github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/binlogreplication/binlog_json_serialization.go (about)

     1  // Copyright 2024 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package binlogreplication
    16  
    17  import (
    18  	"encoding/binary"
    19  	"fmt"
    20  	"math"
    21  	"sort"
    22  
    23  	gmstypes "github.com/dolthub/go-mysql-server/sql/types"
    24  )
    25  
    26  const jsonTypeSmallObject = byte(0x00)
    27  const jsonTypeLargeObject = byte(0x01)
    28  const jsonTypeSmallArray = byte(0x02)
    29  const jsonTypeLargeArray = byte(0x03)
    30  const jsonTypeLiteral = byte(0x04)
    31  const jsonTypeInt16 = byte(0x05)
    32  const jsonTypeUint16 = byte(0x06)
    33  const jsonTypeInt32 = byte(0x07)
    34  const jsonTypeUint32 = byte(0x08)
    35  const jsonTypeInt64 = byte(0x09)
    36  const jsonTypeUint64 = byte(0x0a)
    37  const jsonTypeDouble = byte(0x0b)
    38  const jsonTypeString = byte(0x0c)
    39  const jsonTypeCustom = byte(0x0f)
    40  
    41  const jsonLiteralValueNull = byte(0x00)
    42  const jsonLiteralValueTrue = byte(0x01)
    43  const jsonLiteralValueFalse = byte(0x02)
    44  
    45  // maxOffsetSize is used to determine if an byte offset into an array or object encoding will exceed the capacity
    46  // of a uint16 and whether the encoding needs to switch to the large array or large object format.
    47  const maxOffsetSize = uint32(65_535)
    48  
    49  // encodeJsonDoc encodes the specified |jsonDoc| into MySQL's custom/internal binary encoding
    50  // so that it can be included in a binlog event.
    51  //
    52  // The internal MySQL JSON binary format is documented here:
    53  // https://dev.mysql.com/doc/dev/mysql-server/latest/json__binary_8h.html
    54  //
    55  // And a third-party description is here:
    56  // https://lafengnan.gitbooks.io/blog/content/mysql/chapter2.html
    57  func encodeJsonDoc(jsonDoc gmstypes.JSONDocument) (buffer []byte, err error) {
    58  	typeId, encodedValue, err := encodeJsonValue(jsonDoc.Val)
    59  	if err != nil {
    60  		return nil, err
    61  	}
    62  	buffer = append(buffer, typeId)
    63  	return append(buffer, encodedValue...), nil
    64  }
    65  
    66  // encodeJsonArray encodes the specified |jsonArray| into MySQL's internal JSON encoding and returns
    67  // the type ID indicating whether this is a small or large array, the encoded array data, and any
    68  // error encountered. The |largeEncoding| param controls whether this function will use the small
    69  // array encoding (i.e. using 2 bytes for counts, sizes, and offsets), or the large array encoding
    70  // (i.e. using 4 bytes for counts, sizes, and offsets).
    71  //
    72  // A JSON Array is encoded into the following components:
    73  // - Type Identifier: Always 1 byte; jsonTypeSmallArray or jsonTypeLargeArray; not included in the returned []byte.
    74  // - Count:  2 bytes for small encoding, otherwise 4; number of elements in the array
    75  // - Size: 2 bytes for small encoding, otherwise 4; total size of the encoded array (everything but the Type ID)
    76  // - Value Entries: 1 per value; 1 byte for type ID, variable sized offset (or inlined literal value)
    77  // - Values: 1 per value; encoded value bytes
    78  func encodeJsonArray(jsonArray []any, largeEncoding bool) (typeId byte, encodedArray []byte, err error) {
    79  	if !largeEncoding && len(jsonArray) > int(maxOffsetSize) {
    80  		return 0, nil, fmt.Errorf(
    81  			"too many elements in JSON array (%d) to serialize in small array encoding", len(jsonArray))
    82  	}
    83  
    84  	var valueEntriesBuffer []byte
    85  	var valuesBuffer []byte
    86  	nextValuesOffset := calculateInitialArrayValuesOffset(len(jsonArray), largeEncoding)
    87  
    88  	for _, element := range jsonArray {
    89  		typeId, encodedValue, err := encodeJsonValue(element)
    90  		if err != nil {
    91  			return 0, nil, err
    92  		}
    93  
    94  		// Literals can be inlined in the value-entries section
    95  		if typeId == jsonTypeLiteral {
    96  			valueEntriesBuffer = append(valueEntriesBuffer, typeId)
    97  			if len(encodedValue) != 1 {
    98  				return 0, nil, fmt.Errorf("unexpected buffer length")
    99  			}
   100  			valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, uint32(encodedValue[0]), largeEncoding)
   101  		} else {
   102  			if !largeEncoding && nextValuesOffset > maxOffsetSize-uint32(len(encodedValue)) {
   103  				return 0, nil, fmt.Errorf("offset too large for small array encoding")
   104  			}
   105  
   106  			valueEntriesBuffer = append(valueEntriesBuffer, typeId)
   107  			valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, nextValuesOffset, largeEncoding)
   108  			valuesBuffer = append(valuesBuffer, encodedValue...)
   109  			nextValuesOffset += uint32(len(encodedValue))
   110  		}
   111  	}
   112  
   113  	// element count (uint16 for small arrays)
   114  	encodedArray = appendForEncoding(encodedArray, uint32(len(jsonArray)), largeEncoding)
   115  
   116  	// Grab the total size of the array data from the next offset position pointing to the end of the values buffer
   117  	arrayPayloadLength := nextValuesOffset
   118  
   119  	encodedArray = appendForEncoding(encodedArray, arrayPayloadLength, largeEncoding)
   120  	encodedArray = append(encodedArray, valueEntriesBuffer...)
   121  	encodedArray = append(encodedArray, valuesBuffer...)
   122  
   123  	if !largeEncoding {
   124  		return jsonTypeSmallArray, encodedArray, nil
   125  	} else {
   126  		return jsonTypeLargeArray, encodedArray, nil
   127  	}
   128  }
   129  
   130  // encodeJsonObject encodes the specified |jsonObject| into MySQL's internal JSON encoding and returns
   131  // the type ID indicating whether this is a small or large object, the encoded object data, and any
   132  // error encountered. The |largeEncoding| param controls whether this function will use the small
   133  // object encoding (i.e. using 2 bytes for counts, sizes, and offsets), or the large object encoding
   134  // (i.e. using 4 bytes for counts, sizes, and offsets).
   135  //
   136  // A JSON Object is encoded into the following components:
   137  // - Type Identifier: Always 1 byte; either jsonTypeSmallObject or jsonTypeLargeObject (not included in returned []byte)
   138  // - Count: variable based on small/large encoding; holds the number of keys in the object
   139  // - Size: variable based on small/large encoding; total size of the encoded object (i.e. everything but the Type ID)
   140  // - Key Entries: 1 per key; variable length key offset (based on small/large encoding), plus 2 bytes for key length
   141  // - Value Entries (variable): 1 per value; 1 byte for type ID, 2 bytes for offset or inlined literal value for jsonTypeSmallObject, otherwise 4
   142  // - Keys (variable): 1 per key; encoded string bytes
   143  // - Values (variable): 1 per value; encoded value bytes
   144  func encodeJsonObject(jsonObject map[string]any, largeEncoding bool) (typeId byte, encodedObject []byte, err error) {
   145  	var keyEntriesBuffer []byte
   146  	var keysBuffer []byte
   147  	nextKeysOffset := calculateInitialObjectKeysOffset(len(jsonObject), largeEncoding)
   148  
   149  	// Sort the keys so that we can process the keys and values in a consistent order. MySQL seems to sort
   150  	// json keys internally first by length, then alphabetically, but correct replication doesn't seem to
   151  	// rely on matching that behavior.
   152  	sortedKeys := make([]string, 0, len(jsonObject))
   153  	for key := range jsonObject {
   154  		sortedKeys = append(sortedKeys, key)
   155  	}
   156  	sort.Strings(sortedKeys)
   157  
   158  	// Process keys first, since value entry data depends on offsets that we don't know until we
   159  	// process all the keys.
   160  	for _, key := range sortedKeys {
   161  		// NOTE: Don't use encodeJsonValue for the key – its length gets encoded slightly differently
   162  		//       for JSON objects.
   163  		encodedValue := []byte(key)
   164  
   165  		if !largeEncoding && nextKeysOffset > maxOffsetSize-uint32(len(encodedValue)) {
   166  			return 0, nil, fmt.Errorf("offset too large for small object encoding")
   167  		}
   168  
   169  		keyEntriesBuffer = appendForEncoding(keyEntriesBuffer, nextKeysOffset, largeEncoding)
   170  		keyEntriesBuffer = append(keyEntriesBuffer, byte(len(encodedValue)), byte(len(encodedValue)<<8))
   171  		keysBuffer = append(keysBuffer, encodedValue...)
   172  		nextKeysOffset += uint32(len(encodedValue))
   173  	}
   174  
   175  	// Process values – since the object values are written after the keys, and we need to store the
   176  	// offsets to those locations in the value entries that appear before the keys and the values, we
   177  	// have to make a second pass through the object to process the values once we know the final
   178  	// length of the keys section.
   179  	var valueEntriesBuffer []byte
   180  	var valuesBuffer []byte
   181  	nextValuesOffset := nextKeysOffset
   182  	for _, key := range sortedKeys {
   183  		value := jsonObject[key]
   184  		typeId, encodedValue, err := encodeJsonValue(value)
   185  		if err != nil {
   186  			return 0, nil, err
   187  		}
   188  
   189  		// Literals may be inlined in the value-entries section
   190  		if typeId == jsonTypeLiteral {
   191  			valueEntriesBuffer = append(valueEntriesBuffer, typeId)
   192  			if len(encodedValue) != 1 {
   193  				return 0, nil, fmt.Errorf("unexpected buffer length")
   194  			}
   195  			valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, uint32(encodedValue[0]), largeEncoding)
   196  		} else {
   197  			if !largeEncoding && nextValuesOffset > maxOffsetSize-uint32(len(encodedValue)) {
   198  				return 0, nil, fmt.Errorf("offset too large for small object encoding")
   199  			}
   200  
   201  			valueEntriesBuffer = append(valueEntriesBuffer, typeId)
   202  			valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, nextValuesOffset, largeEncoding)
   203  			valuesBuffer = append(valuesBuffer, encodedValue...)
   204  			nextValuesOffset += uint32(len(encodedValue))
   205  		}
   206  	}
   207  
   208  	// element count (uint16 for small objects)
   209  	encodedObject = appendForEncoding(encodedObject, uint32(len(jsonObject)), largeEncoding)
   210  
   211  	// Grab the total size of the object data from the next offset position pointing to the end of the values buffer
   212  	objectPayloadLength := nextValuesOffset
   213  
   214  	encodedObject = appendForEncoding(encodedObject, objectPayloadLength, largeEncoding)
   215  	encodedObject = append(encodedObject, keyEntriesBuffer...)
   216  	encodedObject = append(encodedObject, valueEntriesBuffer...)
   217  	encodedObject = append(encodedObject, keysBuffer...)
   218  	encodedObject = append(encodedObject, valuesBuffer...)
   219  
   220  	if !largeEncoding {
   221  		return jsonTypeSmallObject, encodedObject, nil
   222  	} else {
   223  		return jsonTypeLargeObject, encodedObject, nil
   224  	}
   225  }
   226  
   227  // encodeJsonObject encodes the specified |jsonValue| into MySQL's internal JSON encoding and returns
   228  // the type ID indicating what type of value this is, the encoded value, and any error encountered.
   229  func encodeJsonValue(jsonValue any) (typeId byte, buffer []byte, err error) {
   230  	if jsonValue == nil {
   231  		buffer = append(buffer, jsonLiteralValueNull)
   232  		return jsonTypeLiteral, buffer, nil
   233  	}
   234  
   235  	switch v := jsonValue.(type) {
   236  	case bool:
   237  		if v {
   238  			buffer = append(buffer, jsonLiteralValueTrue)
   239  		} else {
   240  			buffer = append(buffer, jsonLiteralValueFalse)
   241  		}
   242  		return jsonTypeLiteral, buffer, nil
   243  
   244  	case string:
   245  		// String lengths use a special encoding that can span multiple bytes
   246  		buffer, err = appendStringLength(buffer, len(v))
   247  		if err != nil {
   248  			return 0, nil, err
   249  		}
   250  
   251  		buffer = append(buffer, []byte(v)...)
   252  		return jsonTypeString, buffer, nil
   253  
   254  	case float64:
   255  		// NOTE: all our numbers end up being represented as float64s currently when we parse stored JSON
   256  		bits := math.Float64bits(v)
   257  		buffer = append(buffer, make([]byte, 8)...)
   258  		binary.LittleEndian.PutUint64(buffer, bits)
   259  		return jsonTypeDouble, buffer, nil
   260  
   261  	case []any:
   262  		// MySQL attempts to use the small encoding first, and if offset sizes overflow, then it switches to the
   263  		// large encoding. This is a little messy/inefficient to try the small encoding first, but because of the
   264  		// way the binary format is designed, we can't know if/when we'll need the large format without serializing
   265  		// the data first.
   266  		id, encodedArray, err := encodeJsonArray(v, false)
   267  		if err == nil {
   268  			return id, encodedArray, nil
   269  		}
   270  		return encodeJsonArray(v, true)
   271  
   272  	case map[string]any:
   273  		// See the comment above about MySQL's JSON serialization format, and why we try the small encoding first,
   274  		// before we know if we need the large encoding or not.
   275  		id, encodedObject, err := encodeJsonObject(v, false)
   276  		if err == nil {
   277  			return id, encodedObject, nil
   278  		}
   279  		return encodeJsonObject(v, true)
   280  
   281  	default:
   282  		return 0x00, nil, fmt.Errorf("unexpected type in JSON document: %T", v)
   283  	}
   284  }
   285  
   286  // appendForEncoding appends the |value| to the specified |bytes| and returns the updated byte slice. If
   287  // |largeEncoding| is true, then 4 bytes are added to |bytes| to represent |value|, otherwise 2 bytes are used.
   288  // This is a helper function for serializing the smallArray/largeArray and smallObject/largeObject formats, since
   289  // they are identical formats, except that offsets, counts, and sizes are stored as 2 bytes in the small encodings,
   290  // and stored as 4 bytes in the large encodings.
   291  func appendForEncoding(bytes []byte, value uint32, largeEncoding bool) []byte {
   292  	if !largeEncoding {
   293  		bytes = append(bytes, byte(value), byte(value>>8))
   294  	} else {
   295  		bytes = append(bytes, byte(value), byte(value>>8), byte(value>>16), byte(value>>24))
   296  	}
   297  	return bytes
   298  }
   299  
   300  // appendStringLength appends a variable number of bytes to the specified |bytes| to encode |length|, the
   301  // length of a string. For string lengths, if the length is larger than 127 bytes, we set the high bit of
   302  // the first byte and use two bytes to encode the length. Similarly, if the high bit of the second byte is
   303  // also set, the length is encoded over three bytes.
   304  func appendStringLength(bytes []byte, length int) ([]byte, error) {
   305  	switch {
   306  	case length > 0x1FFFFF:
   307  		return nil, fmt.Errorf("strings larger than 2,097,151 bytes not supported")
   308  
   309  	case length > 0x3FFF: // 16,383
   310  		return append(bytes,
   311  			byte(length&0x7F|0x80),
   312  			byte(length>>7|0x80),
   313  			byte(length>>14)), nil
   314  
   315  	case length > 0x7F: // 127
   316  		return append(bytes,
   317  			byte(length&0x7F|0x80),
   318  			byte(length>>7)), nil
   319  
   320  	default:
   321  		return append(bytes, byte(length)), nil
   322  	}
   323  }
   324  
   325  // calculateInitialArrayValuesOffset returns the initial offset value for the first array value in the
   326  // encoded array byte slice. When |largeEncoding| is false, this value includes the two bytes for the
   327  // element count, the two bytes for the encoded size, and three bytes (one byte for type ID, and two
   328  // bytes for the offset) for each element in the array, specified by |arrayLength|. When |largeEncoding|
   329  // is true, this value includes four bytes for the element count, four bytes for the encoded size, and
   330  // five bytes (one byte for type ID, and four bytes for the offset) for each element in the array,
   331  // specified by |arrayLength|.
   332  func calculateInitialArrayValuesOffset(arrayLength int, largeEncoding bool) uint32 {
   333  	if !largeEncoding {
   334  		return uint32(2 + 2 + (arrayLength * 3))
   335  	}
   336  	return uint32(4 + 4 + (arrayLength * 5))
   337  }
   338  
   339  // calculateInitialObjectKeysOffset returns the initial offset value for the first key in the encoded
   340  // object byte slice. When |largeEncoding| is false, the first key offset position is calculated by adding
   341  // 2 bytes (the key/value pair count field), 2 bytes (the size of encoded object field), 4 bytes (2 bytes
   342  // for the key offset, and 2 bytes for the length of the key) per key/value pair, specified by |objectLength|,
   343  // and another 3 bytes (1 byte for the value's type ID, and 2 bytes for the offset to the value's data) for
   344  // each key/value pair. When |largeEncoding| is true, the first key offset position is calculated by adding
   345  // 4 bytes (the key/value pair count field), 4 bytes (the size of encoded object field), 6 bytes (4 bytes
   346  // for the key offset, and 2 bytes for the length of the key) per key/value pair, specified by |objectLength|,
   347  // and another 5 bytes (1 byte for the value's type ID, and 4 bytes for the offset to the value's data) for
   348  // each key/value pair.
   349  func calculateInitialObjectKeysOffset(objectLength int, largeEncoding bool) uint32 {
   350  	if !largeEncoding {
   351  		return uint32(2 + 2 + objectLength*4 + objectLength*3)
   352  	}
   353  	return uint32(4 + 4 + objectLength*6 + objectLength*5)
   354  }