github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/libraries/doltcore/sqle/binlogreplication/binlog_json_serialization.go (about) 1 // Copyright 2024 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package binlogreplication 16 17 import ( 18 "encoding/binary" 19 "fmt" 20 "math" 21 "sort" 22 23 gmstypes "github.com/dolthub/go-mysql-server/sql/types" 24 ) 25 26 const jsonTypeSmallObject = byte(0x00) 27 const jsonTypeLargeObject = byte(0x01) 28 const jsonTypeSmallArray = byte(0x02) 29 const jsonTypeLargeArray = byte(0x03) 30 const jsonTypeLiteral = byte(0x04) 31 const jsonTypeInt16 = byte(0x05) 32 const jsonTypeUint16 = byte(0x06) 33 const jsonTypeInt32 = byte(0x07) 34 const jsonTypeUint32 = byte(0x08) 35 const jsonTypeInt64 = byte(0x09) 36 const jsonTypeUint64 = byte(0x0a) 37 const jsonTypeDouble = byte(0x0b) 38 const jsonTypeString = byte(0x0c) 39 const jsonTypeCustom = byte(0x0f) 40 41 const jsonLiteralValueNull = byte(0x00) 42 const jsonLiteralValueTrue = byte(0x01) 43 const jsonLiteralValueFalse = byte(0x02) 44 45 // maxOffsetSize is used to determine if an byte offset into an array or object encoding will exceed the capacity 46 // of a uint16 and whether the encoding needs to switch to the large array or large object format. 47 const maxOffsetSize = uint32(65_535) 48 49 // encodeJsonDoc encodes the specified |jsonDoc| into MySQL's custom/internal binary encoding 50 // so that it can be included in a binlog event. 51 // 52 // The internal MySQL JSON binary format is documented here: 53 // https://dev.mysql.com/doc/dev/mysql-server/latest/json__binary_8h.html 54 // 55 // And a third-party description is here: 56 // https://lafengnan.gitbooks.io/blog/content/mysql/chapter2.html 57 func encodeJsonDoc(jsonDoc gmstypes.JSONDocument) (buffer []byte, err error) { 58 typeId, encodedValue, err := encodeJsonValue(jsonDoc.Val) 59 if err != nil { 60 return nil, err 61 } 62 buffer = append(buffer, typeId) 63 return append(buffer, encodedValue...), nil 64 } 65 66 // encodeJsonArray encodes the specified |jsonArray| into MySQL's internal JSON encoding and returns 67 // the type ID indicating whether this is a small or large array, the encoded array data, and any 68 // error encountered. The |largeEncoding| param controls whether this function will use the small 69 // array encoding (i.e. using 2 bytes for counts, sizes, and offsets), or the large array encoding 70 // (i.e. using 4 bytes for counts, sizes, and offsets). 71 // 72 // A JSON Array is encoded into the following components: 73 // - Type Identifier: Always 1 byte; jsonTypeSmallArray or jsonTypeLargeArray; not included in the returned []byte. 74 // - Count: 2 bytes for small encoding, otherwise 4; number of elements in the array 75 // - Size: 2 bytes for small encoding, otherwise 4; total size of the encoded array (everything but the Type ID) 76 // - Value Entries: 1 per value; 1 byte for type ID, variable sized offset (or inlined literal value) 77 // - Values: 1 per value; encoded value bytes 78 func encodeJsonArray(jsonArray []any, largeEncoding bool) (typeId byte, encodedArray []byte, err error) { 79 if !largeEncoding && len(jsonArray) > int(maxOffsetSize) { 80 return 0, nil, fmt.Errorf( 81 "too many elements in JSON array (%d) to serialize in small array encoding", len(jsonArray)) 82 } 83 84 var valueEntriesBuffer []byte 85 var valuesBuffer []byte 86 nextValuesOffset := calculateInitialArrayValuesOffset(len(jsonArray), largeEncoding) 87 88 for _, element := range jsonArray { 89 typeId, encodedValue, err := encodeJsonValue(element) 90 if err != nil { 91 return 0, nil, err 92 } 93 94 // Literals can be inlined in the value-entries section 95 if typeId == jsonTypeLiteral { 96 valueEntriesBuffer = append(valueEntriesBuffer, typeId) 97 if len(encodedValue) != 1 { 98 return 0, nil, fmt.Errorf("unexpected buffer length") 99 } 100 valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, uint32(encodedValue[0]), largeEncoding) 101 } else { 102 if !largeEncoding && nextValuesOffset > maxOffsetSize-uint32(len(encodedValue)) { 103 return 0, nil, fmt.Errorf("offset too large for small array encoding") 104 } 105 106 valueEntriesBuffer = append(valueEntriesBuffer, typeId) 107 valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, nextValuesOffset, largeEncoding) 108 valuesBuffer = append(valuesBuffer, encodedValue...) 109 nextValuesOffset += uint32(len(encodedValue)) 110 } 111 } 112 113 // element count (uint16 for small arrays) 114 encodedArray = appendForEncoding(encodedArray, uint32(len(jsonArray)), largeEncoding) 115 116 // Grab the total size of the array data from the next offset position pointing to the end of the values buffer 117 arrayPayloadLength := nextValuesOffset 118 119 encodedArray = appendForEncoding(encodedArray, arrayPayloadLength, largeEncoding) 120 encodedArray = append(encodedArray, valueEntriesBuffer...) 121 encodedArray = append(encodedArray, valuesBuffer...) 122 123 if !largeEncoding { 124 return jsonTypeSmallArray, encodedArray, nil 125 } else { 126 return jsonTypeLargeArray, encodedArray, nil 127 } 128 } 129 130 // encodeJsonObject encodes the specified |jsonObject| into MySQL's internal JSON encoding and returns 131 // the type ID indicating whether this is a small or large object, the encoded object data, and any 132 // error encountered. The |largeEncoding| param controls whether this function will use the small 133 // object encoding (i.e. using 2 bytes for counts, sizes, and offsets), or the large object encoding 134 // (i.e. using 4 bytes for counts, sizes, and offsets). 135 // 136 // A JSON Object is encoded into the following components: 137 // - Type Identifier: Always 1 byte; either jsonTypeSmallObject or jsonTypeLargeObject (not included in returned []byte) 138 // - Count: variable based on small/large encoding; holds the number of keys in the object 139 // - Size: variable based on small/large encoding; total size of the encoded object (i.e. everything but the Type ID) 140 // - Key Entries: 1 per key; variable length key offset (based on small/large encoding), plus 2 bytes for key length 141 // - Value Entries (variable): 1 per value; 1 byte for type ID, 2 bytes for offset or inlined literal value for jsonTypeSmallObject, otherwise 4 142 // - Keys (variable): 1 per key; encoded string bytes 143 // - Values (variable): 1 per value; encoded value bytes 144 func encodeJsonObject(jsonObject map[string]any, largeEncoding bool) (typeId byte, encodedObject []byte, err error) { 145 var keyEntriesBuffer []byte 146 var keysBuffer []byte 147 nextKeysOffset := calculateInitialObjectKeysOffset(len(jsonObject), largeEncoding) 148 149 // Sort the keys so that we can process the keys and values in a consistent order. MySQL seems to sort 150 // json keys internally first by length, then alphabetically, but correct replication doesn't seem to 151 // rely on matching that behavior. 152 sortedKeys := make([]string, 0, len(jsonObject)) 153 for key := range jsonObject { 154 sortedKeys = append(sortedKeys, key) 155 } 156 sort.Strings(sortedKeys) 157 158 // Process keys first, since value entry data depends on offsets that we don't know until we 159 // process all the keys. 160 for _, key := range sortedKeys { 161 // NOTE: Don't use encodeJsonValue for the key – its length gets encoded slightly differently 162 // for JSON objects. 163 encodedValue := []byte(key) 164 165 if !largeEncoding && nextKeysOffset > maxOffsetSize-uint32(len(encodedValue)) { 166 return 0, nil, fmt.Errorf("offset too large for small object encoding") 167 } 168 169 keyEntriesBuffer = appendForEncoding(keyEntriesBuffer, nextKeysOffset, largeEncoding) 170 keyEntriesBuffer = append(keyEntriesBuffer, byte(len(encodedValue)), byte(len(encodedValue)<<8)) 171 keysBuffer = append(keysBuffer, encodedValue...) 172 nextKeysOffset += uint32(len(encodedValue)) 173 } 174 175 // Process values – since the object values are written after the keys, and we need to store the 176 // offsets to those locations in the value entries that appear before the keys and the values, we 177 // have to make a second pass through the object to process the values once we know the final 178 // length of the keys section. 179 var valueEntriesBuffer []byte 180 var valuesBuffer []byte 181 nextValuesOffset := nextKeysOffset 182 for _, key := range sortedKeys { 183 value := jsonObject[key] 184 typeId, encodedValue, err := encodeJsonValue(value) 185 if err != nil { 186 return 0, nil, err 187 } 188 189 // Literals may be inlined in the value-entries section 190 if typeId == jsonTypeLiteral { 191 valueEntriesBuffer = append(valueEntriesBuffer, typeId) 192 if len(encodedValue) != 1 { 193 return 0, nil, fmt.Errorf("unexpected buffer length") 194 } 195 valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, uint32(encodedValue[0]), largeEncoding) 196 } else { 197 if !largeEncoding && nextValuesOffset > maxOffsetSize-uint32(len(encodedValue)) { 198 return 0, nil, fmt.Errorf("offset too large for small object encoding") 199 } 200 201 valueEntriesBuffer = append(valueEntriesBuffer, typeId) 202 valueEntriesBuffer = appendForEncoding(valueEntriesBuffer, nextValuesOffset, largeEncoding) 203 valuesBuffer = append(valuesBuffer, encodedValue...) 204 nextValuesOffset += uint32(len(encodedValue)) 205 } 206 } 207 208 // element count (uint16 for small objects) 209 encodedObject = appendForEncoding(encodedObject, uint32(len(jsonObject)), largeEncoding) 210 211 // Grab the total size of the object data from the next offset position pointing to the end of the values buffer 212 objectPayloadLength := nextValuesOffset 213 214 encodedObject = appendForEncoding(encodedObject, objectPayloadLength, largeEncoding) 215 encodedObject = append(encodedObject, keyEntriesBuffer...) 216 encodedObject = append(encodedObject, valueEntriesBuffer...) 217 encodedObject = append(encodedObject, keysBuffer...) 218 encodedObject = append(encodedObject, valuesBuffer...) 219 220 if !largeEncoding { 221 return jsonTypeSmallObject, encodedObject, nil 222 } else { 223 return jsonTypeLargeObject, encodedObject, nil 224 } 225 } 226 227 // encodeJsonObject encodes the specified |jsonValue| into MySQL's internal JSON encoding and returns 228 // the type ID indicating what type of value this is, the encoded value, and any error encountered. 229 func encodeJsonValue(jsonValue any) (typeId byte, buffer []byte, err error) { 230 if jsonValue == nil { 231 buffer = append(buffer, jsonLiteralValueNull) 232 return jsonTypeLiteral, buffer, nil 233 } 234 235 switch v := jsonValue.(type) { 236 case bool: 237 if v { 238 buffer = append(buffer, jsonLiteralValueTrue) 239 } else { 240 buffer = append(buffer, jsonLiteralValueFalse) 241 } 242 return jsonTypeLiteral, buffer, nil 243 244 case string: 245 // String lengths use a special encoding that can span multiple bytes 246 buffer, err = appendStringLength(buffer, len(v)) 247 if err != nil { 248 return 0, nil, err 249 } 250 251 buffer = append(buffer, []byte(v)...) 252 return jsonTypeString, buffer, nil 253 254 case float64: 255 // NOTE: all our numbers end up being represented as float64s currently when we parse stored JSON 256 bits := math.Float64bits(v) 257 buffer = append(buffer, make([]byte, 8)...) 258 binary.LittleEndian.PutUint64(buffer, bits) 259 return jsonTypeDouble, buffer, nil 260 261 case []any: 262 // MySQL attempts to use the small encoding first, and if offset sizes overflow, then it switches to the 263 // large encoding. This is a little messy/inefficient to try the small encoding first, but because of the 264 // way the binary format is designed, we can't know if/when we'll need the large format without serializing 265 // the data first. 266 id, encodedArray, err := encodeJsonArray(v, false) 267 if err == nil { 268 return id, encodedArray, nil 269 } 270 return encodeJsonArray(v, true) 271 272 case map[string]any: 273 // See the comment above about MySQL's JSON serialization format, and why we try the small encoding first, 274 // before we know if we need the large encoding or not. 275 id, encodedObject, err := encodeJsonObject(v, false) 276 if err == nil { 277 return id, encodedObject, nil 278 } 279 return encodeJsonObject(v, true) 280 281 default: 282 return 0x00, nil, fmt.Errorf("unexpected type in JSON document: %T", v) 283 } 284 } 285 286 // appendForEncoding appends the |value| to the specified |bytes| and returns the updated byte slice. If 287 // |largeEncoding| is true, then 4 bytes are added to |bytes| to represent |value|, otherwise 2 bytes are used. 288 // This is a helper function for serializing the smallArray/largeArray and smallObject/largeObject formats, since 289 // they are identical formats, except that offsets, counts, and sizes are stored as 2 bytes in the small encodings, 290 // and stored as 4 bytes in the large encodings. 291 func appendForEncoding(bytes []byte, value uint32, largeEncoding bool) []byte { 292 if !largeEncoding { 293 bytes = append(bytes, byte(value), byte(value>>8)) 294 } else { 295 bytes = append(bytes, byte(value), byte(value>>8), byte(value>>16), byte(value>>24)) 296 } 297 return bytes 298 } 299 300 // appendStringLength appends a variable number of bytes to the specified |bytes| to encode |length|, the 301 // length of a string. For string lengths, if the length is larger than 127 bytes, we set the high bit of 302 // the first byte and use two bytes to encode the length. Similarly, if the high bit of the second byte is 303 // also set, the length is encoded over three bytes. 304 func appendStringLength(bytes []byte, length int) ([]byte, error) { 305 switch { 306 case length > 0x1FFFFF: 307 return nil, fmt.Errorf("strings larger than 2,097,151 bytes not supported") 308 309 case length > 0x3FFF: // 16,383 310 return append(bytes, 311 byte(length&0x7F|0x80), 312 byte(length>>7|0x80), 313 byte(length>>14)), nil 314 315 case length > 0x7F: // 127 316 return append(bytes, 317 byte(length&0x7F|0x80), 318 byte(length>>7)), nil 319 320 default: 321 return append(bytes, byte(length)), nil 322 } 323 } 324 325 // calculateInitialArrayValuesOffset returns the initial offset value for the first array value in the 326 // encoded array byte slice. When |largeEncoding| is false, this value includes the two bytes for the 327 // element count, the two bytes for the encoded size, and three bytes (one byte for type ID, and two 328 // bytes for the offset) for each element in the array, specified by |arrayLength|. When |largeEncoding| 329 // is true, this value includes four bytes for the element count, four bytes for the encoded size, and 330 // five bytes (one byte for type ID, and four bytes for the offset) for each element in the array, 331 // specified by |arrayLength|. 332 func calculateInitialArrayValuesOffset(arrayLength int, largeEncoding bool) uint32 { 333 if !largeEncoding { 334 return uint32(2 + 2 + (arrayLength * 3)) 335 } 336 return uint32(4 + 4 + (arrayLength * 5)) 337 } 338 339 // calculateInitialObjectKeysOffset returns the initial offset value for the first key in the encoded 340 // object byte slice. When |largeEncoding| is false, the first key offset position is calculated by adding 341 // 2 bytes (the key/value pair count field), 2 bytes (the size of encoded object field), 4 bytes (2 bytes 342 // for the key offset, and 2 bytes for the length of the key) per key/value pair, specified by |objectLength|, 343 // and another 3 bytes (1 byte for the value's type ID, and 2 bytes for the offset to the value's data) for 344 // each key/value pair. When |largeEncoding| is true, the first key offset position is calculated by adding 345 // 4 bytes (the key/value pair count field), 4 bytes (the size of encoded object field), 6 bytes (4 bytes 346 // for the key offset, and 2 bytes for the length of the key) per key/value pair, specified by |objectLength|, 347 // and another 5 bytes (1 byte for the value's type ID, and 4 bytes for the offset to the value's data) for 348 // each key/value pair. 349 func calculateInitialObjectKeysOffset(objectLength int, largeEncoding bool) uint32 { 350 if !largeEncoding { 351 return uint32(2 + 2 + objectLength*4 + objectLength*3) 352 } 353 return uint32(4 + 4 + objectLength*6 + objectLength*5) 354 }