github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/encoding/proto/common.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package proto 22 23 import ( 24 "reflect" 25 "sort" 26 27 "github.com/m3db/m3/src/dbnode/encoding/m3tsz" 28 29 dpb "github.com/golang/protobuf/protoc-gen-go/descriptor" 30 "github.com/jhump/protoreflect/desc" 31 ) 32 33 // Schema represents a schema for a protobuf message. 34 type Schema *desc.MessageDescriptor 35 36 const ( 37 // ~1GiB is an intentionally large number to avoid users ever running into any 38 // limitations, but we want some theoretical maximum so that in the case of data / memory 39 // corruption the iterator can avoid panicing due to trying to allocate a massive byte slice 40 // (MAX_UINT64 for example) and return a reasonable error message instead. 41 maxMarshalledProtoMessageSize = 2 << 29 42 43 // maxCustomFieldNum is included for the same rationale as maxMarshalledProtoMessageSize. 44 maxCustomFieldNum = 10000 45 46 protoFieldTypeNotFound dpb.FieldDescriptorProto_Type = -1 47 ) 48 49 type customFieldType int8 50 51 const ( 52 // All the protobuf field types that we can perform custom encoding / 53 // compression on will get mapped to one of these types. This prevents 54 // us from having to reference the protobuf type all over the encoder 55 // and iterators and also simplifies the logic because the protobuf 56 // format has several instances of multiple types that we will treat the 57 // same. For example, in our encoding scheme the proto types: 58 // int32, sfixed32, and enums are all are treated as int32s and there 59 // is no reasonm to distinguish between them for the purposes of encoding 60 // and decoding. 61 notCustomEncodedField customFieldType = iota 62 signedInt64Field 63 signedInt32Field 64 unsignedInt64Field 65 unsignedInt32Field 66 float64Field 67 float32Field 68 bytesField 69 boolField 70 71 numCustomTypes = 9 72 ) 73 74 // -1 because iota's are zero-indexed so the highest value will be the number of 75 // custom types - 1. 76 var numBitsToEncodeCustomType = numBitsRequiredForNumUpToN(numCustomTypes - 1) 77 78 const ( 79 // Single bit op codes that get encoded into the compressed stream and 80 // inform the iterator / decoder how it should interpret subsequent 81 // bits. 82 opCodeNoMoreDataOrTimeUnitChangeAndOrSchemaChange = 0 83 opCodeMoreData = 1 84 85 opCodeNoMoreData = 0 86 opCodeTimeUnitChangeAndOrSchemaChange = 1 87 88 opCodeTimeUnitUnchanged = 0 89 opCodeTimeUnitChange = 1 90 91 opCodeSchemaUnchanged = 0 92 opCodeSchemaChange = 1 93 94 opCodeNoChange = 0 95 opCodeChange = 1 96 97 opCodeInterpretSubsequentBitsAsLRUIndex = 0 98 opCodeInterpretSubsequentBitsAsBytesLengthVarInt = 1 99 100 opCodeNoFieldsSetToDefaultProtoMarshal = 0 101 opCodeFieldsSetToDefaultProtoMarshal = 1 102 103 opCodeIntDeltaPositive = 0 104 opCodeIntDeltaNegative = 1 105 106 opCodeBitsetValueIsNotSet = 0 107 opCodeBitsetValueIsSet = 1 108 109 opCodeBoolTrue = 1 110 opCodeBoolFalse = 0 111 ) 112 113 var ( 114 typeOfBytes = reflect.TypeOf(([]byte)(nil)) 115 116 // Maps protobuf types to our custom type as described above. 117 mapProtoTypeToCustomFieldType = map[dpb.FieldDescriptorProto_Type]customFieldType{ 118 dpb.FieldDescriptorProto_TYPE_DOUBLE: float64Field, 119 dpb.FieldDescriptorProto_TYPE_FLOAT: float32Field, 120 121 dpb.FieldDescriptorProto_TYPE_INT64: signedInt64Field, 122 dpb.FieldDescriptorProto_TYPE_SFIXED64: signedInt64Field, 123 124 dpb.FieldDescriptorProto_TYPE_UINT64: unsignedInt64Field, 125 dpb.FieldDescriptorProto_TYPE_FIXED64: unsignedInt64Field, 126 127 dpb.FieldDescriptorProto_TYPE_INT32: signedInt32Field, 128 dpb.FieldDescriptorProto_TYPE_SFIXED32: signedInt32Field, 129 // Signed because thats how Proto encodes it (can technically have negative 130 // enum values but its not recommended for compression reasons). 131 dpb.FieldDescriptorProto_TYPE_ENUM: signedInt32Field, 132 133 dpb.FieldDescriptorProto_TYPE_UINT32: unsignedInt32Field, 134 dpb.FieldDescriptorProto_TYPE_FIXED32: unsignedInt32Field, 135 136 dpb.FieldDescriptorProto_TYPE_SINT32: signedInt32Field, 137 dpb.FieldDescriptorProto_TYPE_SINT64: signedInt64Field, 138 139 dpb.FieldDescriptorProto_TYPE_STRING: bytesField, 140 dpb.FieldDescriptorProto_TYPE_BYTES: bytesField, 141 142 dpb.FieldDescriptorProto_TYPE_BOOL: boolField, 143 } 144 ) 145 146 type marshalledField struct { 147 fieldNum int32 148 marshalled []byte 149 } 150 151 type sortedMarshalledFields []marshalledField 152 153 // customFieldState is used to track any required state for encoding / decoding a single 154 // field in the encoder / iterator respectively. 155 type customFieldState struct { 156 // Bytes State. TODO(rartoul): Wrap this up in an encoderAndIterator like 157 // the floats and ints. 158 bytesFieldDict []encoderBytesFieldDictState 159 iteratorBytesFieldDict [][]byte 160 // Float state. Works as both an encoder and iterator (I.E the encoder calls 161 // the encode methods and the iterator calls the read methods). 162 floatEncAndIter m3tsz.FloatEncoderAndIterator 163 // Int state. 164 intEncAndIter intEncoderAndIterator 165 166 fieldNum int 167 protoFieldType dpb.FieldDescriptorProto_Type 168 fieldType customFieldType 169 } 170 171 type encoderBytesFieldDictState struct { 172 // We store the hash so we can perform fast equality checks, and 173 // we store the startPos + length so that when we have a value 174 // that matches a hash, we can be certain its not a hash collision 175 // by comparing the bytes against those we already wrote into the 176 // stream. 177 hash uint64 178 startPos uint32 179 length uint32 180 } 181 182 func newCustomFieldState( 183 fieldNum int, 184 protoFieldType dpb.FieldDescriptorProto_Type, 185 customFieldType customFieldType, 186 ) customFieldState { 187 s := customFieldState{ 188 fieldNum: fieldNum, 189 fieldType: customFieldType, 190 protoFieldType: protoFieldType} 191 if isUnsignedInt(customFieldType) { 192 s.intEncAndIter.unsigned = true 193 } 194 return s 195 } 196 197 // TODO(rartoul): Improve this function to be less naive and actually explore nested messages 198 // for fields that we can use our custom compression on: https://github.com/m3db/m3/issues/1471 199 func customAndNonCustomFields( 200 customFields []customFieldState, 201 nonCustomFields []marshalledField, 202 schema *desc.MessageDescriptor, 203 ) ([]customFieldState, []marshalledField) { 204 fields := schema.GetFields() 205 numCustomFields := numCustomFields(schema) 206 numNonCustomFields := len(fields) - numCustomFields 207 208 if cap(customFields) >= numCustomFields { 209 for i := range customFields { 210 customFields[i] = customFieldState{} 211 } 212 customFields = customFields[:0] 213 } else { 214 customFields = make([]customFieldState, 0, numCustomFields) 215 } 216 217 if cap(nonCustomFields) >= numNonCustomFields { 218 for i := range nonCustomFields { 219 nonCustomFields[i] = marshalledField{} 220 } 221 nonCustomFields = nonCustomFields[:0] 222 } else { 223 nonCustomFields = make([]marshalledField, 0, numNonCustomFields) 224 } 225 226 var ( 227 prevFieldNum int32 = -1 228 isSorted = true 229 ) 230 for _, field := range fields { 231 var ( 232 fieldType = field.GetType() 233 fieldNum = field.GetNumber() 234 ) 235 if fieldNum < prevFieldNum { 236 isSorted = false 237 } 238 239 customFieldType, ok := isCustomField(fieldType, field.IsRepeated()) 240 if !ok { 241 nonCustomFields = append(nonCustomFields, marshalledField{fieldNum: fieldNum}) 242 continue 243 } 244 245 fieldState := newCustomFieldState(int(fieldNum), fieldType, customFieldType) 246 customFields = append(customFields, fieldState) 247 } 248 249 if !isSorted { 250 sort.Slice(customFields, func(a, b int) bool { 251 return customFields[a].fieldNum < customFields[b].fieldNum 252 }) 253 sort.Slice(nonCustomFields, func(a, b int) bool { 254 return nonCustomFields[a].fieldNum < nonCustomFields[b].fieldNum 255 }) 256 } 257 258 return customFields, nonCustomFields 259 } 260 261 func isCustomFloatEncodedField(t customFieldType) bool { 262 return t == float64Field || t == float32Field 263 } 264 265 func isCustomIntEncodedField(t customFieldType) bool { 266 return t == signedInt64Field || 267 t == unsignedInt64Field || 268 t == signedInt32Field || 269 t == unsignedInt32Field 270 } 271 272 func isUnsignedInt(t customFieldType) bool { 273 return t == unsignedInt64Field || t == unsignedInt32Field 274 } 275 276 func numCustomFields(schema *desc.MessageDescriptor) int { 277 var ( 278 fields = schema.GetFields() 279 numCustomFields = 0 280 ) 281 282 for _, field := range fields { 283 if _, ok := isCustomField(field.GetType(), field.IsRepeated()); ok { 284 numCustomFields++ 285 } 286 } 287 288 return numCustomFields 289 } 290 291 func isCustomField(fieldType dpb.FieldDescriptorProto_Type, isRepeated bool) (customFieldType, bool) { 292 if isRepeated { 293 return -1, false 294 } 295 296 customFieldType, ok := mapProtoTypeToCustomFieldType[fieldType] 297 return customFieldType, ok 298 } 299 300 func fieldsContains(fieldNum int32, fields []*desc.FieldDescriptor) bool { 301 for _, field := range fields { 302 if field.GetNumber() == fieldNum { 303 return true 304 } 305 } 306 return false 307 } 308 309 // numBitsRequiredForNumUpToN returns the number of bits that are required 310 // to represent all the possible numbers between 0 and n as a uint64. 311 // 312 // 4 --> 2 313 // 8 --> 3 314 // 16 --> 4 315 // 32 --> 5 316 // 64 --> 6 317 // 128 --> 7 318 func numBitsRequiredForNumUpToN(n int) int { 319 count := 0 320 for n > 0 { 321 count++ 322 n = n >> 1 323 } 324 return count 325 } 326 327 func (m sortedMarshalledFields) Len() int { 328 return len(m) 329 } 330 331 func (m sortedMarshalledFields) Less(i, j int) bool { 332 return m[i].fieldNum < m[j].fieldNum 333 } 334 335 func (m sortedMarshalledFields) Swap(i, j int) { 336 m[i], m[j] = m[j], m[i] 337 }