github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/encoding/proto/common.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package proto
    22  
    23  import (
    24  	"reflect"
    25  	"sort"
    26  
    27  	"github.com/m3db/m3/src/dbnode/encoding/m3tsz"
    28  
    29  	dpb "github.com/golang/protobuf/protoc-gen-go/descriptor"
    30  	"github.com/jhump/protoreflect/desc"
    31  )
    32  
    33  // Schema represents a schema for a protobuf message.
    34  type Schema *desc.MessageDescriptor
    35  
    36  const (
    37  	// ~1GiB is an intentionally large number to avoid users ever running into any
    38  	// limitations, but we want some theoretical maximum so that in the case of data / memory
    39  	// corruption the iterator can avoid panicing due to trying to allocate a massive byte slice
    40  	// (MAX_UINT64 for example) and return a reasonable error message instead.
    41  	maxMarshalledProtoMessageSize = 2 << 29
    42  
    43  	// maxCustomFieldNum is included for the same rationale as maxMarshalledProtoMessageSize.
    44  	maxCustomFieldNum = 10000
    45  
    46  	protoFieldTypeNotFound dpb.FieldDescriptorProto_Type = -1
    47  )
    48  
    49  type customFieldType int8
    50  
    51  const (
    52  	// All the protobuf field types that we can perform custom encoding /
    53  	// compression on will get mapped to one of these types. This prevents
    54  	// us from having to reference the protobuf type all over the encoder
    55  	// and iterators and also simplifies the logic because the protobuf
    56  	// format has several instances of multiple types that we will treat the
    57  	// same. For example, in our encoding scheme the proto types:
    58  	// int32, sfixed32, and enums are all are treated as int32s and there
    59  	// is no reasonm to distinguish between them for the purposes of encoding
    60  	// and decoding.
    61  	notCustomEncodedField customFieldType = iota
    62  	signedInt64Field
    63  	signedInt32Field
    64  	unsignedInt64Field
    65  	unsignedInt32Field
    66  	float64Field
    67  	float32Field
    68  	bytesField
    69  	boolField
    70  
    71  	numCustomTypes = 9
    72  )
    73  
    74  // -1 because iota's are zero-indexed so the highest value will be the number of
    75  // custom types - 1.
    76  var numBitsToEncodeCustomType = numBitsRequiredForNumUpToN(numCustomTypes - 1)
    77  
    78  const (
    79  	// Single bit op codes that get encoded into the compressed stream and
    80  	// inform the iterator / decoder how it should interpret subsequent
    81  	// bits.
    82  	opCodeNoMoreDataOrTimeUnitChangeAndOrSchemaChange = 0
    83  	opCodeMoreData                                    = 1
    84  
    85  	opCodeNoMoreData                      = 0
    86  	opCodeTimeUnitChangeAndOrSchemaChange = 1
    87  
    88  	opCodeTimeUnitUnchanged = 0
    89  	opCodeTimeUnitChange    = 1
    90  
    91  	opCodeSchemaUnchanged = 0
    92  	opCodeSchemaChange    = 1
    93  
    94  	opCodeNoChange = 0
    95  	opCodeChange   = 1
    96  
    97  	opCodeInterpretSubsequentBitsAsLRUIndex          = 0
    98  	opCodeInterpretSubsequentBitsAsBytesLengthVarInt = 1
    99  
   100  	opCodeNoFieldsSetToDefaultProtoMarshal = 0
   101  	opCodeFieldsSetToDefaultProtoMarshal   = 1
   102  
   103  	opCodeIntDeltaPositive = 0
   104  	opCodeIntDeltaNegative = 1
   105  
   106  	opCodeBitsetValueIsNotSet = 0
   107  	opCodeBitsetValueIsSet    = 1
   108  
   109  	opCodeBoolTrue  = 1
   110  	opCodeBoolFalse = 0
   111  )
   112  
   113  var (
   114  	typeOfBytes = reflect.TypeOf(([]byte)(nil))
   115  
   116  	// Maps protobuf types to our custom type as described above.
   117  	mapProtoTypeToCustomFieldType = map[dpb.FieldDescriptorProto_Type]customFieldType{
   118  		dpb.FieldDescriptorProto_TYPE_DOUBLE: float64Field,
   119  		dpb.FieldDescriptorProto_TYPE_FLOAT:  float32Field,
   120  
   121  		dpb.FieldDescriptorProto_TYPE_INT64:    signedInt64Field,
   122  		dpb.FieldDescriptorProto_TYPE_SFIXED64: signedInt64Field,
   123  
   124  		dpb.FieldDescriptorProto_TYPE_UINT64:  unsignedInt64Field,
   125  		dpb.FieldDescriptorProto_TYPE_FIXED64: unsignedInt64Field,
   126  
   127  		dpb.FieldDescriptorProto_TYPE_INT32:    signedInt32Field,
   128  		dpb.FieldDescriptorProto_TYPE_SFIXED32: signedInt32Field,
   129  		// Signed because thats how Proto encodes it (can technically have negative
   130  		// enum values but its not recommended for compression reasons).
   131  		dpb.FieldDescriptorProto_TYPE_ENUM: signedInt32Field,
   132  
   133  		dpb.FieldDescriptorProto_TYPE_UINT32:  unsignedInt32Field,
   134  		dpb.FieldDescriptorProto_TYPE_FIXED32: unsignedInt32Field,
   135  
   136  		dpb.FieldDescriptorProto_TYPE_SINT32: signedInt32Field,
   137  		dpb.FieldDescriptorProto_TYPE_SINT64: signedInt64Field,
   138  
   139  		dpb.FieldDescriptorProto_TYPE_STRING: bytesField,
   140  		dpb.FieldDescriptorProto_TYPE_BYTES:  bytesField,
   141  
   142  		dpb.FieldDescriptorProto_TYPE_BOOL: boolField,
   143  	}
   144  )
   145  
   146  type marshalledField struct {
   147  	fieldNum   int32
   148  	marshalled []byte
   149  }
   150  
   151  type sortedMarshalledFields []marshalledField
   152  
   153  // customFieldState is used to track any required state for encoding / decoding a single
   154  // field in the encoder / iterator respectively.
   155  type customFieldState struct {
   156  	// Bytes State. TODO(rartoul): Wrap this up in an encoderAndIterator like
   157  	// the floats and ints.
   158  	bytesFieldDict         []encoderBytesFieldDictState
   159  	iteratorBytesFieldDict [][]byte
   160  	// Float state. Works as both an encoder and iterator (I.E the encoder calls
   161  	// the encode methods and the iterator calls the read methods).
   162  	floatEncAndIter m3tsz.FloatEncoderAndIterator
   163  	// Int state.
   164  	intEncAndIter intEncoderAndIterator
   165  
   166  	fieldNum       int
   167  	protoFieldType dpb.FieldDescriptorProto_Type
   168  	fieldType      customFieldType
   169  }
   170  
   171  type encoderBytesFieldDictState struct {
   172  	// We store the hash so we can perform fast equality checks, and
   173  	// we store the startPos + length so that when we have a value
   174  	// that matches a hash, we can be certain its not a hash collision
   175  	// by comparing the bytes against those we already wrote into the
   176  	// stream.
   177  	hash     uint64
   178  	startPos uint32
   179  	length   uint32
   180  }
   181  
   182  func newCustomFieldState(
   183  	fieldNum int,
   184  	protoFieldType dpb.FieldDescriptorProto_Type,
   185  	customFieldType customFieldType,
   186  ) customFieldState {
   187  	s := customFieldState{
   188  		fieldNum:       fieldNum,
   189  		fieldType:      customFieldType,
   190  		protoFieldType: protoFieldType}
   191  	if isUnsignedInt(customFieldType) {
   192  		s.intEncAndIter.unsigned = true
   193  	}
   194  	return s
   195  }
   196  
   197  // TODO(rartoul): Improve this function to be less naive and actually explore nested messages
   198  // for fields that we can use our custom compression on: https://github.com/m3db/m3/issues/1471
   199  func customAndNonCustomFields(
   200  	customFields []customFieldState,
   201  	nonCustomFields []marshalledField,
   202  	schema *desc.MessageDescriptor,
   203  ) ([]customFieldState, []marshalledField) {
   204  	fields := schema.GetFields()
   205  	numCustomFields := numCustomFields(schema)
   206  	numNonCustomFields := len(fields) - numCustomFields
   207  
   208  	if cap(customFields) >= numCustomFields {
   209  		for i := range customFields {
   210  			customFields[i] = customFieldState{}
   211  		}
   212  		customFields = customFields[:0]
   213  	} else {
   214  		customFields = make([]customFieldState, 0, numCustomFields)
   215  	}
   216  
   217  	if cap(nonCustomFields) >= numNonCustomFields {
   218  		for i := range nonCustomFields {
   219  			nonCustomFields[i] = marshalledField{}
   220  		}
   221  		nonCustomFields = nonCustomFields[:0]
   222  	} else {
   223  		nonCustomFields = make([]marshalledField, 0, numNonCustomFields)
   224  	}
   225  
   226  	var (
   227  		prevFieldNum int32 = -1
   228  		isSorted           = true
   229  	)
   230  	for _, field := range fields {
   231  		var (
   232  			fieldType = field.GetType()
   233  			fieldNum  = field.GetNumber()
   234  		)
   235  		if fieldNum < prevFieldNum {
   236  			isSorted = false
   237  		}
   238  
   239  		customFieldType, ok := isCustomField(fieldType, field.IsRepeated())
   240  		if !ok {
   241  			nonCustomFields = append(nonCustomFields, marshalledField{fieldNum: fieldNum})
   242  			continue
   243  		}
   244  
   245  		fieldState := newCustomFieldState(int(fieldNum), fieldType, customFieldType)
   246  		customFields = append(customFields, fieldState)
   247  	}
   248  
   249  	if !isSorted {
   250  		sort.Slice(customFields, func(a, b int) bool {
   251  			return customFields[a].fieldNum < customFields[b].fieldNum
   252  		})
   253  		sort.Slice(nonCustomFields, func(a, b int) bool {
   254  			return nonCustomFields[a].fieldNum < nonCustomFields[b].fieldNum
   255  		})
   256  	}
   257  
   258  	return customFields, nonCustomFields
   259  }
   260  
   261  func isCustomFloatEncodedField(t customFieldType) bool {
   262  	return t == float64Field || t == float32Field
   263  }
   264  
   265  func isCustomIntEncodedField(t customFieldType) bool {
   266  	return t == signedInt64Field ||
   267  		t == unsignedInt64Field ||
   268  		t == signedInt32Field ||
   269  		t == unsignedInt32Field
   270  }
   271  
   272  func isUnsignedInt(t customFieldType) bool {
   273  	return t == unsignedInt64Field || t == unsignedInt32Field
   274  }
   275  
   276  func numCustomFields(schema *desc.MessageDescriptor) int {
   277  	var (
   278  		fields          = schema.GetFields()
   279  		numCustomFields = 0
   280  	)
   281  
   282  	for _, field := range fields {
   283  		if _, ok := isCustomField(field.GetType(), field.IsRepeated()); ok {
   284  			numCustomFields++
   285  		}
   286  	}
   287  
   288  	return numCustomFields
   289  }
   290  
   291  func isCustomField(fieldType dpb.FieldDescriptorProto_Type, isRepeated bool) (customFieldType, bool) {
   292  	if isRepeated {
   293  		return -1, false
   294  	}
   295  
   296  	customFieldType, ok := mapProtoTypeToCustomFieldType[fieldType]
   297  	return customFieldType, ok
   298  }
   299  
   300  func fieldsContains(fieldNum int32, fields []*desc.FieldDescriptor) bool {
   301  	for _, field := range fields {
   302  		if field.GetNumber() == fieldNum {
   303  			return true
   304  		}
   305  	}
   306  	return false
   307  }
   308  
   309  // numBitsRequiredForNumUpToN returns the number of bits that are required
   310  // to represent all the possible numbers between 0 and n as a uint64.
   311  //
   312  // 4   --> 2
   313  // 8   --> 3
   314  // 16  --> 4
   315  // 32  --> 5
   316  // 64  --> 6
   317  // 128 --> 7
   318  func numBitsRequiredForNumUpToN(n int) int {
   319  	count := 0
   320  	for n > 0 {
   321  		count++
   322  		n = n >> 1
   323  	}
   324  	return count
   325  }
   326  
   327  func (m sortedMarshalledFields) Len() int {
   328  	return len(m)
   329  }
   330  
   331  func (m sortedMarshalledFields) Less(i, j int) bool {
   332  	return m[i].fieldNum < m[j].fieldNum
   333  }
   334  
   335  func (m sortedMarshalledFields) Swap(i, j int) {
   336  	m[i], m[j] = m[j], m[i]
   337  }