github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/encoding/proto/encoder.go

github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/encoding/proto/encoder.go (about)

     1  // Copyright (c) 2019 Uber Technologies, Inc.
     2  //
     3  // Permission is hereby granted, free of charge, to any person obtaining a copy
     4  // of this software and associated documentation files (the "Software"), to deal
     5  // in the Software without restriction, including without limitation the rights
     6  // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     7  // copies of the Software, and to permit persons to whom the Software is
     8  // furnished to do so, subject to the following conditions:
     9  //
    10  // The above copyright notice and this permission notice shall be included in
    11  // all copies or substantial portions of the Software.
    12  //
    13  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    14  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    15  // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    16  // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    17  // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    18  // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    19  // THE SOFTWARE.
    20  
    21  package proto
    22  
    23  import (
    24  	"bytes"
    25  	"encoding/binary"
    26  	"fmt"
    27  
    28  	"github.com/m3db/m3/src/dbnode/encoding"
    29  	"github.com/m3db/m3/src/dbnode/encoding/m3tsz"
    30  	"github.com/m3db/m3/src/dbnode/namespace"
    31  	"github.com/m3db/m3/src/dbnode/ts"
    32  	"github.com/m3db/m3/src/dbnode/x/xio"
    33  	"github.com/m3db/m3/src/x/checked"
    34  	"github.com/m3db/m3/src/x/context"
    35  	"github.com/m3db/m3/src/x/instrument"
    36  	xtime "github.com/m3db/m3/src/x/time"
    37  
    38  	"github.com/cespare/xxhash/v2"
    39  	"github.com/jhump/protoreflect/desc"
    40  )
    41  
    42  // Make sure encoder implements encoding.Encoder.
    43  var _ encoding.Encoder = &Encoder{}
    44  
    45  const (
    46  	currentEncodingSchemeVersion = 1
    47  )
    48  
    49  var (
    50  	encErrPrefix                      = "proto encoder:"
    51  	errEncoderSchemaIsRequired        = fmt.Errorf("%s schema is required", encErrPrefix)
    52  	errEncoderMessageHasUnknownFields = fmt.Errorf("%s message has unknown fields", encErrPrefix)
    53  	errEncoderClosed                  = fmt.Errorf("%s encoder is closed", encErrPrefix)
    54  	errNoEncodedDatapoints            = fmt.Errorf("%s encoder has no encoded datapoints", encErrPrefix)
    55  )
    56  
    57  // Encoder compresses arbitrary ProtoBuf streams given a schema.
    58  type Encoder struct {
    59  	opts encoding.Options
    60  
    61  	stream     encoding.OStream
    62  	schemaDesc namespace.SchemaDescr
    63  	schema     *desc.MessageDescriptor
    64  
    65  	numEncoded      int
    66  	lastEncodedDP   ts.Datapoint
    67  	customFields    []customFieldState
    68  	nonCustomFields []marshalledField
    69  	prevAnnotation  ts.Annotation
    70  
    71  	// Fields that are reused between function calls to
    72  	// avoid allocations.
    73  	varIntBuf              [8]byte
    74  	fieldsChangedToDefault []int32
    75  	marshalBuf             []byte
    76  
    77  	unmarshaller customFieldUnmarshaller
    78  
    79  	hasEncodedSchema bool
    80  	closed           bool
    81  
    82  	stats            encoderStats
    83  	timestampEncoder m3tsz.TimestampEncoder
    84  }
    85  
    86  // EncoderStats contains statistics about the encoders compression performance.
    87  type EncoderStats struct {
    88  	UncompressedBytes int
    89  	CompressedBytes   int
    90  }
    91  
    92  type encoderStats struct {
    93  	uncompressedBytes int
    94  }
    95  
    96  func (s *encoderStats) IncUncompressedBytes(x int) {
    97  	s.uncompressedBytes += x
    98  }
    99  
   100  // NewEncoder creates a new protobuf encoder.
   101  func NewEncoder(start xtime.UnixNano, opts encoding.Options) *Encoder {
   102  	initAllocIfEmpty := opts.EncoderPool() == nil
   103  	stream := encoding.NewOStream(nil, initAllocIfEmpty, opts.BytesPool())
   104  	return &Encoder{
   105  		opts:   opts,
   106  		stream: stream,
   107  		timestampEncoder: m3tsz.NewTimestampEncoder(
   108  			start, opts.DefaultTimeUnit(), opts),
   109  		varIntBuf: [8]byte{},
   110  	}
   111  }
   112  
   113  // Encode encodes a timestamp and a protobuf message. The function signature is strange
   114  // in order to implement the encoding.Encoder interface. It accepts a ts.Datapoint, but
   115  // only the Timestamp field will be used, the Value field will be ignored and will always
   116  // return 0 on subsequent iteration. In addition, the provided annotation is expected to
   117  // be a marshalled protobuf message that matches the configured schema.
   118  func (enc *Encoder) Encode(dp ts.Datapoint, timeUnit xtime.Unit, protoBytes ts.Annotation) error {
   119  	if unusableErr := enc.isUsable(); unusableErr != nil {
   120  		return unusableErr
   121  	}
   122  
   123  	if enc.schema == nil {
   124  		// It is a programmatic error that schema is not set at all prior to encoding, panic to fix it asap.
   125  		return instrument.InvariantErrorf(errEncoderSchemaIsRequired.Error())
   126  	}
   127  
   128  	// Proto encoder value is meaningless, but make sure its always zero just to be safe so that
   129  	// it doesn't cause LastEncoded() to produce invalid results.
   130  	dp.Value = float64(0)
   131  
   132  	if enc.unmarshaller == nil {
   133  		// Lazy init.
   134  		enc.unmarshaller = newCustomFieldUnmarshaller(customUnmarshallerOptions{})
   135  	}
   136  	// resetAndUnmarshal before any data is written so that the marshalled message can be validated
   137  	// upfront, otherwise errors could be encountered mid-write leaving the stream in a corrupted state.
   138  	if err := enc.unmarshaller.resetAndUnmarshal(enc.schema, protoBytes); err != nil {
   139  		return fmt.Errorf(
   140  			"%s error unmarshalling message: %v", encErrPrefix, err)
   141  	}
   142  
   143  	if enc.numEncoded == 0 {
   144  		enc.encodeStreamHeader()
   145  	}
   146  
   147  	var (
   148  		needToEncodeSchema   = !enc.hasEncodedSchema
   149  		needToEncodeTimeUnit = timeUnit != enc.timestampEncoder.TimeUnit
   150  	)
   151  	if needToEncodeSchema || needToEncodeTimeUnit {
   152  		enc.encodeSchemaAndOrTimeUnit(needToEncodeSchema, needToEncodeTimeUnit, timeUnit)
   153  	} else {
   154  		// Control bit that indicates the stream has more data but no time unit or schema changes.
   155  		enc.stream.WriteBit(opCodeMoreData)
   156  	}
   157  
   158  	err := enc.timestampEncoder.WriteTime(enc.stream, dp.TimestampNanos, nil, timeUnit)
   159  	if err != nil {
   160  		return fmt.Errorf(
   161  			"%s error encoding timestamp: %v", encErrPrefix, err)
   162  	}
   163  
   164  	if err := enc.encodeProto(protoBytes); err != nil {
   165  		return fmt.Errorf(
   166  			"%s error encoding proto portion of message: %v", encErrPrefix, err)
   167  	}
   168  
   169  	enc.numEncoded++
   170  	enc.lastEncodedDP = dp
   171  	enc.prevAnnotation = protoBytes
   172  	enc.stats.IncUncompressedBytes(len(protoBytes))
   173  	return nil
   174  }
   175  
   176  func (enc *Encoder) encodeSchemaAndOrTimeUnit(
   177  	needToEncodeSchema bool,
   178  	needToEncodeTimeUnit bool,
   179  	timeUnit xtime.Unit,
   180  ) {
   181  	// First bit means either there is no more data OR the time unit and/or schema has changed.
   182  	enc.stream.WriteBit(opCodeNoMoreDataOrTimeUnitChangeAndOrSchemaChange)
   183  	// Next bit means there is more data, but the time unit and/or schema has changed.
   184  	enc.stream.WriteBit(opCodeTimeUnitChangeAndOrSchemaChange)
   185  
   186  	// Next bit is a boolean indicating whether the time unit has changed.
   187  	if needToEncodeTimeUnit {
   188  		enc.stream.WriteBit(opCodeTimeUnitChange)
   189  	} else {
   190  		enc.stream.WriteBit(opCodeTimeUnitUnchanged)
   191  	}
   192  
   193  	// Next bit is a boolean indicating whether the schema has changed.
   194  	if needToEncodeSchema {
   195  		enc.stream.WriteBit(opCodeSchemaChange)
   196  	} else {
   197  		enc.stream.WriteBit(opCodeSchemaUnchanged)
   198  	}
   199  
   200  	if needToEncodeTimeUnit {
   201  		// The encoder manages encoding time unit changes manually (instead of deferring to
   202  		// the timestamp encoder) because by default the WriteTime() API will use a marker
   203  		// encoding scheme that relies on looking ahead into the stream for bit combinations that
   204  		// could not possibly exist in the M3TSZ encoding scheme.
   205  		// The protobuf encoder can't rely on this behavior because its possible for the protobuf
   206  		// encoder to encode a legitimate bit combination that matches the "impossible" M3TSZ
   207  		// markers exactly.
   208  		enc.timestampEncoder.WriteTimeUnit(enc.stream, timeUnit)
   209  	}
   210  
   211  	if needToEncodeSchema {
   212  		enc.encodeCustomSchemaTypes()
   213  		enc.hasEncodedSchema = true
   214  	}
   215  }
   216  
   217  // Stream returns a copy of the underlying data stream.
   218  func (enc *Encoder) Stream(ctx context.Context) (xio.SegmentReader, bool) {
   219  	seg := enc.segmentZeroCopy(ctx)
   220  	if seg.Len() == 0 {
   221  		return nil, false
   222  	}
   223  
   224  	if readerPool := enc.opts.SegmentReaderPool(); readerPool != nil {
   225  		reader := readerPool.Get()
   226  		reader.Reset(seg)
   227  		return reader, true
   228  	}
   229  	return xio.NewSegmentReader(seg), true
   230  }
   231  
   232  func (enc *Encoder) segmentZeroCopy(ctx context.Context) ts.Segment {
   233  	length := enc.stream.Len()
   234  	if enc.stream.Len() == 0 {
   235  		return ts.Segment{}
   236  	}
   237  
   238  	// We need a tail to capture an immutable snapshot of the encoder data
   239  	// as the last byte can change after this method returns.
   240  	rawBuffer, _ := enc.stream.RawBytes()
   241  	lastByte := rawBuffer[length-1]
   242  
   243  	// Take ref up to last byte.
   244  	headBytes := rawBuffer[:length-1]
   245  
   246  	// Zero copy from the output stream.
   247  	var head checked.Bytes
   248  	if pool := enc.opts.CheckedBytesWrapperPool(); pool != nil {
   249  		head = pool.Get(headBytes)
   250  	} else {
   251  		head = checked.NewBytes(headBytes, nil)
   252  	}
   253  
   254  	// Make sure the ostream bytes ref is delayed from finalizing
   255  	// until this operation is complete (since this is zero copy).
   256  	buffer, _ := enc.stream.CheckedBytes()
   257  	ctx.RegisterCloser(buffer.DelayFinalizer())
   258  
   259  	// Take a shared ref to a known good tail.
   260  	tail := tails[lastByte]
   261  
   262  	// Only discard the head since tails are shared for process life time.
   263  	return ts.NewSegment(head, tail, 0, ts.FinalizeHead)
   264  }
   265  
   266  func (enc *Encoder) segmentTakeOwnership() ts.Segment {
   267  	length := enc.stream.Len()
   268  	if length == 0 {
   269  		return ts.Segment{}
   270  	}
   271  
   272  	// Take ref from the ostream.
   273  	head := enc.stream.Discard()
   274  
   275  	return ts.NewSegment(head, nil, 0, ts.FinalizeHead)
   276  }
   277  
   278  // NumEncoded returns the number of encoded messages.
   279  func (enc *Encoder) NumEncoded() int {
   280  	return enc.numEncoded
   281  }
   282  
   283  // LastEncoded returns the last encoded datapoint. Does not include
   284  // annotation / protobuf message for interface purposes.
   285  func (enc *Encoder) LastEncoded() (ts.Datapoint, error) {
   286  	if unusableErr := enc.isUsable(); unusableErr != nil {
   287  		return ts.Datapoint{}, unusableErr
   288  	}
   289  
   290  	if enc.numEncoded == 0 {
   291  		return ts.Datapoint{}, errNoEncodedDatapoints
   292  	}
   293  
   294  	// Value is meaningless for proto encoder and should already be zero,
   295  	// but set it again to be safe.
   296  	enc.lastEncodedDP.Value = 0
   297  	return enc.lastEncodedDP, nil
   298  }
   299  
   300  // LastAnnotationChecksum returns the checksum of the last encoded annotation (which contain the bytes
   301  // used for ProtoBuf data).
   302  func (enc *Encoder) LastAnnotationChecksum() (uint64, error) {
   303  	if enc.numEncoded == 0 {
   304  		return 0, errNoEncodedDatapoints
   305  	}
   306  
   307  	return xxhash.Sum64(enc.prevAnnotation), nil
   308  }
   309  
   310  // Len returns the length of the data stream.
   311  func (enc *Encoder) Len() int {
   312  	return enc.stream.Len()
   313  }
   314  
   315  // Empty returns true when underlying stream is empty.
   316  func (enc *Encoder) Empty() bool {
   317  	return enc.stream.Empty()
   318  }
   319  
   320  // Stats returns EncoderStats which contain statistics about the encoders compression
   321  // ratio.
   322  func (enc *Encoder) Stats() EncoderStats {
   323  	return EncoderStats{
   324  		UncompressedBytes: enc.stats.uncompressedBytes,
   325  		CompressedBytes:   enc.Len(),
   326  	}
   327  }
   328  
   329  func (enc *Encoder) encodeStreamHeader() {
   330  	enc.encodeVarInt(currentEncodingSchemeVersion)
   331  	enc.encodeVarInt(uint64(enc.opts.ByteFieldDictionaryLRUSize()))
   332  }
   333  
   334  func (enc *Encoder) encodeCustomSchemaTypes() {
   335  	if len(enc.customFields) == 0 {
   336  		enc.encodeVarInt(0)
   337  		return
   338  	}
   339  
   340  	// Field numbers are 1-indexed so encoding the maximum field number
   341  	// at the beginning is equivalent to encoding the number of types
   342  	// we need to read after if we imagine that we're encoding a 1-indexed
   343  	// bitset where the position in the bitset encodes the field number (I.E
   344  	// the first value is the type for field number 1) and the values are
   345  	// the number of bits required to unique identify a custom type instead of
   346  	// just being a single bit (3 bits in the case of version 1 of the encoding
   347  	// scheme.)
   348  	maxFieldNum := enc.customFields[len(enc.customFields)-1].fieldNum
   349  	enc.encodeVarInt(uint64(maxFieldNum))
   350  
   351  	// Start at 1 because we're zero-indexed.
   352  	for i := 1; i <= maxFieldNum; i++ {
   353  		customTypeBits := uint64(notCustomEncodedField)
   354  		for _, customField := range enc.customFields {
   355  			if customField.fieldNum == i {
   356  				customTypeBits = uint64(customField.fieldType)
   357  				break
   358  			}
   359  		}
   360  
   361  		enc.stream.WriteBits(
   362  			customTypeBits,
   363  			numBitsToEncodeCustomType)
   364  	}
   365  }
   366  
   367  func (enc *Encoder) encodeProto(buf []byte) error {
   368  	var (
   369  		sortedTopLevelScalarValues    = enc.unmarshaller.sortedCustomFieldValues()
   370  		sortedTopLevelScalarValuesIdx = 0
   371  		lastMarshalledValue           unmarshalValue
   372  	)
   373  
   374  	// Loop through the customFields slice and sortedTopLevelScalarValues slice (both
   375  	// of which are sorted by field number) at the same time and match each customField
   376  	// to its encoded value in the stream (if any).
   377  	for i, customField := range enc.customFields {
   378  		if sortedTopLevelScalarValuesIdx < len(sortedTopLevelScalarValues) {
   379  			lastMarshalledValue = sortedTopLevelScalarValues[sortedTopLevelScalarValuesIdx]
   380  		}
   381  
   382  		lastMarshalledValueFieldNumber := -1
   383  
   384  		hasNext := sortedTopLevelScalarValuesIdx < len(sortedTopLevelScalarValues)
   385  		if hasNext {
   386  			lastMarshalledValueFieldNumber = int(lastMarshalledValue.fieldNumber)
   387  		}
   388  
   389  		// Since both the customFields slice and the sortedTopLevelScalarValues slice
   390  		// are sorted by field number, if the scalar slice contains no more values or
   391  		// it contains a next value, but the field number is not equal to the field number
   392  		// of the current customField, it is safe to conclude that the current customField's
   393  		// value was not encoded in this message which means that it should be interpreted
   394  		// as the default value for that field according to the proto3 specification.
   395  		noMarshalledValue := (!hasNext ||
   396  			customField.fieldNum != lastMarshalledValueFieldNumber)
   397  		if noMarshalledValue {
   398  			err := enc.encodeZeroValue(i)
   399  			if err != nil {
   400  				return err
   401  			}
   402  			continue
   403  		}
   404  
   405  		switch {
   406  		case isCustomFloatEncodedField(customField.fieldType):
   407  			enc.encodeTSZValue(i, lastMarshalledValue.asFloat64())
   408  
   409  		case isCustomIntEncodedField(customField.fieldType):
   410  			if isUnsignedInt(customField.fieldType) {
   411  				enc.encodeUnsignedIntValue(i, lastMarshalledValue.asUint64())
   412  			} else {
   413  				enc.encodeSignedIntValue(i, lastMarshalledValue.asInt64())
   414  			}
   415  
   416  		case customField.fieldType == bytesField:
   417  			err := enc.encodeBytesValue(i, lastMarshalledValue.asBytes())
   418  			if err != nil {
   419  				return err
   420  			}
   421  
   422  		case customField.fieldType == boolField:
   423  			enc.encodeBoolValue(i, lastMarshalledValue.asBool())
   424  
   425  		default:
   426  			// This should never happen.
   427  			return fmt.Errorf(
   428  				"%s error no logic for custom encoding field number: %d",
   429  				encErrPrefix, customField.fieldNum)
   430  		}
   431  
   432  		sortedTopLevelScalarValuesIdx++
   433  	}
   434  
   435  	if err := enc.encodeNonCustomValues(); err != nil {
   436  		return err
   437  	}
   438  
   439  	return nil
   440  }
   441  
   442  func (enc *Encoder) encodeZeroValue(i int) error {
   443  	customField := enc.customFields[i]
   444  	switch {
   445  	case isCustomFloatEncodedField(customField.fieldType):
   446  		var zeroFloat64 float64
   447  		enc.encodeTSZValue(i, zeroFloat64)
   448  		return nil
   449  
   450  	case isCustomIntEncodedField(customField.fieldType):
   451  		if isUnsignedInt(customField.fieldType) {
   452  			var zeroUInt64 uint64
   453  			enc.encodeUnsignedIntValue(i, zeroUInt64)
   454  		} else {
   455  			var zeroInt64 int64
   456  			enc.encodeSignedIntValue(i, zeroInt64)
   457  		}
   458  		return nil
   459  
   460  	case customField.fieldType == bytesField:
   461  		var zeroBytes []byte
   462  		return enc.encodeBytesValue(i, zeroBytes)
   463  
   464  	case customField.fieldType == boolField:
   465  		enc.encodeBoolValue(i, false)
   466  		return nil
   467  
   468  	default:
   469  		// This should never happen.
   470  		return fmt.Errorf(
   471  			"%s error no logic for custom encoding field number: %d",
   472  			encErrPrefix, customField.fieldNum)
   473  	}
   474  }
   475  
   476  // Reset resets the encoder for reuse.
   477  func (enc *Encoder) Reset(
   478  	start xtime.UnixNano,
   479  	capacity int,
   480  	descr namespace.SchemaDescr,
   481  ) {
   482  	enc.SetSchema(descr)
   483  	enc.reset(start, capacity)
   484  }
   485  
   486  // SetSchema sets the schema for the encoder.
   487  func (enc *Encoder) SetSchema(descr namespace.SchemaDescr) {
   488  	if descr == nil {
   489  		enc.schemaDesc = nil
   490  		enc.resetSchema(nil)
   491  		return
   492  	}
   493  
   494  	// Noop if schema has not changed.
   495  	if enc.schemaDesc != nil && len(descr.DeployId()) != 0 && enc.schemaDesc.DeployId() == descr.DeployId() {
   496  		return
   497  	}
   498  
   499  	enc.schemaDesc = descr
   500  	enc.resetSchema(descr.Get().MessageDescriptor)
   501  }
   502  
   503  func (enc *Encoder) reset(start xtime.UnixNano, capacity int) {
   504  	enc.stream.Reset(enc.newBuffer(capacity))
   505  	enc.timestampEncoder = m3tsz.NewTimestampEncoder(
   506  		start, enc.opts.DefaultTimeUnit(), enc.opts)
   507  	enc.lastEncodedDP = ts.Datapoint{}
   508  
   509  	// Prevent this from growing too large and remaining in the pools.
   510  	enc.marshalBuf = nil
   511  
   512  	if enc.schema != nil {
   513  		enc.customFields, enc.nonCustomFields = customAndNonCustomFields(enc.customFields, enc.nonCustomFields, enc.schema)
   514  	}
   515  
   516  	enc.closed = false
   517  	enc.numEncoded = 0
   518  }
   519  
   520  func (enc *Encoder) resetSchema(schema *desc.MessageDescriptor) {
   521  	enc.schema = schema
   522  	if enc.schema == nil {
   523  		// Clear but don't set to nil so they don't need to be reallocated
   524  		// next time.
   525  		customFields := enc.customFields
   526  		for i := range customFields {
   527  			customFields[i] = customFieldState{}
   528  		}
   529  		enc.customFields = customFields[:0]
   530  
   531  		nonCustomFields := enc.nonCustomFields
   532  		for i := range nonCustomFields {
   533  			nonCustomFields[i] = marshalledField{}
   534  		}
   535  		enc.nonCustomFields = nonCustomFields[:0]
   536  		return
   537  	}
   538  
   539  	enc.customFields, enc.nonCustomFields = customAndNonCustomFields(enc.customFields, enc.nonCustomFields, enc.schema)
   540  	enc.hasEncodedSchema = false
   541  }
   542  
   543  // Close closes the encoder.
   544  func (enc *Encoder) Close() {
   545  	if enc.closed {
   546  		return
   547  	}
   548  
   549  	enc.Reset(0, 0, nil)
   550  	enc.stream.Reset(nil)
   551  	enc.closed = true
   552  
   553  	if pool := enc.opts.EncoderPool(); pool != nil {
   554  		pool.Put(enc)
   555  	}
   556  }
   557  
   558  // Discard closes the encoder and transfers ownership of the data stream to
   559  // the caller.
   560  func (enc *Encoder) Discard() ts.Segment {
   561  	segment := enc.segmentTakeOwnership()
   562  	// Close the encoder since its no longer needed
   563  	enc.Close()
   564  	return segment
   565  }
   566  
   567  // DiscardReset does the same thing as Discard except it also resets the encoder
   568  // for reuse.
   569  func (enc *Encoder) DiscardReset(
   570  	start xtime.UnixNano,
   571  	capacity int,
   572  	descr namespace.SchemaDescr,
   573  ) ts.Segment {
   574  	segment := enc.segmentTakeOwnership()
   575  	enc.Reset(start, capacity, descr)
   576  	return segment
   577  }
   578  
   579  // Bytes returns the raw bytes of the underlying data stream. Does not
   580  // transfer ownership and is generally unsafe.
   581  func (enc *Encoder) Bytes() ([]byte, error) {
   582  	if unusableErr := enc.isUsable(); unusableErr != nil {
   583  		return nil, unusableErr
   584  	}
   585  
   586  	bytes, _ := enc.stream.RawBytes()
   587  	return bytes, nil
   588  }
   589  
   590  func (enc *Encoder) encodeTSZValue(i int, val float64) {
   591  	enc.customFields[i].floatEncAndIter.WriteFloat(enc.stream, val)
   592  }
   593  
   594  func (enc *Encoder) encodeSignedIntValue(i int, val int64) {
   595  	enc.customFields[i].intEncAndIter.encodeSignedIntValue(enc.stream, val)
   596  }
   597  
   598  func (enc *Encoder) encodeUnsignedIntValue(i int, val uint64) {
   599  	enc.customFields[i].intEncAndIter.encodeUnsignedIntValue(enc.stream, val)
   600  }
   601  
   602  func (enc *Encoder) encodeBytesValue(i int, val []byte) error {
   603  	var (
   604  		customField      = enc.customFields[i]
   605  		hash             = xxhash.Sum64(val)
   606  		numPreviousBytes = len(customField.bytesFieldDict)
   607  		lastStateIdx     = numPreviousBytes - 1
   608  		lastState        encoderBytesFieldDictState
   609  	)
   610  	if numPreviousBytes > 0 {
   611  		lastState = customField.bytesFieldDict[lastStateIdx]
   612  	}
   613  
   614  	if numPreviousBytes > 0 && hash == lastState.hash {
   615  		streamBytes, _ := enc.stream.RawBytes()
   616  		match, err := enc.bytesMatchEncodedDictionaryValue(
   617  			streamBytes, lastState, val)
   618  		if err != nil {
   619  			return fmt.Errorf(
   620  				"%s error checking if bytes match last encoded dictionary bytes: %v",
   621  				encErrPrefix, err)
   622  		}
   623  		if match {
   624  			// No changes control bit.
   625  			enc.stream.WriteBit(opCodeNoChange)
   626  			return nil
   627  		}
   628  	}
   629  
   630  	// Bytes changed control bit.
   631  	enc.stream.WriteBit(opCodeChange)
   632  
   633  	streamBytes, _ := enc.stream.RawBytes()
   634  	for j, state := range customField.bytesFieldDict {
   635  		if hash != state.hash {
   636  			continue
   637  		}
   638  
   639  		match, err := enc.bytesMatchEncodedDictionaryValue(
   640  			streamBytes, state, val)
   641  		if err != nil {
   642  			return fmt.Errorf(
   643  				"%s error checking if bytes match encoded dictionary bytes: %v",
   644  				encErrPrefix, err)
   645  		}
   646  		if !match {
   647  			continue
   648  		}
   649  
   650  		// Control bit means interpret next n bits as the index for the previous write
   651  		// that this matches where n is the number of bits required to represent all
   652  		// possible array indices in the configured LRU size.
   653  		enc.stream.WriteBit(opCodeInterpretSubsequentBitsAsLRUIndex)
   654  		enc.stream.WriteBits(
   655  			uint64(j),
   656  			numBitsRequiredForNumUpToN(
   657  				enc.opts.ByteFieldDictionaryLRUSize()))
   658  		enc.moveToEndOfBytesDict(i, j)
   659  		return nil
   660  	}
   661  
   662  	// Control bit means interpret subsequent bits as varInt encoding length of a new
   663  	// []byte we haven't seen before.
   664  	enc.stream.WriteBit(opCodeInterpretSubsequentBitsAsBytesLengthVarInt)
   665  
   666  	length := len(val)
   667  	enc.encodeVarInt(uint64(length))
   668  
   669  	// Add padding bits until we reach the next byte. This ensures that the startPos
   670  	// that we're going to store in the dictionary LRU will be aligned on a physical
   671  	// byte boundary which makes retrieving the bytes again later for comparison much
   672  	// easier.
   673  	//
   674  	// Note that this will waste up to a maximum of 7 bits per []byte that we encode
   675  	// which is acceptable for now, but in the future we may want to make the code able
   676  	// to do the comparison even if the bytes aren't aligned on a byte boundary in order
   677  	// to improve the compression.
   678  	//
   679  	// Also this implementation had the side-effect of making encoding and decoding of
   680  	// []byte values much faster because for long []byte the encoder and iterator can avoid
   681  	// bit manipulation and calling WriteByte() / ReadByte() in a loop and can instead read the
   682  	// entire []byte in one go.
   683  	enc.padToNextByte()
   684  
   685  	// Track the byte position we're going to start at so we can store it in the LRU after.
   686  	streamBytes, _ = enc.stream.RawBytes()
   687  	bytePos := len(streamBytes)
   688  
   689  	// Write the actual bytes.
   690  	enc.stream.WriteBytes(val)
   691  
   692  	enc.addToBytesDict(i, encoderBytesFieldDictState{
   693  		hash:     hash,
   694  		startPos: uint32(bytePos),
   695  		length:   uint32(length),
   696  	})
   697  	return nil
   698  }
   699  
   700  func (enc *Encoder) encodeBoolValue(i int, val bool) {
   701  	if val {
   702  		enc.stream.WriteBit(opCodeBoolTrue)
   703  	} else {
   704  		enc.stream.WriteBit(opCodeBoolFalse)
   705  	}
   706  }
   707  
   708  func (enc *Encoder) encodeNonCustomValues() error {
   709  	if len(enc.nonCustomFields) == 0 {
   710  		// Fast path, skip all the encoding logic entirely because there are
   711  		// no fields that require proto encoding.
   712  		// TODO(rartoul): Note that the encoding scheme could be further optimized
   713  		// such that if there are no fields that require proto encoding then we don't
   714  		// need to waste this bit per write.
   715  		enc.stream.WriteBit(opCodeNoChange)
   716  		return nil
   717  	}
   718  
   719  	// Reset for re-use.
   720  	enc.fieldsChangedToDefault = enc.fieldsChangedToDefault[:0]
   721  
   722  	var (
   723  		incomingNonCustomFields = enc.unmarshaller.sortedNonCustomFieldValues()
   724  		// Matching entries in two sorted lists in which every element in each list is unique so keep
   725  		// track of the last index at which a match was found so that subsequent inner loops can start
   726  		// at the next index.
   727  		lastMatchIdx     = -1
   728  		numChangedValues = 0
   729  	)
   730  	enc.marshalBuf = enc.marshalBuf[:0] // Reset buf for reuse.
   731  
   732  	for i, existingField := range enc.nonCustomFields {
   733  		var curVal []byte
   734  		for i := lastMatchIdx + 1; i < len(incomingNonCustomFields); i++ {
   735  			incomingField := incomingNonCustomFields[i]
   736  			if existingField.fieldNum == incomingField.fieldNum {
   737  				curVal = incomingField.marshalled
   738  				lastMatchIdx = i
   739  				break
   740  			}
   741  		}
   742  
   743  		prevVal := existingField.marshalled
   744  		if bytes.Equal(prevVal, curVal) {
   745  			// No change, nothing to encode.
   746  			continue
   747  		}
   748  
   749  		numChangedValues++
   750  		if curVal == nil {
   751  			// Interpret as default value.
   752  			enc.fieldsChangedToDefault = append(enc.fieldsChangedToDefault, existingField.fieldNum)
   753  		}
   754  		enc.marshalBuf = append(enc.marshalBuf, curVal...)
   755  
   756  		// Need to copy since the encoder no longer owns the original source of the bytes once
   757  		// this function returns.
   758  		enc.nonCustomFields[i].marshalled = append(enc.nonCustomFields[i].marshalled[:0], curVal...)
   759  	}
   760  
   761  	if numChangedValues <= 0 {
   762  		// Only want to skip encoding if nothing has changed AND we've already
   763  		// encoded the first message.
   764  		enc.stream.WriteBit(opCodeNoChange)
   765  		return nil
   766  	}
   767  
   768  	// Control bit indicating that proto values have changed.
   769  	enc.stream.WriteBit(opCodeChange)
   770  	if len(enc.fieldsChangedToDefault) > 0 {
   771  		// Control bit indicating that some fields have been set to default values
   772  		// and that a bitset will follow specifying which fields have changed.
   773  		enc.stream.WriteBit(opCodeFieldsSetToDefaultProtoMarshal)
   774  		enc.encodeBitset(enc.fieldsChangedToDefault)
   775  	} else {
   776  		// Control bit indicating that none of the changed fields have been set to
   777  		// their default values so we can do a clean merge on read.
   778  		enc.stream.WriteBit(opCodeNoFieldsSetToDefaultProtoMarshal)
   779  	}
   780  
   781  	// This wastes up to 7 bits of space per encoded message but significantly improves encoding and
   782  	// decoding speed due to the fact that the OStream and IStream can write and read the data with
   783  	// the equivalent of one memcpy as opposed to having to decode one byte at a time due to lack
   784  	// of alignment.
   785  	enc.padToNextByte()
   786  	enc.encodeVarInt(uint64(len(enc.marshalBuf)))
   787  	enc.stream.WriteBytes(enc.marshalBuf)
   788  
   789  	return nil
   790  }
   791  
   792  func (enc *Encoder) isUsable() error {
   793  	if enc.closed {
   794  		return errEncoderClosed
   795  	}
   796  
   797  	return nil
   798  }
   799  
   800  func (enc *Encoder) bytesMatchEncodedDictionaryValue(
   801  	streamBytes []byte,
   802  	dictState encoderBytesFieldDictState,
   803  	currBytes []byte,
   804  ) (bool, error) {
   805  	var (
   806  		prevEncodedBytesStart = dictState.startPos
   807  		prevEncodedBytesEnd   = prevEncodedBytesStart + dictState.length
   808  	)
   809  
   810  	if prevEncodedBytesEnd > uint32(len(streamBytes)) {
   811  		// Should never happen.
   812  		return false, fmt.Errorf(
   813  			"bytes position in LRU is outside of stream bounds, streamSize: %d, startPos: %d, length: %d",
   814  			len(streamBytes), prevEncodedBytesStart, dictState.length)
   815  	}
   816  
   817  	return bytes.Equal(streamBytes[prevEncodedBytesStart:prevEncodedBytesEnd], currBytes), nil
   818  }
   819  
   820  // padToNextByte will add padding bits in the current byte until the ostream
   821  // reaches the beginning of the next byte. This allows us begin encoding data
   822  // with the guarantee that we're aligned at a physical byte boundary.
   823  func (enc *Encoder) padToNextByte() {
   824  	_, bitPos := enc.stream.RawBytes()
   825  	for bitPos%8 != 0 {
   826  		enc.stream.WriteBit(0)
   827  		bitPos++
   828  	}
   829  }
   830  
   831  func (enc *Encoder) moveToEndOfBytesDict(fieldIdx, i int) {
   832  	existing := enc.customFields[fieldIdx].bytesFieldDict
   833  	for j := i; j < len(existing); j++ {
   834  		nextIdx := j + 1
   835  		if nextIdx >= len(existing) {
   836  			break
   837  		}
   838  
   839  		currVal := existing[j]
   840  		nextVal := existing[nextIdx]
   841  		existing[j] = nextVal
   842  		existing[nextIdx] = currVal
   843  	}
   844  }
   845  
   846  func (enc *Encoder) addToBytesDict(fieldIdx int, state encoderBytesFieldDictState) {
   847  	existing := enc.customFields[fieldIdx].bytesFieldDict
   848  	if len(existing) < enc.opts.ByteFieldDictionaryLRUSize() {
   849  		enc.customFields[fieldIdx].bytesFieldDict = append(existing, state)
   850  		return
   851  	}
   852  
   853  	// Shift everything down 1 and replace the last value to evict the
   854  	// least recently used entry and add the newest one.
   855  	//     [1,2,3]
   856  	// becomes
   857  	//     [2,3,3]
   858  	// after shift, and then becomes
   859  	//     [2,3,4]
   860  	// after replacing the last value.
   861  	for i := range existing {
   862  		nextIdx := i + 1
   863  		if nextIdx >= len(existing) {
   864  			break
   865  		}
   866  
   867  		existing[i] = existing[nextIdx]
   868  	}
   869  
   870  	existing[len(existing)-1] = state
   871  }
   872  
   873  // encodeBitset writes out a bitset in the form of:
   874  //
   875  //      varint(number of bits)|bitset
   876  //
   877  // I.E first it encodes a varint which specifies the number of following
   878  // bits to interpret as a bitset and then it encodes the provided values
   879  // as zero-indexed bitset.
   880  func (enc *Encoder) encodeBitset(values []int32) {
   881  	var max int32
   882  	for _, v := range values {
   883  		if v > max {
   884  			max = v
   885  		}
   886  	}
   887  
   888  	// Encode a varint that indicates how many of the remaining
   889  	// bits to interpret as a bitset.
   890  	enc.encodeVarInt(uint64(max))
   891  
   892  	// Encode the bitset
   893  	for i := int32(0); i < max; i++ {
   894  		wroteExists := false
   895  
   896  		for _, v := range values {
   897  			// Subtract one because the values are 1-indexed but the bitset
   898  			// is 0-indexed.
   899  			if i == v-1 {
   900  				enc.stream.WriteBit(opCodeBitsetValueIsSet)
   901  				wroteExists = true
   902  				break
   903  			}
   904  		}
   905  
   906  		if wroteExists {
   907  			continue
   908  		}
   909  
   910  		enc.stream.WriteBit(opCodeBitsetValueIsNotSet)
   911  	}
   912  }
   913  
   914  func (enc *Encoder) encodeVarInt(x uint64) {
   915  	var (
   916  		// Convert array to slice we can reuse the buffer.
   917  		buf      = enc.varIntBuf[:]
   918  		numBytes = binary.PutUvarint(buf, x)
   919  	)
   920  
   921  	// Reslice so we only write out as many bytes as is required
   922  	// to represent the number.
   923  	buf = buf[:numBytes]
   924  	enc.stream.WriteBytes(buf)
   925  }
   926  
   927  func (enc *Encoder) newBuffer(capacity int) checked.Bytes {
   928  	if bytesPool := enc.opts.BytesPool(); bytesPool != nil {
   929  		return bytesPool.Get(capacity)
   930  	}
   931  	return checked.NewBytes(make([]byte, 0, capacity), nil)
   932  }
   933  
   934  // tails is a list of all possible tails based on the
   935  // byte value of the last byte. For the proto encoder
   936  // they are all the same.
   937  var tails [256]checked.Bytes
   938  
   939  func init() {
   940  	for i := 0; i < 256; i++ {
   941  		tails[i] = checked.NewBytes([]byte{byte(i)}, nil)
   942  	}
   943  }