github.com/m3db/m3@v1.5.1-0.20231129193456-75a402aa583b/src/dbnode/encoding/proto/encoder.go (about) 1 // Copyright (c) 2019 Uber Technologies, Inc. 2 // 3 // Permission is hereby granted, free of charge, to any person obtaining a copy 4 // of this software and associated documentation files (the "Software"), to deal 5 // in the Software without restriction, including without limitation the rights 6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 // copies of the Software, and to permit persons to whom the Software is 8 // furnished to do so, subject to the following conditions: 9 // 10 // The above copyright notice and this permission notice shall be included in 11 // all copies or substantial portions of the Software. 12 // 13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 // THE SOFTWARE. 20 21 package proto 22 23 import ( 24 "bytes" 25 "encoding/binary" 26 "fmt" 27 28 "github.com/m3db/m3/src/dbnode/encoding" 29 "github.com/m3db/m3/src/dbnode/encoding/m3tsz" 30 "github.com/m3db/m3/src/dbnode/namespace" 31 "github.com/m3db/m3/src/dbnode/ts" 32 "github.com/m3db/m3/src/dbnode/x/xio" 33 "github.com/m3db/m3/src/x/checked" 34 "github.com/m3db/m3/src/x/context" 35 "github.com/m3db/m3/src/x/instrument" 36 xtime "github.com/m3db/m3/src/x/time" 37 38 "github.com/cespare/xxhash/v2" 39 "github.com/jhump/protoreflect/desc" 40 ) 41 42 // Make sure encoder implements encoding.Encoder. 43 var _ encoding.Encoder = &Encoder{} 44 45 const ( 46 currentEncodingSchemeVersion = 1 47 ) 48 49 var ( 50 encErrPrefix = "proto encoder:" 51 errEncoderSchemaIsRequired = fmt.Errorf("%s schema is required", encErrPrefix) 52 errEncoderMessageHasUnknownFields = fmt.Errorf("%s message has unknown fields", encErrPrefix) 53 errEncoderClosed = fmt.Errorf("%s encoder is closed", encErrPrefix) 54 errNoEncodedDatapoints = fmt.Errorf("%s encoder has no encoded datapoints", encErrPrefix) 55 ) 56 57 // Encoder compresses arbitrary ProtoBuf streams given a schema. 58 type Encoder struct { 59 opts encoding.Options 60 61 stream encoding.OStream 62 schemaDesc namespace.SchemaDescr 63 schema *desc.MessageDescriptor 64 65 numEncoded int 66 lastEncodedDP ts.Datapoint 67 customFields []customFieldState 68 nonCustomFields []marshalledField 69 prevAnnotation ts.Annotation 70 71 // Fields that are reused between function calls to 72 // avoid allocations. 73 varIntBuf [8]byte 74 fieldsChangedToDefault []int32 75 marshalBuf []byte 76 77 unmarshaller customFieldUnmarshaller 78 79 hasEncodedSchema bool 80 closed bool 81 82 stats encoderStats 83 timestampEncoder m3tsz.TimestampEncoder 84 } 85 86 // EncoderStats contains statistics about the encoders compression performance. 87 type EncoderStats struct { 88 UncompressedBytes int 89 CompressedBytes int 90 } 91 92 type encoderStats struct { 93 uncompressedBytes int 94 } 95 96 func (s *encoderStats) IncUncompressedBytes(x int) { 97 s.uncompressedBytes += x 98 } 99 100 // NewEncoder creates a new protobuf encoder. 101 func NewEncoder(start xtime.UnixNano, opts encoding.Options) *Encoder { 102 initAllocIfEmpty := opts.EncoderPool() == nil 103 stream := encoding.NewOStream(nil, initAllocIfEmpty, opts.BytesPool()) 104 return &Encoder{ 105 opts: opts, 106 stream: stream, 107 timestampEncoder: m3tsz.NewTimestampEncoder( 108 start, opts.DefaultTimeUnit(), opts), 109 varIntBuf: [8]byte{}, 110 } 111 } 112 113 // Encode encodes a timestamp and a protobuf message. The function signature is strange 114 // in order to implement the encoding.Encoder interface. It accepts a ts.Datapoint, but 115 // only the Timestamp field will be used, the Value field will be ignored and will always 116 // return 0 on subsequent iteration. In addition, the provided annotation is expected to 117 // be a marshalled protobuf message that matches the configured schema. 118 func (enc *Encoder) Encode(dp ts.Datapoint, timeUnit xtime.Unit, protoBytes ts.Annotation) error { 119 if unusableErr := enc.isUsable(); unusableErr != nil { 120 return unusableErr 121 } 122 123 if enc.schema == nil { 124 // It is a programmatic error that schema is not set at all prior to encoding, panic to fix it asap. 125 return instrument.InvariantErrorf(errEncoderSchemaIsRequired.Error()) 126 } 127 128 // Proto encoder value is meaningless, but make sure its always zero just to be safe so that 129 // it doesn't cause LastEncoded() to produce invalid results. 130 dp.Value = float64(0) 131 132 if enc.unmarshaller == nil { 133 // Lazy init. 134 enc.unmarshaller = newCustomFieldUnmarshaller(customUnmarshallerOptions{}) 135 } 136 // resetAndUnmarshal before any data is written so that the marshalled message can be validated 137 // upfront, otherwise errors could be encountered mid-write leaving the stream in a corrupted state. 138 if err := enc.unmarshaller.resetAndUnmarshal(enc.schema, protoBytes); err != nil { 139 return fmt.Errorf( 140 "%s error unmarshalling message: %v", encErrPrefix, err) 141 } 142 143 if enc.numEncoded == 0 { 144 enc.encodeStreamHeader() 145 } 146 147 var ( 148 needToEncodeSchema = !enc.hasEncodedSchema 149 needToEncodeTimeUnit = timeUnit != enc.timestampEncoder.TimeUnit 150 ) 151 if needToEncodeSchema || needToEncodeTimeUnit { 152 enc.encodeSchemaAndOrTimeUnit(needToEncodeSchema, needToEncodeTimeUnit, timeUnit) 153 } else { 154 // Control bit that indicates the stream has more data but no time unit or schema changes. 155 enc.stream.WriteBit(opCodeMoreData) 156 } 157 158 err := enc.timestampEncoder.WriteTime(enc.stream, dp.TimestampNanos, nil, timeUnit) 159 if err != nil { 160 return fmt.Errorf( 161 "%s error encoding timestamp: %v", encErrPrefix, err) 162 } 163 164 if err := enc.encodeProto(protoBytes); err != nil { 165 return fmt.Errorf( 166 "%s error encoding proto portion of message: %v", encErrPrefix, err) 167 } 168 169 enc.numEncoded++ 170 enc.lastEncodedDP = dp 171 enc.prevAnnotation = protoBytes 172 enc.stats.IncUncompressedBytes(len(protoBytes)) 173 return nil 174 } 175 176 func (enc *Encoder) encodeSchemaAndOrTimeUnit( 177 needToEncodeSchema bool, 178 needToEncodeTimeUnit bool, 179 timeUnit xtime.Unit, 180 ) { 181 // First bit means either there is no more data OR the time unit and/or schema has changed. 182 enc.stream.WriteBit(opCodeNoMoreDataOrTimeUnitChangeAndOrSchemaChange) 183 // Next bit means there is more data, but the time unit and/or schema has changed. 184 enc.stream.WriteBit(opCodeTimeUnitChangeAndOrSchemaChange) 185 186 // Next bit is a boolean indicating whether the time unit has changed. 187 if needToEncodeTimeUnit { 188 enc.stream.WriteBit(opCodeTimeUnitChange) 189 } else { 190 enc.stream.WriteBit(opCodeTimeUnitUnchanged) 191 } 192 193 // Next bit is a boolean indicating whether the schema has changed. 194 if needToEncodeSchema { 195 enc.stream.WriteBit(opCodeSchemaChange) 196 } else { 197 enc.stream.WriteBit(opCodeSchemaUnchanged) 198 } 199 200 if needToEncodeTimeUnit { 201 // The encoder manages encoding time unit changes manually (instead of deferring to 202 // the timestamp encoder) because by default the WriteTime() API will use a marker 203 // encoding scheme that relies on looking ahead into the stream for bit combinations that 204 // could not possibly exist in the M3TSZ encoding scheme. 205 // The protobuf encoder can't rely on this behavior because its possible for the protobuf 206 // encoder to encode a legitimate bit combination that matches the "impossible" M3TSZ 207 // markers exactly. 208 enc.timestampEncoder.WriteTimeUnit(enc.stream, timeUnit) 209 } 210 211 if needToEncodeSchema { 212 enc.encodeCustomSchemaTypes() 213 enc.hasEncodedSchema = true 214 } 215 } 216 217 // Stream returns a copy of the underlying data stream. 218 func (enc *Encoder) Stream(ctx context.Context) (xio.SegmentReader, bool) { 219 seg := enc.segmentZeroCopy(ctx) 220 if seg.Len() == 0 { 221 return nil, false 222 } 223 224 if readerPool := enc.opts.SegmentReaderPool(); readerPool != nil { 225 reader := readerPool.Get() 226 reader.Reset(seg) 227 return reader, true 228 } 229 return xio.NewSegmentReader(seg), true 230 } 231 232 func (enc *Encoder) segmentZeroCopy(ctx context.Context) ts.Segment { 233 length := enc.stream.Len() 234 if enc.stream.Len() == 0 { 235 return ts.Segment{} 236 } 237 238 // We need a tail to capture an immutable snapshot of the encoder data 239 // as the last byte can change after this method returns. 240 rawBuffer, _ := enc.stream.RawBytes() 241 lastByte := rawBuffer[length-1] 242 243 // Take ref up to last byte. 244 headBytes := rawBuffer[:length-1] 245 246 // Zero copy from the output stream. 247 var head checked.Bytes 248 if pool := enc.opts.CheckedBytesWrapperPool(); pool != nil { 249 head = pool.Get(headBytes) 250 } else { 251 head = checked.NewBytes(headBytes, nil) 252 } 253 254 // Make sure the ostream bytes ref is delayed from finalizing 255 // until this operation is complete (since this is zero copy). 256 buffer, _ := enc.stream.CheckedBytes() 257 ctx.RegisterCloser(buffer.DelayFinalizer()) 258 259 // Take a shared ref to a known good tail. 260 tail := tails[lastByte] 261 262 // Only discard the head since tails are shared for process life time. 263 return ts.NewSegment(head, tail, 0, ts.FinalizeHead) 264 } 265 266 func (enc *Encoder) segmentTakeOwnership() ts.Segment { 267 length := enc.stream.Len() 268 if length == 0 { 269 return ts.Segment{} 270 } 271 272 // Take ref from the ostream. 273 head := enc.stream.Discard() 274 275 return ts.NewSegment(head, nil, 0, ts.FinalizeHead) 276 } 277 278 // NumEncoded returns the number of encoded messages. 279 func (enc *Encoder) NumEncoded() int { 280 return enc.numEncoded 281 } 282 283 // LastEncoded returns the last encoded datapoint. Does not include 284 // annotation / protobuf message for interface purposes. 285 func (enc *Encoder) LastEncoded() (ts.Datapoint, error) { 286 if unusableErr := enc.isUsable(); unusableErr != nil { 287 return ts.Datapoint{}, unusableErr 288 } 289 290 if enc.numEncoded == 0 { 291 return ts.Datapoint{}, errNoEncodedDatapoints 292 } 293 294 // Value is meaningless for proto encoder and should already be zero, 295 // but set it again to be safe. 296 enc.lastEncodedDP.Value = 0 297 return enc.lastEncodedDP, nil 298 } 299 300 // LastAnnotationChecksum returns the checksum of the last encoded annotation (which contain the bytes 301 // used for ProtoBuf data). 302 func (enc *Encoder) LastAnnotationChecksum() (uint64, error) { 303 if enc.numEncoded == 0 { 304 return 0, errNoEncodedDatapoints 305 } 306 307 return xxhash.Sum64(enc.prevAnnotation), nil 308 } 309 310 // Len returns the length of the data stream. 311 func (enc *Encoder) Len() int { 312 return enc.stream.Len() 313 } 314 315 // Empty returns true when underlying stream is empty. 316 func (enc *Encoder) Empty() bool { 317 return enc.stream.Empty() 318 } 319 320 // Stats returns EncoderStats which contain statistics about the encoders compression 321 // ratio. 322 func (enc *Encoder) Stats() EncoderStats { 323 return EncoderStats{ 324 UncompressedBytes: enc.stats.uncompressedBytes, 325 CompressedBytes: enc.Len(), 326 } 327 } 328 329 func (enc *Encoder) encodeStreamHeader() { 330 enc.encodeVarInt(currentEncodingSchemeVersion) 331 enc.encodeVarInt(uint64(enc.opts.ByteFieldDictionaryLRUSize())) 332 } 333 334 func (enc *Encoder) encodeCustomSchemaTypes() { 335 if len(enc.customFields) == 0 { 336 enc.encodeVarInt(0) 337 return 338 } 339 340 // Field numbers are 1-indexed so encoding the maximum field number 341 // at the beginning is equivalent to encoding the number of types 342 // we need to read after if we imagine that we're encoding a 1-indexed 343 // bitset where the position in the bitset encodes the field number (I.E 344 // the first value is the type for field number 1) and the values are 345 // the number of bits required to unique identify a custom type instead of 346 // just being a single bit (3 bits in the case of version 1 of the encoding 347 // scheme.) 348 maxFieldNum := enc.customFields[len(enc.customFields)-1].fieldNum 349 enc.encodeVarInt(uint64(maxFieldNum)) 350 351 // Start at 1 because we're zero-indexed. 352 for i := 1; i <= maxFieldNum; i++ { 353 customTypeBits := uint64(notCustomEncodedField) 354 for _, customField := range enc.customFields { 355 if customField.fieldNum == i { 356 customTypeBits = uint64(customField.fieldType) 357 break 358 } 359 } 360 361 enc.stream.WriteBits( 362 customTypeBits, 363 numBitsToEncodeCustomType) 364 } 365 } 366 367 func (enc *Encoder) encodeProto(buf []byte) error { 368 var ( 369 sortedTopLevelScalarValues = enc.unmarshaller.sortedCustomFieldValues() 370 sortedTopLevelScalarValuesIdx = 0 371 lastMarshalledValue unmarshalValue 372 ) 373 374 // Loop through the customFields slice and sortedTopLevelScalarValues slice (both 375 // of which are sorted by field number) at the same time and match each customField 376 // to its encoded value in the stream (if any). 377 for i, customField := range enc.customFields { 378 if sortedTopLevelScalarValuesIdx < len(sortedTopLevelScalarValues) { 379 lastMarshalledValue = sortedTopLevelScalarValues[sortedTopLevelScalarValuesIdx] 380 } 381 382 lastMarshalledValueFieldNumber := -1 383 384 hasNext := sortedTopLevelScalarValuesIdx < len(sortedTopLevelScalarValues) 385 if hasNext { 386 lastMarshalledValueFieldNumber = int(lastMarshalledValue.fieldNumber) 387 } 388 389 // Since both the customFields slice and the sortedTopLevelScalarValues slice 390 // are sorted by field number, if the scalar slice contains no more values or 391 // it contains a next value, but the field number is not equal to the field number 392 // of the current customField, it is safe to conclude that the current customField's 393 // value was not encoded in this message which means that it should be interpreted 394 // as the default value for that field according to the proto3 specification. 395 noMarshalledValue := (!hasNext || 396 customField.fieldNum != lastMarshalledValueFieldNumber) 397 if noMarshalledValue { 398 err := enc.encodeZeroValue(i) 399 if err != nil { 400 return err 401 } 402 continue 403 } 404 405 switch { 406 case isCustomFloatEncodedField(customField.fieldType): 407 enc.encodeTSZValue(i, lastMarshalledValue.asFloat64()) 408 409 case isCustomIntEncodedField(customField.fieldType): 410 if isUnsignedInt(customField.fieldType) { 411 enc.encodeUnsignedIntValue(i, lastMarshalledValue.asUint64()) 412 } else { 413 enc.encodeSignedIntValue(i, lastMarshalledValue.asInt64()) 414 } 415 416 case customField.fieldType == bytesField: 417 err := enc.encodeBytesValue(i, lastMarshalledValue.asBytes()) 418 if err != nil { 419 return err 420 } 421 422 case customField.fieldType == boolField: 423 enc.encodeBoolValue(i, lastMarshalledValue.asBool()) 424 425 default: 426 // This should never happen. 427 return fmt.Errorf( 428 "%s error no logic for custom encoding field number: %d", 429 encErrPrefix, customField.fieldNum) 430 } 431 432 sortedTopLevelScalarValuesIdx++ 433 } 434 435 if err := enc.encodeNonCustomValues(); err != nil { 436 return err 437 } 438 439 return nil 440 } 441 442 func (enc *Encoder) encodeZeroValue(i int) error { 443 customField := enc.customFields[i] 444 switch { 445 case isCustomFloatEncodedField(customField.fieldType): 446 var zeroFloat64 float64 447 enc.encodeTSZValue(i, zeroFloat64) 448 return nil 449 450 case isCustomIntEncodedField(customField.fieldType): 451 if isUnsignedInt(customField.fieldType) { 452 var zeroUInt64 uint64 453 enc.encodeUnsignedIntValue(i, zeroUInt64) 454 } else { 455 var zeroInt64 int64 456 enc.encodeSignedIntValue(i, zeroInt64) 457 } 458 return nil 459 460 case customField.fieldType == bytesField: 461 var zeroBytes []byte 462 return enc.encodeBytesValue(i, zeroBytes) 463 464 case customField.fieldType == boolField: 465 enc.encodeBoolValue(i, false) 466 return nil 467 468 default: 469 // This should never happen. 470 return fmt.Errorf( 471 "%s error no logic for custom encoding field number: %d", 472 encErrPrefix, customField.fieldNum) 473 } 474 } 475 476 // Reset resets the encoder for reuse. 477 func (enc *Encoder) Reset( 478 start xtime.UnixNano, 479 capacity int, 480 descr namespace.SchemaDescr, 481 ) { 482 enc.SetSchema(descr) 483 enc.reset(start, capacity) 484 } 485 486 // SetSchema sets the schema for the encoder. 487 func (enc *Encoder) SetSchema(descr namespace.SchemaDescr) { 488 if descr == nil { 489 enc.schemaDesc = nil 490 enc.resetSchema(nil) 491 return 492 } 493 494 // Noop if schema has not changed. 495 if enc.schemaDesc != nil && len(descr.DeployId()) != 0 && enc.schemaDesc.DeployId() == descr.DeployId() { 496 return 497 } 498 499 enc.schemaDesc = descr 500 enc.resetSchema(descr.Get().MessageDescriptor) 501 } 502 503 func (enc *Encoder) reset(start xtime.UnixNano, capacity int) { 504 enc.stream.Reset(enc.newBuffer(capacity)) 505 enc.timestampEncoder = m3tsz.NewTimestampEncoder( 506 start, enc.opts.DefaultTimeUnit(), enc.opts) 507 enc.lastEncodedDP = ts.Datapoint{} 508 509 // Prevent this from growing too large and remaining in the pools. 510 enc.marshalBuf = nil 511 512 if enc.schema != nil { 513 enc.customFields, enc.nonCustomFields = customAndNonCustomFields(enc.customFields, enc.nonCustomFields, enc.schema) 514 } 515 516 enc.closed = false 517 enc.numEncoded = 0 518 } 519 520 func (enc *Encoder) resetSchema(schema *desc.MessageDescriptor) { 521 enc.schema = schema 522 if enc.schema == nil { 523 // Clear but don't set to nil so they don't need to be reallocated 524 // next time. 525 customFields := enc.customFields 526 for i := range customFields { 527 customFields[i] = customFieldState{} 528 } 529 enc.customFields = customFields[:0] 530 531 nonCustomFields := enc.nonCustomFields 532 for i := range nonCustomFields { 533 nonCustomFields[i] = marshalledField{} 534 } 535 enc.nonCustomFields = nonCustomFields[:0] 536 return 537 } 538 539 enc.customFields, enc.nonCustomFields = customAndNonCustomFields(enc.customFields, enc.nonCustomFields, enc.schema) 540 enc.hasEncodedSchema = false 541 } 542 543 // Close closes the encoder. 544 func (enc *Encoder) Close() { 545 if enc.closed { 546 return 547 } 548 549 enc.Reset(0, 0, nil) 550 enc.stream.Reset(nil) 551 enc.closed = true 552 553 if pool := enc.opts.EncoderPool(); pool != nil { 554 pool.Put(enc) 555 } 556 } 557 558 // Discard closes the encoder and transfers ownership of the data stream to 559 // the caller. 560 func (enc *Encoder) Discard() ts.Segment { 561 segment := enc.segmentTakeOwnership() 562 // Close the encoder since its no longer needed 563 enc.Close() 564 return segment 565 } 566 567 // DiscardReset does the same thing as Discard except it also resets the encoder 568 // for reuse. 569 func (enc *Encoder) DiscardReset( 570 start xtime.UnixNano, 571 capacity int, 572 descr namespace.SchemaDescr, 573 ) ts.Segment { 574 segment := enc.segmentTakeOwnership() 575 enc.Reset(start, capacity, descr) 576 return segment 577 } 578 579 // Bytes returns the raw bytes of the underlying data stream. Does not 580 // transfer ownership and is generally unsafe. 581 func (enc *Encoder) Bytes() ([]byte, error) { 582 if unusableErr := enc.isUsable(); unusableErr != nil { 583 return nil, unusableErr 584 } 585 586 bytes, _ := enc.stream.RawBytes() 587 return bytes, nil 588 } 589 590 func (enc *Encoder) encodeTSZValue(i int, val float64) { 591 enc.customFields[i].floatEncAndIter.WriteFloat(enc.stream, val) 592 } 593 594 func (enc *Encoder) encodeSignedIntValue(i int, val int64) { 595 enc.customFields[i].intEncAndIter.encodeSignedIntValue(enc.stream, val) 596 } 597 598 func (enc *Encoder) encodeUnsignedIntValue(i int, val uint64) { 599 enc.customFields[i].intEncAndIter.encodeUnsignedIntValue(enc.stream, val) 600 } 601 602 func (enc *Encoder) encodeBytesValue(i int, val []byte) error { 603 var ( 604 customField = enc.customFields[i] 605 hash = xxhash.Sum64(val) 606 numPreviousBytes = len(customField.bytesFieldDict) 607 lastStateIdx = numPreviousBytes - 1 608 lastState encoderBytesFieldDictState 609 ) 610 if numPreviousBytes > 0 { 611 lastState = customField.bytesFieldDict[lastStateIdx] 612 } 613 614 if numPreviousBytes > 0 && hash == lastState.hash { 615 streamBytes, _ := enc.stream.RawBytes() 616 match, err := enc.bytesMatchEncodedDictionaryValue( 617 streamBytes, lastState, val) 618 if err != nil { 619 return fmt.Errorf( 620 "%s error checking if bytes match last encoded dictionary bytes: %v", 621 encErrPrefix, err) 622 } 623 if match { 624 // No changes control bit. 625 enc.stream.WriteBit(opCodeNoChange) 626 return nil 627 } 628 } 629 630 // Bytes changed control bit. 631 enc.stream.WriteBit(opCodeChange) 632 633 streamBytes, _ := enc.stream.RawBytes() 634 for j, state := range customField.bytesFieldDict { 635 if hash != state.hash { 636 continue 637 } 638 639 match, err := enc.bytesMatchEncodedDictionaryValue( 640 streamBytes, state, val) 641 if err != nil { 642 return fmt.Errorf( 643 "%s error checking if bytes match encoded dictionary bytes: %v", 644 encErrPrefix, err) 645 } 646 if !match { 647 continue 648 } 649 650 // Control bit means interpret next n bits as the index for the previous write 651 // that this matches where n is the number of bits required to represent all 652 // possible array indices in the configured LRU size. 653 enc.stream.WriteBit(opCodeInterpretSubsequentBitsAsLRUIndex) 654 enc.stream.WriteBits( 655 uint64(j), 656 numBitsRequiredForNumUpToN( 657 enc.opts.ByteFieldDictionaryLRUSize())) 658 enc.moveToEndOfBytesDict(i, j) 659 return nil 660 } 661 662 // Control bit means interpret subsequent bits as varInt encoding length of a new 663 // []byte we haven't seen before. 664 enc.stream.WriteBit(opCodeInterpretSubsequentBitsAsBytesLengthVarInt) 665 666 length := len(val) 667 enc.encodeVarInt(uint64(length)) 668 669 // Add padding bits until we reach the next byte. This ensures that the startPos 670 // that we're going to store in the dictionary LRU will be aligned on a physical 671 // byte boundary which makes retrieving the bytes again later for comparison much 672 // easier. 673 // 674 // Note that this will waste up to a maximum of 7 bits per []byte that we encode 675 // which is acceptable for now, but in the future we may want to make the code able 676 // to do the comparison even if the bytes aren't aligned on a byte boundary in order 677 // to improve the compression. 678 // 679 // Also this implementation had the side-effect of making encoding and decoding of 680 // []byte values much faster because for long []byte the encoder and iterator can avoid 681 // bit manipulation and calling WriteByte() / ReadByte() in a loop and can instead read the 682 // entire []byte in one go. 683 enc.padToNextByte() 684 685 // Track the byte position we're going to start at so we can store it in the LRU after. 686 streamBytes, _ = enc.stream.RawBytes() 687 bytePos := len(streamBytes) 688 689 // Write the actual bytes. 690 enc.stream.WriteBytes(val) 691 692 enc.addToBytesDict(i, encoderBytesFieldDictState{ 693 hash: hash, 694 startPos: uint32(bytePos), 695 length: uint32(length), 696 }) 697 return nil 698 } 699 700 func (enc *Encoder) encodeBoolValue(i int, val bool) { 701 if val { 702 enc.stream.WriteBit(opCodeBoolTrue) 703 } else { 704 enc.stream.WriteBit(opCodeBoolFalse) 705 } 706 } 707 708 func (enc *Encoder) encodeNonCustomValues() error { 709 if len(enc.nonCustomFields) == 0 { 710 // Fast path, skip all the encoding logic entirely because there are 711 // no fields that require proto encoding. 712 // TODO(rartoul): Note that the encoding scheme could be further optimized 713 // such that if there are no fields that require proto encoding then we don't 714 // need to waste this bit per write. 715 enc.stream.WriteBit(opCodeNoChange) 716 return nil 717 } 718 719 // Reset for re-use. 720 enc.fieldsChangedToDefault = enc.fieldsChangedToDefault[:0] 721 722 var ( 723 incomingNonCustomFields = enc.unmarshaller.sortedNonCustomFieldValues() 724 // Matching entries in two sorted lists in which every element in each list is unique so keep 725 // track of the last index at which a match was found so that subsequent inner loops can start 726 // at the next index. 727 lastMatchIdx = -1 728 numChangedValues = 0 729 ) 730 enc.marshalBuf = enc.marshalBuf[:0] // Reset buf for reuse. 731 732 for i, existingField := range enc.nonCustomFields { 733 var curVal []byte 734 for i := lastMatchIdx + 1; i < len(incomingNonCustomFields); i++ { 735 incomingField := incomingNonCustomFields[i] 736 if existingField.fieldNum == incomingField.fieldNum { 737 curVal = incomingField.marshalled 738 lastMatchIdx = i 739 break 740 } 741 } 742 743 prevVal := existingField.marshalled 744 if bytes.Equal(prevVal, curVal) { 745 // No change, nothing to encode. 746 continue 747 } 748 749 numChangedValues++ 750 if curVal == nil { 751 // Interpret as default value. 752 enc.fieldsChangedToDefault = append(enc.fieldsChangedToDefault, existingField.fieldNum) 753 } 754 enc.marshalBuf = append(enc.marshalBuf, curVal...) 755 756 // Need to copy since the encoder no longer owns the original source of the bytes once 757 // this function returns. 758 enc.nonCustomFields[i].marshalled = append(enc.nonCustomFields[i].marshalled[:0], curVal...) 759 } 760 761 if numChangedValues <= 0 { 762 // Only want to skip encoding if nothing has changed AND we've already 763 // encoded the first message. 764 enc.stream.WriteBit(opCodeNoChange) 765 return nil 766 } 767 768 // Control bit indicating that proto values have changed. 769 enc.stream.WriteBit(opCodeChange) 770 if len(enc.fieldsChangedToDefault) > 0 { 771 // Control bit indicating that some fields have been set to default values 772 // and that a bitset will follow specifying which fields have changed. 773 enc.stream.WriteBit(opCodeFieldsSetToDefaultProtoMarshal) 774 enc.encodeBitset(enc.fieldsChangedToDefault) 775 } else { 776 // Control bit indicating that none of the changed fields have been set to 777 // their default values so we can do a clean merge on read. 778 enc.stream.WriteBit(opCodeNoFieldsSetToDefaultProtoMarshal) 779 } 780 781 // This wastes up to 7 bits of space per encoded message but significantly improves encoding and 782 // decoding speed due to the fact that the OStream and IStream can write and read the data with 783 // the equivalent of one memcpy as opposed to having to decode one byte at a time due to lack 784 // of alignment. 785 enc.padToNextByte() 786 enc.encodeVarInt(uint64(len(enc.marshalBuf))) 787 enc.stream.WriteBytes(enc.marshalBuf) 788 789 return nil 790 } 791 792 func (enc *Encoder) isUsable() error { 793 if enc.closed { 794 return errEncoderClosed 795 } 796 797 return nil 798 } 799 800 func (enc *Encoder) bytesMatchEncodedDictionaryValue( 801 streamBytes []byte, 802 dictState encoderBytesFieldDictState, 803 currBytes []byte, 804 ) (bool, error) { 805 var ( 806 prevEncodedBytesStart = dictState.startPos 807 prevEncodedBytesEnd = prevEncodedBytesStart + dictState.length 808 ) 809 810 if prevEncodedBytesEnd > uint32(len(streamBytes)) { 811 // Should never happen. 812 return false, fmt.Errorf( 813 "bytes position in LRU is outside of stream bounds, streamSize: %d, startPos: %d, length: %d", 814 len(streamBytes), prevEncodedBytesStart, dictState.length) 815 } 816 817 return bytes.Equal(streamBytes[prevEncodedBytesStart:prevEncodedBytesEnd], currBytes), nil 818 } 819 820 // padToNextByte will add padding bits in the current byte until the ostream 821 // reaches the beginning of the next byte. This allows us begin encoding data 822 // with the guarantee that we're aligned at a physical byte boundary. 823 func (enc *Encoder) padToNextByte() { 824 _, bitPos := enc.stream.RawBytes() 825 for bitPos%8 != 0 { 826 enc.stream.WriteBit(0) 827 bitPos++ 828 } 829 } 830 831 func (enc *Encoder) moveToEndOfBytesDict(fieldIdx, i int) { 832 existing := enc.customFields[fieldIdx].bytesFieldDict 833 for j := i; j < len(existing); j++ { 834 nextIdx := j + 1 835 if nextIdx >= len(existing) { 836 break 837 } 838 839 currVal := existing[j] 840 nextVal := existing[nextIdx] 841 existing[j] = nextVal 842 existing[nextIdx] = currVal 843 } 844 } 845 846 func (enc *Encoder) addToBytesDict(fieldIdx int, state encoderBytesFieldDictState) { 847 existing := enc.customFields[fieldIdx].bytesFieldDict 848 if len(existing) < enc.opts.ByteFieldDictionaryLRUSize() { 849 enc.customFields[fieldIdx].bytesFieldDict = append(existing, state) 850 return 851 } 852 853 // Shift everything down 1 and replace the last value to evict the 854 // least recently used entry and add the newest one. 855 // [1,2,3] 856 // becomes 857 // [2,3,3] 858 // after shift, and then becomes 859 // [2,3,4] 860 // after replacing the last value. 861 for i := range existing { 862 nextIdx := i + 1 863 if nextIdx >= len(existing) { 864 break 865 } 866 867 existing[i] = existing[nextIdx] 868 } 869 870 existing[len(existing)-1] = state 871 } 872 873 // encodeBitset writes out a bitset in the form of: 874 // 875 // varint(number of bits)|bitset 876 // 877 // I.E first it encodes a varint which specifies the number of following 878 // bits to interpret as a bitset and then it encodes the provided values 879 // as zero-indexed bitset. 880 func (enc *Encoder) encodeBitset(values []int32) { 881 var max int32 882 for _, v := range values { 883 if v > max { 884 max = v 885 } 886 } 887 888 // Encode a varint that indicates how many of the remaining 889 // bits to interpret as a bitset. 890 enc.encodeVarInt(uint64(max)) 891 892 // Encode the bitset 893 for i := int32(0); i < max; i++ { 894 wroteExists := false 895 896 for _, v := range values { 897 // Subtract one because the values are 1-indexed but the bitset 898 // is 0-indexed. 899 if i == v-1 { 900 enc.stream.WriteBit(opCodeBitsetValueIsSet) 901 wroteExists = true 902 break 903 } 904 } 905 906 if wroteExists { 907 continue 908 } 909 910 enc.stream.WriteBit(opCodeBitsetValueIsNotSet) 911 } 912 } 913 914 func (enc *Encoder) encodeVarInt(x uint64) { 915 var ( 916 // Convert array to slice we can reuse the buffer. 917 buf = enc.varIntBuf[:] 918 numBytes = binary.PutUvarint(buf, x) 919 ) 920 921 // Reslice so we only write out as many bytes as is required 922 // to represent the number. 923 buf = buf[:numBytes] 924 enc.stream.WriteBytes(buf) 925 } 926 927 func (enc *Encoder) newBuffer(capacity int) checked.Bytes { 928 if bytesPool := enc.opts.BytesPool(); bytesPool != nil { 929 return bytesPool.Get(capacity) 930 } 931 return checked.NewBytes(make([]byte, 0, capacity), nil) 932 } 933 934 // tails is a list of all possible tails based on the 935 // byte value of the last byte. For the proto encoder 936 // they are all the same. 937 var tails [256]checked.Bytes 938 939 func init() { 940 for i := 0; i < 256; i++ { 941 tails[i] = checked.NewBytes([]byte{byte(i)}, nil) 942 } 943 }