github.com/minio/simdjson-go@v0.4.6-0.20231116094823-04d21cddf993/parsed_serialize.go (about) 1 /* 2 * MinIO Cloud Storage, (C) 2020 MinIO, Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package simdjson 18 19 import ( 20 "bufio" 21 "bytes" 22 "encoding/binary" 23 "errors" 24 "fmt" 25 "io" 26 "math" 27 "runtime" 28 "sync" 29 "unsafe" 30 31 "github.com/klauspost/compress/s2" 32 "github.com/klauspost/compress/zstd" 33 ) 34 35 const ( 36 stringBits = 14 37 stringSize = 1 << stringBits 38 stringmask = stringSize - 1 39 serializedVersion = 3 40 ) 41 42 // Serializer allows to serialize parsed json and read it back. 43 // A Serializer can be reused, but not used concurrently. 44 type Serializer struct { 45 // Compressed strings 46 sMsg []byte 47 48 // Uncompressed tags 49 tagsBuf []byte 50 // Values 51 valuesBuf []byte 52 valuesCompBuf []byte 53 tagsCompBuf []byte 54 55 compValues, compTags uint8 56 compStrings uint8 57 fasterComp bool 58 59 // Deduplicated strings 60 stringWr io.Writer 61 stringsTable [stringSize]uint32 62 stringBuf []byte 63 64 maxBlockSize uint64 65 } 66 67 // NewSerializer will create and initialize a Serializer. 68 func NewSerializer() *Serializer { 69 initSerializerOnce.Do(initSerializer) 70 var s Serializer 71 s.CompressMode(CompressDefault) 72 s.maxBlockSize = 1 << 31 73 return &s 74 } 75 76 type CompressMode uint8 77 78 const ( 79 // CompressNone no compression whatsoever. 80 CompressNone CompressMode = iota 81 82 // CompressFast will apply light compression, 83 // but will not deduplicate strings which may affect deserialization speed. 84 CompressFast 85 86 // CompressDefault applies light compression and deduplicates strings. 87 CompressDefault 88 89 // CompressBest 90 CompressBest 91 ) 92 93 func (s *Serializer) CompressMode(c CompressMode) { 94 switch c { 95 case CompressNone: 96 s.compValues = blockTypeUncompressed 97 s.compTags = blockTypeUncompressed 98 s.compStrings = blockTypeUncompressed 99 case CompressFast: 100 s.compValues = blockTypeS2 101 s.compTags = blockTypeS2 102 s.compStrings = blockTypeS2 103 s.fasterComp = true 104 case CompressDefault: 105 s.compValues = blockTypeS2 106 s.compTags = blockTypeZstd 107 s.compStrings = blockTypeS2 108 case CompressBest: 109 s.compValues = blockTypeZstd 110 s.compTags = blockTypeZstd 111 s.compStrings = blockTypeZstd 112 default: 113 panic("unknown compression mode") 114 } 115 } 116 117 func serializeNDStream(dst io.Writer, in <-chan Stream, reuse chan<- *ParsedJson, concurrency int, comp CompressMode) error { 118 if concurrency <= 0 { 119 concurrency = (runtime.GOMAXPROCS(0) + 1) / 2 120 } 121 var wg sync.WaitGroup 122 wg.Add(concurrency) 123 type workload struct { 124 pj *ParsedJson 125 dst chan []byte 126 } 127 var readCh = make(chan workload, concurrency) 128 var writeCh = make(chan chan []byte, concurrency) 129 dstPool := sync.Pool{ 130 New: func() interface{} { 131 return make([]byte, 0, 64<<10) 132 }, 133 } 134 for i := 0; i < concurrency; i++ { 135 go func() { 136 s := NewSerializer() 137 s.CompressMode(comp) 138 defer wg.Done() 139 for input := range readCh { 140 res := s.Serialize(dstPool.Get().([]byte)[:0], *input.pj) 141 input.dst <- res 142 select { 143 case reuse <- input.pj: 144 default: 145 } 146 } 147 }() 148 } 149 var writeErr error 150 var wwg sync.WaitGroup 151 wwg.Add(1) 152 go func() { 153 defer wwg.Done() 154 for block := range writeCh { 155 b := <-block 156 var n int 157 n, writeErr = dst.Write(b) 158 if n != len(b) { 159 writeErr = io.ErrShortWrite 160 } 161 } 162 }() 163 var readErr error 164 var rwg sync.WaitGroup 165 rwg.Add(1) 166 go func() { 167 defer rwg.Done() 168 defer close(readCh) 169 for block := range in { 170 if block.Error != nil { 171 readErr = block.Error 172 } 173 readCh <- workload{ 174 pj: block.Value, 175 dst: make(chan []byte, 0), 176 } 177 } 178 }() 179 rwg.Wait() 180 if readErr != nil { 181 wg.Wait() 182 close(writeCh) 183 wwg.Wait() 184 return readErr 185 } 186 // Read done, wait for workers... 187 wg.Wait() 188 close(writeCh) 189 // Wait for writer... 190 wwg.Wait() 191 return writeErr 192 } 193 194 const ( 195 tagFloatWithFlag = Tag('e') 196 ) 197 198 // Serialize the data in pj and return the data. 199 // An optional destination can be provided. 200 func (s *Serializer) Serialize(dst []byte, pj ParsedJson) []byte { 201 // Blocks: 202 // - Compressed size of entire block following. Can be 0 if empty. (varuint) 203 // - Block type, byte: 204 // 0: uncompressed, rest is data. 205 // 1: S2 compressed stream. 206 // 2: Zstd block. 207 // - Compressed data. 208 // 209 // Serialized format: 210 // - Header: Version (byte) 211 // - Compressed size of remaining data (varuint). Excludes previous and size of this. 212 // - Tape size, uncompressed (varuint) 213 // - Strings size, uncompressed (varuint) 214 // - Strings Block: Compressed block. See above. 215 // - Message size, uncompressed (varuint) 216 // - Message Block: Compressed block. See above. 217 // - Uncompressed size of tags (varuint) 218 // - Tags Block: Compressed block. See above. 219 // - Uncompressed values size (varuint) 220 // - Values Block: Compressed block. See above. 221 // 222 // Reconstruction: 223 // 224 // Read next tag. Depending on the tag, read a number of values: 225 // Values: 226 // - Null, BoolTrue/BoolFalse: No value. 227 // - Nop, Skip distances must be reconstructed. 228 // - TagObjectStart, TagArrayStart, TagRoot: (Offset - Current offset). Write end tag for object and array. 229 // - TagObjectEnd, TagArrayEnd: No value stored, derived from start. 230 // - TagInteger, TagUint, TagFloat: 64 bits 231 // - TagString: offset, length stored. 232 // - tagFloatWithFlag (v2): Contains float parsing flag. 233 // 234 // If there are any values left as tag or value, it is considered invalid. 235 236 var wg sync.WaitGroup 237 238 // Reset lookup table. 239 // Offsets are offset by 1, so 0 indicates an unfilled entry. 240 for i := range s.stringsTable[:] { 241 s.stringsTable[i] = 0 242 } 243 if len(s.stringBuf) > 0 { 244 s.stringBuf = s.stringBuf[:0] 245 } 246 if len(s.sMsg) > 0 { 247 s.sMsg = s.sMsg[:0] 248 } 249 250 msgWr, msgDone := encBlock(s.compStrings, s.sMsg, s.fasterComp) 251 s.stringWr = msgWr 252 253 const tagBufSize = 64 << 10 254 const valBufSize = 64 << 10 255 256 valWr, valDone := encBlock(s.compValues, s.valuesCompBuf, s.fasterComp) 257 tagWr, tagDone := encBlock(s.compTags, s.tagsCompBuf, s.fasterComp) 258 // Pessimistically allocate for maximum possible size. 259 if cap(s.tagsBuf) <= tagBufSize { 260 s.tagsBuf = make([]byte, tagBufSize) 261 } 262 s.tagsBuf = s.tagsBuf[:tagBufSize] 263 264 // At most one value per 2 tape entries 265 if cap(s.valuesBuf) < valBufSize+4 { 266 s.valuesBuf = make([]byte, valBufSize+4) 267 } 268 269 s.valuesBuf = s.valuesBuf[:0] 270 off := 0 271 tagsOff := 0 272 var tmp [8]byte 273 rawValues := 0 274 rawTags := 0 275 for off < len(pj.Tape) { 276 if tagsOff >= tagBufSize { 277 rawTags += tagsOff 278 tagWr.Write(s.tagsBuf[:tagsOff]) 279 tagsOff = 0 280 } 281 if len(s.valuesBuf) >= valBufSize { 282 rawValues += len(s.valuesBuf) 283 valWr.Write(s.valuesBuf) 284 s.valuesBuf = s.valuesBuf[:0] 285 } 286 entry := pj.Tape[off] 287 ntype := Tag(entry >> 56) 288 payload := entry & JSONVALUEMASK 289 290 switch ntype { 291 case TagNop: 292 // We recreate the skip count when we unmarshal 293 case TagString: 294 sb, err := pj.stringByteAt(payload, pj.Tape[off+1]) 295 if err != nil { 296 panic(err) 297 } 298 offset := s.indexString(sb) 299 300 binary.LittleEndian.PutUint64(tmp[:], offset) 301 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 302 binary.LittleEndian.PutUint64(tmp[:], uint64(len(sb))) 303 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 304 off++ 305 case TagUint: 306 binary.LittleEndian.PutUint64(tmp[:], pj.Tape[off+1]) 307 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 308 off++ 309 case TagInteger: 310 binary.LittleEndian.PutUint64(tmp[:], pj.Tape[off+1]) 311 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 312 off++ 313 case TagFloat: 314 if payload == 0 { 315 binary.LittleEndian.PutUint64(tmp[:], pj.Tape[off+1]) 316 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 317 off++ 318 } else { 319 ntype = tagFloatWithFlag 320 binary.LittleEndian.PutUint64(tmp[:], entry) 321 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 322 binary.LittleEndian.PutUint64(tmp[:], pj.Tape[off+1]) 323 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 324 off++ 325 } 326 case TagNull, TagBoolTrue, TagBoolFalse: 327 // No value. 328 case TagObjectStart, TagArrayStart, TagRoot: 329 // TagObjectStart TagArrayStart always points forward. 330 // TagRoot can point either direction so we rely on under/overflow. 331 binary.LittleEndian.PutUint64(tmp[:], payload-uint64(off)) 332 s.valuesBuf = append(s.valuesBuf, tmp[:]...) 333 case TagObjectEnd, TagArrayEnd, TagEnd: 334 // Value can be deducted from start tag or no value. 335 default: 336 wg.Wait() 337 panic(fmt.Errorf("unknown tag: %d", int(ntype))) 338 } 339 s.tagsBuf[tagsOff] = uint8(ntype) 340 tagsOff++ 341 off++ 342 } 343 if tagsOff > 0 { 344 rawTags += tagsOff 345 tagWr.Write(s.tagsBuf[:tagsOff]) 346 } 347 if len(s.valuesBuf) > 0 { 348 rawValues += len(s.valuesBuf) 349 valWr.Write(s.valuesBuf) 350 } 351 wg.Add(3) 352 go func() { 353 var err error 354 s.tagsCompBuf, err = tagDone() 355 if err != nil { 356 panic(err) 357 } 358 wg.Done() 359 }() 360 go func() { 361 var err error 362 s.valuesCompBuf, err = valDone() 363 if err != nil { 364 panic(err) 365 } 366 wg.Done() 367 }() 368 go func() { 369 var err error 370 s.sMsg, err = msgDone() 371 if err != nil { 372 panic(err) 373 } 374 wg.Done() 375 }() 376 377 // Wait for compressors 378 wg.Wait() 379 380 // Version 381 dst = append(dst, serializedVersion) 382 383 // Size of varints... 384 varInts := binary.PutUvarint(tmp[:], uint64(0)) + 385 binary.PutUvarint(tmp[:], uint64(len(s.sMsg))) + 386 binary.PutUvarint(tmp[:], uint64(rawTags)) + 387 binary.PutUvarint(tmp[:], uint64(len(s.tagsCompBuf))) + 388 binary.PutUvarint(tmp[:], uint64(rawValues)) + 389 binary.PutUvarint(tmp[:], uint64(len(s.valuesCompBuf))) + 390 binary.PutUvarint(tmp[:], uint64(len(s.stringBuf))) + 391 binary.PutUvarint(tmp[:], uint64(len(pj.Tape))) 392 393 n := binary.PutUvarint(tmp[:], uint64(1+len(s.sMsg)+len(s.tagsCompBuf)+len(s.valuesCompBuf)+varInts)) 394 dst = append(dst, tmp[:n]...) 395 396 // Tape elements, uncompressed. 397 n = binary.PutUvarint(tmp[:], uint64(len(pj.Tape))) 398 dst = append(dst, tmp[:n]...) 399 400 // Strings uncompressed size 401 dst = append(dst, 0) 402 // Strings 403 dst = append(dst, 0) 404 405 // Messages uncompressed size 406 n = binary.PutUvarint(tmp[:], uint64(len(s.stringBuf))) 407 dst = append(dst, tmp[:n]...) 408 // Message 409 n = binary.PutUvarint(tmp[:], uint64(len(s.sMsg))) 410 dst = append(dst, tmp[:n]...) 411 dst = append(dst, s.sMsg...) 412 413 // Tags 414 n = binary.PutUvarint(tmp[:], uint64(rawTags)) 415 dst = append(dst, tmp[:n]...) 416 n = binary.PutUvarint(tmp[:], uint64(len(s.tagsCompBuf))) 417 dst = append(dst, tmp[:n]...) 418 dst = append(dst, s.tagsCompBuf...) 419 420 // Values 421 n = binary.PutUvarint(tmp[:], uint64(rawValues)) 422 dst = append(dst, tmp[:n]...) 423 n = binary.PutUvarint(tmp[:], uint64(len(s.valuesCompBuf))) 424 dst = append(dst, tmp[:n]...) 425 dst = append(dst, s.valuesCompBuf...) 426 if false { 427 fmt.Println("strings:", len(pj.Strings.B)+len(pj.Message), "->", len(s.sMsg), "tags:", rawTags, "->", len(s.tagsCompBuf), "values:", rawValues, "->", len(s.valuesCompBuf), "Total:", len(pj.Message)+len(pj.Strings.B)+len(pj.Tape)*8, "->", len(dst)) 428 } 429 430 return dst 431 } 432 433 func (s *Serializer) splitBlocks(r io.Reader, out chan []byte) error { 434 br := bufio.NewReader(r) 435 defer close(out) 436 for { 437 if v, err := br.ReadByte(); err != nil { 438 return err 439 } else if v != 1 { 440 return errors.New("unknown version") 441 } 442 443 // Comp size 444 c, err := binary.ReadUvarint(br) 445 if err != nil { 446 return err 447 } 448 if c > s.maxBlockSize { 449 return errors.New("compressed block too big") 450 } 451 block := make([]byte, c) 452 n, err := io.ReadFull(br, block) 453 if err != nil { 454 return err 455 } 456 if n > 0 { 457 out <- block 458 } 459 } 460 } 461 462 // Deserialize the content in src. 463 // Only basic sanity checks will be performed. 464 // Slight corruption will likely go through unnoticed. 465 // And optional destination can be provided. 466 func (s *Serializer) Deserialize(src []byte, dst *ParsedJson) (*ParsedJson, error) { 467 br := bytes.NewBuffer(src) 468 469 if v, err := br.ReadByte(); err != nil { 470 return dst, err 471 } else if v > serializedVersion { 472 // v3 reads v2. 473 // v2 reads v1. 474 return dst, errors.New("unknown version") 475 } 476 477 if dst == nil { 478 dst = &ParsedJson{} 479 } 480 481 // Comp size 482 if c, err := binary.ReadUvarint(br); err != nil { 483 return dst, err 484 } else { 485 if int(c) > br.Len() { 486 return dst, fmt.Errorf("stream too short, want %d, only have %d left", c, br.Len()) 487 } 488 if int(c) > br.Len() { 489 fmt.Println("extra length:", int(c), br.Len()) 490 } 491 } 492 493 // Tape size 494 if ts, err := binary.ReadUvarint(br); err != nil { 495 return dst, err 496 } else { 497 if uint64(cap(dst.Tape)) < ts { 498 dst.Tape = make([]uint64, ts) 499 } 500 dst.Tape = dst.Tape[:ts] 501 } 502 503 // String size 504 if ss, err := binary.ReadUvarint(br); err != nil { 505 return dst, err 506 } else { 507 if dst.Strings == nil || uint64(cap(dst.Strings.B)) < ss { 508 dst.Strings = &TStrings{B: make([]byte, ss)} 509 } 510 dst.Strings.B = dst.Strings.B[:ss] 511 } 512 513 // Decompress strings 514 var sWG sync.WaitGroup 515 var stringsErr, msgErr error 516 err := s.decBlock(br, dst.Strings.B, &sWG, &stringsErr) 517 if err != nil { 518 return dst, err 519 } 520 521 // Message size 522 if ss, err := binary.ReadUvarint(br); err != nil { 523 return dst, err 524 } else { 525 if uint64(cap(dst.Message)) < ss || dst.Message == nil { 526 dst.Message = make([]byte, ss) 527 } 528 dst.Message = dst.Message[:ss] 529 } 530 531 // Messages 532 err = s.decBlock(br, dst.Message, &sWG, &msgErr) 533 if err != nil { 534 return dst, err 535 } 536 defer sWG.Wait() 537 538 // Decompress tags 539 if tags, err := binary.ReadUvarint(br); err != nil { 540 return dst, err 541 } else { 542 if uint64(cap(s.tagsBuf)) < tags { 543 s.tagsBuf = make([]byte, tags) 544 } 545 s.tagsBuf = s.tagsBuf[:tags] 546 } 547 548 var wg sync.WaitGroup 549 var tagsErr error 550 err = s.decBlock(br, s.tagsBuf, &wg, &tagsErr) 551 if err != nil { 552 return dst, fmt.Errorf("decompressing tags: %w", err) 553 } 554 defer wg.Wait() 555 556 // Decompress values 557 if vals, err := binary.ReadUvarint(br); err != nil { 558 return dst, err 559 } else { 560 if uint64(cap(s.valuesBuf)) < vals { 561 s.valuesBuf = make([]byte, vals) 562 } 563 s.valuesBuf = s.valuesBuf[:vals] 564 } 565 566 var valsErr error 567 err = s.decBlock(br, s.valuesBuf, &wg, &valsErr) 568 if err != nil { 569 return dst, fmt.Errorf("decompressing values: %w", err) 570 } 571 572 // Wait until we have what we need for the tape. 573 wg.Wait() 574 switch { 575 case tagsErr != nil: 576 return dst, fmt.Errorf("decompressing tags: %w", tagsErr) 577 case valsErr != nil: 578 return dst, fmt.Errorf("decompressing values: %w", valsErr) 579 } 580 581 // Reconstruct tape: 582 var off int 583 values := s.valuesBuf 584 nSkips := 0 585 for _, t := range s.tagsBuf { 586 if off == len(dst.Tape) { 587 return dst, errors.New("tags extended beyond tape") 588 } 589 tag := Tag(t) 590 591 tagDst := uint64(t) << 56 592 if nSkips > 0 && tag != TagNop { 593 // We owe skips. Add with jumps 594 for i := 0; i < nSkips; i++ { 595 dst.Tape[off] = (uint64(TagNop) << JSONTAGOFFSET) | uint64(nSkips-i) 596 off++ 597 } 598 nSkips = 0 599 } 600 switch tag { 601 case TagNop: 602 nSkips++ 603 case TagString: 604 if len(values) < 16 { 605 return dst, fmt.Errorf("reading %v: no values left", tag) 606 } 607 sOffset := binary.LittleEndian.Uint64(values[:8]) 608 sLen := binary.LittleEndian.Uint64(values[8:16]) 609 values = values[16:] 610 611 dst.Tape[off] = tagDst | sOffset 612 dst.Tape[off+1] = sLen 613 off += 2 614 case TagFloat, TagInteger, TagUint: 615 if len(values) < 8 { 616 return dst, fmt.Errorf("reading %v: no values left", tag) 617 } 618 dst.Tape[off] = tagDst 619 dst.Tape[off+1] = binary.LittleEndian.Uint64(values[:8]) 620 values = values[8:] 621 off += 2 622 case tagFloatWithFlag: 623 // Tape contains full value 624 if len(values) < 16 { 625 return dst, fmt.Errorf("reading %v: no values left", tag) 626 } 627 dst.Tape[off] = binary.LittleEndian.Uint64(values[:8]) 628 dst.Tape[off+1] = binary.LittleEndian.Uint64(values[8:16]) 629 values = values[16:] 630 off += 2 631 case TagNull, TagBoolTrue, TagBoolFalse, TagEnd: 632 dst.Tape[off] = tagDst 633 off++ 634 case TagObjectStart, TagArrayStart: 635 if len(values) < 8 { 636 return dst, fmt.Errorf("reading %v: no values left", tag) 637 } 638 // Always forward 639 val := binary.LittleEndian.Uint64(values[:8]) 640 values = values[8:] 641 val += uint64(off) 642 if val > uint64(len(dst.Tape)) { 643 return dst, fmt.Errorf("%v extends beyond tape (%d). offset:%d", tag, len(dst.Tape), val) 644 } 645 646 dst.Tape[off] = tagDst | val 647 // Write closing... 648 dst.Tape[val-1] = uint64(tagOpenToClose[tag])<<56 | uint64(off) 649 650 off++ 651 case TagRoot: 652 if len(values) < 8 { 653 return dst, fmt.Errorf("reading %v: no values left", tag) 654 } 655 // Always forward 656 val := binary.LittleEndian.Uint64(values[:8]) 657 values = values[8:] 658 val += uint64(off) 659 if val > uint64(len(dst.Tape)) { 660 return dst, fmt.Errorf("%v extends beyond tape (%d). offset:%d", tag, len(dst.Tape), val) 661 } 662 663 dst.Tape[off] = tagDst | val 664 665 off++ 666 case TagObjectEnd, TagArrayEnd: 667 // This should already have been written. 668 if dst.Tape[off]&JSONTAGMASK != tagDst { 669 return dst, fmt.Errorf("reading %v, offset:%d, start tag did not match %x != %x", tag, off, dst.Tape[off]>>56, uint8(tagDst)) 670 } 671 off++ 672 default: 673 return nil, fmt.Errorf("unknown tag: %v", tag) 674 } 675 } 676 if nSkips > 0 { 677 // We owe skips. Add with jumps 678 for i := 0; i < nSkips; i++ { 679 dst.Tape[off] = (uint64(TagNop) << JSONTAGOFFSET) | uint64(nSkips-i) 680 off++ 681 } 682 nSkips = 0 683 } 684 sWG.Wait() 685 if off != len(dst.Tape) { 686 return dst, fmt.Errorf("tags did not fill tape, want %d, got %d", len(dst.Tape), off) 687 } 688 if len(values) > 0 { 689 return dst, fmt.Errorf("values did not fill tape, want %d, got %d", len(dst.Tape), off) 690 } 691 if stringsErr != nil { 692 return dst, fmt.Errorf("reading strings: %w", stringsErr) 693 } 694 return dst, nil 695 } 696 697 func (s *Serializer) decBlock(br *bytes.Buffer, dst []byte, wg *sync.WaitGroup, dstErr *error) error { 698 size, err := binary.ReadUvarint(br) 699 if err != nil { 700 return err 701 } 702 if size > uint64(br.Len()) { 703 return fmt.Errorf("block size (%d) extends beyond input %d", size, br.Len()) 704 } 705 if size == 0 && len(dst) == 0 { 706 // Nothing, no compress type 707 return nil 708 } 709 if size < 1 { 710 return fmt.Errorf("block size (%d) too small %d", size, br.Len()) 711 } 712 713 typ, err := br.ReadByte() 714 if err != nil { 715 return err 716 } 717 size-- 718 compressed := br.Next(int(size)) 719 if len(compressed) != int(size) { 720 return errors.New("short block section") 721 } 722 switch typ { 723 case blockTypeUncompressed: 724 // uncompressed 725 if len(compressed) != len(dst) { 726 return fmt.Errorf("short uncompressed block: in (%d) != out (%d)", len(compressed), len(dst)) 727 } 728 copy(dst, compressed) 729 case blockTypeS2: 730 wg.Add(1) 731 go func() { 732 defer wg.Done() 733 buf := bytes.NewBuffer(compressed) 734 dec := s2Readers.Get().(*s2.Reader) 735 dec.Reset(buf) 736 _, err := io.ReadFull(dec, dst) 737 dec.Reset(nil) 738 s2Readers.Put(dec) 739 *dstErr = err 740 }() 741 case blockTypeZstd: 742 wg.Add(1) 743 go func() { 744 defer wg.Done() 745 want := len(dst) 746 dst, err = zDec.DecodeAll(compressed, dst[:0]) 747 if err == nil && want != len(dst) { 748 err = errors.New("zstd decompressed size mismatch") 749 } 750 *dstErr = err 751 }() 752 default: 753 return fmt.Errorf("unknown compression type: %d", typ) 754 } 755 return nil 756 } 757 758 const ( 759 blockTypeUncompressed byte = 0 760 blockTypeS2 byte = 1 761 blockTypeZstd byte = 2 762 ) 763 764 var zDec *zstd.Decoder 765 766 var zEncFast = sync.Pool{New: func() interface{} { 767 e, _ := zstd.NewWriter(nil, zstd.WithEncoderLevel(zstd.SpeedDefault), zstd.WithEncoderCRC(false)) 768 return e 769 }} 770 771 var s2FastWriters = sync.Pool{New: func() interface{} { 772 return s2.NewWriter(nil) 773 }} 774 775 var s2Writers = sync.Pool{New: func() interface{} { 776 return s2.NewWriter(nil, s2.WriterBetterCompression()) 777 }} 778 var s2Readers = sync.Pool{New: func() interface{} { 779 return s2.NewReader(nil) 780 }} 781 782 var initSerializerOnce sync.Once 783 784 func initSerializer() { 785 zDec, _ = zstd.NewReader(nil) 786 } 787 788 type encodedResult func() ([]byte, error) 789 790 // encBlock will encode a block of data. 791 func encBlock(mode byte, buf []byte, fast bool) (io.Writer, encodedResult) { 792 dst := bytes.NewBuffer(buf[:0]) 793 dst.WriteByte(mode) 794 switch mode { 795 case blockTypeUncompressed: 796 return dst, func() ([]byte, error) { 797 return dst.Bytes(), nil 798 } 799 case blockTypeS2: 800 var enc *s2.Writer 801 var put *sync.Pool 802 if fast { 803 enc = s2FastWriters.Get().(*s2.Writer) 804 put = &s2FastWriters 805 } else { 806 enc = s2Writers.Get().(*s2.Writer) 807 put = &s2Writers 808 } 809 enc.Reset(dst) 810 return enc, func() (i []byte, err error) { 811 err = enc.Close() 812 if err != nil { 813 return nil, err 814 } 815 enc.Reset(nil) 816 put.Put(enc) 817 return dst.Bytes(), nil 818 } 819 case blockTypeZstd: 820 enc := zEncFast.Get().(*zstd.Encoder) 821 enc.Reset(dst) 822 return enc, func() (i []byte, err error) { 823 err = enc.Close() 824 if err != nil { 825 return nil, err 826 } 827 enc.Reset(nil) 828 zEncFast.Put(enc) 829 return dst.Bytes(), nil 830 } 831 } 832 panic("unknown compression mode") 833 } 834 835 // indexString will deduplicate strings and populate 836 func (s *Serializer) indexString(sb []byte) (offset uint64) { 837 // Only possible on 64 bit platforms, so it will never trigger on 32 bit platforms. 838 if uint32(len(sb)) >= math.MaxUint32 { 839 panic("string too long") 840 } 841 842 h := memHash(sb) & stringmask 843 off := int(s.stringsTable[h]) - 1 844 end := off + len(sb) 845 if off >= 0 && end <= len(s.stringBuf) { 846 found := s.stringBuf[off:end] 847 if bytes.Equal(found, sb) { 848 return uint64(off) 849 } 850 // It didn't match :( 851 } 852 off = len(s.stringBuf) 853 s.stringBuf = append(s.stringBuf, sb...) 854 s.stringsTable[h] = uint32(off + 1) 855 s.stringWr.Write(sb) 856 return uint64(off) 857 } 858 859 //go:noescape 860 //go:linkname memhash runtime.memhash 861 func memhash(p unsafe.Pointer, h, s uintptr) uintptr 862 863 // memHash is the hash function used by go map, it utilizes available hardware instructions (behaves 864 // as aeshash if aes instruction is available). 865 // NOTE: The hash seed changes for every process. So, this cannot be used as a persistent hash. 866 func memHash(data []byte) uint64 { 867 ss := (*stringStruct)(unsafe.Pointer(&data)) 868 return uint64(memhash(ss.str, 0, uintptr(ss.len))) 869 } 870 871 type stringStruct struct { 872 str unsafe.Pointer 873 len int 874 }