github.com/dolthub/dolt/go@v0.40.5-0.20240520175717-68db7794bea6/store/nbs/table_reader.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "context" 26 "encoding/binary" 27 "errors" 28 "io" 29 "sort" 30 31 "github.com/golang/snappy" 32 "golang.org/x/sync/errgroup" 33 34 "github.com/dolthub/dolt/go/store/chunks" 35 "github.com/dolthub/dolt/go/store/hash" 36 ) 37 38 // Do not read more than 128MB at a time. 39 const maxReadSize = 128 * 1024 * 1024 40 41 // CompressedChunk represents a chunk of data in a table file which is still compressed via snappy. 42 type CompressedChunk struct { 43 // H is the hash of the chunk 44 H hash.Hash 45 46 // FullCompressedChunk is the entirety of the compressed chunk data including the crc 47 FullCompressedChunk []byte 48 49 // CompressedData is just the snappy encoded byte buffer that stores the chunk data 50 CompressedData []byte 51 } 52 53 // NewCompressedChunk creates a CompressedChunk 54 func NewCompressedChunk(h hash.Hash, buff []byte) (CompressedChunk, error) { 55 dataLen := uint64(len(buff)) - checksumSize 56 57 chksum := binary.BigEndian.Uint32(buff[dataLen:]) 58 compressedData := buff[:dataLen] 59 60 if chksum != crc(compressedData) { 61 return CompressedChunk{}, errors.New("checksum error") 62 } 63 64 return CompressedChunk{H: h, FullCompressedChunk: buff, CompressedData: compressedData}, nil 65 } 66 67 // ToChunk snappy decodes the compressed data and returns a chunks.Chunk 68 func (cmp CompressedChunk) ToChunk() (chunks.Chunk, error) { 69 data, err := snappy.Decode(nil, cmp.CompressedData) 70 71 if err != nil { 72 return chunks.Chunk{}, err 73 } 74 75 return chunks.NewChunkWithHash(cmp.H, data), nil 76 } 77 78 func ChunkToCompressedChunk(chunk chunks.Chunk) CompressedChunk { 79 compressed := snappy.Encode(nil, chunk.Data()) 80 length := len(compressed) 81 // todo: this append allocates a new buffer and copies |compressed|. 82 // This is costly, but maybe better, as it allows us to reclaim the 83 // extra space allocated in snappy.Encode (see snappy.MaxEncodedLen). 84 compressed = append(compressed, []byte{0, 0, 0, 0}...) 85 binary.BigEndian.PutUint32(compressed[length:], crc(compressed[:length])) 86 return CompressedChunk{H: chunk.Hash(), FullCompressedChunk: compressed, CompressedData: compressed[:length]} 87 } 88 89 // Hash returns the hash of the data 90 func (cmp CompressedChunk) Hash() hash.Hash { 91 return cmp.H 92 } 93 94 // IsEmpty returns true if the chunk contains no data. 95 func (cmp CompressedChunk) IsEmpty() bool { 96 return len(cmp.CompressedData) == 0 || (len(cmp.CompressedData) == 1 && cmp.CompressedData[0] == 0) 97 } 98 99 // CompressedSize returns the size of this CompressedChunk. 100 func (cmp CompressedChunk) CompressedSize() int { 101 return len(cmp.CompressedData) 102 } 103 104 var EmptyCompressedChunk CompressedChunk 105 106 func init() { 107 EmptyCompressedChunk = ChunkToCompressedChunk(chunks.EmptyChunk) 108 } 109 110 // ErrInvalidTableFile is an error returned when a table file is corrupt or invalid. 111 var ErrInvalidTableFile = errors.New("invalid or corrupt table file") 112 var ErrUnsupportedTableFileFormat = errors.New("unsupported table file format") 113 114 type indexEntry interface { 115 Offset() uint64 116 Length() uint32 117 } 118 119 type indexResult struct { 120 offset uint64 121 length uint32 122 } 123 124 func (ir indexResult) Offset() uint64 { 125 return ir.offset 126 } 127 128 func (ir indexResult) Length() uint32 { 129 return ir.length 130 } 131 132 type tableReaderAt interface { 133 ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) 134 Reader(ctx context.Context) (io.ReadCloser, error) 135 Close() error 136 clone() (tableReaderAt, error) 137 } 138 139 // tableReader implements get & has queries against a single nbs table. goroutine safe. 140 // |blockSize| refers to the block-size of the underlying storage. We assume that, each 141 // time we read data, we actually have to read in blocks of this size. So, we're willing 142 // to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group 143 // more chunks together into a single read request to backing storage. 144 type tableReader struct { 145 prefixes []uint64 146 idx tableIndex 147 r tableReaderAt 148 blockSize uint64 149 } 150 151 // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index 152 // and footer, though it may contain an unspecified number of bytes before that data. r should allow 153 // retrieving any desired range of bytes from the table. 154 func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) (tableReader, error) { 155 p, err := index.prefixes() 156 if err != nil { 157 return tableReader{}, err 158 } 159 return tableReader{ 160 prefixes: p, 161 idx: index, 162 r: r, 163 blockSize: blockSize, 164 }, nil 165 } 166 167 // Scan across (logically) two ordered slices of address prefixes. 168 func (tr tableReader) hasMany(addrs []hasRecord) (bool, error) { 169 filterIdx := uint32(0) 170 filterLen := uint32(tr.idx.chunkCount()) 171 172 var remaining bool 173 for i, addr := range addrs { 174 if addr.has { 175 continue 176 } 177 178 // Use binary search to find the location of the addr.prefix in 179 // the prefixes array. filterIdx will be at the first entry 180 // where its prefix >= addr.prefix after this search. 181 // 182 // TODO: This is worse than a linear scan for small table files 183 // or for very large queries. 184 j := filterLen 185 for filterIdx < j { 186 h := filterIdx + (j-filterIdx)/2 187 // filterIdx <= h < j 188 if tr.prefixes[h] < addr.prefix { 189 filterIdx = h + 1 // tr.prefixes[filterIdx-1] < addr.prefix 190 } else { 191 j = h // tr.prefixes[j] >= addr.prefix 192 } 193 } 194 195 if filterIdx >= filterLen { 196 return true, nil 197 } 198 199 if addr.prefix != tr.prefixes[filterIdx] { 200 remaining = true 201 continue 202 } 203 204 // prefixes are equal, so locate and compare against the corresponding suffix 205 for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ { 206 m, err := tr.idx.entrySuffixMatches(j, addr.a) 207 if err != nil { 208 return false, err 209 } 210 if m { 211 addrs[i].has = true 212 break 213 } 214 } 215 216 if !addrs[i].has { 217 remaining = true 218 } 219 } 220 221 return remaining, nil 222 } 223 224 func (tr tableReader) count() (uint32, error) { 225 return tr.idx.chunkCount(), nil 226 } 227 228 func (tr tableReader) uncompressedLen() (uint64, error) { 229 return tr.idx.totalUncompressedData(), nil 230 } 231 232 func (tr tableReader) index() (tableIndex, error) { 233 return tr.idx, nil 234 } 235 236 // returns true iff |h| can be found in this table. 237 func (tr tableReader) has(h hash.Hash) (bool, error) { 238 _, ok, err := tr.idx.lookup(&h) 239 return ok, err 240 } 241 242 // returns the storage associated with |h|, iff present. Returns nil if absent. On success, 243 // the returned byte slice directly references the underlying storage. 244 func (tr tableReader) get(ctx context.Context, h hash.Hash, stats *Stats) ([]byte, error) { 245 e, found, err := tr.idx.lookup(&h) 246 if err != nil { 247 return nil, err 248 } 249 if !found { 250 return nil, nil 251 } 252 253 offset := e.Offset() 254 length := uint64(e.Length()) 255 buff := make([]byte, length) // TODO: Avoid this allocation for every get 256 257 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(offset), stats) 258 259 if err != nil { 260 return nil, err 261 } 262 263 if n != int(length) { 264 return nil, errors.New("failed to read all data") 265 } 266 267 cmp, err := NewCompressedChunk(h, buff) 268 269 if err != nil { 270 return nil, err 271 } 272 273 if len(cmp.CompressedData) == 0 { 274 return nil, errors.New("failed to get data") 275 } 276 277 chnk, err := cmp.ToChunk() 278 279 if err != nil { 280 return nil, err 281 } 282 283 return chnk.Data(), nil 284 } 285 286 type offsetRec struct { 287 a *hash.Hash 288 offset uint64 289 length uint32 290 } 291 292 type offsetRecSlice []offsetRec 293 294 func (hs offsetRecSlice) Len() int { return len(hs) } 295 func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset } 296 func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 297 298 var _ chunkReader = tableReader{} 299 300 func (tr tableReader) readCompressedAtOffsets( 301 ctx context.Context, 302 rb readBatch, 303 found func(context.Context, CompressedChunk), 304 stats *Stats, 305 ) error { 306 return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error { 307 found(ctx, cmp) 308 return nil 309 }) 310 } 311 312 func (tr tableReader) readAtOffsets( 313 ctx context.Context, 314 rb readBatch, 315 found func(context.Context, *chunks.Chunk), 316 stats *Stats, 317 ) error { 318 return tr.readAtOffsetsWithCB(ctx, rb, stats, func(ctx context.Context, cmp CompressedChunk) error { 319 chk, err := cmp.ToChunk() 320 321 if err != nil { 322 return err 323 } 324 325 found(ctx, &chk) 326 return nil 327 }) 328 } 329 330 func (tr tableReader) readAtOffsetsWithCB( 331 ctx context.Context, 332 rb readBatch, 333 stats *Stats, 334 cb func(ctx context.Context, cmp CompressedChunk) error, 335 ) error { 336 readLength := rb.End() - rb.Start() 337 buff := make([]byte, readLength) 338 339 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(rb.Start()), stats) 340 if err != nil { 341 return err 342 } 343 344 if uint64(n) != readLength { 345 return errors.New("failed to read all data") 346 } 347 348 for i := range rb { 349 cmp, err := rb.ExtractChunkFromRead(buff, i) 350 if err != nil { 351 return err 352 } 353 354 err = cb(ctx, cmp) 355 if err != nil { 356 return err 357 } 358 } 359 360 return nil 361 } 362 363 // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical 364 // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix. 365 func (tr tableReader) getMany( 366 ctx context.Context, 367 eg *errgroup.Group, 368 reqs []getRecord, 369 found func(context.Context, *chunks.Chunk), 370 stats *Stats) (bool, error) { 371 372 // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 373 // of table locations which must be read in order to satisfy the getMany operation. 374 offsetRecords, remaining, err := tr.findOffsets(reqs) 375 if err != nil { 376 return false, err 377 } 378 err = tr.getManyAtOffsets(ctx, eg, offsetRecords, found, stats) 379 return remaining, err 380 } 381 func (tr tableReader) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(context.Context, CompressedChunk), stats *Stats) (bool, error) { 382 // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 383 // of table locations which must be read in order to satisfy the getMany operation. 384 offsetRecords, remaining, err := tr.findOffsets(reqs) 385 if err != nil { 386 return false, err 387 } 388 err = tr.getManyCompressedAtOffsets(ctx, eg, offsetRecords, found, stats) 389 return remaining, err 390 } 391 392 func (tr tableReader) getManyCompressedAtOffsets(ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(context.Context, CompressedChunk), stats *Stats) error { 393 return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( 394 ctx context.Context, 395 rb readBatch, 396 stats *Stats) error { 397 return tr.readCompressedAtOffsets(ctx, rb, found, stats) 398 }) 399 } 400 401 func (tr tableReader) getManyAtOffsets( 402 ctx context.Context, 403 eg *errgroup.Group, 404 offsetRecords offsetRecSlice, 405 found func(context.Context, *chunks.Chunk), 406 stats *Stats, 407 ) error { 408 return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( 409 ctx context.Context, 410 rb readBatch, 411 stats *Stats) error { 412 return tr.readAtOffsets(ctx, rb, found, stats) 413 }) 414 } 415 416 type readBatch offsetRecSlice 417 418 func (r readBatch) Start() uint64 { 419 return r[0].offset 420 } 421 422 func (r readBatch) End() uint64 { 423 last := r[len(r)-1] 424 return last.offset + uint64(last.length) 425 } 426 427 func (s readBatch) ExtractChunkFromRead(buff []byte, idx int) (CompressedChunk, error) { 428 rec := s[idx] 429 chunkStart := rec.offset - s.Start() 430 return NewCompressedChunk(hash.Hash(*rec.a), buff[chunkStart:chunkStart+uint64(rec.length)]) 431 } 432 433 func toReadBatches(offsets offsetRecSlice, blockSize uint64) []readBatch { 434 res := make([]readBatch, 0) 435 var batch readBatch 436 for i := 0; i < len(offsets); { 437 rec := offsets[i] 438 if batch == nil { 439 batch = readBatch{rec} 440 i++ 441 continue 442 } 443 444 if _, canRead := canReadAhead(rec, batch.Start(), batch.End(), blockSize); canRead { 445 batch = append(batch, rec) 446 i++ 447 continue 448 } 449 450 res = append(res, batch) 451 batch = nil 452 } 453 if batch != nil { 454 res = append(res, batch) 455 } 456 return res 457 } 458 459 func (tr tableReader) getManyAtOffsetsWithReadFunc( 460 ctx context.Context, 461 eg *errgroup.Group, 462 offsetRecords offsetRecSlice, 463 stats *Stats, 464 readAtOffsets func( 465 ctx context.Context, 466 rb readBatch, 467 stats *Stats) error, 468 ) error { 469 batches := toReadBatches(offsetRecords, tr.blockSize) 470 for i := range batches { 471 if ctx.Err() != nil { 472 return ctx.Err() 473 } 474 i := i 475 eg.Go(func() error { 476 return readAtOffsets(ctx, batches[i], stats) 477 }) 478 } 479 return nil 480 } 481 482 // findOffsets iterates over |reqs| and |prefixes| (both sorted by 483 // address) to build the set of table locations which must be read in order to 484 // find each chunk specified by |reqs|. If this table contains all requested 485 // chunks remaining will be set to false upon return. If some are not here, 486 // then remaining will be true. The result offsetRecSlice is sorted in offset 487 // order. 488 func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool, err error) { 489 filterIdx := uint32(0) 490 filterLen := uint32(len(tr.prefixes)) 491 ors = make(offsetRecSlice, 0, len(reqs)) 492 493 // Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 494 // of table locations which must be read in order to satisfy |reqs|. 495 for i, req := range reqs { 496 if req.found { 497 continue 498 } 499 500 // Use binary search to find the location of the addr.prefix in 501 // the prefixes array. filterIdx will be at the first entry 502 // where its prefix >= addr.prefix after this search. 503 // 504 // TODO: This is worse than a linear scan for small table files 505 // or for very large queries. 506 j := filterLen 507 for filterIdx < j { 508 h := filterIdx + (j-filterIdx)/2 509 // filterIdx <= h < j 510 if tr.prefixes[h] < req.prefix { 511 filterIdx = h + 1 // tr.prefixes[filterIdx-1] < req.prefix 512 } else { 513 j = h // tr.prefixes[j] >= req.prefix 514 } 515 } 516 517 if filterIdx >= filterLen { 518 remaining = true // last prefix visited. 519 break 520 } 521 522 if req.prefix != tr.prefixes[filterIdx] { 523 remaining = true 524 continue 525 } 526 527 // record all offsets within the table which contain the data required. 528 for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ { 529 m, err := tr.idx.entrySuffixMatches(j, req.a) 530 if err != nil { 531 return nil, false, err 532 } 533 if m { 534 reqs[i].found = true 535 entry, err := tr.idx.indexEntry(j, nil) 536 if err != nil { 537 return nil, false, err 538 } 539 ors = append(ors, offsetRec{req.a, entry.Offset(), entry.Length()}) 540 break 541 } 542 } 543 544 if !reqs[i].found { 545 remaining = true 546 } 547 } 548 549 sort.Sort(ors) 550 return ors, remaining, nil 551 } 552 553 func canReadAhead(fRec offsetRec, curStart, curEnd, blockSize uint64) (newEnd uint64, canRead bool) { 554 if fRec.offset < curEnd { 555 // |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address 556 // prefix matches the prefix of a requested address. If the set of requests contains 557 // addresses which share a common prefix, then it's possible for multiple offsetRecords 558 // to reference the same table offset position. In that case, we'll see sequential 559 // offsetRecords with the same fRec.offset. 560 return curEnd, true 561 } 562 563 if curEnd-curStart >= maxReadSize { 564 return curEnd, false 565 } 566 567 if fRec.offset-curEnd > blockSize { 568 return curEnd, false 569 } 570 571 return fRec.offset + uint64(fRec.length), true 572 } 573 574 func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) { 575 var offsetRecords offsetRecSlice 576 // Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table. 577 offsetRecords, remaining, err = tr.findOffsets(reqs) 578 if err != nil { 579 return 0, false, err 580 } 581 582 // Now |offsetRecords| contains all locations within the table which must 583 // be searched (note that there may be duplicates of a particular 584 // location). Scan forward, grouping sequences of reads into large physical 585 // reads. 586 587 var readStart, readEnd uint64 588 readStarted := false 589 590 for i := 0; i < len(offsetRecords); { 591 rec := offsetRecords[i] 592 length := rec.length 593 594 if !readStarted { 595 readStarted = true 596 reads++ 597 readStart = rec.offset 598 readEnd = readStart + uint64(length) 599 i++ 600 continue 601 } 602 603 if newReadEnd, canRead := canReadAhead(rec, readStart, readEnd, tr.blockSize); canRead { 604 readEnd = newReadEnd 605 i++ 606 continue 607 } 608 609 readStarted = false 610 } 611 612 return 613 } 614 615 func (tr tableReader) extract(ctx context.Context, chunks chan<- extractRecord) error { 616 sendChunk := func(or offsetRec) error { 617 buff := make([]byte, or.length) 618 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(or.offset), &Stats{}) 619 if err != nil { 620 return err 621 } 622 if uint32(n) != or.length { 623 return errors.New("did not read all data") 624 } 625 cmp, err := NewCompressedChunk(hash.Hash(*or.a), buff) 626 627 if err != nil { 628 return err 629 } 630 631 chnk, err := cmp.ToChunk() 632 633 if err != nil { 634 return err 635 } 636 637 chunks <- extractRecord{a: *or.a, data: chnk.Data()} 638 return nil 639 } 640 641 var ors offsetRecSlice 642 for i := uint32(0); i < tr.idx.chunkCount(); i++ { 643 h := new(hash.Hash) 644 e, err := tr.idx.indexEntry(i, h) 645 if err != nil { 646 return err 647 } 648 ors = append(ors, offsetRec{h, e.Offset(), e.Length()}) 649 } 650 sort.Sort(ors) 651 for _, or := range ors { 652 err := sendChunk(or) 653 if err != nil { 654 return err 655 } 656 } 657 658 return nil 659 } 660 661 func (tr tableReader) reader(ctx context.Context) (io.ReadCloser, uint64, error) { 662 i, _ := tr.index() 663 sz := i.tableFileSize() 664 r, err := tr.r.Reader(ctx) 665 if err != nil { 666 return nil, 0, err 667 } 668 return r, sz, nil 669 } 670 671 func (tr tableReader) getRecordRanges(requests []getRecord) (map[hash.Hash]Range, error) { 672 // findOffsets sets getRecord.found 673 recs, _, err := tr.findOffsets(requests) 674 if err != nil { 675 return nil, err 676 } 677 ranges := make(map[hash.Hash]Range, len(recs)) 678 for _, r := range recs { 679 ranges[*r.a] = Range{ 680 Offset: r.offset, 681 Length: r.length, 682 } 683 } 684 return ranges, nil 685 } 686 687 func (tr tableReader) currentSize() uint64 { 688 return tr.idx.tableFileSize() 689 } 690 691 func (tr tableReader) close() error { 692 err := tr.idx.Close() 693 if err != nil { 694 tr.r.Close() 695 return err 696 } 697 return tr.r.Close() 698 } 699 700 func (tr tableReader) clone() (tableReader, error) { 701 idx, err := tr.idx.clone() 702 if err != nil { 703 return tableReader{}, err 704 } 705 r, err := tr.r.clone() 706 if err != nil { 707 idx.Close() 708 return tableReader{}, err 709 } 710 return tableReader{ 711 prefixes: tr.prefixes, 712 idx: idx, 713 r: r, 714 blockSize: tr.blockSize, 715 }, nil 716 }