github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table_reader.go (about) 1 // Copyright 2019 Dolthub, Inc. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // This file incorporates work covered by the following copyright and 16 // permission notice: 17 // 18 // Copyright 2016 Attic Labs, Inc. All rights reserved. 19 // Licensed under the Apache License, version 2.0: 20 // http://www.apache.org/licenses/LICENSE-2.0 21 22 package nbs 23 24 import ( 25 "bytes" 26 "context" 27 "encoding/binary" 28 "errors" 29 "io" 30 "os" 31 "sort" 32 "sync/atomic" 33 34 "github.com/dolthub/mmap-go" 35 "github.com/golang/snappy" 36 "golang.org/x/sync/errgroup" 37 38 "github.com/dolthub/dolt/go/store/chunks" 39 "github.com/dolthub/dolt/go/store/hash" 40 ) 41 42 // CompressedChunk represents a chunk of data in a table file which is still compressed via snappy. 43 type CompressedChunk struct { 44 // H is the hash of the chunk 45 H hash.Hash 46 47 // FullCompressedChunk is the entirety of the compressed chunk data including the crc 48 FullCompressedChunk []byte 49 50 // CompressedData is just the snappy encoded byte buffer that stores the chunk data 51 CompressedData []byte 52 } 53 54 // NewCompressedChunk creates a CompressedChunk 55 func NewCompressedChunk(h hash.Hash, buff []byte) (CompressedChunk, error) { 56 dataLen := uint64(len(buff)) - checksumSize 57 58 chksum := binary.BigEndian.Uint32(buff[dataLen:]) 59 compressedData := buff[:dataLen] 60 61 if chksum != crc(compressedData) { 62 return CompressedChunk{}, errors.New("checksum error") 63 } 64 65 return CompressedChunk{H: h, FullCompressedChunk: buff, CompressedData: compressedData}, nil 66 } 67 68 // ToChunk snappy decodes the compressed data and returns a chunks.Chunk 69 func (cmp CompressedChunk) ToChunk() (chunks.Chunk, error) { 70 data, err := snappy.Decode(nil, cmp.CompressedData) 71 72 if err != nil { 73 return chunks.Chunk{}, err 74 } 75 76 return chunks.NewChunkWithHash(cmp.H, data), nil 77 } 78 79 func ChunkToCompressedChunk(chunk chunks.Chunk) CompressedChunk { 80 compressed := snappy.Encode(nil, chunk.Data()) 81 length := len(compressed) 82 compressed = append(compressed, []byte{0, 0, 0, 0}...) 83 binary.BigEndian.PutUint32(compressed[length:], crc(compressed[:length])) 84 return CompressedChunk{H: chunk.Hash(), FullCompressedChunk: compressed, CompressedData: compressed[:length]} 85 } 86 87 // Hash returns the hash of the data 88 func (cmp CompressedChunk) Hash() hash.Hash { 89 return cmp.H 90 } 91 92 // IsEmpty returns true if the chunk contains no data. 93 func (cmp CompressedChunk) IsEmpty() bool { 94 return len(cmp.CompressedData) == 0 || (len(cmp.CompressedData) == 1 && cmp.CompressedData[0] == 0) 95 } 96 97 var EmptyCompressedChunk CompressedChunk 98 99 func init() { 100 EmptyCompressedChunk = ChunkToCompressedChunk(chunks.EmptyChunk) 101 } 102 103 // ErrInvalidTableFile is an error returned when a table file is corrupt or invalid. 104 var ErrInvalidTableFile = errors.New("invalid or corrupt table file") 105 106 type onHeapTableIndex struct { 107 chunkCount uint32 108 totalUncompressedData uint64 109 prefixes, offsets []uint64 110 lengths, ordinals []uint32 111 suffixes []byte 112 } 113 114 type indexEntry interface { 115 Offset() uint64 116 Length() uint32 117 } 118 119 type indexResult struct { 120 o uint64 121 l uint32 122 } 123 124 func (ir indexResult) Offset() uint64 { 125 return ir.o 126 } 127 128 func (ir indexResult) Length() uint32 { 129 return ir.l 130 } 131 132 // An mmapIndexEntry is an addrSuffix, a BigEndian uint64 for the offset and a 133 // BigEnding uint32 for the chunk size. 134 const mmapIndexEntrySize = addrSuffixSize + uint64Size + lengthSize 135 136 type mmapOrdinalSlice []mmapOrdinal 137 138 func (s mmapOrdinalSlice) Len() int { return len(s) } 139 func (s mmapOrdinalSlice) Less(i, j int) bool { return s[i].offset < s[j].offset } 140 func (s mmapOrdinalSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] } 141 142 func (i mmapTableIndex) Ordinals() []uint32 { 143 s := mmapOrdinalSlice(make([]mmapOrdinal, i.chunkCount)) 144 for idx := 0; uint32(idx) < i.chunkCount; idx++ { 145 mi := idx * mmapIndexEntrySize 146 e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize]) 147 s[idx] = mmapOrdinal{idx, e.Offset()} 148 } 149 sort.Sort(s) 150 res := make([]uint32, i.chunkCount) 151 for j, r := range s { 152 res[r.idx] = uint32(j) 153 } 154 return res 155 } 156 157 type mmapTableIndex struct { 158 chunkCount uint32 159 totalUncompressedData uint64 160 fileSz uint64 161 prefixes []uint64 162 data mmap.MMap 163 refCnt *int32 164 } 165 166 func (i mmapTableIndex) Prefixes() []uint64 { 167 return i.prefixes 168 } 169 170 type mmapOrdinal struct { 171 idx int 172 offset uint64 173 } 174 175 func (i mmapTableIndex) TableFileSize() uint64 { 176 return i.fileSz 177 } 178 179 func (i mmapTableIndex) ChunkCount() uint32 { 180 return i.chunkCount 181 } 182 183 func (i mmapTableIndex) TotalUncompressedData() uint64 { 184 return i.totalUncompressedData 185 } 186 187 func (i mmapTableIndex) Close() error { 188 cnt := atomic.AddInt32(i.refCnt, -1) 189 if cnt == 0 { 190 return i.data.Unmap() 191 } 192 if cnt < 0 { 193 panic("Close() called and reduced ref count to < 0.") 194 } 195 return nil 196 } 197 198 func (i mmapTableIndex) Clone() tableIndex { 199 cnt := atomic.AddInt32(i.refCnt, 1) 200 if cnt == 1 { 201 panic("Clone() called after last Close(). This index is no longer valid.") 202 } 203 return i 204 } 205 206 func (i mmapTableIndex) prefixIdx(prefix uint64) (idx uint32) { 207 // NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in 208 // an extremely tight loop and inlining the code was a significant perf improvement. 209 idx, j := 0, i.chunkCount 210 for idx < j { 211 h := idx + (j-idx)/2 // avoid overflow when computing h 212 // i ≤ h < j 213 if i.prefixes[h] < prefix { 214 idx = h + 1 // preserves f(i-1) == false 215 } else { 216 j = h // preserves f(j) == true 217 } 218 } 219 return 220 } 221 222 func (i mmapTableIndex) Lookup(h *addr) (indexEntry, bool) { 223 prefix := binary.BigEndian.Uint64(h[:]) 224 for idx := i.prefixIdx(prefix); idx < i.chunkCount && i.prefixes[idx] == prefix; idx++ { 225 mi := idx * mmapIndexEntrySize 226 e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize]) 227 if bytes.Equal(e.suffix(), h[addrPrefixSize:]) { 228 return e, true 229 } 230 } 231 return mmapIndexEntry{}, false 232 } 233 234 func (i mmapTableIndex) EntrySuffixMatches(idx uint32, h *addr) bool { 235 mi := idx * mmapIndexEntrySize 236 e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize]) 237 return bytes.Equal(e.suffix(), h[addrPrefixSize:]) 238 } 239 240 func (i mmapTableIndex) IndexEntry(idx uint32, a *addr) indexEntry { 241 mi := idx * mmapIndexEntrySize 242 e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize]) 243 if a != nil { 244 binary.BigEndian.PutUint64(a[:], i.prefixes[idx]) 245 copy(a[addrPrefixSize:], e.suffix()) 246 } 247 return e 248 } 249 250 type mmapIndexEntry []byte 251 252 const mmapIndexEntryOffsetStart = addrSuffixSize 253 const mmapIndexEntryLengthStart = addrSuffixSize + uint64Size 254 255 func (e mmapIndexEntry) suffix() []byte { 256 return e[:addrSuffixSize] 257 } 258 259 func (e mmapIndexEntry) Offset() uint64 { 260 return binary.BigEndian.Uint64(e[mmapIndexEntryOffsetStart:]) 261 } 262 263 func (e mmapIndexEntry) Length() uint32 { 264 return binary.BigEndian.Uint32(e[mmapIndexEntryLengthStart:]) 265 } 266 267 func mmapOffheapSize(chunks int) int { 268 pageSize := 4096 269 esz := addrSuffixSize + uint64Size + lengthSize 270 min := esz * chunks 271 if min%pageSize == 0 { 272 return min 273 } else { 274 return (min/pageSize + 1) * pageSize 275 } 276 } 277 278 func newMmapTableIndex(ti onHeapTableIndex, f *os.File) (mmapTableIndex, error) { 279 flags := 0 280 if f == nil { 281 flags = mmap.ANON 282 } 283 arr, err := mmap.MapRegion(f, mmapOffheapSize(len(ti.ordinals)), mmap.RDWR, flags, 0) 284 if err != nil { 285 return mmapTableIndex{}, err 286 } 287 for i := range ti.ordinals { 288 idx := i * mmapIndexEntrySize 289 si := addrSuffixSize * ti.ordinals[i] 290 copy(arr[idx:], ti.suffixes[si:si+addrSuffixSize]) 291 binary.BigEndian.PutUint64(arr[idx+mmapIndexEntryOffsetStart:], ti.offsets[ti.ordinals[i]]) 292 binary.BigEndian.PutUint32(arr[idx+mmapIndexEntryLengthStart:], ti.lengths[ti.ordinals[i]]) 293 } 294 295 refCnt := new(int32) 296 *refCnt = 1 297 return mmapTableIndex{ 298 ti.chunkCount, 299 ti.totalUncompressedData, 300 ti.TableFileSize(), 301 ti.Prefixes(), 302 arr, 303 refCnt, 304 }, nil 305 } 306 307 type tableReaderAt interface { 308 ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error) 309 } 310 311 // tableReader implements get & has queries against a single nbs table. goroutine safe. 312 // |blockSize| refers to the block-size of the underlying storage. We assume that, each 313 // time we read data, we actually have to read in blocks of this size. So, we're willing 314 // to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group 315 // more chunks together into a single read request to backing storage. 316 type tableReader struct { 317 tableIndex 318 prefixes []uint64 319 chunkCount uint32 320 totalUncompressedData uint64 321 r tableReaderAt 322 blockSize uint64 323 } 324 325 type tableIndex interface { 326 // ChunkCount returns the total number of chunks in the indexed file. 327 ChunkCount() uint32 328 // EntrySuffixMatches returns true if the entry at index |idx| matches 329 // the suffix of the address |h|. Used by |Lookup| after finding 330 // matching indexes based on |Prefixes|. 331 EntrySuffixMatches(idx uint32, h *addr) bool 332 // IndexEntry returns the |indexEntry| at |idx|. Optionally puts the 333 // full address of that entry in |a| if |a| is not |nil|. 334 IndexEntry(idx uint32, a *addr) indexEntry 335 // Lookup returns an |indexEntry| for the chunk corresponding to the 336 // provided address |h|. Second returns is |true| if an entry exists 337 // and |false| otherwise. 338 Lookup(h *addr) (indexEntry, bool) 339 // Ordinals returns a slice of indexes which maps the |i|th chunk in 340 // the indexed file to its corresponding entry in index. The |i|th 341 // entry in the result is the |i|th chunk in the indexed file, and its 342 // corresponding value in the slice is the index entry that maps to it. 343 Ordinals() []uint32 344 // Prefixes returns the sorted slice of |uint64| |addr| prefixes; each 345 // entry corresponds to an indexed chunk address. 346 Prefixes() []uint64 347 // TableFileSize returns the total size of the indexed table file, in bytes. 348 TableFileSize() uint64 349 // TotalUncompressedData returns the total uncompressed data size of 350 // the table file. Used for informational statistics only. 351 TotalUncompressedData() uint64 352 353 // Close releases any resources used by this tableIndex. 354 Close() error 355 356 // Clone returns a |tableIndex| with the same contents which can be 357 // |Close|d independently. 358 Clone() tableIndex 359 } 360 361 var _ tableIndex = mmapTableIndex{} 362 363 // parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index 364 // and footer, though it may contain an unspecified number of bytes before that data. 365 // |tableIndex| doesn't keep alive any references to |buff|. 366 func parseTableIndex(buff []byte) (onHeapTableIndex, error) { 367 pos := int64(len(buff)) 368 369 // footer 370 pos -= magicNumberSize 371 372 if string(buff[pos:]) != magicNumber { 373 return onHeapTableIndex{}, ErrInvalidTableFile 374 } 375 376 // total uncompressed chunk data 377 pos -= uint64Size 378 379 if pos < 0 { 380 return onHeapTableIndex{}, ErrInvalidTableFile 381 } 382 383 totalUncompressedData := binary.BigEndian.Uint64(buff[pos:]) 384 385 pos -= uint32Size 386 387 if pos < 0 { 388 return onHeapTableIndex{}, ErrInvalidTableFile 389 } 390 391 chunkCount := binary.BigEndian.Uint32(buff[pos:]) 392 393 // index 394 suffixesSize := int64(chunkCount) * addrSuffixSize 395 pos -= suffixesSize 396 397 if pos < 0 { 398 return onHeapTableIndex{}, ErrInvalidTableFile 399 } 400 401 suffixes := make([]byte, suffixesSize) 402 copy(suffixes, buff[pos:]) 403 404 lengthsSize := int64(chunkCount) * lengthSize 405 pos -= lengthsSize 406 407 if pos < 0 { 408 return onHeapTableIndex{}, ErrInvalidTableFile 409 } 410 411 lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize]) 412 413 tuplesSize := int64(chunkCount) * prefixTupleSize 414 pos -= tuplesSize 415 416 if pos < 0 { 417 return onHeapTableIndex{}, ErrInvalidTableFile 418 } 419 420 prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize]) 421 422 return onHeapTableIndex{ 423 chunkCount, totalUncompressedData, 424 prefixes, offsets, 425 lengths, ordinals, 426 suffixes, 427 }, nil 428 } 429 430 func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) { 431 lengths = make([]uint32, count) 432 offsets = make([]uint64, count) 433 434 lengths[0] = binary.BigEndian.Uint32(buff) 435 436 for i := uint64(1); i < uint64(count); i++ { 437 lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:]) 438 offsets[i] = offsets[i-1] + uint64(lengths[i-1]) 439 } 440 return 441 } 442 443 func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) { 444 prefixes = make([]uint64, count) 445 ordinals = make([]uint32, count) 446 447 for i := uint64(0); i < uint64(count); i++ { 448 idx := i * prefixTupleSize 449 prefixes[i] = binary.BigEndian.Uint64(buff[idx:]) 450 ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:]) 451 } 452 return 453 } 454 455 func (ti onHeapTableIndex) prefixIdxToOrdinal(idx uint32) uint32 { 456 return ti.ordinals[idx] 457 } 458 459 // TableFileSize returns the size of the table file that this index references. 460 // This assumes that the index follows immediately after the last chunk in the 461 // file and that the last chunk in the file is in the index. 462 func (ti onHeapTableIndex) TableFileSize() uint64 { 463 if ti.chunkCount == 0 { 464 return footerSize 465 } 466 len, offset := ti.offsets[ti.chunkCount-1], uint64(ti.lengths[ti.chunkCount-1]) 467 return offset + len + indexSize(ti.chunkCount) + footerSize 468 } 469 470 // prefixIdx returns the first position in |tr.prefixes| whose value == 471 // |prefix|. Returns |tr.chunkCount| if absent 472 func (ti onHeapTableIndex) prefixIdx(prefix uint64) (idx uint32) { 473 // NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in 474 // an extremely tight loop and inlining the code was a significant perf improvement. 475 idx, j := 0, ti.chunkCount 476 for idx < j { 477 h := idx + (j-idx)/2 // avoid overflow when computing h 478 // i ≤ h < j 479 if ti.prefixes[h] < prefix { 480 idx = h + 1 // preserves f(i-1) == false 481 } else { 482 j = h // preserves f(j) == true 483 } 484 } 485 486 return 487 } 488 489 // EntrySuffixMatches returns true IFF the suffix for prefix entry |idx| 490 // matches the address |a|. 491 func (ti onHeapTableIndex) EntrySuffixMatches(idx uint32, h *addr) bool { 492 li := uint64(ti.ordinals[idx]) * addrSuffixSize 493 return bytes.Equal(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) 494 } 495 496 // lookupOrdinal returns the ordinal of |h| if present. Returns |ti.chunkCount| 497 // if absent. 498 func (ti onHeapTableIndex) lookupOrdinal(h *addr) uint32 { 499 prefix := h.Prefix() 500 501 for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ { 502 if ti.EntrySuffixMatches(idx, h) { 503 return ti.ordinals[idx] 504 } 505 } 506 507 return ti.chunkCount 508 } 509 510 func (ti onHeapTableIndex) IndexEntry(idx uint32, a *addr) indexEntry { 511 ord := ti.ordinals[idx] 512 if a != nil { 513 binary.BigEndian.PutUint64(a[:], ti.prefixes[idx]) 514 li := uint64(ord) * addrSuffixSize 515 copy(a[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize]) 516 } 517 return indexResult{ti.offsets[ord], ti.lengths[ord]} 518 } 519 520 func (ti onHeapTableIndex) Lookup(h *addr) (indexEntry, bool) { 521 ord := ti.lookupOrdinal(h) 522 if ord == ti.chunkCount { 523 return indexResult{}, false 524 } 525 return indexResult{ti.offsets[ord], ti.lengths[ord]}, true 526 } 527 528 func (ti onHeapTableIndex) Prefixes() []uint64 { 529 return ti.prefixes 530 } 531 532 func (ti onHeapTableIndex) Ordinals() []uint32 { 533 return ti.ordinals 534 } 535 536 func (i onHeapTableIndex) ChunkCount() uint32 { 537 return i.chunkCount 538 } 539 540 func (i onHeapTableIndex) TotalUncompressedData() uint64 { 541 return i.totalUncompressedData 542 } 543 544 func (i onHeapTableIndex) Close() error { 545 return nil 546 } 547 548 func (i onHeapTableIndex) Clone() tableIndex { 549 return i 550 } 551 552 // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index 553 // and footer, though it may contain an unspecified number of bytes before that data. r should allow 554 // retrieving any desired range of bytes from the table. 555 func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) tableReader { 556 return tableReader{ 557 index, 558 index.Prefixes(), 559 index.ChunkCount(), 560 index.TotalUncompressedData(), 561 r, 562 blockSize, 563 } 564 } 565 566 // Scan across (logically) two ordered slices of address prefixes. 567 func (tr tableReader) hasMany(addrs []hasRecord) (bool, error) { 568 // TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs)) 569 570 filterIdx := uint32(0) 571 filterLen := uint32(tr.chunkCount) 572 573 var remaining bool 574 for i, addr := range addrs { 575 if addr.has { 576 continue 577 } 578 579 for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] { 580 filterIdx++ 581 } 582 583 if filterIdx >= filterLen { 584 return true, nil 585 } 586 587 if addr.prefix != tr.prefixes[filterIdx] { 588 remaining = true 589 continue 590 } 591 592 // prefixes are equal, so locate and compare against the corresponding suffix 593 for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ { 594 if tr.EntrySuffixMatches(j, addr.a) { 595 addrs[i].has = true 596 break 597 } 598 } 599 600 if !addrs[i].has { 601 remaining = true 602 } 603 } 604 605 return remaining, nil 606 } 607 608 func (tr tableReader) count() (uint32, error) { 609 return tr.chunkCount, nil 610 } 611 612 func (tr tableReader) uncompressedLen() (uint64, error) { 613 return tr.totalUncompressedData, nil 614 } 615 616 func (tr tableReader) index() (tableIndex, error) { 617 return tr.tableIndex, nil 618 } 619 620 // returns true iff |h| can be found in this table. 621 func (tr tableReader) has(h addr) (bool, error) { 622 _, ok := tr.Lookup(&h) 623 return ok, nil 624 } 625 626 // returns the storage associated with |h|, iff present. Returns nil if absent. On success, 627 // the returned byte slice directly references the underlying storage. 628 func (tr tableReader) get(ctx context.Context, h addr, stats *Stats) ([]byte, error) { 629 e, found := tr.Lookup(&h) 630 if !found { 631 return nil, nil 632 } 633 634 offset := e.Offset() 635 length := uint64(e.Length()) 636 buff := make([]byte, length) // TODO: Avoid this allocation for every get 637 638 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(offset), stats) 639 640 if err != nil { 641 return nil, err 642 } 643 644 if n != int(length) { 645 return nil, errors.New("failed to read all data") 646 } 647 648 cmp, err := NewCompressedChunk(hash.Hash(h), buff) 649 650 if err != nil { 651 return nil, err 652 } 653 654 if len(cmp.CompressedData) == 0 { 655 return nil, errors.New("failed to get data") 656 } 657 658 chnk, err := cmp.ToChunk() 659 660 if err != nil { 661 return nil, err 662 } 663 664 return chnk.Data(), nil 665 } 666 667 type offsetRec struct { 668 a *addr 669 offset uint64 670 length uint32 671 } 672 673 type offsetRecSlice []offsetRec 674 675 func (hs offsetRecSlice) Len() int { return len(hs) } 676 func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset } 677 func (hs offsetRecSlice) Swap(i, j int) { hs[i], hs[j] = hs[j], hs[i] } 678 679 var _ chunkReadPlanner = tableReader{} 680 var _ chunkReader = tableReader{} 681 682 func (tr tableReader) readCompressedAtOffsets( 683 ctx context.Context, 684 rb readBatch, 685 found func(CompressedChunk), 686 stats *Stats, 687 ) error { 688 return tr.readAtOffsetsWithCB(ctx, rb, stats, func(cmp CompressedChunk) error { 689 found(cmp) 690 return nil 691 }) 692 } 693 694 func (tr tableReader) readAtOffsets( 695 ctx context.Context, 696 rb readBatch, 697 found func(*chunks.Chunk), 698 stats *Stats, 699 ) error { 700 return tr.readAtOffsetsWithCB(ctx, rb, stats, func(cmp CompressedChunk) error { 701 chk, err := cmp.ToChunk() 702 703 if err != nil { 704 return err 705 } 706 707 found(&chk) 708 return nil 709 }) 710 } 711 712 func (tr tableReader) readAtOffsetsWithCB( 713 ctx context.Context, 714 rb readBatch, 715 stats *Stats, 716 cb func(cmp CompressedChunk) error, 717 ) error { 718 readLength := rb.End() - rb.Start() 719 buff := make([]byte, readLength) 720 721 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(rb.Start()), stats) 722 if err != nil { 723 return err 724 } 725 726 if uint64(n) != readLength { 727 return errors.New("failed to read all data") 728 } 729 730 for i := range rb { 731 cmp, err := rb.ExtractChunkFromRead(buff, i) 732 if err != nil { 733 return err 734 } 735 736 err = cb(cmp) 737 if err != nil { 738 return err 739 } 740 } 741 742 return nil 743 } 744 745 // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical 746 // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix. 747 func (tr tableReader) getMany( 748 ctx context.Context, 749 eg *errgroup.Group, 750 reqs []getRecord, 751 found func(*chunks.Chunk), 752 stats *Stats) (bool, error) { 753 754 // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 755 // of table locations which must be read in order to satisfy the getMany operation. 756 offsetRecords, remaining := tr.findOffsets(reqs) 757 err := tr.getManyAtOffsets(ctx, eg, offsetRecords, found, stats) 758 return remaining, err 759 } 760 func (tr tableReader) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(CompressedChunk), stats *Stats) (bool, error) { 761 // Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 762 // of table locations which must be read in order to satisfy the getMany operation. 763 offsetRecords, remaining := tr.findOffsets(reqs) 764 err := tr.getManyCompressedAtOffsets(ctx, eg, offsetRecords, found, stats) 765 return remaining, err 766 } 767 768 func (tr tableReader) getManyCompressedAtOffsets(ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(CompressedChunk), stats *Stats) error { 769 return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( 770 ctx context.Context, 771 rb readBatch, 772 stats *Stats) error { 773 return tr.readCompressedAtOffsets(ctx, rb, found, stats) 774 }) 775 } 776 777 func (tr tableReader) getManyAtOffsets( 778 ctx context.Context, 779 eg *errgroup.Group, 780 offsetRecords offsetRecSlice, 781 found func(*chunks.Chunk), 782 stats *Stats, 783 ) error { 784 return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func( 785 ctx context.Context, 786 rb readBatch, 787 stats *Stats) error { 788 return tr.readAtOffsets(ctx, rb, found, stats) 789 }) 790 } 791 792 type readBatch offsetRecSlice 793 794 func (r readBatch) Start() uint64 { 795 return r[0].offset 796 } 797 798 func (r readBatch) End() uint64 { 799 last := r[len(r)-1] 800 return last.offset + uint64(last.length) 801 } 802 803 func (s readBatch) ExtractChunkFromRead(buff []byte, idx int) (CompressedChunk, error) { 804 rec := s[idx] 805 chunkStart := rec.offset - s.Start() 806 return NewCompressedChunk(hash.Hash(*rec.a), buff[chunkStart:chunkStart+uint64(rec.length)]) 807 } 808 809 func toReadBatches(offsets offsetRecSlice, blockSize uint64) []readBatch { 810 res := make([]readBatch, 0) 811 var batch readBatch 812 for i := 0; i < len(offsets); { 813 rec := offsets[i] 814 if batch == nil { 815 batch = readBatch{rec} 816 i++ 817 continue 818 } 819 820 if _, canRead := canReadAhead(rec, batch.End(), blockSize); canRead { 821 batch = append(batch, rec) 822 i++ 823 continue 824 } 825 826 res = append(res, batch) 827 batch = nil 828 } 829 if batch != nil { 830 res = append(res, batch) 831 } 832 return res 833 } 834 835 func (tr tableReader) getManyAtOffsetsWithReadFunc( 836 ctx context.Context, 837 eg *errgroup.Group, 838 offsetRecords offsetRecSlice, 839 stats *Stats, 840 readAtOffsets func( 841 ctx context.Context, 842 rb readBatch, 843 stats *Stats) error, 844 ) error { 845 batches := toReadBatches(offsetRecords, tr.blockSize) 846 var idx int32 847 readBatches := func() error { 848 for { 849 if ctx.Err() != nil { 850 return ctx.Err() 851 } 852 i := atomic.AddInt32(&idx, 1) - 1 853 if int(i) >= len(batches) { 854 return nil 855 } 856 rb := batches[i] 857 err := readAtOffsets(ctx, rb, stats) 858 if err != nil { 859 return err 860 } 861 } 862 } 863 ioParallelism := 4 864 for i := 0; i < ioParallelism; i++ { 865 eg.Go(readBatches) 866 } 867 868 return nil 869 } 870 871 // findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by 872 // address) to build the set of table locations which must be read in order to 873 // find each chunk specified by |reqs|. If this table contains all requested 874 // chunks remaining will be set to false upon return. If some are not here, 875 // then remaining will be true. The result offsetRecSlice is sorted in offset 876 // order. 877 func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) { 878 filterIdx := uint32(0) 879 filterLen := uint32(len(tr.prefixes)) 880 ors = make(offsetRecSlice, 0, len(reqs)) 881 882 // Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set 883 // of table locations which must be read in order to satisfy |reqs|. 884 for i, req := range reqs { 885 if req.found { 886 continue 887 } 888 889 // advance within the prefixes until we reach one which is >= req.prefix 890 for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix { 891 filterIdx++ 892 } 893 894 if filterIdx >= filterLen { 895 remaining = true // last prefix visited. 896 break 897 } 898 899 if req.prefix != tr.prefixes[filterIdx] { 900 remaining = true 901 continue 902 } 903 904 // record all offsets within the table which contain the data required. 905 for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ { 906 if tr.EntrySuffixMatches(j, req.a) { 907 reqs[i].found = true 908 entry := tr.IndexEntry(j, nil) 909 ors = append(ors, offsetRec{req.a, entry.Offset(), entry.Length()}) 910 break 911 } 912 } 913 } 914 915 sort.Sort(ors) 916 return ors, remaining 917 } 918 919 func canReadAhead(fRec offsetRec, readEnd, blockSize uint64) (newEnd uint64, canRead bool) { 920 if fRec.offset < readEnd { 921 // |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address 922 // prefix matches the prefix of a requested address. If the set of requests contains 923 // addresses which share a common prefix, then it's possible for multiple offsetRecords 924 // to reference the same table offset position. In that case, we'll see sequential 925 // offsetRecords with the same fRec.offset. 926 return readEnd, true 927 } 928 929 if fRec.offset-readEnd > blockSize { 930 return readEnd, false 931 } 932 933 return fRec.offset + uint64(fRec.length), true 934 } 935 936 func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) { 937 var offsetRecords offsetRecSlice 938 // Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table. 939 offsetRecords, remaining = tr.findOffsets(reqs) 940 941 // Now |offsetRecords| contains all locations within the table which must 942 // be searched (note that there may be duplicates of a particular 943 // location). Scan forward, grouping sequences of reads into large physical 944 // reads. 945 946 var readStart, readEnd uint64 947 readStarted := false 948 949 for i := 0; i < len(offsetRecords); { 950 rec := offsetRecords[i] 951 length := rec.length 952 953 if !readStarted { 954 readStarted = true 955 reads++ 956 readStart = rec.offset 957 readEnd = readStart + uint64(length) 958 i++ 959 continue 960 } 961 962 if newReadEnd, canRead := canReadAhead(rec, readEnd, tr.blockSize); canRead { 963 readEnd = newReadEnd 964 i++ 965 continue 966 } 967 968 readStarted = false 969 } 970 971 return 972 } 973 974 func (tr tableReader) extract(ctx context.Context, chunks chan<- extractRecord) error { 975 sendChunk := func(or offsetRec) error { 976 buff := make([]byte, or.length) 977 n, err := tr.r.ReadAtWithStats(ctx, buff, int64(or.offset), &Stats{}) 978 if err != nil { 979 return err 980 } 981 if uint32(n) != or.length { 982 return errors.New("did not read all data") 983 } 984 cmp, err := NewCompressedChunk(hash.Hash(*or.a), buff) 985 986 if err != nil { 987 return err 988 } 989 990 chnk, err := cmp.ToChunk() 991 992 if err != nil { 993 return err 994 } 995 996 chunks <- extractRecord{a: *or.a, data: chnk.Data()} 997 return nil 998 } 999 1000 var ors offsetRecSlice 1001 for i := uint32(0); i < tr.chunkCount; i++ { 1002 a := new(addr) 1003 e := tr.IndexEntry(i, a) 1004 ors = append(ors, offsetRec{a, e.Offset(), e.Length()}) 1005 } 1006 sort.Sort(ors) 1007 for _, or := range ors { 1008 err := sendChunk(or) 1009 if err != nil { 1010 return err 1011 } 1012 } 1013 1014 return nil 1015 } 1016 1017 func (tr tableReader) reader(ctx context.Context) (io.Reader, error) { 1018 i, _ := tr.index() 1019 return io.LimitReader(&readerAdapter{tr.r, 0, ctx}, int64(i.TableFileSize())), nil 1020 } 1021 1022 func (tr tableReader) Close() error { 1023 return tr.tableIndex.Close() 1024 } 1025 1026 func (tr tableReader) Clone() tableReader { 1027 return tableReader{tr.tableIndex.Clone(), tr.prefixes, tr.chunkCount, tr.totalUncompressedData, tr.r, tr.blockSize} 1028 } 1029 1030 type readerAdapter struct { 1031 rat tableReaderAt 1032 off int64 1033 ctx context.Context 1034 } 1035 1036 func (ra *readerAdapter) Read(p []byte) (n int, err error) { 1037 n, err = ra.rat.ReadAtWithStats(ra.ctx, p, ra.off, &Stats{}) 1038 ra.off += int64(n) 1039 return 1040 }