github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table_reader.go

github.com/hasnat/dolt/go@v0.0.0-20210628190320-9eb5d843fbb7/store/nbs/table_reader.go (about)

     1  // Copyright 2019 Dolthub, Inc.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  //
    15  // This file incorporates work covered by the following copyright and
    16  // permission notice:
    17  //
    18  // Copyright 2016 Attic Labs, Inc. All rights reserved.
    19  // Licensed under the Apache License, version 2.0:
    20  // http://www.apache.org/licenses/LICENSE-2.0
    21  
    22  package nbs
    23  
    24  import (
    25  	"bytes"
    26  	"context"
    27  	"encoding/binary"
    28  	"errors"
    29  	"io"
    30  	"os"
    31  	"sort"
    32  	"sync/atomic"
    33  
    34  	"github.com/dolthub/mmap-go"
    35  	"github.com/golang/snappy"
    36  	"golang.org/x/sync/errgroup"
    37  
    38  	"github.com/dolthub/dolt/go/store/chunks"
    39  	"github.com/dolthub/dolt/go/store/hash"
    40  )
    41  
    42  // CompressedChunk represents a chunk of data in a table file which is still compressed via snappy.
    43  type CompressedChunk struct {
    44  	// H is the hash of the chunk
    45  	H hash.Hash
    46  
    47  	// FullCompressedChunk is the entirety of the compressed chunk data including the crc
    48  	FullCompressedChunk []byte
    49  
    50  	// CompressedData is just the snappy encoded byte buffer that stores the chunk data
    51  	CompressedData []byte
    52  }
    53  
    54  // NewCompressedChunk creates a CompressedChunk
    55  func NewCompressedChunk(h hash.Hash, buff []byte) (CompressedChunk, error) {
    56  	dataLen := uint64(len(buff)) - checksumSize
    57  
    58  	chksum := binary.BigEndian.Uint32(buff[dataLen:])
    59  	compressedData := buff[:dataLen]
    60  
    61  	if chksum != crc(compressedData) {
    62  		return CompressedChunk{}, errors.New("checksum error")
    63  	}
    64  
    65  	return CompressedChunk{H: h, FullCompressedChunk: buff, CompressedData: compressedData}, nil
    66  }
    67  
    68  // ToChunk snappy decodes the compressed data and returns a chunks.Chunk
    69  func (cmp CompressedChunk) ToChunk() (chunks.Chunk, error) {
    70  	data, err := snappy.Decode(nil, cmp.CompressedData)
    71  
    72  	if err != nil {
    73  		return chunks.Chunk{}, err
    74  	}
    75  
    76  	return chunks.NewChunkWithHash(cmp.H, data), nil
    77  }
    78  
    79  func ChunkToCompressedChunk(chunk chunks.Chunk) CompressedChunk {
    80  	compressed := snappy.Encode(nil, chunk.Data())
    81  	length := len(compressed)
    82  	compressed = append(compressed, []byte{0, 0, 0, 0}...)
    83  	binary.BigEndian.PutUint32(compressed[length:], crc(compressed[:length]))
    84  	return CompressedChunk{H: chunk.Hash(), FullCompressedChunk: compressed, CompressedData: compressed[:length]}
    85  }
    86  
    87  // Hash returns the hash of the data
    88  func (cmp CompressedChunk) Hash() hash.Hash {
    89  	return cmp.H
    90  }
    91  
    92  // IsEmpty returns true if the chunk contains no data.
    93  func (cmp CompressedChunk) IsEmpty() bool {
    94  	return len(cmp.CompressedData) == 0 || (len(cmp.CompressedData) == 1 && cmp.CompressedData[0] == 0)
    95  }
    96  
    97  var EmptyCompressedChunk CompressedChunk
    98  
    99  func init() {
   100  	EmptyCompressedChunk = ChunkToCompressedChunk(chunks.EmptyChunk)
   101  }
   102  
   103  // ErrInvalidTableFile is an error returned when a table file is corrupt or invalid.
   104  var ErrInvalidTableFile = errors.New("invalid or corrupt table file")
   105  
   106  type onHeapTableIndex struct {
   107  	chunkCount            uint32
   108  	totalUncompressedData uint64
   109  	prefixes, offsets     []uint64
   110  	lengths, ordinals     []uint32
   111  	suffixes              []byte
   112  }
   113  
   114  type indexEntry interface {
   115  	Offset() uint64
   116  	Length() uint32
   117  }
   118  
   119  type indexResult struct {
   120  	o uint64
   121  	l uint32
   122  }
   123  
   124  func (ir indexResult) Offset() uint64 {
   125  	return ir.o
   126  }
   127  
   128  func (ir indexResult) Length() uint32 {
   129  	return ir.l
   130  }
   131  
   132  // An mmapIndexEntry is an addrSuffix, a BigEndian uint64 for the offset and a
   133  // BigEnding uint32 for the chunk size.
   134  const mmapIndexEntrySize = addrSuffixSize + uint64Size + lengthSize
   135  
   136  type mmapOrdinalSlice []mmapOrdinal
   137  
   138  func (s mmapOrdinalSlice) Len() int           { return len(s) }
   139  func (s mmapOrdinalSlice) Less(i, j int) bool { return s[i].offset < s[j].offset }
   140  func (s mmapOrdinalSlice) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
   141  
   142  func (i mmapTableIndex) Ordinals() []uint32 {
   143  	s := mmapOrdinalSlice(make([]mmapOrdinal, i.chunkCount))
   144  	for idx := 0; uint32(idx) < i.chunkCount; idx++ {
   145  		mi := idx * mmapIndexEntrySize
   146  		e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize])
   147  		s[idx] = mmapOrdinal{idx, e.Offset()}
   148  	}
   149  	sort.Sort(s)
   150  	res := make([]uint32, i.chunkCount)
   151  	for j, r := range s {
   152  		res[r.idx] = uint32(j)
   153  	}
   154  	return res
   155  }
   156  
   157  type mmapTableIndex struct {
   158  	chunkCount            uint32
   159  	totalUncompressedData uint64
   160  	fileSz                uint64
   161  	prefixes              []uint64
   162  	data                  mmap.MMap
   163  	refCnt                *int32
   164  }
   165  
   166  func (i mmapTableIndex) Prefixes() []uint64 {
   167  	return i.prefixes
   168  }
   169  
   170  type mmapOrdinal struct {
   171  	idx    int
   172  	offset uint64
   173  }
   174  
   175  func (i mmapTableIndex) TableFileSize() uint64 {
   176  	return i.fileSz
   177  }
   178  
   179  func (i mmapTableIndex) ChunkCount() uint32 {
   180  	return i.chunkCount
   181  }
   182  
   183  func (i mmapTableIndex) TotalUncompressedData() uint64 {
   184  	return i.totalUncompressedData
   185  }
   186  
   187  func (i mmapTableIndex) Close() error {
   188  	cnt := atomic.AddInt32(i.refCnt, -1)
   189  	if cnt == 0 {
   190  		return i.data.Unmap()
   191  	}
   192  	if cnt < 0 {
   193  		panic("Close() called and reduced ref count to < 0.")
   194  	}
   195  	return nil
   196  }
   197  
   198  func (i mmapTableIndex) Clone() tableIndex {
   199  	cnt := atomic.AddInt32(i.refCnt, 1)
   200  	if cnt == 1 {
   201  		panic("Clone() called after last Close(). This index is no longer valid.")
   202  	}
   203  	return i
   204  }
   205  
   206  func (i mmapTableIndex) prefixIdx(prefix uint64) (idx uint32) {
   207  	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
   208  	// an extremely tight loop and inlining the code was a significant perf improvement.
   209  	idx, j := 0, i.chunkCount
   210  	for idx < j {
   211  		h := idx + (j-idx)/2 // avoid overflow when computing h
   212  		// i ≤ h < j
   213  		if i.prefixes[h] < prefix {
   214  			idx = h + 1 // preserves f(i-1) == false
   215  		} else {
   216  			j = h // preserves f(j) == true
   217  		}
   218  	}
   219  	return
   220  }
   221  
   222  func (i mmapTableIndex) Lookup(h *addr) (indexEntry, bool) {
   223  	prefix := binary.BigEndian.Uint64(h[:])
   224  	for idx := i.prefixIdx(prefix); idx < i.chunkCount && i.prefixes[idx] == prefix; idx++ {
   225  		mi := idx * mmapIndexEntrySize
   226  		e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize])
   227  		if bytes.Equal(e.suffix(), h[addrPrefixSize:]) {
   228  			return e, true
   229  		}
   230  	}
   231  	return mmapIndexEntry{}, false
   232  }
   233  
   234  func (i mmapTableIndex) EntrySuffixMatches(idx uint32, h *addr) bool {
   235  	mi := idx * mmapIndexEntrySize
   236  	e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize])
   237  	return bytes.Equal(e.suffix(), h[addrPrefixSize:])
   238  }
   239  
   240  func (i mmapTableIndex) IndexEntry(idx uint32, a *addr) indexEntry {
   241  	mi := idx * mmapIndexEntrySize
   242  	e := mmapIndexEntry(i.data[mi : mi+mmapIndexEntrySize])
   243  	if a != nil {
   244  		binary.BigEndian.PutUint64(a[:], i.prefixes[idx])
   245  		copy(a[addrPrefixSize:], e.suffix())
   246  	}
   247  	return e
   248  }
   249  
   250  type mmapIndexEntry []byte
   251  
   252  const mmapIndexEntryOffsetStart = addrSuffixSize
   253  const mmapIndexEntryLengthStart = addrSuffixSize + uint64Size
   254  
   255  func (e mmapIndexEntry) suffix() []byte {
   256  	return e[:addrSuffixSize]
   257  }
   258  
   259  func (e mmapIndexEntry) Offset() uint64 {
   260  	return binary.BigEndian.Uint64(e[mmapIndexEntryOffsetStart:])
   261  }
   262  
   263  func (e mmapIndexEntry) Length() uint32 {
   264  	return binary.BigEndian.Uint32(e[mmapIndexEntryLengthStart:])
   265  }
   266  
   267  func mmapOffheapSize(chunks int) int {
   268  	pageSize := 4096
   269  	esz := addrSuffixSize + uint64Size + lengthSize
   270  	min := esz * chunks
   271  	if min%pageSize == 0 {
   272  		return min
   273  	} else {
   274  		return (min/pageSize + 1) * pageSize
   275  	}
   276  }
   277  
   278  func newMmapTableIndex(ti onHeapTableIndex, f *os.File) (mmapTableIndex, error) {
   279  	flags := 0
   280  	if f == nil {
   281  		flags = mmap.ANON
   282  	}
   283  	arr, err := mmap.MapRegion(f, mmapOffheapSize(len(ti.ordinals)), mmap.RDWR, flags, 0)
   284  	if err != nil {
   285  		return mmapTableIndex{}, err
   286  	}
   287  	for i := range ti.ordinals {
   288  		idx := i * mmapIndexEntrySize
   289  		si := addrSuffixSize * ti.ordinals[i]
   290  		copy(arr[idx:], ti.suffixes[si:si+addrSuffixSize])
   291  		binary.BigEndian.PutUint64(arr[idx+mmapIndexEntryOffsetStart:], ti.offsets[ti.ordinals[i]])
   292  		binary.BigEndian.PutUint32(arr[idx+mmapIndexEntryLengthStart:], ti.lengths[ti.ordinals[i]])
   293  	}
   294  
   295  	refCnt := new(int32)
   296  	*refCnt = 1
   297  	return mmapTableIndex{
   298  		ti.chunkCount,
   299  		ti.totalUncompressedData,
   300  		ti.TableFileSize(),
   301  		ti.Prefixes(),
   302  		arr,
   303  		refCnt,
   304  	}, nil
   305  }
   306  
   307  type tableReaderAt interface {
   308  	ReadAtWithStats(ctx context.Context, p []byte, off int64, stats *Stats) (n int, err error)
   309  }
   310  
   311  // tableReader implements get & has queries against a single nbs table. goroutine safe.
   312  // |blockSize| refers to the block-size of the underlying storage. We assume that, each
   313  // time we read data, we actually have to read in blocks of this size. So, we're willing
   314  // to tolerate up to |blockSize| overhead each time we read a chunk, if it helps us group
   315  // more chunks together into a single read request to backing storage.
   316  type tableReader struct {
   317  	tableIndex
   318  	prefixes              []uint64
   319  	chunkCount            uint32
   320  	totalUncompressedData uint64
   321  	r                     tableReaderAt
   322  	blockSize             uint64
   323  }
   324  
   325  type tableIndex interface {
   326  	// ChunkCount returns the total number of chunks in the indexed file.
   327  	ChunkCount() uint32
   328  	// EntrySuffixMatches returns true if the entry at index |idx| matches
   329  	// the suffix of the address |h|. Used by |Lookup| after finding
   330  	// matching indexes based on |Prefixes|.
   331  	EntrySuffixMatches(idx uint32, h *addr) bool
   332  	// IndexEntry returns the |indexEntry| at |idx|. Optionally puts the
   333  	// full address of that entry in |a| if |a| is not |nil|.
   334  	IndexEntry(idx uint32, a *addr) indexEntry
   335  	// Lookup returns an |indexEntry| for the chunk corresponding to the
   336  	// provided address |h|. Second returns is |true| if an entry exists
   337  	// and |false| otherwise.
   338  	Lookup(h *addr) (indexEntry, bool)
   339  	// Ordinals returns a slice of indexes which maps the |i|th chunk in
   340  	// the indexed file to its corresponding entry in index. The |i|th
   341  	// entry in the result is the |i|th chunk in the indexed file, and its
   342  	// corresponding value in the slice is the index entry that maps to it.
   343  	Ordinals() []uint32
   344  	// Prefixes returns the sorted slice of |uint64| |addr| prefixes; each
   345  	// entry corresponds to an indexed chunk address.
   346  	Prefixes() []uint64
   347  	// TableFileSize returns the total size of the indexed table file, in bytes.
   348  	TableFileSize() uint64
   349  	// TotalUncompressedData returns the total uncompressed data size of
   350  	// the table file. Used for informational statistics only.
   351  	TotalUncompressedData() uint64
   352  
   353  	// Close releases any resources used by this tableIndex.
   354  	Close() error
   355  
   356  	// Clone returns a |tableIndex| with the same contents which can be
   357  	// |Close|d independently.
   358  	Clone() tableIndex
   359  }
   360  
   361  var _ tableIndex = mmapTableIndex{}
   362  
   363  // parses a valid nbs tableIndex from a byte stream. |buff| must end with an NBS index
   364  // and footer, though it may contain an unspecified number of bytes before that data.
   365  // |tableIndex| doesn't keep alive any references to |buff|.
   366  func parseTableIndex(buff []byte) (onHeapTableIndex, error) {
   367  	pos := int64(len(buff))
   368  
   369  	// footer
   370  	pos -= magicNumberSize
   371  
   372  	if string(buff[pos:]) != magicNumber {
   373  		return onHeapTableIndex{}, ErrInvalidTableFile
   374  	}
   375  
   376  	// total uncompressed chunk data
   377  	pos -= uint64Size
   378  
   379  	if pos < 0 {
   380  		return onHeapTableIndex{}, ErrInvalidTableFile
   381  	}
   382  
   383  	totalUncompressedData := binary.BigEndian.Uint64(buff[pos:])
   384  
   385  	pos -= uint32Size
   386  
   387  	if pos < 0 {
   388  		return onHeapTableIndex{}, ErrInvalidTableFile
   389  	}
   390  
   391  	chunkCount := binary.BigEndian.Uint32(buff[pos:])
   392  
   393  	// index
   394  	suffixesSize := int64(chunkCount) * addrSuffixSize
   395  	pos -= suffixesSize
   396  
   397  	if pos < 0 {
   398  		return onHeapTableIndex{}, ErrInvalidTableFile
   399  	}
   400  
   401  	suffixes := make([]byte, suffixesSize)
   402  	copy(suffixes, buff[pos:])
   403  
   404  	lengthsSize := int64(chunkCount) * lengthSize
   405  	pos -= lengthsSize
   406  
   407  	if pos < 0 {
   408  		return onHeapTableIndex{}, ErrInvalidTableFile
   409  	}
   410  
   411  	lengths, offsets := computeOffsets(chunkCount, buff[pos:pos+lengthsSize])
   412  
   413  	tuplesSize := int64(chunkCount) * prefixTupleSize
   414  	pos -= tuplesSize
   415  
   416  	if pos < 0 {
   417  		return onHeapTableIndex{}, ErrInvalidTableFile
   418  	}
   419  
   420  	prefixes, ordinals := computePrefixes(chunkCount, buff[pos:pos+tuplesSize])
   421  
   422  	return onHeapTableIndex{
   423  		chunkCount, totalUncompressedData,
   424  		prefixes, offsets,
   425  		lengths, ordinals,
   426  		suffixes,
   427  	}, nil
   428  }
   429  
   430  func computeOffsets(count uint32, buff []byte) (lengths []uint32, offsets []uint64) {
   431  	lengths = make([]uint32, count)
   432  	offsets = make([]uint64, count)
   433  
   434  	lengths[0] = binary.BigEndian.Uint32(buff)
   435  
   436  	for i := uint64(1); i < uint64(count); i++ {
   437  		lengths[i] = binary.BigEndian.Uint32(buff[i*lengthSize:])
   438  		offsets[i] = offsets[i-1] + uint64(lengths[i-1])
   439  	}
   440  	return
   441  }
   442  
   443  func computePrefixes(count uint32, buff []byte) (prefixes []uint64, ordinals []uint32) {
   444  	prefixes = make([]uint64, count)
   445  	ordinals = make([]uint32, count)
   446  
   447  	for i := uint64(0); i < uint64(count); i++ {
   448  		idx := i * prefixTupleSize
   449  		prefixes[i] = binary.BigEndian.Uint64(buff[idx:])
   450  		ordinals[i] = binary.BigEndian.Uint32(buff[idx+addrPrefixSize:])
   451  	}
   452  	return
   453  }
   454  
   455  func (ti onHeapTableIndex) prefixIdxToOrdinal(idx uint32) uint32 {
   456  	return ti.ordinals[idx]
   457  }
   458  
   459  // TableFileSize returns the size of the table file that this index references.
   460  // This assumes that the index follows immediately after the last chunk in the
   461  // file and that the last chunk in the file is in the index.
   462  func (ti onHeapTableIndex) TableFileSize() uint64 {
   463  	if ti.chunkCount == 0 {
   464  		return footerSize
   465  	}
   466  	len, offset := ti.offsets[ti.chunkCount-1], uint64(ti.lengths[ti.chunkCount-1])
   467  	return offset + len + indexSize(ti.chunkCount) + footerSize
   468  }
   469  
   470  // prefixIdx returns the first position in |tr.prefixes| whose value ==
   471  // |prefix|. Returns |tr.chunkCount| if absent
   472  func (ti onHeapTableIndex) prefixIdx(prefix uint64) (idx uint32) {
   473  	// NOTE: The golang impl of sort.Search is basically inlined here. This method can be called in
   474  	// an extremely tight loop and inlining the code was a significant perf improvement.
   475  	idx, j := 0, ti.chunkCount
   476  	for idx < j {
   477  		h := idx + (j-idx)/2 // avoid overflow when computing h
   478  		// i ≤ h < j
   479  		if ti.prefixes[h] < prefix {
   480  			idx = h + 1 // preserves f(i-1) == false
   481  		} else {
   482  			j = h // preserves f(j) == true
   483  		}
   484  	}
   485  
   486  	return
   487  }
   488  
   489  // EntrySuffixMatches returns true IFF the suffix for prefix entry |idx|
   490  // matches the address |a|.
   491  func (ti onHeapTableIndex) EntrySuffixMatches(idx uint32, h *addr) bool {
   492  	li := uint64(ti.ordinals[idx]) * addrSuffixSize
   493  	return bytes.Equal(h[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize])
   494  }
   495  
   496  // lookupOrdinal returns the ordinal of |h| if present. Returns |ti.chunkCount|
   497  // if absent.
   498  func (ti onHeapTableIndex) lookupOrdinal(h *addr) uint32 {
   499  	prefix := h.Prefix()
   500  
   501  	for idx := ti.prefixIdx(prefix); idx < ti.chunkCount && ti.prefixes[idx] == prefix; idx++ {
   502  		if ti.EntrySuffixMatches(idx, h) {
   503  			return ti.ordinals[idx]
   504  		}
   505  	}
   506  
   507  	return ti.chunkCount
   508  }
   509  
   510  func (ti onHeapTableIndex) IndexEntry(idx uint32, a *addr) indexEntry {
   511  	ord := ti.ordinals[idx]
   512  	if a != nil {
   513  		binary.BigEndian.PutUint64(a[:], ti.prefixes[idx])
   514  		li := uint64(ord) * addrSuffixSize
   515  		copy(a[addrPrefixSize:], ti.suffixes[li:li+addrSuffixSize])
   516  	}
   517  	return indexResult{ti.offsets[ord], ti.lengths[ord]}
   518  }
   519  
   520  func (ti onHeapTableIndex) Lookup(h *addr) (indexEntry, bool) {
   521  	ord := ti.lookupOrdinal(h)
   522  	if ord == ti.chunkCount {
   523  		return indexResult{}, false
   524  	}
   525  	return indexResult{ti.offsets[ord], ti.lengths[ord]}, true
   526  }
   527  
   528  func (ti onHeapTableIndex) Prefixes() []uint64 {
   529  	return ti.prefixes
   530  }
   531  
   532  func (ti onHeapTableIndex) Ordinals() []uint32 {
   533  	return ti.ordinals
   534  }
   535  
   536  func (i onHeapTableIndex) ChunkCount() uint32 {
   537  	return i.chunkCount
   538  }
   539  
   540  func (i onHeapTableIndex) TotalUncompressedData() uint64 {
   541  	return i.totalUncompressedData
   542  }
   543  
   544  func (i onHeapTableIndex) Close() error {
   545  	return nil
   546  }
   547  
   548  func (i onHeapTableIndex) Clone() tableIndex {
   549  	return i
   550  }
   551  
   552  // newTableReader parses a valid nbs table byte stream and returns a reader. buff must end with an NBS index
   553  // and footer, though it may contain an unspecified number of bytes before that data. r should allow
   554  // retrieving any desired range of bytes from the table.
   555  func newTableReader(index tableIndex, r tableReaderAt, blockSize uint64) tableReader {
   556  	return tableReader{
   557  		index,
   558  		index.Prefixes(),
   559  		index.ChunkCount(),
   560  		index.TotalUncompressedData(),
   561  		r,
   562  		blockSize,
   563  	}
   564  }
   565  
   566  // Scan across (logically) two ordered slices of address prefixes.
   567  func (tr tableReader) hasMany(addrs []hasRecord) (bool, error) {
   568  	// TODO: Use findInIndex if (tr.chunkCount - len(addrs)*Log2(tr.chunkCount)) > (tr.chunkCount - len(addrs))
   569  
   570  	filterIdx := uint32(0)
   571  	filterLen := uint32(tr.chunkCount)
   572  
   573  	var remaining bool
   574  	for i, addr := range addrs {
   575  		if addr.has {
   576  			continue
   577  		}
   578  
   579  		for filterIdx < filterLen && addr.prefix > tr.prefixes[filterIdx] {
   580  			filterIdx++
   581  		}
   582  
   583  		if filterIdx >= filterLen {
   584  			return true, nil
   585  		}
   586  
   587  		if addr.prefix != tr.prefixes[filterIdx] {
   588  			remaining = true
   589  			continue
   590  		}
   591  
   592  		// prefixes are equal, so locate and compare against the corresponding suffix
   593  		for j := filterIdx; j < filterLen && addr.prefix == tr.prefixes[j]; j++ {
   594  			if tr.EntrySuffixMatches(j, addr.a) {
   595  				addrs[i].has = true
   596  				break
   597  			}
   598  		}
   599  
   600  		if !addrs[i].has {
   601  			remaining = true
   602  		}
   603  	}
   604  
   605  	return remaining, nil
   606  }
   607  
   608  func (tr tableReader) count() (uint32, error) {
   609  	return tr.chunkCount, nil
   610  }
   611  
   612  func (tr tableReader) uncompressedLen() (uint64, error) {
   613  	return tr.totalUncompressedData, nil
   614  }
   615  
   616  func (tr tableReader) index() (tableIndex, error) {
   617  	return tr.tableIndex, nil
   618  }
   619  
   620  // returns true iff |h| can be found in this table.
   621  func (tr tableReader) has(h addr) (bool, error) {
   622  	_, ok := tr.Lookup(&h)
   623  	return ok, nil
   624  }
   625  
   626  // returns the storage associated with |h|, iff present. Returns nil if absent. On success,
   627  // the returned byte slice directly references the underlying storage.
   628  func (tr tableReader) get(ctx context.Context, h addr, stats *Stats) ([]byte, error) {
   629  	e, found := tr.Lookup(&h)
   630  	if !found {
   631  		return nil, nil
   632  	}
   633  
   634  	offset := e.Offset()
   635  	length := uint64(e.Length())
   636  	buff := make([]byte, length) // TODO: Avoid this allocation for every get
   637  
   638  	n, err := tr.r.ReadAtWithStats(ctx, buff, int64(offset), stats)
   639  
   640  	if err != nil {
   641  		return nil, err
   642  	}
   643  
   644  	if n != int(length) {
   645  		return nil, errors.New("failed to read all data")
   646  	}
   647  
   648  	cmp, err := NewCompressedChunk(hash.Hash(h), buff)
   649  
   650  	if err != nil {
   651  		return nil, err
   652  	}
   653  
   654  	if len(cmp.CompressedData) == 0 {
   655  		return nil, errors.New("failed to get data")
   656  	}
   657  
   658  	chnk, err := cmp.ToChunk()
   659  
   660  	if err != nil {
   661  		return nil, err
   662  	}
   663  
   664  	return chnk.Data(), nil
   665  }
   666  
   667  type offsetRec struct {
   668  	a      *addr
   669  	offset uint64
   670  	length uint32
   671  }
   672  
   673  type offsetRecSlice []offsetRec
   674  
   675  func (hs offsetRecSlice) Len() int           { return len(hs) }
   676  func (hs offsetRecSlice) Less(i, j int) bool { return hs[i].offset < hs[j].offset }
   677  func (hs offsetRecSlice) Swap(i, j int)      { hs[i], hs[j] = hs[j], hs[i] }
   678  
   679  var _ chunkReadPlanner = tableReader{}
   680  var _ chunkReader = tableReader{}
   681  
   682  func (tr tableReader) readCompressedAtOffsets(
   683  	ctx context.Context,
   684  	rb readBatch,
   685  	found func(CompressedChunk),
   686  	stats *Stats,
   687  ) error {
   688  	return tr.readAtOffsetsWithCB(ctx, rb, stats, func(cmp CompressedChunk) error {
   689  		found(cmp)
   690  		return nil
   691  	})
   692  }
   693  
   694  func (tr tableReader) readAtOffsets(
   695  	ctx context.Context,
   696  	rb readBatch,
   697  	found func(*chunks.Chunk),
   698  	stats *Stats,
   699  ) error {
   700  	return tr.readAtOffsetsWithCB(ctx, rb, stats, func(cmp CompressedChunk) error {
   701  		chk, err := cmp.ToChunk()
   702  
   703  		if err != nil {
   704  			return err
   705  		}
   706  
   707  		found(&chk)
   708  		return nil
   709  	})
   710  }
   711  
   712  func (tr tableReader) readAtOffsetsWithCB(
   713  	ctx context.Context,
   714  	rb readBatch,
   715  	stats *Stats,
   716  	cb func(cmp CompressedChunk) error,
   717  ) error {
   718  	readLength := rb.End() - rb.Start()
   719  	buff := make([]byte, readLength)
   720  
   721  	n, err := tr.r.ReadAtWithStats(ctx, buff, int64(rb.Start()), stats)
   722  	if err != nil {
   723  		return err
   724  	}
   725  
   726  	if uint64(n) != readLength {
   727  		return errors.New("failed to read all data")
   728  	}
   729  
   730  	for i := range rb {
   731  		cmp, err := rb.ExtractChunkFromRead(buff, i)
   732  		if err != nil {
   733  			return err
   734  		}
   735  
   736  		err = cb(cmp)
   737  		if err != nil {
   738  			return err
   739  		}
   740  	}
   741  
   742  	return nil
   743  }
   744  
   745  // getMany retrieves multiple stored blocks and optimizes by attempting to read in larger physical
   746  // blocks which contain multiple stored blocks. |reqs| must be sorted by address prefix.
   747  func (tr tableReader) getMany(
   748  	ctx context.Context,
   749  	eg *errgroup.Group,
   750  	reqs []getRecord,
   751  	found func(*chunks.Chunk),
   752  	stats *Stats) (bool, error) {
   753  
   754  	// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
   755  	// of table locations which must be read in order to satisfy the getMany operation.
   756  	offsetRecords, remaining := tr.findOffsets(reqs)
   757  	err := tr.getManyAtOffsets(ctx, eg, offsetRecords, found, stats)
   758  	return remaining, err
   759  }
   760  func (tr tableReader) getManyCompressed(ctx context.Context, eg *errgroup.Group, reqs []getRecord, found func(CompressedChunk), stats *Stats) (bool, error) {
   761  	// Pass #1: Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
   762  	// of table locations which must be read in order to satisfy the getMany operation.
   763  	offsetRecords, remaining := tr.findOffsets(reqs)
   764  	err := tr.getManyCompressedAtOffsets(ctx, eg, offsetRecords, found, stats)
   765  	return remaining, err
   766  }
   767  
   768  func (tr tableReader) getManyCompressedAtOffsets(ctx context.Context, eg *errgroup.Group, offsetRecords offsetRecSlice, found func(CompressedChunk), stats *Stats) error {
   769  	return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func(
   770  		ctx context.Context,
   771  		rb readBatch,
   772  		stats *Stats) error {
   773  		return tr.readCompressedAtOffsets(ctx, rb, found, stats)
   774  	})
   775  }
   776  
   777  func (tr tableReader) getManyAtOffsets(
   778  	ctx context.Context,
   779  	eg *errgroup.Group,
   780  	offsetRecords offsetRecSlice,
   781  	found func(*chunks.Chunk),
   782  	stats *Stats,
   783  ) error {
   784  	return tr.getManyAtOffsetsWithReadFunc(ctx, eg, offsetRecords, stats, func(
   785  		ctx context.Context,
   786  		rb readBatch,
   787  		stats *Stats) error {
   788  		return tr.readAtOffsets(ctx, rb, found, stats)
   789  	})
   790  }
   791  
   792  type readBatch offsetRecSlice
   793  
   794  func (r readBatch) Start() uint64 {
   795  	return r[0].offset
   796  }
   797  
   798  func (r readBatch) End() uint64 {
   799  	last := r[len(r)-1]
   800  	return last.offset + uint64(last.length)
   801  }
   802  
   803  func (s readBatch) ExtractChunkFromRead(buff []byte, idx int) (CompressedChunk, error) {
   804  	rec := s[idx]
   805  	chunkStart := rec.offset - s.Start()
   806  	return NewCompressedChunk(hash.Hash(*rec.a), buff[chunkStart:chunkStart+uint64(rec.length)])
   807  }
   808  
   809  func toReadBatches(offsets offsetRecSlice, blockSize uint64) []readBatch {
   810  	res := make([]readBatch, 0)
   811  	var batch readBatch
   812  	for i := 0; i < len(offsets); {
   813  		rec := offsets[i]
   814  		if batch == nil {
   815  			batch = readBatch{rec}
   816  			i++
   817  			continue
   818  		}
   819  
   820  		if _, canRead := canReadAhead(rec, batch.End(), blockSize); canRead {
   821  			batch = append(batch, rec)
   822  			i++
   823  			continue
   824  		}
   825  
   826  		res = append(res, batch)
   827  		batch = nil
   828  	}
   829  	if batch != nil {
   830  		res = append(res, batch)
   831  	}
   832  	return res
   833  }
   834  
   835  func (tr tableReader) getManyAtOffsetsWithReadFunc(
   836  	ctx context.Context,
   837  	eg *errgroup.Group,
   838  	offsetRecords offsetRecSlice,
   839  	stats *Stats,
   840  	readAtOffsets func(
   841  		ctx context.Context,
   842  		rb readBatch,
   843  		stats *Stats) error,
   844  ) error {
   845  	batches := toReadBatches(offsetRecords, tr.blockSize)
   846  	var idx int32
   847  	readBatches := func() error {
   848  		for {
   849  			if ctx.Err() != nil {
   850  				return ctx.Err()
   851  			}
   852  			i := atomic.AddInt32(&idx, 1) - 1
   853  			if int(i) >= len(batches) {
   854  				return nil
   855  			}
   856  			rb := batches[i]
   857  			err := readAtOffsets(ctx, rb, stats)
   858  			if err != nil {
   859  				return err
   860  			}
   861  		}
   862  	}
   863  	ioParallelism := 4
   864  	for i := 0; i < ioParallelism; i++ {
   865  		eg.Go(readBatches)
   866  	}
   867  
   868  	return nil
   869  }
   870  
   871  // findOffsets iterates over |reqs| and |tr.prefixes| (both sorted by
   872  // address) to build the set of table locations which must be read in order to
   873  // find each chunk specified by |reqs|. If this table contains all requested
   874  // chunks remaining will be set to false upon return. If some are not here,
   875  // then remaining will be true. The result offsetRecSlice is sorted in offset
   876  // order.
   877  func (tr tableReader) findOffsets(reqs []getRecord) (ors offsetRecSlice, remaining bool) {
   878  	filterIdx := uint32(0)
   879  	filterLen := uint32(len(tr.prefixes))
   880  	ors = make(offsetRecSlice, 0, len(reqs))
   881  
   882  	// Iterate over |reqs| and |tr.prefixes| (both sorted by address) and build the set
   883  	// of table locations which must be read in order to satisfy |reqs|.
   884  	for i, req := range reqs {
   885  		if req.found {
   886  			continue
   887  		}
   888  
   889  		// advance within the prefixes until we reach one which is >= req.prefix
   890  		for filterIdx < filterLen && tr.prefixes[filterIdx] < req.prefix {
   891  			filterIdx++
   892  		}
   893  
   894  		if filterIdx >= filterLen {
   895  			remaining = true // last prefix visited.
   896  			break
   897  		}
   898  
   899  		if req.prefix != tr.prefixes[filterIdx] {
   900  			remaining = true
   901  			continue
   902  		}
   903  
   904  		// record all offsets within the table which contain the data required.
   905  		for j := filterIdx; j < filterLen && req.prefix == tr.prefixes[j]; j++ {
   906  			if tr.EntrySuffixMatches(j, req.a) {
   907  				reqs[i].found = true
   908  				entry := tr.IndexEntry(j, nil)
   909  				ors = append(ors, offsetRec{req.a, entry.Offset(), entry.Length()})
   910  				break
   911  			}
   912  		}
   913  	}
   914  
   915  	sort.Sort(ors)
   916  	return ors, remaining
   917  }
   918  
   919  func canReadAhead(fRec offsetRec, readEnd, blockSize uint64) (newEnd uint64, canRead bool) {
   920  	if fRec.offset < readEnd {
   921  		// |offsetRecords| will contain an offsetRecord for *every* chunkRecord whose address
   922  		// prefix matches the prefix of a requested address. If the set of requests contains
   923  		// addresses which share a common prefix, then it's possible for multiple offsetRecords
   924  		// to reference the same table offset position. In that case, we'll see sequential
   925  		// offsetRecords with the same fRec.offset.
   926  		return readEnd, true
   927  	}
   928  
   929  	if fRec.offset-readEnd > blockSize {
   930  		return readEnd, false
   931  	}
   932  
   933  	return fRec.offset + uint64(fRec.length), true
   934  }
   935  
   936  func (tr tableReader) calcReads(reqs []getRecord, blockSize uint64) (reads int, remaining bool, err error) {
   937  	var offsetRecords offsetRecSlice
   938  	// Pass #1: Build the set of table locations which must be read in order to find all the elements of |reqs| which are present in this table.
   939  	offsetRecords, remaining = tr.findOffsets(reqs)
   940  
   941  	// Now |offsetRecords| contains all locations within the table which must
   942  	// be searched (note that there may be duplicates of a particular
   943  	// location). Scan forward, grouping sequences of reads into large physical
   944  	// reads.
   945  
   946  	var readStart, readEnd uint64
   947  	readStarted := false
   948  
   949  	for i := 0; i < len(offsetRecords); {
   950  		rec := offsetRecords[i]
   951  		length := rec.length
   952  
   953  		if !readStarted {
   954  			readStarted = true
   955  			reads++
   956  			readStart = rec.offset
   957  			readEnd = readStart + uint64(length)
   958  			i++
   959  			continue
   960  		}
   961  
   962  		if newReadEnd, canRead := canReadAhead(rec, readEnd, tr.blockSize); canRead {
   963  			readEnd = newReadEnd
   964  			i++
   965  			continue
   966  		}
   967  
   968  		readStarted = false
   969  	}
   970  
   971  	return
   972  }
   973  
   974  func (tr tableReader) extract(ctx context.Context, chunks chan<- extractRecord) error {
   975  	sendChunk := func(or offsetRec) error {
   976  		buff := make([]byte, or.length)
   977  		n, err := tr.r.ReadAtWithStats(ctx, buff, int64(or.offset), &Stats{})
   978  		if err != nil {
   979  			return err
   980  		}
   981  		if uint32(n) != or.length {
   982  			return errors.New("did not read all data")
   983  		}
   984  		cmp, err := NewCompressedChunk(hash.Hash(*or.a), buff)
   985  
   986  		if err != nil {
   987  			return err
   988  		}
   989  
   990  		chnk, err := cmp.ToChunk()
   991  
   992  		if err != nil {
   993  			return err
   994  		}
   995  
   996  		chunks <- extractRecord{a: *or.a, data: chnk.Data()}
   997  		return nil
   998  	}
   999  
  1000  	var ors offsetRecSlice
  1001  	for i := uint32(0); i < tr.chunkCount; i++ {
  1002  		a := new(addr)
  1003  		e := tr.IndexEntry(i, a)
  1004  		ors = append(ors, offsetRec{a, e.Offset(), e.Length()})
  1005  	}
  1006  	sort.Sort(ors)
  1007  	for _, or := range ors {
  1008  		err := sendChunk(or)
  1009  		if err != nil {
  1010  			return err
  1011  		}
  1012  	}
  1013  
  1014  	return nil
  1015  }
  1016  
  1017  func (tr tableReader) reader(ctx context.Context) (io.Reader, error) {
  1018  	i, _ := tr.index()
  1019  	return io.LimitReader(&readerAdapter{tr.r, 0, ctx}, int64(i.TableFileSize())), nil
  1020  }
  1021  
  1022  func (tr tableReader) Close() error {
  1023  	return tr.tableIndex.Close()
  1024  }
  1025  
  1026  func (tr tableReader) Clone() tableReader {
  1027  	return tableReader{tr.tableIndex.Clone(), tr.prefixes, tr.chunkCount, tr.totalUncompressedData, tr.r, tr.blockSize}
  1028  }
  1029  
  1030  type readerAdapter struct {
  1031  	rat tableReaderAt
  1032  	off int64
  1033  	ctx context.Context
  1034  }
  1035  
  1036  func (ra *readerAdapter) Read(p []byte) (n int, err error) {
  1037  	n, err = ra.rat.ReadAtWithStats(ra.ctx, p, ra.off, &Stats{})
  1038  	ra.off += int64(n)
  1039  	return
  1040  }