github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/reader.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"cmp"
    10  	"context"
    11  	"encoding/binary"
    12  	"io"
    13  	"os"
    14  	"slices"
    15  	"time"
    16  
    17  	"github.com/cespare/xxhash/v2"
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/cockroachdb/pebble/internal/base"
    20  	"github.com/cockroachdb/pebble/internal/bytealloc"
    21  	"github.com/cockroachdb/pebble/internal/cache"
    22  	"github.com/cockroachdb/pebble/internal/crc"
    23  	"github.com/cockroachdb/pebble/internal/invariants"
    24  	"github.com/cockroachdb/pebble/internal/keyspan"
    25  	"github.com/cockroachdb/pebble/internal/private"
    26  	"github.com/cockroachdb/pebble/objstorage"
    27  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    28  )
    29  
    30  var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry")
    31  var errReaderClosed = errors.New("pebble/table: reader is closed")
    32  
    33  // decodeBlockHandle returns the block handle encoded at the start of src, as
    34  // well as the number of bytes it occupies. It returns zero if given invalid
    35  // input. A block handle for a data block or a first/lower level index block
    36  // should not be decoded using decodeBlockHandle since the caller may validate
    37  // that the number of bytes decoded is equal to the length of src, which will
    38  // be false if the properties are not decoded. In those cases the caller
    39  // should use decodeBlockHandleWithProperties.
    40  func decodeBlockHandle(src []byte) (BlockHandle, int) {
    41  	offset, n := binary.Uvarint(src)
    42  	length, m := binary.Uvarint(src[n:])
    43  	if n == 0 || m == 0 {
    44  		return BlockHandle{}, 0
    45  	}
    46  	return BlockHandle{offset, length}, n + m
    47  }
    48  
    49  // decodeBlockHandleWithProperties returns the block handle and properties
    50  // encoded in src. src needs to be exactly the length that was encoded. This
    51  // method must be used for data block and first/lower level index blocks. The
    52  // properties in the block handle point to the bytes in src.
    53  func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) {
    54  	bh, n := decodeBlockHandle(src)
    55  	if n == 0 {
    56  		return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle")
    57  	}
    58  	return BlockHandleWithProperties{
    59  		BlockHandle: bh,
    60  		Props:       src[n:],
    61  	}, nil
    62  }
    63  
    64  func encodeBlockHandle(dst []byte, b BlockHandle) int {
    65  	n := binary.PutUvarint(dst, b.Offset)
    66  	m := binary.PutUvarint(dst[n:], b.Length)
    67  	return n + m
    68  }
    69  
    70  func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte {
    71  	n := encodeBlockHandle(dst, b.BlockHandle)
    72  	dst = append(dst[:n], b.Props...)
    73  	return dst
    74  }
    75  
    76  // block is a []byte that holds a sequence of key/value pairs plus an index
    77  // over those pairs.
    78  type block []byte
    79  
    80  type loadBlockResult int8
    81  
    82  const (
    83  	loadBlockOK loadBlockResult = iota
    84  	// Could be due to error or because no block left to load.
    85  	loadBlockFailed
    86  	loadBlockIrrelevant
    87  )
    88  
    89  type blockTransform func([]byte) ([]byte, error)
    90  
    91  // ReaderOption provide an interface to do work on Reader while it is being
    92  // opened.
    93  type ReaderOption interface {
    94  	// readerApply is called on the reader during opening in order to set internal
    95  	// parameters.
    96  	readerApply(*Reader)
    97  }
    98  
    99  // Comparers is a map from comparer name to comparer. It is used for debugging
   100  // tools which may be used on multiple databases configured with different
   101  // comparers. Comparers implements the OpenOption interface and can be passed
   102  // as a parameter to NewReader.
   103  type Comparers map[string]*Comparer
   104  
   105  func (c Comparers) readerApply(r *Reader) {
   106  	if r.Compare != nil || r.Properties.ComparerName == "" {
   107  		return
   108  	}
   109  	if comparer, ok := c[r.Properties.ComparerName]; ok {
   110  		r.Compare = comparer.Compare
   111  		r.FormatKey = comparer.FormatKey
   112  		r.Split = comparer.Split
   113  	}
   114  }
   115  
   116  // Mergers is a map from merger name to merger. It is used for debugging tools
   117  // which may be used on multiple databases configured with different
   118  // mergers. Mergers implements the OpenOption interface and can be passed as
   119  // a parameter to NewReader.
   120  type Mergers map[string]*Merger
   121  
   122  func (m Mergers) readerApply(r *Reader) {
   123  	if r.mergerOK || r.Properties.MergerName == "" {
   124  		return
   125  	}
   126  	_, r.mergerOK = m[r.Properties.MergerName]
   127  }
   128  
   129  // cacheOpts is a Reader open option for specifying the cache ID and sstable file
   130  // number. If not specified, a unique cache ID will be used.
   131  type cacheOpts struct {
   132  	cacheID uint64
   133  	fileNum base.DiskFileNum
   134  }
   135  
   136  // Marker function to indicate the option should be applied before reading the
   137  // sstable properties and, in the write path, before writing the default
   138  // sstable properties.
   139  func (c *cacheOpts) preApply() {}
   140  
   141  func (c *cacheOpts) readerApply(r *Reader) {
   142  	if r.cacheID == 0 {
   143  		r.cacheID = c.cacheID
   144  	}
   145  	if r.fileNum.FileNum() == 0 {
   146  		r.fileNum = c.fileNum
   147  	}
   148  }
   149  
   150  func (c *cacheOpts) writerApply(w *Writer) {
   151  	if w.cacheID == 0 {
   152  		w.cacheID = c.cacheID
   153  	}
   154  	if w.fileNum.FileNum() == 0 {
   155  		w.fileNum = c.fileNum
   156  	}
   157  }
   158  
   159  // rawTombstonesOpt is a Reader open option for specifying that range
   160  // tombstones returned by Reader.NewRangeDelIter() should not be
   161  // fragmented. Used by debug tools to get a raw view of the tombstones
   162  // contained in an sstable.
   163  type rawTombstonesOpt struct{}
   164  
   165  func (rawTombstonesOpt) preApply() {}
   166  
   167  func (rawTombstonesOpt) readerApply(r *Reader) {
   168  	r.rawTombstones = true
   169  }
   170  
   171  func init() {
   172  	private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} {
   173  		return &cacheOpts{cacheID, fileNum}
   174  	}
   175  	private.SSTableRawTombstonesOpt = rawTombstonesOpt{}
   176  }
   177  
   178  // CommonReader abstracts functionality over a Reader or a VirtualReader. This
   179  // can be used by code which doesn't care to distinguish between a reader and a
   180  // virtual reader.
   181  type CommonReader interface {
   182  	NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
   183  	NewRawRangeDelIter() (keyspan.FragmentIterator, error)
   184  	NewIterWithBlockPropertyFiltersAndContextEtc(
   185  		ctx context.Context, lower, upper []byte,
   186  		filterer *BlockPropertiesFilterer,
   187  		hideObsoletePoints, useFilterBlock bool,
   188  		stats *base.InternalIteratorStats,
   189  		categoryAndQoS CategoryAndQoS,
   190  		statsCollector *CategoryStatsCollector,
   191  		rp ReaderProvider,
   192  	) (Iterator, error)
   193  	NewCompactionIter(
   194  		bytesIterated *uint64,
   195  		categoryAndQoS CategoryAndQoS,
   196  		statsCollector *CategoryStatsCollector,
   197  		rp ReaderProvider,
   198  		bufferPool *BufferPool,
   199  	) (Iterator, error)
   200  	EstimateDiskUsage(start, end []byte) (uint64, error)
   201  	CommonProperties() *CommonProperties
   202  }
   203  
   204  // Reader is a table reader.
   205  type Reader struct {
   206  	readable          objstorage.Readable
   207  	cacheID           uint64
   208  	fileNum           base.DiskFileNum
   209  	err               error
   210  	indexBH           BlockHandle
   211  	filterBH          BlockHandle
   212  	rangeDelBH        BlockHandle
   213  	rangeKeyBH        BlockHandle
   214  	rangeDelTransform blockTransform
   215  	valueBIH          valueBlocksIndexHandle
   216  	propertiesBH      BlockHandle
   217  	metaIndexBH       BlockHandle
   218  	footerBH          BlockHandle
   219  	opts              ReaderOptions
   220  	Compare           Compare
   221  	FormatKey         base.FormatKey
   222  	Split             Split
   223  	tableFilter       *tableFilterReader
   224  	// Keep types that are not multiples of 8 bytes at the end and with
   225  	// decreasing size.
   226  	Properties    Properties
   227  	tableFormat   TableFormat
   228  	rawTombstones bool
   229  	mergerOK      bool
   230  	checksumType  ChecksumType
   231  	// metaBufferPool is a buffer pool used exclusively when opening a table and
   232  	// loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate
   233  	// the BufferPool.pool slice as a part of the Reader allocation. It's
   234  	// capacity 3 to accommodate the meta block (1), and both the compressed
   235  	// properties block (1) and decompressed properties block (1)
   236  	// simultaneously.
   237  	metaBufferPool      BufferPool
   238  	metaBufferPoolAlloc [3]allocedBuffer
   239  }
   240  
   241  // Close implements DB.Close, as documented in the pebble package.
   242  func (r *Reader) Close() error {
   243  	r.opts.Cache.Unref()
   244  
   245  	if r.readable != nil {
   246  		r.err = firstError(r.err, r.readable.Close())
   247  		r.readable = nil
   248  	}
   249  
   250  	if r.err != nil {
   251  		return r.err
   252  	}
   253  	// Make any future calls to Get, NewIter or Close return an error.
   254  	r.err = errReaderClosed
   255  	return nil
   256  }
   257  
   258  // NewIterWithBlockPropertyFilters returns an iterator for the contents of the
   259  // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after
   260  // itself and returns a nil iterator.
   261  func (r *Reader) NewIterWithBlockPropertyFilters(
   262  	lower, upper []byte,
   263  	filterer *BlockPropertiesFilterer,
   264  	useFilterBlock bool,
   265  	stats *base.InternalIteratorStats,
   266  	categoryAndQoS CategoryAndQoS,
   267  	statsCollector *CategoryStatsCollector,
   268  	rp ReaderProvider,
   269  ) (Iterator, error) {
   270  	return r.newIterWithBlockPropertyFiltersAndContext(
   271  		context.Background(), lower, upper, filterer, false, useFilterBlock, stats,
   272  		categoryAndQoS, statsCollector, rp, nil)
   273  }
   274  
   275  // NewIterWithBlockPropertyFiltersAndContextEtc is similar to
   276  // NewIterWithBlockPropertyFilters and additionally accepts a context for
   277  // tracing.
   278  //
   279  // If hideObsoletePoints, the callee assumes that filterer already includes
   280  // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by
   281  // first calling TryAddBlockPropertyFilterForHideObsoletePoints.
   282  func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
   283  	ctx context.Context,
   284  	lower, upper []byte,
   285  	filterer *BlockPropertiesFilterer,
   286  	hideObsoletePoints, useFilterBlock bool,
   287  	stats *base.InternalIteratorStats,
   288  	categoryAndQoS CategoryAndQoS,
   289  	statsCollector *CategoryStatsCollector,
   290  	rp ReaderProvider,
   291  ) (Iterator, error) {
   292  	return r.newIterWithBlockPropertyFiltersAndContext(
   293  		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, categoryAndQoS,
   294  		statsCollector, rp, nil)
   295  }
   296  
   297  // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called
   298  // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the
   299  // value of hideObsoletePoints and potentially add a block property filter.
   300  func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(
   301  	snapshotForHideObsoletePoints uint64,
   302  	fileLargestSeqNum uint64,
   303  	pointKeyFilters []BlockPropertyFilter,
   304  ) (hideObsoletePoints bool, filters []BlockPropertyFilter) {
   305  	hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 &&
   306  		snapshotForHideObsoletePoints > fileLargestSeqNum
   307  	if hideObsoletePoints {
   308  		pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{})
   309  	}
   310  	return hideObsoletePoints, pointKeyFilters
   311  }
   312  
   313  func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
   314  	ctx context.Context,
   315  	lower, upper []byte,
   316  	filterer *BlockPropertiesFilterer,
   317  	hideObsoletePoints bool,
   318  	useFilterBlock bool,
   319  	stats *base.InternalIteratorStats,
   320  	categoryAndQoS CategoryAndQoS,
   321  	statsCollector *CategoryStatsCollector,
   322  	rp ReaderProvider,
   323  	v *virtualState,
   324  ) (Iterator, error) {
   325  	// NB: pebble.tableCache wraps the returned iterator with one which performs
   326  	// reference counting on the Reader, preventing the Reader from being closed
   327  	// until the final iterator closes.
   328  	if r.Properties.IndexType == twoLevelIndex {
   329  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   330  		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats,
   331  			categoryAndQoS, statsCollector, rp, nil /* bufferPool */)
   332  		if err != nil {
   333  			return nil, err
   334  		}
   335  		return i, nil
   336  	}
   337  
   338  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   339  	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats,
   340  		categoryAndQoS, statsCollector, rp, nil /* bufferPool */)
   341  	if err != nil {
   342  		return nil, err
   343  	}
   344  	return i, nil
   345  }
   346  
   347  // NewIter returns an iterator for the contents of the table. If an error
   348  // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter
   349  // must only be used when the Reader is guaranteed to outlive any LazyValues
   350  // returned from the iter.
   351  func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) {
   352  	return r.NewIterWithBlockPropertyFilters(
   353  		lower, upper, nil, true /* useFilterBlock */, nil, /* stats */
   354  		CategoryAndQoS{}, nil /*statsCollector */, TrivialReaderProvider{Reader: r})
   355  }
   356  
   357  // NewCompactionIter returns an iterator similar to NewIter but it also increments
   358  // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up
   359  // after itself and returns a nil iterator.
   360  func (r *Reader) NewCompactionIter(
   361  	bytesIterated *uint64,
   362  	categoryAndQoS CategoryAndQoS,
   363  	statsCollector *CategoryStatsCollector,
   364  	rp ReaderProvider,
   365  	bufferPool *BufferPool,
   366  ) (Iterator, error) {
   367  	return r.newCompactionIter(bytesIterated, categoryAndQoS, statsCollector, rp, nil, bufferPool)
   368  }
   369  
   370  func (r *Reader) newCompactionIter(
   371  	bytesIterated *uint64,
   372  	categoryAndQoS CategoryAndQoS,
   373  	statsCollector *CategoryStatsCollector,
   374  	rp ReaderProvider,
   375  	v *virtualState,
   376  	bufferPool *BufferPool,
   377  ) (Iterator, error) {
   378  	if r.Properties.IndexType == twoLevelIndex {
   379  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   380  		err := i.init(
   381  			context.Background(),
   382  			r, v, nil /* lower */, nil /* upper */, nil,
   383  			false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   384  			nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool,
   385  		)
   386  		if err != nil {
   387  			return nil, err
   388  		}
   389  		i.setupForCompaction()
   390  		return &twoLevelCompactionIterator{
   391  			twoLevelIterator: i,
   392  			bytesIterated:    bytesIterated,
   393  		}, nil
   394  	}
   395  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   396  	err := i.init(
   397  		context.Background(), r, v, nil /* lower */, nil, /* upper */
   398  		nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   399  		nil /* stats */, categoryAndQoS, statsCollector, rp, bufferPool,
   400  	)
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  	i.setupForCompaction()
   405  	return &compactionIterator{
   406  		singleLevelIterator: i,
   407  		bytesIterated:       bytesIterated,
   408  	}, nil
   409  }
   410  
   411  // NewRawRangeDelIter returns an internal iterator for the contents of the
   412  // range-del block for the table. Returns nil if the table does not contain
   413  // any range deletions.
   414  //
   415  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   416  // iterator. Add WithContext methods since the existing ones are public.
   417  func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
   418  	if r.rangeDelBH.Length == 0 {
   419  		return nil, nil
   420  	}
   421  	h, err := r.readRangeDel(nil /* stats */, nil /* iterStats */)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  	i := &fragmentBlockIter{elideSameSeqnum: true}
   426  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   427  		return nil, err
   428  	}
   429  	return i, nil
   430  }
   431  
   432  // NewRawRangeKeyIter returns an internal iterator for the contents of the
   433  // range-key block for the table. Returns nil if the table does not contain any
   434  // range keys.
   435  //
   436  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   437  // iterator. Add WithContext methods since the existing ones are public.
   438  func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
   439  	if r.rangeKeyBH.Length == 0 {
   440  		return nil, nil
   441  	}
   442  	h, err := r.readRangeKey(nil /* stats */, nil /* iterStats */)
   443  	if err != nil {
   444  		return nil, err
   445  	}
   446  	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
   447  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   448  		return nil, err
   449  	}
   450  	return i, nil
   451  }
   452  
   453  type rangeKeyFragmentBlockIter struct {
   454  	fragmentBlockIter
   455  }
   456  
   457  func (i *rangeKeyFragmentBlockIter) Close() error {
   458  	err := i.fragmentBlockIter.Close()
   459  	i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse()
   460  	rangeKeyFragmentBlockIterPool.Put(i)
   461  	return err
   462  }
   463  
   464  func (r *Reader) readIndex(
   465  	ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
   466  ) (bufferHandle, error) {
   467  	ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock)
   468  	return r.readBlock(ctx, r.indexBH, nil, nil, stats, iterStats, nil /* buffer pool */)
   469  }
   470  
   471  func (r *Reader) readFilter(
   472  	ctx context.Context, stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
   473  ) (bufferHandle, error) {
   474  	ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock)
   475  	return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
   476  }
   477  
   478  func (r *Reader) readRangeDel(
   479  	stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
   480  ) (bufferHandle, error) {
   481  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   482  	return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
   483  }
   484  
   485  func (r *Reader) readRangeKey(
   486  	stats *base.InternalIteratorStats, iterStats *iterStatsAccumulator,
   487  ) (bufferHandle, error) {
   488  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   489  	return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, iterStats, nil /* buffer pool */)
   490  }
   491  
   492  func checkChecksum(
   493  	checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum,
   494  ) error {
   495  	expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:])
   496  	var computedChecksum uint32
   497  	switch checksumType {
   498  	case ChecksumTypeCRC32c:
   499  		computedChecksum = crc.New(b[:bh.Length+1]).Value()
   500  	case ChecksumTypeXXHash64:
   501  		computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1]))
   502  	default:
   503  		return errors.Errorf("unsupported checksum type: %d", checksumType)
   504  	}
   505  
   506  	if expectedChecksum != computedChecksum {
   507  		return base.CorruptionErrorf(
   508  			"pebble/table: invalid table %s (checksum mismatch at %d/%d)",
   509  			errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length))
   510  	}
   511  	return nil
   512  }
   513  
   514  type cacheValueOrBuf struct {
   515  	// buf.Valid() returns true if backed by a BufferPool.
   516  	buf Buf
   517  	// v is non-nil if backed by the block cache.
   518  	v *cache.Value
   519  }
   520  
   521  func (b cacheValueOrBuf) get() []byte {
   522  	if b.buf.Valid() {
   523  		return b.buf.p.pool[b.buf.i].b
   524  	}
   525  	return b.v.Buf()
   526  }
   527  
   528  func (b cacheValueOrBuf) release() {
   529  	if b.buf.Valid() {
   530  		b.buf.Release()
   531  	} else {
   532  		cache.Free(b.v)
   533  	}
   534  }
   535  
   536  func (b cacheValueOrBuf) truncate(n int) {
   537  	if b.buf.Valid() {
   538  		b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n]
   539  	} else {
   540  		b.v.Truncate(n)
   541  	}
   542  }
   543  
   544  func (r *Reader) readBlock(
   545  	ctx context.Context,
   546  	bh BlockHandle,
   547  	transform blockTransform,
   548  	readHandle objstorage.ReadHandle,
   549  	stats *base.InternalIteratorStats,
   550  	iterStats *iterStatsAccumulator,
   551  	bufferPool *BufferPool,
   552  ) (handle bufferHandle, _ error) {
   553  	if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil {
   554  		// Cache hit.
   555  		if readHandle != nil {
   556  			readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen))
   557  		}
   558  		if stats != nil {
   559  			stats.BlockBytes += bh.Length
   560  			stats.BlockBytesInCache += bh.Length
   561  		}
   562  		if iterStats != nil {
   563  			iterStats.reportStats(bh.Length, bh.Length)
   564  		}
   565  		// This block is already in the cache; return a handle to existing vlaue
   566  		// in the cache.
   567  		return bufferHandle{h: h}, nil
   568  	}
   569  
   570  	// Cache miss.
   571  	var compressed cacheValueOrBuf
   572  	if bufferPool != nil {
   573  		compressed = cacheValueOrBuf{
   574  			buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)),
   575  		}
   576  	} else {
   577  		compressed = cacheValueOrBuf{
   578  			v: cache.Alloc(int(bh.Length + blockTrailerLen)),
   579  		}
   580  	}
   581  
   582  	readStartTime := time.Now()
   583  	var err error
   584  	if readHandle != nil {
   585  		err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   586  	} else {
   587  		err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   588  	}
   589  	readDuration := time.Since(readStartTime)
   590  	// TODO(sumeer): should the threshold be configurable.
   591  	const slowReadTracingThreshold = 5 * time.Millisecond
   592  	// The invariants.Enabled path is for deterministic testing.
   593  	if invariants.Enabled {
   594  		readDuration = slowReadTracingThreshold
   595  	}
   596  	// Call IsTracingEnabled to avoid the allocations of boxing integers into an
   597  	// interface{}, unless necessary.
   598  	if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
   599  		r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s",
   600  			int(bh.Length+blockTrailerLen), readDuration.String())
   601  	}
   602  	if stats != nil {
   603  		stats.BlockReadDuration += readDuration
   604  	}
   605  	if err != nil {
   606  		compressed.release()
   607  		return bufferHandle{}, err
   608  	}
   609  	if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil {
   610  		compressed.release()
   611  		return bufferHandle{}, err
   612  	}
   613  
   614  	typ := blockType(compressed.get()[bh.Length])
   615  	compressed.truncate(int(bh.Length))
   616  
   617  	var decompressed cacheValueOrBuf
   618  	if typ == noCompressionBlockType {
   619  		decompressed = compressed
   620  	} else {
   621  		// Decode the length of the decompressed value.
   622  		decodedLen, prefixLen, err := decompressedLen(typ, compressed.get())
   623  		if err != nil {
   624  			compressed.release()
   625  			return bufferHandle{}, err
   626  		}
   627  
   628  		if bufferPool != nil {
   629  			decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)}
   630  		} else {
   631  			decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)}
   632  		}
   633  		if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil {
   634  			compressed.release()
   635  			return bufferHandle{}, err
   636  		}
   637  		compressed.release()
   638  	}
   639  
   640  	if transform != nil {
   641  		// Transforming blocks is very rare, so the extra copy of the
   642  		// transformed data is not problematic.
   643  		tmpTransformed, err := transform(decompressed.get())
   644  		if err != nil {
   645  			decompressed.release()
   646  			return bufferHandle{}, err
   647  		}
   648  
   649  		var transformed cacheValueOrBuf
   650  		if bufferPool != nil {
   651  			transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))}
   652  		} else {
   653  			transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))}
   654  		}
   655  		copy(transformed.get(), tmpTransformed)
   656  		decompressed.release()
   657  		decompressed = transformed
   658  	}
   659  
   660  	if stats != nil {
   661  		stats.BlockBytes += bh.Length
   662  	}
   663  	if iterStats != nil {
   664  		iterStats.reportStats(bh.Length, 0)
   665  	}
   666  	if decompressed.buf.Valid() {
   667  		return bufferHandle{b: decompressed.buf}, nil
   668  	}
   669  	h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v)
   670  	return bufferHandle{h: h}, nil
   671  }
   672  
   673  func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
   674  	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
   675  	// v1 format range-del blocks have unfragmented and unsorted range
   676  	// tombstones. We need properly fragmented and sorted range tombstones in
   677  	// order to serve from them directly.
   678  	iter := &blockIter{}
   679  	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil {
   680  		return nil, err
   681  	}
   682  	var tombstones []keyspan.Span
   683  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
   684  		t := keyspan.Span{
   685  			Start: key.UserKey,
   686  			End:   value.InPlaceValue(),
   687  			Keys:  []keyspan.Key{{Trailer: key.Trailer}},
   688  		}
   689  		tombstones = append(tombstones, t)
   690  	}
   691  	keyspan.Sort(r.Compare, tombstones)
   692  
   693  	// Fragment the tombstones, outputting them directly to a block writer.
   694  	rangeDelBlock := blockWriter{
   695  		restartInterval: 1,
   696  	}
   697  	frag := keyspan.Fragmenter{
   698  		Cmp:    r.Compare,
   699  		Format: r.FormatKey,
   700  		Emit: func(s keyspan.Span) {
   701  			for _, k := range s.Keys {
   702  				startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer}
   703  				rangeDelBlock.add(startIK, s.End)
   704  			}
   705  		},
   706  	}
   707  	for i := range tombstones {
   708  		frag.Add(tombstones[i])
   709  	}
   710  	frag.Finish()
   711  
   712  	// Return the contents of the constructed v2 format range-del block.
   713  	return rangeDelBlock.finish(), nil
   714  }
   715  
   716  func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
   717  	// We use a BufferPool when reading metaindex blocks in order to avoid
   718  	// populating the block cache with these blocks. In heavy-write workloads,
   719  	// especially with high compaction concurrency, new tables may be created
   720  	// frequently. Populating the block cache with these metaindex blocks adds
   721  	// additional contention on the block cache mutexes (see #1997).
   722  	// Additionally, these blocks are exceedingly unlikely to be read again
   723  	// while they're still in the block cache except in misconfigurations with
   724  	// excessive sstables counts or a table cache that's far too small.
   725  	r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0])
   726  	// When we're finished, release the buffers we've allocated back to memory
   727  	// allocator. We don't expect to use metaBufferPool again.
   728  	defer r.metaBufferPool.Release()
   729  
   730  	b, err := r.readBlock(
   731  		context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil, /* stats */
   732  		nil /* iterStats */, &r.metaBufferPool)
   733  	if err != nil {
   734  		return err
   735  	}
   736  	data := b.Get()
   737  	defer b.Release()
   738  
   739  	if uint64(len(data)) != metaindexBH.Length {
   740  		return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d",
   741  			errors.Safe(len(data)), errors.Safe(metaindexBH.Length))
   742  	}
   743  
   744  	i, err := newRawBlockIter(bytes.Compare, data)
   745  	if err != nil {
   746  		return err
   747  	}
   748  
   749  	meta := map[string]BlockHandle{}
   750  	for valid := i.First(); valid; valid = i.Next() {
   751  		value := i.Value()
   752  		if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) {
   753  			vbih, n, err := decodeValueBlocksIndexHandle(i.Value())
   754  			if err != nil {
   755  				return err
   756  			}
   757  			if n == 0 || n != len(value) {
   758  				return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)")
   759  			}
   760  			r.valueBIH = vbih
   761  		} else {
   762  			bh, n := decodeBlockHandle(value)
   763  			if n == 0 || n != len(value) {
   764  				return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)")
   765  			}
   766  			meta[string(i.Key().UserKey)] = bh
   767  		}
   768  	}
   769  	if err := i.Close(); err != nil {
   770  		return err
   771  	}
   772  
   773  	if bh, ok := meta[metaPropertiesName]; ok {
   774  		b, err = r.readBlock(
   775  			context.Background(), bh, nil /* transform */, nil /* readHandle */, nil, /* stats */
   776  			nil /* iterStats */, nil /* buffer pool */)
   777  		if err != nil {
   778  			return err
   779  		}
   780  		r.propertiesBH = bh
   781  		err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties)
   782  		b.Release()
   783  		if err != nil {
   784  			return err
   785  		}
   786  	}
   787  
   788  	if bh, ok := meta[metaRangeDelV2Name]; ok {
   789  		r.rangeDelBH = bh
   790  	} else if bh, ok := meta[metaRangeDelName]; ok {
   791  		r.rangeDelBH = bh
   792  		if !r.rawTombstones {
   793  			r.rangeDelTransform = r.transformRangeDelV1
   794  		}
   795  	}
   796  
   797  	if bh, ok := meta[metaRangeKeyName]; ok {
   798  		r.rangeKeyBH = bh
   799  	}
   800  
   801  	for name, fp := range r.opts.Filters {
   802  		types := []struct {
   803  			ftype  FilterType
   804  			prefix string
   805  		}{
   806  			{TableFilter, "fullfilter."},
   807  		}
   808  		var done bool
   809  		for _, t := range types {
   810  			if bh, ok := meta[t.prefix+name]; ok {
   811  				r.filterBH = bh
   812  
   813  				switch t.ftype {
   814  				case TableFilter:
   815  					r.tableFilter = newTableFilterReader(fp)
   816  				default:
   817  					return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype))
   818  				}
   819  
   820  				done = true
   821  				break
   822  			}
   823  		}
   824  		if done {
   825  			break
   826  		}
   827  	}
   828  	return nil
   829  }
   830  
   831  // Layout returns the layout (block organization) for an sstable.
   832  func (r *Reader) Layout() (*Layout, error) {
   833  	if r.err != nil {
   834  		return nil, r.err
   835  	}
   836  
   837  	l := &Layout{
   838  		Data:       make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks),
   839  		Filter:     r.filterBH,
   840  		RangeDel:   r.rangeDelBH,
   841  		RangeKey:   r.rangeKeyBH,
   842  		ValueIndex: r.valueBIH.h,
   843  		Properties: r.propertiesBH,
   844  		MetaIndex:  r.metaIndexBH,
   845  		Footer:     r.footerBH,
   846  		Format:     r.tableFormat,
   847  	}
   848  
   849  	indexH, err := r.readIndex(context.Background(), nil, nil)
   850  	if err != nil {
   851  		return nil, err
   852  	}
   853  	defer indexH.Release()
   854  
   855  	var alloc bytealloc.A
   856  
   857  	if r.Properties.IndexPartitions == 0 {
   858  		l.Index = append(l.Index, r.indexBH)
   859  		iter, _ := newBlockIter(r.Compare, indexH.Get())
   860  		for key, value := iter.First(); key != nil; key, value = iter.Next() {
   861  			dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   862  			if err != nil {
   863  				return nil, errCorruptIndexEntry
   864  			}
   865  			if len(dataBH.Props) > 0 {
   866  				alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   867  			}
   868  			l.Data = append(l.Data, dataBH)
   869  		}
   870  	} else {
   871  		l.TopIndex = r.indexBH
   872  		topIter, _ := newBlockIter(r.Compare, indexH.Get())
   873  		iter := &blockIter{}
   874  		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
   875  			indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   876  			if err != nil {
   877  				return nil, errCorruptIndexEntry
   878  			}
   879  			l.Index = append(l.Index, indexBH.BlockHandle)
   880  
   881  			subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle,
   882  				nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
   883  			if err != nil {
   884  				return nil, err
   885  			}
   886  			if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */
   887  				false /* hideObsoletePoints */); err != nil {
   888  				return nil, err
   889  			}
   890  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   891  				dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   892  				if len(dataBH.Props) > 0 {
   893  					alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   894  				}
   895  				if err != nil {
   896  					return nil, errCorruptIndexEntry
   897  				}
   898  				l.Data = append(l.Data, dataBH)
   899  			}
   900  			subIndex.Release()
   901  			*iter = iter.resetForReuse()
   902  		}
   903  	}
   904  	if r.valueBIH.h.Length != 0 {
   905  		vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil, nil /* buffer pool */)
   906  		if err != nil {
   907  			return nil, err
   908  		}
   909  		defer vbiH.Release()
   910  		vbiBlock := vbiH.Get()
   911  		indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength +
   912  			r.valueBIH.blockLengthByteLength)
   913  		i := 0
   914  		for len(vbiBlock) != 0 {
   915  			if len(vbiBlock) < indexEntryLen {
   916  				return nil, errors.Errorf(
   917  					"remaining value index block %d does not contain a full entry of length %d",
   918  					len(vbiBlock), indexEntryLen)
   919  			}
   920  			n := int(r.valueBIH.blockNumByteLength)
   921  			bn := int(littleEndianGet(vbiBlock, n))
   922  			if bn != i {
   923  				return nil, errors.Errorf("unexpected block num %d, expected %d",
   924  					bn, i)
   925  			}
   926  			i++
   927  			vbiBlock = vbiBlock[n:]
   928  			n = int(r.valueBIH.blockOffsetByteLength)
   929  			blockOffset := littleEndianGet(vbiBlock, n)
   930  			vbiBlock = vbiBlock[n:]
   931  			n = int(r.valueBIH.blockLengthByteLength)
   932  			blockLen := littleEndianGet(vbiBlock, n)
   933  			vbiBlock = vbiBlock[n:]
   934  			l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen})
   935  		}
   936  	}
   937  
   938  	return l, nil
   939  }
   940  
   941  // ValidateBlockChecksums validates the checksums for each block in the SSTable.
   942  func (r *Reader) ValidateBlockChecksums() error {
   943  	// Pre-compute the BlockHandles for the underlying file.
   944  	l, err := r.Layout()
   945  	if err != nil {
   946  		return err
   947  	}
   948  
   949  	// Construct the set of blocks to check. Note that the footer is not checked
   950  	// as it is not a block with a checksum.
   951  	blocks := make([]BlockHandle, len(l.Data))
   952  	for i := range l.Data {
   953  		blocks[i] = l.Data[i].BlockHandle
   954  	}
   955  	blocks = append(blocks, l.Index...)
   956  	blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex)
   957  
   958  	// Sorting by offset ensures we are performing a sequential scan of the
   959  	// file.
   960  	slices.SortFunc(blocks, func(a, b BlockHandle) int {
   961  		return cmp.Compare(a.Offset, b.Offset)
   962  	})
   963  
   964  	// Check all blocks sequentially. Make use of read-ahead, given we are
   965  	// scanning the entire file from start to end.
   966  	rh := r.readable.NewReadHandle(context.TODO())
   967  	defer rh.Close()
   968  
   969  	for _, bh := range blocks {
   970  		// Certain blocks may not be present, in which case we skip them.
   971  		if bh.Length == 0 {
   972  			continue
   973  		}
   974  
   975  		// Read the block, which validates the checksum.
   976  		h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* iterStats */, nil /* buffer pool */)
   977  		if err != nil {
   978  			return err
   979  		}
   980  		h.Release()
   981  	}
   982  
   983  	return nil
   984  }
   985  
   986  // CommonProperties implemented the CommonReader interface.
   987  func (r *Reader) CommonProperties() *CommonProperties {
   988  	return &r.Properties.CommonProperties
   989  }
   990  
   991  // EstimateDiskUsage returns the total size of data blocks overlapping the range
   992  // `[start, end]`. Even if a data block partially overlaps, or we cannot
   993  // determine overlap due to abbreviated index keys, the full data block size is
   994  // included in the estimation.
   995  //
   996  // This function does not account for any metablock space usage. Assumes there
   997  // is at least partial overlap, i.e., `[start, end]` falls neither completely
   998  // before nor completely after the file's range.
   999  //
  1000  // Only blocks containing point keys are considered. Range deletion and range
  1001  // key blocks are not considered.
  1002  //
  1003  // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of
  1004  // data blocks overlapped and add that same fraction of the metadata blocks to the
  1005  // estimate.
  1006  func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) {
  1007  	if r.err != nil {
  1008  		return 0, r.err
  1009  	}
  1010  
  1011  	indexH, err := r.readIndex(context.Background(), nil, nil)
  1012  	if err != nil {
  1013  		return 0, err
  1014  	}
  1015  	defer indexH.Release()
  1016  
  1017  	// Iterators over the bottom-level index blocks containing start and end.
  1018  	// These may be different in case of partitioned index but will both point
  1019  	// to the same blockIter over the single index in the unpartitioned case.
  1020  	var startIdxIter, endIdxIter *blockIter
  1021  	if r.Properties.IndexPartitions == 0 {
  1022  		iter, err := newBlockIter(r.Compare, indexH.Get())
  1023  		if err != nil {
  1024  			return 0, err
  1025  		}
  1026  		startIdxIter = iter
  1027  		endIdxIter = iter
  1028  	} else {
  1029  		topIter, err := newBlockIter(r.Compare, indexH.Get())
  1030  		if err != nil {
  1031  			return 0, err
  1032  		}
  1033  
  1034  		key, val := topIter.SeekGE(start, base.SeekGEFlagsNone)
  1035  		if key == nil {
  1036  			// The range falls completely after this file, or an error occurred.
  1037  			return 0, topIter.Error()
  1038  		}
  1039  		startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1040  		if err != nil {
  1041  			return 0, errCorruptIndexEntry
  1042  		}
  1043  		startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle,
  1044  			nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
  1045  		if err != nil {
  1046  			return 0, err
  1047  		}
  1048  		defer startIdxBlock.Release()
  1049  		startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get())
  1050  		if err != nil {
  1051  			return 0, err
  1052  		}
  1053  
  1054  		key, val = topIter.SeekGE(end, base.SeekGEFlagsNone)
  1055  		if key == nil {
  1056  			if err := topIter.Error(); err != nil {
  1057  				return 0, err
  1058  			}
  1059  		} else {
  1060  			endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1061  			if err != nil {
  1062  				return 0, errCorruptIndexEntry
  1063  			}
  1064  			endIdxBlock, err := r.readBlock(context.Background(),
  1065  				endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* iterStats */, nil /* buffer pool */)
  1066  			if err != nil {
  1067  				return 0, err
  1068  			}
  1069  			defer endIdxBlock.Release()
  1070  			endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get())
  1071  			if err != nil {
  1072  				return 0, err
  1073  			}
  1074  		}
  1075  	}
  1076  	// startIdxIter should not be nil at this point, while endIdxIter can be if the
  1077  	// range spans past the end of the file.
  1078  
  1079  	key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone)
  1080  	if key == nil {
  1081  		// The range falls completely after this file, or an error occurred.
  1082  		return 0, startIdxIter.Error()
  1083  	}
  1084  	startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1085  	if err != nil {
  1086  		return 0, errCorruptIndexEntry
  1087  	}
  1088  
  1089  	includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 {
  1090  		// INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil.
  1091  		// Linearly interpolate what is stored in value blocks.
  1092  		//
  1093  		// TODO(sumeer): if we need more accuracy, without loading any data blocks
  1094  		// (which contain the value handles, and which may also be insufficient if
  1095  		// the values are in separate files), we will need to accumulate the
  1096  		// logical size of the key-value pairs and store the cumulative value for
  1097  		// each data block in the index block entry. This increases the size of
  1098  		// the BlockHandle, so wait until this becomes necessary.
  1099  		return dataBlockSize +
  1100  			uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))*
  1101  				float64(r.Properties.ValueBlocksSize))
  1102  	}
  1103  	if endIdxIter == nil {
  1104  		// The range spans beyond this file. Include data blocks through the last.
  1105  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1106  	}
  1107  	key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone)
  1108  	if key == nil {
  1109  		if err := endIdxIter.Error(); err != nil {
  1110  			return 0, err
  1111  		}
  1112  		// The range spans beyond this file. Include data blocks through the last.
  1113  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1114  	}
  1115  	endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1116  	if err != nil {
  1117  		return 0, errCorruptIndexEntry
  1118  	}
  1119  	return includeInterpolatedValueBlocksSize(
  1120  		endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil
  1121  }
  1122  
  1123  // TableFormat returns the format version for the table.
  1124  func (r *Reader) TableFormat() (TableFormat, error) {
  1125  	if r.err != nil {
  1126  		return TableFormatUnspecified, r.err
  1127  	}
  1128  	return r.tableFormat, nil
  1129  }
  1130  
  1131  // NewReader returns a new table reader for the file. Closing the reader will
  1132  // close the file.
  1133  func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  1134  	o = o.ensureDefaults()
  1135  	r := &Reader{
  1136  		readable: f,
  1137  		opts:     o,
  1138  	}
  1139  	if r.opts.Cache == nil {
  1140  		r.opts.Cache = cache.New(0)
  1141  	} else {
  1142  		r.opts.Cache.Ref()
  1143  	}
  1144  
  1145  	if f == nil {
  1146  		r.err = errors.New("pebble/table: nil file")
  1147  		return nil, r.Close()
  1148  	}
  1149  
  1150  	// Note that the extra options are applied twice. First here for pre-apply
  1151  	// options, and then below for post-apply options. Pre and post refer to
  1152  	// before and after reading the metaindex and properties.
  1153  	type preApply interface{ preApply() }
  1154  	for _, opt := range extraOpts {
  1155  		if _, ok := opt.(preApply); ok {
  1156  			opt.readerApply(r)
  1157  		}
  1158  	}
  1159  	if r.cacheID == 0 {
  1160  		r.cacheID = r.opts.Cache.NewID()
  1161  	}
  1162  
  1163  	footer, err := readFooter(f)
  1164  	if err != nil {
  1165  		r.err = err
  1166  		return nil, r.Close()
  1167  	}
  1168  	r.checksumType = footer.checksum
  1169  	r.tableFormat = footer.format
  1170  	// Read the metaindex.
  1171  	if err := r.readMetaindex(footer.metaindexBH); err != nil {
  1172  		r.err = err
  1173  		return nil, r.Close()
  1174  	}
  1175  	r.indexBH = footer.indexBH
  1176  	r.metaIndexBH = footer.metaindexBH
  1177  	r.footerBH = footer.footerBH
  1178  
  1179  	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
  1180  		r.Compare = o.Comparer.Compare
  1181  		r.FormatKey = o.Comparer.FormatKey
  1182  		r.Split = o.Comparer.Split
  1183  	}
  1184  
  1185  	if o.MergerName == r.Properties.MergerName {
  1186  		r.mergerOK = true
  1187  	}
  1188  
  1189  	// Apply the extra options again now that the comparer and merger names are
  1190  	// known.
  1191  	for _, opt := range extraOpts {
  1192  		if _, ok := opt.(preApply); !ok {
  1193  			opt.readerApply(r)
  1194  		}
  1195  	}
  1196  
  1197  	if r.Compare == nil {
  1198  		r.err = errors.Errorf("pebble/table: %d: unknown comparer %s",
  1199  			errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName))
  1200  	}
  1201  	if !r.mergerOK {
  1202  		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
  1203  			r.err = errors.Errorf("pebble/table: %d: unknown merger %s",
  1204  				errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName))
  1205  		}
  1206  	}
  1207  	if r.err != nil {
  1208  		return nil, r.Close()
  1209  	}
  1210  
  1211  	return r, nil
  1212  }
  1213  
  1214  // ReadableFile describes the smallest subset of vfs.File that is required for
  1215  // reading SSTs.
  1216  type ReadableFile interface {
  1217  	io.ReaderAt
  1218  	io.Closer
  1219  	Stat() (os.FileInfo, error)
  1220  }
  1221  
  1222  // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable
  1223  // implementation (which does not support read-ahead)
  1224  func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) {
  1225  	info, err := r.Stat()
  1226  	if err != nil {
  1227  		return nil, err
  1228  	}
  1229  	res := &simpleReadable{
  1230  		f:    r,
  1231  		size: info.Size(),
  1232  	}
  1233  	res.rh = objstorage.MakeNoopReadHandle(res)
  1234  	return res, nil
  1235  }
  1236  
  1237  // simpleReadable wraps a ReadableFile to implement objstorage.Readable.
  1238  type simpleReadable struct {
  1239  	f    ReadableFile
  1240  	size int64
  1241  	rh   objstorage.NoopReadHandle
  1242  }
  1243  
  1244  var _ objstorage.Readable = (*simpleReadable)(nil)
  1245  
  1246  // ReadAt is part of the objstorage.Readable interface.
  1247  func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error {
  1248  	n, err := s.f.ReadAt(p, off)
  1249  	if invariants.Enabled && err == nil && n != len(p) {
  1250  		panic("short read")
  1251  	}
  1252  	return err
  1253  }
  1254  
  1255  // Close is part of the objstorage.Readable interface.
  1256  func (s *simpleReadable) Close() error {
  1257  	return s.f.Close()
  1258  }
  1259  
  1260  // Size is part of the objstorage.Readable interface.
  1261  func (s *simpleReadable) Size() int64 {
  1262  	return s.size
  1263  }
  1264  
  1265  // NewReaddHandle is part of the objstorage.Readable interface.
  1266  func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
  1267  	return &s.rh
  1268  }