github.com/cockroachdb/pebble@v1.1.2/sstable/reader.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"io"
    12  	"os"
    13  	"sort"
    14  	"time"
    15  
    16  	"github.com/cespare/xxhash/v2"
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  	"github.com/cockroachdb/pebble/internal/bytealloc"
    20  	"github.com/cockroachdb/pebble/internal/cache"
    21  	"github.com/cockroachdb/pebble/internal/crc"
    22  	"github.com/cockroachdb/pebble/internal/invariants"
    23  	"github.com/cockroachdb/pebble/internal/keyspan"
    24  	"github.com/cockroachdb/pebble/internal/private"
    25  	"github.com/cockroachdb/pebble/objstorage"
    26  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    27  )
    28  
    29  var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry")
    30  var errReaderClosed = errors.New("pebble/table: reader is closed")
    31  
    32  // decodeBlockHandle returns the block handle encoded at the start of src, as
    33  // well as the number of bytes it occupies. It returns zero if given invalid
    34  // input. A block handle for a data block or a first/lower level index block
    35  // should not be decoded using decodeBlockHandle since the caller may validate
    36  // that the number of bytes decoded is equal to the length of src, which will
    37  // be false if the properties are not decoded. In those cases the caller
    38  // should use decodeBlockHandleWithProperties.
    39  func decodeBlockHandle(src []byte) (BlockHandle, int) {
    40  	offset, n := binary.Uvarint(src)
    41  	length, m := binary.Uvarint(src[n:])
    42  	if n == 0 || m == 0 {
    43  		return BlockHandle{}, 0
    44  	}
    45  	return BlockHandle{offset, length}, n + m
    46  }
    47  
    48  // decodeBlockHandleWithProperties returns the block handle and properties
    49  // encoded in src. src needs to be exactly the length that was encoded. This
    50  // method must be used for data block and first/lower level index blocks. The
    51  // properties in the block handle point to the bytes in src.
    52  func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) {
    53  	bh, n := decodeBlockHandle(src)
    54  	if n == 0 {
    55  		return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle")
    56  	}
    57  	return BlockHandleWithProperties{
    58  		BlockHandle: bh,
    59  		Props:       src[n:],
    60  	}, nil
    61  }
    62  
    63  func encodeBlockHandle(dst []byte, b BlockHandle) int {
    64  	n := binary.PutUvarint(dst, b.Offset)
    65  	m := binary.PutUvarint(dst[n:], b.Length)
    66  	return n + m
    67  }
    68  
    69  func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte {
    70  	n := encodeBlockHandle(dst, b.BlockHandle)
    71  	dst = append(dst[:n], b.Props...)
    72  	return dst
    73  }
    74  
    75  // block is a []byte that holds a sequence of key/value pairs plus an index
    76  // over those pairs.
    77  type block []byte
    78  
    79  type loadBlockResult int8
    80  
    81  const (
    82  	loadBlockOK loadBlockResult = iota
    83  	// Could be due to error or because no block left to load.
    84  	loadBlockFailed
    85  	loadBlockIrrelevant
    86  )
    87  
    88  type blockTransform func([]byte) ([]byte, error)
    89  
    90  // ReaderOption provide an interface to do work on Reader while it is being
    91  // opened.
    92  type ReaderOption interface {
    93  	// readerApply is called on the reader during opening in order to set internal
    94  	// parameters.
    95  	readerApply(*Reader)
    96  }
    97  
    98  // Comparers is a map from comparer name to comparer. It is used for debugging
    99  // tools which may be used on multiple databases configured with different
   100  // comparers. Comparers implements the OpenOption interface and can be passed
   101  // as a parameter to NewReader.
   102  type Comparers map[string]*Comparer
   103  
   104  func (c Comparers) readerApply(r *Reader) {
   105  	if r.Compare != nil || r.Properties.ComparerName == "" {
   106  		return
   107  	}
   108  	if comparer, ok := c[r.Properties.ComparerName]; ok {
   109  		r.Compare = comparer.Compare
   110  		r.FormatKey = comparer.FormatKey
   111  		r.Split = comparer.Split
   112  	}
   113  }
   114  
   115  // Mergers is a map from merger name to merger. It is used for debugging tools
   116  // which may be used on multiple databases configured with different
   117  // mergers. Mergers implements the OpenOption interface and can be passed as
   118  // a parameter to NewReader.
   119  type Mergers map[string]*Merger
   120  
   121  func (m Mergers) readerApply(r *Reader) {
   122  	if r.mergerOK || r.Properties.MergerName == "" {
   123  		return
   124  	}
   125  	_, r.mergerOK = m[r.Properties.MergerName]
   126  }
   127  
   128  // cacheOpts is a Reader open option for specifying the cache ID and sstable file
   129  // number. If not specified, a unique cache ID will be used.
   130  type cacheOpts struct {
   131  	cacheID uint64
   132  	fileNum base.DiskFileNum
   133  }
   134  
   135  // Marker function to indicate the option should be applied before reading the
   136  // sstable properties and, in the write path, before writing the default
   137  // sstable properties.
   138  func (c *cacheOpts) preApply() {}
   139  
   140  func (c *cacheOpts) readerApply(r *Reader) {
   141  	if r.cacheID == 0 {
   142  		r.cacheID = c.cacheID
   143  	}
   144  	if r.fileNum.FileNum() == 0 {
   145  		r.fileNum = c.fileNum
   146  	}
   147  }
   148  
   149  func (c *cacheOpts) writerApply(w *Writer) {
   150  	if w.cacheID == 0 {
   151  		w.cacheID = c.cacheID
   152  	}
   153  	if w.fileNum.FileNum() == 0 {
   154  		w.fileNum = c.fileNum
   155  	}
   156  }
   157  
   158  // rawTombstonesOpt is a Reader open option for specifying that range
   159  // tombstones returned by Reader.NewRangeDelIter() should not be
   160  // fragmented. Used by debug tools to get a raw view of the tombstones
   161  // contained in an sstable.
   162  type rawTombstonesOpt struct{}
   163  
   164  func (rawTombstonesOpt) preApply() {}
   165  
   166  func (rawTombstonesOpt) readerApply(r *Reader) {
   167  	r.rawTombstones = true
   168  }
   169  
   170  func init() {
   171  	private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} {
   172  		return &cacheOpts{cacheID, fileNum}
   173  	}
   174  	private.SSTableRawTombstonesOpt = rawTombstonesOpt{}
   175  }
   176  
   177  // CommonReader abstracts functionality over a Reader or a VirtualReader. This
   178  // can be used by code which doesn't care to distinguish between a reader and a
   179  // virtual reader.
   180  type CommonReader interface {
   181  	NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
   182  	NewRawRangeDelIter() (keyspan.FragmentIterator, error)
   183  	NewIterWithBlockPropertyFiltersAndContextEtc(
   184  		ctx context.Context, lower, upper []byte,
   185  		filterer *BlockPropertiesFilterer,
   186  		hideObsoletePoints, useFilterBlock bool,
   187  		stats *base.InternalIteratorStats,
   188  		rp ReaderProvider,
   189  	) (Iterator, error)
   190  	NewCompactionIter(
   191  		bytesIterated *uint64,
   192  		rp ReaderProvider,
   193  		bufferPool *BufferPool,
   194  	) (Iterator, error)
   195  	EstimateDiskUsage(start, end []byte) (uint64, error)
   196  	CommonProperties() *CommonProperties
   197  }
   198  
   199  // Reader is a table reader.
   200  type Reader struct {
   201  	readable          objstorage.Readable
   202  	cacheID           uint64
   203  	fileNum           base.DiskFileNum
   204  	err               error
   205  	indexBH           BlockHandle
   206  	filterBH          BlockHandle
   207  	rangeDelBH        BlockHandle
   208  	rangeKeyBH        BlockHandle
   209  	rangeDelTransform blockTransform
   210  	valueBIH          valueBlocksIndexHandle
   211  	propertiesBH      BlockHandle
   212  	metaIndexBH       BlockHandle
   213  	footerBH          BlockHandle
   214  	opts              ReaderOptions
   215  	Compare           Compare
   216  	FormatKey         base.FormatKey
   217  	Split             Split
   218  	tableFilter       *tableFilterReader
   219  	// Keep types that are not multiples of 8 bytes at the end and with
   220  	// decreasing size.
   221  	Properties    Properties
   222  	tableFormat   TableFormat
   223  	rawTombstones bool
   224  	mergerOK      bool
   225  	checksumType  ChecksumType
   226  	// metaBufferPool is a buffer pool used exclusively when opening a table and
   227  	// loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate
   228  	// the BufferPool.pool slice as a part of the Reader allocation. It's
   229  	// capacity 3 to accommodate the meta block (1), and both the compressed
   230  	// properties block (1) and decompressed properties block (1)
   231  	// simultaneously.
   232  	metaBufferPool      BufferPool
   233  	metaBufferPoolAlloc [3]allocedBuffer
   234  }
   235  
   236  // Close implements DB.Close, as documented in the pebble package.
   237  func (r *Reader) Close() error {
   238  	r.opts.Cache.Unref()
   239  
   240  	if r.readable != nil {
   241  		r.err = firstError(r.err, r.readable.Close())
   242  		r.readable = nil
   243  	}
   244  
   245  	if r.err != nil {
   246  		return r.err
   247  	}
   248  	// Make any future calls to Get, NewIter or Close return an error.
   249  	r.err = errReaderClosed
   250  	return nil
   251  }
   252  
   253  // NewIterWithBlockPropertyFilters returns an iterator for the contents of the
   254  // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after
   255  // itself and returns a nil iterator.
   256  func (r *Reader) NewIterWithBlockPropertyFilters(
   257  	lower, upper []byte,
   258  	filterer *BlockPropertiesFilterer,
   259  	useFilterBlock bool,
   260  	stats *base.InternalIteratorStats,
   261  	rp ReaderProvider,
   262  ) (Iterator, error) {
   263  	return r.newIterWithBlockPropertyFiltersAndContext(
   264  		context.Background(),
   265  		lower, upper, filterer, false, useFilterBlock, stats, rp, nil,
   266  	)
   267  }
   268  
   269  // NewIterWithBlockPropertyFiltersAndContextEtc is similar to
   270  // NewIterWithBlockPropertyFilters and additionally accepts a context for
   271  // tracing.
   272  //
   273  // If hideObsoletePoints, the callee assumes that filterer already includes
   274  // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by
   275  // first calling TryAddBlockPropertyFilterForHideObsoletePoints.
   276  func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
   277  	ctx context.Context,
   278  	lower, upper []byte,
   279  	filterer *BlockPropertiesFilterer,
   280  	hideObsoletePoints, useFilterBlock bool,
   281  	stats *base.InternalIteratorStats,
   282  	rp ReaderProvider,
   283  ) (Iterator, error) {
   284  	return r.newIterWithBlockPropertyFiltersAndContext(
   285  		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil,
   286  	)
   287  }
   288  
   289  // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called
   290  // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the
   291  // value of hideObsoletePoints and potentially add a block property filter.
   292  func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(
   293  	snapshotForHideObsoletePoints uint64,
   294  	fileLargestSeqNum uint64,
   295  	pointKeyFilters []BlockPropertyFilter,
   296  ) (hideObsoletePoints bool, filters []BlockPropertyFilter) {
   297  	hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 &&
   298  		snapshotForHideObsoletePoints > fileLargestSeqNum
   299  	if hideObsoletePoints {
   300  		pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{})
   301  	}
   302  	return hideObsoletePoints, pointKeyFilters
   303  }
   304  
   305  func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
   306  	ctx context.Context,
   307  	lower, upper []byte,
   308  	filterer *BlockPropertiesFilterer,
   309  	hideObsoletePoints bool,
   310  	useFilterBlock bool,
   311  	stats *base.InternalIteratorStats,
   312  	rp ReaderProvider,
   313  	v *virtualState,
   314  ) (Iterator, error) {
   315  	// NB: pebble.tableCache wraps the returned iterator with one which performs
   316  	// reference counting on the Reader, preventing the Reader from being closed
   317  	// until the final iterator closes.
   318  	if r.Properties.IndexType == twoLevelIndex {
   319  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   320  		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */)
   321  		if err != nil {
   322  			return nil, err
   323  		}
   324  		return i, nil
   325  	}
   326  
   327  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   328  	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */)
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	return i, nil
   333  }
   334  
   335  // NewIter returns an iterator for the contents of the table. If an error
   336  // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter
   337  // must only be used when the Reader is guaranteed to outlive any LazyValues
   338  // returned from the iter.
   339  func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) {
   340  	return r.NewIterWithBlockPropertyFilters(
   341  		lower, upper, nil, true /* useFilterBlock */, nil, /* stats */
   342  		TrivialReaderProvider{Reader: r})
   343  }
   344  
   345  // NewCompactionIter returns an iterator similar to NewIter but it also increments
   346  // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up
   347  // after itself and returns a nil iterator.
   348  func (r *Reader) NewCompactionIter(
   349  	bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool,
   350  ) (Iterator, error) {
   351  	return r.newCompactionIter(bytesIterated, rp, nil, bufferPool)
   352  }
   353  
   354  func (r *Reader) newCompactionIter(
   355  	bytesIterated *uint64, rp ReaderProvider, v *virtualState, bufferPool *BufferPool,
   356  ) (Iterator, error) {
   357  	if r.Properties.IndexType == twoLevelIndex {
   358  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   359  		err := i.init(
   360  			context.Background(),
   361  			r, v, nil /* lower */, nil /* upper */, nil,
   362  			false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   363  			nil /* stats */, rp, bufferPool,
   364  		)
   365  		if err != nil {
   366  			return nil, err
   367  		}
   368  		i.setupForCompaction()
   369  		return &twoLevelCompactionIterator{
   370  			twoLevelIterator: i,
   371  			bytesIterated:    bytesIterated,
   372  		}, nil
   373  	}
   374  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   375  	err := i.init(
   376  		context.Background(), r, v, nil /* lower */, nil, /* upper */
   377  		nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   378  		nil /* stats */, rp, bufferPool,
   379  	)
   380  	if err != nil {
   381  		return nil, err
   382  	}
   383  	i.setupForCompaction()
   384  	return &compactionIterator{
   385  		singleLevelIterator: i,
   386  		bytesIterated:       bytesIterated,
   387  	}, nil
   388  }
   389  
   390  // NewRawRangeDelIter returns an internal iterator for the contents of the
   391  // range-del block for the table. Returns nil if the table does not contain
   392  // any range deletions.
   393  //
   394  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   395  // iterator. Add WithContext methods since the existing ones are public.
   396  func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
   397  	if r.rangeDelBH.Length == 0 {
   398  		return nil, nil
   399  	}
   400  	h, err := r.readRangeDel(nil /* stats */)
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  	i := &fragmentBlockIter{elideSameSeqnum: true}
   405  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   406  		return nil, err
   407  	}
   408  	return i, nil
   409  }
   410  
   411  // NewRawRangeKeyIter returns an internal iterator for the contents of the
   412  // range-key block for the table. Returns nil if the table does not contain any
   413  // range keys.
   414  //
   415  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   416  // iterator. Add WithContext methods since the existing ones are public.
   417  func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
   418  	if r.rangeKeyBH.Length == 0 {
   419  		return nil, nil
   420  	}
   421  	h, err := r.readRangeKey(nil /* stats */)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
   426  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   427  		return nil, err
   428  	}
   429  	return i, nil
   430  }
   431  
   432  type rangeKeyFragmentBlockIter struct {
   433  	fragmentBlockIter
   434  }
   435  
   436  func (i *rangeKeyFragmentBlockIter) Close() error {
   437  	err := i.fragmentBlockIter.Close()
   438  	i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse()
   439  	rangeKeyFragmentBlockIterPool.Put(i)
   440  	return err
   441  }
   442  
   443  func (r *Reader) readIndex(
   444  	ctx context.Context, stats *base.InternalIteratorStats,
   445  ) (bufferHandle, error) {
   446  	ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock)
   447  	return r.readBlock(ctx, r.indexBH, nil, nil, stats, nil /* buffer pool */)
   448  }
   449  
   450  func (r *Reader) readFilter(
   451  	ctx context.Context, stats *base.InternalIteratorStats,
   452  ) (bufferHandle, error) {
   453  	ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock)
   454  	return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */)
   455  }
   456  
   457  func (r *Reader) readRangeDel(stats *base.InternalIteratorStats) (bufferHandle, error) {
   458  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   459  	return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, nil /* buffer pool */)
   460  }
   461  
   462  func (r *Reader) readRangeKey(stats *base.InternalIteratorStats) (bufferHandle, error) {
   463  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   464  	return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */)
   465  }
   466  
   467  func checkChecksum(
   468  	checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum,
   469  ) error {
   470  	expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:])
   471  	var computedChecksum uint32
   472  	switch checksumType {
   473  	case ChecksumTypeCRC32c:
   474  		computedChecksum = crc.New(b[:bh.Length+1]).Value()
   475  	case ChecksumTypeXXHash64:
   476  		computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1]))
   477  	default:
   478  		return errors.Errorf("unsupported checksum type: %d", checksumType)
   479  	}
   480  
   481  	if expectedChecksum != computedChecksum {
   482  		return base.CorruptionErrorf(
   483  			"pebble/table: invalid table %s (checksum mismatch at %d/%d)",
   484  			errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length))
   485  	}
   486  	return nil
   487  }
   488  
   489  type cacheValueOrBuf struct {
   490  	// buf.Valid() returns true if backed by a BufferPool.
   491  	buf Buf
   492  	// v is non-nil if backed by the block cache.
   493  	v *cache.Value
   494  }
   495  
   496  func (b cacheValueOrBuf) get() []byte {
   497  	if b.buf.Valid() {
   498  		return b.buf.p.pool[b.buf.i].b
   499  	}
   500  	return b.v.Buf()
   501  }
   502  
   503  func (b cacheValueOrBuf) release() {
   504  	if b.buf.Valid() {
   505  		b.buf.Release()
   506  	} else {
   507  		cache.Free(b.v)
   508  	}
   509  }
   510  
   511  func (b cacheValueOrBuf) truncate(n int) {
   512  	if b.buf.Valid() {
   513  		b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n]
   514  	} else {
   515  		b.v.Truncate(n)
   516  	}
   517  }
   518  
   519  func (r *Reader) readBlock(
   520  	ctx context.Context,
   521  	bh BlockHandle,
   522  	transform blockTransform,
   523  	readHandle objstorage.ReadHandle,
   524  	stats *base.InternalIteratorStats,
   525  	bufferPool *BufferPool,
   526  ) (handle bufferHandle, _ error) {
   527  	if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil {
   528  		// Cache hit.
   529  		if readHandle != nil {
   530  			readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen))
   531  		}
   532  		if stats != nil {
   533  			stats.BlockBytes += bh.Length
   534  			stats.BlockBytesInCache += bh.Length
   535  		}
   536  		// This block is already in the cache; return a handle to existing vlaue
   537  		// in the cache.
   538  		return bufferHandle{h: h}, nil
   539  	}
   540  
   541  	// Cache miss.
   542  
   543  	if sema := r.opts.LoadBlockSema; sema != nil {
   544  		if err := sema.Acquire(ctx, 1); err != nil {
   545  			// An error here can only come from the context.
   546  			return bufferHandle{}, err
   547  		}
   548  		defer sema.Release(1)
   549  	}
   550  
   551  	var compressed cacheValueOrBuf
   552  	if bufferPool != nil {
   553  		compressed = cacheValueOrBuf{
   554  			buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)),
   555  		}
   556  	} else {
   557  		compressed = cacheValueOrBuf{
   558  			v: cache.Alloc(int(bh.Length + blockTrailerLen)),
   559  		}
   560  	}
   561  
   562  	readStartTime := time.Now()
   563  	var err error
   564  	if readHandle != nil {
   565  		err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   566  	} else {
   567  		err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   568  	}
   569  	readDuration := time.Since(readStartTime)
   570  	// TODO(sumeer): should the threshold be configurable.
   571  	const slowReadTracingThreshold = 5 * time.Millisecond
   572  	// The invariants.Enabled path is for deterministic testing.
   573  	if invariants.Enabled {
   574  		readDuration = slowReadTracingThreshold
   575  	}
   576  	// Call IsTracingEnabled to avoid the allocations of boxing integers into an
   577  	// interface{}, unless necessary.
   578  	if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
   579  		r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s",
   580  			int(bh.Length+blockTrailerLen), readDuration.String())
   581  	}
   582  	if stats != nil {
   583  		stats.BlockBytes += bh.Length
   584  		stats.BlockReadDuration += readDuration
   585  	}
   586  	if err != nil {
   587  		compressed.release()
   588  		return bufferHandle{}, err
   589  	}
   590  	if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil {
   591  		compressed.release()
   592  		return bufferHandle{}, err
   593  	}
   594  
   595  	typ := blockType(compressed.get()[bh.Length])
   596  	compressed.truncate(int(bh.Length))
   597  
   598  	var decompressed cacheValueOrBuf
   599  	if typ == noCompressionBlockType {
   600  		decompressed = compressed
   601  	} else {
   602  		// Decode the length of the decompressed value.
   603  		decodedLen, prefixLen, err := decompressedLen(typ, compressed.get())
   604  		if err != nil {
   605  			compressed.release()
   606  			return bufferHandle{}, err
   607  		}
   608  
   609  		if bufferPool != nil {
   610  			decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)}
   611  		} else {
   612  			decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)}
   613  		}
   614  		if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil {
   615  			compressed.release()
   616  			return bufferHandle{}, err
   617  		}
   618  		compressed.release()
   619  	}
   620  
   621  	if transform != nil {
   622  		// Transforming blocks is very rare, so the extra copy of the
   623  		// transformed data is not problematic.
   624  		tmpTransformed, err := transform(decompressed.get())
   625  		if err != nil {
   626  			decompressed.release()
   627  			return bufferHandle{}, err
   628  		}
   629  
   630  		var transformed cacheValueOrBuf
   631  		if bufferPool != nil {
   632  			transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))}
   633  		} else {
   634  			transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))}
   635  		}
   636  		copy(transformed.get(), tmpTransformed)
   637  		decompressed.release()
   638  		decompressed = transformed
   639  	}
   640  
   641  	if decompressed.buf.Valid() {
   642  		return bufferHandle{b: decompressed.buf}, nil
   643  	}
   644  	h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v)
   645  	return bufferHandle{h: h}, nil
   646  }
   647  
   648  func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
   649  	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
   650  	// v1 format range-del blocks have unfragmented and unsorted range
   651  	// tombstones. We need properly fragmented and sorted range tombstones in
   652  	// order to serve from them directly.
   653  	iter := &blockIter{}
   654  	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil {
   655  		return nil, err
   656  	}
   657  	var tombstones []keyspan.Span
   658  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
   659  		t := keyspan.Span{
   660  			Start: key.UserKey,
   661  			End:   value.InPlaceValue(),
   662  			Keys:  []keyspan.Key{{Trailer: key.Trailer}},
   663  		}
   664  		tombstones = append(tombstones, t)
   665  	}
   666  	keyspan.Sort(r.Compare, tombstones)
   667  
   668  	// Fragment the tombstones, outputting them directly to a block writer.
   669  	rangeDelBlock := blockWriter{
   670  		restartInterval: 1,
   671  	}
   672  	frag := keyspan.Fragmenter{
   673  		Cmp:    r.Compare,
   674  		Format: r.FormatKey,
   675  		Emit: func(s keyspan.Span) {
   676  			for _, k := range s.Keys {
   677  				startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer}
   678  				rangeDelBlock.add(startIK, s.End)
   679  			}
   680  		},
   681  	}
   682  	for i := range tombstones {
   683  		frag.Add(tombstones[i])
   684  	}
   685  	frag.Finish()
   686  
   687  	// Return the contents of the constructed v2 format range-del block.
   688  	return rangeDelBlock.finish(), nil
   689  }
   690  
   691  func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
   692  	// We use a BufferPool when reading metaindex blocks in order to avoid
   693  	// populating the block cache with these blocks. In heavy-write workloads,
   694  	// especially with high compaction concurrency, new tables may be created
   695  	// frequently. Populating the block cache with these metaindex blocks adds
   696  	// additional contention on the block cache mutexes (see #1997).
   697  	// Additionally, these blocks are exceedingly unlikely to be read again
   698  	// while they're still in the block cache except in misconfigurations with
   699  	// excessive sstables counts or a table cache that's far too small.
   700  	r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0])
   701  	// When we're finished, release the buffers we've allocated back to memory
   702  	// allocator. We don't expect to use metaBufferPool again.
   703  	defer r.metaBufferPool.Release()
   704  
   705  	b, err := r.readBlock(
   706  		context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool)
   707  	if err != nil {
   708  		return err
   709  	}
   710  	data := b.Get()
   711  	defer b.Release()
   712  
   713  	if uint64(len(data)) != metaindexBH.Length {
   714  		return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d",
   715  			errors.Safe(len(data)), errors.Safe(metaindexBH.Length))
   716  	}
   717  
   718  	i, err := newRawBlockIter(bytes.Compare, data)
   719  	if err != nil {
   720  		return err
   721  	}
   722  
   723  	meta := map[string]BlockHandle{}
   724  	for valid := i.First(); valid; valid = i.Next() {
   725  		value := i.Value()
   726  		if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) {
   727  			vbih, n, err := decodeValueBlocksIndexHandle(i.Value())
   728  			if err != nil {
   729  				return err
   730  			}
   731  			if n == 0 || n != len(value) {
   732  				return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)")
   733  			}
   734  			r.valueBIH = vbih
   735  		} else {
   736  			bh, n := decodeBlockHandle(value)
   737  			if n == 0 || n != len(value) {
   738  				return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)")
   739  			}
   740  			meta[string(i.Key().UserKey)] = bh
   741  		}
   742  	}
   743  	if err := i.Close(); err != nil {
   744  		return err
   745  	}
   746  
   747  	if bh, ok := meta[metaPropertiesName]; ok {
   748  		b, err = r.readBlock(
   749  			context.Background(), bh, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
   750  		if err != nil {
   751  			return err
   752  		}
   753  		r.propertiesBH = bh
   754  		err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties)
   755  		b.Release()
   756  		if err != nil {
   757  			return err
   758  		}
   759  	}
   760  
   761  	if bh, ok := meta[metaRangeDelV2Name]; ok {
   762  		r.rangeDelBH = bh
   763  	} else if bh, ok := meta[metaRangeDelName]; ok {
   764  		r.rangeDelBH = bh
   765  		if !r.rawTombstones {
   766  			r.rangeDelTransform = r.transformRangeDelV1
   767  		}
   768  	}
   769  
   770  	if bh, ok := meta[metaRangeKeyName]; ok {
   771  		r.rangeKeyBH = bh
   772  	}
   773  
   774  	for name, fp := range r.opts.Filters {
   775  		types := []struct {
   776  			ftype  FilterType
   777  			prefix string
   778  		}{
   779  			{TableFilter, "fullfilter."},
   780  		}
   781  		var done bool
   782  		for _, t := range types {
   783  			if bh, ok := meta[t.prefix+name]; ok {
   784  				r.filterBH = bh
   785  
   786  				switch t.ftype {
   787  				case TableFilter:
   788  					r.tableFilter = newTableFilterReader(fp)
   789  				default:
   790  					return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype))
   791  				}
   792  
   793  				done = true
   794  				break
   795  			}
   796  		}
   797  		if done {
   798  			break
   799  		}
   800  	}
   801  	return nil
   802  }
   803  
   804  // Layout returns the layout (block organization) for an sstable.
   805  func (r *Reader) Layout() (*Layout, error) {
   806  	if r.err != nil {
   807  		return nil, r.err
   808  	}
   809  
   810  	l := &Layout{
   811  		Data:       make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks),
   812  		Filter:     r.filterBH,
   813  		RangeDel:   r.rangeDelBH,
   814  		RangeKey:   r.rangeKeyBH,
   815  		ValueIndex: r.valueBIH.h,
   816  		Properties: r.propertiesBH,
   817  		MetaIndex:  r.metaIndexBH,
   818  		Footer:     r.footerBH,
   819  		Format:     r.tableFormat,
   820  	}
   821  
   822  	indexH, err := r.readIndex(context.Background(), nil)
   823  	if err != nil {
   824  		return nil, err
   825  	}
   826  	defer indexH.Release()
   827  
   828  	var alloc bytealloc.A
   829  
   830  	if r.Properties.IndexPartitions == 0 {
   831  		l.Index = append(l.Index, r.indexBH)
   832  		iter, _ := newBlockIter(r.Compare, indexH.Get())
   833  		for key, value := iter.First(); key != nil; key, value = iter.Next() {
   834  			dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   835  			if err != nil {
   836  				return nil, errCorruptIndexEntry
   837  			}
   838  			if len(dataBH.Props) > 0 {
   839  				alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   840  			}
   841  			l.Data = append(l.Data, dataBH)
   842  		}
   843  	} else {
   844  		l.TopIndex = r.indexBH
   845  		topIter, _ := newBlockIter(r.Compare, indexH.Get())
   846  		iter := &blockIter{}
   847  		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
   848  			indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   849  			if err != nil {
   850  				return nil, errCorruptIndexEntry
   851  			}
   852  			l.Index = append(l.Index, indexBH.BlockHandle)
   853  
   854  			subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle,
   855  				nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
   856  			if err != nil {
   857  				return nil, err
   858  			}
   859  			if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */
   860  				false /* hideObsoletePoints */); err != nil {
   861  				return nil, err
   862  			}
   863  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   864  				dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   865  				if len(dataBH.Props) > 0 {
   866  					alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   867  				}
   868  				if err != nil {
   869  					return nil, errCorruptIndexEntry
   870  				}
   871  				l.Data = append(l.Data, dataBH)
   872  			}
   873  			subIndex.Release()
   874  			*iter = iter.resetForReuse()
   875  		}
   876  	}
   877  	if r.valueBIH.h.Length != 0 {
   878  		vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil /* buffer pool */)
   879  		if err != nil {
   880  			return nil, err
   881  		}
   882  		defer vbiH.Release()
   883  		vbiBlock := vbiH.Get()
   884  		indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength +
   885  			r.valueBIH.blockLengthByteLength)
   886  		i := 0
   887  		for len(vbiBlock) != 0 {
   888  			if len(vbiBlock) < indexEntryLen {
   889  				return nil, errors.Errorf(
   890  					"remaining value index block %d does not contain a full entry of length %d",
   891  					len(vbiBlock), indexEntryLen)
   892  			}
   893  			n := int(r.valueBIH.blockNumByteLength)
   894  			bn := int(littleEndianGet(vbiBlock, n))
   895  			if bn != i {
   896  				return nil, errors.Errorf("unexpected block num %d, expected %d",
   897  					bn, i)
   898  			}
   899  			i++
   900  			vbiBlock = vbiBlock[n:]
   901  			n = int(r.valueBIH.blockOffsetByteLength)
   902  			blockOffset := littleEndianGet(vbiBlock, n)
   903  			vbiBlock = vbiBlock[n:]
   904  			n = int(r.valueBIH.blockLengthByteLength)
   905  			blockLen := littleEndianGet(vbiBlock, n)
   906  			vbiBlock = vbiBlock[n:]
   907  			l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen})
   908  		}
   909  	}
   910  
   911  	return l, nil
   912  }
   913  
   914  // ValidateBlockChecksums validates the checksums for each block in the SSTable.
   915  func (r *Reader) ValidateBlockChecksums() error {
   916  	// Pre-compute the BlockHandles for the underlying file.
   917  	l, err := r.Layout()
   918  	if err != nil {
   919  		return err
   920  	}
   921  
   922  	// Construct the set of blocks to check. Note that the footer is not checked
   923  	// as it is not a block with a checksum.
   924  	blocks := make([]BlockHandle, len(l.Data))
   925  	for i := range l.Data {
   926  		blocks[i] = l.Data[i].BlockHandle
   927  	}
   928  	blocks = append(blocks, l.Index...)
   929  	blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex)
   930  
   931  	// Sorting by offset ensures we are performing a sequential scan of the
   932  	// file.
   933  	sort.Slice(blocks, func(i, j int) bool {
   934  		return blocks[i].Offset < blocks[j].Offset
   935  	})
   936  
   937  	// Check all blocks sequentially. Make use of read-ahead, given we are
   938  	// scanning the entire file from start to end.
   939  	rh := r.readable.NewReadHandle(context.TODO())
   940  	defer rh.Close()
   941  
   942  	for _, bh := range blocks {
   943  		// Certain blocks may not be present, in which case we skip them.
   944  		if bh.Length == 0 {
   945  			continue
   946  		}
   947  
   948  		// Read the block, which validates the checksum.
   949  		h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* buffer pool */)
   950  		if err != nil {
   951  			return err
   952  		}
   953  		h.Release()
   954  	}
   955  
   956  	return nil
   957  }
   958  
   959  // CommonProperties implemented the CommonReader interface.
   960  func (r *Reader) CommonProperties() *CommonProperties {
   961  	return &r.Properties.CommonProperties
   962  }
   963  
   964  // EstimateDiskUsage returns the total size of data blocks overlapping the range
   965  // `[start, end]`. Even if a data block partially overlaps, or we cannot
   966  // determine overlap due to abbreviated index keys, the full data block size is
   967  // included in the estimation.
   968  //
   969  // This function does not account for any metablock space usage. Assumes there
   970  // is at least partial overlap, i.e., `[start, end]` falls neither completely
   971  // before nor completely after the file's range.
   972  //
   973  // Only blocks containing point keys are considered. Range deletion and range
   974  // key blocks are not considered.
   975  //
   976  // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of
   977  // data blocks overlapped and add that same fraction of the metadata blocks to the
   978  // estimate.
   979  func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) {
   980  	if r.err != nil {
   981  		return 0, r.err
   982  	}
   983  
   984  	indexH, err := r.readIndex(context.Background(), nil)
   985  	if err != nil {
   986  		return 0, err
   987  	}
   988  	defer indexH.Release()
   989  
   990  	// Iterators over the bottom-level index blocks containing start and end.
   991  	// These may be different in case of partitioned index but will both point
   992  	// to the same blockIter over the single index in the unpartitioned case.
   993  	var startIdxIter, endIdxIter *blockIter
   994  	if r.Properties.IndexPartitions == 0 {
   995  		iter, err := newBlockIter(r.Compare, indexH.Get())
   996  		if err != nil {
   997  			return 0, err
   998  		}
   999  		startIdxIter = iter
  1000  		endIdxIter = iter
  1001  	} else {
  1002  		topIter, err := newBlockIter(r.Compare, indexH.Get())
  1003  		if err != nil {
  1004  			return 0, err
  1005  		}
  1006  
  1007  		key, val := topIter.SeekGE(start, base.SeekGEFlagsNone)
  1008  		if key == nil {
  1009  			// The range falls completely after this file, or an error occurred.
  1010  			return 0, topIter.Error()
  1011  		}
  1012  		startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1013  		if err != nil {
  1014  			return 0, errCorruptIndexEntry
  1015  		}
  1016  		startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle,
  1017  			nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
  1018  		if err != nil {
  1019  			return 0, err
  1020  		}
  1021  		defer startIdxBlock.Release()
  1022  		startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get())
  1023  		if err != nil {
  1024  			return 0, err
  1025  		}
  1026  
  1027  		key, val = topIter.SeekGE(end, base.SeekGEFlagsNone)
  1028  		if key == nil {
  1029  			if err := topIter.Error(); err != nil {
  1030  				return 0, err
  1031  			}
  1032  		} else {
  1033  			endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1034  			if err != nil {
  1035  				return 0, errCorruptIndexEntry
  1036  			}
  1037  			endIdxBlock, err := r.readBlock(context.Background(),
  1038  				endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
  1039  			if err != nil {
  1040  				return 0, err
  1041  			}
  1042  			defer endIdxBlock.Release()
  1043  			endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get())
  1044  			if err != nil {
  1045  				return 0, err
  1046  			}
  1047  		}
  1048  	}
  1049  	// startIdxIter should not be nil at this point, while endIdxIter can be if the
  1050  	// range spans past the end of the file.
  1051  
  1052  	key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone)
  1053  	if key == nil {
  1054  		// The range falls completely after this file, or an error occurred.
  1055  		return 0, startIdxIter.Error()
  1056  	}
  1057  	startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1058  	if err != nil {
  1059  		return 0, errCorruptIndexEntry
  1060  	}
  1061  
  1062  	includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 {
  1063  		// INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil.
  1064  		// Linearly interpolate what is stored in value blocks.
  1065  		//
  1066  		// TODO(sumeer): if we need more accuracy, without loading any data blocks
  1067  		// (which contain the value handles, and which may also be insufficient if
  1068  		// the values are in separate files), we will need to accumulate the
  1069  		// logical size of the key-value pairs and store the cumulative value for
  1070  		// each data block in the index block entry. This increases the size of
  1071  		// the BlockHandle, so wait until this becomes necessary.
  1072  		return dataBlockSize +
  1073  			uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))*
  1074  				float64(r.Properties.ValueBlocksSize))
  1075  	}
  1076  	if endIdxIter == nil {
  1077  		// The range spans beyond this file. Include data blocks through the last.
  1078  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1079  	}
  1080  	key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone)
  1081  	if key == nil {
  1082  		if err := endIdxIter.Error(); err != nil {
  1083  			return 0, err
  1084  		}
  1085  		// The range spans beyond this file. Include data blocks through the last.
  1086  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1087  	}
  1088  	endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1089  	if err != nil {
  1090  		return 0, errCorruptIndexEntry
  1091  	}
  1092  	return includeInterpolatedValueBlocksSize(
  1093  		endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil
  1094  }
  1095  
  1096  // TableFormat returns the format version for the table.
  1097  func (r *Reader) TableFormat() (TableFormat, error) {
  1098  	if r.err != nil {
  1099  		return TableFormatUnspecified, r.err
  1100  	}
  1101  	return r.tableFormat, nil
  1102  }
  1103  
  1104  // NewReader returns a new table reader for the file. Closing the reader will
  1105  // close the file.
  1106  func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  1107  	o = o.ensureDefaults()
  1108  	r := &Reader{
  1109  		readable: f,
  1110  		opts:     o,
  1111  	}
  1112  	if r.opts.Cache == nil {
  1113  		r.opts.Cache = cache.New(0)
  1114  	} else {
  1115  		r.opts.Cache.Ref()
  1116  	}
  1117  
  1118  	if f == nil {
  1119  		r.err = errors.New("pebble/table: nil file")
  1120  		return nil, r.Close()
  1121  	}
  1122  
  1123  	// Note that the extra options are applied twice. First here for pre-apply
  1124  	// options, and then below for post-apply options. Pre and post refer to
  1125  	// before and after reading the metaindex and properties.
  1126  	type preApply interface{ preApply() }
  1127  	for _, opt := range extraOpts {
  1128  		if _, ok := opt.(preApply); ok {
  1129  			opt.readerApply(r)
  1130  		}
  1131  	}
  1132  	if r.cacheID == 0 {
  1133  		r.cacheID = r.opts.Cache.NewID()
  1134  	}
  1135  
  1136  	footer, err := readFooter(f)
  1137  	if err != nil {
  1138  		r.err = err
  1139  		return nil, r.Close()
  1140  	}
  1141  	r.checksumType = footer.checksum
  1142  	r.tableFormat = footer.format
  1143  	// Read the metaindex.
  1144  	if err := r.readMetaindex(footer.metaindexBH); err != nil {
  1145  		r.err = err
  1146  		return nil, r.Close()
  1147  	}
  1148  	r.indexBH = footer.indexBH
  1149  	r.metaIndexBH = footer.metaindexBH
  1150  	r.footerBH = footer.footerBH
  1151  
  1152  	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
  1153  		r.Compare = o.Comparer.Compare
  1154  		r.FormatKey = o.Comparer.FormatKey
  1155  		r.Split = o.Comparer.Split
  1156  	}
  1157  
  1158  	if o.MergerName == r.Properties.MergerName {
  1159  		r.mergerOK = true
  1160  	}
  1161  
  1162  	// Apply the extra options again now that the comparer and merger names are
  1163  	// known.
  1164  	for _, opt := range extraOpts {
  1165  		if _, ok := opt.(preApply); !ok {
  1166  			opt.readerApply(r)
  1167  		}
  1168  	}
  1169  
  1170  	if r.Compare == nil {
  1171  		r.err = errors.Errorf("pebble/table: %d: unknown comparer %s",
  1172  			errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName))
  1173  	}
  1174  	if !r.mergerOK {
  1175  		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
  1176  			r.err = errors.Errorf("pebble/table: %d: unknown merger %s",
  1177  				errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName))
  1178  		}
  1179  	}
  1180  	if r.err != nil {
  1181  		return nil, r.Close()
  1182  	}
  1183  
  1184  	return r, nil
  1185  }
  1186  
  1187  // ReadableFile describes the smallest subset of vfs.File that is required for
  1188  // reading SSTs.
  1189  type ReadableFile interface {
  1190  	io.ReaderAt
  1191  	io.Closer
  1192  	Stat() (os.FileInfo, error)
  1193  }
  1194  
  1195  // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable
  1196  // implementation (which does not support read-ahead)
  1197  func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) {
  1198  	info, err := r.Stat()
  1199  	if err != nil {
  1200  		return nil, err
  1201  	}
  1202  	res := &simpleReadable{
  1203  		f:    r,
  1204  		size: info.Size(),
  1205  	}
  1206  	res.rh = objstorage.MakeNoopReadHandle(res)
  1207  	return res, nil
  1208  }
  1209  
  1210  // simpleReadable wraps a ReadableFile to implement objstorage.Readable.
  1211  type simpleReadable struct {
  1212  	f    ReadableFile
  1213  	size int64
  1214  	rh   objstorage.NoopReadHandle
  1215  }
  1216  
  1217  var _ objstorage.Readable = (*simpleReadable)(nil)
  1218  
  1219  // ReadAt is part of the objstorage.Readable interface.
  1220  func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error {
  1221  	n, err := s.f.ReadAt(p, off)
  1222  	if invariants.Enabled && err == nil && n != len(p) {
  1223  		panic("short read")
  1224  	}
  1225  	return err
  1226  }
  1227  
  1228  // Close is part of the objstorage.Readable interface.
  1229  func (s *simpleReadable) Close() error {
  1230  	return s.f.Close()
  1231  }
  1232  
  1233  // Size is part of the objstorage.Readable interface.
  1234  func (s *simpleReadable) Size() int64 {
  1235  	return s.size
  1236  }
  1237  
  1238  // NewReaddHandle is part of the objstorage.Readable interface.
  1239  func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
  1240  	return &s.rh
  1241  }