github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/reader.go

github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/reader.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"encoding/binary"
    11  	"io"
    12  	"os"
    13  	"sort"
    14  	"time"
    15  
    16  	"github.com/cespare/xxhash/v2"
    17  	"github.com/cockroachdb/errors"
    18  	"github.com/cockroachdb/pebble/internal/base"
    19  	"github.com/cockroachdb/pebble/internal/bytealloc"
    20  	"github.com/cockroachdb/pebble/internal/cache"
    21  	"github.com/cockroachdb/pebble/internal/crc"
    22  	"github.com/cockroachdb/pebble/internal/invariants"
    23  	"github.com/cockroachdb/pebble/internal/keyspan"
    24  	"github.com/cockroachdb/pebble/internal/private"
    25  	"github.com/cockroachdb/pebble/objstorage"
    26  	"github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing"
    27  )
    28  
    29  var errCorruptIndexEntry = base.CorruptionErrorf("pebble/table: corrupt index entry")
    30  var errReaderClosed = errors.New("pebble/table: reader is closed")
    31  
    32  // decodeBlockHandle returns the block handle encoded at the start of src, as
    33  // well as the number of bytes it occupies. It returns zero if given invalid
    34  // input. A block handle for a data block or a first/lower level index block
    35  // should not be decoded using decodeBlockHandle since the caller may validate
    36  // that the number of bytes decoded is equal to the length of src, which will
    37  // be false if the properties are not decoded. In those cases the caller
    38  // should use decodeBlockHandleWithProperties.
    39  func decodeBlockHandle(src []byte) (BlockHandle, int) {
    40  	offset, n := binary.Uvarint(src)
    41  	length, m := binary.Uvarint(src[n:])
    42  	if n == 0 || m == 0 {
    43  		return BlockHandle{}, 0
    44  	}
    45  	return BlockHandle{offset, length}, n + m
    46  }
    47  
    48  // decodeBlockHandleWithProperties returns the block handle and properties
    49  // encoded in src. src needs to be exactly the length that was encoded. This
    50  // method must be used for data block and first/lower level index blocks. The
    51  // properties in the block handle point to the bytes in src.
    52  func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) {
    53  	bh, n := decodeBlockHandle(src)
    54  	if n == 0 {
    55  		return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle")
    56  	}
    57  	return BlockHandleWithProperties{
    58  		BlockHandle: bh,
    59  		Props:       src[n:],
    60  	}, nil
    61  }
    62  
    63  func encodeBlockHandle(dst []byte, b BlockHandle) int {
    64  	n := binary.PutUvarint(dst, b.Offset)
    65  	m := binary.PutUvarint(dst[n:], b.Length)
    66  	return n + m
    67  }
    68  
    69  func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte {
    70  	n := encodeBlockHandle(dst, b.BlockHandle)
    71  	dst = append(dst[:n], b.Props...)
    72  	return dst
    73  }
    74  
    75  // block is a []byte that holds a sequence of key/value pairs plus an index
    76  // over those pairs.
    77  type block []byte
    78  
    79  type loadBlockResult int8
    80  
    81  const (
    82  	loadBlockOK loadBlockResult = iota
    83  	// Could be due to error or because no block left to load.
    84  	loadBlockFailed
    85  	loadBlockIrrelevant
    86  )
    87  
    88  type blockTransform func([]byte) ([]byte, error)
    89  
    90  // ReaderOption provide an interface to do work on Reader while it is being
    91  // opened.
    92  type ReaderOption interface {
    93  	// readerApply is called on the reader during opening in order to set internal
    94  	// parameters.
    95  	readerApply(*Reader)
    96  }
    97  
    98  // Comparers is a map from comparer name to comparer. It is used for debugging
    99  // tools which may be used on multiple databases configured with different
   100  // comparers. Comparers implements the OpenOption interface and can be passed
   101  // as a parameter to NewReader.
   102  type Comparers map[string]*Comparer
   103  
   104  func (c Comparers) readerApply(r *Reader) {
   105  	if r.Compare != nil || r.Properties.ComparerName == "" {
   106  		return
   107  	}
   108  	if comparer, ok := c[r.Properties.ComparerName]; ok {
   109  		r.Compare = comparer.Compare
   110  		r.FormatKey = comparer.FormatKey
   111  		r.Split = comparer.Split
   112  	}
   113  }
   114  
   115  // Mergers is a map from merger name to merger. It is used for debugging tools
   116  // which may be used on multiple databases configured with different
   117  // mergers. Mergers implements the OpenOption interface and can be passed as
   118  // a parameter to NewReader.
   119  type Mergers map[string]*Merger
   120  
   121  func (m Mergers) readerApply(r *Reader) {
   122  	if r.mergerOK || r.Properties.MergerName == "" {
   123  		return
   124  	}
   125  	_, r.mergerOK = m[r.Properties.MergerName]
   126  }
   127  
   128  // cacheOpts is a Reader open option for specifying the cache ID and sstable file
   129  // number. If not specified, a unique cache ID will be used.
   130  type cacheOpts struct {
   131  	cacheID uint64
   132  	fileNum base.DiskFileNum
   133  }
   134  
   135  // Marker function to indicate the option should be applied before reading the
   136  // sstable properties and, in the write path, before writing the default
   137  // sstable properties.
   138  func (c *cacheOpts) preApply() {}
   139  
   140  func (c *cacheOpts) readerApply(r *Reader) {
   141  	if r.cacheID == 0 {
   142  		r.cacheID = c.cacheID
   143  	}
   144  	if r.fileNum.FileNum() == 0 {
   145  		r.fileNum = c.fileNum
   146  	}
   147  }
   148  
   149  func (c *cacheOpts) writerApply(w *Writer) {
   150  	if w.cacheID == 0 {
   151  		w.cacheID = c.cacheID
   152  	}
   153  	if w.fileNum.FileNum() == 0 {
   154  		w.fileNum = c.fileNum
   155  	}
   156  }
   157  
   158  // rawTombstonesOpt is a Reader open option for specifying that range
   159  // tombstones returned by Reader.NewRangeDelIter() should not be
   160  // fragmented. Used by debug tools to get a raw view of the tombstones
   161  // contained in an sstable.
   162  type rawTombstonesOpt struct{}
   163  
   164  func (rawTombstonesOpt) preApply() {}
   165  
   166  func (rawTombstonesOpt) readerApply(r *Reader) {
   167  	r.rawTombstones = true
   168  }
   169  
   170  func init() {
   171  	private.SSTableCacheOpts = func(cacheID uint64, fileNum base.DiskFileNum) interface{} {
   172  		return &cacheOpts{cacheID, fileNum}
   173  	}
   174  	private.SSTableRawTombstonesOpt = rawTombstonesOpt{}
   175  }
   176  
   177  // CommonReader abstracts functionality over a Reader or a VirtualReader. This
   178  // can be used by code which doesn't care to distinguish between a reader and a
   179  // virtual reader.
   180  type CommonReader interface {
   181  	NewRawRangeKeyIter() (keyspan.FragmentIterator, error)
   182  	NewRawRangeDelIter() (keyspan.FragmentIterator, error)
   183  	NewIterWithBlockPropertyFiltersAndContextEtc(
   184  		ctx context.Context, lower, upper []byte,
   185  		filterer *BlockPropertiesFilterer,
   186  		hideObsoletePoints, useFilterBlock bool,
   187  		stats *base.InternalIteratorStats,
   188  		rp ReaderProvider,
   189  	) (Iterator, error)
   190  	NewCompactionIter(
   191  		bytesIterated *uint64,
   192  		rp ReaderProvider,
   193  		bufferPool *BufferPool,
   194  	) (Iterator, error)
   195  	EstimateDiskUsage(start, end []byte) (uint64, error)
   196  	CommonProperties() *CommonProperties
   197  }
   198  
   199  // Reader is a table reader.
   200  type Reader struct {
   201  	readable          objstorage.Readable
   202  	cacheID           uint64
   203  	fileNum           base.DiskFileNum
   204  	err               error
   205  	indexBH           BlockHandle
   206  	filterBH          BlockHandle
   207  	rangeDelBH        BlockHandle
   208  	rangeKeyBH        BlockHandle
   209  	rangeDelTransform blockTransform
   210  	valueBIH          valueBlocksIndexHandle
   211  	propertiesBH      BlockHandle
   212  	metaIndexBH       BlockHandle
   213  	footerBH          BlockHandle
   214  	opts              ReaderOptions
   215  	Compare           Compare
   216  	FormatKey         base.FormatKey
   217  	Split             Split
   218  	tableFilter       *tableFilterReader
   219  	// Keep types that are not multiples of 8 bytes at the end and with
   220  	// decreasing size.
   221  	Properties    Properties
   222  	tableFormat   TableFormat
   223  	rawTombstones bool
   224  	mergerOK      bool
   225  	checksumType  ChecksumType
   226  	// metaBufferPool is a buffer pool used exclusively when opening a table and
   227  	// loading its meta blocks. metaBufferPoolAlloc is used to batch-allocate
   228  	// the BufferPool.pool slice as a part of the Reader allocation. It's
   229  	// capacity 3 to accommodate the meta block (1), and both the compressed
   230  	// properties block (1) and decompressed properties block (1)
   231  	// simultaneously.
   232  	metaBufferPool      BufferPool
   233  	metaBufferPoolAlloc [3]allocedBuffer
   234  }
   235  
   236  // Close implements DB.Close, as documented in the pebble package.
   237  func (r *Reader) Close() error {
   238  	r.opts.Cache.Unref()
   239  
   240  	if r.readable != nil {
   241  		r.err = firstError(r.err, r.readable.Close())
   242  		r.readable = nil
   243  	}
   244  
   245  	if r.err != nil {
   246  		return r.err
   247  	}
   248  	// Make any future calls to Get, NewIter or Close return an error.
   249  	r.err = errReaderClosed
   250  	return nil
   251  }
   252  
   253  // NewIterWithBlockPropertyFilters returns an iterator for the contents of the
   254  // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after
   255  // itself and returns a nil iterator.
   256  func (r *Reader) NewIterWithBlockPropertyFilters(
   257  	lower, upper []byte,
   258  	filterer *BlockPropertiesFilterer,
   259  	useFilterBlock bool,
   260  	stats *base.InternalIteratorStats,
   261  	rp ReaderProvider,
   262  ) (Iterator, error) {
   263  	return r.newIterWithBlockPropertyFiltersAndContext(
   264  		context.Background(),
   265  		lower, upper, filterer, false, useFilterBlock, stats, rp, nil,
   266  	)
   267  }
   268  
   269  // NewIterWithBlockPropertyFiltersAndContextEtc is similar to
   270  // NewIterWithBlockPropertyFilters and additionally accepts a context for
   271  // tracing.
   272  //
   273  // If hideObsoletePoints, the callee assumes that filterer already includes
   274  // obsoleteKeyBlockPropertyFilter. The caller can satisfy this contract by
   275  // first calling TryAddBlockPropertyFilterForHideObsoletePoints.
   276  func (r *Reader) NewIterWithBlockPropertyFiltersAndContextEtc(
   277  	ctx context.Context,
   278  	lower, upper []byte,
   279  	filterer *BlockPropertiesFilterer,
   280  	hideObsoletePoints, useFilterBlock bool,
   281  	stats *base.InternalIteratorStats,
   282  	rp ReaderProvider,
   283  ) (Iterator, error) {
   284  	return r.newIterWithBlockPropertyFiltersAndContext(
   285  		ctx, lower, upper, filterer, hideObsoletePoints, useFilterBlock, stats, rp, nil,
   286  	)
   287  }
   288  
   289  // TryAddBlockPropertyFilterForHideObsoletePoints is expected to be called
   290  // before the call to NewIterWithBlockPropertyFiltersAndContextEtc, to get the
   291  // value of hideObsoletePoints and potentially add a block property filter.
   292  func (r *Reader) TryAddBlockPropertyFilterForHideObsoletePoints(
   293  	snapshotForHideObsoletePoints uint64,
   294  	fileLargestSeqNum uint64,
   295  	pointKeyFilters []BlockPropertyFilter,
   296  ) (hideObsoletePoints bool, filters []BlockPropertyFilter) {
   297  	hideObsoletePoints = r.tableFormat >= TableFormatPebblev4 &&
   298  		snapshotForHideObsoletePoints > fileLargestSeqNum
   299  	if hideObsoletePoints {
   300  		pointKeyFilters = append(pointKeyFilters, obsoleteKeyBlockPropertyFilter{})
   301  	}
   302  	return hideObsoletePoints, pointKeyFilters
   303  }
   304  
   305  func (r *Reader) newIterWithBlockPropertyFiltersAndContext(
   306  	ctx context.Context,
   307  	lower, upper []byte,
   308  	filterer *BlockPropertiesFilterer,
   309  	hideObsoletePoints bool,
   310  	useFilterBlock bool,
   311  	stats *base.InternalIteratorStats,
   312  	rp ReaderProvider,
   313  	v *virtualState,
   314  ) (Iterator, error) {
   315  	// NB: pebble.tableCache wraps the returned iterator with one which performs
   316  	// reference counting on the Reader, preventing the Reader from being closed
   317  	// until the final iterator closes.
   318  	if r.Properties.IndexType == twoLevelIndex {
   319  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   320  		err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */)
   321  		if err != nil {
   322  			return nil, err
   323  		}
   324  		return i, nil
   325  	}
   326  
   327  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   328  	err := i.init(ctx, r, v, lower, upper, filterer, useFilterBlock, hideObsoletePoints, stats, rp, nil /* bufferPool */)
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  	return i, nil
   333  }
   334  
   335  // NewIter returns an iterator for the contents of the table. If an error
   336  // occurs, NewIter cleans up after itself and returns a nil iterator. NewIter
   337  // must only be used when the Reader is guaranteed to outlive any LazyValues
   338  // returned from the iter.
   339  func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) {
   340  	return r.NewIterWithBlockPropertyFilters(
   341  		lower, upper, nil, true /* useFilterBlock */, nil, /* stats */
   342  		TrivialReaderProvider{Reader: r})
   343  }
   344  
   345  // NewCompactionIter returns an iterator similar to NewIter but it also increments
   346  // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up
   347  // after itself and returns a nil iterator.
   348  func (r *Reader) NewCompactionIter(
   349  	bytesIterated *uint64, rp ReaderProvider, bufferPool *BufferPool,
   350  ) (Iterator, error) {
   351  	return r.newCompactionIter(bytesIterated, rp, nil, bufferPool)
   352  }
   353  
   354  func (r *Reader) newCompactionIter(
   355  	bytesIterated *uint64, rp ReaderProvider, v *virtualState, bufferPool *BufferPool,
   356  ) (Iterator, error) {
   357  	if r.Properties.IndexType == twoLevelIndex {
   358  		i := twoLevelIterPool.Get().(*twoLevelIterator)
   359  		err := i.init(
   360  			context.Background(),
   361  			r, v, nil /* lower */, nil /* upper */, nil,
   362  			false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   363  			nil /* stats */, rp, bufferPool,
   364  		)
   365  		if err != nil {
   366  			return nil, err
   367  		}
   368  		i.setupForCompaction()
   369  		return &twoLevelCompactionIterator{
   370  			twoLevelIterator: i,
   371  			bytesIterated:    bytesIterated,
   372  		}, nil
   373  	}
   374  	i := singleLevelIterPool.Get().(*singleLevelIterator)
   375  	err := i.init(
   376  		context.Background(), r, v, nil /* lower */, nil, /* upper */
   377  		nil, false /* useFilter */, v != nil && v.isForeign, /* hideObsoletePoints */
   378  		nil /* stats */, rp, bufferPool,
   379  	)
   380  	if err != nil {
   381  		return nil, err
   382  	}
   383  	i.setupForCompaction()
   384  	return &compactionIterator{
   385  		singleLevelIterator: i,
   386  		bytesIterated:       bytesIterated,
   387  	}, nil
   388  }
   389  
   390  // NewRawRangeDelIter returns an internal iterator for the contents of the
   391  // range-del block for the table. Returns nil if the table does not contain
   392  // any range deletions.
   393  //
   394  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   395  // iterator. Add WithContext methods since the existing ones are public.
   396  func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
   397  	if r.rangeDelBH.Length == 0 {
   398  		return nil, nil
   399  	}
   400  	h, err := r.readRangeDel(nil /* stats */)
   401  	if err != nil {
   402  		return nil, err
   403  	}
   404  	i := &fragmentBlockIter{elideSameSeqnum: true}
   405  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   406  		return nil, err
   407  	}
   408  	return i, nil
   409  }
   410  
   411  // NewRawRangeKeyIter returns an internal iterator for the contents of the
   412  // range-key block for the table. Returns nil if the table does not contain any
   413  // range keys.
   414  //
   415  // TODO(sumeer): plumb context.Context since this path is relevant in the user-facing
   416  // iterator. Add WithContext methods since the existing ones are public.
   417  func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
   418  	if r.rangeKeyBH.Length == 0 {
   419  		return nil, nil
   420  	}
   421  	h, err := r.readRangeKey(nil /* stats */)
   422  	if err != nil {
   423  		return nil, err
   424  	}
   425  	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
   426  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum, false); err != nil {
   427  		return nil, err
   428  	}
   429  	return i, nil
   430  }
   431  
   432  type rangeKeyFragmentBlockIter struct {
   433  	fragmentBlockIter
   434  }
   435  
   436  func (i *rangeKeyFragmentBlockIter) Close() error {
   437  	err := i.fragmentBlockIter.Close()
   438  	i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse()
   439  	rangeKeyFragmentBlockIterPool.Put(i)
   440  	return err
   441  }
   442  
   443  func (r *Reader) readIndex(
   444  	ctx context.Context, stats *base.InternalIteratorStats,
   445  ) (bufferHandle, error) {
   446  	ctx = objiotracing.WithBlockType(ctx, objiotracing.MetadataBlock)
   447  	return r.readBlock(ctx, r.indexBH, nil, nil, stats, nil /* buffer pool */)
   448  }
   449  
   450  func (r *Reader) readFilter(
   451  	ctx context.Context, stats *base.InternalIteratorStats,
   452  ) (bufferHandle, error) {
   453  	ctx = objiotracing.WithBlockType(ctx, objiotracing.FilterBlock)
   454  	return r.readBlock(ctx, r.filterBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */)
   455  }
   456  
   457  func (r *Reader) readRangeDel(stats *base.InternalIteratorStats) (bufferHandle, error) {
   458  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   459  	return r.readBlock(ctx, r.rangeDelBH, r.rangeDelTransform, nil /* readHandle */, stats, nil /* buffer pool */)
   460  }
   461  
   462  func (r *Reader) readRangeKey(stats *base.InternalIteratorStats) (bufferHandle, error) {
   463  	ctx := objiotracing.WithBlockType(context.Background(), objiotracing.MetadataBlock)
   464  	return r.readBlock(ctx, r.rangeKeyBH, nil /* transform */, nil /* readHandle */, stats, nil /* buffer pool */)
   465  }
   466  
   467  func checkChecksum(
   468  	checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum,
   469  ) error {
   470  	expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:])
   471  	var computedChecksum uint32
   472  	switch checksumType {
   473  	case ChecksumTypeCRC32c:
   474  		computedChecksum = crc.New(b[:bh.Length+1]).Value()
   475  	case ChecksumTypeXXHash64:
   476  		computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1]))
   477  	default:
   478  		return errors.Errorf("unsupported checksum type: %d", checksumType)
   479  	}
   480  
   481  	if expectedChecksum != computedChecksum {
   482  		return base.CorruptionErrorf(
   483  			"pebble/table: invalid table %s (checksum mismatch at %d/%d)",
   484  			errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length))
   485  	}
   486  	return nil
   487  }
   488  
   489  type cacheValueOrBuf struct {
   490  	// buf.Valid() returns true if backed by a BufferPool.
   491  	buf Buf
   492  	// v is non-nil if backed by the block cache.
   493  	v *cache.Value
   494  }
   495  
   496  func (b cacheValueOrBuf) get() []byte {
   497  	if b.buf.Valid() {
   498  		return b.buf.p.pool[b.buf.i].b
   499  	}
   500  	return b.v.Buf()
   501  }
   502  
   503  func (b cacheValueOrBuf) release() {
   504  	if b.buf.Valid() {
   505  		b.buf.Release()
   506  	} else {
   507  		cache.Free(b.v)
   508  	}
   509  }
   510  
   511  func (b cacheValueOrBuf) truncate(n int) {
   512  	if b.buf.Valid() {
   513  		b.buf.p.pool[b.buf.i].b = b.buf.p.pool[b.buf.i].b[:n]
   514  	} else {
   515  		b.v.Truncate(n)
   516  	}
   517  }
   518  
   519  func (r *Reader) readBlock(
   520  	ctx context.Context,
   521  	bh BlockHandle,
   522  	transform blockTransform,
   523  	readHandle objstorage.ReadHandle,
   524  	stats *base.InternalIteratorStats,
   525  	bufferPool *BufferPool,
   526  ) (handle bufferHandle, _ error) {
   527  	if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil {
   528  		// Cache hit.
   529  		if readHandle != nil {
   530  			readHandle.RecordCacheHit(ctx, int64(bh.Offset), int64(bh.Length+blockTrailerLen))
   531  		}
   532  		if stats != nil {
   533  			stats.BlockBytes += bh.Length
   534  			stats.BlockBytesInCache += bh.Length
   535  		}
   536  		// This block is already in the cache; return a handle to existing vlaue
   537  		// in the cache.
   538  		return bufferHandle{h: h}, nil
   539  	}
   540  
   541  	// Cache miss.
   542  	var compressed cacheValueOrBuf
   543  	if bufferPool != nil {
   544  		compressed = cacheValueOrBuf{
   545  			buf: bufferPool.Alloc(int(bh.Length + blockTrailerLen)),
   546  		}
   547  	} else {
   548  		compressed = cacheValueOrBuf{
   549  			v: cache.Alloc(int(bh.Length + blockTrailerLen)),
   550  		}
   551  	}
   552  
   553  	readStartTime := time.Now()
   554  	var err error
   555  	if readHandle != nil {
   556  		err = readHandle.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   557  	} else {
   558  		err = r.readable.ReadAt(ctx, compressed.get(), int64(bh.Offset))
   559  	}
   560  	readDuration := time.Since(readStartTime)
   561  	// TODO(sumeer): should the threshold be configurable.
   562  	const slowReadTracingThreshold = 5 * time.Millisecond
   563  	// The invariants.Enabled path is for deterministic testing.
   564  	if invariants.Enabled {
   565  		readDuration = slowReadTracingThreshold
   566  	}
   567  	// Call IsTracingEnabled to avoid the allocations of boxing integers into an
   568  	// interface{}, unless necessary.
   569  	if readDuration >= slowReadTracingThreshold && r.opts.LoggerAndTracer.IsTracingEnabled(ctx) {
   570  		r.opts.LoggerAndTracer.Eventf(ctx, "reading %d bytes took %s",
   571  			int(bh.Length+blockTrailerLen), readDuration.String())
   572  	}
   573  	if stats != nil {
   574  		stats.BlockReadDuration += readDuration
   575  	}
   576  	if err != nil {
   577  		compressed.release()
   578  		return bufferHandle{}, err
   579  	}
   580  	if err := checkChecksum(r.checksumType, compressed.get(), bh, r.fileNum.FileNum()); err != nil {
   581  		compressed.release()
   582  		return bufferHandle{}, err
   583  	}
   584  
   585  	typ := blockType(compressed.get()[bh.Length])
   586  	compressed.truncate(int(bh.Length))
   587  
   588  	var decompressed cacheValueOrBuf
   589  	if typ == noCompressionBlockType {
   590  		decompressed = compressed
   591  	} else {
   592  		// Decode the length of the decompressed value.
   593  		decodedLen, prefixLen, err := decompressedLen(typ, compressed.get())
   594  		if err != nil {
   595  			compressed.release()
   596  			return bufferHandle{}, err
   597  		}
   598  
   599  		if bufferPool != nil {
   600  			decompressed = cacheValueOrBuf{buf: bufferPool.Alloc(decodedLen)}
   601  		} else {
   602  			decompressed = cacheValueOrBuf{v: cache.Alloc(decodedLen)}
   603  		}
   604  		if _, err := decompressInto(typ, compressed.get()[prefixLen:], decompressed.get()); err != nil {
   605  			compressed.release()
   606  			return bufferHandle{}, err
   607  		}
   608  		compressed.release()
   609  	}
   610  
   611  	if transform != nil {
   612  		// Transforming blocks is very rare, so the extra copy of the
   613  		// transformed data is not problematic.
   614  		tmpTransformed, err := transform(decompressed.get())
   615  		if err != nil {
   616  			decompressed.release()
   617  			return bufferHandle{}, err
   618  		}
   619  
   620  		var transformed cacheValueOrBuf
   621  		if bufferPool != nil {
   622  			transformed = cacheValueOrBuf{buf: bufferPool.Alloc(len(tmpTransformed))}
   623  		} else {
   624  			transformed = cacheValueOrBuf{v: cache.Alloc(len(tmpTransformed))}
   625  		}
   626  		copy(transformed.get(), tmpTransformed)
   627  		decompressed.release()
   628  		decompressed = transformed
   629  	}
   630  
   631  	if stats != nil {
   632  		stats.BlockBytes += bh.Length
   633  	}
   634  	if decompressed.buf.Valid() {
   635  		return bufferHandle{b: decompressed.buf}, nil
   636  	}
   637  	h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, decompressed.v)
   638  	return bufferHandle{h: h}, nil
   639  }
   640  
   641  func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
   642  	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
   643  	// v1 format range-del blocks have unfragmented and unsorted range
   644  	// tombstones. We need properly fragmented and sorted range tombstones in
   645  	// order to serve from them directly.
   646  	iter := &blockIter{}
   647  	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum, false); err != nil {
   648  		return nil, err
   649  	}
   650  	var tombstones []keyspan.Span
   651  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
   652  		t := keyspan.Span{
   653  			Start: key.UserKey,
   654  			End:   value.InPlaceValue(),
   655  			Keys:  []keyspan.Key{{Trailer: key.Trailer}},
   656  		}
   657  		tombstones = append(tombstones, t)
   658  	}
   659  	keyspan.Sort(r.Compare, tombstones)
   660  
   661  	// Fragment the tombstones, outputting them directly to a block writer.
   662  	rangeDelBlock := blockWriter{
   663  		restartInterval: 1,
   664  	}
   665  	frag := keyspan.Fragmenter{
   666  		Cmp:    r.Compare,
   667  		Format: r.FormatKey,
   668  		Emit: func(s keyspan.Span) {
   669  			for _, k := range s.Keys {
   670  				startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer}
   671  				rangeDelBlock.add(startIK, s.End)
   672  			}
   673  		},
   674  	}
   675  	for i := range tombstones {
   676  		frag.Add(tombstones[i])
   677  	}
   678  	frag.Finish()
   679  
   680  	// Return the contents of the constructed v2 format range-del block.
   681  	return rangeDelBlock.finish(), nil
   682  }
   683  
   684  func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
   685  	// We use a BufferPool when reading metaindex blocks in order to avoid
   686  	// populating the block cache with these blocks. In heavy-write workloads,
   687  	// especially with high compaction concurrency, new tables may be created
   688  	// frequently. Populating the block cache with these metaindex blocks adds
   689  	// additional contention on the block cache mutexes (see #1997).
   690  	// Additionally, these blocks are exceedingly unlikely to be read again
   691  	// while they're still in the block cache except in misconfigurations with
   692  	// excessive sstables counts or a table cache that's far too small.
   693  	r.metaBufferPool.initPreallocated(r.metaBufferPoolAlloc[:0])
   694  	// When we're finished, release the buffers we've allocated back to memory
   695  	// allocator. We don't expect to use metaBufferPool again.
   696  	defer r.metaBufferPool.Release()
   697  
   698  	b, err := r.readBlock(
   699  		context.Background(), metaindexBH, nil /* transform */, nil /* readHandle */, nil /* stats */, &r.metaBufferPool)
   700  	if err != nil {
   701  		return err
   702  	}
   703  	data := b.Get()
   704  	defer b.Release()
   705  
   706  	if uint64(len(data)) != metaindexBH.Length {
   707  		return base.CorruptionErrorf("pebble/table: unexpected metaindex block size: %d vs %d",
   708  			errors.Safe(len(data)), errors.Safe(metaindexBH.Length))
   709  	}
   710  
   711  	i, err := newRawBlockIter(bytes.Compare, data)
   712  	if err != nil {
   713  		return err
   714  	}
   715  
   716  	meta := map[string]BlockHandle{}
   717  	for valid := i.First(); valid; valid = i.Next() {
   718  		value := i.Value()
   719  		if bytes.Equal(i.Key().UserKey, []byte(metaValueIndexName)) {
   720  			vbih, n, err := decodeValueBlocksIndexHandle(i.Value())
   721  			if err != nil {
   722  				return err
   723  			}
   724  			if n == 0 || n != len(value) {
   725  				return base.CorruptionErrorf("pebble/table: invalid table (bad value blocks index handle)")
   726  			}
   727  			r.valueBIH = vbih
   728  		} else {
   729  			bh, n := decodeBlockHandle(value)
   730  			if n == 0 || n != len(value) {
   731  				return base.CorruptionErrorf("pebble/table: invalid table (bad block handle)")
   732  			}
   733  			meta[string(i.Key().UserKey)] = bh
   734  		}
   735  	}
   736  	if err := i.Close(); err != nil {
   737  		return err
   738  	}
   739  
   740  	if bh, ok := meta[metaPropertiesName]; ok {
   741  		b, err = r.readBlock(
   742  			context.Background(), bh, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
   743  		if err != nil {
   744  			return err
   745  		}
   746  		r.propertiesBH = bh
   747  		err := r.Properties.load(b.Get(), bh.Offset, r.opts.DeniedUserProperties)
   748  		b.Release()
   749  		if err != nil {
   750  			return err
   751  		}
   752  	}
   753  
   754  	if bh, ok := meta[metaRangeDelV2Name]; ok {
   755  		r.rangeDelBH = bh
   756  	} else if bh, ok := meta[metaRangeDelName]; ok {
   757  		r.rangeDelBH = bh
   758  		if !r.rawTombstones {
   759  			r.rangeDelTransform = r.transformRangeDelV1
   760  		}
   761  	}
   762  
   763  	if bh, ok := meta[metaRangeKeyName]; ok {
   764  		r.rangeKeyBH = bh
   765  	}
   766  
   767  	for name, fp := range r.opts.Filters {
   768  		types := []struct {
   769  			ftype  FilterType
   770  			prefix string
   771  		}{
   772  			{TableFilter, "fullfilter."},
   773  		}
   774  		var done bool
   775  		for _, t := range types {
   776  			if bh, ok := meta[t.prefix+name]; ok {
   777  				r.filterBH = bh
   778  
   779  				switch t.ftype {
   780  				case TableFilter:
   781  					r.tableFilter = newTableFilterReader(fp)
   782  				default:
   783  					return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype))
   784  				}
   785  
   786  				done = true
   787  				break
   788  			}
   789  		}
   790  		if done {
   791  			break
   792  		}
   793  	}
   794  	return nil
   795  }
   796  
   797  // Layout returns the layout (block organization) for an sstable.
   798  func (r *Reader) Layout() (*Layout, error) {
   799  	if r.err != nil {
   800  		return nil, r.err
   801  	}
   802  
   803  	l := &Layout{
   804  		Data:       make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks),
   805  		Filter:     r.filterBH,
   806  		RangeDel:   r.rangeDelBH,
   807  		RangeKey:   r.rangeKeyBH,
   808  		ValueIndex: r.valueBIH.h,
   809  		Properties: r.propertiesBH,
   810  		MetaIndex:  r.metaIndexBH,
   811  		Footer:     r.footerBH,
   812  		Format:     r.tableFormat,
   813  	}
   814  
   815  	indexH, err := r.readIndex(context.Background(), nil)
   816  	if err != nil {
   817  		return nil, err
   818  	}
   819  	defer indexH.Release()
   820  
   821  	var alloc bytealloc.A
   822  
   823  	if r.Properties.IndexPartitions == 0 {
   824  		l.Index = append(l.Index, r.indexBH)
   825  		iter, _ := newBlockIter(r.Compare, indexH.Get())
   826  		for key, value := iter.First(); key != nil; key, value = iter.Next() {
   827  			dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   828  			if err != nil {
   829  				return nil, errCorruptIndexEntry
   830  			}
   831  			if len(dataBH.Props) > 0 {
   832  				alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   833  			}
   834  			l.Data = append(l.Data, dataBH)
   835  		}
   836  	} else {
   837  		l.TopIndex = r.indexBH
   838  		topIter, _ := newBlockIter(r.Compare, indexH.Get())
   839  		iter := &blockIter{}
   840  		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
   841  			indexBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   842  			if err != nil {
   843  				return nil, errCorruptIndexEntry
   844  			}
   845  			l.Index = append(l.Index, indexBH.BlockHandle)
   846  
   847  			subIndex, err := r.readBlock(context.Background(), indexBH.BlockHandle,
   848  				nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
   849  			if err != nil {
   850  				return nil, err
   851  			}
   852  			if err := iter.init(r.Compare, subIndex.Get(), 0, /* globalSeqNum */
   853  				false /* hideObsoletePoints */); err != nil {
   854  				return nil, err
   855  			}
   856  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
   857  				dataBH, err := decodeBlockHandleWithProperties(value.InPlaceValue())
   858  				if len(dataBH.Props) > 0 {
   859  					alloc, dataBH.Props = alloc.Copy(dataBH.Props)
   860  				}
   861  				if err != nil {
   862  					return nil, errCorruptIndexEntry
   863  				}
   864  				l.Data = append(l.Data, dataBH)
   865  			}
   866  			subIndex.Release()
   867  			*iter = iter.resetForReuse()
   868  		}
   869  	}
   870  	if r.valueBIH.h.Length != 0 {
   871  		vbiH, err := r.readBlock(context.Background(), r.valueBIH.h, nil, nil, nil, nil /* buffer pool */)
   872  		if err != nil {
   873  			return nil, err
   874  		}
   875  		defer vbiH.Release()
   876  		vbiBlock := vbiH.Get()
   877  		indexEntryLen := int(r.valueBIH.blockNumByteLength + r.valueBIH.blockOffsetByteLength +
   878  			r.valueBIH.blockLengthByteLength)
   879  		i := 0
   880  		for len(vbiBlock) != 0 {
   881  			if len(vbiBlock) < indexEntryLen {
   882  				return nil, errors.Errorf(
   883  					"remaining value index block %d does not contain a full entry of length %d",
   884  					len(vbiBlock), indexEntryLen)
   885  			}
   886  			n := int(r.valueBIH.blockNumByteLength)
   887  			bn := int(littleEndianGet(vbiBlock, n))
   888  			if bn != i {
   889  				return nil, errors.Errorf("unexpected block num %d, expected %d",
   890  					bn, i)
   891  			}
   892  			i++
   893  			vbiBlock = vbiBlock[n:]
   894  			n = int(r.valueBIH.blockOffsetByteLength)
   895  			blockOffset := littleEndianGet(vbiBlock, n)
   896  			vbiBlock = vbiBlock[n:]
   897  			n = int(r.valueBIH.blockLengthByteLength)
   898  			blockLen := littleEndianGet(vbiBlock, n)
   899  			vbiBlock = vbiBlock[n:]
   900  			l.ValueBlock = append(l.ValueBlock, BlockHandle{Offset: blockOffset, Length: blockLen})
   901  		}
   902  	}
   903  
   904  	return l, nil
   905  }
   906  
   907  // ValidateBlockChecksums validates the checksums for each block in the SSTable.
   908  func (r *Reader) ValidateBlockChecksums() error {
   909  	// Pre-compute the BlockHandles for the underlying file.
   910  	l, err := r.Layout()
   911  	if err != nil {
   912  		return err
   913  	}
   914  
   915  	// Construct the set of blocks to check. Note that the footer is not checked
   916  	// as it is not a block with a checksum.
   917  	blocks := make([]BlockHandle, len(l.Data))
   918  	for i := range l.Data {
   919  		blocks[i] = l.Data[i].BlockHandle
   920  	}
   921  	blocks = append(blocks, l.Index...)
   922  	blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex)
   923  
   924  	// Sorting by offset ensures we are performing a sequential scan of the
   925  	// file.
   926  	sort.Slice(blocks, func(i, j int) bool {
   927  		return blocks[i].Offset < blocks[j].Offset
   928  	})
   929  
   930  	// Check all blocks sequentially. Make use of read-ahead, given we are
   931  	// scanning the entire file from start to end.
   932  	rh := r.readable.NewReadHandle(context.TODO())
   933  	defer rh.Close()
   934  
   935  	for _, bh := range blocks {
   936  		// Certain blocks may not be present, in which case we skip them.
   937  		if bh.Length == 0 {
   938  			continue
   939  		}
   940  
   941  		// Read the block, which validates the checksum.
   942  		h, err := r.readBlock(context.Background(), bh, nil, rh, nil, nil /* buffer pool */)
   943  		if err != nil {
   944  			return err
   945  		}
   946  		h.Release()
   947  	}
   948  
   949  	return nil
   950  }
   951  
   952  // CommonProperties implemented the CommonReader interface.
   953  func (r *Reader) CommonProperties() *CommonProperties {
   954  	return &r.Properties.CommonProperties
   955  }
   956  
   957  // EstimateDiskUsage returns the total size of data blocks overlapping the range
   958  // `[start, end]`. Even if a data block partially overlaps, or we cannot
   959  // determine overlap due to abbreviated index keys, the full data block size is
   960  // included in the estimation.
   961  //
   962  // This function does not account for any metablock space usage. Assumes there
   963  // is at least partial overlap, i.e., `[start, end]` falls neither completely
   964  // before nor completely after the file's range.
   965  //
   966  // Only blocks containing point keys are considered. Range deletion and range
   967  // key blocks are not considered.
   968  //
   969  // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of
   970  // data blocks overlapped and add that same fraction of the metadata blocks to the
   971  // estimate.
   972  func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) {
   973  	if r.err != nil {
   974  		return 0, r.err
   975  	}
   976  
   977  	indexH, err := r.readIndex(context.Background(), nil)
   978  	if err != nil {
   979  		return 0, err
   980  	}
   981  	defer indexH.Release()
   982  
   983  	// Iterators over the bottom-level index blocks containing start and end.
   984  	// These may be different in case of partitioned index but will both point
   985  	// to the same blockIter over the single index in the unpartitioned case.
   986  	var startIdxIter, endIdxIter *blockIter
   987  	if r.Properties.IndexPartitions == 0 {
   988  		iter, err := newBlockIter(r.Compare, indexH.Get())
   989  		if err != nil {
   990  			return 0, err
   991  		}
   992  		startIdxIter = iter
   993  		endIdxIter = iter
   994  	} else {
   995  		topIter, err := newBlockIter(r.Compare, indexH.Get())
   996  		if err != nil {
   997  			return 0, err
   998  		}
   999  
  1000  		key, val := topIter.SeekGE(start, base.SeekGEFlagsNone)
  1001  		if key == nil {
  1002  			// The range falls completely after this file, or an error occurred.
  1003  			return 0, topIter.Error()
  1004  		}
  1005  		startIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1006  		if err != nil {
  1007  			return 0, errCorruptIndexEntry
  1008  		}
  1009  		startIdxBlock, err := r.readBlock(context.Background(), startIdxBH.BlockHandle,
  1010  			nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
  1011  		if err != nil {
  1012  			return 0, err
  1013  		}
  1014  		defer startIdxBlock.Release()
  1015  		startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get())
  1016  		if err != nil {
  1017  			return 0, err
  1018  		}
  1019  
  1020  		key, val = topIter.SeekGE(end, base.SeekGEFlagsNone)
  1021  		if key == nil {
  1022  			if err := topIter.Error(); err != nil {
  1023  				return 0, err
  1024  			}
  1025  		} else {
  1026  			endIdxBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1027  			if err != nil {
  1028  				return 0, errCorruptIndexEntry
  1029  			}
  1030  			endIdxBlock, err := r.readBlock(context.Background(),
  1031  				endIdxBH.BlockHandle, nil /* transform */, nil /* readHandle */, nil /* stats */, nil /* buffer pool */)
  1032  			if err != nil {
  1033  				return 0, err
  1034  			}
  1035  			defer endIdxBlock.Release()
  1036  			endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get())
  1037  			if err != nil {
  1038  				return 0, err
  1039  			}
  1040  		}
  1041  	}
  1042  	// startIdxIter should not be nil at this point, while endIdxIter can be if the
  1043  	// range spans past the end of the file.
  1044  
  1045  	key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone)
  1046  	if key == nil {
  1047  		// The range falls completely after this file, or an error occurred.
  1048  		return 0, startIdxIter.Error()
  1049  	}
  1050  	startBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1051  	if err != nil {
  1052  		return 0, errCorruptIndexEntry
  1053  	}
  1054  
  1055  	includeInterpolatedValueBlocksSize := func(dataBlockSize uint64) uint64 {
  1056  		// INVARIANT: r.Properties.DataSize > 0 since startIdxIter is not nil.
  1057  		// Linearly interpolate what is stored in value blocks.
  1058  		//
  1059  		// TODO(sumeer): if we need more accuracy, without loading any data blocks
  1060  		// (which contain the value handles, and which may also be insufficient if
  1061  		// the values are in separate files), we will need to accumulate the
  1062  		// logical size of the key-value pairs and store the cumulative value for
  1063  		// each data block in the index block entry. This increases the size of
  1064  		// the BlockHandle, so wait until this becomes necessary.
  1065  		return dataBlockSize +
  1066  			uint64((float64(dataBlockSize)/float64(r.Properties.DataSize))*
  1067  				float64(r.Properties.ValueBlocksSize))
  1068  	}
  1069  	if endIdxIter == nil {
  1070  		// The range spans beyond this file. Include data blocks through the last.
  1071  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1072  	}
  1073  	key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone)
  1074  	if key == nil {
  1075  		if err := endIdxIter.Error(); err != nil {
  1076  			return 0, err
  1077  		}
  1078  		// The range spans beyond this file. Include data blocks through the last.
  1079  		return includeInterpolatedValueBlocksSize(r.Properties.DataSize - startBH.Offset), nil
  1080  	}
  1081  	endBH, err := decodeBlockHandleWithProperties(val.InPlaceValue())
  1082  	if err != nil {
  1083  		return 0, errCorruptIndexEntry
  1084  	}
  1085  	return includeInterpolatedValueBlocksSize(
  1086  		endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset), nil
  1087  }
  1088  
  1089  // TableFormat returns the format version for the table.
  1090  func (r *Reader) TableFormat() (TableFormat, error) {
  1091  	if r.err != nil {
  1092  		return TableFormatUnspecified, r.err
  1093  	}
  1094  	return r.tableFormat, nil
  1095  }
  1096  
  1097  // NewReader returns a new table reader for the file. Closing the reader will
  1098  // close the file.
  1099  func NewReader(f objstorage.Readable, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  1100  	o = o.ensureDefaults()
  1101  	r := &Reader{
  1102  		readable: f,
  1103  		opts:     o,
  1104  	}
  1105  	if r.opts.Cache == nil {
  1106  		r.opts.Cache = cache.New(0)
  1107  	} else {
  1108  		r.opts.Cache.Ref()
  1109  	}
  1110  
  1111  	if f == nil {
  1112  		r.err = errors.New("pebble/table: nil file")
  1113  		return nil, r.Close()
  1114  	}
  1115  
  1116  	// Note that the extra options are applied twice. First here for pre-apply
  1117  	// options, and then below for post-apply options. Pre and post refer to
  1118  	// before and after reading the metaindex and properties.
  1119  	type preApply interface{ preApply() }
  1120  	for _, opt := range extraOpts {
  1121  		if _, ok := opt.(preApply); ok {
  1122  			opt.readerApply(r)
  1123  		}
  1124  	}
  1125  	if r.cacheID == 0 {
  1126  		r.cacheID = r.opts.Cache.NewID()
  1127  	}
  1128  
  1129  	footer, err := readFooter(f)
  1130  	if err != nil {
  1131  		r.err = err
  1132  		return nil, r.Close()
  1133  	}
  1134  	r.checksumType = footer.checksum
  1135  	r.tableFormat = footer.format
  1136  	// Read the metaindex.
  1137  	if err := r.readMetaindex(footer.metaindexBH); err != nil {
  1138  		r.err = err
  1139  		return nil, r.Close()
  1140  	}
  1141  	r.indexBH = footer.indexBH
  1142  	r.metaIndexBH = footer.metaindexBH
  1143  	r.footerBH = footer.footerBH
  1144  
  1145  	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
  1146  		r.Compare = o.Comparer.Compare
  1147  		r.FormatKey = o.Comparer.FormatKey
  1148  		r.Split = o.Comparer.Split
  1149  	}
  1150  
  1151  	if o.MergerName == r.Properties.MergerName {
  1152  		r.mergerOK = true
  1153  	}
  1154  
  1155  	// Apply the extra options again now that the comparer and merger names are
  1156  	// known.
  1157  	for _, opt := range extraOpts {
  1158  		if _, ok := opt.(preApply); !ok {
  1159  			opt.readerApply(r)
  1160  		}
  1161  	}
  1162  
  1163  	if r.Compare == nil {
  1164  		r.err = errors.Errorf("pebble/table: %d: unknown comparer %s",
  1165  			errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName))
  1166  	}
  1167  	if !r.mergerOK {
  1168  		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
  1169  			r.err = errors.Errorf("pebble/table: %d: unknown merger %s",
  1170  				errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName))
  1171  		}
  1172  	}
  1173  	if r.err != nil {
  1174  		return nil, r.Close()
  1175  	}
  1176  
  1177  	return r, nil
  1178  }
  1179  
  1180  // ReadableFile describes the smallest subset of vfs.File that is required for
  1181  // reading SSTs.
  1182  type ReadableFile interface {
  1183  	io.ReaderAt
  1184  	io.Closer
  1185  	Stat() (os.FileInfo, error)
  1186  }
  1187  
  1188  // NewSimpleReadable wraps a ReadableFile in a objstorage.Readable
  1189  // implementation (which does not support read-ahead)
  1190  func NewSimpleReadable(r ReadableFile) (objstorage.Readable, error) {
  1191  	info, err := r.Stat()
  1192  	if err != nil {
  1193  		return nil, err
  1194  	}
  1195  	res := &simpleReadable{
  1196  		f:    r,
  1197  		size: info.Size(),
  1198  	}
  1199  	res.rh = objstorage.MakeNoopReadHandle(res)
  1200  	return res, nil
  1201  }
  1202  
  1203  // simpleReadable wraps a ReadableFile to implement objstorage.Readable.
  1204  type simpleReadable struct {
  1205  	f    ReadableFile
  1206  	size int64
  1207  	rh   objstorage.NoopReadHandle
  1208  }
  1209  
  1210  var _ objstorage.Readable = (*simpleReadable)(nil)
  1211  
  1212  // ReadAt is part of the objstorage.Readable interface.
  1213  func (s *simpleReadable) ReadAt(_ context.Context, p []byte, off int64) error {
  1214  	n, err := s.f.ReadAt(p, off)
  1215  	if invariants.Enabled && err == nil && n != len(p) {
  1216  		panic("short read")
  1217  	}
  1218  	return err
  1219  }
  1220  
  1221  // Close is part of the objstorage.Readable interface.
  1222  func (s *simpleReadable) Close() error {
  1223  	return s.f.Close()
  1224  }
  1225  
  1226  // Size is part of the objstorage.Readable interface.
  1227  func (s *simpleReadable) Size() int64 {
  1228  	return s.size
  1229  }
  1230  
  1231  // NewReaddHandle is part of the objstorage.Readable interface.
  1232  func (s *simpleReadable) NewReadHandle(_ context.Context) objstorage.ReadHandle {
  1233  	return &s.rh
  1234  }