github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/reader.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/sstable/reader.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  	"fmt"
    11  	"io"
    12  	"os"
    13  	"sort"
    14  	"sync"
    15  	"unsafe"
    16  
    17  	"github.com/cespare/xxhash/v2"
    18  	"github.com/cockroachdb/errors"
    19  	"github.com/zuoyebang/bitalostable/internal/base"
    20  	"github.com/zuoyebang/bitalostable/internal/cache"
    21  	"github.com/zuoyebang/bitalostable/internal/crc"
    22  	"github.com/zuoyebang/bitalostable/internal/invariants"
    23  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    24  	"github.com/zuoyebang/bitalostable/internal/private"
    25  	"github.com/zuoyebang/bitalostable/vfs"
    26  )
    27  
    28  var errCorruptIndexEntry = base.CorruptionErrorf("bitalostable/table: corrupt index entry")
    29  var errReaderClosed = errors.New("bitalostable/table: reader is closed")
    30  
    31  const (
    32  	// Constants for dynamic readahead of data blocks. Note that the size values
    33  	// make sense as some multiple of the default block size; and they should
    34  	// both be larger than the default block size.
    35  	minFileReadsForReadahead = 2
    36  	// TODO(bilal): Have the initial size value be a factor of the block size,
    37  	// as opposed to a hardcoded value.
    38  	initialReadaheadSize = 64 << 10  /* 64KB */
    39  	maxReadaheadSize     = 256 << 10 /* 256KB */
    40  )
    41  
    42  // decodeBlockHandle returns the block handle encoded at the start of src, as
    43  // well as the number of bytes it occupies. It returns zero if given invalid
    44  // input. A block handle for a data block or a first/lower level index block
    45  // should not be decoded using decodeBlockHandle since the caller may validate
    46  // that the number of bytes decoded is equal to the length of src, which will
    47  // be false if the properties are not decoded. In those cases the caller
    48  // should use decodeBlockHandleWithProperties.
    49  func decodeBlockHandle(src []byte) (BlockHandle, int) {
    50  	offset, n := binary.Uvarint(src)
    51  	length, m := binary.Uvarint(src[n:])
    52  	if n == 0 || m == 0 {
    53  		return BlockHandle{}, 0
    54  	}
    55  	return BlockHandle{offset, length}, n + m
    56  }
    57  
    58  // decodeBlockHandleWithProperties returns the block handle and properties
    59  // encoded in src. src needs to be exactly the length that was encoded. This
    60  // method must be used for data block and first/lower level index blocks. The
    61  // properties in the block handle point to the bytes in src.
    62  func decodeBlockHandleWithProperties(src []byte) (BlockHandleWithProperties, error) {
    63  	bh, n := decodeBlockHandle(src)
    64  	if n == 0 {
    65  		return BlockHandleWithProperties{}, errors.Errorf("invalid BlockHandle")
    66  	}
    67  	return BlockHandleWithProperties{
    68  		BlockHandle: bh,
    69  		Props:       src[n:],
    70  	}, nil
    71  }
    72  
    73  func encodeBlockHandle(dst []byte, b BlockHandle) int {
    74  	n := binary.PutUvarint(dst, b.Offset)
    75  	m := binary.PutUvarint(dst[n:], b.Length)
    76  	return n + m
    77  }
    78  
    79  func encodeBlockHandleWithProperties(dst []byte, b BlockHandleWithProperties) []byte {
    80  	n := encodeBlockHandle(dst, b.BlockHandle)
    81  	dst = append(dst[:n], b.Props...)
    82  	return dst
    83  }
    84  
    85  // block is a []byte that holds a sequence of key/value pairs plus an index
    86  // over those pairs.
    87  type block []byte
    88  
    89  // Iterator iterates over an entire table of data.
    90  type Iterator interface {
    91  	base.InternalIterator
    92  
    93  	// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
    94  	// whether or not the last positioning method may have skipped any keys due
    95  	// to block-property filters. This is used by the Pebble levelIter to
    96  	// control when an iterator steps to the next sstable.
    97  	//
    98  	// MaybeFilteredKeys may always return false positives, that is it may
    99  	// return true when no keys were filtered. It should only be called when the
   100  	// iterator is exhausted. It must never return false negatives when the
   101  	// iterator is exhausted.
   102  	MaybeFilteredKeys() bool
   103  
   104  	SetCloseHook(fn func(i Iterator) error)
   105  }
   106  
   107  // singleLevelIterator iterates over an entire table of data. To seek for a given
   108  // key, it first looks in the index for the block that contains that key, and then
   109  // looks inside that block.
   110  type singleLevelIterator struct {
   111  	cmp Compare
   112  	// Global lower/upper bound for the iterator.
   113  	lower []byte
   114  	upper []byte
   115  	bpfs  *BlockPropertiesFilterer
   116  	// Per-block lower/upper bound. Nil if the bound does not apply to the block
   117  	// because we determined the block lies completely within the bound.
   118  	blockLower []byte
   119  	blockUpper []byte
   120  	reader     *Reader
   121  	index      blockIter
   122  	data       blockIter
   123  	dataRS     readaheadState
   124  	// dataBH refers to the last data block that the iterator considered
   125  	// loading. It may not actually have loaded the block, due to an error or
   126  	// because it was considered irrelevant.
   127  	dataBH    BlockHandle
   128  	err       error
   129  	closeHook func(i Iterator) error
   130  	stats     *base.InternalIteratorStats
   131  
   132  	// boundsCmp and positionedUsingLatestBounds are for optimizing iteration
   133  	// that uses multiple adjacent bounds. The seek after setting a new bound
   134  	// can use the fact that the iterator is either within the previous bounds
   135  	// or exactly one key before or after the bounds. If the new bounds is
   136  	// after/before the previous bounds, and we are already positioned at a
   137  	// block that is relevant for the new bounds, we can try to first position
   138  	// using Next/Prev (repeatedly) instead of doing a more expensive seek.
   139  	//
   140  	// When there are wide files at higher levels that match the bounds
   141  	// but don't have any data for the bound, we will already be
   142  	// positioned at the key beyond the bounds and won't need to do much
   143  	// work -- given that most data is in L6, such files are likely to
   144  	// dominate the performance of the mergingIter, and may be the main
   145  	// benefit of this performance optimization (of course it also helps
   146  	// when the file that has the data has successive seeks that stay in
   147  	// the same block).
   148  	//
   149  	// Specifically, boundsCmp captures the relationship between the previous
   150  	// and current bounds, if the iterator had been positioned after setting
   151  	// the previous bounds. If it was not positioned, i.e., Seek/First/Last
   152  	// were not called, we don't know where it is positioned and cannot
   153  	// optimize.
   154  	//
   155  	// Example: Bounds moving forward, and iterator exhausted in forward direction.
   156  	//      bounds = [f, h), ^ shows block iterator position
   157  	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
   158  	//                                       ^
   159  	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
   160  	//  set to +1. SeekGE(j) can use next (the optimization also requires that j
   161  	//  is within the block, but that is not for correctness, but to limit the
   162  	//  optimization to when it will actually be an optimization).
   163  	//
   164  	// Example: Bounds moving forward.
   165  	//      bounds = [f, h), ^ shows block iterator position
   166  	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
   167  	//                                 ^
   168  	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
   169  	//  set to +1. SeekGE(j) can use next.
   170  	//
   171  	// Example: Bounds moving forward, but iterator not positioned using previous
   172  	//  bounds.
   173  	//      bounds = [f, h), ^ shows block iterator position
   174  	//  file contents [ a  b  c  d  e  f  g  h  i  j  k ]
   175  	//                                             ^
   176  	//  new bounds = [i, j). Iterator is at j since it was never positioned using
   177  	//  [f, h). So positionedUsingLatestBounds=false, and boundsCmp is set to 0.
   178  	//  SeekGE(i) will not use next.
   179  	//
   180  	// Example: Bounds moving forward and sparse file
   181  	//      bounds = [f, h), ^ shows block iterator position
   182  	//  file contents [ a z ]
   183  	//                    ^
   184  	//  new bounds = [j, k). Since positionedUsingLatestBounds=true, boundsCmp is
   185  	//  set to +1. SeekGE(j) notices that the iterator is already past j and does
   186  	//  not need to do anything.
   187  	//
   188  	// Similar examples can be constructed for backward iteration.
   189  	//
   190  	// This notion of exactly one key before or after the bounds is not quite
   191  	// true when block properties are used to ignore blocks. In that case we
   192  	// can't stop precisely at the first block that is past the bounds since
   193  	// we are using the index entries to enforce the bounds.
   194  	//
   195  	// e.g. 3 blocks with keys [b, c]  [f, g], [i, j, k] with index entries d,
   196  	// h, l. And let the lower bound be k, and we are reverse iterating. If
   197  	// the block [i, j, k] is ignored due to the block interval annotations we
   198  	// do need to move the index to block [f, g] since the index entry for the
   199  	// [i, j, k] block is l which is not less than the lower bound of k. So we
   200  	// have passed the entries i, j.
   201  	//
   202  	// This behavior is harmless since the block property filters are fixed
   203  	// for the lifetime of the iterator so i, j are irrelevant. In addition,
   204  	// the current code will not load the [f, g] block, so the seek
   205  	// optimization that attempts to use Next/Prev do not apply anyway.
   206  	boundsCmp                   int
   207  	positionedUsingLatestBounds bool
   208  
   209  	// exhaustedBounds represents whether the iterator is exhausted for
   210  	// iteration by reaching the upper or lower bound. +1 when exhausted
   211  	// the upper bound, -1 when exhausted the lower bound, and 0 when
   212  	// neither. It is used for invariant checking.
   213  	exhaustedBounds int8
   214  
   215  	// maybeFilteredKeysSingleLevel indicates whether the last iterator
   216  	// positioning operation may have skipped any data blocks due to
   217  	// block-property filters when positioning the index.
   218  	maybeFilteredKeysSingleLevel bool
   219  
   220  	// useFilter specifies whether the filter block in this sstable, if present,
   221  	// should be used for prefix seeks or not. In some cases it is beneficial
   222  	// to skip a filter block even if it exists (eg. if probability of a match
   223  	// is high).
   224  	useFilter              bool
   225  	lastBloomFilterMatched bool
   226  }
   227  
   228  // singleLevelIterator implements the base.InternalIterator interface.
   229  var _ base.InternalIterator = (*singleLevelIterator)(nil)
   230  
   231  var singleLevelIterPool = sync.Pool{
   232  	New: func() interface{} {
   233  		i := &singleLevelIterator{}
   234  		// Note: this is a no-op if invariants are disabled or race is enabled.
   235  		invariants.SetFinalizer(i, checkSingleLevelIterator)
   236  		return i
   237  	},
   238  }
   239  
   240  var twoLevelIterPool = sync.Pool{
   241  	New: func() interface{} {
   242  		i := &twoLevelIterator{}
   243  		// Note: this is a no-op if invariants are disabled or race is enabled.
   244  		invariants.SetFinalizer(i, checkTwoLevelIterator)
   245  		return i
   246  	},
   247  }
   248  
   249  // TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some
   250  // code paths that double Close the iters. Fix the double close and pool the
   251  // *fragmentBlockIter type directly.
   252  
   253  var rangeKeyFragmentBlockIterPool = sync.Pool{
   254  	New: func() interface{} {
   255  		i := &rangeKeyFragmentBlockIter{}
   256  		// Note: this is a no-op if invariants are disabled or race is enabled.
   257  		invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator)
   258  		return i
   259  	},
   260  }
   261  
   262  func checkSingleLevelIterator(obj interface{}) {
   263  	i := obj.(*singleLevelIterator)
   264  	if p := i.data.cacheHandle.Get(); p != nil {
   265  		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.cacheHandle is not nil: %p\n", p)
   266  		os.Exit(1)
   267  	}
   268  	if p := i.index.cacheHandle.Get(); p != nil {
   269  		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.cacheHandle is not nil: %p\n", p)
   270  		os.Exit(1)
   271  	}
   272  }
   273  
   274  func checkTwoLevelIterator(obj interface{}) {
   275  	i := obj.(*twoLevelIterator)
   276  	if p := i.data.cacheHandle.Get(); p != nil {
   277  		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.cacheHandle is not nil: %p\n", p)
   278  		os.Exit(1)
   279  	}
   280  	if p := i.index.cacheHandle.Get(); p != nil {
   281  		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.cacheHandle is not nil: %p\n", p)
   282  		os.Exit(1)
   283  	}
   284  }
   285  
   286  func checkRangeKeyFragmentBlockIterator(obj interface{}) {
   287  	i := obj.(*rangeKeyFragmentBlockIter)
   288  	if p := i.blockIter.cacheHandle.Get(); p != nil {
   289  		fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.cacheHandle is not nil: %p\n", p)
   290  		os.Exit(1)
   291  	}
   292  }
   293  
   294  // init initializes a singleLevelIterator for reading from the table. It is
   295  // synonmous with Reader.NewIter, but allows for reusing of the iterator
   296  // between different Readers.
   297  func (i *singleLevelIterator) init(
   298  	r *Reader,
   299  	lower, upper []byte,
   300  	filterer *BlockPropertiesFilterer,
   301  	useFilter bool,
   302  	stats *base.InternalIteratorStats,
   303  ) error {
   304  	if r.err != nil {
   305  		return r.err
   306  	}
   307  	indexH, err := r.readIndex()
   308  	if err != nil {
   309  		return err
   310  	}
   311  
   312  	i.lower = lower
   313  	i.upper = upper
   314  	i.bpfs = filterer
   315  	i.useFilter = useFilter
   316  	i.reader = r
   317  	i.cmp = r.Compare
   318  	i.stats = stats
   319  	err = i.index.initHandle(i.cmp, indexH, r.Properties.GlobalSeqNum)
   320  	if err != nil {
   321  		// blockIter.Close releases indexH and always returns a nil error
   322  		_ = i.index.Close()
   323  		return err
   324  	}
   325  	i.dataRS.size = initialReadaheadSize
   326  	return nil
   327  }
   328  
   329  // setupForCompaction sets up the singleLevelIterator for use with compactionIter.
   330  // Currently, it skips readahead ramp-up. It should be called after init is called.
   331  func (i *singleLevelIterator) setupForCompaction() {
   332  	if i.reader.fs != nil {
   333  		f, err := i.reader.fs.Open(i.reader.filename, vfs.SequentialReadsOption)
   334  		if err == nil {
   335  			// Given that this iterator is for a compaction, we can assume that it
   336  			// will be read sequentially and we can skip the readahead ramp-up.
   337  			i.dataRS.sequentialFile = f
   338  		}
   339  	}
   340  }
   341  
   342  func (i *singleLevelIterator) resetForReuse() singleLevelIterator {
   343  	return singleLevelIterator{
   344  		index: i.index.resetForReuse(),
   345  		data:  i.data.resetForReuse(),
   346  	}
   347  }
   348  
   349  func (i *singleLevelIterator) initBounds() {
   350  	// Trim the iteration bounds for the current block. We don't have to check
   351  	// the bounds on each iteration if the block is entirely contained within the
   352  	// iteration bounds.
   353  	i.blockLower = i.lower
   354  	if i.blockLower != nil {
   355  		key, _ := i.data.First()
   356  		if key != nil && i.cmp(i.blockLower, key.UserKey) < 0 {
   357  			// The lower-bound is less than the first key in the block. No need
   358  			// to check the lower-bound again for this block.
   359  			i.blockLower = nil
   360  		}
   361  	}
   362  	i.blockUpper = i.upper
   363  	if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
   364  		// The upper-bound is greater than the index key which itself is greater
   365  		// than or equal to every key in the block. No need to check the
   366  		// upper-bound again for this block.
   367  		i.blockUpper = nil
   368  	}
   369  }
   370  
   371  type loadBlockResult int8
   372  
   373  const (
   374  	loadBlockOK loadBlockResult = iota
   375  	// Could be due to error or because no block left to load.
   376  	loadBlockFailed
   377  	loadBlockIrrelevant
   378  )
   379  
   380  // loadBlock loads the block at the current index position and leaves i.data
   381  // unpositioned. If unsuccessful, it sets i.err to any error encountered, which
   382  // may be nil if we have simply exhausted the entire table.
   383  func (i *singleLevelIterator) loadBlock(dir int8) loadBlockResult {
   384  	if !i.index.valid() {
   385  		// Ensure the data block iterator is invalidated even if loading of the block
   386  		// fails.
   387  		i.data.invalidate()
   388  		return loadBlockFailed
   389  	}
   390  	// Load the next block.
   391  	v := i.index.Value()
   392  	bhp, err := decodeBlockHandleWithProperties(v)
   393  	if i.dataBH == bhp.BlockHandle && i.data.valid() {
   394  		// We're already at the data block we want to load. Reset bounds in case
   395  		// they changed since the last seek, but don't reload the block from cache
   396  		// or disk.
   397  		//
   398  		// It's safe to leave i.data in its original state here, as all callers to
   399  		// loadBlock make an absolute positioning call (i.e. a seek, first, or last)
   400  		// to `i.data` right after loadBlock returns loadBlockOK.
   401  		i.initBounds()
   402  		return loadBlockOK
   403  	}
   404  	// Ensure the data block iterator is invalidated even if loading of the block
   405  	// fails.
   406  	i.data.invalidate()
   407  	i.dataBH = bhp.BlockHandle
   408  	if err != nil {
   409  		i.err = errCorruptIndexEntry
   410  		return loadBlockFailed
   411  	}
   412  	if i.bpfs != nil {
   413  		intersects, err := i.bpfs.intersects(bhp.Props)
   414  		if err != nil {
   415  			i.err = errCorruptIndexEntry
   416  			return loadBlockFailed
   417  		}
   418  		if intersects == blockMaybeExcluded {
   419  			intersects = i.resolveMaybeExcluded(dir)
   420  		}
   421  		if intersects == blockExcluded {
   422  			i.maybeFilteredKeysSingleLevel = true
   423  			return loadBlockIrrelevant
   424  		}
   425  		// blockIntersects
   426  	}
   427  	block, err := i.readBlockWithStats(i.dataBH, &i.dataRS)
   428  	if err != nil {
   429  		i.err = err
   430  		return loadBlockFailed
   431  	}
   432  	i.err = i.data.initHandle(i.cmp, block, i.reader.Properties.GlobalSeqNum)
   433  	if i.err != nil {
   434  		// The block is partially loaded, and we don't want it to appear valid.
   435  		i.data.invalidate()
   436  		return loadBlockFailed
   437  	}
   438  	i.initBounds()
   439  	return loadBlockOK
   440  }
   441  
   442  // resolveMaybeExcluded is invoked when the block-property filterer has found
   443  // that a block is excluded according to its properties but only if its bounds
   444  // fall within the filter's current bounds.  This function consults the
   445  // apprioriate bound, depending on the iteration direction, and returns either
   446  // `blockIntersects` or `blockMaybeExcluded`.
   447  func (i *singleLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
   448  	// TODO(jackson): We could first try comparing to top-level index block's
   449  	// key, and if within bounds avoid per-data block key comparisons.
   450  
   451  	// This iterator is configured with a bound-limited block property
   452  	// filter. The bpf determined this block could be excluded from
   453  	// iteration based on the property encoded in the block handle.
   454  	// However, we still need to determine if the block is wholly
   455  	// contained within the filter's key bounds.
   456  	//
   457  	// External guarantees ensure all the block's keys are ≥ the
   458  	// filter's lower bound during forward iteration, and that all the
   459  	// block's keys are < the filter's upper bound during backward
   460  	// iteration. We only need to determine if the opposite bound is
   461  	// also met.
   462  	//
   463  	// The index separator in index.Key() provides an inclusive
   464  	// upper-bound for the data block's keys, guaranteeing that all its
   465  	// keys are ≤ index.Key(). For forward iteration, this is all we
   466  	// need.
   467  	if dir > 0 {
   468  		// Forward iteration.
   469  		if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.index.Key()) {
   470  			return blockExcluded
   471  		}
   472  		return blockIntersects
   473  	}
   474  
   475  	// Reverse iteration.
   476  	//
   477  	// Because we're iterating in the reverse direction, we don't yet have
   478  	// enough context available to determine if the block is wholly contained
   479  	// within its bounds. This case arises only during backward iteration,
   480  	// because of the way the index is structured.
   481  	//
   482  	// Consider a bound-limited bpf limited to the bounds [b,d), loading the
   483  	// block with separator `c`. During reverse iteration, the guarantee that
   484  	// all the block's keys are < `d` is externally provided, but no guarantee
   485  	// is made on the bpf's lower bound. The separator `c` only provides an
   486  	// inclusive upper bound on the block's keys, indicating that the
   487  	// corresponding block handle points to a block containing only keys ≤ `c`.
   488  	//
   489  	// To establish a lower bound, we step the index backwards to read the
   490  	// previous block's separator, which provides an inclusive lower bound on
   491  	// the original block's keys. Afterwards, we step forward to restore our
   492  	// index position.
   493  	if peekKey, _ := i.index.Prev(); peekKey == nil {
   494  		// The original block points to the first block of this index block. If
   495  		// there's a two-level index, it could potentially provide a lower
   496  		// bound, but the code refactoring necessary to read it doesn't seem
   497  		// worth the payoff. We fall through to loading the block.
   498  	} else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey) {
   499  		// The lower-bound on the original block falls within the filter's
   500  		// bounds, and we can skip the block (after restoring our current index
   501  		// position).
   502  		_, _ = i.index.Next()
   503  		return blockExcluded
   504  	}
   505  	_, _ = i.index.Next()
   506  	return blockIntersects
   507  }
   508  
   509  func (i *singleLevelIterator) readBlockWithStats(
   510  	bh BlockHandle, raState *readaheadState,
   511  ) (cache.Handle, error) {
   512  	block, cacheHit, err := i.reader.readBlock(bh, nil /* transform */, raState)
   513  	if err == nil && i.stats != nil {
   514  		n := bh.Length
   515  		i.stats.BlockBytes += n
   516  		if cacheHit {
   517  			i.stats.BlockBytesInCache += n
   518  		}
   519  	}
   520  	return block, err
   521  }
   522  
   523  func (i *singleLevelIterator) initBoundsForAlreadyLoadedBlock() {
   524  	if i.data.firstKey.UserKey == nil {
   525  		panic("initBoundsForAlreadyLoadedBlock must not be called on empty or corrupted block")
   526  	}
   527  	i.blockLower = i.lower
   528  	if i.blockLower != nil {
   529  		if i.data.firstKey.UserKey != nil && i.cmp(i.blockLower, i.data.firstKey.UserKey) < 0 {
   530  			// The lower-bound is less than the first key in the block. No need
   531  			// to check the lower-bound again for this block.
   532  			i.blockLower = nil
   533  		}
   534  	}
   535  	i.blockUpper = i.upper
   536  	if i.blockUpper != nil && i.cmp(i.blockUpper, i.index.Key().UserKey) > 0 {
   537  		// The upper-bound is greater than the index key which itself is greater
   538  		// than or equal to every key in the block. No need to check the
   539  		// upper-bound again for this block.
   540  		i.blockUpper = nil
   541  	}
   542  }
   543  
   544  // The number of times to call Next/Prev in a block before giving up and seeking.
   545  // The value of 4 is arbitrary.
   546  // TODO(sumeer): experiment with dynamic adjustment based on the history of
   547  // seeks for a particular iterator.
   548  const numStepsBeforeSeek = 4
   549  
   550  func (i *singleLevelIterator) trySeekGEUsingNextWithinBlock(
   551  	key []byte,
   552  ) (k *InternalKey, v []byte, done bool) {
   553  	k, v = i.data.Key(), i.data.Value()
   554  	for j := 0; j < numStepsBeforeSeek; j++ {
   555  		curKeyCmp := i.cmp(k.UserKey, key)
   556  		if curKeyCmp >= 0 {
   557  			if i.blockUpper != nil && i.cmp(k.UserKey, i.blockUpper) >= 0 {
   558  				i.exhaustedBounds = +1
   559  				return nil, nil, true
   560  			}
   561  			return k, v, true
   562  		}
   563  		k, v = i.data.Next()
   564  		if k == nil {
   565  			break
   566  		}
   567  	}
   568  	return k, v, false
   569  }
   570  
   571  func (i *singleLevelIterator) trySeekLTUsingPrevWithinBlock(
   572  	key []byte,
   573  ) (k *InternalKey, v []byte, done bool) {
   574  	k, v = i.data.Key(), i.data.Value()
   575  	for j := 0; j < numStepsBeforeSeek; j++ {
   576  		curKeyCmp := i.cmp(k.UserKey, key)
   577  		if curKeyCmp < 0 {
   578  			if i.blockLower != nil && i.cmp(k.UserKey, i.blockLower) < 0 {
   579  				i.exhaustedBounds = -1
   580  				return nil, nil, true
   581  			}
   582  			return k, v, true
   583  		}
   584  		k, v = i.data.Prev()
   585  		if k == nil {
   586  			break
   587  		}
   588  	}
   589  	return k, v, false
   590  }
   591  
   592  func (i *singleLevelIterator) recordOffset() uint64 {
   593  	offset := i.dataBH.Offset
   594  	if i.data.valid() {
   595  		// - i.dataBH.Length/len(i.data.data) is the compression ratio. If
   596  		//   uncompressed, this is 1.
   597  		// - i.data.nextOffset is the uncompressed position of the current record
   598  		//   in the block.
   599  		// - i.dataBH.Offset is the offset of the block in the sstable before
   600  		//   decompression.
   601  		offset += (uint64(i.data.nextOffset) * i.dataBH.Length) / uint64(len(i.data.data))
   602  	} else {
   603  		// Last entry in the block must increment bytes iterated by the size of the block trailer
   604  		// and restart points.
   605  		offset += i.dataBH.Length + blockTrailerLen
   606  	}
   607  	return offset
   608  }
   609  
   610  // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable
   611  // package. Note that SeekGE only checks the upper bound. It is up to the
   612  // caller to ensure that key is greater than or equal to the lower bound.
   613  func (i *singleLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
   614  	// The i.exhaustedBounds comparison indicates that the upper bound was
   615  	// reached. The i.data.isDataInvalidated() indicates that the sstable was
   616  	// exhausted.
   617  	if flags.TrySeekUsingNext() && (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) {
   618  		// Already exhausted, so return nil.
   619  		return nil, nil
   620  	}
   621  
   622  	i.exhaustedBounds = 0
   623  	i.err = nil
   624  	boundsCmp := i.boundsCmp
   625  	// Seek optimization only applies until iterator is first positioned after SetBounds.
   626  	i.boundsCmp = 0
   627  	i.positionedUsingLatestBounds = true
   628  	return i.seekGEHelper(key, boundsCmp, flags)
   629  }
   630  
   631  // seekGEHelper contains the common functionality for SeekGE and SeekPrefixGE.
   632  func (i *singleLevelIterator) seekGEHelper(
   633  	key []byte, boundsCmp int, flags base.SeekGEFlags,
   634  ) (*InternalKey, []byte) {
   635  	// Invariant: trySeekUsingNext => !i.data.isDataInvalidated() && i.exhaustedBounds != +1
   636  
   637  	// SeekGE performs various step-instead-of-seeking optimizations: eg enabled
   638  	// by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
   639  	// Care must be taken to ensure that when performing these optimizations and
   640  	// the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately.
   641  	// Consider a previous SeekGE that filtered keys from k until the current
   642  	// iterator position.
   643  	//
   644  	// If the previous SeekGE exhausted the iterator, it's possible keys greater
   645  	// than or equal to the current search key were filtered. We must not reuse
   646  	// the current iterator position without remembering the previous value of
   647  	// maybeFilteredKeys.
   648  
   649  	var dontSeekWithinBlock bool
   650  	if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
   651  		boundsCmp > 0 && i.cmp(key, i.index.Key().UserKey) <= 0 {
   652  		// Fast-path: The bounds have moved forward and this SeekGE is
   653  		// respecting the lower bound (guaranteed by Iterator). We know that
   654  		// the iterator must already be positioned within or just outside the
   655  		// previous bounds. Therefore it cannot be positioned at a block (or
   656  		// the position within that block) that is ahead of the seek position.
   657  		// However it can be positioned at an earlier block. This fast-path to
   658  		// use Next() on the block is only applied when we are already at the
   659  		// block that the slow-path (the else-clause) would load -- this is
   660  		// the motivation for the i.cmp(key, i.index.Key().UserKey) <= 0
   661  		// predicate.
   662  		i.initBoundsForAlreadyLoadedBlock()
   663  		ikey, val, done := i.trySeekGEUsingNextWithinBlock(key)
   664  		if done {
   665  			return ikey, val
   666  		}
   667  		if ikey == nil {
   668  			// Done with this block.
   669  			dontSeekWithinBlock = true
   670  		}
   671  	} else {
   672  		// Cannot use bounds monotonicity. But may be able to optimize if
   673  		// caller claimed externally known invariant represented by
   674  		// flags.TrySeekUsingNext().
   675  		if flags.TrySeekUsingNext() {
   676  			// seekPrefixGE or SeekGE has already ensured
   677  			// !i.data.isDataInvalidated() && i.exhaustedBounds != +1
   678  			currKey := i.data.Key()
   679  			value := i.data.Value()
   680  			less := i.cmp(currKey.UserKey, key) < 0
   681  			// We could be more sophisticated and confirm that the seek
   682  			// position is within the current block before applying this
   683  			// optimization. But there may be some benefit even if it is in
   684  			// the next block, since we can avoid seeking i.index.
   685  			for j := 0; less && j < numStepsBeforeSeek; j++ {
   686  				currKey, value = i.Next()
   687  				if currKey == nil {
   688  					return nil, nil
   689  				}
   690  				less = i.cmp(currKey.UserKey, key) < 0
   691  			}
   692  			if !less {
   693  				if i.blockUpper != nil && i.cmp(currKey.UserKey, i.blockUpper) >= 0 {
   694  					i.exhaustedBounds = +1
   695  					return nil, nil
   696  				}
   697  				return currKey, value
   698  			}
   699  		}
   700  
   701  		// Slow-path.
   702  
   703  		// Since we're re-seeking the iterator, the previous value of
   704  		// maybeFilteredKeysSingleLevel is irrelevant. If we filter out blocks
   705  		// during seeking, loadBlock will set it to true.
   706  		i.maybeFilteredKeysSingleLevel = false
   707  
   708  		var ikey *InternalKey
   709  		if ikey, _ = i.index.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey == nil {
   710  			// The target key is greater than any key in the index block.
   711  			// Invalidate the block iterator so that a subsequent call to Prev()
   712  			// will return the last key in the table.
   713  			i.data.invalidate()
   714  			return nil, nil
   715  		}
   716  		result := i.loadBlock(+1)
   717  		if result == loadBlockFailed {
   718  			return nil, nil
   719  		}
   720  		if result == loadBlockIrrelevant {
   721  			// Enforce the upper bound here since don't want to bother moving
   722  			// to the next block if upper bound is already exceeded. Note that
   723  			// the next block starts with keys >= ikey.UserKey since even
   724  			// though this is the block separator, the same user key can span
   725  			// multiple blocks. Since upper is exclusive we use >= below.
   726  			if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
   727  				i.exhaustedBounds = +1
   728  				return nil, nil
   729  			}
   730  			// Want to skip to the next block.
   731  			dontSeekWithinBlock = true
   732  		}
   733  	}
   734  	if !dontSeekWithinBlock {
   735  		if ikey, val := i.data.SeekGE(key, flags.DisableTrySeekUsingNext()); ikey != nil {
   736  			if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 {
   737  				i.exhaustedBounds = +1
   738  				return nil, nil
   739  			}
   740  			return ikey, val
   741  		}
   742  	}
   743  	return i.skipForward()
   744  }
   745  
   746  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
   747  // bitalostable package. Note that SeekPrefixGE only checks the upper bound. It is up
   748  // to the caller to ensure that key is greater than or equal to the lower bound.
   749  func (i *singleLevelIterator) SeekPrefixGE(
   750  	prefix, key []byte, flags base.SeekGEFlags,
   751  ) (*base.InternalKey, []byte) {
   752  	k, v := i.seekPrefixGE(prefix, key, flags, i.useFilter)
   753  	return k, v
   754  }
   755  
   756  func (i *singleLevelIterator) seekPrefixGE(
   757  	prefix, key []byte, flags base.SeekGEFlags, checkFilter bool,
   758  ) (k *InternalKey, value []byte) {
   759  	i.err = nil
   760  	if checkFilter && i.reader.tableFilter != nil {
   761  		if !i.lastBloomFilterMatched {
   762  			// Iterator is not positioned based on last seek.
   763  			flags = flags.DisableTrySeekUsingNext()
   764  		}
   765  		i.lastBloomFilterMatched = false
   766  		// Check prefix bloom filter.
   767  		var dataH cache.Handle
   768  		dataH, i.err = i.reader.readFilter()
   769  		if i.err != nil {
   770  			i.data.invalidate()
   771  			return nil, nil
   772  		}
   773  		mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix)
   774  		dataH.Release()
   775  		if !mayContain {
   776  			// This invalidation may not be necessary for correctness, and may
   777  			// be a place to optimize later by reusing the already loaded
   778  			// block. It was necessary in earlier versions of the code since
   779  			// the caller was allowed to call Next when SeekPrefixGE returned
   780  			// nil. This is no longer allowed.
   781  			i.data.invalidate()
   782  			return nil, nil
   783  		}
   784  		i.lastBloomFilterMatched = true
   785  	}
   786  	// The i.exhaustedBounds comparison indicates that the upper bound was
   787  	// reached. The i.data.isDataInvalidated() indicates that the sstable was
   788  	// exhausted.
   789  	if flags.TrySeekUsingNext() && (i.exhaustedBounds == +1 || i.data.isDataInvalidated()) {
   790  		// Already exhausted, so return nil.
   791  		return nil, nil
   792  	}
   793  	// Bloom filter matches, or skipped, so this method will position the
   794  	// iterator.
   795  	i.exhaustedBounds = 0
   796  	boundsCmp := i.boundsCmp
   797  	// Seek optimization only applies until iterator is first positioned after SetBounds.
   798  	i.boundsCmp = 0
   799  	i.positionedUsingLatestBounds = true
   800  	k, value = i.seekGEHelper(key, boundsCmp, flags)
   801  	return k, value
   802  }
   803  
   804  // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable
   805  // package. Note that SeekLT only checks the lower bound. It is up to the
   806  // caller to ensure that key is less than the upper bound.
   807  func (i *singleLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
   808  	i.exhaustedBounds = 0
   809  	i.err = nil
   810  	boundsCmp := i.boundsCmp
   811  	// Seek optimization only applies until iterator is first positioned after SetBounds.
   812  	i.boundsCmp = 0
   813  
   814  	// Seeking operations perform various step-instead-of-seeking optimizations:
   815  	// eg by considering monotonically increasing bounds (i.boundsCmp). Care
   816  	// must be taken to ensure that when performing these optimizations and the
   817  	// iterator becomes exhausted i.maybeFilteredKeysSingleLevel is set
   818  	// appropriately.  Consider a previous SeekLT that filtered keys from k
   819  	// until the current iterator position.
   820  	//
   821  	// If the previous SeekLT did exhausted the iterator, it's possible keys
   822  	// less than the current search key were filtered. We must not reuse the
   823  	// current iterator position without remembering the previous value of
   824  	// maybeFilteredKeysSingleLevel.
   825  
   826  	i.positionedUsingLatestBounds = true
   827  
   828  	var dontSeekWithinBlock bool
   829  	if !i.data.isDataInvalidated() && !i.index.isDataInvalidated() && i.data.valid() && i.index.valid() &&
   830  		boundsCmp < 0 && i.cmp(i.data.firstKey.UserKey, key) < 0 {
   831  		// Fast-path: The bounds have moved backward, and this SeekLT is
   832  		// respecting the upper bound (guaranteed by Iterator). We know that
   833  		// the iterator must already be positioned within or just outside the
   834  		// previous bounds. Therefore it cannot be positioned at a block (or
   835  		// the position within that block) that is behind the seek position.
   836  		// However it can be positioned at a later block. This fast-path to
   837  		// use Prev() on the block is only applied when we are already at the
   838  		// block that can satisfy this seek -- this is the motivation for the
   839  		// the i.cmp(i.data.firstKey.UserKey, key) < 0 predicate.
   840  		i.initBoundsForAlreadyLoadedBlock()
   841  		ikey, val, done := i.trySeekLTUsingPrevWithinBlock(key)
   842  		if done {
   843  			return ikey, val
   844  		}
   845  		if ikey == nil {
   846  			// Done with this block.
   847  			dontSeekWithinBlock = true
   848  		}
   849  	} else {
   850  		// Slow-path.
   851  		i.maybeFilteredKeysSingleLevel = false
   852  		var ikey *InternalKey
   853  
   854  		// NB: If a bound-limited block property filter is configured, it's
   855  		// externally ensured that the filter is disabled (through returning
   856  		// Intersects=false irrespective of the block props provided) during
   857  		// seeks.
   858  		if ikey, _ = i.index.SeekGE(key, base.SeekGEFlagsNone); ikey == nil {
   859  			ikey, _ = i.index.Last()
   860  			if ikey == nil {
   861  				return nil, nil
   862  			}
   863  		}
   864  		// INVARIANT: ikey != nil.
   865  		result := i.loadBlock(-1)
   866  		if result == loadBlockFailed {
   867  			return nil, nil
   868  		}
   869  		if result == loadBlockIrrelevant {
   870  			// Enforce the lower bound here since don't want to bother moving
   871  			// to the previous block if lower bound is already exceeded. Note
   872  			// that the previous block starts with keys <= ikey.UserKey since
   873  			// even though this is the current block's separator, the same
   874  			// user key can span multiple blocks.
   875  			if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
   876  				i.exhaustedBounds = -1
   877  				return nil, nil
   878  			}
   879  			// Want to skip to the previous block.
   880  			dontSeekWithinBlock = true
   881  		}
   882  	}
   883  	if !dontSeekWithinBlock {
   884  		if ikey, val := i.data.SeekLT(key, flags); ikey != nil {
   885  			if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 {
   886  				i.exhaustedBounds = -1
   887  				return nil, nil
   888  			}
   889  			return ikey, val
   890  		}
   891  	}
   892  	// The index contains separator keys which may lie between
   893  	// user-keys. Consider the user-keys:
   894  	//
   895  	//   complete
   896  	// ---- new block ---
   897  	//   complexion
   898  	//
   899  	// If these two keys end one block and start the next, the index key may
   900  	// be chosen as "compleu". The SeekGE in the index block will then point
   901  	// us to the block containing "complexion". If this happens, we want the
   902  	// last key from the previous data block.
   903  	return i.skipBackward()
   904  }
   905  
   906  // First implements internalIterator.First, as documented in the bitalostable
   907  // package. Note that First only checks the upper bound. It is up to the caller
   908  // to ensure that key is greater than or equal to the lower bound (e.g. via a
   909  // call to SeekGE(lower)).
   910  func (i *singleLevelIterator) First() (*InternalKey, []byte) {
   911  	if i.lower != nil {
   912  		panic("singleLevelIterator.First() used despite lower bound")
   913  	}
   914  	i.positionedUsingLatestBounds = true
   915  	i.maybeFilteredKeysSingleLevel = false
   916  	return i.firstInternal()
   917  }
   918  
   919  // firstInternal is a helper used for absolute positioning in a single-level
   920  // index file, or for positioning in the second-level index in a two-level
   921  // index file. For the latter, one cannot make any claims about absolute
   922  // positioning.
   923  func (i *singleLevelIterator) firstInternal() (*InternalKey, []byte) {
   924  	i.exhaustedBounds = 0
   925  	i.err = nil
   926  	// Seek optimization only applies until iterator is first positioned after SetBounds.
   927  	i.boundsCmp = 0
   928  
   929  	var ikey *InternalKey
   930  	if ikey, _ = i.index.First(); ikey == nil {
   931  		i.data.invalidate()
   932  		return nil, nil
   933  	}
   934  	result := i.loadBlock(+1)
   935  	if result == loadBlockFailed {
   936  		return nil, nil
   937  	}
   938  	if result == loadBlockOK {
   939  		if ikey, val := i.data.First(); ikey != nil {
   940  			if i.blockUpper != nil && i.cmp(ikey.UserKey, i.blockUpper) >= 0 {
   941  				i.exhaustedBounds = +1
   942  				return nil, nil
   943  			}
   944  			return ikey, val
   945  		}
   946  		// Else fall through to skipForward.
   947  	} else {
   948  		// result == loadBlockIrrelevant. Enforce the upper bound here since
   949  		// don't want to bother moving to the next block if upper bound is
   950  		// already exceeded. Note that the next block starts with keys >=
   951  		// ikey.UserKey since even though this is the block separator, the
   952  		// same user key can span multiple blocks. Since upper is exclusive we
   953  		// use >= below.
   954  		if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
   955  			i.exhaustedBounds = +1
   956  			return nil, nil
   957  		}
   958  		// Else fall through to skipForward.
   959  	}
   960  
   961  	return i.skipForward()
   962  }
   963  
   964  // Last implements internalIterator.Last, as documented in the bitalostable
   965  // package. Note that Last only checks the lower bound. It is up to the caller
   966  // to ensure that key is less than the upper bound (e.g. via a call to
   967  // SeekLT(upper))
   968  func (i *singleLevelIterator) Last() (*InternalKey, []byte) {
   969  	if i.upper != nil {
   970  		panic("singleLevelIterator.Last() used despite upper bound")
   971  	}
   972  	i.positionedUsingLatestBounds = true
   973  	i.maybeFilteredKeysSingleLevel = false
   974  	return i.lastInternal()
   975  }
   976  
   977  // lastInternal is a helper used for absolute positioning in a single-level
   978  // index file, or for positioning in the second-level index in a two-level
   979  // index file. For the latter, one cannot make any claims about absolute
   980  // positioning.
   981  func (i *singleLevelIterator) lastInternal() (*InternalKey, []byte) {
   982  	i.exhaustedBounds = 0
   983  	i.err = nil
   984  	// Seek optimization only applies until iterator is first positioned after SetBounds.
   985  	i.boundsCmp = 0
   986  
   987  	var ikey *InternalKey
   988  	if ikey, _ = i.index.Last(); ikey == nil {
   989  		i.data.invalidate()
   990  		return nil, nil
   991  	}
   992  	result := i.loadBlock(-1)
   993  	if result == loadBlockFailed {
   994  		return nil, nil
   995  	}
   996  	if result == loadBlockOK {
   997  		if ikey, val := i.data.Last(); ikey != nil {
   998  			if i.blockLower != nil && i.cmp(ikey.UserKey, i.blockLower) < 0 {
   999  				i.exhaustedBounds = -1
  1000  				return nil, nil
  1001  			}
  1002  			return ikey, val
  1003  		}
  1004  		// Else fall through to skipBackward.
  1005  	} else {
  1006  		// result == loadBlockIrrelevant. Enforce the lower bound here since
  1007  		// don't want to bother moving to the previous block if lower bound is
  1008  		// already exceeded. Note that the previous block starts with keys <=
  1009  		// key.UserKey since even though this is the current block's
  1010  		// separator, the same user key can span multiple blocks.
  1011  		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
  1012  			i.exhaustedBounds = -1
  1013  			return nil, nil
  1014  		}
  1015  	}
  1016  
  1017  	return i.skipBackward()
  1018  }
  1019  
  1020  // Next implements internalIterator.Next, as documented in the bitalostable
  1021  // package.
  1022  // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
  1023  // due to performance. Keep the two in sync.
  1024  func (i *singleLevelIterator) Next() (*InternalKey, []byte) {
  1025  	if i.exhaustedBounds == +1 {
  1026  		panic("Next called even though exhausted upper bound")
  1027  	}
  1028  	i.exhaustedBounds = 0
  1029  	i.maybeFilteredKeysSingleLevel = false
  1030  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1031  	i.boundsCmp = 0
  1032  
  1033  	if i.err != nil {
  1034  		return nil, nil
  1035  	}
  1036  	if key, val := i.data.Next(); key != nil {
  1037  		if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 {
  1038  			i.exhaustedBounds = +1
  1039  			return nil, nil
  1040  		}
  1041  		return key, val
  1042  	}
  1043  	return i.skipForward()
  1044  }
  1045  
  1046  // Prev implements internalIterator.Prev, as documented in the bitalostable
  1047  // package.
  1048  func (i *singleLevelIterator) Prev() (*InternalKey, []byte) {
  1049  	if i.exhaustedBounds == -1 {
  1050  		panic("Prev called even though exhausted lower bound")
  1051  	}
  1052  	i.exhaustedBounds = 0
  1053  	i.maybeFilteredKeysSingleLevel = false
  1054  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1055  	i.boundsCmp = 0
  1056  
  1057  	if i.err != nil {
  1058  		return nil, nil
  1059  	}
  1060  	if key, val := i.data.Prev(); key != nil {
  1061  		if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
  1062  			i.exhaustedBounds = -1
  1063  			return nil, nil
  1064  		}
  1065  		return key, val
  1066  	}
  1067  	return i.skipBackward()
  1068  }
  1069  
  1070  func (i *singleLevelIterator) skipForward() (*InternalKey, []byte) {
  1071  	for {
  1072  		var key *InternalKey
  1073  		if key, _ = i.index.Next(); key == nil {
  1074  			i.data.invalidate()
  1075  			break
  1076  		}
  1077  		result := i.loadBlock(+1)
  1078  		if result != loadBlockOK {
  1079  			if i.err != nil {
  1080  				break
  1081  			}
  1082  			if result == loadBlockFailed {
  1083  				// We checked that i.index was at a valid entry, so
  1084  				// loadBlockFailed could not have happened due to to i.index
  1085  				// being exhausted, and must be due to an error.
  1086  				panic("loadBlock should not have failed with no error")
  1087  			}
  1088  			// result == loadBlockIrrelevant. Enforce the upper bound here
  1089  			// since don't want to bother moving to the next block if upper
  1090  			// bound is already exceeded. Note that the next block starts with
  1091  			// keys >= key.UserKey since even though this is the block
  1092  			// separator, the same user key can span multiple blocks. Since
  1093  			// upper is exclusive we use >= below.
  1094  			if i.upper != nil && i.cmp(key.UserKey, i.upper) >= 0 {
  1095  				i.exhaustedBounds = +1
  1096  				return nil, nil
  1097  			}
  1098  			continue
  1099  		}
  1100  		if key, val := i.data.First(); key != nil {
  1101  			if i.blockUpper != nil && i.cmp(key.UserKey, i.blockUpper) >= 0 {
  1102  				i.exhaustedBounds = +1
  1103  				return nil, nil
  1104  			}
  1105  			return key, val
  1106  		}
  1107  	}
  1108  	return nil, nil
  1109  }
  1110  
  1111  func (i *singleLevelIterator) skipBackward() (*InternalKey, []byte) {
  1112  	for {
  1113  		var key *InternalKey
  1114  		if key, _ = i.index.Prev(); key == nil {
  1115  			i.data.invalidate()
  1116  			break
  1117  		}
  1118  		result := i.loadBlock(-1)
  1119  		if result != loadBlockOK {
  1120  			if i.err != nil {
  1121  				break
  1122  			}
  1123  			if result == loadBlockFailed {
  1124  				// We checked that i.index was at a valid entry, so
  1125  				// loadBlockFailed could not have happened due to to i.index
  1126  				// being exhausted, and must be due to an error.
  1127  				panic("loadBlock should not have failed with no error")
  1128  			}
  1129  			// result == loadBlockIrrelevant. Enforce the lower bound here
  1130  			// since don't want to bother moving to the previous block if lower
  1131  			// bound is already exceeded. Note that the previous block starts with
  1132  			// keys <= key.UserKey since even though this is the current block's
  1133  			// separator, the same user key can span multiple blocks.
  1134  			if i.lower != nil && i.cmp(key.UserKey, i.lower) < 0 {
  1135  				i.exhaustedBounds = -1
  1136  				return nil, nil
  1137  			}
  1138  			continue
  1139  		}
  1140  		key, val := i.data.Last()
  1141  		if key == nil {
  1142  			return nil, nil
  1143  		}
  1144  		if i.blockLower != nil && i.cmp(key.UserKey, i.blockLower) < 0 {
  1145  			i.exhaustedBounds = -1
  1146  			return nil, nil
  1147  		}
  1148  		return key, val
  1149  	}
  1150  	return nil, nil
  1151  }
  1152  
  1153  // Error implements internalIterator.Error, as documented in the bitalostable
  1154  // package.
  1155  func (i *singleLevelIterator) Error() error {
  1156  	if err := i.data.Error(); err != nil {
  1157  		return err
  1158  	}
  1159  	return i.err
  1160  }
  1161  
  1162  // MaybeFilteredKeys may be called when an iterator is exhausted to indicate
  1163  // whether or not the last positioning method may have skipped any keys due to
  1164  // block-property filters.
  1165  func (i *singleLevelIterator) MaybeFilteredKeys() bool {
  1166  	return i.maybeFilteredKeysSingleLevel
  1167  }
  1168  
  1169  // SetCloseHook sets a function that will be called when the iterator is
  1170  // closed.
  1171  func (i *singleLevelIterator) SetCloseHook(fn func(i Iterator) error) {
  1172  	i.closeHook = fn
  1173  }
  1174  
  1175  func firstError(err0, err1 error) error {
  1176  	if err0 != nil {
  1177  		return err0
  1178  	}
  1179  	return err1
  1180  }
  1181  
  1182  // Close implements internalIterator.Close, as documented in the bitalostable
  1183  // package.
  1184  func (i *singleLevelIterator) Close() error {
  1185  	var err error
  1186  	if i.closeHook != nil {
  1187  		err = firstError(err, i.closeHook(i))
  1188  	}
  1189  	err = firstError(err, i.data.Close())
  1190  	err = firstError(err, i.index.Close())
  1191  	if i.dataRS.sequentialFile != nil {
  1192  		err = firstError(err, i.dataRS.sequentialFile.Close())
  1193  		i.dataRS.sequentialFile = nil
  1194  	}
  1195  	err = firstError(err, i.err)
  1196  	if i.bpfs != nil {
  1197  		releaseBlockPropertiesFilterer(i.bpfs)
  1198  	}
  1199  	*i = i.resetForReuse()
  1200  	singleLevelIterPool.Put(i)
  1201  	return err
  1202  }
  1203  
  1204  func (i *singleLevelIterator) String() string {
  1205  	return i.reader.fileNum.String()
  1206  }
  1207  
  1208  // Deterministic disabling of the bounds-based optimization that avoids seeking.
  1209  // Uses the iterator pointer, since we want diversity in iterator behavior for
  1210  // the same SetBounds call. Used for tests.
  1211  func disableBoundsOpt(bound []byte, ptr uintptr) bool {
  1212  	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  1213  	simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
  1214  	return bound[len(bound)-1]&byte(1) == 0 && simpleHash == 0
  1215  }
  1216  
  1217  // SetBounds implements internalIterator.SetBounds, as documented in the bitalostable
  1218  // package.
  1219  func (i *singleLevelIterator) SetBounds(lower, upper []byte) {
  1220  	i.boundsCmp = 0
  1221  	if i.positionedUsingLatestBounds {
  1222  		if i.upper != nil && lower != nil && i.cmp(i.upper, lower) <= 0 {
  1223  			i.boundsCmp = +1
  1224  			if invariants.Enabled && disableBoundsOpt(lower, uintptr(unsafe.Pointer(i))) {
  1225  				i.boundsCmp = 0
  1226  			}
  1227  		} else if i.lower != nil && upper != nil && i.cmp(upper, i.lower) <= 0 {
  1228  			i.boundsCmp = -1
  1229  			if invariants.Enabled && disableBoundsOpt(upper, uintptr(unsafe.Pointer(i))) {
  1230  				i.boundsCmp = 0
  1231  			}
  1232  		}
  1233  		i.positionedUsingLatestBounds = false
  1234  	}
  1235  	i.lower = lower
  1236  	i.upper = upper
  1237  	i.blockLower = nil
  1238  	i.blockUpper = nil
  1239  }
  1240  
  1241  var _ base.InternalIterator = &singleLevelIterator{}
  1242  var _ base.InternalIterator = &twoLevelIterator{}
  1243  
  1244  // compactionIterator is similar to Iterator but it increments the number of
  1245  // bytes that have been iterated through.
  1246  type compactionIterator struct {
  1247  	*singleLevelIterator
  1248  	bytesIterated *uint64
  1249  	prevOffset    uint64
  1250  }
  1251  
  1252  // compactionIterator implements the base.InternalIterator interface.
  1253  var _ base.InternalIterator = (*compactionIterator)(nil)
  1254  
  1255  func (i *compactionIterator) String() string {
  1256  	return i.reader.fileNum.String()
  1257  }
  1258  
  1259  func (i *compactionIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
  1260  	panic("bitalostable: SeekGE unimplemented")
  1261  }
  1262  
  1263  func (i *compactionIterator) SeekPrefixGE(
  1264  	prefix, key []byte, flags base.SeekGEFlags,
  1265  ) (*base.InternalKey, []byte) {
  1266  	panic("bitalostable: SeekPrefixGE unimplemented")
  1267  }
  1268  
  1269  func (i *compactionIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
  1270  	panic("bitalostable: SeekLT unimplemented")
  1271  }
  1272  
  1273  func (i *compactionIterator) First() (*InternalKey, []byte) {
  1274  	i.err = nil
  1275  	return i.skipForward(i.singleLevelIterator.First())
  1276  }
  1277  
  1278  func (i *compactionIterator) Last() (*InternalKey, []byte) {
  1279  	panic("bitalostable: Last unimplemented")
  1280  }
  1281  
  1282  // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
  1283  // due to performance. Keep the two in sync.
  1284  func (i *compactionIterator) Next() (*InternalKey, []byte) {
  1285  	if i.err != nil {
  1286  		return nil, nil
  1287  	}
  1288  	return i.skipForward(i.data.Next())
  1289  }
  1290  
  1291  func (i *compactionIterator) Prev() (*InternalKey, []byte) {
  1292  	panic("bitalostable: Prev unimplemented")
  1293  }
  1294  
  1295  func (i *compactionIterator) skipForward(key *InternalKey, val []byte) (*InternalKey, []byte) {
  1296  	if key == nil {
  1297  		for {
  1298  			if key, _ := i.index.Next(); key == nil {
  1299  				break
  1300  			}
  1301  			result := i.loadBlock(+1)
  1302  			if result != loadBlockOK {
  1303  				if i.err != nil {
  1304  					break
  1305  				}
  1306  				switch result {
  1307  				case loadBlockFailed:
  1308  					// We checked that i.index was at a valid entry, so
  1309  					// loadBlockFailed could not have happened due to to i.index
  1310  					// being exhausted, and must be due to an error.
  1311  					panic("loadBlock should not have failed with no error")
  1312  				case loadBlockIrrelevant:
  1313  					panic("compactionIter should not be using block intervals for skipping")
  1314  				default:
  1315  					panic(fmt.Sprintf("unexpected case %d", result))
  1316  				}
  1317  			}
  1318  			// result == loadBlockOK
  1319  			if key, val = i.data.First(); key != nil {
  1320  				break
  1321  			}
  1322  		}
  1323  	}
  1324  
  1325  	curOffset := i.recordOffset()
  1326  	*i.bytesIterated += uint64(curOffset - i.prevOffset)
  1327  	i.prevOffset = curOffset
  1328  	return key, val
  1329  }
  1330  
  1331  type twoLevelIterator struct {
  1332  	singleLevelIterator
  1333  	// maybeFilteredKeysSingleLevel indicates whether the last iterator
  1334  	// positioning operation may have skipped any index blocks due to
  1335  	// block-property filters when positioning the top-level-index.
  1336  	maybeFilteredKeysTwoLevel bool
  1337  	topLevelIndex             blockIter
  1338  }
  1339  
  1340  // twoLevelIterator implements the base.InternalIterator interface.
  1341  var _ base.InternalIterator = (*twoLevelIterator)(nil)
  1342  
  1343  // loadIndex loads the index block at the current top level index position and
  1344  // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error
  1345  // encountered, which may be nil if we have simply exhausted the entire table.
  1346  // This is used for two level indexes.
  1347  func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult {
  1348  	// Ensure the data block iterator is invalidated even if loading of the
  1349  	// index fails.
  1350  	i.data.invalidate()
  1351  	if !i.topLevelIndex.valid() {
  1352  		i.index.offset = 0
  1353  		i.index.restarts = 0
  1354  		return loadBlockFailed
  1355  	}
  1356  	bhp, err := decodeBlockHandleWithProperties(i.topLevelIndex.Value())
  1357  	if err != nil {
  1358  		i.err = base.CorruptionErrorf("bitalostable/table: corrupt top level index entry")
  1359  		return loadBlockFailed
  1360  	}
  1361  	if i.bpfs != nil {
  1362  		intersects, err := i.bpfs.intersects(bhp.Props)
  1363  		if err != nil {
  1364  			i.err = errCorruptIndexEntry
  1365  			return loadBlockFailed
  1366  		}
  1367  		if intersects == blockMaybeExcluded {
  1368  			intersects = i.resolveMaybeExcluded(dir)
  1369  		}
  1370  		if intersects == blockExcluded {
  1371  			i.maybeFilteredKeysTwoLevel = true
  1372  			return loadBlockIrrelevant
  1373  		}
  1374  		// blockIntersects
  1375  	}
  1376  	indexBlock, err := i.readBlockWithStats(bhp.BlockHandle, nil /* readaheadState */)
  1377  	if err != nil {
  1378  		i.err = err
  1379  		return loadBlockFailed
  1380  	}
  1381  	if i.err = i.index.initHandle(
  1382  		i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum); i.err == nil {
  1383  		return loadBlockOK
  1384  	}
  1385  	return loadBlockFailed
  1386  }
  1387  
  1388  // resolveMaybeExcluded is invoked when the block-property filterer has found
  1389  // that an index block is excluded according to its properties but only if its
  1390  // bounds fall within the filter's current bounds. This function consults the
  1391  // apprioriate bound, depending on the iteration direction, and returns either
  1392  // `blockIntersects` or
  1393  // `blockMaybeExcluded`.
  1394  func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult {
  1395  	// This iterator is configured with a bound-limited block property filter.
  1396  	// The bpf determined this entire index block could be excluded from
  1397  	// iteration based on the property encoded in the block handle. However, we
  1398  	// still need to determine if the index block is wholly contained within the
  1399  	// filter's key bounds.
  1400  	//
  1401  	// External guarantees ensure all its data blocks' keys are ≥ the filter's
  1402  	// lower bound during forward iteration, and that all its data blocks' keys
  1403  	// are < the filter's upper bound during backward iteration. We only need to
  1404  	// determine if the opposite bound is also met.
  1405  	//
  1406  	// The index separator in topLevelIndex.Key() provides an inclusive
  1407  	// upper-bound for the index block's keys, guaranteeing that all its keys
  1408  	// are ≤ topLevelIndex.Key(). For forward iteration, this is all we need.
  1409  	if dir > 0 {
  1410  		// Forward iteration.
  1411  		if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key()) {
  1412  			return blockExcluded
  1413  		}
  1414  		return blockIntersects
  1415  	}
  1416  
  1417  	// Reverse iteration.
  1418  	//
  1419  	// Because we're iterating in the reverse direction, we don't yet have
  1420  	// enough context available to determine if the block is wholly contained
  1421  	// within its bounds. This case arises only during backward iteration,
  1422  	// because of the way the index is structured.
  1423  	//
  1424  	// Consider a bound-limited bpf limited to the bounds [b,d), loading the
  1425  	// block with separator `c`. During reverse iteration, the guarantee that
  1426  	// all the block's keys are < `d` is externally provided, but no guarantee
  1427  	// is made on the bpf's lower bound. The separator `c` only provides an
  1428  	// inclusive upper bound on the block's keys, indicating that the
  1429  	// corresponding block handle points to a block containing only keys ≤ `c`.
  1430  	//
  1431  	// To establish a lower bound, we step the top-level index backwards to read
  1432  	// the previous block's separator, which provides an inclusive lower bound
  1433  	// on the original index block's keys. Afterwards, we step forward to
  1434  	// restore our top-level index position.
  1435  	if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil {
  1436  		// The original block points to the first index block of this table. If
  1437  		// we knew the lower bound for the entire table, it could provide a
  1438  		// lower bound, but the code refactoring necessary to read it doesn't
  1439  		// seem worth the payoff. We fall through to loading the block.
  1440  	} else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey) {
  1441  		// The lower-bound on the original index block falls within the filter's
  1442  		// bounds, and we can skip the block (after restoring our current
  1443  		// top-level index position).
  1444  		_, _ = i.topLevelIndex.Next()
  1445  		return blockExcluded
  1446  	}
  1447  	_, _ = i.topLevelIndex.Next()
  1448  	return blockIntersects
  1449  }
  1450  
  1451  func (i *twoLevelIterator) init(
  1452  	r *Reader,
  1453  	lower, upper []byte,
  1454  	filterer *BlockPropertiesFilterer,
  1455  	useFilter bool,
  1456  	stats *base.InternalIteratorStats,
  1457  ) error {
  1458  	if r.err != nil {
  1459  		return r.err
  1460  	}
  1461  	topLevelIndexH, err := r.readIndex()
  1462  	if err != nil {
  1463  		return err
  1464  	}
  1465  
  1466  	i.lower = lower
  1467  	i.upper = upper
  1468  	i.bpfs = filterer
  1469  	i.useFilter = useFilter
  1470  	i.reader = r
  1471  	i.cmp = r.Compare
  1472  	i.stats = stats
  1473  	err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum)
  1474  	if err != nil {
  1475  		// blockIter.Close releases topLevelIndexH and always returns a nil error
  1476  		_ = i.topLevelIndex.Close()
  1477  		return err
  1478  	}
  1479  	return nil
  1480  }
  1481  
  1482  func (i *twoLevelIterator) String() string {
  1483  	return i.reader.fileNum.String()
  1484  }
  1485  
  1486  // MaybeFilteredKeys may be called when an iterator is exhausted to indicate
  1487  // whether or not the last positioning method may have skipped any keys due to
  1488  // block-property filters.
  1489  func (i *twoLevelIterator) MaybeFilteredKeys() bool {
  1490  	// While reading sstables with two-level indexes, knowledge of whether we've
  1491  	// filtered keys is tracked separately for each index level. The
  1492  	// seek-using-next optimizations have different criteria. We can only reset
  1493  	// maybeFilteredKeys back to false during a seek when NOT using the
  1494  	// fast-path that uses the current iterator position.
  1495  	//
  1496  	// If either level might have filtered keys to arrive at the current
  1497  	// iterator position, return MaybeFilteredKeys=true.
  1498  	return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel
  1499  }
  1500  
  1501  // SeekGE implements internalIterator.SeekGE, as documented in the bitalostable
  1502  // package. Note that SeekGE only checks the upper bound. It is up to the
  1503  // caller to ensure that key is greater than or equal to the lower bound.
  1504  func (i *twoLevelIterator) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
  1505  	i.exhaustedBounds = 0
  1506  	i.err = nil
  1507  
  1508  	// SeekGE performs various step-instead-of-seeking optimizations: eg enabled
  1509  	// by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp).
  1510  	// Care must be taken to ensure that when performing these optimizations and
  1511  	// the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately.
  1512  	// Consider a previous SeekGE that filtered keys from k until the current
  1513  	// iterator position.
  1514  	//
  1515  	// If the previous SeekGE exhausted the iterator while seeking within the
  1516  	// two-level index, it's possible keys greater than or equal to the current
  1517  	// search key were filtered through skipped index blocks. We must not reuse
  1518  	// the position of the two-level index iterator without remembering the
  1519  	// previous value of maybeFilteredKeys.
  1520  
  1521  	var dontSeekWithinSingleLevelIter bool
  1522  	if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() ||
  1523  		(i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 {
  1524  		// Slow-path: need to position the topLevelIndex.
  1525  		i.maybeFilteredKeysTwoLevel = false
  1526  		flags = flags.DisableTrySeekUsingNext()
  1527  		var ikey *InternalKey
  1528  		if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil {
  1529  			i.data.invalidate()
  1530  			i.index.invalidate()
  1531  			return nil, nil
  1532  		}
  1533  
  1534  		result := i.loadIndex(+1)
  1535  		if result == loadBlockFailed {
  1536  			return nil, nil
  1537  		}
  1538  		if result == loadBlockIrrelevant {
  1539  			// Enforce the upper bound here since don't want to bother moving
  1540  			// to the next entry in the top level index if upper bound is
  1541  			// already exceeded. Note that the next entry starts with keys >=
  1542  			// ikey.UserKey since even though this is the block separator, the
  1543  			// same user key can span multiple index blocks. Since upper is
  1544  			// exclusive we use >= below.
  1545  			if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
  1546  				i.exhaustedBounds = +1
  1547  			}
  1548  			// Fall through to skipForward.
  1549  			dontSeekWithinSingleLevelIter = true
  1550  		}
  1551  	}
  1552  	// Else fast-path: There are two possible cases, from
  1553  	// (i.boundsCmp > 0 || flags.TrySeekUsingNext()):
  1554  	//
  1555  	// 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is
  1556  	// respecting the lower bound (guaranteed by Iterator). We know that
  1557  	// the iterator must already be positioned within or just outside the
  1558  	// previous bounds. Therefore the topLevelIndex iter cannot be
  1559  	// positioned at an entry ahead of the seek position (though it can be
  1560  	// positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0
  1561  	// confirms that it is not behind. Since it is not ahead and not behind
  1562  	// it must be at the right position.
  1563  	//
  1564  	// 2) This SeekGE will land on a key that is greater than the key we are
  1565  	// currently at (guaranteed by trySeekUsingNext), but since
  1566  	// i.cmp(key, i.topLevelIndex.Key().UserKey) <= 0, we are at the correct
  1567  	// lower level index block. No need to reset the state of singleLevelIterator.
  1568  
  1569  	if !dontSeekWithinSingleLevelIter {
  1570  		// Note that while trySeekUsingNext could be false here, singleLevelIterator
  1571  		// could do its own boundsCmp-based optimization to seek using next.
  1572  		if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil {
  1573  			return ikey, val
  1574  		}
  1575  	}
  1576  	return i.skipForward()
  1577  }
  1578  
  1579  // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the
  1580  // bitalostable package. Note that SeekPrefixGE only checks the upper bound. It is up
  1581  // to the caller to ensure that key is greater than or equal to the lower bound.
  1582  func (i *twoLevelIterator) SeekPrefixGE(
  1583  	prefix, key []byte, flags base.SeekGEFlags,
  1584  ) (*base.InternalKey, []byte) {
  1585  	i.err = nil
  1586  
  1587  	// Check prefix bloom filter.
  1588  	if i.reader.tableFilter != nil && i.useFilter {
  1589  		if !i.lastBloomFilterMatched {
  1590  			// Iterator is not positioned based on last seek.
  1591  			flags = flags.DisableTrySeekUsingNext()
  1592  		}
  1593  		i.lastBloomFilterMatched = false
  1594  		var dataH cache.Handle
  1595  		dataH, i.err = i.reader.readFilter()
  1596  		if i.err != nil {
  1597  			i.data.invalidate()
  1598  			return nil, nil
  1599  		}
  1600  		mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix)
  1601  		dataH.Release()
  1602  		if !mayContain {
  1603  			// This invalidation may not be necessary for correctness, and may
  1604  			// be a place to optimize later by reusing the already loaded
  1605  			// block. It was necessary in earlier versions of the code since
  1606  			// the caller was allowed to call Next when SeekPrefixGE returned
  1607  			// nil. This is no longer allowed.
  1608  			i.data.invalidate()
  1609  			return nil, nil
  1610  		}
  1611  		i.lastBloomFilterMatched = true
  1612  	}
  1613  
  1614  	// Bloom filter matches.
  1615  	i.exhaustedBounds = 0
  1616  
  1617  	// SeekPrefixGE performs various step-instead-of-seeking optimizations: eg
  1618  	// enabled by trySeekUsingNext, or by monotonically increasing bounds
  1619  	// (i.boundsCmp).  Care must be taken to ensure that when performing these
  1620  	// optimizations and the iterator becomes exhausted,
  1621  	// i.maybeFilteredKeysTwoLevel is set appropriately.  Consider a previous
  1622  	// SeekPrefixGE that filtered keys from k until the current iterator
  1623  	// position.
  1624  	//
  1625  	// If the previous SeekPrefixGE exhausted the iterator while seeking within
  1626  	// the two-level index, it's possible keys greater than or equal to the
  1627  	// current search key were filtered through skipped index blocks. We must
  1628  	// not reuse the position of the two-level index iterator without
  1629  	// remembering the previous value of maybeFilteredKeysTwoLevel.
  1630  
  1631  	var dontSeekWithinSingleLevelIter bool
  1632  	if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() ||
  1633  		i.boundsCmp <= 0 || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 {
  1634  		// Slow-path: need to position the topLevelIndex.
  1635  		//
  1636  		// TODO(sumeer): improve this slow-path to be able to use Next, when
  1637  		// flags.TrySeekUsingNext() is true, since the fast path never applies
  1638  		// for practical uses of SeekPrefixGE in CockroachDB (they never set
  1639  		// monotonic bounds). To apply it here, we would need to confirm that
  1640  		// the topLevelIndex can continue using the same second level index
  1641  		// block, and in that case we don't need to invalidate and reload the
  1642  		// singleLevelIterator state.
  1643  		i.maybeFilteredKeysTwoLevel = false
  1644  		flags = flags.DisableTrySeekUsingNext()
  1645  		var ikey *InternalKey
  1646  		if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil {
  1647  			i.data.invalidate()
  1648  			i.index.invalidate()
  1649  			return nil, nil
  1650  		}
  1651  
  1652  		result := i.loadIndex(+1)
  1653  		if result == loadBlockFailed {
  1654  			return nil, nil
  1655  		}
  1656  		if result == loadBlockIrrelevant {
  1657  			// Enforce the upper bound here since don't want to bother moving
  1658  			// to the next entry in the top level index if upper bound is
  1659  			// already exceeded. Note that the next entry starts with keys >=
  1660  			// ikey.UserKey since even though this is the block separator, the
  1661  			// same user key can span multiple index blocks. Since upper is
  1662  			// exclusive we use >= below.
  1663  			if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
  1664  				i.exhaustedBounds = +1
  1665  			}
  1666  			// Fall through to skipForward.
  1667  			dontSeekWithinSingleLevelIter = true
  1668  		}
  1669  	}
  1670  	// Else fast-path: The bounds have moved forward and this SeekGE is
  1671  	// respecting the lower bound (guaranteed by Iterator). We know that
  1672  	// the iterator must already be positioned within or just outside the
  1673  	// previous bounds. Therefore the topLevelIndex iter cannot be
  1674  	// positioned at an entry ahead of the seek position (though it can be
  1675  	// positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0
  1676  	// confirms that it is not behind. Since it is not ahead and not behind
  1677  	// it must be at the right position.
  1678  
  1679  	if !dontSeekWithinSingleLevelIter {
  1680  		if ikey, val := i.singleLevelIterator.seekPrefixGE(
  1681  			prefix, key, flags, false /* checkFilter */); ikey != nil {
  1682  			return ikey, val
  1683  		}
  1684  	}
  1685  	// NB: skipForward checks whether exhaustedBounds is already +1.
  1686  	return i.skipForward()
  1687  }
  1688  
  1689  // SeekLT implements internalIterator.SeekLT, as documented in the bitalostable
  1690  // package. Note that SeekLT only checks the lower bound. It is up to the
  1691  // caller to ensure that key is less than the upper bound.
  1692  func (i *twoLevelIterator) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
  1693  	i.exhaustedBounds = 0
  1694  	i.err = nil
  1695  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1696  	i.boundsCmp = 0
  1697  
  1698  	var result loadBlockResult
  1699  	var ikey *InternalKey
  1700  	// NB: Unlike SeekGE, we don't have a fast-path here since we don't know
  1701  	// whether the topLevelIndex is positioned after the position that would
  1702  	// be returned by doing i.topLevelIndex.SeekGE(). To know this we would
  1703  	// need to know the index key preceding the current one.
  1704  	// NB: If a bound-limited block property filter is configured, it's
  1705  	// externally ensured that the filter is disabled (through returning
  1706  	// Intersects=false irrespective of the block props provided) during seeks.
  1707  	i.maybeFilteredKeysTwoLevel = false
  1708  	if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil {
  1709  		if ikey, _ = i.topLevelIndex.Last(); ikey == nil {
  1710  			i.data.invalidate()
  1711  			i.index.invalidate()
  1712  			return nil, nil
  1713  		}
  1714  
  1715  		result = i.loadIndex(-1)
  1716  		if result == loadBlockFailed {
  1717  			return nil, nil
  1718  		}
  1719  		if result == loadBlockOK {
  1720  			if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil {
  1721  				return ikey, val
  1722  			}
  1723  			// Fall through to skipBackward since the singleLevelIterator did
  1724  			// not have any blocks that satisfy the block interval
  1725  			// constraints, or the lower bound was reached.
  1726  		}
  1727  		// Else loadBlockIrrelevant, so fall through.
  1728  	} else {
  1729  		result = i.loadIndex(-1)
  1730  		if result == loadBlockFailed {
  1731  			return nil, nil
  1732  		}
  1733  		if result == loadBlockOK {
  1734  			if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil {
  1735  				return ikey, val
  1736  			}
  1737  			// Fall through to skipBackward since the singleLevelIterator did
  1738  			// not have any blocks that satisfy the block interval
  1739  			// constraint, or the lower bound was reached.
  1740  		}
  1741  		// Else loadBlockIrrelevant, so fall through.
  1742  	}
  1743  	if result == loadBlockIrrelevant {
  1744  		// Enforce the lower bound here since don't want to bother moving to
  1745  		// the previous entry in the top level index if lower bound is already
  1746  		// exceeded. Note that the previous entry starts with keys <=
  1747  		// ikey.UserKey since even though this is the current block's
  1748  		// separator, the same user key can span multiple index blocks.
  1749  		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
  1750  			i.exhaustedBounds = -1
  1751  		}
  1752  	}
  1753  	// NB: skipBackward checks whether exhaustedBounds is already -1.
  1754  	return i.skipBackward()
  1755  }
  1756  
  1757  // First implements internalIterator.First, as documented in the bitalostable
  1758  // package. Note that First only checks the upper bound. It is up to the caller
  1759  // to ensure that key is greater than or equal to the lower bound (e.g. via a
  1760  // call to SeekGE(lower)).
  1761  func (i *twoLevelIterator) First() (*InternalKey, []byte) {
  1762  	if i.lower != nil {
  1763  		panic("twoLevelIterator.First() used despite lower bound")
  1764  	}
  1765  	i.exhaustedBounds = 0
  1766  	i.maybeFilteredKeysTwoLevel = false
  1767  	i.err = nil
  1768  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1769  	i.boundsCmp = 0
  1770  
  1771  	var ikey *InternalKey
  1772  	if ikey, _ = i.topLevelIndex.First(); ikey == nil {
  1773  		return nil, nil
  1774  	}
  1775  
  1776  	result := i.loadIndex(+1)
  1777  	if result == loadBlockFailed {
  1778  		return nil, nil
  1779  	}
  1780  	if result == loadBlockOK {
  1781  		if ikey, val := i.singleLevelIterator.First(); ikey != nil {
  1782  			return ikey, val
  1783  		}
  1784  		// Else fall through to skipForward.
  1785  	} else {
  1786  		// result == loadBlockIrrelevant. Enforce the upper bound here since
  1787  		// don't want to bother moving to the next entry in the top level
  1788  		// index if upper bound is already exceeded. Note that the next entry
  1789  		// starts with keys >= ikey.UserKey since even though this is the
  1790  		// block separator, the same user key can span multiple index blocks.
  1791  		// Since upper is exclusive we use >= below.
  1792  		if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
  1793  			i.exhaustedBounds = +1
  1794  		}
  1795  	}
  1796  	// NB: skipForward checks whether exhaustedBounds is already +1.
  1797  	return i.skipForward()
  1798  }
  1799  
  1800  // Last implements internalIterator.Last, as documented in the bitalostable
  1801  // package. Note that Last only checks the lower bound. It is up to the caller
  1802  // to ensure that key is less than the upper bound (e.g. via a call to
  1803  // SeekLT(upper))
  1804  func (i *twoLevelIterator) Last() (*InternalKey, []byte) {
  1805  	if i.upper != nil {
  1806  		panic("twoLevelIterator.Last() used despite upper bound")
  1807  	}
  1808  	i.exhaustedBounds = 0
  1809  	i.maybeFilteredKeysTwoLevel = false
  1810  	i.err = nil
  1811  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1812  	i.boundsCmp = 0
  1813  
  1814  	var ikey *InternalKey
  1815  	if ikey, _ = i.topLevelIndex.Last(); ikey == nil {
  1816  		return nil, nil
  1817  	}
  1818  
  1819  	result := i.loadIndex(-1)
  1820  	if result == loadBlockFailed {
  1821  		return nil, nil
  1822  	}
  1823  	if result == loadBlockOK {
  1824  		if ikey, val := i.singleLevelIterator.Last(); ikey != nil {
  1825  			return ikey, val
  1826  		}
  1827  		// Else fall through to skipBackward.
  1828  	} else {
  1829  		// result == loadBlockIrrelevant. Enforce the lower bound here
  1830  		// since don't want to bother moving to the previous entry in the
  1831  		// top level index if lower bound is already exceeded. Note that
  1832  		// the previous entry starts with keys <= ikey.UserKey since even
  1833  		// though this is the current block's separator, the same user key
  1834  		// can span multiple index blocks.
  1835  		if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
  1836  			i.exhaustedBounds = -1
  1837  		}
  1838  	}
  1839  	// NB: skipBackward checks whether exhaustedBounds is already -1.
  1840  	return i.skipBackward()
  1841  }
  1842  
  1843  // Next implements internalIterator.Next, as documented in the bitalostable
  1844  // package.
  1845  // Note: twoLevelCompactionIterator.Next mirrors the implementation of
  1846  // twoLevelIterator.Next due to performance. Keep the two in sync.
  1847  func (i *twoLevelIterator) Next() (*InternalKey, []byte) {
  1848  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1849  	i.boundsCmp = 0
  1850  	i.maybeFilteredKeysTwoLevel = false
  1851  	if i.err != nil {
  1852  		return nil, nil
  1853  	}
  1854  	if key, val := i.singleLevelIterator.Next(); key != nil {
  1855  		return key, val
  1856  	}
  1857  	return i.skipForward()
  1858  }
  1859  
  1860  // Prev implements internalIterator.Prev, as documented in the bitalostable
  1861  // package.
  1862  func (i *twoLevelIterator) Prev() (*InternalKey, []byte) {
  1863  	// Seek optimization only applies until iterator is first positioned after SetBounds.
  1864  	i.boundsCmp = 0
  1865  	i.maybeFilteredKeysTwoLevel = false
  1866  	if i.err != nil {
  1867  		return nil, nil
  1868  	}
  1869  	if key, val := i.singleLevelIterator.Prev(); key != nil {
  1870  		return key, val
  1871  	}
  1872  	return i.skipBackward()
  1873  }
  1874  
  1875  func (i *twoLevelIterator) skipForward() (*InternalKey, []byte) {
  1876  	for {
  1877  		if i.err != nil || i.exhaustedBounds > 0 {
  1878  			return nil, nil
  1879  		}
  1880  		i.exhaustedBounds = 0
  1881  		var ikey *InternalKey
  1882  		if ikey, _ = i.topLevelIndex.Next(); ikey == nil {
  1883  			i.data.invalidate()
  1884  			i.index.invalidate()
  1885  			return nil, nil
  1886  		}
  1887  		result := i.loadIndex(+1)
  1888  		if result == loadBlockFailed {
  1889  			return nil, nil
  1890  		}
  1891  		if result == loadBlockOK {
  1892  			if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil {
  1893  				return ikey, val
  1894  			}
  1895  			// Next iteration will return if singleLevelIterator set
  1896  			// exhaustedBounds = +1.
  1897  		} else {
  1898  			// result == loadBlockIrrelevant. Enforce the upper bound here
  1899  			// since don't want to bother moving to the next entry in the top
  1900  			// level index if upper bound is already exceeded. Note that the
  1901  			// next entry starts with keys >= ikey.UserKey since even though
  1902  			// this is the block separator, the same user key can span
  1903  			// multiple index blocks. Since upper is exclusive we use >=
  1904  			// below.
  1905  			if i.upper != nil && i.cmp(ikey.UserKey, i.upper) >= 0 {
  1906  				i.exhaustedBounds = +1
  1907  				// Next iteration will return.
  1908  			}
  1909  		}
  1910  	}
  1911  }
  1912  
  1913  func (i *twoLevelIterator) skipBackward() (*InternalKey, []byte) {
  1914  	for {
  1915  		if i.err != nil || i.exhaustedBounds < 0 {
  1916  			return nil, nil
  1917  		}
  1918  		i.exhaustedBounds = 0
  1919  		var ikey *InternalKey
  1920  		if ikey, _ = i.topLevelIndex.Prev(); ikey == nil {
  1921  			i.data.invalidate()
  1922  			i.index.invalidate()
  1923  			return nil, nil
  1924  		}
  1925  		result := i.loadIndex(-1)
  1926  		if result == loadBlockFailed {
  1927  			return nil, nil
  1928  		}
  1929  		if result == loadBlockOK {
  1930  			if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil {
  1931  				return ikey, val
  1932  			}
  1933  			// Next iteration will return if singleLevelIterator set
  1934  			// exhaustedBounds = -1.
  1935  		} else {
  1936  			// result == loadBlockIrrelevant. Enforce the lower bound here
  1937  			// since don't want to bother moving to the previous entry in the
  1938  			// top level index if lower bound is already exceeded. Note that
  1939  			// the previous entry starts with keys <= ikey.UserKey since even
  1940  			// though this is the current block's separator, the same user key
  1941  			// can span multiple index blocks.
  1942  			if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 {
  1943  				i.exhaustedBounds = -1
  1944  				// Next iteration will return.
  1945  			}
  1946  		}
  1947  	}
  1948  }
  1949  
  1950  // Close implements internalIterator.Close, as documented in the bitalostable
  1951  // package.
  1952  func (i *twoLevelIterator) Close() error {
  1953  	var err error
  1954  	if i.closeHook != nil {
  1955  		err = firstError(err, i.closeHook(i))
  1956  	}
  1957  	err = firstError(err, i.data.Close())
  1958  	err = firstError(err, i.index.Close())
  1959  	err = firstError(err, i.topLevelIndex.Close())
  1960  	if i.dataRS.sequentialFile != nil {
  1961  		err = firstError(err, i.dataRS.sequentialFile.Close())
  1962  		i.dataRS.sequentialFile = nil
  1963  	}
  1964  	err = firstError(err, i.err)
  1965  	if i.bpfs != nil {
  1966  		releaseBlockPropertiesFilterer(i.bpfs)
  1967  	}
  1968  	*i = twoLevelIterator{
  1969  		singleLevelIterator: i.singleLevelIterator.resetForReuse(),
  1970  		topLevelIndex:       i.topLevelIndex.resetForReuse(),
  1971  	}
  1972  	twoLevelIterPool.Put(i)
  1973  	return err
  1974  }
  1975  
  1976  // Note: twoLevelCompactionIterator and compactionIterator are very similar but
  1977  // were separated due to performance.
  1978  type twoLevelCompactionIterator struct {
  1979  	*twoLevelIterator
  1980  	bytesIterated *uint64
  1981  	prevOffset    uint64
  1982  }
  1983  
  1984  // twoLevelCompactionIterator implements the base.InternalIterator interface.
  1985  var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil)
  1986  
  1987  func (i *twoLevelCompactionIterator) Close() error {
  1988  	return i.twoLevelIterator.Close()
  1989  }
  1990  
  1991  func (i *twoLevelCompactionIterator) SeekGE(
  1992  	key []byte, flags base.SeekGEFlags,
  1993  ) (*InternalKey, []byte) {
  1994  	panic("bitalostable: SeekGE unimplemented")
  1995  }
  1996  
  1997  func (i *twoLevelCompactionIterator) SeekPrefixGE(
  1998  	prefix, key []byte, flags base.SeekGEFlags,
  1999  ) (*base.InternalKey, []byte) {
  2000  	panic("bitalostable: SeekPrefixGE unimplemented")
  2001  }
  2002  
  2003  func (i *twoLevelCompactionIterator) SeekLT(
  2004  	key []byte, flags base.SeekLTFlags,
  2005  ) (*InternalKey, []byte) {
  2006  	panic("bitalostable: SeekLT unimplemented")
  2007  }
  2008  
  2009  func (i *twoLevelCompactionIterator) First() (*InternalKey, []byte) {
  2010  	i.err = nil
  2011  	return i.skipForward(i.twoLevelIterator.First())
  2012  }
  2013  
  2014  func (i *twoLevelCompactionIterator) Last() (*InternalKey, []byte) {
  2015  	panic("bitalostable: Last unimplemented")
  2016  }
  2017  
  2018  // Note: twoLevelCompactionIterator.Next mirrors the implementation of
  2019  // twoLevelIterator.Next due to performance. Keep the two in sync.
  2020  func (i *twoLevelCompactionIterator) Next() (*InternalKey, []byte) {
  2021  	if i.err != nil {
  2022  		return nil, nil
  2023  	}
  2024  	return i.skipForward(i.singleLevelIterator.Next())
  2025  }
  2026  
  2027  func (i *twoLevelCompactionIterator) Prev() (*InternalKey, []byte) {
  2028  	panic("bitalostable: Prev unimplemented")
  2029  }
  2030  
  2031  func (i *twoLevelCompactionIterator) String() string {
  2032  	return i.reader.fileNum.String()
  2033  }
  2034  
  2035  func (i *twoLevelCompactionIterator) skipForward(
  2036  	key *InternalKey, val []byte,
  2037  ) (*InternalKey, []byte) {
  2038  	if key == nil {
  2039  		for {
  2040  			if key, _ := i.topLevelIndex.Next(); key == nil {
  2041  				break
  2042  			}
  2043  			result := i.loadIndex(+1)
  2044  			if result != loadBlockOK {
  2045  				if i.err != nil {
  2046  					break
  2047  				}
  2048  				switch result {
  2049  				case loadBlockFailed:
  2050  					// We checked that i.index was at a valid entry, so
  2051  					// loadBlockFailed could not have happened due to to i.index
  2052  					// being exhausted, and must be due to an error.
  2053  					panic("loadBlock should not have failed with no error")
  2054  				case loadBlockIrrelevant:
  2055  					panic("compactionIter should not be using block intervals for skipping")
  2056  				default:
  2057  					panic(fmt.Sprintf("unexpected case %d", result))
  2058  				}
  2059  			}
  2060  			// result == loadBlockOK
  2061  			if key, val = i.singleLevelIterator.First(); key != nil {
  2062  				break
  2063  			}
  2064  		}
  2065  	}
  2066  
  2067  	curOffset := i.recordOffset()
  2068  	*i.bytesIterated += uint64(curOffset - i.prevOffset)
  2069  	i.prevOffset = curOffset
  2070  	return key, val
  2071  }
  2072  
  2073  type blockTransform func([]byte) ([]byte, error)
  2074  
  2075  // readaheadState contains state variables related to readahead. Updated on
  2076  // file reads.
  2077  type readaheadState struct {
  2078  	// Number of sequential reads.
  2079  	numReads int64
  2080  	// Size issued to the next call to Prefetch. Starts at or above
  2081  	// initialReadaheadSize and grows exponentially until maxReadaheadSize.
  2082  	size int64
  2083  	// prevSize is the size used in the last Prefetch call.
  2084  	prevSize int64
  2085  	// The byte offset up to which the OS has been asked to read ahead / cached.
  2086  	// When reading ahead, reads up to this limit should not incur an IO
  2087  	// operation. Reads after this limit can benefit from a new call to
  2088  	// Prefetch.
  2089  	limit int64
  2090  	// sequentialFile holds a file descriptor to the same underlying File,
  2091  	// except with fadvise(FADV_SEQUENTIAL) called on it to take advantage of
  2092  	// OS-level readahead. Initialized when the iterator has been consistently
  2093  	// reading blocks in a sequential access pattern. Once this is non-nil,
  2094  	// the other variables in readaheadState don't matter much as we defer
  2095  	// to OS-level readahead.
  2096  	sequentialFile vfs.File
  2097  }
  2098  
  2099  func (rs *readaheadState) recordCacheHit(offset, blockLength int64) {
  2100  	currentReadEnd := offset + blockLength
  2101  	if rs.sequentialFile != nil {
  2102  		// Using OS-level readahead instead, so do nothing.
  2103  		return
  2104  	}
  2105  	if rs.numReads >= minFileReadsForReadahead {
  2106  		if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize {
  2107  			// This is a read that would have resulted in a readahead, had it
  2108  			// not been a cache hit.
  2109  			rs.limit = currentReadEnd
  2110  			return
  2111  		}
  2112  		if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+maxReadaheadSize {
  2113  			// We read too far away from rs.limit to benefit from readahead in
  2114  			// any scenario. Reset all variables.
  2115  			rs.numReads = 1
  2116  			rs.limit = currentReadEnd
  2117  			rs.size = initialReadaheadSize
  2118  			rs.prevSize = 0
  2119  			return
  2120  		}
  2121  		// Reads in the range [rs.limit - rs.prevSize, rs.limit] end up
  2122  		// here. This is a read that is potentially benefitting from a past
  2123  		// readahead.
  2124  		return
  2125  	}
  2126  	if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize {
  2127  		// Blocks are being read sequentially and would benefit from readahead
  2128  		// down the line.
  2129  		rs.numReads++
  2130  		return
  2131  	}
  2132  	// We read too far ahead of the last read, or before it. This indicates
  2133  	// a random read, where readahead is not desirable. Reset all variables.
  2134  	rs.numReads = 1
  2135  	rs.limit = currentReadEnd
  2136  	rs.size = initialReadaheadSize
  2137  	rs.prevSize = 0
  2138  }
  2139  
  2140  // maybeReadahead updates state and determines whether to issue a readahead /
  2141  // prefetch call for a block read at offset for blockLength bytes.
  2142  // Returns a size value (greater than 0) that should be prefetched if readahead
  2143  // would be beneficial.
  2144  func (rs *readaheadState) maybeReadahead(offset, blockLength int64) int64 {
  2145  	currentReadEnd := offset + blockLength
  2146  	if rs.sequentialFile != nil {
  2147  		// Using OS-level readahead instead, so do nothing.
  2148  		return 0
  2149  	}
  2150  	if rs.numReads >= minFileReadsForReadahead {
  2151  		// The minimum threshold of sequential reads to justify reading ahead
  2152  		// has been reached.
  2153  		// There are two intervals: the interval being read:
  2154  		// [offset, currentReadEnd]
  2155  		// as well as the interval where a read would benefit from read ahead:
  2156  		// [rs.limit, rs.limit + rs.size]
  2157  		// We increase the latter interval to
  2158  		// [rs.limit, rs.limit + maxReadaheadSize] to account for cases where
  2159  		// readahead may not be beneficial with a small readahead size, but over
  2160  		// time the readahead size would increase exponentially to make it
  2161  		// beneficial.
  2162  		if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize {
  2163  			// We are doing a read in the interval ahead of
  2164  			// the last readahead range. In the diagrams below, ++++ is the last
  2165  			// readahead range, ==== is the range represented by
  2166  			// [rs.limit, rs.limit + maxReadaheadSize], and ---- is the range
  2167  			// being read.
  2168  			//
  2169  			//               rs.limit           rs.limit + maxReadaheadSize
  2170  			//         ++++++++++|===========================|
  2171  			//
  2172  			//              |-------------|
  2173  			//            offset       currentReadEnd
  2174  			//
  2175  			// This case is also possible, as are all cases with an overlap
  2176  			// between [rs.limit, rs.limit + maxReadaheadSize] and [offset,
  2177  			// currentReadEnd]:
  2178  			//
  2179  			//               rs.limit           rs.limit + maxReadaheadSize
  2180  			//         ++++++++++|===========================|
  2181  			//
  2182  			//                                            |-------------|
  2183  			//                                         offset       currentReadEnd
  2184  			//
  2185  			//
  2186  			rs.numReads++
  2187  			rs.limit = offset + rs.size
  2188  			rs.prevSize = rs.size
  2189  			// Increase rs.size for the next read.
  2190  			rs.size *= 2
  2191  			if rs.size > maxReadaheadSize {
  2192  				rs.size = maxReadaheadSize
  2193  			}
  2194  			return rs.prevSize
  2195  		}
  2196  		if currentReadEnd < rs.limit-rs.prevSize || offset > rs.limit+maxReadaheadSize {
  2197  			// The above conditional has rs.limit > rs.prevSize to confirm that
  2198  			// rs.limit - rs.prevSize would not underflow.
  2199  			// We read too far away from rs.limit to benefit from readahead in
  2200  			// any scenario. Reset all variables.
  2201  			// The case where we read too far ahead:
  2202  			//
  2203  			// (rs.limit - rs.prevSize)    (rs.limit)   (rs.limit + maxReadaheadSize)
  2204  			//                    |+++++++++++++|=============|
  2205  			//
  2206  			//                                                  |-------------|
  2207  			//                                             offset       currentReadEnd
  2208  			//
  2209  			// Or too far behind:
  2210  			//
  2211  			// (rs.limit - rs.prevSize)    (rs.limit)   (rs.limit + maxReadaheadSize)
  2212  			//                    |+++++++++++++|=============|
  2213  			//
  2214  			//    |-------------|
  2215  			// offset       currentReadEnd
  2216  			//
  2217  			rs.numReads = 1
  2218  			rs.limit = currentReadEnd
  2219  			rs.size = initialReadaheadSize
  2220  			rs.prevSize = 0
  2221  			return 0
  2222  		}
  2223  		// Reads in the range [rs.limit - rs.prevSize, rs.limit] end up
  2224  		// here. This is a read that is potentially benefitting from a past
  2225  		// readahead, but there's no reason to issue a readahead call at the
  2226  		// moment.
  2227  		//
  2228  		// (rs.limit - rs.prevSize)            (rs.limit + maxReadaheadSize)
  2229  		//                    |+++++++++++++|===============|
  2230  		//                             (rs.limit)
  2231  		//
  2232  		//                        |-------|
  2233  		//                     offset    currentReadEnd
  2234  		//
  2235  		rs.numReads++
  2236  		return 0
  2237  	}
  2238  	if currentReadEnd >= rs.limit && offset <= rs.limit+maxReadaheadSize {
  2239  		// Blocks are being read sequentially and would benefit from readahead
  2240  		// down the line.
  2241  		//
  2242  		//                       (rs.limit)   (rs.limit + maxReadaheadSize)
  2243  		//                         |=============|
  2244  		//
  2245  		//                    |-------|
  2246  		//                offset    currentReadEnd
  2247  		//
  2248  		rs.numReads++
  2249  		return 0
  2250  	}
  2251  	// We read too far ahead of the last read, or before it. This indicates
  2252  	// a random read, where readahead is not desirable. Reset all variables.
  2253  	//
  2254  	// (rs.limit - maxReadaheadSize)  (rs.limit)   (rs.limit + maxReadaheadSize)
  2255  	//                     |+++++++++++++|=============|
  2256  	//
  2257  	//                                                    |-------|
  2258  	//                                                offset    currentReadEnd
  2259  	//
  2260  	rs.numReads = 1
  2261  	rs.limit = currentReadEnd
  2262  	rs.size = initialReadaheadSize
  2263  	rs.prevSize = 0
  2264  	return 0
  2265  }
  2266  
  2267  // ReaderOption provide an interface to do work on Reader while it is being
  2268  // opened.
  2269  type ReaderOption interface {
  2270  	// readerApply is called on the reader during opening in order to set internal
  2271  	// parameters.
  2272  	readerApply(*Reader)
  2273  }
  2274  
  2275  // Comparers is a map from comparer name to comparer. It is used for debugging
  2276  // tools which may be used on multiple databases configured with different
  2277  // comparers. Comparers implements the OpenOption interface and can be passed
  2278  // as a parameter to NewReader.
  2279  type Comparers map[string]*Comparer
  2280  
  2281  func (c Comparers) readerApply(r *Reader) {
  2282  	if r.Compare != nil || r.Properties.ComparerName == "" {
  2283  		return
  2284  	}
  2285  	if comparer, ok := c[r.Properties.ComparerName]; ok {
  2286  		r.Compare = comparer.Compare
  2287  		r.FormatKey = comparer.FormatKey
  2288  		r.Split = comparer.Split
  2289  	}
  2290  }
  2291  
  2292  // Mergers is a map from merger name to merger. It is used for debugging tools
  2293  // which may be used on multiple databases configured with different
  2294  // mergers. Mergers implements the OpenOption interface and can be passed as
  2295  // a parameter to NewReader.
  2296  type Mergers map[string]*Merger
  2297  
  2298  func (m Mergers) readerApply(r *Reader) {
  2299  	if r.mergerOK || r.Properties.MergerName == "" {
  2300  		return
  2301  	}
  2302  	_, r.mergerOK = m[r.Properties.MergerName]
  2303  }
  2304  
  2305  // cacheOpts is a Reader open option for specifying the cache ID and sstable file
  2306  // number. If not specified, a unique cache ID will be used.
  2307  type cacheOpts struct {
  2308  	cacheID uint64
  2309  	fileNum base.FileNum
  2310  }
  2311  
  2312  // Marker function to indicate the option should be applied before reading the
  2313  // sstable properties and, in the write path, before writing the default
  2314  // sstable properties.
  2315  func (c *cacheOpts) preApply() {}
  2316  
  2317  func (c *cacheOpts) readerApply(r *Reader) {
  2318  	if r.cacheID == 0 {
  2319  		r.cacheID = c.cacheID
  2320  	}
  2321  	if r.fileNum == 0 {
  2322  		r.fileNum = c.fileNum
  2323  	}
  2324  }
  2325  
  2326  func (c *cacheOpts) writerApply(w *Writer) {
  2327  	if w.cacheID == 0 {
  2328  		w.cacheID = c.cacheID
  2329  	}
  2330  	if w.fileNum == 0 {
  2331  		w.fileNum = c.fileNum
  2332  	}
  2333  }
  2334  
  2335  // FileReopenOpt is specified if this reader is allowed to reopen additional
  2336  // file descriptors for this file. Used to take advantage of OS-level readahead.
  2337  type FileReopenOpt struct {
  2338  	FS       vfs.FS
  2339  	Filename string
  2340  }
  2341  
  2342  func (f FileReopenOpt) readerApply(r *Reader) {
  2343  	if r.fs == nil {
  2344  		r.fs = f.FS
  2345  		r.filename = f.Filename
  2346  	}
  2347  }
  2348  
  2349  // rawTombstonesOpt is a Reader open option for specifying that range
  2350  // tombstones returned by Reader.NewRangeDelIter() should not be
  2351  // fragmented. Used by debug tools to get a raw view of the tombstones
  2352  // contained in an sstable.
  2353  type rawTombstonesOpt struct{}
  2354  
  2355  func (rawTombstonesOpt) preApply() {}
  2356  
  2357  func (rawTombstonesOpt) readerApply(r *Reader) {
  2358  	r.rawTombstones = true
  2359  }
  2360  
  2361  func init() {
  2362  	private.SSTableCacheOpts = func(cacheID uint64, fileNum base.FileNum) interface{} {
  2363  		return &cacheOpts{cacheID, fileNum}
  2364  	}
  2365  	private.SSTableRawTombstonesOpt = rawTombstonesOpt{}
  2366  }
  2367  
  2368  // Reader is a table reader.
  2369  type Reader struct {
  2370  	file              ReadableFile
  2371  	fs                vfs.FS
  2372  	filename          string
  2373  	cacheID           uint64
  2374  	fileNum           base.FileNum
  2375  	rawTombstones     bool
  2376  	err               error
  2377  	indexBH           BlockHandle
  2378  	filterBH          BlockHandle
  2379  	rangeDelBH        BlockHandle
  2380  	rangeKeyBH        BlockHandle
  2381  	rangeDelTransform blockTransform
  2382  	propertiesBH      BlockHandle
  2383  	metaIndexBH       BlockHandle
  2384  	footerBH          BlockHandle
  2385  	opts              ReaderOptions
  2386  	Compare           Compare
  2387  	FormatKey         base.FormatKey
  2388  	Split             Split
  2389  	mergerOK          bool
  2390  	checksumType      ChecksumType
  2391  	tableFilter       *tableFilterReader
  2392  	tableFormat       TableFormat
  2393  	Properties        Properties
  2394  }
  2395  
  2396  // Close implements DB.Close, as documented in the bitalostable package.
  2397  func (r *Reader) Close() error {
  2398  	r.opts.Cache.Unref()
  2399  
  2400  	if r.err != nil {
  2401  		if r.file != nil {
  2402  			r.file.Close()
  2403  			r.file = nil
  2404  		}
  2405  		return r.err
  2406  	}
  2407  	if r.file != nil {
  2408  		r.err = r.file.Close()
  2409  		r.file = nil
  2410  		if r.err != nil {
  2411  			return r.err
  2412  		}
  2413  	}
  2414  	// Make any future calls to Get, NewIter or Close return an error.
  2415  	r.err = errReaderClosed
  2416  	return nil
  2417  }
  2418  
  2419  // NewIterWithBlockPropertyFilters returns an iterator for the contents of the
  2420  // table. If an error occurs, NewIterWithBlockPropertyFilters cleans up after
  2421  // itself and returns a nil iterator.
  2422  func (r *Reader) NewIterWithBlockPropertyFilters(
  2423  	lower, upper []byte,
  2424  	filterer *BlockPropertiesFilterer,
  2425  	useFilterBlock bool,
  2426  	stats *base.InternalIteratorStats,
  2427  ) (Iterator, error) {
  2428  	// NB: bitalostable.tableCache wraps the returned iterator with one which performs
  2429  	// reference counting on the Reader, preventing the Reader from being closed
  2430  	// until the final iterator closes.
  2431  	if r.Properties.IndexType == twoLevelIndex {
  2432  		i := twoLevelIterPool.Get().(*twoLevelIterator)
  2433  		err := i.init(r, lower, upper, filterer, useFilterBlock, stats)
  2434  		if err != nil {
  2435  			return nil, err
  2436  		}
  2437  		return i, nil
  2438  	}
  2439  
  2440  	i := singleLevelIterPool.Get().(*singleLevelIterator)
  2441  	err := i.init(r, lower, upper, filterer, useFilterBlock, stats)
  2442  	if err != nil {
  2443  		return nil, err
  2444  	}
  2445  	return i, nil
  2446  }
  2447  
  2448  // NewIter returns an iterator for the contents of the table. If an error
  2449  // occurs, NewIter cleans up after itself and returns a nil iterator.
  2450  func (r *Reader) NewIter(lower, upper []byte) (Iterator, error) {
  2451  	return r.NewIterWithBlockPropertyFilters(lower, upper, nil, true /* useFilterBlock */, nil /* stats */)
  2452  }
  2453  
  2454  // NewCompactionIter returns an iterator similar to NewIter but it also increments
  2455  // the number of bytes iterated. If an error occurs, NewCompactionIter cleans up
  2456  // after itself and returns a nil iterator.
  2457  func (r *Reader) NewCompactionIter(bytesIterated *uint64) (Iterator, error) {
  2458  	if r.Properties.IndexType == twoLevelIndex {
  2459  		i := twoLevelIterPool.Get().(*twoLevelIterator)
  2460  		err := i.init(r, nil /* lower */, nil /* upper */, nil, false /* useFilter */, nil /* stats */)
  2461  		if err != nil {
  2462  			return nil, err
  2463  		}
  2464  		i.setupForCompaction()
  2465  		return &twoLevelCompactionIterator{
  2466  			twoLevelIterator: i,
  2467  			bytesIterated:    bytesIterated,
  2468  		}, nil
  2469  	}
  2470  	i := singleLevelIterPool.Get().(*singleLevelIterator)
  2471  	err := i.init(r, nil /* lower */, nil /* upper */, nil, false /* useFilter */, nil /* stats */)
  2472  	if err != nil {
  2473  		return nil, err
  2474  	}
  2475  	i.setupForCompaction()
  2476  	return &compactionIterator{
  2477  		singleLevelIterator: i,
  2478  		bytesIterated:       bytesIterated,
  2479  	}, nil
  2480  }
  2481  
  2482  // NewRawRangeDelIter returns an internal iterator for the contents of the
  2483  // range-del block for the table. Returns nil if the table does not contain
  2484  // any range deletions.
  2485  func (r *Reader) NewRawRangeDelIter() (keyspan.FragmentIterator, error) {
  2486  	if r.rangeDelBH.Length == 0 {
  2487  		return nil, nil
  2488  	}
  2489  	h, err := r.readRangeDel()
  2490  	if err != nil {
  2491  		return nil, err
  2492  	}
  2493  	i := &fragmentBlockIter{}
  2494  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil {
  2495  		return nil, err
  2496  	}
  2497  	return i, nil
  2498  }
  2499  
  2500  // NewRawRangeKeyIter returns an internal iterator for the contents of the
  2501  // range-key block for the table. Returns nil if the table does not contain any
  2502  // range keys.
  2503  func (r *Reader) NewRawRangeKeyIter() (keyspan.FragmentIterator, error) {
  2504  	if r.rangeKeyBH.Length == 0 {
  2505  		return nil, nil
  2506  	}
  2507  	h, err := r.readRangeKey()
  2508  	if err != nil {
  2509  		return nil, err
  2510  	}
  2511  	i := rangeKeyFragmentBlockIterPool.Get().(*rangeKeyFragmentBlockIter)
  2512  	if err := i.blockIter.initHandle(r.Compare, h, r.Properties.GlobalSeqNum); err != nil {
  2513  		return nil, err
  2514  	}
  2515  	return i, nil
  2516  }
  2517  
  2518  type rangeKeyFragmentBlockIter struct {
  2519  	fragmentBlockIter
  2520  }
  2521  
  2522  func (i *rangeKeyFragmentBlockIter) Close() error {
  2523  	err := i.fragmentBlockIter.Close()
  2524  	i.fragmentBlockIter = i.fragmentBlockIter.resetForReuse()
  2525  	rangeKeyFragmentBlockIterPool.Put(i)
  2526  	return err
  2527  }
  2528  
  2529  func (r *Reader) readIndex() (cache.Handle, error) {
  2530  	h, _, err :=
  2531  		r.readBlock(r.indexBH, nil /* transform */, nil /* readaheadState */)
  2532  	return h, err
  2533  }
  2534  
  2535  func (r *Reader) readFilter() (cache.Handle, error) {
  2536  	h, _, err :=
  2537  		r.readBlock(r.filterBH, nil /* transform */, nil /* readaheadState */)
  2538  	return h, err
  2539  }
  2540  
  2541  func (r *Reader) readRangeDel() (cache.Handle, error) {
  2542  	h, _, err :=
  2543  		r.readBlock(r.rangeDelBH, r.rangeDelTransform, nil /* readaheadState */)
  2544  	return h, err
  2545  }
  2546  
  2547  func (r *Reader) readRangeKey() (cache.Handle, error) {
  2548  	h, _, err :=
  2549  		r.readBlock(r.rangeKeyBH, nil /* transform */, nil /* readaheadState */)
  2550  	return h, err
  2551  }
  2552  
  2553  func checkChecksum(
  2554  	checksumType ChecksumType, b []byte, bh BlockHandle, fileNum base.FileNum,
  2555  ) error {
  2556  	expectedChecksum := binary.LittleEndian.Uint32(b[bh.Length+1:])
  2557  	var computedChecksum uint32
  2558  	switch checksumType {
  2559  	case ChecksumTypeCRC32c:
  2560  		computedChecksum = crc.New(b[:bh.Length+1]).Value()
  2561  	case ChecksumTypeXXHash64:
  2562  		computedChecksum = uint32(xxhash.Sum64(b[:bh.Length+1]))
  2563  	default:
  2564  		return errors.Errorf("unsupported checksum type: %d", checksumType)
  2565  	}
  2566  
  2567  	if expectedChecksum != computedChecksum {
  2568  		return base.CorruptionErrorf(
  2569  			"bitalostable/table: invalid table %s (checksum mismatch at %d/%d)",
  2570  			errors.Safe(fileNum), errors.Safe(bh.Offset), errors.Safe(bh.Length))
  2571  	}
  2572  	return nil
  2573  }
  2574  
  2575  // readBlock reads and decompresses a block from disk into memory.
  2576  func (r *Reader) readBlock(
  2577  	bh BlockHandle, transform blockTransform, raState *readaheadState,
  2578  ) (_ cache.Handle, cacheHit bool, _ error) {
  2579  	if h := r.opts.Cache.Get(r.cacheID, r.fileNum, bh.Offset); h.Get() != nil {
  2580  		if raState != nil {
  2581  			raState.recordCacheHit(int64(bh.Offset), int64(bh.Length+blockTrailerLen))
  2582  		}
  2583  		return h, true, nil
  2584  	}
  2585  	file := r.file
  2586  
  2587  	if raState != nil {
  2588  		if raState.sequentialFile != nil {
  2589  			file = raState.sequentialFile
  2590  		} else if readaheadSize := raState.maybeReadahead(int64(bh.Offset), int64(bh.Length+blockTrailerLen)); readaheadSize > 0 {
  2591  			if readaheadSize >= maxReadaheadSize {
  2592  				// We've reached the maximum readahead size. Beyond this
  2593  				// point, rely on OS-level readahead. Note that we can only
  2594  				// reopen a new file handle with this optimization if
  2595  				// r.fs != nil. This reader must have been created with the
  2596  				// FileReopenOpt for this field to be set.
  2597  				if r.fs != nil {
  2598  					f, err := r.fs.Open(r.filename, vfs.SequentialReadsOption)
  2599  					if err == nil {
  2600  						// Use this new file handle for all sequential reads by
  2601  						// this iterator going forward.
  2602  						raState.sequentialFile = f
  2603  						file = f
  2604  					}
  2605  
  2606  					// If we tried to load a table that doesn't exist, panic
  2607  					// immediately.  Something is seriously wrong if a table
  2608  					// doesn't exist.
  2609  					// See cockroachdb/cockroach#56490.
  2610  					base.MustExist(r.fs, r.filename, panicFataler{}, err)
  2611  				}
  2612  			}
  2613  			if raState.sequentialFile == nil {
  2614  				type fd interface {
  2615  					Fd() uintptr
  2616  				}
  2617  				if f, ok := r.file.(fd); ok {
  2618  					_ = vfs.Prefetch(f.Fd(), bh.Offset, uint64(readaheadSize))
  2619  				}
  2620  			}
  2621  		}
  2622  	}
  2623  
  2624  	v := r.opts.Cache.Alloc(int(bh.Length + blockTrailerLen))
  2625  	b := v.Buf()
  2626  	if _, err := file.ReadAt(b, int64(bh.Offset)); err != nil {
  2627  		r.opts.Cache.Free(v)
  2628  		return cache.Handle{}, false, err
  2629  	}
  2630  
  2631  	if err := checkChecksum(r.checksumType, b, bh, r.fileNum); err != nil {
  2632  		r.opts.Cache.Free(v)
  2633  		return cache.Handle{}, false, err
  2634  	}
  2635  
  2636  	typ := blockType(b[bh.Length])
  2637  	b = b[:bh.Length]
  2638  	v.Truncate(len(b))
  2639  
  2640  	decoded, err := decompressBlock(r.opts.Cache, typ, b)
  2641  	if decoded != nil {
  2642  		r.opts.Cache.Free(v)
  2643  		v = decoded
  2644  		b = v.Buf()
  2645  	} else if err != nil {
  2646  		r.opts.Cache.Free(v)
  2647  		return cache.Handle{}, false, err
  2648  	}
  2649  
  2650  	if transform != nil {
  2651  		// Transforming blocks is rare, so the extra copy of the transformed data
  2652  		// is not problematic.
  2653  		var err error
  2654  		b, err = transform(b)
  2655  		if err != nil {
  2656  			r.opts.Cache.Free(v)
  2657  			return cache.Handle{}, false, err
  2658  		}
  2659  		newV := r.opts.Cache.Alloc(len(b))
  2660  		copy(newV.Buf(), b)
  2661  		r.opts.Cache.Free(v)
  2662  		v = newV
  2663  	}
  2664  
  2665  	h := r.opts.Cache.Set(r.cacheID, r.fileNum, bh.Offset, v)
  2666  	return h, false, nil
  2667  }
  2668  
  2669  func (r *Reader) transformRangeDelV1(b []byte) ([]byte, error) {
  2670  	// Convert v1 (RocksDB format) range-del blocks to v2 blocks on the fly. The
  2671  	// v1 format range-del blocks have unfragmented and unsorted range
  2672  	// tombstones. We need properly fragmented and sorted range tombstones in
  2673  	// order to serve from them directly.
  2674  	iter := &blockIter{}
  2675  	if err := iter.init(r.Compare, b, r.Properties.GlobalSeqNum); err != nil {
  2676  		return nil, err
  2677  	}
  2678  	var tombstones []keyspan.Span
  2679  	for key, value := iter.First(); key != nil; key, value = iter.Next() {
  2680  		t := keyspan.Span{
  2681  			Start: key.UserKey,
  2682  			End:   value,
  2683  			Keys:  []keyspan.Key{{Trailer: key.Trailer}},
  2684  		}
  2685  		tombstones = append(tombstones, t)
  2686  	}
  2687  	keyspan.Sort(r.Compare, tombstones)
  2688  
  2689  	// Fragment the tombstones, outputting them directly to a block writer.
  2690  	rangeDelBlock := blockWriter{
  2691  		restartInterval: 1,
  2692  	}
  2693  	frag := keyspan.Fragmenter{
  2694  		Cmp:    r.Compare,
  2695  		Format: r.FormatKey,
  2696  		Emit: func(s keyspan.Span) {
  2697  			for _, k := range s.Keys {
  2698  				startIK := InternalKey{UserKey: s.Start, Trailer: k.Trailer}
  2699  				rangeDelBlock.add(startIK, s.End)
  2700  			}
  2701  		},
  2702  	}
  2703  	for i := range tombstones {
  2704  		frag.Add(tombstones[i])
  2705  	}
  2706  	frag.Finish()
  2707  
  2708  	// Return the contents of the constructed v2 format range-del block.
  2709  	return rangeDelBlock.finish(), nil
  2710  }
  2711  
  2712  func (r *Reader) readMetaindex(metaindexBH BlockHandle) error {
  2713  	b, _, err := r.readBlock(metaindexBH, nil /* transform */, nil /* readaheadState */)
  2714  	if err != nil {
  2715  		return err
  2716  	}
  2717  	data := b.Get()
  2718  	defer b.Release()
  2719  
  2720  	if uint64(len(data)) != metaindexBH.Length {
  2721  		return base.CorruptionErrorf("bitalostable/table: unexpected metaindex block size: %d vs %d",
  2722  			errors.Safe(len(data)), errors.Safe(metaindexBH.Length))
  2723  	}
  2724  
  2725  	i, err := newRawBlockIter(bytes.Compare, data)
  2726  	if err != nil {
  2727  		return err
  2728  	}
  2729  
  2730  	meta := map[string]BlockHandle{}
  2731  	for valid := i.First(); valid; valid = i.Next() {
  2732  		bh, n := decodeBlockHandle(i.Value())
  2733  		if n == 0 {
  2734  			return base.CorruptionErrorf("bitalostable/table: invalid table (bad filter block handle)")
  2735  		}
  2736  		meta[string(i.Key().UserKey)] = bh
  2737  	}
  2738  	if err := i.Close(); err != nil {
  2739  		return err
  2740  	}
  2741  
  2742  	if bh, ok := meta[metaPropertiesName]; ok {
  2743  		b, _, err = r.readBlock(bh, nil /* transform */, nil /* readaheadState */)
  2744  		if err != nil {
  2745  			return err
  2746  		}
  2747  		r.propertiesBH = bh
  2748  		err := r.Properties.load(b.Get(), bh.Offset)
  2749  		b.Release()
  2750  		if err != nil {
  2751  			return err
  2752  		}
  2753  	}
  2754  
  2755  	if bh, ok := meta[metaRangeDelV2Name]; ok {
  2756  		r.rangeDelBH = bh
  2757  	} else if bh, ok := meta[metaRangeDelName]; ok {
  2758  		r.rangeDelBH = bh
  2759  		if !r.rawTombstones {
  2760  			r.rangeDelTransform = r.transformRangeDelV1
  2761  		}
  2762  	}
  2763  
  2764  	if bh, ok := meta[metaRangeKeyName]; ok {
  2765  		r.rangeKeyBH = bh
  2766  	}
  2767  
  2768  	for name, fp := range r.opts.Filters {
  2769  		types := []struct {
  2770  			ftype  FilterType
  2771  			prefix string
  2772  		}{
  2773  			{TableFilter, "fullfilter."},
  2774  		}
  2775  		var done bool
  2776  		for _, t := range types {
  2777  			if bh, ok := meta[t.prefix+name]; ok {
  2778  				r.filterBH = bh
  2779  
  2780  				switch t.ftype {
  2781  				case TableFilter:
  2782  					r.tableFilter = newTableFilterReader(fp)
  2783  				default:
  2784  					return base.CorruptionErrorf("unknown filter type: %v", errors.Safe(t.ftype))
  2785  				}
  2786  
  2787  				done = true
  2788  				break
  2789  			}
  2790  		}
  2791  		if done {
  2792  			break
  2793  		}
  2794  	}
  2795  	return nil
  2796  }
  2797  
  2798  // Layout returns the layout (block organization) for an sstable.
  2799  func (r *Reader) Layout() (*Layout, error) {
  2800  	if r.err != nil {
  2801  		return nil, r.err
  2802  	}
  2803  
  2804  	l := &Layout{
  2805  		Data:       make([]BlockHandleWithProperties, 0, r.Properties.NumDataBlocks),
  2806  		Filter:     r.filterBH,
  2807  		RangeDel:   r.rangeDelBH,
  2808  		RangeKey:   r.rangeKeyBH,
  2809  		Properties: r.propertiesBH,
  2810  		MetaIndex:  r.metaIndexBH,
  2811  		Footer:     r.footerBH,
  2812  	}
  2813  
  2814  	indexH, err := r.readIndex()
  2815  	if err != nil {
  2816  		return nil, err
  2817  	}
  2818  	defer indexH.Release()
  2819  
  2820  	var alloc []byte
  2821  
  2822  	if r.Properties.IndexPartitions == 0 {
  2823  		l.Index = append(l.Index, r.indexBH)
  2824  		iter, _ := newBlockIter(r.Compare, indexH.Get())
  2825  		for key, value := iter.First(); key != nil; key, value = iter.Next() {
  2826  			dataBH, err := decodeBlockHandleWithProperties(value)
  2827  			if err != nil {
  2828  				return nil, errCorruptIndexEntry
  2829  			}
  2830  			if len(dataBH.Props) > 0 {
  2831  				if len(alloc) < len(dataBH.Props) {
  2832  					alloc = make([]byte, 256<<10)
  2833  				}
  2834  				n := copy(alloc, dataBH.Props)
  2835  				dataBH.Props = alloc[:n:n]
  2836  				alloc = alloc[n:]
  2837  			}
  2838  			l.Data = append(l.Data, dataBH)
  2839  		}
  2840  	} else {
  2841  		l.TopIndex = r.indexBH
  2842  		topIter, _ := newBlockIter(r.Compare, indexH.Get())
  2843  		iter := &blockIter{}
  2844  		for key, value := topIter.First(); key != nil; key, value = topIter.Next() {
  2845  			indexBH, err := decodeBlockHandleWithProperties(value)
  2846  			if err != nil {
  2847  				return nil, errCorruptIndexEntry
  2848  			}
  2849  			l.Index = append(l.Index, indexBH.BlockHandle)
  2850  
  2851  			subIndex, _, err := r.readBlock(
  2852  				indexBH.BlockHandle, nil /* transform */, nil /* readaheadState */)
  2853  			if err != nil {
  2854  				return nil, err
  2855  			}
  2856  			if err := iter.init(r.Compare, subIndex.Get(), 0 /* globalSeqNum */); err != nil {
  2857  				return nil, err
  2858  			}
  2859  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  2860  				dataBH, err := decodeBlockHandleWithProperties(value)
  2861  				if len(dataBH.Props) > 0 {
  2862  					if len(alloc) < len(dataBH.Props) {
  2863  						alloc = make([]byte, 256<<10)
  2864  					}
  2865  					n := copy(alloc, dataBH.Props)
  2866  					dataBH.Props = alloc[:n:n]
  2867  					alloc = alloc[n:]
  2868  				}
  2869  				if err != nil {
  2870  					return nil, errCorruptIndexEntry
  2871  				}
  2872  				l.Data = append(l.Data, dataBH)
  2873  			}
  2874  			subIndex.Release()
  2875  			*iter = iter.resetForReuse()
  2876  		}
  2877  	}
  2878  
  2879  	return l, nil
  2880  }
  2881  
  2882  // ValidateBlockChecksums validates the checksums for each block in the SSTable.
  2883  func (r *Reader) ValidateBlockChecksums() error {
  2884  	// Pre-compute the BlockHandles for the underlying file.
  2885  	l, err := r.Layout()
  2886  	if err != nil {
  2887  		return err
  2888  	}
  2889  
  2890  	// Construct the set of blocks to check. Note that the footer is not checked
  2891  	// as it is not a block with a checksum.
  2892  	blocks := make([]BlockHandle, len(l.Data))
  2893  	for i := range l.Data {
  2894  		blocks[i] = l.Data[i].BlockHandle
  2895  	}
  2896  	blocks = append(blocks, l.Index...)
  2897  	blocks = append(blocks, l.TopIndex, l.Filter, l.RangeDel, l.RangeKey, l.Properties, l.MetaIndex)
  2898  
  2899  	// Sorting by offset ensures we are performing a sequential scan of the
  2900  	// file.
  2901  	sort.Slice(blocks, func(i, j int) bool {
  2902  		return blocks[i].Offset < blocks[j].Offset
  2903  	})
  2904  
  2905  	// Check all blocks sequentially. Make use of read-ahead, given we are
  2906  	// scanning the entire file from start to end.
  2907  	blockRS := &readaheadState{
  2908  		size: initialReadaheadSize,
  2909  	}
  2910  	for _, bh := range blocks {
  2911  		// Certain blocks may not be present, in which case we skip them.
  2912  		if bh.Length == 0 {
  2913  			continue
  2914  		}
  2915  
  2916  		// Read the block, which validates the checksum.
  2917  		h, _, err := r.readBlock(bh, nil /* transform */, blockRS)
  2918  		if err != nil {
  2919  			return err
  2920  		}
  2921  		h.Release()
  2922  	}
  2923  
  2924  	return nil
  2925  }
  2926  
  2927  // EstimateDiskUsage returns the total size of data blocks overlapping the range
  2928  // `[start, end]`. Even if a data block partially overlaps, or we cannot
  2929  // determine overlap due to abbreviated index keys, the full data block size is
  2930  // included in the estimation.
  2931  //
  2932  // This function does not account for any metablock space usage. Assumes there
  2933  // is at least partial overlap, i.e., `[start, end]` falls neither completely
  2934  // before nor completely after the file's range.
  2935  //
  2936  // Only blocks containing point keys are considered. Range deletion and range
  2937  // key blocks are not considered.
  2938  //
  2939  // TODO(ajkr): account for metablock space usage. Perhaps look at the fraction of
  2940  // data blocks overlapped and add that same fraction of the metadata blocks to the
  2941  // estimate.
  2942  func (r *Reader) EstimateDiskUsage(start, end []byte) (uint64, error) {
  2943  	if r.err != nil {
  2944  		return 0, r.err
  2945  	}
  2946  
  2947  	indexH, err := r.readIndex()
  2948  	if err != nil {
  2949  		return 0, err
  2950  	}
  2951  	defer indexH.Release()
  2952  
  2953  	// Iterators over the bottom-level index blocks containing start and end.
  2954  	// These may be different in case of partitioned index but will both point
  2955  	// to the same blockIter over the single index in the unpartitioned case.
  2956  	var startIdxIter, endIdxIter *blockIter
  2957  	if r.Properties.IndexPartitions == 0 {
  2958  		iter, err := newBlockIter(r.Compare, indexH.Get())
  2959  		if err != nil {
  2960  			return 0, err
  2961  		}
  2962  		startIdxIter = iter
  2963  		endIdxIter = iter
  2964  	} else {
  2965  		topIter, err := newBlockIter(r.Compare, indexH.Get())
  2966  		if err != nil {
  2967  			return 0, err
  2968  		}
  2969  
  2970  		key, val := topIter.SeekGE(start, base.SeekGEFlagsNone)
  2971  		if key == nil {
  2972  			// The range falls completely after this file, or an error occurred.
  2973  			return 0, topIter.Error()
  2974  		}
  2975  		startIdxBH, err := decodeBlockHandleWithProperties(val)
  2976  		if err != nil {
  2977  			return 0, errCorruptIndexEntry
  2978  		}
  2979  		startIdxBlock, _, err := r.readBlock(
  2980  			startIdxBH.BlockHandle, nil /* transform */, nil /* readaheadState */)
  2981  		if err != nil {
  2982  			return 0, err
  2983  		}
  2984  		defer startIdxBlock.Release()
  2985  		startIdxIter, err = newBlockIter(r.Compare, startIdxBlock.Get())
  2986  		if err != nil {
  2987  			return 0, err
  2988  		}
  2989  
  2990  		key, val = topIter.SeekGE(end, base.SeekGEFlagsNone)
  2991  		if key == nil {
  2992  			if err := topIter.Error(); err != nil {
  2993  				return 0, err
  2994  			}
  2995  		} else {
  2996  			endIdxBH, err := decodeBlockHandleWithProperties(val)
  2997  			if err != nil {
  2998  				return 0, errCorruptIndexEntry
  2999  			}
  3000  			endIdxBlock, _, err := r.readBlock(
  3001  				endIdxBH.BlockHandle, nil /* transform */, nil /* readaheadState */)
  3002  			if err != nil {
  3003  				return 0, err
  3004  			}
  3005  			defer endIdxBlock.Release()
  3006  			endIdxIter, err = newBlockIter(r.Compare, endIdxBlock.Get())
  3007  			if err != nil {
  3008  				return 0, err
  3009  			}
  3010  		}
  3011  	}
  3012  	// startIdxIter should not be nil at this point, while endIdxIter can be if the
  3013  	// range spans past the end of the file.
  3014  
  3015  	key, val := startIdxIter.SeekGE(start, base.SeekGEFlagsNone)
  3016  	if key == nil {
  3017  		// The range falls completely after this file, or an error occurred.
  3018  		return 0, startIdxIter.Error()
  3019  	}
  3020  	startBH, err := decodeBlockHandleWithProperties(val)
  3021  	if err != nil {
  3022  		return 0, errCorruptIndexEntry
  3023  	}
  3024  
  3025  	if endIdxIter == nil {
  3026  		// The range spans beyond this file. Include data blocks through the last.
  3027  		return r.Properties.DataSize - startBH.Offset, nil
  3028  	}
  3029  	key, val = endIdxIter.SeekGE(end, base.SeekGEFlagsNone)
  3030  	if key == nil {
  3031  		if err := endIdxIter.Error(); err != nil {
  3032  			return 0, err
  3033  		}
  3034  		// The range spans beyond this file. Include data blocks through the last.
  3035  		return r.Properties.DataSize - startBH.Offset, nil
  3036  	}
  3037  	endBH, err := decodeBlockHandleWithProperties(val)
  3038  	if err != nil {
  3039  		return 0, errCorruptIndexEntry
  3040  	}
  3041  	return endBH.Offset + endBH.Length + blockTrailerLen - startBH.Offset, nil
  3042  }
  3043  
  3044  // TableFormat returns the format version for the table.
  3045  func (r *Reader) TableFormat() (TableFormat, error) {
  3046  	if r.err != nil {
  3047  		return TableFormatUnspecified, r.err
  3048  	}
  3049  	return r.tableFormat, nil
  3050  }
  3051  
  3052  // ReadableFile describes subset of vfs.File required for reading SSTs.
  3053  type ReadableFile interface {
  3054  	io.ReaderAt
  3055  	io.Closer
  3056  	Stat() (os.FileInfo, error)
  3057  }
  3058  
  3059  // NewReader returns a new table reader for the file. Closing the reader will
  3060  // close the file.
  3061  func NewReader(f ReadableFile, o ReaderOptions, extraOpts ...ReaderOption) (*Reader, error) {
  3062  	o = o.ensureDefaults()
  3063  	r := &Reader{
  3064  		file: f,
  3065  		opts: o,
  3066  	}
  3067  	if r.opts.Cache == nil {
  3068  		r.opts.Cache = cache.New(0)
  3069  	} else {
  3070  		r.opts.Cache.Ref()
  3071  	}
  3072  
  3073  	if f == nil {
  3074  		r.err = errors.New("bitalostable/table: nil file")
  3075  		return nil, r.Close()
  3076  	}
  3077  
  3078  	// Note that the extra options are applied twice. First here for pre-apply
  3079  	// options, and then below for post-apply options. Pre and post refer to
  3080  	// before and after reading the metaindex and properties.
  3081  	type preApply interface{ preApply() }
  3082  	for _, opt := range extraOpts {
  3083  		if _, ok := opt.(preApply); ok {
  3084  			opt.readerApply(r)
  3085  		}
  3086  	}
  3087  	if r.cacheID == 0 {
  3088  		r.cacheID = r.opts.Cache.NewID()
  3089  	}
  3090  
  3091  	footer, err := readFooter(f)
  3092  	if err != nil {
  3093  		r.err = err
  3094  		return nil, r.Close()
  3095  	}
  3096  	r.checksumType = footer.checksum
  3097  	r.tableFormat = footer.format
  3098  	// Read the metaindex.
  3099  	if err := r.readMetaindex(footer.metaindexBH); err != nil {
  3100  		r.err = err
  3101  		return nil, r.Close()
  3102  	}
  3103  	r.indexBH = footer.indexBH
  3104  	r.metaIndexBH = footer.metaindexBH
  3105  	r.footerBH = footer.footerBH
  3106  
  3107  	if r.Properties.ComparerName == "" || o.Comparer.Name == r.Properties.ComparerName {
  3108  		r.Compare = o.Comparer.Compare
  3109  		r.FormatKey = o.Comparer.FormatKey
  3110  		r.Split = o.Comparer.Split
  3111  	}
  3112  
  3113  	if o.MergerName == r.Properties.MergerName {
  3114  		r.mergerOK = true
  3115  	}
  3116  
  3117  	// Apply the extra options again now that the comparer and merger names are
  3118  	// known.
  3119  	for _, opt := range extraOpts {
  3120  		if _, ok := opt.(preApply); !ok {
  3121  			opt.readerApply(r)
  3122  		}
  3123  	}
  3124  
  3125  	if r.Compare == nil {
  3126  		r.err = errors.Errorf("bitalostable/table: %d: unknown comparer %s",
  3127  			errors.Safe(r.fileNum), errors.Safe(r.Properties.ComparerName))
  3128  	}
  3129  	if !r.mergerOK {
  3130  		if name := r.Properties.MergerName; name != "" && name != "nullptr" {
  3131  			r.err = errors.Errorf("bitalostable/table: %d: unknown merger %s",
  3132  				errors.Safe(r.fileNum), errors.Safe(r.Properties.MergerName))
  3133  		}
  3134  	}
  3135  	if r.err != nil {
  3136  		return nil, r.Close()
  3137  	}
  3138  	return r, nil
  3139  }
  3140  
  3141  // Layout describes the block organization of an sstable.
  3142  type Layout struct {
  3143  	// NOTE: changes to fields in this struct should also be reflected in
  3144  	// ValidateBlockChecksums, which validates a static list of BlockHandles
  3145  	// referenced in this struct.
  3146  
  3147  	Data       []BlockHandleWithProperties
  3148  	Index      []BlockHandle
  3149  	TopIndex   BlockHandle
  3150  	Filter     BlockHandle
  3151  	RangeDel   BlockHandle
  3152  	RangeKey   BlockHandle
  3153  	Properties BlockHandle
  3154  	MetaIndex  BlockHandle
  3155  	Footer     BlockHandle
  3156  }
  3157  
  3158  // Describe returns a description of the layout. If the verbose parameter is
  3159  // true, details of the structure of each block are returned as well.
  3160  func (l *Layout) Describe(
  3161  	w io.Writer, verbose bool, r *Reader, fmtRecord func(key *base.InternalKey, value []byte),
  3162  ) {
  3163  	type block struct {
  3164  		BlockHandle
  3165  		name string
  3166  	}
  3167  	var blocks []block
  3168  
  3169  	for i := range l.Data {
  3170  		blocks = append(blocks, block{l.Data[i].BlockHandle, "data"})
  3171  	}
  3172  	for i := range l.Index {
  3173  		blocks = append(blocks, block{l.Index[i], "index"})
  3174  	}
  3175  	if l.TopIndex.Length != 0 {
  3176  		blocks = append(blocks, block{l.TopIndex, "top-index"})
  3177  	}
  3178  	if l.Filter.Length != 0 {
  3179  		blocks = append(blocks, block{l.Filter, "filter"})
  3180  	}
  3181  	if l.RangeDel.Length != 0 {
  3182  		blocks = append(blocks, block{l.RangeDel, "range-del"})
  3183  	}
  3184  	if l.RangeKey.Length != 0 {
  3185  		blocks = append(blocks, block{l.RangeKey, "range-key"})
  3186  	}
  3187  	if l.Properties.Length != 0 {
  3188  		blocks = append(blocks, block{l.Properties, "properties"})
  3189  	}
  3190  	if l.MetaIndex.Length != 0 {
  3191  		blocks = append(blocks, block{l.MetaIndex, "meta-index"})
  3192  	}
  3193  	if l.Footer.Length != 0 {
  3194  		if l.Footer.Length == levelDBFooterLen {
  3195  			blocks = append(blocks, block{l.Footer, "leveldb-footer"})
  3196  		} else {
  3197  			blocks = append(blocks, block{l.Footer, "footer"})
  3198  		}
  3199  	}
  3200  
  3201  	sort.Slice(blocks, func(i, j int) bool {
  3202  		return blocks[i].Offset < blocks[j].Offset
  3203  	})
  3204  
  3205  	for i := range blocks {
  3206  		b := &blocks[i]
  3207  		fmt.Fprintf(w, "%10d  %s (%d)\n", b.Offset, b.name, b.Length)
  3208  
  3209  		if !verbose {
  3210  			continue
  3211  		}
  3212  		if b.name == "filter" {
  3213  			continue
  3214  		}
  3215  
  3216  		if b.name == "footer" || b.name == "leveldb-footer" {
  3217  			trailer, offset := make([]byte, b.Length), b.Offset
  3218  			_, _ = r.file.ReadAt(trailer, int64(offset))
  3219  
  3220  			if b.name == "footer" {
  3221  				checksumType := ChecksumType(trailer[0])
  3222  				fmt.Fprintf(w, "%10d    checksum type: %s\n", offset, checksumType)
  3223  				trailer, offset = trailer[1:], offset+1
  3224  			}
  3225  
  3226  			metaHandle, n := binary.Uvarint(trailer)
  3227  			metaLen, m := binary.Uvarint(trailer[n:])
  3228  			fmt.Fprintf(w, "%10d    meta: offset=%d, length=%d\n", offset, metaHandle, metaLen)
  3229  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
  3230  
  3231  			indexHandle, n := binary.Uvarint(trailer)
  3232  			indexLen, m := binary.Uvarint(trailer[n:])
  3233  			fmt.Fprintf(w, "%10d    index: offset=%d, length=%d\n", offset, indexHandle, indexLen)
  3234  			trailer, offset = trailer[n+m:], offset+uint64(n+m)
  3235  
  3236  			fmt.Fprintf(w, "%10d    [padding]\n", offset)
  3237  
  3238  			trailing := 12
  3239  			if b.name == "leveldb-footer" {
  3240  				trailing = 8
  3241  			}
  3242  
  3243  			offset += uint64(len(trailer) - trailing)
  3244  			trailer = trailer[len(trailer)-trailing:]
  3245  
  3246  			if b.name == "footer" {
  3247  				version := trailer[:4]
  3248  				fmt.Fprintf(w, "%10d    version: %d\n", offset, binary.LittleEndian.Uint32(version))
  3249  				trailer, offset = trailer[4:], offset+4
  3250  			}
  3251  
  3252  			magicNumber := trailer
  3253  			fmt.Fprintf(w, "%10d    magic number: 0x%x\n", offset, magicNumber)
  3254  
  3255  			continue
  3256  		}
  3257  
  3258  		h, _, err := r.readBlock(b.BlockHandle, nil /* transform */, nil /* readaheadState */)
  3259  		if err != nil {
  3260  			fmt.Fprintf(w, "  [err: %s]\n", err)
  3261  			continue
  3262  		}
  3263  
  3264  		getRestart := func(data []byte, restarts, i int32) int32 {
  3265  			return int32(binary.LittleEndian.Uint32(data[restarts+4*i:]))
  3266  		}
  3267  
  3268  		formatIsRestart := func(data []byte, restarts, numRestarts, offset int32) {
  3269  			i := sort.Search(int(numRestarts), func(i int) bool {
  3270  				return getRestart(data, restarts, int32(i)) >= offset
  3271  			})
  3272  			if i < int(numRestarts) && getRestart(data, restarts, int32(i)) == offset {
  3273  				fmt.Fprintf(w, " [restart]\n")
  3274  			} else {
  3275  				fmt.Fprintf(w, "\n")
  3276  			}
  3277  		}
  3278  
  3279  		formatRestarts := func(data []byte, restarts, numRestarts int32) {
  3280  			for i := int32(0); i < numRestarts; i++ {
  3281  				offset := getRestart(data, restarts, i)
  3282  				fmt.Fprintf(w, "%10d    [restart %d]\n",
  3283  					b.Offset+uint64(restarts+4*i), b.Offset+uint64(offset))
  3284  			}
  3285  		}
  3286  
  3287  		formatTrailer := func() {
  3288  			trailer := make([]byte, blockTrailerLen)
  3289  			offset := int64(b.Offset + b.Length)
  3290  			_, _ = r.file.ReadAt(trailer, offset)
  3291  			bt := blockType(trailer[0])
  3292  			checksum := binary.LittleEndian.Uint32(trailer[1:])
  3293  			fmt.Fprintf(w, "%10d    [trailer compression=%s checksum=0x%04x]\n", offset, bt, checksum)
  3294  		}
  3295  
  3296  		var lastKey InternalKey
  3297  		switch b.name {
  3298  		case "data", "range-del", "range-key":
  3299  			iter, _ := newBlockIter(r.Compare, h.Get())
  3300  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  3301  				ptr := unsafe.Pointer(uintptr(iter.ptr) + uintptr(iter.offset))
  3302  				shared, ptr := decodeVarint(ptr)
  3303  				unshared, ptr := decodeVarint(ptr)
  3304  				value2, _ := decodeVarint(ptr)
  3305  
  3306  				total := iter.nextOffset - iter.offset
  3307  				// The format of the numbers in the record line is:
  3308  				//
  3309  				//   (<total> = <length> [<shared>] + <unshared> + <value>)
  3310  				//
  3311  				// <total>    is the total number of bytes for the record.
  3312  				// <length>   is the size of the 3 varint encoded integers for <shared>,
  3313  				//            <unshared>, and <value>.
  3314  				// <shared>   is the number of key bytes shared with the previous key.
  3315  				// <unshared> is the number of unshared key bytes.
  3316  				// <value>    is the number of value bytes.
  3317  				fmt.Fprintf(w, "%10d    record (%d = %d [%d] + %d + %d)",
  3318  					b.Offset+uint64(iter.offset), total,
  3319  					total-int32(unshared+value2), shared, unshared, value2)
  3320  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  3321  				if fmtRecord != nil {
  3322  					fmt.Fprintf(w, "              ")
  3323  					fmtRecord(key, value)
  3324  				}
  3325  
  3326  				if base.InternalCompare(r.Compare, lastKey, *key) >= 0 {
  3327  					fmt.Fprintf(w, "              WARNING: OUT OF ORDER KEYS!\n")
  3328  				}
  3329  				lastKey.Trailer = key.Trailer
  3330  				lastKey.UserKey = append(lastKey.UserKey[:0], key.UserKey...)
  3331  			}
  3332  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  3333  			formatTrailer()
  3334  		case "index", "top-index":
  3335  			iter, _ := newBlockIter(r.Compare, h.Get())
  3336  			for key, value := iter.First(); key != nil; key, value = iter.Next() {
  3337  				bh, err := decodeBlockHandleWithProperties(value)
  3338  				if err != nil {
  3339  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
  3340  					continue
  3341  				}
  3342  				fmt.Fprintf(w, "%10d    block:%d/%d",
  3343  					b.Offset+uint64(iter.offset), bh.Offset, bh.Length)
  3344  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  3345  			}
  3346  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  3347  			formatTrailer()
  3348  		case "properties":
  3349  			iter, _ := newRawBlockIter(r.Compare, h.Get())
  3350  			for valid := iter.First(); valid; valid = iter.Next() {
  3351  				fmt.Fprintf(w, "%10d    %s (%d)",
  3352  					b.Offset+uint64(iter.offset), iter.Key().UserKey, iter.nextOffset-iter.offset)
  3353  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  3354  			}
  3355  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  3356  			formatTrailer()
  3357  		case "meta-index":
  3358  			iter, _ := newRawBlockIter(r.Compare, h.Get())
  3359  			for valid := iter.First(); valid; valid = iter.Next() {
  3360  				value := iter.Value()
  3361  				bh, n := decodeBlockHandle(value)
  3362  				if n == 0 || n != len(value) {
  3363  					fmt.Fprintf(w, "%10d    [err: %s]\n", b.Offset+uint64(iter.offset), err)
  3364  					continue
  3365  				}
  3366  
  3367  				fmt.Fprintf(w, "%10d    %s block:%d/%d",
  3368  					b.Offset+uint64(iter.offset), iter.Key().UserKey,
  3369  					bh.Offset, bh.Length)
  3370  				formatIsRestart(iter.data, iter.restarts, iter.numRestarts, iter.offset)
  3371  			}
  3372  			formatRestarts(iter.data, iter.restarts, iter.numRestarts)
  3373  			formatTrailer()
  3374  		}
  3375  
  3376  		h.Release()
  3377  	}
  3378  
  3379  	last := blocks[len(blocks)-1]
  3380  	fmt.Fprintf(w, "%10d  EOF\n", last.Offset+last.Length)
  3381  }
  3382  
  3383  type panicFataler struct{}
  3384  
  3385  func (panicFataler) Fatalf(format string, args ...interface{}) {
  3386  	panic(errors.Errorf(format, args...))
  3387  }