github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/level_iter.go

github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/level_iter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"runtime/debug"
    11  
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manifest"
    16  	"github.com/cockroachdb/pebble/sstable"
    17  )
    18  
    19  // tableNewIters creates a new point and range-del iterator for the given file
    20  // number.
    21  //
    22  // On success, the internalIterator is not-nil and must be closed; the
    23  // FragmentIterator can be nil.
    24  // TODO(radu): always return a non-nil FragmentIterator.
    25  //
    26  // On error, the iterators are nil.
    27  //
    28  // The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters().
    29  type tableNewIters func(
    30  	ctx context.Context,
    31  	file *manifest.FileMetadata,
    32  	opts *IterOptions,
    33  	internalOpts internalIterOpts,
    34  ) (internalIterator, keyspan.FragmentIterator, error)
    35  
    36  // tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter
    37  // for the rangedel iterator returned by tableNewIters.
    38  func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter {
    39  	return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) {
    40  		iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{})
    41  		if iter != nil {
    42  			_ = iter.Close()
    43  		}
    44  		if rangeDelIter == nil {
    45  			rangeDelIter = emptyKeyspanIter
    46  		}
    47  		return rangeDelIter, err
    48  	}
    49  }
    50  
    51  type internalIterOpts struct {
    52  	bytesIterated      *uint64
    53  	bufferPool         *sstable.BufferPool
    54  	stats              *base.InternalIteratorStats
    55  	boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter
    56  }
    57  
    58  // levelIter provides a merged view of the sstables in a level.
    59  //
    60  // levelIter is used during compaction and as part of the Iterator
    61  // implementation. When used as part of the Iterator implementation, level
    62  // iteration needs to "pause" at sstable boundaries if a range deletion
    63  // tombstone is the source of that boundary. We know if a range tombstone is
    64  // the smallest or largest key in a file because the kind will be
    65  // InternalKeyKindRangeDeletion. If the boundary key is a range deletion
    66  // tombstone, we materialize a fake entry to return from levelIter. This
    67  // prevents mergingIter from advancing past the sstable until the sstable
    68  // contains the smallest (or largest for reverse iteration) key in the merged
    69  // heap. Note that mergingIter treats a range deletion tombstone returned by
    70  // the point iterator as a no-op.
    71  //
    72  // SeekPrefixGE presents the need for a second type of pausing. If an sstable
    73  // iterator returns "not found" for a SeekPrefixGE operation, we don't want to
    74  // advance to the next sstable as the "not found" does not indicate that all of
    75  // the keys in the sstable are less than the search key. Advancing to the next
    76  // sstable would cause us to skip over range tombstones, violating
    77  // correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the
    78  // kind InternalKeyKindRangeDeletion which will be used to pause the levelIter
    79  // at the sstable until the mergingIter is ready to advance past it.
    80  type levelIter struct {
    81  	// The context is stored here since (a) iterators are expected to be
    82  	// short-lived (since they pin sstables), (b) plumbing a context into every
    83  	// method is very painful, (c) they do not (yet) respect context
    84  	// cancellation and are only used for tracing.
    85  	ctx      context.Context
    86  	logger   Logger
    87  	comparer *Comparer
    88  	cmp      Compare
    89  	split    Split
    90  	// The lower/upper bounds for iteration as specified at creation or the most
    91  	// recent call to SetBounds.
    92  	lower []byte
    93  	upper []byte
    94  	// The iterator options for the currently open table. If
    95  	// tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary
    96  	// does not lie within the table bounds.
    97  	tableOpts IterOptions
    98  	// The LSM level this levelIter is initialized for.
    99  	level manifest.Level
   100  	// The keys to return when iterating past an sstable boundary and that
   101  	// boundary is a range deletion tombstone. The boundary could be smallest
   102  	// (i.e. arrived at with Prev), or largest (arrived at with Next).
   103  	smallestBoundary *InternalKey
   104  	largestBoundary  *InternalKey
   105  	// combinedIterState may be set when a levelIter is used during user
   106  	// iteration. Although levelIter only iterates over point keys, it's also
   107  	// responsible for lazily constructing the combined range & point iterator
   108  	// when it observes a file containing range keys. If the combined iter
   109  	// state's initialized field is true, the iterator is already using combined
   110  	// iterator, OR the iterator is not configured to use combined iteration. If
   111  	// it's false, the levelIter must set the `triggered` and `key` fields when
   112  	// the levelIter passes over a file containing range keys. See the
   113  	// lazyCombinedIter for more details.
   114  	combinedIterState *combinedIterState
   115  	// A synthetic boundary key to return when SeekPrefixGE finds an sstable
   116  	// which doesn't contain the search key, but which does contain range
   117  	// tombstones.
   118  	syntheticBoundary InternalKey
   119  	// The iter for the current file. It is nil under any of the following conditions:
   120  	// - files.Current() == nil
   121  	// - err != nil
   122  	// - some other constraint, like the bounds in opts, caused the file at index to not
   123  	//   be relevant to the iteration.
   124  	iter internalIterator
   125  	// iterFile holds the current file. It is always equal to l.files.Current().
   126  	iterFile *fileMetadata
   127  	// filteredIter is an optional interface that may be implemented by internal
   128  	// iterators that perform filtering of keys. When a new file's iterator is
   129  	// opened, it's tested to see if it implements filteredIter. If it does,
   130  	// it's stored here to allow the level iterator to recognize when keys were
   131  	// omitted from iteration results due to filtering. This is important when a
   132  	// file contains range deletions that may delete keys from other files. The
   133  	// levelIter must not advance to the next file until the mergingIter has
   134  	// advanced beyond the file's bounds. See
   135  	// levelIterBoundaryContext.isIgnorableBoundaryKey.
   136  	filteredIter filteredIter
   137  	newIters     tableNewIters
   138  	// When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must
   139  	// point to a range del iterator corresponding to the current file. When this
   140  	// iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever
   141  	// a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed
   142  	// in rangeDelIterCopy. This is done for the following special case:
   143  	// when this iterator returns nil because of exceeding the bounds, we don't
   144  	// close iter and *rangeDelIterPtr since we could reuse it in the next seek. But
   145  	// we need to set *rangeDelIterPtr to nil because of the aforementioned contract.
   146  	// This copy is used to revive the *rangeDelIterPtr in the case of reuse.
   147  	rangeDelIterPtr  *keyspan.FragmentIterator
   148  	rangeDelIterCopy keyspan.FragmentIterator
   149  	files            manifest.LevelIterator
   150  	err              error
   151  
   152  	// Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`.
   153  	// We populate it with the corresponding bounds for the currently opened file. It is used for
   154  	// two purposes (described for forward iteration. The explanation for backward iteration is
   155  	// similar.)
   156  	// - To limit the optimization that seeks lower-level iterators past keys shadowed by a range
   157  	//   tombstone. Limiting this seek to the file largestUserKey is necessary since
   158  	//   range tombstones are stored untruncated, while they only apply to keys within their
   159  	//   containing file's boundaries. For a detailed example, see comment above `mergingIter`.
   160  	// - To constrain the tombstone to act-within the bounds of the sstable when checking
   161  	//   containment. For forward iteration we need the smallestUserKey.
   162  	//
   163  	// An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7.
   164  	// - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within
   165  	//   the sstable bounds or earlier than the start of the sstable (and there is no sstable in
   166  	//   between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X,
   167  	//   it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of
   168  	//   largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this
   169  	//   tombstone (since it cannot have a version younger than this tombstone as it is at a lower
   170  	//   level). And even if X = smallestUserKey or equal to the start user key of the tombstone,
   171  	//   if the above conditions are satisfied we know that the internal keys corresponding to X at
   172  	//   lower levels must have a version smaller than that in this file (again because of the level
   173  	//   argument). So we don't need to use sequence numbers for this comparison.
   174  	// - When checking whether this tombstone deletes internal key X we know that the levelIter is at this
   175  	//   sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the
   176  	//   start of the sstable (and there is no sstable in between at this level).
   177  	//   - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains
   178  	//     X.UserKey, we know X is deleted. This argument also works when X is a user key (we use
   179  	//     it when seeking to test whether a user key is deleted).
   180  	//   - X is at the same level. X must be within the sstable bounds of the tombstone so the
   181  	//     X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing
   182  	//     X we need to compare the sequence number of X and the tombstone (we don't need to look
   183  	//     at how this tombstone is truncated to act-within the file bounds, which are InternalKeys,
   184  	//     since X and the tombstone are from the same file).
   185  	//
   186  	// Iterating backwards has one more complication when checking whether a tombstone deletes
   187  	// internal key X at a lower level (the construction we do here also works for a user key X).
   188  	// Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7.
   189  	// If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7,
   190  	// since the higher sstable is positioned at a key <= g#10. We should not use this tombstone
   191  	// to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel,
   192  	// which we set in a separate bool below.
   193  	//
   194  	// These fields differs from the `*Boundary` fields in a few ways:
   195  	// - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key.
   196  	// - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction.
   197  	// - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`.
   198  	boundaryContext *levelIterBoundaryContext
   199  
   200  	// internalOpts holds the internal iterator options to pass to the table
   201  	// cache when constructing new table iterators.
   202  	internalOpts internalIterOpts
   203  
   204  	// Scratch space for the obsolete keys filter, when there are no other block
   205  	// property filters specified. See the performance note where
   206  	// IterOptions.PointKeyFilters is declared.
   207  	filtersBuf [1]BlockPropertyFilter
   208  
   209  	// Disable invariant checks even if they are otherwise enabled. Used by tests
   210  	// which construct "impossible" situations (e.g. seeking to a key before the
   211  	// lower bound).
   212  	disableInvariants bool
   213  }
   214  
   215  // filteredIter is an additional interface implemented by iterators that may
   216  // skip over point keys during iteration. The sstable.Iterator implements this
   217  // interface.
   218  type filteredIter interface {
   219  	// MaybeFilteredKeys may be called when an iterator is exhausted, indicating
   220  	// whether or not the iterator's last positioning method may have skipped
   221  	// any keys due to low-level filters.
   222  	//
   223  	// When an iterator is configured to use block-property filters, the
   224  	// low-level iterator may skip over blocks or whole sstables of keys.
   225  	// Implementations that implement skipping must implement this interface.
   226  	// Higher-level iterators require it to preserve invariants (eg, a levelIter
   227  	// used in a mergingIter must keep the file's range-del iterator open until
   228  	// the mergingIter has moved past the file's bounds, even if all of the
   229  	// file's point keys were filtered).
   230  	//
   231  	// MaybeFilteredKeys may always return false positives, that is it may
   232  	// return true when no keys were filtered. It should only be called when the
   233  	// iterator is exhausted. It must never return false negatives when the
   234  	// iterator is exhausted.
   235  	MaybeFilteredKeys() bool
   236  }
   237  
   238  // levelIter implements the base.InternalIterator interface.
   239  var _ base.InternalIterator = (*levelIter)(nil)
   240  
   241  // newLevelIter returns a levelIter. It is permissible to pass a nil split
   242  // parameter if the caller is never going to call SeekPrefixGE.
   243  func newLevelIter(
   244  	ctx context.Context,
   245  	opts IterOptions,
   246  	comparer *Comparer,
   247  	newIters tableNewIters,
   248  	files manifest.LevelIterator,
   249  	level manifest.Level,
   250  	internalOpts internalIterOpts,
   251  ) *levelIter {
   252  	l := &levelIter{}
   253  	l.init(ctx, opts, comparer, newIters, files, level, internalOpts)
   254  	return l
   255  }
   256  
   257  func (l *levelIter) init(
   258  	ctx context.Context,
   259  	opts IterOptions,
   260  	comparer *Comparer,
   261  	newIters tableNewIters,
   262  	files manifest.LevelIterator,
   263  	level manifest.Level,
   264  	internalOpts internalIterOpts,
   265  ) {
   266  	l.ctx = ctx
   267  	l.err = nil
   268  	l.level = level
   269  	l.logger = opts.getLogger()
   270  	l.lower = opts.LowerBound
   271  	l.upper = opts.UpperBound
   272  	l.tableOpts.TableFilter = opts.TableFilter
   273  	l.tableOpts.PointKeyFilters = opts.PointKeyFilters
   274  	if len(opts.PointKeyFilters) == 0 {
   275  		l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1]
   276  	}
   277  	l.tableOpts.UseL6Filters = opts.UseL6Filters
   278  	l.tableOpts.CategoryAndQoS = opts.CategoryAndQoS
   279  	l.tableOpts.level = l.level
   280  	l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints
   281  	l.comparer = comparer
   282  	l.cmp = comparer.Compare
   283  	l.split = comparer.Split
   284  	l.iterFile = nil
   285  	l.newIters = newIters
   286  	l.files = files
   287  	l.internalOpts = internalOpts
   288  }
   289  
   290  func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) {
   291  	l.rangeDelIterPtr = rangeDelIter
   292  }
   293  
   294  func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) {
   295  	l.boundaryContext = context
   296  }
   297  
   298  func (l *levelIter) initCombinedIterState(state *combinedIterState) {
   299  	l.combinedIterState = state
   300  }
   301  
   302  func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) {
   303  	// If we encounter a file that contains range keys, we may need to
   304  	// trigger a switch to combined range-key and point-key iteration,
   305  	// if the *pebble.Iterator is configured for it. This switch is done
   306  	// lazily because range keys are intended to be rare, and
   307  	// constructing the range-key iterator substantially adds to the
   308  	// cost of iterator construction and seeking.
   309  	//
   310  	// If l.combinedIterState.initialized is already true, either the
   311  	// iterator is already using combined iteration or the iterator is not
   312  	// configured to observe range keys. Either way, there's nothing to do.
   313  	// If false, trigger the switch to combined iteration, using the the
   314  	// file's bounds to seek the range-key iterator appropriately.
   315  	//
   316  	// We only need to trigger combined iteration if the file contains
   317  	// RangeKeySets: if there are only Unsets and Dels, the user will observe no
   318  	// range keys regardless. If this file has table stats available, they'll
   319  	// tell us whether the file has any RangeKeySets. Otherwise, we must
   320  	// fallback to assuming it does if HasRangeKeys=true.
   321  	if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized &&
   322  		(l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) &&
   323  		(l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) &&
   324  		(!file.StatsValid() || file.Stats.NumRangeKeySets > 0) {
   325  		// The file contains range keys, and we're not using combined iteration yet.
   326  		// Trigger a switch to combined iteration. It's possible that a switch has
   327  		// already been triggered if multiple levels encounter files containing
   328  		// range keys while executing a single mergingIter operation. In this case,
   329  		// we need to compare the existing key recorded to l.combinedIterState.key,
   330  		// adjusting it if our key is smaller (forward iteration) or larger
   331  		// (backward iteration) than the existing key.
   332  		//
   333  		// These key comparisons are only required during a single high-level
   334  		// iterator operation. When the high-level iter op completes,
   335  		// iinitialized will be true, and future calls to this function will be
   336  		// no-ops.
   337  		switch dir {
   338  		case +1:
   339  			if !l.combinedIterState.triggered {
   340  				l.combinedIterState.triggered = true
   341  				l.combinedIterState.key = file.SmallestRangeKey.UserKey
   342  			} else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 {
   343  				l.combinedIterState.key = file.SmallestRangeKey.UserKey
   344  			}
   345  		case -1:
   346  			if !l.combinedIterState.triggered {
   347  				l.combinedIterState.triggered = true
   348  				l.combinedIterState.key = file.LargestRangeKey.UserKey
   349  			} else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 {
   350  				l.combinedIterState.key = file.LargestRangeKey.UserKey
   351  			}
   352  		}
   353  	}
   354  }
   355  
   356  func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *fileMetadata {
   357  	// Find the earliest file whose largest key is >= key.
   358  
   359  	// NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If
   360  	// the levelIter is positioned at the key P, it must return a key ≥ P. If
   361  	// used within a merging iterator, the merging iterator will depend on the
   362  	// levelIter only moving forward to maintain heap invariants.
   363  
   364  	// Ordinarily we seek the LevelIterator using SeekGE. In some instances, we
   365  	// Next instead. In other instances, we try Next-ing first, falling back to
   366  	// seek:
   367  	//   a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking
   368  	//      to a key later than the current iterator position. We don't know how
   369  	//      much later the seek key is, so it's possible there are many sstables
   370  	//      between the current position and the seek key. However in most real-
   371  	//      world use cases, the seek key is likely to be nearby. Rather than
   372  	//      performing a log(N) seek through the file metadata, we next a few
   373  	//      times from from our existing location. If we don't find a file whose
   374  	//      largest is >= key within a few nexts, we fall back to seeking.
   375  	//
   376  	//      Note that in this case, the file returned by findFileGE may be
   377  	//      different than the file returned by a raw binary search (eg, when
   378  	//      TrySeekUsingNext=false). This is possible because the most recent
   379  	//      positioning operation may have already determined that previous
   380  	//      files' keys that are ≥ key are all deleted. This information is
   381  	//      encoded within the iterator's current iterator position and is
   382  	//      unavailable to a fresh binary search.
   383  	//
   384  	//   b) flags.RelativeSeek(): The merging iterator decided to re-seek this
   385  	//      level according to a range tombstone. When lazy combined iteration
   386  	//      is enabled, the level iterator is responsible for watching for
   387  	//      files containing range keys and triggering the switch to combined
   388  	//      iteration when such a file is observed. If a range deletion was
   389  	//      observed in a higher level causing the merging iterator to seek the
   390  	//      level to the range deletion's end key, we need to check whether all
   391  	//      of the files between the old position and the new position contain
   392  	//      any range keys.
   393  	//
   394  	//      In this scenario, we don't seek the LevelIterator and instead we
   395  	//      Next it, one file at a time, checking each for range keys. The
   396  	//      merging iterator sets this flag to inform us that we're moving
   397  	//      forward relative to the existing position and that we must examine
   398  	//      each intermediate sstable's metadata for lazy-combined iteration.
   399  	//      In this case, we only Next and never Seek. We set nextsUntilSeek=-1
   400  	//      to signal this intention.
   401  	//
   402  	// NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may
   403  	// be set, because the merging iterator re-seeks relative seeks with
   404  	// explicitly only the RelativeSeek flag set.
   405  	var nextsUntilSeek int
   406  	var nextInsteadOfSeek bool
   407  	if flags.TrySeekUsingNext() {
   408  		nextInsteadOfSeek = true
   409  		nextsUntilSeek = 4 // arbitrary
   410  	}
   411  	if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized {
   412  		nextInsteadOfSeek = true
   413  		nextsUntilSeek = -1
   414  	}
   415  
   416  	var m *fileMetadata
   417  	if nextInsteadOfSeek {
   418  		m = l.iterFile
   419  	} else {
   420  		m = l.files.SeekGE(l.cmp, key)
   421  	}
   422  	// The below loop has a bit of an unusual organization. There are several
   423  	// conditions under which we need to Next to a later file. If none of those
   424  	// conditions are met, the file in `m` is okay to return. The loop body is
   425  	// structured with a series of if statements, each of which may continue the
   426  	// loop to the next file. If none of the statements are met, the end of the
   427  	// loop body is a break.
   428  	for m != nil {
   429  		if m.HasRangeKeys {
   430  			l.maybeTriggerCombinedIteration(m, +1)
   431  
   432  			// Some files may only contain range keys, which we can skip.
   433  			// NB: HasPointKeys=true if the file contains any points or range
   434  			// deletions (which delete points).
   435  			if !m.HasPointKeys {
   436  				m = l.files.Next()
   437  				continue
   438  			}
   439  		}
   440  
   441  		// This file has point keys.
   442  		//
   443  		// However, there are a couple reasons why `m` may not be positioned ≥
   444  		// `key` yet:
   445  		//
   446  		// 1. If SeekGE(key) landed on a file containing range keys, the file
   447  		//    may contain range keys ≥ `key` but no point keys ≥ `key`.
   448  		// 2. When nexting instead of seeking, we must check to see whether
   449  		//    we've nexted sufficiently far, or we need to next again.
   450  		//
   451  		// If the file does not contain point keys ≥ `key`, next to continue
   452  		// looking for a file that does.
   453  		if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 {
   454  			// If nextInsteadOfSeek is set and nextsUntilSeek is non-negative,
   455  			// the iterator has been nexting hoping to discover the relevant
   456  			// file without seeking. It's exhausted the allotted nextsUntilSeek
   457  			// and should seek to the sought key.
   458  			if nextInsteadOfSeek && nextsUntilSeek == 0 {
   459  				nextInsteadOfSeek = false
   460  				m = l.files.SeekGE(l.cmp, key)
   461  				continue
   462  			} else if nextsUntilSeek > 0 {
   463  				nextsUntilSeek--
   464  			}
   465  			m = l.files.Next()
   466  			continue
   467  		}
   468  
   469  		// This file has a point key bound ≥ `key`. But the largest point key
   470  		// bound may still be a range deletion sentinel, which is exclusive.  In
   471  		// this case, the file doesn't actually contain any point keys equal to
   472  		// `key`. We next to keep searching for a file that actually contains
   473  		// point keys ≥ key.
   474  		//
   475  		// Additionally, this prevents loading untruncated range deletions from
   476  		// a table which can't possibly contain the target key and is required
   477  		// for correctness by mergingIter.SeekGE (see the comment in that
   478  		// function).
   479  		if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 {
   480  			m = l.files.Next()
   481  			continue
   482  		}
   483  
   484  		// This file contains point keys ≥ `key`. Break and return it.
   485  		break
   486  	}
   487  	return m
   488  }
   489  
   490  func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *fileMetadata {
   491  	// Find the last file whose smallest key is < ikey.
   492  
   493  	// Ordinarily we seek the LevelIterator using SeekLT.
   494  	//
   495  	// When lazy combined iteration is enabled, there's a complication. The
   496  	// level iterator is responsible for watching for files containing range
   497  	// keys and triggering the switch to combined iteration when such a file is
   498  	// observed. If a range deletion was observed in a higher level causing the
   499  	// merging iterator to seek the level to the range deletion's start key, we
   500  	// need to check whether all of the files between the old position and the
   501  	// new position contain any range keys.
   502  	//
   503  	// In this scenario, we don't seek the LevelIterator and instead we Prev it,
   504  	// one file at a time, checking each for range keys.
   505  	prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized
   506  
   507  	var m *fileMetadata
   508  	if prevInsteadOfSeek {
   509  		m = l.iterFile
   510  	} else {
   511  		m = l.files.SeekLT(l.cmp, key)
   512  	}
   513  	// The below loop has a bit of an unusual organization. There are several
   514  	// conditions under which we need to Prev to a previous file. If none of
   515  	// those conditions are met, the file in `m` is okay to return. The loop
   516  	// body is structured with a series of if statements, each of which may
   517  	// continue the loop to the previous file. If none of the statements are
   518  	// met, the end of the loop body is a break.
   519  	for m != nil {
   520  		if m.HasRangeKeys {
   521  			l.maybeTriggerCombinedIteration(m, -1)
   522  
   523  			// Some files may only contain range keys, which we can skip.
   524  			// NB: HasPointKeys=true if the file contains any points or range
   525  			// deletions (which delete points).
   526  			if !m.HasPointKeys {
   527  				m = l.files.Prev()
   528  				continue
   529  			}
   530  		}
   531  
   532  		// This file has point keys.
   533  		//
   534  		// However, there are a couple reasons why `m` may not be positioned <
   535  		// `key` yet:
   536  		//
   537  		// 1. If SeekLT(key) landed on a file containing range keys, the file
   538  		//    may contain range keys < `key` but no point keys < `key`.
   539  		// 2. When preving instead of seeking, we must check to see whether
   540  		//    we've preved sufficiently far, or we need to prev again.
   541  		//
   542  		// If the file does not contain point keys < `key`, prev to continue
   543  		// looking for a file that does.
   544  		if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 {
   545  			m = l.files.Prev()
   546  			continue
   547  		}
   548  
   549  		// This file contains point keys < `key`. Break and return it.
   550  		break
   551  	}
   552  	return m
   553  }
   554  
   555  // Init the iteration bounds for the current table. Returns -1 if the table
   556  // lies fully before the lower bound, +1 if the table lies fully after the
   557  // upper bound, and 0 if the table overlaps the iteration bounds.
   558  func (l *levelIter) initTableBounds(f *fileMetadata) int {
   559  	l.tableOpts.LowerBound = l.lower
   560  	if l.tableOpts.LowerBound != nil {
   561  		if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 {
   562  			// The largest key in the sstable is smaller than the lower bound.
   563  			return -1
   564  		}
   565  		if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 {
   566  			// The lower bound is smaller or equal to the smallest key in the
   567  			// table. Iteration within the table does not need to check the lower
   568  			// bound.
   569  			l.tableOpts.LowerBound = nil
   570  		}
   571  	}
   572  	l.tableOpts.UpperBound = l.upper
   573  	if l.tableOpts.UpperBound != nil {
   574  		if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 {
   575  			// The smallest key in the sstable is greater than or equal to the upper
   576  			// bound.
   577  			return 1
   578  		}
   579  		if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 {
   580  			// The upper bound is greater than the largest key in the
   581  			// table. Iteration within the table does not need to check the upper
   582  			// bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is
   583  			// inclusive.
   584  			l.tableOpts.UpperBound = nil
   585  		}
   586  	}
   587  	return 0
   588  }
   589  
   590  type loadFileReturnIndicator int8
   591  
   592  const (
   593  	noFileLoaded loadFileReturnIndicator = iota
   594  	fileAlreadyLoaded
   595  	newFileLoaded
   596  )
   597  
   598  func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator {
   599  	l.smallestBoundary = nil
   600  	l.largestBoundary = nil
   601  	if l.boundaryContext != nil {
   602  		l.boundaryContext.isSyntheticIterBoundsKey = false
   603  		l.boundaryContext.isIgnorableBoundaryKey = false
   604  	}
   605  	if l.iterFile == file {
   606  		if l.err != nil {
   607  			return noFileLoaded
   608  		}
   609  		if l.iter != nil {
   610  			// We don't bother comparing the file bounds with the iteration bounds when we have
   611  			// an already open iterator. It is possible that the iter may not be relevant given the
   612  			// current iteration bounds, but it knows those bounds, so it will enforce them.
   613  			if l.rangeDelIterPtr != nil {
   614  				*l.rangeDelIterPtr = l.rangeDelIterCopy
   615  			}
   616  
   617  			// There are a few reasons we might not have triggered combined
   618  			// iteration yet, even though we already had `file` open.
   619  			// 1. If the bounds changed, we might have previously avoided
   620  			//    switching to combined iteration because the bounds excluded
   621  			//    the range keys contained in this file.
   622  			// 2. If an existing iterator was reconfigured to iterate over range
   623  			//    keys (eg, using SetOptions), then we wouldn't have triggered
   624  			//    the switch to combined iteration yet.
   625  			l.maybeTriggerCombinedIteration(file, dir)
   626  			return fileAlreadyLoaded
   627  		}
   628  		// We were already at file, but don't have an iterator, probably because the file was
   629  		// beyond the iteration bounds. It may still be, but it is also possible that the bounds
   630  		// have changed. We handle that below.
   631  	}
   632  
   633  	// Close both iter and rangeDelIterPtr. While mergingIter knows about
   634  	// rangeDelIterPtr, it can't call Close() on it because it does not know
   635  	// when the levelIter will switch it. Note that levelIter.Close() can be
   636  	// called multiple times.
   637  	if err := l.Close(); err != nil {
   638  		return noFileLoaded
   639  	}
   640  
   641  	for {
   642  		l.iterFile = file
   643  		if file == nil {
   644  			return noFileLoaded
   645  		}
   646  
   647  		l.maybeTriggerCombinedIteration(file, dir)
   648  		if !file.HasPointKeys {
   649  			switch dir {
   650  			case +1:
   651  				file = l.files.Next()
   652  				continue
   653  			case -1:
   654  				file = l.files.Prev()
   655  				continue
   656  			}
   657  		}
   658  
   659  		switch l.initTableBounds(file) {
   660  		case -1:
   661  			// The largest key in the sstable is smaller than the lower bound.
   662  			if dir < 0 {
   663  				return noFileLoaded
   664  			}
   665  			file = l.files.Next()
   666  			continue
   667  		case +1:
   668  			// The smallest key in the sstable is greater than or equal to the upper
   669  			// bound.
   670  			if dir > 0 {
   671  				return noFileLoaded
   672  			}
   673  			file = l.files.Prev()
   674  			continue
   675  		}
   676  
   677  		var rangeDelIter keyspan.FragmentIterator
   678  		var iter internalIterator
   679  		iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts)
   680  		l.iter = iter
   681  		if l.err != nil {
   682  			return noFileLoaded
   683  		}
   684  		if rangeDelIter != nil {
   685  			if fi, ok := iter.(filteredIter); ok {
   686  				l.filteredIter = fi
   687  			} else {
   688  				l.filteredIter = nil
   689  			}
   690  		} else {
   691  			l.filteredIter = nil
   692  		}
   693  		if l.rangeDelIterPtr != nil {
   694  			*l.rangeDelIterPtr = rangeDelIter
   695  			l.rangeDelIterCopy = rangeDelIter
   696  		} else if rangeDelIter != nil {
   697  			rangeDelIter.Close()
   698  		}
   699  		if l.boundaryContext != nil {
   700  			l.boundaryContext.smallestUserKey = file.Smallest.UserKey
   701  			l.boundaryContext.largestUserKey = file.Largest.UserKey
   702  			l.boundaryContext.isLargestUserKeyExclusive = file.Largest.IsExclusiveSentinel()
   703  		}
   704  		return newFileLoaded
   705  	}
   706  }
   707  
   708  // In race builds we verify that the keys returned by levelIter lie within
   709  // [lower,upper).
   710  func (l *levelIter) verify(key *InternalKey, val base.LazyValue) (*InternalKey, base.LazyValue) {
   711  	// Note that invariants.Enabled is a compile time constant, which means the
   712  	// block of code will be compiled out of normal builds making this method
   713  	// eligible for inlining. Do not change this to use a variable.
   714  	if invariants.Enabled && !l.disableInvariants && key != nil {
   715  		// We allow returning a boundary key that is outside of the lower/upper
   716  		// bounds as such keys are always range tombstones which will be skipped by
   717  		// the Iterator.
   718  		if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 {
   719  			l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack())
   720  		}
   721  		if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 {
   722  			l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack())
   723  		}
   724  	}
   725  	return key, val
   726  }
   727  
   728  func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
   729  	l.err = nil // clear cached iteration error
   730  	if l.boundaryContext != nil {
   731  		l.boundaryContext.isSyntheticIterBoundsKey = false
   732  		l.boundaryContext.isIgnorableBoundaryKey = false
   733  	}
   734  	// NB: the top-level Iterator has already adjusted key based on
   735  	// IterOptions.LowerBound.
   736  	loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
   737  	if loadFileIndicator == noFileLoaded {
   738  		return nil, base.LazyValue{}
   739  	}
   740  	if loadFileIndicator == newFileLoaded {
   741  		// File changed, so l.iter has changed, and that iterator is not
   742  		// positioned appropriately.
   743  		flags = flags.DisableTrySeekUsingNext()
   744  	}
   745  	if ikey, val := l.iter.SeekGE(key, flags); ikey != nil {
   746  		return l.verify(ikey, val)
   747  	}
   748  	return l.verify(l.skipEmptyFileForward())
   749  }
   750  
   751  func (l *levelIter) SeekPrefixGE(
   752  	prefix, key []byte, flags base.SeekGEFlags,
   753  ) (*base.InternalKey, base.LazyValue) {
   754  	l.err = nil // clear cached iteration error
   755  	if l.boundaryContext != nil {
   756  		l.boundaryContext.isSyntheticIterBoundsKey = false
   757  		l.boundaryContext.isIgnorableBoundaryKey = false
   758  	}
   759  
   760  	// NB: the top-level Iterator has already adjusted key based on
   761  	// IterOptions.LowerBound.
   762  	loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1)
   763  	if loadFileIndicator == noFileLoaded {
   764  		return nil, base.LazyValue{}
   765  	}
   766  	if loadFileIndicator == newFileLoaded {
   767  		// File changed, so l.iter has changed, and that iterator is not
   768  		// positioned appropriately.
   769  		flags = flags.DisableTrySeekUsingNext()
   770  	}
   771  	if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil {
   772  		return l.verify(key, val)
   773  	}
   774  	// When SeekPrefixGE returns nil, we have not necessarily reached the end of
   775  	// the sstable. All we know is that a key with prefix does not exist in the
   776  	// current sstable. We do know that the key lies within the bounds of the
   777  	// table as findFileGE found the table where key <= meta.Largest. We return
   778  	// the table's bound with isIgnorableBoundaryKey set.
   779  	if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil {
   780  		if l.tableOpts.UpperBound != nil {
   781  			l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
   782  			l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
   783  			l.largestBoundary = &l.syntheticBoundary
   784  			if l.boundaryContext != nil {
   785  				l.boundaryContext.isSyntheticIterBoundsKey = true
   786  				l.boundaryContext.isIgnorableBoundaryKey = false
   787  			}
   788  			return l.verify(l.largestBoundary, base.LazyValue{})
   789  		}
   790  		// Return the file's largest bound, ensuring this file stays open until
   791  		// the mergingIter advances beyond the file's bounds. We set
   792  		// isIgnorableBoundaryKey to signal that the actual key returned should
   793  		// be ignored, and does not represent a real key in the database.
   794  		l.largestBoundary = &l.iterFile.LargestPointKey
   795  		if l.boundaryContext != nil {
   796  			l.boundaryContext.isSyntheticIterBoundsKey = false
   797  			l.boundaryContext.isIgnorableBoundaryKey = true
   798  		}
   799  		return l.verify(l.largestBoundary, base.LazyValue{})
   800  	}
   801  	// It is possible that we are here because bloom filter matching failed. In
   802  	// that case it is likely that all keys matching the prefix are wholly
   803  	// within the current file and cannot be in the subsequent file. In that
   804  	// case we don't want to go to the next file, since loading and seeking in
   805  	// there has some cost. Additionally, for sparse key spaces, loading the
   806  	// next file will defeat the optimization for the next SeekPrefixGE that is
   807  	// called with flags.TrySeekUsingNext(), since for sparse key spaces it is
   808  	// likely that the next key will also be contained in the current file.
   809  	var n int
   810  	if l.split != nil {
   811  		// If the split function is specified, calculate the prefix length accordingly.
   812  		n = l.split(l.iterFile.LargestPointKey.UserKey)
   813  	} else {
   814  		// If the split function is not specified, the entire key is used as the
   815  		// prefix. This case can occur when getIter uses SeekPrefixGE.
   816  		n = len(l.iterFile.LargestPointKey.UserKey)
   817  	}
   818  	if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 {
   819  		return nil, base.LazyValue{}
   820  	}
   821  	return l.verify(l.skipEmptyFileForward())
   822  }
   823  
   824  func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
   825  	l.err = nil // clear cached iteration error
   826  	if l.boundaryContext != nil {
   827  		l.boundaryContext.isSyntheticIterBoundsKey = false
   828  		l.boundaryContext.isIgnorableBoundaryKey = false
   829  	}
   830  
   831  	// NB: the top-level Iterator has already adjusted key based on
   832  	// IterOptions.UpperBound.
   833  	if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded {
   834  		return nil, base.LazyValue{}
   835  	}
   836  	if key, val := l.iter.SeekLT(key, flags); key != nil {
   837  		return l.verify(key, val)
   838  	}
   839  	return l.verify(l.skipEmptyFileBackward())
   840  }
   841  
   842  func (l *levelIter) First() (*InternalKey, base.LazyValue) {
   843  	l.err = nil // clear cached iteration error
   844  	if l.boundaryContext != nil {
   845  		l.boundaryContext.isSyntheticIterBoundsKey = false
   846  		l.boundaryContext.isIgnorableBoundaryKey = false
   847  	}
   848  
   849  	// NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is
   850  	// set.
   851  	if l.loadFile(l.files.First(), +1) == noFileLoaded {
   852  		return nil, base.LazyValue{}
   853  	}
   854  	if key, val := l.iter.First(); key != nil {
   855  		return l.verify(key, val)
   856  	}
   857  	return l.verify(l.skipEmptyFileForward())
   858  }
   859  
   860  func (l *levelIter) Last() (*InternalKey, base.LazyValue) {
   861  	l.err = nil // clear cached iteration error
   862  	if l.boundaryContext != nil {
   863  		l.boundaryContext.isSyntheticIterBoundsKey = false
   864  		l.boundaryContext.isIgnorableBoundaryKey = false
   865  	}
   866  
   867  	// NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is
   868  	// set.
   869  	if l.loadFile(l.files.Last(), -1) == noFileLoaded {
   870  		return nil, base.LazyValue{}
   871  	}
   872  	if key, val := l.iter.Last(); key != nil {
   873  		return l.verify(key, val)
   874  	}
   875  	return l.verify(l.skipEmptyFileBackward())
   876  }
   877  
   878  func (l *levelIter) Next() (*InternalKey, base.LazyValue) {
   879  	if l.err != nil || l.iter == nil {
   880  		return nil, base.LazyValue{}
   881  	}
   882  	if l.boundaryContext != nil {
   883  		l.boundaryContext.isSyntheticIterBoundsKey = false
   884  		l.boundaryContext.isIgnorableBoundaryKey = false
   885  	}
   886  
   887  	switch {
   888  	case l.largestBoundary != nil:
   889  		if l.tableOpts.UpperBound != nil {
   890  			// The UpperBound was within this file, so don't load the next
   891  			// file. We leave the largestBoundary unchanged so that subsequent
   892  			// calls to Next() stay at this file. If a Seek/First/Last call is
   893  			// made and this file continues to be relevant, loadFile() will
   894  			// set the largestBoundary to nil.
   895  			if l.rangeDelIterPtr != nil {
   896  				*l.rangeDelIterPtr = nil
   897  			}
   898  			return nil, base.LazyValue{}
   899  		}
   900  		// We're stepping past the boundary key, so now we can load the next file.
   901  		if l.loadFile(l.files.Next(), +1) != noFileLoaded {
   902  			if key, val := l.iter.First(); key != nil {
   903  				return l.verify(key, val)
   904  			}
   905  			return l.verify(l.skipEmptyFileForward())
   906  		}
   907  		return nil, base.LazyValue{}
   908  
   909  	default:
   910  		// Reset the smallest boundary since we're moving away from it.
   911  		l.smallestBoundary = nil
   912  		if key, val := l.iter.Next(); key != nil {
   913  			return l.verify(key, val)
   914  		}
   915  	}
   916  	return l.verify(l.skipEmptyFileForward())
   917  }
   918  
   919  func (l *levelIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
   920  	if l.err != nil || l.iter == nil {
   921  		return nil, base.LazyValue{}
   922  	}
   923  	if l.boundaryContext != nil {
   924  		l.boundaryContext.isSyntheticIterBoundsKey = false
   925  		l.boundaryContext.isIgnorableBoundaryKey = false
   926  	}
   927  
   928  	switch {
   929  	case l.largestBoundary != nil:
   930  		if l.tableOpts.UpperBound != nil {
   931  			// The UpperBound was within this file, so don't load the next
   932  			// file. We leave the largestBoundary unchanged so that subsequent
   933  			// calls to Next() stay at this file. If a Seek/First/Last call is
   934  			// made and this file continues to be relevant, loadFile() will
   935  			// set the largestBoundary to nil.
   936  			if l.rangeDelIterPtr != nil {
   937  				*l.rangeDelIterPtr = nil
   938  			}
   939  			return nil, base.LazyValue{}
   940  		}
   941  		// We're stepping past the boundary key, so we need to load a later
   942  		// file.
   943  
   944  	default:
   945  		// Reset the smallest boundary since we're moving away from it.
   946  		l.smallestBoundary = nil
   947  
   948  		if key, val := l.iter.NextPrefix(succKey); key != nil {
   949  			return l.verify(key, val)
   950  		}
   951  		// Fall through to seeking.
   952  	}
   953  
   954  	// Seek the manifest level iterator using TrySeekUsingNext=true and
   955  	// RelativeSeek=true so that we take advantage of the knowledge that
   956  	// `succKey` can only be contained in later files.
   957  	metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek()
   958  	if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded {
   959  		// NB: The SeekGE on the file's iterator must not set TrySeekUsingNext,
   960  		// because l.iter is unpositioned.
   961  		if key, val := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); key != nil {
   962  			return l.verify(key, val)
   963  		}
   964  		return l.verify(l.skipEmptyFileForward())
   965  	}
   966  	return nil, base.LazyValue{}
   967  }
   968  
   969  func (l *levelIter) Prev() (*InternalKey, base.LazyValue) {
   970  	if l.err != nil || l.iter == nil {
   971  		return nil, base.LazyValue{}
   972  	}
   973  	if l.boundaryContext != nil {
   974  		l.boundaryContext.isSyntheticIterBoundsKey = false
   975  		l.boundaryContext.isIgnorableBoundaryKey = false
   976  	}
   977  
   978  	switch {
   979  	case l.smallestBoundary != nil:
   980  		if l.tableOpts.LowerBound != nil {
   981  			// The LowerBound was within this file, so don't load the previous
   982  			// file. We leave the smallestBoundary unchanged so that
   983  			// subsequent calls to Prev() stay at this file. If a
   984  			// Seek/First/Last call is made and this file continues to be
   985  			// relevant, loadFile() will set the smallestBoundary to nil.
   986  			if l.rangeDelIterPtr != nil {
   987  				*l.rangeDelIterPtr = nil
   988  			}
   989  			return nil, base.LazyValue{}
   990  		}
   991  		// We're stepping past the boundary key, so now we can load the prev file.
   992  		if l.loadFile(l.files.Prev(), -1) != noFileLoaded {
   993  			if key, val := l.iter.Last(); key != nil {
   994  				return l.verify(key, val)
   995  			}
   996  			return l.verify(l.skipEmptyFileBackward())
   997  		}
   998  		return nil, base.LazyValue{}
   999  
  1000  	default:
  1001  		// Reset the largest boundary since we're moving away from it.
  1002  		l.largestBoundary = nil
  1003  		if key, val := l.iter.Prev(); key != nil {
  1004  			return l.verify(key, val)
  1005  		}
  1006  	}
  1007  	return l.verify(l.skipEmptyFileBackward())
  1008  }
  1009  
  1010  func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) {
  1011  	var key *InternalKey
  1012  	var val base.LazyValue
  1013  	// The first iteration of this loop starts with an already exhausted
  1014  	// l.iter. The reason for the exhaustion is either that we iterated to the
  1015  	// end of the sstable, or our iteration was terminated early due to the
  1016  	// presence of an upper-bound or the use of SeekPrefixGE. If
  1017  	// l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is
  1018  	// not exhausted to allow for the merging to finish consuming the
  1019  	// l.rangeDelIterPtr before levelIter switches the rangeDelIter from
  1020  	// under it. This pretense is done by either generating a synthetic
  1021  	// boundary key or returning the largest key of the file, depending on the
  1022  	// exhaustion reason.
  1023  
  1024  	// Subsequent iterations will examine consecutive files such that the first
  1025  	// file that does not have an exhausted iterator causes the code to return
  1026  	// that key, else the behavior described above if there is a corresponding
  1027  	// rangeDelIterPtr.
  1028  	for ; key == nil; key, val = l.iter.First() {
  1029  		if l.rangeDelIterPtr != nil {
  1030  			// We're being used as part of a mergingIter and we've exhausted the
  1031  			// current sstable. If an upper bound is present and the upper bound lies
  1032  			// within the current sstable, then we will have reached the upper bound
  1033  			// rather than the end of the sstable. We need to return a synthetic
  1034  			// boundary key so that mergingIter can use the range tombstone iterator
  1035  			// until the other levels have reached this boundary.
  1036  			//
  1037  			// It is safe to set the boundary key to the UpperBound user key
  1038  			// with the RANGEDEL sentinel since it is the smallest InternalKey
  1039  			// that matches the exclusive upper bound, and does not represent
  1040  			// a real key.
  1041  			if l.tableOpts.UpperBound != nil {
  1042  				if *l.rangeDelIterPtr != nil {
  1043  					l.syntheticBoundary.UserKey = l.tableOpts.UpperBound
  1044  					l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
  1045  					l.largestBoundary = &l.syntheticBoundary
  1046  					if l.boundaryContext != nil {
  1047  						l.boundaryContext.isSyntheticIterBoundsKey = true
  1048  					}
  1049  					return l.largestBoundary, base.LazyValue{}
  1050  				}
  1051  				// Else there are no range deletions in this sstable. This
  1052  				// helps with performance when many levels are populated with
  1053  				// sstables and most don't have any actual keys within the
  1054  				// bounds.
  1055  				return nil, base.LazyValue{}
  1056  			}
  1057  			// If the boundary is a range deletion tombstone, return that key.
  1058  			if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete {
  1059  				l.largestBoundary = &l.iterFile.LargestPointKey
  1060  				if l.boundaryContext != nil {
  1061  					l.boundaryContext.isIgnorableBoundaryKey = true
  1062  				}
  1063  				return l.largestBoundary, base.LazyValue{}
  1064  			}
  1065  			// If the last point iterator positioning op might've skipped keys,
  1066  			// it's possible the file's range deletions are still relevant to
  1067  			// other levels. Return the largest boundary as a special ignorable
  1068  			// marker to avoid advancing to the next file.
  1069  			//
  1070  			// The sstable iterator cannot guarantee that keys were skipped. A
  1071  			// SeekGE that lands on a index separator k only knows that the
  1072  			// block at the index entry contains keys ≤ k. We can't know whether
  1073  			// there were actually keys between the seek key and the index
  1074  			// separator key. If the block is then excluded due to block
  1075  			// property filters, the iterator does not know whether keys were
  1076  			// actually skipped by the block's exclusion.
  1077  			//
  1078  			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
  1079  			// it's possible l.iterFile.Largest was already returned. Returning
  1080  			// l.iterFile.Largest again is a violation of the strict
  1081  			// monotonicity normally provided. The mergingIter's heap can
  1082  			// tolerate this repeat key and in this case will keep the level at
  1083  			// the top of the heap and immediately skip the entry, advancing to
  1084  			// the next file.
  1085  			if *l.rangeDelIterPtr != nil && l.filteredIter != nil &&
  1086  				l.filteredIter.MaybeFilteredKeys() {
  1087  				l.largestBoundary = &l.iterFile.Largest
  1088  				if l.boundaryContext != nil {
  1089  					l.boundaryContext.isIgnorableBoundaryKey = true
  1090  				}
  1091  				return l.largestBoundary, base.LazyValue{}
  1092  			}
  1093  		}
  1094  
  1095  		// Current file was exhausted. Move to the next file.
  1096  		if l.loadFile(l.files.Next(), +1) == noFileLoaded {
  1097  			return nil, base.LazyValue{}
  1098  		}
  1099  	}
  1100  	return key, val
  1101  }
  1102  
  1103  func (l *levelIter) skipEmptyFileBackward() (*InternalKey, base.LazyValue) {
  1104  	var key *InternalKey
  1105  	var val base.LazyValue
  1106  	// The first iteration of this loop starts with an already exhausted
  1107  	// l.iter. The reason for the exhaustion is either that we iterated to the
  1108  	// end of the sstable, or our iteration was terminated early due to the
  1109  	// presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need
  1110  	// to pretend the iterator is not exhausted to allow for the merging to
  1111  	// finish consuming the l.rangeDelIterPtr before levelIter switches the
  1112  	// rangeDelIter from under it. This pretense is done by either generating
  1113  	// a synthetic boundary key or returning the smallest key of the file,
  1114  	// depending on the exhaustion reason.
  1115  
  1116  	// Subsequent iterations will examine consecutive files such that the first
  1117  	// file that does not have an exhausted iterator causes the code to return
  1118  	// that key, else the behavior described above if there is a corresponding
  1119  	// rangeDelIterPtr.
  1120  	for ; key == nil; key, val = l.iter.Last() {
  1121  		if l.rangeDelIterPtr != nil {
  1122  			// We're being used as part of a mergingIter and we've exhausted the
  1123  			// current sstable. If a lower bound is present and the lower bound lies
  1124  			// within the current sstable, then we will have reached the lower bound
  1125  			// rather than the beginning of the sstable. We need to return a
  1126  			// synthetic boundary key so that mergingIter can use the range tombstone
  1127  			// iterator until the other levels have reached this boundary.
  1128  			//
  1129  			// It is safe to set the boundary key to the LowerBound user key
  1130  			// with the RANGEDEL sentinel since it is the smallest InternalKey
  1131  			// that is within the inclusive lower bound, and does not
  1132  			// represent a real key.
  1133  			if l.tableOpts.LowerBound != nil {
  1134  				if *l.rangeDelIterPtr != nil {
  1135  					l.syntheticBoundary.UserKey = l.tableOpts.LowerBound
  1136  					l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel
  1137  					l.smallestBoundary = &l.syntheticBoundary
  1138  					if l.boundaryContext != nil {
  1139  						l.boundaryContext.isSyntheticIterBoundsKey = true
  1140  					}
  1141  					return l.smallestBoundary, base.LazyValue{}
  1142  				}
  1143  				// Else there are no range deletions in this sstable. This
  1144  				// helps with performance when many levels are populated with
  1145  				// sstables and most don't have any actual keys within the
  1146  				// bounds.
  1147  				return nil, base.LazyValue{}
  1148  			}
  1149  			// If the boundary is a range deletion tombstone, return that key.
  1150  			if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete {
  1151  				l.smallestBoundary = &l.iterFile.SmallestPointKey
  1152  				if l.boundaryContext != nil {
  1153  					l.boundaryContext.isIgnorableBoundaryKey = true
  1154  				}
  1155  				return l.smallestBoundary, base.LazyValue{}
  1156  			}
  1157  			// If the last point iterator positioning op skipped keys, it's
  1158  			// possible the file's range deletions are still relevant to other
  1159  			// levels. Return the smallest boundary as a special ignorable key
  1160  			// to avoid advancing to the next file.
  1161  			//
  1162  			// The sstable iterator cannot guarantee that keys were skipped.  A
  1163  			// SeekGE that lands on a index separator k only knows that the
  1164  			// block at the index entry contains keys ≤ k. We can't know whether
  1165  			// there were actually keys between the seek key and the index
  1166  			// separator key. If the block is then excluded due to block
  1167  			// property filters, the iterator does not know whether keys were
  1168  			// actually skipped by the block's exclusion.
  1169  			//
  1170  			// Since MaybeFilteredKeys cannot guarantee that keys were skipped,
  1171  			// it's possible l.iterFile.Smallest was already returned. Returning
  1172  			// l.iterFile.Smallest again is a violation of the strict
  1173  			// monotonicity normally provided. The mergingIter's heap can
  1174  			// tolerate this repeat key and in this case will keep the level at
  1175  			// the top of the heap and immediately skip the entry, advancing to
  1176  			// the next file.
  1177  			if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() {
  1178  				l.smallestBoundary = &l.iterFile.Smallest
  1179  				if l.boundaryContext != nil {
  1180  					l.boundaryContext.isIgnorableBoundaryKey = true
  1181  				}
  1182  				return l.smallestBoundary, base.LazyValue{}
  1183  			}
  1184  		}
  1185  
  1186  		// Current file was exhausted. Move to the previous file.
  1187  		if l.loadFile(l.files.Prev(), -1) == noFileLoaded {
  1188  			return nil, base.LazyValue{}
  1189  		}
  1190  	}
  1191  	return key, val
  1192  }
  1193  
  1194  func (l *levelIter) Error() error {
  1195  	if l.err != nil || l.iter == nil {
  1196  		return l.err
  1197  	}
  1198  	return l.iter.Error()
  1199  }
  1200  
  1201  func (l *levelIter) Close() error {
  1202  	if l.iter != nil {
  1203  		l.err = l.iter.Close()
  1204  		l.iter = nil
  1205  	}
  1206  	if l.rangeDelIterPtr != nil {
  1207  		if t := l.rangeDelIterCopy; t != nil {
  1208  			l.err = firstError(l.err, t.Close())
  1209  		}
  1210  		*l.rangeDelIterPtr = nil
  1211  		l.rangeDelIterCopy = nil
  1212  	}
  1213  	return l.err
  1214  }
  1215  
  1216  func (l *levelIter) SetBounds(lower, upper []byte) {
  1217  	l.lower = lower
  1218  	l.upper = upper
  1219  
  1220  	if l.iter == nil {
  1221  		return
  1222  	}
  1223  
  1224  	// Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within
  1225  	// the boundaries of the current table.
  1226  	if l.initTableBounds(l.iterFile) != 0 {
  1227  		// The table does not overlap the bounds. Close() will set levelIter.err if
  1228  		// an error occurs.
  1229  		_ = l.Close()
  1230  		return
  1231  	}
  1232  
  1233  	l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound)
  1234  }
  1235  
  1236  func (l *levelIter) SetContext(ctx context.Context) {
  1237  	l.ctx = ctx
  1238  	if l.iter != nil {
  1239  		// TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx,
  1240  		// manifest.LevelToInt(opts.level)) that happens in table_cache.go.
  1241  		l.iter.SetContext(ctx)
  1242  	}
  1243  }
  1244  
  1245  func (l *levelIter) String() string {
  1246  	if l.iterFile != nil {
  1247  		return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String())
  1248  	}
  1249  	return fmt.Sprintf("%s: fileNum=<nil>", l.level)
  1250  }
  1251  
  1252  var _ internalIterator = &levelIter{}