github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/iterator.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"context"
    10  	"io"
    11  	"sync"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/pebble/internal/base"
    16  	"github.com/cockroachdb/pebble/internal/bytealloc"
    17  	"github.com/cockroachdb/pebble/internal/fastrand"
    18  	"github.com/cockroachdb/pebble/internal/humanize"
    19  	"github.com/cockroachdb/pebble/internal/invariants"
    20  	"github.com/cockroachdb/pebble/internal/keyspan"
    21  	"github.com/cockroachdb/pebble/internal/manifest"
    22  	"github.com/cockroachdb/pebble/internal/rangekey"
    23  	"github.com/cockroachdb/pebble/sstable"
    24  	"github.com/cockroachdb/redact"
    25  )
    26  
    27  // iterPos describes the state of the internal iterator, in terms of whether it
    28  // is at the position returned to the user (cur), one ahead of the position
    29  // returned (next for forward iteration and prev for reverse iteration). The cur
    30  // position is split into two states, for forward and reverse iteration, since
    31  // we need to differentiate for switching directions.
    32  //
    33  // There is subtlety in what is considered the current position of the Iterator.
    34  // The internal iterator exposes a sequence of internal keys. There is not
    35  // always a single internalIterator position corresponding to the position
    36  // returned to the user. Consider the example:
    37  //
    38  //	a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
    39  //	\                                   /
    40  //	  \       Iterator.Key() = 'a'    /
    41  //
    42  // The Iterator exposes one valid position at user key 'a' and the two exhausted
    43  // positions at the beginning and end of iteration. The underlying
    44  // internalIterator contains 7 valid positions and 2 exhausted positions.
    45  //
    46  // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
    47  // iff the user key at the current internalIterator position equals the
    48  // Iterator.Key returned to the user. This guarantees that a call to nextUserKey
    49  // or prevUserKey will advance to the next or previous iterator position.
    50  // iterPosCur{Forward,Backward} does not make any guarantee about the internal
    51  // iterator position among internal keys with matching user keys, and it will
    52  // vary subtly depending on the particular key kinds encountered. In the above
    53  // example, the iterator returning 'a' may set iterPosCurForward if the internal
    54  // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
    55  //
    56  // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
    57  // must be advanced to the first internalIterator position at a user key greater
    58  // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
    59  // internalIterator position that's !Valid() must also be considered greater or
    60  // less—depending on the direction of iteration—than the last valid Iterator
    61  // position.
    62  type iterPos int8
    63  
    64  const (
    65  	iterPosCurForward iterPos = 0
    66  	iterPosNext       iterPos = 1
    67  	iterPosPrev       iterPos = -1
    68  	iterPosCurReverse iterPos = -2
    69  
    70  	// For limited iteration. When the iterator is at iterPosCurForwardPaused
    71  	// - Next*() call should behave as if the internal iterator is already
    72  	//   at next (akin to iterPosNext).
    73  	// - Prev*() call should behave as if the internal iterator is at the
    74  	//   current key (akin to iterPosCurForward).
    75  	//
    76  	// Similar semantics apply to CurReversePaused.
    77  	iterPosCurForwardPaused iterPos = 2
    78  	iterPosCurReversePaused iterPos = -3
    79  )
    80  
    81  // Approximate gap in bytes between samples of data read during iteration.
    82  // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
    83  // 1 << 20 (1MB). The 1MB factor comes from:
    84  // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985
    85  const readBytesPeriod uint64 = 1 << 16
    86  
    87  var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration")
    88  
    89  // IteratorMetrics holds per-iterator metrics. These do not change over the
    90  // lifetime of the iterator.
    91  type IteratorMetrics struct {
    92  	// The read amplification experienced by this iterator. This is the sum of
    93  	// the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
    94  	// amplification generally results in slower reads, though allowing higher
    95  	// read amplification can also result in faster writes.
    96  	ReadAmp int
    97  }
    98  
    99  // IteratorStatsKind describes the two kind of iterator stats.
   100  type IteratorStatsKind int8
   101  
   102  const (
   103  	// InterfaceCall represents calls to Iterator.
   104  	InterfaceCall IteratorStatsKind = iota
   105  	// InternalIterCall represents calls by Iterator to its internalIterator.
   106  	InternalIterCall
   107  	// NumStatsKind is the number of kinds, and is used for array sizing.
   108  	NumStatsKind
   109  )
   110  
   111  // IteratorStats contains iteration stats.
   112  type IteratorStats struct {
   113  	// ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
   114  	ForwardSeekCount [NumStatsKind]int
   115  	// ReverseSeek includes SeekLT, Last.
   116  	ReverseSeekCount [NumStatsKind]int
   117  	// ForwardStepCount includes Next.
   118  	ForwardStepCount [NumStatsKind]int
   119  	// ReverseStepCount includes Prev.
   120  	ReverseStepCount [NumStatsKind]int
   121  	InternalStats    InternalIteratorStats
   122  	RangeKeyStats    RangeKeyIteratorStats
   123  }
   124  
   125  var _ redact.SafeFormatter = &IteratorStats{}
   126  
   127  // InternalIteratorStats contains miscellaneous stats produced by internal
   128  // iterators.
   129  type InternalIteratorStats = base.InternalIteratorStats
   130  
   131  // RangeKeyIteratorStats contains miscellaneous stats about range keys
   132  // encountered by the iterator.
   133  type RangeKeyIteratorStats struct {
   134  	// Count records the number of range keys encountered during
   135  	// iteration. Range keys may be counted multiple times if the iterator
   136  	// leaves a range key's bounds and then returns.
   137  	Count int
   138  	// ContainedPoints records the number of point keys encountered within the
   139  	// bounds of a range key. Note that this includes point keys with suffixes
   140  	// that sort both above and below the covering range key's suffix.
   141  	ContainedPoints int
   142  	// SkippedPoints records the count of the subset of ContainedPoints point
   143  	// keys that were skipped during iteration due to range-key masking. It does
   144  	// not include point keys that were never loaded because a
   145  	// RangeKeyMasking.Filter excluded the entire containing block.
   146  	SkippedPoints int
   147  }
   148  
   149  // Merge adds all of the argument's statistics to the receiver. It may be used
   150  // to accumulate stats across multiple iterators.
   151  func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) {
   152  	s.Count += o.Count
   153  	s.ContainedPoints += o.ContainedPoints
   154  	s.SkippedPoints += o.SkippedPoints
   155  }
   156  
   157  // LazyValue is a lazy value. See the long comment in base.LazyValue.
   158  type LazyValue = base.LazyValue
   159  
   160  // Iterator iterates over a DB's key/value pairs in key order.
   161  //
   162  // An iterator must be closed after use, but it is not necessary to read an
   163  // iterator until exhaustion.
   164  //
   165  // An iterator is not goroutine-safe, but it is safe to use multiple iterators
   166  // concurrently, with each in a dedicated goroutine.
   167  //
   168  // It is also safe to use an iterator concurrently with modifying its
   169  // underlying DB, if that DB permits modification. However, the resultant
   170  // key/value pairs are not guaranteed to be a consistent snapshot of that DB
   171  // at a particular point in time.
   172  //
   173  // If an iterator encounters an error during any operation, it is stored by
   174  // the Iterator and surfaced through the Error method. All absolute
   175  // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
   176  // accumulated error before positioning. All relative positioning methods (eg,
   177  // Next, Prev) return without advancing if the iterator has an accumulated
   178  // error.
   179  type Iterator struct {
   180  	// The context is stored here since (a) Iterators are expected to be
   181  	// short-lived (since they pin memtables and sstables), (b) plumbing a
   182  	// context into every method is very painful, (c) they do not (yet) respect
   183  	// context cancellation and are only used for tracing.
   184  	ctx       context.Context
   185  	opts      IterOptions
   186  	merge     Merge
   187  	comparer  base.Comparer
   188  	iter      internalIterator
   189  	pointIter internalIterator
   190  	// Either readState or version is set, but not both.
   191  	readState *readState
   192  	version   *version
   193  	// rangeKey holds iteration state specific to iteration over range keys.
   194  	// The range key field may be nil if the Iterator has never been configured
   195  	// to iterate over range keys. Its non-nilness cannot be used to determine
   196  	// if the Iterator is currently iterating over range keys: For that, consult
   197  	// the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
   198  	// field is guaranteed to be non-nil too.
   199  	rangeKey *iteratorRangeKeyState
   200  	// rangeKeyMasking holds state for range-key masking of point keys.
   201  	rangeKeyMasking rangeKeyMasking
   202  	err             error
   203  	// When iterValidityState=IterValid, key represents the current key, which
   204  	// is backed by keyBuf.
   205  	key    []byte
   206  	keyBuf []byte
   207  	value  LazyValue
   208  	// For use in LazyValue.Clone.
   209  	valueBuf []byte
   210  	fetcher  base.LazyFetcher
   211  	// For use in LazyValue.Value.
   212  	lazyValueBuf []byte
   213  	valueCloser  io.Closer
   214  	// boundsBuf holds two buffers used to store the lower and upper bounds.
   215  	// Whenever the Iterator's bounds change, the new bounds are copied into
   216  	// boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
   217  	// allocations. opts.LowerBound and opts.UpperBound point into this slice.
   218  	boundsBuf    [2][]byte
   219  	boundsBufIdx int
   220  	// iterKey, iterValue reflect the latest position of iter, except when
   221  	// SetBounds is called. In that case, these are explicitly set to nil.
   222  	iterKey             *InternalKey
   223  	iterValue           LazyValue
   224  	alloc               *iterAlloc
   225  	getIterAlloc        *getIterAlloc
   226  	prefixOrFullSeekKey []byte
   227  	readSampling        readSampling
   228  	stats               IteratorStats
   229  	externalReaders     [][]*sstable.Reader
   230  
   231  	// Following fields used when constructing an iterator stack, eg, in Clone
   232  	// and SetOptions or when re-fragmenting a batch's range keys/range dels.
   233  	// Non-nil if this Iterator includes a Batch.
   234  	batch            *Batch
   235  	newIters         tableNewIters
   236  	newIterRangeKey  keyspan.TableNewSpanIter
   237  	lazyCombinedIter lazyCombinedIter
   238  	seqNum           uint64
   239  	// batchSeqNum is used by Iterators over indexed batches to detect when the
   240  	// underlying batch has been mutated. The batch beneath an indexed batch may
   241  	// be mutated while the Iterator is open, but new keys are not surfaced
   242  	// until the next call to SetOptions.
   243  	batchSeqNum uint64
   244  	// batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
   245  	// configured to read through an indexed batch. If a batch is set, these
   246  	// iterators will be included within the iterator stack regardless of
   247  	// whether the batch currently contains any keys of their kind. These
   248  	// pointers are used during a call to SetOptions to refresh the Iterator's
   249  	// view of its indexed batch.
   250  	batchPointIter    batchIter
   251  	batchRangeDelIter keyspan.Iter
   252  	batchRangeKeyIter keyspan.Iter
   253  	// merging is a pointer to this iterator's point merging iterator. It
   254  	// appears here because key visibility is handled by the merging iterator.
   255  	// During SetOptions on an iterator over an indexed batch, this field is
   256  	// used to update the merging iterator's batch snapshot.
   257  	merging *mergingIter
   258  
   259  	// Keeping the bools here after all the 8 byte aligned fields shrinks the
   260  	// sizeof this struct by 24 bytes.
   261  
   262  	// INVARIANT:
   263  	// iterValidityState==IterAtLimit <=>
   264  	//  pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
   265  	iterValidityState IterValidityState
   266  	// Set to true by SetBounds, SetOptions. Causes the Iterator to appear
   267  	// exhausted externally, while preserving the correct iterValidityState for
   268  	// the iterator's internal state. Preserving the correct internal validity
   269  	// is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
   270  	// optimizations after "no-op" calls to SetBounds and SetOptions.
   271  	requiresReposition bool
   272  	// The position of iter. When this is iterPos{Prev,Next} the iter has been
   273  	// moved past the current key-value, which can only happen if
   274  	// iterValidityState=IterValid, i.e., there is something to return to the
   275  	// client for the current position.
   276  	pos iterPos
   277  	// Relates to the prefixOrFullSeekKey field above.
   278  	hasPrefix bool
   279  	// Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
   280  	// and SeekGE/SeekLT optimizations
   281  	lastPositioningOp lastPositioningOpKind
   282  	// Used for determining when it's safe to perform SeekGE optimizations that
   283  	// reuse the iterator state to avoid the cost of a full seek if the iterator
   284  	// is already positioned in the correct place. If the iterator's view of its
   285  	// indexed batch was just refreshed, some optimizations cannot be applied on
   286  	// the first seek after the refresh:
   287  	// - SeekGE has a no-op optimization that does not seek on the internal
   288  	//   iterator at all if the iterator is already in the correct place.
   289  	//   This optimization cannot be performed if the internal iterator was
   290  	//   last positioned when the iterator had a different view of an
   291  	//   underlying batch.
   292  	// - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is
   293  	//   greater than the previous operation's seek key, under the expectation
   294  	//   that the various internal iterators can use their current position to
   295  	//   avoid a full expensive re-seek. This applies to the batchIter as well.
   296  	//   However, if the view of the batch was just refreshed, the batchIter's
   297  	//   position is not useful because it may already be beyond new keys less
   298  	//   than the seek key. To prevent the use of this optimization in
   299  	//   batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this
   300  	//   bit is enabled.
   301  	batchJustRefreshed bool
   302  	// Used for an optimization in external iterators to reduce the number of
   303  	// merging levels.
   304  	forwardOnly bool
   305  	// batchOnlyIter is set to true for Batch.NewBatchOnlyIter.
   306  	batchOnlyIter bool
   307  	// closePointIterOnce is set to true if this point iter can only be Close()d
   308  	// once, _and_ closing i.iter and then i.pointIter would close i.pointIter
   309  	// twice. This is necessary to track if the point iter is an internal iterator
   310  	// that could release its resources to a pool on Close(), making it harder for
   311  	// that iterator to make its own closes idempotent.
   312  	//
   313  	// TODO(bilal): Update SetOptions to always close out point key iterators when
   314  	// they won't be used, so that Close() doesn't need to default to closing
   315  	// point iterators twice.
   316  	closePointIterOnce bool
   317  	// Used in some tests to disable the random disabling of seek optimizations.
   318  	forceEnableSeekOpt bool
   319  	// Set to true if NextPrefix is not currently permitted. Defaults to false
   320  	// in case an iterator never had any bounds.
   321  	nextPrefixNotPermittedByUpperBound bool
   322  }
   323  
   324  // cmp is a convenience shorthand for the i.comparer.Compare function.
   325  func (i *Iterator) cmp(a, b []byte) int {
   326  	return i.comparer.Compare(a, b)
   327  }
   328  
   329  // split is a convenience shorthand for the i.comparer.Split function.
   330  func (i *Iterator) split(a []byte) int {
   331  	return i.comparer.Split(a)
   332  }
   333  
   334  // equal is a convenience shorthand for the i.comparer.Equal function.
   335  func (i *Iterator) equal(a, b []byte) bool {
   336  	return i.comparer.Equal(a, b)
   337  }
   338  
   339  // iteratorRangeKeyState holds an iterator's range key iteration state.
   340  type iteratorRangeKeyState struct {
   341  	opts  *IterOptions
   342  	cmp   base.Compare
   343  	split base.Split
   344  	// rangeKeyIter holds the range key iterator stack that iterates over the
   345  	// merged spans across the entirety of the LSM.
   346  	rangeKeyIter keyspan.FragmentIterator
   347  	iiter        keyspan.InterleavingIter
   348  	// stale is set to true when the range key state recorded here (in start,
   349  	// end and keys) may not be in sync with the current range key at the
   350  	// interleaving iterator's current position.
   351  	//
   352  	// When the interelaving iterator passes over a new span, it invokes the
   353  	// SpanChanged hook defined on the `rangeKeyMasking` type,  which sets stale
   354  	// to true if the span is non-nil.
   355  	//
   356  	// The parent iterator may not be positioned over the interleaving
   357  	// iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
   358  	// {keys,start,end} are only updated to the new range key during a call to
   359  	// Iterator.saveRangeKey.
   360  	stale bool
   361  	// updated is used to signal to the Iterator client whether the state of
   362  	// range keys has changed since the previous iterator position through the
   363  	// `RangeKeyChanged` method. It's set to true during an Iterator positioning
   364  	// operation that changes the state of the current range key. Each Iterator
   365  	// positioning operation sets it back to false before executing.
   366  	//
   367  	// TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is
   368  	// intricate and confusing. Try to refactor to reduce complexity.
   369  	updated bool
   370  	// prevPosHadRangeKey records whether the previous Iterator position had a
   371  	// range key (HasPointAndRage() = (_, true)). It's updated at the beginning
   372  	// of each new Iterator positioning operation. It's required by saveRangeKey to
   373  	// to set `updated` appropriately: Without this record of the previous iterator
   374  	// state, it's ambiguous whether an iterator only temporarily stepped onto a
   375  	// position without a range key.
   376  	prevPosHadRangeKey bool
   377  	// rangeKeyOnly is set to true if at the current iterator position there is
   378  	// no point key, only a range key start boundary.
   379  	rangeKeyOnly bool
   380  	// hasRangeKey is true when the current iterator position has a covering
   381  	// range key (eg, a range key with bounds [<lower>,<upper>) such that
   382  	// <lower> ≤ Key() < <upper>).
   383  	hasRangeKey bool
   384  	// start and end are the [start, end) boundaries of the current range keys.
   385  	start []byte
   386  	end   []byte
   387  
   388  	rangeKeyBuffers
   389  
   390  	// iterConfig holds fields that are used for the construction of the
   391  	// iterator stack, but do not need to be directly accessed during iteration.
   392  	// This struct is bundled within the iteratorRangeKeyState struct to reduce
   393  	// allocations.
   394  	iterConfig rangekey.UserIteratorConfig
   395  }
   396  
   397  type rangeKeyBuffers struct {
   398  	// keys is sorted by Suffix ascending.
   399  	keys []RangeKeyData
   400  	// buf is used to save range-key data before moving the range-key iterator.
   401  	// Start and end boundaries, suffixes and values are all copied into buf.
   402  	buf bytealloc.A
   403  	// internal holds buffers used by the range key internal iterators.
   404  	internal rangekey.Buffers
   405  }
   406  
   407  func (b *rangeKeyBuffers) PrepareForReuse() {
   408  	const maxKeysReuse = 100
   409  	if len(b.keys) > maxKeysReuse {
   410  		b.keys = nil
   411  	}
   412  	// Avoid caching the key buf if it is overly large. The constant is
   413  	// fairly arbitrary.
   414  	if cap(b.buf) >= maxKeyBufCacheSize {
   415  		b.buf = nil
   416  	} else {
   417  		b.buf = b.buf[:0]
   418  	}
   419  	b.internal.PrepareForReuse()
   420  }
   421  
   422  func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) {
   423  	i.cmp = cmp
   424  	i.split = split
   425  	i.opts = opts
   426  }
   427  
   428  var iterRangeKeyStateAllocPool = sync.Pool{
   429  	New: func() interface{} {
   430  		return &iteratorRangeKeyState{}
   431  	},
   432  }
   433  
   434  // isEphemeralPosition returns true iff the current iterator position is
   435  // ephemeral, and won't be visited during subsequent relative positioning
   436  // operations.
   437  //
   438  // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
   439  // straddling range key without a coincident point key is such a position.
   440  func (i *Iterator) isEphemeralPosition() bool {
   441  	return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
   442  		!i.equal(i.rangeKey.start, i.key)
   443  }
   444  
   445  type lastPositioningOpKind int8
   446  
   447  const (
   448  	unknownLastPositionOp lastPositioningOpKind = iota
   449  	seekPrefixGELastPositioningOp
   450  	seekGELastPositioningOp
   451  	seekLTLastPositioningOp
   452  	// internalNextOp is a special internal iterator positioning operation used
   453  	// by CanDeterministicallySingleDelete. It exists for enforcing requirements
   454  	// around calling CanDeterministicallySingleDelete at most once per external
   455  	// iterator position.
   456  	internalNextOp
   457  	// invalidatedLastPositionOp is similar to unknownLastPositionOp and the
   458  	// only reason to distinguish this is for the wider set of SeekGE
   459  	// optimizations we permit for the external iterator Iterator.forwardOnly
   460  	// case. Most code predicates should be doing equality comparisons with one
   461  	// of the seek* enum values, so this duplication should not result in code
   462  	// of the form:
   463  	//  if unknownLastPositionOp || invalidLastPositionOp
   464  	invalidatedLastPositionOp
   465  )
   466  
   467  // Limited iteration mode. Not for use with prefix iteration.
   468  //
   469  // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
   470  // at the limit in a best-effort manner. The client should behave correctly
   471  // even if the limits are ignored. These limits are not "deep", in that they
   472  // are not passed down to the underlying collection of internalIterators. This
   473  // is because the limits are transient, and apply only until the next
   474  // iteration call. They serve mainly as a way to bound the amount of work when
   475  // two (or more) Iterators are being coordinated at a higher level.
   476  //
   477  // In limited iteration mode:
   478  // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
   479  //   The return value from the *WithLimit() method provides a more precise
   480  //   disposition.
   481  // - The limit is exclusive for forward and inclusive for reverse.
   482  //
   483  //
   484  // Limited iteration mode & range keys
   485  //
   486  // Limited iteration interacts with range-key iteration. When range key
   487  // iteration is enabled, range keys are interleaved at their start boundaries.
   488  // Limited iteration must ensure that if a range key exists within the limit,
   489  // the iterator visits the range key.
   490  //
   491  // During forward limited iteration, this is trivial: An overlapping range key
   492  // must have a start boundary less than the limit, and the range key's start
   493  // boundary will be interleaved and found to be within the limit.
   494  //
   495  // During reverse limited iteration, the tail of the range key may fall within
   496  // the limit. The range key must be surfaced even if the range key's start
   497  // boundary is less than the limit, and if there are no point keys between the
   498  // current iterator position and the limit. To provide this guarantee, reverse
   499  // limited iteration ignores the limit as long as there is a range key
   500  // overlapping the iteration position.
   501  
   502  // IterValidityState captures the state of the Iterator.
   503  type IterValidityState int8
   504  
   505  const (
   506  	// IterExhausted represents an Iterator that is exhausted.
   507  	IterExhausted IterValidityState = iota
   508  	// IterValid represents an Iterator that is valid.
   509  	IterValid
   510  	// IterAtLimit represents an Iterator that has a non-exhausted
   511  	// internalIterator, but has reached a limit without any key for the
   512  	// caller.
   513  	IterAtLimit
   514  )
   515  
   516  // readSampling stores variables used to sample a read to trigger a read
   517  // compaction
   518  type readSampling struct {
   519  	bytesUntilReadSampling uint64
   520  	initialSamplePassed    bool
   521  	pendingCompactions     readCompactionQueue
   522  	// forceReadSampling is used for testing purposes to force a read sample on every
   523  	// call to Iterator.maybeSampleRead()
   524  	forceReadSampling bool
   525  }
   526  
   527  func (i *Iterator) findNextEntry(limit []byte) {
   528  	i.iterValidityState = IterExhausted
   529  	i.pos = iterPosCurForward
   530  	if i.opts.rangeKeys() && i.rangeKey != nil {
   531  		i.rangeKey.rangeKeyOnly = false
   532  	}
   533  
   534  	// Close the closer for the current value if one was open.
   535  	if i.closeValueCloser() != nil {
   536  		return
   537  	}
   538  
   539  	for i.iterKey != nil {
   540  		key := *i.iterKey
   541  
   542  		if i.hasPrefix {
   543  			if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) {
   544  				return
   545  			}
   546  		}
   547  		// Compare with limit every time we start at a different user key.
   548  		// Note that given the best-effort contract of limit, we could avoid a
   549  		// comparison in the common case by doing this only after
   550  		// i.nextUserKey is called for the deletes below. However that makes
   551  		// the behavior non-deterministic (since the behavior will vary based
   552  		// on what has been compacted), which makes it hard to test with the
   553  		// metamorphic test. So we forego that performance optimization.
   554  		if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 {
   555  			i.iterValidityState = IterAtLimit
   556  			i.pos = iterPosCurForwardPaused
   557  			return
   558  		}
   559  
   560  		// If the user has configured a SkipPoint function, invoke it to see
   561  		// whether we should skip over the current user key.
   562  		if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) {
   563  			// NB: We could call nextUserKey, but in some cases the SkipPoint
   564  			// predicate function might be cheaper than nextUserKey's key copy
   565  			// and key comparison. This should be the case for MVCC suffix
   566  			// comparisons, for example. In the future, we could expand the
   567  			// SkipPoint interface to give the implementor more control over
   568  			// whether we skip over just the internal key, the user key, or even
   569  			// the key prefix.
   570  			i.stats.ForwardStepCount[InternalIterCall]++
   571  			i.iterKey, i.iterValue = i.iter.Next()
   572  			continue
   573  		}
   574  
   575  		switch key.Kind() {
   576  		case InternalKeyKindRangeKeySet:
   577  			// Save the current key.
   578  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   579  			i.key = i.keyBuf
   580  			i.value = LazyValue{}
   581  			// There may also be a live point key at this userkey that we have
   582  			// not yet read. We need to find the next entry with this user key
   583  			// to find it. Save the range key so we don't lose it when we Next
   584  			// the underlying iterator.
   585  			i.saveRangeKey()
   586  			pointKeyExists := i.nextPointCurrentUserKey()
   587  			if i.err != nil {
   588  				i.iterValidityState = IterExhausted
   589  				return
   590  			}
   591  			i.rangeKey.rangeKeyOnly = !pointKeyExists
   592  			i.iterValidityState = IterValid
   593  			return
   594  
   595  		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
   596  			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
   597  			// only simpler, but is also necessary for correctness due to
   598  			// InternalKeyKindSSTableInternalObsoleteBit.
   599  			i.nextUserKey()
   600  			continue
   601  
   602  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   603  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   604  			i.key = i.keyBuf
   605  			i.value = i.iterValue
   606  			i.iterValidityState = IterValid
   607  			i.saveRangeKey()
   608  			return
   609  
   610  		case InternalKeyKindMerge:
   611  			// Resolving the merge may advance us to the next point key, which
   612  			// may be covered by a different set of range keys. Save the range
   613  			// key state so we don't lose it.
   614  			i.saveRangeKey()
   615  			if i.mergeForward(key) {
   616  				i.iterValidityState = IterValid
   617  				return
   618  			}
   619  
   620  			// The merge didn't yield a valid key, either because the value
   621  			// merger indicated it should be deleted, or because an error was
   622  			// encountered.
   623  			i.iterValidityState = IterExhausted
   624  			if i.err != nil {
   625  				return
   626  			}
   627  			if i.pos != iterPosNext {
   628  				i.nextUserKey()
   629  			}
   630  			if i.closeValueCloser() != nil {
   631  				return
   632  			}
   633  			i.pos = iterPosCurForward
   634  
   635  		default:
   636  			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
   637  			i.iterValidityState = IterExhausted
   638  			return
   639  		}
   640  	}
   641  }
   642  
   643  func (i *Iterator) nextPointCurrentUserKey() bool {
   644  	// If the user has configured a SkipPoint function and the current user key
   645  	// would be skipped by it, there's no need to step forward looking for a
   646  	// point key. If we were to find one, it should be skipped anyways.
   647  	if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) {
   648  		return false
   649  	}
   650  
   651  	i.pos = iterPosCurForward
   652  
   653  	i.iterKey, i.iterValue = i.iter.Next()
   654  	i.stats.ForwardStepCount[InternalIterCall]++
   655  	if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) {
   656  		i.pos = iterPosNext
   657  		return false
   658  	}
   659  
   660  	key := *i.iterKey
   661  	switch key.Kind() {
   662  	case InternalKeyKindRangeKeySet:
   663  		// RangeKeySets must always be interleaved as the first internal key
   664  		// for a user key.
   665  		i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key")
   666  		return false
   667  
   668  	case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
   669  		// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
   670  		// only simpler, but is also necessary for correctness due to
   671  		// InternalKeyKindSSTableInternalObsoleteBit.
   672  		return false
   673  
   674  	case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   675  		i.value = i.iterValue
   676  		return true
   677  
   678  	case InternalKeyKindMerge:
   679  		return i.mergeForward(key)
   680  
   681  	default:
   682  		i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
   683  		return false
   684  	}
   685  }
   686  
   687  // mergeForward resolves a MERGE key, advancing the underlying iterator forward
   688  // to merge with subsequent keys with the same userkey. mergeForward returns a
   689  // boolean indicating whether or not the merge yielded a valid key. A merge may
   690  // not yield a valid key if an error occurred, in which case i.err is non-nil,
   691  // or the user's value merger specified the key to be deleted.
   692  //
   693  // mergeForward does not update iterValidityState.
   694  func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
   695  	var iterValue []byte
   696  	iterValue, _, i.err = i.iterValue.Value(nil)
   697  	if i.err != nil {
   698  		return false
   699  	}
   700  	var valueMerger ValueMerger
   701  	valueMerger, i.err = i.merge(key.UserKey, iterValue)
   702  	if i.err != nil {
   703  		return false
   704  	}
   705  
   706  	i.mergeNext(key, valueMerger)
   707  	if i.err != nil {
   708  		return false
   709  	}
   710  
   711  	var needDelete bool
   712  	var value []byte
   713  	value, needDelete, i.valueCloser, i.err = finishValueMerger(
   714  		valueMerger, true /* includesBase */)
   715  	i.value = base.MakeInPlaceValue(value)
   716  	if i.err != nil {
   717  		return false
   718  	}
   719  	if needDelete {
   720  		_ = i.closeValueCloser()
   721  		return false
   722  	}
   723  	return true
   724  }
   725  
   726  func (i *Iterator) closeValueCloser() error {
   727  	if i.valueCloser != nil {
   728  		i.err = i.valueCloser.Close()
   729  		i.valueCloser = nil
   730  	}
   731  	return i.err
   732  }
   733  
   734  func (i *Iterator) nextUserKey() {
   735  	if i.iterKey == nil {
   736  		return
   737  	}
   738  	trailer := i.iterKey.Trailer
   739  	done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
   740  	if i.iterValidityState != IterValid {
   741  		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
   742  		i.key = i.keyBuf
   743  	}
   744  	for {
   745  		i.iterKey, i.iterValue = i.iter.Next()
   746  		i.stats.ForwardStepCount[InternalIterCall]++
   747  		// NB: We're guaranteed to be on the next user key if the previous key
   748  		// had a zero sequence number (`done`), or the new key has a trailer
   749  		// greater or equal to the previous key's trailer. This is true because
   750  		// internal keys with the same user key are sorted by Trailer in
   751  		// strictly monotonically descending order. We expect the trailer
   752  		// optimization to trigger around 50% of the time with randomly
   753  		// distributed writes. We expect it to trigger very frequently when
   754  		// iterating through ingested sstables, which contain keys that all have
   755  		// the same sequence number.
   756  		if done || i.iterKey == nil || i.iterKey.Trailer >= trailer {
   757  			break
   758  		}
   759  		if !i.equal(i.key, i.iterKey.UserKey) {
   760  			break
   761  		}
   762  		done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
   763  		trailer = i.iterKey.Trailer
   764  	}
   765  }
   766  
   767  func (i *Iterator) maybeSampleRead() {
   768  	// This method is only called when a public method of Iterator is
   769  	// returning, and below we exclude the case were the iterator is paused at
   770  	// a limit. The effect of these choices is that keys that are deleted, but
   771  	// are encountered during iteration, are not accounted for in the read
   772  	// sampling and will not cause read driven compactions, even though we are
   773  	// incurring cost in iterating over them. And this issue is not limited to
   774  	// Iterator, which does not see the effect of range deletes, which may be
   775  	// causing iteration work in mergingIter. It is not clear at this time
   776  	// whether this is a deficiency worth addressing.
   777  	if i.iterValidityState != IterValid {
   778  		return
   779  	}
   780  	if i.readState == nil {
   781  		return
   782  	}
   783  	if i.readSampling.forceReadSampling {
   784  		i.sampleRead()
   785  		return
   786  	}
   787  	samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
   788  	if samplingPeriod <= 0 {
   789  		return
   790  	}
   791  	bytesRead := uint64(len(i.key) + i.value.Len())
   792  	for i.readSampling.bytesUntilReadSampling < bytesRead {
   793  		i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod)))
   794  		// The block below tries to adjust for the case where this is the
   795  		// first read in a newly-opened iterator. As bytesUntilReadSampling
   796  		// starts off at zero, we don't want to sample the first read of
   797  		// every newly-opened iterator, but we do want to sample some of them.
   798  		if !i.readSampling.initialSamplePassed {
   799  			i.readSampling.initialSamplePassed = true
   800  			if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) {
   801  				continue
   802  			}
   803  		}
   804  		i.sampleRead()
   805  	}
   806  	i.readSampling.bytesUntilReadSampling -= bytesRead
   807  }
   808  
   809  func (i *Iterator) sampleRead() {
   810  	var topFile *manifest.FileMetadata
   811  	topLevel, numOverlappingLevels := numLevels, 0
   812  	mi := i.merging
   813  	if mi == nil {
   814  		return
   815  	}
   816  	if len(mi.levels) > 1 {
   817  		mi.ForEachLevelIter(func(li *levelIter) bool {
   818  			l := manifest.LevelToInt(li.level)
   819  			if f := li.iterFile; f != nil {
   820  				var containsKey bool
   821  				if i.pos == iterPosNext || i.pos == iterPosCurForward ||
   822  					i.pos == iterPosCurForwardPaused {
   823  					containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0
   824  				} else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
   825  					i.pos == iterPosCurReversePaused {
   826  					containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0
   827  				}
   828  				// Do nothing if the current key is not contained in f's
   829  				// bounds. We could seek the LevelIterator at this level
   830  				// to find the right file, but the performance impacts of
   831  				// doing that are significant enough to negate the benefits
   832  				// of read sampling in the first place. See the discussion
   833  				// at:
   834  				// https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492
   835  				if containsKey {
   836  					numOverlappingLevels++
   837  					if numOverlappingLevels >= 2 {
   838  						// Terminate the loop early if at least 2 overlapping levels are found.
   839  						return true
   840  					}
   841  					topLevel = l
   842  					topFile = f
   843  				}
   844  			}
   845  			return false
   846  		})
   847  	}
   848  	if topFile == nil || topLevel >= numLevels {
   849  		return
   850  	}
   851  	if numOverlappingLevels >= 2 {
   852  		allowedSeeks := topFile.AllowedSeeks.Add(-1)
   853  		if allowedSeeks == 0 {
   854  
   855  			// Since the compaction queue can handle duplicates, we can keep
   856  			// adding to the queue even once allowedSeeks hits 0.
   857  			// In fact, we NEED to keep adding to the queue, because the queue
   858  			// is small and evicts older and possibly useful compactions.
   859  			topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks)
   860  
   861  			read := readCompaction{
   862  				start:   topFile.SmallestPointKey.UserKey,
   863  				end:     topFile.LargestPointKey.UserKey,
   864  				level:   topLevel,
   865  				fileNum: topFile.FileNum,
   866  			}
   867  			i.readSampling.pendingCompactions.add(&read, i.cmp)
   868  		}
   869  	}
   870  }
   871  
   872  func (i *Iterator) findPrevEntry(limit []byte) {
   873  	i.iterValidityState = IterExhausted
   874  	i.pos = iterPosCurReverse
   875  	if i.opts.rangeKeys() && i.rangeKey != nil {
   876  		i.rangeKey.rangeKeyOnly = false
   877  	}
   878  
   879  	// Close the closer for the current value if one was open.
   880  	if i.valueCloser != nil {
   881  		i.err = i.valueCloser.Close()
   882  		i.valueCloser = nil
   883  		if i.err != nil {
   884  			i.iterValidityState = IterExhausted
   885  			return
   886  		}
   887  	}
   888  
   889  	var valueMerger ValueMerger
   890  	firstLoopIter := true
   891  	rangeKeyBoundary := false
   892  	// The code below compares with limit in multiple places. As documented in
   893  	// findNextEntry, this is being done to make the behavior of limit
   894  	// deterministic to allow for metamorphic testing. It is not required by
   895  	// the best-effort contract of limit.
   896  	for i.iterKey != nil {
   897  		key := *i.iterKey
   898  
   899  		// NB: We cannot pause if the current key is covered by a range key.
   900  		// Otherwise, the user might not ever learn of a range key that covers
   901  		// the key space being iterated over in which there are no point keys.
   902  		// Since limits are best effort, ignoring the limit in this case is
   903  		// allowed by the contract of limit.
   904  		if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
   905  			i.iterValidityState = IterAtLimit
   906  			i.pos = iterPosCurReversePaused
   907  			return
   908  		}
   909  		firstLoopIter = false
   910  
   911  		if i.iterValidityState == IterValid {
   912  			if !i.equal(key.UserKey, i.key) {
   913  				// We've iterated to the previous user key.
   914  				i.pos = iterPosPrev
   915  				if valueMerger != nil {
   916  					var needDelete bool
   917  					var value []byte
   918  					value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
   919  					i.value = base.MakeInPlaceValue(value)
   920  					if i.err == nil && needDelete {
   921  						// The point key at this key is deleted. If we also have
   922  						// a range key boundary at this key, we still want to
   923  						// return. Otherwise, we need to continue looking for
   924  						// a live key.
   925  						i.value = LazyValue{}
   926  						if rangeKeyBoundary {
   927  							i.rangeKey.rangeKeyOnly = true
   928  						} else {
   929  							i.iterValidityState = IterExhausted
   930  							if i.closeValueCloser() == nil {
   931  								continue
   932  							}
   933  						}
   934  					}
   935  				}
   936  				if i.err != nil {
   937  					i.iterValidityState = IterExhausted
   938  				}
   939  				return
   940  			}
   941  		}
   942  
   943  		// If the user has configured a SkipPoint function, invoke it to see
   944  		// whether we should skip over the current user key.
   945  		if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) {
   946  			// NB: We could call prevUserKey, but in some cases the SkipPoint
   947  			// predicate function might be cheaper than prevUserKey's key copy
   948  			// and key comparison. This should be the case for MVCC suffix
   949  			// comparisons, for example. In the future, we could expand the
   950  			// SkipPoint interface to give the implementor more control over
   951  			// whether we skip over just the internal key, the user key, or even
   952  			// the key prefix.
   953  			i.stats.ReverseStepCount[InternalIterCall]++
   954  			i.iterKey, i.iterValue = i.iter.Prev()
   955  			if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
   956  				i.iterValidityState = IterAtLimit
   957  				i.pos = iterPosCurReversePaused
   958  				return
   959  			}
   960  			continue
   961  		}
   962  
   963  		switch key.Kind() {
   964  		case InternalKeyKindRangeKeySet:
   965  			// Range key start boundary markers are interleaved with the maximum
   966  			// sequence number, so if there's a point key also at this key, we
   967  			// must've already iterated over it.
   968  			// This is the final entry at this user key, so we may return
   969  			i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
   970  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   971  			i.key = i.keyBuf
   972  			i.iterValidityState = IterValid
   973  			i.saveRangeKey()
   974  			// In all other cases, previous iteration requires advancing to
   975  			// iterPosPrev in order to determine if the key is live and
   976  			// unshadowed by another key at the same user key. In this case,
   977  			// because range key start boundary markers are always interleaved
   978  			// at the maximum sequence number, we know that there aren't any
   979  			// additional keys with the same user key in the backward direction.
   980  			//
   981  			// We Prev the underlying iterator once anyways for consistency, so
   982  			// that we can maintain the invariant during backward iteration that
   983  			// i.iterPos = iterPosPrev.
   984  			i.stats.ReverseStepCount[InternalIterCall]++
   985  			i.iterKey, i.iterValue = i.iter.Prev()
   986  
   987  			// Set rangeKeyBoundary so that on the next iteration, we know to
   988  			// return the key even if the MERGE point key is deleted.
   989  			rangeKeyBoundary = true
   990  
   991  		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
   992  			i.value = LazyValue{}
   993  			i.iterValidityState = IterExhausted
   994  			valueMerger = nil
   995  			i.iterKey, i.iterValue = i.iter.Prev()
   996  			i.stats.ReverseStepCount[InternalIterCall]++
   997  			// Compare with the limit. We could optimize by only checking when
   998  			// we step to the previous user key, but detecting that requires a
   999  			// comparison too. Note that this position may already passed a
  1000  			// number of versions of this user key, but they are all deleted, so
  1001  			// the fact that a subsequent Prev*() call will not see them is
  1002  			// harmless. Also note that this is the only place in the loop,
  1003  			// other than the firstLoopIter and SkipPoint cases above, where we
  1004  			// could step to a different user key and start processing it for
  1005  			// returning to the caller.
  1006  			if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
  1007  				i.iterValidityState = IterAtLimit
  1008  				i.pos = iterPosCurReversePaused
  1009  				return
  1010  			}
  1011  			continue
  1012  
  1013  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
  1014  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
  1015  			i.key = i.keyBuf
  1016  			// iterValue is owned by i.iter and could change after the Prev()
  1017  			// call, so use valueBuf instead. Note that valueBuf is only used
  1018  			// in this one instance; everywhere else (eg. in findNextEntry),
  1019  			// we just point i.value to the unsafe i.iter-owned value buffer.
  1020  			i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher)
  1021  			i.saveRangeKey()
  1022  			i.iterValidityState = IterValid
  1023  			i.iterKey, i.iterValue = i.iter.Prev()
  1024  			i.stats.ReverseStepCount[InternalIterCall]++
  1025  			valueMerger = nil
  1026  			continue
  1027  
  1028  		case InternalKeyKindMerge:
  1029  			if i.iterValidityState == IterExhausted {
  1030  				i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
  1031  				i.key = i.keyBuf
  1032  				i.saveRangeKey()
  1033  				var iterValue []byte
  1034  				iterValue, _, i.err = i.iterValue.Value(nil)
  1035  				if i.err != nil {
  1036  					return
  1037  				}
  1038  				valueMerger, i.err = i.merge(i.key, iterValue)
  1039  				if i.err != nil {
  1040  					return
  1041  				}
  1042  				i.iterValidityState = IterValid
  1043  			} else if valueMerger == nil {
  1044  				// Extract value before iterValue since we use value before iterValue
  1045  				// and the underlying iterator is not required to provide backing
  1046  				// memory for both simultaneously.
  1047  				var value []byte
  1048  				var callerOwned bool
  1049  				value, callerOwned, i.err = i.value.Value(i.lazyValueBuf)
  1050  				if callerOwned {
  1051  					i.lazyValueBuf = value[:0]
  1052  				}
  1053  				if i.err != nil {
  1054  					return
  1055  				}
  1056  				valueMerger, i.err = i.merge(i.key, value)
  1057  				var iterValue []byte
  1058  				iterValue, _, i.err = i.iterValue.Value(nil)
  1059  				if i.err != nil {
  1060  					return
  1061  				}
  1062  				if i.err == nil {
  1063  					i.err = valueMerger.MergeNewer(iterValue)
  1064  				}
  1065  				if i.err != nil {
  1066  					i.iterValidityState = IterExhausted
  1067  					return
  1068  				}
  1069  			} else {
  1070  				var iterValue []byte
  1071  				iterValue, _, i.err = i.iterValue.Value(nil)
  1072  				if i.err != nil {
  1073  					return
  1074  				}
  1075  				i.err = valueMerger.MergeNewer(iterValue)
  1076  				if i.err != nil {
  1077  					i.iterValidityState = IterExhausted
  1078  					return
  1079  				}
  1080  			}
  1081  			i.iterKey, i.iterValue = i.iter.Prev()
  1082  			i.stats.ReverseStepCount[InternalIterCall]++
  1083  			continue
  1084  
  1085  		default:
  1086  			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
  1087  			i.iterValidityState = IterExhausted
  1088  			return
  1089  		}
  1090  	}
  1091  
  1092  	// i.iterKey == nil, so broke out of the preceding loop.
  1093  	if i.iterValidityState == IterValid {
  1094  		i.pos = iterPosPrev
  1095  		if valueMerger != nil {
  1096  			var needDelete bool
  1097  			var value []byte
  1098  			value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
  1099  			i.value = base.MakeInPlaceValue(value)
  1100  			if i.err == nil && needDelete {
  1101  				i.key = nil
  1102  				i.value = LazyValue{}
  1103  				i.iterValidityState = IterExhausted
  1104  			}
  1105  		}
  1106  		if i.err != nil {
  1107  			i.iterValidityState = IterExhausted
  1108  		}
  1109  	}
  1110  }
  1111  
  1112  func (i *Iterator) prevUserKey() {
  1113  	if i.iterKey == nil {
  1114  		return
  1115  	}
  1116  	if i.iterValidityState != IterValid {
  1117  		// If we're going to compare against the prev key, we need to save the
  1118  		// current key.
  1119  		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
  1120  		i.key = i.keyBuf
  1121  	}
  1122  	for {
  1123  		i.iterKey, i.iterValue = i.iter.Prev()
  1124  		i.stats.ReverseStepCount[InternalIterCall]++
  1125  		if i.iterKey == nil {
  1126  			break
  1127  		}
  1128  		if !i.equal(i.key, i.iterKey.UserKey) {
  1129  			break
  1130  		}
  1131  	}
  1132  }
  1133  
  1134  func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
  1135  	// Save the current key.
  1136  	i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
  1137  	i.key = i.keyBuf
  1138  
  1139  	// Loop looking for older values for this key and merging them.
  1140  	for {
  1141  		i.iterKey, i.iterValue = i.iter.Next()
  1142  		i.stats.ForwardStepCount[InternalIterCall]++
  1143  		if i.iterKey == nil {
  1144  			i.pos = iterPosNext
  1145  			return
  1146  		}
  1147  		key = *i.iterKey
  1148  		if !i.equal(i.key, key.UserKey) {
  1149  			// We've advanced to the next key.
  1150  			i.pos = iterPosNext
  1151  			return
  1152  		}
  1153  		switch key.Kind() {
  1154  		case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized:
  1155  			// We've hit a deletion tombstone. Return everything up to this
  1156  			// point.
  1157  			//
  1158  			// NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not
  1159  			// only simpler, but is also necessary for correctness due to
  1160  			// InternalKeyKindSSTableInternalObsoleteBit.
  1161  			return
  1162  
  1163  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
  1164  			// We've hit a Set value. Merge with the existing value and return.
  1165  			var iterValue []byte
  1166  			iterValue, _, i.err = i.iterValue.Value(nil)
  1167  			if i.err != nil {
  1168  				return
  1169  			}
  1170  			i.err = valueMerger.MergeOlder(iterValue)
  1171  			return
  1172  
  1173  		case InternalKeyKindMerge:
  1174  			// We've hit another Merge value. Merge with the existing value and
  1175  			// continue looping.
  1176  			var iterValue []byte
  1177  			iterValue, _, i.err = i.iterValue.Value(nil)
  1178  			if i.err != nil {
  1179  				return
  1180  			}
  1181  			i.err = valueMerger.MergeOlder(iterValue)
  1182  			if i.err != nil {
  1183  				return
  1184  			}
  1185  			continue
  1186  
  1187  		case InternalKeyKindRangeKeySet:
  1188  			// The RANGEKEYSET marker must sort before a MERGE at the same user key.
  1189  			i.err = base.CorruptionErrorf("pebble: out of order range key marker")
  1190  			return
  1191  
  1192  		default:
  1193  			i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind()))
  1194  			return
  1195  		}
  1196  	}
  1197  }
  1198  
  1199  // SeekGE moves the iterator to the first key/value pair whose key is greater
  1200  // than or equal to the given key. Returns true if the iterator is pointing at
  1201  // a valid entry and false otherwise.
  1202  func (i *Iterator) SeekGE(key []byte) bool {
  1203  	return i.SeekGEWithLimit(key, nil) == IterValid
  1204  }
  1205  
  1206  // SeekGEWithLimit moves the iterator to the first key/value pair whose key is
  1207  // greater than or equal to the given key.
  1208  //
  1209  // If limit is provided, it serves as a best-effort exclusive limit. If the
  1210  // first key greater than or equal to the given search key is also greater than
  1211  // or equal to limit, the Iterator may pause and return IterAtLimit. Because
  1212  // limits are best-effort, SeekGEWithLimit may return a key beyond limit.
  1213  //
  1214  // If the Iterator is configured to iterate over range keys, SeekGEWithLimit
  1215  // guarantees it will surface any range keys with bounds overlapping the
  1216  // keyspace [key, limit).
  1217  func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
  1218  	if i.rangeKey != nil {
  1219  		// NB: Check Valid() before clearing requiresReposition.
  1220  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1221  		// If we have a range key but did not expose it at the previous iterator
  1222  		// position (because the iterator was not at a valid position), updated
  1223  		// must be true. This ensures that after an iterator op sequence like:
  1224  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1225  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1226  		//   - SeekGE(...)        → (IterValid, RangeBounds() = [a,b))
  1227  		// the iterator returns RangeKeyChanged()=true.
  1228  		//
  1229  		// The remainder of this function will only update i.rangeKey.updated if
  1230  		// the iterator moves into a new range key, or out of the current range
  1231  		// key.
  1232  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1233  	}
  1234  	lastPositioningOp := i.lastPositioningOp
  1235  	hasPrefix := i.hasPrefix
  1236  	// Set it to unknown, since this operation may not succeed, in which case
  1237  	// the SeekGE following this should not make any assumption about iterator
  1238  	// position.
  1239  	i.lastPositioningOp = unknownLastPositionOp
  1240  	i.requiresReposition = false
  1241  	i.err = nil // clear cached iteration error
  1242  	i.hasPrefix = false
  1243  	i.stats.ForwardSeekCount[InterfaceCall]++
  1244  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1245  		key = lowerBound
  1246  	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1247  		key = upperBound
  1248  	}
  1249  	seekInternalIter := true
  1250  
  1251  	var flags base.SeekGEFlags
  1252  	if i.batchJustRefreshed {
  1253  		i.batchJustRefreshed = false
  1254  		flags = flags.EnableBatchJustRefreshed()
  1255  	}
  1256  	if lastPositioningOp == seekGELastPositioningOp {
  1257  		cmp := i.cmp(i.prefixOrFullSeekKey, key)
  1258  		// If this seek is to the same or later key, and the iterator is
  1259  		// already positioned there, this is a noop. This can be helpful for
  1260  		// sparse key spaces that have many deleted keys, where one can avoid
  1261  		// the overhead of iterating past them again and again.
  1262  		if cmp <= 0 {
  1263  			if !flags.BatchJustRefreshed() &&
  1264  				(i.iterValidityState == IterExhausted ||
  1265  					(i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
  1266  						(limit == nil || i.cmp(i.key, limit) < 0))) {
  1267  				// Noop
  1268  				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt {
  1269  					i.lastPositioningOp = seekGELastPositioningOp
  1270  					return i.iterValidityState
  1271  				}
  1272  			}
  1273  			// cmp == 0 is not safe to optimize since
  1274  			// - i.pos could be at iterPosNext, due to a merge.
  1275  			// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
  1276  			//   SET pair for a key, and the iterator would have moved past DELETE
  1277  			//   but stayed at iterPosCurForward. A similar situation occurs for a
  1278  			//   MERGE, SET pair where the MERGE is consumed and the iterator is
  1279  			//   at the SET.
  1280  			// We also leverage the IterAtLimit <=> i.pos invariant defined in the
  1281  			// comment on iterValidityState, to exclude any cases where i.pos
  1282  			// is iterPosCur{Forward,Reverse}Paused. This avoids the need to
  1283  			// special-case those iterator positions and their interactions with
  1284  			// TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
  1285  			// do not use limited Seeks in the first place.
  1286  			if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
  1287  				flags = flags.EnableTrySeekUsingNext()
  1288  			}
  1289  			if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1290  				flags = flags.DisableTrySeekUsingNext()
  1291  			}
  1292  			if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 {
  1293  				// Have some work to do, but don't need to seek, and we can
  1294  				// start doing findNextEntry from i.iterKey.
  1295  				seekInternalIter = false
  1296  			}
  1297  		}
  1298  	}
  1299  	// Check for another TrySeekUsingNext optimization opportunity, currently
  1300  	// specifically tailored to external iterators. This case is intended to
  1301  	// trigger in instances of Seek-ing with monotonically increasing keys with
  1302  	// Nexts interspersed. At the time of writing, this is the case for
  1303  	// CockroachDB scans. This optimization is important for external iterators
  1304  	// to avoid re-seeking within an already-exhausted sstable. It is not always
  1305  	// a performance win more generally, so we restrict it to external iterators
  1306  	// that are configured to only use forward positioning operations.
  1307  	//
  1308  	// TODO(jackson): This optimization should be obsolete once we introduce and
  1309  	// use the NextPrefix iterator positioning operation.
  1310  	if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp &&
  1311  		i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid &&
  1312  		i.cmp(key, i.iterKey.UserKey) > 0 {
  1313  		flags = flags.EnableTrySeekUsingNext()
  1314  		if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1315  			flags = flags.DisableTrySeekUsingNext()
  1316  		}
  1317  	}
  1318  	if seekInternalIter {
  1319  		i.iterKey, i.iterValue = i.iter.SeekGE(key, flags)
  1320  		i.stats.ForwardSeekCount[InternalIterCall]++
  1321  	}
  1322  	i.findNextEntry(limit)
  1323  	i.maybeSampleRead()
  1324  	if i.Error() == nil {
  1325  		// Prepare state for a future noop optimization.
  1326  		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
  1327  		i.lastPositioningOp = seekGELastPositioningOp
  1328  	}
  1329  	return i.iterValidityState
  1330  }
  1331  
  1332  // SeekPrefixGE moves the iterator to the first key/value pair whose key is
  1333  // greater than or equal to the given key and which has the same "prefix" as
  1334  // the given key. The prefix for a key is determined by the user-defined
  1335  // Comparer.Split function. The iterator will not observe keys not matching the
  1336  // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
  1337  // iteration mode. The iterator remains in prefix iteration until a subsequent
  1338  // call to another absolute positioning method (SeekGE, SeekLT, First,
  1339  // Last). Reverse iteration (Prev) is not supported when an iterator is in
  1340  // prefix iteration mode. Returns true if the iterator is pointing at a valid
  1341  // entry and false otherwise.
  1342  //
  1343  // The semantics of SeekPrefixGE are slightly unusual and designed for
  1344  // iteration to be able to take advantage of bloom filters that have been
  1345  // created on the "prefix". If you're not using bloom filters, there is no
  1346  // reason to use SeekPrefixGE.
  1347  //
  1348  // An example Split function may separate a timestamp suffix from the prefix of
  1349  // the key.
  1350  //
  1351  //	Split(<key>@<timestamp>) -> <key>
  1352  //
  1353  // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
  1354  // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
  1355  // usual definition, those prefixes differ by the definition of the Split
  1356  // function. To see how this works, consider the following set of calls on this
  1357  // data set:
  1358  //
  1359  //	SeekPrefixGE("a@0") -> "a@1"
  1360  //	Next()              -> "a@2"
  1361  //	Next()              -> EOF
  1362  //
  1363  // If you're just looking to iterate over keys with a shared prefix, as
  1364  // defined by the configured comparer, set iterator bounds instead:
  1365  //
  1366  //	iter := db.NewIter(&pebble.IterOptions{
  1367  //	  LowerBound: []byte("prefix"),
  1368  //	  UpperBound: []byte("prefiy"),
  1369  //	})
  1370  //	for iter.First(); iter.Valid(); iter.Next() {
  1371  //	  // Only keys beginning with "prefix" will be visited.
  1372  //	}
  1373  //
  1374  // See ExampleIterator_SeekPrefixGE for a working example.
  1375  //
  1376  // When iterating with range keys enabled, all range keys encountered are
  1377  // truncated to the seek key's prefix's bounds. The truncation of the upper
  1378  // bound requires that the database's Comparer is configured with a
  1379  // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
  1380  // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
  1381  func (i *Iterator) SeekPrefixGE(key []byte) bool {
  1382  	if i.rangeKey != nil {
  1383  		// NB: Check Valid() before clearing requiresReposition.
  1384  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1385  		// If we have a range key but did not expose it at the previous iterator
  1386  		// position (because the iterator was not at a valid position), updated
  1387  		// must be true. This ensures that after an iterator op sequence like:
  1388  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1389  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1390  		//   - SeekPrefixGE(...)  → (IterValid, RangeBounds() = [a,b))
  1391  		// the iterator returns RangeKeyChanged()=true.
  1392  		//
  1393  		// The remainder of this function will only update i.rangeKey.updated if
  1394  		// the iterator moves into a new range key, or out of the current range
  1395  		// key.
  1396  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1397  	}
  1398  	lastPositioningOp := i.lastPositioningOp
  1399  	// Set it to unknown, since this operation may not succeed, in which case
  1400  	// the SeekPrefixGE following this should not make any assumption about
  1401  	// iterator position.
  1402  	i.lastPositioningOp = unknownLastPositionOp
  1403  	i.requiresReposition = false
  1404  	i.err = nil // clear cached iteration error
  1405  	i.stats.ForwardSeekCount[InterfaceCall]++
  1406  	if i.comparer.Split == nil {
  1407  		panic("pebble: split must be provided for SeekPrefixGE")
  1408  	}
  1409  	if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
  1410  		panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
  1411  	}
  1412  	prefixLen := i.split(key)
  1413  	keyPrefix := key[:prefixLen]
  1414  	var flags base.SeekGEFlags
  1415  	if i.batchJustRefreshed {
  1416  		flags = flags.EnableBatchJustRefreshed()
  1417  		i.batchJustRefreshed = false
  1418  	}
  1419  	if lastPositioningOp == seekPrefixGELastPositioningOp {
  1420  		if !i.hasPrefix {
  1421  			panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
  1422  		}
  1423  		// The iterator has not been repositioned after the last SeekPrefixGE.
  1424  		// See if we are seeking to a larger key, since then we can optimize
  1425  		// the seek by using next. Note that we could also optimize if Next
  1426  		// has been called, if the iterator is not exhausted and the current
  1427  		// position is <= the seek key. We are keeping this limited for now
  1428  		// since such optimizations require care for correctness, and to not
  1429  		// become de-optimizations (if one usually has to do all the next
  1430  		// calls and then the seek). This SeekPrefixGE optimization
  1431  		// specifically benefits CockroachDB.
  1432  		cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
  1433  		// cmp == 0 is not safe to optimize since
  1434  		// - i.pos could be at iterPosNext, due to a merge.
  1435  		// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
  1436  		//   SET pair for a key, and the iterator would have moved past DELETE
  1437  		//   but stayed at iterPosCurForward. A similar situation occurs for a
  1438  		//   MERGE, SET pair where the MERGE is consumed and the iterator is
  1439  		//   at the SET.
  1440  		// In general some versions of i.prefix could have been consumed by
  1441  		// the iterator, so we only optimize for cmp < 0.
  1442  		if cmp < 0 {
  1443  			flags = flags.EnableTrySeekUsingNext()
  1444  		}
  1445  		if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1446  			flags = flags.DisableTrySeekUsingNext()
  1447  		}
  1448  	}
  1449  	// Make a copy of the prefix so that modifications to the key after
  1450  	// SeekPrefixGE returns does not affect the stored prefix.
  1451  	if cap(i.prefixOrFullSeekKey) < prefixLen {
  1452  		i.prefixOrFullSeekKey = make([]byte, prefixLen)
  1453  	} else {
  1454  		i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
  1455  	}
  1456  	i.hasPrefix = true
  1457  	copy(i.prefixOrFullSeekKey, keyPrefix)
  1458  
  1459  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1460  		if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) {
  1461  			i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound")
  1462  			i.iterValidityState = IterExhausted
  1463  			return false
  1464  		}
  1465  		key = lowerBound
  1466  	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1467  		if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) {
  1468  			i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound")
  1469  			i.iterValidityState = IterExhausted
  1470  			return false
  1471  		}
  1472  		key = upperBound
  1473  	}
  1474  	i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
  1475  	i.stats.ForwardSeekCount[InternalIterCall]++
  1476  	i.findNextEntry(nil)
  1477  	i.maybeSampleRead()
  1478  	if i.Error() == nil {
  1479  		i.lastPositioningOp = seekPrefixGELastPositioningOp
  1480  	}
  1481  	return i.iterValidityState == IterValid
  1482  }
  1483  
  1484  // Deterministic disabling of the seek optimizations. It uses the iterator
  1485  // pointer, since we want diversity in iterator behavior for the same key.  Used
  1486  // for tests.
  1487  func disableSeekOpt(key []byte, ptr uintptr) bool {
  1488  	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  1489  	simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
  1490  	return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
  1491  }
  1492  
  1493  // SeekLT moves the iterator to the last key/value pair whose key is less than
  1494  // the given key. Returns true if the iterator is pointing at a valid entry and
  1495  // false otherwise.
  1496  func (i *Iterator) SeekLT(key []byte) bool {
  1497  	return i.SeekLTWithLimit(key, nil) == IterValid
  1498  }
  1499  
  1500  // SeekLTWithLimit moves the iterator to the last key/value pair whose key is
  1501  // less than the given key.
  1502  //
  1503  // If limit is provided, it serves as a best-effort inclusive limit. If the last
  1504  // key less than the given search key is also less than limit, the Iterator may
  1505  // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
  1506  // may return a key beyond limit.
  1507  //
  1508  // If the Iterator is configured to iterate over range keys, SeekLTWithLimit
  1509  // guarantees it will surface any range keys with bounds overlapping the
  1510  // keyspace up to limit.
  1511  func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
  1512  	if i.rangeKey != nil {
  1513  		// NB: Check Valid() before clearing requiresReposition.
  1514  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1515  		// If we have a range key but did not expose it at the previous iterator
  1516  		// position (because the iterator was not at a valid position), updated
  1517  		// must be true. This ensures that after an iterator op sequence like:
  1518  		//   - Next()               → (IterValid, RangeBounds() = [a,b))
  1519  		//   - NextWithLimit(...)   → (IterAtLimit, RangeBounds() = -)
  1520  		//   - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b))
  1521  		// the iterator returns RangeKeyChanged()=true.
  1522  		//
  1523  		// The remainder of this function will only update i.rangeKey.updated if
  1524  		// the iterator moves into a new range key, or out of the current range
  1525  		// key.
  1526  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1527  	}
  1528  	lastPositioningOp := i.lastPositioningOp
  1529  	// Set it to unknown, since this operation may not succeed, in which case
  1530  	// the SeekLT following this should not make any assumption about iterator
  1531  	// position.
  1532  	i.lastPositioningOp = unknownLastPositionOp
  1533  	i.batchJustRefreshed = false
  1534  	i.requiresReposition = false
  1535  	i.err = nil // clear cached iteration error
  1536  	i.stats.ReverseSeekCount[InterfaceCall]++
  1537  	if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1538  		key = upperBound
  1539  	} else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1540  		key = lowerBound
  1541  	}
  1542  	i.hasPrefix = false
  1543  	seekInternalIter := true
  1544  	// The following noop optimization only applies when i.batch == nil, since
  1545  	// an iterator over a batch is iterating over mutable data, that may have
  1546  	// changed since the last seek.
  1547  	if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
  1548  		cmp := i.cmp(key, i.prefixOrFullSeekKey)
  1549  		// If this seek is to the same or earlier key, and the iterator is
  1550  		// already positioned there, this is a noop. This can be helpful for
  1551  		// sparse key spaces that have many deleted keys, where one can avoid
  1552  		// the overhead of iterating past them again and again.
  1553  		if cmp <= 0 {
  1554  			// NB: when pos != iterPosCurReversePaused, the invariant
  1555  			// documented earlier implies that iterValidityState !=
  1556  			// IterAtLimit.
  1557  			if i.iterValidityState == IterExhausted ||
  1558  				(i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
  1559  					(limit == nil || i.cmp(limit, i.key) <= 0)) {
  1560  				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1561  					i.lastPositioningOp = seekLTLastPositioningOp
  1562  					return i.iterValidityState
  1563  				}
  1564  			}
  1565  			if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 {
  1566  				// Have some work to do, but don't need to seek, and we can
  1567  				// start doing findPrevEntry from i.iterKey.
  1568  				seekInternalIter = false
  1569  			}
  1570  		}
  1571  	}
  1572  	if seekInternalIter {
  1573  		i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone)
  1574  		i.stats.ReverseSeekCount[InternalIterCall]++
  1575  	}
  1576  	i.findPrevEntry(limit)
  1577  	i.maybeSampleRead()
  1578  	if i.Error() == nil && i.batch == nil {
  1579  		// Prepare state for a future noop optimization.
  1580  		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
  1581  		i.lastPositioningOp = seekLTLastPositioningOp
  1582  	}
  1583  	return i.iterValidityState
  1584  }
  1585  
  1586  // First moves the iterator the the first key/value pair. Returns true if the
  1587  // iterator is pointing at a valid entry and false otherwise.
  1588  func (i *Iterator) First() bool {
  1589  	if i.rangeKey != nil {
  1590  		// NB: Check Valid() before clearing requiresReposition.
  1591  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1592  		// If we have a range key but did not expose it at the previous iterator
  1593  		// position (because the iterator was not at a valid position), updated
  1594  		// must be true. This ensures that after an iterator op sequence like:
  1595  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1596  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1597  		//   - First(...)         → (IterValid, RangeBounds() = [a,b))
  1598  		// the iterator returns RangeKeyChanged()=true.
  1599  		//
  1600  		// The remainder of this function will only update i.rangeKey.updated if
  1601  		// the iterator moves into a new range key, or out of the current range
  1602  		// key.
  1603  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1604  	}
  1605  	i.err = nil // clear cached iteration error
  1606  	i.hasPrefix = false
  1607  	i.batchJustRefreshed = false
  1608  	i.lastPositioningOp = unknownLastPositionOp
  1609  	i.requiresReposition = false
  1610  	i.stats.ForwardSeekCount[InterfaceCall]++
  1611  
  1612  	i.iterFirstWithinBounds()
  1613  	i.findNextEntry(nil)
  1614  	i.maybeSampleRead()
  1615  	return i.iterValidityState == IterValid
  1616  }
  1617  
  1618  // Last moves the iterator the the last key/value pair. Returns true if the
  1619  // iterator is pointing at a valid entry and false otherwise.
  1620  func (i *Iterator) Last() bool {
  1621  	if i.rangeKey != nil {
  1622  		// NB: Check Valid() before clearing requiresReposition.
  1623  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1624  		// If we have a range key but did not expose it at the previous iterator
  1625  		// position (because the iterator was not at a valid position), updated
  1626  		// must be true. This ensures that after an iterator op sequence like:
  1627  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1628  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1629  		//   - Last(...)          → (IterValid, RangeBounds() = [a,b))
  1630  		// the iterator returns RangeKeyChanged()=true.
  1631  		//
  1632  		// The remainder of this function will only update i.rangeKey.updated if
  1633  		// the iterator moves into a new range key, or out of the current range
  1634  		// key.
  1635  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1636  	}
  1637  	i.err = nil // clear cached iteration error
  1638  	i.hasPrefix = false
  1639  	i.batchJustRefreshed = false
  1640  	i.lastPositioningOp = unknownLastPositionOp
  1641  	i.requiresReposition = false
  1642  	i.stats.ReverseSeekCount[InterfaceCall]++
  1643  
  1644  	i.iterLastWithinBounds()
  1645  	i.findPrevEntry(nil)
  1646  	i.maybeSampleRead()
  1647  	return i.iterValidityState == IterValid
  1648  }
  1649  
  1650  // Next moves the iterator to the next key/value pair. Returns true if the
  1651  // iterator is pointing at a valid entry and false otherwise.
  1652  func (i *Iterator) Next() bool {
  1653  	return i.nextWithLimit(nil) == IterValid
  1654  }
  1655  
  1656  // NextWithLimit moves the iterator to the next key/value pair.
  1657  //
  1658  // If limit is provided, it serves as a best-effort exclusive limit. If the next
  1659  // key  is greater than or equal to limit, the Iterator may pause and return
  1660  // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
  1661  // beyond limit.
  1662  //
  1663  // If the Iterator is configured to iterate over range keys, NextWithLimit
  1664  // guarantees it will surface any range keys with bounds overlapping the
  1665  // keyspace up to limit.
  1666  func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
  1667  	return i.nextWithLimit(limit)
  1668  }
  1669  
  1670  // NextPrefix moves the iterator to the next key/value pair with a key
  1671  // containing a different prefix than the current key. Prefixes are determined
  1672  // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration
  1673  // mode.
  1674  //
  1675  // It is not permitted to invoke NextPrefix while at a IterAtLimit position.
  1676  // When called in this condition, NextPrefix has non-deterministic behavior.
  1677  //
  1678  // It is not permitted to invoke NextPrefix when the Iterator has an
  1679  // upper-bound that is a versioned MVCC key (see the comment for
  1680  // Comparer.Split). It returns an error in this case.
  1681  func (i *Iterator) NextPrefix() bool {
  1682  	if i.nextPrefixNotPermittedByUpperBound {
  1683  		i.lastPositioningOp = unknownLastPositionOp
  1684  		i.requiresReposition = false
  1685  		i.err = errors.Errorf("NextPrefix not permitted with upper bound %s",
  1686  			i.comparer.FormatKey(i.opts.UpperBound))
  1687  		i.iterValidityState = IterExhausted
  1688  		return false
  1689  	}
  1690  	if i.hasPrefix {
  1691  		i.iterValidityState = IterExhausted
  1692  		return false
  1693  	}
  1694  	return i.nextPrefix() == IterValid
  1695  }
  1696  
  1697  func (i *Iterator) nextPrefix() IterValidityState {
  1698  	if i.rangeKey != nil {
  1699  		// NB: Check Valid() before clearing requiresReposition.
  1700  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1701  		// If we have a range key but did not expose it at the previous iterator
  1702  		// position (because the iterator was not at a valid position), updated
  1703  		// must be true. This ensures that after an iterator op sequence like:
  1704  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1705  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1706  		//   - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
  1707  		// the iterator returns RangeKeyChanged()=true.
  1708  		//
  1709  		// The remainder of this function will only update i.rangeKey.updated if
  1710  		// the iterator moves into a new range key, or out of the current range
  1711  		// key.
  1712  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1713  	}
  1714  
  1715  	// Although NextPrefix documents that behavior at IterAtLimit is undefined,
  1716  	// this function handles these cases as a simple prefix-agnostic Next. This
  1717  	// is done for deterministic behavior in the metamorphic tests.
  1718  	//
  1719  	// TODO(jackson): If the metamorphic test operation generator is adjusted to
  1720  	// make generation of some operations conditional on the previous
  1721  	// operations, then we can remove this behavior and explicitly error.
  1722  
  1723  	i.lastPositioningOp = unknownLastPositionOp
  1724  	i.requiresReposition = false
  1725  	switch i.pos {
  1726  	case iterPosCurForward:
  1727  		// Positioned on the current key. Advance to the next prefix.
  1728  		i.internalNextPrefix(i.split(i.key))
  1729  	case iterPosCurForwardPaused:
  1730  		// Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
  1731  		// up above. The iterator is already positioned at the next key.
  1732  	case iterPosCurReverse:
  1733  		// Switching directions.
  1734  		// Unless the iterator was exhausted, reverse iteration needs to
  1735  		// position the iterator at iterPosPrev.
  1736  		if i.iterKey != nil {
  1737  			i.err = errors.New("switching from reverse to forward but iter is not at prev")
  1738  			i.iterValidityState = IterExhausted
  1739  			return i.iterValidityState
  1740  		}
  1741  		// The Iterator is exhausted and i.iter is positioned before the first
  1742  		// key. Reposition to point to the first internal key.
  1743  		i.iterFirstWithinBounds()
  1744  	case iterPosCurReversePaused:
  1745  		// Positioned at a limit. Implement as a prefix-agnostic Next. See TODO
  1746  		// up above.
  1747  		//
  1748  		// Switching directions; The iterator must not be exhausted since it
  1749  		// paused.
  1750  		if i.iterKey == nil {
  1751  			i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
  1752  			i.iterValidityState = IterExhausted
  1753  			return i.iterValidityState
  1754  		}
  1755  		i.nextUserKey()
  1756  	case iterPosPrev:
  1757  		// The underlying iterator is pointed to the previous key (this can
  1758  		// only happen when switching iteration directions).
  1759  		if i.iterKey == nil {
  1760  			// We're positioned before the first key. Need to reposition to point to
  1761  			// the first key.
  1762  			i.iterFirstWithinBounds()
  1763  		} else {
  1764  			// Move the internal iterator back onto the user key stored in
  1765  			// i.key. iterPosPrev guarantees that it's positioned at the last
  1766  			// key with the user key less than i.key, so we're guaranteed to
  1767  			// land on the correct key with a single Next.
  1768  			i.iterKey, i.iterValue = i.iter.Next()
  1769  			if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) {
  1770  				i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q",
  1771  					i.iterKey.UserKey, i.key)
  1772  			}
  1773  		}
  1774  		// The internal iterator is now positioned at i.key. Advance to the next
  1775  		// prefix.
  1776  		i.internalNextPrefix(i.split(i.key))
  1777  	case iterPosNext:
  1778  		// Already positioned on the next key. Only call nextPrefixKey if the
  1779  		// next key shares the same prefix.
  1780  		if i.iterKey != nil {
  1781  			currKeyPrefixLen := i.split(i.key)
  1782  			iterKeyPrefixLen := i.split(i.iterKey.UserKey)
  1783  			if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
  1784  				i.internalNextPrefix(currKeyPrefixLen)
  1785  			}
  1786  		}
  1787  	}
  1788  
  1789  	i.stats.ForwardStepCount[InterfaceCall]++
  1790  	i.findNextEntry(nil /* limit */)
  1791  	i.maybeSampleRead()
  1792  	return i.iterValidityState
  1793  }
  1794  
  1795  func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) {
  1796  	if i.iterKey == nil {
  1797  		return
  1798  	}
  1799  	// The Next "fast-path" is not really a fast-path when there is more than
  1800  	// one version. However, even with TableFormatPebblev3, there is a small
  1801  	// slowdown (~10%) for one version if we remove it and only call NextPrefix.
  1802  	// When there are two versions, only calling NextPrefix is ~30% faster.
  1803  	i.stats.ForwardStepCount[InternalIterCall]++
  1804  	if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil {
  1805  		return
  1806  	}
  1807  	iterKeyPrefixLen := i.split(i.iterKey.UserKey)
  1808  	if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) {
  1809  		return
  1810  	}
  1811  	i.stats.ForwardStepCount[InternalIterCall]++
  1812  	i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen])
  1813  	i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey)
  1814  	if invariants.Enabled && i.iterKey != nil {
  1815  		if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 {
  1816  			panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q",
  1817  				i.iterKey, i.prefixOrFullSeekKey))
  1818  		}
  1819  	}
  1820  }
  1821  
  1822  func (i *Iterator) nextWithLimit(limit []byte) IterValidityState {
  1823  	i.stats.ForwardStepCount[InterfaceCall]++
  1824  	if i.hasPrefix {
  1825  		if limit != nil {
  1826  			i.err = errors.New("cannot use limit with prefix iteration")
  1827  			i.iterValidityState = IterExhausted
  1828  			return i.iterValidityState
  1829  		} else if i.iterValidityState == IterExhausted {
  1830  			// No-op, already exhasuted. We avoid executing the Next because it
  1831  			// can break invariants: Specifically, a file that fails the bloom
  1832  			// filter test may result in its level being removed from the
  1833  			// merging iterator. The level's removal can cause a lazy combined
  1834  			// iterator to miss range keys and trigger a switch to combined
  1835  			// iteration at a larger key, breaking keyspan invariants.
  1836  			return i.iterValidityState
  1837  		}
  1838  	}
  1839  	if i.err != nil {
  1840  		return i.iterValidityState
  1841  	}
  1842  	if i.rangeKey != nil {
  1843  		// NB: Check Valid() before clearing requiresReposition.
  1844  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1845  		// If we have a range key but did not expose it at the previous iterator
  1846  		// position (because the iterator was not at a valid position), updated
  1847  		// must be true. This ensures that after an iterator op sequence like:
  1848  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1849  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1850  		//   - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b))
  1851  		// the iterator returns RangeKeyChanged()=true.
  1852  		//
  1853  		// The remainder of this function will only update i.rangeKey.updated if
  1854  		// the iterator moves into a new range key, or out of the current range
  1855  		// key.
  1856  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1857  	}
  1858  	i.lastPositioningOp = unknownLastPositionOp
  1859  	i.requiresReposition = false
  1860  	switch i.pos {
  1861  	case iterPosCurForward:
  1862  		i.nextUserKey()
  1863  	case iterPosCurForwardPaused:
  1864  		// Already at the right place.
  1865  	case iterPosCurReverse:
  1866  		// Switching directions.
  1867  		// Unless the iterator was exhausted, reverse iteration needs to
  1868  		// position the iterator at iterPosPrev.
  1869  		if i.iterKey != nil {
  1870  			i.err = errors.New("switching from reverse to forward but iter is not at prev")
  1871  			i.iterValidityState = IterExhausted
  1872  			return i.iterValidityState
  1873  		}
  1874  		// We're positioned before the first key. Need to reposition to point to
  1875  		// the first key.
  1876  		i.iterFirstWithinBounds()
  1877  	case iterPosCurReversePaused:
  1878  		// Switching directions.
  1879  		// The iterator must not be exhausted since it paused.
  1880  		if i.iterKey == nil {
  1881  			i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
  1882  			i.iterValidityState = IterExhausted
  1883  			return i.iterValidityState
  1884  		}
  1885  		i.nextUserKey()
  1886  	case iterPosPrev:
  1887  		// The underlying iterator is pointed to the previous key (this can
  1888  		// only happen when switching iteration directions). We set
  1889  		// i.iterValidityState to IterExhausted here to force the calls to
  1890  		// nextUserKey to save the current key i.iter is pointing at in order
  1891  		// to determine when the next user-key is reached.
  1892  		i.iterValidityState = IterExhausted
  1893  		if i.iterKey == nil {
  1894  			// We're positioned before the first key. Need to reposition to point to
  1895  			// the first key.
  1896  			i.iterFirstWithinBounds()
  1897  		} else {
  1898  			i.nextUserKey()
  1899  		}
  1900  		i.nextUserKey()
  1901  	case iterPosNext:
  1902  		// Already at the right place.
  1903  	}
  1904  	i.findNextEntry(limit)
  1905  	i.maybeSampleRead()
  1906  	return i.iterValidityState
  1907  }
  1908  
  1909  // Prev moves the iterator to the previous key/value pair. Returns true if the
  1910  // iterator is pointing at a valid entry and false otherwise.
  1911  func (i *Iterator) Prev() bool {
  1912  	return i.PrevWithLimit(nil) == IterValid
  1913  }
  1914  
  1915  // PrevWithLimit moves the iterator to the previous key/value pair.
  1916  //
  1917  // If limit is provided, it serves as a best-effort inclusive limit. If the
  1918  // previous key is less than limit, the Iterator may pause and return
  1919  // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
  1920  // beyond limit.
  1921  //
  1922  // If the Iterator is configured to iterate over range keys, PrevWithLimit
  1923  // guarantees it will surface any range keys with bounds overlapping the
  1924  // keyspace up to limit.
  1925  func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
  1926  	i.stats.ReverseStepCount[InterfaceCall]++
  1927  	if i.err != nil {
  1928  		return i.iterValidityState
  1929  	}
  1930  	if i.rangeKey != nil {
  1931  		// NB: Check Valid() before clearing requiresReposition.
  1932  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1933  		// If we have a range key but did not expose it at the previous iterator
  1934  		// position (because the iterator was not at a valid position), updated
  1935  		// must be true. This ensures that after an iterator op sequence like:
  1936  		//   - Next()             → (IterValid, RangeBounds() = [a,b))
  1937  		//   - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -)
  1938  		//   - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b))
  1939  		// the iterator returns RangeKeyChanged()=true.
  1940  		//
  1941  		// The remainder of this function will only update i.rangeKey.updated if
  1942  		// the iterator moves into a new range key, or out of the current range
  1943  		// key.
  1944  		i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys()
  1945  	}
  1946  	i.lastPositioningOp = unknownLastPositionOp
  1947  	i.requiresReposition = false
  1948  	if i.hasPrefix {
  1949  		i.err = errReversePrefixIteration
  1950  		i.iterValidityState = IterExhausted
  1951  		return i.iterValidityState
  1952  	}
  1953  	switch i.pos {
  1954  	case iterPosCurForward:
  1955  		// Switching directions, and will handle this below.
  1956  	case iterPosCurForwardPaused:
  1957  		// Switching directions, and will handle this below.
  1958  	case iterPosCurReverse:
  1959  		i.prevUserKey()
  1960  	case iterPosCurReversePaused:
  1961  		// Already at the right place.
  1962  	case iterPosNext:
  1963  		// The underlying iterator is pointed to the next key (this can only happen
  1964  		// when switching iteration directions). We will handle this below.
  1965  	case iterPosPrev:
  1966  		// Already at the right place.
  1967  	}
  1968  	if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
  1969  		// Switching direction.
  1970  		stepAgain := i.pos == iterPosNext
  1971  
  1972  		// Synthetic range key markers are a special case. Consider SeekGE(b)
  1973  		// which finds a range key [a, c). To ensure the user observes the range
  1974  		// key, the Iterator pauses at Key() = b. The iterator must advance the
  1975  		// internal iterator to see if there's also a coincident point key at
  1976  		// 'b', leaving the iterator at iterPosNext if there's not.
  1977  		//
  1978  		// This is a problem: Synthetic range key markers are only interleaved
  1979  		// during the original seek. A subsequent Prev() of i.iter will not move
  1980  		// back onto the synthetic range key marker. In this case where the
  1981  		// previous iterator position was a synthetic range key start boundary,
  1982  		// we must not step a second time.
  1983  		if i.isEphemeralPosition() {
  1984  			stepAgain = false
  1985  		}
  1986  
  1987  		// We set i.iterValidityState to IterExhausted here to force the calls
  1988  		// to prevUserKey to save the current key i.iter is pointing at in
  1989  		// order to determine when the prev user-key is reached.
  1990  		i.iterValidityState = IterExhausted
  1991  		if i.iterKey == nil {
  1992  			// We're positioned after the last key. Need to reposition to point to
  1993  			// the last key.
  1994  			i.iterLastWithinBounds()
  1995  		} else {
  1996  			i.prevUserKey()
  1997  		}
  1998  		if stepAgain {
  1999  			i.prevUserKey()
  2000  		}
  2001  	}
  2002  	i.findPrevEntry(limit)
  2003  	i.maybeSampleRead()
  2004  	return i.iterValidityState
  2005  }
  2006  
  2007  // iterFirstWithinBounds moves the internal iterator to the first key,
  2008  // respecting bounds.
  2009  func (i *Iterator) iterFirstWithinBounds() {
  2010  	i.stats.ForwardSeekCount[InternalIterCall]++
  2011  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
  2012  		i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
  2013  	} else {
  2014  		i.iterKey, i.iterValue = i.iter.First()
  2015  	}
  2016  }
  2017  
  2018  // iterLastWithinBounds moves the internal iterator to the last key, respecting
  2019  // bounds.
  2020  func (i *Iterator) iterLastWithinBounds() {
  2021  	i.stats.ReverseSeekCount[InternalIterCall]++
  2022  	if upperBound := i.opts.GetUpperBound(); upperBound != nil {
  2023  		i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
  2024  	} else {
  2025  		i.iterKey, i.iterValue = i.iter.Last()
  2026  	}
  2027  }
  2028  
  2029  // RangeKeyData describes a range key's data, set through RangeKeySet. The key
  2030  // boundaries of the range key is provided by Iterator.RangeBounds.
  2031  type RangeKeyData struct {
  2032  	Suffix []byte
  2033  	Value  []byte
  2034  }
  2035  
  2036  // rangeKeyWithinLimit is called during limited reverse iteration when
  2037  // positioned over a key beyond the limit. If there exists a range key that lies
  2038  // within the limit, the iterator must not pause in order to ensure the user has
  2039  // an opportunity to observe the range key within limit.
  2040  //
  2041  // It would be valid to ignore the limit whenever there's a range key covering
  2042  // the key, but that would introduce nondeterminism. To preserve determinism for
  2043  // testing, the iterator ignores the limit only if the covering range key does
  2044  // cover the keyspace within the limit.
  2045  //
  2046  // This awkwardness exists because range keys are interleaved at their inclusive
  2047  // start positions. Note that limit is inclusive.
  2048  func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
  2049  	if i.rangeKey == nil || !i.opts.rangeKeys() {
  2050  		return false
  2051  	}
  2052  	s := i.rangeKey.iiter.Span()
  2053  	// If the range key ends beyond the limit, then the range key does not cover
  2054  	// any portion of the keyspace within the limit and it is safe to pause.
  2055  	return s != nil && i.cmp(s.End, limit) > 0
  2056  }
  2057  
  2058  // saveRangeKey saves the current range key to the underlying iterator's current
  2059  // range key state. If the range key has not changed, saveRangeKey is a no-op.
  2060  // If there is a new range key, saveRangeKey copies all of the key, value and
  2061  // suffixes into Iterator-managed buffers.
  2062  func (i *Iterator) saveRangeKey() {
  2063  	if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
  2064  		return
  2065  	}
  2066  
  2067  	s := i.rangeKey.iiter.Span()
  2068  	if s == nil {
  2069  		i.rangeKey.hasRangeKey = false
  2070  		i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
  2071  		return
  2072  	} else if !i.rangeKey.stale {
  2073  		// The range key `s` is identical to the one currently saved. No-op.
  2074  		return
  2075  	}
  2076  
  2077  	if s.KeysOrder != keyspan.BySuffixAsc {
  2078  		panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
  2079  	}
  2080  
  2081  	// Although `i.rangeKey.stale` is true, the span s may still be identical
  2082  	// to the currently saved span. This is possible when seeking the iterator,
  2083  	// which may land back on the same range key. If we previously had a range
  2084  	// key and the new one has an identical start key, then it must be the same
  2085  	// range key and we can avoid copying and keep `i.rangeKey.updated=false`.
  2086  	//
  2087  	// TODO(jackson): These key comparisons could be avoidable during relative
  2088  	// positioning operations continuing in the same direction, because these
  2089  	// ops will never encounter the previous position's range key while
  2090  	// stale=true. However, threading whether the current op is a seek or step
  2091  	// maybe isn't worth it. This key comparison is only necessary once when we
  2092  	// step onto a new range key, which should be relatively rare.
  2093  	if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
  2094  		i.equal(i.rangeKey.end, s.End) {
  2095  		i.rangeKey.updated = false
  2096  		i.rangeKey.stale = false
  2097  		i.rangeKey.hasRangeKey = true
  2098  		return
  2099  	}
  2100  	i.stats.RangeKeyStats.Count += len(s.Keys)
  2101  	i.rangeKey.buf.Reset()
  2102  	i.rangeKey.hasRangeKey = true
  2103  	i.rangeKey.updated = true
  2104  	i.rangeKey.stale = false
  2105  	i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start)
  2106  	i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End)
  2107  	i.rangeKey.keys = i.rangeKey.keys[:0]
  2108  	for j := 0; j < len(s.Keys); j++ {
  2109  		if invariants.Enabled {
  2110  			if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
  2111  				panic("pebble: user iteration encountered non-RangeKeySet key kind")
  2112  			} else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
  2113  				panic("pebble: user iteration encountered range keys not in suffix order")
  2114  			}
  2115  		}
  2116  		var rkd RangeKeyData
  2117  		i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix)
  2118  		i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value)
  2119  		i.rangeKey.keys = append(i.rangeKey.keys, rkd)
  2120  	}
  2121  }
  2122  
  2123  // RangeKeyChanged indicates whether the most recent iterator positioning
  2124  // operation resulted in the iterator stepping into or out of a new range key.
  2125  // If true, previously returned range key bounds and data has been invalidated.
  2126  // If false, previously obtained range key bounds, suffix and value slices are
  2127  // still valid and may continue to be read.
  2128  //
  2129  // Invalid iterator positions are considered to not hold range keys, meaning
  2130  // that if an iterator steps from an IterExhausted or IterAtLimit position onto
  2131  // a position with a range key, RangeKeyChanged will yield true.
  2132  func (i *Iterator) RangeKeyChanged() bool {
  2133  	return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
  2134  }
  2135  
  2136  // HasPointAndRange indicates whether there exists a point key, a range key or
  2137  // both at the current iterator position.
  2138  func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
  2139  	if i.iterValidityState != IterValid || i.requiresReposition {
  2140  		return false, false
  2141  	}
  2142  	if i.opts.KeyTypes == IterKeyTypePointsOnly {
  2143  		return true, false
  2144  	}
  2145  	return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
  2146  }
  2147  
  2148  // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
  2149  // range key covering the current iterator position. RangeBounds returns nil
  2150  // bounds if there is no range key covering the current iterator position, or
  2151  // the iterator is not configured to surface range keys.
  2152  //
  2153  // If valid, the returned start bound is less than or equal to Key() and the
  2154  // returned end bound is greater than Key().
  2155  func (i *Iterator) RangeBounds() (start, end []byte) {
  2156  	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
  2157  		return nil, nil
  2158  	}
  2159  	return i.rangeKey.start, i.rangeKey.end
  2160  }
  2161  
  2162  // Key returns the key of the current key/value pair, or nil if done. The
  2163  // caller should not modify the contents of the returned slice, and its
  2164  // contents may change on the next call to Next.
  2165  //
  2166  // If positioned at an iterator position that only holds a range key, Key()
  2167  // always returns the start bound of the range key. Otherwise, it returns the
  2168  // point key's key.
  2169  func (i *Iterator) Key() []byte {
  2170  	return i.key
  2171  }
  2172  
  2173  // Value returns the value of the current key/value pair, or nil if done. The
  2174  // caller should not modify the contents of the returned slice, and its
  2175  // contents may change on the next call to Next.
  2176  //
  2177  // Only valid if HasPointAndRange() returns true for hasPoint.
  2178  // Deprecated: use ValueAndErr instead.
  2179  func (i *Iterator) Value() []byte {
  2180  	val, _ := i.ValueAndErr()
  2181  	return val
  2182  }
  2183  
  2184  // ValueAndErr returns the value, and any error encountered in extracting the value.
  2185  // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
  2186  //
  2187  // The caller should not modify the contents of the returned slice, and its
  2188  // contents may change on the next call to Next.
  2189  func (i *Iterator) ValueAndErr() ([]byte, error) {
  2190  	val, callerOwned, err := i.value.Value(i.lazyValueBuf)
  2191  	if err != nil {
  2192  		i.err = err
  2193  	}
  2194  	if callerOwned {
  2195  		i.lazyValueBuf = val[:0]
  2196  	}
  2197  	return val, err
  2198  }
  2199  
  2200  // LazyValue returns the LazyValue. Only for advanced use cases.
  2201  // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint.
  2202  func (i *Iterator) LazyValue() LazyValue {
  2203  	return i.value
  2204  }
  2205  
  2206  // RangeKeys returns the range key values and their suffixes covering the
  2207  // current iterator position. The range bounds may be retrieved separately
  2208  // through Iterator.RangeBounds().
  2209  func (i *Iterator) RangeKeys() []RangeKeyData {
  2210  	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
  2211  		return nil
  2212  	}
  2213  	return i.rangeKey.keys
  2214  }
  2215  
  2216  // Valid returns true if the iterator is positioned at a valid key/value pair
  2217  // and false otherwise.
  2218  func (i *Iterator) Valid() bool {
  2219  	valid := i.iterValidityState == IterValid && !i.requiresReposition
  2220  	if invariants.Enabled {
  2221  		if err := i.Error(); valid && err != nil {
  2222  			panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err))
  2223  		}
  2224  	}
  2225  	return valid
  2226  }
  2227  
  2228  // Error returns any accumulated error.
  2229  func (i *Iterator) Error() error {
  2230  	if i.iter != nil {
  2231  		return firstError(i.err, i.iter.Error())
  2232  	}
  2233  	return i.err
  2234  }
  2235  
  2236  const maxKeyBufCacheSize = 4 << 10 // 4 KB
  2237  
  2238  // Close closes the iterator and returns any accumulated error. Exhausting
  2239  // all the key/value pairs in a table is not considered to be an error.
  2240  // It is not valid to call any method, including Close, after the iterator
  2241  // has been closed.
  2242  func (i *Iterator) Close() error {
  2243  	// Close the child iterator before releasing the readState because when the
  2244  	// readState is released sstables referenced by the readState may be deleted
  2245  	// which will fail on Windows if the sstables are still open by the child
  2246  	// iterator.
  2247  	if i.iter != nil {
  2248  		i.err = firstError(i.err, i.iter.Close())
  2249  
  2250  		// Closing i.iter did not necessarily close the point and range key
  2251  		// iterators. Calls to SetOptions may have 'disconnected' either one
  2252  		// from i.iter if iteration key types were changed. Both point and range
  2253  		// key iterators are preserved in case the iterator needs to switch key
  2254  		// types again. We explicitly close both of these iterators here.
  2255  		//
  2256  		// NB: If the iterators were still connected to i.iter, they may be
  2257  		// closed, but calling Close on a closed internal iterator or fragment
  2258  		// iterator is allowed.
  2259  		if i.pointIter != nil && !i.closePointIterOnce {
  2260  			i.err = firstError(i.err, i.pointIter.Close())
  2261  		}
  2262  		if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
  2263  			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  2264  		}
  2265  	}
  2266  	err := i.err
  2267  
  2268  	if i.readState != nil {
  2269  		if i.readSampling.pendingCompactions.size > 0 {
  2270  			// Copy pending read compactions using db.mu.Lock()
  2271  			i.readState.db.mu.Lock()
  2272  			i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
  2273  			reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
  2274  			i.readState.db.mu.compact.rescheduleReadCompaction = false
  2275  			concurrentCompactions := i.readState.db.mu.compact.compactingCount
  2276  			i.readState.db.mu.Unlock()
  2277  
  2278  			if reschedule && concurrentCompactions == 0 {
  2279  				// In a read heavy workload, flushes may not happen frequently enough to
  2280  				// schedule compactions.
  2281  				i.readState.db.compactionSchedulers.Add(1)
  2282  				go i.readState.db.maybeScheduleCompactionAsync()
  2283  			}
  2284  		}
  2285  
  2286  		i.readState.unref()
  2287  		i.readState = nil
  2288  	}
  2289  
  2290  	if i.version != nil {
  2291  		i.version.Unref()
  2292  	}
  2293  
  2294  	for _, readers := range i.externalReaders {
  2295  		for _, r := range readers {
  2296  			err = firstError(err, r.Close())
  2297  		}
  2298  	}
  2299  
  2300  	// Close the closer for the current value if one was open.
  2301  	if i.valueCloser != nil {
  2302  		err = firstError(err, i.valueCloser.Close())
  2303  		i.valueCloser = nil
  2304  	}
  2305  
  2306  	if i.rangeKey != nil {
  2307  
  2308  		i.rangeKey.rangeKeyBuffers.PrepareForReuse()
  2309  		*i.rangeKey = iteratorRangeKeyState{
  2310  			rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
  2311  		}
  2312  		iterRangeKeyStateAllocPool.Put(i.rangeKey)
  2313  		i.rangeKey = nil
  2314  	}
  2315  	if alloc := i.alloc; alloc != nil {
  2316  		// Avoid caching the key buf if it is overly large. The constant is fairly
  2317  		// arbitrary.
  2318  		if cap(i.keyBuf) >= maxKeyBufCacheSize {
  2319  			alloc.keyBuf = nil
  2320  		} else {
  2321  			alloc.keyBuf = i.keyBuf
  2322  		}
  2323  		if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize {
  2324  			alloc.prefixOrFullSeekKey = nil
  2325  		} else {
  2326  			alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
  2327  		}
  2328  		for j := range i.boundsBuf {
  2329  			if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
  2330  				alloc.boundsBuf[j] = nil
  2331  			} else {
  2332  				alloc.boundsBuf[j] = i.boundsBuf[j]
  2333  			}
  2334  		}
  2335  		*alloc = iterAlloc{
  2336  			keyBuf:              alloc.keyBuf,
  2337  			boundsBuf:           alloc.boundsBuf,
  2338  			prefixOrFullSeekKey: alloc.prefixOrFullSeekKey,
  2339  		}
  2340  		iterAllocPool.Put(alloc)
  2341  	} else if alloc := i.getIterAlloc; alloc != nil {
  2342  		if cap(i.keyBuf) >= maxKeyBufCacheSize {
  2343  			alloc.keyBuf = nil
  2344  		} else {
  2345  			alloc.keyBuf = i.keyBuf
  2346  		}
  2347  		*alloc = getIterAlloc{
  2348  			keyBuf: alloc.keyBuf,
  2349  		}
  2350  		getIterAllocPool.Put(alloc)
  2351  	}
  2352  	return err
  2353  }
  2354  
  2355  // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
  2356  // returns, the caller is free to mutate the provided slices.
  2357  //
  2358  // The iterator will always be invalidated and must be repositioned with a call
  2359  // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
  2360  func (i *Iterator) SetBounds(lower, upper []byte) {
  2361  	// Ensure that the Iterator appears exhausted, regardless of whether we
  2362  	// actually have to invalidate the internal iterator. Optimizations that
  2363  	// avoid exhaustion are an internal implementation detail that shouldn't
  2364  	// leak through the interface. The caller should still call an absolute
  2365  	// positioning method to reposition the iterator.
  2366  	i.requiresReposition = true
  2367  
  2368  	if ((i.opts.LowerBound == nil) == (lower == nil)) &&
  2369  		((i.opts.UpperBound == nil) == (upper == nil)) &&
  2370  		i.equal(i.opts.LowerBound, lower) &&
  2371  		i.equal(i.opts.UpperBound, upper) {
  2372  		// Unchanged, noop.
  2373  		return
  2374  	}
  2375  
  2376  	// Copy the user-provided bounds into an Iterator-owned buffer, and set them
  2377  	// on i.opts.{Lower,Upper}Bound.
  2378  	i.processBounds(lower, upper)
  2379  
  2380  	i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2381  	// If the iterator has an open point iterator that's not currently being
  2382  	// used, propagate the new bounds to it.
  2383  	if i.pointIter != nil && !i.opts.pointKeys() {
  2384  		i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2385  	}
  2386  	// If the iterator has a range key iterator, propagate bounds to it. The
  2387  	// top-level SetBounds on the interleaving iterator (i.iter) won't propagate
  2388  	// bounds to the range key iterator stack, because the FragmentIterator
  2389  	// interface doesn't define a SetBounds method. We need to directly inform
  2390  	// the iterConfig stack.
  2391  	if i.rangeKey != nil {
  2392  		i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2393  	}
  2394  
  2395  	// Even though this is not a positioning operation, the alteration of the
  2396  	// bounds means we cannot optimize Seeks by using Next.
  2397  	i.invalidate()
  2398  }
  2399  
  2400  // SetContext replaces the context provided at iterator creation, or the last
  2401  // one provided by SetContext. Even though iterators are expected to be
  2402  // short-lived, there are some cases where either (a) iterators are used far
  2403  // from the code that created them, (b) iterators are reused (while being
  2404  // short-lived) for processing different requests. For such scenarios, we
  2405  // allow the caller to replace the context.
  2406  func (i *Iterator) SetContext(ctx context.Context) {
  2407  	i.ctx = ctx
  2408  	i.iter.SetContext(ctx)
  2409  	// If the iterator has an open point iterator that's not currently being
  2410  	// used, propagate the new context to it.
  2411  	if i.pointIter != nil && !i.opts.pointKeys() {
  2412  		i.pointIter.SetContext(i.ctx)
  2413  	}
  2414  }
  2415  
  2416  // Initialization and changing of the bounds must call processBounds.
  2417  // processBounds saves the bounds and computes derived state from those
  2418  // bounds.
  2419  func (i *Iterator) processBounds(lower, upper []byte) {
  2420  	// Copy the user-provided bounds into an Iterator-owned buffer. We can't
  2421  	// overwrite the current bounds, because some internal iterators compare old
  2422  	// and new bounds for optimizations.
  2423  
  2424  	buf := i.boundsBuf[i.boundsBufIdx][:0]
  2425  	if lower != nil {
  2426  		buf = append(buf, lower...)
  2427  		i.opts.LowerBound = buf
  2428  	} else {
  2429  		i.opts.LowerBound = nil
  2430  	}
  2431  	i.nextPrefixNotPermittedByUpperBound = false
  2432  	if upper != nil {
  2433  		buf = append(buf, upper...)
  2434  		i.opts.UpperBound = buf[len(buf)-len(upper):]
  2435  		if i.comparer.Split != nil {
  2436  			if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) {
  2437  				// Setting an upper bound that is a versioned MVCC key. This means
  2438  				// that a key can have some MVCC versions before the upper bound and
  2439  				// some after. This causes significant complications for NextPrefix,
  2440  				// so we bar the user of NextPrefix.
  2441  				i.nextPrefixNotPermittedByUpperBound = true
  2442  			}
  2443  		}
  2444  	} else {
  2445  		i.opts.UpperBound = nil
  2446  	}
  2447  	i.boundsBuf[i.boundsBufIdx] = buf
  2448  	i.boundsBufIdx = 1 - i.boundsBufIdx
  2449  }
  2450  
  2451  // SetOptions sets new iterator options for the iterator. Note that the lower
  2452  // and upper bounds applied here will supersede any bounds set by previous calls
  2453  // to SetBounds.
  2454  //
  2455  // Note that the slices provided in this SetOptions must not be changed by the
  2456  // caller until the iterator is closed, or a subsequent SetBounds or SetOptions
  2457  // has returned. This is because comparisons between the existing and new bounds
  2458  // are sometimes used to optimize seeking. See the extended commentary on
  2459  // SetBounds.
  2460  //
  2461  // If the iterator was created over an indexed mutable batch, the iterator's
  2462  // view of the mutable batch is refreshed.
  2463  //
  2464  // The iterator will always be invalidated and must be repositioned with a call
  2465  // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
  2466  //
  2467  // If only lower and upper bounds need to be modified, prefer SetBounds.
  2468  func (i *Iterator) SetOptions(o *IterOptions) {
  2469  	if i.externalReaders != nil {
  2470  		if err := validateExternalIterOpts(o); err != nil {
  2471  			panic(err)
  2472  		}
  2473  	}
  2474  
  2475  	// Ensure that the Iterator appears exhausted, regardless of whether we
  2476  	// actually have to invalidate the internal iterator. Optimizations that
  2477  	// avoid exhaustion are an internal implementation detail that shouldn't
  2478  	// leak through the interface. The caller should still call an absolute
  2479  	// positioning method to reposition the iterator.
  2480  	i.requiresReposition = true
  2481  
  2482  	// Check if global state requires we close all internal iterators.
  2483  	//
  2484  	// If the Iterator is in an error state, invalidate the existing iterators
  2485  	// so that we reconstruct an iterator state from scratch.
  2486  	//
  2487  	// If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
  2488  	// improperly including or excluding memtables. Invalidate them so that
  2489  	// finishInitializingIter will reconstruct them.
  2490  	//
  2491  	// If either the original options or the new options specify a table filter,
  2492  	// we need to reconstruct the iterator stacks. If they both supply a table
  2493  	// filter, we can't be certain that it's the same filter since we have no
  2494  	// mechanism to compare the filter closures.
  2495  	closeBoth := i.err != nil ||
  2496  		o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable ||
  2497  		o.TableFilter != nil || i.opts.TableFilter != nil
  2498  
  2499  	// If either options specify block property filters for an iterator stack,
  2500  	// reconstruct it.
  2501  	if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
  2502  		o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil ||
  2503  		i.opts.SkipPoint != nil) {
  2504  		i.err = firstError(i.err, i.pointIter.Close())
  2505  		i.pointIter = nil
  2506  	}
  2507  	if i.rangeKey != nil {
  2508  		if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
  2509  			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  2510  			i.rangeKey = nil
  2511  		} else {
  2512  			// If there's still a range key iterator stack, invalidate the
  2513  			// iterator. This ensures RangeKeyChanged() returns true if a
  2514  			// subsequent positioning operation discovers a range key. It also
  2515  			// prevents seek no-op optimizations.
  2516  			i.invalidate()
  2517  		}
  2518  	}
  2519  
  2520  	// If the iterator is backed by a batch that's been mutated, refresh its
  2521  	// existing point and range-key iterators, and invalidate the iterator to
  2522  	// prevent seek-using-next optimizations. If we don't yet have a point-key
  2523  	// iterator or range-key iterator but we require one, it'll be created in
  2524  	// the slow path that reconstructs the iterator in finishInitializingIter.
  2525  	if i.batch != nil {
  2526  		nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
  2527  		if nextBatchSeqNum != i.batchSeqNum {
  2528  			i.batchSeqNum = nextBatchSeqNum
  2529  			if i.merging != nil {
  2530  				i.merging.batchSnapshot = nextBatchSeqNum
  2531  			}
  2532  			// Prevent a no-op seek optimization on the next seek. We won't be
  2533  			// able to reuse the top-level Iterator state, because it may be
  2534  			// incorrect after the inclusion of new batch mutations.
  2535  			i.batchJustRefreshed = true
  2536  			if i.pointIter != nil && i.batch.countRangeDels > 0 {
  2537  				if i.batchRangeDelIter.Count() == 0 {
  2538  					// When we constructed this iterator, there were no
  2539  					// rangedels in the batch. Iterator construction will
  2540  					// have excluded the batch rangedel iterator from the
  2541  					// point iterator stack. We need to reconstruct the
  2542  					// point iterator to add i.batchRangeDelIter into the
  2543  					// iterator stack.
  2544  					i.err = firstError(i.err, i.pointIter.Close())
  2545  					i.pointIter = nil
  2546  				} else {
  2547  					// There are range deletions in the batch and we already
  2548  					// have a batch rangedel iterator. We can update the
  2549  					// batch rangedel iterator in place.
  2550  					//
  2551  					// NB: There may or may not be new range deletions. We
  2552  					// can't tell based on i.batchRangeDelIter.Count(),
  2553  					// which is the count of fragmented range deletions, NOT
  2554  					// the number of range deletions written to the batch
  2555  					// [i.batch.countRangeDels].
  2556  					i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum)
  2557  				}
  2558  			}
  2559  			if i.rangeKey != nil && i.batch.countRangeKeys > 0 {
  2560  				if i.batchRangeKeyIter.Count() == 0 {
  2561  					// When we constructed this iterator, there were no range
  2562  					// keys in the batch. Iterator construction will have
  2563  					// excluded the batch rangekey iterator from the range key
  2564  					// iterator stack. We need to reconstruct the range key
  2565  					// iterator to add i.batchRangeKeyIter into the iterator
  2566  					// stack.
  2567  					i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  2568  					i.rangeKey = nil
  2569  				} else {
  2570  					// There are range keys in the batch and we already
  2571  					// have a batch rangekey iterator. We can update the batch
  2572  					// rangekey iterator in place.
  2573  					//
  2574  					// NB: There may or may not be new range keys. We can't
  2575  					// tell based on i.batchRangeKeyIter.Count(), which is the
  2576  					// count of fragmented range keys, NOT the number of
  2577  					// range keys written to the batch [i.batch.countRangeKeys].
  2578  					i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum)
  2579  					i.invalidate()
  2580  				}
  2581  			}
  2582  		}
  2583  	}
  2584  
  2585  	// Reset combinedIterState.initialized in case the iterator key types
  2586  	// changed. If there's already a range key iterator stack, the combined
  2587  	// iterator is already initialized.  Additionally, if the iterator is not
  2588  	// configured to include range keys, mark it as initialized to signal that
  2589  	// lower level iterators should not trigger a switch to combined iteration.
  2590  	i.lazyCombinedIter.combinedIterState = combinedIterState{
  2591  		initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
  2592  	}
  2593  
  2594  	boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
  2595  		((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
  2596  		i.equal(i.opts.LowerBound, o.LowerBound) &&
  2597  		i.equal(i.opts.UpperBound, o.UpperBound)
  2598  
  2599  	if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
  2600  		(i.pointIter != nil || !i.opts.pointKeys()) &&
  2601  		(i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
  2602  		i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) &&
  2603  		o.UseL6Filters == i.opts.UseL6Filters {
  2604  		// The options are identical, so we can likely use the fast path. In
  2605  		// addition to all the above constraints, we cannot use the fast path if
  2606  		// configured to perform lazy combined iteration but an indexed batch
  2607  		// used by the iterator now contains range keys. Lazy combined iteration
  2608  		// is not compatible with batch range keys because we always need to
  2609  		// merge the batch's range keys into iteration.
  2610  		if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 {
  2611  			// Fast path. This preserves the Seek-using-Next optimizations as
  2612  			// long as the iterator wasn't already invalidated up above.
  2613  			return
  2614  		}
  2615  	}
  2616  	// Slow path.
  2617  
  2618  	// The options changed. Save the new ones to i.opts.
  2619  	if boundsEqual {
  2620  		// Copying the options into i.opts will overwrite LowerBound and
  2621  		// UpperBound fields with the user-provided slices. We need to hold on
  2622  		// to the Pebble-owned slices, so save them and re-set them after the
  2623  		// copy.
  2624  		lower, upper := i.opts.LowerBound, i.opts.UpperBound
  2625  		i.opts = *o
  2626  		i.opts.LowerBound, i.opts.UpperBound = lower, upper
  2627  	} else {
  2628  		i.opts = *o
  2629  		i.processBounds(o.LowerBound, o.UpperBound)
  2630  		// Propagate the changed bounds to the existing point iterator.
  2631  		// NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
  2632  		// because i.opts now point to buffers owned by Pebble.
  2633  		if i.pointIter != nil {
  2634  			i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2635  		}
  2636  		if i.rangeKey != nil {
  2637  			i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2638  		}
  2639  	}
  2640  
  2641  	// Even though this is not a positioning operation, the invalidation of the
  2642  	// iterator stack means we cannot optimize Seeks by using Next.
  2643  	i.invalidate()
  2644  
  2645  	// Iterators created through NewExternalIter have a different iterator
  2646  	// initialization process.
  2647  	if i.externalReaders != nil {
  2648  		finishInitializingExternal(i.ctx, i)
  2649  		return
  2650  	}
  2651  	finishInitializingIter(i.ctx, i.alloc)
  2652  }
  2653  
  2654  func (i *Iterator) invalidate() {
  2655  	i.lastPositioningOp = invalidatedLastPositionOp
  2656  	i.hasPrefix = false
  2657  	i.iterKey = nil
  2658  	i.iterValue = LazyValue{}
  2659  	i.err = nil
  2660  	// This switch statement isn't necessary for correctness since callers
  2661  	// should call a repositioning method. We could have arbitrarily set i.pos
  2662  	// to one of the values. But it results in more intuitive behavior in
  2663  	// tests, which do not always reposition.
  2664  	switch i.pos {
  2665  	case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
  2666  		i.pos = iterPosCurForward
  2667  	case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
  2668  		i.pos = iterPosCurReverse
  2669  	}
  2670  	i.iterValidityState = IterExhausted
  2671  	if i.rangeKey != nil {
  2672  		i.rangeKey.iiter.Invalidate()
  2673  		i.rangeKey.prevPosHadRangeKey = false
  2674  	}
  2675  }
  2676  
  2677  // Metrics returns per-iterator metrics.
  2678  func (i *Iterator) Metrics() IteratorMetrics {
  2679  	m := IteratorMetrics{
  2680  		ReadAmp: 1,
  2681  	}
  2682  	if mi, ok := i.iter.(*mergingIter); ok {
  2683  		m.ReadAmp = len(mi.levels)
  2684  	}
  2685  	return m
  2686  }
  2687  
  2688  // ResetStats resets the stats to 0.
  2689  func (i *Iterator) ResetStats() {
  2690  	i.stats = IteratorStats{}
  2691  }
  2692  
  2693  // Stats returns the current stats.
  2694  func (i *Iterator) Stats() IteratorStats {
  2695  	return i.stats
  2696  }
  2697  
  2698  // CloneOptions configures an iterator constructed through Iterator.Clone.
  2699  type CloneOptions struct {
  2700  	// IterOptions, if non-nil, define the iterator options to configure a
  2701  	// cloned iterator. If nil, the clone adopts the same IterOptions as the
  2702  	// iterator being cloned.
  2703  	IterOptions *IterOptions
  2704  	// RefreshBatchView may be set to true when cloning an Iterator over an
  2705  	// indexed batch. When false, the clone adopts the same (possibly stale)
  2706  	// view of the indexed batch as the cloned Iterator. When true, the clone is
  2707  	// constructed with a refreshed view of the batch, observing all of the
  2708  	// batch's mutations at the time of the Clone. If the cloned iterator was
  2709  	// not constructed to read over an indexed batch, RefreshVatchView has no
  2710  	// effect.
  2711  	RefreshBatchView bool
  2712  }
  2713  
  2714  // Clone creates a new Iterator over the same underlying data, i.e., over the
  2715  // same {batch, memtables, sstables}). The resulting iterator is not positioned.
  2716  // It starts with the same IterOptions, unless opts.IterOptions is set.
  2717  //
  2718  // When called on an Iterator over an indexed batch, the clone's visibility of
  2719  // the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
  2720  // the clone inherits the iterator's current (possibly stale) view of the batch,
  2721  // and callers may call SetOptions to subsequently refresh the clone's view to
  2722  // include all batch mutations. If true, the clone is constructed with a
  2723  // complete view of the indexed batch's mutations at the time of the Clone.
  2724  //
  2725  // Callers can use Clone if they need multiple iterators that need to see
  2726  // exactly the same underlying state of the DB. This should not be used to
  2727  // extend the lifetime of the data backing the original Iterator since that
  2728  // will cause an increase in memory and disk usage (use NewSnapshot for that
  2729  // purpose).
  2730  func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
  2731  	return i.CloneWithContext(context.Background(), opts)
  2732  }
  2733  
  2734  // CloneWithContext is like Clone, and additionally accepts a context for
  2735  // tracing.
  2736  func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) {
  2737  	if opts.IterOptions == nil {
  2738  		opts.IterOptions = &i.opts
  2739  	}
  2740  	if i.batchOnlyIter {
  2741  		return nil, errors.Errorf("cannot Clone a batch-only Iterator")
  2742  	}
  2743  	readState := i.readState
  2744  	vers := i.version
  2745  	if readState == nil && vers == nil {
  2746  		return nil, errors.Errorf("cannot Clone a closed Iterator")
  2747  	}
  2748  	// i is already holding a ref, so there is no race with unref here.
  2749  	//
  2750  	// TODO(bilal): If the underlying iterator was created on a snapshot, we could
  2751  	// grab a reference to the current readState instead of reffing the original
  2752  	// readState. This allows us to release references to some zombie sstables
  2753  	// and memtables.
  2754  	if readState != nil {
  2755  		readState.ref()
  2756  	}
  2757  	if vers != nil {
  2758  		vers.Ref()
  2759  	}
  2760  	// Bundle various structures under a single umbrella in order to allocate
  2761  	// them together.
  2762  	buf := iterAllocPool.Get().(*iterAlloc)
  2763  	dbi := &buf.dbi
  2764  	*dbi = Iterator{
  2765  		ctx:                 ctx,
  2766  		opts:                *opts.IterOptions,
  2767  		alloc:               buf,
  2768  		merge:               i.merge,
  2769  		comparer:            i.comparer,
  2770  		readState:           readState,
  2771  		version:             vers,
  2772  		keyBuf:              buf.keyBuf,
  2773  		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
  2774  		boundsBuf:           buf.boundsBuf,
  2775  		batch:               i.batch,
  2776  		batchSeqNum:         i.batchSeqNum,
  2777  		newIters:            i.newIters,
  2778  		newIterRangeKey:     i.newIterRangeKey,
  2779  		seqNum:              i.seqNum,
  2780  	}
  2781  	dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
  2782  
  2783  	// If the caller requested the clone have a current view of the indexed
  2784  	// batch, set the clone's batch sequence number appropriately.
  2785  	if i.batch != nil && opts.RefreshBatchView {
  2786  		dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
  2787  	}
  2788  
  2789  	return finishInitializingIter(ctx, buf), nil
  2790  }
  2791  
  2792  // Merge adds all of the argument's statistics to the receiver. It may be used
  2793  // to accumulate stats across multiple iterators.
  2794  func (stats *IteratorStats) Merge(o IteratorStats) {
  2795  	for i := InterfaceCall; i < NumStatsKind; i++ {
  2796  		stats.ForwardSeekCount[i] += o.ForwardSeekCount[i]
  2797  		stats.ReverseSeekCount[i] += o.ReverseSeekCount[i]
  2798  		stats.ForwardStepCount[i] += o.ForwardStepCount[i]
  2799  		stats.ReverseStepCount[i] += o.ReverseStepCount[i]
  2800  	}
  2801  	stats.InternalStats.Merge(o.InternalStats)
  2802  	stats.RangeKeyStats.Merge(o.RangeKeyStats)
  2803  }
  2804  
  2805  func (stats *IteratorStats) String() string {
  2806  	return redact.StringWithoutMarkers(stats)
  2807  }
  2808  
  2809  // SafeFormat implements the redact.SafeFormatter interface.
  2810  func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
  2811  	for i := range stats.ForwardStepCount {
  2812  		switch IteratorStatsKind(i) {
  2813  		case InterfaceCall:
  2814  			s.SafeString("(interface (dir, seek, step): ")
  2815  		case InternalIterCall:
  2816  			s.SafeString(", (internal (dir, seek, step): ")
  2817  		}
  2818  		s.Printf("(fwd, %d, %d), (rev, %d, %d))",
  2819  			redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]),
  2820  			redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i]))
  2821  	}
  2822  	if stats.InternalStats != (InternalIteratorStats{}) {
  2823  		s.SafeString(",\n(internal-stats: ")
  2824  		s.Printf("(block-bytes: (total %s, cached %s, read-time %s)), "+
  2825  			"(points: (count %s, key-bytes %s, value-bytes %s, tombstoned %s))",
  2826  			humanize.Bytes.Uint64(stats.InternalStats.BlockBytes),
  2827  			humanize.Bytes.Uint64(stats.InternalStats.BlockBytesInCache),
  2828  			humanize.FormattedString(stats.InternalStats.BlockReadDuration.String()),
  2829  			humanize.Count.Uint64(stats.InternalStats.PointCount),
  2830  			humanize.Bytes.Uint64(stats.InternalStats.KeyBytes),
  2831  			humanize.Bytes.Uint64(stats.InternalStats.ValueBytes),
  2832  			humanize.Count.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones),
  2833  		)
  2834  		if stats.InternalStats.SeparatedPointValue.Count != 0 {
  2835  			s.Printf(", (separated: (count %s, bytes %s, fetched %s)))",
  2836  				humanize.Count.Uint64(stats.InternalStats.SeparatedPointValue.Count),
  2837  				humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytes),
  2838  				humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytesFetched))
  2839  		} else {
  2840  			s.Printf(")")
  2841  		}
  2842  	}
  2843  	if stats.RangeKeyStats != (RangeKeyIteratorStats{}) {
  2844  		s.SafeString(",\n(range-key-stats: ")
  2845  		s.Printf("(count %d), (contained points: (count %d, skipped %d)))",
  2846  			stats.RangeKeyStats.Count,
  2847  			stats.RangeKeyStats.ContainedPoints,
  2848  			stats.RangeKeyStats.SkippedPoints)
  2849  	}
  2850  }
  2851  
  2852  // CanDeterministicallySingleDelete takes a valid iterator and examines internal
  2853  // state to determine if a SingleDelete deleting Iterator.Key() would
  2854  // deterministically delete the key. CanDeterministicallySingleDelete requires
  2855  // the iterator to be oriented in the forward direction (eg, the last
  2856  // positioning operation must've been a First, a Seek[Prefix]GE, or a
  2857  // Next[Prefix][WithLimit]).
  2858  //
  2859  // This function does not change the external position of the iterator, and all
  2860  // positioning methods should behave the same as if it was never called. This
  2861  // function will only return a meaningful result the first time it's invoked at
  2862  // an iterator position. This function invalidates the iterator Value's memory,
  2863  // and the caller must not rely on the memory safety of the previous Iterator
  2864  // position.
  2865  //
  2866  // If CanDeterministicallySingleDelete returns true AND the key at the iterator
  2867  // position is not modified between the creation of the Iterator and the commit
  2868  // of a batch containing a SingleDelete over the key, then the caller can be
  2869  // assured that SingleDelete is equivalent to Delete on the local engine, but it
  2870  // may not be true on another engine that received the same writes and with
  2871  // logically equivalent state since this engine may have collapsed multiple SETs
  2872  // into one.
  2873  func CanDeterministicallySingleDelete(it *Iterator) (bool, error) {
  2874  	// This function may only be called once per external iterator position. We
  2875  	// can validate this by checking the last positioning operation.
  2876  	if it.lastPositioningOp == internalNextOp {
  2877  		return false, errors.New("pebble: CanDeterministicallySingleDelete called twice")
  2878  	}
  2879  	validity, kind := it.internalNext()
  2880  	var shadowedBySingleDelete bool
  2881  	for validity == internalNextValid {
  2882  		switch kind {
  2883  		case InternalKeyKindDelete, InternalKeyKindDeleteSized:
  2884  			// A DEL or DELSIZED tombstone is okay. An internal key
  2885  			// sequence like SINGLEDEL; SET; DEL; SET can be handled
  2886  			// deterministically. If there are SETs further down, we
  2887  			// don't care about them.
  2888  			return true, nil
  2889  		case InternalKeyKindSingleDelete:
  2890  			// A SingleDelete is okay as long as when that SingleDelete was
  2891  			// written, it was written deterministically (eg, with its own
  2892  			// CanDeterministicallySingleDelete check). Validate that it was
  2893  			// written deterministically. We'll allow one set to appear after
  2894  			// the SingleDelete.
  2895  			shadowedBySingleDelete = true
  2896  			validity, kind = it.internalNext()
  2897  			continue
  2898  		case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge:
  2899  			// If we observed a single delete, it's allowed to delete 1 key.
  2900  			// We'll keep looping to validate that the internal keys beneath the
  2901  			// already-written single delete are copacetic.
  2902  			if shadowedBySingleDelete {
  2903  				shadowedBySingleDelete = false
  2904  				validity, kind = it.internalNext()
  2905  				continue
  2906  			}
  2907  			// We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL
  2908  			// that deleted the KV at the original iterator position could
  2909  			// result in this key becoming visible.
  2910  			return false, nil
  2911  		case InternalKeyKindRangeDelete:
  2912  			// RangeDeletes are handled by the merging iterator and should never
  2913  			// be observed by the top-level Iterator.
  2914  			panic(errors.AssertionFailedf("pebble: unexpected range delete"))
  2915  		case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
  2916  			// Range keys are interleaved at the maximal sequence number and
  2917  			// should never be observed within a user key.
  2918  			panic(errors.AssertionFailedf("pebble: unexpected range key"))
  2919  		default:
  2920  			panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind)))
  2921  		}
  2922  	}
  2923  	if validity == internalNextError {
  2924  		return false, it.Error()
  2925  	}
  2926  	return true, nil
  2927  }
  2928  
  2929  // internalNextValidity enumerates the potential outcomes of a call to
  2930  // internalNext.
  2931  type internalNextValidity int8
  2932  
  2933  const (
  2934  	// internalNextError is returned by internalNext when an error occurred and
  2935  	// the caller is responsible for checking iter.Error().
  2936  	internalNextError internalNextValidity = iota
  2937  	// internalNextExhausted is returned by internalNext when the next internal
  2938  	// key is an internal key with a different user key than Iterator.Key().
  2939  	internalNextExhausted
  2940  	// internalNextValid is returned by internalNext when the internal next
  2941  	// found a shadowed internal key with a user key equal to Iterator.Key().
  2942  	internalNextValid
  2943  )
  2944  
  2945  // internalNext advances internal Iterator state forward to expose the
  2946  // InternalKeyKind of the next internal key with a user key equal to Key().
  2947  //
  2948  // internalNext is a highly specialized operation and is unlikely to be
  2949  // generally useful. See Iterator.Next for how to reposition the iterator to the
  2950  // next key. internalNext requires the Iterator to be at a valid position in the
  2951  // forward direction (the last positioning operation must've been a First, a
  2952  // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true).
  2953  //
  2954  // internalNext, unlike all other Iterator methods, exposes internal LSM state.
  2955  // internalNext advances the Iterator's internal iterator to the next shadowed
  2956  // key with a user key equal to Key(). When a key is overwritten or deleted, its
  2957  // removal from the LSM occurs lazily as a part of compactions. internalNext
  2958  // allows the caller to see whether an obsolete internal key exists with the
  2959  // current Key(), and what it's key kind is. Note that the existence of an
  2960  // internal key is nondeterministic and dependent on internal LSM state. These
  2961  // semantics are unlikely to be applicable to almost all use cases.
  2962  //
  2963  // If internalNext finds a key that shares the same user key as Key(), it
  2964  // returns internalNextValid and the internal key's kind. If internalNext
  2965  // encounters an error, it returns internalNextError and the caller is expected
  2966  // to call Iterator.Error() to retrieve it. In all other circumstances,
  2967  // internalNext returns internalNextExhausted, indicating that there are no more
  2968  // additional internal keys with the user key Key().
  2969  //
  2970  // internalNext does not change the external position of the iterator, and a
  2971  // Next operation should behave the same as if internalNext was never called.
  2972  // internalNext does invalidate the iterator Value's memory, and the caller must
  2973  // not rely on the memory safety of the previous Iterator position.
  2974  func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) {
  2975  	i.stats.ForwardStepCount[InterfaceCall]++
  2976  	if i.err != nil {
  2977  		return internalNextError, base.InternalKeyKindInvalid
  2978  	} else if i.iterValidityState != IterValid {
  2979  		return internalNextExhausted, base.InternalKeyKindInvalid
  2980  	}
  2981  	i.lastPositioningOp = internalNextOp
  2982  
  2983  	switch i.pos {
  2984  	case iterPosCurForward:
  2985  		i.iterKey, i.iterValue = i.iter.Next()
  2986  		if i.iterKey == nil {
  2987  			// We check i.iter.Error() here and return an internalNextError enum
  2988  			// variant so that the caller does not need to check i.iter.Error()
  2989  			// in the common case that the next internal key has a new user key.
  2990  			if i.err = i.iter.Error(); i.err != nil {
  2991  				return internalNextError, base.InternalKeyKindInvalid
  2992  			}
  2993  			i.pos = iterPosNext
  2994  			return internalNextExhausted, base.InternalKeyKindInvalid
  2995  		} else if i.comparer.Equal(i.iterKey.UserKey, i.key) {
  2996  			return internalNextValid, i.iterKey.Kind()
  2997  		}
  2998  		i.pos = iterPosNext
  2999  		return internalNextExhausted, base.InternalKeyKindInvalid
  3000  	case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev:
  3001  		i.err = errors.New("switching from reverse to forward via internalNext is prohibited")
  3002  		i.iterValidityState = IterExhausted
  3003  		return internalNextError, base.InternalKeyKindInvalid
  3004  	case iterPosNext, iterPosCurForwardPaused:
  3005  		// The previous method already moved onto the next user key. This is
  3006  		// only possible if
  3007  		//   - the last positioning method was a call to internalNext, and we
  3008  		//     advanced to a new user key.
  3009  		//   - the previous non-internalNext iterator operation encountered a
  3010  		//     range key or merge, forcing an internal Next that found a new
  3011  		//     user key that's not equal to i.Iterator.Key().
  3012  		return internalNextExhausted, base.InternalKeyKindInvalid
  3013  	default:
  3014  		panic("unreachable")
  3015  	}
  3016  }