github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/iterator.go

github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/iterator.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"io"
    10  	"sync"
    11  	"sync/atomic"
    12  	"unsafe"
    13  
    14  	"github.com/cockroachdb/errors"
    15  	"github.com/cockroachdb/redact"
    16  	"github.com/zuoyebang/bitalostable/internal/base"
    17  	"github.com/zuoyebang/bitalostable/internal/fastrand"
    18  	"github.com/zuoyebang/bitalostable/internal/humanize"
    19  	"github.com/zuoyebang/bitalostable/internal/invariants"
    20  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    21  	"github.com/zuoyebang/bitalostable/internal/manifest"
    22  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    23  	"github.com/zuoyebang/bitalostable/sstable"
    24  )
    25  
    26  // iterPos describes the state of the internal iterator, in terms of whether it
    27  // is at the position returned to the user (cur), one ahead of the position
    28  // returned (next for forward iteration and prev for reverse iteration). The cur
    29  // position is split into two states, for forward and reverse iteration, since
    30  // we need to differentiate for switching directions.
    31  //
    32  // There is subtlety in what is considered the current position of the Iterator.
    33  // The internal iterator exposes a sequence of internal keys. There is not
    34  // always a single internalIterator position corresponding to the position
    35  // returned to the user. Consider the example:
    36  //
    37  //	a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4
    38  //	\                                   /
    39  //	  \       Iterator.Key() = 'a'    /
    40  //
    41  // The Iterator exposes one valid position at user key 'a' and the two exhausted
    42  // positions at the beginning and end of iteration. The underlying
    43  // internalIterator contains 7 valid positions and 2 exhausted positions.
    44  //
    45  // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward}
    46  // iff the user key at the current internalIterator position equals the
    47  // Iterator.Key returned to the user. This guarantees that a call to nextUserKey
    48  // or prevUserKey will advance to the next or previous iterator position.
    49  // iterPosCur{Forward,Backward} does not make any guarantee about the internal
    50  // iterator position among internal keys with matching user keys, and it will
    51  // vary subtly depending on the particular key kinds encountered. In the above
    52  // example, the iterator returning 'a' may set iterPosCurForward if the internal
    53  // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6.
    54  //
    55  // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator
    56  // must be advanced to the first internalIterator position at a user key greater
    57  // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An
    58  // internalIterator position that's !Valid() must also be considered greater or
    59  // less—depending on the direction of iteration—than the last valid Iterator
    60  // position.
    61  type iterPos int8
    62  
    63  const (
    64  	iterPosCurForward iterPos = 0
    65  	iterPosNext       iterPos = 1
    66  	iterPosPrev       iterPos = -1
    67  	iterPosCurReverse iterPos = -2
    68  
    69  	// For limited iteration. When the iterator is at iterPosCurForwardPaused
    70  	// - Next*() call should behave as if the internal iterator is already
    71  	//   at next (akin to iterPosNext).
    72  	// - Prev*() call should behave as if the internal iterator is at the
    73  	//   current key (akin to iterPosCurForward).
    74  	//
    75  	// Similar semantics apply to CurReversePaused.
    76  	iterPosCurForwardPaused iterPos = 2
    77  	iterPosCurReversePaused iterPos = -3
    78  )
    79  
    80  // Approximate gap in bytes between samples of data read during iteration.
    81  // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield
    82  // 1 << 20 (1MB). The 1MB factor comes from:
    83  // https://github.com/zuoyebang/bitalostable/issues/29#issuecomment-494477985
    84  const readBytesPeriod uint64 = 1 << 16
    85  
    86  var errReversePrefixIteration = errors.New("bitalostable: unsupported reverse prefix iteration")
    87  
    88  // IteratorMetrics holds per-iterator metrics. These do not change over the
    89  // lifetime of the iterator.
    90  type IteratorMetrics struct {
    91  	// The read amplification experienced by this iterator. This is the sum of
    92  	// the memtables, the L0 sublevels and the non-empty Ln levels. Higher read
    93  	// amplification generally results in slower reads, though allowing higher
    94  	// read amplification can also result in faster writes.
    95  	ReadAmp int
    96  }
    97  
    98  // IteratorStatsKind describes the two kind of iterator stats.
    99  type IteratorStatsKind int8
   100  
   101  const (
   102  	// InterfaceCall represents calls to Iterator.
   103  	InterfaceCall IteratorStatsKind = iota
   104  	// InternalIterCall represents calls by Iterator to its internalIterator.
   105  	InternalIterCall
   106  	// NumStatsKind is the number of kinds, and is used for array sizing.
   107  	NumStatsKind
   108  )
   109  
   110  // IteratorStats contains iteration stats.
   111  type IteratorStats struct {
   112  	// ForwardSeekCount includes SeekGE, SeekPrefixGE, First.
   113  	ForwardSeekCount [NumStatsKind]int
   114  	// ReverseSeek includes SeekLT, Last.
   115  	ReverseSeekCount [NumStatsKind]int
   116  	// ForwardStepCount includes Next.
   117  	ForwardStepCount [NumStatsKind]int
   118  	// ReverseStepCount includes Prev.
   119  	ReverseStepCount [NumStatsKind]int
   120  	InternalStats    InternalIteratorStats
   121  }
   122  
   123  var _ redact.SafeFormatter = &IteratorStats{}
   124  
   125  // InternalIteratorStats contains miscellaneous stats produced by internal
   126  // iterators.
   127  type InternalIteratorStats = base.InternalIteratorStats
   128  
   129  // Iterator iterates over a DB's key/value pairs in key order.
   130  //
   131  // An iterator must be closed after use, but it is not necessary to read an
   132  // iterator until exhaustion.
   133  //
   134  // An iterator is not goroutine-safe, but it is safe to use multiple iterators
   135  // concurrently, with each in a dedicated goroutine.
   136  //
   137  // It is also safe to use an iterator concurrently with modifying its
   138  // underlying DB, if that DB permits modification. However, the resultant
   139  // key/value pairs are not guaranteed to be a consistent snapshot of that DB
   140  // at a particular point in time.
   141  //
   142  // If an iterator encounters an error during any operation, it is stored by
   143  // the Iterator and surfaced through the Error method. All absolute
   144  // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any
   145  // accumulated error before positioning. All relative positioning methods (eg,
   146  // Next, Prev) return without advancing if the iterator has an accumulated
   147  // error.
   148  type Iterator struct {
   149  	opts      IterOptions
   150  	merge     Merge
   151  	comparer  base.Comparer
   152  	iter      internalIterator
   153  	pointIter internalIterator
   154  	readState *readState
   155  	// rangeKey holds iteration state specific to iteration over range keys.
   156  	// The range key field may be nil if the Iterator has never been configured
   157  	// to iterate over range keys. Its non-nilness cannot be used to determine
   158  	// if the Iterator is currently iterating over range keys: For that, consult
   159  	// the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter
   160  	// field is guaranteed to be non-nil too.
   161  	rangeKey *iteratorRangeKeyState
   162  	// rangeKeyMasking holds state for range-key masking of point keys.
   163  	rangeKeyMasking rangeKeyMasking
   164  	err             error
   165  	// When iterValidityState=IterValid, key represents the current key, which
   166  	// is backed by keyBuf.
   167  	key         []byte
   168  	keyBuf      []byte
   169  	value       []byte
   170  	valueBuf    []byte
   171  	valueCloser io.Closer
   172  	// boundsBuf holds two buffers used to store the lower and upper bounds.
   173  	// Whenever the Iterator's bounds change, the new bounds are copied into
   174  	// boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
   175  	// allocations. opts.LowerBound and opts.UpperBound point into this slice.
   176  	boundsBuf    [2][]byte
   177  	boundsBufIdx int
   178  	// iterKey, iterValue reflect the latest position of iter, except when
   179  	// SetBounds is called. In that case, these are explicitly set to nil.
   180  	iterKey             *InternalKey
   181  	iterValue           []byte
   182  	alloc               *iterAlloc
   183  	getIterAlloc        *getIterAlloc
   184  	prefixOrFullSeekKey []byte
   185  	readSampling        readSampling
   186  	stats               IteratorStats
   187  	externalReaders     [][]*sstable.Reader
   188  
   189  	// Following fields used when constructing an iterator stack, eg, in Clone
   190  	// and SetOptions or when re-fragmenting a batch's range keys/range dels.
   191  	// Non-nil if this Iterator includes a Batch.
   192  	batch            *Batch
   193  	newIters         tableNewIters
   194  	newIterRangeKey  keyspan.TableNewSpanIter
   195  	lazyCombinedIter lazyCombinedIter
   196  	seqNum           uint64
   197  	// batchSeqNum is used by Iterators over indexed batches to detect when the
   198  	// underlying batch has been mutated. The batch beneath an indexed batch may
   199  	// be mutated while the Iterator is open, but new keys are not surfaced
   200  	// until the next call to SetOptions.
   201  	batchSeqNum uint64
   202  	// batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is
   203  	// configured to read through an indexed batch. If a batch is set, these
   204  	// iterators will be included within the iterator stack regardless of
   205  	// whether the batch currently contains any keys of their kind. These
   206  	// pointers are used during a call to SetOptions to refresh the Iterator's
   207  	// view of its indexed batch.
   208  	batchPointIter    batchIter
   209  	batchRangeDelIter keyspan.Iter
   210  	batchRangeKeyIter keyspan.Iter
   211  
   212  	// Keeping the bools here after all the 8 byte aligned fields shrinks the
   213  	// sizeof this struct by 24 bytes.
   214  
   215  	// INVARIANT:
   216  	// iterValidityState==IterAtLimit <=>
   217  	//  pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused
   218  	iterValidityState IterValidityState
   219  	// Set to true by SetBounds, SetOptions. Causes the Iterator to appear
   220  	// exhausted externally, while preserving the correct iterValidityState for
   221  	// the iterator's internal state. Preserving the correct internal validity
   222  	// is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT
   223  	// optimizations after "no-op" calls to SetBounds and SetOptions.
   224  	requiresReposition bool
   225  	// The position of iter. When this is iterPos{Prev,Next} the iter has been
   226  	// moved past the current key-value, which can only happen if
   227  	// iterValidityState=IterValid, i.e., there is something to return to the
   228  	// client for the current position.
   229  	pos iterPos
   230  	// Relates to the prefixOrFullSeekKey field above.
   231  	hasPrefix bool
   232  	// Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext),
   233  	// and SeekGE/SeekLT optimizations
   234  	lastPositioningOp lastPositioningOpKind
   235  	// Used for an optimization in external iterators to reduce the number of
   236  	// merging levels.
   237  	forwardOnly bool
   238  	// closePointIterOnce is set to true if this point iter can only be Close()d
   239  	// once, _and_ closing i.iter and then i.pointIter would close i.pointIter
   240  	// twice. This is necessary to track if the point iter is an internal iterator
   241  	// that could release its resources to a pool on Close(), making it harder for
   242  	// that iterator to make its own closes idempotent.
   243  	//
   244  	// TODO(bilal): Update SetOptions to always close out point key iterators when
   245  	// they won't be used, so that Close() doesn't need to default to closing
   246  	// point iterators twice.
   247  	closePointIterOnce bool
   248  	// Used in some tests to disable the random disabling of seek optimizations.
   249  	forceEnableSeekOpt bool
   250  }
   251  
   252  // cmp is a convenience shorthand for the i.comparer.Compare function.
   253  func (i *Iterator) cmp(a, b []byte) int {
   254  	return i.comparer.Compare(a, b)
   255  }
   256  
   257  // split is a convenience shorthand for the i.comparer.Split function.
   258  func (i *Iterator) split(a []byte) int {
   259  	return i.comparer.Split(a)
   260  }
   261  
   262  // equal is a convenience shorthand for the i.comparer.Equal function.
   263  func (i *Iterator) equal(a, b []byte) bool {
   264  	return i.comparer.Equal(a, b)
   265  }
   266  
   267  // iteratorRangeKeyState holds an iterator's range key iteration state.
   268  type iteratorRangeKeyState struct {
   269  	opts  *IterOptions
   270  	cmp   base.Compare
   271  	split base.Split
   272  	// rangeKeyIter holds the range key iterator stack that iterates over the
   273  	// merged spans across the entirety of the LSM.
   274  	rangeKeyIter keyspan.FragmentIterator
   275  	iiter        keyspan.InterleavingIter
   276  	// stale is set to true when the range key state recorded here (in start,
   277  	// end and keys) may not be in sync with the current range key at the
   278  	// interleaving iterator's current position.
   279  	//
   280  	// When the interelaving iterator passes over a new span, it invokes the
   281  	// SpanChanged hook defined on the `rangeKeyMasking` type,  which sets stale
   282  	// to true if the span is non-nil.
   283  	//
   284  	// The parent iterator may not be positioned over the interleaving
   285  	// iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so
   286  	// {keys,start,end} are only updated to the new range key during a call to
   287  	// Iterator.saveRangeKey.
   288  	stale bool
   289  	// updated is used to signal to the Iterator client whether the state of
   290  	// range keys has changed since the previous iterator position through the
   291  	// `RangeKeyChanged` method. It's set to true during an Iterator positioning
   292  	// operation that changes the state of the current range key. Each Iterator
   293  	// positioning operation sets it back to false before executing.
   294  	updated bool
   295  	// prevPosHadRangeKey records whether the previous Iterator position had a
   296  	// range key (HasPointAndRage() = (_, true)). It's updated at the beginning
   297  	// of each new Iterator positioning operation. It's required by saveRangeKey to
   298  	// to set `updated` appropriately: Without this record of the previous iterator
   299  	// state, it's ambiguous whether an iterator only temporarily stepped onto a
   300  	// position without a range key.
   301  	prevPosHadRangeKey bool
   302  	// rangeKeyOnly is set to true if at the current iterator position there is
   303  	// no point key, only a range key start boundary.
   304  	rangeKeyOnly bool
   305  	// hasRangeKey is true when the current iterator position has a covering
   306  	// range key (eg, a range key with bounds [<lower>,<upper>) such that
   307  	// <lower> ≤ Key() < <upper>).
   308  	hasRangeKey bool
   309  	// start and end are the [start, end) boundaries of the current range keys.
   310  	start []byte
   311  	end   []byte
   312  	// keys is sorted by Suffix ascending.
   313  	keys []RangeKeyData
   314  	// buf is used to save range-key data before moving the range-key iterator.
   315  	// Start and end boundaries, suffixes and values are all copied into buf.
   316  	buf []byte
   317  
   318  	// iterConfig holds fields that are used for the construction of the
   319  	// iterator stack, but do not need to be directly accessed during iteration.
   320  	// This struct is bundled within the iteratorRangeKeyState struct to reduce
   321  	// allocations.
   322  	iterConfig rangekey.UserIteratorConfig
   323  }
   324  
   325  func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) {
   326  	i.cmp = cmp
   327  	i.split = split
   328  	i.opts = opts
   329  }
   330  
   331  var iterRangeKeyStateAllocPool = sync.Pool{
   332  	New: func() interface{} {
   333  		return &iteratorRangeKeyState{}
   334  	},
   335  }
   336  
   337  // isEphemeralPosition returns true iff the current iterator position is
   338  // ephemeral, and won't be visited during subsequent relative positioning
   339  // operations.
   340  //
   341  // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a
   342  // straddling range key without a coincident point key is such a position.
   343  func (i *Iterator) isEphemeralPosition() bool {
   344  	return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly &&
   345  		!i.equal(i.rangeKey.start, i.key)
   346  }
   347  
   348  type lastPositioningOpKind int8
   349  
   350  const (
   351  	unknownLastPositionOp lastPositioningOpKind = iota
   352  	seekPrefixGELastPositioningOp
   353  	seekGELastPositioningOp
   354  	seekLTLastPositioningOp
   355  )
   356  
   357  // Limited iteration mode. Not for use with prefix iteration.
   358  //
   359  // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator
   360  // at the limit in a best-effort manner. The client should behave correctly
   361  // even if the limits are ignored. These limits are not "deep", in that they
   362  // are not passed down to the underlying collection of internalIterators. This
   363  // is because the limits are transient, and apply only until the next
   364  // iteration call. They serve mainly as a way to bound the amount of work when
   365  // two (or more) Iterators are being coordinated at a higher level.
   366  //
   367  // In limited iteration mode:
   368  // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method.
   369  //   The return value from the *WithLimit() method provides a more precise
   370  //   disposition.
   371  // - The limit is exclusive for forward and inclusive for reverse.
   372  //
   373  //
   374  // Limited iteration mode & range keys
   375  //
   376  // Limited iteration interacts with range-key iteration. When range key
   377  // iteration is enabled, range keys are interleaved at their start boundaries.
   378  // Limited iteration must ensure that if a range key exists within the limit,
   379  // the iterator visits the range key.
   380  //
   381  // During forward limited iteration, this is trivial: An overlapping range key
   382  // must have a start boundary less than the limit, and the range key's start
   383  // boundary will be interleaved and found to be within the limit.
   384  //
   385  // During reverse limited iteration, the tail of the range key may fall within
   386  // the limit. The range key must be surfaced even if the range key's start
   387  // boundary is less than the limit, and if there are no point keys between the
   388  // current iterator position and the limit. To provide this guarantee, reverse
   389  // limited iteration ignores the limit as long as there is a range key
   390  // overlapping the iteration position.
   391  
   392  // IterValidityState captures the state of the Iterator.
   393  type IterValidityState int8
   394  
   395  const (
   396  	// IterExhausted represents an Iterator that is exhausted.
   397  	IterExhausted IterValidityState = iota
   398  	// IterValid represents an Iterator that is valid.
   399  	IterValid
   400  	// IterAtLimit represents an Iterator that has a non-exhausted
   401  	// internalIterator, but has reached a limit without any key for the
   402  	// caller.
   403  	IterAtLimit
   404  )
   405  
   406  // readSampling stores variables used to sample a read to trigger a read
   407  // compaction
   408  type readSampling struct {
   409  	bytesUntilReadSampling uint64
   410  	initialSamplePassed    bool
   411  	pendingCompactions     readCompactionQueue
   412  	// forceReadSampling is used for testing purposes to force a read sample on every
   413  	// call to Iterator.maybeSampleRead()
   414  	forceReadSampling bool
   415  }
   416  
   417  func (i *Iterator) findNextEntry(limit []byte) {
   418  	i.iterValidityState = IterExhausted
   419  	i.pos = iterPosCurForward
   420  	if i.opts.rangeKeys() && i.rangeKey != nil {
   421  		i.rangeKey.rangeKeyOnly = false
   422  	}
   423  
   424  	// Close the closer for the current value if one was open.
   425  	if i.closeValueCloser() != nil {
   426  		return
   427  	}
   428  
   429  	for i.iterKey != nil {
   430  		key := *i.iterKey
   431  
   432  		if i.hasPrefix {
   433  			if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) {
   434  				return
   435  			}
   436  		}
   437  		// Compare with limit every time we start at a different user key.
   438  		// Note that given the best-effort contract of limit, we could avoid a
   439  		// comparison in the common case by doing this only after
   440  		// i.nextUserKey is called for the deletes below. However that makes
   441  		// the behavior non-deterministic (since the behavior will vary based
   442  		// on what has been compacted), which makes it hard to test with the
   443  		// metamorphic test. So we forego that performance optimization.
   444  		if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 {
   445  			i.iterValidityState = IterAtLimit
   446  			i.pos = iterPosCurForwardPaused
   447  			return
   448  		}
   449  
   450  		switch key.Kind() {
   451  		case InternalKeyKindRangeKeySet:
   452  			// Save the current key.
   453  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   454  			i.key = i.keyBuf
   455  			i.value = nil
   456  			// There may also be a live point key at this userkey that we have
   457  			// not yet read. We need to find the next entry with this user key
   458  			// to find it. Save the range key so we don't lose it when we Next
   459  			// the underlying iterator.
   460  			i.saveRangeKey()
   461  			pointKeyExists := i.nextPointCurrentUserKey()
   462  			if i.err != nil {
   463  				i.iterValidityState = IterExhausted
   464  				return
   465  			}
   466  			i.rangeKey.rangeKeyOnly = !pointKeyExists
   467  			i.iterValidityState = IterValid
   468  			return
   469  
   470  		case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   471  			i.nextUserKey()
   472  			continue
   473  
   474  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   475  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   476  			i.key = i.keyBuf
   477  			i.value = i.iterValue
   478  			i.iterValidityState = IterValid
   479  			i.saveRangeKey()
   480  			return
   481  
   482  		case InternalKeyKindMerge:
   483  			// Resolving the merge may advance us to the next point key, which
   484  			// may be covered by a different set of range keys. Save the range
   485  			// key state so we don't lose it.
   486  			i.saveRangeKey()
   487  			if i.mergeForward(key) {
   488  				i.iterValidityState = IterValid
   489  				return
   490  			}
   491  
   492  			// The merge didn't yield a valid key, either because the value
   493  			// merger indicated it should be deleted, or because an error was
   494  			// encountered.
   495  			i.iterValidityState = IterExhausted
   496  			if i.err != nil {
   497  				return
   498  			}
   499  			if i.pos != iterPosNext {
   500  				i.nextUserKey()
   501  			}
   502  			if i.closeValueCloser() != nil {
   503  				return
   504  			}
   505  			i.pos = iterPosCurForward
   506  
   507  		default:
   508  			i.err = base.CorruptionErrorf("bitalostable: invalid internal key kind: %d", errors.Safe(key.Kind()))
   509  			i.iterValidityState = IterExhausted
   510  			return
   511  		}
   512  	}
   513  }
   514  
   515  func (i *Iterator) nextPointCurrentUserKey() bool {
   516  	i.pos = iterPosCurForward
   517  
   518  	i.iterKey, i.iterValue = i.iter.Next()
   519  	i.stats.ForwardStepCount[InternalIterCall]++
   520  	if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) {
   521  		i.pos = iterPosNext
   522  		return false
   523  	}
   524  
   525  	key := *i.iterKey
   526  	switch key.Kind() {
   527  	case InternalKeyKindRangeKeySet:
   528  		// RangeKeySets must always be interleaved as the first internal key
   529  		// for a user key.
   530  		i.err = base.CorruptionErrorf("bitalostable: unexpected range key set mid-user key")
   531  		return false
   532  
   533  	case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   534  		return false
   535  
   536  	case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   537  		i.value = i.iterValue
   538  		return true
   539  
   540  	case InternalKeyKindMerge:
   541  		return i.mergeForward(key)
   542  
   543  	default:
   544  		i.err = base.CorruptionErrorf("bitalostable: invalid internal key kind: %d", errors.Safe(key.Kind()))
   545  		return false
   546  	}
   547  }
   548  
   549  // mergeForward resolves a MERGE key, advancing the underlying iterator forward
   550  // to merge with subsequent keys with the same userkey. mergeForward returns a
   551  // boolean indicating whether or not the merge yielded a valid key. A merge may
   552  // not yield a valid key if an error occurred, in which case i.err is non-nil,
   553  // or the user's value merger specified the key to be deleted.
   554  //
   555  // mergeForward does not update iterValidityState.
   556  func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) {
   557  	var valueMerger ValueMerger
   558  	valueMerger, i.err = i.merge(key.UserKey, i.iterValue)
   559  	if i.err != nil {
   560  		return false
   561  	}
   562  
   563  	i.mergeNext(key, valueMerger)
   564  	if i.err != nil {
   565  		return false
   566  	}
   567  
   568  	var needDelete bool
   569  	i.value, needDelete, i.valueCloser, i.err = finishValueMerger(
   570  		valueMerger, true /* includesBase */)
   571  	if i.err != nil {
   572  		return false
   573  	}
   574  	if needDelete {
   575  		_ = i.closeValueCloser()
   576  		return false
   577  	}
   578  	return true
   579  }
   580  
   581  func (i *Iterator) closeValueCloser() error {
   582  	if i.valueCloser != nil {
   583  		i.err = i.valueCloser.Close()
   584  		i.valueCloser = nil
   585  	}
   586  	return i.err
   587  }
   588  
   589  func (i *Iterator) nextUserKey() {
   590  	if i.iterKey == nil {
   591  		return
   592  	}
   593  	trailer := i.iterKey.Trailer
   594  	done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
   595  	if i.iterValidityState != IterValid {
   596  		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
   597  		i.key = i.keyBuf
   598  	}
   599  	for {
   600  		i.iterKey, i.iterValue = i.iter.Next()
   601  		i.stats.ForwardStepCount[InternalIterCall]++
   602  		// NB: We're guaranteed to be on the next user key if the previous key
   603  		// had a zero sequence number (`done`), or the new key has a trailer
   604  		// greater or equal to the previous key's trailer. This is true because
   605  		// internal keys with the same user key are sorted by Trailer in
   606  		// strictly monotonically descending order. We expect the trailer
   607  		// optimization to trigger around 50% of the time with randomly
   608  		// distributed writes. We expect it to trigger very frequently when
   609  		// iterating through ingested sstables, which contain keys that all have
   610  		// the same sequence number.
   611  		if done || i.iterKey == nil || i.iterKey.Trailer >= trailer {
   612  			break
   613  		}
   614  		if !i.equal(i.key, i.iterKey.UserKey) {
   615  			break
   616  		}
   617  		done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer
   618  		trailer = i.iterKey.Trailer
   619  	}
   620  }
   621  
   622  func (i *Iterator) maybeSampleRead() {
   623  	// This method is only called when a public method of Iterator is
   624  	// returning, and below we exclude the case were the iterator is paused at
   625  	// a limit. The effect of these choices is that keys that are deleted, but
   626  	// are encountered during iteration, are not accounted for in the read
   627  	// sampling and will not cause read driven compactions, even though we are
   628  	// incurring cost in iterating over them. And this issue is not limited to
   629  	// Iterator, which does not see the effect of range deletes, which may be
   630  	// causing iteration work in mergingIter. It is not clear at this time
   631  	// whether this is a deficiency worth addressing.
   632  	if i.iterValidityState != IterValid {
   633  		return
   634  	}
   635  	if i.readState == nil {
   636  		return
   637  	}
   638  	if i.readSampling.forceReadSampling {
   639  		i.sampleRead()
   640  		return
   641  	}
   642  	samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier)
   643  	if samplingPeriod <= 0 {
   644  		return
   645  	}
   646  	bytesRead := uint64(len(i.key) + len(i.value))
   647  	for i.readSampling.bytesUntilReadSampling < bytesRead {
   648  		i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod)))
   649  		// The block below tries to adjust for the case where this is the
   650  		// first read in a newly-opened iterator. As bytesUntilReadSampling
   651  		// starts off at zero, we don't want to sample the first read of
   652  		// every newly-opened iterator, but we do want to sample some of them.
   653  		if !i.readSampling.initialSamplePassed {
   654  			i.readSampling.initialSamplePassed = true
   655  			if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) {
   656  				continue
   657  			}
   658  		}
   659  		i.sampleRead()
   660  	}
   661  	i.readSampling.bytesUntilReadSampling -= bytesRead
   662  }
   663  
   664  func (i *Iterator) sampleRead() {
   665  	var topFile *manifest.FileMetadata
   666  	topLevel, numOverlappingLevels := numLevels, 0
   667  	if mi, ok := i.iter.(*mergingIter); ok {
   668  		if len(mi.levels) > 1 {
   669  			mi.ForEachLevelIter(func(li *levelIter) bool {
   670  				l := manifest.LevelToInt(li.level)
   671  				if file := li.files.Current(); file != nil {
   672  					var containsKey bool
   673  					if i.pos == iterPosNext || i.pos == iterPosCurForward ||
   674  						i.pos == iterPosCurForwardPaused {
   675  						containsKey = i.cmp(file.SmallestPointKey.UserKey, i.key) <= 0
   676  					} else if i.pos == iterPosPrev || i.pos == iterPosCurReverse ||
   677  						i.pos == iterPosCurReversePaused {
   678  						containsKey = i.cmp(file.LargestPointKey.UserKey, i.key) >= 0
   679  					}
   680  					// Do nothing if the current key is not contained in file's
   681  					// bounds. We could seek the LevelIterator at this level
   682  					// to find the right file, but the performance impacts of
   683  					// doing that are significant enough to negate the benefits
   684  					// of read sampling in the first place. See the discussion
   685  					// at:
   686  					// https://github.com/zuoyebang/bitalostable/pull/1041#issuecomment-763226492
   687  					if containsKey {
   688  						numOverlappingLevels++
   689  						if numOverlappingLevels >= 2 {
   690  							// Terminate the loop early if at least 2 overlapping levels are found.
   691  							return true
   692  						}
   693  						topLevel = l
   694  						topFile = file
   695  					}
   696  				}
   697  				return false
   698  			})
   699  		}
   700  	}
   701  	if topFile == nil || topLevel >= numLevels {
   702  		return
   703  	}
   704  	if numOverlappingLevels >= 2 {
   705  		allowedSeeks := atomic.AddInt64(&topFile.Atomic.AllowedSeeks, -1)
   706  		if allowedSeeks == 0 {
   707  
   708  			// Since the compaction queue can handle duplicates, we can keep
   709  			// adding to the queue even once allowedSeeks hits 0.
   710  			// In fact, we NEED to keep adding to the queue, because the queue
   711  			// is small and evicts older and possibly useful compactions.
   712  			atomic.AddInt64(&topFile.Atomic.AllowedSeeks, topFile.InitAllowedSeeks)
   713  
   714  			read := readCompaction{
   715  				start:   topFile.SmallestPointKey.UserKey,
   716  				end:     topFile.LargestPointKey.UserKey,
   717  				level:   topLevel,
   718  				fileNum: topFile.FileNum,
   719  			}
   720  			i.readSampling.pendingCompactions.add(&read, i.cmp)
   721  		}
   722  	}
   723  }
   724  
   725  func (i *Iterator) findPrevEntry(limit []byte) {
   726  	i.iterValidityState = IterExhausted
   727  	i.pos = iterPosCurReverse
   728  	if i.opts.rangeKeys() && i.rangeKey != nil {
   729  		i.rangeKey.rangeKeyOnly = false
   730  	}
   731  
   732  	// Close the closer for the current value if one was open.
   733  	if i.valueCloser != nil {
   734  		i.err = i.valueCloser.Close()
   735  		i.valueCloser = nil
   736  		if i.err != nil {
   737  			i.iterValidityState = IterExhausted
   738  			return
   739  		}
   740  	}
   741  
   742  	var valueMerger ValueMerger
   743  	firstLoopIter := true
   744  	rangeKeyBoundary := false
   745  	// The code below compares with limit in multiple places. As documented in
   746  	// findNextEntry, this is being done to make the behavior of limit
   747  	// deterministic to allow for metamorphic testing. It is not required by
   748  	// the best-effort contract of limit.
   749  	for i.iterKey != nil {
   750  		key := *i.iterKey
   751  
   752  		// NB: We cannot pause if the current key is covered by a range key.
   753  		// Otherwise, the user might not ever learn of a range key that covers
   754  		// the key space being iterated over in which there are no point keys.
   755  		// Since limits are best effort, ignoring the limit in this case is
   756  		// allowed by the contract of limit.
   757  		if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
   758  			i.iterValidityState = IterAtLimit
   759  			i.pos = iterPosCurReversePaused
   760  			return
   761  		}
   762  		firstLoopIter = false
   763  
   764  		if i.iterValidityState == IterValid {
   765  			if !i.equal(key.UserKey, i.key) {
   766  				// We've iterated to the previous user key.
   767  				i.pos = iterPosPrev
   768  				if valueMerger != nil {
   769  					var needDelete bool
   770  					i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
   771  					if i.err == nil && needDelete {
   772  						// The point key at this key is deleted. If we also have
   773  						// a range key boundary at this key, we still want to
   774  						// return. Otherwise, we need to continue looking for
   775  						// a live key.
   776  						i.value = nil
   777  						if rangeKeyBoundary {
   778  							i.rangeKey.rangeKeyOnly = true
   779  						} else {
   780  							i.iterValidityState = IterExhausted
   781  							if i.closeValueCloser() == nil {
   782  								continue
   783  							}
   784  						}
   785  					}
   786  				}
   787  				if i.err != nil {
   788  					i.iterValidityState = IterExhausted
   789  				}
   790  				return
   791  			}
   792  		}
   793  
   794  		switch key.Kind() {
   795  		case InternalKeyKindRangeKeySet:
   796  			// Range key start boundary markers are interleaved with the maximum
   797  			// sequence number, so if there's a point key also at this key, we
   798  			// must've already iterated over it.
   799  			// This is the final entry at this user key, so we may return
   800  			i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid
   801  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   802  			i.key = i.keyBuf
   803  			i.iterValidityState = IterValid
   804  			i.saveRangeKey()
   805  			// In all other cases, previous iteration requires advancing to
   806  			// iterPosPrev in order to determine if the key is live and
   807  			// unshadowed by another key at the same user key. In this case,
   808  			// because range key start boundary markers are always interleaved
   809  			// at the maximum sequence number, we know that there aren't any
   810  			// additional keys with the same user key in the backward direction.
   811  			//
   812  			// We Prev the underlying iterator once anyways for consistency, so
   813  			// that we can maintain the invariant during backward iteration that
   814  			// i.iterPos = iterPosPrev.
   815  			i.stats.ReverseStepCount[InternalIterCall]++
   816  			i.iterKey, i.iterValue = i.iter.Prev()
   817  
   818  			// Set rangeKeyBoundary so that on the next iteration, we know to
   819  			// return the key even if the MERGE point key is deleted.
   820  			rangeKeyBoundary = true
   821  
   822  		case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   823  			i.value = nil
   824  			i.iterValidityState = IterExhausted
   825  			valueMerger = nil
   826  			i.iterKey, i.iterValue = i.iter.Prev()
   827  			i.stats.ReverseStepCount[InternalIterCall]++
   828  			// Compare with the limit. We could optimize by only checking when
   829  			// we step to the previous user key, but detecting that requires a
   830  			// comparison too. Note that this position may already passed a
   831  			// number of versions of this user key, but they are all deleted,
   832  			// so the fact that a subsequent Prev*() call will not see them is
   833  			// harmless. Also note that this is the only place in the loop,
   834  			// other than the firstLoopIter case above, where we could step
   835  			// to a different user key and start processing it for returning
   836  			// to the caller.
   837  			if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) {
   838  				i.iterValidityState = IterAtLimit
   839  				i.pos = iterPosCurReversePaused
   840  				return
   841  			}
   842  			continue
   843  
   844  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   845  			i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   846  			i.key = i.keyBuf
   847  			// iterValue is owned by i.iter and could change after the Prev()
   848  			// call, so use valueBuf instead. Note that valueBuf is only used
   849  			// in this one instance; everywhere else (eg. in findNextEntry),
   850  			// we just point i.value to the unsafe i.iter-owned value buffer.
   851  			i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
   852  			i.value = i.valueBuf
   853  			i.saveRangeKey()
   854  			i.iterValidityState = IterValid
   855  			i.iterKey, i.iterValue = i.iter.Prev()
   856  			i.stats.ReverseStepCount[InternalIterCall]++
   857  			valueMerger = nil
   858  			continue
   859  
   860  		case InternalKeyKindMerge:
   861  			if i.iterValidityState == IterExhausted {
   862  				i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   863  				i.key = i.keyBuf
   864  				i.saveRangeKey()
   865  				valueMerger, i.err = i.merge(i.key, i.iterValue)
   866  				if i.err != nil {
   867  					return
   868  				}
   869  				i.iterValidityState = IterValid
   870  			} else if valueMerger == nil {
   871  				valueMerger, i.err = i.merge(i.key, i.value)
   872  				if i.err == nil {
   873  					i.err = valueMerger.MergeNewer(i.iterValue)
   874  				}
   875  				if i.err != nil {
   876  					i.iterValidityState = IterExhausted
   877  					return
   878  				}
   879  			} else {
   880  				i.err = valueMerger.MergeNewer(i.iterValue)
   881  				if i.err != nil {
   882  					i.iterValidityState = IterExhausted
   883  					return
   884  				}
   885  			}
   886  			i.iterKey, i.iterValue = i.iter.Prev()
   887  			i.stats.ReverseStepCount[InternalIterCall]++
   888  			continue
   889  
   890  		default:
   891  			i.err = base.CorruptionErrorf("bitalostable: invalid internal key kind: %d", errors.Safe(key.Kind()))
   892  			i.iterValidityState = IterExhausted
   893  			return
   894  		}
   895  	}
   896  
   897  	// i.iterKey == nil, so broke out of the preceding loop.
   898  	if i.iterValidityState == IterValid {
   899  		i.pos = iterPosPrev
   900  		if valueMerger != nil {
   901  			var needDelete bool
   902  			i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */)
   903  			if i.err == nil && needDelete {
   904  				i.key = nil
   905  				i.value = nil
   906  				i.iterValidityState = IterExhausted
   907  			}
   908  		}
   909  		if i.err != nil {
   910  			i.iterValidityState = IterExhausted
   911  		}
   912  	}
   913  }
   914  
   915  func (i *Iterator) prevUserKey() {
   916  	if i.iterKey == nil {
   917  		return
   918  	}
   919  	if i.iterValidityState != IterValid {
   920  		// If we're going to compare against the prev key, we need to save the
   921  		// current key.
   922  		i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
   923  		i.key = i.keyBuf
   924  	}
   925  	for {
   926  		i.iterKey, i.iterValue = i.iter.Prev()
   927  		i.stats.ReverseStepCount[InternalIterCall]++
   928  		if i.iterKey == nil {
   929  			break
   930  		}
   931  		if !i.equal(i.key, i.iterKey.UserKey) {
   932  			break
   933  		}
   934  	}
   935  }
   936  
   937  func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) {
   938  	// Save the current key.
   939  	i.keyBuf = append(i.keyBuf[:0], key.UserKey...)
   940  	i.key = i.keyBuf
   941  
   942  	// Loop looking for older values for this key and merging them.
   943  	for {
   944  		i.iterKey, i.iterValue = i.iter.Next()
   945  		i.stats.ForwardStepCount[InternalIterCall]++
   946  		if i.iterKey == nil {
   947  			i.pos = iterPosNext
   948  			return
   949  		}
   950  		key = *i.iterKey
   951  		if !i.equal(i.key, key.UserKey) {
   952  			// We've advanced to the next key.
   953  			i.pos = iterPosNext
   954  			return
   955  		}
   956  		switch key.Kind() {
   957  		case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   958  			// We've hit a deletion tombstone. Return everything up to this
   959  			// point.
   960  			return
   961  
   962  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   963  			// We've hit a Set value. Merge with the existing value and return.
   964  			i.err = valueMerger.MergeOlder(i.iterValue)
   965  			return
   966  
   967  		case InternalKeyKindMerge:
   968  			// We've hit another Merge value. Merge with the existing value and
   969  			// continue looping.
   970  			i.err = valueMerger.MergeOlder(i.iterValue)
   971  			if i.err != nil {
   972  				return
   973  			}
   974  			continue
   975  
   976  		case InternalKeyKindRangeKeySet:
   977  			// The RANGEKEYSET marker must sort before a MERGE at the same user key.
   978  			i.err = base.CorruptionErrorf("bitalostable: out of order range key marker")
   979  			return
   980  
   981  		default:
   982  			i.err = base.CorruptionErrorf("bitalostable: invalid internal key kind: %d", errors.Safe(key.Kind()))
   983  			return
   984  		}
   985  	}
   986  }
   987  
   988  // SeekGE moves the iterator to the first key/value pair whose key is greater
   989  // than or equal to the given key. Returns true if the iterator is pointing at
   990  // a valid entry and false otherwise.
   991  func (i *Iterator) SeekGE(key []byte) bool {
   992  	return i.SeekGEWithLimit(key, nil) == IterValid
   993  }
   994  
   995  // SeekGEWithLimit moves the iterator to the first key/value pair whose key is
   996  // greater than or equal to the given key.
   997  //
   998  // If limit is provided, it serves as a best-effort exclusive limit. If the
   999  // first key greater than or equal to the given search key is also greater than
  1000  // or equal to limit, the Iterator may pause and return IterAtLimit. Because
  1001  // limits are best-effort, SeekGEWithLimit may return a key beyond limit.
  1002  //
  1003  // If the Iterator is configured to iterate over range keys, SeekGEWithLimit
  1004  // guarantees it will surface any range keys with bounds overlapping the
  1005  // keyspace [key, limit).
  1006  func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState {
  1007  	lastPositioningOp := i.lastPositioningOp
  1008  	// Set it to unknown, since this operation may not succeed, in which case
  1009  	// the SeekGE following this should not make any assumption about iterator
  1010  	// position.
  1011  	i.lastPositioningOp = unknownLastPositionOp
  1012  	i.requiresReposition = false
  1013  	i.err = nil
  1014  	i.hasPrefix = false
  1015  	i.stats.ForwardSeekCount[InterfaceCall]++
  1016  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1017  		key = lowerBound
  1018  	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1019  		key = upperBound
  1020  	}
  1021  	if i.rangeKey != nil {
  1022  		i.rangeKey.updated = false
  1023  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1024  	}
  1025  	seekInternalIter := true
  1026  	var flags base.SeekGEFlags
  1027  	// The following noop optimization only applies when i.batch == nil, since
  1028  	// an iterator over a batch is iterating over mutable data, that may have
  1029  	// changed since the last seek.
  1030  	if lastPositioningOp == seekGELastPositioningOp && i.batch == nil {
  1031  		cmp := i.cmp(i.prefixOrFullSeekKey, key)
  1032  		// If this seek is to the same or later key, and the iterator is
  1033  		// already positioned there, this is a noop. This can be helpful for
  1034  		// sparse key spaces that have many deleted keys, where one can avoid
  1035  		// the overhead of iterating past them again and again.
  1036  		if cmp <= 0 {
  1037  			if i.iterValidityState == IterExhausted ||
  1038  				(i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 &&
  1039  					(limit == nil || i.cmp(i.key, limit) < 0)) {
  1040  				// Noop
  1041  				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt {
  1042  					i.lastPositioningOp = seekGELastPositioningOp
  1043  					return i.iterValidityState
  1044  				}
  1045  			}
  1046  			// cmp == 0 is not safe to optimize since
  1047  			// - i.pos could be at iterPosNext, due to a merge.
  1048  			// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
  1049  			//   SET pair for a key, and the iterator would have moved past DELETE
  1050  			//   but stayed at iterPosCurForward. A similar situation occurs for a
  1051  			//   MERGE, SET pair where the MERGE is consumed and the iterator is
  1052  			//   at the SET.
  1053  			// We also leverage the IterAtLimit <=> i.pos invariant defined in the
  1054  			// comment on iterValidityState, to exclude any cases where i.pos
  1055  			// is iterPosCur{Forward,Reverse}Paused. This avoids the need to
  1056  			// special-case those iterator positions and their interactions with
  1057  			// TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB
  1058  			// do not use limited Seeks in the first place.
  1059  			if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil {
  1060  				flags = flags.EnableTrySeekUsingNext()
  1061  			}
  1062  			if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1063  				flags = flags.DisableTrySeekUsingNext()
  1064  			}
  1065  			if i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 {
  1066  				// Have some work to do, but don't need to seek, and we can
  1067  				// start doing findNextEntry from i.iterKey.
  1068  				seekInternalIter = false
  1069  			}
  1070  		}
  1071  	}
  1072  	if seekInternalIter {
  1073  		i.iterKey, i.iterValue = i.iter.SeekGE(key, flags)
  1074  		i.stats.ForwardSeekCount[InternalIterCall]++
  1075  	}
  1076  	i.findNextEntry(limit)
  1077  	i.maybeSampleRead()
  1078  	if i.Error() == nil && i.batch == nil {
  1079  		// Prepare state for a future noop optimization.
  1080  		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
  1081  		i.lastPositioningOp = seekGELastPositioningOp
  1082  	}
  1083  	return i.iterValidityState
  1084  }
  1085  
  1086  // SeekPrefixGE moves the iterator to the first key/value pair whose key is
  1087  // greater than or equal to the given key and which has the same "prefix" as
  1088  // the given key. The prefix for a key is determined by the user-defined
  1089  // Comparer.Split function. The iterator will not observe keys not matching the
  1090  // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix
  1091  // iteration mode. The iterator remains in prefix iteration until a subsequent
  1092  // call to another absolute positioning method (SeekGE, SeekLT, First,
  1093  // Last). Reverse iteration (Prev) is not supported when an iterator is in
  1094  // prefix iteration mode. Returns true if the iterator is pointing at a valid
  1095  // entry and false otherwise.
  1096  //
  1097  // The semantics of SeekPrefixGE are slightly unusual and designed for
  1098  // iteration to be able to take advantage of bloom filters that have been
  1099  // created on the "prefix". If you're not using bloom filters, there is no
  1100  // reason to use SeekPrefixGE.
  1101  //
  1102  // An example Split function may separate a timestamp suffix from the prefix of
  1103  // the key.
  1104  //
  1105  //	Split(<key>@<timestamp>) -> <key>
  1106  //
  1107  // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys
  1108  // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the
  1109  // usual definition, those prefixes differ by the definition of the Split
  1110  // function. To see how this works, consider the following set of calls on this
  1111  // data set:
  1112  //
  1113  //	SeekPrefixGE("a@0") -> "a@1"
  1114  //	Next()              -> "a@2"
  1115  //	Next()              -> EOF
  1116  //
  1117  // If you're just looking to iterate over keys with a shared prefix, as
  1118  // defined by the configured comparer, set iterator bounds instead:
  1119  //
  1120  //	iter := db.NewIter(&bitalostable.IterOptions{
  1121  //	  LowerBound: []byte("prefix"),
  1122  //	  UpperBound: []byte("prefiy"),
  1123  //	})
  1124  //	for iter.First(); iter.Valid(); iter.Next() {
  1125  //	  // Only keys beginning with "prefix" will be visited.
  1126  //	}
  1127  //
  1128  // See ExampleIterator_SeekPrefixGE for a working example.
  1129  //
  1130  // When iterating with range keys enabled, all range keys encountered are
  1131  // truncated to the seek key's prefix's bounds. The truncation of the upper
  1132  // bound requires that the database's Comparer is configured with a
  1133  // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the
  1134  // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)].
  1135  func (i *Iterator) SeekPrefixGE(key []byte) bool {
  1136  	lastPositioningOp := i.lastPositioningOp
  1137  	// Set it to unknown, since this operation may not succeed, in which case
  1138  	// the SeekPrefixGE following this should not make any assumption about
  1139  	// iterator position.
  1140  	i.lastPositioningOp = unknownLastPositionOp
  1141  	i.requiresReposition = false
  1142  	i.err = nil
  1143  	i.stats.ForwardSeekCount[InterfaceCall]++
  1144  	if i.rangeKey != nil {
  1145  		i.rangeKey.updated = false
  1146  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1147  	}
  1148  	if i.comparer.Split == nil {
  1149  		panic("bitalostable: split must be provided for SeekPrefixGE")
  1150  	}
  1151  	if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly {
  1152  		panic("bitalostable: ImmediateSuccessor must be provided for SeekPrefixGE with range keys")
  1153  	}
  1154  	prefixLen := i.split(key)
  1155  	keyPrefix := key[:prefixLen]
  1156  	var flags base.SeekGEFlags
  1157  	if lastPositioningOp == seekPrefixGELastPositioningOp {
  1158  		if !i.hasPrefix {
  1159  			panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false")
  1160  		}
  1161  		// The iterator has not been repositioned after the last SeekPrefixGE.
  1162  		// See if we are seeking to a larger key, since then we can optimize
  1163  		// the seek by using next. Note that we could also optimize if Next
  1164  		// has been called, if the iterator is not exhausted and the current
  1165  		// position is <= the seek key. We are keeping this limited for now
  1166  		// since such optimizations require care for correctness, and to not
  1167  		// become de-optimizations (if one usually has to do all the next
  1168  		// calls and then the seek). This SeekPrefixGE optimization
  1169  		// specifically benefits CockroachDB.
  1170  		cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix)
  1171  		// cmp == 0 is not safe to optimize since
  1172  		// - i.pos could be at iterPosNext, due to a merge.
  1173  		// - Even if i.pos were at iterPosCurForward, we could have a DELETE,
  1174  		//   SET pair for a key, and the iterator would have moved past DELETE
  1175  		//   but stayed at iterPosCurForward. A similar situation occurs for a
  1176  		//   MERGE, SET pair where the MERGE is consumed and the iterator is
  1177  		//   at the SET.
  1178  		// In general some versions of i.prefix could have been consumed by
  1179  		// the iterator, so we only optimize for cmp < 0.
  1180  		if cmp < 0 {
  1181  			flags = flags.EnableTrySeekUsingNext()
  1182  		}
  1183  		if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1184  			flags = flags.DisableTrySeekUsingNext()
  1185  		}
  1186  	}
  1187  	// Make a copy of the prefix so that modifications to the key after
  1188  	// SeekPrefixGE returns does not affect the stored prefix.
  1189  	if cap(i.prefixOrFullSeekKey) < prefixLen {
  1190  		i.prefixOrFullSeekKey = make([]byte, prefixLen)
  1191  	} else {
  1192  		i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen]
  1193  	}
  1194  	i.hasPrefix = true
  1195  	copy(i.prefixOrFullSeekKey, keyPrefix)
  1196  
  1197  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1198  		if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) {
  1199  			i.err = errors.New("bitalostable: SeekPrefixGE supplied with key outside of lower bound")
  1200  			i.iterValidityState = IterExhausted
  1201  			return false
  1202  		}
  1203  		key = lowerBound
  1204  	} else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1205  		if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) {
  1206  			i.err = errors.New("bitalostable: SeekPrefixGE supplied with key outside of upper bound")
  1207  			i.iterValidityState = IterExhausted
  1208  			return false
  1209  		}
  1210  		key = upperBound
  1211  	}
  1212  	i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags)
  1213  	i.stats.ForwardSeekCount[InternalIterCall]++
  1214  	i.findNextEntry(nil)
  1215  	i.maybeSampleRead()
  1216  	if i.Error() == nil {
  1217  		i.lastPositioningOp = seekPrefixGELastPositioningOp
  1218  	}
  1219  	return i.iterValidityState == IterValid
  1220  }
  1221  
  1222  // Deterministic disabling of the seek optimization. It uses the iterator
  1223  // pointer, since we want diversity in iterator behavior for the same key.
  1224  // Used for tests.
  1225  func disableSeekOpt(key []byte, ptr uintptr) bool {
  1226  	// Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
  1227  	simpleHash := (11400714819323198485 * uint64(ptr)) >> 63
  1228  	return key != nil && key[0]&byte(1) == 0 && simpleHash == 0
  1229  }
  1230  
  1231  // SeekLT moves the iterator to the last key/value pair whose key is less than
  1232  // the given key. Returns true if the iterator is pointing at a valid entry and
  1233  // false otherwise.
  1234  func (i *Iterator) SeekLT(key []byte) bool {
  1235  	return i.SeekLTWithLimit(key, nil) == IterValid
  1236  }
  1237  
  1238  // SeekLTWithLimit moves the iterator to the last key/value pair whose key is
  1239  // less than the given key.
  1240  //
  1241  // If limit is provided, it serves as a best-effort inclusive limit. If the last
  1242  // key less than the given search key is also less than limit, the Iterator may
  1243  // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit
  1244  // may return a key beyond limit.
  1245  //
  1246  // If the Iterator is configured to iterate over range keys, SeekLTWithLimit
  1247  // guarantees it will surface any range keys with bounds overlapping the
  1248  // keyspace up to limit.
  1249  func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState {
  1250  	lastPositioningOp := i.lastPositioningOp
  1251  	// Set it to unknown, since this operation may not succeed, in which case
  1252  	// the SeekLT following this should not make any assumption about iterator
  1253  	// position.
  1254  	i.lastPositioningOp = unknownLastPositionOp
  1255  	i.requiresReposition = false
  1256  	i.err = nil
  1257  	i.stats.ReverseSeekCount[InterfaceCall]++
  1258  	if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 {
  1259  		key = upperBound
  1260  	} else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 {
  1261  		key = lowerBound
  1262  	}
  1263  	if i.rangeKey != nil {
  1264  		i.rangeKey.updated = false
  1265  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1266  	}
  1267  	i.hasPrefix = false
  1268  	seekInternalIter := true
  1269  	// The following noop optimization only applies when i.batch == nil, since
  1270  	// an iterator over a batch is iterating over mutable data, that may have
  1271  	// changed since the last seek.
  1272  	if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil {
  1273  		cmp := i.cmp(key, i.prefixOrFullSeekKey)
  1274  		// If this seek is to the same or earlier key, and the iterator is
  1275  		// already positioned there, this is a noop. This can be helpful for
  1276  		// sparse key spaces that have many deleted keys, where one can avoid
  1277  		// the overhead of iterating past them again and again.
  1278  		if cmp <= 0 {
  1279  			// NB: when pos != iterPosCurReversePaused, the invariant
  1280  			// documented earlier implies that iterValidityState !=
  1281  			// IterAtLimit.
  1282  			if i.iterValidityState == IterExhausted ||
  1283  				(i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 &&
  1284  					(limit == nil || i.cmp(limit, i.key) <= 0)) {
  1285  				if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) {
  1286  					i.lastPositioningOp = seekLTLastPositioningOp
  1287  					return i.iterValidityState
  1288  				}
  1289  			}
  1290  			if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 {
  1291  				// Have some work to do, but don't need to seek, and we can
  1292  				// start doing findPrevEntry from i.iterKey.
  1293  				seekInternalIter = false
  1294  			}
  1295  		}
  1296  	}
  1297  	if seekInternalIter {
  1298  		i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone)
  1299  		i.stats.ReverseSeekCount[InternalIterCall]++
  1300  	}
  1301  	i.findPrevEntry(limit)
  1302  	i.maybeSampleRead()
  1303  	if i.Error() == nil && i.batch == nil {
  1304  		// Prepare state for a future noop optimization.
  1305  		i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...)
  1306  		i.lastPositioningOp = seekLTLastPositioningOp
  1307  	}
  1308  	return i.iterValidityState
  1309  }
  1310  
  1311  // First moves the iterator the the first key/value pair. Returns true if the
  1312  // iterator is pointing at a valid entry and false otherwise.
  1313  func (i *Iterator) First() bool {
  1314  	i.err = nil
  1315  	i.hasPrefix = false
  1316  	i.lastPositioningOp = unknownLastPositionOp
  1317  	i.requiresReposition = false
  1318  	i.stats.ForwardSeekCount[InterfaceCall]++
  1319  	if i.rangeKey != nil {
  1320  		i.rangeKey.updated = false
  1321  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1322  	}
  1323  
  1324  	if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
  1325  		i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
  1326  		i.stats.ForwardSeekCount[InternalIterCall]++
  1327  	} else {
  1328  		i.iterKey, i.iterValue = i.iter.First()
  1329  		i.stats.ForwardSeekCount[InternalIterCall]++
  1330  	}
  1331  	i.findNextEntry(nil)
  1332  	i.maybeSampleRead()
  1333  	return i.iterValidityState == IterValid
  1334  }
  1335  
  1336  // Last moves the iterator the the last key/value pair. Returns true if the
  1337  // iterator is pointing at a valid entry and false otherwise.
  1338  func (i *Iterator) Last() bool {
  1339  	i.err = nil
  1340  	i.hasPrefix = false
  1341  	i.lastPositioningOp = unknownLastPositionOp
  1342  	i.requiresReposition = false
  1343  	i.stats.ReverseSeekCount[InterfaceCall]++
  1344  	if i.rangeKey != nil {
  1345  		i.rangeKey.updated = false
  1346  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1347  	}
  1348  
  1349  	if upperBound := i.opts.GetUpperBound(); upperBound != nil {
  1350  		i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
  1351  		i.stats.ReverseSeekCount[InternalIterCall]++
  1352  	} else {
  1353  		i.iterKey, i.iterValue = i.iter.Last()
  1354  		i.stats.ReverseSeekCount[InternalIterCall]++
  1355  	}
  1356  	i.findPrevEntry(nil)
  1357  	i.maybeSampleRead()
  1358  	return i.iterValidityState == IterValid
  1359  }
  1360  
  1361  // Next moves the iterator to the next key/value pair. Returns true if the
  1362  // iterator is pointing at a valid entry and false otherwise.
  1363  func (i *Iterator) Next() bool {
  1364  	return i.NextWithLimit(nil) == IterValid
  1365  }
  1366  
  1367  // NextWithLimit moves the iterator to the next key/value pair.
  1368  //
  1369  // If limit is provided, it serves as a best-effort exclusive limit. If the next
  1370  // key  is greater than or equal to limit, the Iterator may pause and return
  1371  // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key
  1372  // beyond limit.
  1373  //
  1374  // If the Iterator is configured to iterate over range keys, NextWithLimit
  1375  // guarantees it will surface any range keys with bounds overlapping the
  1376  // keyspace up to limit.
  1377  func (i *Iterator) NextWithLimit(limit []byte) IterValidityState {
  1378  	i.stats.ForwardStepCount[InterfaceCall]++
  1379  	if i.hasPrefix {
  1380  		if limit != nil {
  1381  			i.err = errors.New("cannot use limit with prefix iteration")
  1382  			i.iterValidityState = IterExhausted
  1383  			return i.iterValidityState
  1384  		} else if i.iterValidityState == IterExhausted {
  1385  			// No-op, already exhasuted. We avoid executing the Next because it
  1386  			// can break invariants: Specifically, a file that fails the bloom
  1387  			// filter test may result in its level being removed from the
  1388  			// merging iterator. The level's removal can cause a lazy combined
  1389  			// iterator to miss range keys and trigger a switch to combined
  1390  			// iteration at a larger key, breaking keyspan invariants.
  1391  			return i.iterValidityState
  1392  		}
  1393  	}
  1394  	if i.err != nil {
  1395  		return i.iterValidityState
  1396  	}
  1397  	i.lastPositioningOp = unknownLastPositionOp
  1398  	i.requiresReposition = false
  1399  	if i.rangeKey != nil {
  1400  		i.rangeKey.updated = false
  1401  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1402  	}
  1403  	switch i.pos {
  1404  	case iterPosCurForward:
  1405  		i.nextUserKey()
  1406  	case iterPosCurForwardPaused:
  1407  		// Already at the right place.
  1408  	case iterPosCurReverse:
  1409  		// Switching directions.
  1410  		// Unless the iterator was exhausted, reverse iteration needs to
  1411  		// position the iterator at iterPosPrev.
  1412  		if i.iterKey != nil {
  1413  			i.err = errors.New("switching from reverse to forward but iter is not at prev")
  1414  			i.iterValidityState = IterExhausted
  1415  			return i.iterValidityState
  1416  		}
  1417  		// We're positioned before the first key. Need to reposition to point to
  1418  		// the first key.
  1419  		if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
  1420  			i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
  1421  			i.stats.ForwardSeekCount[InternalIterCall]++
  1422  		} else {
  1423  			i.iterKey, i.iterValue = i.iter.First()
  1424  			i.stats.ForwardSeekCount[InternalIterCall]++
  1425  		}
  1426  	case iterPosCurReversePaused:
  1427  		// Switching directions.
  1428  		// The iterator must not be exhausted since it paused.
  1429  		if i.iterKey == nil {
  1430  			i.err = errors.New("switching paused from reverse to forward but iter is exhausted")
  1431  			i.iterValidityState = IterExhausted
  1432  			return i.iterValidityState
  1433  		}
  1434  		i.nextUserKey()
  1435  	case iterPosPrev:
  1436  		// The underlying iterator is pointed to the previous key (this can
  1437  		// only happen when switching iteration directions). We set
  1438  		// i.iterValidityState to IterExhausted here to force the calls to
  1439  		// nextUserKey to save the current key i.iter is pointing at in order
  1440  		// to determine when the next user-key is reached.
  1441  		i.iterValidityState = IterExhausted
  1442  		if i.iterKey == nil {
  1443  			// We're positioned before the first key. Need to reposition to point to
  1444  			// the first key.
  1445  			if lowerBound := i.opts.GetLowerBound(); lowerBound != nil {
  1446  				i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone)
  1447  				i.stats.ForwardSeekCount[InternalIterCall]++
  1448  			} else {
  1449  				i.iterKey, i.iterValue = i.iter.First()
  1450  				i.stats.ForwardSeekCount[InternalIterCall]++
  1451  			}
  1452  		} else {
  1453  			i.nextUserKey()
  1454  		}
  1455  		i.nextUserKey()
  1456  	case iterPosNext:
  1457  		// Already at the right place.
  1458  	}
  1459  	i.findNextEntry(limit)
  1460  	i.maybeSampleRead()
  1461  	return i.iterValidityState
  1462  }
  1463  
  1464  // Prev moves the iterator to the previous key/value pair. Returns true if the
  1465  // iterator is pointing at a valid entry and false otherwise.
  1466  func (i *Iterator) Prev() bool {
  1467  	return i.PrevWithLimit(nil) == IterValid
  1468  }
  1469  
  1470  // PrevWithLimit moves the iterator to the previous key/value pair.
  1471  //
  1472  // If limit is provided, it serves as a best-effort inclusive limit. If the
  1473  // previous key is less than limit, the Iterator may pause and return
  1474  // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key
  1475  // beyond limit.
  1476  //
  1477  // If the Iterator is configured to iterate over range keys, PrevWithLimit
  1478  // guarantees it will surface any range keys with bounds overlapping the
  1479  // keyspace up to limit.
  1480  func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState {
  1481  	i.stats.ReverseStepCount[InterfaceCall]++
  1482  	if i.err != nil {
  1483  		return i.iterValidityState
  1484  	}
  1485  	i.lastPositioningOp = unknownLastPositionOp
  1486  	i.requiresReposition = false
  1487  	if i.rangeKey != nil {
  1488  		i.rangeKey.updated = false
  1489  		i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid()
  1490  	}
  1491  	if i.hasPrefix {
  1492  		i.err = errReversePrefixIteration
  1493  		i.iterValidityState = IterExhausted
  1494  		return i.iterValidityState
  1495  	}
  1496  	switch i.pos {
  1497  	case iterPosCurForward:
  1498  		// Switching directions, and will handle this below.
  1499  	case iterPosCurForwardPaused:
  1500  		// Switching directions, and will handle this below.
  1501  	case iterPosCurReverse:
  1502  		i.prevUserKey()
  1503  	case iterPosCurReversePaused:
  1504  		// Already at the right place.
  1505  	case iterPosNext:
  1506  		// The underlying iterator is pointed to the next key (this can only happen
  1507  		// when switching iteration directions). We will handle this below.
  1508  	case iterPosPrev:
  1509  		// Already at the right place.
  1510  	}
  1511  	if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused {
  1512  		// Switching direction.
  1513  		stepAgain := i.pos == iterPosNext
  1514  
  1515  		// Synthetic range key markers are a special case. Consider SeekGE(b)
  1516  		// which finds a range key [a, c). To ensure the user observes the range
  1517  		// key, the Iterator pauses at Key() = b. The iterator must advance the
  1518  		// internal iterator to see if there's also a coincident point key at
  1519  		// 'b', leaving the iterator at iterPosNext if there's not.
  1520  		//
  1521  		// This is a problem: Synthetic range key markers are only interleaved
  1522  		// during the original seek. A subsequent Prev() of i.iter will not move
  1523  		// back onto the synthetic range key marker. In this case where the
  1524  		// previous iterator position was a synthetic range key start boundary,
  1525  		// we must not step a second time.
  1526  		if i.isEphemeralPosition() {
  1527  			stepAgain = false
  1528  		}
  1529  
  1530  		// We set i.iterValidityState to IterExhausted here to force the calls
  1531  		// to prevUserKey to save the current key i.iter is pointing at in
  1532  		// order to determine when the prev user-key is reached.
  1533  		i.iterValidityState = IterExhausted
  1534  		if i.iterKey == nil {
  1535  			// We're positioned after the last key. Need to reposition to point to
  1536  			// the last key.
  1537  			if upperBound := i.opts.GetUpperBound(); upperBound != nil {
  1538  				i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone)
  1539  				i.stats.ReverseSeekCount[InternalIterCall]++
  1540  			} else {
  1541  				i.iterKey, i.iterValue = i.iter.Last()
  1542  				i.stats.ReverseSeekCount[InternalIterCall]++
  1543  			}
  1544  		} else {
  1545  			i.prevUserKey()
  1546  		}
  1547  		if stepAgain {
  1548  			i.prevUserKey()
  1549  		}
  1550  	}
  1551  	i.findPrevEntry(limit)
  1552  	i.maybeSampleRead()
  1553  	return i.iterValidityState
  1554  }
  1555  
  1556  // RangeKeyData describes a range key's data, set through RangeKeySet. The key
  1557  // boundaries of the range key is provided by Iterator.RangeBounds.
  1558  type RangeKeyData struct {
  1559  	Suffix []byte
  1560  	Value  []byte
  1561  }
  1562  
  1563  // rangeKeyWithinLimit is called during limited reverse iteration when
  1564  // positioned over a key beyond the limit. If there exists a range key that lies
  1565  // within the limit, the iterator must not pause in order to ensure the user has
  1566  // an opportunity to observe the range key within limit.
  1567  //
  1568  // It would be valid to ignore the limit whenever there's a range key covering
  1569  // the key, but that would introduce nondeterminism. To preserve determinism for
  1570  // testing, the iterator ignores the limit only if the covering range key does
  1571  // cover the keyspace within the limit.
  1572  //
  1573  // This awkwardness exists because range keys are interleaved at their inclusive
  1574  // start positions. Note that limit is inclusive.
  1575  func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool {
  1576  	if i.rangeKey == nil || !i.opts.rangeKeys() {
  1577  		return false
  1578  	}
  1579  	s := i.rangeKey.iiter.Span()
  1580  	// If the range key ends beyond the limit, then the range key does not cover
  1581  	// any portion of the keyspace within the limit and it is safe to pause.
  1582  	return s != nil && i.cmp(s.End, limit) > 0
  1583  }
  1584  
  1585  // saveRangeKey saves the current range key to the underlying iterator's current
  1586  // range key state. If the range key has not changed, saveRangeKey is a no-op.
  1587  // If there is a new range key, saveRangeKey copies all of the key, value and
  1588  // suffixes into Iterator-managed buffers.
  1589  func (i *Iterator) saveRangeKey() {
  1590  	if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly {
  1591  		return
  1592  	}
  1593  
  1594  	s := i.rangeKey.iiter.Span()
  1595  	if s == nil {
  1596  		i.rangeKey.hasRangeKey = false
  1597  		i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey
  1598  		return
  1599  	} else if !i.rangeKey.stale {
  1600  		// The range key `s` is identical to the one currently saved. No-op.
  1601  		return
  1602  	}
  1603  
  1604  	if s.KeysOrder != keyspan.BySuffixAsc {
  1605  		panic("bitalostable: range key span's keys unexpectedly not in ascending suffix order")
  1606  	}
  1607  
  1608  	// Although `i.rangeKey.stale` is true, the span s may still be identical
  1609  	// to the currently saved span. This is possible when seeking the iterator,
  1610  	// which may land back on the same range key. If we previously had a range
  1611  	// key and the new one has an identical start key, then it must be the same
  1612  	// range key and we can avoid copying and keep `i.rangeKey.updated=false`.
  1613  	//
  1614  	// TODO(jackson): These key comparisons could be avoidable during relative
  1615  	// positioning operations continuing in the same direction, because these
  1616  	// ops will never encounter the previous position's range key while
  1617  	// stale=true. However, threading whether the current op is a seek or step
  1618  	// maybe isn't worth it. This key comparison is only necessary once when we
  1619  	// step onto a new range key, which should be relatively rare.
  1620  	if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) &&
  1621  		i.equal(i.rangeKey.end, s.End) {
  1622  		i.rangeKey.updated = false
  1623  		i.rangeKey.stale = false
  1624  		i.rangeKey.hasRangeKey = true
  1625  		return
  1626  	}
  1627  
  1628  	i.rangeKey.hasRangeKey = true
  1629  	i.rangeKey.updated = true
  1630  	i.rangeKey.stale = false
  1631  	i.rangeKey.buf = append(i.rangeKey.buf[:0], s.Start...)
  1632  	i.rangeKey.start = i.rangeKey.buf
  1633  	i.rangeKey.buf = append(i.rangeKey.buf, s.End...)
  1634  	i.rangeKey.end = i.rangeKey.buf[len(i.rangeKey.buf)-len(s.End):]
  1635  	i.rangeKey.keys = i.rangeKey.keys[:0]
  1636  	for j := 0; j < len(s.Keys); j++ {
  1637  		if invariants.Enabled {
  1638  			if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet {
  1639  				panic("bitalostable: user iteration encountered non-RangeKeySet key kind")
  1640  			} else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 {
  1641  				panic("bitalostable: user iteration encountered range keys not in suffix order")
  1642  			}
  1643  		}
  1644  		i.rangeKey.buf = append(i.rangeKey.buf, s.Keys[j].Suffix...)
  1645  		suffix := i.rangeKey.buf[len(i.rangeKey.buf)-len(s.Keys[j].Suffix):]
  1646  		i.rangeKey.buf = append(i.rangeKey.buf, s.Keys[j].Value...)
  1647  		value := i.rangeKey.buf[len(i.rangeKey.buf)-len(s.Keys[j].Value):]
  1648  		i.rangeKey.keys = append(i.rangeKey.keys, RangeKeyData{
  1649  			Suffix: suffix,
  1650  			Value:  value,
  1651  		})
  1652  	}
  1653  }
  1654  
  1655  // RangeKeyChanged indicates whether the most recent iterator positioning
  1656  // operation resulted in the iterator stepping into or out of a new range key.
  1657  // If true, previously returned range key bounds and data has been invalidated.
  1658  // If false, previously obtained range key bounds, suffix and value slices are
  1659  // still valid and may continue to be read.
  1660  //
  1661  // Invalid iterator positions are considered to not hold range keys, meaning
  1662  // that if an iterator steps from an IterExhausted or IterAtLimit position onto
  1663  // a position with a range key, RangeKeyChanged will yield true.
  1664  func (i *Iterator) RangeKeyChanged() bool {
  1665  	return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated
  1666  }
  1667  
  1668  // HasPointAndRange indicates whether there exists a point key, a range key or
  1669  // both at the current iterator position.
  1670  func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) {
  1671  	if i.iterValidityState != IterValid || i.requiresReposition {
  1672  		return false, false
  1673  	}
  1674  	if i.opts.KeyTypes == IterKeyTypePointsOnly {
  1675  		return true, false
  1676  	}
  1677  	return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey
  1678  }
  1679  
  1680  // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the
  1681  // range key covering the current iterator position. RangeBounds returns nil
  1682  // bounds if there is no range key covering the current iterator position, or
  1683  // the iterator is not configured to surface range keys.
  1684  func (i *Iterator) RangeBounds() (start, end []byte) {
  1685  	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
  1686  		return nil, nil
  1687  	}
  1688  	return i.rangeKey.start, i.rangeKey.end
  1689  }
  1690  
  1691  // Key returns the key of the current key/value pair, or nil if done. The
  1692  // caller should not modify the contents of the returned slice, and its
  1693  // contents may change on the next call to Next.
  1694  func (i *Iterator) Key() []byte {
  1695  	return i.key
  1696  }
  1697  
  1698  // Value returns the value of the current key/value pair, or nil if done. The
  1699  // caller should not modify the contents of the returned slice, and its
  1700  // contents may change on the next call to Next.
  1701  //
  1702  // Only valid if HasPointAndRange() returns true for hasPoint.
  1703  func (i *Iterator) Value() []byte {
  1704  	return i.value
  1705  }
  1706  
  1707  // RangeKeys returns the range key values and their suffixes covering the
  1708  // current iterator position. The range bounds may be retrieved separately
  1709  // through Iterator.RangeBounds().
  1710  func (i *Iterator) RangeKeys() []RangeKeyData {
  1711  	if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey {
  1712  		return nil
  1713  	}
  1714  	return i.rangeKey.keys
  1715  }
  1716  
  1717  // Valid returns true if the iterator is positioned at a valid key/value pair
  1718  // and false otherwise.
  1719  func (i *Iterator) Valid() bool {
  1720  	return i.iterValidityState == IterValid && !i.requiresReposition
  1721  }
  1722  
  1723  // Error returns any accumulated error.
  1724  func (i *Iterator) Error() error {
  1725  	if i.iter != nil {
  1726  		return firstError(i.err, i.iter.Error())
  1727  	}
  1728  	return i.err
  1729  }
  1730  
  1731  // Close closes the iterator and returns any accumulated error. Exhausting
  1732  // all the key/value pairs in a table is not considered to be an error.
  1733  // It is not valid to call any method, including Close, after the iterator
  1734  // has been closed.
  1735  func (i *Iterator) Close() error {
  1736  	// Close the child iterator before releasing the readState because when the
  1737  	// readState is released sstables referenced by the readState may be deleted
  1738  	// which will fail on Windows if the sstables are still open by the child
  1739  	// iterator.
  1740  	if i.iter != nil {
  1741  		i.err = firstError(i.err, i.iter.Close())
  1742  
  1743  		// Closing i.iter did not necessarily close the point and range key
  1744  		// iterators. Calls to SetOptions may have 'disconnected' either one
  1745  		// from i.iter if iteration key types were changed. Both point and range
  1746  		// key iterators are preserved in case the iterator needs to switch key
  1747  		// types again. We explicitly close both of these iterators here.
  1748  		//
  1749  		// NB: If the iterators were still connected to i.iter, they may be
  1750  		// closed, but calling Close on a closed internal iterator or fragment
  1751  		// iterator is allowed.
  1752  		if i.pointIter != nil && !i.closePointIterOnce {
  1753  			i.err = firstError(i.err, i.pointIter.Close())
  1754  		}
  1755  		if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil {
  1756  			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  1757  		}
  1758  	}
  1759  	err := i.err
  1760  
  1761  	if i.readState != nil {
  1762  		if i.readSampling.pendingCompactions.size > 0 {
  1763  			// Copy pending read compactions using db.mu.Lock()
  1764  			i.readState.db.mu.Lock()
  1765  			i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp)
  1766  			reschedule := i.readState.db.mu.compact.rescheduleReadCompaction
  1767  			i.readState.db.mu.compact.rescheduleReadCompaction = false
  1768  			concurrentCompactions := i.readState.db.mu.compact.compactingCount
  1769  			i.readState.db.mu.Unlock()
  1770  
  1771  			if reschedule && concurrentCompactions == 0 {
  1772  				// In a read heavy workload, flushes may not happen frequently enough to
  1773  				// schedule compactions.
  1774  				i.readState.db.compactionSchedulers.Add(1)
  1775  				go i.readState.db.maybeScheduleCompactionAsync()
  1776  			}
  1777  		}
  1778  
  1779  		i.readState.unref()
  1780  		i.readState = nil
  1781  	}
  1782  
  1783  	for _, readers := range i.externalReaders {
  1784  		for _, r := range readers {
  1785  			err = firstError(err, r.Close())
  1786  		}
  1787  	}
  1788  
  1789  	// Close the closer for the current value if one was open.
  1790  	if i.valueCloser != nil {
  1791  		err = firstError(err, i.valueCloser.Close())
  1792  		i.valueCloser = nil
  1793  	}
  1794  
  1795  	const maxKeyBufCacheSize = 4 << 10 // 4 KB
  1796  
  1797  	if i.rangeKey != nil {
  1798  		// Avoid caching the key buf if it is overly large. The constant is
  1799  		// fairly arbitrary.
  1800  		if cap(i.rangeKey.buf) >= maxKeyBufCacheSize {
  1801  			i.rangeKey.buf = nil
  1802  		}
  1803  		*i.rangeKey = iteratorRangeKeyState{buf: i.rangeKey.buf}
  1804  		iterRangeKeyStateAllocPool.Put(i.rangeKey)
  1805  		i.rangeKey = nil
  1806  	}
  1807  	if alloc := i.alloc; alloc != nil {
  1808  		// Avoid caching the key buf if it is overly large. The constant is fairly
  1809  		// arbitrary.
  1810  		if cap(i.keyBuf) >= maxKeyBufCacheSize {
  1811  			alloc.keyBuf = nil
  1812  		} else {
  1813  			alloc.keyBuf = i.keyBuf
  1814  		}
  1815  		if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize {
  1816  			alloc.prefixOrFullSeekKey = nil
  1817  		} else {
  1818  			alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey
  1819  		}
  1820  		for j := range i.boundsBuf {
  1821  			if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
  1822  				alloc.boundsBuf[j] = nil
  1823  			} else {
  1824  				alloc.boundsBuf[j] = i.boundsBuf[j]
  1825  			}
  1826  		}
  1827  		*alloc = iterAlloc{
  1828  			keyBuf:              alloc.keyBuf,
  1829  			boundsBuf:           alloc.boundsBuf,
  1830  			prefixOrFullSeekKey: alloc.prefixOrFullSeekKey,
  1831  		}
  1832  		iterAllocPool.Put(alloc)
  1833  	} else if alloc := i.getIterAlloc; alloc != nil {
  1834  		if cap(i.keyBuf) >= maxKeyBufCacheSize {
  1835  			alloc.keyBuf = nil
  1836  		} else {
  1837  			alloc.keyBuf = i.keyBuf
  1838  		}
  1839  		*alloc = getIterAlloc{
  1840  			keyBuf: alloc.keyBuf,
  1841  		}
  1842  		getIterAllocPool.Put(alloc)
  1843  	}
  1844  	return err
  1845  }
  1846  
  1847  // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds
  1848  // returns, the caller is free to mutate the provided slices.
  1849  //
  1850  // The iterator will always be invalidated and must be repositioned with a call
  1851  // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
  1852  func (i *Iterator) SetBounds(lower, upper []byte) {
  1853  	// Ensure that the Iterator appears exhausted, regardless of whether we
  1854  	// actually have to invalidate the internal iterator. Optimizations that
  1855  	// avoid exhaustion are an internal implementation detail that shouldn't
  1856  	// leak through the interface. The caller should still call an absolute
  1857  	// positioning method to reposition the iterator.
  1858  	i.requiresReposition = true
  1859  
  1860  	if ((i.opts.LowerBound == nil) == (lower == nil)) &&
  1861  		((i.opts.UpperBound == nil) == (upper == nil)) &&
  1862  		i.equal(i.opts.LowerBound, lower) &&
  1863  		i.equal(i.opts.UpperBound, upper) {
  1864  		// Unchanged, noop.
  1865  		return
  1866  	}
  1867  
  1868  	// Copy the user-provided bounds into an Iterator-owned buffer, and set them
  1869  	// on i.opts.{Lower,Upper}Bound.
  1870  	i.saveBounds(lower, upper)
  1871  
  1872  	i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  1873  	// If the iterator has an open point iterator that's not currently being
  1874  	// used, propagate the new bounds to it.
  1875  	if i.pointIter != nil && !i.opts.pointKeys() {
  1876  		i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  1877  	}
  1878  	// If the iterator has a range key iterator, propagate bounds to it. The
  1879  	// top-level SetBounds on the interleaving iterator (i.iter) won't propagate
  1880  	// bounds to the range key iterator stack, because the FragmentIterator
  1881  	// interface doesn't define a SetBounds method. We need to directly inform
  1882  	// the iterConfig stack.
  1883  	if i.rangeKey != nil {
  1884  		i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  1885  	}
  1886  
  1887  	// Even though this is not a positioning operation, the alteration of the
  1888  	// bounds means we cannot optimize Seeks by using Next.
  1889  	i.invalidate()
  1890  }
  1891  
  1892  func (i *Iterator) saveBounds(lower, upper []byte) {
  1893  	// Copy the user-provided bounds into an Iterator-owned buffer. We can't
  1894  	// overwrite the current bounds, because some internal iterators compare old
  1895  	// and new bounds for optimizations.
  1896  
  1897  	buf := i.boundsBuf[i.boundsBufIdx][:0]
  1898  	if lower != nil {
  1899  		buf = append(buf, lower...)
  1900  		i.opts.LowerBound = buf
  1901  	} else {
  1902  		i.opts.LowerBound = nil
  1903  	}
  1904  	if upper != nil {
  1905  		buf = append(buf, upper...)
  1906  		i.opts.UpperBound = buf[len(buf)-len(upper):]
  1907  	} else {
  1908  		i.opts.UpperBound = nil
  1909  	}
  1910  	i.boundsBuf[i.boundsBufIdx] = buf
  1911  	i.boundsBufIdx = 1 - i.boundsBufIdx
  1912  }
  1913  
  1914  // SetOptions sets new iterator options for the iterator. Note that the lower
  1915  // and upper bounds applied here will supersede any bounds set by previous calls
  1916  // to SetBounds.
  1917  //
  1918  // Note that the slices provided in this SetOptions must not be changed by the
  1919  // caller until the iterator is closed, or a subsequent SetBounds or SetOptions
  1920  // has returned. This is because comparisons between the existing and new bounds
  1921  // are sometimes used to optimize seeking. See the extended commentary on
  1922  // SetBounds.
  1923  //
  1924  // If the iterator was created over an indexed mutable batch, the iterator's
  1925  // view of the mutable batch is refreshed.
  1926  //
  1927  // The iterator will always be invalidated and must be repositioned with a call
  1928  // to SeekGE, SeekPrefixGE, SeekLT, First, or Last.
  1929  //
  1930  // If only lower and upper bounds need to be modified, prefer SetBounds.
  1931  func (i *Iterator) SetOptions(o *IterOptions) {
  1932  	if i.externalReaders != nil {
  1933  		if err := validateExternalIterOpts(o); err != nil {
  1934  			panic(err)
  1935  		}
  1936  	}
  1937  
  1938  	// Ensure that the Iterator appears exhausted, regardless of whether we
  1939  	// actually have to invalidate the internal iterator. Optimizations that
  1940  	// avoid exhaustion are an internal implementation detail that shouldn't
  1941  	// leak through the interface. The caller should still call an absolute
  1942  	// positioning method to reposition the iterator.
  1943  	i.requiresReposition = true
  1944  
  1945  	// Check if global state requires we close all internal iterators.
  1946  	//
  1947  	// If the Iterator is in an error state, invalidate the existing iterators
  1948  	// so that we reconstruct an iterator state from scratch.
  1949  	//
  1950  	// If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect,
  1951  	// improperly including or excluding memtables. Invalidate them so that
  1952  	// finishInitializingIter will reconstruct them.
  1953  	//
  1954  	// If either the original options or the new options specify a table filter,
  1955  	// we need to reconstruct the iterator stacks. If they both supply a table
  1956  	// filter, we can't be certain that it's the same filter since we have no
  1957  	// mechanism to compare the filter closures.
  1958  	closeBoth := i.err != nil ||
  1959  		o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable ||
  1960  		o.TableFilter != nil || i.opts.TableFilter != nil
  1961  
  1962  	// If either options specify block property filters for an iterator stack,
  1963  	// reconstruct it.
  1964  	if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 ||
  1965  		o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil) {
  1966  		i.err = firstError(i.err, i.pointIter.Close())
  1967  		i.pointIter = nil
  1968  	}
  1969  	if i.rangeKey != nil {
  1970  		if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 {
  1971  			i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  1972  			i.rangeKey = nil
  1973  		} else {
  1974  			// If there's still a range key iterator stack, invalidate the
  1975  			// iterator. This ensures RangeKeyChanged() returns true if a
  1976  			// subsequent positioning operation discovers a range key. It also
  1977  			// prevents seek no-op optimizations.
  1978  			i.invalidate()
  1979  		}
  1980  	}
  1981  
  1982  	// If the iterator is backed by a batch that's been mutated, refresh its
  1983  	// existing point and range-key iterators, and invalidate the iterator to
  1984  	// prevent seek-using-next optimizations. If we don't yet have a point-key
  1985  	// iterator or range-key iterator but we require one, it'll be created in
  1986  	// the slow path that reconstructs the iterator in finishInitializingIter.
  1987  	if i.batch != nil {
  1988  		nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
  1989  		if nextBatchSeqNum != i.batchSeqNum {
  1990  			i.batchSeqNum = nextBatchSeqNum
  1991  			if i.pointIter != nil {
  1992  				if i.batch.countRangeDels == 0 {
  1993  					// No range deletions exist in the batch. We only need to
  1994  					// update the batchIter's snapshot.
  1995  					i.batchPointIter.snapshot = nextBatchSeqNum
  1996  					i.invalidate()
  1997  				} else if i.batchRangeDelIter.Count() == 0 {
  1998  					// When we constructed this iterator, there were no
  1999  					// rangedels in the batch. Iterator construction will have
  2000  					// excluded the batch rangedel iterator from the point
  2001  					// iterator stack. We need to reconstruct the point iterator
  2002  					// to add i.batchRangeDelIter into the iterator stack.
  2003  					i.err = firstError(i.err, i.pointIter.Close())
  2004  					i.pointIter = nil
  2005  				} else {
  2006  					// There are range deletions in the batch and we already
  2007  					// have a batch rangedel iterator. We can update the batch
  2008  					// rangedel iterator in place.
  2009  					//
  2010  					// NB: There may or may not be new range deletions. We can't
  2011  					// tell based on i.batchRangeDelIter.Count(), which is the
  2012  					// count of fragmented range deletions, NOT the number of
  2013  					// range deletions written to the batch
  2014  					// [i.batch.countRangeDels].
  2015  					i.batchPointIter.snapshot = nextBatchSeqNum
  2016  					i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum)
  2017  					i.invalidate()
  2018  				}
  2019  			}
  2020  			if i.rangeKey != nil && i.batch.countRangeKeys > 0 {
  2021  				if i.batchRangeKeyIter.Count() == 0 {
  2022  					// When we constructed this iterator, there were no range
  2023  					// keys in the batch. Iterator construction will have
  2024  					// excluded the batch rangekey iterator from the range key
  2025  					// iterator stack. We need to reconstruct the range key
  2026  					// iterator to add i.batchRangeKeyIter into the iterator
  2027  					// stack.
  2028  					i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close())
  2029  					i.rangeKey = nil
  2030  				} else {
  2031  					// There are range keys in the batch and we already
  2032  					// have a batch rangekey iterator. We can update the batch
  2033  					// rangekey iterator in place.
  2034  					//
  2035  					// NB: There may or may not be new range keys. We can't
  2036  					// tell based on i.batchRangeKeyIter.Count(), which is the
  2037  					// count of fragmented range keys, NOT the number of
  2038  					// range keys written to the batch [i.batch.countRangeKeys].
  2039  					i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum)
  2040  					i.invalidate()
  2041  				}
  2042  			}
  2043  		}
  2044  	}
  2045  
  2046  	// Reset combinedIterState.initialized in case the iterator key types
  2047  	// changed. If there's already a range key iterator stack, the combined
  2048  	// iterator is already initialized.  Additionally, if the iterator is not
  2049  	// configured to include range keys, mark it as initialized to signal that
  2050  	// lower level iterators should not trigger a switch to combined iteration.
  2051  	i.lazyCombinedIter.combinedIterState = combinedIterState{
  2052  		initialized: i.rangeKey != nil || !i.opts.rangeKeys(),
  2053  	}
  2054  
  2055  	boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) &&
  2056  		((i.opts.UpperBound == nil) == (o.UpperBound == nil)) &&
  2057  		i.equal(i.opts.LowerBound, o.LowerBound) &&
  2058  		i.equal(i.opts.UpperBound, o.UpperBound)
  2059  
  2060  	if boundsEqual && o.KeyTypes == i.opts.KeyTypes &&
  2061  		(i.pointIter != nil || !i.opts.pointKeys()) &&
  2062  		(i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) &&
  2063  		i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) &&
  2064  		o.UseL6Filters == i.opts.UseL6Filters {
  2065  		// The options are identical, so we can likely use the fast path. In
  2066  		// addition to all the above constraints, we cannot use the fast path if
  2067  		// configured to perform lazy combined iteration but an indexed batch
  2068  		// used by the iterator now contains range keys. Lazy combined iteration
  2069  		// is not compatible with batch range keys because we always need to
  2070  		// merge the batch's range keys into iteration.
  2071  		if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 {
  2072  			// Fast path. This preserves the Seek-using-Next optimizations as
  2073  			// long as the iterator wasn't already invalidated up above.
  2074  			return
  2075  		}
  2076  	}
  2077  	// Slow path.
  2078  
  2079  	// The options changed. Save the new ones to i.opts.
  2080  	if boundsEqual {
  2081  		// Copying the options into i.opts will overwrite LowerBound and
  2082  		// UpperBound fields with the user-provided slices. We need to hold on
  2083  		// to the Pebble-owned slices, so save them and re-set them after the
  2084  		// copy.
  2085  		lower, upper := i.opts.LowerBound, i.opts.UpperBound
  2086  		i.opts = *o
  2087  		i.opts.LowerBound, i.opts.UpperBound = lower, upper
  2088  	} else {
  2089  		i.opts = *o
  2090  		i.saveBounds(o.LowerBound, o.UpperBound)
  2091  		// Propagate the changed bounds to the existing point iterator.
  2092  		// NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound
  2093  		// because i.opts now point to buffers owned by Pebble.
  2094  		if i.pointIter != nil {
  2095  			i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2096  		}
  2097  		if i.rangeKey != nil {
  2098  			i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound)
  2099  		}
  2100  	}
  2101  
  2102  	// Even though this is not a positioning operation, the invalidation of the
  2103  	// iterator stack means we cannot optimize Seeks by using Next.
  2104  	i.invalidate()
  2105  
  2106  	// Iterators created through NewExternalIter have a different iterator
  2107  	// initialization process.
  2108  	if i.externalReaders != nil {
  2109  		finishInitializingExternal(i)
  2110  		return
  2111  	}
  2112  	finishInitializingIter(i.alloc)
  2113  }
  2114  
  2115  func (i *Iterator) invalidate() {
  2116  	i.lastPositioningOp = unknownLastPositionOp
  2117  	i.hasPrefix = false
  2118  	i.iterKey = nil
  2119  	i.iterValue = nil
  2120  	i.err = nil
  2121  	// This switch statement isn't necessary for correctness since callers
  2122  	// should call a repositioning method. We could have arbitrarily set i.pos
  2123  	// to one of the values. But it results in more intuitive behavior in
  2124  	// tests, which do not always reposition.
  2125  	switch i.pos {
  2126  	case iterPosCurForward, iterPosNext, iterPosCurForwardPaused:
  2127  		i.pos = iterPosCurForward
  2128  	case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused:
  2129  		i.pos = iterPosCurReverse
  2130  	}
  2131  	i.iterValidityState = IterExhausted
  2132  	if i.rangeKey != nil {
  2133  		i.rangeKey.iiter.Invalidate()
  2134  	}
  2135  }
  2136  
  2137  // Metrics returns per-iterator metrics.
  2138  func (i *Iterator) Metrics() IteratorMetrics {
  2139  	m := IteratorMetrics{
  2140  		ReadAmp: 1,
  2141  	}
  2142  	if mi, ok := i.iter.(*mergingIter); ok {
  2143  		m.ReadAmp = len(mi.levels)
  2144  	}
  2145  	return m
  2146  }
  2147  
  2148  // ResetStats resets the stats to 0.
  2149  func (i *Iterator) ResetStats() {
  2150  	i.stats = IteratorStats{}
  2151  }
  2152  
  2153  // Stats returns the current stats.
  2154  func (i *Iterator) Stats() IteratorStats {
  2155  	return i.stats
  2156  }
  2157  
  2158  // CloneOptions configures an iterator constructed through Iterator.Clone.
  2159  type CloneOptions struct {
  2160  	// IterOptions, if non-nil, define the iterator options to configure a
  2161  	// cloned iterator. If nil, the clone adopts the same IterOptions as the
  2162  	// iterator being cloned.
  2163  	IterOptions *IterOptions
  2164  	// RefreshBatchView may be set to true when cloning an Iterator over an
  2165  	// indexed batch. When false, the clone adopts the same (possibly stale)
  2166  	// view of the indexed batch as the cloned Iterator. When true, the clone is
  2167  	// constructed with a refreshed view of the batch, observing all of the
  2168  	// batch's mutations at the time of the Clone. If the cloned iterator was
  2169  	// not constructed to read over an indexed batch, RefreshVatchView has no
  2170  	// effect.
  2171  	RefreshBatchView bool
  2172  }
  2173  
  2174  // Clone creates a new Iterator over the same underlying data, i.e., over the
  2175  // same {batch, memtables, sstables}). The resulting iterator is not positioned.
  2176  // It starts with the same IterOptions, unless opts.IterOptions is set.
  2177  //
  2178  // When called on an Iterator over an indexed batch, the clone's visibility of
  2179  // the indexed batch is determined by CloneOptions.RefreshBatchView. If false,
  2180  // the clone inherits the iterator's current (possibly stale) view of the batch,
  2181  // and callers may call SetOptions to subsequently refresh the clone's view to
  2182  // include all batch mutations. If true, the clone is constructed with a
  2183  // complete view of the indexed batch's mutations at the time of the Clone.
  2184  //
  2185  // Callers can use Clone if they need multiple iterators that need to see
  2186  // exactly the same underlying state of the DB. This should not be used to
  2187  // extend the lifetime of the data backing the original Iterator since that
  2188  // will cause an increase in memory and disk usage (use NewSnapshot for that
  2189  // purpose).
  2190  func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) {
  2191  	if opts.IterOptions == nil {
  2192  		opts.IterOptions = &i.opts
  2193  	}
  2194  
  2195  	readState := i.readState
  2196  	if readState == nil {
  2197  		return nil, errors.Errorf("cannot Clone a closed Iterator")
  2198  	}
  2199  	// i is already holding a ref, so there is no race with unref here.
  2200  	readState.ref()
  2201  	// Bundle various structures under a single umbrella in order to allocate
  2202  	// them together.
  2203  	buf := iterAllocPool.Get().(*iterAlloc)
  2204  	dbi := &buf.dbi
  2205  	*dbi = Iterator{
  2206  		opts:                *opts.IterOptions,
  2207  		alloc:               buf,
  2208  		merge:               i.merge,
  2209  		comparer:            i.comparer,
  2210  		readState:           readState,
  2211  		keyBuf:              buf.keyBuf,
  2212  		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
  2213  		boundsBuf:           buf.boundsBuf,
  2214  		batch:               i.batch,
  2215  		batchSeqNum:         i.batchSeqNum,
  2216  		newIters:            i.newIters,
  2217  		newIterRangeKey:     i.newIterRangeKey,
  2218  		seqNum:              i.seqNum,
  2219  	}
  2220  	dbi.saveBounds(dbi.opts.LowerBound, dbi.opts.UpperBound)
  2221  
  2222  	// If the caller requested the clone have a current view of the indexed
  2223  	// batch, set the clone's batch sequence number appropriately.
  2224  	if i.batch != nil && opts.RefreshBatchView {
  2225  		dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch)
  2226  	}
  2227  
  2228  	return finishInitializingIter(buf), nil
  2229  }
  2230  
  2231  func (stats *IteratorStats) String() string {
  2232  	return redact.StringWithoutMarkers(stats)
  2233  }
  2234  
  2235  // SafeFormat implements the redact.SafeFormatter interface.
  2236  func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) {
  2237  	for i := range stats.ForwardStepCount {
  2238  		switch IteratorStatsKind(i) {
  2239  		case InterfaceCall:
  2240  			s.SafeString("(interface (dir, seek, step): ")
  2241  		case InternalIterCall:
  2242  			s.SafeString(", (internal (dir, seek, step): ")
  2243  		}
  2244  		s.Printf("(fwd, %d, %d), (rev, %d, %d))",
  2245  			redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]),
  2246  			redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i]))
  2247  	}
  2248  	if stats.InternalStats != (InternalIteratorStats{}) {
  2249  		s.SafeString(",\n(internal-stats: ")
  2250  		s.Printf("(block-bytes: (total %s, cached %s)), "+
  2251  			"(points: (count %s, key-bytes %s, value-bytes %s, tombstoned: %s))",
  2252  			humanize.IEC.Uint64(stats.InternalStats.BlockBytes),
  2253  			humanize.IEC.Uint64(stats.InternalStats.BlockBytesInCache),
  2254  			humanize.SI.Uint64(stats.InternalStats.PointCount),
  2255  			humanize.SI.Uint64(stats.InternalStats.KeyBytes),
  2256  			humanize.SI.Uint64(stats.InternalStats.ValueBytes),
  2257  			humanize.SI.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones),
  2258  		)
  2259  	}
  2260  }