github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/range_keys.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"github.com/zuoyebang/bitalostable/internal/base"
     9  	"github.com/zuoyebang/bitalostable/internal/invariants"
    10  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    11  	"github.com/zuoyebang/bitalostable/internal/manifest"
    12  	"github.com/zuoyebang/bitalostable/sstable"
    13  )
    14  
    15  // constructRangeKeyIter constructs the range-key iterator stack, populating
    16  // i.rangeKey.rangeKeyIter with the resulting iterator.
    17  func (i *Iterator) constructRangeKeyIter() {
    18  	i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
    19  		&i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
    20  		&i.hasPrefix, &i.prefixOrFullSeekKey)
    21  
    22  	// If there's an indexed batch with range keys, include it.
    23  	if i.batch != nil {
    24  		if i.batch.index == nil {
    25  			i.rangeKey.iterConfig.AddLevel(newErrorKeyspanIter(ErrNotIndexed))
    26  		} else {
    27  			// Only include the batch's range key iterator if it has any keys.
    28  			// NB: This can force reconstruction of the rangekey iterator stack
    29  			// in SetOptions if subsequently range keys are added. See
    30  			// SetOptions.
    31  			if i.batch.countRangeKeys > 0 {
    32  				i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, i.batchSeqNum)
    33  				i.rangeKey.iterConfig.AddLevel(&i.batchRangeKeyIter)
    34  			}
    35  		}
    36  	}
    37  
    38  	// Next are the flushables: memtables and large batches.
    39  	for j := len(i.readState.memtables) - 1; j >= 0; j-- {
    40  		mem := i.readState.memtables[j]
    41  		// We only need to read from memtables which contain sequence numbers older
    42  		// than seqNum.
    43  		if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
    44  			continue
    45  		}
    46  		if rki := mem.newRangeKeyIter(&i.opts); rki != nil {
    47  			i.rangeKey.iterConfig.AddLevel(rki)
    48  		}
    49  	}
    50  
    51  	current := i.readState.current
    52  	// Next are the file levels: L0 sub-levels followed by lower levels.
    53  	//
    54  	// Add file-specific iterators for L0 files containing range keys. This is less
    55  	// efficient than using levelIters for sublevels of L0 files containing
    56  	// range keys, but range keys are expected to be sparse anyway, reducing the
    57  	// cost benefit of maintaining a separate L0Sublevels instance for range key
    58  	// files and then using it here.
    59  	//
    60  	// NB: We iterate L0's files in reverse order. They're sorted by
    61  	// LargestSeqNum ascending, and we need to add them to the merging iterator
    62  	// in LargestSeqNum descending to preserve the merging iterator's invariants
    63  	// around Key Trailer order.
    64  	iter := current.RangeKeyLevels[0].Iter()
    65  	for f := iter.Last(); f != nil; f = iter.Prev() {
    66  		spanIterOpts := &keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}
    67  		spanIter, err := i.newIterRangeKey(f, spanIterOpts)
    68  		if err != nil {
    69  			i.rangeKey.iterConfig.AddLevel(&errorKeyspanIter{err: err})
    70  			continue
    71  		}
    72  		i.rangeKey.iterConfig.AddLevel(spanIter)
    73  	}
    74  
    75  	// Add level iterators for the non-empty non-L0 levels.
    76  	for level := 1; level < len(current.RangeKeyLevels); level++ {
    77  		if current.RangeKeyLevels[level].Empty() {
    78  			continue
    79  		}
    80  		li := i.rangeKey.iterConfig.NewLevelIter()
    81  		spanIterOpts := keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters}
    82  		li.Init(spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
    83  			manifest.Level(level), i.opts.logger, manifest.KeyTypeRange)
    84  		i.rangeKey.iterConfig.AddLevel(li)
    85  	}
    86  }
    87  
    88  // Range key masking
    89  //
    90  // Pebble iterators may be configured such that range keys with suffixes mask
    91  // point keys with lower suffixes. The intended use is implementing a MVCC
    92  // delete range operation using range keys, when suffixes are MVCC timestamps.
    93  //
    94  // To enable masking, the user populates the IterOptions's RangeKeyMasking
    95  // field. The Suffix field configures which range keys act as masks. The
    96  // intended use is to hold a MVCC read timestamp. When implementing a MVCC
    97  // delete range operation, only range keys that are visible at the read
    98  // timestamp should be visible. If a range key has a suffix ≤
    99  // RangeKeyMasking.Suffix, it acts as a mask.
   100  //
   101  // Range key masking is facilitated by the keyspan.InterleavingIter. The
   102  // interleaving iterator interleaves range keys and point keys during combined
   103  // iteration. During user iteration, the interleaving iterator is configured
   104  // with a keyspan.SpanMask, implemented by the rangeKeyMasking struct below.
   105  // The SpanMask interface defines two methods: SpanChanged and SkipPoint.
   106  //
   107  // SpanChanged is used to keep the current mask up-to-date. Whenever the point
   108  // iterator has stepped into or out of the bounds of a range key, the
   109  // interleaving iterator invokes SpanChanged passing the current covering range
   110  // key. The below rangeKeyMasking implementation scans the range keys looking
   111  // for the range key with the largest suffix that's still ≤ the suffix supplied
   112  // to IterOptions.RangeKeyMasking.Suffix (the "read timestamp"). If it finds a
   113  // range key that meets the condition, the range key should act as a mask. The
   114  // span and the relevant range key's suffix are saved.
   115  //
   116  // The above ensures that `rangeKeyMasking.maskActiveSuffix` always contains the
   117  // current masking suffix such that any point keys with lower suffixes should be
   118  // skipped.
   119  //
   120  // There are two ways in which masked point keys are skipped.
   121  //
   122  //   1. Interleaving iterator SkipPoint
   123  //
   124  // Whenever the interleaving iterator encounters a point key that falls within
   125  // the bounds of a range key, it invokes SkipPoint. The interleaving iterator
   126  // guarantees that the SpanChanged method described above has already been
   127  // invoked with the covering range key. The below rangeKeyMasking implementation
   128  // of SkipPoint splits the key into prefix and suffix, compares the suffix to
   129  // the `maskActiveSuffix` updated by SpanChanged and returns true if
   130  // suffix(point) < maskActiveSuffix.
   131  //
   132  // The SkipPoint logic is sufficient to ensure that the Pebble iterator filters
   133  // out all masked point keys. However, it requires the iterator read each masked
   134  // point key. For broad range keys that mask many points, this may be expensive.
   135  //
   136  //   2. Block property filter
   137  //
   138  // For more efficient handling of braad range keys that mask many points, the
   139  // IterOptions.RangeKeyMasking field has an optional Filter option. This Filter
   140  // field takes a superset of the block-property filter interface, adding a
   141  // method to dynamically configure the filter's filtering criteria.
   142  //
   143  // To make use of the Filter option, the user is required to define and
   144  // configure a block-property collector that collects a property containing at
   145  // least the maximum suffix of a key within a block.
   146  //
   147  // When the SpanChanged method described above is invoked, rangeKeyMasking also
   148  // reconfigures the user-provided filter. It invokes a SetSuffix method,
   149  // providing the `maskActiveSuffix`, requesting that from now on the
   150  // block-property filter return Intersects()=false for any properties indicating
   151  // that a block contains exclusively keys with suffixes greater than the
   152  // provided suffix.
   153  //
   154  // Note that unlike other block-property filters, the filter used for masking
   155  // must not apply across the entire keyspace. It must only filter blocks that
   156  // lie within the bounds of the range key that set the mask suffix. To
   157  // accommodate this, rangeKeyMasking implements a special interface:
   158  // sstable.BoundLimitedBlockPropertyFilter. This interface extends the block
   159  // property filter interface with two new methods: KeyIsWithinLowerBound and
   160  // KeyIsWithinUpperBound. The rangeKeyMasking type wraps the user-provided block
   161  // property filter, implementing these two methods and overriding Intersects to
   162  // always return true if there is no active mask.
   163  //
   164  // The logic to ensure that a mask block-property filter is only applied within
   165  // the bounds of the masking range key is subtle. The interleaving iterator
   166  // guarantees that it never invokes SpanChanged until the point iterator is
   167  // positioned within the range key. During forward iteration, this guarantees
   168  // that any block that a sstable reader might attempt to load contains only keys
   169  // greater than or equal to the range key's lower bound. During backward
   170  // iteration, it provides the analagous guarantee on the range key's upper
   171  // bound.
   172  //
   173  // The above ensures that an sstable reader only needs to verify that a block
   174  // that it skips meets the opposite bound. This is where the
   175  // KeyIsWithinLowerBound and KeyIsWithinUpperBound methods are used. When an
   176  // sstable iterator is configured with a BoundLimitedBlockPropertyFilter, it
   177  // checks for intersection with the block-property filter before every block
   178  // load, like ordinary block-property filters. However, if the bound-limited
   179  // block property filter indicates that it does NOT intersect, the filter's
   180  // relevant KeyIsWithin{Lower,Upper}Bound method is queried, using a block
   181  // index separator as the bound. If the method indicates that the provided index
   182  // separator does not fall within the range key bounds, the no-intersection
   183  // result is ignored, and the block is read.
   184  
   185  type rangeKeyMasking struct {
   186  	cmp    base.Compare
   187  	split  base.Split
   188  	filter BlockPropertyFilterMask
   189  	// maskActiveSuffix holds the suffix of a range key currently acting as a
   190  	// mask, hiding point keys with suffixes greater than it. maskActiveSuffix
   191  	// is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil.
   192  	// maskActiveSuffix is updated whenever the iterator passes over a new range
   193  	// key. The maskActiveSuffix should only be used if maskSpan is non-nil.
   194  	//
   195  	// See SpanChanged.
   196  	maskActiveSuffix []byte
   197  	// maskSpan holds the span from which the active mask suffix was extracted.
   198  	// The span is used for bounds comparisons, to ensure that a range-key mask
   199  	// is not applied beyond the bounds of the range key.
   200  	maskSpan *keyspan.Span
   201  	parent   *Iterator
   202  }
   203  
   204  func (m *rangeKeyMasking) init(parent *Iterator, cmp base.Compare, split base.Split) {
   205  	m.cmp = cmp
   206  	m.split = split
   207  	if parent.opts.RangeKeyMasking.Filter != nil {
   208  		m.filter = parent.opts.RangeKeyMasking.Filter()
   209  	}
   210  	m.parent = parent
   211  }
   212  
   213  // SpanChanged implements the keyspan.SpanMask interface, used during range key
   214  // iteration.
   215  func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) {
   216  	if s == nil && m.maskSpan == nil {
   217  		return
   218  	}
   219  	m.maskSpan = nil
   220  	m.maskActiveSuffix = m.maskActiveSuffix[:0]
   221  
   222  	// Find the smallest suffix of a range key contained within the Span,
   223  	// excluding suffixes less than m.opts.RangeKeyMasking.Suffix.
   224  	if s != nil {
   225  		m.parent.rangeKey.stale = true
   226  		if m.parent.opts.RangeKeyMasking.Suffix != nil {
   227  			for j := range s.Keys {
   228  				if s.Keys[j].Suffix == nil {
   229  					continue
   230  				}
   231  				if m.cmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 {
   232  					continue
   233  				}
   234  				if len(m.maskActiveSuffix) == 0 || m.cmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 {
   235  					m.maskSpan = s
   236  					m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...)
   237  				}
   238  			}
   239  		}
   240  	}
   241  
   242  	if m.maskSpan != nil && m.parent.opts.RangeKeyMasking.Filter != nil {
   243  		// Update the  block-property filter to filter point keys with suffixes
   244  		// greater than m.maskActiveSuffix.
   245  		err := m.filter.SetSuffix(m.maskActiveSuffix)
   246  		if err != nil {
   247  			m.parent.err = err
   248  		}
   249  	}
   250  	// If no span is active, we leave the inner block-property filter configured
   251  	// with its existing suffix. That's okay, because Intersects calls are first
   252  	// evaluated by iteratorRangeKeyState.Intersects, which considers all blocks
   253  	// as intersecting if there's no active mask.
   254  }
   255  
   256  // SkipPoint implements the keyspan.SpanMask interface, used during range key
   257  // iteration. Whenever a point key is covered by a non-empty Span, the
   258  // interleaving iterator invokes SkipPoint. This function is responsible for
   259  // performing range key masking.
   260  //
   261  // If a non-nil IterOptions.RangeKeyMasking.Suffix is set, range key masking is
   262  // enabled. Masking hides point keys, transparently skipping over the keys.
   263  // Whether or not a point key is masked is determined by comparing the point
   264  // key's suffix, the overlapping span's keys' suffixes, and the user-configured
   265  // IterOption's RangeKeyMasking.Suffix. When configured with a masking threshold
   266  // _t_, and there exists a span with suffix _r_ covering a point key with suffix
   267  // _p_, and
   268  //
   269  //	_t_ ≤ _r_ < _p_
   270  //
   271  // then the point key is elided. Consider the following rendering, where using
   272  // integer suffixes with higher integers sort before suffixes with lower
   273  // integers, (for example @7 ≤ @6 < @5):
   274  //
   275  //	     ^
   276  //	  @9 |        •―――――――――――――――○ [e,m)@9
   277  //	s  8 |                      • l@8
   278  //	u  7 |------------------------------------ @7 RangeKeyMasking.Suffix
   279  //	f  6 |      [h,q)@6 •―――――――――――――――――○            (threshold)
   280  //	f  5 |              • h@5
   281  //	f  4 |                          • n@4
   282  //	i  3 |          •―――――――――――○ [f,l)@3
   283  //	x  2 |  • b@2
   284  //	   1 |
   285  //	   0 |___________________________________
   286  //	      a b c d e f g h i j k l m n o p q
   287  //
   288  // An iterator scanning the entire keyspace with the masking threshold set to @7
   289  // will observe point keys b@2 and l@8. The span keys [h,q)@6 and [f,l)@3 serve
   290  // as masks, because cmp(@6,@7) ≥ 0 and cmp(@3,@7) ≥ 0. The span key [e,m)@9
   291  // does not serve as a mask, because cmp(@9,@7) < 0.
   292  //
   293  // Although point l@8 falls within the user key bounds of [e,m)@9, [e,m)@9 is
   294  // non-masking due to its suffix. The point key l@8 also falls within the user
   295  // key bounds of [h,q)@6, but since cmp(@6,@8) ≥ 0, l@8 is unmasked.
   296  //
   297  // Invariant: The userKey is within the user key bounds of the span most
   298  // recently provided to `SpanChanged`.
   299  func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool {
   300  	if m.maskSpan == nil {
   301  		// No range key is currently acting as a mask, so don't skip.
   302  		return false
   303  	}
   304  	// Range key masking is enabled and the current span includes a range key
   305  	// that is being used as a mask. (NB: SpanChanged already verified that the
   306  	// range key's suffix is ≥ RangeKeyMasking.Suffix).
   307  	//
   308  	// This point key falls within the bounds of the range key (guaranteed by
   309  	// the InterleavingIter). Skip the point key if the range key's suffix is
   310  	// greater than the point key's suffix.
   311  	pointSuffix := userKey[m.split(userKey):]
   312  	return len(pointSuffix) > 0 && m.cmp(m.maskActiveSuffix, pointSuffix) < 0
   313  }
   314  
   315  // The iteratorRangeKeyState type implements the sstable package's
   316  // BoundLimitedBlockPropertyFilter interface in order to use block property
   317  // filters for range key masking. The iteratorRangeKeyState implementation wraps
   318  // the block-property filter provided in Options.RangeKeyMasking.Filter.
   319  //
   320  // Using a block-property filter for range-key masking requires limiting the
   321  // filter's effect to the bounds of the range key currently acting as a mask.
   322  // Consider the range key [a,m)@10, and an iterator positioned just before the
   323  // below block, bounded by index separators `c` and `z`:
   324  //
   325  //	          c                          z
   326  //	   x      |  c@9 c@5 c@1 d@7 e@4 y@4 | ...
   327  //	iter pos
   328  //
   329  // The next block cannot be skipped, despite the range key suffix @10 is greater
   330  // than all the block's keys' suffixes, because it contains a key (y@4) outside
   331  // the bounds of the range key.
   332  //
   333  // This extended BoundLimitedBlockPropertyFilter interface adds two new methods,
   334  // KeyIsWithinLowerBound and KeyIsWithinUpperBound, for testing whether a
   335  // particular block is within bounds.
   336  //
   337  // The iteratorRangeKeyState implements these new methods by first checking if
   338  // the iterator is currently positioned within a range key. If not, the provided
   339  // key is considered out-of-bounds. If the iterator is positioned within a range
   340  // key, it compares the corresponding range key bound.
   341  var _ sstable.BoundLimitedBlockPropertyFilter = (*rangeKeyMasking)(nil)
   342  
   343  // Name implements the limitedBlockPropertyFilter interface defined in the
   344  // sstable package by passing through to the user-defined block property filter.
   345  func (m *rangeKeyMasking) Name() string {
   346  	return m.filter.Name()
   347  }
   348  
   349  // Intersects implements the limitedBlockPropertyFilter interface defined in the
   350  // sstable package by passing the intersection decision to the user-provided
   351  // block property filter only if a range key is covering the current iterator
   352  // position.
   353  func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) {
   354  	if m.maskSpan == nil {
   355  		// No span is actively masking.
   356  		return true, nil
   357  	}
   358  	return m.filter.Intersects(prop)
   359  }
   360  
   361  // KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface
   362  // defined in the sstable package. It's used to restrict the masking block
   363  // property filter to only applying within the bounds of the active range key.
   364  func (m *rangeKeyMasking) KeyIsWithinLowerBound(ik *InternalKey) bool {
   365  	// Invariant: m.maskSpan != nil
   366  	//
   367  	// The provided `ik` is an inclusive lower bound of the block we're
   368  	// considering skipping.
   369  	return m.cmp(m.maskSpan.Start, ik.UserKey) <= 0
   370  }
   371  
   372  // KeyIsWithinUpperBound implements the limitedBlockPropertyFilter interface
   373  // defined in the sstable package. It's used to restrict the masking block
   374  // property filter to only applying within the bounds of the active range key.
   375  func (m *rangeKeyMasking) KeyIsWithinUpperBound(ik *InternalKey) bool {
   376  	// Invariant: m.maskSpan != nil
   377  	//
   378  	// The provided `ik` is an *inclusive* upper bound of the block we're
   379  	// considering skipping, so the range key's end must be strictly greater
   380  	// than the block bound for the block to be within bounds.
   381  	return m.cmp(m.maskSpan.End, ik.UserKey) > 0
   382  }
   383  
   384  // lazyCombinedIter implements the internalIterator interface, wrapping a
   385  // pointIter. It requires the pointIter's the levelIters be configured with
   386  // pointers to its combinedIterState. When the levelIter observes a file
   387  // containing a range key, the lazyCombinedIter constructs the combined
   388  // range+point key iterator stack and switches to it.
   389  type lazyCombinedIter struct {
   390  	// parent holds a pointer to the root *bitalostable.Iterator containing this
   391  	// iterator. It's used to mutate the internalIterator in use when switching
   392  	// to combined iteration.
   393  	parent            *Iterator
   394  	pointIter         internalIterator
   395  	combinedIterState combinedIterState
   396  }
   397  
   398  // combinedIterState encapsulates the current state of combined iteration.
   399  // Various low-level iterators (mergingIter, leveliter) hold pointers to the
   400  // *bitalostable.Iterator's combinedIterState. This allows them to check whether or
   401  // not they must monitor for files containing range keys (!initialized), or not.
   402  //
   403  // When !initialized, low-level iterators watch for files containing range keys.
   404  // When one is discovered, they set triggered=true and key to the smallest
   405  // (forward direction) or largest (reverse direction) range key that's been
   406  // observed.
   407  type combinedIterState struct {
   408  	// key holds the smallest (forward direction) or largest (backward
   409  	// direction) user key from a range key bound discovered during the iterator
   410  	// operation that triggered the switch to combined iteration.
   411  	//
   412  	// Slices stored here must be stable. This is possible because callers pass
   413  	// a Smallest/Largest bound from a fileMetadata, which are immutable. A key
   414  	// slice's bytes must not be overwritten.
   415  	key         []byte
   416  	triggered   bool
   417  	initialized bool
   418  }
   419  
   420  // Assert that *lazyCombinedIter implements internalIterator.
   421  var _ internalIterator = (*lazyCombinedIter)(nil)
   422  
   423  // initCombinedIteration is invoked after a pointIter positioning operation
   424  // resulted in i.combinedIterState.triggered=true.
   425  //
   426  // The `dir` parameter is `+1` or `-1` indicating forward iteration or backward
   427  // iteration respectively.
   428  //
   429  // The `pointKey` and `pointValue` parameters provide the new point key-value
   430  // pair that the iterator was just positioned to. The combined iterator should
   431  // be seeded with this point key-value pair and return the smaller (forward
   432  // iteration) or largest (backward iteration) of the two.
   433  //
   434  // The `seekKey` parameter is non-nil only if the iterator operation that
   435  // triggered the switch to combined iteration was a SeekGE, SeekPrefixGE or
   436  // SeekLT. It provides the seek key supplied and is used to seek the range-key
   437  // iterator using the same key. This is necessary for SeekGE/SeekPrefixGE
   438  // operations that land in the middle of a range key and must truncate to the
   439  // user-provided seek key.
   440  func (i *lazyCombinedIter) initCombinedIteration(
   441  	dir int8, pointKey *InternalKey, pointValue []byte, seekKey []byte,
   442  ) (*InternalKey, []byte) {
   443  	// Invariant: i.parent.rangeKey is nil.
   444  	// Invariant: !i.combinedIterState.initialized.
   445  	if invariants.Enabled {
   446  		if i.combinedIterState.initialized {
   447  			panic("bitalostable: combined iterator already initialized")
   448  		}
   449  		if i.parent.rangeKey != nil {
   450  			panic("bitalostable: iterator already has a range-key iterator stack")
   451  		}
   452  	}
   453  
   454  	// We need to determine the key to seek the range key iterator to. If
   455  	// seekKey is not nil, the user-initiated operation that triggered the
   456  	// switch to combined iteration was itself a seek, and we can use that key.
   457  	// Otherwise, a First/Last or relative positioning operation triggered the
   458  	// switch to combined iteration.
   459  	//
   460  	// The levelIter that observed a file containing range keys populated
   461  	// combinedIterState.key with the smallest (forward) or largest (backward)
   462  	// range key it observed. If multiple levelIters observed files with range
   463  	// keys during the same operation on the mergingIter, combinedIterState.key
   464  	// is the smallest [during forward iteration; largest in reverse iteration]
   465  	// such key.
   466  	if seekKey == nil {
   467  		// Use the levelIter-populated key.
   468  		seekKey = i.combinedIterState.key
   469  
   470  		// We may need to adjust the levelIter-populated seek key to the
   471  		// surfaced point key. If the key observed is beyond [in the iteration
   472  		// direction] the current point key, there may still exist a range key
   473  		// at an earlier key. Consider the following example:
   474  		//
   475  		//   L5:  000003:[bar.DEL.5, foo.RANGEKEYSET.9]
   476  		//   L6:  000001:[bar.SET.2] 000002:[bax.RANGEKEYSET.8]
   477  		//
   478  		// A call to First() seeks the levels to files L5.000003 and L6.000001.
   479  		// The L5 levelIter observes that L5.000003 contains the range key with
   480  		// start key `foo`, and triggers a switch to combined iteration, setting
   481  		// `combinedIterState.key` = `foo`.
   482  		//
   483  		// The L6 levelIter did not observe the true first range key
   484  		// (bax.RANGEKEYSET.8), because it appears in a later sstable. When the
   485  		// combined iterator is initialized, the range key iterator must be
   486  		// seeked to a key that will find `bax`. To accomplish this, we seek the
   487  		// key instead to `bar`. It is guaranteed that no range key exists
   488  		// earlier than `bar`, otherwise a levelIter would've observed it and
   489  		// set `combinedIterState.key` to its start key.
   490  		if pointKey != nil {
   491  			if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKey.UserKey) > 0 {
   492  				seekKey = pointKey.UserKey
   493  			} else if dir == -1 && i.parent.cmp(seekKey, pointKey.UserKey) < 0 {
   494  				seekKey = pointKey.UserKey
   495  			}
   496  		}
   497  	}
   498  
   499  	if i.parent.hasPrefix {
   500  		si := i.parent.comparer.Split(seekKey)
   501  		if i.parent.cmp(seekKey[:si], i.parent.prefixOrFullSeekKey) > 0 {
   502  			// The earliest possible range key has a start key with a prefix
   503  			// greater than the current iteration prefix. There's no need to
   504  			// switch to combined iteration, because there are not any range
   505  			// keys within the bounds of the prefix. Additionally, using a seek
   506  			// key that is outside the scope of the prefix can violate
   507  			// invariants within the range key iterator stack. Optimizations
   508  			// that exit early due to exhausting the prefix may result in
   509  			// `seekKey` being larger than the next range key's start key.
   510  			//
   511  			// See the testdata/rangekeys test case associated with #1893.
   512  			i.combinedIterState = combinedIterState{initialized: false}
   513  			return pointKey, pointValue
   514  		}
   515  	}
   516  
   517  	// An operation on the point iterator observed a file containing range keys,
   518  	// so we must switch to combined interleaving iteration. First, construct
   519  	// the range key iterator stack. It must not exist, otherwise we'd already
   520  	// be performing combined iteration.
   521  	i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
   522  	i.parent.rangeKey.init(i.parent.comparer.Compare, i.parent.comparer.Split, &i.parent.opts)
   523  	i.parent.constructRangeKeyIter()
   524  
   525  	// Initialize the Iterator's interleaving iterator.
   526  	i.parent.rangeKey.iiter.Init(
   527  		&i.parent.comparer, i.parent.pointIter, i.parent.rangeKey.rangeKeyIter,
   528  		&i.parent.rangeKeyMasking, i.parent.opts.LowerBound, i.parent.opts.UpperBound)
   529  
   530  	// Set the parent's primary iterator to point to the combined, interleaving
   531  	// iterator that's now initialized with our current state.
   532  	i.parent.iter = &i.parent.rangeKey.iiter
   533  	i.combinedIterState.initialized = true
   534  	i.combinedIterState.key = nil
   535  
   536  	// All future iterator operations will go directly through the combined
   537  	// iterator.
   538  	//
   539  	// Initialize the interleaving iterator. We pass the point key-value pair so
   540  	// that the interleaving iterator knows where the point iterator is
   541  	// positioned. Additionally, we pass the seek key to which the range-key
   542  	// iterator should be seeked in order to initialize its position.
   543  	//
   544  	// In the forward direction (invert for backwards), the seek key is a key
   545  	// guaranteed to find the smallest range key that's greater than the last
   546  	// key the iterator returned. The range key may be less than pointKey, in
   547  	// which case the range key will be interleaved next instead of the point
   548  	// key.
   549  	if dir == +1 {
   550  		var prefix []byte
   551  		if i.parent.hasPrefix {
   552  			prefix = i.parent.prefixOrFullSeekKey
   553  		}
   554  		return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKey, pointValue)
   555  	}
   556  	return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKey, pointValue)
   557  }
   558  
   559  func (i *lazyCombinedIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
   560  	if i.combinedIterState.initialized {
   561  		return i.parent.rangeKey.iiter.SeekGE(key, flags)
   562  	}
   563  	k, v := i.pointIter.SeekGE(key, flags)
   564  	if i.combinedIterState.triggered {
   565  		return i.initCombinedIteration(+1, k, v, key)
   566  	}
   567  	return k, v
   568  }
   569  
   570  func (i *lazyCombinedIter) SeekPrefixGE(
   571  	prefix, key []byte, flags base.SeekGEFlags,
   572  ) (*InternalKey, []byte) {
   573  	if i.combinedIterState.initialized {
   574  		return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags)
   575  	}
   576  	k, v := i.pointIter.SeekPrefixGE(prefix, key, flags)
   577  	if i.combinedIterState.triggered {
   578  		return i.initCombinedIteration(+1, k, v, key)
   579  	}
   580  	return k, v
   581  }
   582  
   583  func (i *lazyCombinedIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
   584  	if i.combinedIterState.initialized {
   585  		return i.parent.rangeKey.iiter.SeekLT(key, flags)
   586  	}
   587  	k, v := i.pointIter.SeekLT(key, flags)
   588  	if i.combinedIterState.triggered {
   589  		return i.initCombinedIteration(-1, k, v, key)
   590  	}
   591  	return k, v
   592  }
   593  
   594  func (i *lazyCombinedIter) First() (*InternalKey, []byte) {
   595  	if i.combinedIterState.initialized {
   596  		return i.parent.rangeKey.iiter.First()
   597  	}
   598  	k, v := i.pointIter.First()
   599  	if i.combinedIterState.triggered {
   600  		return i.initCombinedIteration(+1, k, v, nil)
   601  	}
   602  	return k, v
   603  }
   604  
   605  func (i *lazyCombinedIter) Last() (*InternalKey, []byte) {
   606  	if i.combinedIterState.initialized {
   607  		return i.parent.rangeKey.iiter.Last()
   608  	}
   609  	k, v := i.pointIter.Last()
   610  	if i.combinedIterState.triggered {
   611  		return i.initCombinedIteration(-1, k, v, nil)
   612  	}
   613  	return k, v
   614  }
   615  
   616  func (i *lazyCombinedIter) Next() (*InternalKey, []byte) {
   617  	if i.combinedIterState.initialized {
   618  		return i.parent.rangeKey.iiter.Next()
   619  	}
   620  	k, v := i.pointIter.Next()
   621  	if i.combinedIterState.triggered {
   622  		return i.initCombinedIteration(+1, k, v, nil)
   623  	}
   624  	return k, v
   625  }
   626  
   627  func (i *lazyCombinedIter) Prev() (*InternalKey, []byte) {
   628  	if i.combinedIterState.initialized {
   629  		return i.parent.rangeKey.iiter.Prev()
   630  	}
   631  	k, v := i.pointIter.Prev()
   632  	if i.combinedIterState.triggered {
   633  		return i.initCombinedIteration(-1, k, v, nil)
   634  	}
   635  	return k, v
   636  }
   637  
   638  func (i *lazyCombinedIter) Error() error {
   639  	if i.combinedIterState.initialized {
   640  		return i.parent.rangeKey.iiter.Error()
   641  	}
   642  	return i.pointIter.Error()
   643  }
   644  
   645  func (i *lazyCombinedIter) Close() error {
   646  	if i.combinedIterState.initialized {
   647  		return i.parent.rangeKey.iiter.Close()
   648  	}
   649  	return i.pointIter.Close()
   650  }
   651  
   652  func (i *lazyCombinedIter) SetBounds(lower, upper []byte) {
   653  	if i.combinedIterState.initialized {
   654  		i.parent.rangeKey.iiter.SetBounds(lower, upper)
   655  		return
   656  	}
   657  	i.pointIter.SetBounds(lower, upper)
   658  }
   659  
   660  func (i *lazyCombinedIter) String() string {
   661  	if i.combinedIterState.initialized {
   662  		return i.parent.rangeKey.iiter.String()
   663  	}
   664  	return i.pointIter.String()
   665  }