github.com/cockroachdb/pebble@v1.1.2/range_keys.go

github.com/cockroachdb/pebble@v1.1.2/range_keys.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"github.com/cockroachdb/pebble/internal/base"
     9  	"github.com/cockroachdb/pebble/internal/invariants"
    10  	"github.com/cockroachdb/pebble/internal/keyspan"
    11  	"github.com/cockroachdb/pebble/internal/manifest"
    12  	"github.com/cockroachdb/pebble/sstable"
    13  )
    14  
    15  // constructRangeKeyIter constructs the range-key iterator stack, populating
    16  // i.rangeKey.rangeKeyIter with the resulting iterator.
    17  func (i *Iterator) constructRangeKeyIter() {
    18  	i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
    19  		&i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
    20  		&i.hasPrefix, &i.prefixOrFullSeekKey, false /* internalKeys */, &i.rangeKey.rangeKeyBuffers.internal)
    21  
    22  	// If there's an indexed batch with range keys, include it.
    23  	if i.batch != nil {
    24  		if i.batch.index == nil {
    25  			i.rangeKey.iterConfig.AddLevel(newErrorKeyspanIter(ErrNotIndexed))
    26  		} else {
    27  			// Only include the batch's range key iterator if it has any keys.
    28  			// NB: This can force reconstruction of the rangekey iterator stack
    29  			// in SetOptions if subsequently range keys are added. See
    30  			// SetOptions.
    31  			if i.batch.countRangeKeys > 0 {
    32  				i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, i.batchSeqNum)
    33  				i.rangeKey.iterConfig.AddLevel(&i.batchRangeKeyIter)
    34  			}
    35  		}
    36  	}
    37  
    38  	// Next are the flushables: memtables and large batches.
    39  	if i.readState != nil {
    40  		for j := len(i.readState.memtables) - 1; j >= 0; j-- {
    41  			mem := i.readState.memtables[j]
    42  			// We only need to read from memtables which contain sequence numbers older
    43  			// than seqNum.
    44  			if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
    45  				continue
    46  			}
    47  			if rki := mem.newRangeKeyIter(&i.opts); rki != nil {
    48  				i.rangeKey.iterConfig.AddLevel(rki)
    49  			}
    50  		}
    51  	}
    52  
    53  	current := i.version
    54  	if current == nil {
    55  		current = i.readState.current
    56  	}
    57  	// Next are the file levels: L0 sub-levels followed by lower levels.
    58  	//
    59  	// Add file-specific iterators for L0 files containing range keys. This is less
    60  	// efficient than using levelIters for sublevels of L0 files containing
    61  	// range keys, but range keys are expected to be sparse anyway, reducing the
    62  	// cost benefit of maintaining a separate L0Sublevels instance for range key
    63  	// files and then using it here.
    64  	//
    65  	// NB: We iterate L0's files in reverse order. They're sorted by
    66  	// LargestSeqNum ascending, and we need to add them to the merging iterator
    67  	// in LargestSeqNum descending to preserve the merging iterator's invariants
    68  	// around Key Trailer order.
    69  	iter := current.RangeKeyLevels[0].Iter()
    70  	for f := iter.Last(); f != nil; f = iter.Prev() {
    71  		spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions())
    72  		if err != nil {
    73  			i.rangeKey.iterConfig.AddLevel(&errorKeyspanIter{err: err})
    74  			continue
    75  		}
    76  		i.rangeKey.iterConfig.AddLevel(spanIter)
    77  	}
    78  
    79  	// Add level iterators for the non-empty non-L0 levels.
    80  	for level := 1; level < len(current.RangeKeyLevels); level++ {
    81  		if current.RangeKeyLevels[level].Empty() {
    82  			continue
    83  		}
    84  		li := i.rangeKey.iterConfig.NewLevelIter()
    85  		spanIterOpts := i.opts.SpanIterOptions()
    86  		li.Init(spanIterOpts, i.cmp, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
    87  			manifest.Level(level), manifest.KeyTypeRange)
    88  		i.rangeKey.iterConfig.AddLevel(li)
    89  	}
    90  }
    91  
    92  // Range key masking
    93  //
    94  // Pebble iterators may be configured such that range keys with suffixes mask
    95  // point keys with lower suffixes. The intended use is implementing a MVCC
    96  // delete range operation using range keys, when suffixes are MVCC timestamps.
    97  //
    98  // To enable masking, the user populates the IterOptions's RangeKeyMasking
    99  // field. The Suffix field configures which range keys act as masks. The
   100  // intended use is to hold a MVCC read timestamp. When implementing a MVCC
   101  // delete range operation, only range keys that are visible at the read
   102  // timestamp should be visible. If a range key has a suffix ≤
   103  // RangeKeyMasking.Suffix, it acts as a mask.
   104  //
   105  // Range key masking is facilitated by the keyspan.InterleavingIter. The
   106  // interleaving iterator interleaves range keys and point keys during combined
   107  // iteration. During user iteration, the interleaving iterator is configured
   108  // with a keyspan.SpanMask, implemented by the rangeKeyMasking struct below.
   109  // The SpanMask interface defines two methods: SpanChanged and SkipPoint.
   110  //
   111  // SpanChanged is used to keep the current mask up-to-date. Whenever the point
   112  // iterator has stepped into or out of the bounds of a range key, the
   113  // interleaving iterator invokes SpanChanged passing the current covering range
   114  // key. The below rangeKeyMasking implementation scans the range keys looking
   115  // for the range key with the largest suffix that's still ≤ the suffix supplied
   116  // to IterOptions.RangeKeyMasking.Suffix (the "read timestamp"). If it finds a
   117  // range key that meets the condition, the range key should act as a mask. The
   118  // span and the relevant range key's suffix are saved.
   119  //
   120  // The above ensures that `rangeKeyMasking.maskActiveSuffix` always contains the
   121  // current masking suffix such that any point keys with lower suffixes should be
   122  // skipped.
   123  //
   124  // There are two ways in which masked point keys are skipped.
   125  //
   126  //   1. Interleaving iterator SkipPoint
   127  //
   128  // Whenever the interleaving iterator encounters a point key that falls within
   129  // the bounds of a range key, it invokes SkipPoint. The interleaving iterator
   130  // guarantees that the SpanChanged method described above has already been
   131  // invoked with the covering range key. The below rangeKeyMasking implementation
   132  // of SkipPoint splits the key into prefix and suffix, compares the suffix to
   133  // the `maskActiveSuffix` updated by SpanChanged and returns true if
   134  // suffix(point) < maskActiveSuffix.
   135  //
   136  // The SkipPoint logic is sufficient to ensure that the Pebble iterator filters
   137  // out all masked point keys. However, it requires the iterator read each masked
   138  // point key. For broad range keys that mask many points, this may be expensive.
   139  //
   140  //   2. Block property filter
   141  //
   142  // For more efficient handling of braad range keys that mask many points, the
   143  // IterOptions.RangeKeyMasking field has an optional Filter option. This Filter
   144  // field takes a superset of the block-property filter interface, adding a
   145  // method to dynamically configure the filter's filtering criteria.
   146  //
   147  // To make use of the Filter option, the user is required to define and
   148  // configure a block-property collector that collects a property containing at
   149  // least the maximum suffix of a key within a block.
   150  //
   151  // When the SpanChanged method described above is invoked, rangeKeyMasking also
   152  // reconfigures the user-provided filter. It invokes a SetSuffix method,
   153  // providing the `maskActiveSuffix`, requesting that from now on the
   154  // block-property filter return Intersects()=false for any properties indicating
   155  // that a block contains exclusively keys with suffixes greater than the
   156  // provided suffix.
   157  //
   158  // Note that unlike other block-property filters, the filter used for masking
   159  // must not apply across the entire keyspace. It must only filter blocks that
   160  // lie within the bounds of the range key that set the mask suffix. To
   161  // accommodate this, rangeKeyMasking implements a special interface:
   162  // sstable.BoundLimitedBlockPropertyFilter. This interface extends the block
   163  // property filter interface with two new methods: KeyIsWithinLowerBound and
   164  // KeyIsWithinUpperBound. The rangeKeyMasking type wraps the user-provided block
   165  // property filter, implementing these two methods and overriding Intersects to
   166  // always return true if there is no active mask.
   167  //
   168  // The logic to ensure that a mask block-property filter is only applied within
   169  // the bounds of the masking range key is subtle. The interleaving iterator
   170  // guarantees that it never invokes SpanChanged until the point iterator is
   171  // positioned within the range key. During forward iteration, this guarantees
   172  // that any block that a sstable reader might attempt to load contains only keys
   173  // greater than or equal to the range key's lower bound. During backward
   174  // iteration, it provides the analagous guarantee on the range key's upper
   175  // bound.
   176  //
   177  // The above ensures that an sstable reader only needs to verify that a block
   178  // that it skips meets the opposite bound. This is where the
   179  // KeyIsWithinLowerBound and KeyIsWithinUpperBound methods are used. When an
   180  // sstable iterator is configured with a BoundLimitedBlockPropertyFilter, it
   181  // checks for intersection with the block-property filter before every block
   182  // load, like ordinary block-property filters. However, if the bound-limited
   183  // block property filter indicates that it does NOT intersect, the filter's
   184  // relevant KeyIsWithin{Lower,Upper}Bound method is queried, using a block
   185  // index separator as the bound. If the method indicates that the provided index
   186  // separator does not fall within the range key bounds, the no-intersection
   187  // result is ignored, and the block is read.
   188  
   189  type rangeKeyMasking struct {
   190  	cmp    base.Compare
   191  	split  base.Split
   192  	filter BlockPropertyFilterMask
   193  	// maskActiveSuffix holds the suffix of a range key currently acting as a
   194  	// mask, hiding point keys with suffixes greater than it. maskActiveSuffix
   195  	// is only ever non-nil if IterOptions.RangeKeyMasking.Suffix is non-nil.
   196  	// maskActiveSuffix is updated whenever the iterator passes over a new range
   197  	// key. The maskActiveSuffix should only be used if maskSpan is non-nil.
   198  	//
   199  	// See SpanChanged.
   200  	maskActiveSuffix []byte
   201  	// maskSpan holds the span from which the active mask suffix was extracted.
   202  	// The span is used for bounds comparisons, to ensure that a range-key mask
   203  	// is not applied beyond the bounds of the range key.
   204  	maskSpan *keyspan.Span
   205  	parent   *Iterator
   206  }
   207  
   208  func (m *rangeKeyMasking) init(parent *Iterator, cmp base.Compare, split base.Split) {
   209  	m.cmp = cmp
   210  	m.split = split
   211  	if parent.opts.RangeKeyMasking.Filter != nil {
   212  		m.filter = parent.opts.RangeKeyMasking.Filter()
   213  	}
   214  	m.parent = parent
   215  }
   216  
   217  // SpanChanged implements the keyspan.SpanMask interface, used during range key
   218  // iteration.
   219  func (m *rangeKeyMasking) SpanChanged(s *keyspan.Span) {
   220  	if s == nil && m.maskSpan == nil {
   221  		return
   222  	}
   223  	m.maskSpan = nil
   224  	m.maskActiveSuffix = m.maskActiveSuffix[:0]
   225  
   226  	// Find the smallest suffix of a range key contained within the Span,
   227  	// excluding suffixes less than m.opts.RangeKeyMasking.Suffix.
   228  	if s != nil {
   229  		m.parent.rangeKey.stale = true
   230  		if m.parent.opts.RangeKeyMasking.Suffix != nil {
   231  			for j := range s.Keys {
   232  				if s.Keys[j].Suffix == nil {
   233  					continue
   234  				}
   235  				if m.cmp(s.Keys[j].Suffix, m.parent.opts.RangeKeyMasking.Suffix) < 0 {
   236  					continue
   237  				}
   238  				if len(m.maskActiveSuffix) == 0 || m.cmp(m.maskActiveSuffix, s.Keys[j].Suffix) > 0 {
   239  					m.maskSpan = s
   240  					m.maskActiveSuffix = append(m.maskActiveSuffix[:0], s.Keys[j].Suffix...)
   241  				}
   242  			}
   243  		}
   244  	}
   245  
   246  	if m.maskSpan != nil && m.parent.opts.RangeKeyMasking.Filter != nil {
   247  		// Update the  block-property filter to filter point keys with suffixes
   248  		// greater than m.maskActiveSuffix.
   249  		err := m.filter.SetSuffix(m.maskActiveSuffix)
   250  		if err != nil {
   251  			m.parent.err = err
   252  		}
   253  	}
   254  	// If no span is active, we leave the inner block-property filter configured
   255  	// with its existing suffix. That's okay, because Intersects calls are first
   256  	// evaluated by iteratorRangeKeyState.Intersects, which considers all blocks
   257  	// as intersecting if there's no active mask.
   258  }
   259  
   260  // SkipPoint implements the keyspan.SpanMask interface, used during range key
   261  // iteration. Whenever a point key is covered by a non-empty Span, the
   262  // interleaving iterator invokes SkipPoint. This function is responsible for
   263  // performing range key masking.
   264  //
   265  // If a non-nil IterOptions.RangeKeyMasking.Suffix is set, range key masking is
   266  // enabled. Masking hides point keys, transparently skipping over the keys.
   267  // Whether or not a point key is masked is determined by comparing the point
   268  // key's suffix, the overlapping span's keys' suffixes, and the user-configured
   269  // IterOption's RangeKeyMasking.Suffix. When configured with a masking threshold
   270  // _t_, and there exists a span with suffix _r_ covering a point key with suffix
   271  // _p_, and
   272  //
   273  //	_t_ ≤ _r_ < _p_
   274  //
   275  // then the point key is elided. Consider the following rendering, where using
   276  // integer suffixes with higher integers sort before suffixes with lower
   277  // integers, (for example @7 ≤ @6 < @5):
   278  //
   279  //	     ^
   280  //	  @9 |        •―――――――――――――――○ [e,m)@9
   281  //	s  8 |                      • l@8
   282  //	u  7 |------------------------------------ @7 RangeKeyMasking.Suffix
   283  //	f  6 |      [h,q)@6 •―――――――――――――――――○            (threshold)
   284  //	f  5 |              • h@5
   285  //	f  4 |                          • n@4
   286  //	i  3 |          •―――――――――――○ [f,l)@3
   287  //	x  2 |  • b@2
   288  //	   1 |
   289  //	   0 |___________________________________
   290  //	      a b c d e f g h i j k l m n o p q
   291  //
   292  // An iterator scanning the entire keyspace with the masking threshold set to @7
   293  // will observe point keys b@2 and l@8. The span keys [h,q)@6 and [f,l)@3 serve
   294  // as masks, because cmp(@6,@7) ≥ 0 and cmp(@3,@7) ≥ 0. The span key [e,m)@9
   295  // does not serve as a mask, because cmp(@9,@7) < 0.
   296  //
   297  // Although point l@8 falls within the user key bounds of [e,m)@9, [e,m)@9 is
   298  // non-masking due to its suffix. The point key l@8 also falls within the user
   299  // key bounds of [h,q)@6, but since cmp(@6,@8) ≥ 0, l@8 is unmasked.
   300  //
   301  // Invariant: The userKey is within the user key bounds of the span most
   302  // recently provided to `SpanChanged`.
   303  func (m *rangeKeyMasking) SkipPoint(userKey []byte) bool {
   304  	m.parent.stats.RangeKeyStats.ContainedPoints++
   305  	if m.maskSpan == nil {
   306  		// No range key is currently acting as a mask, so don't skip.
   307  		return false
   308  	}
   309  	// Range key masking is enabled and the current span includes a range key
   310  	// that is being used as a mask. (NB: SpanChanged already verified that the
   311  	// range key's suffix is ≥ RangeKeyMasking.Suffix).
   312  	//
   313  	// This point key falls within the bounds of the range key (guaranteed by
   314  	// the InterleavingIter). Skip the point key if the range key's suffix is
   315  	// greater than the point key's suffix.
   316  	pointSuffix := userKey[m.split(userKey):]
   317  	if len(pointSuffix) > 0 && m.cmp(m.maskActiveSuffix, pointSuffix) < 0 {
   318  		m.parent.stats.RangeKeyStats.SkippedPoints++
   319  		return true
   320  	}
   321  	return false
   322  }
   323  
   324  // The iteratorRangeKeyState type implements the sstable package's
   325  // BoundLimitedBlockPropertyFilter interface in order to use block property
   326  // filters for range key masking. The iteratorRangeKeyState implementation wraps
   327  // the block-property filter provided in Options.RangeKeyMasking.Filter.
   328  //
   329  // Using a block-property filter for range-key masking requires limiting the
   330  // filter's effect to the bounds of the range key currently acting as a mask.
   331  // Consider the range key [a,m)@10, and an iterator positioned just before the
   332  // below block, bounded by index separators `c` and `z`:
   333  //
   334  //	          c                          z
   335  //	   x      |  c@9 c@5 c@1 d@7 e@4 y@4 | ...
   336  //	iter pos
   337  //
   338  // The next block cannot be skipped, despite the range key suffix @10 is greater
   339  // than all the block's keys' suffixes, because it contains a key (y@4) outside
   340  // the bounds of the range key.
   341  //
   342  // This extended BoundLimitedBlockPropertyFilter interface adds two new methods,
   343  // KeyIsWithinLowerBound and KeyIsWithinUpperBound, for testing whether a
   344  // particular block is within bounds.
   345  //
   346  // The iteratorRangeKeyState implements these new methods by first checking if
   347  // the iterator is currently positioned within a range key. If not, the provided
   348  // key is considered out-of-bounds. If the iterator is positioned within a range
   349  // key, it compares the corresponding range key bound.
   350  var _ sstable.BoundLimitedBlockPropertyFilter = (*rangeKeyMasking)(nil)
   351  
   352  // Name implements the limitedBlockPropertyFilter interface defined in the
   353  // sstable package by passing through to the user-defined block property filter.
   354  func (m *rangeKeyMasking) Name() string {
   355  	return m.filter.Name()
   356  }
   357  
   358  // Intersects implements the limitedBlockPropertyFilter interface defined in the
   359  // sstable package by passing the intersection decision to the user-provided
   360  // block property filter only if a range key is covering the current iterator
   361  // position.
   362  func (m *rangeKeyMasking) Intersects(prop []byte) (bool, error) {
   363  	if m.maskSpan == nil {
   364  		// No span is actively masking.
   365  		return true, nil
   366  	}
   367  	return m.filter.Intersects(prop)
   368  }
   369  
   370  // KeyIsWithinLowerBound implements the limitedBlockPropertyFilter interface
   371  // defined in the sstable package. It's used to restrict the masking block
   372  // property filter to only applying within the bounds of the active range key.
   373  func (m *rangeKeyMasking) KeyIsWithinLowerBound(key []byte) bool {
   374  	// Invariant: m.maskSpan != nil
   375  	//
   376  	// The provided `key` is an inclusive lower bound of the block we're
   377  	// considering skipping.
   378  	return m.cmp(m.maskSpan.Start, key) <= 0
   379  }
   380  
   381  // KeyIsWithinUpperBound implements the limitedBlockPropertyFilter interface
   382  // defined in the sstable package. It's used to restrict the masking block
   383  // property filter to only applying within the bounds of the active range key.
   384  func (m *rangeKeyMasking) KeyIsWithinUpperBound(key []byte) bool {
   385  	// Invariant: m.maskSpan != nil
   386  	//
   387  	// The provided `key` is an *inclusive* upper bound of the block we're
   388  	// considering skipping, so the range key's end must be strictly greater
   389  	// than the block bound for the block to be within bounds.
   390  	return m.cmp(m.maskSpan.End, key) > 0
   391  }
   392  
   393  // lazyCombinedIter implements the internalIterator interface, wrapping a
   394  // pointIter. It requires the pointIter's the levelIters be configured with
   395  // pointers to its combinedIterState. When the levelIter observes a file
   396  // containing a range key, the lazyCombinedIter constructs the combined
   397  // range+point key iterator stack and switches to it.
   398  type lazyCombinedIter struct {
   399  	// parent holds a pointer to the root *pebble.Iterator containing this
   400  	// iterator. It's used to mutate the internalIterator in use when switching
   401  	// to combined iteration.
   402  	parent            *Iterator
   403  	pointIter         internalIterator
   404  	combinedIterState combinedIterState
   405  }
   406  
   407  // combinedIterState encapsulates the current state of combined iteration.
   408  // Various low-level iterators (mergingIter, leveliter) hold pointers to the
   409  // *pebble.Iterator's combinedIterState. This allows them to check whether or
   410  // not they must monitor for files containing range keys (!initialized), or not.
   411  //
   412  // When !initialized, low-level iterators watch for files containing range keys.
   413  // When one is discovered, they set triggered=true and key to the smallest
   414  // (forward direction) or largest (reverse direction) range key that's been
   415  // observed.
   416  type combinedIterState struct {
   417  	// key holds the smallest (forward direction) or largest (backward
   418  	// direction) user key from a range key bound discovered during the iterator
   419  	// operation that triggered the switch to combined iteration.
   420  	//
   421  	// Slices stored here must be stable. This is possible because callers pass
   422  	// a Smallest/Largest bound from a fileMetadata, which are immutable. A key
   423  	// slice's bytes must not be overwritten.
   424  	key         []byte
   425  	triggered   bool
   426  	initialized bool
   427  }
   428  
   429  // Assert that *lazyCombinedIter implements internalIterator.
   430  var _ internalIterator = (*lazyCombinedIter)(nil)
   431  
   432  // initCombinedIteration is invoked after a pointIter positioning operation
   433  // resulted in i.combinedIterState.triggered=true.
   434  //
   435  // The `dir` parameter is `+1` or `-1` indicating forward iteration or backward
   436  // iteration respectively.
   437  //
   438  // The `pointKey` and `pointValue` parameters provide the new point key-value
   439  // pair that the iterator was just positioned to. The combined iterator should
   440  // be seeded with this point key-value pair and return the smaller (forward
   441  // iteration) or largest (backward iteration) of the two.
   442  //
   443  // The `seekKey` parameter is non-nil only if the iterator operation that
   444  // triggered the switch to combined iteration was a SeekGE, SeekPrefixGE or
   445  // SeekLT. It provides the seek key supplied and is used to seek the range-key
   446  // iterator using the same key. This is necessary for SeekGE/SeekPrefixGE
   447  // operations that land in the middle of a range key and must truncate to the
   448  // user-provided seek key.
   449  func (i *lazyCombinedIter) initCombinedIteration(
   450  	dir int8, pointKey *InternalKey, pointValue base.LazyValue, seekKey []byte,
   451  ) (*InternalKey, base.LazyValue) {
   452  	// Invariant: i.parent.rangeKey is nil.
   453  	// Invariant: !i.combinedIterState.initialized.
   454  	if invariants.Enabled {
   455  		if i.combinedIterState.initialized {
   456  			panic("pebble: combined iterator already initialized")
   457  		}
   458  		if i.parent.rangeKey != nil {
   459  			panic("pebble: iterator already has a range-key iterator stack")
   460  		}
   461  	}
   462  
   463  	// We need to determine the key to seek the range key iterator to. If
   464  	// seekKey is not nil, the user-initiated operation that triggered the
   465  	// switch to combined iteration was itself a seek, and we can use that key.
   466  	// Otherwise, a First/Last or relative positioning operation triggered the
   467  	// switch to combined iteration.
   468  	//
   469  	// The levelIter that observed a file containing range keys populated
   470  	// combinedIterState.key with the smallest (forward) or largest (backward)
   471  	// range key it observed. If multiple levelIters observed files with range
   472  	// keys during the same operation on the mergingIter, combinedIterState.key
   473  	// is the smallest [during forward iteration; largest in reverse iteration]
   474  	// such key.
   475  	if seekKey == nil {
   476  		// Use the levelIter-populated key.
   477  		seekKey = i.combinedIterState.key
   478  
   479  		// We may need to adjust the levelIter-populated seek key to the
   480  		// surfaced point key. If the key observed is beyond [in the iteration
   481  		// direction] the current point key, there may still exist a range key
   482  		// at an earlier key. Consider the following example:
   483  		//
   484  		//   L5:  000003:[bar.DEL.5, foo.RANGEKEYSET.9]
   485  		//   L6:  000001:[bar.SET.2] 000002:[bax.RANGEKEYSET.8]
   486  		//
   487  		// A call to First() seeks the levels to files L5.000003 and L6.000001.
   488  		// The L5 levelIter observes that L5.000003 contains the range key with
   489  		// start key `foo`, and triggers a switch to combined iteration, setting
   490  		// `combinedIterState.key` = `foo`.
   491  		//
   492  		// The L6 levelIter did not observe the true first range key
   493  		// (bax.RANGEKEYSET.8), because it appears in a later sstable. When the
   494  		// combined iterator is initialized, the range key iterator must be
   495  		// seeked to a key that will find `bax`. To accomplish this, we seek the
   496  		// key instead to `bar`. It is guaranteed that no range key exists
   497  		// earlier than `bar`, otherwise a levelIter would've observed it and
   498  		// set `combinedIterState.key` to its start key.
   499  		if pointKey != nil {
   500  			if dir == +1 && i.parent.cmp(i.combinedIterState.key, pointKey.UserKey) > 0 {
   501  				seekKey = pointKey.UserKey
   502  			} else if dir == -1 && i.parent.cmp(seekKey, pointKey.UserKey) < 0 {
   503  				seekKey = pointKey.UserKey
   504  			}
   505  		}
   506  	}
   507  
   508  	// An operation on the point iterator observed a file containing range keys,
   509  	// so we must switch to combined interleaving iteration. First, construct
   510  	// the range key iterator stack. It must not exist, otherwise we'd already
   511  	// be performing combined iteration.
   512  	i.parent.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
   513  	i.parent.rangeKey.init(i.parent.comparer.Compare, i.parent.comparer.Split, &i.parent.opts)
   514  	i.parent.constructRangeKeyIter()
   515  
   516  	// Initialize the Iterator's interleaving iterator.
   517  	i.parent.rangeKey.iiter.Init(
   518  		&i.parent.comparer, i.parent.pointIter, i.parent.rangeKey.rangeKeyIter,
   519  		keyspan.InterleavingIterOpts{
   520  			Mask:       &i.parent.rangeKeyMasking,
   521  			LowerBound: i.parent.opts.LowerBound,
   522  			UpperBound: i.parent.opts.UpperBound,
   523  		})
   524  
   525  	// Set the parent's primary iterator to point to the combined, interleaving
   526  	// iterator that's now initialized with our current state.
   527  	i.parent.iter = &i.parent.rangeKey.iiter
   528  	i.combinedIterState.initialized = true
   529  	i.combinedIterState.key = nil
   530  
   531  	// All future iterator operations will go directly through the combined
   532  	// iterator.
   533  	//
   534  	// Initialize the interleaving iterator. We pass the point key-value pair so
   535  	// that the interleaving iterator knows where the point iterator is
   536  	// positioned. Additionally, we pass the seek key to which the range-key
   537  	// iterator should be seeked in order to initialize its position.
   538  	//
   539  	// In the forward direction (invert for backwards), the seek key is a key
   540  	// guaranteed to find the smallest range key that's greater than the last
   541  	// key the iterator returned. The range key may be less than pointKey, in
   542  	// which case the range key will be interleaved next instead of the point
   543  	// key.
   544  	if dir == +1 {
   545  		var prefix []byte
   546  		if i.parent.hasPrefix {
   547  			prefix = i.parent.prefixOrFullSeekKey
   548  		}
   549  		return i.parent.rangeKey.iiter.InitSeekGE(prefix, seekKey, pointKey, pointValue)
   550  	}
   551  	return i.parent.rangeKey.iiter.InitSeekLT(seekKey, pointKey, pointValue)
   552  }
   553  
   554  func (i *lazyCombinedIter) SeekGE(
   555  	key []byte, flags base.SeekGEFlags,
   556  ) (*InternalKey, base.LazyValue) {
   557  	if i.combinedIterState.initialized {
   558  		return i.parent.rangeKey.iiter.SeekGE(key, flags)
   559  	}
   560  	k, v := i.pointIter.SeekGE(key, flags)
   561  	if i.combinedIterState.triggered {
   562  		return i.initCombinedIteration(+1, k, v, key)
   563  	}
   564  	return k, v
   565  }
   566  
   567  func (i *lazyCombinedIter) SeekPrefixGE(
   568  	prefix, key []byte, flags base.SeekGEFlags,
   569  ) (*InternalKey, base.LazyValue) {
   570  	if i.combinedIterState.initialized {
   571  		return i.parent.rangeKey.iiter.SeekPrefixGE(prefix, key, flags)
   572  	}
   573  	k, v := i.pointIter.SeekPrefixGE(prefix, key, flags)
   574  	if i.combinedIterState.triggered {
   575  		return i.initCombinedIteration(+1, k, v, key)
   576  	}
   577  	return k, v
   578  }
   579  
   580  func (i *lazyCombinedIter) SeekLT(
   581  	key []byte, flags base.SeekLTFlags,
   582  ) (*InternalKey, base.LazyValue) {
   583  	if i.combinedIterState.initialized {
   584  		return i.parent.rangeKey.iiter.SeekLT(key, flags)
   585  	}
   586  	k, v := i.pointIter.SeekLT(key, flags)
   587  	if i.combinedIterState.triggered {
   588  		return i.initCombinedIteration(-1, k, v, key)
   589  	}
   590  	return k, v
   591  }
   592  
   593  func (i *lazyCombinedIter) First() (*InternalKey, base.LazyValue) {
   594  	if i.combinedIterState.initialized {
   595  		return i.parent.rangeKey.iiter.First()
   596  	}
   597  	k, v := i.pointIter.First()
   598  	if i.combinedIterState.triggered {
   599  		return i.initCombinedIteration(+1, k, v, nil)
   600  	}
   601  	return k, v
   602  }
   603  
   604  func (i *lazyCombinedIter) Last() (*InternalKey, base.LazyValue) {
   605  	if i.combinedIterState.initialized {
   606  		return i.parent.rangeKey.iiter.Last()
   607  	}
   608  	k, v := i.pointIter.Last()
   609  	if i.combinedIterState.triggered {
   610  		return i.initCombinedIteration(-1, k, v, nil)
   611  	}
   612  	return k, v
   613  }
   614  
   615  func (i *lazyCombinedIter) Next() (*InternalKey, base.LazyValue) {
   616  	if i.combinedIterState.initialized {
   617  		return i.parent.rangeKey.iiter.Next()
   618  	}
   619  	k, v := i.pointIter.Next()
   620  	if i.combinedIterState.triggered {
   621  		return i.initCombinedIteration(+1, k, v, nil)
   622  	}
   623  	return k, v
   624  }
   625  
   626  func (i *lazyCombinedIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
   627  	if i.combinedIterState.initialized {
   628  		return i.parent.rangeKey.iiter.NextPrefix(succKey)
   629  	}
   630  	k, v := i.pointIter.NextPrefix(succKey)
   631  	if i.combinedIterState.triggered {
   632  		return i.initCombinedIteration(+1, k, v, nil)
   633  	}
   634  	return k, v
   635  }
   636  
   637  func (i *lazyCombinedIter) Prev() (*InternalKey, base.LazyValue) {
   638  	if i.combinedIterState.initialized {
   639  		return i.parent.rangeKey.iiter.Prev()
   640  	}
   641  	k, v := i.pointIter.Prev()
   642  	if i.combinedIterState.triggered {
   643  		return i.initCombinedIteration(-1, k, v, nil)
   644  	}
   645  	return k, v
   646  }
   647  
   648  func (i *lazyCombinedIter) Error() error {
   649  	if i.combinedIterState.initialized {
   650  		return i.parent.rangeKey.iiter.Error()
   651  	}
   652  	return i.pointIter.Error()
   653  }
   654  
   655  func (i *lazyCombinedIter) Close() error {
   656  	if i.combinedIterState.initialized {
   657  		return i.parent.rangeKey.iiter.Close()
   658  	}
   659  	return i.pointIter.Close()
   660  }
   661  
   662  func (i *lazyCombinedIter) SetBounds(lower, upper []byte) {
   663  	if i.combinedIterState.initialized {
   664  		i.parent.rangeKey.iiter.SetBounds(lower, upper)
   665  		return
   666  	}
   667  	i.pointIter.SetBounds(lower, upper)
   668  }
   669  
   670  func (i *lazyCombinedIter) String() string {
   671  	if i.combinedIterState.initialized {
   672  		return i.parent.rangeKey.iiter.String()
   673  	}
   674  	return i.pointIter.String()
   675  }