github.com/cockroachdb/pebble@v1.1.2/merging_iter.go

github.com/cockroachdb/pebble@v1.1.2/merging_iter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"runtime/debug"
    11  	"unsafe"
    12  
    13  	"github.com/cockroachdb/errors"
    14  	"github.com/cockroachdb/pebble/internal/base"
    15  	"github.com/cockroachdb/pebble/internal/invariants"
    16  	"github.com/cockroachdb/pebble/internal/keyspan"
    17  )
    18  
    19  type mergingIterLevel struct {
    20  	index int
    21  	iter  internalIterator
    22  	// rangeDelIter is set to the range-deletion iterator for the level. When
    23  	// configured with a levelIter, this pointer changes as sstable boundaries
    24  	// are crossed. See levelIter.initRangeDel and the Range Deletions comment
    25  	// below.
    26  	rangeDelIter keyspan.FragmentIterator
    27  	// iterKey and iterValue cache the current key and value iter are pointed at.
    28  	iterKey   *InternalKey
    29  	iterValue base.LazyValue
    30  	// levelIter is non-nil if this level's iter is ultimately backed by a
    31  	// *levelIter. The handle in iter may have wrapped the levelIter with
    32  	// intermediary internalIterator implementations.
    33  	levelIter *levelIter
    34  
    35  	// levelIterBoundaryContext's fields are set when using levelIter, in order
    36  	// to surface sstable boundary keys and file-level context. See levelIter
    37  	// comment and the Range Deletions comment below.
    38  	levelIterBoundaryContext
    39  
    40  	// tombstone caches the tombstone rangeDelIter is currently pointed at. If
    41  	// tombstone is nil, there are no further tombstones within the
    42  	// current sstable in the current iterator direction. The cached tombstone is
    43  	// only valid for the levels in the range [0,heap[0].index]. This avoids
    44  	// positioning tombstones at lower levels which cannot possibly shadow the
    45  	// current key.
    46  	tombstone *keyspan.Span
    47  }
    48  
    49  type levelIterBoundaryContext struct {
    50  	// smallestUserKey and largestUserKey are populated with the smallest and
    51  	// largest boundaries of the current file.
    52  	smallestUserKey, largestUserKey []byte
    53  	// isLargestUserKeyExclusive is set to true when a file's largest boundary
    54  	// is an exclusive key, (eg, a range deletion sentinel). If true, the file
    55  	// does not contain any keys with the provided user key, and the
    56  	// largestUserKey bound is exclusive.
    57  	isLargestUserKeyExclusive bool
    58  	// isSyntheticIterBoundsKey is set to true iff the key returned by the level
    59  	// iterator is a synthetic key derived from the iterator bounds. This is used
    60  	// to prevent the mergingIter from being stuck at such a synthetic key if it
    61  	// becomes the top element of the heap. When used with a user-facing Iterator,
    62  	// the only range deletions exposed by this mergingIter should be those with
    63  	// `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`.
    64  	isSyntheticIterBoundsKey bool
    65  	// isIgnorableBoundaryKey is set to true iff the key returned by the level
    66  	// iterator is a file boundary key that should be ignored when returning to
    67  	// the parent iterator. File boundary keys are used by the level iter to
    68  	// keep a levelIter file's range deletion iterator open as long as other
    69  	// levels within the merging iterator require it. When used with a user-facing
    70  	// Iterator, the only range deletions exposed by this mergingIter should be
    71  	// those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`.
    72  	isIgnorableBoundaryKey bool
    73  }
    74  
    75  // mergingIter provides a merged view of multiple iterators from different
    76  // levels of the LSM.
    77  //
    78  // The core of a mergingIter is a heap of internalIterators (see
    79  // mergingIterHeap). The heap can operate as either a min-heap, used during
    80  // forward iteration (First, SeekGE, Next) or a max-heap, used during reverse
    81  // iteration (Last, SeekLT, Prev). The heap is initialized in calls to First,
    82  // Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top
    83  // element on the heap, advances its iterator, and then "fixes" the heap
    84  // property. When one of the child iterators is exhausted during Next/Prev
    85  // iteration, it is removed from the heap.
    86  //
    87  // # Range Deletions
    88  //
    89  // A mergingIter can optionally be configured with a slice of range deletion
    90  // iterators. The range deletion iterator slice must exactly parallel the point
    91  // iterators and the range deletion iterator must correspond to the same level
    92  // in the LSM as the point iterator. Note that each memtable and each table in
    93  // L0 is a different "level" from the mergingIter perspective. So level 0 below
    94  // does not correspond to L0 in the LSM.
    95  //
    96  // A range deletion iterator iterates over fragmented range tombstones. Range
    97  // tombstones are fragmented by splitting them at any overlapping points. This
    98  // fragmentation guarantees that within an sstable tombstones will either be
    99  // distinct or will have identical start and end user keys. While range
   100  // tombstones are fragmented within an sstable, the start and end keys are not truncated
   101  // to sstable boundaries. This is necessary because the tombstone end key is
   102  // exclusive and does not have a sequence number. Consider an sstable
   103  // containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must
   104  // delete "b#8", yet older versions of "b" might spill over to the next
   105  // sstable. So the boundary key for this sstable must be "b#8". Adjusting the
   106  // end key of tombstones to be optionally inclusive or contain a sequence
   107  // number would be possible solutions (such solutions have potentially serious
   108  // issues: tombstones have exclusive end keys since an inclusive deletion end can
   109  // be converted to an exclusive one while the reverse transformation is not possible;
   110  // the semantics of a sequence number for the end key of a range tombstone are murky).
   111  //
   112  // The approach taken here performs an
   113  // implicit truncation of the tombstone to the sstable boundaries.
   114  //
   115  // During initialization of a mergingIter, the range deletion iterators for
   116  // batches, memtables, and L0 tables are populated up front. Note that Batches
   117  // and memtables index unfragmented tombstones.  Batch.newRangeDelIter() and
   118  // memTable.newRangeDelIter() fragment and cache the tombstones on demand. The
   119  // L1-L6 range deletion iterators are populated by levelIter. When configured
   120  // to load range deletion iterators, whenever a levelIter loads a table it
   121  // loads both the point iterator and the range deletion
   122  // iterator. levelIter.rangeDelIter is configured to point to the right entry
   123  // in mergingIter.levels. The effect of this setup is that
   124  // mergingIter.levels[i].rangeDelIter always contains the fragmented range
   125  // tombstone for the current table in level i that the levelIter has open.
   126  //
   127  // Another crucial mechanism of levelIter is that it materializes fake point
   128  // entries for the table boundaries if the boundary is range deletion
   129  // key. Consider a table that contains only a range tombstone [a-e)#10. The
   130  // sstable boundaries for this table will be a#10,15 and
   131  // e#72057594037927935,15. During forward iteration levelIter will return
   132  // e#72057594037927935,15 as a key. During reverse iteration levelIter will
   133  // return a#10,15 as a key. These sentinel keys act as bookends to point
   134  // iteration and allow mergingIter to keep a table and its associated range
   135  // tombstones loaded as long as there are keys at lower levels that are within
   136  // the bounds of the table.
   137  //
   138  // The final piece to the range deletion puzzle is the LSM invariant that for a
   139  // given key K newer versions of K can only exist earlier in the level, or at
   140  // higher levels of the tree. For example, if K#4 exists in L3, k#5 can only
   141  // exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly
   142  // uses this invariant to find the value for a key by walking the LSM level by
   143  // level. For range deletions, this invariant means that a range deletion at
   144  // level N will necessarily shadow any keys within its bounds in level Y where
   145  // Y > N. One wrinkle to this statement is that it only applies to keys that
   146  // lie within the sstable bounds as well, but we get that guarantee due to the
   147  // way the range deletion iterator and point iterator are bound together by a
   148  // levelIter.
   149  //
   150  // Tying the above all together, we get a picture where each level (index in
   151  // mergingIter.levels) is composed of both point operations (pX) and range
   152  // deletions (rX). The range deletions for level X shadow both the point
   153  // operations and range deletions for level Y where Y > X allowing mergingIter
   154  // to skip processing entries in that shadow. For example, consider the
   155  // scenario:
   156  //
   157  //	r0: a---e
   158  //	r1:    d---h
   159  //	r2:       g---k
   160  //	r3:          j---n
   161  //	r4:             m---q
   162  //
   163  // This is showing 5 levels of range deletions. Consider what happens upon
   164  // SeekGE("b"). We first seek the point iterator for level 0 (the point values
   165  // are not shown above) and we then seek the range deletion iterator. That
   166  // returns the tombstone [a,e). This tombstone tells us that all keys in the
   167  // range [a,e) in lower levels are deleted so we can skip them. So we can
   168  // adjust the seek key to "e", the tombstone end key. For level 1 we seek to
   169  // "e" and find the range tombstone [d,h) and similar logic holds. By the time
   170  // we get to level 4 we're seeking to "n".
   171  //
   172  // One consequence of not truncating tombstone end keys to sstable boundaries
   173  // is the seeking process described above cannot always seek to the tombstone
   174  // end key in the older level. For example, imagine in the above example r3 is
   175  // a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j,
   176  // n) has "k" as its upper boundary. In this situation, compactions involving
   177  // keys at or after "k" can output those keys to r4+, even if they're newer
   178  // than our tombstone [j, n). So instead of seeking to "n" in r4 we can only
   179  // seek to "k".  To achieve this, the instance variable `largestUserKey.`
   180  // maintains the upper bounds of the current sstables in the partitioned
   181  // levels. In this example, `levels[3].largestUserKey` holds "k", telling us to
   182  // limit the seek triggered by a tombstone in r3 to "k".
   183  //
   184  // During actual iteration levels can contain both point operations and range
   185  // deletions. Within a level, when a range deletion contains a point operation
   186  // the sequence numbers must be checked to determine if the point operation is
   187  // newer or older than the range deletion tombstone. The mergingIter maintains
   188  // the invariant that the range deletion iterators for all levels newer that
   189  // the current iteration key (L < m.heap.items[0].index) are positioned at the
   190  // next (or previous during reverse iteration) range deletion tombstone. We
   191  // know those levels don't contain a range deletion tombstone that covers the
   192  // current key because if they did the current key would be deleted. The range
   193  // deletion iterator for the current key's level is positioned at a range
   194  // tombstone covering or past the current key. The position of all of other
   195  // range deletion iterators is unspecified. Whenever a key from those levels
   196  // becomes the current key, their range deletion iterators need to be
   197  // positioned. This lazy positioning avoids seeking the range deletion
   198  // iterators for keys that are never considered. (A similar bit of lazy
   199  // evaluation can be done for the point iterators, but is still TBD).
   200  //
   201  // For a full example, consider the following setup:
   202  //
   203  //	p0:               o
   204  //	r0:             m---q
   205  //
   206  //	p1:              n p
   207  //	r1:       g---k
   208  //
   209  //	p2:  b d    i
   210  //	r2: a---e           q----v
   211  //
   212  //	p3:     e
   213  //	r3:
   214  //
   215  // If we start iterating from the beginning, the first key we encounter is "b"
   216  // in p2. When the mergingIter is pointing at a valid entry, the range deletion
   217  // iterators for all of the levels < m.heap.items[0].index are positioned at
   218  // the next range tombstone past the current key. So r0 will point at [m,q) and
   219  // r1 at [g,k). When the key "b" is encountered, we check to see if the current
   220  // tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e),
   221  // contains and is newer than "b".
   222  //
   223  // Advancing the iterator finds the next key at "d". This is in the same level
   224  // as the previous key "b" so we don't have to reposition any of the range
   225  // deletion iterators, but merely check whether "d" is now contained by any of
   226  // the range tombstones at higher levels or has stepped past the range
   227  // tombstone in its own level or higher levels. In this case, there is nothing to be done.
   228  //
   229  // Advancing the iterator again finds "e". Since "e" comes from p3, we have to
   230  // position the r3 range deletion iterator, which is empty. "e" is past the r2
   231  // tombstone of [a,e) so we need to advance the r2 range deletion iterator to
   232  // [q,v).
   233  //
   234  // The next key is "i". Because this key is in p2, a level above "e", we don't
   235  // have to reposition any range deletion iterators and instead see that "i" is
   236  // covered by the range tombstone [g,k). The iterator is immediately advanced
   237  // to "n" which is covered by the range tombstone [m,q) causing the iterator to
   238  // advance to "o" which is visible.
   239  //
   240  // TODO(peter,rangedel): For testing, advance the iterator through various
   241  // scenarios and have each step display the current state (i.e. the current
   242  // heap and range-del iterator positioning).
   243  type mergingIter struct {
   244  	logger        Logger
   245  	split         Split
   246  	dir           int
   247  	snapshot      uint64
   248  	batchSnapshot uint64
   249  	levels        []mergingIterLevel
   250  	heap          mergingIterHeap
   251  	err           error
   252  	prefix        []byte
   253  	lower         []byte
   254  	upper         []byte
   255  	stats         *InternalIteratorStats
   256  
   257  	// levelsPositioned, if non-nil, is a slice of the same length as levels.
   258  	// It's used by NextPrefix to record which levels have already been
   259  	// repositioned. It's created lazily by the first call to NextPrefix.
   260  	levelsPositioned []bool
   261  
   262  	combinedIterState *combinedIterState
   263  
   264  	// Used in some tests to disable the random disabling of seek optimizations.
   265  	forceEnableSeekOpt bool
   266  }
   267  
   268  // mergingIter implements the base.InternalIterator interface.
   269  var _ base.InternalIterator = (*mergingIter)(nil)
   270  
   271  // newMergingIter returns an iterator that merges its input. Walking the
   272  // resultant iterator will return all key/value pairs of all input iterators
   273  // in strictly increasing key order, as defined by cmp. It is permissible to
   274  // pass a nil split parameter if the caller is never going to call
   275  // SeekPrefixGE.
   276  //
   277  // The input's key ranges may overlap, but there are assumed to be no duplicate
   278  // keys: if iters[i] contains a key k then iters[j] will not contain that key k.
   279  //
   280  // None of the iters may be nil.
   281  func newMergingIter(
   282  	logger Logger,
   283  	stats *base.InternalIteratorStats,
   284  	cmp Compare,
   285  	split Split,
   286  	iters ...internalIterator,
   287  ) *mergingIter {
   288  	m := &mergingIter{}
   289  	levels := make([]mergingIterLevel, len(iters))
   290  	for i := range levels {
   291  		levels[i].iter = iters[i]
   292  	}
   293  	m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...)
   294  	return m
   295  }
   296  
   297  func (m *mergingIter) init(
   298  	opts *IterOptions,
   299  	stats *base.InternalIteratorStats,
   300  	cmp Compare,
   301  	split Split,
   302  	levels ...mergingIterLevel,
   303  ) {
   304  	m.err = nil // clear cached iteration error
   305  	m.logger = opts.getLogger()
   306  	if opts != nil {
   307  		m.lower = opts.LowerBound
   308  		m.upper = opts.UpperBound
   309  	}
   310  	m.snapshot = InternalKeySeqNumMax
   311  	m.batchSnapshot = InternalKeySeqNumMax
   312  	m.levels = levels
   313  	m.heap.cmp = cmp
   314  	m.split = split
   315  	m.stats = stats
   316  	if cap(m.heap.items) < len(levels) {
   317  		m.heap.items = make([]*mergingIterLevel, 0, len(levels))
   318  	} else {
   319  		m.heap.items = m.heap.items[:0]
   320  	}
   321  	for l := range m.levels {
   322  		m.levels[l].index = l
   323  	}
   324  }
   325  
   326  func (m *mergingIter) initHeap() {
   327  	m.heap.items = m.heap.items[:0]
   328  	for i := range m.levels {
   329  		if l := &m.levels[i]; l.iterKey != nil {
   330  			m.heap.items = append(m.heap.items, l)
   331  		} else {
   332  			m.err = firstError(m.err, l.iter.Error())
   333  			if m.err != nil {
   334  				return
   335  			}
   336  		}
   337  	}
   338  	m.heap.init()
   339  }
   340  
   341  func (m *mergingIter) initMinHeap() {
   342  	m.dir = 1
   343  	m.heap.reverse = false
   344  	m.initHeap()
   345  	m.initMinRangeDelIters(-1)
   346  }
   347  
   348  // The level of the previous top element was oldTopLevel. Note that all range delete
   349  // iterators < oldTopLevel are positioned past the key of the previous top element and
   350  // the range delete iterator == oldTopLevel is positioned at or past the key of the
   351  // previous top element. We need to position the range delete iterators from oldTopLevel + 1
   352  // to the level of the current top element.
   353  func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) {
   354  	if m.heap.len() == 0 {
   355  		return
   356  	}
   357  
   358  	// Position the range-del iterators at levels <= m.heap.items[0].index.
   359  	item := m.heap.items[0]
   360  	for level := oldTopLevel + 1; level <= item.index; level++ {
   361  		l := &m.levels[level]
   362  		if l.rangeDelIter == nil {
   363  			continue
   364  		}
   365  		l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey)
   366  	}
   367  }
   368  
   369  func (m *mergingIter) initMaxHeap() {
   370  	m.dir = -1
   371  	m.heap.reverse = true
   372  	m.initHeap()
   373  	m.initMaxRangeDelIters(-1)
   374  }
   375  
   376  // The level of the previous top element was oldTopLevel. Note that all range delete
   377  // iterators < oldTopLevel are positioned before the key of the previous top element and
   378  // the range delete iterator == oldTopLevel is positioned at or before the key of the
   379  // previous top element. We need to position the range delete iterators from oldTopLevel + 1
   380  // to the level of the current top element.
   381  func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) {
   382  	if m.heap.len() == 0 {
   383  		return
   384  	}
   385  	// Position the range-del iterators at levels <= m.heap.items[0].index.
   386  	item := m.heap.items[0]
   387  	for level := oldTopLevel + 1; level <= item.index; level++ {
   388  		l := &m.levels[level]
   389  		if l.rangeDelIter == nil {
   390  			continue
   391  		}
   392  		l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey)
   393  	}
   394  }
   395  
   396  func (m *mergingIter) switchToMinHeap() {
   397  	if m.heap.len() == 0 {
   398  		if m.lower != nil {
   399  			m.SeekGE(m.lower, base.SeekGEFlagsNone)
   400  		} else {
   401  			m.First()
   402  		}
   403  		return
   404  	}
   405  
   406  	// We're switching from using a max heap to a min heap. We need to advance
   407  	// any iterator that is less than or equal to the current key. Consider the
   408  	// scenario where we have 2 iterators being merged (user-key:seq-num):
   409  	//
   410  	// i1:     *a:2     b:2
   411  	// i2: a:1      b:1
   412  	//
   413  	// The current key is a:2 and i2 is pointed at a:1. When we switch to forward
   414  	// iteration, we want to return a key that is greater than a:2.
   415  
   416  	key := m.heap.items[0].iterKey
   417  	cur := m.heap.items[0]
   418  
   419  	for i := range m.levels {
   420  		l := &m.levels[i]
   421  		if l == cur {
   422  			continue
   423  		}
   424  
   425  		// If the iterator is exhausted, it may be out of bounds if range
   426  		// deletions modified our search key as we descended. we need to
   427  		// reposition it within the search bounds. If the current key is a
   428  		// range tombstone, the iterator might still be exhausted but at a
   429  		// sstable boundary sentinel. It would be okay to reposition an
   430  		// interator like this only through successive Next calls, except that
   431  		// it would violate the levelIter's invariants by causing it to return
   432  		// a key before the lower bound.
   433  		//
   434  		//           bounds = [ f, _ )
   435  		// L0:   [ b ]          [ f*                   z ]
   436  		// L1: [ a           |----|        k        y ]
   437  		// L2:    [  c  (d) ] [ e      g     m ]
   438  		// L3:             [                    x ]
   439  		//
   440  		// * - current key   [] - table bounds () - heap item
   441  		//
   442  		// In the above diagram, the L2 iterator is positioned at a sstable
   443  		// boundary (d) outside the lower bound (f). It arrived here from a
   444  		// seek whose seek-key was modified by a range tombstone. If we called
   445  		// Next on the L2 iterator, it would return e, violating its lower
   446  		// bound.  Instead, we seek it to >= f and Next from there.
   447  
   448  		if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey &&
   449  			l.iterKey.IsExclusiveSentinel() &&
   450  			m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) {
   451  			if m.lower != nil {
   452  				l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
   453  			} else {
   454  				l.iterKey, l.iterValue = l.iter.First()
   455  			}
   456  		}
   457  		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() {
   458  			if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) < 0 {
   459  				// key < iter-key
   460  				break
   461  			}
   462  			// key >= iter-key
   463  		}
   464  	}
   465  
   466  	// Special handling for the current iterator because we were using its key
   467  	// above. The iterator cur.iter may still be exhausted at a sstable boundary
   468  	// sentinel. Similar to the logic applied to the other levels, in these
   469  	// cases we seek the iterator to the first key in order to avoid violating
   470  	// levelIter's invariants. See the example in the for loop above.
   471  	if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
   472  		m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 {
   473  		cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
   474  	} else {
   475  		cur.iterKey, cur.iterValue = cur.iter.Next()
   476  	}
   477  	m.initMinHeap()
   478  }
   479  
   480  func (m *mergingIter) switchToMaxHeap() {
   481  	if m.heap.len() == 0 {
   482  		if m.upper != nil {
   483  			m.SeekLT(m.upper, base.SeekLTFlagsNone)
   484  		} else {
   485  			m.Last()
   486  		}
   487  		return
   488  	}
   489  
   490  	// We're switching from using a min heap to a max heap. We need to backup any
   491  	// iterator that is greater than or equal to the current key. Consider the
   492  	// scenario where we have 2 iterators being merged (user-key:seq-num):
   493  	//
   494  	// i1: a:2     *b:2
   495  	// i2:     a:1      b:1
   496  	//
   497  	// The current key is b:2 and i2 is pointing at b:1. When we switch to
   498  	// reverse iteration, we want to return a key that is less than b:2.
   499  	key := m.heap.items[0].iterKey
   500  	cur := m.heap.items[0]
   501  
   502  	for i := range m.levels {
   503  		l := &m.levels[i]
   504  		if l == cur {
   505  			continue
   506  		}
   507  
   508  		// If the iterator is exhausted, it may be out of bounds if range
   509  		// deletions modified our search key as we descended. we need to
   510  		// reposition it within the search bounds. If the current key is a
   511  		// range tombstone, the iterator might still be exhausted but at a
   512  		// sstable boundary sentinel. It would be okay to reposition an
   513  		// interator like this only through successive Prev calls, except that
   514  		// it would violate the levelIter's invariants by causing it to return
   515  		// a key beyond the upper bound.
   516  		//
   517  		//           bounds = [ _, g )
   518  		// L0:   [ b ]          [ f*                   z ]
   519  		// L1: [ a                |-------| k       y ]
   520  		// L2:    [  c   d  ]        h [(i)    m ]
   521  		// L3:             [  e                  x ]
   522  		//
   523  		// * - current key   [] - table bounds () - heap item
   524  		//
   525  		// In the above diagram, the L2 iterator is positioned at a sstable
   526  		// boundary (i) outside the upper bound (g). It arrived here from a
   527  		// seek whose seek-key was modified by a range tombstone. If we called
   528  		// Prev on the L2 iterator, it would return h, violating its upper
   529  		// bound.  Instead, we seek it to < g, and Prev from there.
   530  
   531  		if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey &&
   532  			l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) {
   533  			if m.upper != nil {
   534  				l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
   535  			} else {
   536  				l.iterKey, l.iterValue = l.iter.Last()
   537  			}
   538  		}
   539  		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() {
   540  			if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) > 0 {
   541  				// key > iter-key
   542  				break
   543  			}
   544  			// key <= iter-key
   545  		}
   546  	}
   547  
   548  	// Special handling for the current iterator because we were using its key
   549  	// above. The iterator cur.iter may still be exhausted at a sstable boundary
   550  	// sentinel. Similar to the logic applied to the other levels, in these
   551  	// cases we seek the iterator to  in order to avoid violating levelIter's
   552  	// invariants by Prev-ing through files.  See the example in the for loop
   553  	// above.
   554  	if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
   555  		m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 {
   556  		cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
   557  	} else {
   558  		cur.iterKey, cur.iterValue = cur.iter.Prev()
   559  	}
   560  	m.initMaxHeap()
   561  }
   562  
   563  // maybeNextEntryWithinPrefix steps to the next entry, as long as the iteration
   564  // prefix has not already been exceeded. If it has, it exhausts the iterator by
   565  // resetting the heap to empty.
   566  func (m *mergingIter) maybeNextEntryWithinPrefix(l *mergingIterLevel) {
   567  	if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) {
   568  		// The item at the root of the heap already exceeds the iteration
   569  		// prefix. We should not advance any more. Clear the heap to reflect
   570  		// that the iterator is now exhausted (within this prefix, at
   571  		// least).
   572  		m.heap.items = m.heap.items[:0]
   573  		return
   574  	}
   575  	m.nextEntry(l, nil /* succKey */)
   576  }
   577  
   578  // nextEntry unconditionally steps to the next entry. item is the current top
   579  // item in the heap.
   580  //
   581  // nextEntry should be called directly when not in prefix-iteration mode, or by
   582  // Next.  During prefix iteration mode, all other callers should use
   583  // maybeNextEntryWithinPrefix which will avoid advancing the iterator if the
   584  // current iteration prefix has been exhausted. See the comment within
   585  // nextEntry's body for an explanation of why other callers should call
   586  // maybeNextEntryWithinPrefix, which will ensure the documented invariant is
   587  // preserved.
   588  func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) {
   589  	// INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal
   590  	// to m.prefix. This invariant is important for ensuring TrySeekUsingNext
   591  	// optimizations behave correctly.
   592  	//
   593  	// During prefix iteration, the iterator does not have a full view of the
   594  	// LSM. Some level iterators may omit keys that are known to fall outside
   595  	// the seek prefix (eg, due to sstable bloom filter exclusion). It's
   596  	// important that in such cases we don't position any iterators beyond
   597  	// m.prefix, because doing so may interfere with future seeks.
   598  	//
   599  	// Let prefixes P1 < P2 < P3. Imagine a SeekPrefixGE to prefix P1, followed
   600  	// by a SeekPrefixGE to prefix P2. Imagine there exist live keys at prefix
   601  	// P2, but they're not visible to the SeekPrefixGE(P1) (because of
   602  	// bloom-filter exclusion or a range tombstone that deletes prefix P1 but
   603  	// not P2). If the SeekPrefixGE(P1) is allowed to move any level iterators
   604  	// to P3, the SeekPrefixGE(P2, TrySeekUsingNext=true) may mistakenly think
   605  	// the level contains no point keys or range tombstones within the prefix
   606  	// P2. Care is taken to avoid ever advancing the iterator beyond the current
   607  	// prefix. If nextEntry is ever invoked while we're already beyond the
   608  	// current prefix, we're violating the invariant.
   609  	if invariants.Enabled && m.prefix != nil {
   610  		if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) {
   611  			m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s",
   612  				m.prefix, l.iterKey, debug.Stack())
   613  		}
   614  	}
   615  
   616  	oldTopLevel := l.index
   617  	oldRangeDelIter := l.rangeDelIter
   618  
   619  	if succKey == nil {
   620  		l.iterKey, l.iterValue = l.iter.Next()
   621  	} else {
   622  		l.iterKey, l.iterValue = l.iter.NextPrefix(succKey)
   623  	}
   624  
   625  	if l.iterKey != nil {
   626  		if m.heap.len() > 1 {
   627  			m.heap.fix(0)
   628  		}
   629  		if l.rangeDelIter != oldRangeDelIter {
   630  			// The rangeDelIter changed which indicates that the l.iter moved to the
   631  			// next sstable. We have to update the tombstone for oldTopLevel as well.
   632  			oldTopLevel--
   633  		}
   634  	} else {
   635  		m.err = l.iter.Error()
   636  		if m.err == nil {
   637  			m.heap.pop()
   638  		}
   639  	}
   640  
   641  	// The cached tombstones are only valid for the levels
   642  	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
   643  	// [oldTopLevel+1,heap[0].index].
   644  	m.initMinRangeDelIters(oldTopLevel)
   645  }
   646  
   647  // isNextEntryDeleted starts from the current entry (as the next entry) and if
   648  // it is deleted, moves the iterators forward as needed and returns true, else
   649  // it returns false. item is the top item in the heap.
   650  //
   651  // During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by
   652  // clearing the heap if the deleted key(s) extend beyond the iteration prefix
   653  // during prefix-iteration mode.
   654  func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool {
   655  	// Look for a range deletion tombstone containing item.iterKey at higher
   656  	// levels (level < item.index). If we find such a range tombstone we know
   657  	// it deletes the key in the current level. Also look for a range
   658  	// deletion at the current level (level == item.index). If we find such a
   659  	// range deletion we need to check whether it is newer than the current
   660  	// entry.
   661  	for level := 0; level <= item.index; level++ {
   662  		l := &m.levels[level]
   663  		if l.rangeDelIter == nil || l.tombstone == nil {
   664  			// If l.tombstone is nil, there are no further tombstones
   665  			// in the current sstable in the current (forward) iteration
   666  			// direction.
   667  			continue
   668  		}
   669  		if m.heap.cmp(l.tombstone.End, item.iterKey.UserKey) <= 0 {
   670  			// The current key is at or past the tombstone end key.
   671  			//
   672  			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
   673  			// the levelIter must be positioned at a key >= item.iterKey. So it is sufficient to seek the
   674  			// current l.rangeDelIter (since any range del iterators that will be provided by the
   675  			// levelIter in the future cannot contain item.iterKey). Also, it is possible that we
   676  			// will encounter parts of the range delete that should be ignored -- we handle that
   677  			// below.
   678  			l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey)
   679  		}
   680  		if l.tombstone == nil {
   681  			continue
   682  		}
   683  
   684  		// Reasoning for correctness of untruncated tombstone handling when the untruncated
   685  		// tombstone is at a higher level:
   686  		// The iterator corresponding to this tombstone is still in the heap so it must be
   687  		// positioned >= item.iterKey. Which means the Largest key bound of the sstable containing this
   688  		// tombstone is >= item.iterKey. So the upper limit of this tombstone cannot be file-bounds-constrained
   689  		// to < item.iterKey. But it is possible that item.key < smallestUserKey, in which
   690  		// case this tombstone should be ignored.
   691  		//
   692  		// Example 1:
   693  		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The
   694  		// smallestUserKey is c, so we know the key is within the file bounds and the tombstone
   695  		// [b, i) covers it.
   696  		//
   697  		// Example 2:
   698  		// Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i)
   699  		// does not cover this key.
   700  		//
   701  		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
   702  		if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.iterKey.UserKey) <= 0) &&
   703  			l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) {
   704  			if level < item.index {
   705  				// We could also do m.seekGE(..., level + 1). The levels from
   706  				// [level + 1, item.index) are already after item.iterKey so seeking them may be
   707  				// wasteful.
   708  
   709  				// We can seek up to the min of largestUserKey and tombstone.End.
   710  				//
   711  				// Using example 1 above, we can seek to the smaller of g and i, which is g.
   712  				//
   713  				// Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel],
   714  				// and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to
   715  				// the exclusive bound of the tombstone. We do not need to look at
   716  				// isLargestKeyRangeDelSentinel.
   717  				//
   718  				// Progress argument: Since this file is at a higher level than item.iterKey we know
   719  				// that the iterator in this file must be positioned within its bounds and at a key
   720  				// X > item.iterKey (otherwise it would be the min of the heap). It is not
   721  				// possible for X.UserKey == item.iterKey.UserKey, since it is incompatible with
   722  				// X > item.iterKey (a lower version cannot be in a higher sstable), so it must be that
   723  				// X.UserKey > item.iterKey.UserKey. Which means l.largestUserKey > item.key.UserKey.
   724  				// We also know that l.tombstone.End > item.iterKey.UserKey. So the min of these,
   725  				// seekKey, computed below, is > item.iterKey.UserKey, so the call to seekGE() will
   726  				// make forward progress.
   727  				seekKey := l.tombstone.End
   728  				if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 {
   729  					seekKey = l.largestUserKey
   730  				}
   731  				// This seek is not directly due to a SeekGE call, so we don't know
   732  				// enough about the underlying iterator positions, and so we keep the
   733  				// try-seek-using-next optimization disabled. Additionally, if we're in
   734  				// prefix-seek mode and a re-seek would have moved us past the original
   735  				// prefix, we can remove all merging iter levels below the rangedel
   736  				// tombstone's level and return immediately instead of re-seeking. This
   737  				// is correct since those levels cannot provide a key that matches the
   738  				// prefix, and is also visible. Additionally, this is important to make
   739  				// subsequent `TrySeekUsingNext` work correctly, as a re-seek on a
   740  				// different prefix could have resulted in this iterator skipping visible
   741  				// keys at prefixes in between m.prefix and seekKey, that are currently
   742  				// not in the heap due to a bloom filter mismatch.
   743  				//
   744  				// Additionally, we set the relative-seek flag. This is
   745  				// important when iterating with lazy combined iteration. If
   746  				// there's a range key between this level's current file and the
   747  				// file the seek will land on, we need to detect it in order to
   748  				// trigger construction of the combined iterator.
   749  				if m.prefix != nil {
   750  					if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) {
   751  						for i := item.index; i < len(m.levels); i++ {
   752  							// Remove this level from the heap. Setting iterKey and iterValue
   753  							// to their zero values should be sufficient for initMinHeap to not
   754  							// re-initialize the heap with them in it. Other fields in
   755  							// mergingIterLevel can remain as-is; the iter/rangeDelIter needs
   756  							// to stay intact for future trySeekUsingNexts to work, the level
   757  							// iter boundary context is owned by the levelIter which is not
   758  							// being repositioned, and any tombstones in these levels will be
   759  							// irrelevant for us anyway.
   760  							m.levels[i].iterKey = nil
   761  							m.levels[i].iterValue = base.LazyValue{}
   762  						}
   763  						// TODO(bilal): Consider a more efficient way of removing levels from
   764  						// the heap without reinitializing all of it. This would likely
   765  						// necessitate tracking the heap positions of each mergingIterHeap
   766  						// item in the mergingIterLevel, and then swapping that item in the
   767  						// heap with the last-positioned heap item, and shrinking the heap by
   768  						// one.
   769  						m.initMinHeap()
   770  						return true
   771  					}
   772  				}
   773  				m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek())
   774  				return true
   775  			}
   776  			if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) {
   777  				if m.prefix == nil {
   778  					m.nextEntry(item, nil /* succKey */)
   779  				} else {
   780  					m.maybeNextEntryWithinPrefix(item)
   781  				}
   782  				return true
   783  			}
   784  		}
   785  	}
   786  	return false
   787  }
   788  
   789  // Starting from the current entry, finds the first (next) entry that can be returned.
   790  func (m *mergingIter) findNextEntry() (*InternalKey, base.LazyValue) {
   791  	for m.heap.len() > 0 && m.err == nil {
   792  		item := m.heap.items[0]
   793  		if m.levels[item.index].isSyntheticIterBoundsKey {
   794  			break
   795  		}
   796  
   797  		m.addItemStats(item)
   798  
   799  		// Skip ignorable boundary keys. These are not real keys and exist to
   800  		// keep sstables open until we've surpassed their end boundaries so that
   801  		// their range deletions are visible.
   802  		if m.levels[item.index].isIgnorableBoundaryKey {
   803  			if m.prefix == nil {
   804  				m.nextEntry(item, nil /* succKey */)
   805  			} else {
   806  				m.maybeNextEntryWithinPrefix(item)
   807  			}
   808  			continue
   809  		}
   810  
   811  		// Check if the heap root key is deleted by a range tombstone in a
   812  		// higher level. If it is, isNextEntryDeleted will advance the iterator
   813  		// to a later key (through seeking or nexting).
   814  		if m.isNextEntryDeleted(item) {
   815  			m.stats.PointsCoveredByRangeTombstones++
   816  			continue
   817  		}
   818  
   819  		// Check if the key is visible at the iterator sequence numbers.
   820  		if !item.iterKey.Visible(m.snapshot, m.batchSnapshot) {
   821  			if m.prefix == nil {
   822  				m.nextEntry(item, nil /* succKey */)
   823  			} else {
   824  				m.maybeNextEntryWithinPrefix(item)
   825  			}
   826  			continue
   827  		}
   828  
   829  		// The heap root is visible and not deleted by any range tombstones.
   830  		// Return it.
   831  		return item.iterKey, item.iterValue
   832  	}
   833  	return nil, base.LazyValue{}
   834  }
   835  
   836  // Steps to the prev entry. item is the current top item in the heap.
   837  func (m *mergingIter) prevEntry(l *mergingIterLevel) {
   838  	oldTopLevel := l.index
   839  	oldRangeDelIter := l.rangeDelIter
   840  	if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil {
   841  		if m.heap.len() > 1 {
   842  			m.heap.fix(0)
   843  		}
   844  		if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil {
   845  			// The rangeDelIter changed which indicates that the l.iter moved to the
   846  			// previous sstable. We have to update the tombstone for oldTopLevel as
   847  			// well.
   848  			oldTopLevel--
   849  		}
   850  	} else {
   851  		m.err = l.iter.Error()
   852  		if m.err == nil {
   853  			m.heap.pop()
   854  		}
   855  	}
   856  
   857  	// The cached tombstones are only valid for the levels
   858  	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
   859  	// [oldTopLevel+1,heap[0].index].
   860  	m.initMaxRangeDelIters(oldTopLevel)
   861  }
   862  
   863  // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted,
   864  // moves the iterators backward as needed and returns true, else it returns false. item is the top
   865  // item in the heap.
   866  func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool {
   867  	// Look for a range deletion tombstone containing item.iterKey at higher
   868  	// levels (level < item.index). If we find such a range tombstone we know
   869  	// it deletes the key in the current level. Also look for a range
   870  	// deletion at the current level (level == item.index). If we find such a
   871  	// range deletion we need to check whether it is newer than the current
   872  	// entry.
   873  	for level := 0; level <= item.index; level++ {
   874  		l := &m.levels[level]
   875  		if l.rangeDelIter == nil || l.tombstone == nil {
   876  			// If l.tombstone is nil, there are no further tombstones
   877  			// in the current sstable in the current (reverse) iteration
   878  			// direction.
   879  			continue
   880  		}
   881  		if m.heap.cmp(item.iterKey.UserKey, l.tombstone.Start) < 0 {
   882  			// The current key is before the tombstone start key.
   883  			//
   884  			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
   885  			// the levelIter must be positioned at a key < item.iterKey. So it is sufficient to seek the
   886  			// current l.rangeDelIter (since any range del iterators that will be provided by the
   887  			// levelIter in the future cannot contain item.iterKey). Also, it is it is possible that we
   888  			// will encounter parts of the range delete that should be ignored -- we handle that
   889  			// below.
   890  			l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey)
   891  		}
   892  		if l.tombstone == nil {
   893  			continue
   894  		}
   895  
   896  		// Reasoning for correctness of untruncated tombstone handling when the untruncated
   897  		// tombstone is at a higher level:
   898  		//
   899  		// The iterator corresponding to this tombstone is still in the heap so it must be
   900  		// positioned <= item.iterKey. Which means the Smallest key bound of the sstable containing this
   901  		// tombstone is <= item.iterKey. So the lower limit of this tombstone cannot have been
   902  		// file-bounds-constrained to > item.iterKey. But it is possible that item.key >= Largest
   903  		// key bound of this sstable, in which case this tombstone should be ignored.
   904  		//
   905  		// Example 1:
   906  		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The
   907  		// largestUserKey is g, so we know the key is within the file bounds and the tombstone
   908  		// [b, i) covers it.
   909  		//
   910  		// Example 2:
   911  		// Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated
   912  		// tombstone was involved in a compaction which must have had a file to the right of this
   913  		// sstable that is part of the same atomic compaction group for future compactions. That
   914  		// file must have bounds that cover g#6 and this levelIter must be at that file.
   915  		//
   916  		// Example 3:
   917  		// sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10.
   918  		// This key is not deleted by this tombstone. We need to look at
   919  		// isLargestUserKeyExclusive.
   920  		//
   921  		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
   922  
   923  		// Default to within bounds.
   924  		withinLargestSSTableBound := true
   925  		if l.largestUserKey != nil {
   926  			cmpResult := m.heap.cmp(l.largestUserKey, item.iterKey.UserKey)
   927  			withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive)
   928  		}
   929  		if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) && l.tombstone.VisibleAt(m.snapshot) {
   930  			if level < item.index {
   931  				// We could also do m.seekLT(..., level + 1). The levels from
   932  				// [level + 1, item.index) are already before item.iterKey so seeking them may be
   933  				// wasteful.
   934  
   935  				// We can seek up to the max of smallestUserKey and tombstone.Start.UserKey.
   936  				//
   937  				// Using example 1 above, we can seek to the larger of c and b, which is c.
   938  				//
   939  				// Progress argument: We know that the iterator in this file is positioned within
   940  				// its bounds and at a key X < item.iterKey (otherwise it would be the max of the heap).
   941  				// So smallestUserKey <= item.iterKey.UserKey and we already know that
   942  				// l.tombstone.Start.UserKey <= item.iterKey.UserKey. So the seekKey computed below
   943  				// is <= item.iterKey.UserKey, and since we do a seekLT() we will make backwards
   944  				// progress.
   945  				seekKey := l.tombstone.Start
   946  				if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 {
   947  					seekKey = l.smallestUserKey
   948  				}
   949  				// We set the relative-seek flag. This is important when
   950  				// iterating with lazy combined iteration. If there's a range
   951  				// key between this level's current file and the file the seek
   952  				// will land on, we need to detect it in order to trigger
   953  				// construction of the combined iterator.
   954  				m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek())
   955  				return true
   956  			}
   957  			if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) {
   958  				m.prevEntry(item)
   959  				return true
   960  			}
   961  		}
   962  	}
   963  	return false
   964  }
   965  
   966  // Starting from the current entry, finds the first (prev) entry that can be returned.
   967  func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) {
   968  	for m.heap.len() > 0 && m.err == nil {
   969  		item := m.heap.items[0]
   970  		if m.levels[item.index].isSyntheticIterBoundsKey {
   971  			break
   972  		}
   973  		m.addItemStats(item)
   974  		if m.isPrevEntryDeleted(item) {
   975  			m.stats.PointsCoveredByRangeTombstones++
   976  			continue
   977  		}
   978  		if item.iterKey.Visible(m.snapshot, m.batchSnapshot) &&
   979  			(!m.levels[item.index].isIgnorableBoundaryKey) {
   980  			return item.iterKey, item.iterValue
   981  		}
   982  		m.prevEntry(item)
   983  	}
   984  	return nil, base.LazyValue{}
   985  }
   986  
   987  // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks.
   988  func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) {
   989  	// When seeking, we can use tombstones to adjust the key we seek to on each
   990  	// level. Consider the series of range tombstones:
   991  	//
   992  	//   1: a---e
   993  	//   2:    d---h
   994  	//   3:       g---k
   995  	//   4:          j---n
   996  	//   5:             m---q
   997  	//
   998  	// If we SeekGE("b") we also find the tombstone "b" resides within in the
   999  	// first level which is [a,e). Regardless of whether this tombstone deletes
  1000  	// "b" in that level, we know it deletes "b" in all lower levels, so we
  1001  	// adjust the search key in the next level to the tombstone end key "e". We
  1002  	// then SeekGE("e") in the second level and find the corresponding tombstone
  1003  	// [d,h). This process continues and we end up seeking for "h" in the 3rd
  1004  	// level, "k" in the 4th level and "n" in the last level.
  1005  	//
  1006  	// TODO(peter,rangedel): In addition to the above we can delay seeking a
  1007  	// level (and any lower levels) when the current iterator position is
  1008  	// contained within a range tombstone at a higher level.
  1009  
  1010  	// Deterministically disable the TrySeekUsingNext optimizations sometimes in
  1011  	// invariant builds to encourage the metamorphic tests to surface bugs. Note
  1012  	// that we cannot disable the optimization within individual levels. It must
  1013  	// be disabled for all levels or none. If one lower-level iterator performs
  1014  	// a fresh seek whereas another takes advantage of its current iterator
  1015  	// position, the heap can become inconsistent. Consider the following
  1016  	// example:
  1017  	//
  1018  	//     L5:  [ [b-c) ]  [ d ]*
  1019  	//     L6:  [  b ]           [e]*
  1020  	//
  1021  	// Imagine a SeekGE(a). The [b-c) range tombstone deletes the L6 point key
  1022  	// 'b', resulting in the iterator positioned at d with the heap:
  1023  	//
  1024  	//     {L5: d, L6: e}
  1025  	//
  1026  	// A subsequent SeekGE(b) is seeking to a larger key, so the caller may set
  1027  	// TrySeekUsingNext()=true. If the L5 iterator used the TrySeekUsingNext
  1028  	// optimization but the L6 iterator did not, the iterator would have the
  1029  	// heap:
  1030  	//
  1031  	//     {L6: b, L5: d}
  1032  	//
  1033  	// Because the L5 iterator has already advanced to the next sstable, the
  1034  	// merging iterator cannot observe the [b-c) range tombstone and will
  1035  	// mistakenly return L6's deleted point key 'b'.
  1036  	if invariants.Enabled && flags.TrySeekUsingNext() && !m.forceEnableSeekOpt &&
  1037  		disableSeekOpt(key, uintptr(unsafe.Pointer(m))) {
  1038  		flags = flags.DisableTrySeekUsingNext()
  1039  	}
  1040  
  1041  	for ; level < len(m.levels); level++ {
  1042  		if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 {
  1043  			m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack())
  1044  		}
  1045  
  1046  		l := &m.levels[level]
  1047  		if m.prefix != nil {
  1048  			l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags)
  1049  		} else {
  1050  			l.iterKey, l.iterValue = l.iter.SeekGE(key, flags)
  1051  		}
  1052  
  1053  		// If this level contains overlapping range tombstones, alter the seek
  1054  		// key accordingly. Caveat: If we're performing lazy-combined iteration,
  1055  		// we cannot alter the seek key: Range tombstones don't delete range
  1056  		// keys, and there might exist live range keys within the range
  1057  		// tombstone's span that need to be observed to trigger a switch to
  1058  		// combined iteration.
  1059  		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
  1060  			(m.combinedIterState == nil || m.combinedIterState.initialized) {
  1061  			// The level has a range-del iterator. Find the tombstone containing
  1062  			// the search key.
  1063  			//
  1064  			// For untruncated tombstones that are possibly file-bounds-constrained, we are using a
  1065  			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
  1066  			// is at this file we know that largestUserKey >= key, so we know that the
  1067  			// tombstone we find cannot be file-bounds-constrained in its upper bound to something < key.
  1068  			// We do need to  compare with smallestUserKey to ensure that the tombstone is not
  1069  			// file-bounds-constrained in its lower bound.
  1070  			//
  1071  			// See the detailed comments in isNextEntryDeleted() on why similar containment and
  1072  			// seeking logic is correct. The subtle difference here is that key is a user key,
  1073  			// so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the
  1074  			// tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past
  1075  			// this sstable since it realizes the largest key is a InternalRangeDelSentinel.
  1076  			l.tombstone = rangeDelIter.SeekGE(key)
  1077  			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) &&
  1078  				(l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) {
  1079  				// NB: Based on the comment above l.largestUserKey >= key, and based on the
  1080  				// containment condition tombstone.End > key, so the assignment to key results
  1081  				// in a monotonically non-decreasing key across iterations of this loop.
  1082  				//
  1083  				// The adjustment of key here can only move it to a larger key. Since
  1084  				// the caller of seekGE guaranteed that the original key was greater
  1085  				// than or equal to m.lower, the new key will continue to be greater
  1086  				// than or equal to m.lower.
  1087  				if l.largestUserKey != nil &&
  1088  					m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 {
  1089  					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
  1090  					// but that is harmless for this seek optimization.
  1091  					key = l.largestUserKey
  1092  				} else {
  1093  					key = l.tombstone.End
  1094  				}
  1095  			}
  1096  		}
  1097  	}
  1098  
  1099  	m.initMinHeap()
  1100  }
  1101  
  1102  func (m *mergingIter) String() string {
  1103  	return "merging"
  1104  }
  1105  
  1106  // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks
  1107  // the upper bound. It is up to the caller to ensure that key is greater than
  1108  // or equal to the lower bound.
  1109  func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) {
  1110  	m.err = nil // clear cached iteration error
  1111  	m.prefix = nil
  1112  	m.seekGE(key, 0 /* start level */, flags)
  1113  	return m.findNextEntry()
  1114  }
  1115  
  1116  // SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that
  1117  // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
  1118  // that key is greater than or equal to the lower bound.
  1119  func (m *mergingIter) SeekPrefixGE(
  1120  	prefix, key []byte, flags base.SeekGEFlags,
  1121  ) (*base.InternalKey, base.LazyValue) {
  1122  	m.err = nil // clear cached iteration error
  1123  	m.prefix = prefix
  1124  	m.seekGE(key, 0 /* start level */, flags)
  1125  	return m.findNextEntry()
  1126  }
  1127  
  1128  // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks.
  1129  func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) {
  1130  	// See the comment in seekGE regarding using tombstones to adjust the seek
  1131  	// target per level.
  1132  	m.prefix = nil
  1133  	for ; level < len(m.levels); level++ {
  1134  		if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 {
  1135  			m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack())
  1136  		}
  1137  
  1138  		l := &m.levels[level]
  1139  		l.iterKey, l.iterValue = l.iter.SeekLT(key, flags)
  1140  
  1141  		// If this level contains overlapping range tombstones, alter the seek
  1142  		// key accordingly. Caveat: If we're performing lazy-combined iteration,
  1143  		// we cannot alter the seek key: Range tombstones don't delete range
  1144  		// keys, and there might exist live range keys within the range
  1145  		// tombstone's span that need to be observed to trigger a switch to
  1146  		// combined iteration.
  1147  		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
  1148  			(m.combinedIterState == nil || m.combinedIterState.initialized) {
  1149  			// The level has a range-del iterator. Find the tombstone containing
  1150  			// the search key.
  1151  			//
  1152  			// For untruncated tombstones that are possibly file-bounds-constrained we are using a
  1153  			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
  1154  			// is at this file we know that smallestUserKey <= key, so we know that the
  1155  			// tombstone we find cannot be file-bounds-constrained in its lower bound to something > key.
  1156  			// We do need to  compare with largestUserKey to ensure that the tombstone is not
  1157  			// file-bounds-constrained in its upper bound.
  1158  			//
  1159  			// See the detailed comments in isPrevEntryDeleted() on why similar containment and
  1160  			// seeking logic is correct.
  1161  
  1162  			// Default to within bounds.
  1163  			withinLargestSSTableBound := true
  1164  			if l.largestUserKey != nil {
  1165  				cmpResult := m.heap.cmp(l.largestUserKey, key)
  1166  				withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive)
  1167  			}
  1168  
  1169  			l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key)
  1170  			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) &&
  1171  				l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound {
  1172  				// NB: Based on the comment above l.smallestUserKey <= key, and based
  1173  				// on the containment condition tombstone.Start.UserKey <= key, so the
  1174  				// assignment to key results in a monotonically non-increasing key
  1175  				// across iterations of this loop.
  1176  				//
  1177  				// The adjustment of key here can only move it to a smaller key. Since
  1178  				// the caller of seekLT guaranteed that the original key was less than
  1179  				// or equal to m.upper, the new key will continue to be less than or
  1180  				// equal to m.upper.
  1181  				if l.smallestUserKey != nil &&
  1182  					m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 {
  1183  					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
  1184  					// but that is harmless for this seek optimization.
  1185  					key = l.smallestUserKey
  1186  				} else {
  1187  					key = l.tombstone.Start
  1188  				}
  1189  			}
  1190  		}
  1191  	}
  1192  
  1193  	m.initMaxHeap()
  1194  }
  1195  
  1196  // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks
  1197  // the lower bound. It is up to the caller to ensure that key is less than the
  1198  // upper bound.
  1199  func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) {
  1200  	m.err = nil // clear cached iteration error
  1201  	m.prefix = nil
  1202  	m.seekLT(key, 0 /* start level */, flags)
  1203  	return m.findPrevEntry()
  1204  }
  1205  
  1206  // First implements base.InternalIterator.First. Note that First only checks
  1207  // the upper bound. It is up to the caller to ensure that key is greater than
  1208  // or equal to the lower bound (e.g. via a call to SeekGE(lower)).
  1209  func (m *mergingIter) First() (*InternalKey, base.LazyValue) {
  1210  	m.err = nil // clear cached iteration error
  1211  	m.prefix = nil
  1212  	m.heap.items = m.heap.items[:0]
  1213  	for i := range m.levels {
  1214  		l := &m.levels[i]
  1215  		l.iterKey, l.iterValue = l.iter.First()
  1216  	}
  1217  	m.initMinHeap()
  1218  	return m.findNextEntry()
  1219  }
  1220  
  1221  // Last implements base.InternalIterator.Last. Note that Last only checks the
  1222  // lower bound. It is up to the caller to ensure that key is less than the
  1223  // upper bound (e.g. via a call to SeekLT(upper))
  1224  func (m *mergingIter) Last() (*InternalKey, base.LazyValue) {
  1225  	m.err = nil // clear cached iteration error
  1226  	m.prefix = nil
  1227  	for i := range m.levels {
  1228  		l := &m.levels[i]
  1229  		l.iterKey, l.iterValue = l.iter.Last()
  1230  	}
  1231  	m.initMaxHeap()
  1232  	return m.findPrevEntry()
  1233  }
  1234  
  1235  func (m *mergingIter) Next() (*InternalKey, base.LazyValue) {
  1236  	if m.err != nil {
  1237  		return nil, base.LazyValue{}
  1238  	}
  1239  
  1240  	if m.dir != 1 {
  1241  		m.switchToMinHeap()
  1242  		return m.findNextEntry()
  1243  	}
  1244  
  1245  	if m.heap.len() == 0 {
  1246  		return nil, base.LazyValue{}
  1247  	}
  1248  
  1249  	// NB: It's okay to call nextEntry directly even during prefix iteration
  1250  	// mode (as opposed to indirectly through maybeNextEntryWithinPrefix).
  1251  	// During prefix iteration mode, we rely on the caller to not call Next if
  1252  	// the iterator has already advanced beyond the iteration prefix. See the
  1253  	// comment above the base.InternalIterator interface.
  1254  	m.nextEntry(m.heap.items[0], nil /* succKey */)
  1255  	return m.findNextEntry()
  1256  }
  1257  
  1258  func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) {
  1259  	if m.dir != 1 {
  1260  		panic("pebble: cannot switch directions with NextPrefix")
  1261  	}
  1262  	if m.err != nil || m.heap.len() == 0 {
  1263  		return nil, LazyValue{}
  1264  	}
  1265  	if m.levelsPositioned == nil {
  1266  		m.levelsPositioned = make([]bool, len(m.levels))
  1267  	} else {
  1268  		for i := range m.levelsPositioned {
  1269  			m.levelsPositioned[i] = false
  1270  		}
  1271  	}
  1272  
  1273  	// The heap root necessarily must be positioned at a key < succKey, because
  1274  	// NextPrefix was invoked.
  1275  	root := &m.heap.items[0]
  1276  	m.levelsPositioned[(*root).index] = true
  1277  	if invariants.Enabled && m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 {
  1278  		m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q",
  1279  			succKey, (*root).iterKey)
  1280  	}
  1281  	m.nextEntry(*root, succKey)
  1282  	// NB: root is a pointer to the heap root. nextEntry may have changed
  1283  	// the heap root, so we must not expect root to still point to the same
  1284  	// level (or to even be valid, if the heap is now exhaused).
  1285  
  1286  	for m.heap.len() > 0 {
  1287  		if m.levelsPositioned[(*root).index] {
  1288  			// A level we've previously positioned is at the top of the heap, so
  1289  			// there are no other levels positioned at keys < succKey. We've
  1290  			// advanced as far as we need to.
  1291  			break
  1292  		}
  1293  		// Since this level was not the original heap root when NextPrefix was
  1294  		// called, we don't know whether this level's current key has the
  1295  		// previous prefix or a new one.
  1296  		if m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 {
  1297  			break
  1298  		}
  1299  		m.levelsPositioned[(*root).index] = true
  1300  		m.nextEntry(*root, succKey)
  1301  	}
  1302  	return m.findNextEntry()
  1303  }
  1304  
  1305  func (m *mergingIter) Prev() (*InternalKey, base.LazyValue) {
  1306  	if m.err != nil {
  1307  		return nil, base.LazyValue{}
  1308  	}
  1309  
  1310  	if m.dir != -1 {
  1311  		if m.prefix != nil {
  1312  			m.err = errors.New("pebble: unsupported reverse prefix iteration")
  1313  			return nil, base.LazyValue{}
  1314  		}
  1315  		m.switchToMaxHeap()
  1316  		return m.findPrevEntry()
  1317  	}
  1318  
  1319  	if m.heap.len() == 0 {
  1320  		return nil, base.LazyValue{}
  1321  	}
  1322  
  1323  	m.prevEntry(m.heap.items[0])
  1324  	return m.findPrevEntry()
  1325  }
  1326  
  1327  func (m *mergingIter) Error() error {
  1328  	if m.heap.len() == 0 || m.err != nil {
  1329  		return m.err
  1330  	}
  1331  	return m.levels[m.heap.items[0].index].iter.Error()
  1332  }
  1333  
  1334  func (m *mergingIter) Close() error {
  1335  	for i := range m.levels {
  1336  		iter := m.levels[i].iter
  1337  		if err := iter.Close(); err != nil && m.err == nil {
  1338  			m.err = err
  1339  		}
  1340  		if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil {
  1341  			if err := rangeDelIter.Close(); err != nil && m.err == nil {
  1342  				m.err = err
  1343  			}
  1344  		}
  1345  	}
  1346  	m.levels = nil
  1347  	m.heap.items = m.heap.items[:0]
  1348  	return m.err
  1349  }
  1350  
  1351  func (m *mergingIter) SetBounds(lower, upper []byte) {
  1352  	m.prefix = nil
  1353  	m.lower = lower
  1354  	m.upper = upper
  1355  	for i := range m.levels {
  1356  		m.levels[i].iter.SetBounds(lower, upper)
  1357  	}
  1358  	m.heap.clear()
  1359  }
  1360  
  1361  func (m *mergingIter) DebugString() string {
  1362  	var buf bytes.Buffer
  1363  	sep := ""
  1364  	for m.heap.len() > 0 {
  1365  		item := m.heap.pop()
  1366  		fmt.Fprintf(&buf, "%s%s", sep, item.iterKey)
  1367  		sep = " "
  1368  	}
  1369  	if m.dir == 1 {
  1370  		m.initMinHeap()
  1371  	} else {
  1372  		m.initMaxHeap()
  1373  	}
  1374  	return buf.String()
  1375  }
  1376  
  1377  func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) {
  1378  	for _, ml := range m.levels {
  1379  		if ml.levelIter != nil {
  1380  			if done := fn(ml.levelIter); done {
  1381  				break
  1382  			}
  1383  		}
  1384  	}
  1385  }
  1386  
  1387  func (m *mergingIter) addItemStats(l *mergingIterLevel) {
  1388  	m.stats.PointCount++
  1389  	m.stats.KeyBytes += uint64(len(l.iterKey.UserKey))
  1390  	m.stats.ValueBytes += uint64(len(l.iterValue.ValueOrHandle))
  1391  }
  1392  
  1393  var _ internalIterator = &mergingIter{}