github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/merging_iter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"bytes"
     9  	"fmt"
    10  	"runtime/debug"
    11  
    12  	"github.com/cockroachdb/errors"
    13  	"github.com/zuoyebang/bitalostable/internal/base"
    14  	"github.com/zuoyebang/bitalostable/internal/invariants"
    15  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    16  )
    17  
    18  type mergingIterLevel struct {
    19  	iter internalIterator
    20  	// rangeDelIter is set to the range-deletion iterator for the level. When
    21  	// configured with a levelIter, this pointer changes as sstable boundaries
    22  	// are crossed. See levelIter.initRangeDel and the Range Deletions comment
    23  	// below.
    24  	rangeDelIter keyspan.FragmentIterator
    25  	// iterKey and iterValue cache the current key and value iter are pointed at.
    26  	iterKey   *InternalKey
    27  	iterValue []byte
    28  
    29  	// levelIterBoundaryContext's fields are set when using levelIter, in order
    30  	// to surface sstable boundary keys and file-level context. See levelIter
    31  	// comment and the Range Deletions comment below.
    32  	levelIterBoundaryContext
    33  
    34  	// tombstone caches the tombstone rangeDelIter is currently pointed at. If
    35  	// tombstone is nil, there are no further tombstones within the
    36  	// current sstable in the current iterator direction. The cached tombstone is
    37  	// only valid for the levels in the range [0,heap[0].index]. This avoids
    38  	// positioning tombstones at lower levels which cannot possibly shadow the
    39  	// current key.
    40  	tombstone *keyspan.Span
    41  }
    42  
    43  type levelIterBoundaryContext struct {
    44  	// smallestUserKey and largestUserKey are populated with the smallest and
    45  	// largest boundaries of the current file.
    46  	smallestUserKey, largestUserKey []byte
    47  	// isLargestUserKeyRangeDelSentinel is set to true when a file's largest
    48  	// boundary is an exclusive range deletion sentinel. If true, the file does
    49  	// not contain any keys with the provided user key, and the largestUserKey
    50  	// bound is exclusive.
    51  	isLargestUserKeyRangeDelSentinel bool
    52  	// isSyntheticIterBoundsKey is set to true iff the key returned by the level
    53  	// iterator is a synthetic key derived from the iterator bounds. This is
    54  	// used to prevent the mergingIter from being stuck at such a synthetic key
    55  	// if it becomes the top element of the heap.
    56  	isSyntheticIterBoundsKey bool
    57  	// isIgnorableBoundaryKey is set to true iff the key returned by the level
    58  	// iterator is a file boundary key that should be ignored. This is used to
    59  	// keep a levelIter file's range deletion iterator open as long as other
    60  	// levels within the merging iterator require it.
    61  	isIgnorableBoundaryKey bool
    62  }
    63  
    64  // mergingIter provides a merged view of multiple iterators from different
    65  // levels of the LSM.
    66  //
    67  // The core of a mergingIter is a heap of internalIterators (see
    68  // mergingIterHeap). The heap can operate as either a min-heap, used during
    69  // forward iteration (First, SeekGE, Next) or a max-heap, used during reverse
    70  // iteration (Last, SeekLT, Prev). The heap is initialized in calls to First,
    71  // Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top
    72  // element on the heap, advances its iterator, and then "fixes" the heap
    73  // property. When one of the child iterators is exhausted during Next/Prev
    74  // iteration, it is removed from the heap.
    75  //
    76  // # Range Deletions
    77  //
    78  // A mergingIter can optionally be configured with a slice of range deletion
    79  // iterators. The range deletion iterator slice must exactly parallel the point
    80  // iterators and the range deletion iterator must correspond to the same level
    81  // in the LSM as the point iterator. Note that each memtable and each table in
    82  // L0 is a different "level" from the mergingIter perspective. So level 0 below
    83  // does not correspond to L0 in the LSM.
    84  //
    85  // A range deletion iterator iterates over fragmented range tombstones. Range
    86  // tombstones are fragmented by splitting them at any overlapping points. This
    87  // fragmentation guarantees that within an sstable tombstones will either be
    88  // distinct or will have identical start and end user keys. While range
    89  // tombstones are fragmented within an sstable, the start and end keys are not truncated
    90  // to sstable boundaries. This is necessary because the tombstone end key is
    91  // exclusive and does not have a sequence number. Consider an sstable
    92  // containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must
    93  // delete "b#8", yet older versions of "b" might spill over to the next
    94  // sstable. So the boundary key for this sstable must be "b#8". Adjusting the
    95  // end key of tombstones to be optionally inclusive or contain a sequence
    96  // number would be possible solutions (such solutions have potentially serious
    97  // issues: tombstones have exclusive end keys since an inclusive deletion end can
    98  // be converted to an exclusive one while the reverse transformation is not possible;
    99  // the semantics of a sequence number for the end key of a range tombstone are murky).
   100  //
   101  // The approach taken here performs an
   102  // implicit truncation of the tombstone to the sstable boundaries.
   103  //
   104  // During initialization of a mergingIter, the range deletion iterators for
   105  // batches, memtables, and L0 tables are populated up front. Note that Batches
   106  // and memtables index unfragmented tombstones.  Batch.newRangeDelIter() and
   107  // memTable.newRangeDelIter() fragment and cache the tombstones on demand. The
   108  // L1-L6 range deletion iterators are populated by levelIter. When configured
   109  // to load range deletion iterators, whenever a levelIter loads a table it
   110  // loads both the point iterator and the range deletion
   111  // iterator. levelIter.rangeDelIter is configured to point to the right entry
   112  // in mergingIter.levels. The effect of this setup is that
   113  // mergingIter.levels[i].rangeDelIter always contains the fragmented range
   114  // tombstone for the current table in level i that the levelIter has open.
   115  //
   116  // Another crucial mechanism of levelIter is that it materializes fake point
   117  // entries for the table boundaries if the boundary is range deletion
   118  // key. Consider a table that contains only a range tombstone [a-e)#10. The
   119  // sstable boundaries for this table will be a#10,15 and
   120  // e#72057594037927935,15. During forward iteration levelIter will return
   121  // e#72057594037927935,15 as a key. During reverse iteration levelIter will
   122  // return a#10,15 as a key. These sentinel keys act as bookends to point
   123  // iteration and allow mergingIter to keep a table and its associated range
   124  // tombstones loaded as long as there are keys at lower levels that are within
   125  // the bounds of the table.
   126  //
   127  // The final piece to the range deletion puzzle is the LSM invariant that for a
   128  // given key K newer versions of K can only exist earlier in the level, or at
   129  // higher levels of the tree. For example, if K#4 exists in L3, k#5 can only
   130  // exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly
   131  // uses this invariant to find the value for a key by walking the LSM level by
   132  // level. For range deletions, this invariant means that a range deletion at
   133  // level N will necessarily shadow any keys within its bounds in level Y where
   134  // Y > N. One wrinkle to this statement is that it only applies to keys that
   135  // lie within the sstable bounds as well, but we get that guarantee due to the
   136  // way the range deletion iterator and point iterator are bound together by a
   137  // levelIter.
   138  //
   139  // Tying the above all together, we get a picture where each level (index in
   140  // mergingIter.levels) is composed of both point operations (pX) and range
   141  // deletions (rX). The range deletions for level X shadow both the point
   142  // operations and range deletions for level Y where Y > X allowing mergingIter
   143  // to skip processing entries in that shadow. For example, consider the
   144  // scenario:
   145  //
   146  //	r0: a---e
   147  //	r1:    d---h
   148  //	r2:       g---k
   149  //	r3:          j---n
   150  //	r4:             m---q
   151  //
   152  // This is showing 5 levels of range deletions. Consider what happens upon
   153  // SeekGE("b"). We first seek the point iterator for level 0 (the point values
   154  // are not shown above) and we then seek the range deletion iterator. That
   155  // returns the tombstone [a,e). This tombstone tells us that all keys in the
   156  // range [a,e) in lower levels are deleted so we can skip them. So we can
   157  // adjust the seek key to "e", the tombstone end key. For level 1 we seek to
   158  // "e" and find the range tombstone [d,h) and similar logic holds. By the time
   159  // we get to level 4 we're seeking to "n".
   160  //
   161  // One consequence of not truncating tombstone end keys to sstable boundaries
   162  // is the seeking process described above cannot always seek to the tombstone
   163  // end key in the older level. For example, imagine in the above example r3 is
   164  // a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j,
   165  // n) has "k" as its upper boundary. In this situation, compactions involving
   166  // keys at or after "k" can output those keys to r4+, even if they're newer
   167  // than our tombstone [j, n). So instead of seeking to "n" in r4 we can only
   168  // seek to "k".  To achieve this, the instance variable `largestUserKey.`
   169  // maintains the upper bounds of the current sstables in the partitioned
   170  // levels. In this example, `levels[3].largestUserKey` holds "k", telling us to
   171  // limit the seek triggered by a tombstone in r3 to "k".
   172  //
   173  // During actual iteration levels can contain both point operations and range
   174  // deletions. Within a level, when a range deletion contains a point operation
   175  // the sequence numbers must be checked to determine if the point operation is
   176  // newer or older than the range deletion tombstone. The mergingIter maintains
   177  // the invariant that the range deletion iterators for all levels newer that
   178  // the current iteration key (L < m.heap.items[0].index) are positioned at the
   179  // next (or previous during reverse iteration) range deletion tombstone. We
   180  // know those levels don't contain a range deletion tombstone that covers the
   181  // current key because if they did the current key would be deleted. The range
   182  // deletion iterator for the current key's level is positioned at a range
   183  // tombstone covering or past the current key. The position of all of other
   184  // range deletion iterators is unspecified. Whenever a key from those levels
   185  // becomes the current key, their range deletion iterators need to be
   186  // positioned. This lazy positioning avoids seeking the range deletion
   187  // iterators for keys that are never considered. (A similar bit of lazy
   188  // evaluation can be done for the point iterators, but is still TBD).
   189  //
   190  // For a full example, consider the following setup:
   191  //
   192  //	p0:               o
   193  //	r0:             m---q
   194  //
   195  //	p1:              n p
   196  //	r1:       g---k
   197  //
   198  //	p2:  b d    i
   199  //	r2: a---e           q----v
   200  //
   201  //	p3:     e
   202  //	r3:
   203  //
   204  // If we start iterating from the beginning, the first key we encounter is "b"
   205  // in p2. When the mergingIter is pointing at a valid entry, the range deletion
   206  // iterators for all of the levels < m.heap.items[0].index are positioned at
   207  // the next range tombstone past the current key. So r0 will point at [m,q) and
   208  // r1 at [g,k). When the key "b" is encountered, we check to see if the current
   209  // tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e),
   210  // contains and is newer than "b".
   211  //
   212  // Advancing the iterator finds the next key at "d". This is in the same level
   213  // as the previous key "b" so we don't have to reposition any of the range
   214  // deletion iterators, but merely check whether "d" is now contained by any of
   215  // the range tombstones at higher levels or has stepped past the range
   216  // tombstone in its own level or higher levels. In this case, there is nothing to be done.
   217  //
   218  // Advancing the iterator again finds "e". Since "e" comes from p3, we have to
   219  // position the r3 range deletion iterator, which is empty. "e" is past the r2
   220  // tombstone of [a,e) so we need to advance the r2 range deletion iterator to
   221  // [q,v).
   222  //
   223  // The next key is "i". Because this key is in p2, a level above "e", we don't
   224  // have to reposition any range deletion iterators and instead see that "i" is
   225  // covered by the range tombstone [g,k). The iterator is immediately advanced
   226  // to "n" which is covered by the range tombstone [m,q) causing the iterator to
   227  // advance to "o" which is visible.
   228  //
   229  // TODO(peter,rangedel): For testing, advance the iterator through various
   230  // scenarios and have each step display the current state (i.e. the current
   231  // heap and range-del iterator positioning).
   232  type mergingIter struct {
   233  	logger   Logger
   234  	split    Split
   235  	dir      int
   236  	snapshot uint64
   237  	levels   []mergingIterLevel
   238  	heap     mergingIterHeap
   239  	err      error
   240  	prefix   []byte
   241  	lower    []byte
   242  	upper    []byte
   243  	stats    *InternalIteratorStats
   244  
   245  	combinedIterState *combinedIterState
   246  
   247  	// Elide range tombstones from being returned during iteration. Set to true
   248  	// when mergingIter is a child of Iterator and the mergingIter is processing
   249  	// range tombstones.
   250  	elideRangeTombstones bool
   251  }
   252  
   253  // mergingIter implements the base.InternalIterator interface.
   254  var _ base.InternalIterator = (*mergingIter)(nil)
   255  
   256  // newMergingIter returns an iterator that merges its input. Walking the
   257  // resultant iterator will return all key/value pairs of all input iterators
   258  // in strictly increasing key order, as defined by cmp. It is permissible to
   259  // pass a nil split parameter if the caller is never going to call
   260  // SeekPrefixGE.
   261  //
   262  // The input's key ranges may overlap, but there are assumed to be no duplicate
   263  // keys: if iters[i] contains a key k then iters[j] will not contain that key k.
   264  //
   265  // None of the iters may be nil.
   266  func newMergingIter(
   267  	logger Logger,
   268  	stats *base.InternalIteratorStats,
   269  	cmp Compare,
   270  	split Split,
   271  	iters ...internalIterator,
   272  ) *mergingIter {
   273  	m := &mergingIter{}
   274  	levels := make([]mergingIterLevel, len(iters))
   275  	for i := range levels {
   276  		levels[i].iter = iters[i]
   277  	}
   278  	m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...)
   279  	return m
   280  }
   281  
   282  func (m *mergingIter) init(
   283  	opts *IterOptions,
   284  	stats *base.InternalIteratorStats,
   285  	cmp Compare,
   286  	split Split,
   287  	levels ...mergingIterLevel,
   288  ) {
   289  	m.err = nil
   290  	m.logger = opts.getLogger()
   291  	if opts != nil {
   292  		m.lower = opts.LowerBound
   293  		m.upper = opts.UpperBound
   294  	}
   295  	m.snapshot = InternalKeySeqNumMax
   296  	m.levels = levels
   297  	m.heap.cmp = cmp
   298  	m.split = split
   299  	m.stats = stats
   300  	if cap(m.heap.items) < len(levels) {
   301  		m.heap.items = make([]mergingIterItem, 0, len(levels))
   302  	} else {
   303  		m.heap.items = m.heap.items[:0]
   304  	}
   305  }
   306  
   307  func (m *mergingIter) initHeap() {
   308  	m.heap.items = m.heap.items[:0]
   309  	for i := range m.levels {
   310  		if l := &m.levels[i]; l.iterKey != nil {
   311  			m.heap.items = append(m.heap.items, mergingIterItem{
   312  				index: i,
   313  				key:   *l.iterKey,
   314  				value: l.iterValue,
   315  			})
   316  		} else {
   317  			m.err = firstError(m.err, l.iter.Error())
   318  			if m.err != nil {
   319  				return
   320  			}
   321  		}
   322  	}
   323  	m.heap.init()
   324  }
   325  
   326  func (m *mergingIter) initMinHeap() {
   327  	m.dir = 1
   328  	m.heap.reverse = false
   329  	m.initHeap()
   330  	m.initMinRangeDelIters(-1)
   331  }
   332  
   333  // The level of the previous top element was oldTopLevel. Note that all range delete
   334  // iterators < oldTopLevel are positioned past the key of the previous top element and
   335  // the range delete iterator == oldTopLevel is positioned at or past the key of the
   336  // previous top element. We need to position the range delete iterators from oldTopLevel + 1
   337  // to the level of the current top element.
   338  func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) {
   339  	if m.heap.len() == 0 {
   340  		return
   341  	}
   342  
   343  	// Position the range-del iterators at levels <= m.heap.items[0].index.
   344  	item := &m.heap.items[0]
   345  	for level := oldTopLevel + 1; level <= item.index; level++ {
   346  		l := &m.levels[level]
   347  		if l.rangeDelIter == nil {
   348  			continue
   349  		}
   350  		l.tombstone = keyspan.SeekGE(m.heap.cmp, l.rangeDelIter, item.key.UserKey)
   351  	}
   352  }
   353  
   354  func (m *mergingIter) initMaxHeap() {
   355  	m.dir = -1
   356  	m.heap.reverse = true
   357  	m.initHeap()
   358  	m.initMaxRangeDelIters(-1)
   359  }
   360  
   361  // The level of the previous top element was oldTopLevel. Note that all range delete
   362  // iterators < oldTopLevel are positioned before the key of the previous top element and
   363  // the range delete iterator == oldTopLevel is positioned at or before the key of the
   364  // previous top element. We need to position the range delete iterators from oldTopLevel + 1
   365  // to the level of the current top element.
   366  func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) {
   367  	if m.heap.len() == 0 {
   368  		return
   369  	}
   370  	// Position the range-del iterators at levels <= m.heap.items[0].index.
   371  	item := &m.heap.items[0]
   372  	for level := oldTopLevel + 1; level <= item.index; level++ {
   373  		l := &m.levels[level]
   374  		if l.rangeDelIter == nil {
   375  			continue
   376  		}
   377  		l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.key.UserKey)
   378  	}
   379  }
   380  
   381  func (m *mergingIter) switchToMinHeap() {
   382  	if m.heap.len() == 0 {
   383  		if m.lower != nil {
   384  			m.SeekGE(m.lower, base.SeekGEFlagsNone)
   385  		} else {
   386  			m.First()
   387  		}
   388  		return
   389  	}
   390  
   391  	// We're switching from using a max heap to a min heap. We need to advance
   392  	// any iterator that is less than or equal to the current key. Consider the
   393  	// scenario where we have 2 iterators being merged (user-key:seq-num):
   394  	//
   395  	// i1:     *a:2     b:2
   396  	// i2: a:1      b:1
   397  	//
   398  	// The current key is a:2 and i2 is pointed at a:1. When we switch to forward
   399  	// iteration, we want to return a key that is greater than a:2.
   400  
   401  	key := m.heap.items[0].key
   402  	cur := &m.levels[m.heap.items[0].index]
   403  
   404  	for i := range m.levels {
   405  		l := &m.levels[i]
   406  		if l == cur {
   407  			continue
   408  		}
   409  
   410  		// If the iterator is exhausted, it may be out of bounds if range
   411  		// deletions modified our search key as we descended. we need to
   412  		// reposition it within the search bounds. If the current key is a
   413  		// range tombstone, the iterator might still be exhausted but at a
   414  		// sstable boundary sentinel. It would be okay to reposition an
   415  		// interator like this only through successive Next calls, except that
   416  		// it would violate the levelIter's invariants by causing it to return
   417  		// a key before the lower bound.
   418  		//
   419  		//           bounds = [ f, _ )
   420  		// L0:   [ b ]          [ f*                   z ]
   421  		// L1: [ a           |----|        k        y ]
   422  		// L2:    [  c  (d) ] [ e      g     m ]
   423  		// L3:             [                    x ]
   424  		//
   425  		// * - current key   [] - table bounds () - heap item
   426  		//
   427  		// In the above diagram, the L2 iterator is positioned at a sstable
   428  		// boundary (d) outside the lower bound (f). It arrived here from a
   429  		// seek whose seek-key was modified by a range tombstone. If we called
   430  		// Next on the L2 iterator, it would return e, violating its lower
   431  		// bound.  Instead, we seek it to >= f and Next from there.
   432  
   433  		if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey &&
   434  			l.iterKey.IsExclusiveSentinel() &&
   435  			m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) {
   436  			if m.lower != nil {
   437  				l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
   438  			} else {
   439  				l.iterKey, l.iterValue = l.iter.First()
   440  			}
   441  		}
   442  		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() {
   443  			if base.InternalCompare(m.heap.cmp, key, *l.iterKey) < 0 {
   444  				// key < iter-key
   445  				break
   446  			}
   447  			// key >= iter-key
   448  		}
   449  	}
   450  
   451  	// Special handling for the current iterator because we were using its key
   452  	// above. The iterator cur.iter may still be exhausted at a sstable boundary
   453  	// sentinel. Similar to the logic applied to the other levels, in these
   454  	// cases we seek the iterator to the first key in order to avoid violating
   455  	// levelIter's invariants. See the example in the for loop above.
   456  	if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
   457  		m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 {
   458  		cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone)
   459  	} else {
   460  		cur.iterKey, cur.iterValue = cur.iter.Next()
   461  	}
   462  	m.initMinHeap()
   463  }
   464  
   465  func (m *mergingIter) switchToMaxHeap() {
   466  	if m.heap.len() == 0 {
   467  		if m.upper != nil {
   468  			m.SeekLT(m.upper, base.SeekLTFlagsNone)
   469  		} else {
   470  			m.Last()
   471  		}
   472  		return
   473  	}
   474  
   475  	// We're switching from using a min heap to a max heap. We need to backup any
   476  	// iterator that is greater than or equal to the current key. Consider the
   477  	// scenario where we have 2 iterators being merged (user-key:seq-num):
   478  	//
   479  	// i1: a:2     *b:2
   480  	// i2:     a:1      b:1
   481  	//
   482  	// The current key is b:2 and i2 is pointing at b:1. When we switch to
   483  	// reverse iteration, we want to return a key that is less than b:2.
   484  	key := m.heap.items[0].key
   485  	cur := &m.levels[m.heap.items[0].index]
   486  
   487  	for i := range m.levels {
   488  		l := &m.levels[i]
   489  		if l == cur {
   490  			continue
   491  		}
   492  
   493  		// If the iterator is exhausted, it may be out of bounds if range
   494  		// deletions modified our search key as we descended. we need to
   495  		// reposition it within the search bounds. If the current key is a
   496  		// range tombstone, the iterator might still be exhausted but at a
   497  		// sstable boundary sentinel. It would be okay to reposition an
   498  		// interator like this only through successive Prev calls, except that
   499  		// it would violate the levelIter's invariants by causing it to return
   500  		// a key beyond the upper bound.
   501  		//
   502  		//           bounds = [ _, g )
   503  		// L0:   [ b ]          [ f*                   z ]
   504  		// L1: [ a                |-------| k       y ]
   505  		// L2:    [  c   d  ]        h [(i)    m ]
   506  		// L3:             [  e                  x ]
   507  		//
   508  		// * - current key   [] - table bounds () - heap item
   509  		//
   510  		// In the above diagram, the L2 iterator is positioned at a sstable
   511  		// boundary (i) outside the upper bound (g). It arrived here from a
   512  		// seek whose seek-key was modified by a range tombstone. If we called
   513  		// Prev on the L2 iterator, it would return h, violating its upper
   514  		// bound.  Instead, we seek it to < g, and Prev from there.
   515  
   516  		if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey &&
   517  			l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) {
   518  			if m.upper != nil {
   519  				l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
   520  			} else {
   521  				l.iterKey, l.iterValue = l.iter.Last()
   522  			}
   523  		}
   524  		for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() {
   525  			if base.InternalCompare(m.heap.cmp, key, *l.iterKey) > 0 {
   526  				// key > iter-key
   527  				break
   528  			}
   529  			// key <= iter-key
   530  		}
   531  	}
   532  
   533  	// Special handling for the current iterator because we were using its key
   534  	// above. The iterator cur.iter may still be exhausted at a sstable boundary
   535  	// sentinel. Similar to the logic applied to the other levels, in these
   536  	// cases we seek the iterator to  in order to avoid violating levelIter's
   537  	// invariants by Prev-ing through files.  See the example in the for loop
   538  	// above.
   539  	if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() &&
   540  		m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 {
   541  		cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone)
   542  	} else {
   543  		cur.iterKey, cur.iterValue = cur.iter.Prev()
   544  	}
   545  	m.initMaxHeap()
   546  }
   547  
   548  // Steps to the next entry. item is the current top item in the heap.
   549  func (m *mergingIter) nextEntry(item *mergingIterItem) {
   550  	l := &m.levels[item.index]
   551  	oldTopLevel := item.index
   552  	oldRangeDelIter := l.rangeDelIter
   553  	if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil {
   554  		item.key, item.value = *l.iterKey, l.iterValue
   555  		if m.heap.len() > 1 {
   556  			m.heap.fix(0)
   557  		}
   558  		if l.rangeDelIter != oldRangeDelIter {
   559  			// The rangeDelIter changed which indicates that the l.iter moved to the
   560  			// next sstable. We have to update the tombstone for oldTopLevel as well.
   561  			oldTopLevel--
   562  		}
   563  	} else {
   564  		m.err = l.iter.Error()
   565  		if m.err == nil {
   566  			m.heap.pop()
   567  		}
   568  	}
   569  
   570  	// The cached tombstones are only valid for the levels
   571  	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
   572  	// [oldTopLevel+1,heap[0].index].
   573  	m.initMinRangeDelIters(oldTopLevel)
   574  }
   575  
   576  // isNextEntryDeleted() starts from the current entry (as the next entry) and if it is deleted,
   577  // moves the iterators forward as needed and returns true, else it returns false. item is the top
   578  // item in the heap.
   579  func (m *mergingIter) isNextEntryDeleted(item *mergingIterItem) bool {
   580  	// Look for a range deletion tombstone containing item.key at higher
   581  	// levels (level < item.index). If we find such a range tombstone we know
   582  	// it deletes the key in the current level. Also look for a range
   583  	// deletion at the current level (level == item.index). If we find such a
   584  	// range deletion we need to check whether it is newer than the current
   585  	// entry.
   586  	for level := 0; level <= item.index; level++ {
   587  		l := &m.levels[level]
   588  		if l.rangeDelIter == nil || l.tombstone == nil {
   589  			// If l.tombstone is nil, there are no further tombstones
   590  			// in the current sstable in the current (forward) iteration
   591  			// direction.
   592  			continue
   593  		}
   594  		if m.heap.cmp(l.tombstone.End, item.key.UserKey) <= 0 {
   595  			// The current key is at or past the tombstone end key.
   596  			//
   597  			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
   598  			// the levelIter must be positioned at a key >= item.key. So it is sufficient to seek the
   599  			// current l.rangeDelIter (since any range del iterators that will be provided by the
   600  			// levelIter in the future cannot contain item.key). Also, it is possible that we
   601  			// will encounter parts of the range delete that should be ignored -- we handle that
   602  			// below.
   603  			l.tombstone = keyspan.SeekGE(m.heap.cmp, l.rangeDelIter, item.key.UserKey)
   604  		}
   605  		if l.tombstone == nil {
   606  			continue
   607  		}
   608  
   609  		// Reasoning for correctness of untruncated tombstone handling when the untruncated
   610  		// tombstone is at a higher level:
   611  		// The iterator corresponding to this tombstone is still in the heap so it must be
   612  		// positioned >= item.key. Which means the Largest key bound of the sstable containing this
   613  		// tombstone is >= item.key. So the upper limit of this tombstone cannot be file-bounds-constrained
   614  		// to < item.key. But it is possible that item.key < smallestUserKey, in which
   615  		// case this tombstone should be ignored.
   616  		//
   617  		// Example 1:
   618  		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The
   619  		// smallestUserKey is c, so we know the key is within the file bounds and the tombstone
   620  		// [b, i) covers it.
   621  		//
   622  		// Example 2:
   623  		// Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i)
   624  		// does not cover this key.
   625  		//
   626  		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
   627  		if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.key.UserKey) <= 0) &&
   628  			l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.key.UserKey) {
   629  			if level < item.index {
   630  				// We could also do m.seekGE(..., level + 1). The levels from
   631  				// [level + 1, item.index) are already after item.key so seeking them may be
   632  				// wasteful.
   633  
   634  				// We can seek up to the min of largestUserKey and tombstone.End.
   635  				//
   636  				// Using example 1 above, we can seek to the smaller of g and i, which is g.
   637  				//
   638  				// Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel],
   639  				// and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to
   640  				// the exclusive bound of the tombstone. We do not need to look at
   641  				// isLargestKeyRangeDelSentinel.
   642  				//
   643  				// Progress argument: Since this file is at a higher level than item.key we know
   644  				// that the iterator in this file must be positioned within its bounds and at a key
   645  				// X > item.key (otherwise it would be the min of the heap). It is not
   646  				// possible for X.UserKey == item.key.UserKey, since it is incompatible with
   647  				// X > item.key (a lower version cannot be in a higher sstable), so it must be that
   648  				// X.UserKey > item.key.UserKey. Which means l.largestUserKey > item.key.UserKey.
   649  				// We also know that l.tombstone.End > item.key.UserKey. So the min of these,
   650  				// seekKey, computed below, is > item.key.UserKey, so the call to seekGE() will
   651  				// make forward progress.
   652  				seekKey := l.tombstone.End
   653  				if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 {
   654  					seekKey = l.largestUserKey
   655  				}
   656  				// This seek is not directly due to a SeekGE call, so we don't
   657  				// know enough about the underlying iterator positions, and so
   658  				// we keep the try-seek-using-next optimization disabled.
   659  				//
   660  				// Additionally, we set the relative-seek flag. This is
   661  				// important when iterating with lazy combined iteration. If
   662  				// there's a range key between this level's current file and the
   663  				// file the seek will land on, we need to detect it in order to
   664  				// trigger construction of the combined iterator.
   665  				m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek())
   666  				return true
   667  			}
   668  			if l.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
   669  				m.nextEntry(item)
   670  				return true
   671  			}
   672  		}
   673  	}
   674  	return false
   675  }
   676  
   677  // Starting from the current entry, finds the first (next) entry that can be returned.
   678  func (m *mergingIter) findNextEntry() (*InternalKey, []byte) {
   679  	var reseeked bool
   680  	for m.heap.len() > 0 && m.err == nil {
   681  		item := &m.heap.items[0]
   682  		if m.levels[item.index].isSyntheticIterBoundsKey {
   683  			break
   684  		}
   685  		// For prefix iteration, stop if we already seeked the iterator due to a
   686  		// range tombstone and are now past the prefix. We could amortize the
   687  		// cost of this comparison, by doing it only after we have iterated in
   688  		// this for loop a few times. But unless we find a performance benefit
   689  		// to that, we do the simple thing and compare each time. Note that
   690  		// isNextEntryDeleted already did at least 4 key comparisons in order to
   691  		// return true, and additionally at least one heap comparison to step to
   692  		// the next entry.
   693  		//
   694  		// Note that we cannot move this comparison into the isNextEntryDeleted
   695  		// branch. Once isNextEntryDeleted determines a key is deleted and seeks
   696  		// the level's iterator, item.key's memory is potentially invalid. If
   697  		// the iterator is now exhausted, item.key may be garbage.
   698  		if m.prefix != nil && reseeked {
   699  			if n := m.split(item.key.UserKey); !bytes.Equal(m.prefix, item.key.UserKey[:n]) {
   700  				return nil, nil
   701  			}
   702  		}
   703  
   704  		m.addItemStats(item)
   705  		if m.isNextEntryDeleted(item) {
   706  			m.stats.PointsCoveredByRangeTombstones++
   707  			reseeked = true
   708  			continue
   709  		}
   710  		if item.key.Visible1(m.snapshot) &&
   711  			(!m.levels[item.index].isIgnorableBoundaryKey) &&
   712  			(item.key.Kind() != InternalKeyKindRangeDelete || !m.elideRangeTombstones) {
   713  			return &item.key, item.value
   714  		}
   715  		m.nextEntry(item)
   716  	}
   717  	return nil, nil
   718  }
   719  
   720  // Steps to the prev entry. item is the current top item in the heap.
   721  func (m *mergingIter) prevEntry(item *mergingIterItem) {
   722  	l := &m.levels[item.index]
   723  	oldTopLevel := item.index
   724  	oldRangeDelIter := l.rangeDelIter
   725  	if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil {
   726  		item.key, item.value = *l.iterKey, l.iterValue
   727  		if m.heap.len() > 1 {
   728  			m.heap.fix(0)
   729  		}
   730  		if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil {
   731  			// The rangeDelIter changed which indicates that the l.iter moved to the
   732  			// previous sstable. We have to update the tombstone for oldTopLevel as
   733  			// well.
   734  			oldTopLevel--
   735  		}
   736  	} else {
   737  		m.err = l.iter.Error()
   738  		if m.err == nil {
   739  			m.heap.pop()
   740  		}
   741  	}
   742  
   743  	// The cached tombstones are only valid for the levels
   744  	// [0,oldTopLevel]. Updated the cached tombstones for any levels in the range
   745  	// [oldTopLevel+1,heap[0].index].
   746  	m.initMaxRangeDelIters(oldTopLevel)
   747  }
   748  
   749  // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted,
   750  // moves the iterators backward as needed and returns true, else it returns false. item is the top
   751  // item in the heap.
   752  func (m *mergingIter) isPrevEntryDeleted(item *mergingIterItem) bool {
   753  	// Look for a range deletion tombstone containing item.key at higher
   754  	// levels (level < item.index). If we find such a range tombstone we know
   755  	// it deletes the key in the current level. Also look for a range
   756  	// deletion at the current level (level == item.index). If we find such a
   757  	// range deletion we need to check whether it is newer than the current
   758  	// entry.
   759  	for level := 0; level <= item.index; level++ {
   760  		l := &m.levels[level]
   761  		if l.rangeDelIter == nil || l.tombstone == nil {
   762  			// If l.tombstone is nil, there are no further tombstones
   763  			// in the current sstable in the current (reverse) iteration
   764  			// direction.
   765  			continue
   766  		}
   767  		if m.heap.cmp(item.key.UserKey, l.tombstone.Start) < 0 {
   768  			// The current key is before the tombstone start key.
   769  			//
   770  			// NB: for the case that this l.rangeDelIter is provided by a levelIter we know that
   771  			// the levelIter must be positioned at a key < item.key. So it is sufficient to seek the
   772  			// current l.rangeDelIter (since any range del iterators that will be provided by the
   773  			// levelIter in the future cannot contain item.key). Also, it is it is possible that we
   774  			// will encounter parts of the range delete that should be ignored -- we handle that
   775  			// below.
   776  			l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.key.UserKey)
   777  		}
   778  		if l.tombstone == nil {
   779  			continue
   780  		}
   781  
   782  		// Reasoning for correctness of untruncated tombstone handling when the untruncated
   783  		// tombstone is at a higher level:
   784  		//
   785  		// The iterator corresponding to this tombstone is still in the heap so it must be
   786  		// positioned <= item.key. Which means the Smallest key bound of the sstable containing this
   787  		// tombstone is <= item.key. So the lower limit of this tombstone cannot have been
   788  		// file-bounds-constrained to > item.key. But it is possible that item.key >= Largest
   789  		// key bound of this sstable, in which case this tombstone should be ignored.
   790  		//
   791  		// Example 1:
   792  		// sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The
   793  		// largestUserKey is g, so we know the key is within the file bounds and the tombstone
   794  		// [b, i) covers it.
   795  		//
   796  		// Example 2:
   797  		// Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated
   798  		// tombstone was involved in a compaction which must have had a file to the right of this
   799  		// sstable that is part of the same atomic compaction group for future compactions. That
   800  		// file must have bounds that cover g#6 and this levelIter must be at that file.
   801  		//
   802  		// Example 3:
   803  		// sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10.
   804  		// This key is not deleted by this tombstone. We need to look at
   805  		// isLargestUserKeyRangeDelSentinel.
   806  		//
   807  		// For a tombstone at the same level as the key, the file bounds are trivially satisfied.
   808  
   809  		// Default to within bounds.
   810  		withinLargestSSTableBound := true
   811  		if l.largestUserKey != nil {
   812  			cmpResult := m.heap.cmp(l.largestUserKey, item.key.UserKey)
   813  			withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyRangeDelSentinel)
   814  		}
   815  		if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.key.UserKey) && l.tombstone.VisibleAt(m.snapshot) {
   816  			if level < item.index {
   817  				// We could also do m.seekLT(..., level + 1). The levels from
   818  				// [level + 1, item.index) are already before item.key so seeking them may be
   819  				// wasteful.
   820  
   821  				// We can seek up to the max of smallestUserKey and tombstone.Start.UserKey.
   822  				//
   823  				// Using example 1 above, we can seek to the larger of c and b, which is c.
   824  				//
   825  				// Progress argument: We know that the iterator in this file is positioned within
   826  				// its bounds and at a key X < item.key (otherwise it would be the max of the heap).
   827  				// So smallestUserKey <= item.key.UserKey and we already know that
   828  				// l.tombstone.Start.UserKey <= item.key.UserKey. So the seekKey computed below
   829  				// is <= item.key.UserKey, and since we do a seekLT() we will make backwards
   830  				// progress.
   831  				seekKey := l.tombstone.Start
   832  				if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 {
   833  					seekKey = l.smallestUserKey
   834  				}
   835  				// We set the relative-seek flag. This is important when
   836  				// iterating with lazy combined iteration. If there's a range
   837  				// key between this level's current file and the file the seek
   838  				// will land on, we need to detect it in order to trigger
   839  				// construction of the combined iterator.
   840  				m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek())
   841  				return true
   842  			}
   843  			if l.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) {
   844  				m.prevEntry(item)
   845  				return true
   846  			}
   847  		}
   848  	}
   849  	return false
   850  }
   851  
   852  // Starting from the current entry, finds the first (prev) entry that can be returned.
   853  func (m *mergingIter) findPrevEntry() (*InternalKey, []byte) {
   854  	for m.heap.len() > 0 && m.err == nil {
   855  		item := &m.heap.items[0]
   856  		if m.levels[item.index].isSyntheticIterBoundsKey {
   857  			break
   858  		}
   859  		m.addItemStats(item)
   860  		if m.isPrevEntryDeleted(item) {
   861  			m.stats.PointsCoveredByRangeTombstones++
   862  			continue
   863  		}
   864  		if item.key.Visible1(m.snapshot) &&
   865  			(!m.levels[item.index].isIgnorableBoundaryKey) &&
   866  			(item.key.Kind() != InternalKeyKindRangeDelete || !m.elideRangeTombstones) {
   867  			return &item.key, item.value
   868  		}
   869  		m.prevEntry(item)
   870  	}
   871  	return nil, nil
   872  }
   873  
   874  // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks.
   875  func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) {
   876  	// When seeking, we can use tombstones to adjust the key we seek to on each
   877  	// level. Consider the series of range tombstones:
   878  	//
   879  	//   1: a---e
   880  	//   2:    d---h
   881  	//   3:       g---k
   882  	//   4:          j---n
   883  	//   5:             m---q
   884  	//
   885  	// If we SeekGE("b") we also find the tombstone "b" resides within in the
   886  	// first level which is [a,e). Regardless of whether this tombstone deletes
   887  	// "b" in that level, we know it deletes "b" in all lower levels, so we
   888  	// adjust the search key in the next level to the tombstone end key "e". We
   889  	// then SeekGE("e") in the second level and find the corresponding tombstone
   890  	// [d,h). This process continues and we end up seeking for "h" in the 3rd
   891  	// level, "k" in the 4th level and "n" in the last level.
   892  	//
   893  	// TODO(peter,rangedel): In addition to the above we can delay seeking a
   894  	// level (and any lower levels) when the current iterator position is
   895  	// contained within a range tombstone at a higher level.
   896  
   897  	for ; level < len(m.levels); level++ {
   898  		if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 {
   899  			m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack())
   900  		}
   901  
   902  		l := &m.levels[level]
   903  		if m.prefix != nil {
   904  			l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags)
   905  		} else {
   906  			l.iterKey, l.iterValue = l.iter.SeekGE(key, flags)
   907  		}
   908  
   909  		// If this level contains overlapping range tombstones, alter the seek
   910  		// key accordingly. Caveat: If we're performing lazy-combined iteration,
   911  		// we cannot alter the seek key: Range tombstones don't delete range
   912  		// keys, and there might exist live range keys within the range
   913  		// tombstone's span that need to be observed to trigger a switch to
   914  		// combined iteration.
   915  		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
   916  			(m.combinedIterState == nil || m.combinedIterState.initialized) {
   917  			// The level has a range-del iterator. Find the tombstone containing
   918  			// the search key.
   919  			//
   920  			// For untruncated tombstones that are possibly file-bounds-constrained, we are using a
   921  			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
   922  			// is at this file we know that largestUserKey >= key, so we know that the
   923  			// tombstone we find cannot be file-bounds-constrained in its upper bound to something < key.
   924  			// We do need to  compare with smallestUserKey to ensure that the tombstone is not
   925  			// file-bounds-constrained in its lower bound.
   926  			//
   927  			// See the detailed comments in isNextEntryDeleted() on why similar containment and
   928  			// seeking logic is correct. The subtle difference here is that key is a user key,
   929  			// so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the
   930  			// tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past
   931  			// this sstable since it realizes the largest key is a InternalRangeDelSentinel.
   932  			l.tombstone = keyspan.SeekGE(m.heap.cmp, rangeDelIter, key)
   933  			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) &&
   934  				(l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) {
   935  				// NB: Based on the comment above l.largestUserKey >= key, and based on the
   936  				// containment condition tombstone.End > key, so the assignment to key results
   937  				// in a monotonically non-decreasing key across iterations of this loop.
   938  				//
   939  				// The adjustment of key here can only move it to a larger key. Since
   940  				// the caller of seekGE guaranteed that the original key was greater
   941  				// than or equal to m.lower, the new key will continue to be greater
   942  				// than or equal to m.lower.
   943  				if l.largestUserKey != nil &&
   944  					m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 {
   945  					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
   946  					// but that is harmless for this seek optimization.
   947  					key = l.largestUserKey
   948  				} else {
   949  					key = l.tombstone.End
   950  				}
   951  			}
   952  		}
   953  	}
   954  
   955  	m.initMinHeap()
   956  }
   957  
   958  func (m *mergingIter) String() string {
   959  	return "merging"
   960  }
   961  
   962  // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks
   963  // the upper bound. It is up to the caller to ensure that key is greater than
   964  // or equal to the lower bound.
   965  func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) {
   966  	m.err = nil
   967  	m.prefix = nil
   968  	m.seekGE(key, 0 /* start level */, flags)
   969  	return m.findNextEntry()
   970  }
   971  
   972  // SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that
   973  // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
   974  // that key is greater than or equal to the lower bound.
   975  func (m *mergingIter) SeekPrefixGE(
   976  	prefix, key []byte, flags base.SeekGEFlags,
   977  ) (*base.InternalKey, []byte) {
   978  	m.err = nil
   979  	m.prefix = prefix
   980  	m.seekGE(key, 0 /* start level */, flags)
   981  	return m.findNextEntry()
   982  }
   983  
   984  // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks.
   985  func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) {
   986  	// See the comment in seekGE regarding using tombstones to adjust the seek
   987  	// target per level.
   988  	m.prefix = nil
   989  	for ; level < len(m.levels); level++ {
   990  		if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 {
   991  			m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack())
   992  		}
   993  
   994  		l := &m.levels[level]
   995  		l.iterKey, l.iterValue = l.iter.SeekLT(key, flags)
   996  
   997  		// If this level contains overlapping range tombstones, alter the seek
   998  		// key accordingly. Caveat: If we're performing lazy-combined iteration,
   999  		// we cannot alter the seek key: Range tombstones don't delete range
  1000  		// keys, and there might exist live range keys within the range
  1001  		// tombstone's span that need to be observed to trigger a switch to
  1002  		// combined iteration.
  1003  		if rangeDelIter := l.rangeDelIter; rangeDelIter != nil &&
  1004  			(m.combinedIterState == nil || m.combinedIterState.initialized) {
  1005  			// The level has a range-del iterator. Find the tombstone containing
  1006  			// the search key.
  1007  			//
  1008  			// For untruncated tombstones that are possibly file-bounds-constrained we are using a
  1009  			// levelIter which will set smallestUserKey and largestUserKey. Since the levelIter
  1010  			// is at this file we know that smallestUserKey <= key, so we know that the
  1011  			// tombstone we find cannot be file-bounds-constrained in its lower bound to something > key.
  1012  			// We do need to  compare with largestUserKey to ensure that the tombstone is not
  1013  			// file-bounds-constrained in its upper bound.
  1014  			//
  1015  			// See the detailed comments in isPrevEntryDeleted() on why similar containment and
  1016  			// seeking logic is correct.
  1017  
  1018  			// Default to within bounds.
  1019  			withinLargestSSTableBound := true
  1020  			if l.largestUserKey != nil {
  1021  				cmpResult := m.heap.cmp(l.largestUserKey, key)
  1022  				withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyRangeDelSentinel)
  1023  			}
  1024  
  1025  			l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key)
  1026  			if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) &&
  1027  				l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound {
  1028  				// NB: Based on the comment above l.smallestUserKey <= key, and based
  1029  				// on the containment condition tombstone.Start.UserKey <= key, so the
  1030  				// assignment to key results in a monotonically non-increasing key
  1031  				// across iterations of this loop.
  1032  				//
  1033  				// The adjustment of key here can only move it to a smaller key. Since
  1034  				// the caller of seekLT guaranteed that the original key was less than
  1035  				// or equal to m.upper, the new key will continue to be less than or
  1036  				// equal to m.upper.
  1037  				if l.smallestUserKey != nil &&
  1038  					m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 {
  1039  					// Truncate the tombstone for seeking purposes. Note that this can over-truncate
  1040  					// but that is harmless for this seek optimization.
  1041  					key = l.smallestUserKey
  1042  				} else {
  1043  					key = l.tombstone.Start
  1044  				}
  1045  			}
  1046  		}
  1047  	}
  1048  
  1049  	m.initMaxHeap()
  1050  }
  1051  
  1052  // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks
  1053  // the lower bound. It is up to the caller to ensure that key is less than the
  1054  // upper bound.
  1055  func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) {
  1056  	m.err = nil
  1057  	m.prefix = nil
  1058  	m.seekLT(key, 0 /* start level */, flags)
  1059  	return m.findPrevEntry()
  1060  }
  1061  
  1062  // First implements base.InternalIterator.First. Note that First only checks
  1063  // the upper bound. It is up to the caller to ensure that key is greater than
  1064  // or equal to the lower bound (e.g. via a call to SeekGE(lower)).
  1065  func (m *mergingIter) First() (*InternalKey, []byte) {
  1066  	m.err = nil
  1067  	m.prefix = nil
  1068  	m.heap.items = m.heap.items[:0]
  1069  	for i := range m.levels {
  1070  		l := &m.levels[i]
  1071  		l.iterKey, l.iterValue = l.iter.First()
  1072  	}
  1073  	m.initMinHeap()
  1074  	return m.findNextEntry()
  1075  }
  1076  
  1077  // Last implements base.InternalIterator.Last. Note that Last only checks the
  1078  // lower bound. It is up to the caller to ensure that key is less than the
  1079  // upper bound (e.g. via a call to SeekLT(upper))
  1080  func (m *mergingIter) Last() (*InternalKey, []byte) {
  1081  	m.err = nil
  1082  	m.prefix = nil
  1083  	for i := range m.levels {
  1084  		l := &m.levels[i]
  1085  		l.iterKey, l.iterValue = l.iter.Last()
  1086  	}
  1087  	m.initMaxHeap()
  1088  	return m.findPrevEntry()
  1089  }
  1090  
  1091  func (m *mergingIter) Next() (*InternalKey, []byte) {
  1092  	if m.err != nil {
  1093  		return nil, nil
  1094  	}
  1095  
  1096  	if m.dir != 1 {
  1097  		m.switchToMinHeap()
  1098  		return m.findNextEntry()
  1099  	}
  1100  
  1101  	if m.heap.len() == 0 {
  1102  		return nil, nil
  1103  	}
  1104  
  1105  	m.nextEntry(&m.heap.items[0])
  1106  	return m.findNextEntry()
  1107  }
  1108  
  1109  func (m *mergingIter) Prev() (*InternalKey, []byte) {
  1110  	if m.err != nil {
  1111  		return nil, nil
  1112  	}
  1113  
  1114  	if m.dir != -1 {
  1115  		if m.prefix != nil {
  1116  			m.err = errors.New("bitalostable: unsupported reverse prefix iteration")
  1117  			return nil, nil
  1118  		}
  1119  		m.switchToMaxHeap()
  1120  		return m.findPrevEntry()
  1121  	}
  1122  
  1123  	if m.heap.len() == 0 {
  1124  		return nil, nil
  1125  	}
  1126  
  1127  	m.prevEntry(&m.heap.items[0])
  1128  	return m.findPrevEntry()
  1129  }
  1130  
  1131  func (m *mergingIter) Error() error {
  1132  	if m.heap.len() == 0 || m.err != nil {
  1133  		return m.err
  1134  	}
  1135  	return m.levels[m.heap.items[0].index].iter.Error()
  1136  }
  1137  
  1138  func (m *mergingIter) Close() error {
  1139  	for i := range m.levels {
  1140  		iter := m.levels[i].iter
  1141  		if err := iter.Close(); err != nil && m.err == nil {
  1142  			m.err = err
  1143  		}
  1144  		if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil {
  1145  			if err := rangeDelIter.Close(); err != nil && m.err == nil {
  1146  				m.err = err
  1147  			}
  1148  		}
  1149  	}
  1150  	m.levels = nil
  1151  	m.heap.items = m.heap.items[:0]
  1152  	return m.err
  1153  }
  1154  
  1155  func (m *mergingIter) SetBounds(lower, upper []byte) {
  1156  	m.prefix = nil
  1157  	m.lower = lower
  1158  	m.upper = upper
  1159  	for i := range m.levels {
  1160  		m.levels[i].iter.SetBounds(lower, upper)
  1161  	}
  1162  	m.heap.clear()
  1163  }
  1164  
  1165  func (m *mergingIter) DebugString() string {
  1166  	var buf bytes.Buffer
  1167  	sep := ""
  1168  	for m.heap.len() > 0 {
  1169  		item := m.heap.pop()
  1170  		fmt.Fprintf(&buf, "%s%s", sep, item.key)
  1171  		sep = " "
  1172  	}
  1173  	if m.dir == 1 {
  1174  		m.initMinHeap()
  1175  	} else {
  1176  		m.initMaxHeap()
  1177  	}
  1178  	return buf.String()
  1179  }
  1180  
  1181  func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) {
  1182  	for _, iter := range m.levels {
  1183  		if li, ok := iter.iter.(*levelIter); ok {
  1184  			if done := fn(li); done {
  1185  				break
  1186  			}
  1187  		}
  1188  	}
  1189  }
  1190  
  1191  func (m *mergingIter) addItemStats(item *mergingIterItem) {
  1192  	m.stats.PointCount++
  1193  	m.stats.KeyBytes += uint64(len(item.key.UserKey))
  1194  	m.stats.ValueBytes += uint64(len(item.value))
  1195  }
  1196  
  1197  var _ internalIterator = &mergingIter{}