github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction_iter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"fmt"
     9  	"io"
    10  	"sort"
    11  	"strconv"
    12  
    13  	"github.com/cockroachdb/errors"
    14  	"github.com/zuoyebang/bitalostable/internal/base"
    15  	"github.com/zuoyebang/bitalostable/internal/bytealloc"
    16  	"github.com/zuoyebang/bitalostable/internal/invariants"
    17  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    18  	"github.com/zuoyebang/bitalostable/internal/rangekey"
    19  )
    20  
    21  // compactionIter provides a forward-only iterator that encapsulates the logic
    22  // for collapsing entries during compaction. It wraps an internal iterator and
    23  // collapses entries that are no longer necessary because they are shadowed by
    24  // newer entries. The simplest example of this is when the internal iterator
    25  // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries,
    26  // compactionIter collapses the second entry because it is no longer
    27  // necessary. The high-level structure for compactionIter is to iterate over
    28  // its internal iterator and output 1 entry for every user-key. There are four
    29  // complications to this story.
    30  //
    31  // 1. Eliding Deletion Tombstones
    32  //
    33  // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to
    34  // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly
    35  // shadows an entry at a lower level. If we're compacting to the base-level in
    36  // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower
    37  // level and can be elided.
    38  //
    39  // We can do slightly better than only eliding deletion tombstones at the base
    40  // level by observing that we can elide a deletion tombstone if there are no
    41  // sstables that contain the entry's key. This check is performed by
    42  // elideTombstone.
    43  //
    44  // 2. Merges
    45  //
    46  // The MERGE operation merges the value for an entry with the existing value
    47  // for an entry. The logical value of an entry can be composed of a series of
    48  // merge operations. When compactionIter sees a MERGE, it scans forward in its
    49  // internal iterator collapsing MERGE operations for the same key until it
    50  // encounters a SET or DELETE operation. For example, the keys a.MERGE.4,
    51  // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be
    52  // merged using the specified Merger.
    53  //
    54  // An interesting case here occurs when MERGE is combined with SET. Consider
    55  // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The
    56  // reason that the kind is changed to SET is because the SET operation acts as
    57  // a barrier preventing further merging. This can be seen better in the
    58  // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower
    59  // (older) level and not involved in the compaction. If the compaction of
    60  // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with
    61  // a.MERGE.1 would merge the values together incorrectly.
    62  //
    63  // 3. Snapshots
    64  //
    65  // Snapshots are lightweight point-in-time views of the DB state. At its core,
    66  // a snapshot is a sequence number along with a guarantee from Pebble that it
    67  // will maintain the view of the database at that sequence number. Part of this
    68  // guarantee is relatively straightforward to achieve. When reading from the
    69  // database Pebble will ignore sequence numbers that are larger than the
    70  // snapshot sequence number. The primary complexity with snapshots occurs
    71  // during compaction: the collapsing of entries that are shadowed by newer
    72  // entries is at odds with the guarantee that Pebble will maintain the view of
    73  // the database at the snapshot sequence number. Rather than collapsing entries
    74  // up to the next user key, compactionIter can only collapse entries up to the
    75  // next snapshot boundary. That is, every snapshot boundary potentially causes
    76  // another entry for the same user-key to be emitted. Another way to view this
    77  // is that snapshots define stripes and entries are collapsed within stripes,
    78  // but not across stripes. Consider the following scenario:
    79  //
    80  //	a.PUT.9
    81  //	a.DEL.8
    82  //	a.PUT.7
    83  //	a.DEL.6
    84  //	a.PUT.5
    85  //
    86  // In the absence of snapshots these entries would be collapsed to
    87  // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can
    88  // be divided into two stripes and collapsed within the stripes:
    89  //
    90  //	a.PUT.9        a.PUT.9
    91  //	a.DEL.8  --->
    92  //	a.PUT.7
    93  //	--             --
    94  //	a.DEL.6  --->  a.DEL.6
    95  //	a.PUT.5
    96  //
    97  // All of the rules described earlier still apply, but they are confined to
    98  // operate within a snapshot stripe. Snapshots only affect compaction when the
    99  // snapshot sequence number lies within the range of sequence numbers being
   100  // compacted. In the above example, a snapshot at sequence number 10 or at
   101  // sequence number 5 would not have any effect.
   102  //
   103  // 4. Range Deletions
   104  //
   105  // Range deletions provide the ability to delete all of the keys (and values)
   106  // in a contiguous range. Range deletions are stored indexed by their start
   107  // key. The end key of the range is stored in the value. In order to support
   108  // lookup of the range deletions which overlap with a particular key, the range
   109  // deletion tombstones need to be fragmented whenever they overlap. This
   110  // fragmentation is performed by keyspan.Fragmenter. The fragments are then
   111  // subject to the rules for snapshots. For example, consider the two range
   112  // tombstones [a,e)#1 and [c,g)#2:
   113  //
   114  //	2:     c-------g
   115  //	1: a-------e
   116  //
   117  // These tombstones will be fragmented into:
   118  //
   119  //	2:     c---e---g
   120  //	1: a---c---e
   121  //
   122  // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer
   123  // depends on whether it is in a new snapshot stripe.
   124  //
   125  // In addition to the fragmentation of range tombstones, compaction also needs
   126  // to take the range tombstones into consideration when outputting normal
   127  // keys. Just as with point deletions, a range deletion covering an entry can
   128  // cause the entry to be elided.
   129  //
   130  // A note on the stability of keys and values.
   131  //
   132  // The stability guarantees of keys and values returned by the iterator tree
   133  // that backs a compactionIter is nuanced and care must be taken when
   134  // referencing any returned items.
   135  //
   136  // Keys and values returned by exported functions (i.e. First, Next, etc.) have
   137  // lifetimes that fall into two categories:
   138  //
   139  // Lifetime valid for duration of compaction. Range deletion keys and values are
   140  // stable for the duration of the compaction, due to way in which a
   141  // compactionIter is typically constructed (i.e. via (*compaction).newInputIter,
   142  // which wraps the iterator over the range deletion block in a noCloseIter,
   143  // preventing the release of the backing memory until the compaction is
   144  // finished).
   145  //
   146  // Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL,
   147  // etc.) and values must be cloned / copied following the return from the
   148  // exported function, and before a subsequent call to Next advances the iterator
   149  // and mutates the contents of the returned key and value.
   150  type compactionIter struct {
   151  	equal Equal
   152  	merge Merge
   153  	iter  internalIterator
   154  	err   error
   155  	// `key.UserKey` is set to `keyBuf` caused by saving `i.iterKey.UserKey`
   156  	// and `key.Trailer` is set to `i.iterKey.Trailer`. This is the
   157  	// case on return from all public methods -- these methods return `key`.
   158  	// Additionally, it is the internal state when the code is moving to the
   159  	// next key so it can determine whether the user key has changed from
   160  	// the previous key.
   161  	key InternalKey
   162  	// keyTrailer is updated when `i.key` is updated and holds the key's
   163  	// original trailer (eg, before any sequence-number zeroing or changes to
   164  	// key kind).
   165  	keyTrailer  uint64
   166  	value       []byte
   167  	valueCloser io.Closer
   168  	// Temporary buffer used for storing the previous user key in order to
   169  	// determine when iteration has advanced to a new user key and thus a new
   170  	// snapshot stripe.
   171  	keyBuf []byte
   172  	// Temporary buffer used for storing the previous value, which may be an
   173  	// unsafe, i.iter-owned slice that could be altered when the iterator is
   174  	// advanced.
   175  	valueBuf []byte
   176  	// Is the current entry valid?
   177  	valid     bool
   178  	iterKey   *InternalKey
   179  	iterValue []byte
   180  	// `skip` indicates whether the remaining skippable entries in the current
   181  	// snapshot stripe should be skipped or processed. An example of a non-
   182  	// skippable entry is a range tombstone as we need to return it from the
   183  	// `compactionIter`, even if a key covering its start key has already been
   184  	// seen in the same stripe. `skip` has no effect when `pos == iterPosNext`.
   185  	skip bool
   186  	// `pos` indicates the iterator position at the top of `Next()`. Its type's
   187  	// (`iterPos`) values take on the following meanings in the context of
   188  	// `compactionIter`.
   189  	//
   190  	// - `iterPosCur`: the iterator is at the last key returned.
   191  	// - `iterPosNext`: the iterator has already been advanced to the next
   192  	//   candidate key. For example, this happens when processing merge operands,
   193  	//   where we advance the iterator all the way into the next stripe or next
   194  	//   user key to ensure we've seen all mergeable operands.
   195  	// - `iterPosPrev`: this is invalid as compactionIter is forward-only.
   196  	pos iterPos
   197  	// The index of the snapshot for the current key within the snapshots slice.
   198  	curSnapshotIdx    int
   199  	curSnapshotSeqNum uint64
   200  	// The snapshot sequence numbers that need to be maintained. These sequence
   201  	// numbers define the snapshot stripes (see the Snapshots description
   202  	// above). The sequence numbers are in ascending order.
   203  	snapshots []uint64
   204  	// Reference to the range deletion tombstone fragmenter (e.g.,
   205  	// `compaction.rangeDelFrag`).
   206  	rangeDelFrag *keyspan.Fragmenter
   207  	rangeKeyFrag *keyspan.Fragmenter
   208  	// The fragmented tombstones.
   209  	tombstones []keyspan.Span
   210  	// The fragmented range keys.
   211  	rangeKeys []keyspan.Span
   212  	// Byte allocator for the tombstone keys.
   213  	alloc               bytealloc.A
   214  	allowZeroSeqNum     bool
   215  	elideTombstone      func(key []byte) bool
   216  	elideRangeTombstone func(start, end []byte) bool
   217  	// The on-disk format major version. This informs the types of keys that
   218  	// may be written to disk during a compaction.
   219  	formatVersion FormatMajorVersion
   220  }
   221  
   222  func newCompactionIter(
   223  	cmp Compare,
   224  	equal Equal,
   225  	formatKey base.FormatKey,
   226  	merge Merge,
   227  	iter internalIterator,
   228  	snapshots []uint64,
   229  	rangeDelFrag *keyspan.Fragmenter,
   230  	rangeKeyFrag *keyspan.Fragmenter,
   231  	allowZeroSeqNum bool,
   232  	elideTombstone func(key []byte) bool,
   233  	elideRangeTombstone func(start, end []byte) bool,
   234  	formatVersion FormatMajorVersion,
   235  ) *compactionIter {
   236  	i := &compactionIter{
   237  		equal:               equal,
   238  		merge:               merge,
   239  		iter:                iter,
   240  		snapshots:           snapshots,
   241  		rangeDelFrag:        rangeDelFrag,
   242  		rangeKeyFrag:        rangeKeyFrag,
   243  		allowZeroSeqNum:     allowZeroSeqNum,
   244  		elideTombstone:      elideTombstone,
   245  		elideRangeTombstone: elideRangeTombstone,
   246  		formatVersion:       formatVersion,
   247  	}
   248  	i.rangeDelFrag.Cmp = cmp
   249  	i.rangeDelFrag.Format = formatKey
   250  	i.rangeDelFrag.Emit = i.emitRangeDelChunk
   251  	i.rangeKeyFrag.Cmp = cmp
   252  	i.rangeKeyFrag.Format = formatKey
   253  	i.rangeKeyFrag.Emit = i.emitRangeKeyChunk
   254  	return i
   255  }
   256  
   257  func (i *compactionIter) First() (*InternalKey, []byte) {
   258  	if i.err != nil {
   259  		return nil, nil
   260  	}
   261  	i.iterKey, i.iterValue = i.iter.First()
   262  	if i.iterKey != nil {
   263  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots)
   264  	}
   265  	i.pos = iterPosNext
   266  	return i.Next()
   267  }
   268  
   269  func (i *compactionIter) Next() (*InternalKey, []byte) {
   270  	if i.err != nil {
   271  		return nil, nil
   272  	}
   273  
   274  	// Close the closer for the current value if one was open.
   275  	if i.closeValueCloser() != nil {
   276  		return nil, nil
   277  	}
   278  
   279  	// Prior to this call to `Next()` we are in one of three situations with
   280  	// respect to `iterKey` and related state:
   281  	//
   282  	// - `!skip && pos == iterPosNext`: `iterKey` is already at the next key.
   283  	// - `!skip && pos == iterPosCur`: We are at the key that has been returned.
   284  	//   To move forward we advance by one key, even if that lands us in the same
   285  	//   snapshot stripe.
   286  	// - `skip && pos == iterPosCur`: We are at the key that has been returned.
   287  	//   To move forward we skip skippable entries in the stripe.
   288  	if i.pos == iterPosCurForward {
   289  		if i.skip {
   290  			i.skipInStripe()
   291  		} else {
   292  			i.nextInStripe()
   293  		}
   294  	}
   295  
   296  	i.pos = iterPosCurForward
   297  	i.valid = false
   298  	for i.iterKey != nil {
   299  		if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) {
   300  			// Return the span so the compaction can use it for file truncation and add
   301  			// it to the relevant fragmenter. We do not set `skip` to true before
   302  			// returning as there may be a forthcoming point key with the same user key
   303  			// and sequence number. Such a point key must be visible (i.e., not skipped
   304  			// over) since we promise point keys are not deleted by range tombstones at
   305  			// the same sequence number.
   306  			//
   307  			// Although, note that `skip` may already be true before reaching here
   308  			// due to an earlier key in the stripe. Then it is fine to leave it set
   309  			// to true, as the earlier key must have had a higher sequence number.
   310  			//
   311  			// NOTE: there is a subtle invariant violation here in that calling
   312  			// saveKey and returning a reference to the temporary slice violates
   313  			// the stability guarantee for range deletion keys. A potential
   314  			// mediation could return the original iterKey and iterValue
   315  			// directly, as the backing memory is guaranteed to be stable until
   316  			// the compaction completes. The violation here is only minor in
   317  			// that the caller immediately clones the range deletion InternalKey
   318  			// when passing the key to the deletion fragmenter (see the
   319  			// call-site in compaction.go).
   320  			// TODO(travers): address this violation by removing the call to
   321  			// saveKey and instead return the original iterKey and iterValue.
   322  			// This goes against the comment on i.key in the struct, and
   323  			// therefore warrants some investigation.
   324  			i.saveKey()
   325  			i.value = i.iterValue
   326  			i.valid = true
   327  			return &i.key, i.value
   328  		}
   329  
   330  		if i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) {
   331  			i.saveKey()
   332  			i.skipInStripe()
   333  			continue
   334  		}
   335  
   336  		switch i.iterKey.Kind() {
   337  		case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   338  			// If we're at the last snapshot stripe and the tombstone can be elided
   339  			// skip skippable keys in the same stripe.
   340  			if i.curSnapshotIdx == 0 && i.elideTombstone(i.iterKey.UserKey) {
   341  				i.saveKey()
   342  				i.skipInStripe()
   343  				continue
   344  			}
   345  
   346  			switch i.iterKey.Kind() {
   347  			case InternalKeyKindDelete:
   348  				i.saveKey()
   349  				i.value = i.iterValue
   350  				i.valid = true
   351  				i.skip = true
   352  				return &i.key, i.value
   353  
   354  			case InternalKeyKindSingleDelete:
   355  				if i.singleDeleteNext() {
   356  					return &i.key, i.value
   357  				}
   358  
   359  				continue
   360  			}
   361  
   362  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   363  			// The key we emit for this entry is a function of the current key
   364  			// kind, and whether this entry is followed by a DEL/SINGLEDEL
   365  			// entry. setNext() does the work to move the iterator forward,
   366  			// preserving the original value, and potentially mutating the key
   367  			// kind.
   368  			i.setNext()
   369  			return &i.key, i.value
   370  
   371  		case InternalKeyKindMerge:
   372  			// Record the snapshot index before mergeNext as merging
   373  			// advances the iterator, adjusting curSnapshotIdx.
   374  			origSnapshotIdx := i.curSnapshotIdx
   375  			var valueMerger ValueMerger
   376  			valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue)
   377  			var change stripeChangeType
   378  			if i.err == nil {
   379  				change = i.mergeNext(valueMerger)
   380  			}
   381  			var needDelete bool
   382  			if i.err == nil {
   383  				// includesBase is true whenever we've transformed the MERGE record
   384  				// into a SET.
   385  				includesBase := i.key.Kind() == InternalKeyKindSet
   386  				i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase)
   387  			}
   388  			if i.err == nil {
   389  				if needDelete {
   390  					i.valid = false
   391  					if i.closeValueCloser() != nil {
   392  						return nil, nil
   393  					}
   394  					continue
   395  				}
   396  				// A non-skippable entry does not necessarily cover later merge
   397  				// operands, so we must not zero the current merge result's seqnum.
   398  				//
   399  				// For example, suppose the forthcoming two keys are a range
   400  				// tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that
   401  				// range tombstones do not cover point keys at the same seqnum, so
   402  				// `a#3` is not deleted. The range tombstone will be seen first due
   403  				// to its larger value type. Since it is a non-skippable key, the
   404  				// current merge will not include `a#3`. If we zeroed the current
   405  				// merge result's seqnum, then it would conflict with the upcoming
   406  				// merge including `a#3`, whose seqnum will also be zeroed.
   407  				if change != sameStripeNonSkippable {
   408  					i.maybeZeroSeqnum(origSnapshotIdx)
   409  				}
   410  				return &i.key, i.value
   411  			}
   412  			if i.err != nil {
   413  				i.valid = false
   414  				i.err = base.MarkCorruptionError(i.err)
   415  			}
   416  			return nil, nil
   417  
   418  		default:
   419  			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
   420  			i.valid = false
   421  			return nil, nil
   422  		}
   423  	}
   424  
   425  	return nil, nil
   426  }
   427  
   428  func (i *compactionIter) closeValueCloser() error {
   429  	if i.valueCloser == nil {
   430  		return nil
   431  	}
   432  
   433  	i.err = i.valueCloser.Close()
   434  	i.valueCloser = nil
   435  	if i.err != nil {
   436  		i.valid = false
   437  	}
   438  	return i.err
   439  }
   440  
   441  // snapshotIndex returns the index of the first sequence number in snapshots
   442  // which is greater than or equal to seq.
   443  func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) {
   444  	index := sort.Search(len(snapshots), func(i int) bool {
   445  		return snapshots[i] > seq
   446  	})
   447  	if index >= len(snapshots) {
   448  		return index, InternalKeySeqNumMax
   449  	}
   450  	return index, snapshots[index]
   451  }
   452  
   453  // skipInStripe skips over skippable keys in the same stripe and user key.
   454  func (i *compactionIter) skipInStripe() {
   455  	i.skip = true
   456  	var change stripeChangeType
   457  	for {
   458  		change = i.nextInStripe()
   459  		if change == sameStripeNonSkippable || change == newStripe {
   460  			break
   461  		}
   462  	}
   463  	// Reset skip if we landed outside the original stripe. Otherwise, we landed
   464  	// in the same stripe on a non-skippable key. In that case we should preserve
   465  	// `i.skip == true` such that later keys in the stripe will continue to be
   466  	// skipped.
   467  	if change == newStripe {
   468  		i.skip = false
   469  	}
   470  }
   471  
   472  func (i *compactionIter) iterNext() bool {
   473  	i.iterKey, i.iterValue = i.iter.Next()
   474  	return i.iterKey != nil
   475  }
   476  
   477  // stripeChangeType indicates how the snapshot stripe changed relative to the previous
   478  // key. If no change, it also indicates whether the current entry is skippable.
   479  type stripeChangeType int
   480  
   481  const (
   482  	newStripe stripeChangeType = iota
   483  	sameStripeSkippable
   484  	sameStripeNonSkippable
   485  )
   486  
   487  // nextInStripe advances the iterator and returns one of the above const ints
   488  // indicating how its state changed.
   489  //
   490  // Calls to nextInStripe must be preceded by a call to saveKey to retain a
   491  // temporary reference to the original key, so that forward iteration can
   492  // proceed with a reference to the original key. Care should be taken to avoid
   493  // overwriting or mutating the saved key or value before they have been returned
   494  // to the caller of the exported function (i.e. the caller of Next, First, etc.)
   495  func (i *compactionIter) nextInStripe() stripeChangeType {
   496  	if !i.iterNext() {
   497  		return newStripe
   498  	}
   499  	key := i.iterKey
   500  
   501  	// NB: The below conditional is an optimization to avoid a user key
   502  	// comparison in many cases. Internal keys with the same user key are
   503  	// ordered in (strictly) descending order by trailer. If the new key has a
   504  	// greater or equal trailer, or the previous key had a zero sequence number,
   505  	// the new key must have a new user key.
   506  	//
   507  	// A couple things make these cases common:
   508  	// - Sequence-number zeroing ensures ~all of the keys in L6 have a zero
   509  	//   sequence number.
   510  	// - Ingested sstables' keys all adopt the same sequence number.
   511  	if i.keyTrailer <= base.InternalKeyZeroSeqnumMaxTrailer || key.Trailer >= i.keyTrailer {
   512  		if invariants.Enabled && i.equal(i.key.UserKey, key.UserKey) {
   513  			prevKey := i.key
   514  			prevKey.Trailer = i.keyTrailer
   515  			panic(fmt.Sprintf("bitalostable: invariant violation: %s and %s out of order", key, prevKey))
   516  		}
   517  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
   518  		return newStripe
   519  	} else if !i.equal(i.key.UserKey, key.UserKey) {
   520  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
   521  		return newStripe
   522  	}
   523  	origSnapshotIdx := i.curSnapshotIdx
   524  	i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
   525  	switch key.Kind() {
   526  	case InternalKeyKindRangeDelete:
   527  		// Range tombstones need to be exposed by the compactionIter to the upper level
   528  		// `compaction` object, so return them regardless of whether they are in the same
   529  		// snapshot stripe.
   530  		if i.curSnapshotIdx == origSnapshotIdx {
   531  			return sameStripeNonSkippable
   532  		}
   533  		return newStripe
   534  	case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete:
   535  		// Range keys are interleaved at the max sequence number for a given user
   536  		// key, so we should not see any more range keys in this stripe.
   537  		panic("unreachable")
   538  	case InternalKeyKindInvalid:
   539  		if i.curSnapshotIdx == origSnapshotIdx {
   540  			return sameStripeNonSkippable
   541  		}
   542  		return newStripe
   543  	}
   544  	if i.curSnapshotIdx == origSnapshotIdx {
   545  		return sameStripeSkippable
   546  	}
   547  	return newStripe
   548  }
   549  
   550  func (i *compactionIter) setNext() {
   551  	// Save the current key.
   552  	i.saveKey()
   553  	i.value = i.iterValue
   554  	i.valid = true
   555  	i.maybeZeroSeqnum(i.curSnapshotIdx)
   556  
   557  	// There are two cases where we can early return and skip the remaining
   558  	// records in the stripe:
   559  	// - If the DB does not SETWITHDEL.
   560  	// - If this key is already a SETWITHDEL.
   561  	if i.formatVersion < FormatSetWithDelete ||
   562  		i.iterKey.Kind() == InternalKeyKindSetWithDelete {
   563  		i.skip = true
   564  		return
   565  	}
   566  
   567  	// We are iterating forward. Save the current value.
   568  	i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
   569  	i.value = i.valueBuf
   570  
   571  	// Else, we continue to loop through entries in the stripe looking for a
   572  	// DEL. Note that we may stop *before* encountering a DEL, if one exists.
   573  	for {
   574  		switch t := i.nextInStripe(); t {
   575  		case newStripe, sameStripeNonSkippable:
   576  			i.pos = iterPosNext
   577  			if t == sameStripeNonSkippable {
   578  				// We iterated onto a key that we cannot skip. We can
   579  				// conservatively transform the original SET into a SETWITHDEL
   580  				// as an indication that there *may* still be a DEL/SINGLEDEL
   581  				// under this SET, even if we did not actually encounter one.
   582  				//
   583  				// This is safe to do, as:
   584  				//
   585  				// - in the case that there *is not* actually a DEL/SINGLEDEL
   586  				// under this entry, any SINGLEDEL above this now-transformed
   587  				// SETWITHDEL will become a DEL when the two encounter in a
   588  				// compaction. The DEL will eventually be elided in a
   589  				// subsequent compaction. The cost for ensuring correctness is
   590  				// that this entry is kept around for an additional compaction
   591  				// cycle(s).
   592  				//
   593  				// - in the case there *is* indeed a DEL/SINGLEDEL under us
   594  				// (but in a different stripe or sstable), then we will have
   595  				// already done the work to transform the SET into a
   596  				// SETWITHDEL, and we will skip any additional iteration when
   597  				// this entry is encountered again in a subsequent compaction.
   598  				//
   599  				// Ideally, this codepath would be smart enough to handle the
   600  				// case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- ....
   601  				// This requires preserving any RANGEDEL entries we encounter
   602  				// along the way, then emitting the original (possibly
   603  				// transformed) key, followed by the RANGEDELs. This requires
   604  				// a sizable refactoring of the existing code, as nextInStripe
   605  				// currently returns a sameStripeNonSkippable when it
   606  				// encounters a RANGEDEL.
   607  				// TODO(travers): optimize to handle the RANGEDEL case if it
   608  				// turns out to be a performance problem.
   609  				i.key.SetKind(InternalKeyKindSetWithDelete)
   610  
   611  				// By setting i.skip=true, we are saying that after the
   612  				// non-skippable key is emitted (which is likely a RANGEDEL),
   613  				// the remaining point keys that share the same user key as this
   614  				// saved key should be skipped.
   615  				i.skip = true
   616  			}
   617  			return
   618  		case sameStripeSkippable:
   619  			// We're still in the same stripe. If this is a DEL/SINGLEDEL, we
   620  			// stop looking and emit a SETWITHDEL. Subsequent keys are
   621  			// eligible for skipping.
   622  			if i.iterKey.Kind() == InternalKeyKindDelete ||
   623  				i.iterKey.Kind() == InternalKeyKindSingleDelete {
   624  				i.key.SetKind(InternalKeyKindSetWithDelete)
   625  				i.skip = true
   626  				return
   627  			}
   628  		default:
   629  			panic("bitalostable: unexpected stripeChangeType: " + strconv.Itoa(int(t)))
   630  		}
   631  	}
   632  }
   633  
   634  func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType {
   635  	// Save the current key.
   636  	i.saveKey()
   637  	i.valid = true
   638  
   639  	// Loop looking for older values in the current snapshot stripe and merge
   640  	// them.
   641  	for {
   642  		if change := i.nextInStripe(); change == sameStripeNonSkippable || change == newStripe {
   643  			i.pos = iterPosNext
   644  			return change
   645  		}
   646  		key := i.iterKey
   647  		switch key.Kind() {
   648  		case InternalKeyKindDelete, InternalKeyKindSingleDelete:
   649  			// We've hit a deletion tombstone. Return everything up to this point and
   650  			// then skip entries until the next snapshot stripe. We change the kind
   651  			// of the result key to a Set so that it shadows keys in lower
   652  			// levels. That is, MERGE+DEL -> SET.
   653  			// We do the same for SingleDelete since SingleDelete is only
   654  			// permitted (with deterministic behavior) for keys that have been
   655  			// set once since the last SingleDelete/Delete, so everything
   656  			// older is acceptable to shadow. Note that this is slightly
   657  			// different from singleDeleteNext() which implements stricter
   658  			// semantics in terms of applying the SingleDelete to the single
   659  			// next Set. But those stricter semantics are not observable to
   660  			// the end-user since Iterator interprets SingleDelete as Delete.
   661  			// We could do something more complicated here and consume only a
   662  			// single Set, and then merge in any following Sets, but that is
   663  			// complicated wrt code and unnecessary given the narrow permitted
   664  			// use of SingleDelete.
   665  			i.key.SetKind(InternalKeyKindSet)
   666  			i.skip = true
   667  			return sameStripeSkippable
   668  
   669  		case InternalKeyKindSet, InternalKeyKindSetWithDelete:
   670  			if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) {
   671  				// We change the kind of the result key to a Set so that it shadows
   672  				// keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't
   673  				// strictly necessary, but provides consistency with the behavior of
   674  				// MERGE+DEL.
   675  				i.key.SetKind(InternalKeyKindSet)
   676  				i.skip = true
   677  				return sameStripeSkippable
   678  			}
   679  
   680  			// We've hit a Set or SetWithDel value. Merge with the existing
   681  			// value and return. We change the kind of the resulting key to a
   682  			// Set so that it shadows keys in lower levels. That is:
   683  			// MERGE + (SET*) -> SET.
   684  			i.err = valueMerger.MergeOlder(i.iterValue)
   685  			if i.err != nil {
   686  				i.valid = false
   687  				return sameStripeSkippable
   688  			}
   689  			i.key.SetKind(InternalKeyKindSet)
   690  			i.skip = true
   691  			return sameStripeSkippable
   692  
   693  		case InternalKeyKindMerge:
   694  			if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) {
   695  				// We change the kind of the result key to a Set so that it shadows
   696  				// keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't
   697  				// strictly necessary, but provides consistency with the behavior of
   698  				// MERGE+DEL.
   699  				i.key.SetKind(InternalKeyKindSet)
   700  				i.skip = true
   701  				return sameStripeSkippable
   702  			}
   703  
   704  			// We've hit another Merge value. Merge with the existing value and
   705  			// continue looping.
   706  			i.err = valueMerger.MergeOlder(i.iterValue)
   707  			if i.err != nil {
   708  				i.valid = false
   709  				return sameStripeSkippable
   710  			}
   711  
   712  		default:
   713  			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
   714  			i.valid = false
   715  			return sameStripeSkippable
   716  		}
   717  	}
   718  }
   719  
   720  func (i *compactionIter) singleDeleteNext() bool {
   721  	// Save the current key.
   722  	i.saveKey()
   723  	i.value = i.iterValue
   724  	i.valid = true
   725  
   726  	// Loop until finds a key to be passed to the next level.
   727  	for {
   728  		if change := i.nextInStripe(); change == sameStripeNonSkippable || change == newStripe {
   729  			i.pos = iterPosNext
   730  			return true
   731  		}
   732  
   733  		key := i.iterKey
   734  		switch key.Kind() {
   735  		case InternalKeyKindDelete, InternalKeyKindMerge, InternalKeyKindSetWithDelete:
   736  			// We've hit a Delete, Merge or SetWithDelete, transform the
   737  			// SingleDelete into a full Delete.
   738  			i.key.SetKind(InternalKeyKindDelete)
   739  			i.skip = true
   740  			return true
   741  
   742  		case InternalKeyKindSet:
   743  			i.nextInStripe()
   744  			i.valid = false
   745  			return false
   746  
   747  		case InternalKeyKindSingleDelete:
   748  			continue
   749  
   750  		default:
   751  			i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind()))
   752  			i.valid = false
   753  			return false
   754  		}
   755  	}
   756  }
   757  
   758  func (i *compactionIter) saveKey() {
   759  	i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
   760  	i.key.UserKey = i.keyBuf
   761  	i.key.Trailer = i.iterKey.Trailer
   762  	i.keyTrailer = i.iterKey.Trailer
   763  }
   764  
   765  func (i *compactionIter) cloneKey(key []byte) []byte {
   766  	i.alloc, key = i.alloc.Copy(key)
   767  	return key
   768  }
   769  
   770  func (i *compactionIter) Key() InternalKey {
   771  	return i.key
   772  }
   773  
   774  func (i *compactionIter) Value() []byte {
   775  	return i.value
   776  }
   777  
   778  func (i *compactionIter) Valid() bool {
   779  	return i.valid
   780  }
   781  
   782  func (i *compactionIter) Error() error {
   783  	return i.err
   784  }
   785  
   786  func (i *compactionIter) Close() error {
   787  	err := i.iter.Close()
   788  	if i.err == nil {
   789  		i.err = err
   790  	}
   791  
   792  	// Close the closer for the current value if one was open.
   793  	if i.valueCloser != nil {
   794  		i.err = firstError(i.err, i.valueCloser.Close())
   795  		i.valueCloser = nil
   796  	}
   797  
   798  	return i.err
   799  }
   800  
   801  // Tombstones returns a list of pending range tombstones in the fragmenter
   802  // up to the specified key, or all pending range tombstones if key = nil.
   803  func (i *compactionIter) Tombstones(key []byte) []keyspan.Span {
   804  	if key == nil {
   805  		i.rangeDelFrag.Finish()
   806  	} else {
   807  		// The specified end key is exclusive; no versions of the specified
   808  		// user key (including range tombstones covering that key) should
   809  		// be flushed yet.
   810  		i.rangeDelFrag.TruncateAndFlushTo(key)
   811  	}
   812  	tombstones := i.tombstones
   813  	i.tombstones = nil
   814  	return tombstones
   815  }
   816  
   817  // RangeKeys returns a list of pending fragmented range keys up to the specified
   818  // key, or all pending range keys if key = nil.
   819  func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span {
   820  	if key == nil {
   821  		i.rangeKeyFrag.Finish()
   822  	} else {
   823  		// The specified end key is exclusive; no versions of the specified
   824  		// user key (including range tombstones covering that key) should
   825  		// be flushed yet.
   826  		i.rangeKeyFrag.TruncateAndFlushTo(key)
   827  	}
   828  	rangeKeys := i.rangeKeys
   829  	i.rangeKeys = nil
   830  	return rangeKeys
   831  }
   832  
   833  func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) {
   834  	// Apply the snapshot stripe rules, keeping only the latest tombstone for
   835  	// each snapshot stripe.
   836  	currentIdx := -1
   837  	keys := fragmented.Keys[:0]
   838  	for _, k := range fragmented.Keys {
   839  		idx, _ := snapshotIndex(k.SeqNum(), i.snapshots)
   840  		if currentIdx == idx {
   841  			continue
   842  		}
   843  		if idx == 0 && i.elideRangeTombstone(fragmented.Start, fragmented.End) {
   844  			// This is the last snapshot stripe and the range tombstone
   845  			// can be elided.
   846  			break
   847  		}
   848  
   849  		keys = append(keys, k)
   850  		if idx == 0 {
   851  			// This is the last snapshot stripe.
   852  			break
   853  		}
   854  		currentIdx = idx
   855  	}
   856  	if len(keys) > 0 {
   857  		i.tombstones = append(i.tombstones, keyspan.Span{
   858  			Start: fragmented.Start,
   859  			End:   fragmented.End,
   860  			Keys:  keys,
   861  		})
   862  	}
   863  }
   864  
   865  func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) {
   866  	// Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to
   867  	// do that here.
   868  	if len(fragmented.Keys) > 0 {
   869  		i.rangeKeys = append(i.rangeKeys, fragmented)
   870  	}
   871  }
   872  
   873  // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
   874  // so improves compression and enables an optimization during forward iteration
   875  // to skip some key comparisons. The seqnum for an entry can be zeroed if the
   876  // entry is on the bottom snapshot stripe and on the bottom level of the LSM.
   877  func (i *compactionIter) maybeZeroSeqnum(snapshotIdx int) {
   878  	if !i.allowZeroSeqNum {
   879  		// TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
   880  		// make the determination on a key by key basis, similar to what is done
   881  		// for elideTombstone. Need to add a benchmark for compactionIter to verify
   882  		// that isn't too expensive.
   883  		return
   884  	}
   885  	if snapshotIdx > 0 {
   886  		// This is not the last snapshot
   887  		return
   888  	}
   889  	i.key.SetSeqNum(0)
   890  }