github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction_iter.go (about)

     1  // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"fmt"
     9  	"sort"
    10  
    11  	"github.com/petermattis/pebble/internal/bytealloc"
    12  	"github.com/petermattis/pebble/internal/rangedel"
    13  )
    14  
    15  // compactionIter provides a forward-only iterator that encapsulates the logic
    16  // for collapsing entries during compaction. It wraps an internal iterator and
    17  // collapses entries that are no longer necessary because they are shadowed by
    18  // newer entries. The simplest example of this is when the internal iterator
    19  // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries,
    20  // compactionIter collapses the second entry because it is no longer
    21  // necessary. The high-level structure for compactionIter is to iterate over
    22  // its internal iterator and output 1 entry for every user-key. There are four
    23  // complications to this story.
    24  //
    25  // 1. Eliding Deletion Tombstones
    26  //
    27  // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to
    28  // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly
    29  // shadows an entry at a lower level. If we're compacting to the base-level in
    30  // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower
    31  // level and can be elided.
    32  //
    33  // We can do slightly better than only eliding deletion tombstones at the base
    34  // level by observing that we can elide a deletion tombstone if there are no
    35  // sstables that contain the entry's key. This check is performed by
    36  // elideTombstone.
    37  //
    38  // 2. Merges
    39  //
    40  // The MERGE operation merges the value for an entry with the existing value
    41  // for an entry. The logical value of an entry can be composed of a series of
    42  // merge operations. When compactionIter sees a MERGE, it scans forward in its
    43  // internal iterator collapsing MERGE operations for the same key until it
    44  // encounters a SET or DELETE operation. For example, the keys a.MERGE.4,
    45  // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be
    46  // merged using the specified Merger.
    47  //
    48  // An interesting case here occurs when MERGE is combined with SET. Consider
    49  // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The
    50  // reason that the kind is changed to SET is because the SET operation acts as
    51  // a barrier preventing further merging. This can be seen better in the
    52  // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower
    53  // (older) level and not involved in the compaction. If the compaction of
    54  // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with
    55  // a.MERGE.1 would merge the values together incorrectly.
    56  //
    57  // 3. Snapshots
    58  //
    59  // Snapshots are lightweight point-in-time views of the DB state. At its core,
    60  // a snapshot is a sequence number along with a guarantee from Pebble that it
    61  // will maintain the view of the database at that sequence number. Part of this
    62  // guarantee is relatively straightforward to achieve. When reading from the
    63  // database Pebble will ignore sequence numbers that are larger than the
    64  // snapshot sequence number. The primary complexity with snapshots occurs
    65  // during compaction: the collapsing of entries that are shadowed by newer
    66  // entries is at odds with the guarantee that Pebble will maintain the view of
    67  // the database at the snapshot sequence number. Rather than collapsing entries
    68  // up to the next user key, compactionIter can only collapse entries up to the
    69  // next snapshot boundary. That is, every snapshot boundary potentially causes
    70  // another entry for the same user-key to be emitted. Another way to view this
    71  // is that snapshots define stripes and entries are collapsed within stripes,
    72  // but not across stripes. Consider the following scenario:
    73  //
    74  //   a.PUT.9
    75  //   a.DEL.8
    76  //   a.PUT.7
    77  //   a.DEL.6
    78  //   a.PUT.5
    79  //
    80  // In the absence of snapshots these entries would be collapsed to
    81  // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can
    82  // be divided into two stripes and collapsed within the stripes:
    83  //
    84  //   a.PUT.9        a.PUT.9
    85  //   a.DEL.8  --->
    86  //   a.PUT.7
    87  //   --             --
    88  //   a.DEL.6  --->  a.DEL.6
    89  //   a.PUT.5
    90  //
    91  // All of the rules described earlier still apply, but they are confined to
    92  // operate within a snapshot stripe. Snapshots only affect compaction when the
    93  // snapshot sequence number lies within the range of sequence numbers being
    94  // compacted. In the above example, a snapshot at sequence number 10 or at
    95  // sequence number 5 would not have any effect.
    96  //
    97  // 4. Range Deletions
    98  //
    99  // Range deletions provide the ability to delete all of the keys (and values)
   100  // in a contiguous range. Range deletions are stored indexed by their start
   101  // key. The end key of the range is stored in the value. In order to support
   102  // lookup of the range deletions which overlap with a particular key, the range
   103  // deletion tombstones need to be fragmented whenever they overlap. This
   104  // fragmentation is performed by rangedel.Fragmenter. The fragments are then
   105  // subject to the rules for snapshots. For example, consider the two range
   106  // tombstones [a,e)#1 and [c,g)#2:
   107  //
   108  //   2:     c-------g
   109  //   1: a-------e
   110  //
   111  // These tombstones will be fragmented into:
   112  //
   113  //   2:     c---e---g
   114  //   1: a---c---e
   115  //
   116  // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer
   117  // depends on whether it is in a new snapshot stripe.
   118  //
   119  // In addition to the fragmentation of range tombstones, compaction also needs
   120  // to take the range tombstones into consideration when outputting normal
   121  // keys. Just as with point deletions, a range deletion covering an entry can
   122  // cause the entry to be elided.
   123  type compactionIter struct {
   124  	cmp   Compare
   125  	merge Merge
   126  	iter  internalIterator
   127  	err   error
   128  	key   InternalKey
   129  	value []byte
   130  	// Temporary buffer used for storing the previous user key in order to
   131  	// determine when iteration has advanced to a new user key and thus a new
   132  	// snapshot stripe.
   133  	keyBuf []byte
   134  	// Temporary buffer used for aggregating merge operations.
   135  	valueBuf []byte
   136  	// Is the current entry valid?
   137  	valid     bool
   138  	iterKey   *InternalKey
   139  	iterValue []byte
   140  	// Skip indicates whether the remaining entries in the current snapshot
   141  	// stripe should be skipped or processed. Skipped is true at the start of a
   142  	// stripe and set to false afterwards.
   143  	skip bool
   144  	// The index of the snapshot for the current key within the snapshots slice.
   145  	curSnapshotIdx    int
   146  	curSnapshotSeqNum uint64
   147  	// The snapshot sequence numbers that need to be maintained. These sequence
   148  	// numbers define the snapshot stripes (see the Snapshots description
   149  	// above). The sequence numbers are in ascending order.
   150  	snapshots []uint64
   151  	// The range deletion tombstone fragmenter.
   152  	rangeDelFrag rangedel.Fragmenter
   153  	// The fragmented tombstones.
   154  	tombstones []rangedel.Tombstone
   155  	// Byte allocator for the tombstone keys.
   156  	alloc               bytealloc.A
   157  	allowZeroSeqNum     bool
   158  	elideTombstone      func(key []byte) bool
   159  	elideRangeTombstone func(start, end []byte) bool
   160  }
   161  
   162  func newCompactionIter(
   163  	cmp Compare,
   164  	merge Merge,
   165  	iter internalIterator,
   166  	snapshots []uint64,
   167  	allowZeroSeqNum bool,
   168  	elideTombstone func(key []byte) bool,
   169  	elideRangeTombstone func(start, end []byte) bool,
   170  ) *compactionIter {
   171  	i := &compactionIter{
   172  		cmp:                 cmp,
   173  		merge:               merge,
   174  		iter:                iter,
   175  		snapshots:           snapshots,
   176  		allowZeroSeqNum:     allowZeroSeqNum,
   177  		elideTombstone:      elideTombstone,
   178  		elideRangeTombstone: elideRangeTombstone,
   179  	}
   180  	i.rangeDelFrag.Cmp = cmp
   181  	i.rangeDelFrag.Emit = i.emitRangeDelChunk
   182  	return i
   183  }
   184  
   185  func (i *compactionIter) First() (*InternalKey, []byte) {
   186  	if i.err != nil {
   187  		return nil, nil
   188  	}
   189  	i.iterKey, i.iterValue = i.iter.First()
   190  	if i.iterKey != nil {
   191  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots)
   192  	}
   193  	return i.Next()
   194  }
   195  
   196  func (i *compactionIter) Next() (*InternalKey, []byte) {
   197  	if i.err != nil {
   198  		return nil, nil
   199  	}
   200  
   201  	if i.skip {
   202  		i.skip = false
   203  		i.skipStripe()
   204  	}
   205  
   206  	i.valid = false
   207  	for i.iterKey != nil {
   208  		i.key = *i.iterKey
   209  		switch i.key.Kind() {
   210  		case InternalKeyKindDelete:
   211  			// If we're at the last snapshot stripe and the tombstone can be elided
   212  			// skip to the next stripe (which will be the next user key).
   213  			if i.curSnapshotIdx == 0 && i.elideTombstone(i.key.UserKey) {
   214  				i.saveKey()
   215  				i.skipStripe()
   216  				continue
   217  			}
   218  
   219  			i.saveKey()
   220  			i.value = i.iterValue
   221  			i.valid = true
   222  			i.skip = true
   223  			return &i.key, i.value
   224  
   225  		case InternalKeyKindRangeDelete:
   226  			i.key = i.cloneKey(i.key)
   227  			i.rangeDelFrag.Add(i.key, i.iterValue)
   228  			i.nextInStripe()
   229  			continue
   230  
   231  		case InternalKeyKindSet:
   232  			if i.rangeDelFrag.Deleted(i.key, i.curSnapshotSeqNum) {
   233  				i.saveKey()
   234  				i.skipStripe()
   235  				continue
   236  			}
   237  
   238  			i.saveKey()
   239  			i.value = i.iterValue
   240  			i.valid = true
   241  			i.skip = true
   242  			i.maybeZeroSeqnum()
   243  			return &i.key, i.value
   244  
   245  		case InternalKeyKindMerge:
   246  			if i.rangeDelFrag.Deleted(i.key, i.curSnapshotSeqNum) {
   247  				i.saveKey()
   248  				i.skipStripe()
   249  				continue
   250  			}
   251  
   252  			// NB: it is important to call maybeZeroSeqnum before mergeNext as
   253  			// merging advances the iterator, adjusting curSnapshotIdx and thus
   254  			// invalidating the state that maybeZeroSeqnum uses to make its
   255  			// determination.
   256  			i.maybeZeroSeqnum()
   257  			return i.mergeNext()
   258  
   259  		case InternalKeyKindInvalid:
   260  			// NB: Invalid keys occur when there is some error parsing the key. Pass
   261  			// them through unmodified.
   262  			i.saveKey()
   263  			i.saveValue()
   264  			i.iterKey, i.iterValue = i.iter.Next()
   265  			i.valid = true
   266  			return &i.key, i.value
   267  
   268  		default:
   269  			i.err = fmt.Errorf("invalid internal key kind: %d", i.key.Kind())
   270  			return nil, nil
   271  		}
   272  	}
   273  
   274  	return nil, nil
   275  }
   276  
   277  // snapshotIndex returns the index of the first sequence number in snapshots
   278  // which is greater than or equal to seq.
   279  func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) {
   280  	index := sort.Search(len(snapshots), func(i int) bool {
   281  		return snapshots[i] > seq
   282  	})
   283  	if index >= len(snapshots) {
   284  		return index, InternalKeySeqNumMax
   285  	}
   286  	return index, snapshots[index]
   287  }
   288  
   289  func (i *compactionIter) skipStripe() {
   290  	for i.nextInStripe() {
   291  	}
   292  }
   293  
   294  func (i *compactionIter) nextInStripe() bool {
   295  	i.iterKey, i.iterValue = i.iter.Next()
   296  	if i.iterKey == nil {
   297  		return false
   298  	}
   299  	key := i.iterKey
   300  	if i.cmp(i.key.UserKey, key.UserKey) != 0 {
   301  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
   302  		return false
   303  	}
   304  	switch key.Kind() {
   305  	case InternalKeyKindRangeDelete:
   306  		// Range tombstones are always added to the fragmenter. They are processed
   307  		// into stripes after fragmentation.
   308  		i.rangeDelFrag.Add(i.cloneKey(*key), i.iterValue)
   309  		return true
   310  	case InternalKeyKindInvalid:
   311  		i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots)
   312  		return false
   313  	}
   314  	if len(i.snapshots) == 0 {
   315  		return true
   316  	}
   317  	idx, seqNum := snapshotIndex(key.SeqNum(), i.snapshots)
   318  	if i.curSnapshotIdx == idx {
   319  		return true
   320  	}
   321  	i.curSnapshotIdx = idx
   322  	i.curSnapshotSeqNum = seqNum
   323  	return false
   324  }
   325  
   326  func (i *compactionIter) mergeNext() (*InternalKey, []byte) {
   327  	// Save the current key and value.
   328  	i.saveKey()
   329  	i.saveValue()
   330  	i.valid = true
   331  
   332  	// Loop looking for older values in the current snapshot stripe and merge
   333  	// them.
   334  	for {
   335  		if !i.nextInStripe() {
   336  			i.skip = false
   337  			return &i.key, i.value
   338  		}
   339  		key := i.iterKey
   340  		switch key.Kind() {
   341  		case InternalKeyKindDelete:
   342  			// We've hit a deletion tombstone. Return everything up to this point and
   343  			// then skip entries until the next snapshot stripe.
   344  			i.valueBuf = i.value[:0]
   345  			i.skip = true
   346  			return &i.key, i.value
   347  
   348  		case InternalKeyKindRangeDelete:
   349  			// We've hit a range deletion tombstone. Return everything up to this
   350  			// point and then skip entries until the next snapshot stripe.
   351  			i.skip = true
   352  			return &i.key, i.value
   353  
   354  		case InternalKeyKindSet:
   355  			if i.rangeDelFrag.Deleted(*key, i.curSnapshotSeqNum) {
   356  				i.skip = true
   357  				return &i.key, i.value
   358  			}
   359  
   360  			// We've hit a Set value. Merge with the existing value and return. We
   361  			// change the kind of the resulting key to a Set so that it shadows keys
   362  			// in lower levels. That is, MERGE+MERGE+SET -> SET.
   363  			i.value = i.merge(i.key.UserKey, i.value, i.iterValue, nil)
   364  			i.valueBuf = i.value[:0]
   365  			i.key.SetKind(InternalKeyKindSet)
   366  			i.skip = true
   367  			return &i.key, i.value
   368  
   369  		case InternalKeyKindMerge:
   370  			if i.rangeDelFrag.Deleted(*key, i.curSnapshotSeqNum) {
   371  				i.skip = true
   372  				return &i.key, i.value
   373  			}
   374  
   375  			// We've hit another Merge value. Merge with the existing value and
   376  			// continue looping.
   377  			i.value = i.merge(i.key.UserKey, i.value, i.iterValue, nil)
   378  			i.valueBuf = i.value[:0]
   379  
   380  		default:
   381  			i.err = fmt.Errorf("invalid internal key kind: %d", i.iterKey.Kind())
   382  			return nil, nil
   383  		}
   384  	}
   385  }
   386  
   387  func (i *compactionIter) saveKey() {
   388  	i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...)
   389  	i.key.UserKey = i.keyBuf
   390  }
   391  
   392  func (i *compactionIter) saveValue() {
   393  	i.valueBuf = append(i.valueBuf[:0], i.iterValue...)
   394  	i.value = i.valueBuf
   395  }
   396  
   397  func (i *compactionIter) cloneKey(key InternalKey) InternalKey {
   398  	i.alloc, key.UserKey = i.alloc.Copy(key.UserKey)
   399  	return key
   400  }
   401  
   402  func (i *compactionIter) Key() InternalKey {
   403  	return i.key
   404  }
   405  
   406  func (i *compactionIter) Value() []byte {
   407  	return i.value
   408  }
   409  
   410  func (i *compactionIter) Valid() bool {
   411  	return i.valid
   412  }
   413  
   414  func (i *compactionIter) Error() error {
   415  	return i.err
   416  }
   417  
   418  func (i *compactionIter) Close() error {
   419  	err := i.iter.Close()
   420  	if i.err == nil {
   421  		i.err = err
   422  	}
   423  	return i.err
   424  }
   425  
   426  func (i *compactionIter) Tombstones(key []byte) []rangedel.Tombstone {
   427  	if key == nil {
   428  		i.rangeDelFrag.Finish()
   429  	} else {
   430  		i.rangeDelFrag.FlushTo(key)
   431  	}
   432  	tombstones := i.tombstones
   433  	i.tombstones = nil
   434  	return tombstones
   435  }
   436  
   437  func (i *compactionIter) emitRangeDelChunk(fragmented []rangedel.Tombstone) {
   438  	// Apply the snapshot stripe rules, keeping only the latest tombstone for
   439  	// each snapshot stripe.
   440  	currentIdx := -1
   441  	for _, v := range fragmented {
   442  		idx, _ := snapshotIndex(v.Start.SeqNum(), i.snapshots)
   443  		if currentIdx == idx {
   444  			continue
   445  		}
   446  		if idx == 0 && i.elideRangeTombstone(v.Start.UserKey, v.End) {
   447  			// This is the last snapshot stripe and the range tombstone can be
   448  			// elided.
   449  			break
   450  		}
   451  
   452  		i.tombstones = append(i.tombstones, v)
   453  		if idx == 0 {
   454  			// This is the last snapshot stripe.
   455  			break
   456  		}
   457  		currentIdx = idx
   458  	}
   459  }
   460  
   461  // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing
   462  // so improves compression and enables an optimization during forward iteration
   463  // to skip some key comparisons. The seqnum for an entry can be zeroed if the
   464  // entry is on the bottom snapshot stripe and on the bottom level of the LSM.
   465  func (i *compactionIter) maybeZeroSeqnum() {
   466  	if !i.allowZeroSeqNum {
   467  		// TODO(peter): allowZeroSeqNum applies to the entire compaction. We could
   468  		// make the determination on a key by key basis, similar to what is done
   469  		// for elideTombstone. Need to add a benchmark for compactionIter to verify
   470  		// that isn't too expensive.
   471  		return
   472  	}
   473  	if i.curSnapshotIdx > 0 {
   474  		// This is not the last snapshot
   475  		return
   476  	}
   477  	i.key.SetSeqNum(0)
   478  }