github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/internal/rangekey/coalesce.go (about)

     1  // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package rangekey
     6  
     7  import (
     8  	"bytes"
     9  	"math"
    10  	"sort"
    11  
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manifest"
    16  )
    17  
    18  // UserIteratorConfig holds state for constructing the range key iterator stack
    19  // for user iteration. The range key iterator must merge range key spans across
    20  // the levels of the LSM. This merging is performed by a keyspan.MergingIter
    21  // on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating
    22  // range-key semantics and shadowing, so the spans returned by a MergingIter are
    23  // fully resolved.
    24  //
    25  // The MergingIter is wrapped by a BoundedIter, which elides spans that are
    26  // outside the iterator bounds (or the current prefix's bounds, during prefix
    27  // iteration mode).
    28  //
    29  // To provide determinisim during iteration, the BoundedIter is wrapped by a
    30  // DefragmentingIter that defragments abutting spans with identical
    31  // user-observable state.
    32  //
    33  // At the top-level an InterleavingIter interleaves range keys with point keys
    34  // and performs truncation to iterator bounds.
    35  //
    36  // Below is an abbreviated diagram illustrating the mechanics of a SeekGE.
    37  //
    38  //	               InterleavingIter.SeekGE
    39  //	                       │
    40  //	            DefragmentingIter.SeekGE
    41  //	                       │
    42  //	               BoundedIter.SeekGE
    43  //	                       │
    44  //	      ╭────────────────┴───────────────╮
    45  //	      │                                ├── defragmentBwd*
    46  //	MergingIter.SeekGE                     │
    47  //	      │                                ╰── defragmentFwd
    48  //	      ╰─╶╶ per level╶╶ ─╮
    49  //	                        │
    50  //	                        │
    51  //	                        ├── <?>.SeekLT
    52  //	                        │
    53  //	                        ╰── <?>.Next
    54  type UserIteratorConfig struct {
    55  	snapshot     uint64
    56  	comparer     *base.Comparer
    57  	miter        keyspan.MergingIter
    58  	biter        keyspan.BoundedIter
    59  	diter        keyspan.DefragmentingIter
    60  	liters       [manifest.NumLevels]keyspan.LevelIter
    61  	litersUsed   int
    62  	internalKeys bool
    63  	bufs         *Buffers
    64  }
    65  
    66  // Buffers holds various buffers used for range key iteration. They're exposed
    67  // so that they may be pooled and reused between iterators.
    68  type Buffers struct {
    69  	merging       keyspan.MergingBuffers
    70  	defragmenting keyspan.DefragmentingBuffers
    71  	sortBuf       keyspan.KeysBySuffix
    72  }
    73  
    74  // PrepareForReuse discards any excessively large buffers.
    75  func (bufs *Buffers) PrepareForReuse() {
    76  	bufs.merging.PrepareForReuse()
    77  	bufs.defragmenting.PrepareForReuse()
    78  }
    79  
    80  // Init initializes the range key iterator stack for user iteration. The
    81  // resulting fragment iterator applies range key semantics, defragments spans
    82  // according to their user-observable state and, if !internalKeys, removes all
    83  // Keys other than RangeKeySets describing the current state of range keys. The
    84  // resulting spans contain Keys sorted by suffix (unless internalKeys is true,
    85  // in which case they remain sorted by trailer descending).
    86  //
    87  // The snapshot sequence number parameter determines which keys are visible. Any
    88  // keys not visible at the provided snapshot are ignored.
    89  func (ui *UserIteratorConfig) Init(
    90  	comparer *base.Comparer,
    91  	snapshot uint64,
    92  	lower, upper []byte,
    93  	hasPrefix *bool,
    94  	prefix *[]byte,
    95  	internalKeys bool,
    96  	bufs *Buffers,
    97  	iters ...keyspan.FragmentIterator,
    98  ) keyspan.FragmentIterator {
    99  	ui.snapshot = snapshot
   100  	ui.comparer = comparer
   101  	ui.internalKeys = internalKeys
   102  	ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...)
   103  	ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix)
   104  	if internalKeys {
   105  		ui.diter.Init(comparer, &ui.biter, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
   106  	} else {
   107  		ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting)
   108  	}
   109  	ui.litersUsed = 0
   110  	ui.bufs = bufs
   111  	return &ui.diter
   112  }
   113  
   114  // AddLevel adds a new level to the bottom of the iterator stack. AddLevel
   115  // must be called after Init and before any other method on the iterator.
   116  func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) {
   117  	ui.miter.AddLevel(iter)
   118  }
   119  
   120  // NewLevelIter returns a pointer to a newly allocated or reused
   121  // keyspan.LevelIter. The caller is responsible for calling Init() on this
   122  // instance.
   123  func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter {
   124  	if ui.litersUsed >= len(ui.liters) {
   125  		return &keyspan.LevelIter{}
   126  	}
   127  	ui.litersUsed++
   128  	return &ui.liters[ui.litersUsed-1]
   129  }
   130  
   131  // SetBounds propagates bounds to the iterator stack. The fragment iterator
   132  // interface ordinarily doesn't enforce bounds, so this is exposed as an
   133  // explicit method on the user iterator config.
   134  func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) {
   135  	ui.biter.SetBounds(lower, upper)
   136  }
   137  
   138  // Transform implements the keyspan.Transformer interface for use with a
   139  // keyspan.MergingIter. It transforms spans by resolving range keys at the
   140  // provided snapshot sequence number. Shadowing of keys is resolved (eg, removal
   141  // of unset keys, removal of keys overwritten by a set at the same suffix, etc)
   142  // and then non-RangeKeySet keys are removed. The resulting transformed spans
   143  // only contain RangeKeySets describing the state visible at the provided
   144  // sequence number, and hold their Keys sorted by Suffix (except if internalKeys
   145  // is true, then keys remain sorted by trailer.
   146  func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error {
   147  	// Apply shadowing of keys.
   148  	dst.Start = s.Start
   149  	dst.End = s.End
   150  	ui.bufs.sortBuf = keyspan.KeysBySuffix{
   151  		Cmp:  cmp,
   152  		Keys: ui.bufs.sortBuf.Keys[:0],
   153  	}
   154  	if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil {
   155  		return err
   156  	}
   157  	if ui.internalKeys {
   158  		if s.KeysOrder != keyspan.ByTrailerDesc {
   159  			panic("unexpected key ordering in UserIteratorTransform with internalKeys = true")
   160  		}
   161  		dst.Keys = ui.bufs.sortBuf.Keys
   162  		keyspan.SortKeysByTrailer(&dst.Keys)
   163  		return nil
   164  	}
   165  	// During user iteration over range keys, unsets and deletes don't matter. This
   166  	// step helps logical defragmentation during iteration.
   167  	keys := ui.bufs.sortBuf.Keys
   168  	dst.Keys = dst.Keys[:0]
   169  	for i := range keys {
   170  		switch keys[i].Kind() {
   171  		case base.InternalKeyKindRangeKeySet:
   172  			if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
   173  				panic("pebble: keys unexpectedly not in ascending suffix order")
   174  			}
   175  			dst.Keys = append(dst.Keys, keys[i])
   176  		case base.InternalKeyKindRangeKeyUnset:
   177  			if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 {
   178  				panic("pebble: keys unexpectedly not in ascending suffix order")
   179  			}
   180  			// Skip.
   181  			continue
   182  		case base.InternalKeyKindRangeKeyDelete:
   183  			// Skip.
   184  			continue
   185  		default:
   186  			return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind())
   187  		}
   188  	}
   189  	// coalesce results in dst.Keys being sorted by Suffix.
   190  	dst.KeysOrder = keyspan.BySuffixAsc
   191  	return nil
   192  }
   193  
   194  // ShouldDefragment implements the DefragmentMethod interface and configures a
   195  // DefragmentingIter to defragment spans of range keys if their user-visible
   196  // state is identical. This defragmenting method assumes the provided spans have
   197  // already been transformed through (UserIterationConfig).Transform, so all
   198  // RangeKeySets are user-visible sets and are already in Suffix order. This
   199  // defragmenter checks for equality between set suffixes and values (ignoring
   200  // sequence numbers). It's intended for use during user iteration, when the
   201  // wrapped keyspan iterator is merging spans across all levels of the LSM.
   202  func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool {
   203  	// This method is not called with internalKeys = true.
   204  	if ui.internalKeys {
   205  		panic("unexpected call to ShouldDefragment with internalKeys = true")
   206  	}
   207  	// This implementation must only be used on spans that have transformed by
   208  	// ui.Transform. The transform applies shadowing, removes all keys besides
   209  	// the resulting Sets and sorts the keys by suffix. Since shadowing has been
   210  	// applied, each Set must set a unique suffix. If the two spans are
   211  	// equivalent, they must have the same number of range key sets.
   212  	if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 {
   213  		return false
   214  	}
   215  	if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc {
   216  		panic("pebble: range key span's keys unexpectedly not in ascending suffix order")
   217  	}
   218  
   219  	ret := true
   220  	for i := range a.Keys {
   221  		if invariants.Enabled {
   222  			if a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet ||
   223  				b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet {
   224  				panic("pebble: unexpected non-RangeKeySet during defragmentation")
   225  			}
   226  			if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 ||
   227  				ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) {
   228  				panic("pebble: range keys not ordered by suffix during defragmentation")
   229  			}
   230  		}
   231  		if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) {
   232  			ret = false
   233  			break
   234  		}
   235  		if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) {
   236  			ret = false
   237  			break
   238  		}
   239  	}
   240  	return ret
   241  }
   242  
   243  // Coalesce imposes range key semantics and coalesces range keys with the same
   244  // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or
   245  // deletes. Coalesce modifies the provided span's Keys slice, reslicing the
   246  // slice to remove dropped keys.
   247  //
   248  // Coalescence has subtle behavior with respect to sequence numbers. Coalesce
   249  // depends on a keyspan.Span's Keys being sorted in sequence number descending
   250  // order. The first key has the largest sequence number. The returned coalesced
   251  // span includes only the largest sequence number. All other sequence numbers
   252  // are forgotten. When a compaction constructs output range keys from a
   253  // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and
   254  // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number.
   255  //
   256  // This has the potentially surprising effect of 'promoting' a key to a higher
   257  // sequence number. This is okay, because:
   258  //   - There are no other overlapping keys within the coalesced span of
   259  //     sequence numbers (otherwise they would be in the compaction, due to
   260  //     the LSM invariant).
   261  //   - Range key sequence numbers are never compared to point key sequence
   262  //     numbers. Range keys and point keys have parallel existences.
   263  //   - Compactions only coalesce within snapshot stripes.
   264  //
   265  // Additionally, internal range keys at the same sequence number have subtle
   266  // mechanics:
   267  //   - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix.
   268  //   - RANGEKEYDELs only apply to keys at lower sequence numbers.
   269  //
   270  // This is required for ingestion. Ingested sstables are assigned a single
   271  // sequence number for the file, at which all of the file's keys are visible.
   272  // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such
   273  // that among keys with equal sequence numbers (thus ordered by their kinds) the
   274  // keys do not affect one another. Ingested sstables are expected to be
   275  // consistent with respect to the set/unset suffixes: A given suffix should be
   276  // set or unset but not both.
   277  //
   278  // The resulting dst Keys slice is sorted by Trailer.
   279  func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error {
   280  	// TODO(jackson): Currently, Coalesce doesn't actually perform the sequence
   281  	// number promotion described in the comment above.
   282  	keysBySuffix := keyspan.KeysBySuffix{
   283  		Cmp:  cmp,
   284  		Keys: (*dst)[:0],
   285  	}
   286  	if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil {
   287  		return err
   288  	}
   289  	// Update the span with the (potentially reduced) keys slice. coalesce left
   290  	// the keys in *dst sorted by suffix. Re-sort them by trailer.
   291  	*dst = keysBySuffix.Keys
   292  	keyspan.SortKeysByTrailer(dst)
   293  	return nil
   294  }
   295  
   296  func coalesce(
   297  	equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key,
   298  ) error {
   299  	// First, enforce visibility and RangeKeyDelete mechanics. We only need to
   300  	// consider the prefix of keys before and including the first
   301  	// RangeKeyDelete. We also must skip any keys that aren't visible at the
   302  	// provided snapshot sequence number.
   303  	//
   304  	// NB: Within a given sequence number, keys are ordered as:
   305  	//   RangeKeySet > RangeKeyUnset > RangeKeyDelete
   306  	// This is significant, because this ensures that a Set or Unset sharing a
   307  	// sequence number with a Delete do not shadow each other.
   308  	deleteIdx := -1
   309  	for i := range keys {
   310  		if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer {
   311  			panic("pebble: invariant violation: span keys unordered")
   312  		}
   313  		if !keys[i].VisibleAt(snapshot) {
   314  			continue
   315  		}
   316  		// Once a RangeKeyDelete is observed, we know it shadows all subsequent
   317  		// keys and we can break early. We don't add the RangeKeyDelete key to
   318  		// keysBySuffix.keys yet, because we don't want a suffix-less key
   319  		// that appeared earlier in the slice to elide it. It'll be added back
   320  		// in at the end.
   321  		if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete {
   322  			deleteIdx = i
   323  			break
   324  		}
   325  		keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i])
   326  	}
   327  
   328  	// Sort the accumulated keys by suffix. There may be duplicates within a
   329  	// suffix, in which case the one with a larger trailer survives.
   330  	//
   331  	// We use a stable sort so that the first key with a given suffix is the one
   332  	// that with the highest Trailer (because the input `keys` was sorted by
   333  	// trailer descending).
   334  	sort.Stable(keysBySuffix)
   335  
   336  	// Grab a handle of the full sorted slice, before reslicing
   337  	// keysBySuffix.keys to accumulate the final coalesced keys.
   338  	sorted := keysBySuffix.Keys
   339  	keysBySuffix.Keys = keysBySuffix.Keys[:0]
   340  
   341  	var (
   342  		// prevSuffix is updated on each iteration of the below loop, and
   343  		// compared by the subsequent iteration to determine whether adjacent
   344  		// keys are defined at the same suffix.
   345  		prevSuffix []byte
   346  		// shadowing is set to true once any Key is shadowed by another key.
   347  		// When it's set to true—or after the loop if no keys are shadowed—the
   348  		// keysBySuffix.keys slice is resliced to contain the prefix of
   349  		// unshadowed keys. This avoids copying them incrementally in the common
   350  		// case of no shadowing.
   351  		shadowing bool
   352  	)
   353  	for i := range sorted {
   354  		if i > 0 && equal(prevSuffix, sorted[i].Suffix) {
   355  			// Skip; this key is shadowed by the predecessor that had a larger
   356  			// Trailer. If this is the first shadowed key, set shadowing=true
   357  			// and reslice keysBySuffix.keys to hold the entire unshadowed
   358  			// prefix.
   359  			if !shadowing {
   360  				keysBySuffix.Keys = keysBySuffix.Keys[:i]
   361  				shadowing = true
   362  			}
   363  			continue
   364  		}
   365  		prevSuffix = sorted[i].Suffix
   366  		if shadowing {
   367  			keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i])
   368  		}
   369  	}
   370  	// If there was no shadowing, keysBySuffix.keys is untouched. We can simply
   371  	// set it to the existing `sorted` slice (also backed by keysBySuffix.keys).
   372  	if !shadowing {
   373  		keysBySuffix.Keys = sorted
   374  	}
   375  	// If the original input `keys` slice contained a RangeKeyDelete, add it.
   376  	if deleteIdx >= 0 {
   377  		keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx])
   378  	}
   379  	return nil
   380  }