github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/rangekey/coalesce.go (about) 1 // Copyright 2021 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package rangekey 6 7 import ( 8 "bytes" 9 "math" 10 "sort" 11 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 ) 17 18 // UserIteratorConfig holds state for constructing the range key iterator stack 19 // for user iteration. The range key iterator must merge range key spans across 20 // the levels of the LSM. This merging is performed by a keyspan.MergingIter 21 // on-the-fly. The UserIteratorConfig implements keyspan.Transformer, evaluating 22 // range-key semantics and shadowing, so the spans returned by a MergingIter are 23 // fully resolved. 24 // 25 // The MergingIter is wrapped by a BoundedIter, which elides spans that are 26 // outside the iterator bounds (or the current prefix's bounds, during prefix 27 // iteration mode). 28 // 29 // To provide determinisim during iteration, the BoundedIter is wrapped by a 30 // DefragmentingIter that defragments abutting spans with identical 31 // user-observable state. 32 // 33 // At the top-level an InterleavingIter interleaves range keys with point keys 34 // and performs truncation to iterator bounds. 35 // 36 // Below is an abbreviated diagram illustrating the mechanics of a SeekGE. 37 // 38 // InterleavingIter.SeekGE 39 // │ 40 // DefragmentingIter.SeekGE 41 // │ 42 // BoundedIter.SeekGE 43 // │ 44 // ╭────────────────┴───────────────╮ 45 // │ ├── defragmentBwd* 46 // MergingIter.SeekGE │ 47 // │ ╰── defragmentFwd 48 // ╰─╶╶ per level╶╶ ─╮ 49 // │ 50 // │ 51 // ├── <?>.SeekLT 52 // │ 53 // ╰── <?>.Next 54 type UserIteratorConfig struct { 55 snapshot uint64 56 comparer *base.Comparer 57 miter keyspan.MergingIter 58 biter keyspan.BoundedIter 59 diter keyspan.DefragmentingIter 60 liters [manifest.NumLevels]keyspan.LevelIter 61 litersUsed int 62 internalKeys bool 63 bufs *Buffers 64 } 65 66 // Buffers holds various buffers used for range key iteration. They're exposed 67 // so that they may be pooled and reused between iterators. 68 type Buffers struct { 69 merging keyspan.MergingBuffers 70 defragmenting keyspan.DefragmentingBuffers 71 sortBuf keyspan.KeysBySuffix 72 } 73 74 // PrepareForReuse discards any excessively large buffers. 75 func (bufs *Buffers) PrepareForReuse() { 76 bufs.merging.PrepareForReuse() 77 bufs.defragmenting.PrepareForReuse() 78 } 79 80 // Init initializes the range key iterator stack for user iteration. The 81 // resulting fragment iterator applies range key semantics, defragments spans 82 // according to their user-observable state and, if !internalKeys, removes all 83 // Keys other than RangeKeySets describing the current state of range keys. The 84 // resulting spans contain Keys sorted by suffix (unless internalKeys is true, 85 // in which case they remain sorted by trailer descending). 86 // 87 // The snapshot sequence number parameter determines which keys are visible. Any 88 // keys not visible at the provided snapshot are ignored. 89 func (ui *UserIteratorConfig) Init( 90 comparer *base.Comparer, 91 snapshot uint64, 92 lower, upper []byte, 93 hasPrefix *bool, 94 prefix *[]byte, 95 internalKeys bool, 96 bufs *Buffers, 97 iters ...keyspan.FragmentIterator, 98 ) keyspan.FragmentIterator { 99 ui.snapshot = snapshot 100 ui.comparer = comparer 101 ui.internalKeys = internalKeys 102 ui.miter.Init(comparer.Compare, ui, &bufs.merging, iters...) 103 ui.biter.Init(comparer.Compare, comparer.Split, &ui.miter, lower, upper, hasPrefix, prefix) 104 if internalKeys { 105 ui.diter.Init(comparer, &ui.biter, keyspan.DefragmentInternal, keyspan.StaticDefragmentReducer, &bufs.defragmenting) 106 } else { 107 ui.diter.Init(comparer, &ui.biter, ui, keyspan.StaticDefragmentReducer, &bufs.defragmenting) 108 } 109 ui.litersUsed = 0 110 ui.bufs = bufs 111 return &ui.diter 112 } 113 114 // AddLevel adds a new level to the bottom of the iterator stack. AddLevel 115 // must be called after Init and before any other method on the iterator. 116 func (ui *UserIteratorConfig) AddLevel(iter keyspan.FragmentIterator) { 117 ui.miter.AddLevel(iter) 118 } 119 120 // NewLevelIter returns a pointer to a newly allocated or reused 121 // keyspan.LevelIter. The caller is responsible for calling Init() on this 122 // instance. 123 func (ui *UserIteratorConfig) NewLevelIter() *keyspan.LevelIter { 124 if ui.litersUsed >= len(ui.liters) { 125 return &keyspan.LevelIter{} 126 } 127 ui.litersUsed++ 128 return &ui.liters[ui.litersUsed-1] 129 } 130 131 // SetBounds propagates bounds to the iterator stack. The fragment iterator 132 // interface ordinarily doesn't enforce bounds, so this is exposed as an 133 // explicit method on the user iterator config. 134 func (ui *UserIteratorConfig) SetBounds(lower, upper []byte) { 135 ui.biter.SetBounds(lower, upper) 136 } 137 138 // Transform implements the keyspan.Transformer interface for use with a 139 // keyspan.MergingIter. It transforms spans by resolving range keys at the 140 // provided snapshot sequence number. Shadowing of keys is resolved (eg, removal 141 // of unset keys, removal of keys overwritten by a set at the same suffix, etc) 142 // and then non-RangeKeySet keys are removed. The resulting transformed spans 143 // only contain RangeKeySets describing the state visible at the provided 144 // sequence number, and hold their Keys sorted by Suffix (except if internalKeys 145 // is true, then keys remain sorted by trailer. 146 func (ui *UserIteratorConfig) Transform(cmp base.Compare, s keyspan.Span, dst *keyspan.Span) error { 147 // Apply shadowing of keys. 148 dst.Start = s.Start 149 dst.End = s.End 150 ui.bufs.sortBuf = keyspan.KeysBySuffix{ 151 Cmp: cmp, 152 Keys: ui.bufs.sortBuf.Keys[:0], 153 } 154 if err := coalesce(ui.comparer.Equal, &ui.bufs.sortBuf, ui.snapshot, s.Keys); err != nil { 155 return err 156 } 157 if ui.internalKeys { 158 if s.KeysOrder != keyspan.ByTrailerDesc { 159 panic("unexpected key ordering in UserIteratorTransform with internalKeys = true") 160 } 161 dst.Keys = ui.bufs.sortBuf.Keys 162 keyspan.SortKeysByTrailer(&dst.Keys) 163 return nil 164 } 165 // During user iteration over range keys, unsets and deletes don't matter. This 166 // step helps logical defragmentation during iteration. 167 keys := ui.bufs.sortBuf.Keys 168 dst.Keys = dst.Keys[:0] 169 for i := range keys { 170 switch keys[i].Kind() { 171 case base.InternalKeyKindRangeKeySet: 172 if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 173 panic("pebble: keys unexpectedly not in ascending suffix order") 174 } 175 dst.Keys = append(dst.Keys, keys[i]) 176 case base.InternalKeyKindRangeKeyUnset: 177 if invariants.Enabled && len(dst.Keys) > 0 && cmp(dst.Keys[len(dst.Keys)-1].Suffix, keys[i].Suffix) > 0 { 178 panic("pebble: keys unexpectedly not in ascending suffix order") 179 } 180 // Skip. 181 continue 182 case base.InternalKeyKindRangeKeyDelete: 183 // Skip. 184 continue 185 default: 186 return base.CorruptionErrorf("pebble: unrecognized range key kind %s", keys[i].Kind()) 187 } 188 } 189 // coalesce results in dst.Keys being sorted by Suffix. 190 dst.KeysOrder = keyspan.BySuffixAsc 191 return nil 192 } 193 194 // ShouldDefragment implements the DefragmentMethod interface and configures a 195 // DefragmentingIter to defragment spans of range keys if their user-visible 196 // state is identical. This defragmenting method assumes the provided spans have 197 // already been transformed through (UserIterationConfig).Transform, so all 198 // RangeKeySets are user-visible sets and are already in Suffix order. This 199 // defragmenter checks for equality between set suffixes and values (ignoring 200 // sequence numbers). It's intended for use during user iteration, when the 201 // wrapped keyspan iterator is merging spans across all levels of the LSM. 202 func (ui *UserIteratorConfig) ShouldDefragment(equal base.Equal, a, b *keyspan.Span) bool { 203 // This method is not called with internalKeys = true. 204 if ui.internalKeys { 205 panic("unexpected call to ShouldDefragment with internalKeys = true") 206 } 207 // This implementation must only be used on spans that have transformed by 208 // ui.Transform. The transform applies shadowing, removes all keys besides 209 // the resulting Sets and sorts the keys by suffix. Since shadowing has been 210 // applied, each Set must set a unique suffix. If the two spans are 211 // equivalent, they must have the same number of range key sets. 212 if len(a.Keys) != len(b.Keys) || len(a.Keys) == 0 { 213 return false 214 } 215 if a.KeysOrder != keyspan.BySuffixAsc || b.KeysOrder != keyspan.BySuffixAsc { 216 panic("pebble: range key span's keys unexpectedly not in ascending suffix order") 217 } 218 219 ret := true 220 for i := range a.Keys { 221 if invariants.Enabled { 222 if a.Keys[i].Kind() != base.InternalKeyKindRangeKeySet || 223 b.Keys[i].Kind() != base.InternalKeyKindRangeKeySet { 224 panic("pebble: unexpected non-RangeKeySet during defragmentation") 225 } 226 if i > 0 && (ui.comparer.Compare(a.Keys[i].Suffix, a.Keys[i-1].Suffix) < 0 || 227 ui.comparer.Compare(b.Keys[i].Suffix, b.Keys[i-1].Suffix) < 0) { 228 panic("pebble: range keys not ordered by suffix during defragmentation") 229 } 230 } 231 if !equal(a.Keys[i].Suffix, b.Keys[i].Suffix) { 232 ret = false 233 break 234 } 235 if !bytes.Equal(a.Keys[i].Value, b.Keys[i].Value) { 236 ret = false 237 break 238 } 239 } 240 return ret 241 } 242 243 // Coalesce imposes range key semantics and coalesces range keys with the same 244 // bounds. Coalesce drops any keys shadowed by more recent sets, unsets or 245 // deletes. Coalesce modifies the provided span's Keys slice, reslicing the 246 // slice to remove dropped keys. 247 // 248 // Coalescence has subtle behavior with respect to sequence numbers. Coalesce 249 // depends on a keyspan.Span's Keys being sorted in sequence number descending 250 // order. The first key has the largest sequence number. The returned coalesced 251 // span includes only the largest sequence number. All other sequence numbers 252 // are forgotten. When a compaction constructs output range keys from a 253 // coalesced span, it produces at most one RANGEKEYSET, one RANGEKEYUNSET and 254 // one RANGEKEYDEL. Each one of these keys adopt the largest sequence number. 255 // 256 // This has the potentially surprising effect of 'promoting' a key to a higher 257 // sequence number. This is okay, because: 258 // - There are no other overlapping keys within the coalesced span of 259 // sequence numbers (otherwise they would be in the compaction, due to 260 // the LSM invariant). 261 // - Range key sequence numbers are never compared to point key sequence 262 // numbers. Range keys and point keys have parallel existences. 263 // - Compactions only coalesce within snapshot stripes. 264 // 265 // Additionally, internal range keys at the same sequence number have subtle 266 // mechanics: 267 // - RANGEKEYSETs shadow RANGEKEYUNSETs of the same suffix. 268 // - RANGEKEYDELs only apply to keys at lower sequence numbers. 269 // 270 // This is required for ingestion. Ingested sstables are assigned a single 271 // sequence number for the file, at which all of the file's keys are visible. 272 // The RANGEKEYSET, RANGEKEYUNSET and RANGEKEYDEL key kinds are ordered such 273 // that among keys with equal sequence numbers (thus ordered by their kinds) the 274 // keys do not affect one another. Ingested sstables are expected to be 275 // consistent with respect to the set/unset suffixes: A given suffix should be 276 // set or unset but not both. 277 // 278 // The resulting dst Keys slice is sorted by Trailer. 279 func Coalesce(cmp base.Compare, eq base.Equal, keys []keyspan.Key, dst *[]keyspan.Key) error { 280 // TODO(jackson): Currently, Coalesce doesn't actually perform the sequence 281 // number promotion described in the comment above. 282 keysBySuffix := keyspan.KeysBySuffix{ 283 Cmp: cmp, 284 Keys: (*dst)[:0], 285 } 286 if err := coalesce(eq, &keysBySuffix, math.MaxUint64, keys); err != nil { 287 return err 288 } 289 // Update the span with the (potentially reduced) keys slice. coalesce left 290 // the keys in *dst sorted by suffix. Re-sort them by trailer. 291 *dst = keysBySuffix.Keys 292 keyspan.SortKeysByTrailer(dst) 293 return nil 294 } 295 296 func coalesce( 297 equal base.Equal, keysBySuffix *keyspan.KeysBySuffix, snapshot uint64, keys []keyspan.Key, 298 ) error { 299 // First, enforce visibility and RangeKeyDelete mechanics. We only need to 300 // consider the prefix of keys before and including the first 301 // RangeKeyDelete. We also must skip any keys that aren't visible at the 302 // provided snapshot sequence number. 303 // 304 // NB: Within a given sequence number, keys are ordered as: 305 // RangeKeySet > RangeKeyUnset > RangeKeyDelete 306 // This is significant, because this ensures that a Set or Unset sharing a 307 // sequence number with a Delete do not shadow each other. 308 deleteIdx := -1 309 for i := range keys { 310 if invariants.Enabled && i > 0 && keys[i].Trailer > keys[i-1].Trailer { 311 panic("pebble: invariant violation: span keys unordered") 312 } 313 if !keys[i].VisibleAt(snapshot) { 314 continue 315 } 316 // Once a RangeKeyDelete is observed, we know it shadows all subsequent 317 // keys and we can break early. We don't add the RangeKeyDelete key to 318 // keysBySuffix.keys yet, because we don't want a suffix-less key 319 // that appeared earlier in the slice to elide it. It'll be added back 320 // in at the end. 321 if keys[i].Kind() == base.InternalKeyKindRangeKeyDelete { 322 deleteIdx = i 323 break 324 } 325 keysBySuffix.Keys = append(keysBySuffix.Keys, keys[i]) 326 } 327 328 // Sort the accumulated keys by suffix. There may be duplicates within a 329 // suffix, in which case the one with a larger trailer survives. 330 // 331 // We use a stable sort so that the first key with a given suffix is the one 332 // that with the highest Trailer (because the input `keys` was sorted by 333 // trailer descending). 334 sort.Stable(keysBySuffix) 335 336 // Grab a handle of the full sorted slice, before reslicing 337 // keysBySuffix.keys to accumulate the final coalesced keys. 338 sorted := keysBySuffix.Keys 339 keysBySuffix.Keys = keysBySuffix.Keys[:0] 340 341 var ( 342 // prevSuffix is updated on each iteration of the below loop, and 343 // compared by the subsequent iteration to determine whether adjacent 344 // keys are defined at the same suffix. 345 prevSuffix []byte 346 // shadowing is set to true once any Key is shadowed by another key. 347 // When it's set to true—or after the loop if no keys are shadowed—the 348 // keysBySuffix.keys slice is resliced to contain the prefix of 349 // unshadowed keys. This avoids copying them incrementally in the common 350 // case of no shadowing. 351 shadowing bool 352 ) 353 for i := range sorted { 354 if i > 0 && equal(prevSuffix, sorted[i].Suffix) { 355 // Skip; this key is shadowed by the predecessor that had a larger 356 // Trailer. If this is the first shadowed key, set shadowing=true 357 // and reslice keysBySuffix.keys to hold the entire unshadowed 358 // prefix. 359 if !shadowing { 360 keysBySuffix.Keys = keysBySuffix.Keys[:i] 361 shadowing = true 362 } 363 continue 364 } 365 prevSuffix = sorted[i].Suffix 366 if shadowing { 367 keysBySuffix.Keys = append(keysBySuffix.Keys, sorted[i]) 368 } 369 } 370 // If there was no shadowing, keysBySuffix.keys is untouched. We can simply 371 // set it to the existing `sorted` slice (also backed by keysBySuffix.keys). 372 if !shadowing { 373 keysBySuffix.Keys = sorted 374 } 375 // If the original input `keys` slice contained a RangeKeyDelete, add it. 376 if deleteIdx >= 0 { 377 keysBySuffix.Keys = append(keysBySuffix.Keys, keys[deleteIdx]) 378 } 379 return nil 380 }