github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/compaction_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "encoding/binary" 10 "fmt" 11 "io" 12 "sort" 13 "strconv" 14 15 "github.com/cockroachdb/errors" 16 "github.com/cockroachdb/pebble/internal/base" 17 "github.com/cockroachdb/pebble/internal/bytealloc" 18 "github.com/cockroachdb/pebble/internal/keyspan" 19 "github.com/cockroachdb/pebble/internal/rangekey" 20 "github.com/cockroachdb/redact" 21 ) 22 23 // compactionIter provides a forward-only iterator that encapsulates the logic 24 // for collapsing entries during compaction. It wraps an internal iterator and 25 // collapses entries that are no longer necessary because they are shadowed by 26 // newer entries. The simplest example of this is when the internal iterator 27 // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries, 28 // compactionIter collapses the second entry because it is no longer 29 // necessary. The high-level structure for compactionIter is to iterate over 30 // its internal iterator and output 1 entry for every user-key. There are four 31 // complications to this story. 32 // 33 // 1. Eliding Deletion Tombstones 34 // 35 // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to 36 // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly 37 // shadows an entry at a lower level. If we're compacting to the base-level in 38 // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower 39 // level and can be elided. 40 // 41 // We can do slightly better than only eliding deletion tombstones at the base 42 // level by observing that we can elide a deletion tombstone if there are no 43 // sstables that contain the entry's key. This check is performed by 44 // elideTombstone. 45 // 46 // 2. Merges 47 // 48 // The MERGE operation merges the value for an entry with the existing value 49 // for an entry. The logical value of an entry can be composed of a series of 50 // merge operations. When compactionIter sees a MERGE, it scans forward in its 51 // internal iterator collapsing MERGE operations for the same key until it 52 // encounters a SET or DELETE operation. For example, the keys a.MERGE.4, 53 // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be 54 // merged using the specified Merger. 55 // 56 // An interesting case here occurs when MERGE is combined with SET. Consider 57 // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The 58 // reason that the kind is changed to SET is because the SET operation acts as 59 // a barrier preventing further merging. This can be seen better in the 60 // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower 61 // (older) level and not involved in the compaction. If the compaction of 62 // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with 63 // a.MERGE.1 would merge the values together incorrectly. 64 // 65 // 3. Snapshots 66 // 67 // Snapshots are lightweight point-in-time views of the DB state. At its core, 68 // a snapshot is a sequence number along with a guarantee from Pebble that it 69 // will maintain the view of the database at that sequence number. Part of this 70 // guarantee is relatively straightforward to achieve. When reading from the 71 // database Pebble will ignore sequence numbers that are larger than the 72 // snapshot sequence number. The primary complexity with snapshots occurs 73 // during compaction: the collapsing of entries that are shadowed by newer 74 // entries is at odds with the guarantee that Pebble will maintain the view of 75 // the database at the snapshot sequence number. Rather than collapsing entries 76 // up to the next user key, compactionIter can only collapse entries up to the 77 // next snapshot boundary. That is, every snapshot boundary potentially causes 78 // another entry for the same user-key to be emitted. Another way to view this 79 // is that snapshots define stripes and entries are collapsed within stripes, 80 // but not across stripes. Consider the following scenario: 81 // 82 // a.PUT.9 83 // a.DEL.8 84 // a.PUT.7 85 // a.DEL.6 86 // a.PUT.5 87 // 88 // In the absence of snapshots these entries would be collapsed to 89 // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can 90 // be divided into two stripes and collapsed within the stripes: 91 // 92 // a.PUT.9 a.PUT.9 93 // a.DEL.8 ---> 94 // a.PUT.7 95 // -- -- 96 // a.DEL.6 ---> a.DEL.6 97 // a.PUT.5 98 // 99 // All of the rules described earlier still apply, but they are confined to 100 // operate within a snapshot stripe. Snapshots only affect compaction when the 101 // snapshot sequence number lies within the range of sequence numbers being 102 // compacted. In the above example, a snapshot at sequence number 10 or at 103 // sequence number 5 would not have any effect. 104 // 105 // 4. Range Deletions 106 // 107 // Range deletions provide the ability to delete all of the keys (and values) 108 // in a contiguous range. Range deletions are stored indexed by their start 109 // key. The end key of the range is stored in the value. In order to support 110 // lookup of the range deletions which overlap with a particular key, the range 111 // deletion tombstones need to be fragmented whenever they overlap. This 112 // fragmentation is performed by keyspan.Fragmenter. The fragments are then 113 // subject to the rules for snapshots. For example, consider the two range 114 // tombstones [a,e)#1 and [c,g)#2: 115 // 116 // 2: c-------g 117 // 1: a-------e 118 // 119 // These tombstones will be fragmented into: 120 // 121 // 2: c---e---g 122 // 1: a---c---e 123 // 124 // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer 125 // depends on whether it is in a new snapshot stripe. 126 // 127 // In addition to the fragmentation of range tombstones, compaction also needs 128 // to take the range tombstones into consideration when outputting normal 129 // keys. Just as with point deletions, a range deletion covering an entry can 130 // cause the entry to be elided. 131 // 132 // A note on the stability of keys and values. 133 // 134 // The stability guarantees of keys and values returned by the iterator tree 135 // that backs a compactionIter is nuanced and care must be taken when 136 // referencing any returned items. 137 // 138 // Keys and values returned by exported functions (i.e. First, Next, etc.) have 139 // lifetimes that fall into two categories: 140 // 141 // Lifetime valid for duration of compaction. Range deletion keys and values are 142 // stable for the duration of the compaction, due to way in which a 143 // compactionIter is typically constructed (i.e. via (*compaction).newInputIter, 144 // which wraps the iterator over the range deletion block in a noCloseIter, 145 // preventing the release of the backing memory until the compaction is 146 // finished). 147 // 148 // Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL, 149 // etc.) and values must be cloned / copied following the return from the 150 // exported function, and before a subsequent call to Next advances the iterator 151 // and mutates the contents of the returned key and value. 152 type compactionIter struct { 153 equal Equal 154 merge Merge 155 iter internalIterator 156 err error 157 // `key.UserKey` is set to `keyBuf` caused by saving `i.iterKey.UserKey` 158 // and `key.Trailer` is set to `i.iterKey.Trailer`. This is the 159 // case on return from all public methods -- these methods return `key`. 160 // Additionally, it is the internal state when the code is moving to the 161 // next key so it can determine whether the user key has changed from 162 // the previous key. 163 key InternalKey 164 // keyTrailer is updated when `i.key` is updated and holds the key's 165 // original trailer (eg, before any sequence-number zeroing or changes to 166 // key kind). 167 keyTrailer uint64 168 value []byte 169 valueCloser io.Closer 170 // Temporary buffer used for storing the previous user key in order to 171 // determine when iteration has advanced to a new user key and thus a new 172 // snapshot stripe. 173 keyBuf []byte 174 // Temporary buffer used for storing the previous value, which may be an 175 // unsafe, i.iter-owned slice that could be altered when the iterator is 176 // advanced. 177 valueBuf []byte 178 // Is the current entry valid? 179 valid bool 180 iterKey *InternalKey 181 iterValue []byte 182 iterStripeChange stripeChangeType 183 // `skip` indicates whether the remaining skippable entries in the current 184 // snapshot stripe should be skipped or processed. An example of a non- 185 // skippable entry is a range tombstone as we need to return it from the 186 // `compactionIter`, even if a key covering its start key has already been 187 // seen in the same stripe. `skip` has no effect when `pos == iterPosNext`. 188 // 189 // TODO(jackson): If we use keyspan.InterleavingIter for range deletions, 190 // like we do for range keys, the only remaining 'non-skippable' key is 191 // the invalid key. We should be able to simplify this logic and remove this 192 // field. 193 skip bool 194 // `pos` indicates the iterator position at the top of `Next()`. Its type's 195 // (`iterPos`) values take on the following meanings in the context of 196 // `compactionIter`. 197 // 198 // - `iterPosCur`: the iterator is at the last key returned. 199 // - `iterPosNext`: the iterator has already been advanced to the next 200 // candidate key. For example, this happens when processing merge operands, 201 // where we advance the iterator all the way into the next stripe or next 202 // user key to ensure we've seen all mergeable operands. 203 // - `iterPosPrev`: this is invalid as compactionIter is forward-only. 204 pos iterPos 205 // `snapshotPinned` indicates whether the last point key returned by the 206 // compaction iterator was only returned because an open snapshot prevents 207 // its elision. This field only applies to point keys, and not to range 208 // deletions or range keys. 209 // 210 // For MERGE, it is possible that doing the merge is interrupted even when 211 // the next point key is in the same stripe. This can happen if the loop in 212 // mergeNext gets interrupted by sameStripeNonSkippable. 213 // sameStripeNonSkippable occurs due to RANGEDELs that sort before 214 // SET/MERGE/DEL with the same seqnum, so the RANGEDEL does not necessarily 215 // delete the subsequent SET/MERGE/DEL keys. 216 snapshotPinned bool 217 // forceObsoleteDueToRangeDel is set to true in a subset of the cases that 218 // snapshotPinned is true. This value is true when the point is obsolete due 219 // to a RANGEDEL but could not be deleted due to a snapshot. 220 // 221 // NB: it may seem that the additional cases that snapshotPinned captures 222 // are harmless in that they can also be used to mark a point as obsolete 223 // (it is merely a duplication of some logic that happens in 224 // Writer.AddWithForceObsolete), but that is not quite accurate as of this 225 // writing -- snapshotPinned originated in stats collection and for a 226 // sequence MERGE, SET, where the MERGE cannot merge with the (older) SET 227 // due to a snapshot, the snapshotPinned value for the SET is true. 228 // 229 // TODO(sumeer,jackson): improve the logic of snapshotPinned and reconsider 230 // whether we need forceObsoleteDueToRangeDel. 231 forceObsoleteDueToRangeDel bool 232 // The index of the snapshot for the current key within the snapshots slice. 233 curSnapshotIdx int 234 curSnapshotSeqNum uint64 235 // The snapshot sequence numbers that need to be maintained. These sequence 236 // numbers define the snapshot stripes (see the Snapshots description 237 // above). The sequence numbers are in ascending order. 238 snapshots []uint64 239 // frontiers holds a heap of user keys that affect compaction behavior when 240 // they're exceeded. Before a new key is returned, the compaction iterator 241 // advances the frontier, notifying any code that subscribed to be notified 242 // when a key was reached. The primary use today is within the 243 // implementation of compactionOutputSplitters in compaction.go. Many of 244 // these splitters wait for the compaction iterator to call Advance(k) when 245 // it's returning a new key. If the key that they're waiting for is 246 // surpassed, these splitters update internal state recording that they 247 // should request a compaction split next time they're asked in 248 // [shouldSplitBefore]. 249 frontiers frontiers 250 // Reference to the range deletion tombstone fragmenter (e.g., 251 // `compaction.rangeDelFrag`). 252 rangeDelFrag *keyspan.Fragmenter 253 rangeKeyFrag *keyspan.Fragmenter 254 // The fragmented tombstones. 255 tombstones []keyspan.Span 256 // The fragmented range keys. 257 rangeKeys []keyspan.Span 258 // Byte allocator for the tombstone keys. 259 alloc bytealloc.A 260 allowZeroSeqNum bool 261 elideTombstone func(key []byte) bool 262 elideRangeTombstone func(start, end []byte) bool 263 ineffectualSingleDeleteCallback func(userKey []byte) 264 singleDeleteInvariantViolationCallback func(userKey []byte) 265 // The on-disk format major version. This informs the types of keys that 266 // may be written to disk during a compaction. 267 formatVersion FormatMajorVersion 268 stats struct { 269 // count of DELSIZED keys that were missized. 270 countMissizedDels uint64 271 } 272 } 273 274 func newCompactionIter( 275 cmp Compare, 276 equal Equal, 277 formatKey base.FormatKey, 278 merge Merge, 279 iter internalIterator, 280 snapshots []uint64, 281 rangeDelFrag *keyspan.Fragmenter, 282 rangeKeyFrag *keyspan.Fragmenter, 283 allowZeroSeqNum bool, 284 elideTombstone func(key []byte) bool, 285 elideRangeTombstone func(start, end []byte) bool, 286 ineffectualSingleDeleteCallback func(userKey []byte), 287 singleDeleteInvariantViolationCallback func(userKey []byte), 288 formatVersion FormatMajorVersion, 289 ) *compactionIter { 290 i := &compactionIter{ 291 equal: equal, 292 merge: merge, 293 iter: iter, 294 snapshots: snapshots, 295 frontiers: frontiers{cmp: cmp}, 296 rangeDelFrag: rangeDelFrag, 297 rangeKeyFrag: rangeKeyFrag, 298 allowZeroSeqNum: allowZeroSeqNum, 299 elideTombstone: elideTombstone, 300 elideRangeTombstone: elideRangeTombstone, 301 ineffectualSingleDeleteCallback: ineffectualSingleDeleteCallback, 302 singleDeleteInvariantViolationCallback: singleDeleteInvariantViolationCallback, 303 formatVersion: formatVersion, 304 } 305 i.rangeDelFrag.Cmp = cmp 306 i.rangeDelFrag.Format = formatKey 307 i.rangeDelFrag.Emit = i.emitRangeDelChunk 308 i.rangeKeyFrag.Cmp = cmp 309 i.rangeKeyFrag.Format = formatKey 310 i.rangeKeyFrag.Emit = i.emitRangeKeyChunk 311 return i 312 } 313 314 func (i *compactionIter) First() (*InternalKey, []byte) { 315 if i.err != nil { 316 return nil, nil 317 } 318 var iterValue LazyValue 319 i.iterKey, iterValue = i.iter.First() 320 i.iterValue, _, i.err = iterValue.Value(nil) 321 if i.err != nil { 322 return nil, nil 323 } 324 if i.iterKey != nil { 325 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots) 326 } 327 i.pos = iterPosNext 328 i.iterStripeChange = newStripeNewKey 329 return i.Next() 330 } 331 332 func (i *compactionIter) Next() (*InternalKey, []byte) { 333 if i.err != nil { 334 return nil, nil 335 } 336 337 // Close the closer for the current value if one was open. 338 if i.closeValueCloser() != nil { 339 return nil, nil 340 } 341 342 // Prior to this call to `Next()` we are in one of four situations with 343 // respect to `iterKey` and related state: 344 // 345 // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key. 346 // - `!skip && pos == iterPosCurForward`: We are at the key that has been returned. 347 // To move forward we advance by one key, even if that lands us in the same 348 // snapshot stripe. 349 // - `skip && pos == iterPosCurForward`: We are at the key that has been returned. 350 // To move forward we skip skippable entries in the stripe. 351 // - `skip && pos == iterPosNext && i.iterStripeChange == sameStripeNonSkippable`: 352 // This case may occur when skipping within a snapshot stripe and we 353 // encounter either: 354 // a) an invalid key kind; The previous call will have returned 355 // whatever key it was processing and deferred handling of the 356 // invalid key to this invocation of Next(). We're responsible for 357 // ignoring skip=true and falling into the invalid key kind case 358 // down below. 359 // b) an interleaved range delete; This is a wart of the current code 360 // structure. While skipping within a snapshot stripe, a range 361 // delete interleaved at its start key and sequence number 362 // interrupts the sequence of point keys. After we return the range 363 // delete to the caller, we need to pick up skipping at where we 364 // left off, so we preserve skip=true. 365 // TODO(jackson): This last case is confusing and can be removed if we 366 // interleave range deletions at the maximal sequence number using the 367 // keyspan interleaving iterator. This is the treatment given to range 368 // keys today. 369 if i.pos == iterPosCurForward { 370 if i.skip { 371 i.skipInStripe() 372 } else { 373 i.nextInStripe() 374 } 375 } else if i.skip { 376 if i.iterStripeChange != sameStripeNonSkippable { 377 panic(errors.AssertionFailedf("compaction iterator has skip=true, but iterator is at iterPosNext")) 378 } 379 } 380 381 i.pos = iterPosCurForward 382 i.valid = false 383 384 for i.iterKey != nil { 385 // If we entered a new snapshot stripe with the same key, any key we 386 // return on this iteration is only returned because the open snapshot 387 // prevented it from being elided or merged with the key returned for 388 // the previous stripe. Mark it as pinned so that the compaction loop 389 // can correctly populate output tables' pinned statistics. We might 390 // also set snapshotPinned=true down below if we observe that the key is 391 // deleted by a range deletion in a higher stripe or that this key is a 392 // tombstone that could be elided if only it were in the last snapshot 393 // stripe. 394 i.snapshotPinned = i.iterStripeChange == newStripeSameKey 395 396 if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) { 397 // Return the span so the compaction can use it for file truncation and add 398 // it to the relevant fragmenter. We do not set `skip` to true before 399 // returning as there may be a forthcoming point key with the same user key 400 // and sequence number. Such a point key must be visible (i.e., not skipped 401 // over) since we promise point keys are not deleted by range tombstones at 402 // the same sequence number. 403 // 404 // Although, note that `skip` may already be true before reaching here 405 // due to an earlier key in the stripe. Then it is fine to leave it set 406 // to true, as the earlier key must have had a higher sequence number. 407 // 408 // NOTE: there is a subtle invariant violation here in that calling 409 // saveKey and returning a reference to the temporary slice violates 410 // the stability guarantee for range deletion keys. A potential 411 // mediation could return the original iterKey and iterValue 412 // directly, as the backing memory is guaranteed to be stable until 413 // the compaction completes. The violation here is only minor in 414 // that the caller immediately clones the range deletion InternalKey 415 // when passing the key to the deletion fragmenter (see the 416 // call-site in compaction.go). 417 // TODO(travers): address this violation by removing the call to 418 // saveKey and instead return the original iterKey and iterValue. 419 // This goes against the comment on i.key in the struct, and 420 // therefore warrants some investigation. 421 i.saveKey() 422 // TODO(jackson): Handle tracking pinned statistics for range keys 423 // and range deletions. This would require updating 424 // emitRangeDelChunk and rangeKeyCompactionTransform to update 425 // statistics when they apply their own snapshot striping logic. 426 i.snapshotPinned = false 427 i.value = i.iterValue 428 i.valid = true 429 return &i.key, i.value 430 } 431 432 // TODO(sumeer): we could avoid calling Covers if i.iterStripeChange == 433 // sameStripeSameKey since that check has already been done in 434 // nextInStripeHelper. However, we also need to handle the case of 435 // CoversInvisibly below. 436 if cover := i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum); cover == keyspan.CoversVisibly { 437 // A pending range deletion deletes this key. Skip it. 438 i.saveKey() 439 i.skipInStripe() 440 continue 441 } else if cover == keyspan.CoversInvisibly { 442 // i.iterKey would be deleted by a range deletion if there weren't 443 // any open snapshots. Mark it as pinned. 444 // 445 // NB: there are multiple places in this file where we call 446 // i.rangeDelFrag.Covers and this is the only one where we are writing 447 // to i.snapshotPinned. Those other cases occur in mergeNext where the 448 // caller is deciding whether the value should be merged or not, and the 449 // key is in the same snapshot stripe. Hence, snapshotPinned is by 450 // definition false in those cases. 451 i.snapshotPinned = true 452 i.forceObsoleteDueToRangeDel = true 453 } else { 454 i.forceObsoleteDueToRangeDel = false 455 } 456 457 switch i.iterKey.Kind() { 458 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 459 if i.elideTombstone(i.iterKey.UserKey) { 460 if i.curSnapshotIdx == 0 { 461 // If we're at the last snapshot stripe and the tombstone 462 // can be elided skip skippable keys in the same stripe. 463 i.saveKey() 464 if i.key.Kind() == InternalKeyKindSingleDelete { 465 i.skipDueToSingleDeleteElision() 466 } else { 467 i.skipInStripe() 468 if !i.skip && i.iterStripeChange != newStripeNewKey { 469 panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe disabled skip without advancing to new key")) 470 } 471 } 472 if i.iterStripeChange == newStripeSameKey { 473 panic(errors.AssertionFailedf("pebble: skipInStripe in last stripe found a new stripe within the same key")) 474 } 475 continue 476 } else { 477 // We're not at the last snapshot stripe, so the tombstone 478 // can NOT yet be elided. Mark it as pinned, so that it's 479 // included in table statistics appropriately. 480 i.snapshotPinned = true 481 } 482 } 483 484 switch i.iterKey.Kind() { 485 case InternalKeyKindDelete: 486 i.saveKey() 487 i.value = i.iterValue 488 i.valid = true 489 i.skip = true 490 return &i.key, i.value 491 492 case InternalKeyKindDeleteSized: 493 // We may skip subsequent keys because of this tombstone. Scan 494 // ahead to see just how much data this tombstone drops and if 495 // the tombstone's value should be updated accordingly. 496 return i.deleteSizedNext() 497 498 case InternalKeyKindSingleDelete: 499 if i.singleDeleteNext() { 500 return &i.key, i.value 501 } else if i.err != nil { 502 return nil, nil 503 } 504 continue 505 506 default: 507 panic(errors.AssertionFailedf( 508 "unexpected kind %s", redact.SafeString(i.iterKey.Kind().String()))) 509 } 510 511 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 512 // The key we emit for this entry is a function of the current key 513 // kind, and whether this entry is followed by a DEL/SINGLEDEL 514 // entry. setNext() does the work to move the iterator forward, 515 // preserving the original value, and potentially mutating the key 516 // kind. 517 i.setNext() 518 if i.err != nil { 519 return nil, nil 520 } 521 return &i.key, i.value 522 523 case InternalKeyKindMerge: 524 // Record the snapshot index before mergeNext as merging 525 // advances the iterator, adjusting curSnapshotIdx. 526 origSnapshotIdx := i.curSnapshotIdx 527 var valueMerger ValueMerger 528 valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue) 529 var change stripeChangeType 530 if i.err == nil { 531 change = i.mergeNext(valueMerger) 532 } 533 var needDelete bool 534 if i.err == nil { 535 // includesBase is true whenever we've transformed the MERGE record 536 // into a SET. 537 var includesBase bool 538 switch i.key.Kind() { 539 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 540 includesBase = true 541 case InternalKeyKindMerge: 542 default: 543 panic(errors.AssertionFailedf( 544 "unexpected kind %s", redact.SafeString(i.key.Kind().String()))) 545 } 546 i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase) 547 } 548 if i.err == nil { 549 if needDelete { 550 i.valid = false 551 if i.closeValueCloser() != nil { 552 return nil, nil 553 } 554 continue 555 } 556 // A non-skippable entry does not necessarily cover later merge 557 // operands, so we must not zero the current merge result's seqnum. 558 // 559 // For example, suppose the forthcoming two keys are a range 560 // tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that 561 // range tombstones do not cover point keys at the same seqnum, so 562 // `a#3` is not deleted. The range tombstone will be seen first due 563 // to its larger value type. Since it is a non-skippable key, the 564 // current merge will not include `a#3`. If we zeroed the current 565 // merge result's seqnum, then it would conflict with the upcoming 566 // merge including `a#3`, whose seqnum will also be zeroed. 567 if change != sameStripeNonSkippable { 568 i.maybeZeroSeqnum(origSnapshotIdx) 569 } 570 return &i.key, i.value 571 } 572 if i.err != nil { 573 i.valid = false 574 // TODO(sumeer): why is MarkCorruptionError only being called for 575 // MERGE? 576 i.err = base.MarkCorruptionError(i.err) 577 } 578 return nil, nil 579 580 default: 581 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 582 i.valid = false 583 return nil, nil 584 } 585 } 586 587 return nil, nil 588 } 589 590 func (i *compactionIter) closeValueCloser() error { 591 if i.valueCloser == nil { 592 return nil 593 } 594 595 i.err = i.valueCloser.Close() 596 i.valueCloser = nil 597 if i.err != nil { 598 i.valid = false 599 } 600 return i.err 601 } 602 603 // snapshotIndex returns the index of the first sequence number in snapshots 604 // which is greater than or equal to seq. 605 func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) { 606 index := sort.Search(len(snapshots), func(i int) bool { 607 return snapshots[i] > seq 608 }) 609 if index >= len(snapshots) { 610 return index, InternalKeySeqNumMax 611 } 612 return index, snapshots[index] 613 } 614 615 // skipInStripe skips over skippable keys in the same stripe and user key. It 616 // may set i.err, in which case i.iterKey will be nil. 617 func (i *compactionIter) skipInStripe() { 618 i.skip = true 619 // TODO(sumeer): we can avoid the overhead of calling i.rangeDelFrag.Covers, 620 // in this case of nextInStripe, since we are skipping all of them anyway. 621 for i.nextInStripe() == sameStripeSkippable { 622 if i.err != nil { 623 panic(i.err) 624 } 625 } 626 // Reset skip if we landed outside the original stripe. Otherwise, we landed 627 // in the same stripe on a non-skippable key. In that case we should preserve 628 // `i.skip == true` such that later keys in the stripe will continue to be 629 // skipped. 630 if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { 631 i.skip = false 632 } 633 } 634 635 func (i *compactionIter) iterNext() bool { 636 var iterValue LazyValue 637 i.iterKey, iterValue = i.iter.Next() 638 i.iterValue, _, i.err = iterValue.Value(nil) 639 if i.err != nil { 640 i.iterKey = nil 641 } 642 return i.iterKey != nil 643 } 644 645 // stripeChangeType indicates how the snapshot stripe changed relative to the 646 // previous key. If no change, it also indicates whether the current entry is 647 // skippable. If the snapshot stripe changed, it also indicates whether the new 648 // stripe was entered because the iterator progressed onto an entirely new key 649 // or entered a new stripe within the same key. 650 type stripeChangeType int 651 652 const ( 653 newStripeNewKey stripeChangeType = iota 654 newStripeSameKey 655 sameStripeSkippable 656 sameStripeNonSkippable 657 ) 658 659 // nextInStripe advances the iterator and returns one of the above const ints 660 // indicating how its state changed. 661 // 662 // All sameStripeSkippable keys that are covered by a RANGEDEL will be skipped 663 // and not returned. 664 // 665 // Calls to nextInStripe must be preceded by a call to saveKey to retain a 666 // temporary reference to the original key, so that forward iteration can 667 // proceed with a reference to the original key. Care should be taken to avoid 668 // overwriting or mutating the saved key or value before they have been returned 669 // to the caller of the exported function (i.e. the caller of Next, First, etc.) 670 // 671 // nextInStripe may set i.err, in which case the return value will be 672 // newStripeNewKey, and i.iterKey will be nil. 673 func (i *compactionIter) nextInStripe() stripeChangeType { 674 i.iterStripeChange = i.nextInStripeHelper() 675 return i.iterStripeChange 676 } 677 678 // nextInStripeHelper is an internal helper for nextInStripe; callers should use 679 // nextInStripe and not call nextInStripeHelper. 680 func (i *compactionIter) nextInStripeHelper() stripeChangeType { 681 origSnapshotIdx := i.curSnapshotIdx 682 for { 683 if !i.iterNext() { 684 return newStripeNewKey 685 } 686 key := i.iterKey 687 688 if !i.equal(i.key.UserKey, key.UserKey) { 689 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 690 return newStripeNewKey 691 } 692 693 // If i.key and key have the same user key, then 694 // 1. i.key must not have had a zero sequence number (or it would've be the last 695 // key with its user key). 696 // 2. i.key must have a strictly larger sequence number 697 // There's an exception in that either key may be a range delete. Range 698 // deletes may share a sequence number with a point key if the keys were 699 // ingested together. Range keys may also share the sequence number if they 700 // were ingested, but range keys are interleaved into the compaction 701 // iterator's input iterator at the maximal sequence number so their 702 // original sequence number will not be observed here. 703 if prevSeqNum := base.SeqNumFromTrailer(i.keyTrailer); (prevSeqNum == 0 || prevSeqNum <= key.SeqNum()) && 704 i.key.Kind() != InternalKeyKindRangeDelete && key.Kind() != InternalKeyKindRangeDelete { 705 prevKey := i.key 706 prevKey.Trailer = i.keyTrailer 707 panic(errors.AssertionFailedf("pebble: invariant violation: %s and %s out of order", prevKey, key)) 708 } 709 710 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 711 switch key.Kind() { 712 case InternalKeyKindRangeDelete: 713 // Range tombstones need to be exposed by the compactionIter to the upper level 714 // `compaction` object, so return them regardless of whether they are in the same 715 // snapshot stripe. 716 if i.curSnapshotIdx == origSnapshotIdx { 717 return sameStripeNonSkippable 718 } 719 return newStripeSameKey 720 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 721 // Range keys are interleaved at the max sequence number for a given user 722 // key, so we should not see any more range keys in this stripe. 723 panic("unreachable") 724 case InternalKeyKindInvalid: 725 if i.curSnapshotIdx == origSnapshotIdx { 726 return sameStripeNonSkippable 727 } 728 return newStripeSameKey 729 case InternalKeyKindDelete, InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSingleDelete, 730 InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: 731 // Fall through 732 default: 733 i.iterKey = nil 734 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 735 i.valid = false 736 return newStripeNewKey 737 } 738 if i.curSnapshotIdx == origSnapshotIdx { 739 // Same snapshot. 740 if i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.CoversVisibly { 741 continue 742 } 743 return sameStripeSkippable 744 } 745 return newStripeSameKey 746 } 747 } 748 749 func (i *compactionIter) setNext() { 750 // Save the current key. 751 i.saveKey() 752 i.value = i.iterValue 753 i.valid = true 754 i.maybeZeroSeqnum(i.curSnapshotIdx) 755 756 // There are two cases where we can early return and skip the remaining 757 // records in the stripe: 758 // - If the DB does not SETWITHDEL. 759 // - If this key is already a SETWITHDEL. 760 if i.formatVersion < FormatSetWithDelete || 761 i.iterKey.Kind() == InternalKeyKindSetWithDelete { 762 i.skip = true 763 return 764 } 765 766 // We are iterating forward. Save the current value. 767 i.valueBuf = append(i.valueBuf[:0], i.iterValue...) 768 i.value = i.valueBuf 769 770 // Else, we continue to loop through entries in the stripe looking for a 771 // DEL. Note that we may stop *before* encountering a DEL, if one exists. 772 // 773 // NB: nextInStripe will skip sameStripeSkippable keys that are visibly 774 // covered by a RANGEDEL. This can include DELs -- this is fine since such 775 // DELs don't need to be combined with SET to make SETWITHDEL. 776 for { 777 switch i.nextInStripe() { 778 case newStripeNewKey, newStripeSameKey: 779 i.pos = iterPosNext 780 return 781 case sameStripeNonSkippable: 782 i.pos = iterPosNext 783 // We iterated onto a key that we cannot skip. We can 784 // conservatively transform the original SET into a SETWITHDEL 785 // as an indication that there *may* still be a DEL/SINGLEDEL 786 // under this SET, even if we did not actually encounter one. 787 // 788 // This is safe to do, as: 789 // 790 // - in the case that there *is not* actually a DEL/SINGLEDEL 791 // under this entry, any SINGLEDEL above this now-transformed 792 // SETWITHDEL will become a DEL when the two encounter in a 793 // compaction. The DEL will eventually be elided in a 794 // subsequent compaction. The cost for ensuring correctness is 795 // that this entry is kept around for an additional compaction 796 // cycle(s). 797 // 798 // - in the case there *is* indeed a DEL/SINGLEDEL under us 799 // (but in a different stripe or sstable), then we will have 800 // already done the work to transform the SET into a 801 // SETWITHDEL, and we will skip any additional iteration when 802 // this entry is encountered again in a subsequent compaction. 803 // 804 // Ideally, this codepath would be smart enough to handle the 805 // case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- .... 806 // This requires preserving any RANGEDEL entries we encounter 807 // along the way, then emitting the original (possibly 808 // transformed) key, followed by the RANGEDELs. This requires 809 // a sizable refactoring of the existing code, as nextInStripe 810 // currently returns a sameStripeNonSkippable when it 811 // encounters a RANGEDEL. 812 // TODO(travers): optimize to handle the RANGEDEL case if it 813 // turns out to be a performance problem. 814 i.key.SetKind(InternalKeyKindSetWithDelete) 815 816 // By setting i.skip=true, we are saying that after the 817 // non-skippable key is emitted (which is likely a RANGEDEL), 818 // the remaining point keys that share the same user key as this 819 // saved key should be skipped. 820 i.skip = true 821 return 822 case sameStripeSkippable: 823 // We're still in the same stripe. If this is a 824 // DEL/SINGLEDEL/DELSIZED, we stop looking and emit a SETWITHDEL. 825 // Subsequent keys are eligible for skipping. 826 switch i.iterKey.Kind() { 827 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 828 i.key.SetKind(InternalKeyKindSetWithDelete) 829 i.skip = true 830 return 831 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete: 832 // Do nothing 833 default: 834 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 835 i.valid = false 836 } 837 default: 838 panic("pebble: unexpected stripeChangeType: " + strconv.Itoa(int(i.iterStripeChange))) 839 } 840 } 841 } 842 843 func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { 844 // Save the current key. 845 i.saveKey() 846 i.valid = true 847 848 // Loop looking for older values in the current snapshot stripe and merge 849 // them. 850 for { 851 if i.nextInStripe() != sameStripeSkippable { 852 i.pos = iterPosNext 853 return i.iterStripeChange 854 } 855 if i.err != nil { 856 panic(i.err) 857 } 858 // NB: MERGE#10+RANGEDEL#9 stays a MERGE, since nextInStripe skips 859 // sameStripeSkippable keys that are visibly covered by a RANGEDEL. There 860 // may be MERGE#7 that is invisibly covered and will be preserved, but 861 // there is no risk that MERGE#10 and MERGE#7 will get merged in the 862 // future as the RANGEDEL still exists and will be used in user-facing 863 // reads that see MERGE#10, and will also eventually cause MERGE#7 to be 864 // deleted in a compaction. 865 key := i.iterKey 866 switch key.Kind() { 867 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 868 // We've hit a deletion tombstone. Return everything up to this point and 869 // then skip entries until the next snapshot stripe. We change the kind 870 // of the result key to a Set so that it shadows keys in lower 871 // levels. That is, MERGE+DEL -> SETWITHDEL. 872 // 873 // We do the same for SingleDelete since SingleDelete is only 874 // permitted (with deterministic behavior) for keys that have been 875 // set once since the last SingleDelete/Delete, so everything 876 // older is acceptable to shadow. Note that this is slightly 877 // different from singleDeleteNext() which implements stricter 878 // semantics in terms of applying the SingleDelete to the single 879 // next Set. But those stricter semantics are not observable to 880 // the end-user since Iterator interprets SingleDelete as Delete. 881 // We could do something more complicated here and consume only a 882 // single Set, and then merge in any following Sets, but that is 883 // complicated wrt code and unnecessary given the narrow permitted 884 // use of SingleDelete. 885 i.key.SetKind(InternalKeyKindSetWithDelete) 886 i.skip = true 887 return sameStripeSkippable 888 889 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 890 // We've hit a Set or SetWithDel value. Merge with the existing 891 // value and return. We change the kind of the resulting key to a 892 // Set so that it shadows keys in lower levels. That is: 893 // MERGE + (SET*) -> SET. 894 i.err = valueMerger.MergeOlder(i.iterValue) 895 if i.err != nil { 896 i.valid = false 897 return sameStripeSkippable 898 } 899 i.key.SetKind(InternalKeyKindSet) 900 i.skip = true 901 return sameStripeSkippable 902 903 case InternalKeyKindMerge: 904 // We've hit another Merge value. Merge with the existing value and 905 // continue looping. 906 i.err = valueMerger.MergeOlder(i.iterValue) 907 if i.err != nil { 908 i.valid = false 909 return sameStripeSkippable 910 } 911 912 default: 913 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 914 i.valid = false 915 return sameStripeSkippable 916 } 917 } 918 } 919 920 // singleDeleteNext processes a SingleDelete point tombstone. A SingleDelete, or 921 // SINGLEDEL, is unique in that it deletes exactly 1 internal key. It's a 922 // performance optimization when the client knows a user key has not been 923 // overwritten, allowing the elision of the tombstone earlier, avoiding write 924 // amplification. 925 // 926 // singleDeleteNext returns a boolean indicating whether or not the caller 927 // should yield the SingleDelete key to the consumer of the compactionIter. If 928 // singleDeleteNext returns false, the caller may consume/elide the 929 // SingleDelete. 930 func (i *compactionIter) singleDeleteNext() bool { 931 // Save the current key. 932 i.saveKey() 933 i.value = i.iterValue 934 i.valid = true 935 936 // Loop until finds a key to be passed to the next level. 937 for { 938 // If we find a key that can't be skipped, return true so that the 939 // caller yields the SingleDelete to the caller. 940 if i.nextInStripe() != sameStripeSkippable { 941 // This defers additional error checking regarding single delete 942 // invariants to the compaction where the keys with the same user key as 943 // the single delete are in the same stripe. 944 i.pos = iterPosNext 945 return i.err == nil 946 } 947 if i.err != nil { 948 panic(i.err) 949 } 950 // INVARIANT: sameStripeSkippable. 951 key := i.iterKey 952 kind := key.Kind() 953 switch kind { 954 case InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized: 955 if (kind == InternalKeyKindDelete || kind == InternalKeyKindDeleteSized) && 956 i.ineffectualSingleDeleteCallback != nil { 957 i.ineffectualSingleDeleteCallback(i.key.UserKey) 958 } 959 // We've hit a Delete, DeleteSized, SetWithDelete, transform 960 // the SingleDelete into a full Delete. 961 i.key.SetKind(InternalKeyKindDelete) 962 i.skip = true 963 return true 964 965 case InternalKeyKindSet, InternalKeyKindMerge: 966 // This SingleDelete deletes the Set/Merge, and we can now elide the 967 // SingleDel as well. We advance past the Set and return false to 968 // indicate to the main compaction loop that we should NOT yield the 969 // current SingleDel key to the compaction loop. 970 // 971 // NB: singleDeleteNext was called with i.pos == iterPosCurForward, and 972 // after the call to nextInStripe, we are still at iterPosCurForward, 973 // since we are at the key after the Set/Merge that was single deleted. 974 change := i.nextInStripe() 975 switch change { 976 case sameStripeSkippable, newStripeSameKey: 977 // On the same user key. 978 nextKind := i.iterKey.Kind() 979 switch nextKind { 980 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: 981 if i.singleDeleteInvariantViolationCallback != nil { 982 // sameStripeSkippable keys returned by nextInStripe() are already 983 // known to not be covered by a RANGEDEL, so it is an invariant 984 // violation. The rare case is newStripeSameKey, where it is a 985 // violation if not covered by a RANGEDEL. 986 if change == sameStripeSkippable || 987 i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) == keyspan.NoCover { 988 i.singleDeleteInvariantViolationCallback(i.key.UserKey) 989 } 990 } 991 case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, 992 InternalKeyKindRangeDelete: 993 default: 994 panic(errors.AssertionFailedf( 995 "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) 996 } 997 case sameStripeNonSkippable: 998 // No ability to check whether there is another Set/Merge below with 999 // the same user key. 1000 // 1001 // TODO(sumeer): once range deletions are interleaved at the maximal 1002 // sequence number, this case will go away. 1003 case newStripeNewKey: 1004 default: 1005 panic("unreachable") 1006 } 1007 i.valid = false 1008 return false 1009 1010 case InternalKeyKindSingleDelete: 1011 // Two single deletes met in a compaction. The first single delete is 1012 // ineffectual. 1013 if i.ineffectualSingleDeleteCallback != nil { 1014 i.ineffectualSingleDeleteCallback(i.key.UserKey) 1015 } 1016 // Continue to apply the second single delete. 1017 continue 1018 1019 default: 1020 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 1021 i.valid = false 1022 return false 1023 } 1024 } 1025 } 1026 1027 // skipDueToSingleDeleteElision is called when the SingleDelete is being 1028 // elided because it is in the final snapshot stripe and there are no keys 1029 // with the same user key in lower levels in the LSM (below the files in this 1030 // compaction). 1031 // 1032 // TODO(sumeer): the only difference between singleDeleteNext and 1033 // skipDueToSingleDeleteElision is the fact that the caller knows it will be 1034 // eliding the single delete in the latter case. There are some similar things 1035 // happening in both implementations. My first attempt at combining them into 1036 // a single method was hard to comprehend. Try again. 1037 func (i *compactionIter) skipDueToSingleDeleteElision() { 1038 for { 1039 stripeChange := i.nextInStripe() 1040 if i.err != nil { 1041 panic(i.err) 1042 } 1043 switch stripeChange { 1044 case newStripeNewKey: 1045 // The single delete is only now being elided, meaning it did not elide 1046 // any keys earlier in its descent down the LSM. We stepped onto a new 1047 // user key, meaning that even now at its moment of elision, it still 1048 // hasn't elided any other keys. The single delete was ineffectual (a 1049 // no-op). 1050 if i.ineffectualSingleDeleteCallback != nil { 1051 i.ineffectualSingleDeleteCallback(i.key.UserKey) 1052 } 1053 i.skip = false 1054 return 1055 case newStripeSameKey: 1056 // This should be impossible. If we're eliding a single delete, we 1057 // determined that the tombstone is in the final snapshot stripe, but we 1058 // stepped into a new stripe of the same key. 1059 panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) 1060 case sameStripeNonSkippable: 1061 // There's a key that we cannot skip. There are two possible cases: 1062 // a. The key is invalid. This is an error. 1063 // b. The key is a range deletion. 1064 // The second case may also be an ineffectual single delete. However, it 1065 // is possible that there is a SET that is at the same seqnum as the 1066 // RANGEDEL, and so is not deleted by that RANGEDEL, and will be deleted 1067 // by this single delete. So we cannot be certain that this is an 1068 // ineffectual single delete. 1069 // 1070 // TODO(sumeer): the existing todo to interleave range deletions at the 1071 // maximal sequence number will allow us to address this ambiguity. 1072 // 1073 // TODO(sumeer): by setting skip to true, the compactionIter is making a 1074 // single delete stronger (like a del), which will hide bugs in the use of 1075 // single delete. 1076 i.skip = true 1077 return 1078 case sameStripeSkippable: 1079 kind := i.iterKey.Kind() 1080 switch kind { 1081 case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: 1082 if i.ineffectualSingleDeleteCallback != nil { 1083 i.ineffectualSingleDeleteCallback(i.key.UserKey) 1084 } 1085 switch kind { 1086 case InternalKeyKindDelete, InternalKeyKindDeleteSized: 1087 i.skipInStripe() 1088 return 1089 case InternalKeyKindSingleDelete: 1090 // Repeat the same with this SingleDelete. We don't want to simply 1091 // call skipInStripe(), since it increases the strength of the 1092 // SingleDel, which hides bugs in the use of single delete. 1093 continue 1094 default: 1095 panic(errors.AssertionFailedf( 1096 "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) 1097 } 1098 case InternalKeyKindSetWithDelete: 1099 // The SingleDelete should behave like a Delete. 1100 i.skipInStripe() 1101 return 1102 case InternalKeyKindSet, InternalKeyKindMerge: 1103 // This SingleDelete deletes the Set/Merge, and we are eliding the 1104 // SingleDel as well. Step to the next key (this is not deleted by the 1105 // SingleDelete). 1106 // 1107 // NB: skipDueToSingleDeleteElision was called with i.pos == 1108 // iterPosCurForward, and after the call to nextInStripe, we are still 1109 // at iterPosCurForward, since we are at the key after the Set/Merge 1110 // that was single deleted. 1111 change := i.nextInStripe() 1112 if i.err != nil { 1113 panic(i.err) 1114 } 1115 switch change { 1116 case newStripeSameKey: 1117 panic(errors.AssertionFailedf("eliding single delete followed by same key in new stripe")) 1118 case newStripeNewKey: 1119 case sameStripeSkippable: 1120 // On the same key. 1121 nextKind := i.iterKey.Kind() 1122 switch nextKind { 1123 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: 1124 if i.singleDeleteInvariantViolationCallback != nil { 1125 i.singleDeleteInvariantViolationCallback(i.key.UserKey) 1126 } 1127 case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete, 1128 InternalKeyKindRangeDelete: 1129 default: 1130 panic(errors.AssertionFailedf( 1131 "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) 1132 } 1133 case sameStripeNonSkippable: 1134 // No ability to check whether there is another Set/Merge below with 1135 // the same user key. 1136 // 1137 // TODO(sumeer): once range deletions are interleaved at the maximal 1138 // sequence number, this case will go away. 1139 default: 1140 panic("unreachable") 1141 } 1142 // Whether in same stripe or new stripe, this key is not consumed by 1143 // the SingleDelete. 1144 i.skip = false 1145 return 1146 default: 1147 panic(errors.AssertionFailedf( 1148 "unexpected internal key kind: %d", errors.Safe(i.iterKey.Kind()))) 1149 } 1150 default: 1151 panic("unreachable") 1152 } 1153 } 1154 } 1155 1156 // deleteSizedNext processes a DELSIZED point tombstone. Unlike ordinary DELs, 1157 // these tombstones carry a value that's a varint indicating the size of the 1158 // entry (len(key)+len(value)) that the tombstone is expected to delete. 1159 // 1160 // When a deleteSizedNext is encountered, we skip ahead to see which keys, if 1161 // any, are elided as a result of the tombstone. 1162 func (i *compactionIter) deleteSizedNext() (*base.InternalKey, []byte) { 1163 i.saveKey() 1164 i.valid = true 1165 i.skip = true 1166 1167 // The DELSIZED tombstone may have no value at all. This happens when the 1168 // tombstone has already deleted the key that the user originally predicted. 1169 // In this case, we still peek forward in case there's another DELSIZED key 1170 // with a lower sequence number, in which case we'll adopt its value. 1171 if len(i.iterValue) == 0 { 1172 i.value = i.valueBuf[:0] 1173 } else { 1174 i.valueBuf = append(i.valueBuf[:0], i.iterValue...) 1175 i.value = i.valueBuf 1176 } 1177 1178 // Loop through all the keys within this stripe that are skippable. 1179 i.pos = iterPosNext 1180 for i.nextInStripe() == sameStripeSkippable { 1181 if i.err != nil { 1182 panic(i.err) 1183 } 1184 switch i.iterKey.Kind() { 1185 case InternalKeyKindDelete, InternalKeyKindDeleteSized, InternalKeyKindSingleDelete: 1186 // We encountered a tombstone (DEL, or DELSIZED) that's deleted by 1187 // the original DELSIZED tombstone. This can happen in two cases: 1188 // 1189 // (1) These tombstones were intended to delete two distinct values, 1190 // and this DELSIZED has already dropped the relevant key. For 1191 // example: 1192 // 1193 // a.DELSIZED.9 a.SET.7 a.DELSIZED.5 a.SET.4 1194 // 1195 // If a.DELSIZED.9 has already deleted a.SET.7, its size has 1196 // already been zeroed out. In this case, we want to adopt the 1197 // value of the DELSIZED with the lower sequence number, in 1198 // case the a.SET.4 key has not yet been elided. 1199 // 1200 // (2) This DELSIZED was missized. The user thought they were 1201 // deleting a key with this user key, but this user key had 1202 // already been deleted. 1203 // 1204 // We can differentiate these two cases by examining the length of 1205 // the DELSIZED's value. A DELSIZED's value holds the size of both 1206 // the user key and value that it intends to delete. For any user 1207 // key with a length > 0, a DELSIZED that has not deleted a key must 1208 // have a value with a length > 0. 1209 // 1210 // We treat both cases the same functionally, adopting the identity 1211 // of the lower-sequence numbered tombstone. However in the second 1212 // case, we also increment the stat counting missized tombstones. 1213 if len(i.value) > 0 { 1214 // The original DELSIZED key was missized. The key that the user 1215 // thought they were deleting does not exist. 1216 i.stats.countMissizedDels++ 1217 } 1218 i.valueBuf = append(i.valueBuf[:0], i.iterValue...) 1219 i.value = i.valueBuf 1220 if i.iterKey.Kind() != InternalKeyKindDeleteSized { 1221 // Convert the DELSIZED to a DEL—The DEL/SINGLEDEL we're eliding 1222 // may not have deleted the key(s) it was intended to yet. The 1223 // ordinary DEL compaction heuristics are better suited at that, 1224 // plus we don't want to count it as a missized DEL. We early 1225 // exit in this case, after skipping the remainder of the 1226 // snapshot stripe. 1227 i.key.SetKind(InternalKeyKindDelete) 1228 // NB: We skipInStripe now, rather than returning leaving 1229 // i.skip=true and returning early, because Next() requires 1230 // that i.skip=true only if i.iterPos = iterPosCurForward. 1231 // 1232 // Ignore any error caused by skipInStripe since it does not affect 1233 // the key/value being returned here, and the next call to Next() will 1234 // expose it. 1235 i.skipInStripe() 1236 return &i.key, i.value 1237 } 1238 // Continue, in case we uncover another DELSIZED or a key this 1239 // DELSIZED deletes. 1240 1241 case InternalKeyKindSet, InternalKeyKindMerge, InternalKeyKindSetWithDelete: 1242 // If the DELSIZED is value-less, it already deleted the key that it 1243 // was intended to delete. This is possible with a sequence like: 1244 // 1245 // DELSIZED.8 SET.7 SET.3 1246 // 1247 // The DELSIZED only describes the size of the SET.7, which in this 1248 // case has already been elided. We don't count it as a missizing, 1249 // instead converting the DELSIZED to a DEL. Skip the remainder of 1250 // the snapshot stripe and return. 1251 if len(i.value) == 0 { 1252 i.key.SetKind(InternalKeyKindDelete) 1253 // NB: We skipInStripe now, rather than returning leaving 1254 // i.skip=true and returning early, because Next() requires 1255 // that i.skip=true only if i.iterPos = iterPosCurForward. 1256 // 1257 // Ignore any error caused by skipInStripe since it does not affect 1258 // the key/value being returned here, and the next call to Next() will 1259 // expose it. 1260 i.skipInStripe() 1261 return &i.key, i.value 1262 } 1263 // The deleted key is not a DEL, DELSIZED, and the DELSIZED in i.key 1264 // has a positive size. 1265 expectedSize, n := binary.Uvarint(i.value) 1266 if n != len(i.value) { 1267 i.err = base.CorruptionErrorf("DELSIZED holds invalid value: %x", errors.Safe(i.value)) 1268 i.valid = false 1269 return nil, nil 1270 } 1271 elidedSize := uint64(len(i.iterKey.UserKey)) + uint64(len(i.iterValue)) 1272 if elidedSize != expectedSize { 1273 // The original DELSIZED key was missized. It's unclear what to 1274 // do. The user-provided size was wrong, so it's unlikely to be 1275 // accurate or meaningful. We could: 1276 // 1277 // 1. return the DELSIZED with the original user-provided size unmodified 1278 // 2. return the DELZIZED with a zeroed size to reflect that a key was 1279 // elided, even if it wasn't the anticipated size. 1280 // 3. subtract the elided size from the estimate and re-encode. 1281 // 4. convert the DELSIZED into a value-less DEL, so that 1282 // ordinary DEL heuristics apply. 1283 // 1284 // We opt for (4) under the rationale that we can't rely on the 1285 // user-provided size for accuracy, so ordinary DEL heuristics 1286 // are safer. 1287 i.stats.countMissizedDels++ 1288 i.key.SetKind(InternalKeyKindDelete) 1289 i.value = i.valueBuf[:0] 1290 // NB: We skipInStripe now, rather than returning leaving 1291 // i.skip=true and returning early, because Next() requires 1292 // that i.skip=true only if i.iterPos = iterPosCurForward. 1293 // 1294 // Ignore any error caused by skipInStripe since it does not affect 1295 // the key/value being returned here, and the next call to Next() will 1296 // expose it. 1297 i.skipInStripe() 1298 return &i.key, i.value 1299 } 1300 // NB: We remove the value regardless of whether the key was sized 1301 // appropriately. The size encoded is 'consumed' the first time it 1302 // meets a key that it deletes. 1303 i.value = i.valueBuf[:0] 1304 1305 default: 1306 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 1307 i.valid = false 1308 return nil, nil 1309 } 1310 } 1311 // Reset skip if we landed outside the original stripe. Otherwise, we landed 1312 // in the same stripe on a non-skippable key. In that case we should preserve 1313 // `i.skip == true` such that later keys in the stripe will continue to be 1314 // skipped. 1315 if i.iterStripeChange == newStripeNewKey || i.iterStripeChange == newStripeSameKey { 1316 i.skip = false 1317 } 1318 if i.err != nil { 1319 return nil, nil 1320 } 1321 return &i.key, i.value 1322 } 1323 1324 func (i *compactionIter) saveKey() { 1325 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 1326 i.key.UserKey = i.keyBuf 1327 i.key.Trailer = i.iterKey.Trailer 1328 i.keyTrailer = i.iterKey.Trailer 1329 i.frontiers.Advance(i.key.UserKey) 1330 } 1331 1332 func (i *compactionIter) cloneKey(key []byte) []byte { 1333 i.alloc, key = i.alloc.Copy(key) 1334 return key 1335 } 1336 1337 func (i *compactionIter) Key() InternalKey { 1338 return i.key 1339 } 1340 1341 func (i *compactionIter) Value() []byte { 1342 return i.value 1343 } 1344 1345 func (i *compactionIter) Valid() bool { 1346 return i.valid 1347 } 1348 1349 func (i *compactionIter) Error() error { 1350 return i.err 1351 } 1352 1353 func (i *compactionIter) Close() error { 1354 err := i.iter.Close() 1355 if i.err == nil { 1356 i.err = err 1357 } 1358 1359 // Close the closer for the current value if one was open. 1360 if i.valueCloser != nil { 1361 i.err = firstError(i.err, i.valueCloser.Close()) 1362 i.valueCloser = nil 1363 } 1364 1365 return i.err 1366 } 1367 1368 // Tombstones returns a list of pending range tombstones in the fragmenter 1369 // up to the specified key, or all pending range tombstones if key = nil. 1370 func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { 1371 if key == nil { 1372 i.rangeDelFrag.Finish() 1373 } else { 1374 // The specified end key is exclusive; no versions of the specified 1375 // user key (including range tombstones covering that key) should 1376 // be flushed yet. 1377 i.rangeDelFrag.TruncateAndFlushTo(key) 1378 } 1379 tombstones := i.tombstones 1380 i.tombstones = nil 1381 return tombstones 1382 } 1383 1384 // RangeKeys returns a list of pending fragmented range keys up to the specified 1385 // key, or all pending range keys if key = nil. 1386 func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span { 1387 if key == nil { 1388 i.rangeKeyFrag.Finish() 1389 } else { 1390 // The specified end key is exclusive; no versions of the specified 1391 // user key (including range tombstones covering that key) should 1392 // be flushed yet. 1393 i.rangeKeyFrag.TruncateAndFlushTo(key) 1394 } 1395 rangeKeys := i.rangeKeys 1396 i.rangeKeys = nil 1397 return rangeKeys 1398 } 1399 1400 func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) { 1401 // Apply the snapshot stripe rules, keeping only the latest tombstone for 1402 // each snapshot stripe. 1403 currentIdx := -1 1404 keys := fragmented.Keys[:0] 1405 for _, k := range fragmented.Keys { 1406 idx, _ := snapshotIndex(k.SeqNum(), i.snapshots) 1407 if currentIdx == idx { 1408 continue 1409 } 1410 if idx == 0 && i.elideRangeTombstone(fragmented.Start, fragmented.End) { 1411 // This is the last snapshot stripe and the range tombstone 1412 // can be elided. 1413 break 1414 } 1415 1416 keys = append(keys, k) 1417 if idx == 0 { 1418 // This is the last snapshot stripe. 1419 break 1420 } 1421 currentIdx = idx 1422 } 1423 if len(keys) > 0 { 1424 i.tombstones = append(i.tombstones, keyspan.Span{ 1425 Start: fragmented.Start, 1426 End: fragmented.End, 1427 Keys: keys, 1428 }) 1429 } 1430 } 1431 1432 func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) { 1433 // Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to 1434 // do that here. 1435 if len(fragmented.Keys) > 0 { 1436 i.rangeKeys = append(i.rangeKeys, fragmented) 1437 } 1438 } 1439 1440 // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing 1441 // so improves compression and enables an optimization during forward iteration 1442 // to skip some key comparisons. The seqnum for an entry can be zeroed if the 1443 // entry is on the bottom snapshot stripe and on the bottom level of the LSM. 1444 func (i *compactionIter) maybeZeroSeqnum(snapshotIdx int) { 1445 if !i.allowZeroSeqNum { 1446 // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could 1447 // make the determination on a key by key basis, similar to what is done 1448 // for elideTombstone. Need to add a benchmark for compactionIter to verify 1449 // that isn't too expensive. 1450 return 1451 } 1452 if snapshotIdx > 0 { 1453 // This is not the last snapshot 1454 return 1455 } 1456 i.key.SetSeqNum(base.SeqNumZero) 1457 } 1458 1459 // A frontier is used to monitor a compaction's progression across the user 1460 // keyspace. 1461 // 1462 // A frontier hold a user key boundary that it's concerned with in its `key` 1463 // field. If/when the compaction iterator returns an InternalKey with a user key 1464 // _k_ such that k ≥ frontier.key, the compaction iterator invokes the 1465 // frontier's `reached` function, passing _k_ as its argument. 1466 // 1467 // The `reached` function returns a new value to use as the key. If `reached` 1468 // returns nil, the frontier is forgotten and its `reached` method will not be 1469 // invoked again, unless the user calls [Update] to set a new key. 1470 // 1471 // A frontier's key may be updated outside the context of a `reached` 1472 // invocation at any time, through its Update method. 1473 type frontier struct { 1474 // container points to the containing *frontiers that was passed to Init 1475 // when the frontier was initialized. 1476 container *frontiers 1477 1478 // key holds the frontier's current key. If nil, this frontier is inactive 1479 // and its reached func will not be invoked. The value of this key may only 1480 // be updated by the `frontiers` type, or the Update method. 1481 key []byte 1482 1483 // reached is invoked to inform a frontier that its key has been reached. 1484 // It's invoked with the user key that reached the limit. The `key` argument 1485 // is guaranteed to be ≥ the frontier's key. 1486 // 1487 // After reached is invoked, the frontier's key is updated to the return 1488 // value of `reached`. Note bene, the frontier is permitted to update its 1489 // key to a user key ≤ the argument `key`. 1490 // 1491 // If a frontier is set to key k1, and reached(k2) is invoked (k2 ≥ k1), the 1492 // frontier will receive reached(k2) calls until it returns nil or a key 1493 // `k3` such that k2 < k3. This property is useful for frontiers that use 1494 // `reached` invocations to drive iteration through collections of keys that 1495 // may contain multiple keys that are both < k2 and ≥ k1. 1496 reached func(key []byte) (next []byte) 1497 } 1498 1499 // Init initializes the frontier with the provided key and reached callback. 1500 // The frontier is attached to the provided *frontiers and the provided reached 1501 // func will be invoked when the *frontiers is advanced to a key ≥ this 1502 // frontier's key. 1503 func (f *frontier) Init( 1504 frontiers *frontiers, initialKey []byte, reached func(key []byte) (next []byte), 1505 ) { 1506 *f = frontier{ 1507 container: frontiers, 1508 key: initialKey, 1509 reached: reached, 1510 } 1511 if initialKey != nil { 1512 f.container.push(f) 1513 } 1514 } 1515 1516 // String implements fmt.Stringer. 1517 func (f *frontier) String() string { 1518 return string(f.key) 1519 } 1520 1521 // Update replaces the existing frontier's key with the provided key. The 1522 // frontier's reached func will be invoked when the new key is reached. 1523 func (f *frontier) Update(key []byte) { 1524 c := f.container 1525 prevKeyIsNil := f.key == nil 1526 f.key = key 1527 if prevKeyIsNil { 1528 if key != nil { 1529 c.push(f) 1530 } 1531 return 1532 } 1533 1534 // Find the frontier within the heap (it must exist within the heap because 1535 // f.key was != nil). If the frontier key is now nil, remove it from the 1536 // heap. Otherwise, fix up its position. 1537 for i := 0; i < len(c.items); i++ { 1538 if c.items[i] == f { 1539 if key != nil { 1540 c.fix(i) 1541 } else { 1542 n := c.len() - 1 1543 c.swap(i, n) 1544 c.down(i, n) 1545 c.items = c.items[:n] 1546 } 1547 return 1548 } 1549 } 1550 panic("unreachable") 1551 } 1552 1553 // frontiers is used to track progression of a task (eg, compaction) across the 1554 // keyspace. Clients that want to be informed when the task advances to a key ≥ 1555 // some frontier may register a frontier, providing a callback. The task calls 1556 // `Advance(k)` with each user key encountered, which invokes the `reached` func 1557 // on all tracked frontiers with `key`s ≤ k. 1558 // 1559 // Internally, frontiers is implemented as a simple heap. 1560 type frontiers struct { 1561 cmp Compare 1562 items []*frontier 1563 } 1564 1565 // String implements fmt.Stringer. 1566 func (f *frontiers) String() string { 1567 var buf bytes.Buffer 1568 for i := 0; i < len(f.items); i++ { 1569 if i > 0 { 1570 fmt.Fprint(&buf, ", ") 1571 } 1572 fmt.Fprintf(&buf, "%s: %q", f.items[i], f.items[i].key) 1573 } 1574 return buf.String() 1575 } 1576 1577 // Advance notifies all member frontiers with keys ≤ k. 1578 func (f *frontiers) Advance(k []byte) { 1579 for len(f.items) > 0 && f.cmp(k, f.items[0].key) >= 0 { 1580 // This frontier has been reached. Invoke the closure and update with 1581 // the next frontier. 1582 f.items[0].key = f.items[0].reached(k) 1583 if f.items[0].key == nil { 1584 // This was the final frontier that this user was concerned with. 1585 // Remove it from the heap. 1586 f.pop() 1587 } else { 1588 // Fix up the heap root. 1589 f.fix(0) 1590 } 1591 } 1592 } 1593 1594 func (f *frontiers) len() int { 1595 return len(f.items) 1596 } 1597 1598 func (f *frontiers) less(i, j int) bool { 1599 return f.cmp(f.items[i].key, f.items[j].key) < 0 1600 } 1601 1602 func (f *frontiers) swap(i, j int) { 1603 f.items[i], f.items[j] = f.items[j], f.items[i] 1604 } 1605 1606 // fix, up and down are copied from the go stdlib. 1607 1608 func (f *frontiers) fix(i int) { 1609 if !f.down(i, f.len()) { 1610 f.up(i) 1611 } 1612 } 1613 1614 func (f *frontiers) push(ff *frontier) { 1615 n := len(f.items) 1616 f.items = append(f.items, ff) 1617 f.up(n) 1618 } 1619 1620 func (f *frontiers) pop() *frontier { 1621 n := f.len() - 1 1622 f.swap(0, n) 1623 f.down(0, n) 1624 item := f.items[n] 1625 f.items = f.items[:n] 1626 return item 1627 } 1628 1629 func (f *frontiers) up(j int) { 1630 for { 1631 i := (j - 1) / 2 // parent 1632 if i == j || !f.less(j, i) { 1633 break 1634 } 1635 f.swap(i, j) 1636 j = i 1637 } 1638 } 1639 1640 func (f *frontiers) down(i0, n int) bool { 1641 i := i0 1642 for { 1643 j1 := 2*i + 1 1644 if j1 >= n || j1 < 0 { // j1 < 0 after int overflow 1645 break 1646 } 1647 j := j1 // left child 1648 if j2 := j1 + 1; j2 < n && f.less(j2, j1) { 1649 j = j2 // = 2*i + 2 // right child 1650 } 1651 if !f.less(j, i) { 1652 break 1653 } 1654 f.swap(i, j) 1655 i = j 1656 } 1657 return i > i0 1658 }