github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/compaction_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "fmt" 9 "io" 10 "sort" 11 "strconv" 12 13 "github.com/cockroachdb/errors" 14 "github.com/zuoyebang/bitalostable/internal/base" 15 "github.com/zuoyebang/bitalostable/internal/bytealloc" 16 "github.com/zuoyebang/bitalostable/internal/invariants" 17 "github.com/zuoyebang/bitalostable/internal/keyspan" 18 "github.com/zuoyebang/bitalostable/internal/rangekey" 19 ) 20 21 // compactionIter provides a forward-only iterator that encapsulates the logic 22 // for collapsing entries during compaction. It wraps an internal iterator and 23 // collapses entries that are no longer necessary because they are shadowed by 24 // newer entries. The simplest example of this is when the internal iterator 25 // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries, 26 // compactionIter collapses the second entry because it is no longer 27 // necessary. The high-level structure for compactionIter is to iterate over 28 // its internal iterator and output 1 entry for every user-key. There are four 29 // complications to this story. 30 // 31 // 1. Eliding Deletion Tombstones 32 // 33 // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to 34 // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly 35 // shadows an entry at a lower level. If we're compacting to the base-level in 36 // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower 37 // level and can be elided. 38 // 39 // We can do slightly better than only eliding deletion tombstones at the base 40 // level by observing that we can elide a deletion tombstone if there are no 41 // sstables that contain the entry's key. This check is performed by 42 // elideTombstone. 43 // 44 // 2. Merges 45 // 46 // The MERGE operation merges the value for an entry with the existing value 47 // for an entry. The logical value of an entry can be composed of a series of 48 // merge operations. When compactionIter sees a MERGE, it scans forward in its 49 // internal iterator collapsing MERGE operations for the same key until it 50 // encounters a SET or DELETE operation. For example, the keys a.MERGE.4, 51 // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be 52 // merged using the specified Merger. 53 // 54 // An interesting case here occurs when MERGE is combined with SET. Consider 55 // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The 56 // reason that the kind is changed to SET is because the SET operation acts as 57 // a barrier preventing further merging. This can be seen better in the 58 // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower 59 // (older) level and not involved in the compaction. If the compaction of 60 // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with 61 // a.MERGE.1 would merge the values together incorrectly. 62 // 63 // 3. Snapshots 64 // 65 // Snapshots are lightweight point-in-time views of the DB state. At its core, 66 // a snapshot is a sequence number along with a guarantee from Pebble that it 67 // will maintain the view of the database at that sequence number. Part of this 68 // guarantee is relatively straightforward to achieve. When reading from the 69 // database Pebble will ignore sequence numbers that are larger than the 70 // snapshot sequence number. The primary complexity with snapshots occurs 71 // during compaction: the collapsing of entries that are shadowed by newer 72 // entries is at odds with the guarantee that Pebble will maintain the view of 73 // the database at the snapshot sequence number. Rather than collapsing entries 74 // up to the next user key, compactionIter can only collapse entries up to the 75 // next snapshot boundary. That is, every snapshot boundary potentially causes 76 // another entry for the same user-key to be emitted. Another way to view this 77 // is that snapshots define stripes and entries are collapsed within stripes, 78 // but not across stripes. Consider the following scenario: 79 // 80 // a.PUT.9 81 // a.DEL.8 82 // a.PUT.7 83 // a.DEL.6 84 // a.PUT.5 85 // 86 // In the absence of snapshots these entries would be collapsed to 87 // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can 88 // be divided into two stripes and collapsed within the stripes: 89 // 90 // a.PUT.9 a.PUT.9 91 // a.DEL.8 ---> 92 // a.PUT.7 93 // -- -- 94 // a.DEL.6 ---> a.DEL.6 95 // a.PUT.5 96 // 97 // All of the rules described earlier still apply, but they are confined to 98 // operate within a snapshot stripe. Snapshots only affect compaction when the 99 // snapshot sequence number lies within the range of sequence numbers being 100 // compacted. In the above example, a snapshot at sequence number 10 or at 101 // sequence number 5 would not have any effect. 102 // 103 // 4. Range Deletions 104 // 105 // Range deletions provide the ability to delete all of the keys (and values) 106 // in a contiguous range. Range deletions are stored indexed by their start 107 // key. The end key of the range is stored in the value. In order to support 108 // lookup of the range deletions which overlap with a particular key, the range 109 // deletion tombstones need to be fragmented whenever they overlap. This 110 // fragmentation is performed by keyspan.Fragmenter. The fragments are then 111 // subject to the rules for snapshots. For example, consider the two range 112 // tombstones [a,e)#1 and [c,g)#2: 113 // 114 // 2: c-------g 115 // 1: a-------e 116 // 117 // These tombstones will be fragmented into: 118 // 119 // 2: c---e---g 120 // 1: a---c---e 121 // 122 // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer 123 // depends on whether it is in a new snapshot stripe. 124 // 125 // In addition to the fragmentation of range tombstones, compaction also needs 126 // to take the range tombstones into consideration when outputting normal 127 // keys. Just as with point deletions, a range deletion covering an entry can 128 // cause the entry to be elided. 129 // 130 // A note on the stability of keys and values. 131 // 132 // The stability guarantees of keys and values returned by the iterator tree 133 // that backs a compactionIter is nuanced and care must be taken when 134 // referencing any returned items. 135 // 136 // Keys and values returned by exported functions (i.e. First, Next, etc.) have 137 // lifetimes that fall into two categories: 138 // 139 // Lifetime valid for duration of compaction. Range deletion keys and values are 140 // stable for the duration of the compaction, due to way in which a 141 // compactionIter is typically constructed (i.e. via (*compaction).newInputIter, 142 // which wraps the iterator over the range deletion block in a noCloseIter, 143 // preventing the release of the backing memory until the compaction is 144 // finished). 145 // 146 // Lifetime limited to duration of sstable block liveness. Point keys (SET, DEL, 147 // etc.) and values must be cloned / copied following the return from the 148 // exported function, and before a subsequent call to Next advances the iterator 149 // and mutates the contents of the returned key and value. 150 type compactionIter struct { 151 equal Equal 152 merge Merge 153 iter internalIterator 154 err error 155 // `key.UserKey` is set to `keyBuf` caused by saving `i.iterKey.UserKey` 156 // and `key.Trailer` is set to `i.iterKey.Trailer`. This is the 157 // case on return from all public methods -- these methods return `key`. 158 // Additionally, it is the internal state when the code is moving to the 159 // next key so it can determine whether the user key has changed from 160 // the previous key. 161 key InternalKey 162 // keyTrailer is updated when `i.key` is updated and holds the key's 163 // original trailer (eg, before any sequence-number zeroing or changes to 164 // key kind). 165 keyTrailer uint64 166 value []byte 167 valueCloser io.Closer 168 // Temporary buffer used for storing the previous user key in order to 169 // determine when iteration has advanced to a new user key and thus a new 170 // snapshot stripe. 171 keyBuf []byte 172 // Temporary buffer used for storing the previous value, which may be an 173 // unsafe, i.iter-owned slice that could be altered when the iterator is 174 // advanced. 175 valueBuf []byte 176 // Is the current entry valid? 177 valid bool 178 iterKey *InternalKey 179 iterValue []byte 180 // `skip` indicates whether the remaining skippable entries in the current 181 // snapshot stripe should be skipped or processed. An example of a non- 182 // skippable entry is a range tombstone as we need to return it from the 183 // `compactionIter`, even if a key covering its start key has already been 184 // seen in the same stripe. `skip` has no effect when `pos == iterPosNext`. 185 skip bool 186 // `pos` indicates the iterator position at the top of `Next()`. Its type's 187 // (`iterPos`) values take on the following meanings in the context of 188 // `compactionIter`. 189 // 190 // - `iterPosCur`: the iterator is at the last key returned. 191 // - `iterPosNext`: the iterator has already been advanced to the next 192 // candidate key. For example, this happens when processing merge operands, 193 // where we advance the iterator all the way into the next stripe or next 194 // user key to ensure we've seen all mergeable operands. 195 // - `iterPosPrev`: this is invalid as compactionIter is forward-only. 196 pos iterPos 197 // The index of the snapshot for the current key within the snapshots slice. 198 curSnapshotIdx int 199 curSnapshotSeqNum uint64 200 // The snapshot sequence numbers that need to be maintained. These sequence 201 // numbers define the snapshot stripes (see the Snapshots description 202 // above). The sequence numbers are in ascending order. 203 snapshots []uint64 204 // Reference to the range deletion tombstone fragmenter (e.g., 205 // `compaction.rangeDelFrag`). 206 rangeDelFrag *keyspan.Fragmenter 207 rangeKeyFrag *keyspan.Fragmenter 208 // The fragmented tombstones. 209 tombstones []keyspan.Span 210 // The fragmented range keys. 211 rangeKeys []keyspan.Span 212 // Byte allocator for the tombstone keys. 213 alloc bytealloc.A 214 allowZeroSeqNum bool 215 elideTombstone func(key []byte) bool 216 elideRangeTombstone func(start, end []byte) bool 217 // The on-disk format major version. This informs the types of keys that 218 // may be written to disk during a compaction. 219 formatVersion FormatMajorVersion 220 } 221 222 func newCompactionIter( 223 cmp Compare, 224 equal Equal, 225 formatKey base.FormatKey, 226 merge Merge, 227 iter internalIterator, 228 snapshots []uint64, 229 rangeDelFrag *keyspan.Fragmenter, 230 rangeKeyFrag *keyspan.Fragmenter, 231 allowZeroSeqNum bool, 232 elideTombstone func(key []byte) bool, 233 elideRangeTombstone func(start, end []byte) bool, 234 formatVersion FormatMajorVersion, 235 ) *compactionIter { 236 i := &compactionIter{ 237 equal: equal, 238 merge: merge, 239 iter: iter, 240 snapshots: snapshots, 241 rangeDelFrag: rangeDelFrag, 242 rangeKeyFrag: rangeKeyFrag, 243 allowZeroSeqNum: allowZeroSeqNum, 244 elideTombstone: elideTombstone, 245 elideRangeTombstone: elideRangeTombstone, 246 formatVersion: formatVersion, 247 } 248 i.rangeDelFrag.Cmp = cmp 249 i.rangeDelFrag.Format = formatKey 250 i.rangeDelFrag.Emit = i.emitRangeDelChunk 251 i.rangeKeyFrag.Cmp = cmp 252 i.rangeKeyFrag.Format = formatKey 253 i.rangeKeyFrag.Emit = i.emitRangeKeyChunk 254 return i 255 } 256 257 func (i *compactionIter) First() (*InternalKey, []byte) { 258 if i.err != nil { 259 return nil, nil 260 } 261 i.iterKey, i.iterValue = i.iter.First() 262 if i.iterKey != nil { 263 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots) 264 } 265 i.pos = iterPosNext 266 return i.Next() 267 } 268 269 func (i *compactionIter) Next() (*InternalKey, []byte) { 270 if i.err != nil { 271 return nil, nil 272 } 273 274 // Close the closer for the current value if one was open. 275 if i.closeValueCloser() != nil { 276 return nil, nil 277 } 278 279 // Prior to this call to `Next()` we are in one of three situations with 280 // respect to `iterKey` and related state: 281 // 282 // - `!skip && pos == iterPosNext`: `iterKey` is already at the next key. 283 // - `!skip && pos == iterPosCur`: We are at the key that has been returned. 284 // To move forward we advance by one key, even if that lands us in the same 285 // snapshot stripe. 286 // - `skip && pos == iterPosCur`: We are at the key that has been returned. 287 // To move forward we skip skippable entries in the stripe. 288 if i.pos == iterPosCurForward { 289 if i.skip { 290 i.skipInStripe() 291 } else { 292 i.nextInStripe() 293 } 294 } 295 296 i.pos = iterPosCurForward 297 i.valid = false 298 for i.iterKey != nil { 299 if i.iterKey.Kind() == InternalKeyKindRangeDelete || rangekey.IsRangeKey(i.iterKey.Kind()) { 300 // Return the span so the compaction can use it for file truncation and add 301 // it to the relevant fragmenter. We do not set `skip` to true before 302 // returning as there may be a forthcoming point key with the same user key 303 // and sequence number. Such a point key must be visible (i.e., not skipped 304 // over) since we promise point keys are not deleted by range tombstones at 305 // the same sequence number. 306 // 307 // Although, note that `skip` may already be true before reaching here 308 // due to an earlier key in the stripe. Then it is fine to leave it set 309 // to true, as the earlier key must have had a higher sequence number. 310 // 311 // NOTE: there is a subtle invariant violation here in that calling 312 // saveKey and returning a reference to the temporary slice violates 313 // the stability guarantee for range deletion keys. A potential 314 // mediation could return the original iterKey and iterValue 315 // directly, as the backing memory is guaranteed to be stable until 316 // the compaction completes. The violation here is only minor in 317 // that the caller immediately clones the range deletion InternalKey 318 // when passing the key to the deletion fragmenter (see the 319 // call-site in compaction.go). 320 // TODO(travers): address this violation by removing the call to 321 // saveKey and instead return the original iterKey and iterValue. 322 // This goes against the comment on i.key in the struct, and 323 // therefore warrants some investigation. 324 i.saveKey() 325 i.value = i.iterValue 326 i.valid = true 327 return &i.key, i.value 328 } 329 330 if i.rangeDelFrag.Covers(*i.iterKey, i.curSnapshotSeqNum) { 331 i.saveKey() 332 i.skipInStripe() 333 continue 334 } 335 336 switch i.iterKey.Kind() { 337 case InternalKeyKindDelete, InternalKeyKindSingleDelete: 338 // If we're at the last snapshot stripe and the tombstone can be elided 339 // skip skippable keys in the same stripe. 340 if i.curSnapshotIdx == 0 && i.elideTombstone(i.iterKey.UserKey) { 341 i.saveKey() 342 i.skipInStripe() 343 continue 344 } 345 346 switch i.iterKey.Kind() { 347 case InternalKeyKindDelete: 348 i.saveKey() 349 i.value = i.iterValue 350 i.valid = true 351 i.skip = true 352 return &i.key, i.value 353 354 case InternalKeyKindSingleDelete: 355 if i.singleDeleteNext() { 356 return &i.key, i.value 357 } 358 359 continue 360 } 361 362 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 363 // The key we emit for this entry is a function of the current key 364 // kind, and whether this entry is followed by a DEL/SINGLEDEL 365 // entry. setNext() does the work to move the iterator forward, 366 // preserving the original value, and potentially mutating the key 367 // kind. 368 i.setNext() 369 return &i.key, i.value 370 371 case InternalKeyKindMerge: 372 // Record the snapshot index before mergeNext as merging 373 // advances the iterator, adjusting curSnapshotIdx. 374 origSnapshotIdx := i.curSnapshotIdx 375 var valueMerger ValueMerger 376 valueMerger, i.err = i.merge(i.iterKey.UserKey, i.iterValue) 377 var change stripeChangeType 378 if i.err == nil { 379 change = i.mergeNext(valueMerger) 380 } 381 var needDelete bool 382 if i.err == nil { 383 // includesBase is true whenever we've transformed the MERGE record 384 // into a SET. 385 includesBase := i.key.Kind() == InternalKeyKindSet 386 i.value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, includesBase) 387 } 388 if i.err == nil { 389 if needDelete { 390 i.valid = false 391 if i.closeValueCloser() != nil { 392 return nil, nil 393 } 394 continue 395 } 396 // A non-skippable entry does not necessarily cover later merge 397 // operands, so we must not zero the current merge result's seqnum. 398 // 399 // For example, suppose the forthcoming two keys are a range 400 // tombstone, `[a, b)#3`, and a merge operand, `a#3`. Recall that 401 // range tombstones do not cover point keys at the same seqnum, so 402 // `a#3` is not deleted. The range tombstone will be seen first due 403 // to its larger value type. Since it is a non-skippable key, the 404 // current merge will not include `a#3`. If we zeroed the current 405 // merge result's seqnum, then it would conflict with the upcoming 406 // merge including `a#3`, whose seqnum will also be zeroed. 407 if change != sameStripeNonSkippable { 408 i.maybeZeroSeqnum(origSnapshotIdx) 409 } 410 return &i.key, i.value 411 } 412 if i.err != nil { 413 i.valid = false 414 i.err = base.MarkCorruptionError(i.err) 415 } 416 return nil, nil 417 418 default: 419 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 420 i.valid = false 421 return nil, nil 422 } 423 } 424 425 return nil, nil 426 } 427 428 func (i *compactionIter) closeValueCloser() error { 429 if i.valueCloser == nil { 430 return nil 431 } 432 433 i.err = i.valueCloser.Close() 434 i.valueCloser = nil 435 if i.err != nil { 436 i.valid = false 437 } 438 return i.err 439 } 440 441 // snapshotIndex returns the index of the first sequence number in snapshots 442 // which is greater than or equal to seq. 443 func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) { 444 index := sort.Search(len(snapshots), func(i int) bool { 445 return snapshots[i] > seq 446 }) 447 if index >= len(snapshots) { 448 return index, InternalKeySeqNumMax 449 } 450 return index, snapshots[index] 451 } 452 453 // skipInStripe skips over skippable keys in the same stripe and user key. 454 func (i *compactionIter) skipInStripe() { 455 i.skip = true 456 var change stripeChangeType 457 for { 458 change = i.nextInStripe() 459 if change == sameStripeNonSkippable || change == newStripe { 460 break 461 } 462 } 463 // Reset skip if we landed outside the original stripe. Otherwise, we landed 464 // in the same stripe on a non-skippable key. In that case we should preserve 465 // `i.skip == true` such that later keys in the stripe will continue to be 466 // skipped. 467 if change == newStripe { 468 i.skip = false 469 } 470 } 471 472 func (i *compactionIter) iterNext() bool { 473 i.iterKey, i.iterValue = i.iter.Next() 474 return i.iterKey != nil 475 } 476 477 // stripeChangeType indicates how the snapshot stripe changed relative to the previous 478 // key. If no change, it also indicates whether the current entry is skippable. 479 type stripeChangeType int 480 481 const ( 482 newStripe stripeChangeType = iota 483 sameStripeSkippable 484 sameStripeNonSkippable 485 ) 486 487 // nextInStripe advances the iterator and returns one of the above const ints 488 // indicating how its state changed. 489 // 490 // Calls to nextInStripe must be preceded by a call to saveKey to retain a 491 // temporary reference to the original key, so that forward iteration can 492 // proceed with a reference to the original key. Care should be taken to avoid 493 // overwriting or mutating the saved key or value before they have been returned 494 // to the caller of the exported function (i.e. the caller of Next, First, etc.) 495 func (i *compactionIter) nextInStripe() stripeChangeType { 496 if !i.iterNext() { 497 return newStripe 498 } 499 key := i.iterKey 500 501 // NB: The below conditional is an optimization to avoid a user key 502 // comparison in many cases. Internal keys with the same user key are 503 // ordered in (strictly) descending order by trailer. If the new key has a 504 // greater or equal trailer, or the previous key had a zero sequence number, 505 // the new key must have a new user key. 506 // 507 // A couple things make these cases common: 508 // - Sequence-number zeroing ensures ~all of the keys in L6 have a zero 509 // sequence number. 510 // - Ingested sstables' keys all adopt the same sequence number. 511 if i.keyTrailer <= base.InternalKeyZeroSeqnumMaxTrailer || key.Trailer >= i.keyTrailer { 512 if invariants.Enabled && i.equal(i.key.UserKey, key.UserKey) { 513 prevKey := i.key 514 prevKey.Trailer = i.keyTrailer 515 panic(fmt.Sprintf("bitalostable: invariant violation: %s and %s out of order", key, prevKey)) 516 } 517 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 518 return newStripe 519 } else if !i.equal(i.key.UserKey, key.UserKey) { 520 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 521 return newStripe 522 } 523 origSnapshotIdx := i.curSnapshotIdx 524 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 525 switch key.Kind() { 526 case InternalKeyKindRangeDelete: 527 // Range tombstones need to be exposed by the compactionIter to the upper level 528 // `compaction` object, so return them regardless of whether they are in the same 529 // snapshot stripe. 530 if i.curSnapshotIdx == origSnapshotIdx { 531 return sameStripeNonSkippable 532 } 533 return newStripe 534 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 535 // Range keys are interleaved at the max sequence number for a given user 536 // key, so we should not see any more range keys in this stripe. 537 panic("unreachable") 538 case InternalKeyKindInvalid: 539 if i.curSnapshotIdx == origSnapshotIdx { 540 return sameStripeNonSkippable 541 } 542 return newStripe 543 } 544 if i.curSnapshotIdx == origSnapshotIdx { 545 return sameStripeSkippable 546 } 547 return newStripe 548 } 549 550 func (i *compactionIter) setNext() { 551 // Save the current key. 552 i.saveKey() 553 i.value = i.iterValue 554 i.valid = true 555 i.maybeZeroSeqnum(i.curSnapshotIdx) 556 557 // There are two cases where we can early return and skip the remaining 558 // records in the stripe: 559 // - If the DB does not SETWITHDEL. 560 // - If this key is already a SETWITHDEL. 561 if i.formatVersion < FormatSetWithDelete || 562 i.iterKey.Kind() == InternalKeyKindSetWithDelete { 563 i.skip = true 564 return 565 } 566 567 // We are iterating forward. Save the current value. 568 i.valueBuf = append(i.valueBuf[:0], i.iterValue...) 569 i.value = i.valueBuf 570 571 // Else, we continue to loop through entries in the stripe looking for a 572 // DEL. Note that we may stop *before* encountering a DEL, if one exists. 573 for { 574 switch t := i.nextInStripe(); t { 575 case newStripe, sameStripeNonSkippable: 576 i.pos = iterPosNext 577 if t == sameStripeNonSkippable { 578 // We iterated onto a key that we cannot skip. We can 579 // conservatively transform the original SET into a SETWITHDEL 580 // as an indication that there *may* still be a DEL/SINGLEDEL 581 // under this SET, even if we did not actually encounter one. 582 // 583 // This is safe to do, as: 584 // 585 // - in the case that there *is not* actually a DEL/SINGLEDEL 586 // under this entry, any SINGLEDEL above this now-transformed 587 // SETWITHDEL will become a DEL when the two encounter in a 588 // compaction. The DEL will eventually be elided in a 589 // subsequent compaction. The cost for ensuring correctness is 590 // that this entry is kept around for an additional compaction 591 // cycle(s). 592 // 593 // - in the case there *is* indeed a DEL/SINGLEDEL under us 594 // (but in a different stripe or sstable), then we will have 595 // already done the work to transform the SET into a 596 // SETWITHDEL, and we will skip any additional iteration when 597 // this entry is encountered again in a subsequent compaction. 598 // 599 // Ideally, this codepath would be smart enough to handle the 600 // case of SET <- RANGEDEL <- ... <- DEL/SINGLEDEL <- .... 601 // This requires preserving any RANGEDEL entries we encounter 602 // along the way, then emitting the original (possibly 603 // transformed) key, followed by the RANGEDELs. This requires 604 // a sizable refactoring of the existing code, as nextInStripe 605 // currently returns a sameStripeNonSkippable when it 606 // encounters a RANGEDEL. 607 // TODO(travers): optimize to handle the RANGEDEL case if it 608 // turns out to be a performance problem. 609 i.key.SetKind(InternalKeyKindSetWithDelete) 610 611 // By setting i.skip=true, we are saying that after the 612 // non-skippable key is emitted (which is likely a RANGEDEL), 613 // the remaining point keys that share the same user key as this 614 // saved key should be skipped. 615 i.skip = true 616 } 617 return 618 case sameStripeSkippable: 619 // We're still in the same stripe. If this is a DEL/SINGLEDEL, we 620 // stop looking and emit a SETWITHDEL. Subsequent keys are 621 // eligible for skipping. 622 if i.iterKey.Kind() == InternalKeyKindDelete || 623 i.iterKey.Kind() == InternalKeyKindSingleDelete { 624 i.key.SetKind(InternalKeyKindSetWithDelete) 625 i.skip = true 626 return 627 } 628 default: 629 panic("bitalostable: unexpected stripeChangeType: " + strconv.Itoa(int(t))) 630 } 631 } 632 } 633 634 func (i *compactionIter) mergeNext(valueMerger ValueMerger) stripeChangeType { 635 // Save the current key. 636 i.saveKey() 637 i.valid = true 638 639 // Loop looking for older values in the current snapshot stripe and merge 640 // them. 641 for { 642 if change := i.nextInStripe(); change == sameStripeNonSkippable || change == newStripe { 643 i.pos = iterPosNext 644 return change 645 } 646 key := i.iterKey 647 switch key.Kind() { 648 case InternalKeyKindDelete, InternalKeyKindSingleDelete: 649 // We've hit a deletion tombstone. Return everything up to this point and 650 // then skip entries until the next snapshot stripe. We change the kind 651 // of the result key to a Set so that it shadows keys in lower 652 // levels. That is, MERGE+DEL -> SET. 653 // We do the same for SingleDelete since SingleDelete is only 654 // permitted (with deterministic behavior) for keys that have been 655 // set once since the last SingleDelete/Delete, so everything 656 // older is acceptable to shadow. Note that this is slightly 657 // different from singleDeleteNext() which implements stricter 658 // semantics in terms of applying the SingleDelete to the single 659 // next Set. But those stricter semantics are not observable to 660 // the end-user since Iterator interprets SingleDelete as Delete. 661 // We could do something more complicated here and consume only a 662 // single Set, and then merge in any following Sets, but that is 663 // complicated wrt code and unnecessary given the narrow permitted 664 // use of SingleDelete. 665 i.key.SetKind(InternalKeyKindSet) 666 i.skip = true 667 return sameStripeSkippable 668 669 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 670 if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) { 671 // We change the kind of the result key to a Set so that it shadows 672 // keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't 673 // strictly necessary, but provides consistency with the behavior of 674 // MERGE+DEL. 675 i.key.SetKind(InternalKeyKindSet) 676 i.skip = true 677 return sameStripeSkippable 678 } 679 680 // We've hit a Set or SetWithDel value. Merge with the existing 681 // value and return. We change the kind of the resulting key to a 682 // Set so that it shadows keys in lower levels. That is: 683 // MERGE + (SET*) -> SET. 684 i.err = valueMerger.MergeOlder(i.iterValue) 685 if i.err != nil { 686 i.valid = false 687 return sameStripeSkippable 688 } 689 i.key.SetKind(InternalKeyKindSet) 690 i.skip = true 691 return sameStripeSkippable 692 693 case InternalKeyKindMerge: 694 if i.rangeDelFrag.Covers(*key, i.curSnapshotSeqNum) { 695 // We change the kind of the result key to a Set so that it shadows 696 // keys in lower levels. That is, MERGE+RANGEDEL -> SET. This isn't 697 // strictly necessary, but provides consistency with the behavior of 698 // MERGE+DEL. 699 i.key.SetKind(InternalKeyKindSet) 700 i.skip = true 701 return sameStripeSkippable 702 } 703 704 // We've hit another Merge value. Merge with the existing value and 705 // continue looping. 706 i.err = valueMerger.MergeOlder(i.iterValue) 707 if i.err != nil { 708 i.valid = false 709 return sameStripeSkippable 710 } 711 712 default: 713 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 714 i.valid = false 715 return sameStripeSkippable 716 } 717 } 718 } 719 720 func (i *compactionIter) singleDeleteNext() bool { 721 // Save the current key. 722 i.saveKey() 723 i.value = i.iterValue 724 i.valid = true 725 726 // Loop until finds a key to be passed to the next level. 727 for { 728 if change := i.nextInStripe(); change == sameStripeNonSkippable || change == newStripe { 729 i.pos = iterPosNext 730 return true 731 } 732 733 key := i.iterKey 734 switch key.Kind() { 735 case InternalKeyKindDelete, InternalKeyKindMerge, InternalKeyKindSetWithDelete: 736 // We've hit a Delete, Merge or SetWithDelete, transform the 737 // SingleDelete into a full Delete. 738 i.key.SetKind(InternalKeyKindDelete) 739 i.skip = true 740 return true 741 742 case InternalKeyKindSet: 743 i.nextInStripe() 744 i.valid = false 745 return false 746 747 case InternalKeyKindSingleDelete: 748 continue 749 750 default: 751 i.err = base.CorruptionErrorf("invalid internal key kind: %d", errors.Safe(i.iterKey.Kind())) 752 i.valid = false 753 return false 754 } 755 } 756 } 757 758 func (i *compactionIter) saveKey() { 759 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 760 i.key.UserKey = i.keyBuf 761 i.key.Trailer = i.iterKey.Trailer 762 i.keyTrailer = i.iterKey.Trailer 763 } 764 765 func (i *compactionIter) cloneKey(key []byte) []byte { 766 i.alloc, key = i.alloc.Copy(key) 767 return key 768 } 769 770 func (i *compactionIter) Key() InternalKey { 771 return i.key 772 } 773 774 func (i *compactionIter) Value() []byte { 775 return i.value 776 } 777 778 func (i *compactionIter) Valid() bool { 779 return i.valid 780 } 781 782 func (i *compactionIter) Error() error { 783 return i.err 784 } 785 786 func (i *compactionIter) Close() error { 787 err := i.iter.Close() 788 if i.err == nil { 789 i.err = err 790 } 791 792 // Close the closer for the current value if one was open. 793 if i.valueCloser != nil { 794 i.err = firstError(i.err, i.valueCloser.Close()) 795 i.valueCloser = nil 796 } 797 798 return i.err 799 } 800 801 // Tombstones returns a list of pending range tombstones in the fragmenter 802 // up to the specified key, or all pending range tombstones if key = nil. 803 func (i *compactionIter) Tombstones(key []byte) []keyspan.Span { 804 if key == nil { 805 i.rangeDelFrag.Finish() 806 } else { 807 // The specified end key is exclusive; no versions of the specified 808 // user key (including range tombstones covering that key) should 809 // be flushed yet. 810 i.rangeDelFrag.TruncateAndFlushTo(key) 811 } 812 tombstones := i.tombstones 813 i.tombstones = nil 814 return tombstones 815 } 816 817 // RangeKeys returns a list of pending fragmented range keys up to the specified 818 // key, or all pending range keys if key = nil. 819 func (i *compactionIter) RangeKeys(key []byte) []keyspan.Span { 820 if key == nil { 821 i.rangeKeyFrag.Finish() 822 } else { 823 // The specified end key is exclusive; no versions of the specified 824 // user key (including range tombstones covering that key) should 825 // be flushed yet. 826 i.rangeKeyFrag.TruncateAndFlushTo(key) 827 } 828 rangeKeys := i.rangeKeys 829 i.rangeKeys = nil 830 return rangeKeys 831 } 832 833 func (i *compactionIter) emitRangeDelChunk(fragmented keyspan.Span) { 834 // Apply the snapshot stripe rules, keeping only the latest tombstone for 835 // each snapshot stripe. 836 currentIdx := -1 837 keys := fragmented.Keys[:0] 838 for _, k := range fragmented.Keys { 839 idx, _ := snapshotIndex(k.SeqNum(), i.snapshots) 840 if currentIdx == idx { 841 continue 842 } 843 if idx == 0 && i.elideRangeTombstone(fragmented.Start, fragmented.End) { 844 // This is the last snapshot stripe and the range tombstone 845 // can be elided. 846 break 847 } 848 849 keys = append(keys, k) 850 if idx == 0 { 851 // This is the last snapshot stripe. 852 break 853 } 854 currentIdx = idx 855 } 856 if len(keys) > 0 { 857 i.tombstones = append(i.tombstones, keyspan.Span{ 858 Start: fragmented.Start, 859 End: fragmented.End, 860 Keys: keys, 861 }) 862 } 863 } 864 865 func (i *compactionIter) emitRangeKeyChunk(fragmented keyspan.Span) { 866 // Elision of snapshot stripes happens in rangeKeyCompactionTransform, so no need to 867 // do that here. 868 if len(fragmented.Keys) > 0 { 869 i.rangeKeys = append(i.rangeKeys, fragmented) 870 } 871 } 872 873 // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing 874 // so improves compression and enables an optimization during forward iteration 875 // to skip some key comparisons. The seqnum for an entry can be zeroed if the 876 // entry is on the bottom snapshot stripe and on the bottom level of the LSM. 877 func (i *compactionIter) maybeZeroSeqnum(snapshotIdx int) { 878 if !i.allowZeroSeqNum { 879 // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could 880 // make the determination on a key by key basis, similar to what is done 881 // for elideTombstone. Need to add a benchmark for compactionIter to verify 882 // that isn't too expensive. 883 return 884 } 885 if snapshotIdx > 0 { 886 // This is not the last snapshot 887 return 888 } 889 i.key.SetSeqNum(0) 890 }