github.com/petermattis/pebble@v0.0.0-20190905164901-ab51a2166067/compaction_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "fmt" 9 "sort" 10 11 "github.com/petermattis/pebble/internal/bytealloc" 12 "github.com/petermattis/pebble/internal/rangedel" 13 ) 14 15 // compactionIter provides a forward-only iterator that encapsulates the logic 16 // for collapsing entries during compaction. It wraps an internal iterator and 17 // collapses entries that are no longer necessary because they are shadowed by 18 // newer entries. The simplest example of this is when the internal iterator 19 // contains two keys: a.PUT.2 and a.PUT.1. Instead of returning both entries, 20 // compactionIter collapses the second entry because it is no longer 21 // necessary. The high-level structure for compactionIter is to iterate over 22 // its internal iterator and output 1 entry for every user-key. There are four 23 // complications to this story. 24 // 25 // 1. Eliding Deletion Tombstones 26 // 27 // Consider the entries a.DEL.2 and a.PUT.1. These entries collapse to 28 // a.DEL.2. Do we have to output the entry a.DEL.2? Only if a.DEL.2 possibly 29 // shadows an entry at a lower level. If we're compacting to the base-level in 30 // the LSM tree then a.DEL.2 is definitely not shadowing an entry at a lower 31 // level and can be elided. 32 // 33 // We can do slightly better than only eliding deletion tombstones at the base 34 // level by observing that we can elide a deletion tombstone if there are no 35 // sstables that contain the entry's key. This check is performed by 36 // elideTombstone. 37 // 38 // 2. Merges 39 // 40 // The MERGE operation merges the value for an entry with the existing value 41 // for an entry. The logical value of an entry can be composed of a series of 42 // merge operations. When compactionIter sees a MERGE, it scans forward in its 43 // internal iterator collapsing MERGE operations for the same key until it 44 // encounters a SET or DELETE operation. For example, the keys a.MERGE.4, 45 // a.MERGE.3, a.MERGE.2 will be collapsed to a.MERGE.4 and the values will be 46 // merged using the specified Merger. 47 // 48 // An interesting case here occurs when MERGE is combined with SET. Consider 49 // the entries a.MERGE.3 and a.SET.2. The collapsed key will be a.SET.3. The 50 // reason that the kind is changed to SET is because the SET operation acts as 51 // a barrier preventing further merging. This can be seen better in the 52 // scenario a.MERGE.3, a.SET.2, a.MERGE.1. The entry a.MERGE.1 may be at lower 53 // (older) level and not involved in the compaction. If the compaction of 54 // a.MERGE.3 and a.SET.2 produced a.MERGE.3, a subsequent compaction with 55 // a.MERGE.1 would merge the values together incorrectly. 56 // 57 // 3. Snapshots 58 // 59 // Snapshots are lightweight point-in-time views of the DB state. At its core, 60 // a snapshot is a sequence number along with a guarantee from Pebble that it 61 // will maintain the view of the database at that sequence number. Part of this 62 // guarantee is relatively straightforward to achieve. When reading from the 63 // database Pebble will ignore sequence numbers that are larger than the 64 // snapshot sequence number. The primary complexity with snapshots occurs 65 // during compaction: the collapsing of entries that are shadowed by newer 66 // entries is at odds with the guarantee that Pebble will maintain the view of 67 // the database at the snapshot sequence number. Rather than collapsing entries 68 // up to the next user key, compactionIter can only collapse entries up to the 69 // next snapshot boundary. That is, every snapshot boundary potentially causes 70 // another entry for the same user-key to be emitted. Another way to view this 71 // is that snapshots define stripes and entries are collapsed within stripes, 72 // but not across stripes. Consider the following scenario: 73 // 74 // a.PUT.9 75 // a.DEL.8 76 // a.PUT.7 77 // a.DEL.6 78 // a.PUT.5 79 // 80 // In the absence of snapshots these entries would be collapsed to 81 // a.PUT.9. What if there is a snapshot at sequence number 7? The entries can 82 // be divided into two stripes and collapsed within the stripes: 83 // 84 // a.PUT.9 a.PUT.9 85 // a.DEL.8 ---> 86 // a.PUT.7 87 // -- -- 88 // a.DEL.6 ---> a.DEL.6 89 // a.PUT.5 90 // 91 // All of the rules described earlier still apply, but they are confined to 92 // operate within a snapshot stripe. Snapshots only affect compaction when the 93 // snapshot sequence number lies within the range of sequence numbers being 94 // compacted. In the above example, a snapshot at sequence number 10 or at 95 // sequence number 5 would not have any effect. 96 // 97 // 4. Range Deletions 98 // 99 // Range deletions provide the ability to delete all of the keys (and values) 100 // in a contiguous range. Range deletions are stored indexed by their start 101 // key. The end key of the range is stored in the value. In order to support 102 // lookup of the range deletions which overlap with a particular key, the range 103 // deletion tombstones need to be fragmented whenever they overlap. This 104 // fragmentation is performed by rangedel.Fragmenter. The fragments are then 105 // subject to the rules for snapshots. For example, consider the two range 106 // tombstones [a,e)#1 and [c,g)#2: 107 // 108 // 2: c-------g 109 // 1: a-------e 110 // 111 // These tombstones will be fragmented into: 112 // 113 // 2: c---e---g 114 // 1: a---c---e 115 // 116 // Do we output the fragment [c,e)#1? Since it is covered by [c-e]#2 the answer 117 // depends on whether it is in a new snapshot stripe. 118 // 119 // In addition to the fragmentation of range tombstones, compaction also needs 120 // to take the range tombstones into consideration when outputting normal 121 // keys. Just as with point deletions, a range deletion covering an entry can 122 // cause the entry to be elided. 123 type compactionIter struct { 124 cmp Compare 125 merge Merge 126 iter internalIterator 127 err error 128 key InternalKey 129 value []byte 130 // Temporary buffer used for storing the previous user key in order to 131 // determine when iteration has advanced to a new user key and thus a new 132 // snapshot stripe. 133 keyBuf []byte 134 // Temporary buffer used for aggregating merge operations. 135 valueBuf []byte 136 // Is the current entry valid? 137 valid bool 138 iterKey *InternalKey 139 iterValue []byte 140 // Skip indicates whether the remaining entries in the current snapshot 141 // stripe should be skipped or processed. Skipped is true at the start of a 142 // stripe and set to false afterwards. 143 skip bool 144 // The index of the snapshot for the current key within the snapshots slice. 145 curSnapshotIdx int 146 curSnapshotSeqNum uint64 147 // The snapshot sequence numbers that need to be maintained. These sequence 148 // numbers define the snapshot stripes (see the Snapshots description 149 // above). The sequence numbers are in ascending order. 150 snapshots []uint64 151 // The range deletion tombstone fragmenter. 152 rangeDelFrag rangedel.Fragmenter 153 // The fragmented tombstones. 154 tombstones []rangedel.Tombstone 155 // Byte allocator for the tombstone keys. 156 alloc bytealloc.A 157 allowZeroSeqNum bool 158 elideTombstone func(key []byte) bool 159 elideRangeTombstone func(start, end []byte) bool 160 } 161 162 func newCompactionIter( 163 cmp Compare, 164 merge Merge, 165 iter internalIterator, 166 snapshots []uint64, 167 allowZeroSeqNum bool, 168 elideTombstone func(key []byte) bool, 169 elideRangeTombstone func(start, end []byte) bool, 170 ) *compactionIter { 171 i := &compactionIter{ 172 cmp: cmp, 173 merge: merge, 174 iter: iter, 175 snapshots: snapshots, 176 allowZeroSeqNum: allowZeroSeqNum, 177 elideTombstone: elideTombstone, 178 elideRangeTombstone: elideRangeTombstone, 179 } 180 i.rangeDelFrag.Cmp = cmp 181 i.rangeDelFrag.Emit = i.emitRangeDelChunk 182 return i 183 } 184 185 func (i *compactionIter) First() (*InternalKey, []byte) { 186 if i.err != nil { 187 return nil, nil 188 } 189 i.iterKey, i.iterValue = i.iter.First() 190 if i.iterKey != nil { 191 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(i.iterKey.SeqNum(), i.snapshots) 192 } 193 return i.Next() 194 } 195 196 func (i *compactionIter) Next() (*InternalKey, []byte) { 197 if i.err != nil { 198 return nil, nil 199 } 200 201 if i.skip { 202 i.skip = false 203 i.skipStripe() 204 } 205 206 i.valid = false 207 for i.iterKey != nil { 208 i.key = *i.iterKey 209 switch i.key.Kind() { 210 case InternalKeyKindDelete: 211 // If we're at the last snapshot stripe and the tombstone can be elided 212 // skip to the next stripe (which will be the next user key). 213 if i.curSnapshotIdx == 0 && i.elideTombstone(i.key.UserKey) { 214 i.saveKey() 215 i.skipStripe() 216 continue 217 } 218 219 i.saveKey() 220 i.value = i.iterValue 221 i.valid = true 222 i.skip = true 223 return &i.key, i.value 224 225 case InternalKeyKindRangeDelete: 226 i.key = i.cloneKey(i.key) 227 i.rangeDelFrag.Add(i.key, i.iterValue) 228 i.nextInStripe() 229 continue 230 231 case InternalKeyKindSet: 232 if i.rangeDelFrag.Deleted(i.key, i.curSnapshotSeqNum) { 233 i.saveKey() 234 i.skipStripe() 235 continue 236 } 237 238 i.saveKey() 239 i.value = i.iterValue 240 i.valid = true 241 i.skip = true 242 i.maybeZeroSeqnum() 243 return &i.key, i.value 244 245 case InternalKeyKindMerge: 246 if i.rangeDelFrag.Deleted(i.key, i.curSnapshotSeqNum) { 247 i.saveKey() 248 i.skipStripe() 249 continue 250 } 251 252 // NB: it is important to call maybeZeroSeqnum before mergeNext as 253 // merging advances the iterator, adjusting curSnapshotIdx and thus 254 // invalidating the state that maybeZeroSeqnum uses to make its 255 // determination. 256 i.maybeZeroSeqnum() 257 return i.mergeNext() 258 259 case InternalKeyKindInvalid: 260 // NB: Invalid keys occur when there is some error parsing the key. Pass 261 // them through unmodified. 262 i.saveKey() 263 i.saveValue() 264 i.iterKey, i.iterValue = i.iter.Next() 265 i.valid = true 266 return &i.key, i.value 267 268 default: 269 i.err = fmt.Errorf("invalid internal key kind: %d", i.key.Kind()) 270 return nil, nil 271 } 272 } 273 274 return nil, nil 275 } 276 277 // snapshotIndex returns the index of the first sequence number in snapshots 278 // which is greater than or equal to seq. 279 func snapshotIndex(seq uint64, snapshots []uint64) (int, uint64) { 280 index := sort.Search(len(snapshots), func(i int) bool { 281 return snapshots[i] > seq 282 }) 283 if index >= len(snapshots) { 284 return index, InternalKeySeqNumMax 285 } 286 return index, snapshots[index] 287 } 288 289 func (i *compactionIter) skipStripe() { 290 for i.nextInStripe() { 291 } 292 } 293 294 func (i *compactionIter) nextInStripe() bool { 295 i.iterKey, i.iterValue = i.iter.Next() 296 if i.iterKey == nil { 297 return false 298 } 299 key := i.iterKey 300 if i.cmp(i.key.UserKey, key.UserKey) != 0 { 301 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 302 return false 303 } 304 switch key.Kind() { 305 case InternalKeyKindRangeDelete: 306 // Range tombstones are always added to the fragmenter. They are processed 307 // into stripes after fragmentation. 308 i.rangeDelFrag.Add(i.cloneKey(*key), i.iterValue) 309 return true 310 case InternalKeyKindInvalid: 311 i.curSnapshotIdx, i.curSnapshotSeqNum = snapshotIndex(key.SeqNum(), i.snapshots) 312 return false 313 } 314 if len(i.snapshots) == 0 { 315 return true 316 } 317 idx, seqNum := snapshotIndex(key.SeqNum(), i.snapshots) 318 if i.curSnapshotIdx == idx { 319 return true 320 } 321 i.curSnapshotIdx = idx 322 i.curSnapshotSeqNum = seqNum 323 return false 324 } 325 326 func (i *compactionIter) mergeNext() (*InternalKey, []byte) { 327 // Save the current key and value. 328 i.saveKey() 329 i.saveValue() 330 i.valid = true 331 332 // Loop looking for older values in the current snapshot stripe and merge 333 // them. 334 for { 335 if !i.nextInStripe() { 336 i.skip = false 337 return &i.key, i.value 338 } 339 key := i.iterKey 340 switch key.Kind() { 341 case InternalKeyKindDelete: 342 // We've hit a deletion tombstone. Return everything up to this point and 343 // then skip entries until the next snapshot stripe. 344 i.valueBuf = i.value[:0] 345 i.skip = true 346 return &i.key, i.value 347 348 case InternalKeyKindRangeDelete: 349 // We've hit a range deletion tombstone. Return everything up to this 350 // point and then skip entries until the next snapshot stripe. 351 i.skip = true 352 return &i.key, i.value 353 354 case InternalKeyKindSet: 355 if i.rangeDelFrag.Deleted(*key, i.curSnapshotSeqNum) { 356 i.skip = true 357 return &i.key, i.value 358 } 359 360 // We've hit a Set value. Merge with the existing value and return. We 361 // change the kind of the resulting key to a Set so that it shadows keys 362 // in lower levels. That is, MERGE+MERGE+SET -> SET. 363 i.value = i.merge(i.key.UserKey, i.value, i.iterValue, nil) 364 i.valueBuf = i.value[:0] 365 i.key.SetKind(InternalKeyKindSet) 366 i.skip = true 367 return &i.key, i.value 368 369 case InternalKeyKindMerge: 370 if i.rangeDelFrag.Deleted(*key, i.curSnapshotSeqNum) { 371 i.skip = true 372 return &i.key, i.value 373 } 374 375 // We've hit another Merge value. Merge with the existing value and 376 // continue looping. 377 i.value = i.merge(i.key.UserKey, i.value, i.iterValue, nil) 378 i.valueBuf = i.value[:0] 379 380 default: 381 i.err = fmt.Errorf("invalid internal key kind: %d", i.iterKey.Kind()) 382 return nil, nil 383 } 384 } 385 } 386 387 func (i *compactionIter) saveKey() { 388 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 389 i.key.UserKey = i.keyBuf 390 } 391 392 func (i *compactionIter) saveValue() { 393 i.valueBuf = append(i.valueBuf[:0], i.iterValue...) 394 i.value = i.valueBuf 395 } 396 397 func (i *compactionIter) cloneKey(key InternalKey) InternalKey { 398 i.alloc, key.UserKey = i.alloc.Copy(key.UserKey) 399 return key 400 } 401 402 func (i *compactionIter) Key() InternalKey { 403 return i.key 404 } 405 406 func (i *compactionIter) Value() []byte { 407 return i.value 408 } 409 410 func (i *compactionIter) Valid() bool { 411 return i.valid 412 } 413 414 func (i *compactionIter) Error() error { 415 return i.err 416 } 417 418 func (i *compactionIter) Close() error { 419 err := i.iter.Close() 420 if i.err == nil { 421 i.err = err 422 } 423 return i.err 424 } 425 426 func (i *compactionIter) Tombstones(key []byte) []rangedel.Tombstone { 427 if key == nil { 428 i.rangeDelFrag.Finish() 429 } else { 430 i.rangeDelFrag.FlushTo(key) 431 } 432 tombstones := i.tombstones 433 i.tombstones = nil 434 return tombstones 435 } 436 437 func (i *compactionIter) emitRangeDelChunk(fragmented []rangedel.Tombstone) { 438 // Apply the snapshot stripe rules, keeping only the latest tombstone for 439 // each snapshot stripe. 440 currentIdx := -1 441 for _, v := range fragmented { 442 idx, _ := snapshotIndex(v.Start.SeqNum(), i.snapshots) 443 if currentIdx == idx { 444 continue 445 } 446 if idx == 0 && i.elideRangeTombstone(v.Start.UserKey, v.End) { 447 // This is the last snapshot stripe and the range tombstone can be 448 // elided. 449 break 450 } 451 452 i.tombstones = append(i.tombstones, v) 453 if idx == 0 { 454 // This is the last snapshot stripe. 455 break 456 } 457 currentIdx = idx 458 } 459 } 460 461 // maybeZeroSeqnum attempts to set the seqnum for the current key to 0. Doing 462 // so improves compression and enables an optimization during forward iteration 463 // to skip some key comparisons. The seqnum for an entry can be zeroed if the 464 // entry is on the bottom snapshot stripe and on the bottom level of the LSM. 465 func (i *compactionIter) maybeZeroSeqnum() { 466 if !i.allowZeroSeqNum { 467 // TODO(peter): allowZeroSeqNum applies to the entire compaction. We could 468 // make the determination on a key by key basis, similar to what is done 469 // for elideTombstone. Need to add a benchmark for compactionIter to verify 470 // that isn't too expensive. 471 return 472 } 473 if i.curSnapshotIdx > 0 { 474 // This is not the last snapshot 475 return 476 } 477 i.key.SetSeqNum(0) 478 }