github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/merging_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "fmt" 11 "runtime/debug" 12 "unsafe" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/internal/invariants" 17 "github.com/cockroachdb/pebble/internal/keyspan" 18 ) 19 20 type mergingIterLevel struct { 21 index int 22 iter internalIterator 23 // rangeDelIter is set to the range-deletion iterator for the level. When 24 // configured with a levelIter, this pointer changes as sstable boundaries 25 // are crossed. See levelIter.initRangeDel and the Range Deletions comment 26 // below. 27 rangeDelIter keyspan.FragmentIterator 28 // iterKey and iterValue cache the current key and value iter are pointed at. 29 iterKey *InternalKey 30 iterValue base.LazyValue 31 // levelIter is non-nil if this level's iter is ultimately backed by a 32 // *levelIter. The handle in iter may have wrapped the levelIter with 33 // intermediary internalIterator implementations. 34 levelIter *levelIter 35 36 // levelIterBoundaryContext's fields are set when using levelIter, in order 37 // to surface sstable boundary keys and file-level context. See levelIter 38 // comment and the Range Deletions comment below. 39 levelIterBoundaryContext 40 41 // tombstone caches the tombstone rangeDelIter is currently pointed at. If 42 // tombstone is nil, there are no further tombstones within the 43 // current sstable in the current iterator direction. The cached tombstone is 44 // only valid for the levels in the range [0,heap[0].index]. This avoids 45 // positioning tombstones at lower levels which cannot possibly shadow the 46 // current key. 47 tombstone *keyspan.Span 48 } 49 50 type levelIterBoundaryContext struct { 51 // smallestUserKey and largestUserKey are populated with the smallest and 52 // largest boundaries of the current file. 53 smallestUserKey, largestUserKey []byte 54 // isLargestUserKeyExclusive is set to true when a file's largest boundary 55 // is an exclusive key, (eg, a range deletion sentinel). If true, the file 56 // does not contain any keys with the provided user key, and the 57 // largestUserKey bound is exclusive. 58 isLargestUserKeyExclusive bool 59 // isSyntheticIterBoundsKey is set to true iff the key returned by the level 60 // iterator is a synthetic key derived from the iterator bounds. This is used 61 // to prevent the mergingIter from being stuck at such a synthetic key if it 62 // becomes the top element of the heap. When used with a user-facing Iterator, 63 // the only range deletions exposed by this mergingIter should be those with 64 // `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. 65 isSyntheticIterBoundsKey bool 66 // isIgnorableBoundaryKey is set to true iff the key returned by the level 67 // iterator is a file boundary key that should be ignored when returning to 68 // the parent iterator. File boundary keys are used by the level iter to 69 // keep a levelIter file's range deletion iterator open as long as other 70 // levels within the merging iterator require it. When used with a user-facing 71 // Iterator, the only range deletions exposed by this mergingIter should be 72 // those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. 73 isIgnorableBoundaryKey bool 74 } 75 76 // mergingIter provides a merged view of multiple iterators from different 77 // levels of the LSM. 78 // 79 // The core of a mergingIter is a heap of internalIterators (see 80 // mergingIterHeap). The heap can operate as either a min-heap, used during 81 // forward iteration (First, SeekGE, Next) or a max-heap, used during reverse 82 // iteration (Last, SeekLT, Prev). The heap is initialized in calls to First, 83 // Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top 84 // element on the heap, advances its iterator, and then "fixes" the heap 85 // property. When one of the child iterators is exhausted during Next/Prev 86 // iteration, it is removed from the heap. 87 // 88 // # Range Deletions 89 // 90 // A mergingIter can optionally be configured with a slice of range deletion 91 // iterators. The range deletion iterator slice must exactly parallel the point 92 // iterators and the range deletion iterator must correspond to the same level 93 // in the LSM as the point iterator. Note that each memtable and each table in 94 // L0 is a different "level" from the mergingIter perspective. So level 0 below 95 // does not correspond to L0 in the LSM. 96 // 97 // A range deletion iterator iterates over fragmented range tombstones. Range 98 // tombstones are fragmented by splitting them at any overlapping points. This 99 // fragmentation guarantees that within an sstable tombstones will either be 100 // distinct or will have identical start and end user keys. While range 101 // tombstones are fragmented within an sstable, the start and end keys are not truncated 102 // to sstable boundaries. This is necessary because the tombstone end key is 103 // exclusive and does not have a sequence number. Consider an sstable 104 // containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must 105 // delete "b#8", yet older versions of "b" might spill over to the next 106 // sstable. So the boundary key for this sstable must be "b#8". Adjusting the 107 // end key of tombstones to be optionally inclusive or contain a sequence 108 // number would be possible solutions (such solutions have potentially serious 109 // issues: tombstones have exclusive end keys since an inclusive deletion end can 110 // be converted to an exclusive one while the reverse transformation is not possible; 111 // the semantics of a sequence number for the end key of a range tombstone are murky). 112 // 113 // The approach taken here performs an 114 // implicit truncation of the tombstone to the sstable boundaries. 115 // 116 // During initialization of a mergingIter, the range deletion iterators for 117 // batches, memtables, and L0 tables are populated up front. Note that Batches 118 // and memtables index unfragmented tombstones. Batch.newRangeDelIter() and 119 // memTable.newRangeDelIter() fragment and cache the tombstones on demand. The 120 // L1-L6 range deletion iterators are populated by levelIter. When configured 121 // to load range deletion iterators, whenever a levelIter loads a table it 122 // loads both the point iterator and the range deletion 123 // iterator. levelIter.rangeDelIter is configured to point to the right entry 124 // in mergingIter.levels. The effect of this setup is that 125 // mergingIter.levels[i].rangeDelIter always contains the fragmented range 126 // tombstone for the current table in level i that the levelIter has open. 127 // 128 // Another crucial mechanism of levelIter is that it materializes fake point 129 // entries for the table boundaries if the boundary is range deletion 130 // key. Consider a table that contains only a range tombstone [a-e)#10. The 131 // sstable boundaries for this table will be a#10,15 and 132 // e#72057594037927935,15. During forward iteration levelIter will return 133 // e#72057594037927935,15 as a key. During reverse iteration levelIter will 134 // return a#10,15 as a key. These sentinel keys act as bookends to point 135 // iteration and allow mergingIter to keep a table and its associated range 136 // tombstones loaded as long as there are keys at lower levels that are within 137 // the bounds of the table. 138 // 139 // The final piece to the range deletion puzzle is the LSM invariant that for a 140 // given key K newer versions of K can only exist earlier in the level, or at 141 // higher levels of the tree. For example, if K#4 exists in L3, k#5 can only 142 // exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly 143 // uses this invariant to find the value for a key by walking the LSM level by 144 // level. For range deletions, this invariant means that a range deletion at 145 // level N will necessarily shadow any keys within its bounds in level Y where 146 // Y > N. One wrinkle to this statement is that it only applies to keys that 147 // lie within the sstable bounds as well, but we get that guarantee due to the 148 // way the range deletion iterator and point iterator are bound together by a 149 // levelIter. 150 // 151 // Tying the above all together, we get a picture where each level (index in 152 // mergingIter.levels) is composed of both point operations (pX) and range 153 // deletions (rX). The range deletions for level X shadow both the point 154 // operations and range deletions for level Y where Y > X allowing mergingIter 155 // to skip processing entries in that shadow. For example, consider the 156 // scenario: 157 // 158 // r0: a---e 159 // r1: d---h 160 // r2: g---k 161 // r3: j---n 162 // r4: m---q 163 // 164 // This is showing 5 levels of range deletions. Consider what happens upon 165 // SeekGE("b"). We first seek the point iterator for level 0 (the point values 166 // are not shown above) and we then seek the range deletion iterator. That 167 // returns the tombstone [a,e). This tombstone tells us that all keys in the 168 // range [a,e) in lower levels are deleted so we can skip them. So we can 169 // adjust the seek key to "e", the tombstone end key. For level 1 we seek to 170 // "e" and find the range tombstone [d,h) and similar logic holds. By the time 171 // we get to level 4 we're seeking to "n". 172 // 173 // One consequence of not truncating tombstone end keys to sstable boundaries 174 // is the seeking process described above cannot always seek to the tombstone 175 // end key in the older level. For example, imagine in the above example r3 is 176 // a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j, 177 // n) has "k" as its upper boundary. In this situation, compactions involving 178 // keys at or after "k" can output those keys to r4+, even if they're newer 179 // than our tombstone [j, n). So instead of seeking to "n" in r4 we can only 180 // seek to "k". To achieve this, the instance variable `largestUserKey.` 181 // maintains the upper bounds of the current sstables in the partitioned 182 // levels. In this example, `levels[3].largestUserKey` holds "k", telling us to 183 // limit the seek triggered by a tombstone in r3 to "k". 184 // 185 // During actual iteration levels can contain both point operations and range 186 // deletions. Within a level, when a range deletion contains a point operation 187 // the sequence numbers must be checked to determine if the point operation is 188 // newer or older than the range deletion tombstone. The mergingIter maintains 189 // the invariant that the range deletion iterators for all levels newer that 190 // the current iteration key (L < m.heap.items[0].index) are positioned at the 191 // next (or previous during reverse iteration) range deletion tombstone. We 192 // know those levels don't contain a range deletion tombstone that covers the 193 // current key because if they did the current key would be deleted. The range 194 // deletion iterator for the current key's level is positioned at a range 195 // tombstone covering or past the current key. The position of all of other 196 // range deletion iterators is unspecified. Whenever a key from those levels 197 // becomes the current key, their range deletion iterators need to be 198 // positioned. This lazy positioning avoids seeking the range deletion 199 // iterators for keys that are never considered. (A similar bit of lazy 200 // evaluation can be done for the point iterators, but is still TBD). 201 // 202 // For a full example, consider the following setup: 203 // 204 // p0: o 205 // r0: m---q 206 // 207 // p1: n p 208 // r1: g---k 209 // 210 // p2: b d i 211 // r2: a---e q----v 212 // 213 // p3: e 214 // r3: 215 // 216 // If we start iterating from the beginning, the first key we encounter is "b" 217 // in p2. When the mergingIter is pointing at a valid entry, the range deletion 218 // iterators for all of the levels < m.heap.items[0].index are positioned at 219 // the next range tombstone past the current key. So r0 will point at [m,q) and 220 // r1 at [g,k). When the key "b" is encountered, we check to see if the current 221 // tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e), 222 // contains and is newer than "b". 223 // 224 // Advancing the iterator finds the next key at "d". This is in the same level 225 // as the previous key "b" so we don't have to reposition any of the range 226 // deletion iterators, but merely check whether "d" is now contained by any of 227 // the range tombstones at higher levels or has stepped past the range 228 // tombstone in its own level or higher levels. In this case, there is nothing to be done. 229 // 230 // Advancing the iterator again finds "e". Since "e" comes from p3, we have to 231 // position the r3 range deletion iterator, which is empty. "e" is past the r2 232 // tombstone of [a,e) so we need to advance the r2 range deletion iterator to 233 // [q,v). 234 // 235 // The next key is "i". Because this key is in p2, a level above "e", we don't 236 // have to reposition any range deletion iterators and instead see that "i" is 237 // covered by the range tombstone [g,k). The iterator is immediately advanced 238 // to "n" which is covered by the range tombstone [m,q) causing the iterator to 239 // advance to "o" which is visible. 240 // 241 // TODO(peter,rangedel): For testing, advance the iterator through various 242 // scenarios and have each step display the current state (i.e. the current 243 // heap and range-del iterator positioning). 244 type mergingIter struct { 245 logger Logger 246 split Split 247 dir int 248 snapshot uint64 249 batchSnapshot uint64 250 levels []mergingIterLevel 251 heap mergingIterHeap 252 err error 253 prefix []byte 254 lower []byte 255 upper []byte 256 stats *InternalIteratorStats 257 258 // levelsPositioned, if non-nil, is a slice of the same length as levels. 259 // It's used by NextPrefix to record which levels have already been 260 // repositioned. It's created lazily by the first call to NextPrefix. 261 levelsPositioned []bool 262 263 combinedIterState *combinedIterState 264 265 // Used in some tests to disable the random disabling of seek optimizations. 266 forceEnableSeekOpt bool 267 } 268 269 // mergingIter implements the base.InternalIterator interface. 270 var _ base.InternalIterator = (*mergingIter)(nil) 271 272 // newMergingIter returns an iterator that merges its input. Walking the 273 // resultant iterator will return all key/value pairs of all input iterators 274 // in strictly increasing key order, as defined by cmp. It is permissible to 275 // pass a nil split parameter if the caller is never going to call 276 // SeekPrefixGE. 277 // 278 // The input's key ranges may overlap, but there are assumed to be no duplicate 279 // keys: if iters[i] contains a key k then iters[j] will not contain that key k. 280 // 281 // None of the iters may be nil. 282 func newMergingIter( 283 logger Logger, 284 stats *base.InternalIteratorStats, 285 cmp Compare, 286 split Split, 287 iters ...internalIterator, 288 ) *mergingIter { 289 m := &mergingIter{} 290 levels := make([]mergingIterLevel, len(iters)) 291 for i := range levels { 292 levels[i].iter = iters[i] 293 } 294 m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...) 295 return m 296 } 297 298 func (m *mergingIter) init( 299 opts *IterOptions, 300 stats *base.InternalIteratorStats, 301 cmp Compare, 302 split Split, 303 levels ...mergingIterLevel, 304 ) { 305 m.err = nil // clear cached iteration error 306 m.logger = opts.getLogger() 307 if opts != nil { 308 m.lower = opts.LowerBound 309 m.upper = opts.UpperBound 310 } 311 m.snapshot = InternalKeySeqNumMax 312 m.batchSnapshot = InternalKeySeqNumMax 313 m.levels = levels 314 m.heap.cmp = cmp 315 m.split = split 316 m.stats = stats 317 if cap(m.heap.items) < len(levels) { 318 m.heap.items = make([]*mergingIterLevel, 0, len(levels)) 319 } else { 320 m.heap.items = m.heap.items[:0] 321 } 322 for l := range m.levels { 323 m.levels[l].index = l 324 } 325 } 326 327 func (m *mergingIter) initHeap() { 328 m.heap.items = m.heap.items[:0] 329 for i := range m.levels { 330 if l := &m.levels[i]; l.iterKey != nil { 331 m.heap.items = append(m.heap.items, l) 332 } else { 333 m.err = firstError(m.err, l.iter.Error()) 334 if m.err != nil { 335 return 336 } 337 } 338 } 339 m.heap.init() 340 } 341 342 func (m *mergingIter) initMinHeap() { 343 m.dir = 1 344 m.heap.reverse = false 345 m.initHeap() 346 m.initMinRangeDelIters(-1) 347 } 348 349 // The level of the previous top element was oldTopLevel. Note that all range delete 350 // iterators < oldTopLevel are positioned past the key of the previous top element and 351 // the range delete iterator == oldTopLevel is positioned at or past the key of the 352 // previous top element. We need to position the range delete iterators from oldTopLevel + 1 353 // to the level of the current top element. 354 func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) { 355 if m.heap.len() == 0 { 356 return 357 } 358 359 // Position the range-del iterators at levels <= m.heap.items[0].index. 360 item := m.heap.items[0] 361 for level := oldTopLevel + 1; level <= item.index; level++ { 362 l := &m.levels[level] 363 if l.rangeDelIter == nil { 364 continue 365 } 366 l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) 367 } 368 } 369 370 func (m *mergingIter) initMaxHeap() { 371 m.dir = -1 372 m.heap.reverse = true 373 m.initHeap() 374 m.initMaxRangeDelIters(-1) 375 } 376 377 // The level of the previous top element was oldTopLevel. Note that all range delete 378 // iterators < oldTopLevel are positioned before the key of the previous top element and 379 // the range delete iterator == oldTopLevel is positioned at or before the key of the 380 // previous top element. We need to position the range delete iterators from oldTopLevel + 1 381 // to the level of the current top element. 382 func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) { 383 if m.heap.len() == 0 { 384 return 385 } 386 // Position the range-del iterators at levels <= m.heap.items[0].index. 387 item := m.heap.items[0] 388 for level := oldTopLevel + 1; level <= item.index; level++ { 389 l := &m.levels[level] 390 if l.rangeDelIter == nil { 391 continue 392 } 393 l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) 394 } 395 } 396 397 func (m *mergingIter) switchToMinHeap() { 398 if m.heap.len() == 0 { 399 if m.lower != nil { 400 m.SeekGE(m.lower, base.SeekGEFlagsNone) 401 } else { 402 m.First() 403 } 404 return 405 } 406 407 // We're switching from using a max heap to a min heap. We need to advance 408 // any iterator that is less than or equal to the current key. Consider the 409 // scenario where we have 2 iterators being merged (user-key:seq-num): 410 // 411 // i1: *a:2 b:2 412 // i2: a:1 b:1 413 // 414 // The current key is a:2 and i2 is pointed at a:1. When we switch to forward 415 // iteration, we want to return a key that is greater than a:2. 416 417 key := m.heap.items[0].iterKey 418 cur := m.heap.items[0] 419 420 for i := range m.levels { 421 l := &m.levels[i] 422 if l == cur { 423 continue 424 } 425 426 // If the iterator is exhausted, it may be out of bounds if range 427 // deletions modified our search key as we descended. we need to 428 // reposition it within the search bounds. If the current key is a 429 // range tombstone, the iterator might still be exhausted but at a 430 // sstable boundary sentinel. It would be okay to reposition an 431 // interator like this only through successive Next calls, except that 432 // it would violate the levelIter's invariants by causing it to return 433 // a key before the lower bound. 434 // 435 // bounds = [ f, _ ) 436 // L0: [ b ] [ f* z ] 437 // L1: [ a |----| k y ] 438 // L2: [ c (d) ] [ e g m ] 439 // L3: [ x ] 440 // 441 // * - current key [] - table bounds () - heap item 442 // 443 // In the above diagram, the L2 iterator is positioned at a sstable 444 // boundary (d) outside the lower bound (f). It arrived here from a 445 // seek whose seek-key was modified by a range tombstone. If we called 446 // Next on the L2 iterator, it would return e, violating its lower 447 // bound. Instead, we seek it to >= f and Next from there. 448 449 if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey && 450 l.iterKey.IsExclusiveSentinel() && 451 m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) { 452 if m.lower != nil { 453 l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone) 454 } else { 455 l.iterKey, l.iterValue = l.iter.First() 456 } 457 } 458 for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() { 459 if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) < 0 { 460 // key < iter-key 461 break 462 } 463 // key >= iter-key 464 } 465 } 466 467 // Special handling for the current iterator because we were using its key 468 // above. The iterator cur.iter may still be exhausted at a sstable boundary 469 // sentinel. Similar to the logic applied to the other levels, in these 470 // cases we seek the iterator to the first key in order to avoid violating 471 // levelIter's invariants. See the example in the for loop above. 472 if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && 473 m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 { 474 cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone) 475 } else { 476 cur.iterKey, cur.iterValue = cur.iter.Next() 477 } 478 m.initMinHeap() 479 } 480 481 func (m *mergingIter) switchToMaxHeap() { 482 if m.heap.len() == 0 { 483 if m.upper != nil { 484 m.SeekLT(m.upper, base.SeekLTFlagsNone) 485 } else { 486 m.Last() 487 } 488 return 489 } 490 491 // We're switching from using a min heap to a max heap. We need to backup any 492 // iterator that is greater than or equal to the current key. Consider the 493 // scenario where we have 2 iterators being merged (user-key:seq-num): 494 // 495 // i1: a:2 *b:2 496 // i2: a:1 b:1 497 // 498 // The current key is b:2 and i2 is pointing at b:1. When we switch to 499 // reverse iteration, we want to return a key that is less than b:2. 500 key := m.heap.items[0].iterKey 501 cur := m.heap.items[0] 502 503 for i := range m.levels { 504 l := &m.levels[i] 505 if l == cur { 506 continue 507 } 508 509 // If the iterator is exhausted, it may be out of bounds if range 510 // deletions modified our search key as we descended. we need to 511 // reposition it within the search bounds. If the current key is a 512 // range tombstone, the iterator might still be exhausted but at a 513 // sstable boundary sentinel. It would be okay to reposition an 514 // interator like this only through successive Prev calls, except that 515 // it would violate the levelIter's invariants by causing it to return 516 // a key beyond the upper bound. 517 // 518 // bounds = [ _, g ) 519 // L0: [ b ] [ f* z ] 520 // L1: [ a |-------| k y ] 521 // L2: [ c d ] h [(i) m ] 522 // L3: [ e x ] 523 // 524 // * - current key [] - table bounds () - heap item 525 // 526 // In the above diagram, the L2 iterator is positioned at a sstable 527 // boundary (i) outside the upper bound (g). It arrived here from a 528 // seek whose seek-key was modified by a range tombstone. If we called 529 // Prev on the L2 iterator, it would return h, violating its upper 530 // bound. Instead, we seek it to < g, and Prev from there. 531 532 if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey && 533 l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) { 534 if m.upper != nil { 535 l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone) 536 } else { 537 l.iterKey, l.iterValue = l.iter.Last() 538 } 539 } 540 for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() { 541 if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) > 0 { 542 // key > iter-key 543 break 544 } 545 // key <= iter-key 546 } 547 } 548 549 // Special handling for the current iterator because we were using its key 550 // above. The iterator cur.iter may still be exhausted at a sstable boundary 551 // sentinel. Similar to the logic applied to the other levels, in these 552 // cases we seek the iterator to in order to avoid violating levelIter's 553 // invariants by Prev-ing through files. See the example in the for loop 554 // above. 555 if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && 556 m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 { 557 cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone) 558 } else { 559 cur.iterKey, cur.iterValue = cur.iter.Prev() 560 } 561 m.initMaxHeap() 562 } 563 564 // maybeNextEntryWithinPrefix steps to the next entry, as long as the iteration 565 // prefix has not already been exceeded. If it has, it exhausts the iterator by 566 // resetting the heap to empty. 567 func (m *mergingIter) maybeNextEntryWithinPrefix(l *mergingIterLevel) { 568 if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { 569 // The item at the root of the heap already exceeds the iteration 570 // prefix. We should not advance any more. Clear the heap to reflect 571 // that the iterator is now exhausted (within this prefix, at 572 // least). 573 m.heap.items = m.heap.items[:0] 574 return 575 } 576 m.nextEntry(l, nil /* succKey */) 577 } 578 579 // nextEntry unconditionally steps to the next entry. item is the current top 580 // item in the heap. 581 // 582 // nextEntry should be called directly when not in prefix-iteration mode, or by 583 // Next. During prefix iteration mode, all other callers should use 584 // maybeNextEntryWithinPrefix which will avoid advancing the iterator if the 585 // current iteration prefix has been exhausted. See the comment within 586 // nextEntry's body for an explanation of why other callers should call 587 // maybeNextEntryWithinPrefix, which will ensure the documented invariant is 588 // preserved. 589 func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) { 590 // INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal 591 // to m.prefix. This invariant is important for ensuring TrySeekUsingNext 592 // optimizations behave correctly. 593 // 594 // During prefix iteration, the iterator does not have a full view of the 595 // LSM. Some level iterators may omit keys that are known to fall outside 596 // the seek prefix (eg, due to sstable bloom filter exclusion). It's 597 // important that in such cases we don't position any iterators beyond 598 // m.prefix, because doing so may interfere with future seeks. 599 // 600 // Let prefixes P1 < P2 < P3. Imagine a SeekPrefixGE to prefix P1, followed 601 // by a SeekPrefixGE to prefix P2. Imagine there exist live keys at prefix 602 // P2, but they're not visible to the SeekPrefixGE(P1) (because of 603 // bloom-filter exclusion or a range tombstone that deletes prefix P1 but 604 // not P2). If the SeekPrefixGE(P1) is allowed to move any level iterators 605 // to P3, the SeekPrefixGE(P2, TrySeekUsingNext=true) may mistakenly think 606 // the level contains no point keys or range tombstones within the prefix 607 // P2. Care is taken to avoid ever advancing the iterator beyond the current 608 // prefix. If nextEntry is ever invoked while we're already beyond the 609 // current prefix, we're violating the invariant. 610 if invariants.Enabled && m.prefix != nil { 611 if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { 612 m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s", 613 m.prefix, l.iterKey, debug.Stack()) 614 } 615 } 616 617 oldTopLevel := l.index 618 oldRangeDelIter := l.rangeDelIter 619 620 if succKey == nil { 621 l.iterKey, l.iterValue = l.iter.Next() 622 } else { 623 l.iterKey, l.iterValue = l.iter.NextPrefix(succKey) 624 } 625 626 if l.iterKey != nil { 627 if m.heap.len() > 1 { 628 m.heap.fix(0) 629 } 630 if l.rangeDelIter != oldRangeDelIter { 631 // The rangeDelIter changed which indicates that the l.iter moved to the 632 // next sstable. We have to update the tombstone for oldTopLevel as well. 633 oldTopLevel-- 634 } 635 } else { 636 m.err = l.iter.Error() 637 if m.err == nil { 638 m.heap.pop() 639 } 640 } 641 642 // The cached tombstones are only valid for the levels 643 // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range 644 // [oldTopLevel+1,heap[0].index]. 645 m.initMinRangeDelIters(oldTopLevel) 646 } 647 648 // isNextEntryDeleted starts from the current entry (as the next entry) and if 649 // it is deleted, moves the iterators forward as needed and returns true, else 650 // it returns false. item is the top item in the heap. 651 // 652 // During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by 653 // clearing the heap if the deleted key(s) extend beyond the iteration prefix 654 // during prefix-iteration mode. 655 func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { 656 // Look for a range deletion tombstone containing item.iterKey at higher 657 // levels (level < item.index). If we find such a range tombstone we know 658 // it deletes the key in the current level. Also look for a range 659 // deletion at the current level (level == item.index). If we find such a 660 // range deletion we need to check whether it is newer than the current 661 // entry. 662 for level := 0; level <= item.index; level++ { 663 l := &m.levels[level] 664 if l.rangeDelIter == nil || l.tombstone == nil { 665 // If l.tombstone is nil, there are no further tombstones 666 // in the current sstable in the current (forward) iteration 667 // direction. 668 continue 669 } 670 if m.heap.cmp(l.tombstone.End, item.iterKey.UserKey) <= 0 { 671 // The current key is at or past the tombstone end key. 672 // 673 // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that 674 // the levelIter must be positioned at a key >= item.iterKey. So it is sufficient to seek the 675 // current l.rangeDelIter (since any range del iterators that will be provided by the 676 // levelIter in the future cannot contain item.iterKey). Also, it is possible that we 677 // will encounter parts of the range delete that should be ignored -- we handle that 678 // below. 679 l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) 680 } 681 if l.tombstone == nil { 682 continue 683 } 684 685 // Reasoning for correctness of untruncated tombstone handling when the untruncated 686 // tombstone is at a higher level: 687 // The iterator corresponding to this tombstone is still in the heap so it must be 688 // positioned >= item.iterKey. Which means the Largest key bound of the sstable containing this 689 // tombstone is >= item.iterKey. So the upper limit of this tombstone cannot be file-bounds-constrained 690 // to < item.iterKey. But it is possible that item.key < smallestUserKey, in which 691 // case this tombstone should be ignored. 692 // 693 // Example 1: 694 // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The 695 // smallestUserKey is c, so we know the key is within the file bounds and the tombstone 696 // [b, i) covers it. 697 // 698 // Example 2: 699 // Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i) 700 // does not cover this key. 701 // 702 // For a tombstone at the same level as the key, the file bounds are trivially satisfied. 703 if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.iterKey.UserKey) <= 0) && 704 l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) { 705 if level < item.index { 706 // We could also do m.seekGE(..., level + 1). The levels from 707 // [level + 1, item.index) are already after item.iterKey so seeking them may be 708 // wasteful. 709 710 // We can seek up to the min of largestUserKey and tombstone.End. 711 // 712 // Using example 1 above, we can seek to the smaller of g and i, which is g. 713 // 714 // Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel], 715 // and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to 716 // the exclusive bound of the tombstone. We do not need to look at 717 // isLargestKeyRangeDelSentinel. 718 // 719 // Progress argument: Since this file is at a higher level than item.iterKey we know 720 // that the iterator in this file must be positioned within its bounds and at a key 721 // X > item.iterKey (otherwise it would be the min of the heap). It is not 722 // possible for X.UserKey == item.iterKey.UserKey, since it is incompatible with 723 // X > item.iterKey (a lower version cannot be in a higher sstable), so it must be that 724 // X.UserKey > item.iterKey.UserKey. Which means l.largestUserKey > item.key.UserKey. 725 // We also know that l.tombstone.End > item.iterKey.UserKey. So the min of these, 726 // seekKey, computed below, is > item.iterKey.UserKey, so the call to seekGE() will 727 // make forward progress. 728 seekKey := l.tombstone.End 729 if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 { 730 seekKey = l.largestUserKey 731 } 732 // This seek is not directly due to a SeekGE call, so we don't know 733 // enough about the underlying iterator positions, and so we keep the 734 // try-seek-using-next optimization disabled. Additionally, if we're in 735 // prefix-seek mode and a re-seek would have moved us past the original 736 // prefix, we can remove all merging iter levels below the rangedel 737 // tombstone's level and return immediately instead of re-seeking. This 738 // is correct since those levels cannot provide a key that matches the 739 // prefix, and is also visible. Additionally, this is important to make 740 // subsequent `TrySeekUsingNext` work correctly, as a re-seek on a 741 // different prefix could have resulted in this iterator skipping visible 742 // keys at prefixes in between m.prefix and seekKey, that are currently 743 // not in the heap due to a bloom filter mismatch. 744 // 745 // Additionally, we set the relative-seek flag. This is 746 // important when iterating with lazy combined iteration. If 747 // there's a range key between this level's current file and the 748 // file the seek will land on, we need to detect it in order to 749 // trigger construction of the combined iterator. 750 if m.prefix != nil { 751 if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { 752 for i := item.index; i < len(m.levels); i++ { 753 // Remove this level from the heap. Setting iterKey and iterValue 754 // to their zero values should be sufficient for initMinHeap to not 755 // re-initialize the heap with them in it. Other fields in 756 // mergingIterLevel can remain as-is; the iter/rangeDelIter needs 757 // to stay intact for future trySeekUsingNexts to work, the level 758 // iter boundary context is owned by the levelIter which is not 759 // being repositioned, and any tombstones in these levels will be 760 // irrelevant for us anyway. 761 m.levels[i].iterKey = nil 762 m.levels[i].iterValue = base.LazyValue{} 763 } 764 // TODO(bilal): Consider a more efficient way of removing levels from 765 // the heap without reinitializing all of it. This would likely 766 // necessitate tracking the heap positions of each mergingIterHeap 767 // item in the mergingIterLevel, and then swapping that item in the 768 // heap with the last-positioned heap item, and shrinking the heap by 769 // one. 770 m.initMinHeap() 771 return true 772 } 773 } 774 m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()) 775 return true 776 } 777 if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { 778 if m.prefix == nil { 779 m.nextEntry(item, nil /* succKey */) 780 } else { 781 m.maybeNextEntryWithinPrefix(item) 782 } 783 return true 784 } 785 } 786 } 787 return false 788 } 789 790 // Starting from the current entry, finds the first (next) entry that can be returned. 791 func (m *mergingIter) findNextEntry() (*InternalKey, base.LazyValue) { 792 for m.heap.len() > 0 && m.err == nil { 793 item := m.heap.items[0] 794 if m.levels[item.index].isSyntheticIterBoundsKey { 795 break 796 } 797 798 m.addItemStats(item) 799 800 // Skip ignorable boundary keys. These are not real keys and exist to 801 // keep sstables open until we've surpassed their end boundaries so that 802 // their range deletions are visible. 803 if m.levels[item.index].isIgnorableBoundaryKey { 804 if m.prefix == nil { 805 m.nextEntry(item, nil /* succKey */) 806 } else { 807 m.maybeNextEntryWithinPrefix(item) 808 } 809 continue 810 } 811 812 // Check if the heap root key is deleted by a range tombstone in a 813 // higher level. If it is, isNextEntryDeleted will advance the iterator 814 // to a later key (through seeking or nexting). 815 if m.isNextEntryDeleted(item) { 816 m.stats.PointsCoveredByRangeTombstones++ 817 continue 818 } 819 820 // Check if the key is visible at the iterator sequence numbers. 821 if !item.iterKey.Visible(m.snapshot, m.batchSnapshot) { 822 if m.prefix == nil { 823 m.nextEntry(item, nil /* succKey */) 824 } else { 825 m.maybeNextEntryWithinPrefix(item) 826 } 827 continue 828 } 829 830 // The heap root is visible and not deleted by any range tombstones. 831 // Return it. 832 return item.iterKey, item.iterValue 833 } 834 return nil, base.LazyValue{} 835 } 836 837 // Steps to the prev entry. item is the current top item in the heap. 838 func (m *mergingIter) prevEntry(l *mergingIterLevel) { 839 oldTopLevel := l.index 840 oldRangeDelIter := l.rangeDelIter 841 if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil { 842 if m.heap.len() > 1 { 843 m.heap.fix(0) 844 } 845 if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil { 846 // The rangeDelIter changed which indicates that the l.iter moved to the 847 // previous sstable. We have to update the tombstone for oldTopLevel as 848 // well. 849 oldTopLevel-- 850 } 851 } else { 852 m.err = l.iter.Error() 853 if m.err == nil { 854 m.heap.pop() 855 } 856 } 857 858 // The cached tombstones are only valid for the levels 859 // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range 860 // [oldTopLevel+1,heap[0].index]. 861 m.initMaxRangeDelIters(oldTopLevel) 862 } 863 864 // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted, 865 // moves the iterators backward as needed and returns true, else it returns false. item is the top 866 // item in the heap. 867 func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool { 868 // Look for a range deletion tombstone containing item.iterKey at higher 869 // levels (level < item.index). If we find such a range tombstone we know 870 // it deletes the key in the current level. Also look for a range 871 // deletion at the current level (level == item.index). If we find such a 872 // range deletion we need to check whether it is newer than the current 873 // entry. 874 for level := 0; level <= item.index; level++ { 875 l := &m.levels[level] 876 if l.rangeDelIter == nil || l.tombstone == nil { 877 // If l.tombstone is nil, there are no further tombstones 878 // in the current sstable in the current (reverse) iteration 879 // direction. 880 continue 881 } 882 if m.heap.cmp(item.iterKey.UserKey, l.tombstone.Start) < 0 { 883 // The current key is before the tombstone start key. 884 // 885 // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that 886 // the levelIter must be positioned at a key < item.iterKey. So it is sufficient to seek the 887 // current l.rangeDelIter (since any range del iterators that will be provided by the 888 // levelIter in the future cannot contain item.iterKey). Also, it is it is possible that we 889 // will encounter parts of the range delete that should be ignored -- we handle that 890 // below. 891 l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) 892 } 893 if l.tombstone == nil { 894 continue 895 } 896 897 // Reasoning for correctness of untruncated tombstone handling when the untruncated 898 // tombstone is at a higher level: 899 // 900 // The iterator corresponding to this tombstone is still in the heap so it must be 901 // positioned <= item.iterKey. Which means the Smallest key bound of the sstable containing this 902 // tombstone is <= item.iterKey. So the lower limit of this tombstone cannot have been 903 // file-bounds-constrained to > item.iterKey. But it is possible that item.key >= Largest 904 // key bound of this sstable, in which case this tombstone should be ignored. 905 // 906 // Example 1: 907 // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The 908 // largestUserKey is g, so we know the key is within the file bounds and the tombstone 909 // [b, i) covers it. 910 // 911 // Example 2: 912 // Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated 913 // tombstone was involved in a compaction which must have had a file to the right of this 914 // sstable that is part of the same atomic compaction group for future compactions. That 915 // file must have bounds that cover g#6 and this levelIter must be at that file. 916 // 917 // Example 3: 918 // sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10. 919 // This key is not deleted by this tombstone. We need to look at 920 // isLargestUserKeyExclusive. 921 // 922 // For a tombstone at the same level as the key, the file bounds are trivially satisfied. 923 924 // Default to within bounds. 925 withinLargestSSTableBound := true 926 if l.largestUserKey != nil { 927 cmpResult := m.heap.cmp(l.largestUserKey, item.iterKey.UserKey) 928 withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) 929 } 930 if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) && l.tombstone.VisibleAt(m.snapshot) { 931 if level < item.index { 932 // We could also do m.seekLT(..., level + 1). The levels from 933 // [level + 1, item.index) are already before item.iterKey so seeking them may be 934 // wasteful. 935 936 // We can seek up to the max of smallestUserKey and tombstone.Start.UserKey. 937 // 938 // Using example 1 above, we can seek to the larger of c and b, which is c. 939 // 940 // Progress argument: We know that the iterator in this file is positioned within 941 // its bounds and at a key X < item.iterKey (otherwise it would be the max of the heap). 942 // So smallestUserKey <= item.iterKey.UserKey and we already know that 943 // l.tombstone.Start.UserKey <= item.iterKey.UserKey. So the seekKey computed below 944 // is <= item.iterKey.UserKey, and since we do a seekLT() we will make backwards 945 // progress. 946 seekKey := l.tombstone.Start 947 if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 { 948 seekKey = l.smallestUserKey 949 } 950 // We set the relative-seek flag. This is important when 951 // iterating with lazy combined iteration. If there's a range 952 // key between this level's current file and the file the seek 953 // will land on, we need to detect it in order to trigger 954 // construction of the combined iterator. 955 m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()) 956 return true 957 } 958 if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { 959 m.prevEntry(item) 960 return true 961 } 962 } 963 } 964 return false 965 } 966 967 // Starting from the current entry, finds the first (prev) entry that can be returned. 968 func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) { 969 for m.heap.len() > 0 && m.err == nil { 970 item := m.heap.items[0] 971 if m.levels[item.index].isSyntheticIterBoundsKey { 972 break 973 } 974 m.addItemStats(item) 975 if m.isPrevEntryDeleted(item) { 976 m.stats.PointsCoveredByRangeTombstones++ 977 continue 978 } 979 if item.iterKey.Visible(m.snapshot, m.batchSnapshot) && 980 (!m.levels[item.index].isIgnorableBoundaryKey) { 981 return item.iterKey, item.iterValue 982 } 983 m.prevEntry(item) 984 } 985 return nil, base.LazyValue{} 986 } 987 988 // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks. 989 func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { 990 // When seeking, we can use tombstones to adjust the key we seek to on each 991 // level. Consider the series of range tombstones: 992 // 993 // 1: a---e 994 // 2: d---h 995 // 3: g---k 996 // 4: j---n 997 // 5: m---q 998 // 999 // If we SeekGE("b") we also find the tombstone "b" resides within in the 1000 // first level which is [a,e). Regardless of whether this tombstone deletes 1001 // "b" in that level, we know it deletes "b" in all lower levels, so we 1002 // adjust the search key in the next level to the tombstone end key "e". We 1003 // then SeekGE("e") in the second level and find the corresponding tombstone 1004 // [d,h). This process continues and we end up seeking for "h" in the 3rd 1005 // level, "k" in the 4th level and "n" in the last level. 1006 // 1007 // TODO(peter,rangedel): In addition to the above we can delay seeking a 1008 // level (and any lower levels) when the current iterator position is 1009 // contained within a range tombstone at a higher level. 1010 1011 // Deterministically disable the TrySeekUsingNext optimizations sometimes in 1012 // invariant builds to encourage the metamorphic tests to surface bugs. Note 1013 // that we cannot disable the optimization within individual levels. It must 1014 // be disabled for all levels or none. If one lower-level iterator performs 1015 // a fresh seek whereas another takes advantage of its current iterator 1016 // position, the heap can become inconsistent. Consider the following 1017 // example: 1018 // 1019 // L5: [ [b-c) ] [ d ]* 1020 // L6: [ b ] [e]* 1021 // 1022 // Imagine a SeekGE(a). The [b-c) range tombstone deletes the L6 point key 1023 // 'b', resulting in the iterator positioned at d with the heap: 1024 // 1025 // {L5: d, L6: e} 1026 // 1027 // A subsequent SeekGE(b) is seeking to a larger key, so the caller may set 1028 // TrySeekUsingNext()=true. If the L5 iterator used the TrySeekUsingNext 1029 // optimization but the L6 iterator did not, the iterator would have the 1030 // heap: 1031 // 1032 // {L6: b, L5: d} 1033 // 1034 // Because the L5 iterator has already advanced to the next sstable, the 1035 // merging iterator cannot observe the [b-c) range tombstone and will 1036 // mistakenly return L6's deleted point key 'b'. 1037 if invariants.Enabled && flags.TrySeekUsingNext() && !m.forceEnableSeekOpt && 1038 disableSeekOpt(key, uintptr(unsafe.Pointer(m))) { 1039 flags = flags.DisableTrySeekUsingNext() 1040 } 1041 1042 for ; level < len(m.levels); level++ { 1043 if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 { 1044 m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack()) 1045 } 1046 1047 l := &m.levels[level] 1048 if m.prefix != nil { 1049 l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags) 1050 } else { 1051 l.iterKey, l.iterValue = l.iter.SeekGE(key, flags) 1052 } 1053 1054 // If this level contains overlapping range tombstones, alter the seek 1055 // key accordingly. Caveat: If we're performing lazy-combined iteration, 1056 // we cannot alter the seek key: Range tombstones don't delete range 1057 // keys, and there might exist live range keys within the range 1058 // tombstone's span that need to be observed to trigger a switch to 1059 // combined iteration. 1060 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && 1061 (m.combinedIterState == nil || m.combinedIterState.initialized) { 1062 // The level has a range-del iterator. Find the tombstone containing 1063 // the search key. 1064 // 1065 // For untruncated tombstones that are possibly file-bounds-constrained, we are using a 1066 // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter 1067 // is at this file we know that largestUserKey >= key, so we know that the 1068 // tombstone we find cannot be file-bounds-constrained in its upper bound to something < key. 1069 // We do need to compare with smallestUserKey to ensure that the tombstone is not 1070 // file-bounds-constrained in its lower bound. 1071 // 1072 // See the detailed comments in isNextEntryDeleted() on why similar containment and 1073 // seeking logic is correct. The subtle difference here is that key is a user key, 1074 // so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the 1075 // tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past 1076 // this sstable since it realizes the largest key is a InternalRangeDelSentinel. 1077 l.tombstone = rangeDelIter.SeekGE(key) 1078 if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) && 1079 (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) { 1080 // NB: Based on the comment above l.largestUserKey >= key, and based on the 1081 // containment condition tombstone.End > key, so the assignment to key results 1082 // in a monotonically non-decreasing key across iterations of this loop. 1083 // 1084 // The adjustment of key here can only move it to a larger key. Since 1085 // the caller of seekGE guaranteed that the original key was greater 1086 // than or equal to m.lower, the new key will continue to be greater 1087 // than or equal to m.lower. 1088 if l.largestUserKey != nil && 1089 m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 { 1090 // Truncate the tombstone for seeking purposes. Note that this can over-truncate 1091 // but that is harmless for this seek optimization. 1092 key = l.largestUserKey 1093 } else { 1094 key = l.tombstone.End 1095 } 1096 } 1097 } 1098 } 1099 1100 m.initMinHeap() 1101 } 1102 1103 func (m *mergingIter) String() string { 1104 return "merging" 1105 } 1106 1107 // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks 1108 // the upper bound. It is up to the caller to ensure that key is greater than 1109 // or equal to the lower bound. 1110 func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 1111 m.err = nil // clear cached iteration error 1112 m.prefix = nil 1113 m.seekGE(key, 0 /* start level */, flags) 1114 return m.findNextEntry() 1115 } 1116 1117 // SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that 1118 // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure 1119 // that key is greater than or equal to the lower bound. 1120 func (m *mergingIter) SeekPrefixGE( 1121 prefix, key []byte, flags base.SeekGEFlags, 1122 ) (*base.InternalKey, base.LazyValue) { 1123 m.err = nil // clear cached iteration error 1124 m.prefix = prefix 1125 m.seekGE(key, 0 /* start level */, flags) 1126 return m.findNextEntry() 1127 } 1128 1129 // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks. 1130 func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) { 1131 // See the comment in seekGE regarding using tombstones to adjust the seek 1132 // target per level. 1133 m.prefix = nil 1134 for ; level < len(m.levels); level++ { 1135 if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 { 1136 m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack()) 1137 } 1138 1139 l := &m.levels[level] 1140 l.iterKey, l.iterValue = l.iter.SeekLT(key, flags) 1141 1142 // If this level contains overlapping range tombstones, alter the seek 1143 // key accordingly. Caveat: If we're performing lazy-combined iteration, 1144 // we cannot alter the seek key: Range tombstones don't delete range 1145 // keys, and there might exist live range keys within the range 1146 // tombstone's span that need to be observed to trigger a switch to 1147 // combined iteration. 1148 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && 1149 (m.combinedIterState == nil || m.combinedIterState.initialized) { 1150 // The level has a range-del iterator. Find the tombstone containing 1151 // the search key. 1152 // 1153 // For untruncated tombstones that are possibly file-bounds-constrained we are using a 1154 // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter 1155 // is at this file we know that smallestUserKey <= key, so we know that the 1156 // tombstone we find cannot be file-bounds-constrained in its lower bound to something > key. 1157 // We do need to compare with largestUserKey to ensure that the tombstone is not 1158 // file-bounds-constrained in its upper bound. 1159 // 1160 // See the detailed comments in isPrevEntryDeleted() on why similar containment and 1161 // seeking logic is correct. 1162 1163 // Default to within bounds. 1164 withinLargestSSTableBound := true 1165 if l.largestUserKey != nil { 1166 cmpResult := m.heap.cmp(l.largestUserKey, key) 1167 withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) 1168 } 1169 1170 l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) 1171 if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && 1172 l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound { 1173 // NB: Based on the comment above l.smallestUserKey <= key, and based 1174 // on the containment condition tombstone.Start.UserKey <= key, so the 1175 // assignment to key results in a monotonically non-increasing key 1176 // across iterations of this loop. 1177 // 1178 // The adjustment of key here can only move it to a smaller key. Since 1179 // the caller of seekLT guaranteed that the original key was less than 1180 // or equal to m.upper, the new key will continue to be less than or 1181 // equal to m.upper. 1182 if l.smallestUserKey != nil && 1183 m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 { 1184 // Truncate the tombstone for seeking purposes. Note that this can over-truncate 1185 // but that is harmless for this seek optimization. 1186 key = l.smallestUserKey 1187 } else { 1188 key = l.tombstone.Start 1189 } 1190 } 1191 } 1192 } 1193 1194 m.initMaxHeap() 1195 } 1196 1197 // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks 1198 // the lower bound. It is up to the caller to ensure that key is less than the 1199 // upper bound. 1200 func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 1201 m.err = nil // clear cached iteration error 1202 m.prefix = nil 1203 m.seekLT(key, 0 /* start level */, flags) 1204 return m.findPrevEntry() 1205 } 1206 1207 // First implements base.InternalIterator.First. Note that First only checks 1208 // the upper bound. It is up to the caller to ensure that key is greater than 1209 // or equal to the lower bound (e.g. via a call to SeekGE(lower)). 1210 func (m *mergingIter) First() (*InternalKey, base.LazyValue) { 1211 m.err = nil // clear cached iteration error 1212 m.prefix = nil 1213 m.heap.items = m.heap.items[:0] 1214 for i := range m.levels { 1215 l := &m.levels[i] 1216 l.iterKey, l.iterValue = l.iter.First() 1217 } 1218 m.initMinHeap() 1219 return m.findNextEntry() 1220 } 1221 1222 // Last implements base.InternalIterator.Last. Note that Last only checks the 1223 // lower bound. It is up to the caller to ensure that key is less than the 1224 // upper bound (e.g. via a call to SeekLT(upper)) 1225 func (m *mergingIter) Last() (*InternalKey, base.LazyValue) { 1226 m.err = nil // clear cached iteration error 1227 m.prefix = nil 1228 for i := range m.levels { 1229 l := &m.levels[i] 1230 l.iterKey, l.iterValue = l.iter.Last() 1231 } 1232 m.initMaxHeap() 1233 return m.findPrevEntry() 1234 } 1235 1236 func (m *mergingIter) Next() (*InternalKey, base.LazyValue) { 1237 if m.err != nil { 1238 return nil, base.LazyValue{} 1239 } 1240 1241 if m.dir != 1 { 1242 m.switchToMinHeap() 1243 return m.findNextEntry() 1244 } 1245 1246 if m.heap.len() == 0 { 1247 return nil, base.LazyValue{} 1248 } 1249 1250 // NB: It's okay to call nextEntry directly even during prefix iteration 1251 // mode (as opposed to indirectly through maybeNextEntryWithinPrefix). 1252 // During prefix iteration mode, we rely on the caller to not call Next if 1253 // the iterator has already advanced beyond the iteration prefix. See the 1254 // comment above the base.InternalIterator interface. 1255 m.nextEntry(m.heap.items[0], nil /* succKey */) 1256 return m.findNextEntry() 1257 } 1258 1259 func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 1260 if m.dir != 1 { 1261 panic("pebble: cannot switch directions with NextPrefix") 1262 } 1263 if m.err != nil || m.heap.len() == 0 { 1264 return nil, LazyValue{} 1265 } 1266 if m.levelsPositioned == nil { 1267 m.levelsPositioned = make([]bool, len(m.levels)) 1268 } else { 1269 for i := range m.levelsPositioned { 1270 m.levelsPositioned[i] = false 1271 } 1272 } 1273 1274 // The heap root necessarily must be positioned at a key < succKey, because 1275 // NextPrefix was invoked. 1276 root := &m.heap.items[0] 1277 m.levelsPositioned[(*root).index] = true 1278 if invariants.Enabled && m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { 1279 m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q", 1280 succKey, (*root).iterKey) 1281 } 1282 m.nextEntry(*root, succKey) 1283 // NB: root is a pointer to the heap root. nextEntry may have changed 1284 // the heap root, so we must not expect root to still point to the same 1285 // level (or to even be valid, if the heap is now exhaused). 1286 1287 for m.heap.len() > 0 { 1288 if m.levelsPositioned[(*root).index] { 1289 // A level we've previously positioned is at the top of the heap, so 1290 // there are no other levels positioned at keys < succKey. We've 1291 // advanced as far as we need to. 1292 break 1293 } 1294 // Since this level was not the original heap root when NextPrefix was 1295 // called, we don't know whether this level's current key has the 1296 // previous prefix or a new one. 1297 if m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { 1298 break 1299 } 1300 m.levelsPositioned[(*root).index] = true 1301 m.nextEntry(*root, succKey) 1302 } 1303 return m.findNextEntry() 1304 } 1305 1306 func (m *mergingIter) Prev() (*InternalKey, base.LazyValue) { 1307 if m.err != nil { 1308 return nil, base.LazyValue{} 1309 } 1310 1311 if m.dir != -1 { 1312 if m.prefix != nil { 1313 m.err = errors.New("pebble: unsupported reverse prefix iteration") 1314 return nil, base.LazyValue{} 1315 } 1316 m.switchToMaxHeap() 1317 return m.findPrevEntry() 1318 } 1319 1320 if m.heap.len() == 0 { 1321 return nil, base.LazyValue{} 1322 } 1323 1324 m.prevEntry(m.heap.items[0]) 1325 return m.findPrevEntry() 1326 } 1327 1328 func (m *mergingIter) Error() error { 1329 if m.heap.len() == 0 || m.err != nil { 1330 return m.err 1331 } 1332 return m.levels[m.heap.items[0].index].iter.Error() 1333 } 1334 1335 func (m *mergingIter) Close() error { 1336 for i := range m.levels { 1337 iter := m.levels[i].iter 1338 if err := iter.Close(); err != nil && m.err == nil { 1339 m.err = err 1340 } 1341 if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil { 1342 if err := rangeDelIter.Close(); err != nil && m.err == nil { 1343 m.err = err 1344 } 1345 } 1346 } 1347 m.levels = nil 1348 m.heap.items = m.heap.items[:0] 1349 return m.err 1350 } 1351 1352 func (m *mergingIter) SetBounds(lower, upper []byte) { 1353 m.prefix = nil 1354 m.lower = lower 1355 m.upper = upper 1356 for i := range m.levels { 1357 m.levels[i].iter.SetBounds(lower, upper) 1358 } 1359 m.heap.clear() 1360 } 1361 1362 func (m *mergingIter) SetContext(ctx context.Context) { 1363 for i := range m.levels { 1364 m.levels[i].iter.SetContext(ctx) 1365 } 1366 } 1367 1368 func (m *mergingIter) DebugString() string { 1369 var buf bytes.Buffer 1370 sep := "" 1371 for m.heap.len() > 0 { 1372 item := m.heap.pop() 1373 fmt.Fprintf(&buf, "%s%s", sep, item.iterKey) 1374 sep = " " 1375 } 1376 if m.dir == 1 { 1377 m.initMinHeap() 1378 } else { 1379 m.initMaxHeap() 1380 } 1381 return buf.String() 1382 } 1383 1384 func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) { 1385 for _, ml := range m.levels { 1386 if ml.levelIter != nil { 1387 if done := fn(ml.levelIter); done { 1388 break 1389 } 1390 } 1391 } 1392 } 1393 1394 func (m *mergingIter) addItemStats(l *mergingIterLevel) { 1395 m.stats.PointCount++ 1396 m.stats.KeyBytes += uint64(len(l.iterKey.UserKey)) 1397 m.stats.ValueBytes += uint64(len(l.iterValue.ValueOrHandle)) 1398 } 1399 1400 var _ internalIterator = &mergingIter{}