github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/merging_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "fmt" 10 "runtime/debug" 11 "unsafe" 12 13 "github.com/cockroachdb/errors" 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/internal/invariants" 16 "github.com/cockroachdb/pebble/internal/keyspan" 17 ) 18 19 type mergingIterLevel struct { 20 index int 21 iter internalIterator 22 // rangeDelIter is set to the range-deletion iterator for the level. When 23 // configured with a levelIter, this pointer changes as sstable boundaries 24 // are crossed. See levelIter.initRangeDel and the Range Deletions comment 25 // below. 26 rangeDelIter keyspan.FragmentIterator 27 // iterKey and iterValue cache the current key and value iter are pointed at. 28 iterKey *InternalKey 29 iterValue base.LazyValue 30 // levelIter is non-nil if this level's iter is ultimately backed by a 31 // *levelIter. The handle in iter may have wrapped the levelIter with 32 // intermediary internalIterator implementations. 33 levelIter *levelIter 34 35 // levelIterBoundaryContext's fields are set when using levelIter, in order 36 // to surface sstable boundary keys and file-level context. See levelIter 37 // comment and the Range Deletions comment below. 38 levelIterBoundaryContext 39 40 // tombstone caches the tombstone rangeDelIter is currently pointed at. If 41 // tombstone is nil, there are no further tombstones within the 42 // current sstable in the current iterator direction. The cached tombstone is 43 // only valid for the levels in the range [0,heap[0].index]. This avoids 44 // positioning tombstones at lower levels which cannot possibly shadow the 45 // current key. 46 tombstone *keyspan.Span 47 } 48 49 type levelIterBoundaryContext struct { 50 // smallestUserKey and largestUserKey are populated with the smallest and 51 // largest boundaries of the current file. 52 smallestUserKey, largestUserKey []byte 53 // isLargestUserKeyExclusive is set to true when a file's largest boundary 54 // is an exclusive key, (eg, a range deletion sentinel). If true, the file 55 // does not contain any keys with the provided user key, and the 56 // largestUserKey bound is exclusive. 57 isLargestUserKeyExclusive bool 58 // isSyntheticIterBoundsKey is set to true iff the key returned by the level 59 // iterator is a synthetic key derived from the iterator bounds. This is used 60 // to prevent the mergingIter from being stuck at such a synthetic key if it 61 // becomes the top element of the heap. When used with a user-facing Iterator, 62 // the only range deletions exposed by this mergingIter should be those with 63 // `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. 64 isSyntheticIterBoundsKey bool 65 // isIgnorableBoundaryKey is set to true iff the key returned by the level 66 // iterator is a file boundary key that should be ignored when returning to 67 // the parent iterator. File boundary keys are used by the level iter to 68 // keep a levelIter file's range deletion iterator open as long as other 69 // levels within the merging iterator require it. When used with a user-facing 70 // Iterator, the only range deletions exposed by this mergingIter should be 71 // those with `isSyntheticIterBoundsKey || isIgnorableBoundaryKey`. 72 isIgnorableBoundaryKey bool 73 } 74 75 // mergingIter provides a merged view of multiple iterators from different 76 // levels of the LSM. 77 // 78 // The core of a mergingIter is a heap of internalIterators (see 79 // mergingIterHeap). The heap can operate as either a min-heap, used during 80 // forward iteration (First, SeekGE, Next) or a max-heap, used during reverse 81 // iteration (Last, SeekLT, Prev). The heap is initialized in calls to First, 82 // Last, SeekGE, and SeekLT. A call to Next or Prev takes the current top 83 // element on the heap, advances its iterator, and then "fixes" the heap 84 // property. When one of the child iterators is exhausted during Next/Prev 85 // iteration, it is removed from the heap. 86 // 87 // # Range Deletions 88 // 89 // A mergingIter can optionally be configured with a slice of range deletion 90 // iterators. The range deletion iterator slice must exactly parallel the point 91 // iterators and the range deletion iterator must correspond to the same level 92 // in the LSM as the point iterator. Note that each memtable and each table in 93 // L0 is a different "level" from the mergingIter perspective. So level 0 below 94 // does not correspond to L0 in the LSM. 95 // 96 // A range deletion iterator iterates over fragmented range tombstones. Range 97 // tombstones are fragmented by splitting them at any overlapping points. This 98 // fragmentation guarantees that within an sstable tombstones will either be 99 // distinct or will have identical start and end user keys. While range 100 // tombstones are fragmented within an sstable, the start and end keys are not truncated 101 // to sstable boundaries. This is necessary because the tombstone end key is 102 // exclusive and does not have a sequence number. Consider an sstable 103 // containing the range tombstone [a,c)#9 and the key "b#8". The tombstone must 104 // delete "b#8", yet older versions of "b" might spill over to the next 105 // sstable. So the boundary key for this sstable must be "b#8". Adjusting the 106 // end key of tombstones to be optionally inclusive or contain a sequence 107 // number would be possible solutions (such solutions have potentially serious 108 // issues: tombstones have exclusive end keys since an inclusive deletion end can 109 // be converted to an exclusive one while the reverse transformation is not possible; 110 // the semantics of a sequence number for the end key of a range tombstone are murky). 111 // 112 // The approach taken here performs an 113 // implicit truncation of the tombstone to the sstable boundaries. 114 // 115 // During initialization of a mergingIter, the range deletion iterators for 116 // batches, memtables, and L0 tables are populated up front. Note that Batches 117 // and memtables index unfragmented tombstones. Batch.newRangeDelIter() and 118 // memTable.newRangeDelIter() fragment and cache the tombstones on demand. The 119 // L1-L6 range deletion iterators are populated by levelIter. When configured 120 // to load range deletion iterators, whenever a levelIter loads a table it 121 // loads both the point iterator and the range deletion 122 // iterator. levelIter.rangeDelIter is configured to point to the right entry 123 // in mergingIter.levels. The effect of this setup is that 124 // mergingIter.levels[i].rangeDelIter always contains the fragmented range 125 // tombstone for the current table in level i that the levelIter has open. 126 // 127 // Another crucial mechanism of levelIter is that it materializes fake point 128 // entries for the table boundaries if the boundary is range deletion 129 // key. Consider a table that contains only a range tombstone [a-e)#10. The 130 // sstable boundaries for this table will be a#10,15 and 131 // e#72057594037927935,15. During forward iteration levelIter will return 132 // e#72057594037927935,15 as a key. During reverse iteration levelIter will 133 // return a#10,15 as a key. These sentinel keys act as bookends to point 134 // iteration and allow mergingIter to keep a table and its associated range 135 // tombstones loaded as long as there are keys at lower levels that are within 136 // the bounds of the table. 137 // 138 // The final piece to the range deletion puzzle is the LSM invariant that for a 139 // given key K newer versions of K can only exist earlier in the level, or at 140 // higher levels of the tree. For example, if K#4 exists in L3, k#5 can only 141 // exist earlier in the L3 or in L0, L1, L2 or a memtable. Get very explicitly 142 // uses this invariant to find the value for a key by walking the LSM level by 143 // level. For range deletions, this invariant means that a range deletion at 144 // level N will necessarily shadow any keys within its bounds in level Y where 145 // Y > N. One wrinkle to this statement is that it only applies to keys that 146 // lie within the sstable bounds as well, but we get that guarantee due to the 147 // way the range deletion iterator and point iterator are bound together by a 148 // levelIter. 149 // 150 // Tying the above all together, we get a picture where each level (index in 151 // mergingIter.levels) is composed of both point operations (pX) and range 152 // deletions (rX). The range deletions for level X shadow both the point 153 // operations and range deletions for level Y where Y > X allowing mergingIter 154 // to skip processing entries in that shadow. For example, consider the 155 // scenario: 156 // 157 // r0: a---e 158 // r1: d---h 159 // r2: g---k 160 // r3: j---n 161 // r4: m---q 162 // 163 // This is showing 5 levels of range deletions. Consider what happens upon 164 // SeekGE("b"). We first seek the point iterator for level 0 (the point values 165 // are not shown above) and we then seek the range deletion iterator. That 166 // returns the tombstone [a,e). This tombstone tells us that all keys in the 167 // range [a,e) in lower levels are deleted so we can skip them. So we can 168 // adjust the seek key to "e", the tombstone end key. For level 1 we seek to 169 // "e" and find the range tombstone [d,h) and similar logic holds. By the time 170 // we get to level 4 we're seeking to "n". 171 // 172 // One consequence of not truncating tombstone end keys to sstable boundaries 173 // is the seeking process described above cannot always seek to the tombstone 174 // end key in the older level. For example, imagine in the above example r3 is 175 // a partitioned level (i.e., L1+ in our LSM), and the sstable containing [j, 176 // n) has "k" as its upper boundary. In this situation, compactions involving 177 // keys at or after "k" can output those keys to r4+, even if they're newer 178 // than our tombstone [j, n). So instead of seeking to "n" in r4 we can only 179 // seek to "k". To achieve this, the instance variable `largestUserKey.` 180 // maintains the upper bounds of the current sstables in the partitioned 181 // levels. In this example, `levels[3].largestUserKey` holds "k", telling us to 182 // limit the seek triggered by a tombstone in r3 to "k". 183 // 184 // During actual iteration levels can contain both point operations and range 185 // deletions. Within a level, when a range deletion contains a point operation 186 // the sequence numbers must be checked to determine if the point operation is 187 // newer or older than the range deletion tombstone. The mergingIter maintains 188 // the invariant that the range deletion iterators for all levels newer that 189 // the current iteration key (L < m.heap.items[0].index) are positioned at the 190 // next (or previous during reverse iteration) range deletion tombstone. We 191 // know those levels don't contain a range deletion tombstone that covers the 192 // current key because if they did the current key would be deleted. The range 193 // deletion iterator for the current key's level is positioned at a range 194 // tombstone covering or past the current key. The position of all of other 195 // range deletion iterators is unspecified. Whenever a key from those levels 196 // becomes the current key, their range deletion iterators need to be 197 // positioned. This lazy positioning avoids seeking the range deletion 198 // iterators for keys that are never considered. (A similar bit of lazy 199 // evaluation can be done for the point iterators, but is still TBD). 200 // 201 // For a full example, consider the following setup: 202 // 203 // p0: o 204 // r0: m---q 205 // 206 // p1: n p 207 // r1: g---k 208 // 209 // p2: b d i 210 // r2: a---e q----v 211 // 212 // p3: e 213 // r3: 214 // 215 // If we start iterating from the beginning, the first key we encounter is "b" 216 // in p2. When the mergingIter is pointing at a valid entry, the range deletion 217 // iterators for all of the levels < m.heap.items[0].index are positioned at 218 // the next range tombstone past the current key. So r0 will point at [m,q) and 219 // r1 at [g,k). When the key "b" is encountered, we check to see if the current 220 // tombstone for r0 or r1 contains it, and whether the tombstone for r2, [a,e), 221 // contains and is newer than "b". 222 // 223 // Advancing the iterator finds the next key at "d". This is in the same level 224 // as the previous key "b" so we don't have to reposition any of the range 225 // deletion iterators, but merely check whether "d" is now contained by any of 226 // the range tombstones at higher levels or has stepped past the range 227 // tombstone in its own level or higher levels. In this case, there is nothing to be done. 228 // 229 // Advancing the iterator again finds "e". Since "e" comes from p3, we have to 230 // position the r3 range deletion iterator, which is empty. "e" is past the r2 231 // tombstone of [a,e) so we need to advance the r2 range deletion iterator to 232 // [q,v). 233 // 234 // The next key is "i". Because this key is in p2, a level above "e", we don't 235 // have to reposition any range deletion iterators and instead see that "i" is 236 // covered by the range tombstone [g,k). The iterator is immediately advanced 237 // to "n" which is covered by the range tombstone [m,q) causing the iterator to 238 // advance to "o" which is visible. 239 // 240 // TODO(peter,rangedel): For testing, advance the iterator through various 241 // scenarios and have each step display the current state (i.e. the current 242 // heap and range-del iterator positioning). 243 type mergingIter struct { 244 logger Logger 245 split Split 246 dir int 247 snapshot uint64 248 batchSnapshot uint64 249 levels []mergingIterLevel 250 heap mergingIterHeap 251 err error 252 prefix []byte 253 lower []byte 254 upper []byte 255 stats *InternalIteratorStats 256 257 // levelsPositioned, if non-nil, is a slice of the same length as levels. 258 // It's used by NextPrefix to record which levels have already been 259 // repositioned. It's created lazily by the first call to NextPrefix. 260 levelsPositioned []bool 261 262 combinedIterState *combinedIterState 263 264 // Used in some tests to disable the random disabling of seek optimizations. 265 forceEnableSeekOpt bool 266 } 267 268 // mergingIter implements the base.InternalIterator interface. 269 var _ base.InternalIterator = (*mergingIter)(nil) 270 271 // newMergingIter returns an iterator that merges its input. Walking the 272 // resultant iterator will return all key/value pairs of all input iterators 273 // in strictly increasing key order, as defined by cmp. It is permissible to 274 // pass a nil split parameter if the caller is never going to call 275 // SeekPrefixGE. 276 // 277 // The input's key ranges may overlap, but there are assumed to be no duplicate 278 // keys: if iters[i] contains a key k then iters[j] will not contain that key k. 279 // 280 // None of the iters may be nil. 281 func newMergingIter( 282 logger Logger, 283 stats *base.InternalIteratorStats, 284 cmp Compare, 285 split Split, 286 iters ...internalIterator, 287 ) *mergingIter { 288 m := &mergingIter{} 289 levels := make([]mergingIterLevel, len(iters)) 290 for i := range levels { 291 levels[i].iter = iters[i] 292 } 293 m.init(&IterOptions{logger: logger}, stats, cmp, split, levels...) 294 return m 295 } 296 297 func (m *mergingIter) init( 298 opts *IterOptions, 299 stats *base.InternalIteratorStats, 300 cmp Compare, 301 split Split, 302 levels ...mergingIterLevel, 303 ) { 304 m.err = nil // clear cached iteration error 305 m.logger = opts.getLogger() 306 if opts != nil { 307 m.lower = opts.LowerBound 308 m.upper = opts.UpperBound 309 } 310 m.snapshot = InternalKeySeqNumMax 311 m.batchSnapshot = InternalKeySeqNumMax 312 m.levels = levels 313 m.heap.cmp = cmp 314 m.split = split 315 m.stats = stats 316 if cap(m.heap.items) < len(levels) { 317 m.heap.items = make([]*mergingIterLevel, 0, len(levels)) 318 } else { 319 m.heap.items = m.heap.items[:0] 320 } 321 for l := range m.levels { 322 m.levels[l].index = l 323 } 324 } 325 326 func (m *mergingIter) initHeap() { 327 m.heap.items = m.heap.items[:0] 328 for i := range m.levels { 329 if l := &m.levels[i]; l.iterKey != nil { 330 m.heap.items = append(m.heap.items, l) 331 } else { 332 m.err = firstError(m.err, l.iter.Error()) 333 if m.err != nil { 334 return 335 } 336 } 337 } 338 m.heap.init() 339 } 340 341 func (m *mergingIter) initMinHeap() { 342 m.dir = 1 343 m.heap.reverse = false 344 m.initHeap() 345 m.initMinRangeDelIters(-1) 346 } 347 348 // The level of the previous top element was oldTopLevel. Note that all range delete 349 // iterators < oldTopLevel are positioned past the key of the previous top element and 350 // the range delete iterator == oldTopLevel is positioned at or past the key of the 351 // previous top element. We need to position the range delete iterators from oldTopLevel + 1 352 // to the level of the current top element. 353 func (m *mergingIter) initMinRangeDelIters(oldTopLevel int) { 354 if m.heap.len() == 0 { 355 return 356 } 357 358 // Position the range-del iterators at levels <= m.heap.items[0].index. 359 item := m.heap.items[0] 360 for level := oldTopLevel + 1; level <= item.index; level++ { 361 l := &m.levels[level] 362 if l.rangeDelIter == nil { 363 continue 364 } 365 l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) 366 } 367 } 368 369 func (m *mergingIter) initMaxHeap() { 370 m.dir = -1 371 m.heap.reverse = true 372 m.initHeap() 373 m.initMaxRangeDelIters(-1) 374 } 375 376 // The level of the previous top element was oldTopLevel. Note that all range delete 377 // iterators < oldTopLevel are positioned before the key of the previous top element and 378 // the range delete iterator == oldTopLevel is positioned at or before the key of the 379 // previous top element. We need to position the range delete iterators from oldTopLevel + 1 380 // to the level of the current top element. 381 func (m *mergingIter) initMaxRangeDelIters(oldTopLevel int) { 382 if m.heap.len() == 0 { 383 return 384 } 385 // Position the range-del iterators at levels <= m.heap.items[0].index. 386 item := m.heap.items[0] 387 for level := oldTopLevel + 1; level <= item.index; level++ { 388 l := &m.levels[level] 389 if l.rangeDelIter == nil { 390 continue 391 } 392 l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) 393 } 394 } 395 396 func (m *mergingIter) switchToMinHeap() { 397 if m.heap.len() == 0 { 398 if m.lower != nil { 399 m.SeekGE(m.lower, base.SeekGEFlagsNone) 400 } else { 401 m.First() 402 } 403 return 404 } 405 406 // We're switching from using a max heap to a min heap. We need to advance 407 // any iterator that is less than or equal to the current key. Consider the 408 // scenario where we have 2 iterators being merged (user-key:seq-num): 409 // 410 // i1: *a:2 b:2 411 // i2: a:1 b:1 412 // 413 // The current key is a:2 and i2 is pointed at a:1. When we switch to forward 414 // iteration, we want to return a key that is greater than a:2. 415 416 key := m.heap.items[0].iterKey 417 cur := m.heap.items[0] 418 419 for i := range m.levels { 420 l := &m.levels[i] 421 if l == cur { 422 continue 423 } 424 425 // If the iterator is exhausted, it may be out of bounds if range 426 // deletions modified our search key as we descended. we need to 427 // reposition it within the search bounds. If the current key is a 428 // range tombstone, the iterator might still be exhausted but at a 429 // sstable boundary sentinel. It would be okay to reposition an 430 // interator like this only through successive Next calls, except that 431 // it would violate the levelIter's invariants by causing it to return 432 // a key before the lower bound. 433 // 434 // bounds = [ f, _ ) 435 // L0: [ b ] [ f* z ] 436 // L1: [ a |----| k y ] 437 // L2: [ c (d) ] [ e g m ] 438 // L3: [ x ] 439 // 440 // * - current key [] - table bounds () - heap item 441 // 442 // In the above diagram, the L2 iterator is positioned at a sstable 443 // boundary (d) outside the lower bound (f). It arrived here from a 444 // seek whose seek-key was modified by a range tombstone. If we called 445 // Next on the L2 iterator, it would return e, violating its lower 446 // bound. Instead, we seek it to >= f and Next from there. 447 448 if l.iterKey == nil || (m.lower != nil && l.isSyntheticIterBoundsKey && 449 l.iterKey.IsExclusiveSentinel() && 450 m.heap.cmp(l.iterKey.UserKey, m.lower) <= 0) { 451 if m.lower != nil { 452 l.iterKey, l.iterValue = l.iter.SeekGE(m.lower, base.SeekGEFlagsNone) 453 } else { 454 l.iterKey, l.iterValue = l.iter.First() 455 } 456 } 457 for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Next() { 458 if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) < 0 { 459 // key < iter-key 460 break 461 } 462 // key >= iter-key 463 } 464 } 465 466 // Special handling for the current iterator because we were using its key 467 // above. The iterator cur.iter may still be exhausted at a sstable boundary 468 // sentinel. Similar to the logic applied to the other levels, in these 469 // cases we seek the iterator to the first key in order to avoid violating 470 // levelIter's invariants. See the example in the for loop above. 471 if m.lower != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && 472 m.heap.cmp(cur.iterKey.UserKey, m.lower) <= 0 { 473 cur.iterKey, cur.iterValue = cur.iter.SeekGE(m.lower, base.SeekGEFlagsNone) 474 } else { 475 cur.iterKey, cur.iterValue = cur.iter.Next() 476 } 477 m.initMinHeap() 478 } 479 480 func (m *mergingIter) switchToMaxHeap() { 481 if m.heap.len() == 0 { 482 if m.upper != nil { 483 m.SeekLT(m.upper, base.SeekLTFlagsNone) 484 } else { 485 m.Last() 486 } 487 return 488 } 489 490 // We're switching from using a min heap to a max heap. We need to backup any 491 // iterator that is greater than or equal to the current key. Consider the 492 // scenario where we have 2 iterators being merged (user-key:seq-num): 493 // 494 // i1: a:2 *b:2 495 // i2: a:1 b:1 496 // 497 // The current key is b:2 and i2 is pointing at b:1. When we switch to 498 // reverse iteration, we want to return a key that is less than b:2. 499 key := m.heap.items[0].iterKey 500 cur := m.heap.items[0] 501 502 for i := range m.levels { 503 l := &m.levels[i] 504 if l == cur { 505 continue 506 } 507 508 // If the iterator is exhausted, it may be out of bounds if range 509 // deletions modified our search key as we descended. we need to 510 // reposition it within the search bounds. If the current key is a 511 // range tombstone, the iterator might still be exhausted but at a 512 // sstable boundary sentinel. It would be okay to reposition an 513 // interator like this only through successive Prev calls, except that 514 // it would violate the levelIter's invariants by causing it to return 515 // a key beyond the upper bound. 516 // 517 // bounds = [ _, g ) 518 // L0: [ b ] [ f* z ] 519 // L1: [ a |-------| k y ] 520 // L2: [ c d ] h [(i) m ] 521 // L3: [ e x ] 522 // 523 // * - current key [] - table bounds () - heap item 524 // 525 // In the above diagram, the L2 iterator is positioned at a sstable 526 // boundary (i) outside the upper bound (g). It arrived here from a 527 // seek whose seek-key was modified by a range tombstone. If we called 528 // Prev on the L2 iterator, it would return h, violating its upper 529 // bound. Instead, we seek it to < g, and Prev from there. 530 531 if l.iterKey == nil || (m.upper != nil && l.isSyntheticIterBoundsKey && 532 l.iterKey.IsExclusiveSentinel() && m.heap.cmp(l.iterKey.UserKey, m.upper) >= 0) { 533 if m.upper != nil { 534 l.iterKey, l.iterValue = l.iter.SeekLT(m.upper, base.SeekLTFlagsNone) 535 } else { 536 l.iterKey, l.iterValue = l.iter.Last() 537 } 538 } 539 for ; l.iterKey != nil; l.iterKey, l.iterValue = l.iter.Prev() { 540 if base.InternalCompare(m.heap.cmp, *key, *l.iterKey) > 0 { 541 // key > iter-key 542 break 543 } 544 // key <= iter-key 545 } 546 } 547 548 // Special handling for the current iterator because we were using its key 549 // above. The iterator cur.iter may still be exhausted at a sstable boundary 550 // sentinel. Similar to the logic applied to the other levels, in these 551 // cases we seek the iterator to in order to avoid violating levelIter's 552 // invariants by Prev-ing through files. See the example in the for loop 553 // above. 554 if m.upper != nil && cur.isSyntheticIterBoundsKey && cur.iterKey.IsExclusiveSentinel() && 555 m.heap.cmp(cur.iterKey.UserKey, m.upper) >= 0 { 556 cur.iterKey, cur.iterValue = cur.iter.SeekLT(m.upper, base.SeekLTFlagsNone) 557 } else { 558 cur.iterKey, cur.iterValue = cur.iter.Prev() 559 } 560 m.initMaxHeap() 561 } 562 563 // maybeNextEntryWithinPrefix steps to the next entry, as long as the iteration 564 // prefix has not already been exceeded. If it has, it exhausts the iterator by 565 // resetting the heap to empty. 566 func (m *mergingIter) maybeNextEntryWithinPrefix(l *mergingIterLevel) { 567 if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { 568 // The item at the root of the heap already exceeds the iteration 569 // prefix. We should not advance any more. Clear the heap to reflect 570 // that the iterator is now exhausted (within this prefix, at 571 // least). 572 m.heap.items = m.heap.items[:0] 573 return 574 } 575 m.nextEntry(l, nil /* succKey */) 576 } 577 578 // nextEntry unconditionally steps to the next entry. item is the current top 579 // item in the heap. 580 // 581 // nextEntry should be called directly when not in prefix-iteration mode, or by 582 // Next. During prefix iteration mode, all other callers should use 583 // maybeNextEntryWithinPrefix which will avoid advancing the iterator if the 584 // current iteration prefix has been exhausted. See the comment within 585 // nextEntry's body for an explanation of why other callers should call 586 // maybeNextEntryWithinPrefix, which will ensure the documented invariant is 587 // preserved. 588 func (m *mergingIter) nextEntry(l *mergingIterLevel, succKey []byte) { 589 // INVARIANT: If in prefix iteration mode, item.iterKey must have a prefix equal 590 // to m.prefix. This invariant is important for ensuring TrySeekUsingNext 591 // optimizations behave correctly. 592 // 593 // During prefix iteration, the iterator does not have a full view of the 594 // LSM. Some level iterators may omit keys that are known to fall outside 595 // the seek prefix (eg, due to sstable bloom filter exclusion). It's 596 // important that in such cases we don't position any iterators beyond 597 // m.prefix, because doing so may interfere with future seeks. 598 // 599 // Let prefixes P1 < P2 < P3. Imagine a SeekPrefixGE to prefix P1, followed 600 // by a SeekPrefixGE to prefix P2. Imagine there exist live keys at prefix 601 // P2, but they're not visible to the SeekPrefixGE(P1) (because of 602 // bloom-filter exclusion or a range tombstone that deletes prefix P1 but 603 // not P2). If the SeekPrefixGE(P1) is allowed to move any level iterators 604 // to P3, the SeekPrefixGE(P2, TrySeekUsingNext=true) may mistakenly think 605 // the level contains no point keys or range tombstones within the prefix 606 // P2. Care is taken to avoid ever advancing the iterator beyond the current 607 // prefix. If nextEntry is ever invoked while we're already beyond the 608 // current prefix, we're violating the invariant. 609 if invariants.Enabled && m.prefix != nil { 610 if s := m.split(l.iterKey.UserKey); !bytes.Equal(m.prefix, l.iterKey.UserKey[:s]) { 611 m.logger.Fatalf("mergingIter: prefix violation: nexting beyond prefix %q; existing heap root %q\n%s", 612 m.prefix, l.iterKey, debug.Stack()) 613 } 614 } 615 616 oldTopLevel := l.index 617 oldRangeDelIter := l.rangeDelIter 618 619 if succKey == nil { 620 l.iterKey, l.iterValue = l.iter.Next() 621 } else { 622 l.iterKey, l.iterValue = l.iter.NextPrefix(succKey) 623 } 624 625 if l.iterKey != nil { 626 if m.heap.len() > 1 { 627 m.heap.fix(0) 628 } 629 if l.rangeDelIter != oldRangeDelIter { 630 // The rangeDelIter changed which indicates that the l.iter moved to the 631 // next sstable. We have to update the tombstone for oldTopLevel as well. 632 oldTopLevel-- 633 } 634 } else { 635 m.err = l.iter.Error() 636 if m.err == nil { 637 m.heap.pop() 638 } 639 } 640 641 // The cached tombstones are only valid for the levels 642 // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range 643 // [oldTopLevel+1,heap[0].index]. 644 m.initMinRangeDelIters(oldTopLevel) 645 } 646 647 // isNextEntryDeleted starts from the current entry (as the next entry) and if 648 // it is deleted, moves the iterators forward as needed and returns true, else 649 // it returns false. item is the top item in the heap. 650 // 651 // During prefix iteration mode, isNextEntryDeleted will exhaust the iterator by 652 // clearing the heap if the deleted key(s) extend beyond the iteration prefix 653 // during prefix-iteration mode. 654 func (m *mergingIter) isNextEntryDeleted(item *mergingIterLevel) bool { 655 // Look for a range deletion tombstone containing item.iterKey at higher 656 // levels (level < item.index). If we find such a range tombstone we know 657 // it deletes the key in the current level. Also look for a range 658 // deletion at the current level (level == item.index). If we find such a 659 // range deletion we need to check whether it is newer than the current 660 // entry. 661 for level := 0; level <= item.index; level++ { 662 l := &m.levels[level] 663 if l.rangeDelIter == nil || l.tombstone == nil { 664 // If l.tombstone is nil, there are no further tombstones 665 // in the current sstable in the current (forward) iteration 666 // direction. 667 continue 668 } 669 if m.heap.cmp(l.tombstone.End, item.iterKey.UserKey) <= 0 { 670 // The current key is at or past the tombstone end key. 671 // 672 // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that 673 // the levelIter must be positioned at a key >= item.iterKey. So it is sufficient to seek the 674 // current l.rangeDelIter (since any range del iterators that will be provided by the 675 // levelIter in the future cannot contain item.iterKey). Also, it is possible that we 676 // will encounter parts of the range delete that should be ignored -- we handle that 677 // below. 678 l.tombstone = l.rangeDelIter.SeekGE(item.iterKey.UserKey) 679 } 680 if l.tombstone == nil { 681 continue 682 } 683 684 // Reasoning for correctness of untruncated tombstone handling when the untruncated 685 // tombstone is at a higher level: 686 // The iterator corresponding to this tombstone is still in the heap so it must be 687 // positioned >= item.iterKey. Which means the Largest key bound of the sstable containing this 688 // tombstone is >= item.iterKey. So the upper limit of this tombstone cannot be file-bounds-constrained 689 // to < item.iterKey. But it is possible that item.key < smallestUserKey, in which 690 // case this tombstone should be ignored. 691 // 692 // Example 1: 693 // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is c#6. The 694 // smallestUserKey is c, so we know the key is within the file bounds and the tombstone 695 // [b, i) covers it. 696 // 697 // Example 2: 698 // Same sstable bounds but key is b#10. The smallestUserKey is c, so the tombstone [b, i) 699 // does not cover this key. 700 // 701 // For a tombstone at the same level as the key, the file bounds are trivially satisfied. 702 if (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, item.iterKey.UserKey) <= 0) && 703 l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) { 704 if level < item.index { 705 // We could also do m.seekGE(..., level + 1). The levels from 706 // [level + 1, item.index) are already after item.iterKey so seeking them may be 707 // wasteful. 708 709 // We can seek up to the min of largestUserKey and tombstone.End. 710 // 711 // Using example 1 above, we can seek to the smaller of g and i, which is g. 712 // 713 // Another example, where the sstable bounds are [c#8, i#InternalRangeDelSentinel], 714 // and the tombstone is [b, i)#8. Seeking to i is correct since it is seeking up to 715 // the exclusive bound of the tombstone. We do not need to look at 716 // isLargestKeyRangeDelSentinel. 717 // 718 // Progress argument: Since this file is at a higher level than item.iterKey we know 719 // that the iterator in this file must be positioned within its bounds and at a key 720 // X > item.iterKey (otherwise it would be the min of the heap). It is not 721 // possible for X.UserKey == item.iterKey.UserKey, since it is incompatible with 722 // X > item.iterKey (a lower version cannot be in a higher sstable), so it must be that 723 // X.UserKey > item.iterKey.UserKey. Which means l.largestUserKey > item.key.UserKey. 724 // We also know that l.tombstone.End > item.iterKey.UserKey. So the min of these, 725 // seekKey, computed below, is > item.iterKey.UserKey, so the call to seekGE() will 726 // make forward progress. 727 seekKey := l.tombstone.End 728 if l.largestUserKey != nil && m.heap.cmp(l.largestUserKey, seekKey) < 0 { 729 seekKey = l.largestUserKey 730 } 731 // This seek is not directly due to a SeekGE call, so we don't know 732 // enough about the underlying iterator positions, and so we keep the 733 // try-seek-using-next optimization disabled. Additionally, if we're in 734 // prefix-seek mode and a re-seek would have moved us past the original 735 // prefix, we can remove all merging iter levels below the rangedel 736 // tombstone's level and return immediately instead of re-seeking. This 737 // is correct since those levels cannot provide a key that matches the 738 // prefix, and is also visible. Additionally, this is important to make 739 // subsequent `TrySeekUsingNext` work correctly, as a re-seek on a 740 // different prefix could have resulted in this iterator skipping visible 741 // keys at prefixes in between m.prefix and seekKey, that are currently 742 // not in the heap due to a bloom filter mismatch. 743 // 744 // Additionally, we set the relative-seek flag. This is 745 // important when iterating with lazy combined iteration. If 746 // there's a range key between this level's current file and the 747 // file the seek will land on, we need to detect it in order to 748 // trigger construction of the combined iterator. 749 if m.prefix != nil { 750 if n := m.split(seekKey); !bytes.Equal(m.prefix, seekKey[:n]) { 751 for i := item.index; i < len(m.levels); i++ { 752 // Remove this level from the heap. Setting iterKey and iterValue 753 // to their zero values should be sufficient for initMinHeap to not 754 // re-initialize the heap with them in it. Other fields in 755 // mergingIterLevel can remain as-is; the iter/rangeDelIter needs 756 // to stay intact for future trySeekUsingNexts to work, the level 757 // iter boundary context is owned by the levelIter which is not 758 // being repositioned, and any tombstones in these levels will be 759 // irrelevant for us anyway. 760 m.levels[i].iterKey = nil 761 m.levels[i].iterValue = base.LazyValue{} 762 } 763 // TODO(bilal): Consider a more efficient way of removing levels from 764 // the heap without reinitializing all of it. This would likely 765 // necessitate tracking the heap positions of each mergingIterHeap 766 // item in the mergingIterLevel, and then swapping that item in the 767 // heap with the last-positioned heap item, and shrinking the heap by 768 // one. 769 m.initMinHeap() 770 return true 771 } 772 } 773 m.seekGE(seekKey, item.index, base.SeekGEFlagsNone.EnableRelativeSeek()) 774 return true 775 } 776 if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { 777 if m.prefix == nil { 778 m.nextEntry(item, nil /* succKey */) 779 } else { 780 m.maybeNextEntryWithinPrefix(item) 781 } 782 return true 783 } 784 } 785 } 786 return false 787 } 788 789 // Starting from the current entry, finds the first (next) entry that can be returned. 790 func (m *mergingIter) findNextEntry() (*InternalKey, base.LazyValue) { 791 for m.heap.len() > 0 && m.err == nil { 792 item := m.heap.items[0] 793 if m.levels[item.index].isSyntheticIterBoundsKey { 794 break 795 } 796 797 m.addItemStats(item) 798 799 // Skip ignorable boundary keys. These are not real keys and exist to 800 // keep sstables open until we've surpassed their end boundaries so that 801 // their range deletions are visible. 802 if m.levels[item.index].isIgnorableBoundaryKey { 803 if m.prefix == nil { 804 m.nextEntry(item, nil /* succKey */) 805 } else { 806 m.maybeNextEntryWithinPrefix(item) 807 } 808 continue 809 } 810 811 // Check if the heap root key is deleted by a range tombstone in a 812 // higher level. If it is, isNextEntryDeleted will advance the iterator 813 // to a later key (through seeking or nexting). 814 if m.isNextEntryDeleted(item) { 815 m.stats.PointsCoveredByRangeTombstones++ 816 continue 817 } 818 819 // Check if the key is visible at the iterator sequence numbers. 820 if !item.iterKey.Visible(m.snapshot, m.batchSnapshot) { 821 if m.prefix == nil { 822 m.nextEntry(item, nil /* succKey */) 823 } else { 824 m.maybeNextEntryWithinPrefix(item) 825 } 826 continue 827 } 828 829 // The heap root is visible and not deleted by any range tombstones. 830 // Return it. 831 return item.iterKey, item.iterValue 832 } 833 return nil, base.LazyValue{} 834 } 835 836 // Steps to the prev entry. item is the current top item in the heap. 837 func (m *mergingIter) prevEntry(l *mergingIterLevel) { 838 oldTopLevel := l.index 839 oldRangeDelIter := l.rangeDelIter 840 if l.iterKey, l.iterValue = l.iter.Prev(); l.iterKey != nil { 841 if m.heap.len() > 1 { 842 m.heap.fix(0) 843 } 844 if l.rangeDelIter != oldRangeDelIter && l.rangeDelIter != nil { 845 // The rangeDelIter changed which indicates that the l.iter moved to the 846 // previous sstable. We have to update the tombstone for oldTopLevel as 847 // well. 848 oldTopLevel-- 849 } 850 } else { 851 m.err = l.iter.Error() 852 if m.err == nil { 853 m.heap.pop() 854 } 855 } 856 857 // The cached tombstones are only valid for the levels 858 // [0,oldTopLevel]. Updated the cached tombstones for any levels in the range 859 // [oldTopLevel+1,heap[0].index]. 860 m.initMaxRangeDelIters(oldTopLevel) 861 } 862 863 // isPrevEntryDeleted() starts from the current entry (as the prev entry) and if it is deleted, 864 // moves the iterators backward as needed and returns true, else it returns false. item is the top 865 // item in the heap. 866 func (m *mergingIter) isPrevEntryDeleted(item *mergingIterLevel) bool { 867 // Look for a range deletion tombstone containing item.iterKey at higher 868 // levels (level < item.index). If we find such a range tombstone we know 869 // it deletes the key in the current level. Also look for a range 870 // deletion at the current level (level == item.index). If we find such a 871 // range deletion we need to check whether it is newer than the current 872 // entry. 873 for level := 0; level <= item.index; level++ { 874 l := &m.levels[level] 875 if l.rangeDelIter == nil || l.tombstone == nil { 876 // If l.tombstone is nil, there are no further tombstones 877 // in the current sstable in the current (reverse) iteration 878 // direction. 879 continue 880 } 881 if m.heap.cmp(item.iterKey.UserKey, l.tombstone.Start) < 0 { 882 // The current key is before the tombstone start key. 883 // 884 // NB: for the case that this l.rangeDelIter is provided by a levelIter we know that 885 // the levelIter must be positioned at a key < item.iterKey. So it is sufficient to seek the 886 // current l.rangeDelIter (since any range del iterators that will be provided by the 887 // levelIter in the future cannot contain item.iterKey). Also, it is it is possible that we 888 // will encounter parts of the range delete that should be ignored -- we handle that 889 // below. 890 l.tombstone = keyspan.SeekLE(m.heap.cmp, l.rangeDelIter, item.iterKey.UserKey) 891 } 892 if l.tombstone == nil { 893 continue 894 } 895 896 // Reasoning for correctness of untruncated tombstone handling when the untruncated 897 // tombstone is at a higher level: 898 // 899 // The iterator corresponding to this tombstone is still in the heap so it must be 900 // positioned <= item.iterKey. Which means the Smallest key bound of the sstable containing this 901 // tombstone is <= item.iterKey. So the lower limit of this tombstone cannot have been 902 // file-bounds-constrained to > item.iterKey. But it is possible that item.key >= Largest 903 // key bound of this sstable, in which case this tombstone should be ignored. 904 // 905 // Example 1: 906 // sstable bounds [c#8, g#12] containing a tombstone [b, i)#7, and key is f#6. The 907 // largestUserKey is g, so we know the key is within the file bounds and the tombstone 908 // [b, i) covers it. 909 // 910 // Example 2: 911 // Same sstable but the key is g#6. This cannot happen since the [b, i)#7 untruncated 912 // tombstone was involved in a compaction which must have had a file to the right of this 913 // sstable that is part of the same atomic compaction group for future compactions. That 914 // file must have bounds that cover g#6 and this levelIter must be at that file. 915 // 916 // Example 3: 917 // sstable bounds [c#8, g#RangeDelSentinel] containing [b, i)#7 and the key is g#10. 918 // This key is not deleted by this tombstone. We need to look at 919 // isLargestUserKeyExclusive. 920 // 921 // For a tombstone at the same level as the key, the file bounds are trivially satisfied. 922 923 // Default to within bounds. 924 withinLargestSSTableBound := true 925 if l.largestUserKey != nil { 926 cmpResult := m.heap.cmp(l.largestUserKey, item.iterKey.UserKey) 927 withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) 928 } 929 if withinLargestSSTableBound && l.tombstone.Contains(m.heap.cmp, item.iterKey.UserKey) && l.tombstone.VisibleAt(m.snapshot) { 930 if level < item.index { 931 // We could also do m.seekLT(..., level + 1). The levels from 932 // [level + 1, item.index) are already before item.iterKey so seeking them may be 933 // wasteful. 934 935 // We can seek up to the max of smallestUserKey and tombstone.Start.UserKey. 936 // 937 // Using example 1 above, we can seek to the larger of c and b, which is c. 938 // 939 // Progress argument: We know that the iterator in this file is positioned within 940 // its bounds and at a key X < item.iterKey (otherwise it would be the max of the heap). 941 // So smallestUserKey <= item.iterKey.UserKey and we already know that 942 // l.tombstone.Start.UserKey <= item.iterKey.UserKey. So the seekKey computed below 943 // is <= item.iterKey.UserKey, and since we do a seekLT() we will make backwards 944 // progress. 945 seekKey := l.tombstone.Start 946 if l.smallestUserKey != nil && m.heap.cmp(l.smallestUserKey, seekKey) > 0 { 947 seekKey = l.smallestUserKey 948 } 949 // We set the relative-seek flag. This is important when 950 // iterating with lazy combined iteration. If there's a range 951 // key between this level's current file and the file the seek 952 // will land on, we need to detect it in order to trigger 953 // construction of the combined iterator. 954 m.seekLT(seekKey, item.index, base.SeekLTFlagsNone.EnableRelativeSeek()) 955 return true 956 } 957 if l.tombstone.CoversAt(m.snapshot, item.iterKey.SeqNum()) { 958 m.prevEntry(item) 959 return true 960 } 961 } 962 } 963 return false 964 } 965 966 // Starting from the current entry, finds the first (prev) entry that can be returned. 967 func (m *mergingIter) findPrevEntry() (*InternalKey, base.LazyValue) { 968 for m.heap.len() > 0 && m.err == nil { 969 item := m.heap.items[0] 970 if m.levels[item.index].isSyntheticIterBoundsKey { 971 break 972 } 973 m.addItemStats(item) 974 if m.isPrevEntryDeleted(item) { 975 m.stats.PointsCoveredByRangeTombstones++ 976 continue 977 } 978 if item.iterKey.Visible(m.snapshot, m.batchSnapshot) && 979 (!m.levels[item.index].isIgnorableBoundaryKey) { 980 return item.iterKey, item.iterValue 981 } 982 m.prevEntry(item) 983 } 984 return nil, base.LazyValue{} 985 } 986 987 // Seeks levels >= level to >= key. Additionally uses range tombstones to extend the seeks. 988 func (m *mergingIter) seekGE(key []byte, level int, flags base.SeekGEFlags) { 989 // When seeking, we can use tombstones to adjust the key we seek to on each 990 // level. Consider the series of range tombstones: 991 // 992 // 1: a---e 993 // 2: d---h 994 // 3: g---k 995 // 4: j---n 996 // 5: m---q 997 // 998 // If we SeekGE("b") we also find the tombstone "b" resides within in the 999 // first level which is [a,e). Regardless of whether this tombstone deletes 1000 // "b" in that level, we know it deletes "b" in all lower levels, so we 1001 // adjust the search key in the next level to the tombstone end key "e". We 1002 // then SeekGE("e") in the second level and find the corresponding tombstone 1003 // [d,h). This process continues and we end up seeking for "h" in the 3rd 1004 // level, "k" in the 4th level and "n" in the last level. 1005 // 1006 // TODO(peter,rangedel): In addition to the above we can delay seeking a 1007 // level (and any lower levels) when the current iterator position is 1008 // contained within a range tombstone at a higher level. 1009 1010 // Deterministically disable the TrySeekUsingNext optimizations sometimes in 1011 // invariant builds to encourage the metamorphic tests to surface bugs. Note 1012 // that we cannot disable the optimization within individual levels. It must 1013 // be disabled for all levels or none. If one lower-level iterator performs 1014 // a fresh seek whereas another takes advantage of its current iterator 1015 // position, the heap can become inconsistent. Consider the following 1016 // example: 1017 // 1018 // L5: [ [b-c) ] [ d ]* 1019 // L6: [ b ] [e]* 1020 // 1021 // Imagine a SeekGE(a). The [b-c) range tombstone deletes the L6 point key 1022 // 'b', resulting in the iterator positioned at d with the heap: 1023 // 1024 // {L5: d, L6: e} 1025 // 1026 // A subsequent SeekGE(b) is seeking to a larger key, so the caller may set 1027 // TrySeekUsingNext()=true. If the L5 iterator used the TrySeekUsingNext 1028 // optimization but the L6 iterator did not, the iterator would have the 1029 // heap: 1030 // 1031 // {L6: b, L5: d} 1032 // 1033 // Because the L5 iterator has already advanced to the next sstable, the 1034 // merging iterator cannot observe the [b-c) range tombstone and will 1035 // mistakenly return L6's deleted point key 'b'. 1036 if invariants.Enabled && flags.TrySeekUsingNext() && !m.forceEnableSeekOpt && 1037 disableSeekOpt(key, uintptr(unsafe.Pointer(m))) { 1038 flags = flags.DisableTrySeekUsingNext() 1039 } 1040 1041 for ; level < len(m.levels); level++ { 1042 if invariants.Enabled && m.lower != nil && m.heap.cmp(key, m.lower) < 0 { 1043 m.logger.Fatalf("mergingIter: lower bound violation: %s < %s\n%s", key, m.lower, debug.Stack()) 1044 } 1045 1046 l := &m.levels[level] 1047 if m.prefix != nil { 1048 l.iterKey, l.iterValue = l.iter.SeekPrefixGE(m.prefix, key, flags) 1049 } else { 1050 l.iterKey, l.iterValue = l.iter.SeekGE(key, flags) 1051 } 1052 1053 // If this level contains overlapping range tombstones, alter the seek 1054 // key accordingly. Caveat: If we're performing lazy-combined iteration, 1055 // we cannot alter the seek key: Range tombstones don't delete range 1056 // keys, and there might exist live range keys within the range 1057 // tombstone's span that need to be observed to trigger a switch to 1058 // combined iteration. 1059 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && 1060 (m.combinedIterState == nil || m.combinedIterState.initialized) { 1061 // The level has a range-del iterator. Find the tombstone containing 1062 // the search key. 1063 // 1064 // For untruncated tombstones that are possibly file-bounds-constrained, we are using a 1065 // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter 1066 // is at this file we know that largestUserKey >= key, so we know that the 1067 // tombstone we find cannot be file-bounds-constrained in its upper bound to something < key. 1068 // We do need to compare with smallestUserKey to ensure that the tombstone is not 1069 // file-bounds-constrained in its lower bound. 1070 // 1071 // See the detailed comments in isNextEntryDeleted() on why similar containment and 1072 // seeking logic is correct. The subtle difference here is that key is a user key, 1073 // so we can have a sstable with bounds [c#8, i#InternalRangeDelSentinel], and the 1074 // tombstone is [b, k)#8 and the seek key is i: levelIter.SeekGE(i) will move past 1075 // this sstable since it realizes the largest key is a InternalRangeDelSentinel. 1076 l.tombstone = rangeDelIter.SeekGE(key) 1077 if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && l.tombstone.Contains(m.heap.cmp, key) && 1078 (l.smallestUserKey == nil || m.heap.cmp(l.smallestUserKey, key) <= 0) { 1079 // NB: Based on the comment above l.largestUserKey >= key, and based on the 1080 // containment condition tombstone.End > key, so the assignment to key results 1081 // in a monotonically non-decreasing key across iterations of this loop. 1082 // 1083 // The adjustment of key here can only move it to a larger key. Since 1084 // the caller of seekGE guaranteed that the original key was greater 1085 // than or equal to m.lower, the new key will continue to be greater 1086 // than or equal to m.lower. 1087 if l.largestUserKey != nil && 1088 m.heap.cmp(l.largestUserKey, l.tombstone.End) < 0 { 1089 // Truncate the tombstone for seeking purposes. Note that this can over-truncate 1090 // but that is harmless for this seek optimization. 1091 key = l.largestUserKey 1092 } else { 1093 key = l.tombstone.End 1094 } 1095 } 1096 } 1097 } 1098 1099 m.initMinHeap() 1100 } 1101 1102 func (m *mergingIter) String() string { 1103 return "merging" 1104 } 1105 1106 // SeekGE implements base.InternalIterator.SeekGE. Note that SeekGE only checks 1107 // the upper bound. It is up to the caller to ensure that key is greater than 1108 // or equal to the lower bound. 1109 func (m *mergingIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 1110 m.err = nil // clear cached iteration error 1111 m.prefix = nil 1112 m.seekGE(key, 0 /* start level */, flags) 1113 return m.findNextEntry() 1114 } 1115 1116 // SeekPrefixGE implements base.InternalIterator.SeekPrefixGE. Note that 1117 // SeekPrefixGE only checks the upper bound. It is up to the caller to ensure 1118 // that key is greater than or equal to the lower bound. 1119 func (m *mergingIter) SeekPrefixGE( 1120 prefix, key []byte, flags base.SeekGEFlags, 1121 ) (*base.InternalKey, base.LazyValue) { 1122 m.err = nil // clear cached iteration error 1123 m.prefix = prefix 1124 m.seekGE(key, 0 /* start level */, flags) 1125 return m.findNextEntry() 1126 } 1127 1128 // Seeks levels >= level to < key. Additionally uses range tombstones to extend the seeks. 1129 func (m *mergingIter) seekLT(key []byte, level int, flags base.SeekLTFlags) { 1130 // See the comment in seekGE regarding using tombstones to adjust the seek 1131 // target per level. 1132 m.prefix = nil 1133 for ; level < len(m.levels); level++ { 1134 if invariants.Enabled && m.upper != nil && m.heap.cmp(key, m.upper) > 0 { 1135 m.logger.Fatalf("mergingIter: upper bound violation: %s > %s\n%s", key, m.upper, debug.Stack()) 1136 } 1137 1138 l := &m.levels[level] 1139 l.iterKey, l.iterValue = l.iter.SeekLT(key, flags) 1140 1141 // If this level contains overlapping range tombstones, alter the seek 1142 // key accordingly. Caveat: If we're performing lazy-combined iteration, 1143 // we cannot alter the seek key: Range tombstones don't delete range 1144 // keys, and there might exist live range keys within the range 1145 // tombstone's span that need to be observed to trigger a switch to 1146 // combined iteration. 1147 if rangeDelIter := l.rangeDelIter; rangeDelIter != nil && 1148 (m.combinedIterState == nil || m.combinedIterState.initialized) { 1149 // The level has a range-del iterator. Find the tombstone containing 1150 // the search key. 1151 // 1152 // For untruncated tombstones that are possibly file-bounds-constrained we are using a 1153 // levelIter which will set smallestUserKey and largestUserKey. Since the levelIter 1154 // is at this file we know that smallestUserKey <= key, so we know that the 1155 // tombstone we find cannot be file-bounds-constrained in its lower bound to something > key. 1156 // We do need to compare with largestUserKey to ensure that the tombstone is not 1157 // file-bounds-constrained in its upper bound. 1158 // 1159 // See the detailed comments in isPrevEntryDeleted() on why similar containment and 1160 // seeking logic is correct. 1161 1162 // Default to within bounds. 1163 withinLargestSSTableBound := true 1164 if l.largestUserKey != nil { 1165 cmpResult := m.heap.cmp(l.largestUserKey, key) 1166 withinLargestSSTableBound = cmpResult > 0 || (cmpResult == 0 && !l.isLargestUserKeyExclusive) 1167 } 1168 1169 l.tombstone = keyspan.SeekLE(m.heap.cmp, rangeDelIter, key) 1170 if l.tombstone != nil && l.tombstone.VisibleAt(m.snapshot) && 1171 l.tombstone.Contains(m.heap.cmp, key) && withinLargestSSTableBound { 1172 // NB: Based on the comment above l.smallestUserKey <= key, and based 1173 // on the containment condition tombstone.Start.UserKey <= key, so the 1174 // assignment to key results in a monotonically non-increasing key 1175 // across iterations of this loop. 1176 // 1177 // The adjustment of key here can only move it to a smaller key. Since 1178 // the caller of seekLT guaranteed that the original key was less than 1179 // or equal to m.upper, the new key will continue to be less than or 1180 // equal to m.upper. 1181 if l.smallestUserKey != nil && 1182 m.heap.cmp(l.smallestUserKey, l.tombstone.Start) >= 0 { 1183 // Truncate the tombstone for seeking purposes. Note that this can over-truncate 1184 // but that is harmless for this seek optimization. 1185 key = l.smallestUserKey 1186 } else { 1187 key = l.tombstone.Start 1188 } 1189 } 1190 } 1191 } 1192 1193 m.initMaxHeap() 1194 } 1195 1196 // SeekLT implements base.InternalIterator.SeekLT. Note that SeekLT only checks 1197 // the lower bound. It is up to the caller to ensure that key is less than the 1198 // upper bound. 1199 func (m *mergingIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 1200 m.err = nil // clear cached iteration error 1201 m.prefix = nil 1202 m.seekLT(key, 0 /* start level */, flags) 1203 return m.findPrevEntry() 1204 } 1205 1206 // First implements base.InternalIterator.First. Note that First only checks 1207 // the upper bound. It is up to the caller to ensure that key is greater than 1208 // or equal to the lower bound (e.g. via a call to SeekGE(lower)). 1209 func (m *mergingIter) First() (*InternalKey, base.LazyValue) { 1210 m.err = nil // clear cached iteration error 1211 m.prefix = nil 1212 m.heap.items = m.heap.items[:0] 1213 for i := range m.levels { 1214 l := &m.levels[i] 1215 l.iterKey, l.iterValue = l.iter.First() 1216 } 1217 m.initMinHeap() 1218 return m.findNextEntry() 1219 } 1220 1221 // Last implements base.InternalIterator.Last. Note that Last only checks the 1222 // lower bound. It is up to the caller to ensure that key is less than the 1223 // upper bound (e.g. via a call to SeekLT(upper)) 1224 func (m *mergingIter) Last() (*InternalKey, base.LazyValue) { 1225 m.err = nil // clear cached iteration error 1226 m.prefix = nil 1227 for i := range m.levels { 1228 l := &m.levels[i] 1229 l.iterKey, l.iterValue = l.iter.Last() 1230 } 1231 m.initMaxHeap() 1232 return m.findPrevEntry() 1233 } 1234 1235 func (m *mergingIter) Next() (*InternalKey, base.LazyValue) { 1236 if m.err != nil { 1237 return nil, base.LazyValue{} 1238 } 1239 1240 if m.dir != 1 { 1241 m.switchToMinHeap() 1242 return m.findNextEntry() 1243 } 1244 1245 if m.heap.len() == 0 { 1246 return nil, base.LazyValue{} 1247 } 1248 1249 // NB: It's okay to call nextEntry directly even during prefix iteration 1250 // mode (as opposed to indirectly through maybeNextEntryWithinPrefix). 1251 // During prefix iteration mode, we rely on the caller to not call Next if 1252 // the iterator has already advanced beyond the iteration prefix. See the 1253 // comment above the base.InternalIterator interface. 1254 m.nextEntry(m.heap.items[0], nil /* succKey */) 1255 return m.findNextEntry() 1256 } 1257 1258 func (m *mergingIter) NextPrefix(succKey []byte) (*InternalKey, LazyValue) { 1259 if m.dir != 1 { 1260 panic("pebble: cannot switch directions with NextPrefix") 1261 } 1262 if m.err != nil || m.heap.len() == 0 { 1263 return nil, LazyValue{} 1264 } 1265 if m.levelsPositioned == nil { 1266 m.levelsPositioned = make([]bool, len(m.levels)) 1267 } else { 1268 for i := range m.levelsPositioned { 1269 m.levelsPositioned[i] = false 1270 } 1271 } 1272 1273 // The heap root necessarily must be positioned at a key < succKey, because 1274 // NextPrefix was invoked. 1275 root := &m.heap.items[0] 1276 m.levelsPositioned[(*root).index] = true 1277 if invariants.Enabled && m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { 1278 m.logger.Fatalf("pebble: invariant violation: NextPrefix(%q) called on merging iterator already positioned at %q", 1279 succKey, (*root).iterKey) 1280 } 1281 m.nextEntry(*root, succKey) 1282 // NB: root is a pointer to the heap root. nextEntry may have changed 1283 // the heap root, so we must not expect root to still point to the same 1284 // level (or to even be valid, if the heap is now exhaused). 1285 1286 for m.heap.len() > 0 { 1287 if m.levelsPositioned[(*root).index] { 1288 // A level we've previously positioned is at the top of the heap, so 1289 // there are no other levels positioned at keys < succKey. We've 1290 // advanced as far as we need to. 1291 break 1292 } 1293 // Since this level was not the original heap root when NextPrefix was 1294 // called, we don't know whether this level's current key has the 1295 // previous prefix or a new one. 1296 if m.heap.cmp((*root).iterKey.UserKey, succKey) >= 0 { 1297 break 1298 } 1299 m.levelsPositioned[(*root).index] = true 1300 m.nextEntry(*root, succKey) 1301 } 1302 return m.findNextEntry() 1303 } 1304 1305 func (m *mergingIter) Prev() (*InternalKey, base.LazyValue) { 1306 if m.err != nil { 1307 return nil, base.LazyValue{} 1308 } 1309 1310 if m.dir != -1 { 1311 if m.prefix != nil { 1312 m.err = errors.New("pebble: unsupported reverse prefix iteration") 1313 return nil, base.LazyValue{} 1314 } 1315 m.switchToMaxHeap() 1316 return m.findPrevEntry() 1317 } 1318 1319 if m.heap.len() == 0 { 1320 return nil, base.LazyValue{} 1321 } 1322 1323 m.prevEntry(m.heap.items[0]) 1324 return m.findPrevEntry() 1325 } 1326 1327 func (m *mergingIter) Error() error { 1328 if m.heap.len() == 0 || m.err != nil { 1329 return m.err 1330 } 1331 return m.levels[m.heap.items[0].index].iter.Error() 1332 } 1333 1334 func (m *mergingIter) Close() error { 1335 for i := range m.levels { 1336 iter := m.levels[i].iter 1337 if err := iter.Close(); err != nil && m.err == nil { 1338 m.err = err 1339 } 1340 if rangeDelIter := m.levels[i].rangeDelIter; rangeDelIter != nil { 1341 if err := rangeDelIter.Close(); err != nil && m.err == nil { 1342 m.err = err 1343 } 1344 } 1345 } 1346 m.levels = nil 1347 m.heap.items = m.heap.items[:0] 1348 return m.err 1349 } 1350 1351 func (m *mergingIter) SetBounds(lower, upper []byte) { 1352 m.prefix = nil 1353 m.lower = lower 1354 m.upper = upper 1355 for i := range m.levels { 1356 m.levels[i].iter.SetBounds(lower, upper) 1357 } 1358 m.heap.clear() 1359 } 1360 1361 func (m *mergingIter) DebugString() string { 1362 var buf bytes.Buffer 1363 sep := "" 1364 for m.heap.len() > 0 { 1365 item := m.heap.pop() 1366 fmt.Fprintf(&buf, "%s%s", sep, item.iterKey) 1367 sep = " " 1368 } 1369 if m.dir == 1 { 1370 m.initMinHeap() 1371 } else { 1372 m.initMaxHeap() 1373 } 1374 return buf.String() 1375 } 1376 1377 func (m *mergingIter) ForEachLevelIter(fn func(li *levelIter) bool) { 1378 for _, ml := range m.levels { 1379 if ml.levelIter != nil { 1380 if done := fn(ml.levelIter); done { 1381 break 1382 } 1383 } 1384 } 1385 } 1386 1387 func (m *mergingIter) addItemStats(l *mergingIterLevel) { 1388 m.stats.PointCount++ 1389 m.stats.KeyBytes += uint64(len(l.iterKey.UserKey)) 1390 m.stats.ValueBytes += uint64(len(l.iterValue.ValueOrHandle)) 1391 } 1392 1393 var _ internalIterator = &mergingIter{}