github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/internal/keyspan/merging_iter.go (about) 1 // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package keyspan 6 7 import ( 8 "bytes" 9 "fmt" 10 "sort" 11 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/manifest" 15 ) 16 17 // TODO(jackson): Consider implementing an optimization to seek lower levels 18 // past higher levels' RANGEKEYDELs. This would be analaogous to the 19 // optimization pebble.mergingIter performs for RANGEDELs during point key 20 // seeks. It may not be worth it, because range keys are rare and cascading 21 // seeks would require introducing key comparisons to switchTo{Min,Max}Heap 22 // where there currently are none. 23 24 // TODO(jackson): There are several opportunities to use base.Equal in the 25 // MergingIter implementation, but will require a bit of plumbing to thread the 26 // Equal function. 27 28 // MergingIter merges spans across levels of the LSM, exposing an iterator over 29 // spans that yields sets of spans fragmented at unique user key boundaries. 30 // 31 // A MergingIter is initialized with an arbitrary number of child iterators over 32 // fragmented spans. Each child iterator exposes fragmented key spans, such that 33 // overlapping keys are surfaced in a single Span. Key spans from one child 34 // iterator may overlap key spans from another child iterator arbitrarily. 35 // 36 // The spans combined by MergingIter will return spans with keys sorted by 37 // trailer descending. If the MergingIter is configured with a Transformer, it's 38 // permitted to modify the ordering of the spans' keys returned by MergingIter. 39 // 40 // # Algorithm 41 // 42 // The merging iterator wraps child iterators, merging and fragmenting spans 43 // across levels. The high-level algorithm is: 44 // 45 // 1. Initialize the heap with bound keys from child iterators' spans. 46 // 2. Find the next [or previous] two unique user keys' from bounds. 47 // 3. Consider the span formed between the two unique user keys a candidate 48 // span. 49 // 4. Determine if any of the child iterators' spans overlap the candidate 50 // span. 51 // 4a. If any of the child iterator's current bounds are end keys 52 // (during forward iteration) or start keys (during reverse 53 // iteration), then all the spans with that bound overlap the 54 // candidate span. 55 // 4b. Apply the configured transform, which may remove keys. 56 // 4c. If no spans overlap, forget the smallest (forward iteration) 57 // or largest (reverse iteration) unique user key and advance 58 // the iterators to the next unique user key. Start again from 3. 59 // 60 // # Detailed algorithm 61 // 62 // Each level (i0, i1, ...) has a user-provided input FragmentIterator. The 63 // merging iterator steps through individual boundaries of the underlying 64 // spans separately. If the underlying FragmentIterator has fragments 65 // [a,b){#2,#1} [b,c){#1} the mergingIterLevel.{next,prev} step through: 66 // 67 // (a, start), (b, end), (b, start), (c, end) 68 // 69 // Note that (a, start) and (b, end) are observed ONCE each, despite two keys 70 // sharing those bounds. Also note that (b, end) and (b, start) are two distinct 71 // iterator positions of a mergingIterLevel. 72 // 73 // The merging iterator maintains a heap (min during forward iteration, max 74 // during reverse iteration) containing the boundKeys. Each boundKey is a 75 // 3-tuple holding the bound user key, whether the bound is a start or end key 76 // and the set of keys from that level that have that bound. The heap orders 77 // based on the boundKey's user key only. 78 // 79 // The merging iterator is responsible for merging spans across levels to 80 // determine which span is next, but it's also responsible for fragmenting 81 // overlapping spans. Consider the example: 82 // 83 // i0: b---d e-----h 84 // i1: a---c h-----k 85 // i2: a------------------------------p 86 // 87 // fragments: a-b-c-d-e-----h-----k----------p 88 // 89 // None of the individual child iterators contain a span with the exact bounds 90 // [c,d), but the merging iterator must produce a span [c,d). To accomplish 91 // this, the merging iterator visits every span between unique boundary user 92 // keys. In the above example, this is: 93 // 94 // [a,b), [b,c), [c,d), [d,e), [e, h), [h, k), [k, p) 95 // 96 // The merging iterator first initializes the heap to prepare for iteration. 97 // The description below discusses the mechanics of forward iteration after a 98 // call to First, but the mechanics are similar for reverse iteration and 99 // other positioning methods. 100 // 101 // During a call to First, the heap is initialized by seeking every 102 // mergingIterLevel to the first bound of the first fragment. In the above 103 // example, this seeks the child iterators to: 104 // 105 // i0: (b, boundKindFragmentStart, [ [b,d) ]) 106 // i1: (a, boundKindFragmentStart, [ [a,c) ]) 107 // i2: (a, boundKindFragmentStart, [ [a,p) ]) 108 // 109 // After fixing up the heap, the root of the heap is a boundKey with the 110 // smallest user key ('a' in the example). Once the heap is setup for iteration 111 // in the appropriate direction and location, the merging iterator uses 112 // find{Next,Prev}FragmentSet to find the next/previous span bounds. 113 // 114 // During forward iteration, the root of the heap's user key is the start key 115 // key of next merged span. findNextFragmentSet sets m.start to this user 116 // key. The heap may contain other boundKeys with the same user key if another 117 // level has a fragment starting or ending at the same key, so the 118 // findNextFragmentSet method pulls from the heap until it finds the first key 119 // greater than m.start. This key is used as the end key. 120 // 121 // In the above example, this results in m.start = 'a', m.end = 'b' and child 122 // iterators in the following positions: 123 // 124 // i0: (b, boundKindFragmentStart, [ [b,d) ]) 125 // i1: (c, boundKindFragmentEnd, [ [a,c) ]) 126 // i2: (p, boundKindFragmentEnd, [ [a,p) ]) 127 // 128 // With the user key bounds of the next merged span established, 129 // findNextFragmentSet must determine which, if any, fragments overlap the span. 130 // During forward iteration any child iterator that is now positioned at an end 131 // boundary has an overlapping span. (Justification: The child iterator's end 132 // boundary is ≥ m.end. The corresponding start boundary must be ≤ m.start since 133 // there were no other user keys between m.start and m.end. So the fragments 134 // associated with the iterator's current end boundary have start and end bounds 135 // such that start ≤ m.start < m.end ≤ end). 136 // 137 // findNextFragmentSet iterates over the levels, collecting keys from any child 138 // iterators positioned at end boundaries. In the above example, i1 and i2 are 139 // positioned at end boundaries, so findNextFragmentSet collects the keys of 140 // [a,c) and [a,p). These spans contain the merging iterator's [m.start, m.end) 141 // span, but they may also extend beyond the m.start and m.end. The merging 142 // iterator returns the keys with the merging iter's m.start and m.end bounds, 143 // preserving the underlying keys' sequence numbers, key kinds and values. 144 // 145 // A MergingIter is configured with a Transform that's applied to the span 146 // before surfacing it to the iterator user. A Transform may remove keys 147 // arbitrarily, but it may not modify the values themselves. 148 // 149 // It may be the case that findNextFragmentSet finds no levels positioned at end 150 // boundaries, or that there are no spans remaining after applying a transform, 151 // in which case the span [m.start, m.end) overlaps with nothing. In this case 152 // findNextFragmentSet loops, repeating the above process again until it finds a 153 // span that does contain keys. 154 // 155 // # Memory safety 156 // 157 // The FragmentIterator interface only guarantees stability of a Span and its 158 // associated slices until the next positioning method is called. Adjacent Spans 159 // may be contained in different sstables, requring the FragmentIterator 160 // implementation to close one sstable, releasing its memory, before opening the 161 // next. Most of the state used by the MergingIter is derived from spans at 162 // current child iterator positions only, ensuring state is stable. The one 163 // exception is the start bound during forward iteration and the end bound 164 // during reverse iteration. 165 // 166 // If the heap root originates from an end boundary when findNextFragmentSet 167 // begins, a Next on the heap root level may invalidate the end boundary. To 168 // accommodate this, find{Next,Prev}FragmentSet copy the initial boundary if the 169 // subsequent Next/Prev would move to the next span. 170 type MergingIter struct { 171 *MergingBuffers 172 // start and end hold the bounds for the span currently under the 173 // iterator position. 174 // 175 // Invariant: None of the levels' iterators contain spans with a bound 176 // between start and end. For all bounds b, b ≤ start || b ≥ end. 177 start, end []byte 178 179 // transformer defines a transformation to be applied to a span before it's 180 // yielded to the user. Transforming may filter individual keys contained 181 // within the span. 182 transformer Transformer 183 // span holds the iterator's current span. This span is used as the 184 // destination for transforms. Every tranformed span overwrites the 185 // previous. 186 span Span 187 err error 188 dir int8 189 190 // alloc preallocates mergingIterLevel and mergingIterItems for use by the 191 // merging iterator. As long as the merging iterator is used with 192 // manifest.NumLevels+3 and fewer fragment iterators, the merging iterator 193 // will not need to allocate upon initialization. The value NumLevels+3 194 // mirrors the preallocated levels in iterAlloc used for point iterators. 195 // Invariant: cap(levels) == cap(items) 196 alloc struct { 197 levels [manifest.NumLevels + 3]mergingIterLevel 198 items [manifest.NumLevels + 3]mergingIterItem 199 } 200 } 201 202 // MergingBuffers holds buffers used while merging keyspans. 203 type MergingBuffers struct { 204 // keys holds all of the keys across all levels that overlap the key span 205 // [start, end), sorted by Trailer descending. This slice is reconstituted 206 // in synthesizeKeys from each mergingIterLevel's keys every time the 207 // [start, end) bounds change. 208 // 209 // Each element points into a child iterator's memory, so the keys may not 210 // be directly modified. 211 keys keysBySeqNumKind 212 // levels holds levels allocated by MergingIter.init. The MergingIter will 213 // prefer use of its `manifest.NumLevels+3` array, so this slice will be 214 // longer if set. 215 levels []mergingIterLevel 216 // heap holds a slice for the merging iterator heap allocated by 217 // MergingIter.init. The MergingIter will prefer use of its 218 // `manifest.NumLevels+3` items array, so this slice will be longer if set. 219 heap mergingIterHeap 220 // buf is a buffer used to save [start, end) boundary keys. 221 buf []byte 222 } 223 224 // PrepareForReuse discards any excessively large buffers. 225 func (bufs *MergingBuffers) PrepareForReuse() { 226 if cap(bufs.buf) > bufferReuseMaxCapacity { 227 bufs.buf = nil 228 } 229 } 230 231 // MergingIter implements the FragmentIterator interface. 232 var _ FragmentIterator = (*MergingIter)(nil) 233 234 type mergingIterLevel struct { 235 iter FragmentIterator 236 237 // heapKey holds the current key at this level for use within the heap. 238 heapKey boundKey 239 } 240 241 func (l *mergingIterLevel) next() { 242 if l.heapKey.kind == boundKindFragmentStart { 243 l.heapKey = boundKey{ 244 kind: boundKindFragmentEnd, 245 key: l.heapKey.span.End, 246 span: l.heapKey.span, 247 } 248 return 249 } 250 if s := l.iter.Next(); s == nil { 251 l.heapKey = boundKey{kind: boundKindInvalid} 252 } else { 253 l.heapKey = boundKey{ 254 kind: boundKindFragmentStart, 255 key: s.Start, 256 span: s, 257 } 258 } 259 } 260 261 func (l *mergingIterLevel) prev() { 262 if l.heapKey.kind == boundKindFragmentEnd { 263 l.heapKey = boundKey{ 264 kind: boundKindFragmentStart, 265 key: l.heapKey.span.Start, 266 span: l.heapKey.span, 267 } 268 return 269 } 270 if s := l.iter.Prev(); s == nil { 271 l.heapKey = boundKey{kind: boundKindInvalid} 272 } else { 273 l.heapKey = boundKey{ 274 kind: boundKindFragmentEnd, 275 key: s.End, 276 span: s, 277 } 278 } 279 } 280 281 // Init initializes the merging iterator with the provided fragment iterators. 282 func (m *MergingIter) Init( 283 cmp base.Compare, transformer Transformer, bufs *MergingBuffers, iters ...FragmentIterator, 284 ) { 285 *m = MergingIter{ 286 MergingBuffers: bufs, 287 transformer: transformer, 288 } 289 m.heap.cmp = cmp 290 levels, items := m.levels, m.heap.items 291 292 // Invariant: cap(levels) >= cap(items) 293 // Invariant: cap(alloc.levels) == cap(alloc.items) 294 if len(iters) <= len(m.alloc.levels) { 295 // The slices allocated on the MergingIter struct are large enough. 296 m.levels = m.alloc.levels[:len(iters)] 297 m.heap.items = m.alloc.items[:0] 298 } else if len(iters) <= cap(levels) { 299 // The existing heap-allocated slices are large enough, so reuse them. 300 m.levels = levels[:len(iters)] 301 m.heap.items = items[:0] 302 } else { 303 // Heap allocate new slices. 304 m.levels = make([]mergingIterLevel, len(iters)) 305 m.heap.items = make([]mergingIterItem, 0, len(iters)) 306 } 307 for i := range m.levels { 308 m.levels[i] = mergingIterLevel{iter: iters[i]} 309 } 310 } 311 312 // AddLevel adds a new level to the bottom of the merging iterator. AddLevel 313 // must be called after Init and before any other method. 314 func (m *MergingIter) AddLevel(iter FragmentIterator) { 315 m.levels = append(m.levels, mergingIterLevel{iter: iter}) 316 } 317 318 // SeekGE moves the iterator to the first span covering a key greater than 319 // or equal to the given key. This is equivalent to seeking to the first 320 // span with an end key greater than the given key. 321 func (m *MergingIter) SeekGE(key []byte) *Span { 322 m.invalidate() // clear state about current position 323 324 // SeekGE(k) seeks to the first span with an end key greater than the given 325 // key. The merged span M that we're searching for might straddle the seek 326 // `key`. In this case, the M.Start may be a key ≤ the seek key. 327 // 328 // Consider a SeekGE(dog) in the following example. 329 // 330 // i0: b---d e-----h 331 // i1: a---c h-----k 332 // i2: a------------------------------p 333 // merged: a-b-c-d-e-----h-----k----------p 334 // 335 // The merged span M containing 'dog' is [d,e). The 'd' of the merged span 336 // comes from i0's [b,d)'s end boundary. The [b,d) span does not cover any 337 // key >= dog, so we cannot find the span by positioning the child iterators 338 // using a SeekGE(dog). 339 // 340 // Instead, if we take all the child iterators' spans bounds: 341 // a b c d e h k p 342 // We want to partition them into keys ≤ `key` and keys > `key`. 343 // dog 344 // │ 345 // a b c d│e h k p 346 // │ 347 // The largest key on the left of the partition forms the merged span's 348 // start key, and the smallest key on the right of the partition forms the 349 // merged span's end key. Recharacterized: 350 // 351 // M.Start: the largest boundary ≤ k of any child span 352 // M.End: the smallest boundary > k of any child span 353 // 354 // The FragmentIterator interface doesn't implement seeking by all bounds, 355 // it implements seeking by containment. A SeekGE(k) will ensure we observe 356 // all start boundaries ≥ k and all end boundaries > k but does not ensure 357 // we observe end boundaries = k or any boundaries < k. A SeekLT(k) will 358 // ensure we observe all start boundaries < k and all end boundaries ≤ k but 359 // does not ensure we observe any start boundaries = k or any boundaries > 360 // k. This forces us to seek in one direction and step in the other. 361 // 362 // In a SeekGE, we want to end up oriented in the forward direction when 363 // complete, so we begin with searching for M.Start by SeekLT-ing every 364 // child iterator to `k`. For every child span found, we determine the 365 // largest bound ≤ `k` and use it to initialize our max heap. The resulting 366 // root of the max heap is a preliminary value for `M.Start`. 367 for i := range m.levels { 368 l := &m.levels[i] 369 s := l.iter.SeekLT(key) 370 if s == nil { 371 l.heapKey = boundKey{kind: boundKindInvalid} 372 } else if m.cmp(s.End, key) <= 0 { 373 l.heapKey = boundKey{ 374 kind: boundKindFragmentEnd, 375 key: s.End, 376 span: s, 377 } 378 } else { 379 // s.End > key && s.Start < key 380 // We need to use this span's start bound, since that's the largest 381 // bound ≤ key. 382 l.heapKey = boundKey{ 383 kind: boundKindFragmentStart, 384 key: s.Start, 385 span: s, 386 } 387 } 388 } 389 m.initMaxHeap() 390 if m.err != nil { 391 return nil 392 } else if len(m.heap.items) == 0 { 393 // There are no spans covering any key < `key`. There is no span that 394 // straddles the seek key. Reorient the heap into a min heap and return 395 // the first span we find in the forward direction. 396 m.switchToMinHeap() 397 return m.findNextFragmentSet() 398 } 399 400 // The heap root is now the largest boundary key b such that: 401 // 1. b < k 402 // 2. b = k, and b is an end boundary 403 // There's a third case that we will need to consider later, after we've 404 // switched to a min heap: 405 // 3. there exists a start boundary key b such that b = k. 406 // A start boundary key equal to k would not be surfaced when we seeked all 407 // the levels using SeekLT(k), since no key <k would be covered within a 408 // span within an inclusive `k` start boundary. 409 // 410 // Assume that the tightest boundary ≤ k is the current heap root (cases 1 & 411 // 2). After we switch to a min heap, we'll check for the third case and 412 // adjust the start boundary if necessary. 413 m.start = m.heap.items[0].boundKey.key 414 415 // Before switching the direction of the heap, save a copy of the start 416 // boundary if it's the end boundary of some child span. Next-ing the child 417 // iterator might switch files and invalidate the memory of the bound. 418 if m.heap.items[0].boundKey.kind == boundKindFragmentEnd { 419 m.buf = append(m.buf[:0], m.start...) 420 m.start = m.buf 421 } 422 423 // Switch to a min heap. This will move each level to the next bound in 424 // every level, and then establish a min heap. This allows us to obtain the 425 // smallest boundary key > `key`, which will serve as our candidate end 426 // bound. 427 m.switchToMinHeap() 428 if m.err != nil { 429 return nil 430 } else if len(m.heap.items) == 0 { 431 return nil 432 } 433 434 // Check for the case 3 described above. It's possible that when we switch 435 // heap directions, we discover a start boundary of some child span that is 436 // equal to the seek key `key`. In this case, we want this key to be our 437 // start boundary. 438 if m.heap.items[0].boundKey.kind == boundKindFragmentStart && 439 m.cmp(m.heap.items[0].boundKey.key, key) == 0 { 440 // Call findNextFragmentSet, which will set m.start to the heap root and 441 // proceed forward. 442 return m.findNextFragmentSet() 443 } 444 445 m.end = m.heap.items[0].boundKey.key 446 if found, s := m.synthesizeKeys(+1); found && s != nil { 447 return s 448 } 449 return m.findNextFragmentSet() 450 451 } 452 453 // SeekLT moves the iterator to the last span covering a key less than the 454 // given key. This is equivalent to seeking to the last span with a start 455 // key less than the given key. 456 func (m *MergingIter) SeekLT(key []byte) *Span { 457 m.invalidate() // clear state about current position 458 459 // SeekLT(k) seeks to the last span with a start key less than the given 460 // key. The merged span M that we're searching for might straddle the seek 461 // `key`. In this case, the M.End may be a key ≥ the seek key. 462 // 463 // Consider a SeekLT(dog) in the following example. 464 // 465 // i0: b---d e-----h 466 // i1: a---c h-----k 467 // i2: a------------------------------p 468 // merged: a-b-c-d-e-----h-----k----------p 469 // 470 // The merged span M containing the largest key <'dog' is [d,e). The 'e' of 471 // the merged span comes from i0's [e,h)'s start boundary. The [e,h) span 472 // does not cover any key < dog, so we cannot find the span by positioning 473 // the child iterators using a SeekLT(dog). 474 // 475 // Instead, if we take all the child iterators' spans bounds: 476 // a b c d e h k p 477 // We want to partition them into keys < `key` and keys ≥ `key`. 478 // dog 479 // │ 480 // a b c d│e h k p 481 // │ 482 // The largest key on the left of the partition forms the merged span's 483 // start key, and the smallest key on the right of the partition forms the 484 // merged span's end key. Recharacterized: 485 // 486 // M.Start: the largest boundary < k of any child span 487 // M.End: the smallest boundary ≥ k of any child span 488 // 489 // The FragmentIterator interface doesn't implement seeking by all bounds, 490 // it implements seeking by containment. A SeekGE(k) will ensure we observe 491 // all start boundaries ≥ k and all end boundaries > k but does not ensure 492 // we observe end boundaries = k or any boundaries < k. A SeekLT(k) will 493 // ensure we observe all start boundaries < k and all end boundaries ≤ k but 494 // does not ensure we observe any start boundaries = k or any boundaries > 495 // k. This forces us to seek in one direction and step in the other. 496 // 497 // In a SeekLT, we want to end up oriented in the backward direction when 498 // complete, so we begin with searching for M.End by SeekGE-ing every 499 // child iterator to `k`. For every child span found, we determine the 500 // smallest bound ≥ `k` and use it to initialize our min heap. The resulting 501 // root of the min heap is a preliminary value for `M.End`. 502 for i := range m.levels { 503 l := &m.levels[i] 504 s := l.iter.SeekGE(key) 505 if s == nil { 506 l.heapKey = boundKey{kind: boundKindInvalid} 507 } else if m.cmp(s.Start, key) >= 0 { 508 l.heapKey = boundKey{ 509 kind: boundKindFragmentStart, 510 key: s.Start, 511 span: s, 512 } 513 } else { 514 // s.Start < key 515 // We need to use this span's end bound, since that's the smallest 516 // bound > key. 517 l.heapKey = boundKey{ 518 kind: boundKindFragmentEnd, 519 key: s.End, 520 span: s, 521 } 522 } 523 } 524 m.initMinHeap() 525 if m.err != nil { 526 return nil 527 } else if len(m.heap.items) == 0 { 528 // There are no spans covering any key ≥ `key`. There is no span that 529 // straddles the seek key. Reorient the heap into a max heap and return 530 // the first span we find in the reverse direction. 531 m.switchToMaxHeap() 532 return m.findPrevFragmentSet() 533 } 534 535 // The heap root is now the smallest boundary key b such that: 536 // 1. b > k 537 // 2. b = k, and b is a start boundary 538 // There's a third case that we will need to consider later, after we've 539 // switched to a max heap: 540 // 3. there exists an end boundary key b such that b = k. 541 // An end boundary key equal to k would not be surfaced when we seeked all 542 // the levels using SeekGE(k), since k would not be contained within the 543 // exclusive end boundary. 544 // 545 // Assume that the tightest boundary ≥ k is the current heap root (cases 1 & 546 // 2). After we switch to a max heap, we'll check for the third case and 547 // adjust the end boundary if necessary. 548 m.end = m.heap.items[0].boundKey.key 549 550 // Before switching the direction of the heap, save a copy of the end 551 // boundary if it's the start boundary of some child span. Prev-ing the 552 // child iterator might switch files and invalidate the memory of the bound. 553 if m.heap.items[0].boundKey.kind == boundKindFragmentStart { 554 m.buf = append(m.buf[:0], m.end...) 555 m.end = m.buf 556 } 557 558 // Switch to a max heap. This will move each level to the previous bound in 559 // every level, and then establish a max heap. This allows us to obtain the 560 // largest boundary key < `key`, which will serve as our candidate start 561 // bound. 562 m.switchToMaxHeap() 563 if m.err != nil { 564 return nil 565 } else if len(m.heap.items) == 0 { 566 return nil 567 } 568 // Check for the case 3 described above. It's possible that when we switch 569 // heap directions, we discover an end boundary of some child span that is 570 // equal to the seek key `key`. In this case, we want this key to be our end 571 // boundary. 572 if m.heap.items[0].boundKey.kind == boundKindFragmentEnd && 573 m.cmp(m.heap.items[0].boundKey.key, key) == 0 { 574 // Call findPrevFragmentSet, which will set m.end to the heap root and 575 // proceed backwards. 576 return m.findPrevFragmentSet() 577 } 578 579 m.start = m.heap.items[0].boundKey.key 580 if found, s := m.synthesizeKeys(-1); found && s != nil { 581 return s 582 } 583 return m.findPrevFragmentSet() 584 } 585 586 // First seeks the iterator to the first span. 587 func (m *MergingIter) First() *Span { 588 m.invalidate() // clear state about current position 589 for i := range m.levels { 590 if s := m.levels[i].iter.First(); s == nil { 591 m.levels[i].heapKey = boundKey{kind: boundKindInvalid} 592 } else { 593 m.levels[i].heapKey = boundKey{ 594 kind: boundKindFragmentStart, 595 key: s.Start, 596 span: s, 597 } 598 } 599 } 600 m.initMinHeap() 601 return m.findNextFragmentSet() 602 } 603 604 // Last seeks the iterator to the last span. 605 func (m *MergingIter) Last() *Span { 606 m.invalidate() // clear state about current position 607 for i := range m.levels { 608 if s := m.levels[i].iter.Last(); s == nil { 609 m.levels[i].heapKey = boundKey{kind: boundKindInvalid} 610 } else { 611 m.levels[i].heapKey = boundKey{ 612 kind: boundKindFragmentEnd, 613 key: s.End, 614 span: s, 615 } 616 } 617 } 618 m.initMaxHeap() 619 return m.findPrevFragmentSet() 620 } 621 622 // Next advances the iterator to the next span. 623 func (m *MergingIter) Next() *Span { 624 if m.err != nil { 625 return nil 626 } 627 if m.dir == +1 && (m.end == nil || m.start == nil) { 628 return nil 629 } 630 if m.dir != +1 { 631 m.switchToMinHeap() 632 } 633 return m.findNextFragmentSet() 634 } 635 636 // Prev advances the iterator to the previous span. 637 func (m *MergingIter) Prev() *Span { 638 if m.err != nil { 639 return nil 640 } 641 if m.dir == -1 && (m.end == nil || m.start == nil) { 642 return nil 643 } 644 if m.dir != -1 { 645 m.switchToMaxHeap() 646 } 647 return m.findPrevFragmentSet() 648 } 649 650 // Error returns any accumulated error. 651 func (m *MergingIter) Error() error { 652 if m.heap.len() == 0 || m.err != nil { 653 return m.err 654 } 655 return m.levels[m.heap.items[0].index].iter.Error() 656 } 657 658 // Close closes the iterator, releasing all acquired resources. 659 func (m *MergingIter) Close() error { 660 for i := range m.levels { 661 if err := m.levels[i].iter.Close(); err != nil && m.err == nil { 662 m.err = err 663 } 664 } 665 m.levels = nil 666 m.heap.items = m.heap.items[:0] 667 return m.err 668 } 669 670 // String implements fmt.Stringer. 671 func (m *MergingIter) String() string { 672 return "merging-keyspan" 673 } 674 675 func (m *MergingIter) initMinHeap() { 676 m.dir = +1 677 m.heap.reverse = false 678 m.initHeap() 679 } 680 681 func (m *MergingIter) initMaxHeap() { 682 m.dir = -1 683 m.heap.reverse = true 684 m.initHeap() 685 } 686 687 func (m *MergingIter) initHeap() { 688 m.heap.items = m.heap.items[:0] 689 for i := range m.levels { 690 if l := &m.levels[i]; l.heapKey.kind != boundKindInvalid { 691 m.heap.items = append(m.heap.items, mergingIterItem{ 692 index: i, 693 boundKey: &l.heapKey, 694 }) 695 } else { 696 m.err = firstError(m.err, l.iter.Error()) 697 if m.err != nil { 698 return 699 } 700 } 701 } 702 m.heap.init() 703 } 704 705 func (m *MergingIter) switchToMinHeap() { 706 // switchToMinHeap reorients the heap for forward iteration, without moving 707 // the current MergingIter position. 708 709 // The iterator is currently positioned at the span [m.start, m.end), 710 // oriented in the reverse direction, so each level's iterator is positioned 711 // to the largest key ≤ m.start. To reorient in the forward direction, we 712 // must advance each level's iterator to the smallest key ≥ m.end. Consider 713 // this three-level example. 714 // 715 // i0: b---d e-----h 716 // i1: a---c h-----k 717 // i2: a------------------------------p 718 // 719 // merged: a-b-c-d-e-----h-----k----------p 720 // 721 // If currently positioned at the merged span [c,d), then the level 722 // iterators' heap keys are: 723 // 724 // i0: (b, [b, d)) i1: (c, [a,c)) i2: (a, [a,p)) 725 // 726 // Reversing the heap should not move the merging iterator and should not 727 // change the current [m.start, m.end) bounds. It should only prepare for 728 // forward iteration by updating the child iterators' heap keys to: 729 // 730 // i0: (d, [b, d)) i1: (h, [h,k)) i2: (p, [a,p)) 731 // 732 // In every level the first key ≥ m.end is the next in the iterator. 733 // Justification: Suppose not and a level iterator's next key was some key k 734 // such that k < m.end. The max-heap invariant dictates that the current 735 // iterator position is the largest entry with a user key ≥ m.start. This 736 // means k > m.start. We started with the assumption that k < m.end, so 737 // m.start < k < m.end. But then k is between our current span bounds, 738 // and reverse iteration would have constructed the current interval to be 739 // [k, m.end) not [m.start, m.end). 740 741 if invariants.Enabled { 742 for i := range m.levels { 743 l := &m.levels[i] 744 if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.start) > 0 { 745 panic("pebble: invariant violation: max-heap key > m.start") 746 } 747 } 748 } 749 750 for i := range m.levels { 751 m.levels[i].next() 752 } 753 m.initMinHeap() 754 } 755 756 func (m *MergingIter) switchToMaxHeap() { 757 // switchToMaxHeap reorients the heap for reverse iteration, without moving 758 // the current MergingIter position. 759 760 // The iterator is currently positioned at the span [m.start, m.end), 761 // oriented in the forward direction. Each level's iterator is positioned at 762 // the smallest bound ≥ m.end. To reorient in the reverse direction, we must 763 // move each level's iterator to the largest key ≤ m.start. Consider this 764 // three-level example. 765 // 766 // i0: b---d e-----h 767 // i1: a---c h-----k 768 // i2: a------------------------------p 769 // 770 // merged: a-b-c-d-e-----h-----k----------p 771 // 772 // If currently positioned at the merged span [c,d), then the level 773 // iterators' heap keys are: 774 // 775 // i0: (d, [b, d)) i1: (h, [h,k)) i2: (p, [a,p)) 776 // 777 // Reversing the heap should not move the merging iterator and should not 778 // change the current [m.start, m.end) bounds. It should only prepare for 779 // reverse iteration by updating the child iterators' heap keys to: 780 // 781 // i0: (b, [b, d)) i1: (c, [a,c)) i2: (a, [a,p)) 782 // 783 // In every level the largest key ≤ m.start is the prev in the iterator. 784 // Justification: Suppose not and a level iterator's prev key was some key k 785 // such that k > m.start. The min-heap invariant dictates that the current 786 // iterator position is the smallest entry with a user key ≥ m.end. This 787 // means k < m.end, otherwise the iterator would be positioned at k. We 788 // started with the assumption that k > m.start, so m.start < k < m.end. But 789 // then k is between our current span bounds, and reverse iteration 790 // would have constructed the current interval to be [m.start, k) not 791 // [m.start, m.end). 792 793 if invariants.Enabled { 794 for i := range m.levels { 795 l := &m.levels[i] 796 if l.heapKey.kind != boundKindInvalid && m.cmp(l.heapKey.key, m.end) < 0 { 797 panic("pebble: invariant violation: min-heap key < m.end") 798 } 799 } 800 } 801 802 for i := range m.levels { 803 m.levels[i].prev() 804 } 805 m.initMaxHeap() 806 } 807 808 func (m *MergingIter) cmp(a, b []byte) int { 809 return m.heap.cmp(a, b) 810 } 811 812 func (m *MergingIter) findNextFragmentSet() *Span { 813 // Each iteration of this loop considers a new merged span between unique 814 // user keys. An iteration may find that there exists no overlap for a given 815 // span, (eg, if the spans [a,b), [d, e) exist within level iterators, the 816 // below loop will still consider [b,d) before continuing to [d, e)). It 817 // returns when it finds a span that is covered by at least one key. 818 819 for m.heap.len() > 0 && m.err == nil { 820 // Initialize the next span's start bound. SeekGE and First prepare the 821 // heap without advancing. Next leaves the heap in a state such that the 822 // root is the smallest bound key equal to the returned span's end key, 823 // so the heap is already positioned at the next merged span's start key. 824 825 // NB: m.heapRoot() might be either an end boundary OR a start boundary 826 // of a level's span. Both end and start boundaries may still be a start 827 // key of a span in the set of fragmented spans returned by MergingIter. 828 // Consider the scenario: 829 // a----------l #1 830 // b-----------m #2 831 // 832 // The merged, fully-fragmented spans that MergingIter exposes to the caller 833 // have bounds: 834 // a-b #1 835 // b--------l #1 836 // b--------l #2 837 // l-m #2 838 // 839 // When advancing to l-m#2, we must set m.start to 'l', which originated 840 // from [a,l)#1's end boundary. 841 m.start = m.heap.items[0].boundKey.key 842 843 // Before calling nextEntry, consider whether it might invalidate our 844 // start boundary. If the start boundary key originated from an end 845 // boundary, then we need to copy the start key before advancing the 846 // underlying iterator to the next Span. 847 if m.heap.items[0].boundKey.kind == boundKindFragmentEnd { 848 m.buf = append(m.buf[:0], m.start...) 849 m.start = m.buf 850 } 851 852 // There may be many entries all with the same user key. Spans in other 853 // levels may also start or end at this same user key. For eg: 854 // L1: [a, c) [c, d) 855 // L2: [c, e) 856 // If we're positioned at L1's end(c) end boundary, we want to advance 857 // to the first bound > c. 858 m.nextEntry() 859 for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.start) == 0 { 860 m.nextEntry() 861 } 862 if len(m.heap.items) == 0 || m.err != nil { 863 break 864 } 865 866 // The current entry at the top of the heap is the first key > m.start. 867 // It must become the end bound for the span we will return to the user. 868 // In the above example, the root of the heap is L1's end(d). 869 m.end = m.heap.items[0].boundKey.key 870 871 // Each level within m.levels may have a span that overlaps the 872 // fragmented key span [m.start, m.end). Update m.keys to point to them 873 // and sort them by kind, sequence number. There may not be any keys 874 // defined over [m.start, m.end) if we're between the end of one span 875 // and the start of the next, OR if the configured transform filters any 876 // keys out. We allow empty spans that were emitted by child iterators, but 877 // we elide empty spans created by the mergingIter itself that don't overlap 878 // with any child iterator returned spans (i.e. empty spans that bridge two 879 // distinct child-iterator-defined spans). 880 if found, s := m.synthesizeKeys(+1); found && s != nil { 881 return s 882 } 883 } 884 // Exhausted. 885 m.clear() 886 return nil 887 } 888 889 func (m *MergingIter) findPrevFragmentSet() *Span { 890 // Each iteration of this loop considers a new merged span between unique 891 // user keys. An iteration may find that there exists no overlap for a given 892 // span, (eg, if the spans [a,b), [d, e) exist within level iterators, the 893 // below loop will still consider [b,d) before continuing to [a, b)). It 894 // returns when it finds a span that is covered by at least one key. 895 896 for m.heap.len() > 0 && m.err == nil { 897 // Initialize the next span's end bound. SeekLT and Last prepare the 898 // heap without advancing. Prev leaves the heap in a state such that the 899 // root is the largest bound key equal to the returned span's start key, 900 // so the heap is already positioned at the next merged span's end key. 901 902 // NB: m.heapRoot() might be either an end boundary OR a start boundary 903 // of a level's span. Both end and start boundaries may still be a start 904 // key of a span returned by MergingIter. Consider the scenario: 905 // a----------l #2 906 // b-----------m #1 907 // 908 // The merged, fully-fragmented spans that MergingIter exposes to the caller 909 // have bounds: 910 // a-b #2 911 // b--------l #2 912 // b--------l #1 913 // l-m #1 914 // 915 // When Preving to a-b#2, we must set m.end to 'b', which originated 916 // from [b,m)#1's start boundary. 917 m.end = m.heap.items[0].boundKey.key 918 919 // Before calling prevEntry, consider whether it might invalidate our 920 // end boundary. If the end boundary key originated from a start 921 // boundary, then we need to copy the end key before advancing the 922 // underlying iterator to the previous Span. 923 if m.heap.items[0].boundKey.kind == boundKindFragmentStart { 924 m.buf = append(m.buf[:0], m.end...) 925 m.end = m.buf 926 } 927 928 // There may be many entries all with the same user key. Spans in other 929 // levels may also start or end at this same user key. For eg: 930 // L1: [a, c) [c, d) 931 // L2: [c, e) 932 // If we're positioned at L1's start(c) start boundary, we want to prev 933 // to move to the first bound < c. 934 m.prevEntry() 935 for len(m.heap.items) > 0 && m.err == nil && m.cmp(m.heapRoot(), m.end) == 0 { 936 m.prevEntry() 937 } 938 if len(m.heap.items) == 0 || m.err != nil { 939 break 940 } 941 942 // The current entry at the top of the heap is the first key < m.end. 943 // It must become the start bound for the span we will return to the 944 // user. In the above example, the root of the heap is L1's start(a). 945 m.start = m.heap.items[0].boundKey.key 946 947 // Each level within m.levels may have a set of keys that overlap the 948 // fragmented key span [m.start, m.end). Update m.keys to point to them 949 // and sort them by kind, sequence number. There may not be any keys 950 // spanning [m.start, m.end) if we're between the end of one span and 951 // the start of the next, OR if the configured transform filters any 952 // keys out. We allow empty spans that were emitted by child iterators, but 953 // we elide empty spans created by the mergingIter itself that don't overlap 954 // with any child iterator returned spans (i.e. empty spans that bridge two 955 // distinct child-iterator-defined spans). 956 if found, s := m.synthesizeKeys(-1); found && s != nil { 957 return s 958 } 959 } 960 // Exhausted. 961 m.clear() 962 return nil 963 } 964 965 func (m *MergingIter) heapRoot() []byte { 966 return m.heap.items[0].boundKey.key 967 } 968 969 // synthesizeKeys is called by find{Next,Prev}FragmentSet to populate and 970 // sort the set of keys overlapping [m.start, m.end). 971 // 972 // During forward iteration, if the current heap item is a fragment end, 973 // then the fragment's start must be ≤ m.start and the fragment overlaps the 974 // current iterator position of [m.start, m.end). 975 // 976 // During reverse iteration, if the current heap item is a fragment start, 977 // then the fragment's end must be ≥ m.end and the fragment overlaps the 978 // current iteration position of [m.start, m.end). 979 // 980 // The boolean return value, `found`, is true if the returned span overlaps 981 // with a span returned by a child iterator. 982 func (m *MergingIter) synthesizeKeys(dir int8) (bool, *Span) { 983 if invariants.Enabled { 984 if m.cmp(m.start, m.end) >= 0 { 985 panic(fmt.Sprintf("pebble: invariant violation: span start ≥ end: %s >= %s", m.start, m.end)) 986 } 987 } 988 989 m.keys = m.keys[:0] 990 found := false 991 for i := range m.levels { 992 if dir == +1 && m.levels[i].heapKey.kind == boundKindFragmentEnd || 993 dir == -1 && m.levels[i].heapKey.kind == boundKindFragmentStart { 994 m.keys = append(m.keys, m.levels[i].heapKey.span.Keys...) 995 found = true 996 } 997 } 998 // TODO(jackson): We should be able to remove this sort and instead 999 // guarantee that we'll return keys in the order of the levels they're from. 1000 // With careful iterator construction, this would guarantee that they're 1001 // sorted by trailer descending for the range key iteration use case. 1002 sort.Sort(&m.keys) 1003 1004 // Apply the configured transform. See VisibleTransform. 1005 m.span = Span{ 1006 Start: m.start, 1007 End: m.end, 1008 Keys: m.keys, 1009 KeysOrder: ByTrailerDesc, 1010 } 1011 // NB: m.heap.cmp is a base.Compare, whereas m.cmp is a method on 1012 // MergingIter. 1013 if err := m.transformer.Transform(m.heap.cmp, m.span, &m.span); err != nil { 1014 m.err = err 1015 return false, nil 1016 } 1017 return found, &m.span 1018 } 1019 1020 func (m *MergingIter) invalidate() { 1021 m.err = nil 1022 } 1023 1024 func (m *MergingIter) clear() { 1025 for fi := range m.keys { 1026 m.keys[fi] = Key{} 1027 } 1028 m.keys = m.keys[:0] 1029 } 1030 1031 // nextEntry steps to the next entry. 1032 func (m *MergingIter) nextEntry() { 1033 l := &m.levels[m.heap.items[0].index] 1034 l.next() 1035 if !l.heapKey.valid() { 1036 // l.iter is exhausted. 1037 m.err = l.iter.Error() 1038 if m.err == nil { 1039 m.heap.pop() 1040 } 1041 return 1042 } 1043 1044 if m.heap.len() > 1 { 1045 m.heap.fix(0) 1046 } 1047 } 1048 1049 // prevEntry steps to the previous entry. 1050 func (m *MergingIter) prevEntry() { 1051 l := &m.levels[m.heap.items[0].index] 1052 l.prev() 1053 if !l.heapKey.valid() { 1054 // l.iter is exhausted. 1055 m.err = l.iter.Error() 1056 if m.err == nil { 1057 m.heap.pop() 1058 } 1059 return 1060 } 1061 1062 if m.heap.len() > 1 { 1063 m.heap.fix(0) 1064 } 1065 } 1066 1067 // DebugString returns a string representing the current internal state of the 1068 // merging iterator and its heap for debugging purposes. 1069 func (m *MergingIter) DebugString() string { 1070 var buf bytes.Buffer 1071 fmt.Fprintf(&buf, "Current bounds: [%q, %q)\n", m.start, m.end) 1072 for i := range m.levels { 1073 fmt.Fprintf(&buf, "%d: heap key %s\n", i, m.levels[i].heapKey) 1074 } 1075 return buf.String() 1076 } 1077 1078 type mergingIterItem struct { 1079 // boundKey points to the corresponding mergingIterLevel's `iterKey`. 1080 *boundKey 1081 // index is the index of this level within the MergingIter's levels field. 1082 index int 1083 } 1084 1085 // mergingIterHeap is copied from mergingIterHeap defined in the root pebble 1086 // package for use with point keys. 1087 1088 type mergingIterHeap struct { 1089 cmp base.Compare 1090 reverse bool 1091 items []mergingIterItem 1092 } 1093 1094 func (h *mergingIterHeap) len() int { 1095 return len(h.items) 1096 } 1097 1098 func (h *mergingIterHeap) less(i, j int) bool { 1099 // This key comparison only uses the user key and not the boundKind. Bound 1100 // kind doesn't matter because when stepping over a user key, 1101 // findNextFragmentSet and findPrevFragmentSet skip past all heap items with 1102 // that user key, and makes no assumptions on ordering. All other heap 1103 // examinations only consider the user key. 1104 ik, jk := h.items[i].key, h.items[j].key 1105 c := h.cmp(ik, jk) 1106 if h.reverse { 1107 return c > 0 1108 } 1109 return c < 0 1110 } 1111 1112 func (h *mergingIterHeap) swap(i, j int) { 1113 h.items[i], h.items[j] = h.items[j], h.items[i] 1114 } 1115 1116 // init, fix, up and down are copied from the go stdlib. 1117 func (h *mergingIterHeap) init() { 1118 // heapify 1119 n := h.len() 1120 for i := n/2 - 1; i >= 0; i-- { 1121 h.down(i, n) 1122 } 1123 } 1124 1125 func (h *mergingIterHeap) fix(i int) { 1126 if !h.down(i, h.len()) { 1127 h.up(i) 1128 } 1129 } 1130 1131 func (h *mergingIterHeap) pop() *mergingIterItem { 1132 n := h.len() - 1 1133 h.swap(0, n) 1134 h.down(0, n) 1135 item := &h.items[n] 1136 h.items = h.items[:n] 1137 return item 1138 } 1139 1140 func (h *mergingIterHeap) up(j int) { 1141 for { 1142 i := (j - 1) / 2 // parent 1143 if i == j || !h.less(j, i) { 1144 break 1145 } 1146 h.swap(i, j) 1147 j = i 1148 } 1149 } 1150 1151 func (h *mergingIterHeap) down(i0, n int) bool { 1152 i := i0 1153 for { 1154 j1 := 2*i + 1 1155 if j1 >= n || j1 < 0 { // j1 < 0 after int overflow 1156 break 1157 } 1158 j := j1 // left child 1159 if j2 := j1 + 1; j2 < n && h.less(j2, j1) { 1160 j = j2 // = 2*i + 2 // right child 1161 } 1162 if !h.less(j, i) { 1163 break 1164 } 1165 h.swap(i, j) 1166 i = j 1167 } 1168 return i > i0 1169 } 1170 1171 type boundKind int8 1172 1173 const ( 1174 boundKindInvalid boundKind = iota 1175 boundKindFragmentStart 1176 boundKindFragmentEnd 1177 ) 1178 1179 type boundKey struct { 1180 kind boundKind 1181 key []byte 1182 // span holds the span the bound key comes from. 1183 // 1184 // If kind is boundKindFragmentStart, then key is span.Start. If kind is 1185 // boundKindFragmentEnd, then key is span.End. 1186 span *Span 1187 } 1188 1189 func (k boundKey) valid() bool { 1190 return k.kind != boundKindInvalid 1191 } 1192 1193 func (k boundKey) String() string { 1194 var buf bytes.Buffer 1195 switch k.kind { 1196 case boundKindInvalid: 1197 fmt.Fprint(&buf, "invalid") 1198 case boundKindFragmentStart: 1199 fmt.Fprint(&buf, "fragment-start") 1200 case boundKindFragmentEnd: 1201 fmt.Fprint(&buf, "fragment-end ") 1202 default: 1203 fmt.Fprintf(&buf, "unknown-kind(%d)", k.kind) 1204 } 1205 fmt.Fprintf(&buf, " %s [", k.key) 1206 fmt.Fprintf(&buf, "%s", k.span) 1207 fmt.Fprint(&buf, "]") 1208 return buf.String() 1209 }