github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/level_checker.go (about) 1 // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 "io" 11 "sort" 12 13 "github.com/cockroachdb/errors" 14 "github.com/cockroachdb/pebble/internal/base" 15 "github.com/cockroachdb/pebble/internal/keyspan" 16 "github.com/cockroachdb/pebble/internal/manifest" 17 ) 18 19 // This file implements DB.CheckLevels() which checks that every entry in the 20 // DB is consistent with respect to the level invariant: any point (or the 21 // infinite number of points in a range tombstone) has a seqnum such that a 22 // point with the same UserKey at a lower level has a lower seqnum. This is an 23 // expensive check since it involves iterating over all the entries in the DB, 24 // hence only intended for tests or tools. 25 // 26 // If we ignore range tombstones, the consistency checking of points can be 27 // done with a simplified version of mergingIter. simpleMergingIter is that 28 // simplified version of mergingIter that only needs to step through points 29 // (analogous to only doing Next()). It can also easily accommodate 30 // consistency checking of points relative to range tombstones. 31 // simpleMergingIter does not do any seek optimizations present in mergingIter 32 // (it minimally needs to seek the range delete iterators to position them at 33 // or past the current point) since it does not want to miss points for 34 // purposes of consistency checking. 35 // 36 // Mutual consistency of range tombstones is non-trivial to check. One needs 37 // to detect inversions of the form [a, c)#8 at higher level and [b, c)#10 at 38 // a lower level. The start key of the former is not contained in the latter 39 // and we can't use the exclusive end key, c, for a containment check since it 40 // is the sentinel key. We observe that if these tombstones were fragmented 41 // wrt each other we would have [a, b)#8 and [b, c)#8 at the higher level and 42 // [b, c)#10 at the lower level and then it is is trivial to compare the two 43 // [b, c) tombstones. Note that this fragmentation needs to take into account 44 // that tombstones in a file may be untruncated and need to act within the 45 // bounds of the file. This checking is performed by checkRangeTombstones() 46 // and its helper functions. 47 48 // The per-level structure used by simpleMergingIter. 49 type simpleMergingIterLevel struct { 50 iter internalIterator 51 rangeDelIter keyspan.FragmentIterator 52 levelIterBoundaryContext 53 54 iterKey *InternalKey 55 iterValue base.LazyValue 56 tombstone *keyspan.Span 57 } 58 59 type simpleMergingIter struct { 60 levels []simpleMergingIterLevel 61 snapshot uint64 62 heap simpleMergingIterHeap 63 // The last point's key and level. For validation. 64 lastKey InternalKey 65 lastLevel int 66 lastIterMsg string 67 // A non-nil valueMerger means MERGE record processing is ongoing. 68 valueMerger base.ValueMerger 69 // The first error will cause step() to return false. 70 err error 71 numPoints int64 72 merge Merge 73 formatKey base.FormatKey 74 } 75 76 func (m *simpleMergingIter) init( 77 merge Merge, 78 cmp Compare, 79 snapshot uint64, 80 formatKey base.FormatKey, 81 levels ...simpleMergingIterLevel, 82 ) { 83 m.levels = levels 84 m.formatKey = formatKey 85 m.merge = merge 86 m.snapshot = snapshot 87 m.lastLevel = -1 88 m.heap.cmp = cmp 89 m.heap.items = make([]simpleMergingIterItem, 0, len(levels)) 90 for i := range m.levels { 91 l := &m.levels[i] 92 l.iterKey, l.iterValue = l.iter.First() 93 if l.iterKey != nil { 94 item := simpleMergingIterItem{ 95 index: i, 96 value: l.iterValue, 97 } 98 item.key.Trailer = l.iterKey.Trailer 99 item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) 100 m.heap.items = append(m.heap.items, item) 101 } 102 } 103 m.heap.init() 104 105 if m.heap.len() == 0 { 106 return 107 } 108 m.positionRangeDels() 109 } 110 111 // Positions all the rangedel iterators at or past the current top of the 112 // heap, using SeekGE(). 113 func (m *simpleMergingIter) positionRangeDels() { 114 item := &m.heap.items[0] 115 for i := range m.levels { 116 l := &m.levels[i] 117 if l.rangeDelIter == nil { 118 continue 119 } 120 l.tombstone = l.rangeDelIter.SeekGE(item.key.UserKey) 121 } 122 } 123 124 // Returns true if not yet done. 125 func (m *simpleMergingIter) step() bool { 126 if m.heap.len() == 0 || m.err != nil { 127 return false 128 } 129 item := &m.heap.items[0] 130 l := &m.levels[item.index] 131 // Sentinels are not relevant for this point checking. 132 if !item.key.IsExclusiveSentinel() && item.key.Visible(m.snapshot, base.InternalKeySeqNumMax) { 133 m.numPoints++ 134 keyChanged := m.heap.cmp(item.key.UserKey, m.lastKey.UserKey) != 0 135 if !keyChanged { 136 // At the same user key. We will see them in decreasing seqnum 137 // order so the lastLevel must not be lower. 138 if m.lastLevel > item.index { 139 m.err = errors.Errorf("found InternalKey %s in %s and InternalKey %s in %s", 140 item.key.Pretty(m.formatKey), l.iter, m.lastKey.Pretty(m.formatKey), 141 m.lastIterMsg) 142 return false 143 } 144 m.lastLevel = item.index 145 } else { 146 // The user key has changed. 147 m.lastKey.Trailer = item.key.Trailer 148 m.lastKey.UserKey = append(m.lastKey.UserKey[:0], item.key.UserKey...) 149 m.lastLevel = item.index 150 } 151 // Ongoing series of MERGE records ends with a MERGE record. 152 if keyChanged && m.valueMerger != nil { 153 var closer io.Closer 154 _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) 155 if m.err == nil && closer != nil { 156 m.err = closer.Close() 157 } 158 m.valueMerger = nil 159 } 160 itemValue, _, err := item.value.Value(nil) 161 if err != nil { 162 m.err = err 163 return false 164 } 165 if m.valueMerger != nil { 166 // Ongoing series of MERGE records. 167 switch item.key.Kind() { 168 case InternalKeyKindSingleDelete, InternalKeyKindDelete, InternalKeyKindDeleteSized: 169 var closer io.Closer 170 _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) 171 if m.err == nil && closer != nil { 172 m.err = closer.Close() 173 } 174 m.valueMerger = nil 175 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 176 m.err = m.valueMerger.MergeOlder(itemValue) 177 if m.err == nil { 178 var closer io.Closer 179 _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) 180 if m.err == nil && closer != nil { 181 m.err = closer.Close() 182 } 183 } 184 m.valueMerger = nil 185 case InternalKeyKindMerge: 186 m.err = m.valueMerger.MergeOlder(itemValue) 187 default: 188 m.err = errors.Errorf("pebble: invalid internal key kind %s in %s", 189 item.key.Pretty(m.formatKey), 190 l.iter) 191 return false 192 } 193 } else if item.key.Kind() == InternalKeyKindMerge && m.err == nil { 194 // New series of MERGE records. 195 m.valueMerger, m.err = m.merge(item.key.UserKey, itemValue) 196 } 197 if m.err != nil { 198 m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", 199 item.key.Pretty(m.formatKey), l.iter) 200 return false 201 } 202 // Is this point covered by a tombstone at a lower level? Note that all these 203 // iterators must be positioned at a key > item.key. So the Largest key bound 204 // of the sstable containing the tombstone >= item.key. So the upper limit of 205 // the tombstone cannot be file-bounds-constrained to < item.key. But it is 206 // possible that item.key < smallest key bound of the sstable, in which case 207 // this tombstone should be ignored. 208 for level := item.index + 1; level < len(m.levels); level++ { 209 lvl := &m.levels[level] 210 if lvl.rangeDelIter == nil || lvl.tombstone.Empty() { 211 continue 212 } 213 if (lvl.smallestUserKey == nil || m.heap.cmp(lvl.smallestUserKey, item.key.UserKey) <= 0) && 214 lvl.tombstone.Contains(m.heap.cmp, item.key.UserKey) { 215 if lvl.tombstone.CoversAt(m.snapshot, item.key.SeqNum()) { 216 m.err = errors.Errorf("tombstone %s in %s deletes key %s in %s", 217 lvl.tombstone.Pretty(m.formatKey), lvl.iter, item.key.Pretty(m.formatKey), 218 l.iter) 219 return false 220 } 221 } 222 } 223 } 224 225 // The iterator for the current level may be closed in the following call to 226 // Next(). We save its debug string for potential use after it is closed - 227 // either in this current step() invocation or on the next invocation. 228 m.lastIterMsg = l.iter.String() 229 230 // Step to the next point. 231 if l.iterKey, l.iterValue = l.iter.Next(); l.iterKey != nil { 232 // Check point keys in an sstable are ordered. Although not required, we check 233 // for memtables as well. A subtle check here is that successive sstables of 234 // L1 and higher levels are ordered. This happens when levelIter moves to the 235 // next sstable in the level, in which case item.key is previous sstable's 236 // last point key. 237 if base.InternalCompare(m.heap.cmp, item.key, *l.iterKey) >= 0 { 238 m.err = errors.Errorf("out of order keys %s >= %s in %s", 239 item.key.Pretty(m.formatKey), l.iterKey.Pretty(m.formatKey), l.iter) 240 return false 241 } 242 item.key.Trailer = l.iterKey.Trailer 243 item.key.UserKey = append(item.key.UserKey[:0], l.iterKey.UserKey...) 244 item.value = l.iterValue 245 if m.heap.len() > 1 { 246 m.heap.fix(0) 247 } 248 } else { 249 m.err = l.iter.Close() 250 l.iter = nil 251 m.heap.pop() 252 } 253 if m.err != nil { 254 return false 255 } 256 if m.heap.len() == 0 { 257 // Last record was a MERGE record. 258 if m.valueMerger != nil { 259 var closer io.Closer 260 _, closer, m.err = m.valueMerger.Finish(true /* includesBase */) 261 if m.err == nil && closer != nil { 262 m.err = closer.Close() 263 } 264 if m.err != nil { 265 m.err = errors.Wrapf(m.err, "merge processing error on key %s in %s", 266 item.key.Pretty(m.formatKey), m.lastIterMsg) 267 } 268 m.valueMerger = nil 269 } 270 return false 271 } 272 m.positionRangeDels() 273 return true 274 } 275 276 // Checking that range tombstones are mutually consistent is performed by checkRangeTombstones(). 277 // See the overview comment at the top of the file. 278 // 279 // We do this check as follows: 280 // - For each level that can have untruncated tombstones, compute the atomic compaction 281 // bounds (getAtomicUnitBounds()) and use them to truncate tombstones. 282 // - Now that we have a set of truncated tombstones for each level, put them into one 283 // pool of tombstones along with their level information (addTombstonesFromIter()). 284 // - Collect the start and end user keys from all these tombstones (collectAllUserKey()) and use 285 // them to fragment all the tombstones (fragmentUsingUserKey()). 286 // - Sort tombstones by start key and decreasing seqnum (tombstonesByStartKeyAndSeqnum) -- all 287 // tombstones that have the same start key will have the same end key because they have been 288 // fragmented. 289 // - Iterate and check (iterateAndCheckTombstones()). 290 // Note that this simple approach requires holding all the tombstones across all levels in-memory. 291 // A more sophisticated incremental approach could be devised, if necessary. 292 293 // A tombstone and the corresponding level it was found in. 294 type tombstoneWithLevel struct { 295 keyspan.Span 296 level int 297 // The level in LSM. A -1 means it's a memtable. 298 lsmLevel int 299 fileNum FileNum 300 } 301 302 // For sorting tombstoneWithLevels in increasing order of start UserKey and 303 // for the same start UserKey in decreasing order of seqnum. 304 type tombstonesByStartKeyAndSeqnum struct { 305 cmp Compare 306 buf []tombstoneWithLevel 307 } 308 309 func (v *tombstonesByStartKeyAndSeqnum) Len() int { return len(v.buf) } 310 func (v *tombstonesByStartKeyAndSeqnum) Less(i, j int) bool { 311 less := v.cmp(v.buf[i].Start, v.buf[j].Start) 312 if less == 0 { 313 return v.buf[i].LargestSeqNum() > v.buf[j].LargestSeqNum() 314 } 315 return less < 0 316 } 317 func (v *tombstonesByStartKeyAndSeqnum) Swap(i, j int) { 318 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 319 } 320 321 func iterateAndCheckTombstones( 322 cmp Compare, formatKey base.FormatKey, tombstones []tombstoneWithLevel, 323 ) error { 324 sortBuf := tombstonesByStartKeyAndSeqnum{ 325 cmp: cmp, 326 buf: tombstones, 327 } 328 sort.Sort(&sortBuf) 329 330 // For a sequence of tombstones that share the same start UserKey, we will 331 // encounter them in non-increasing seqnum order and so should encounter them 332 // in non-decreasing level order. 333 lastTombstone := tombstoneWithLevel{} 334 for _, t := range tombstones { 335 if cmp(lastTombstone.Start, t.Start) == 0 && lastTombstone.level > t.level { 336 return errors.Errorf("encountered tombstone %s in %s"+ 337 " that has a lower seqnum than the same tombstone in %s", 338 t.Span.Pretty(formatKey), levelOrMemtable(t.lsmLevel, t.fileNum), 339 levelOrMemtable(lastTombstone.lsmLevel, lastTombstone.fileNum)) 340 } 341 lastTombstone = t 342 } 343 return nil 344 } 345 346 type checkConfig struct { 347 logger Logger 348 comparer *Comparer 349 readState *readState 350 newIters tableNewIters 351 seqNum uint64 352 stats *CheckLevelsStats 353 merge Merge 354 formatKey base.FormatKey 355 } 356 357 // cmp is shorthand for comparer.Compare. 358 func (c *checkConfig) cmp(a, b []byte) int { return c.comparer.Compare(a, b) } 359 360 func checkRangeTombstones(c *checkConfig) error { 361 var level int 362 var tombstones []tombstoneWithLevel 363 var err error 364 365 memtables := c.readState.memtables 366 for i := len(memtables) - 1; i >= 0; i-- { 367 iter := memtables[i].newRangeDelIter(nil) 368 if iter == nil { 369 continue 370 } 371 if tombstones, err = addTombstonesFromIter(iter, level, -1, 0, tombstones, 372 c.seqNum, c.cmp, c.formatKey, nil); err != nil { 373 return err 374 } 375 level++ 376 } 377 378 current := c.readState.current 379 addTombstonesFromLevel := func(files manifest.LevelIterator, lsmLevel int) error { 380 for f := files.First(); f != nil; f = files.Next() { 381 lf := files.Take() 382 atomicUnit, _ := expandToAtomicUnit(c.cmp, lf.Slice(), true /* disableIsCompacting */) 383 lower, upper := manifest.KeyRange(c.cmp, atomicUnit.Iter()) 384 iterToClose, iter, err := c.newIters( 385 context.Background(), lf.FileMetadata, &IterOptions{level: manifest.Level(lsmLevel)}, internalIterOpts{}) 386 if err != nil { 387 return err 388 } 389 iterToClose.Close() 390 if iter == nil { 391 continue 392 } 393 truncate := func(t keyspan.Span) keyspan.Span { 394 // Same checks as in keyspan.Truncate. 395 if c.cmp(t.Start, lower.UserKey) < 0 { 396 t.Start = lower.UserKey 397 } 398 if c.cmp(t.End, upper.UserKey) > 0 { 399 t.End = upper.UserKey 400 } 401 if c.cmp(t.Start, t.End) >= 0 { 402 // Remove the keys. 403 t.Keys = t.Keys[:0] 404 } 405 return t 406 } 407 if tombstones, err = addTombstonesFromIter(iter, level, lsmLevel, f.FileNum, 408 tombstones, c.seqNum, c.cmp, c.formatKey, truncate); err != nil { 409 return err 410 } 411 } 412 return nil 413 } 414 // Now the levels with untruncated tombsones. 415 for i := len(current.L0SublevelFiles) - 1; i >= 0; i-- { 416 if current.L0SublevelFiles[i].Empty() { 417 continue 418 } 419 err := addTombstonesFromLevel(current.L0SublevelFiles[i].Iter(), 0) 420 if err != nil { 421 return err 422 } 423 level++ 424 } 425 for i := 1; i < len(current.Levels); i++ { 426 if err := addTombstonesFromLevel(current.Levels[i].Iter(), i); err != nil { 427 return err 428 } 429 level++ 430 } 431 if c.stats != nil { 432 c.stats.NumTombstones = len(tombstones) 433 } 434 // We now have truncated tombstones. 435 // Fragment them all. 436 userKeys := collectAllUserKeys(c.cmp, tombstones) 437 tombstones = fragmentUsingUserKeys(c.cmp, tombstones, userKeys) 438 return iterateAndCheckTombstones(c.cmp, c.formatKey, tombstones) 439 } 440 441 func levelOrMemtable(lsmLevel int, fileNum FileNum) string { 442 if lsmLevel == -1 { 443 return "memtable" 444 } 445 return fmt.Sprintf("L%d: fileNum=%s", lsmLevel, fileNum) 446 } 447 448 func addTombstonesFromIter( 449 iter keyspan.FragmentIterator, 450 level int, 451 lsmLevel int, 452 fileNum FileNum, 453 tombstones []tombstoneWithLevel, 454 seqNum uint64, 455 cmp Compare, 456 formatKey base.FormatKey, 457 truncate func(tombstone keyspan.Span) keyspan.Span, 458 ) (_ []tombstoneWithLevel, err error) { 459 defer func() { 460 err = firstError(err, iter.Close()) 461 }() 462 463 var prevTombstone keyspan.Span 464 for tomb := iter.First(); tomb != nil; tomb = iter.Next() { 465 t := tomb.Visible(seqNum) 466 if t.Empty() { 467 continue 468 } 469 t = t.DeepClone() 470 // This is mainly a test for rangeDelV2 formatted blocks which are expected to 471 // be ordered and fragmented on disk. But we anyways check for memtables, 472 // rangeDelV1 as well. 473 if cmp(prevTombstone.End, t.Start) > 0 { 474 return nil, errors.Errorf("unordered or unfragmented range delete tombstones %s, %s in %s", 475 prevTombstone.Pretty(formatKey), t.Pretty(formatKey), levelOrMemtable(lsmLevel, fileNum)) 476 } 477 prevTombstone = t 478 479 // Truncation of a tombstone must happen after checking its ordering, 480 // fragmentation wrt previous tombstone. Since it is possible that after 481 // truncation the tombstone is ordered, fragmented when it originally wasn't. 482 if truncate != nil { 483 t = truncate(t) 484 } 485 if !t.Empty() { 486 tombstones = append(tombstones, tombstoneWithLevel{ 487 Span: t, 488 level: level, 489 lsmLevel: lsmLevel, 490 fileNum: fileNum, 491 }) 492 } 493 } 494 return tombstones, nil 495 } 496 497 type userKeysSort struct { 498 cmp Compare 499 buf [][]byte 500 } 501 502 func (v *userKeysSort) Len() int { return len(v.buf) } 503 func (v *userKeysSort) Less(i, j int) bool { 504 return v.cmp(v.buf[i], v.buf[j]) < 0 505 } 506 func (v *userKeysSort) Swap(i, j int) { 507 v.buf[i], v.buf[j] = v.buf[j], v.buf[i] 508 } 509 func collectAllUserKeys(cmp Compare, tombstones []tombstoneWithLevel) [][]byte { 510 keys := make([][]byte, 0, len(tombstones)*2) 511 for _, t := range tombstones { 512 keys = append(keys, t.Start) 513 keys = append(keys, t.End) 514 } 515 sorter := userKeysSort{ 516 cmp: cmp, 517 buf: keys, 518 } 519 sort.Sort(&sorter) 520 var last, curr int 521 for last, curr = -1, 0; curr < len(keys); curr++ { 522 if last < 0 || cmp(keys[last], keys[curr]) != 0 { 523 last++ 524 keys[last] = keys[curr] 525 } 526 } 527 keys = keys[:last+1] 528 return keys 529 } 530 531 func fragmentUsingUserKeys( 532 cmp Compare, tombstones []tombstoneWithLevel, userKeys [][]byte, 533 ) []tombstoneWithLevel { 534 var buf []tombstoneWithLevel 535 for _, t := range tombstones { 536 // Find the first position with tombstone start < user key 537 i := sort.Search(len(userKeys), func(i int) bool { 538 return cmp(t.Start, userKeys[i]) < 0 539 }) 540 for ; i < len(userKeys); i++ { 541 if cmp(userKeys[i], t.End) >= 0 { 542 break 543 } 544 tPartial := t 545 tPartial.End = userKeys[i] 546 buf = append(buf, tPartial) 547 t.Start = userKeys[i] 548 } 549 buf = append(buf, t) 550 } 551 return buf 552 } 553 554 // CheckLevelsStats provides basic stats on points and tombstones encountered. 555 type CheckLevelsStats struct { 556 NumPoints int64 557 NumTombstones int 558 } 559 560 // CheckLevels checks: 561 // - Every entry in the DB is consistent with the level invariant. See the 562 // comment at the top of the file. 563 // - Point keys in sstables are ordered. 564 // - Range delete tombstones in sstables are ordered and fragmented. 565 // - Successful processing of all MERGE records. 566 func (d *DB) CheckLevels(stats *CheckLevelsStats) error { 567 // Grab and reference the current readState. 568 readState := d.loadReadState() 569 defer readState.unref() 570 571 // Determine the seqnum to read at after grabbing the read state (current and 572 // memtables) above. 573 seqNum := d.mu.versions.visibleSeqNum.Load() 574 575 checkConfig := &checkConfig{ 576 logger: d.opts.Logger, 577 comparer: d.opts.Comparer, 578 readState: readState, 579 newIters: d.newIters, 580 seqNum: seqNum, 581 stats: stats, 582 merge: d.merge, 583 formatKey: d.opts.Comparer.FormatKey, 584 } 585 return checkLevelsInternal(checkConfig) 586 } 587 588 func checkLevelsInternal(c *checkConfig) (err error) { 589 // Phase 1: Use a simpleMergingIter to step through all the points and ensure 590 // that points with the same user key at different levels are not inverted 591 // wrt sequence numbers and the same holds for tombstones that cover points. 592 // To do this, one needs to construct a simpleMergingIter which is similar to 593 // how one constructs a mergingIter. 594 595 // Add mem tables from newest to oldest. 596 var mlevels []simpleMergingIterLevel 597 defer func() { 598 for i := range mlevels { 599 l := &mlevels[i] 600 if l.iter != nil { 601 err = firstError(err, l.iter.Close()) 602 l.iter = nil 603 } 604 if l.rangeDelIter != nil { 605 err = firstError(err, l.rangeDelIter.Close()) 606 l.rangeDelIter = nil 607 } 608 } 609 }() 610 611 memtables := c.readState.memtables 612 for i := len(memtables) - 1; i >= 0; i-- { 613 mem := memtables[i] 614 mlevels = append(mlevels, simpleMergingIterLevel{ 615 iter: mem.newIter(nil), 616 rangeDelIter: mem.newRangeDelIter(nil), 617 }) 618 } 619 620 current := c.readState.current 621 // Determine the final size for mlevels so that there are no more 622 // reallocations. levelIter will hold a pointer to elements in mlevels. 623 start := len(mlevels) 624 for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { 625 if current.L0SublevelFiles[sublevel].Empty() { 626 continue 627 } 628 mlevels = append(mlevels, simpleMergingIterLevel{}) 629 } 630 for level := 1; level < len(current.Levels); level++ { 631 if current.Levels[level].Empty() { 632 continue 633 } 634 mlevels = append(mlevels, simpleMergingIterLevel{}) 635 } 636 mlevelAlloc := mlevels[start:] 637 // Add L0 files by sublevel. 638 for sublevel := len(current.L0SublevelFiles) - 1; sublevel >= 0; sublevel-- { 639 if current.L0SublevelFiles[sublevel].Empty() { 640 continue 641 } 642 manifestIter := current.L0SublevelFiles[sublevel].Iter() 643 iterOpts := IterOptions{logger: c.logger} 644 li := &levelIter{} 645 li.init(context.Background(), iterOpts, c.comparer, c.newIters, manifestIter, 646 manifest.L0Sublevel(sublevel), internalIterOpts{}) 647 li.initRangeDel(&mlevelAlloc[0].rangeDelIter) 648 li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) 649 mlevelAlloc[0].iter = li 650 mlevelAlloc = mlevelAlloc[1:] 651 } 652 for level := 1; level < len(current.Levels); level++ { 653 if current.Levels[level].Empty() { 654 continue 655 } 656 657 iterOpts := IterOptions{logger: c.logger} 658 li := &levelIter{} 659 li.init(context.Background(), iterOpts, c.comparer, c.newIters, 660 current.Levels[level].Iter(), manifest.Level(level), internalIterOpts{}) 661 li.initRangeDel(&mlevelAlloc[0].rangeDelIter) 662 li.initBoundaryContext(&mlevelAlloc[0].levelIterBoundaryContext) 663 mlevelAlloc[0].iter = li 664 mlevelAlloc = mlevelAlloc[1:] 665 } 666 667 mergingIter := &simpleMergingIter{} 668 mergingIter.init(c.merge, c.cmp, c.seqNum, c.formatKey, mlevels...) 669 for cont := mergingIter.step(); cont; cont = mergingIter.step() { 670 } 671 if err := mergingIter.err; err != nil { 672 return err 673 } 674 if c.stats != nil { 675 c.stats.NumPoints = mergingIter.numPoints 676 } 677 678 // Phase 2: Check that the tombstones are mutually consistent. 679 return checkRangeTombstones(c) 680 } 681 682 type simpleMergingIterItem struct { 683 index int 684 key InternalKey 685 value base.LazyValue 686 } 687 688 type simpleMergingIterHeap struct { 689 cmp Compare 690 reverse bool 691 items []simpleMergingIterItem 692 } 693 694 func (h *simpleMergingIterHeap) len() int { 695 return len(h.items) 696 } 697 698 func (h *simpleMergingIterHeap) less(i, j int) bool { 699 ikey, jkey := h.items[i].key, h.items[j].key 700 if c := h.cmp(ikey.UserKey, jkey.UserKey); c != 0 { 701 if h.reverse { 702 return c > 0 703 } 704 return c < 0 705 } 706 if h.reverse { 707 return ikey.Trailer < jkey.Trailer 708 } 709 return ikey.Trailer > jkey.Trailer 710 } 711 712 func (h *simpleMergingIterHeap) swap(i, j int) { 713 h.items[i], h.items[j] = h.items[j], h.items[i] 714 } 715 716 // init, fix, up and down are copied from the go stdlib. 717 func (h *simpleMergingIterHeap) init() { 718 // heapify 719 n := h.len() 720 for i := n/2 - 1; i >= 0; i-- { 721 h.down(i, n) 722 } 723 } 724 725 func (h *simpleMergingIterHeap) fix(i int) { 726 if !h.down(i, h.len()) { 727 h.up(i) 728 } 729 } 730 731 func (h *simpleMergingIterHeap) pop() *simpleMergingIterItem { 732 n := h.len() - 1 733 h.swap(0, n) 734 h.down(0, n) 735 item := &h.items[n] 736 h.items = h.items[:n] 737 return item 738 } 739 740 func (h *simpleMergingIterHeap) up(j int) { 741 for { 742 i := (j - 1) / 2 // parent 743 if i == j || !h.less(j, i) { 744 break 745 } 746 h.swap(i, j) 747 j = i 748 } 749 } 750 751 func (h *simpleMergingIterHeap) down(i0, n int) bool { 752 i := i0 753 for { 754 j1 := 2*i + 1 755 if j1 >= n || j1 < 0 { // j1 < 0 after int overflow 756 break 757 } 758 j := j1 // left child 759 if j2 := j1 + 1; j2 < n && h.less(j2, j1) { 760 j = j2 // = 2*i + 2 // right child 761 } 762 if !h.less(j, i) { 763 break 764 } 765 h.swap(i, j) 766 i = j 767 } 768 return i > i0 769 }