github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/level_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package bitalostable 6 7 import ( 8 "fmt" 9 "runtime/debug" 10 11 "github.com/zuoyebang/bitalostable/internal/base" 12 "github.com/zuoyebang/bitalostable/internal/invariants" 13 "github.com/zuoyebang/bitalostable/internal/keyspan" 14 "github.com/zuoyebang/bitalostable/internal/manifest" 15 "github.com/zuoyebang/bitalostable/sstable" 16 ) 17 18 // tableNewIters creates a new point and range-del iterator for the given file 19 // number. If bytesIterated is specified, it is incremented as the given file is 20 // iterated through. 21 type tableNewIters func( 22 file *manifest.FileMetadata, 23 opts *IterOptions, 24 internalOpts internalIterOpts, 25 ) (internalIterator, keyspan.FragmentIterator, error) 26 27 type internalIterOpts struct { 28 bytesIterated *uint64 29 stats *base.InternalIteratorStats 30 boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter 31 } 32 33 // levelIter provides a merged view of the sstables in a level. 34 // 35 // levelIter is used during compaction and as part of the Iterator 36 // implementation. When used as part of the Iterator implementation, level 37 // iteration needs to "pause" at sstable boundaries if a range deletion 38 // tombstone is the source of that boundary. We know if a range tombstone is 39 // the smallest or largest key in a file because the kind will be 40 // InternalKeyKindRangeDeletion. If the boundary key is a range deletion 41 // tombstone, we materialize a fake entry to return from levelIter. This 42 // prevents mergingIter from advancing past the sstable until the sstable 43 // contains the smallest (or largest for reverse iteration) key in the merged 44 // heap. Note that mergingIter treats a range deletion tombstone returned by 45 // the point iterator as a no-op. 46 // 47 // SeekPrefixGE presents the need for a second type of pausing. If an sstable 48 // iterator returns "not found" for a SeekPrefixGE operation, we don't want to 49 // advance to the next sstable as the "not found" does not indicate that all of 50 // the keys in the sstable are less than the search key. Advancing to the next 51 // sstable would cause us to skip over range tombstones, violating 52 // correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the 53 // kind InternalKeyKindRangeDeletion which will be used to pause the levelIter 54 // at the sstable until the mergingIter is ready to advance past it. 55 type levelIter struct { 56 logger Logger 57 cmp Compare 58 split Split 59 // The lower/upper bounds for iteration as specified at creation or the most 60 // recent call to SetBounds. 61 lower []byte 62 upper []byte 63 // The iterator options for the currently open table. If 64 // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary 65 // does not lie within the table bounds. 66 tableOpts IterOptions 67 // The LSM level this levelIter is initialized for. 68 level manifest.Level 69 // The keys to return when iterating past an sstable boundary and that 70 // boundary is a range deletion tombstone. The boundary could be smallest 71 // (i.e. arrived at with Prev), or largest (arrived at with Next). 72 smallestBoundary *InternalKey 73 largestBoundary *InternalKey 74 // combinedIterState may be set when a levelIter is used during user 75 // iteration. Although levelIter only iterates over point keys, it's also 76 // responsible for lazily constructing the combined range & point iterator 77 // when it observes a file containing range keys. If the combined iter 78 // state's initialized field is true, the iterator is already using combined 79 // iterator, OR the iterator is not configured to use combined iteration. If 80 // it's false, the levelIter must set the `triggered` and `key` fields when 81 // the levelIter passes over a file containing range keys. See the 82 // lazyCombinedIter for more details. 83 combinedIterState *combinedIterState 84 // A synthetic boundary key to return when SeekPrefixGE finds an sstable 85 // which doesn't contain the search key, but which does contain range 86 // tombstones. 87 syntheticBoundary InternalKey 88 // The iter for the current file. It is nil under any of the following conditions: 89 // - files.Current() == nil 90 // - err != nil 91 // - some other constraint, like the bounds in opts, caused the file at index to not 92 // be relevant to the iteration. 93 iter internalIterator 94 iterFile *fileMetadata 95 // filteredIter is an optional interface that may be implemented by internal 96 // iterators that perform filtering of keys. When a new file's iterator is 97 // opened, it's tested to see if it implements filteredIter. If it does, 98 // it's stored here to allow the level iterator to recognize when keys were 99 // omitted from iteration results due to filtering. This is important when a 100 // file contains range deletions that may delete keys from other files. The 101 // levelIter must not advance to the next file until the mergingIter has 102 // advanced beyond the file's bounds. See 103 // levelIterBoundaryContext.isIgnorableBoundaryKey. 104 filteredIter filteredIter 105 newIters tableNewIters 106 // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must 107 // point to a range del iterator corresponding to the current file. When this 108 // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever 109 // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed 110 // in rangeDelIterCopy. This is done for the following special case: 111 // when this iterator returns nil because of exceeding the bounds, we don't 112 // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But 113 // we need to set *rangeDelIterPtr to nil because of the aforementioned contract. 114 // This copy is used to revive the *rangeDelIterPtr in the case of reuse. 115 rangeDelIterPtr *keyspan.FragmentIterator 116 rangeDelIterCopy keyspan.FragmentIterator 117 files manifest.LevelIterator 118 err error 119 120 // Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`. 121 // We populate it with the corresponding bounds for the currently opened file. It is used for 122 // two purposes (described for forward iteration. The explanation for backward iteration is 123 // similar.) 124 // - To limit the optimization that seeks lower-level iterators past keys shadowed by a range 125 // tombstone. Limiting this seek to the file largestUserKey is necessary since 126 // range tombstones are stored untruncated, while they only apply to keys within their 127 // containing file's boundaries. For a detailed example, see comment above `mergingIter`. 128 // - To constrain the tombstone to act-within the bounds of the sstable when checking 129 // containment. For forward iteration we need the smallestUserKey. 130 // 131 // An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7. 132 // - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within 133 // the sstable bounds or earlier than the start of the sstable (and there is no sstable in 134 // between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X, 135 // it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of 136 // largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this 137 // tombstone (since it cannot have a version younger than this tombstone as it is at a lower 138 // level). And even if X = smallestUserKey or equal to the start user key of the tombstone, 139 // if the above conditions are satisfied we know that the internal keys corresponding to X at 140 // lower levels must have a version smaller than that in this file (again because of the level 141 // argument). So we don't need to use sequence numbers for this comparison. 142 // - When checking whether this tombstone deletes internal key X we know that the levelIter is at this 143 // sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the 144 // start of the sstable (and there is no sstable in between at this level). 145 // - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains 146 // X.UserKey, we know X is deleted. This argument also works when X is a user key (we use 147 // it when seeking to test whether a user key is deleted). 148 // - X is at the same level. X must be within the sstable bounds of the tombstone so the 149 // X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing 150 // X we need to compare the sequence number of X and the tombstone (we don't need to look 151 // at how this tombstone is truncated to act-within the file bounds, which are InternalKeys, 152 // since X and the tombstone are from the same file). 153 // 154 // Iterating backwards has one more complication when checking whether a tombstone deletes 155 // internal key X at a lower level (the construction we do here also works for a user key X). 156 // Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7. 157 // If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7, 158 // since the higher sstable is positioned at a key <= g#10. We should not use this tombstone 159 // to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel, 160 // which we set in a separate bool below. 161 // 162 // These fields differs from the `*Boundary` fields in a few ways: 163 // - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key. 164 // - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction. 165 // - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`. 166 boundaryContext *levelIterBoundaryContext 167 168 // internalOpts holds the internal iterator options to pass to the table 169 // cache when constructing new table iterators. 170 internalOpts internalIterOpts 171 172 // Disable invariant checks even if they are otherwise enabled. Used by tests 173 // which construct "impossible" situations (e.g. seeking to a key before the 174 // lower bound). 175 disableInvariants bool 176 } 177 178 // filteredIter is an additional interface implemented by iterators that may 179 // skip over point keys during iteration. The sstable.Iterator implements this 180 // interface. 181 type filteredIter interface { 182 // MaybeFilteredKeys may be called when an iterator is exhausted, indicating 183 // whether or not the iterator's last positioning method may have skipped 184 // any keys due to low-level filters. 185 // 186 // When an iterator is configured to use block-property filters, the 187 // low-level iterator may skip over blocks or whole sstables of keys. 188 // Implementations that implement skipping must implement this interface. 189 // Higher-level iterators require it to preserve invariants (eg, a levelIter 190 // used in a mergingIter must keep the file's range-del iterator open until 191 // the mergingIter has moved past the file's bounds, even if all of the 192 // file's point keys were filtered). 193 // 194 // MaybeFilteredKeys may always return false positives, that is it may 195 // return true when no keys were filtered. It should only be called when the 196 // iterator is exhausted. It must never return false negatives when the 197 // iterator is exhausted. 198 MaybeFilteredKeys() bool 199 } 200 201 // levelIter implements the base.InternalIterator interface. 202 var _ base.InternalIterator = (*levelIter)(nil) 203 204 // newLevelIter returns a levelIter. It is permissible to pass a nil split 205 // parameter if the caller is never going to call SeekPrefixGE. 206 func newLevelIter( 207 opts IterOptions, 208 cmp Compare, 209 split Split, 210 newIters tableNewIters, 211 files manifest.LevelIterator, 212 level manifest.Level, 213 bytesIterated *uint64, 214 ) *levelIter { 215 l := &levelIter{} 216 l.init(opts, cmp, split, newIters, files, level, internalIterOpts{bytesIterated: bytesIterated}) 217 return l 218 } 219 220 func (l *levelIter) init( 221 opts IterOptions, 222 cmp Compare, 223 split Split, 224 newIters tableNewIters, 225 files manifest.LevelIterator, 226 level manifest.Level, 227 internalOpts internalIterOpts, 228 ) { 229 l.err = nil 230 l.level = level 231 l.logger = opts.getLogger() 232 l.lower = opts.LowerBound 233 l.upper = opts.UpperBound 234 l.tableOpts.TableFilter = opts.TableFilter 235 l.tableOpts.PointKeyFilters = opts.PointKeyFilters 236 l.tableOpts.UseL6Filters = opts.UseL6Filters 237 l.tableOpts.level = l.level 238 l.cmp = cmp 239 l.split = split 240 l.iterFile = nil 241 l.newIters = newIters 242 l.files = files 243 l.internalOpts = internalOpts 244 } 245 246 func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { 247 l.rangeDelIterPtr = rangeDelIter 248 } 249 250 func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) { 251 l.boundaryContext = context 252 } 253 254 func (l *levelIter) initCombinedIterState(state *combinedIterState) { 255 l.combinedIterState = state 256 } 257 258 func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) { 259 // If we encounter a file that contains range keys, we may need to 260 // trigger a switch to combined range-key and point-key iteration, 261 // if the *bitalostable.Iterator is configured for it. This switch is done 262 // lazily because range keys are intended to be rare, and 263 // constructing the range-key iterator substantially adds to the 264 // cost of iterator construction and seeking. 265 // 266 // If l.combinedIterState.initialized is already true, either the 267 // iterator is already using combined iteration or the iterator is not 268 // configured to observe range keys. Either way, there's nothing to do. 269 // If false, trigger the switch to combined iteration, using the the 270 // file's bounds to seek the range-key iterator appropriately. 271 // 272 // We only need to trigger combined iteration if the file contains 273 // RangeKeySets: if there are only Unsets and Dels, the user will observe no 274 // range keys regardless. If this file has table stats available, they'll 275 // tell us whether the file has any RangeKeySets. Otherwise, we must 276 // fallback to assuming it does if HasRangeKeys=true. 277 if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized && 278 (l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) && 279 (l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) && 280 (!file.StatsValid() || file.Stats.NumRangeKeySets > 0) { 281 // The file contains range keys, and we're not using combined iteration yet. 282 // Trigger a switch to combined iteration. It's possible that a switch has 283 // already been triggered if multiple levels encounter files containing 284 // range keys while executing a single mergingIter operation. In this case, 285 // we need to compare the existing key recorded to l.combinedIterState.key, 286 // adjusting it if our key is smaller (forward iteration) or larger 287 // (backward iteration) than the existing key. 288 // 289 // These key comparisons are only required during a single high-level 290 // iterator operation. When the high-level iter op completes, 291 // iinitialized will be true, and future calls to this function will be 292 // no-ops. 293 switch dir { 294 case +1: 295 if !l.combinedIterState.triggered { 296 l.combinedIterState.triggered = true 297 l.combinedIterState.key = file.SmallestRangeKey.UserKey 298 } else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 { 299 l.combinedIterState.key = file.SmallestRangeKey.UserKey 300 } 301 case -1: 302 if !l.combinedIterState.triggered { 303 l.combinedIterState.triggered = true 304 l.combinedIterState.key = file.LargestRangeKey.UserKey 305 } else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 { 306 l.combinedIterState.key = file.LargestRangeKey.UserKey 307 } 308 } 309 } 310 } 311 312 func (l *levelIter) findFileGE(key []byte, isRelativeSeek bool) *fileMetadata { 313 // Find the earliest file whose largest key is >= ikey. 314 315 // Ordinarily we seek the LevelIterator using SeekGE. 316 // 317 // When lazy combined iteration is enabled, there's a complication. The 318 // level iterator is responsible for watching for files containing range 319 // keys and triggering the switch to combined iteration when such a file is 320 // observed. If a range deletion was observed in a higher level causing the 321 // merging iterator to seek the level to the range deletion's end key, we 322 // need to check whether all of the files between the old position and the 323 // new position contain any range keys. 324 // 325 // In this scenario, we don't seek the LevelIterator and instead we Next it, 326 // one file at a time, checking each for range keys. 327 nextInsteadOfSeek := isRelativeSeek && l.combinedIterState != nil && !l.combinedIterState.initialized 328 329 var m *fileMetadata 330 if nextInsteadOfSeek { 331 m = l.iterFile 332 } else { 333 m = l.files.SeekGE(l.cmp, key) 334 } 335 // The below loop has a bit of an unusual organization. There are several 336 // conditions under which we need to Next to a later file. If none of those 337 // conditions are met, the file in `m` is okay to return. The loop body is 338 // structured with a series of if statements, each of which may continue the 339 // loop to the next file. If none of the statements are met, the end of the 340 // loop body is a break. 341 for m != nil { 342 if m.HasRangeKeys { 343 l.maybeTriggerCombinedIteration(m, +1) 344 345 // Some files may only contain range keys, which we can skip. 346 // NB: HasPointKeys=true if the file contains any points or range 347 // deletions (which delete points). 348 if !m.HasPointKeys { 349 m = l.files.Next() 350 continue 351 } 352 } 353 354 // This file has point keys. 355 // 356 // However, there are a couple reasons why `m` may not be positioned ≥ 357 // `key` yet: 358 // 359 // 1. If SeekGE(key) landed on a file containing range keys, the file 360 // may contain range keys ≥ `key` but no point keys ≥ `key`. 361 // 2. When nexting instead of seeking, we must check to see whether 362 // we've nexted sufficiently far, or we need to next again. 363 // 364 // If the file does not contain point keys ≥ `key`, next to continue 365 // looking for a file that does. 366 if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 { 367 m = l.files.Next() 368 continue 369 } 370 371 // This file has point key bound ≥ `key`. But the largest point key 372 // bound may still be a range deletion sentinel, which is exclusive. In 373 // this case, the file doesn't actually contain any point keys equal to 374 // `key`. We next to keep searching for a file that actually contains 375 // point keys ≥ key. 376 // 377 // Additionally, this prevents loading untruncated range deletions from 378 // a table which can't possibly contain the target key and is required 379 // for correctness by mergingIter.SeekGE (see the comment in that 380 // function). 381 if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 { 382 m = l.files.Next() 383 continue 384 } 385 386 // This file contains point keys ≥ `key`. Break and return it. 387 break 388 } 389 return m 390 } 391 392 func (l *levelIter) findFileLT(key []byte, isRelativeSeek bool) *fileMetadata { 393 // Find the last file whose smallest key is < ikey. 394 395 // Ordinarily we seek the LevelIterator using SeekLT. 396 // 397 // When lazy combined iteration is enabled, there's a complication. The 398 // level iterator is responsible for watching for files containing range 399 // keys and triggering the switch to combined iteration when such a file is 400 // observed. If a range deletion was observed in a higher level causing the 401 // merging iterator to seek the level to the range deletion's start key, we 402 // need to check whether all of the files between the old position and the 403 // new position contain any range keys. 404 // 405 // In this scenario, we don't seek the LevelIterator and instead we Prev it, 406 // one file at a time, checking each for range keys. 407 prevInsteadOfSeek := isRelativeSeek && l.combinedIterState != nil && !l.combinedIterState.initialized 408 409 var m *fileMetadata 410 if prevInsteadOfSeek { 411 m = l.iterFile 412 } else { 413 m = l.files.SeekLT(l.cmp, key) 414 } 415 // The below loop has a bit of an unusual organization. There are several 416 // conditions under which we need to Prev to a previous file. If none of 417 // those conditions are met, the file in `m` is okay to return. The loop 418 // body is structured with a series of if statements, each of which may 419 // continue the loop to the previous file. If none of the statements are 420 // met, the end of the loop body is a break. 421 for m != nil { 422 if m.HasRangeKeys { 423 l.maybeTriggerCombinedIteration(m, -1) 424 425 // Some files may only contain range keys, which we can skip. 426 // NB: HasPointKeys=true if the file contains any points or range 427 // deletions (which delete points). 428 if !m.HasPointKeys { 429 m = l.files.Prev() 430 continue 431 } 432 } 433 434 // This file has point keys. 435 // 436 // However, there are a couple reasons why `m` may not be positioned < 437 // `key` yet: 438 // 439 // 1. If SeekLT(key) landed on a file containing range keys, the file 440 // may contain range keys < `key` but no point keys < `key`. 441 // 2. When preving instead of seeking, we must check to see whether 442 // we've preved sufficiently far, or we need to prev again. 443 // 444 // If the file does not contain point keys < `key`, prev to continue 445 // looking for a file that does. 446 if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 { 447 m = l.files.Prev() 448 continue 449 } 450 451 // This file contains point keys < `key`. Break and return it. 452 break 453 } 454 return m 455 } 456 457 // Init the iteration bounds for the current table. Returns -1 if the table 458 // lies fully before the lower bound, +1 if the table lies fully after the 459 // upper bound, and 0 if the table overlaps the iteration bounds. 460 func (l *levelIter) initTableBounds(f *fileMetadata) int { 461 l.tableOpts.LowerBound = l.lower 462 if l.tableOpts.LowerBound != nil { 463 if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 { 464 // The largest key in the sstable is smaller than the lower bound. 465 return -1 466 } 467 if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 { 468 // The lower bound is smaller or equal to the smallest key in the 469 // table. Iteration within the table does not need to check the lower 470 // bound. 471 l.tableOpts.LowerBound = nil 472 } 473 } 474 l.tableOpts.UpperBound = l.upper 475 if l.tableOpts.UpperBound != nil { 476 if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 { 477 // The smallest key in the sstable is greater than or equal to the upper 478 // bound. 479 return 1 480 } 481 if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 { 482 // The upper bound is greater than the largest key in the 483 // table. Iteration within the table does not need to check the upper 484 // bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is 485 // inclusive. 486 l.tableOpts.UpperBound = nil 487 } 488 } 489 return 0 490 } 491 492 type loadFileReturnIndicator int8 493 494 const ( 495 noFileLoaded loadFileReturnIndicator = iota 496 fileAlreadyLoaded 497 newFileLoaded 498 ) 499 500 func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator { 501 l.smallestBoundary = nil 502 l.largestBoundary = nil 503 if l.boundaryContext != nil { 504 l.boundaryContext.isSyntheticIterBoundsKey = false 505 l.boundaryContext.isIgnorableBoundaryKey = false 506 } 507 if l.iterFile == file { 508 if l.err != nil { 509 return noFileLoaded 510 } 511 if l.iter != nil { 512 // We don't bother comparing the file bounds with the iteration bounds when we have 513 // an already open iterator. It is possible that the iter may not be relevant given the 514 // current iteration bounds, but it knows those bounds, so it will enforce them. 515 if l.rangeDelIterPtr != nil { 516 *l.rangeDelIterPtr = l.rangeDelIterCopy 517 } 518 519 // There are a few reasons we might not have triggered combined 520 // iteration yet, even though we already had `file` open. 521 // 1. If the bounds changed, we might have previously avoided 522 // switching to combined iteration because the bounds excluded 523 // the range keys contained in this file. 524 // 2. If an existing iterator was reconfigured to iterate over range 525 // keys (eg, using SetOptions), then we wouldn't have triggered 526 // the switch to combined iteration yet. 527 l.maybeTriggerCombinedIteration(file, dir) 528 return fileAlreadyLoaded 529 } 530 // We were already at file, but don't have an iterator, probably because the file was 531 // beyond the iteration bounds. It may still be, but it is also possible that the bounds 532 // have changed. We handle that below. 533 } 534 535 // Close both iter and rangeDelIterPtr. While mergingIter knows about 536 // rangeDelIterPtr, it can't call Close() on it because it does not know 537 // when the levelIter will switch it. Note that levelIter.Close() can be 538 // called multiple times. 539 if err := l.Close(); err != nil { 540 return noFileLoaded 541 } 542 543 for { 544 l.iterFile = file 545 if file == nil { 546 return noFileLoaded 547 } 548 549 l.maybeTriggerCombinedIteration(file, dir) 550 if !file.HasPointKeys { 551 switch dir { 552 case +1: 553 file = l.files.Next() 554 continue 555 case -1: 556 file = l.files.Prev() 557 continue 558 } 559 } 560 561 switch l.initTableBounds(file) { 562 case -1: 563 // The largest key in the sstable is smaller than the lower bound. 564 if dir < 0 { 565 return noFileLoaded 566 } 567 file = l.files.Next() 568 continue 569 case +1: 570 // The smallest key in the sstable is greater than or equal to the upper 571 // bound. 572 if dir > 0 { 573 return noFileLoaded 574 } 575 file = l.files.Prev() 576 continue 577 } 578 579 var rangeDelIter keyspan.FragmentIterator 580 var iter internalIterator 581 iter, rangeDelIter, l.err = l.newIters(l.files.Current(), &l.tableOpts, l.internalOpts) 582 l.iter = iter 583 if l.err != nil { 584 return noFileLoaded 585 } 586 if rangeDelIter != nil { 587 if fi, ok := iter.(filteredIter); ok { 588 l.filteredIter = fi 589 } else { 590 l.filteredIter = nil 591 } 592 } else { 593 l.filteredIter = nil 594 } 595 if l.rangeDelIterPtr != nil { 596 *l.rangeDelIterPtr = rangeDelIter 597 l.rangeDelIterCopy = rangeDelIter 598 } else if rangeDelIter != nil { 599 rangeDelIter.Close() 600 } 601 if l.boundaryContext != nil { 602 l.boundaryContext.smallestUserKey = file.Smallest.UserKey 603 l.boundaryContext.largestUserKey = file.Largest.UserKey 604 l.boundaryContext.isLargestUserKeyRangeDelSentinel = file.Largest.IsExclusiveSentinel() 605 } 606 return newFileLoaded 607 } 608 } 609 610 // In race builds we verify that the keys returned by levelIter lie within 611 // [lower,upper). 612 func (l *levelIter) verify(key *InternalKey, val []byte) (*InternalKey, []byte) { 613 // Note that invariants.Enabled is a compile time constant, which means the 614 // block of code will be compiled out of normal builds making this method 615 // eligible for inlining. Do not change this to use a variable. 616 if invariants.Enabled && !l.disableInvariants && key != nil { 617 // We allow returning a boundary key that is outside of the lower/upper 618 // bounds as such keys are always range tombstones which will be skipped by 619 // the Iterator. 620 if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 { 621 l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack()) 622 } 623 if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 { 624 l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack()) 625 } 626 } 627 return key, val 628 } 629 630 func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, []byte) { 631 l.err = nil 632 if l.boundaryContext != nil { 633 l.boundaryContext.isSyntheticIterBoundsKey = false 634 l.boundaryContext.isIgnorableBoundaryKey = false 635 } 636 637 // NB: the top-level Iterator has already adjusted key based on 638 // IterOptions.LowerBound. 639 loadFileIndicator := l.loadFile(l.findFileGE(key, flags.RelativeSeek()), +1) 640 if loadFileIndicator == noFileLoaded { 641 return nil, nil 642 } 643 if loadFileIndicator == newFileLoaded { 644 // File changed, so l.iter has changed, and that iterator is not 645 // positioned appropriately. 646 flags = flags.DisableTrySeekUsingNext() 647 } 648 if ikey, val := l.iter.SeekGE(key, flags); ikey != nil { 649 return l.verify(ikey, val) 650 } 651 return l.verify(l.skipEmptyFileForward()) 652 } 653 654 func (l *levelIter) SeekPrefixGE( 655 prefix, key []byte, flags base.SeekGEFlags, 656 ) (*base.InternalKey, []byte) { 657 l.err = nil 658 if l.boundaryContext != nil { 659 l.boundaryContext.isSyntheticIterBoundsKey = false 660 l.boundaryContext.isIgnorableBoundaryKey = false 661 } 662 663 // NB: the top-level Iterator has already adjusted key based on 664 // IterOptions.LowerBound. 665 loadFileIndicator := l.loadFile(l.findFileGE(key, flags.RelativeSeek()), +1) 666 if loadFileIndicator == noFileLoaded { 667 return nil, nil 668 } 669 if loadFileIndicator == newFileLoaded { 670 // File changed, so l.iter has changed, and that iterator is not 671 // positioned appropriately. 672 flags = flags.DisableTrySeekUsingNext() 673 } 674 if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil { 675 return l.verify(key, val) 676 } 677 // When SeekPrefixGE returns nil, we have not necessarily reached the end of 678 // the sstable. All we know is that a key with prefix does not exist in the 679 // current sstable. We do know that the key lies within the bounds of the 680 // table as findFileGE found the table where key <= meta.Largest. We return 681 // the table's bound with isIgnorableBoundaryKey set. 682 if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil { 683 if l.tableOpts.UpperBound != nil { 684 l.syntheticBoundary.UserKey = l.tableOpts.UpperBound 685 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 686 l.largestBoundary = &l.syntheticBoundary 687 if l.boundaryContext != nil { 688 l.boundaryContext.isSyntheticIterBoundsKey = true 689 l.boundaryContext.isIgnorableBoundaryKey = false 690 } 691 return l.verify(l.largestBoundary, nil) 692 } 693 // Return the file's largest bound, ensuring this file stays open until 694 // the mergingIter advances beyond the file's bounds. We set 695 // isIgnorableBoundaryKey to signal that the actual key returned should 696 // be ignored, and does not represent a real key in the database. 697 l.largestBoundary = &l.iterFile.LargestPointKey 698 if l.boundaryContext != nil { 699 l.boundaryContext.isSyntheticIterBoundsKey = false 700 l.boundaryContext.isIgnorableBoundaryKey = true 701 } 702 return l.verify(l.largestBoundary, nil) 703 } 704 // It is possible that we are here because bloom filter matching failed. In 705 // that case it is likely that all keys matching the prefix are wholly 706 // within the current file and cannot be in the subsequent file. In that 707 // case we don't want to go to the next file, since loading and seeking in 708 // there has some cost. Additionally, for sparse key spaces, loading the 709 // next file will defeat the optimization for the next SeekPrefixGE that is 710 // called with flags.TrySeekUsingNext(), since for sparse key spaces it is 711 // likely that the next key will also be contained in the current file. 712 if n := l.split(l.iterFile.LargestPointKey.UserKey); l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 { 713 return nil, nil 714 } 715 return l.verify(l.skipEmptyFileForward()) 716 } 717 718 func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, []byte) { 719 l.err = nil 720 if l.boundaryContext != nil { 721 l.boundaryContext.isSyntheticIterBoundsKey = false 722 l.boundaryContext.isIgnorableBoundaryKey = false 723 } 724 725 // NB: the top-level Iterator has already adjusted key based on 726 // IterOptions.UpperBound. 727 if l.loadFile(l.findFileLT(key, flags.RelativeSeek()), -1) == noFileLoaded { 728 return nil, nil 729 } 730 if key, val := l.iter.SeekLT(key, flags); key != nil { 731 return l.verify(key, val) 732 } 733 return l.verify(l.skipEmptyFileBackward()) 734 } 735 736 func (l *levelIter) First() (*InternalKey, []byte) { 737 l.err = nil 738 if l.boundaryContext != nil { 739 l.boundaryContext.isSyntheticIterBoundsKey = false 740 l.boundaryContext.isIgnorableBoundaryKey = false 741 } 742 743 // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is 744 // set. 745 if l.loadFile(l.files.First(), +1) == noFileLoaded { 746 return nil, nil 747 } 748 if key, val := l.iter.First(); key != nil { 749 return l.verify(key, val) 750 } 751 return l.verify(l.skipEmptyFileForward()) 752 } 753 754 func (l *levelIter) Last() (*InternalKey, []byte) { 755 l.err = nil 756 if l.boundaryContext != nil { 757 l.boundaryContext.isSyntheticIterBoundsKey = false 758 l.boundaryContext.isIgnorableBoundaryKey = false 759 } 760 761 // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is 762 // set. 763 if l.loadFile(l.files.Last(), -1) == noFileLoaded { 764 return nil, nil 765 } 766 if key, val := l.iter.Last(); key != nil { 767 return l.verify(key, val) 768 } 769 return l.verify(l.skipEmptyFileBackward()) 770 } 771 772 func (l *levelIter) Next() (*InternalKey, []byte) { 773 if l.err != nil || l.iter == nil { 774 return nil, nil 775 } 776 if l.boundaryContext != nil { 777 l.boundaryContext.isSyntheticIterBoundsKey = false 778 l.boundaryContext.isIgnorableBoundaryKey = false 779 } 780 781 switch { 782 case l.largestBoundary != nil: 783 if l.tableOpts.UpperBound != nil { 784 // The UpperBound was within this file, so don't load the next 785 // file. We leave the largestBoundary unchanged so that subsequent 786 // calls to Next() stay at this file. If a Seek/First/Last call is 787 // made and this file continues to be relevant, loadFile() will 788 // set the largestBoundary to nil. 789 if l.rangeDelIterPtr != nil { 790 *l.rangeDelIterPtr = nil 791 } 792 return nil, nil 793 } 794 // We're stepping past the boundary key, so now we can load the next file. 795 if l.loadFile(l.files.Next(), +1) != noFileLoaded { 796 if key, val := l.iter.First(); key != nil { 797 return l.verify(key, val) 798 } 799 return l.verify(l.skipEmptyFileForward()) 800 } 801 return nil, nil 802 803 default: 804 // Reset the smallest boundary since we're moving away from it. 805 l.smallestBoundary = nil 806 if key, val := l.iter.Next(); key != nil { 807 return l.verify(key, val) 808 } 809 } 810 return l.verify(l.skipEmptyFileForward()) 811 } 812 813 func (l *levelIter) Prev() (*InternalKey, []byte) { 814 if l.err != nil || l.iter == nil { 815 return nil, nil 816 } 817 if l.boundaryContext != nil { 818 l.boundaryContext.isSyntheticIterBoundsKey = false 819 l.boundaryContext.isIgnorableBoundaryKey = false 820 } 821 822 switch { 823 case l.smallestBoundary != nil: 824 if l.tableOpts.LowerBound != nil { 825 // The LowerBound was within this file, so don't load the previous 826 // file. We leave the smallestBoundary unchanged so that 827 // subsequent calls to Prev() stay at this file. If a 828 // Seek/First/Last call is made and this file continues to be 829 // relevant, loadFile() will set the smallestBoundary to nil. 830 if l.rangeDelIterPtr != nil { 831 *l.rangeDelIterPtr = nil 832 } 833 return nil, nil 834 } 835 // We're stepping past the boundary key, so now we can load the prev file. 836 if l.loadFile(l.files.Prev(), -1) != noFileLoaded { 837 if key, val := l.iter.Last(); key != nil { 838 return l.verify(key, val) 839 } 840 return l.verify(l.skipEmptyFileBackward()) 841 } 842 return nil, nil 843 844 default: 845 // Reset the largest boundary since we're moving away from it. 846 l.largestBoundary = nil 847 if key, val := l.iter.Prev(); key != nil { 848 return l.verify(key, val) 849 } 850 } 851 return l.verify(l.skipEmptyFileBackward()) 852 } 853 854 func (l *levelIter) skipEmptyFileForward() (*InternalKey, []byte) { 855 var key *InternalKey 856 var val []byte 857 // The first iteration of this loop starts with an already exhausted 858 // l.iter. The reason for the exhaustion is either that we iterated to the 859 // end of the sstable, or our iteration was terminated early due to the 860 // presence of an upper-bound or the use of SeekPrefixGE. If 861 // l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is 862 // not exhausted to allow for the merging to finish consuming the 863 // l.rangeDelIterPtr before levelIter switches the rangeDelIter from 864 // under it. This pretense is done by either generating a synthetic 865 // boundary key or returning the largest key of the file, depending on the 866 // exhaustion reason. 867 868 // Subsequent iterations will examine consecutive files such that the first 869 // file that does not have an exhausted iterator causes the code to return 870 // that key, else the behavior described above if there is a corresponding 871 // rangeDelIterPtr. 872 for ; key == nil; key, val = l.iter.First() { 873 if l.rangeDelIterPtr != nil { 874 // We're being used as part of a mergingIter and we've exhausted the 875 // current sstable. If an upper bound is present and the upper bound lies 876 // within the current sstable, then we will have reached the upper bound 877 // rather than the end of the sstable. We need to return a synthetic 878 // boundary key so that mergingIter can use the range tombstone iterator 879 // until the other levels have reached this boundary. 880 // 881 // It is safe to set the boundary key to the UpperBound user key 882 // with the RANGEDEL sentinel since it is the smallest InternalKey 883 // that matches the exclusive upper bound, and does not represent 884 // a real key. 885 if l.tableOpts.UpperBound != nil { 886 if *l.rangeDelIterPtr != nil { 887 l.syntheticBoundary.UserKey = l.tableOpts.UpperBound 888 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 889 l.largestBoundary = &l.syntheticBoundary 890 if l.boundaryContext != nil { 891 l.boundaryContext.isSyntheticIterBoundsKey = true 892 } 893 return l.largestBoundary, nil 894 } 895 // Else there are no range deletions in this sstable. This 896 // helps with performance when many levels are populated with 897 // sstables and most don't have any actual keys within the 898 // bounds. 899 return nil, nil 900 } 901 // If the boundary is a range deletion tombstone, return that key. 902 if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete { 903 l.largestBoundary = &l.iterFile.LargestPointKey 904 return l.largestBoundary, nil 905 } 906 // If the last point iterator positioning op might've skipped keys, 907 // it's possible the file's range deletions are still relevant to 908 // other levels. Return the largest boundary as a special ignorable 909 // marker to avoid advancing to the next file. 910 // 911 // The sstable iterator cannot guarantee that keys were skipped. A 912 // SeekGE that lands on a index separator k only knows that the 913 // block at the index entry contains keys ≤ k. We can't know whether 914 // there were actually keys between the seek key and the index 915 // separator key. If the block is then excluded due to block 916 // property filters, the iterator does not know whether keys were 917 // actually skipped by the block's exclusion. 918 // 919 // Since MaybeFilteredKeys cannot guarantee that keys were skipped, 920 // it's possible l.iterFile.Largest was already returned. Returning 921 // l.iterFile.Largest again is a violation of the strict 922 // monotonicity normally provided. The mergingIter's heap can 923 // tolerate this repeat key and in this case will keep the level at 924 // the top of the heap and immediately skip the entry, advancing to 925 // the next file. 926 if *l.rangeDelIterPtr != nil && l.filteredIter != nil && 927 l.filteredIter.MaybeFilteredKeys() { 928 l.largestBoundary = &l.iterFile.Largest 929 l.boundaryContext.isIgnorableBoundaryKey = true 930 return l.largestBoundary, nil 931 } 932 } 933 934 // Current file was exhausted. Move to the next file. 935 if l.loadFile(l.files.Next(), +1) == noFileLoaded { 936 return nil, nil 937 } 938 } 939 return key, val 940 } 941 942 func (l *levelIter) skipEmptyFileBackward() (*InternalKey, []byte) { 943 var key *InternalKey 944 var val []byte 945 // The first iteration of this loop starts with an already exhausted 946 // l.iter. The reason for the exhaustion is either that we iterated to the 947 // end of the sstable, or our iteration was terminated early due to the 948 // presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need 949 // to pretend the iterator is not exhausted to allow for the merging to 950 // finish consuming the l.rangeDelIterPtr before levelIter switches the 951 // rangeDelIter from under it. This pretense is done by either generating 952 // a synthetic boundary key or returning the smallest key of the file, 953 // depending on the exhaustion reason. 954 955 // Subsequent iterations will examine consecutive files such that the first 956 // file that does not have an exhausted iterator causes the code to return 957 // that key, else the behavior described above if there is a corresponding 958 // rangeDelIterPtr. 959 for ; key == nil; key, val = l.iter.Last() { 960 if l.rangeDelIterPtr != nil { 961 // We're being used as part of a mergingIter and we've exhausted the 962 // current sstable. If a lower bound is present and the lower bound lies 963 // within the current sstable, then we will have reached the lower bound 964 // rather than the beginning of the sstable. We need to return a 965 // synthetic boundary key so that mergingIter can use the range tombstone 966 // iterator until the other levels have reached this boundary. 967 // 968 // It is safe to set the boundary key to the LowerBound user key 969 // with the RANGEDEL sentinel since it is the smallest InternalKey 970 // that is within the inclusive lower bound, and does not 971 // represent a real key. 972 if l.tableOpts.LowerBound != nil { 973 if *l.rangeDelIterPtr != nil { 974 l.syntheticBoundary.UserKey = l.tableOpts.LowerBound 975 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 976 l.smallestBoundary = &l.syntheticBoundary 977 if l.boundaryContext != nil { 978 l.boundaryContext.isSyntheticIterBoundsKey = true 979 } 980 return l.smallestBoundary, nil 981 } 982 // Else there are no range deletions in this sstable. This 983 // helps with performance when many levels are populated with 984 // sstables and most don't have any actual keys within the 985 // bounds. 986 return nil, nil 987 } 988 // If the boundary is a range deletion tombstone, return that key. 989 if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete { 990 l.smallestBoundary = &l.iterFile.SmallestPointKey 991 return l.smallestBoundary, nil 992 } 993 // If the last point iterator positioning op skipped keys, it's 994 // possible the file's range deletions are still relevant to other 995 // levels. Return the smallest boundary as a special ignorable key 996 // to avoid advancing to the next file. 997 // 998 // The sstable iterator cannot guarantee that keys were skipped. A 999 // SeekGE that lands on a index separator k only knows that the 1000 // block at the index entry contains keys ≤ k. We can't know whether 1001 // there were actually keys between the seek key and the index 1002 // separator key. If the block is then excluded due to block 1003 // property filters, the iterator does not know whether keys were 1004 // actually skipped by the block's exclusion. 1005 // 1006 // Since MaybeFilteredKeys cannot guarantee that keys were skipped, 1007 // it's possible l.iterFile.Smallest was already returned. Returning 1008 // l.iterFile.Smallest again is a violation of the strict 1009 // monotonicity normally provided. The mergingIter's heap can 1010 // tolerate this repeat key and in this case will keep the level at 1011 // the top of the heap and immediately skip the entry, advancing to 1012 // the next file. 1013 if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() { 1014 l.smallestBoundary = &l.iterFile.Smallest 1015 l.boundaryContext.isIgnorableBoundaryKey = true 1016 return l.smallestBoundary, nil 1017 } 1018 } 1019 1020 // Current file was exhausted. Move to the previous file. 1021 if l.loadFile(l.files.Prev(), -1) == noFileLoaded { 1022 return nil, nil 1023 } 1024 } 1025 return key, val 1026 } 1027 1028 func (l *levelIter) Error() error { 1029 if l.err != nil || l.iter == nil { 1030 return l.err 1031 } 1032 return l.iter.Error() 1033 } 1034 1035 func (l *levelIter) Close() error { 1036 if l.iter != nil { 1037 l.err = l.iter.Close() 1038 l.iter = nil 1039 } 1040 if l.rangeDelIterPtr != nil { 1041 if t := l.rangeDelIterCopy; t != nil { 1042 l.err = firstError(l.err, t.Close()) 1043 } 1044 *l.rangeDelIterPtr = nil 1045 l.rangeDelIterCopy = nil 1046 } 1047 return l.err 1048 } 1049 1050 func (l *levelIter) SetBounds(lower, upper []byte) { 1051 l.lower = lower 1052 l.upper = upper 1053 1054 if l.iter == nil { 1055 return 1056 } 1057 1058 // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within 1059 // the boundaries of the current table. 1060 if l.initTableBounds(l.iterFile) != 0 { 1061 // The table does not overlap the bounds. Close() will set levelIter.err if 1062 // an error occurs. 1063 _ = l.Close() 1064 return 1065 } 1066 1067 l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound) 1068 } 1069 1070 func (l *levelIter) String() string { 1071 if l.iterFile != nil { 1072 return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String()) 1073 } 1074 return fmt.Sprintf("%s: fileNum=<nil>", l.level) 1075 } 1076 1077 var _ internalIterator = &levelIter{}