github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/level_iter.go (about) 1 // Copyright 2018 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "context" 9 "fmt" 10 "runtime/debug" 11 12 "github.com/cockroachdb/pebble/internal/base" 13 "github.com/cockroachdb/pebble/internal/invariants" 14 "github.com/cockroachdb/pebble/internal/keyspan" 15 "github.com/cockroachdb/pebble/internal/manifest" 16 "github.com/cockroachdb/pebble/sstable" 17 ) 18 19 // tableNewIters creates a new point and range-del iterator for the given file 20 // number. 21 // 22 // On success, the internalIterator is not-nil and must be closed; the 23 // FragmentIterator can be nil. 24 // TODO(radu): always return a non-nil FragmentIterator. 25 // 26 // On error, the iterators are nil. 27 // 28 // The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters(). 29 type tableNewIters func( 30 ctx context.Context, 31 file *manifest.FileMetadata, 32 opts *IterOptions, 33 internalOpts internalIterOpts, 34 ) (internalIterator, keyspan.FragmentIterator, error) 35 36 // tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter 37 // for the rangedel iterator returned by tableNewIters. 38 func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter { 39 return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { 40 iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{}) 41 if iter != nil { 42 _ = iter.Close() 43 } 44 if rangeDelIter == nil { 45 rangeDelIter = emptyKeyspanIter 46 } 47 return rangeDelIter, err 48 } 49 } 50 51 type internalIterOpts struct { 52 bytesIterated *uint64 53 bufferPool *sstable.BufferPool 54 stats *base.InternalIteratorStats 55 boundLimitedFilter sstable.BoundLimitedBlockPropertyFilter 56 } 57 58 // levelIter provides a merged view of the sstables in a level. 59 // 60 // levelIter is used during compaction and as part of the Iterator 61 // implementation. When used as part of the Iterator implementation, level 62 // iteration needs to "pause" at sstable boundaries if a range deletion 63 // tombstone is the source of that boundary. We know if a range tombstone is 64 // the smallest or largest key in a file because the kind will be 65 // InternalKeyKindRangeDeletion. If the boundary key is a range deletion 66 // tombstone, we materialize a fake entry to return from levelIter. This 67 // prevents mergingIter from advancing past the sstable until the sstable 68 // contains the smallest (or largest for reverse iteration) key in the merged 69 // heap. Note that mergingIter treats a range deletion tombstone returned by 70 // the point iterator as a no-op. 71 // 72 // SeekPrefixGE presents the need for a second type of pausing. If an sstable 73 // iterator returns "not found" for a SeekPrefixGE operation, we don't want to 74 // advance to the next sstable as the "not found" does not indicate that all of 75 // the keys in the sstable are less than the search key. Advancing to the next 76 // sstable would cause us to skip over range tombstones, violating 77 // correctness. Instead, SeekPrefixGE creates a synthetic boundary key with the 78 // kind InternalKeyKindRangeDeletion which will be used to pause the levelIter 79 // at the sstable until the mergingIter is ready to advance past it. 80 type levelIter struct { 81 // The context is stored here since (a) iterators are expected to be 82 // short-lived (since they pin sstables), (b) plumbing a context into every 83 // method is very painful, (c) they do not (yet) respect context 84 // cancellation and are only used for tracing. 85 ctx context.Context 86 logger Logger 87 comparer *Comparer 88 cmp Compare 89 split Split 90 // The lower/upper bounds for iteration as specified at creation or the most 91 // recent call to SetBounds. 92 lower []byte 93 upper []byte 94 // The iterator options for the currently open table. If 95 // tableOpts.{Lower,Upper}Bound are nil, the corresponding iteration boundary 96 // does not lie within the table bounds. 97 tableOpts IterOptions 98 // The LSM level this levelIter is initialized for. 99 level manifest.Level 100 // The keys to return when iterating past an sstable boundary and that 101 // boundary is a range deletion tombstone. The boundary could be smallest 102 // (i.e. arrived at with Prev), or largest (arrived at with Next). 103 smallestBoundary *InternalKey 104 largestBoundary *InternalKey 105 // combinedIterState may be set when a levelIter is used during user 106 // iteration. Although levelIter only iterates over point keys, it's also 107 // responsible for lazily constructing the combined range & point iterator 108 // when it observes a file containing range keys. If the combined iter 109 // state's initialized field is true, the iterator is already using combined 110 // iterator, OR the iterator is not configured to use combined iteration. If 111 // it's false, the levelIter must set the `triggered` and `key` fields when 112 // the levelIter passes over a file containing range keys. See the 113 // lazyCombinedIter for more details. 114 combinedIterState *combinedIterState 115 // A synthetic boundary key to return when SeekPrefixGE finds an sstable 116 // which doesn't contain the search key, but which does contain range 117 // tombstones. 118 syntheticBoundary InternalKey 119 // The iter for the current file. It is nil under any of the following conditions: 120 // - files.Current() == nil 121 // - err != nil 122 // - some other constraint, like the bounds in opts, caused the file at index to not 123 // be relevant to the iteration. 124 iter internalIterator 125 // iterFile holds the current file. It is always equal to l.files.Current(). 126 iterFile *fileMetadata 127 // filteredIter is an optional interface that may be implemented by internal 128 // iterators that perform filtering of keys. When a new file's iterator is 129 // opened, it's tested to see if it implements filteredIter. If it does, 130 // it's stored here to allow the level iterator to recognize when keys were 131 // omitted from iteration results due to filtering. This is important when a 132 // file contains range deletions that may delete keys from other files. The 133 // levelIter must not advance to the next file until the mergingIter has 134 // advanced beyond the file's bounds. See 135 // levelIterBoundaryContext.isIgnorableBoundaryKey. 136 filteredIter filteredIter 137 newIters tableNewIters 138 // When rangeDelIterPtr != nil, the caller requires that *rangeDelIterPtr must 139 // point to a range del iterator corresponding to the current file. When this 140 // iterator returns nil, *rangeDelIterPtr should also be set to nil. Whenever 141 // a non-nil internalIterator is placed in rangeDelIterPtr, a copy is placed 142 // in rangeDelIterCopy. This is done for the following special case: 143 // when this iterator returns nil because of exceeding the bounds, we don't 144 // close iter and *rangeDelIterPtr since we could reuse it in the next seek. But 145 // we need to set *rangeDelIterPtr to nil because of the aforementioned contract. 146 // This copy is used to revive the *rangeDelIterPtr in the case of reuse. 147 rangeDelIterPtr *keyspan.FragmentIterator 148 rangeDelIterCopy keyspan.FragmentIterator 149 files manifest.LevelIterator 150 err error 151 152 // Pointer into this level's entry in `mergingIterLevel::levelIterBoundaryContext`. 153 // We populate it with the corresponding bounds for the currently opened file. It is used for 154 // two purposes (described for forward iteration. The explanation for backward iteration is 155 // similar.) 156 // - To limit the optimization that seeks lower-level iterators past keys shadowed by a range 157 // tombstone. Limiting this seek to the file largestUserKey is necessary since 158 // range tombstones are stored untruncated, while they only apply to keys within their 159 // containing file's boundaries. For a detailed example, see comment above `mergingIter`. 160 // - To constrain the tombstone to act-within the bounds of the sstable when checking 161 // containment. For forward iteration we need the smallestUserKey. 162 // 163 // An example is sstable bounds [c#8, g#12] containing a tombstone [b, i)#7. 164 // - When doing a SeekGE to user key X, the levelIter is at this sstable because X is either within 165 // the sstable bounds or earlier than the start of the sstable (and there is no sstable in 166 // between at this level). If X >= smallestUserKey, and the tombstone [b, i) contains X, 167 // it is correct to SeekGE the sstables at lower levels to min(g, i) (i.e., min of 168 // largestUserKey, tombstone.End) since any user key preceding min(g, i) must be covered by this 169 // tombstone (since it cannot have a version younger than this tombstone as it is at a lower 170 // level). And even if X = smallestUserKey or equal to the start user key of the tombstone, 171 // if the above conditions are satisfied we know that the internal keys corresponding to X at 172 // lower levels must have a version smaller than that in this file (again because of the level 173 // argument). So we don't need to use sequence numbers for this comparison. 174 // - When checking whether this tombstone deletes internal key X we know that the levelIter is at this 175 // sstable so (repeating the above) X.UserKey is either within the sstable bounds or earlier than the 176 // start of the sstable (and there is no sstable in between at this level). 177 // - X is at at a lower level. If X.UserKey >= smallestUserKey, and the tombstone contains 178 // X.UserKey, we know X is deleted. This argument also works when X is a user key (we use 179 // it when seeking to test whether a user key is deleted). 180 // - X is at the same level. X must be within the sstable bounds of the tombstone so the 181 // X.UserKey >= smallestUserKey comparison is trivially true. In addition to the tombstone containing 182 // X we need to compare the sequence number of X and the tombstone (we don't need to look 183 // at how this tombstone is truncated to act-within the file bounds, which are InternalKeys, 184 // since X and the tombstone are from the same file). 185 // 186 // Iterating backwards has one more complication when checking whether a tombstone deletes 187 // internal key X at a lower level (the construction we do here also works for a user key X). 188 // Consider sstable bounds [c#8, g#InternalRangeDelSentinel] containing a tombstone [b, i)#7. 189 // If we are positioned at key g#10 at a lower sstable, the tombstone we will see is [b, i)#7, 190 // since the higher sstable is positioned at a key <= g#10. We should not use this tombstone 191 // to delete g#10. This requires knowing that the largestUserKey is a range delete sentinel, 192 // which we set in a separate bool below. 193 // 194 // These fields differs from the `*Boundary` fields in a few ways: 195 // - `*Boundary` is only populated when the iterator is positioned exactly on the sentinel key. 196 // - `*Boundary` can hold either the lower- or upper-bound, depending on the iterator direction. 197 // - `*Boundary` is not exposed to the next higher-level iterator, i.e., `mergingIter`. 198 boundaryContext *levelIterBoundaryContext 199 200 // internalOpts holds the internal iterator options to pass to the table 201 // cache when constructing new table iterators. 202 internalOpts internalIterOpts 203 204 // Scratch space for the obsolete keys filter, when there are no other block 205 // property filters specified. See the performance note where 206 // IterOptions.PointKeyFilters is declared. 207 filtersBuf [1]BlockPropertyFilter 208 209 // Disable invariant checks even if they are otherwise enabled. Used by tests 210 // which construct "impossible" situations (e.g. seeking to a key before the 211 // lower bound). 212 disableInvariants bool 213 } 214 215 // filteredIter is an additional interface implemented by iterators that may 216 // skip over point keys during iteration. The sstable.Iterator implements this 217 // interface. 218 type filteredIter interface { 219 // MaybeFilteredKeys may be called when an iterator is exhausted, indicating 220 // whether or not the iterator's last positioning method may have skipped 221 // any keys due to low-level filters. 222 // 223 // When an iterator is configured to use block-property filters, the 224 // low-level iterator may skip over blocks or whole sstables of keys. 225 // Implementations that implement skipping must implement this interface. 226 // Higher-level iterators require it to preserve invariants (eg, a levelIter 227 // used in a mergingIter must keep the file's range-del iterator open until 228 // the mergingIter has moved past the file's bounds, even if all of the 229 // file's point keys were filtered). 230 // 231 // MaybeFilteredKeys may always return false positives, that is it may 232 // return true when no keys were filtered. It should only be called when the 233 // iterator is exhausted. It must never return false negatives when the 234 // iterator is exhausted. 235 MaybeFilteredKeys() bool 236 } 237 238 // levelIter implements the base.InternalIterator interface. 239 var _ base.InternalIterator = (*levelIter)(nil) 240 241 // newLevelIter returns a levelIter. It is permissible to pass a nil split 242 // parameter if the caller is never going to call SeekPrefixGE. 243 func newLevelIter( 244 ctx context.Context, 245 opts IterOptions, 246 comparer *Comparer, 247 newIters tableNewIters, 248 files manifest.LevelIterator, 249 level manifest.Level, 250 internalOpts internalIterOpts, 251 ) *levelIter { 252 l := &levelIter{} 253 l.init(ctx, opts, comparer, newIters, files, level, internalOpts) 254 return l 255 } 256 257 func (l *levelIter) init( 258 ctx context.Context, 259 opts IterOptions, 260 comparer *Comparer, 261 newIters tableNewIters, 262 files manifest.LevelIterator, 263 level manifest.Level, 264 internalOpts internalIterOpts, 265 ) { 266 l.ctx = ctx 267 l.err = nil 268 l.level = level 269 l.logger = opts.getLogger() 270 l.lower = opts.LowerBound 271 l.upper = opts.UpperBound 272 l.tableOpts.TableFilter = opts.TableFilter 273 l.tableOpts.PointKeyFilters = opts.PointKeyFilters 274 if len(opts.PointKeyFilters) == 0 { 275 l.tableOpts.PointKeyFilters = l.filtersBuf[:0:1] 276 } 277 l.tableOpts.UseL6Filters = opts.UseL6Filters 278 l.tableOpts.CategoryAndQoS = opts.CategoryAndQoS 279 l.tableOpts.level = l.level 280 l.tableOpts.snapshotForHideObsoletePoints = opts.snapshotForHideObsoletePoints 281 l.comparer = comparer 282 l.cmp = comparer.Compare 283 l.split = comparer.Split 284 l.iterFile = nil 285 l.newIters = newIters 286 l.files = files 287 l.internalOpts = internalOpts 288 } 289 290 func (l *levelIter) initRangeDel(rangeDelIter *keyspan.FragmentIterator) { 291 l.rangeDelIterPtr = rangeDelIter 292 } 293 294 func (l *levelIter) initBoundaryContext(context *levelIterBoundaryContext) { 295 l.boundaryContext = context 296 } 297 298 func (l *levelIter) initCombinedIterState(state *combinedIterState) { 299 l.combinedIterState = state 300 } 301 302 func (l *levelIter) maybeTriggerCombinedIteration(file *fileMetadata, dir int) { 303 // If we encounter a file that contains range keys, we may need to 304 // trigger a switch to combined range-key and point-key iteration, 305 // if the *pebble.Iterator is configured for it. This switch is done 306 // lazily because range keys are intended to be rare, and 307 // constructing the range-key iterator substantially adds to the 308 // cost of iterator construction and seeking. 309 // 310 // If l.combinedIterState.initialized is already true, either the 311 // iterator is already using combined iteration or the iterator is not 312 // configured to observe range keys. Either way, there's nothing to do. 313 // If false, trigger the switch to combined iteration, using the the 314 // file's bounds to seek the range-key iterator appropriately. 315 // 316 // We only need to trigger combined iteration if the file contains 317 // RangeKeySets: if there are only Unsets and Dels, the user will observe no 318 // range keys regardless. If this file has table stats available, they'll 319 // tell us whether the file has any RangeKeySets. Otherwise, we must 320 // fallback to assuming it does if HasRangeKeys=true. 321 if file != nil && file.HasRangeKeys && l.combinedIterState != nil && !l.combinedIterState.initialized && 322 (l.upper == nil || l.cmp(file.SmallestRangeKey.UserKey, l.upper) < 0) && 323 (l.lower == nil || l.cmp(file.LargestRangeKey.UserKey, l.lower) > 0) && 324 (!file.StatsValid() || file.Stats.NumRangeKeySets > 0) { 325 // The file contains range keys, and we're not using combined iteration yet. 326 // Trigger a switch to combined iteration. It's possible that a switch has 327 // already been triggered if multiple levels encounter files containing 328 // range keys while executing a single mergingIter operation. In this case, 329 // we need to compare the existing key recorded to l.combinedIterState.key, 330 // adjusting it if our key is smaller (forward iteration) or larger 331 // (backward iteration) than the existing key. 332 // 333 // These key comparisons are only required during a single high-level 334 // iterator operation. When the high-level iter op completes, 335 // iinitialized will be true, and future calls to this function will be 336 // no-ops. 337 switch dir { 338 case +1: 339 if !l.combinedIterState.triggered { 340 l.combinedIterState.triggered = true 341 l.combinedIterState.key = file.SmallestRangeKey.UserKey 342 } else if l.cmp(l.combinedIterState.key, file.SmallestRangeKey.UserKey) > 0 { 343 l.combinedIterState.key = file.SmallestRangeKey.UserKey 344 } 345 case -1: 346 if !l.combinedIterState.triggered { 347 l.combinedIterState.triggered = true 348 l.combinedIterState.key = file.LargestRangeKey.UserKey 349 } else if l.cmp(l.combinedIterState.key, file.LargestRangeKey.UserKey) < 0 { 350 l.combinedIterState.key = file.LargestRangeKey.UserKey 351 } 352 } 353 } 354 } 355 356 func (l *levelIter) findFileGE(key []byte, flags base.SeekGEFlags) *fileMetadata { 357 // Find the earliest file whose largest key is >= key. 358 359 // NB: if flags.TrySeekUsingNext()=true, the levelIter must respect it. If 360 // the levelIter is positioned at the key P, it must return a key ≥ P. If 361 // used within a merging iterator, the merging iterator will depend on the 362 // levelIter only moving forward to maintain heap invariants. 363 364 // Ordinarily we seek the LevelIterator using SeekGE. In some instances, we 365 // Next instead. In other instances, we try Next-ing first, falling back to 366 // seek: 367 // a) flags.TrySeekUsingNext(): The top-level Iterator knows we're seeking 368 // to a key later than the current iterator position. We don't know how 369 // much later the seek key is, so it's possible there are many sstables 370 // between the current position and the seek key. However in most real- 371 // world use cases, the seek key is likely to be nearby. Rather than 372 // performing a log(N) seek through the file metadata, we next a few 373 // times from from our existing location. If we don't find a file whose 374 // largest is >= key within a few nexts, we fall back to seeking. 375 // 376 // Note that in this case, the file returned by findFileGE may be 377 // different than the file returned by a raw binary search (eg, when 378 // TrySeekUsingNext=false). This is possible because the most recent 379 // positioning operation may have already determined that previous 380 // files' keys that are ≥ key are all deleted. This information is 381 // encoded within the iterator's current iterator position and is 382 // unavailable to a fresh binary search. 383 // 384 // b) flags.RelativeSeek(): The merging iterator decided to re-seek this 385 // level according to a range tombstone. When lazy combined iteration 386 // is enabled, the level iterator is responsible for watching for 387 // files containing range keys and triggering the switch to combined 388 // iteration when such a file is observed. If a range deletion was 389 // observed in a higher level causing the merging iterator to seek the 390 // level to the range deletion's end key, we need to check whether all 391 // of the files between the old position and the new position contain 392 // any range keys. 393 // 394 // In this scenario, we don't seek the LevelIterator and instead we 395 // Next it, one file at a time, checking each for range keys. The 396 // merging iterator sets this flag to inform us that we're moving 397 // forward relative to the existing position and that we must examine 398 // each intermediate sstable's metadata for lazy-combined iteration. 399 // In this case, we only Next and never Seek. We set nextsUntilSeek=-1 400 // to signal this intention. 401 // 402 // NB: At most one of flags.RelativeSeek() and flags.TrySeekUsingNext() may 403 // be set, because the merging iterator re-seeks relative seeks with 404 // explicitly only the RelativeSeek flag set. 405 var nextsUntilSeek int 406 var nextInsteadOfSeek bool 407 if flags.TrySeekUsingNext() { 408 nextInsteadOfSeek = true 409 nextsUntilSeek = 4 // arbitrary 410 } 411 if flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized { 412 nextInsteadOfSeek = true 413 nextsUntilSeek = -1 414 } 415 416 var m *fileMetadata 417 if nextInsteadOfSeek { 418 m = l.iterFile 419 } else { 420 m = l.files.SeekGE(l.cmp, key) 421 } 422 // The below loop has a bit of an unusual organization. There are several 423 // conditions under which we need to Next to a later file. If none of those 424 // conditions are met, the file in `m` is okay to return. The loop body is 425 // structured with a series of if statements, each of which may continue the 426 // loop to the next file. If none of the statements are met, the end of the 427 // loop body is a break. 428 for m != nil { 429 if m.HasRangeKeys { 430 l.maybeTriggerCombinedIteration(m, +1) 431 432 // Some files may only contain range keys, which we can skip. 433 // NB: HasPointKeys=true if the file contains any points or range 434 // deletions (which delete points). 435 if !m.HasPointKeys { 436 m = l.files.Next() 437 continue 438 } 439 } 440 441 // This file has point keys. 442 // 443 // However, there are a couple reasons why `m` may not be positioned ≥ 444 // `key` yet: 445 // 446 // 1. If SeekGE(key) landed on a file containing range keys, the file 447 // may contain range keys ≥ `key` but no point keys ≥ `key`. 448 // 2. When nexting instead of seeking, we must check to see whether 449 // we've nexted sufficiently far, or we need to next again. 450 // 451 // If the file does not contain point keys ≥ `key`, next to continue 452 // looking for a file that does. 453 if (m.HasRangeKeys || nextInsteadOfSeek) && l.cmp(m.LargestPointKey.UserKey, key) < 0 { 454 // If nextInsteadOfSeek is set and nextsUntilSeek is non-negative, 455 // the iterator has been nexting hoping to discover the relevant 456 // file without seeking. It's exhausted the allotted nextsUntilSeek 457 // and should seek to the sought key. 458 if nextInsteadOfSeek && nextsUntilSeek == 0 { 459 nextInsteadOfSeek = false 460 m = l.files.SeekGE(l.cmp, key) 461 continue 462 } else if nextsUntilSeek > 0 { 463 nextsUntilSeek-- 464 } 465 m = l.files.Next() 466 continue 467 } 468 469 // This file has a point key bound ≥ `key`. But the largest point key 470 // bound may still be a range deletion sentinel, which is exclusive. In 471 // this case, the file doesn't actually contain any point keys equal to 472 // `key`. We next to keep searching for a file that actually contains 473 // point keys ≥ key. 474 // 475 // Additionally, this prevents loading untruncated range deletions from 476 // a table which can't possibly contain the target key and is required 477 // for correctness by mergingIter.SeekGE (see the comment in that 478 // function). 479 if m.LargestPointKey.IsExclusiveSentinel() && l.cmp(m.LargestPointKey.UserKey, key) == 0 { 480 m = l.files.Next() 481 continue 482 } 483 484 // This file contains point keys ≥ `key`. Break and return it. 485 break 486 } 487 return m 488 } 489 490 func (l *levelIter) findFileLT(key []byte, flags base.SeekLTFlags) *fileMetadata { 491 // Find the last file whose smallest key is < ikey. 492 493 // Ordinarily we seek the LevelIterator using SeekLT. 494 // 495 // When lazy combined iteration is enabled, there's a complication. The 496 // level iterator is responsible for watching for files containing range 497 // keys and triggering the switch to combined iteration when such a file is 498 // observed. If a range deletion was observed in a higher level causing the 499 // merging iterator to seek the level to the range deletion's start key, we 500 // need to check whether all of the files between the old position and the 501 // new position contain any range keys. 502 // 503 // In this scenario, we don't seek the LevelIterator and instead we Prev it, 504 // one file at a time, checking each for range keys. 505 prevInsteadOfSeek := flags.RelativeSeek() && l.combinedIterState != nil && !l.combinedIterState.initialized 506 507 var m *fileMetadata 508 if prevInsteadOfSeek { 509 m = l.iterFile 510 } else { 511 m = l.files.SeekLT(l.cmp, key) 512 } 513 // The below loop has a bit of an unusual organization. There are several 514 // conditions under which we need to Prev to a previous file. If none of 515 // those conditions are met, the file in `m` is okay to return. The loop 516 // body is structured with a series of if statements, each of which may 517 // continue the loop to the previous file. If none of the statements are 518 // met, the end of the loop body is a break. 519 for m != nil { 520 if m.HasRangeKeys { 521 l.maybeTriggerCombinedIteration(m, -1) 522 523 // Some files may only contain range keys, which we can skip. 524 // NB: HasPointKeys=true if the file contains any points or range 525 // deletions (which delete points). 526 if !m.HasPointKeys { 527 m = l.files.Prev() 528 continue 529 } 530 } 531 532 // This file has point keys. 533 // 534 // However, there are a couple reasons why `m` may not be positioned < 535 // `key` yet: 536 // 537 // 1. If SeekLT(key) landed on a file containing range keys, the file 538 // may contain range keys < `key` but no point keys < `key`. 539 // 2. When preving instead of seeking, we must check to see whether 540 // we've preved sufficiently far, or we need to prev again. 541 // 542 // If the file does not contain point keys < `key`, prev to continue 543 // looking for a file that does. 544 if (m.HasRangeKeys || prevInsteadOfSeek) && l.cmp(m.SmallestPointKey.UserKey, key) >= 0 { 545 m = l.files.Prev() 546 continue 547 } 548 549 // This file contains point keys < `key`. Break and return it. 550 break 551 } 552 return m 553 } 554 555 // Init the iteration bounds for the current table. Returns -1 if the table 556 // lies fully before the lower bound, +1 if the table lies fully after the 557 // upper bound, and 0 if the table overlaps the iteration bounds. 558 func (l *levelIter) initTableBounds(f *fileMetadata) int { 559 l.tableOpts.LowerBound = l.lower 560 if l.tableOpts.LowerBound != nil { 561 if l.cmp(f.LargestPointKey.UserKey, l.tableOpts.LowerBound) < 0 { 562 // The largest key in the sstable is smaller than the lower bound. 563 return -1 564 } 565 if l.cmp(l.tableOpts.LowerBound, f.SmallestPointKey.UserKey) <= 0 { 566 // The lower bound is smaller or equal to the smallest key in the 567 // table. Iteration within the table does not need to check the lower 568 // bound. 569 l.tableOpts.LowerBound = nil 570 } 571 } 572 l.tableOpts.UpperBound = l.upper 573 if l.tableOpts.UpperBound != nil { 574 if l.cmp(f.SmallestPointKey.UserKey, l.tableOpts.UpperBound) >= 0 { 575 // The smallest key in the sstable is greater than or equal to the upper 576 // bound. 577 return 1 578 } 579 if l.cmp(l.tableOpts.UpperBound, f.LargestPointKey.UserKey) > 0 { 580 // The upper bound is greater than the largest key in the 581 // table. Iteration within the table does not need to check the upper 582 // bound. NB: tableOpts.UpperBound is exclusive and f.LargestPointKey is 583 // inclusive. 584 l.tableOpts.UpperBound = nil 585 } 586 } 587 return 0 588 } 589 590 type loadFileReturnIndicator int8 591 592 const ( 593 noFileLoaded loadFileReturnIndicator = iota 594 fileAlreadyLoaded 595 newFileLoaded 596 ) 597 598 func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicator { 599 l.smallestBoundary = nil 600 l.largestBoundary = nil 601 if l.boundaryContext != nil { 602 l.boundaryContext.isSyntheticIterBoundsKey = false 603 l.boundaryContext.isIgnorableBoundaryKey = false 604 } 605 if l.iterFile == file { 606 if l.err != nil { 607 return noFileLoaded 608 } 609 if l.iter != nil { 610 // We don't bother comparing the file bounds with the iteration bounds when we have 611 // an already open iterator. It is possible that the iter may not be relevant given the 612 // current iteration bounds, but it knows those bounds, so it will enforce them. 613 if l.rangeDelIterPtr != nil { 614 *l.rangeDelIterPtr = l.rangeDelIterCopy 615 } 616 617 // There are a few reasons we might not have triggered combined 618 // iteration yet, even though we already had `file` open. 619 // 1. If the bounds changed, we might have previously avoided 620 // switching to combined iteration because the bounds excluded 621 // the range keys contained in this file. 622 // 2. If an existing iterator was reconfigured to iterate over range 623 // keys (eg, using SetOptions), then we wouldn't have triggered 624 // the switch to combined iteration yet. 625 l.maybeTriggerCombinedIteration(file, dir) 626 return fileAlreadyLoaded 627 } 628 // We were already at file, but don't have an iterator, probably because the file was 629 // beyond the iteration bounds. It may still be, but it is also possible that the bounds 630 // have changed. We handle that below. 631 } 632 633 // Close both iter and rangeDelIterPtr. While mergingIter knows about 634 // rangeDelIterPtr, it can't call Close() on it because it does not know 635 // when the levelIter will switch it. Note that levelIter.Close() can be 636 // called multiple times. 637 if err := l.Close(); err != nil { 638 return noFileLoaded 639 } 640 641 for { 642 l.iterFile = file 643 if file == nil { 644 return noFileLoaded 645 } 646 647 l.maybeTriggerCombinedIteration(file, dir) 648 if !file.HasPointKeys { 649 switch dir { 650 case +1: 651 file = l.files.Next() 652 continue 653 case -1: 654 file = l.files.Prev() 655 continue 656 } 657 } 658 659 switch l.initTableBounds(file) { 660 case -1: 661 // The largest key in the sstable is smaller than the lower bound. 662 if dir < 0 { 663 return noFileLoaded 664 } 665 file = l.files.Next() 666 continue 667 case +1: 668 // The smallest key in the sstable is greater than or equal to the upper 669 // bound. 670 if dir > 0 { 671 return noFileLoaded 672 } 673 file = l.files.Prev() 674 continue 675 } 676 677 var rangeDelIter keyspan.FragmentIterator 678 var iter internalIterator 679 iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts) 680 l.iter = iter 681 if l.err != nil { 682 return noFileLoaded 683 } 684 if rangeDelIter != nil { 685 if fi, ok := iter.(filteredIter); ok { 686 l.filteredIter = fi 687 } else { 688 l.filteredIter = nil 689 } 690 } else { 691 l.filteredIter = nil 692 } 693 if l.rangeDelIterPtr != nil { 694 *l.rangeDelIterPtr = rangeDelIter 695 l.rangeDelIterCopy = rangeDelIter 696 } else if rangeDelIter != nil { 697 rangeDelIter.Close() 698 } 699 if l.boundaryContext != nil { 700 l.boundaryContext.smallestUserKey = file.Smallest.UserKey 701 l.boundaryContext.largestUserKey = file.Largest.UserKey 702 l.boundaryContext.isLargestUserKeyExclusive = file.Largest.IsExclusiveSentinel() 703 } 704 return newFileLoaded 705 } 706 } 707 708 // In race builds we verify that the keys returned by levelIter lie within 709 // [lower,upper). 710 func (l *levelIter) verify(key *InternalKey, val base.LazyValue) (*InternalKey, base.LazyValue) { 711 // Note that invariants.Enabled is a compile time constant, which means the 712 // block of code will be compiled out of normal builds making this method 713 // eligible for inlining. Do not change this to use a variable. 714 if invariants.Enabled && !l.disableInvariants && key != nil { 715 // We allow returning a boundary key that is outside of the lower/upper 716 // bounds as such keys are always range tombstones which will be skipped by 717 // the Iterator. 718 if l.lower != nil && key != l.smallestBoundary && l.cmp(key.UserKey, l.lower) < 0 { 719 l.logger.Fatalf("levelIter %s: lower bound violation: %s < %s\n%s", l.level, key, l.lower, debug.Stack()) 720 } 721 if l.upper != nil && key != l.largestBoundary && l.cmp(key.UserKey, l.upper) > 0 { 722 l.logger.Fatalf("levelIter %s: upper bound violation: %s > %s\n%s", l.level, key, l.upper, debug.Stack()) 723 } 724 } 725 return key, val 726 } 727 728 func (l *levelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*InternalKey, base.LazyValue) { 729 l.err = nil // clear cached iteration error 730 if l.boundaryContext != nil { 731 l.boundaryContext.isSyntheticIterBoundsKey = false 732 l.boundaryContext.isIgnorableBoundaryKey = false 733 } 734 // NB: the top-level Iterator has already adjusted key based on 735 // IterOptions.LowerBound. 736 loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) 737 if loadFileIndicator == noFileLoaded { 738 return nil, base.LazyValue{} 739 } 740 if loadFileIndicator == newFileLoaded { 741 // File changed, so l.iter has changed, and that iterator is not 742 // positioned appropriately. 743 flags = flags.DisableTrySeekUsingNext() 744 } 745 if ikey, val := l.iter.SeekGE(key, flags); ikey != nil { 746 return l.verify(ikey, val) 747 } 748 return l.verify(l.skipEmptyFileForward()) 749 } 750 751 func (l *levelIter) SeekPrefixGE( 752 prefix, key []byte, flags base.SeekGEFlags, 753 ) (*base.InternalKey, base.LazyValue) { 754 l.err = nil // clear cached iteration error 755 if l.boundaryContext != nil { 756 l.boundaryContext.isSyntheticIterBoundsKey = false 757 l.boundaryContext.isIgnorableBoundaryKey = false 758 } 759 760 // NB: the top-level Iterator has already adjusted key based on 761 // IterOptions.LowerBound. 762 loadFileIndicator := l.loadFile(l.findFileGE(key, flags), +1) 763 if loadFileIndicator == noFileLoaded { 764 return nil, base.LazyValue{} 765 } 766 if loadFileIndicator == newFileLoaded { 767 // File changed, so l.iter has changed, and that iterator is not 768 // positioned appropriately. 769 flags = flags.DisableTrySeekUsingNext() 770 } 771 if key, val := l.iter.SeekPrefixGE(prefix, key, flags); key != nil { 772 return l.verify(key, val) 773 } 774 // When SeekPrefixGE returns nil, we have not necessarily reached the end of 775 // the sstable. All we know is that a key with prefix does not exist in the 776 // current sstable. We do know that the key lies within the bounds of the 777 // table as findFileGE found the table where key <= meta.Largest. We return 778 // the table's bound with isIgnorableBoundaryKey set. 779 if l.rangeDelIterPtr != nil && *l.rangeDelIterPtr != nil { 780 if l.tableOpts.UpperBound != nil { 781 l.syntheticBoundary.UserKey = l.tableOpts.UpperBound 782 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 783 l.largestBoundary = &l.syntheticBoundary 784 if l.boundaryContext != nil { 785 l.boundaryContext.isSyntheticIterBoundsKey = true 786 l.boundaryContext.isIgnorableBoundaryKey = false 787 } 788 return l.verify(l.largestBoundary, base.LazyValue{}) 789 } 790 // Return the file's largest bound, ensuring this file stays open until 791 // the mergingIter advances beyond the file's bounds. We set 792 // isIgnorableBoundaryKey to signal that the actual key returned should 793 // be ignored, and does not represent a real key in the database. 794 l.largestBoundary = &l.iterFile.LargestPointKey 795 if l.boundaryContext != nil { 796 l.boundaryContext.isSyntheticIterBoundsKey = false 797 l.boundaryContext.isIgnorableBoundaryKey = true 798 } 799 return l.verify(l.largestBoundary, base.LazyValue{}) 800 } 801 // It is possible that we are here because bloom filter matching failed. In 802 // that case it is likely that all keys matching the prefix are wholly 803 // within the current file and cannot be in the subsequent file. In that 804 // case we don't want to go to the next file, since loading and seeking in 805 // there has some cost. Additionally, for sparse key spaces, loading the 806 // next file will defeat the optimization for the next SeekPrefixGE that is 807 // called with flags.TrySeekUsingNext(), since for sparse key spaces it is 808 // likely that the next key will also be contained in the current file. 809 var n int 810 if l.split != nil { 811 // If the split function is specified, calculate the prefix length accordingly. 812 n = l.split(l.iterFile.LargestPointKey.UserKey) 813 } else { 814 // If the split function is not specified, the entire key is used as the 815 // prefix. This case can occur when getIter uses SeekPrefixGE. 816 n = len(l.iterFile.LargestPointKey.UserKey) 817 } 818 if l.cmp(prefix, l.iterFile.LargestPointKey.UserKey[:n]) < 0 { 819 return nil, base.LazyValue{} 820 } 821 return l.verify(l.skipEmptyFileForward()) 822 } 823 824 func (l *levelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*InternalKey, base.LazyValue) { 825 l.err = nil // clear cached iteration error 826 if l.boundaryContext != nil { 827 l.boundaryContext.isSyntheticIterBoundsKey = false 828 l.boundaryContext.isIgnorableBoundaryKey = false 829 } 830 831 // NB: the top-level Iterator has already adjusted key based on 832 // IterOptions.UpperBound. 833 if l.loadFile(l.findFileLT(key, flags), -1) == noFileLoaded { 834 return nil, base.LazyValue{} 835 } 836 if key, val := l.iter.SeekLT(key, flags); key != nil { 837 return l.verify(key, val) 838 } 839 return l.verify(l.skipEmptyFileBackward()) 840 } 841 842 func (l *levelIter) First() (*InternalKey, base.LazyValue) { 843 l.err = nil // clear cached iteration error 844 if l.boundaryContext != nil { 845 l.boundaryContext.isSyntheticIterBoundsKey = false 846 l.boundaryContext.isIgnorableBoundaryKey = false 847 } 848 849 // NB: the top-level Iterator will call SeekGE if IterOptions.LowerBound is 850 // set. 851 if l.loadFile(l.files.First(), +1) == noFileLoaded { 852 return nil, base.LazyValue{} 853 } 854 if key, val := l.iter.First(); key != nil { 855 return l.verify(key, val) 856 } 857 return l.verify(l.skipEmptyFileForward()) 858 } 859 860 func (l *levelIter) Last() (*InternalKey, base.LazyValue) { 861 l.err = nil // clear cached iteration error 862 if l.boundaryContext != nil { 863 l.boundaryContext.isSyntheticIterBoundsKey = false 864 l.boundaryContext.isIgnorableBoundaryKey = false 865 } 866 867 // NB: the top-level Iterator will call SeekLT if IterOptions.UpperBound is 868 // set. 869 if l.loadFile(l.files.Last(), -1) == noFileLoaded { 870 return nil, base.LazyValue{} 871 } 872 if key, val := l.iter.Last(); key != nil { 873 return l.verify(key, val) 874 } 875 return l.verify(l.skipEmptyFileBackward()) 876 } 877 878 func (l *levelIter) Next() (*InternalKey, base.LazyValue) { 879 if l.err != nil || l.iter == nil { 880 return nil, base.LazyValue{} 881 } 882 if l.boundaryContext != nil { 883 l.boundaryContext.isSyntheticIterBoundsKey = false 884 l.boundaryContext.isIgnorableBoundaryKey = false 885 } 886 887 switch { 888 case l.largestBoundary != nil: 889 if l.tableOpts.UpperBound != nil { 890 // The UpperBound was within this file, so don't load the next 891 // file. We leave the largestBoundary unchanged so that subsequent 892 // calls to Next() stay at this file. If a Seek/First/Last call is 893 // made and this file continues to be relevant, loadFile() will 894 // set the largestBoundary to nil. 895 if l.rangeDelIterPtr != nil { 896 *l.rangeDelIterPtr = nil 897 } 898 return nil, base.LazyValue{} 899 } 900 // We're stepping past the boundary key, so now we can load the next file. 901 if l.loadFile(l.files.Next(), +1) != noFileLoaded { 902 if key, val := l.iter.First(); key != nil { 903 return l.verify(key, val) 904 } 905 return l.verify(l.skipEmptyFileForward()) 906 } 907 return nil, base.LazyValue{} 908 909 default: 910 // Reset the smallest boundary since we're moving away from it. 911 l.smallestBoundary = nil 912 if key, val := l.iter.Next(); key != nil { 913 return l.verify(key, val) 914 } 915 } 916 return l.verify(l.skipEmptyFileForward()) 917 } 918 919 func (l *levelIter) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 920 if l.err != nil || l.iter == nil { 921 return nil, base.LazyValue{} 922 } 923 if l.boundaryContext != nil { 924 l.boundaryContext.isSyntheticIterBoundsKey = false 925 l.boundaryContext.isIgnorableBoundaryKey = false 926 } 927 928 switch { 929 case l.largestBoundary != nil: 930 if l.tableOpts.UpperBound != nil { 931 // The UpperBound was within this file, so don't load the next 932 // file. We leave the largestBoundary unchanged so that subsequent 933 // calls to Next() stay at this file. If a Seek/First/Last call is 934 // made and this file continues to be relevant, loadFile() will 935 // set the largestBoundary to nil. 936 if l.rangeDelIterPtr != nil { 937 *l.rangeDelIterPtr = nil 938 } 939 return nil, base.LazyValue{} 940 } 941 // We're stepping past the boundary key, so we need to load a later 942 // file. 943 944 default: 945 // Reset the smallest boundary since we're moving away from it. 946 l.smallestBoundary = nil 947 948 if key, val := l.iter.NextPrefix(succKey); key != nil { 949 return l.verify(key, val) 950 } 951 // Fall through to seeking. 952 } 953 954 // Seek the manifest level iterator using TrySeekUsingNext=true and 955 // RelativeSeek=true so that we take advantage of the knowledge that 956 // `succKey` can only be contained in later files. 957 metadataSeekFlags := base.SeekGEFlagsNone.EnableTrySeekUsingNext().EnableRelativeSeek() 958 if l.loadFile(l.findFileGE(succKey, metadataSeekFlags), +1) != noFileLoaded { 959 // NB: The SeekGE on the file's iterator must not set TrySeekUsingNext, 960 // because l.iter is unpositioned. 961 if key, val := l.iter.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { 962 return l.verify(key, val) 963 } 964 return l.verify(l.skipEmptyFileForward()) 965 } 966 return nil, base.LazyValue{} 967 } 968 969 func (l *levelIter) Prev() (*InternalKey, base.LazyValue) { 970 if l.err != nil || l.iter == nil { 971 return nil, base.LazyValue{} 972 } 973 if l.boundaryContext != nil { 974 l.boundaryContext.isSyntheticIterBoundsKey = false 975 l.boundaryContext.isIgnorableBoundaryKey = false 976 } 977 978 switch { 979 case l.smallestBoundary != nil: 980 if l.tableOpts.LowerBound != nil { 981 // The LowerBound was within this file, so don't load the previous 982 // file. We leave the smallestBoundary unchanged so that 983 // subsequent calls to Prev() stay at this file. If a 984 // Seek/First/Last call is made and this file continues to be 985 // relevant, loadFile() will set the smallestBoundary to nil. 986 if l.rangeDelIterPtr != nil { 987 *l.rangeDelIterPtr = nil 988 } 989 return nil, base.LazyValue{} 990 } 991 // We're stepping past the boundary key, so now we can load the prev file. 992 if l.loadFile(l.files.Prev(), -1) != noFileLoaded { 993 if key, val := l.iter.Last(); key != nil { 994 return l.verify(key, val) 995 } 996 return l.verify(l.skipEmptyFileBackward()) 997 } 998 return nil, base.LazyValue{} 999 1000 default: 1001 // Reset the largest boundary since we're moving away from it. 1002 l.largestBoundary = nil 1003 if key, val := l.iter.Prev(); key != nil { 1004 return l.verify(key, val) 1005 } 1006 } 1007 return l.verify(l.skipEmptyFileBackward()) 1008 } 1009 1010 func (l *levelIter) skipEmptyFileForward() (*InternalKey, base.LazyValue) { 1011 var key *InternalKey 1012 var val base.LazyValue 1013 // The first iteration of this loop starts with an already exhausted 1014 // l.iter. The reason for the exhaustion is either that we iterated to the 1015 // end of the sstable, or our iteration was terminated early due to the 1016 // presence of an upper-bound or the use of SeekPrefixGE. If 1017 // l.rangeDelIterPtr is non-nil, we may need to pretend the iterator is 1018 // not exhausted to allow for the merging to finish consuming the 1019 // l.rangeDelIterPtr before levelIter switches the rangeDelIter from 1020 // under it. This pretense is done by either generating a synthetic 1021 // boundary key or returning the largest key of the file, depending on the 1022 // exhaustion reason. 1023 1024 // Subsequent iterations will examine consecutive files such that the first 1025 // file that does not have an exhausted iterator causes the code to return 1026 // that key, else the behavior described above if there is a corresponding 1027 // rangeDelIterPtr. 1028 for ; key == nil; key, val = l.iter.First() { 1029 if l.rangeDelIterPtr != nil { 1030 // We're being used as part of a mergingIter and we've exhausted the 1031 // current sstable. If an upper bound is present and the upper bound lies 1032 // within the current sstable, then we will have reached the upper bound 1033 // rather than the end of the sstable. We need to return a synthetic 1034 // boundary key so that mergingIter can use the range tombstone iterator 1035 // until the other levels have reached this boundary. 1036 // 1037 // It is safe to set the boundary key to the UpperBound user key 1038 // with the RANGEDEL sentinel since it is the smallest InternalKey 1039 // that matches the exclusive upper bound, and does not represent 1040 // a real key. 1041 if l.tableOpts.UpperBound != nil { 1042 if *l.rangeDelIterPtr != nil { 1043 l.syntheticBoundary.UserKey = l.tableOpts.UpperBound 1044 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 1045 l.largestBoundary = &l.syntheticBoundary 1046 if l.boundaryContext != nil { 1047 l.boundaryContext.isSyntheticIterBoundsKey = true 1048 } 1049 return l.largestBoundary, base.LazyValue{} 1050 } 1051 // Else there are no range deletions in this sstable. This 1052 // helps with performance when many levels are populated with 1053 // sstables and most don't have any actual keys within the 1054 // bounds. 1055 return nil, base.LazyValue{} 1056 } 1057 // If the boundary is a range deletion tombstone, return that key. 1058 if l.iterFile.LargestPointKey.Kind() == InternalKeyKindRangeDelete { 1059 l.largestBoundary = &l.iterFile.LargestPointKey 1060 if l.boundaryContext != nil { 1061 l.boundaryContext.isIgnorableBoundaryKey = true 1062 } 1063 return l.largestBoundary, base.LazyValue{} 1064 } 1065 // If the last point iterator positioning op might've skipped keys, 1066 // it's possible the file's range deletions are still relevant to 1067 // other levels. Return the largest boundary as a special ignorable 1068 // marker to avoid advancing to the next file. 1069 // 1070 // The sstable iterator cannot guarantee that keys were skipped. A 1071 // SeekGE that lands on a index separator k only knows that the 1072 // block at the index entry contains keys ≤ k. We can't know whether 1073 // there were actually keys between the seek key and the index 1074 // separator key. If the block is then excluded due to block 1075 // property filters, the iterator does not know whether keys were 1076 // actually skipped by the block's exclusion. 1077 // 1078 // Since MaybeFilteredKeys cannot guarantee that keys were skipped, 1079 // it's possible l.iterFile.Largest was already returned. Returning 1080 // l.iterFile.Largest again is a violation of the strict 1081 // monotonicity normally provided. The mergingIter's heap can 1082 // tolerate this repeat key and in this case will keep the level at 1083 // the top of the heap and immediately skip the entry, advancing to 1084 // the next file. 1085 if *l.rangeDelIterPtr != nil && l.filteredIter != nil && 1086 l.filteredIter.MaybeFilteredKeys() { 1087 l.largestBoundary = &l.iterFile.Largest 1088 if l.boundaryContext != nil { 1089 l.boundaryContext.isIgnorableBoundaryKey = true 1090 } 1091 return l.largestBoundary, base.LazyValue{} 1092 } 1093 } 1094 1095 // Current file was exhausted. Move to the next file. 1096 if l.loadFile(l.files.Next(), +1) == noFileLoaded { 1097 return nil, base.LazyValue{} 1098 } 1099 } 1100 return key, val 1101 } 1102 1103 func (l *levelIter) skipEmptyFileBackward() (*InternalKey, base.LazyValue) { 1104 var key *InternalKey 1105 var val base.LazyValue 1106 // The first iteration of this loop starts with an already exhausted 1107 // l.iter. The reason for the exhaustion is either that we iterated to the 1108 // end of the sstable, or our iteration was terminated early due to the 1109 // presence of a lower-bound. If l.rangeDelIterPtr is non-nil, we may need 1110 // to pretend the iterator is not exhausted to allow for the merging to 1111 // finish consuming the l.rangeDelIterPtr before levelIter switches the 1112 // rangeDelIter from under it. This pretense is done by either generating 1113 // a synthetic boundary key or returning the smallest key of the file, 1114 // depending on the exhaustion reason. 1115 1116 // Subsequent iterations will examine consecutive files such that the first 1117 // file that does not have an exhausted iterator causes the code to return 1118 // that key, else the behavior described above if there is a corresponding 1119 // rangeDelIterPtr. 1120 for ; key == nil; key, val = l.iter.Last() { 1121 if l.rangeDelIterPtr != nil { 1122 // We're being used as part of a mergingIter and we've exhausted the 1123 // current sstable. If a lower bound is present and the lower bound lies 1124 // within the current sstable, then we will have reached the lower bound 1125 // rather than the beginning of the sstable. We need to return a 1126 // synthetic boundary key so that mergingIter can use the range tombstone 1127 // iterator until the other levels have reached this boundary. 1128 // 1129 // It is safe to set the boundary key to the LowerBound user key 1130 // with the RANGEDEL sentinel since it is the smallest InternalKey 1131 // that is within the inclusive lower bound, and does not 1132 // represent a real key. 1133 if l.tableOpts.LowerBound != nil { 1134 if *l.rangeDelIterPtr != nil { 1135 l.syntheticBoundary.UserKey = l.tableOpts.LowerBound 1136 l.syntheticBoundary.Trailer = InternalKeyRangeDeleteSentinel 1137 l.smallestBoundary = &l.syntheticBoundary 1138 if l.boundaryContext != nil { 1139 l.boundaryContext.isSyntheticIterBoundsKey = true 1140 } 1141 return l.smallestBoundary, base.LazyValue{} 1142 } 1143 // Else there are no range deletions in this sstable. This 1144 // helps with performance when many levels are populated with 1145 // sstables and most don't have any actual keys within the 1146 // bounds. 1147 return nil, base.LazyValue{} 1148 } 1149 // If the boundary is a range deletion tombstone, return that key. 1150 if l.iterFile.SmallestPointKey.Kind() == InternalKeyKindRangeDelete { 1151 l.smallestBoundary = &l.iterFile.SmallestPointKey 1152 if l.boundaryContext != nil { 1153 l.boundaryContext.isIgnorableBoundaryKey = true 1154 } 1155 return l.smallestBoundary, base.LazyValue{} 1156 } 1157 // If the last point iterator positioning op skipped keys, it's 1158 // possible the file's range deletions are still relevant to other 1159 // levels. Return the smallest boundary as a special ignorable key 1160 // to avoid advancing to the next file. 1161 // 1162 // The sstable iterator cannot guarantee that keys were skipped. A 1163 // SeekGE that lands on a index separator k only knows that the 1164 // block at the index entry contains keys ≤ k. We can't know whether 1165 // there were actually keys between the seek key and the index 1166 // separator key. If the block is then excluded due to block 1167 // property filters, the iterator does not know whether keys were 1168 // actually skipped by the block's exclusion. 1169 // 1170 // Since MaybeFilteredKeys cannot guarantee that keys were skipped, 1171 // it's possible l.iterFile.Smallest was already returned. Returning 1172 // l.iterFile.Smallest again is a violation of the strict 1173 // monotonicity normally provided. The mergingIter's heap can 1174 // tolerate this repeat key and in this case will keep the level at 1175 // the top of the heap and immediately skip the entry, advancing to 1176 // the next file. 1177 if *l.rangeDelIterPtr != nil && l.filteredIter != nil && l.filteredIter.MaybeFilteredKeys() { 1178 l.smallestBoundary = &l.iterFile.Smallest 1179 if l.boundaryContext != nil { 1180 l.boundaryContext.isIgnorableBoundaryKey = true 1181 } 1182 return l.smallestBoundary, base.LazyValue{} 1183 } 1184 } 1185 1186 // Current file was exhausted. Move to the previous file. 1187 if l.loadFile(l.files.Prev(), -1) == noFileLoaded { 1188 return nil, base.LazyValue{} 1189 } 1190 } 1191 return key, val 1192 } 1193 1194 func (l *levelIter) Error() error { 1195 if l.err != nil || l.iter == nil { 1196 return l.err 1197 } 1198 return l.iter.Error() 1199 } 1200 1201 func (l *levelIter) Close() error { 1202 if l.iter != nil { 1203 l.err = l.iter.Close() 1204 l.iter = nil 1205 } 1206 if l.rangeDelIterPtr != nil { 1207 if t := l.rangeDelIterCopy; t != nil { 1208 l.err = firstError(l.err, t.Close()) 1209 } 1210 *l.rangeDelIterPtr = nil 1211 l.rangeDelIterCopy = nil 1212 } 1213 return l.err 1214 } 1215 1216 func (l *levelIter) SetBounds(lower, upper []byte) { 1217 l.lower = lower 1218 l.upper = upper 1219 1220 if l.iter == nil { 1221 return 1222 } 1223 1224 // Update tableOpts.{Lower,Upper}Bound in case the new boundaries fall within 1225 // the boundaries of the current table. 1226 if l.initTableBounds(l.iterFile) != 0 { 1227 // The table does not overlap the bounds. Close() will set levelIter.err if 1228 // an error occurs. 1229 _ = l.Close() 1230 return 1231 } 1232 1233 l.iter.SetBounds(l.tableOpts.LowerBound, l.tableOpts.UpperBound) 1234 } 1235 1236 func (l *levelIter) SetContext(ctx context.Context) { 1237 l.ctx = ctx 1238 if l.iter != nil { 1239 // TODO(sumeer): this is losing the ctx = objiotracing.WithLevel(ctx, 1240 // manifest.LevelToInt(opts.level)) that happens in table_cache.go. 1241 l.iter.SetContext(ctx) 1242 } 1243 } 1244 1245 func (l *levelIter) String() string { 1246 if l.iterFile != nil { 1247 return fmt.Sprintf("%s: fileNum=%s", l.level, l.iter.String()) 1248 } 1249 return fmt.Sprintf("%s: fileNum=<nil>", l.level) 1250 } 1251 1252 var _ internalIterator = &levelIter{}