github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/sstable/reader_iter_two_lvl.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 13 ) 14 15 type twoLevelIterator struct { 16 singleLevelIterator 17 // maybeFilteredKeysSingleLevel indicates whether the last iterator 18 // positioning operation may have skipped any index blocks due to 19 // block-property filters when positioning the top-level-index. 20 maybeFilteredKeysTwoLevel bool 21 topLevelIndex blockIter 22 } 23 24 // twoLevelIterator implements the base.InternalIterator interface. 25 var _ base.InternalIterator = (*twoLevelIterator)(nil) 26 27 // loadIndex loads the index block at the current top level index position and 28 // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error 29 // encountered, which may be nil if we have simply exhausted the entire table. 30 // This is used for two level indexes. 31 func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { 32 // Ensure the index data block iterators are invalidated even if loading of 33 // the index fails. 34 i.data.invalidate() 35 i.index.invalidate() 36 if !i.topLevelIndex.valid() { 37 i.index.offset = 0 38 i.index.restarts = 0 39 return loadBlockFailed 40 } 41 v := i.topLevelIndex.value() 42 bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue()) 43 if err != nil { 44 i.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry") 45 return loadBlockFailed 46 } 47 if i.bpfs != nil { 48 intersects, err := i.bpfs.intersects(bhp.Props) 49 if err != nil { 50 i.err = errCorruptIndexEntry 51 return loadBlockFailed 52 } 53 if intersects == blockMaybeExcluded { 54 intersects = i.resolveMaybeExcluded(dir) 55 } 56 if intersects == blockExcluded { 57 i.maybeFilteredKeysTwoLevel = true 58 return loadBlockIrrelevant 59 } 60 // blockIntersects 61 } 62 ctx := objiotracing.WithBlockType(i.ctx, objiotracing.MetadataBlock) 63 indexBlock, err := i.reader.readBlock( 64 ctx, bhp.BlockHandle, nil /* transform */, nil /* readHandle */, i.stats, &i.iterStats, i.bufferPool) 65 if err != nil { 66 i.err = err 67 return loadBlockFailed 68 } 69 if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil { 70 return loadBlockOK 71 } 72 return loadBlockFailed 73 } 74 75 // resolveMaybeExcluded is invoked when the block-property filterer has found 76 // that an index block is excluded according to its properties but only if its 77 // bounds fall within the filter's current bounds. This function consults the 78 // apprioriate bound, depending on the iteration direction, and returns either 79 // `blockIntersects` or 80 // `blockMaybeExcluded`. 81 func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { 82 // This iterator is configured with a bound-limited block property filter. 83 // The bpf determined this entire index block could be excluded from 84 // iteration based on the property encoded in the block handle. However, we 85 // still need to determine if the index block is wholly contained within the 86 // filter's key bounds. 87 // 88 // External guarantees ensure all its data blocks' keys are ≥ the filter's 89 // lower bound during forward iteration, and that all its data blocks' keys 90 // are < the filter's upper bound during backward iteration. We only need to 91 // determine if the opposite bound is also met. 92 // 93 // The index separator in topLevelIndex.Key() provides an inclusive 94 // upper-bound for the index block's keys, guaranteeing that all its keys 95 // are ≤ topLevelIndex.Key(). For forward iteration, this is all we need. 96 if dir > 0 { 97 // Forward iteration. 98 if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key().UserKey) { 99 return blockExcluded 100 } 101 return blockIntersects 102 } 103 104 // Reverse iteration. 105 // 106 // Because we're iterating in the reverse direction, we don't yet have 107 // enough context available to determine if the block is wholly contained 108 // within its bounds. This case arises only during backward iteration, 109 // because of the way the index is structured. 110 // 111 // Consider a bound-limited bpf limited to the bounds [b,d), loading the 112 // block with separator `c`. During reverse iteration, the guarantee that 113 // all the block's keys are < `d` is externally provided, but no guarantee 114 // is made on the bpf's lower bound. The separator `c` only provides an 115 // inclusive upper bound on the block's keys, indicating that the 116 // corresponding block handle points to a block containing only keys ≤ `c`. 117 // 118 // To establish a lower bound, we step the top-level index backwards to read 119 // the previous block's separator, which provides an inclusive lower bound 120 // on the original index block's keys. Afterwards, we step forward to 121 // restore our top-level index position. 122 if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil { 123 // The original block points to the first index block of this table. If 124 // we knew the lower bound for the entire table, it could provide a 125 // lower bound, but the code refactoring necessary to read it doesn't 126 // seem worth the payoff. We fall through to loading the block. 127 } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) { 128 // The lower-bound on the original index block falls within the filter's 129 // bounds, and we can skip the block (after restoring our current 130 // top-level index position). 131 _, _ = i.topLevelIndex.Next() 132 return blockExcluded 133 } 134 _, _ = i.topLevelIndex.Next() 135 return blockIntersects 136 } 137 138 // Note that lower, upper passed into init has nothing to do with virtual sstable 139 // bounds. If the virtualState passed in is not nil, then virtual sstable bounds 140 // will be enforced. 141 func (i *twoLevelIterator) init( 142 ctx context.Context, 143 r *Reader, 144 v *virtualState, 145 lower, upper []byte, 146 filterer *BlockPropertiesFilterer, 147 useFilter, hideObsoletePoints bool, 148 stats *base.InternalIteratorStats, 149 categoryAndQoS CategoryAndQoS, 150 statsCollector *CategoryStatsCollector, 151 rp ReaderProvider, 152 bufferPool *BufferPool, 153 ) error { 154 if r.err != nil { 155 return r.err 156 } 157 i.iterStats.init(categoryAndQoS, statsCollector) 158 topLevelIndexH, err := r.readIndex(ctx, stats, &i.iterStats) 159 if err != nil { 160 return err 161 } 162 if v != nil { 163 i.vState = v 164 // Note that upper is exclusive here. 165 i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */) 166 } 167 168 i.ctx = ctx 169 i.lower = lower 170 i.upper = upper 171 i.bpfs = filterer 172 i.useFilter = useFilter 173 i.reader = r 174 i.cmp = r.Compare 175 i.stats = stats 176 i.hideObsoletePoints = hideObsoletePoints 177 i.bufferPool = bufferPool 178 err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false) 179 if err != nil { 180 // blockIter.Close releases topLevelIndexH and always returns a nil error 181 _ = i.topLevelIndex.Close() 182 return err 183 } 184 i.dataRH = r.readable.NewReadHandle(ctx) 185 if r.tableFormat >= TableFormatPebblev3 { 186 if r.Properties.NumValueBlocks > 0 { 187 i.vbReader = &valueBlockReader{ 188 bpOpen: i, 189 rp: rp, 190 vbih: r.valueBIH, 191 stats: stats, 192 } 193 i.data.lazyValueHandling.vbr = i.vbReader 194 i.vbRH = r.readable.NewReadHandle(ctx) 195 } 196 i.data.lazyValueHandling.hasValuePrefix = true 197 } 198 return nil 199 } 200 201 func (i *twoLevelIterator) String() string { 202 if i.vState != nil { 203 return i.vState.fileNum.String() 204 } 205 return i.reader.fileNum.String() 206 } 207 208 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 209 // whether or not the last positioning method may have skipped any keys due to 210 // block-property filters. 211 func (i *twoLevelIterator) MaybeFilteredKeys() bool { 212 // While reading sstables with two-level indexes, knowledge of whether we've 213 // filtered keys is tracked separately for each index level. The 214 // seek-using-next optimizations have different criteria. We can only reset 215 // maybeFilteredKeys back to false during a seek when NOT using the 216 // fast-path that uses the current iterator position. 217 // 218 // If either level might have filtered keys to arrive at the current 219 // iterator position, return MaybeFilteredKeys=true. 220 return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel 221 } 222 223 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 224 // package. Note that SeekGE only checks the upper bound. It is up to the 225 // caller to ensure that key is greater than or equal to the lower bound. 226 func (i *twoLevelIterator) SeekGE( 227 key []byte, flags base.SeekGEFlags, 228 ) (*InternalKey, base.LazyValue) { 229 if i.vState != nil { 230 // Callers of SeekGE don't know about virtual sstable bounds, so we may 231 // have to internally restrict the bounds. 232 // 233 // TODO(bananabrick): We can optimize away this check for the level iter 234 // if necessary. 235 if i.cmp(key, i.lower) < 0 { 236 key = i.lower 237 } 238 } 239 240 err := i.err 241 i.err = nil // clear cached iteration error 242 243 // The twoLevelIterator could be already exhausted. Utilize that when 244 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 245 // bounds-exhausted near the top of the file. 246 if flags.TrySeekUsingNext() && 247 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 248 err == nil { 249 // Already exhausted, so return nil. 250 return nil, base.LazyValue{} 251 } 252 253 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled 254 // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). 255 // Care must be taken to ensure that when performing these optimizations and 256 // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. 257 // Consider a previous SeekGE that filtered keys from k until the current 258 // iterator position. 259 // 260 // If the previous SeekGE exhausted the iterator while seeking within the 261 // two-level index, it's possible keys greater than or equal to the current 262 // search key were filtered through skipped index blocks. We must not reuse 263 // the position of the two-level index iterator without remembering the 264 // previous value of maybeFilteredKeys. 265 266 // We fall into the slow path if i.index.isDataInvalidated() even if the 267 // top-level iterator is already positioned correctly and all other 268 // conditions are met. An alternative structure could reuse topLevelIndex's 269 // current position and reload the index block to which it points. Arguably, 270 // an index block load is expensive and the index block may still be earlier 271 // than the index block containing the sought key, resulting in a wasteful 272 // block load. 273 274 var dontSeekWithinSingleLevelIter bool 275 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 276 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 277 // Slow-path: need to position the topLevelIndex. 278 279 // The previous exhausted state of singleLevelIterator is no longer 280 // relevant, since we may be moving to a different index block. 281 i.exhaustedBounds = 0 282 i.maybeFilteredKeysTwoLevel = false 283 flags = flags.DisableTrySeekUsingNext() 284 var ikey *InternalKey 285 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 286 i.data.invalidate() 287 i.index.invalidate() 288 return nil, base.LazyValue{} 289 } 290 291 result := i.loadIndex(+1) 292 if result == loadBlockFailed { 293 i.boundsCmp = 0 294 return nil, base.LazyValue{} 295 } 296 if result == loadBlockIrrelevant { 297 // Enforce the upper bound here since don't want to bother moving 298 // to the next entry in the top level index if upper bound is 299 // already exceeded. Note that the next entry starts with keys >= 300 // ikey.UserKey since even though this is the block separator, the 301 // same user key can span multiple index blocks. If upper is 302 // exclusive we use >= below, else we use >. 303 if i.upper != nil { 304 cmp := i.cmp(ikey.UserKey, i.upper) 305 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 306 i.exhaustedBounds = +1 307 } 308 } 309 // Fall through to skipForward. 310 dontSeekWithinSingleLevelIter = true 311 // Clear boundsCmp. 312 // 313 // In the typical cases where dontSeekWithinSingleLevelIter=false, 314 // the singleLevelIterator.SeekGE call will clear boundsCmp. 315 // However, in this case where dontSeekWithinSingleLevelIter=true, 316 // we never seek on the single-level iterator. This call will fall 317 // through to skipForward, which may improperly leave boundsCmp=+1 318 // unless we clear it here. 319 i.boundsCmp = 0 320 } 321 } else { 322 // INVARIANT: err == nil. 323 // 324 // Else fast-path: There are two possible cases, from 325 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 326 // 327 // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is 328 // respecting the lower bound (guaranteed by Iterator). We know that the 329 // iterator must already be positioned within or just outside the previous 330 // bounds. Therefore, the topLevelIndex iter cannot be positioned at an 331 // entry ahead of the seek position (though it can be positioned behind). 332 // The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 confirms that it is 333 // not behind. Since it is not ahead and not behind it must be at the 334 // right position. 335 // 336 // 2) This SeekGE will land on a key that is greater than the key we are 337 // currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 338 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 339 // index block. No need to reset the state of singleLevelIterator. 340 // 341 // Note that cases 1 and 2 never overlap, and one of them must be true, 342 // but we have some test code (TestIterRandomizedMaybeFilteredKeys) that 343 // sets both to true, so we fix things here and then do an invariant 344 // check. 345 // 346 // This invariant checking is important enough that we do not gate it 347 // behind invariants.Enabled. 348 if i.boundsCmp > 0 { 349 // TODO(sumeer): fix TestIterRandomizedMaybeFilteredKeys so as to not 350 // need this behavior. 351 flags = flags.DisableTrySeekUsingNext() 352 } 353 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 354 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 355 i.boundsCmp > 0, flags.TrySeekUsingNext())) 356 } 357 358 if !flags.TrySeekUsingNext() { 359 // Case 1. Bounds have changed so the previous exhausted bounds state is 360 // irrelevant. 361 // WARNING-data-exhausted: this is safe to do only because the monotonic 362 // bounds optimizations only work when !data-exhausted. If they also 363 // worked with data-exhausted, we have made it unclear whether 364 // data-exhausted is actually true. See the comment at the top of the 365 // file. 366 i.exhaustedBounds = 0 367 } 368 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 369 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 370 // bug https://github.com/cockroachdb/pebble/issues/2036. 371 } 372 373 if !dontSeekWithinSingleLevelIter { 374 // Note that while trySeekUsingNext could be false here, singleLevelIterator 375 // could do its own boundsCmp-based optimization to seek using next. 376 if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil { 377 return ikey, val 378 } 379 } 380 return i.skipForward() 381 } 382 383 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 384 // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up 385 // to the caller to ensure that key is greater than or equal to the lower bound. 386 func (i *twoLevelIterator) SeekPrefixGE( 387 prefix, key []byte, flags base.SeekGEFlags, 388 ) (*base.InternalKey, base.LazyValue) { 389 if i.vState != nil { 390 // Callers of SeekGE don't know about virtual sstable bounds, so we may 391 // have to internally restrict the bounds. 392 // 393 // TODO(bananabrick): We can optimize away this check for the level iter 394 // if necessary. 395 if i.cmp(key, i.lower) < 0 { 396 key = i.lower 397 } 398 } 399 400 // NOTE: prefix is only used for bloom filter checking and not later work in 401 // this method. Hence, we can use the existing iterator position if the last 402 // SeekPrefixGE did not fail bloom filter matching. 403 404 err := i.err 405 i.err = nil // clear cached iteration error 406 407 // The twoLevelIterator could be already exhausted. Utilize that when 408 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 409 // bounds-exhausted near the top of the file. 410 filterUsedAndDidNotMatch := 411 i.reader.tableFilter != nil && i.useFilter && !i.lastBloomFilterMatched 412 if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch && 413 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 414 err == nil { 415 // Already exhausted, so return nil. 416 return nil, base.LazyValue{} 417 } 418 419 // Check prefix bloom filter. 420 if i.reader.tableFilter != nil && i.useFilter { 421 if !i.lastBloomFilterMatched { 422 // Iterator is not positioned based on last seek. 423 flags = flags.DisableTrySeekUsingNext() 424 } 425 i.lastBloomFilterMatched = false 426 var dataH bufferHandle 427 dataH, i.err = i.reader.readFilter(i.ctx, i.stats, &i.iterStats) 428 if i.err != nil { 429 i.data.invalidate() 430 return nil, base.LazyValue{} 431 } 432 mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) 433 dataH.Release() 434 if !mayContain { 435 // This invalidation may not be necessary for correctness, and may 436 // be a place to optimize later by reusing the already loaded 437 // block. It was necessary in earlier versions of the code since 438 // the caller was allowed to call Next when SeekPrefixGE returned 439 // nil. This is no longer allowed. 440 i.data.invalidate() 441 return nil, base.LazyValue{} 442 } 443 i.lastBloomFilterMatched = true 444 } 445 446 // Bloom filter matches. 447 448 // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg 449 // enabled by trySeekUsingNext, or by monotonically increasing bounds 450 // (i.boundsCmp). Care must be taken to ensure that when performing these 451 // optimizations and the iterator becomes exhausted, 452 // i.maybeFilteredKeysTwoLevel is set appropriately. Consider a previous 453 // SeekPrefixGE that filtered keys from k until the current iterator 454 // position. 455 // 456 // If the previous SeekPrefixGE exhausted the iterator while seeking within 457 // the two-level index, it's possible keys greater than or equal to the 458 // current search key were filtered through skipped index blocks. We must 459 // not reuse the position of the two-level index iterator without 460 // remembering the previous value of maybeFilteredKeysTwoLevel. 461 462 // We fall into the slow path if i.index.isDataInvalidated() even if the 463 // top-level iterator is already positioned correctly and all other 464 // conditions are met. An alternative structure could reuse topLevelIndex's 465 // current position and reload the index block to which it points. Arguably, 466 // an index block load is expensive and the index block may still be earlier 467 // than the index block containing the sought key, resulting in a wasteful 468 // block load. 469 470 var dontSeekWithinSingleLevelIter bool 471 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 472 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 473 // Slow-path: need to position the topLevelIndex. 474 475 // The previous exhausted state of singleLevelIterator is no longer 476 // relevant, since we may be moving to a different index block. 477 i.exhaustedBounds = 0 478 i.maybeFilteredKeysTwoLevel = false 479 flags = flags.DisableTrySeekUsingNext() 480 var ikey *InternalKey 481 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 482 i.data.invalidate() 483 i.index.invalidate() 484 return nil, base.LazyValue{} 485 } 486 487 result := i.loadIndex(+1) 488 if result == loadBlockFailed { 489 i.boundsCmp = 0 490 return nil, base.LazyValue{} 491 } 492 if result == loadBlockIrrelevant { 493 // Enforce the upper bound here since don't want to bother moving 494 // to the next entry in the top level index if upper bound is 495 // already exceeded. Note that the next entry starts with keys >= 496 // ikey.UserKey since even though this is the block separator, the 497 // same user key can span multiple index blocks. If upper is 498 // exclusive we use >= below, else we use >. 499 if i.upper != nil { 500 cmp := i.cmp(ikey.UserKey, i.upper) 501 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 502 i.exhaustedBounds = +1 503 } 504 } 505 // Fall through to skipForward. 506 dontSeekWithinSingleLevelIter = true 507 // Clear boundsCmp. 508 // 509 // In the typical cases where dontSeekWithinSingleLevelIter=false, 510 // the singleLevelIterator.SeekPrefixGE call will clear boundsCmp. 511 // However, in this case where dontSeekWithinSingleLevelIter=true, 512 // we never seek on the single-level iterator. This call will fall 513 // through to skipForward, which may improperly leave boundsCmp=+1 514 // unless we clear it here. 515 i.boundsCmp = 0 516 } 517 } else { 518 // INVARIANT: err == nil. 519 // 520 // Else fast-path: There are two possible cases, from 521 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 522 // 523 // 1) The bounds have moved forward (i.boundsCmp > 0) and this 524 // SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We 525 // know that the iterator must already be positioned within or just 526 // outside the previous bounds. Therefore, the topLevelIndex iter cannot 527 // be positioned at an entry ahead of the seek position (though it can be 528 // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 529 // confirms that it is not behind. Since it is not ahead and not behind it 530 // must be at the right position. 531 // 532 // 2) This SeekPrefixGE will land on a key that is greater than the key we 533 // are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 534 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 535 // index block. No need to reset the state of singleLevelIterator. 536 // 537 // Note that cases 1 and 2 never overlap, and one of them must be true. 538 // This invariant checking is important enough that we do not gate it 539 // behind invariants.Enabled. 540 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 541 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 542 i.boundsCmp > 0, flags.TrySeekUsingNext())) 543 } 544 545 if !flags.TrySeekUsingNext() { 546 // Case 1. Bounds have changed so the previous exhausted bounds state is 547 // irrelevant. 548 // WARNING-data-exhausted: this is safe to do only because the monotonic 549 // bounds optimizations only work when !data-exhausted. If they also 550 // worked with data-exhausted, we have made it unclear whether 551 // data-exhausted is actually true. See the comment at the top of the 552 // file. 553 i.exhaustedBounds = 0 554 } 555 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 556 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 557 // bug https://github.com/cockroachdb/pebble/issues/2036. 558 } 559 560 if !dontSeekWithinSingleLevelIter { 561 if ikey, val := i.singleLevelIterator.seekPrefixGE( 562 prefix, key, flags, false /* checkFilter */); ikey != nil { 563 return ikey, val 564 } 565 } 566 // NB: skipForward checks whether exhaustedBounds is already +1. 567 return i.skipForward() 568 } 569 570 // virtualLast should only be called if i.vReader != nil and i.endKeyInclusive 571 // is true. 572 func (i *twoLevelIterator) virtualLast() (*InternalKey, base.LazyValue) { 573 if i.vState == nil { 574 panic("pebble: invalid call to virtualLast") 575 } 576 577 // Seek to the first internal key. 578 ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone) 579 if i.endKeyInclusive { 580 // Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2, 581 // c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is 582 // c#1. We can perform SeekGE(i.upper) and then keep nexting until we find 583 // the last key with userkey == i.upper. 584 // 585 // TODO(bananabrick): Think about how to improve this. If many internal keys 586 // with the same user key at the upper bound then this could be slow, but 587 // maybe the odds of having many internal keys with the same user key at the 588 // upper bound are low. 589 for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 { 590 ikey, _ = i.Next() 591 } 592 return i.Prev() 593 } 594 // We seeked to the first key >= i.upper. 595 return i.Prev() 596 } 597 598 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 599 // package. Note that SeekLT only checks the lower bound. It is up to the 600 // caller to ensure that key is less than the upper bound. 601 func (i *twoLevelIterator) SeekLT( 602 key []byte, flags base.SeekLTFlags, 603 ) (*InternalKey, base.LazyValue) { 604 if i.vState != nil { 605 // Might have to fix upper bound since virtual sstable bounds are not 606 // known to callers of SeekLT. 607 // 608 // TODO(bananabrick): We can optimize away this check for the level iter 609 // if necessary. 610 cmp := i.cmp(key, i.upper) 611 // key == i.upper is fine. We'll do the right thing and return the 612 // first internal key with user key < key. 613 if cmp > 0 { 614 return i.virtualLast() 615 } 616 } 617 618 i.exhaustedBounds = 0 619 i.err = nil // clear cached iteration error 620 // Seek optimization only applies until iterator is first positioned after SetBounds. 621 i.boundsCmp = 0 622 623 var result loadBlockResult 624 var ikey *InternalKey 625 // NB: Unlike SeekGE, we don't have a fast-path here since we don't know 626 // whether the topLevelIndex is positioned after the position that would 627 // be returned by doing i.topLevelIndex.SeekGE(). To know this we would 628 // need to know the index key preceding the current one. 629 // NB: If a bound-limited block property filter is configured, it's 630 // externally ensured that the filter is disabled (through returning 631 // Intersects=false irrespective of the block props provided) during seeks. 632 i.maybeFilteredKeysTwoLevel = false 633 if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { 634 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 635 i.data.invalidate() 636 i.index.invalidate() 637 return nil, base.LazyValue{} 638 } 639 640 result = i.loadIndex(-1) 641 if result == loadBlockFailed { 642 return nil, base.LazyValue{} 643 } 644 if result == loadBlockOK { 645 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 646 return i.maybeVerifyKey(ikey, val) 647 } 648 // Fall through to skipBackward since the singleLevelIterator did 649 // not have any blocks that satisfy the block interval 650 // constraints, or the lower bound was reached. 651 } 652 // Else loadBlockIrrelevant, so fall through. 653 } else { 654 result = i.loadIndex(-1) 655 if result == loadBlockFailed { 656 return nil, base.LazyValue{} 657 } 658 if result == loadBlockOK { 659 if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil { 660 return i.maybeVerifyKey(ikey, val) 661 } 662 // Fall through to skipBackward since the singleLevelIterator did 663 // not have any blocks that satisfy the block interval 664 // constraint, or the lower bound was reached. 665 } 666 // Else loadBlockIrrelevant, so fall through. 667 } 668 if result == loadBlockIrrelevant { 669 // Enforce the lower bound here since don't want to bother moving to 670 // the previous entry in the top level index if lower bound is already 671 // exceeded. Note that the previous entry starts with keys <= 672 // ikey.UserKey since even though this is the current block's 673 // separator, the same user key can span multiple index blocks. 674 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 675 i.exhaustedBounds = -1 676 } 677 } 678 // NB: skipBackward checks whether exhaustedBounds is already -1. 679 return i.skipBackward() 680 } 681 682 // First implements internalIterator.First, as documented in the pebble 683 // package. Note that First only checks the upper bound. It is up to the caller 684 // to ensure that key is greater than or equal to the lower bound (e.g. via a 685 // call to SeekGE(lower)). 686 func (i *twoLevelIterator) First() (*InternalKey, base.LazyValue) { 687 // If the iterator was created on a virtual sstable, we will SeekGE to the 688 // lower bound instead of using First, because First does not respect 689 // bounds. 690 if i.vState != nil { 691 return i.SeekGE(i.lower, base.SeekGEFlagsNone) 692 } 693 694 if i.lower != nil { 695 panic("twoLevelIterator.First() used despite lower bound") 696 } 697 i.exhaustedBounds = 0 698 i.maybeFilteredKeysTwoLevel = false 699 i.err = nil // clear cached iteration error 700 // Seek optimization only applies until iterator is first positioned after SetBounds. 701 i.boundsCmp = 0 702 703 var ikey *InternalKey 704 if ikey, _ = i.topLevelIndex.First(); ikey == nil { 705 return nil, base.LazyValue{} 706 } 707 708 result := i.loadIndex(+1) 709 if result == loadBlockFailed { 710 return nil, base.LazyValue{} 711 } 712 if result == loadBlockOK { 713 if ikey, val := i.singleLevelIterator.First(); ikey != nil { 714 return ikey, val 715 } 716 // Else fall through to skipForward. 717 } else { 718 // result == loadBlockIrrelevant. Enforce the upper bound here since 719 // don't want to bother moving to the next entry in the top level 720 // index if upper bound is already exceeded. Note that the next entry 721 // starts with keys >= ikey.UserKey since even though this is the 722 // block separator, the same user key can span multiple index blocks. 723 // If upper is exclusive we use >= below, else we use >. 724 if i.upper != nil { 725 cmp := i.cmp(ikey.UserKey, i.upper) 726 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 727 i.exhaustedBounds = +1 728 } 729 } 730 } 731 // NB: skipForward checks whether exhaustedBounds is already +1. 732 return i.skipForward() 733 } 734 735 // Last implements internalIterator.Last, as documented in the pebble 736 // package. Note that Last only checks the lower bound. It is up to the caller 737 // to ensure that key is less than the upper bound (e.g. via a call to 738 // SeekLT(upper)) 739 func (i *twoLevelIterator) Last() (*InternalKey, base.LazyValue) { 740 if i.vState != nil { 741 if i.endKeyInclusive { 742 return i.virtualLast() 743 } 744 return i.SeekLT(i.upper, base.SeekLTFlagsNone) 745 } 746 747 if i.upper != nil { 748 panic("twoLevelIterator.Last() used despite upper bound") 749 } 750 i.exhaustedBounds = 0 751 i.maybeFilteredKeysTwoLevel = false 752 i.err = nil // clear cached iteration error 753 // Seek optimization only applies until iterator is first positioned after SetBounds. 754 i.boundsCmp = 0 755 756 var ikey *InternalKey 757 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 758 return nil, base.LazyValue{} 759 } 760 761 result := i.loadIndex(-1) 762 if result == loadBlockFailed { 763 return nil, base.LazyValue{} 764 } 765 if result == loadBlockOK { 766 if ikey, val := i.singleLevelIterator.Last(); ikey != nil { 767 return ikey, val 768 } 769 // Else fall through to skipBackward. 770 } else { 771 // result == loadBlockIrrelevant. Enforce the lower bound here 772 // since don't want to bother moving to the previous entry in the 773 // top level index if lower bound is already exceeded. Note that 774 // the previous entry starts with keys <= ikey.UserKey since even 775 // though this is the current block's separator, the same user key 776 // can span multiple index blocks. 777 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 778 i.exhaustedBounds = -1 779 } 780 } 781 // NB: skipBackward checks whether exhaustedBounds is already -1. 782 return i.skipBackward() 783 } 784 785 // Next implements internalIterator.Next, as documented in the pebble 786 // package. 787 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 788 // twoLevelIterator.Next due to performance. Keep the two in sync. 789 func (i *twoLevelIterator) Next() (*InternalKey, base.LazyValue) { 790 // Seek optimization only applies until iterator is first positioned after SetBounds. 791 i.boundsCmp = 0 792 i.maybeFilteredKeysTwoLevel = false 793 if i.err != nil { 794 // TODO(jackson): Can this case be turned into a panic? Once an error is 795 // encountered, the iterator must be re-seeked. 796 return nil, base.LazyValue{} 797 } 798 if key, val := i.singleLevelIterator.Next(); key != nil { 799 return key, val 800 } 801 return i.skipForward() 802 } 803 804 // NextPrefix implements (base.InternalIterator).NextPrefix. 805 func (i *twoLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 806 if i.exhaustedBounds == +1 { 807 panic("Next called even though exhausted upper bound") 808 } 809 // Seek optimization only applies until iterator is first positioned after SetBounds. 810 i.boundsCmp = 0 811 i.maybeFilteredKeysTwoLevel = false 812 if i.err != nil { 813 // TODO(jackson): Can this case be turned into a panic? Once an error is 814 // encountered, the iterator must be re-seeked. 815 return nil, base.LazyValue{} 816 } 817 if key, val := i.singleLevelIterator.NextPrefix(succKey); key != nil { 818 return key, val 819 } 820 // key == nil 821 if i.err != nil { 822 return nil, base.LazyValue{} 823 } 824 825 // Did not find prefix in the existing second-level index block. This is the 826 // slow-path where we seek the iterator. 827 var ikey *InternalKey 828 if ikey, _ = i.topLevelIndex.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil { 829 i.data.invalidate() 830 i.index.invalidate() 831 return nil, base.LazyValue{} 832 } 833 result := i.loadIndex(+1) 834 if result == loadBlockFailed { 835 return nil, base.LazyValue{} 836 } 837 if result == loadBlockIrrelevant { 838 // Enforce the upper bound here since don't want to bother moving to the 839 // next entry in the top level index if upper bound is already exceeded. 840 // Note that the next entry starts with keys >= ikey.UserKey since even 841 // though this is the block separator, the same user key can span multiple 842 // index blocks. If upper is exclusive we use >= below, else we use >. 843 if i.upper != nil { 844 cmp := i.cmp(ikey.UserKey, i.upper) 845 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 846 i.exhaustedBounds = +1 847 } 848 } 849 } else if key, val := i.singleLevelIterator.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { 850 return i.maybeVerifyKey(key, val) 851 } 852 return i.skipForward() 853 } 854 855 // Prev implements internalIterator.Prev, as documented in the pebble 856 // package. 857 func (i *twoLevelIterator) Prev() (*InternalKey, base.LazyValue) { 858 // Seek optimization only applies until iterator is first positioned after SetBounds. 859 i.boundsCmp = 0 860 i.maybeFilteredKeysTwoLevel = false 861 if i.err != nil { 862 return nil, base.LazyValue{} 863 } 864 if key, val := i.singleLevelIterator.Prev(); key != nil { 865 return key, val 866 } 867 return i.skipBackward() 868 } 869 870 func (i *twoLevelIterator) skipForward() (*InternalKey, base.LazyValue) { 871 for { 872 if i.err != nil || i.exhaustedBounds > 0 { 873 return nil, base.LazyValue{} 874 } 875 i.exhaustedBounds = 0 876 var ikey *InternalKey 877 if ikey, _ = i.topLevelIndex.Next(); ikey == nil { 878 i.data.invalidate() 879 i.index.invalidate() 880 return nil, base.LazyValue{} 881 } 882 result := i.loadIndex(+1) 883 if result == loadBlockFailed { 884 return nil, base.LazyValue{} 885 } 886 if result == loadBlockOK { 887 if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil { 888 return i.maybeVerifyKey(ikey, val) 889 } 890 // Next iteration will return if singleLevelIterator set 891 // exhaustedBounds = +1. 892 } else { 893 // result == loadBlockIrrelevant. Enforce the upper bound here 894 // since don't want to bother moving to the next entry in the top 895 // level index if upper bound is already exceeded. Note that the 896 // next entry starts with keys >= ikey.UserKey since even though 897 // this is the block separator, the same user key can span 898 // multiple index blocks. If upper is exclusive we use >= 899 // below, else we use >. 900 if i.upper != nil { 901 cmp := i.cmp(ikey.UserKey, i.upper) 902 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 903 i.exhaustedBounds = +1 904 // Next iteration will return. 905 } 906 } 907 } 908 } 909 } 910 911 func (i *twoLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { 912 for { 913 if i.err != nil || i.exhaustedBounds < 0 { 914 return nil, base.LazyValue{} 915 } 916 i.exhaustedBounds = 0 917 var ikey *InternalKey 918 if ikey, _ = i.topLevelIndex.Prev(); ikey == nil { 919 i.data.invalidate() 920 i.index.invalidate() 921 return nil, base.LazyValue{} 922 } 923 result := i.loadIndex(-1) 924 if result == loadBlockFailed { 925 return nil, base.LazyValue{} 926 } 927 if result == loadBlockOK { 928 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 929 return i.maybeVerifyKey(ikey, val) 930 } 931 // Next iteration will return if singleLevelIterator set 932 // exhaustedBounds = -1. 933 } else { 934 // result == loadBlockIrrelevant. Enforce the lower bound here 935 // since don't want to bother moving to the previous entry in the 936 // top level index if lower bound is already exceeded. Note that 937 // the previous entry starts with keys <= ikey.UserKey since even 938 // though this is the current block's separator, the same user key 939 // can span multiple index blocks. 940 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 941 i.exhaustedBounds = -1 942 // Next iteration will return. 943 } 944 } 945 } 946 } 947 948 // Close implements internalIterator.Close, as documented in the pebble 949 // package. 950 func (i *twoLevelIterator) Close() error { 951 i.iterStats.close() 952 var err error 953 if i.closeHook != nil { 954 err = firstError(err, i.closeHook(i)) 955 } 956 err = firstError(err, i.data.Close()) 957 err = firstError(err, i.index.Close()) 958 err = firstError(err, i.topLevelIndex.Close()) 959 if i.dataRH != nil { 960 err = firstError(err, i.dataRH.Close()) 961 i.dataRH = nil 962 } 963 err = firstError(err, i.err) 964 if i.bpfs != nil { 965 releaseBlockPropertiesFilterer(i.bpfs) 966 } 967 if i.vbReader != nil { 968 i.vbReader.close() 969 } 970 if i.vbRH != nil { 971 err = firstError(err, i.vbRH.Close()) 972 i.vbRH = nil 973 } 974 *i = twoLevelIterator{ 975 singleLevelIterator: i.singleLevelIterator.resetForReuse(), 976 topLevelIndex: i.topLevelIndex.resetForReuse(), 977 } 978 twoLevelIterPool.Put(i) 979 return err 980 } 981 982 // Note: twoLevelCompactionIterator and compactionIterator are very similar but 983 // were separated due to performance. 984 type twoLevelCompactionIterator struct { 985 *twoLevelIterator 986 bytesIterated *uint64 987 prevOffset uint64 988 } 989 990 // twoLevelCompactionIterator implements the base.InternalIterator interface. 991 var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil) 992 993 func (i *twoLevelCompactionIterator) Close() error { 994 return i.twoLevelIterator.Close() 995 } 996 997 func (i *twoLevelCompactionIterator) SeekGE( 998 key []byte, flags base.SeekGEFlags, 999 ) (*InternalKey, base.LazyValue) { 1000 panic("pebble: SeekGE unimplemented") 1001 } 1002 1003 func (i *twoLevelCompactionIterator) SeekPrefixGE( 1004 prefix, key []byte, flags base.SeekGEFlags, 1005 ) (*base.InternalKey, base.LazyValue) { 1006 panic("pebble: SeekPrefixGE unimplemented") 1007 } 1008 1009 func (i *twoLevelCompactionIterator) SeekLT( 1010 key []byte, flags base.SeekLTFlags, 1011 ) (*InternalKey, base.LazyValue) { 1012 panic("pebble: SeekLT unimplemented") 1013 } 1014 1015 func (i *twoLevelCompactionIterator) First() (*InternalKey, base.LazyValue) { 1016 i.err = nil // clear cached iteration error 1017 return i.skipForward(i.twoLevelIterator.First()) 1018 } 1019 1020 func (i *twoLevelCompactionIterator) Last() (*InternalKey, base.LazyValue) { 1021 panic("pebble: Last unimplemented") 1022 } 1023 1024 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 1025 // twoLevelIterator.Next due to performance. Keep the two in sync. 1026 func (i *twoLevelCompactionIterator) Next() (*InternalKey, base.LazyValue) { 1027 if i.err != nil { 1028 return nil, base.LazyValue{} 1029 } 1030 return i.skipForward(i.singleLevelIterator.Next()) 1031 } 1032 1033 func (i *twoLevelCompactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 1034 panic("pebble: NextPrefix unimplemented") 1035 } 1036 1037 func (i *twoLevelCompactionIterator) Prev() (*InternalKey, base.LazyValue) { 1038 panic("pebble: Prev unimplemented") 1039 } 1040 1041 func (i *twoLevelCompactionIterator) String() string { 1042 if i.vState != nil { 1043 return i.vState.fileNum.String() 1044 } 1045 return i.reader.fileNum.String() 1046 } 1047 1048 func (i *twoLevelCompactionIterator) skipForward( 1049 key *InternalKey, val base.LazyValue, 1050 ) (*InternalKey, base.LazyValue) { 1051 if key == nil { 1052 for { 1053 if key, _ := i.topLevelIndex.Next(); key == nil { 1054 break 1055 } 1056 result := i.loadIndex(+1) 1057 if result != loadBlockOK { 1058 if i.err != nil { 1059 break 1060 } 1061 switch result { 1062 case loadBlockFailed: 1063 // We checked that i.index was at a valid entry, so 1064 // loadBlockFailed could not have happened due to to i.index 1065 // being exhausted, and must be due to an error. 1066 panic("loadBlock should not have failed with no error") 1067 case loadBlockIrrelevant: 1068 panic("compactionIter should not be using block intervals for skipping") 1069 default: 1070 panic(fmt.Sprintf("unexpected case %d", result)) 1071 } 1072 } 1073 // result == loadBlockOK 1074 if key, val = i.singleLevelIterator.First(); key != nil { 1075 break 1076 } 1077 } 1078 } 1079 1080 curOffset := i.recordOffset() 1081 *i.bytesIterated += uint64(curOffset - i.prevOffset) 1082 i.prevOffset = curOffset 1083 1084 if i.vState != nil && key != nil { 1085 cmp := i.cmp(key.UserKey, i.vState.upper.UserKey) 1086 if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) { 1087 return nil, base.LazyValue{} 1088 } 1089 } 1090 1091 return key, val 1092 }