github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/reader_iter_two_lvl.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 13 ) 14 15 type twoLevelIterator struct { 16 singleLevelIterator 17 // maybeFilteredKeysSingleLevel indicates whether the last iterator 18 // positioning operation may have skipped any index blocks due to 19 // block-property filters when positioning the top-level-index. 20 maybeFilteredKeysTwoLevel bool 21 topLevelIndex blockIter 22 } 23 24 // twoLevelIterator implements the base.InternalIterator interface. 25 var _ base.InternalIterator = (*twoLevelIterator)(nil) 26 27 // loadIndex loads the index block at the current top level index position and 28 // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error 29 // encountered, which may be nil if we have simply exhausted the entire table. 30 // This is used for two level indexes. 31 func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { 32 // Ensure the index data block iterators are invalidated even if loading of 33 // the index fails. 34 i.data.invalidate() 35 i.index.invalidate() 36 if !i.topLevelIndex.valid() { 37 i.index.offset = 0 38 i.index.restarts = 0 39 return loadBlockFailed 40 } 41 v := i.topLevelIndex.value() 42 bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue()) 43 if err != nil { 44 i.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry") 45 return loadBlockFailed 46 } 47 if i.bpfs != nil { 48 intersects, err := i.bpfs.intersects(bhp.Props) 49 if err != nil { 50 i.err = errCorruptIndexEntry 51 return loadBlockFailed 52 } 53 if intersects == blockMaybeExcluded { 54 intersects = i.resolveMaybeExcluded(dir) 55 } 56 if intersects == blockExcluded { 57 i.maybeFilteredKeysTwoLevel = true 58 return loadBlockIrrelevant 59 } 60 // blockIntersects 61 } 62 ctx := objiotracing.WithBlockType(i.ctx, objiotracing.MetadataBlock) 63 indexBlock, err := i.reader.readBlock(ctx, bhp.BlockHandle, nil /* transform */, nil /* readHandle */, i.stats, i.bufferPool) 64 if err != nil { 65 i.err = err 66 return loadBlockFailed 67 } 68 if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil { 69 return loadBlockOK 70 } 71 return loadBlockFailed 72 } 73 74 // resolveMaybeExcluded is invoked when the block-property filterer has found 75 // that an index block is excluded according to its properties but only if its 76 // bounds fall within the filter's current bounds. This function consults the 77 // apprioriate bound, depending on the iteration direction, and returns either 78 // `blockIntersects` or 79 // `blockMaybeExcluded`. 80 func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { 81 // This iterator is configured with a bound-limited block property filter. 82 // The bpf determined this entire index block could be excluded from 83 // iteration based on the property encoded in the block handle. However, we 84 // still need to determine if the index block is wholly contained within the 85 // filter's key bounds. 86 // 87 // External guarantees ensure all its data blocks' keys are ≥ the filter's 88 // lower bound during forward iteration, and that all its data blocks' keys 89 // are < the filter's upper bound during backward iteration. We only need to 90 // determine if the opposite bound is also met. 91 // 92 // The index separator in topLevelIndex.Key() provides an inclusive 93 // upper-bound for the index block's keys, guaranteeing that all its keys 94 // are ≤ topLevelIndex.Key(). For forward iteration, this is all we need. 95 if dir > 0 { 96 // Forward iteration. 97 if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key().UserKey) { 98 return blockExcluded 99 } 100 return blockIntersects 101 } 102 103 // Reverse iteration. 104 // 105 // Because we're iterating in the reverse direction, we don't yet have 106 // enough context available to determine if the block is wholly contained 107 // within its bounds. This case arises only during backward iteration, 108 // because of the way the index is structured. 109 // 110 // Consider a bound-limited bpf limited to the bounds [b,d), loading the 111 // block with separator `c`. During reverse iteration, the guarantee that 112 // all the block's keys are < `d` is externally provided, but no guarantee 113 // is made on the bpf's lower bound. The separator `c` only provides an 114 // inclusive upper bound on the block's keys, indicating that the 115 // corresponding block handle points to a block containing only keys ≤ `c`. 116 // 117 // To establish a lower bound, we step the top-level index backwards to read 118 // the previous block's separator, which provides an inclusive lower bound 119 // on the original index block's keys. Afterwards, we step forward to 120 // restore our top-level index position. 121 if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil { 122 // The original block points to the first index block of this table. If 123 // we knew the lower bound for the entire table, it could provide a 124 // lower bound, but the code refactoring necessary to read it doesn't 125 // seem worth the payoff. We fall through to loading the block. 126 } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) { 127 // The lower-bound on the original index block falls within the filter's 128 // bounds, and we can skip the block (after restoring our current 129 // top-level index position). 130 _, _ = i.topLevelIndex.Next() 131 return blockExcluded 132 } 133 _, _ = i.topLevelIndex.Next() 134 return blockIntersects 135 } 136 137 // Note that lower, upper passed into init has nothing to do with virtual sstable 138 // bounds. If the virtualState passed in is not nil, then virtual sstable bounds 139 // will be enforced. 140 func (i *twoLevelIterator) init( 141 ctx context.Context, 142 r *Reader, 143 v *virtualState, 144 lower, upper []byte, 145 filterer *BlockPropertiesFilterer, 146 useFilter, hideObsoletePoints bool, 147 stats *base.InternalIteratorStats, 148 rp ReaderProvider, 149 bufferPool *BufferPool, 150 ) error { 151 if r.err != nil { 152 return r.err 153 } 154 topLevelIndexH, err := r.readIndex(ctx, stats) 155 if err != nil { 156 return err 157 } 158 if v != nil { 159 i.vState = v 160 // Note that upper is exclusive here. 161 i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */) 162 } 163 164 i.ctx = ctx 165 i.lower = lower 166 i.upper = upper 167 i.bpfs = filterer 168 i.useFilter = useFilter 169 i.reader = r 170 i.cmp = r.Compare 171 i.stats = stats 172 i.hideObsoletePoints = hideObsoletePoints 173 i.bufferPool = bufferPool 174 err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false) 175 if err != nil { 176 // blockIter.Close releases topLevelIndexH and always returns a nil error 177 _ = i.topLevelIndex.Close() 178 return err 179 } 180 i.dataRH = r.readable.NewReadHandle(ctx) 181 if r.tableFormat >= TableFormatPebblev3 { 182 if r.Properties.NumValueBlocks > 0 { 183 i.vbReader = &valueBlockReader{ 184 ctx: ctx, 185 bpOpen: i, 186 rp: rp, 187 vbih: r.valueBIH, 188 stats: stats, 189 } 190 i.data.lazyValueHandling.vbr = i.vbReader 191 i.vbRH = r.readable.NewReadHandle(ctx) 192 } 193 i.data.lazyValueHandling.hasValuePrefix = true 194 } 195 return nil 196 } 197 198 func (i *twoLevelIterator) String() string { 199 if i.vState != nil { 200 return i.vState.fileNum.String() 201 } 202 return i.reader.fileNum.String() 203 } 204 205 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 206 // whether or not the last positioning method may have skipped any keys due to 207 // block-property filters. 208 func (i *twoLevelIterator) MaybeFilteredKeys() bool { 209 // While reading sstables with two-level indexes, knowledge of whether we've 210 // filtered keys is tracked separately for each index level. The 211 // seek-using-next optimizations have different criteria. We can only reset 212 // maybeFilteredKeys back to false during a seek when NOT using the 213 // fast-path that uses the current iterator position. 214 // 215 // If either level might have filtered keys to arrive at the current 216 // iterator position, return MaybeFilteredKeys=true. 217 return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel 218 } 219 220 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 221 // package. Note that SeekGE only checks the upper bound. It is up to the 222 // caller to ensure that key is greater than or equal to the lower bound. 223 func (i *twoLevelIterator) SeekGE( 224 key []byte, flags base.SeekGEFlags, 225 ) (*InternalKey, base.LazyValue) { 226 if i.vState != nil { 227 // Callers of SeekGE don't know about virtual sstable bounds, so we may 228 // have to internally restrict the bounds. 229 // 230 // TODO(bananabrick): We can optimize away this check for the level iter 231 // if necessary. 232 if i.cmp(key, i.lower) < 0 { 233 key = i.lower 234 } 235 } 236 237 err := i.err 238 i.err = nil // clear cached iteration error 239 240 // The twoLevelIterator could be already exhausted. Utilize that when 241 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 242 // bounds-exhausted near the top of the file. 243 if flags.TrySeekUsingNext() && 244 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 245 err == nil { 246 // Already exhausted, so return nil. 247 return nil, base.LazyValue{} 248 } 249 250 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled 251 // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). 252 // Care must be taken to ensure that when performing these optimizations and 253 // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. 254 // Consider a previous SeekGE that filtered keys from k until the current 255 // iterator position. 256 // 257 // If the previous SeekGE exhausted the iterator while seeking within the 258 // two-level index, it's possible keys greater than or equal to the current 259 // search key were filtered through skipped index blocks. We must not reuse 260 // the position of the two-level index iterator without remembering the 261 // previous value of maybeFilteredKeys. 262 263 // We fall into the slow path if i.index.isDataInvalidated() even if the 264 // top-level iterator is already positioned correctly and all other 265 // conditions are met. An alternative structure could reuse topLevelIndex's 266 // current position and reload the index block to which it points. Arguably, 267 // an index block load is expensive and the index block may still be earlier 268 // than the index block containing the sought key, resulting in a wasteful 269 // block load. 270 271 var dontSeekWithinSingleLevelIter bool 272 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 273 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 274 // Slow-path: need to position the topLevelIndex. 275 276 // The previous exhausted state of singleLevelIterator is no longer 277 // relevant, since we may be moving to a different index block. 278 i.exhaustedBounds = 0 279 i.maybeFilteredKeysTwoLevel = false 280 flags = flags.DisableTrySeekUsingNext() 281 var ikey *InternalKey 282 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 283 i.data.invalidate() 284 i.index.invalidate() 285 return nil, base.LazyValue{} 286 } 287 288 result := i.loadIndex(+1) 289 if result == loadBlockFailed { 290 i.boundsCmp = 0 291 return nil, base.LazyValue{} 292 } 293 if result == loadBlockIrrelevant { 294 // Enforce the upper bound here since don't want to bother moving 295 // to the next entry in the top level index if upper bound is 296 // already exceeded. Note that the next entry starts with keys >= 297 // ikey.UserKey since even though this is the block separator, the 298 // same user key can span multiple index blocks. If upper is 299 // exclusive we use >= below, else we use >. 300 if i.upper != nil { 301 cmp := i.cmp(ikey.UserKey, i.upper) 302 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 303 i.exhaustedBounds = +1 304 } 305 } 306 // Fall through to skipForward. 307 dontSeekWithinSingleLevelIter = true 308 // Clear boundsCmp. 309 // 310 // In the typical cases where dontSeekWithinSingleLevelIter=false, 311 // the singleLevelIterator.SeekGE call will clear boundsCmp. 312 // However, in this case where dontSeekWithinSingleLevelIter=true, 313 // we never seek on the single-level iterator. This call will fall 314 // through to skipForward, which may improperly leave boundsCmp=+1 315 // unless we clear it here. 316 i.boundsCmp = 0 317 } 318 } else { 319 // INVARIANT: err == nil. 320 // 321 // Else fast-path: There are two possible cases, from 322 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 323 // 324 // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is 325 // respecting the lower bound (guaranteed by Iterator). We know that the 326 // iterator must already be positioned within or just outside the previous 327 // bounds. Therefore, the topLevelIndex iter cannot be positioned at an 328 // entry ahead of the seek position (though it can be positioned behind). 329 // The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 confirms that it is 330 // not behind. Since it is not ahead and not behind it must be at the 331 // right position. 332 // 333 // 2) This SeekGE will land on a key that is greater than the key we are 334 // currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 335 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 336 // index block. No need to reset the state of singleLevelIterator. 337 // 338 // Note that cases 1 and 2 never overlap, and one of them must be true, 339 // but we have some test code (TestIterRandomizedMaybeFilteredKeys) that 340 // sets both to true, so we fix things here and then do an invariant 341 // check. 342 // 343 // This invariant checking is important enough that we do not gate it 344 // behind invariants.Enabled. 345 if i.boundsCmp > 0 { 346 // TODO(sumeer): fix TestIterRandomizedMaybeFilteredKeys so as to not 347 // need this behavior. 348 flags = flags.DisableTrySeekUsingNext() 349 } 350 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 351 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 352 i.boundsCmp > 0, flags.TrySeekUsingNext())) 353 } 354 355 if !flags.TrySeekUsingNext() { 356 // Case 1. Bounds have changed so the previous exhausted bounds state is 357 // irrelevant. 358 // WARNING-data-exhausted: this is safe to do only because the monotonic 359 // bounds optimizations only work when !data-exhausted. If they also 360 // worked with data-exhausted, we have made it unclear whether 361 // data-exhausted is actually true. See the comment at the top of the 362 // file. 363 i.exhaustedBounds = 0 364 } 365 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 366 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 367 // bug https://github.com/cockroachdb/pebble/issues/2036. 368 } 369 370 if !dontSeekWithinSingleLevelIter { 371 // Note that while trySeekUsingNext could be false here, singleLevelIterator 372 // could do its own boundsCmp-based optimization to seek using next. 373 if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil { 374 return ikey, val 375 } 376 } 377 return i.skipForward() 378 } 379 380 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 381 // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up 382 // to the caller to ensure that key is greater than or equal to the lower bound. 383 func (i *twoLevelIterator) SeekPrefixGE( 384 prefix, key []byte, flags base.SeekGEFlags, 385 ) (*base.InternalKey, base.LazyValue) { 386 if i.vState != nil { 387 // Callers of SeekGE don't know about virtual sstable bounds, so we may 388 // have to internally restrict the bounds. 389 // 390 // TODO(bananabrick): We can optimize away this check for the level iter 391 // if necessary. 392 if i.cmp(key, i.lower) < 0 { 393 key = i.lower 394 } 395 } 396 397 // NOTE: prefix is only used for bloom filter checking and not later work in 398 // this method. Hence, we can use the existing iterator position if the last 399 // SeekPrefixGE did not fail bloom filter matching. 400 401 err := i.err 402 i.err = nil // clear cached iteration error 403 404 // The twoLevelIterator could be already exhausted. Utilize that when 405 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 406 // bounds-exhausted near the top of the file. 407 filterUsedAndDidNotMatch := 408 i.reader.tableFilter != nil && i.useFilter && !i.lastBloomFilterMatched 409 if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch && 410 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 411 err == nil { 412 // Already exhausted, so return nil. 413 return nil, base.LazyValue{} 414 } 415 416 // Check prefix bloom filter. 417 if i.reader.tableFilter != nil && i.useFilter { 418 if !i.lastBloomFilterMatched { 419 // Iterator is not positioned based on last seek. 420 flags = flags.DisableTrySeekUsingNext() 421 } 422 i.lastBloomFilterMatched = false 423 var dataH bufferHandle 424 dataH, i.err = i.reader.readFilter(i.ctx, i.stats) 425 if i.err != nil { 426 i.data.invalidate() 427 return nil, base.LazyValue{} 428 } 429 mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) 430 dataH.Release() 431 if !mayContain { 432 // This invalidation may not be necessary for correctness, and may 433 // be a place to optimize later by reusing the already loaded 434 // block. It was necessary in earlier versions of the code since 435 // the caller was allowed to call Next when SeekPrefixGE returned 436 // nil. This is no longer allowed. 437 i.data.invalidate() 438 return nil, base.LazyValue{} 439 } 440 i.lastBloomFilterMatched = true 441 } 442 443 // Bloom filter matches. 444 445 // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg 446 // enabled by trySeekUsingNext, or by monotonically increasing bounds 447 // (i.boundsCmp). Care must be taken to ensure that when performing these 448 // optimizations and the iterator becomes exhausted, 449 // i.maybeFilteredKeysTwoLevel is set appropriately. Consider a previous 450 // SeekPrefixGE that filtered keys from k until the current iterator 451 // position. 452 // 453 // If the previous SeekPrefixGE exhausted the iterator while seeking within 454 // the two-level index, it's possible keys greater than or equal to the 455 // current search key were filtered through skipped index blocks. We must 456 // not reuse the position of the two-level index iterator without 457 // remembering the previous value of maybeFilteredKeysTwoLevel. 458 459 // We fall into the slow path if i.index.isDataInvalidated() even if the 460 // top-level iterator is already positioned correctly and all other 461 // conditions are met. An alternative structure could reuse topLevelIndex's 462 // current position and reload the index block to which it points. Arguably, 463 // an index block load is expensive and the index block may still be earlier 464 // than the index block containing the sought key, resulting in a wasteful 465 // block load. 466 467 var dontSeekWithinSingleLevelIter bool 468 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 469 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 470 // Slow-path: need to position the topLevelIndex. 471 472 // The previous exhausted state of singleLevelIterator is no longer 473 // relevant, since we may be moving to a different index block. 474 i.exhaustedBounds = 0 475 i.maybeFilteredKeysTwoLevel = false 476 flags = flags.DisableTrySeekUsingNext() 477 var ikey *InternalKey 478 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 479 i.data.invalidate() 480 i.index.invalidate() 481 return nil, base.LazyValue{} 482 } 483 484 result := i.loadIndex(+1) 485 if result == loadBlockFailed { 486 i.boundsCmp = 0 487 return nil, base.LazyValue{} 488 } 489 if result == loadBlockIrrelevant { 490 // Enforce the upper bound here since don't want to bother moving 491 // to the next entry in the top level index if upper bound is 492 // already exceeded. Note that the next entry starts with keys >= 493 // ikey.UserKey since even though this is the block separator, the 494 // same user key can span multiple index blocks. If upper is 495 // exclusive we use >= below, else we use >. 496 if i.upper != nil { 497 cmp := i.cmp(ikey.UserKey, i.upper) 498 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 499 i.exhaustedBounds = +1 500 } 501 } 502 // Fall through to skipForward. 503 dontSeekWithinSingleLevelIter = true 504 // Clear boundsCmp. 505 // 506 // In the typical cases where dontSeekWithinSingleLevelIter=false, 507 // the singleLevelIterator.SeekPrefixGE call will clear boundsCmp. 508 // However, in this case where dontSeekWithinSingleLevelIter=true, 509 // we never seek on the single-level iterator. This call will fall 510 // through to skipForward, which may improperly leave boundsCmp=+1 511 // unless we clear it here. 512 i.boundsCmp = 0 513 } 514 } else { 515 // INVARIANT: err == nil. 516 // 517 // Else fast-path: There are two possible cases, from 518 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 519 // 520 // 1) The bounds have moved forward (i.boundsCmp > 0) and this 521 // SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We 522 // know that the iterator must already be positioned within or just 523 // outside the previous bounds. Therefore, the topLevelIndex iter cannot 524 // be positioned at an entry ahead of the seek position (though it can be 525 // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 526 // confirms that it is not behind. Since it is not ahead and not behind it 527 // must be at the right position. 528 // 529 // 2) This SeekPrefixGE will land on a key that is greater than the key we 530 // are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 531 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 532 // index block. No need to reset the state of singleLevelIterator. 533 // 534 // Note that cases 1 and 2 never overlap, and one of them must be true. 535 // This invariant checking is important enough that we do not gate it 536 // behind invariants.Enabled. 537 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 538 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 539 i.boundsCmp > 0, flags.TrySeekUsingNext())) 540 } 541 542 if !flags.TrySeekUsingNext() { 543 // Case 1. Bounds have changed so the previous exhausted bounds state is 544 // irrelevant. 545 // WARNING-data-exhausted: this is safe to do only because the monotonic 546 // bounds optimizations only work when !data-exhausted. If they also 547 // worked with data-exhausted, we have made it unclear whether 548 // data-exhausted is actually true. See the comment at the top of the 549 // file. 550 i.exhaustedBounds = 0 551 } 552 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 553 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 554 // bug https://github.com/cockroachdb/pebble/issues/2036. 555 } 556 557 if !dontSeekWithinSingleLevelIter { 558 if ikey, val := i.singleLevelIterator.seekPrefixGE( 559 prefix, key, flags, false /* checkFilter */); ikey != nil { 560 return ikey, val 561 } 562 } 563 // NB: skipForward checks whether exhaustedBounds is already +1. 564 return i.skipForward() 565 } 566 567 // virtualLast should only be called if i.vReader != nil and i.endKeyInclusive 568 // is true. 569 func (i *twoLevelIterator) virtualLast() (*InternalKey, base.LazyValue) { 570 if i.vState == nil { 571 panic("pebble: invalid call to virtualLast") 572 } 573 574 // Seek to the first internal key. 575 ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone) 576 if i.endKeyInclusive { 577 // Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2, 578 // c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is 579 // c#1. We can perform SeekGE(i.upper) and then keep nexting until we find 580 // the last key with userkey == i.upper. 581 // 582 // TODO(bananabrick): Think about how to improve this. If many internal keys 583 // with the same user key at the upper bound then this could be slow, but 584 // maybe the odds of having many internal keys with the same user key at the 585 // upper bound are low. 586 for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 { 587 ikey, _ = i.Next() 588 } 589 return i.Prev() 590 } 591 // We seeked to the first key >= i.upper. 592 return i.Prev() 593 } 594 595 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 596 // package. Note that SeekLT only checks the lower bound. It is up to the 597 // caller to ensure that key is less than the upper bound. 598 func (i *twoLevelIterator) SeekLT( 599 key []byte, flags base.SeekLTFlags, 600 ) (*InternalKey, base.LazyValue) { 601 if i.vState != nil { 602 // Might have to fix upper bound since virtual sstable bounds are not 603 // known to callers of SeekLT. 604 // 605 // TODO(bananabrick): We can optimize away this check for the level iter 606 // if necessary. 607 cmp := i.cmp(key, i.upper) 608 // key == i.upper is fine. We'll do the right thing and return the 609 // first internal key with user key < key. 610 if cmp > 0 { 611 return i.virtualLast() 612 } 613 } 614 615 i.exhaustedBounds = 0 616 i.err = nil // clear cached iteration error 617 // Seek optimization only applies until iterator is first positioned after SetBounds. 618 i.boundsCmp = 0 619 620 var result loadBlockResult 621 var ikey *InternalKey 622 // NB: Unlike SeekGE, we don't have a fast-path here since we don't know 623 // whether the topLevelIndex is positioned after the position that would 624 // be returned by doing i.topLevelIndex.SeekGE(). To know this we would 625 // need to know the index key preceding the current one. 626 // NB: If a bound-limited block property filter is configured, it's 627 // externally ensured that the filter is disabled (through returning 628 // Intersects=false irrespective of the block props provided) during seeks. 629 i.maybeFilteredKeysTwoLevel = false 630 if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { 631 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 632 i.data.invalidate() 633 i.index.invalidate() 634 return nil, base.LazyValue{} 635 } 636 637 result = i.loadIndex(-1) 638 if result == loadBlockFailed { 639 return nil, base.LazyValue{} 640 } 641 if result == loadBlockOK { 642 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 643 return i.maybeVerifyKey(ikey, val) 644 } 645 // Fall through to skipBackward since the singleLevelIterator did 646 // not have any blocks that satisfy the block interval 647 // constraints, or the lower bound was reached. 648 } 649 // Else loadBlockIrrelevant, so fall through. 650 } else { 651 result = i.loadIndex(-1) 652 if result == loadBlockFailed { 653 return nil, base.LazyValue{} 654 } 655 if result == loadBlockOK { 656 if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil { 657 return i.maybeVerifyKey(ikey, val) 658 } 659 // Fall through to skipBackward since the singleLevelIterator did 660 // not have any blocks that satisfy the block interval 661 // constraint, or the lower bound was reached. 662 } 663 // Else loadBlockIrrelevant, so fall through. 664 } 665 if result == loadBlockIrrelevant { 666 // Enforce the lower bound here since don't want to bother moving to 667 // the previous entry in the top level index if lower bound is already 668 // exceeded. Note that the previous entry starts with keys <= 669 // ikey.UserKey since even though this is the current block's 670 // separator, the same user key can span multiple index blocks. 671 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 672 i.exhaustedBounds = -1 673 } 674 } 675 // NB: skipBackward checks whether exhaustedBounds is already -1. 676 return i.skipBackward() 677 } 678 679 // First implements internalIterator.First, as documented in the pebble 680 // package. Note that First only checks the upper bound. It is up to the caller 681 // to ensure that key is greater than or equal to the lower bound (e.g. via a 682 // call to SeekGE(lower)). 683 func (i *twoLevelIterator) First() (*InternalKey, base.LazyValue) { 684 // If the iterator was created on a virtual sstable, we will SeekGE to the 685 // lower bound instead of using First, because First does not respect 686 // bounds. 687 if i.vState != nil { 688 return i.SeekGE(i.lower, base.SeekGEFlagsNone) 689 } 690 691 if i.lower != nil { 692 panic("twoLevelIterator.First() used despite lower bound") 693 } 694 i.exhaustedBounds = 0 695 i.maybeFilteredKeysTwoLevel = false 696 i.err = nil // clear cached iteration error 697 // Seek optimization only applies until iterator is first positioned after SetBounds. 698 i.boundsCmp = 0 699 700 var ikey *InternalKey 701 if ikey, _ = i.topLevelIndex.First(); ikey == nil { 702 return nil, base.LazyValue{} 703 } 704 705 result := i.loadIndex(+1) 706 if result == loadBlockFailed { 707 return nil, base.LazyValue{} 708 } 709 if result == loadBlockOK { 710 if ikey, val := i.singleLevelIterator.First(); ikey != nil { 711 return ikey, val 712 } 713 // Else fall through to skipForward. 714 } else { 715 // result == loadBlockIrrelevant. Enforce the upper bound here since 716 // don't want to bother moving to the next entry in the top level 717 // index if upper bound is already exceeded. Note that the next entry 718 // starts with keys >= ikey.UserKey since even though this is the 719 // block separator, the same user key can span multiple index blocks. 720 // If upper is exclusive we use >= below, else we use >. 721 if i.upper != nil { 722 cmp := i.cmp(ikey.UserKey, i.upper) 723 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 724 i.exhaustedBounds = +1 725 } 726 } 727 } 728 // NB: skipForward checks whether exhaustedBounds is already +1. 729 return i.skipForward() 730 } 731 732 // Last implements internalIterator.Last, as documented in the pebble 733 // package. Note that Last only checks the lower bound. It is up to the caller 734 // to ensure that key is less than the upper bound (e.g. via a call to 735 // SeekLT(upper)) 736 func (i *twoLevelIterator) Last() (*InternalKey, base.LazyValue) { 737 if i.vState != nil { 738 if i.endKeyInclusive { 739 return i.virtualLast() 740 } 741 return i.SeekLT(i.upper, base.SeekLTFlagsNone) 742 } 743 744 if i.upper != nil { 745 panic("twoLevelIterator.Last() used despite upper bound") 746 } 747 i.exhaustedBounds = 0 748 i.maybeFilteredKeysTwoLevel = false 749 i.err = nil // clear cached iteration error 750 // Seek optimization only applies until iterator is first positioned after SetBounds. 751 i.boundsCmp = 0 752 753 var ikey *InternalKey 754 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 755 return nil, base.LazyValue{} 756 } 757 758 result := i.loadIndex(-1) 759 if result == loadBlockFailed { 760 return nil, base.LazyValue{} 761 } 762 if result == loadBlockOK { 763 if ikey, val := i.singleLevelIterator.Last(); ikey != nil { 764 return ikey, val 765 } 766 // Else fall through to skipBackward. 767 } else { 768 // result == loadBlockIrrelevant. Enforce the lower bound here 769 // since don't want to bother moving to the previous entry in the 770 // top level index if lower bound is already exceeded. Note that 771 // the previous entry starts with keys <= ikey.UserKey since even 772 // though this is the current block's separator, the same user key 773 // can span multiple index blocks. 774 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 775 i.exhaustedBounds = -1 776 } 777 } 778 // NB: skipBackward checks whether exhaustedBounds is already -1. 779 return i.skipBackward() 780 } 781 782 // Next implements internalIterator.Next, as documented in the pebble 783 // package. 784 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 785 // twoLevelIterator.Next due to performance. Keep the two in sync. 786 func (i *twoLevelIterator) Next() (*InternalKey, base.LazyValue) { 787 // Seek optimization only applies until iterator is first positioned after SetBounds. 788 i.boundsCmp = 0 789 i.maybeFilteredKeysTwoLevel = false 790 if i.err != nil { 791 return nil, base.LazyValue{} 792 } 793 if key, val := i.singleLevelIterator.Next(); key != nil { 794 return key, val 795 } 796 return i.skipForward() 797 } 798 799 // NextPrefix implements (base.InternalIterator).NextPrefix. 800 func (i *twoLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 801 if i.exhaustedBounds == +1 { 802 panic("Next called even though exhausted upper bound") 803 } 804 // Seek optimization only applies until iterator is first positioned after SetBounds. 805 i.boundsCmp = 0 806 i.maybeFilteredKeysTwoLevel = false 807 if i.err != nil { 808 return nil, base.LazyValue{} 809 } 810 if key, val := i.singleLevelIterator.NextPrefix(succKey); key != nil { 811 return key, val 812 } 813 // key == nil 814 if i.err != nil { 815 return nil, base.LazyValue{} 816 } 817 818 // Did not find prefix in the existing second-level index block. This is the 819 // slow-path where we seek the iterator. 820 var ikey *InternalKey 821 if ikey, _ = i.topLevelIndex.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil { 822 i.data.invalidate() 823 i.index.invalidate() 824 return nil, base.LazyValue{} 825 } 826 result := i.loadIndex(+1) 827 if result == loadBlockFailed { 828 return nil, base.LazyValue{} 829 } 830 if result == loadBlockIrrelevant { 831 // Enforce the upper bound here since don't want to bother moving to the 832 // next entry in the top level index if upper bound is already exceeded. 833 // Note that the next entry starts with keys >= ikey.UserKey since even 834 // though this is the block separator, the same user key can span multiple 835 // index blocks. If upper is exclusive we use >= below, else we use >. 836 if i.upper != nil { 837 cmp := i.cmp(ikey.UserKey, i.upper) 838 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 839 i.exhaustedBounds = +1 840 } 841 } 842 } else if key, val := i.singleLevelIterator.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { 843 return i.maybeVerifyKey(key, val) 844 } 845 return i.skipForward() 846 } 847 848 // Prev implements internalIterator.Prev, as documented in the pebble 849 // package. 850 func (i *twoLevelIterator) Prev() (*InternalKey, base.LazyValue) { 851 // Seek optimization only applies until iterator is first positioned after SetBounds. 852 i.boundsCmp = 0 853 i.maybeFilteredKeysTwoLevel = false 854 if i.err != nil { 855 return nil, base.LazyValue{} 856 } 857 if key, val := i.singleLevelIterator.Prev(); key != nil { 858 return key, val 859 } 860 return i.skipBackward() 861 } 862 863 func (i *twoLevelIterator) skipForward() (*InternalKey, base.LazyValue) { 864 for { 865 if i.err != nil || i.exhaustedBounds > 0 { 866 return nil, base.LazyValue{} 867 } 868 i.exhaustedBounds = 0 869 var ikey *InternalKey 870 if ikey, _ = i.topLevelIndex.Next(); ikey == nil { 871 i.data.invalidate() 872 i.index.invalidate() 873 return nil, base.LazyValue{} 874 } 875 result := i.loadIndex(+1) 876 if result == loadBlockFailed { 877 return nil, base.LazyValue{} 878 } 879 if result == loadBlockOK { 880 if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil { 881 return i.maybeVerifyKey(ikey, val) 882 } 883 // Next iteration will return if singleLevelIterator set 884 // exhaustedBounds = +1. 885 } else { 886 // result == loadBlockIrrelevant. Enforce the upper bound here 887 // since don't want to bother moving to the next entry in the top 888 // level index if upper bound is already exceeded. Note that the 889 // next entry starts with keys >= ikey.UserKey since even though 890 // this is the block separator, the same user key can span 891 // multiple index blocks. If upper is exclusive we use >= 892 // below, else we use >. 893 if i.upper != nil { 894 cmp := i.cmp(ikey.UserKey, i.upper) 895 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 896 i.exhaustedBounds = +1 897 // Next iteration will return. 898 } 899 } 900 } 901 } 902 } 903 904 func (i *twoLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { 905 for { 906 if i.err != nil || i.exhaustedBounds < 0 { 907 return nil, base.LazyValue{} 908 } 909 i.exhaustedBounds = 0 910 var ikey *InternalKey 911 if ikey, _ = i.topLevelIndex.Prev(); ikey == nil { 912 i.data.invalidate() 913 i.index.invalidate() 914 return nil, base.LazyValue{} 915 } 916 result := i.loadIndex(-1) 917 if result == loadBlockFailed { 918 return nil, base.LazyValue{} 919 } 920 if result == loadBlockOK { 921 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 922 return i.maybeVerifyKey(ikey, val) 923 } 924 // Next iteration will return if singleLevelIterator set 925 // exhaustedBounds = -1. 926 } else { 927 // result == loadBlockIrrelevant. Enforce the lower bound here 928 // since don't want to bother moving to the previous entry in the 929 // top level index if lower bound is already exceeded. Note that 930 // the previous entry starts with keys <= ikey.UserKey since even 931 // though this is the current block's separator, the same user key 932 // can span multiple index blocks. 933 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 934 i.exhaustedBounds = -1 935 // Next iteration will return. 936 } 937 } 938 } 939 } 940 941 // Close implements internalIterator.Close, as documented in the pebble 942 // package. 943 func (i *twoLevelIterator) Close() error { 944 var err error 945 if i.closeHook != nil { 946 err = firstError(err, i.closeHook(i)) 947 } 948 err = firstError(err, i.data.Close()) 949 err = firstError(err, i.index.Close()) 950 err = firstError(err, i.topLevelIndex.Close()) 951 if i.dataRH != nil { 952 err = firstError(err, i.dataRH.Close()) 953 i.dataRH = nil 954 } 955 err = firstError(err, i.err) 956 if i.bpfs != nil { 957 releaseBlockPropertiesFilterer(i.bpfs) 958 } 959 if i.vbReader != nil { 960 i.vbReader.close() 961 } 962 if i.vbRH != nil { 963 err = firstError(err, i.vbRH.Close()) 964 i.vbRH = nil 965 } 966 *i = twoLevelIterator{ 967 singleLevelIterator: i.singleLevelIterator.resetForReuse(), 968 topLevelIndex: i.topLevelIndex.resetForReuse(), 969 } 970 twoLevelIterPool.Put(i) 971 return err 972 } 973 974 // Note: twoLevelCompactionIterator and compactionIterator are very similar but 975 // were separated due to performance. 976 type twoLevelCompactionIterator struct { 977 *twoLevelIterator 978 bytesIterated *uint64 979 prevOffset uint64 980 } 981 982 // twoLevelCompactionIterator implements the base.InternalIterator interface. 983 var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil) 984 985 func (i *twoLevelCompactionIterator) Close() error { 986 return i.twoLevelIterator.Close() 987 } 988 989 func (i *twoLevelCompactionIterator) SeekGE( 990 key []byte, flags base.SeekGEFlags, 991 ) (*InternalKey, base.LazyValue) { 992 panic("pebble: SeekGE unimplemented") 993 } 994 995 func (i *twoLevelCompactionIterator) SeekPrefixGE( 996 prefix, key []byte, flags base.SeekGEFlags, 997 ) (*base.InternalKey, base.LazyValue) { 998 panic("pebble: SeekPrefixGE unimplemented") 999 } 1000 1001 func (i *twoLevelCompactionIterator) SeekLT( 1002 key []byte, flags base.SeekLTFlags, 1003 ) (*InternalKey, base.LazyValue) { 1004 panic("pebble: SeekLT unimplemented") 1005 } 1006 1007 func (i *twoLevelCompactionIterator) First() (*InternalKey, base.LazyValue) { 1008 i.err = nil // clear cached iteration error 1009 return i.skipForward(i.twoLevelIterator.First()) 1010 } 1011 1012 func (i *twoLevelCompactionIterator) Last() (*InternalKey, base.LazyValue) { 1013 panic("pebble: Last unimplemented") 1014 } 1015 1016 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 1017 // twoLevelIterator.Next due to performance. Keep the two in sync. 1018 func (i *twoLevelCompactionIterator) Next() (*InternalKey, base.LazyValue) { 1019 if i.err != nil { 1020 return nil, base.LazyValue{} 1021 } 1022 return i.skipForward(i.singleLevelIterator.Next()) 1023 } 1024 1025 func (i *twoLevelCompactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 1026 panic("pebble: NextPrefix unimplemented") 1027 } 1028 1029 func (i *twoLevelCompactionIterator) Prev() (*InternalKey, base.LazyValue) { 1030 panic("pebble: Prev unimplemented") 1031 } 1032 1033 func (i *twoLevelCompactionIterator) String() string { 1034 if i.vState != nil { 1035 return i.vState.fileNum.String() 1036 } 1037 return i.reader.fileNum.String() 1038 } 1039 1040 func (i *twoLevelCompactionIterator) skipForward( 1041 key *InternalKey, val base.LazyValue, 1042 ) (*InternalKey, base.LazyValue) { 1043 if key == nil { 1044 for { 1045 if key, _ := i.topLevelIndex.Next(); key == nil { 1046 break 1047 } 1048 result := i.loadIndex(+1) 1049 if result != loadBlockOK { 1050 if i.err != nil { 1051 break 1052 } 1053 switch result { 1054 case loadBlockFailed: 1055 // We checked that i.index was at a valid entry, so 1056 // loadBlockFailed could not have happened due to to i.index 1057 // being exhausted, and must be due to an error. 1058 panic("loadBlock should not have failed with no error") 1059 case loadBlockIrrelevant: 1060 panic("compactionIter should not be using block intervals for skipping") 1061 default: 1062 panic(fmt.Sprintf("unexpected case %d", result)) 1063 } 1064 } 1065 // result == loadBlockOK 1066 if key, val = i.singleLevelIterator.First(); key != nil { 1067 break 1068 } 1069 } 1070 } 1071 1072 curOffset := i.recordOffset() 1073 *i.bytesIterated += uint64(curOffset - i.prevOffset) 1074 i.prevOffset = curOffset 1075 1076 if i.vState != nil && key != nil { 1077 cmp := i.cmp(key.UserKey, i.vState.upper.UserKey) 1078 if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) { 1079 return nil, base.LazyValue{} 1080 } 1081 } 1082 1083 return key, val 1084 }