github.com/cockroachdb/pebble@v1.1.2/sstable/reader_iter_two_lvl.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package sstable 6 7 import ( 8 "context" 9 "fmt" 10 11 "github.com/cockroachdb/pebble/internal/base" 12 "github.com/cockroachdb/pebble/objstorage/objstorageprovider" 13 "github.com/cockroachdb/pebble/objstorage/objstorageprovider/objiotracing" 14 ) 15 16 type twoLevelIterator struct { 17 singleLevelIterator 18 // maybeFilteredKeysSingleLevel indicates whether the last iterator 19 // positioning operation may have skipped any index blocks due to 20 // block-property filters when positioning the top-level-index. 21 maybeFilteredKeysTwoLevel bool 22 topLevelIndex blockIter 23 } 24 25 // twoLevelIterator implements the base.InternalIterator interface. 26 var _ base.InternalIterator = (*twoLevelIterator)(nil) 27 28 // loadIndex loads the index block at the current top level index position and 29 // leaves i.index unpositioned. If unsuccessful, it gets i.err to any error 30 // encountered, which may be nil if we have simply exhausted the entire table. 31 // This is used for two level indexes. 32 func (i *twoLevelIterator) loadIndex(dir int8) loadBlockResult { 33 // Ensure the index data block iterators are invalidated even if loading of 34 // the index fails. 35 i.data.invalidate() 36 i.index.invalidate() 37 if !i.topLevelIndex.valid() { 38 i.index.offset = 0 39 i.index.restarts = 0 40 return loadBlockFailed 41 } 42 v := i.topLevelIndex.value() 43 bhp, err := decodeBlockHandleWithProperties(v.InPlaceValue()) 44 if err != nil { 45 i.err = base.CorruptionErrorf("pebble/table: corrupt top level index entry") 46 return loadBlockFailed 47 } 48 if i.bpfs != nil { 49 intersects, err := i.bpfs.intersects(bhp.Props) 50 if err != nil { 51 i.err = errCorruptIndexEntry 52 return loadBlockFailed 53 } 54 if intersects == blockMaybeExcluded { 55 intersects = i.resolveMaybeExcluded(dir) 56 } 57 if intersects == blockExcluded { 58 i.maybeFilteredKeysTwoLevel = true 59 return loadBlockIrrelevant 60 } 61 // blockIntersects 62 } 63 ctx := objiotracing.WithBlockType(i.ctx, objiotracing.MetadataBlock) 64 indexBlock, err := i.reader.readBlock(ctx, bhp.BlockHandle, nil /* transform */, nil /* readHandle */, i.stats, i.bufferPool) 65 if err != nil { 66 i.err = err 67 return loadBlockFailed 68 } 69 if i.err = i.index.initHandle(i.cmp, indexBlock, i.reader.Properties.GlobalSeqNum, false); i.err == nil { 70 return loadBlockOK 71 } 72 return loadBlockFailed 73 } 74 75 // resolveMaybeExcluded is invoked when the block-property filterer has found 76 // that an index block is excluded according to its properties but only if its 77 // bounds fall within the filter's current bounds. This function consults the 78 // apprioriate bound, depending on the iteration direction, and returns either 79 // `blockIntersects` or 80 // `blockMaybeExcluded`. 81 func (i *twoLevelIterator) resolveMaybeExcluded(dir int8) intersectsResult { 82 // This iterator is configured with a bound-limited block property filter. 83 // The bpf determined this entire index block could be excluded from 84 // iteration based on the property encoded in the block handle. However, we 85 // still need to determine if the index block is wholly contained within the 86 // filter's key bounds. 87 // 88 // External guarantees ensure all its data blocks' keys are ≥ the filter's 89 // lower bound during forward iteration, and that all its data blocks' keys 90 // are < the filter's upper bound during backward iteration. We only need to 91 // determine if the opposite bound is also met. 92 // 93 // The index separator in topLevelIndex.Key() provides an inclusive 94 // upper-bound for the index block's keys, guaranteeing that all its keys 95 // are ≤ topLevelIndex.Key(). For forward iteration, this is all we need. 96 if dir > 0 { 97 // Forward iteration. 98 if i.bpfs.boundLimitedFilter.KeyIsWithinUpperBound(i.topLevelIndex.Key().UserKey) { 99 return blockExcluded 100 } 101 return blockIntersects 102 } 103 104 // Reverse iteration. 105 // 106 // Because we're iterating in the reverse direction, we don't yet have 107 // enough context available to determine if the block is wholly contained 108 // within its bounds. This case arises only during backward iteration, 109 // because of the way the index is structured. 110 // 111 // Consider a bound-limited bpf limited to the bounds [b,d), loading the 112 // block with separator `c`. During reverse iteration, the guarantee that 113 // all the block's keys are < `d` is externally provided, but no guarantee 114 // is made on the bpf's lower bound. The separator `c` only provides an 115 // inclusive upper bound on the block's keys, indicating that the 116 // corresponding block handle points to a block containing only keys ≤ `c`. 117 // 118 // To establish a lower bound, we step the top-level index backwards to read 119 // the previous block's separator, which provides an inclusive lower bound 120 // on the original index block's keys. Afterwards, we step forward to 121 // restore our top-level index position. 122 if peekKey, _ := i.topLevelIndex.Prev(); peekKey == nil { 123 // The original block points to the first index block of this table. If 124 // we knew the lower bound for the entire table, it could provide a 125 // lower bound, but the code refactoring necessary to read it doesn't 126 // seem worth the payoff. We fall through to loading the block. 127 } else if i.bpfs.boundLimitedFilter.KeyIsWithinLowerBound(peekKey.UserKey) { 128 // The lower-bound on the original index block falls within the filter's 129 // bounds, and we can skip the block (after restoring our current 130 // top-level index position). 131 _, _ = i.topLevelIndex.Next() 132 return blockExcluded 133 } 134 _, _ = i.topLevelIndex.Next() 135 return blockIntersects 136 } 137 138 // Note that lower, upper passed into init has nothing to do with virtual sstable 139 // bounds. If the virtualState passed in is not nil, then virtual sstable bounds 140 // will be enforced. 141 func (i *twoLevelIterator) init( 142 ctx context.Context, 143 r *Reader, 144 v *virtualState, 145 lower, upper []byte, 146 filterer *BlockPropertiesFilterer, 147 useFilter, hideObsoletePoints bool, 148 stats *base.InternalIteratorStats, 149 rp ReaderProvider, 150 bufferPool *BufferPool, 151 ) error { 152 if r.err != nil { 153 return r.err 154 } 155 topLevelIndexH, err := r.readIndex(ctx, stats) 156 if err != nil { 157 return err 158 } 159 if v != nil { 160 i.vState = v 161 // Note that upper is exclusive here. 162 i.endKeyInclusive, lower, upper = v.constrainBounds(lower, upper, false /* endInclusive */) 163 } 164 165 i.ctx = ctx 166 i.lower = lower 167 i.upper = upper 168 i.bpfs = filterer 169 i.useFilter = useFilter 170 i.reader = r 171 i.cmp = r.Compare 172 i.stats = stats 173 i.hideObsoletePoints = hideObsoletePoints 174 i.bufferPool = bufferPool 175 err = i.topLevelIndex.initHandle(i.cmp, topLevelIndexH, r.Properties.GlobalSeqNum, false) 176 if err != nil { 177 // blockIter.Close releases topLevelIndexH and always returns a nil error 178 _ = i.topLevelIndex.Close() 179 return err 180 } 181 i.dataRH = objstorageprovider.UsePreallocatedReadHandle(ctx, r.readable, &i.dataRHPrealloc) 182 if r.tableFormat >= TableFormatPebblev3 { 183 if r.Properties.NumValueBlocks > 0 { 184 i.vbReader = &valueBlockReader{ 185 ctx: ctx, 186 bpOpen: i, 187 rp: rp, 188 vbih: r.valueBIH, 189 stats: stats, 190 } 191 i.data.lazyValueHandling.vbr = i.vbReader 192 i.vbRH = r.readable.NewReadHandle(ctx) 193 } 194 i.data.lazyValueHandling.hasValuePrefix = true 195 } 196 return nil 197 } 198 199 func (i *twoLevelIterator) String() string { 200 if i.vState != nil { 201 return i.vState.fileNum.String() 202 } 203 return i.reader.fileNum.String() 204 } 205 206 // MaybeFilteredKeys may be called when an iterator is exhausted to indicate 207 // whether or not the last positioning method may have skipped any keys due to 208 // block-property filters. 209 func (i *twoLevelIterator) MaybeFilteredKeys() bool { 210 // While reading sstables with two-level indexes, knowledge of whether we've 211 // filtered keys is tracked separately for each index level. The 212 // seek-using-next optimizations have different criteria. We can only reset 213 // maybeFilteredKeys back to false during a seek when NOT using the 214 // fast-path that uses the current iterator position. 215 // 216 // If either level might have filtered keys to arrive at the current 217 // iterator position, return MaybeFilteredKeys=true. 218 return i.maybeFilteredKeysTwoLevel || i.maybeFilteredKeysSingleLevel 219 } 220 221 // SeekGE implements internalIterator.SeekGE, as documented in the pebble 222 // package. Note that SeekGE only checks the upper bound. It is up to the 223 // caller to ensure that key is greater than or equal to the lower bound. 224 func (i *twoLevelIterator) SeekGE( 225 key []byte, flags base.SeekGEFlags, 226 ) (*InternalKey, base.LazyValue) { 227 if i.vState != nil { 228 // Callers of SeekGE don't know about virtual sstable bounds, so we may 229 // have to internally restrict the bounds. 230 // 231 // TODO(bananabrick): We can optimize away this check for the level iter 232 // if necessary. 233 if i.cmp(key, i.lower) < 0 { 234 key = i.lower 235 } 236 } 237 238 err := i.err 239 i.err = nil // clear cached iteration error 240 241 // The twoLevelIterator could be already exhausted. Utilize that when 242 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 243 // bounds-exhausted near the top of the file. 244 if flags.TrySeekUsingNext() && 245 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 246 err == nil { 247 // Already exhausted, so return nil. 248 return nil, base.LazyValue{} 249 } 250 251 // SeekGE performs various step-instead-of-seeking optimizations: eg enabled 252 // by trySeekUsingNext, or by monotonically increasing bounds (i.boundsCmp). 253 // Care must be taken to ensure that when performing these optimizations and 254 // the iterator becomes exhausted, i.maybeFilteredKeys is set appropriately. 255 // Consider a previous SeekGE that filtered keys from k until the current 256 // iterator position. 257 // 258 // If the previous SeekGE exhausted the iterator while seeking within the 259 // two-level index, it's possible keys greater than or equal to the current 260 // search key were filtered through skipped index blocks. We must not reuse 261 // the position of the two-level index iterator without remembering the 262 // previous value of maybeFilteredKeys. 263 264 // We fall into the slow path if i.index.isDataInvalidated() even if the 265 // top-level iterator is already positioned correctly and all other 266 // conditions are met. An alternative structure could reuse topLevelIndex's 267 // current position and reload the index block to which it points. Arguably, 268 // an index block load is expensive and the index block may still be earlier 269 // than the index block containing the sought key, resulting in a wasteful 270 // block load. 271 272 var dontSeekWithinSingleLevelIter bool 273 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 274 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 275 // Slow-path: need to position the topLevelIndex. 276 277 // The previous exhausted state of singleLevelIterator is no longer 278 // relevant, since we may be moving to a different index block. 279 i.exhaustedBounds = 0 280 i.maybeFilteredKeysTwoLevel = false 281 flags = flags.DisableTrySeekUsingNext() 282 var ikey *InternalKey 283 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 284 i.data.invalidate() 285 i.index.invalidate() 286 return nil, base.LazyValue{} 287 } 288 289 result := i.loadIndex(+1) 290 if result == loadBlockFailed { 291 i.boundsCmp = 0 292 return nil, base.LazyValue{} 293 } 294 if result == loadBlockIrrelevant { 295 // Enforce the upper bound here since don't want to bother moving 296 // to the next entry in the top level index if upper bound is 297 // already exceeded. Note that the next entry starts with keys >= 298 // ikey.UserKey since even though this is the block separator, the 299 // same user key can span multiple index blocks. If upper is 300 // exclusive we use >= below, else we use >. 301 if i.upper != nil { 302 cmp := i.cmp(ikey.UserKey, i.upper) 303 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 304 i.exhaustedBounds = +1 305 } 306 } 307 // Fall through to skipForward. 308 dontSeekWithinSingleLevelIter = true 309 // Clear boundsCmp. 310 // 311 // In the typical cases where dontSeekWithinSingleLevelIter=false, 312 // the singleLevelIterator.SeekGE call will clear boundsCmp. 313 // However, in this case where dontSeekWithinSingleLevelIter=true, 314 // we never seek on the single-level iterator. This call will fall 315 // through to skipForward, which may improperly leave boundsCmp=+1 316 // unless we clear it here. 317 i.boundsCmp = 0 318 } 319 } else { 320 // INVARIANT: err == nil. 321 // 322 // Else fast-path: There are two possible cases, from 323 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 324 // 325 // 1) The bounds have moved forward (i.boundsCmp > 0) and this SeekGE is 326 // respecting the lower bound (guaranteed by Iterator). We know that the 327 // iterator must already be positioned within or just outside the previous 328 // bounds. Therefore, the topLevelIndex iter cannot be positioned at an 329 // entry ahead of the seek position (though it can be positioned behind). 330 // The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 confirms that it is 331 // not behind. Since it is not ahead and not behind it must be at the 332 // right position. 333 // 334 // 2) This SeekGE will land on a key that is greater than the key we are 335 // currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 336 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 337 // index block. No need to reset the state of singleLevelIterator. 338 // 339 // Note that cases 1 and 2 never overlap, and one of them must be true, 340 // but we have some test code (TestIterRandomizedMaybeFilteredKeys) that 341 // sets both to true, so we fix things here and then do an invariant 342 // check. 343 // 344 // This invariant checking is important enough that we do not gate it 345 // behind invariants.Enabled. 346 if i.boundsCmp > 0 { 347 // TODO(sumeer): fix TestIterRandomizedMaybeFilteredKeys so as to not 348 // need this behavior. 349 flags = flags.DisableTrySeekUsingNext() 350 } 351 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 352 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 353 i.boundsCmp > 0, flags.TrySeekUsingNext())) 354 } 355 356 if !flags.TrySeekUsingNext() { 357 // Case 1. Bounds have changed so the previous exhausted bounds state is 358 // irrelevant. 359 // WARNING-data-exhausted: this is safe to do only because the monotonic 360 // bounds optimizations only work when !data-exhausted. If they also 361 // worked with data-exhausted, we have made it unclear whether 362 // data-exhausted is actually true. See the comment at the top of the 363 // file. 364 i.exhaustedBounds = 0 365 } 366 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 367 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 368 // bug https://github.com/cockroachdb/pebble/issues/2036. 369 } 370 371 if !dontSeekWithinSingleLevelIter { 372 // Note that while trySeekUsingNext could be false here, singleLevelIterator 373 // could do its own boundsCmp-based optimization to seek using next. 374 if ikey, val := i.singleLevelIterator.SeekGE(key, flags); ikey != nil { 375 return ikey, val 376 } 377 } 378 return i.skipForward() 379 } 380 381 // SeekPrefixGE implements internalIterator.SeekPrefixGE, as documented in the 382 // pebble package. Note that SeekPrefixGE only checks the upper bound. It is up 383 // to the caller to ensure that key is greater than or equal to the lower bound. 384 func (i *twoLevelIterator) SeekPrefixGE( 385 prefix, key []byte, flags base.SeekGEFlags, 386 ) (*base.InternalKey, base.LazyValue) { 387 if i.vState != nil { 388 // Callers of SeekGE don't know about virtual sstable bounds, so we may 389 // have to internally restrict the bounds. 390 // 391 // TODO(bananabrick): We can optimize away this check for the level iter 392 // if necessary. 393 if i.cmp(key, i.lower) < 0 { 394 key = i.lower 395 } 396 } 397 398 // NOTE: prefix is only used for bloom filter checking and not later work in 399 // this method. Hence, we can use the existing iterator position if the last 400 // SeekPrefixGE did not fail bloom filter matching. 401 402 err := i.err 403 i.err = nil // clear cached iteration error 404 405 // The twoLevelIterator could be already exhausted. Utilize that when 406 // trySeekUsingNext is true. See the comment about data-exhausted, PGDE, and 407 // bounds-exhausted near the top of the file. 408 filterUsedAndDidNotMatch := 409 i.reader.tableFilter != nil && i.useFilter && !i.lastBloomFilterMatched 410 if flags.TrySeekUsingNext() && !filterUsedAndDidNotMatch && 411 (i.exhaustedBounds == +1 || (i.data.isDataInvalidated() && i.index.isDataInvalidated())) && 412 err == nil { 413 // Already exhausted, so return nil. 414 return nil, base.LazyValue{} 415 } 416 417 // Check prefix bloom filter. 418 if i.reader.tableFilter != nil && i.useFilter { 419 if !i.lastBloomFilterMatched { 420 // Iterator is not positioned based on last seek. 421 flags = flags.DisableTrySeekUsingNext() 422 } 423 i.lastBloomFilterMatched = false 424 var dataH bufferHandle 425 dataH, i.err = i.reader.readFilter(i.ctx, i.stats) 426 if i.err != nil { 427 i.data.invalidate() 428 return nil, base.LazyValue{} 429 } 430 mayContain := i.reader.tableFilter.mayContain(dataH.Get(), prefix) 431 dataH.Release() 432 if !mayContain { 433 // This invalidation may not be necessary for correctness, and may 434 // be a place to optimize later by reusing the already loaded 435 // block. It was necessary in earlier versions of the code since 436 // the caller was allowed to call Next when SeekPrefixGE returned 437 // nil. This is no longer allowed. 438 i.data.invalidate() 439 return nil, base.LazyValue{} 440 } 441 i.lastBloomFilterMatched = true 442 } 443 444 // Bloom filter matches. 445 446 // SeekPrefixGE performs various step-instead-of-seeking optimizations: eg 447 // enabled by trySeekUsingNext, or by monotonically increasing bounds 448 // (i.boundsCmp). Care must be taken to ensure that when performing these 449 // optimizations and the iterator becomes exhausted, 450 // i.maybeFilteredKeysTwoLevel is set appropriately. Consider a previous 451 // SeekPrefixGE that filtered keys from k until the current iterator 452 // position. 453 // 454 // If the previous SeekPrefixGE exhausted the iterator while seeking within 455 // the two-level index, it's possible keys greater than or equal to the 456 // current search key were filtered through skipped index blocks. We must 457 // not reuse the position of the two-level index iterator without 458 // remembering the previous value of maybeFilteredKeysTwoLevel. 459 460 // We fall into the slow path if i.index.isDataInvalidated() even if the 461 // top-level iterator is already positioned correctly and all other 462 // conditions are met. An alternative structure could reuse topLevelIndex's 463 // current position and reload the index block to which it points. Arguably, 464 // an index block load is expensive and the index block may still be earlier 465 // than the index block containing the sought key, resulting in a wasteful 466 // block load. 467 468 var dontSeekWithinSingleLevelIter bool 469 if i.topLevelIndex.isDataInvalidated() || !i.topLevelIndex.valid() || i.index.isDataInvalidated() || err != nil || 470 (i.boundsCmp <= 0 && !flags.TrySeekUsingNext()) || i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 { 471 // Slow-path: need to position the topLevelIndex. 472 473 // The previous exhausted state of singleLevelIterator is no longer 474 // relevant, since we may be moving to a different index block. 475 i.exhaustedBounds = 0 476 i.maybeFilteredKeysTwoLevel = false 477 flags = flags.DisableTrySeekUsingNext() 478 var ikey *InternalKey 479 if ikey, _ = i.topLevelIndex.SeekGE(key, flags); ikey == nil { 480 i.data.invalidate() 481 i.index.invalidate() 482 return nil, base.LazyValue{} 483 } 484 485 result := i.loadIndex(+1) 486 if result == loadBlockFailed { 487 i.boundsCmp = 0 488 return nil, base.LazyValue{} 489 } 490 if result == loadBlockIrrelevant { 491 // Enforce the upper bound here since don't want to bother moving 492 // to the next entry in the top level index if upper bound is 493 // already exceeded. Note that the next entry starts with keys >= 494 // ikey.UserKey since even though this is the block separator, the 495 // same user key can span multiple index blocks. If upper is 496 // exclusive we use >= below, else we use >. 497 if i.upper != nil { 498 cmp := i.cmp(ikey.UserKey, i.upper) 499 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 500 i.exhaustedBounds = +1 501 } 502 } 503 // Fall through to skipForward. 504 dontSeekWithinSingleLevelIter = true 505 // Clear boundsCmp. 506 // 507 // In the typical cases where dontSeekWithinSingleLevelIter=false, 508 // the singleLevelIterator.SeekPrefixGE call will clear boundsCmp. 509 // However, in this case where dontSeekWithinSingleLevelIter=true, 510 // we never seek on the single-level iterator. This call will fall 511 // through to skipForward, which may improperly leave boundsCmp=+1 512 // unless we clear it here. 513 i.boundsCmp = 0 514 } 515 } else { 516 // INVARIANT: err == nil. 517 // 518 // Else fast-path: There are two possible cases, from 519 // (i.boundsCmp > 0 || flags.TrySeekUsingNext()): 520 // 521 // 1) The bounds have moved forward (i.boundsCmp > 0) and this 522 // SeekPrefixGE is respecting the lower bound (guaranteed by Iterator). We 523 // know that the iterator must already be positioned within or just 524 // outside the previous bounds. Therefore, the topLevelIndex iter cannot 525 // be positioned at an entry ahead of the seek position (though it can be 526 // positioned behind). The !i.cmp(key, i.topLevelIndex.Key().UserKey) > 0 527 // confirms that it is not behind. Since it is not ahead and not behind it 528 // must be at the right position. 529 // 530 // 2) This SeekPrefixGE will land on a key that is greater than the key we 531 // are currently at (guaranteed by trySeekUsingNext), but since i.cmp(key, 532 // i.topLevelIndex.Key().UserKey) <= 0, we are at the correct lower level 533 // index block. No need to reset the state of singleLevelIterator. 534 // 535 // Note that cases 1 and 2 never overlap, and one of them must be true. 536 // This invariant checking is important enough that we do not gate it 537 // behind invariants.Enabled. 538 if i.boundsCmp > 0 == flags.TrySeekUsingNext() { 539 panic(fmt.Sprintf("inconsistency in optimization case 1 %t and case 2 %t", 540 i.boundsCmp > 0, flags.TrySeekUsingNext())) 541 } 542 543 if !flags.TrySeekUsingNext() { 544 // Case 1. Bounds have changed so the previous exhausted bounds state is 545 // irrelevant. 546 // WARNING-data-exhausted: this is safe to do only because the monotonic 547 // bounds optimizations only work when !data-exhausted. If they also 548 // worked with data-exhausted, we have made it unclear whether 549 // data-exhausted is actually true. See the comment at the top of the 550 // file. 551 i.exhaustedBounds = 0 552 } 553 // Else flags.TrySeekUsingNext(). The i.exhaustedBounds is important to 554 // preserve for singleLevelIterator, and twoLevelIterator.skipForward. See 555 // bug https://github.com/cockroachdb/pebble/issues/2036. 556 } 557 558 if !dontSeekWithinSingleLevelIter { 559 if ikey, val := i.singleLevelIterator.seekPrefixGE( 560 prefix, key, flags, false /* checkFilter */); ikey != nil { 561 return ikey, val 562 } 563 } 564 // NB: skipForward checks whether exhaustedBounds is already +1. 565 return i.skipForward() 566 } 567 568 // virtualLast should only be called if i.vReader != nil and i.endKeyInclusive 569 // is true. 570 func (i *twoLevelIterator) virtualLast() (*InternalKey, base.LazyValue) { 571 if i.vState == nil { 572 panic("pebble: invalid call to virtualLast") 573 } 574 575 // Seek to the first internal key. 576 ikey, _ := i.SeekGE(i.upper, base.SeekGEFlagsNone) 577 if i.endKeyInclusive { 578 // Let's say the virtual sstable upper bound is c#1, with the keys c#3, c#2, 579 // c#1, d, e, ... in the sstable. So, the last key in the virtual sstable is 580 // c#1. We can perform SeekGE(i.upper) and then keep nexting until we find 581 // the last key with userkey == i.upper. 582 // 583 // TODO(bananabrick): Think about how to improve this. If many internal keys 584 // with the same user key at the upper bound then this could be slow, but 585 // maybe the odds of having many internal keys with the same user key at the 586 // upper bound are low. 587 for ikey != nil && i.cmp(ikey.UserKey, i.upper) == 0 { 588 ikey, _ = i.Next() 589 } 590 return i.Prev() 591 } 592 // We seeked to the first key >= i.upper. 593 return i.Prev() 594 } 595 596 // SeekLT implements internalIterator.SeekLT, as documented in the pebble 597 // package. Note that SeekLT only checks the lower bound. It is up to the 598 // caller to ensure that key is less than the upper bound. 599 func (i *twoLevelIterator) SeekLT( 600 key []byte, flags base.SeekLTFlags, 601 ) (*InternalKey, base.LazyValue) { 602 if i.vState != nil { 603 // Might have to fix upper bound since virtual sstable bounds are not 604 // known to callers of SeekLT. 605 // 606 // TODO(bananabrick): We can optimize away this check for the level iter 607 // if necessary. 608 cmp := i.cmp(key, i.upper) 609 // key == i.upper is fine. We'll do the right thing and return the 610 // first internal key with user key < key. 611 if cmp > 0 { 612 return i.virtualLast() 613 } 614 } 615 616 i.exhaustedBounds = 0 617 i.err = nil // clear cached iteration error 618 // Seek optimization only applies until iterator is first positioned after SetBounds. 619 i.boundsCmp = 0 620 621 var result loadBlockResult 622 var ikey *InternalKey 623 // NB: Unlike SeekGE, we don't have a fast-path here since we don't know 624 // whether the topLevelIndex is positioned after the position that would 625 // be returned by doing i.topLevelIndex.SeekGE(). To know this we would 626 // need to know the index key preceding the current one. 627 // NB: If a bound-limited block property filter is configured, it's 628 // externally ensured that the filter is disabled (through returning 629 // Intersects=false irrespective of the block props provided) during seeks. 630 i.maybeFilteredKeysTwoLevel = false 631 if ikey, _ = i.topLevelIndex.SeekGE(key, base.SeekGEFlagsNone); ikey == nil { 632 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 633 i.data.invalidate() 634 i.index.invalidate() 635 return nil, base.LazyValue{} 636 } 637 638 result = i.loadIndex(-1) 639 if result == loadBlockFailed { 640 return nil, base.LazyValue{} 641 } 642 if result == loadBlockOK { 643 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 644 return i.maybeVerifyKey(ikey, val) 645 } 646 // Fall through to skipBackward since the singleLevelIterator did 647 // not have any blocks that satisfy the block interval 648 // constraints, or the lower bound was reached. 649 } 650 // Else loadBlockIrrelevant, so fall through. 651 } else { 652 result = i.loadIndex(-1) 653 if result == loadBlockFailed { 654 return nil, base.LazyValue{} 655 } 656 if result == loadBlockOK { 657 if ikey, val := i.singleLevelIterator.SeekLT(key, flags); ikey != nil { 658 return i.maybeVerifyKey(ikey, val) 659 } 660 // Fall through to skipBackward since the singleLevelIterator did 661 // not have any blocks that satisfy the block interval 662 // constraint, or the lower bound was reached. 663 } 664 // Else loadBlockIrrelevant, so fall through. 665 } 666 if result == loadBlockIrrelevant { 667 // Enforce the lower bound here since don't want to bother moving to 668 // the previous entry in the top level index if lower bound is already 669 // exceeded. Note that the previous entry starts with keys <= 670 // ikey.UserKey since even though this is the current block's 671 // separator, the same user key can span multiple index blocks. 672 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 673 i.exhaustedBounds = -1 674 } 675 } 676 // NB: skipBackward checks whether exhaustedBounds is already -1. 677 return i.skipBackward() 678 } 679 680 // First implements internalIterator.First, as documented in the pebble 681 // package. Note that First only checks the upper bound. It is up to the caller 682 // to ensure that key is greater than or equal to the lower bound (e.g. via a 683 // call to SeekGE(lower)). 684 func (i *twoLevelIterator) First() (*InternalKey, base.LazyValue) { 685 // If the iterator was created on a virtual sstable, we will SeekGE to the 686 // lower bound instead of using First, because First does not respect 687 // bounds. 688 if i.vState != nil { 689 return i.SeekGE(i.lower, base.SeekGEFlagsNone) 690 } 691 692 if i.lower != nil { 693 panic("twoLevelIterator.First() used despite lower bound") 694 } 695 i.exhaustedBounds = 0 696 i.maybeFilteredKeysTwoLevel = false 697 i.err = nil // clear cached iteration error 698 // Seek optimization only applies until iterator is first positioned after SetBounds. 699 i.boundsCmp = 0 700 701 var ikey *InternalKey 702 if ikey, _ = i.topLevelIndex.First(); ikey == nil { 703 return nil, base.LazyValue{} 704 } 705 706 result := i.loadIndex(+1) 707 if result == loadBlockFailed { 708 return nil, base.LazyValue{} 709 } 710 if result == loadBlockOK { 711 if ikey, val := i.singleLevelIterator.First(); ikey != nil { 712 return ikey, val 713 } 714 // Else fall through to skipForward. 715 } else { 716 // result == loadBlockIrrelevant. Enforce the upper bound here since 717 // don't want to bother moving to the next entry in the top level 718 // index if upper bound is already exceeded. Note that the next entry 719 // starts with keys >= ikey.UserKey since even though this is the 720 // block separator, the same user key can span multiple index blocks. 721 // If upper is exclusive we use >= below, else we use >. 722 if i.upper != nil { 723 cmp := i.cmp(ikey.UserKey, i.upper) 724 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 725 i.exhaustedBounds = +1 726 } 727 } 728 } 729 // NB: skipForward checks whether exhaustedBounds is already +1. 730 return i.skipForward() 731 } 732 733 // Last implements internalIterator.Last, as documented in the pebble 734 // package. Note that Last only checks the lower bound. It is up to the caller 735 // to ensure that key is less than the upper bound (e.g. via a call to 736 // SeekLT(upper)) 737 func (i *twoLevelIterator) Last() (*InternalKey, base.LazyValue) { 738 if i.vState != nil { 739 if i.endKeyInclusive { 740 return i.virtualLast() 741 } 742 return i.SeekLT(i.upper, base.SeekLTFlagsNone) 743 } 744 745 if i.upper != nil { 746 panic("twoLevelIterator.Last() used despite upper bound") 747 } 748 i.exhaustedBounds = 0 749 i.maybeFilteredKeysTwoLevel = false 750 i.err = nil // clear cached iteration error 751 // Seek optimization only applies until iterator is first positioned after SetBounds. 752 i.boundsCmp = 0 753 754 var ikey *InternalKey 755 if ikey, _ = i.topLevelIndex.Last(); ikey == nil { 756 return nil, base.LazyValue{} 757 } 758 759 result := i.loadIndex(-1) 760 if result == loadBlockFailed { 761 return nil, base.LazyValue{} 762 } 763 if result == loadBlockOK { 764 if ikey, val := i.singleLevelIterator.Last(); ikey != nil { 765 return ikey, val 766 } 767 // Else fall through to skipBackward. 768 } else { 769 // result == loadBlockIrrelevant. Enforce the lower bound here 770 // since don't want to bother moving to the previous entry in the 771 // top level index if lower bound is already exceeded. Note that 772 // the previous entry starts with keys <= ikey.UserKey since even 773 // though this is the current block's separator, the same user key 774 // can span multiple index blocks. 775 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 776 i.exhaustedBounds = -1 777 } 778 } 779 // NB: skipBackward checks whether exhaustedBounds is already -1. 780 return i.skipBackward() 781 } 782 783 // Next implements internalIterator.Next, as documented in the pebble 784 // package. 785 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 786 // twoLevelIterator.Next due to performance. Keep the two in sync. 787 func (i *twoLevelIterator) Next() (*InternalKey, base.LazyValue) { 788 // Seek optimization only applies until iterator is first positioned after SetBounds. 789 i.boundsCmp = 0 790 i.maybeFilteredKeysTwoLevel = false 791 if i.err != nil { 792 return nil, base.LazyValue{} 793 } 794 if key, val := i.singleLevelIterator.Next(); key != nil { 795 return key, val 796 } 797 return i.skipForward() 798 } 799 800 // NextPrefix implements (base.InternalIterator).NextPrefix. 801 func (i *twoLevelIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 802 if i.exhaustedBounds == +1 { 803 panic("Next called even though exhausted upper bound") 804 } 805 // Seek optimization only applies until iterator is first positioned after SetBounds. 806 i.boundsCmp = 0 807 i.maybeFilteredKeysTwoLevel = false 808 if i.err != nil { 809 return nil, base.LazyValue{} 810 } 811 if key, val := i.singleLevelIterator.NextPrefix(succKey); key != nil { 812 return key, val 813 } 814 // key == nil 815 if i.err != nil { 816 return nil, base.LazyValue{} 817 } 818 819 // Did not find prefix in the existing second-level index block. This is the 820 // slow-path where we seek the iterator. 821 var ikey *InternalKey 822 if ikey, _ = i.topLevelIndex.SeekGE(succKey, base.SeekGEFlagsNone); ikey == nil { 823 i.data.invalidate() 824 i.index.invalidate() 825 return nil, base.LazyValue{} 826 } 827 result := i.loadIndex(+1) 828 if result == loadBlockFailed { 829 return nil, base.LazyValue{} 830 } 831 if result == loadBlockIrrelevant { 832 // Enforce the upper bound here since don't want to bother moving to the 833 // next entry in the top level index if upper bound is already exceeded. 834 // Note that the next entry starts with keys >= ikey.UserKey since even 835 // though this is the block separator, the same user key can span multiple 836 // index blocks. If upper is exclusive we use >= below, else we use >. 837 if i.upper != nil { 838 cmp := i.cmp(ikey.UserKey, i.upper) 839 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 840 i.exhaustedBounds = +1 841 } 842 } 843 } else if key, val := i.singleLevelIterator.SeekGE(succKey, base.SeekGEFlagsNone); key != nil { 844 return i.maybeVerifyKey(key, val) 845 } 846 return i.skipForward() 847 } 848 849 // Prev implements internalIterator.Prev, as documented in the pebble 850 // package. 851 func (i *twoLevelIterator) Prev() (*InternalKey, base.LazyValue) { 852 // Seek optimization only applies until iterator is first positioned after SetBounds. 853 i.boundsCmp = 0 854 i.maybeFilteredKeysTwoLevel = false 855 if i.err != nil { 856 return nil, base.LazyValue{} 857 } 858 if key, val := i.singleLevelIterator.Prev(); key != nil { 859 return key, val 860 } 861 return i.skipBackward() 862 } 863 864 func (i *twoLevelIterator) skipForward() (*InternalKey, base.LazyValue) { 865 for { 866 if i.err != nil || i.exhaustedBounds > 0 { 867 return nil, base.LazyValue{} 868 } 869 i.exhaustedBounds = 0 870 var ikey *InternalKey 871 if ikey, _ = i.topLevelIndex.Next(); ikey == nil { 872 i.data.invalidate() 873 i.index.invalidate() 874 return nil, base.LazyValue{} 875 } 876 result := i.loadIndex(+1) 877 if result == loadBlockFailed { 878 return nil, base.LazyValue{} 879 } 880 if result == loadBlockOK { 881 if ikey, val := i.singleLevelIterator.firstInternal(); ikey != nil { 882 return i.maybeVerifyKey(ikey, val) 883 } 884 // Next iteration will return if singleLevelIterator set 885 // exhaustedBounds = +1. 886 } else { 887 // result == loadBlockIrrelevant. Enforce the upper bound here 888 // since don't want to bother moving to the next entry in the top 889 // level index if upper bound is already exceeded. Note that the 890 // next entry starts with keys >= ikey.UserKey since even though 891 // this is the block separator, the same user key can span 892 // multiple index blocks. If upper is exclusive we use >= 893 // below, else we use >. 894 if i.upper != nil { 895 cmp := i.cmp(ikey.UserKey, i.upper) 896 if (!i.endKeyInclusive && cmp >= 0) || cmp > 0 { 897 i.exhaustedBounds = +1 898 // Next iteration will return. 899 } 900 } 901 } 902 } 903 } 904 905 func (i *twoLevelIterator) skipBackward() (*InternalKey, base.LazyValue) { 906 for { 907 if i.err != nil || i.exhaustedBounds < 0 { 908 return nil, base.LazyValue{} 909 } 910 i.exhaustedBounds = 0 911 var ikey *InternalKey 912 if ikey, _ = i.topLevelIndex.Prev(); ikey == nil { 913 i.data.invalidate() 914 i.index.invalidate() 915 return nil, base.LazyValue{} 916 } 917 result := i.loadIndex(-1) 918 if result == loadBlockFailed { 919 return nil, base.LazyValue{} 920 } 921 if result == loadBlockOK { 922 if ikey, val := i.singleLevelIterator.lastInternal(); ikey != nil { 923 return i.maybeVerifyKey(ikey, val) 924 } 925 // Next iteration will return if singleLevelIterator set 926 // exhaustedBounds = -1. 927 } else { 928 // result == loadBlockIrrelevant. Enforce the lower bound here 929 // since don't want to bother moving to the previous entry in the 930 // top level index if lower bound is already exceeded. Note that 931 // the previous entry starts with keys <= ikey.UserKey since even 932 // though this is the current block's separator, the same user key 933 // can span multiple index blocks. 934 if i.lower != nil && i.cmp(ikey.UserKey, i.lower) < 0 { 935 i.exhaustedBounds = -1 936 // Next iteration will return. 937 } 938 } 939 } 940 } 941 942 // Close implements internalIterator.Close, as documented in the pebble 943 // package. 944 func (i *twoLevelIterator) Close() error { 945 var err error 946 if i.closeHook != nil { 947 err = firstError(err, i.closeHook(i)) 948 } 949 err = firstError(err, i.data.Close()) 950 err = firstError(err, i.index.Close()) 951 err = firstError(err, i.topLevelIndex.Close()) 952 if i.dataRH != nil { 953 err = firstError(err, i.dataRH.Close()) 954 i.dataRH = nil 955 } 956 err = firstError(err, i.err) 957 if i.bpfs != nil { 958 releaseBlockPropertiesFilterer(i.bpfs) 959 } 960 if i.vbReader != nil { 961 i.vbReader.close() 962 } 963 if i.vbRH != nil { 964 err = firstError(err, i.vbRH.Close()) 965 i.vbRH = nil 966 } 967 *i = twoLevelIterator{ 968 singleLevelIterator: i.singleLevelIterator.resetForReuse(), 969 topLevelIndex: i.topLevelIndex.resetForReuse(), 970 } 971 twoLevelIterPool.Put(i) 972 return err 973 } 974 975 // Note: twoLevelCompactionIterator and compactionIterator are very similar but 976 // were separated due to performance. 977 type twoLevelCompactionIterator struct { 978 *twoLevelIterator 979 bytesIterated *uint64 980 prevOffset uint64 981 } 982 983 // twoLevelCompactionIterator implements the base.InternalIterator interface. 984 var _ base.InternalIterator = (*twoLevelCompactionIterator)(nil) 985 986 func (i *twoLevelCompactionIterator) Close() error { 987 return i.twoLevelIterator.Close() 988 } 989 990 func (i *twoLevelCompactionIterator) SeekGE( 991 key []byte, flags base.SeekGEFlags, 992 ) (*InternalKey, base.LazyValue) { 993 panic("pebble: SeekGE unimplemented") 994 } 995 996 func (i *twoLevelCompactionIterator) SeekPrefixGE( 997 prefix, key []byte, flags base.SeekGEFlags, 998 ) (*base.InternalKey, base.LazyValue) { 999 panic("pebble: SeekPrefixGE unimplemented") 1000 } 1001 1002 func (i *twoLevelCompactionIterator) SeekLT( 1003 key []byte, flags base.SeekLTFlags, 1004 ) (*InternalKey, base.LazyValue) { 1005 panic("pebble: SeekLT unimplemented") 1006 } 1007 1008 func (i *twoLevelCompactionIterator) First() (*InternalKey, base.LazyValue) { 1009 i.err = nil // clear cached iteration error 1010 return i.skipForward(i.twoLevelIterator.First()) 1011 } 1012 1013 func (i *twoLevelCompactionIterator) Last() (*InternalKey, base.LazyValue) { 1014 panic("pebble: Last unimplemented") 1015 } 1016 1017 // Note: twoLevelCompactionIterator.Next mirrors the implementation of 1018 // twoLevelIterator.Next due to performance. Keep the two in sync. 1019 func (i *twoLevelCompactionIterator) Next() (*InternalKey, base.LazyValue) { 1020 if i.err != nil { 1021 return nil, base.LazyValue{} 1022 } 1023 return i.skipForward(i.singleLevelIterator.Next()) 1024 } 1025 1026 func (i *twoLevelCompactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) { 1027 panic("pebble: NextPrefix unimplemented") 1028 } 1029 1030 func (i *twoLevelCompactionIterator) Prev() (*InternalKey, base.LazyValue) { 1031 panic("pebble: Prev unimplemented") 1032 } 1033 1034 func (i *twoLevelCompactionIterator) String() string { 1035 if i.vState != nil { 1036 return i.vState.fileNum.String() 1037 } 1038 return i.reader.fileNum.String() 1039 } 1040 1041 func (i *twoLevelCompactionIterator) skipForward( 1042 key *InternalKey, val base.LazyValue, 1043 ) (*InternalKey, base.LazyValue) { 1044 if key == nil { 1045 for { 1046 if key, _ := i.topLevelIndex.Next(); key == nil { 1047 break 1048 } 1049 result := i.loadIndex(+1) 1050 if result != loadBlockOK { 1051 if i.err != nil { 1052 break 1053 } 1054 switch result { 1055 case loadBlockFailed: 1056 // We checked that i.index was at a valid entry, so 1057 // loadBlockFailed could not have happened due to to i.index 1058 // being exhausted, and must be due to an error. 1059 panic("loadBlock should not have failed with no error") 1060 case loadBlockIrrelevant: 1061 panic("compactionIter should not be using block intervals for skipping") 1062 default: 1063 panic(fmt.Sprintf("unexpected case %d", result)) 1064 } 1065 } 1066 // result == loadBlockOK 1067 if key, val = i.singleLevelIterator.First(); key != nil { 1068 break 1069 } 1070 } 1071 } 1072 1073 curOffset := i.recordOffset() 1074 *i.bytesIterated += uint64(curOffset - i.prevOffset) 1075 i.prevOffset = curOffset 1076 1077 if i.vState != nil && key != nil { 1078 cmp := i.cmp(key.UserKey, i.vState.upper.UserKey) 1079 if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) { 1080 return nil, base.LazyValue{} 1081 } 1082 } 1083 1084 return key, val 1085 }