github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/iterator.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "io" 11 "sync" 12 "unsafe" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/internal/bytealloc" 17 "github.com/cockroachdb/pebble/internal/fastrand" 18 "github.com/cockroachdb/pebble/internal/humanize" 19 "github.com/cockroachdb/pebble/internal/invariants" 20 "github.com/cockroachdb/pebble/internal/keyspan" 21 "github.com/cockroachdb/pebble/internal/manifest" 22 "github.com/cockroachdb/pebble/internal/rangekey" 23 "github.com/cockroachdb/pebble/sstable" 24 "github.com/cockroachdb/redact" 25 ) 26 27 // iterPos describes the state of the internal iterator, in terms of whether it 28 // is at the position returned to the user (cur), one ahead of the position 29 // returned (next for forward iteration and prev for reverse iteration). The cur 30 // position is split into two states, for forward and reverse iteration, since 31 // we need to differentiate for switching directions. 32 // 33 // There is subtlety in what is considered the current position of the Iterator. 34 // The internal iterator exposes a sequence of internal keys. There is not 35 // always a single internalIterator position corresponding to the position 36 // returned to the user. Consider the example: 37 // 38 // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4 39 // \ / 40 // \ Iterator.Key() = 'a' / 41 // 42 // The Iterator exposes one valid position at user key 'a' and the two exhausted 43 // positions at the beginning and end of iteration. The underlying 44 // internalIterator contains 7 valid positions and 2 exhausted positions. 45 // 46 // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward} 47 // iff the user key at the current internalIterator position equals the 48 // Iterator.Key returned to the user. This guarantees that a call to nextUserKey 49 // or prevUserKey will advance to the next or previous iterator position. 50 // iterPosCur{Forward,Backward} does not make any guarantee about the internal 51 // iterator position among internal keys with matching user keys, and it will 52 // vary subtly depending on the particular key kinds encountered. In the above 53 // example, the iterator returning 'a' may set iterPosCurForward if the internal 54 // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6. 55 // 56 // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator 57 // must be advanced to the first internalIterator position at a user key greater 58 // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An 59 // internalIterator position that's !Valid() must also be considered greater or 60 // less—depending on the direction of iteration—than the last valid Iterator 61 // position. 62 type iterPos int8 63 64 const ( 65 iterPosCurForward iterPos = 0 66 iterPosNext iterPos = 1 67 iterPosPrev iterPos = -1 68 iterPosCurReverse iterPos = -2 69 70 // For limited iteration. When the iterator is at iterPosCurForwardPaused 71 // - Next*() call should behave as if the internal iterator is already 72 // at next (akin to iterPosNext). 73 // - Prev*() call should behave as if the internal iterator is at the 74 // current key (akin to iterPosCurForward). 75 // 76 // Similar semantics apply to CurReversePaused. 77 iterPosCurForwardPaused iterPos = 2 78 iterPosCurReversePaused iterPos = -3 79 ) 80 81 // Approximate gap in bytes between samples of data read during iteration. 82 // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield 83 // 1 << 20 (1MB). The 1MB factor comes from: 84 // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985 85 const readBytesPeriod uint64 = 1 << 16 86 87 var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration") 88 89 // IteratorMetrics holds per-iterator metrics. These do not change over the 90 // lifetime of the iterator. 91 type IteratorMetrics struct { 92 // The read amplification experienced by this iterator. This is the sum of 93 // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read 94 // amplification generally results in slower reads, though allowing higher 95 // read amplification can also result in faster writes. 96 ReadAmp int 97 } 98 99 // IteratorStatsKind describes the two kind of iterator stats. 100 type IteratorStatsKind int8 101 102 const ( 103 // InterfaceCall represents calls to Iterator. 104 InterfaceCall IteratorStatsKind = iota 105 // InternalIterCall represents calls by Iterator to its internalIterator. 106 InternalIterCall 107 // NumStatsKind is the number of kinds, and is used for array sizing. 108 NumStatsKind 109 ) 110 111 // IteratorStats contains iteration stats. 112 type IteratorStats struct { 113 // ForwardSeekCount includes SeekGE, SeekPrefixGE, First. 114 ForwardSeekCount [NumStatsKind]int 115 // ReverseSeek includes SeekLT, Last. 116 ReverseSeekCount [NumStatsKind]int 117 // ForwardStepCount includes Next. 118 ForwardStepCount [NumStatsKind]int 119 // ReverseStepCount includes Prev. 120 ReverseStepCount [NumStatsKind]int 121 InternalStats InternalIteratorStats 122 RangeKeyStats RangeKeyIteratorStats 123 } 124 125 var _ redact.SafeFormatter = &IteratorStats{} 126 127 // InternalIteratorStats contains miscellaneous stats produced by internal 128 // iterators. 129 type InternalIteratorStats = base.InternalIteratorStats 130 131 // RangeKeyIteratorStats contains miscellaneous stats about range keys 132 // encountered by the iterator. 133 type RangeKeyIteratorStats struct { 134 // Count records the number of range keys encountered during 135 // iteration. Range keys may be counted multiple times if the iterator 136 // leaves a range key's bounds and then returns. 137 Count int 138 // ContainedPoints records the number of point keys encountered within the 139 // bounds of a range key. Note that this includes point keys with suffixes 140 // that sort both above and below the covering range key's suffix. 141 ContainedPoints int 142 // SkippedPoints records the count of the subset of ContainedPoints point 143 // keys that were skipped during iteration due to range-key masking. It does 144 // not include point keys that were never loaded because a 145 // RangeKeyMasking.Filter excluded the entire containing block. 146 SkippedPoints int 147 } 148 149 // Merge adds all of the argument's statistics to the receiver. It may be used 150 // to accumulate stats across multiple iterators. 151 func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) { 152 s.Count += o.Count 153 s.ContainedPoints += o.ContainedPoints 154 s.SkippedPoints += o.SkippedPoints 155 } 156 157 // LazyValue is a lazy value. See the long comment in base.LazyValue. 158 type LazyValue = base.LazyValue 159 160 // Iterator iterates over a DB's key/value pairs in key order. 161 // 162 // An iterator must be closed after use, but it is not necessary to read an 163 // iterator until exhaustion. 164 // 165 // An iterator is not goroutine-safe, but it is safe to use multiple iterators 166 // concurrently, with each in a dedicated goroutine. 167 // 168 // It is also safe to use an iterator concurrently with modifying its 169 // underlying DB, if that DB permits modification. However, the resultant 170 // key/value pairs are not guaranteed to be a consistent snapshot of that DB 171 // at a particular point in time. 172 // 173 // If an iterator encounters an error during any operation, it is stored by 174 // the Iterator and surfaced through the Error method. All absolute 175 // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any 176 // accumulated error before positioning. All relative positioning methods (eg, 177 // Next, Prev) return without advancing if the iterator has an accumulated 178 // error. 179 type Iterator struct { 180 // The context is stored here since (a) Iterators are expected to be 181 // short-lived (since they pin memtables and sstables), (b) plumbing a 182 // context into every method is very painful, (c) they do not (yet) respect 183 // context cancellation and are only used for tracing. 184 ctx context.Context 185 opts IterOptions 186 merge Merge 187 comparer base.Comparer 188 iter internalIterator 189 pointIter internalIterator 190 // Either readState or version is set, but not both. 191 readState *readState 192 version *version 193 // rangeKey holds iteration state specific to iteration over range keys. 194 // The range key field may be nil if the Iterator has never been configured 195 // to iterate over range keys. Its non-nilness cannot be used to determine 196 // if the Iterator is currently iterating over range keys: For that, consult 197 // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter 198 // field is guaranteed to be non-nil too. 199 rangeKey *iteratorRangeKeyState 200 // rangeKeyMasking holds state for range-key masking of point keys. 201 rangeKeyMasking rangeKeyMasking 202 err error 203 // When iterValidityState=IterValid, key represents the current key, which 204 // is backed by keyBuf. 205 key []byte 206 keyBuf []byte 207 value LazyValue 208 // For use in LazyValue.Clone. 209 valueBuf []byte 210 fetcher base.LazyFetcher 211 // For use in LazyValue.Value. 212 lazyValueBuf []byte 213 valueCloser io.Closer 214 // boundsBuf holds two buffers used to store the lower and upper bounds. 215 // Whenever the Iterator's bounds change, the new bounds are copied into 216 // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce 217 // allocations. opts.LowerBound and opts.UpperBound point into this slice. 218 boundsBuf [2][]byte 219 boundsBufIdx int 220 // iterKey, iterValue reflect the latest position of iter, except when 221 // SetBounds is called. In that case, these are explicitly set to nil. 222 iterKey *InternalKey 223 iterValue LazyValue 224 alloc *iterAlloc 225 getIterAlloc *getIterAlloc 226 prefixOrFullSeekKey []byte 227 readSampling readSampling 228 stats IteratorStats 229 externalReaders [][]*sstable.Reader 230 231 // Following fields used when constructing an iterator stack, eg, in Clone 232 // and SetOptions or when re-fragmenting a batch's range keys/range dels. 233 // Non-nil if this Iterator includes a Batch. 234 batch *Batch 235 newIters tableNewIters 236 newIterRangeKey keyspan.TableNewSpanIter 237 lazyCombinedIter lazyCombinedIter 238 seqNum uint64 239 // batchSeqNum is used by Iterators over indexed batches to detect when the 240 // underlying batch has been mutated. The batch beneath an indexed batch may 241 // be mutated while the Iterator is open, but new keys are not surfaced 242 // until the next call to SetOptions. 243 batchSeqNum uint64 244 // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is 245 // configured to read through an indexed batch. If a batch is set, these 246 // iterators will be included within the iterator stack regardless of 247 // whether the batch currently contains any keys of their kind. These 248 // pointers are used during a call to SetOptions to refresh the Iterator's 249 // view of its indexed batch. 250 batchPointIter batchIter 251 batchRangeDelIter keyspan.Iter 252 batchRangeKeyIter keyspan.Iter 253 // merging is a pointer to this iterator's point merging iterator. It 254 // appears here because key visibility is handled by the merging iterator. 255 // During SetOptions on an iterator over an indexed batch, this field is 256 // used to update the merging iterator's batch snapshot. 257 merging *mergingIter 258 259 // Keeping the bools here after all the 8 byte aligned fields shrinks the 260 // sizeof this struct by 24 bytes. 261 262 // INVARIANT: 263 // iterValidityState==IterAtLimit <=> 264 // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused 265 iterValidityState IterValidityState 266 // Set to true by SetBounds, SetOptions. Causes the Iterator to appear 267 // exhausted externally, while preserving the correct iterValidityState for 268 // the iterator's internal state. Preserving the correct internal validity 269 // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT 270 // optimizations after "no-op" calls to SetBounds and SetOptions. 271 requiresReposition bool 272 // The position of iter. When this is iterPos{Prev,Next} the iter has been 273 // moved past the current key-value, which can only happen if 274 // iterValidityState=IterValid, i.e., there is something to return to the 275 // client for the current position. 276 pos iterPos 277 // Relates to the prefixOrFullSeekKey field above. 278 hasPrefix bool 279 // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext), 280 // and SeekGE/SeekLT optimizations 281 lastPositioningOp lastPositioningOpKind 282 // Used for determining when it's safe to perform SeekGE optimizations that 283 // reuse the iterator state to avoid the cost of a full seek if the iterator 284 // is already positioned in the correct place. If the iterator's view of its 285 // indexed batch was just refreshed, some optimizations cannot be applied on 286 // the first seek after the refresh: 287 // - SeekGE has a no-op optimization that does not seek on the internal 288 // iterator at all if the iterator is already in the correct place. 289 // This optimization cannot be performed if the internal iterator was 290 // last positioned when the iterator had a different view of an 291 // underlying batch. 292 // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is 293 // greater than the previous operation's seek key, under the expectation 294 // that the various internal iterators can use their current position to 295 // avoid a full expensive re-seek. This applies to the batchIter as well. 296 // However, if the view of the batch was just refreshed, the batchIter's 297 // position is not useful because it may already be beyond new keys less 298 // than the seek key. To prevent the use of this optimization in 299 // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this 300 // bit is enabled. 301 batchJustRefreshed bool 302 // Used for an optimization in external iterators to reduce the number of 303 // merging levels. 304 forwardOnly bool 305 // closePointIterOnce is set to true if this point iter can only be Close()d 306 // once, _and_ closing i.iter and then i.pointIter would close i.pointIter 307 // twice. This is necessary to track if the point iter is an internal iterator 308 // that could release its resources to a pool on Close(), making it harder for 309 // that iterator to make its own closes idempotent. 310 // 311 // TODO(bilal): Update SetOptions to always close out point key iterators when 312 // they won't be used, so that Close() doesn't need to default to closing 313 // point iterators twice. 314 closePointIterOnce bool 315 // Used in some tests to disable the random disabling of seek optimizations. 316 forceEnableSeekOpt bool 317 // Set to true if NextPrefix is not currently permitted. Defaults to false 318 // in case an iterator never had any bounds. 319 nextPrefixNotPermittedByUpperBound bool 320 } 321 322 // cmp is a convenience shorthand for the i.comparer.Compare function. 323 func (i *Iterator) cmp(a, b []byte) int { 324 return i.comparer.Compare(a, b) 325 } 326 327 // split is a convenience shorthand for the i.comparer.Split function. 328 func (i *Iterator) split(a []byte) int { 329 return i.comparer.Split(a) 330 } 331 332 // equal is a convenience shorthand for the i.comparer.Equal function. 333 func (i *Iterator) equal(a, b []byte) bool { 334 return i.comparer.Equal(a, b) 335 } 336 337 // iteratorRangeKeyState holds an iterator's range key iteration state. 338 type iteratorRangeKeyState struct { 339 opts *IterOptions 340 cmp base.Compare 341 split base.Split 342 // rangeKeyIter holds the range key iterator stack that iterates over the 343 // merged spans across the entirety of the LSM. 344 rangeKeyIter keyspan.FragmentIterator 345 iiter keyspan.InterleavingIter 346 // stale is set to true when the range key state recorded here (in start, 347 // end and keys) may not be in sync with the current range key at the 348 // interleaving iterator's current position. 349 // 350 // When the interelaving iterator passes over a new span, it invokes the 351 // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale 352 // to true if the span is non-nil. 353 // 354 // The parent iterator may not be positioned over the interleaving 355 // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so 356 // {keys,start,end} are only updated to the new range key during a call to 357 // Iterator.saveRangeKey. 358 stale bool 359 // updated is used to signal to the Iterator client whether the state of 360 // range keys has changed since the previous iterator position through the 361 // `RangeKeyChanged` method. It's set to true during an Iterator positioning 362 // operation that changes the state of the current range key. Each Iterator 363 // positioning operation sets it back to false before executing. 364 // 365 // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is 366 // intricate and confusing. Try to refactor to reduce complexity. 367 updated bool 368 // prevPosHadRangeKey records whether the previous Iterator position had a 369 // range key (HasPointAndRage() = (_, true)). It's updated at the beginning 370 // of each new Iterator positioning operation. It's required by saveRangeKey to 371 // to set `updated` appropriately: Without this record of the previous iterator 372 // state, it's ambiguous whether an iterator only temporarily stepped onto a 373 // position without a range key. 374 prevPosHadRangeKey bool 375 // rangeKeyOnly is set to true if at the current iterator position there is 376 // no point key, only a range key start boundary. 377 rangeKeyOnly bool 378 // hasRangeKey is true when the current iterator position has a covering 379 // range key (eg, a range key with bounds [<lower>,<upper>) such that 380 // <lower> ≤ Key() < <upper>). 381 hasRangeKey bool 382 // start and end are the [start, end) boundaries of the current range keys. 383 start []byte 384 end []byte 385 386 rangeKeyBuffers 387 388 // iterConfig holds fields that are used for the construction of the 389 // iterator stack, but do not need to be directly accessed during iteration. 390 // This struct is bundled within the iteratorRangeKeyState struct to reduce 391 // allocations. 392 iterConfig rangekey.UserIteratorConfig 393 } 394 395 type rangeKeyBuffers struct { 396 // keys is sorted by Suffix ascending. 397 keys []RangeKeyData 398 // buf is used to save range-key data before moving the range-key iterator. 399 // Start and end boundaries, suffixes and values are all copied into buf. 400 buf bytealloc.A 401 // internal holds buffers used by the range key internal iterators. 402 internal rangekey.Buffers 403 } 404 405 func (b *rangeKeyBuffers) PrepareForReuse() { 406 const maxKeysReuse = 100 407 if len(b.keys) > maxKeysReuse { 408 b.keys = nil 409 } 410 // Avoid caching the key buf if it is overly large. The constant is 411 // fairly arbitrary. 412 if cap(b.buf) >= maxKeyBufCacheSize { 413 b.buf = nil 414 } else { 415 b.buf = b.buf[:0] 416 } 417 b.internal.PrepareForReuse() 418 } 419 420 func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) { 421 i.cmp = cmp 422 i.split = split 423 i.opts = opts 424 } 425 426 var iterRangeKeyStateAllocPool = sync.Pool{ 427 New: func() interface{} { 428 return &iteratorRangeKeyState{} 429 }, 430 } 431 432 // isEphemeralPosition returns true iff the current iterator position is 433 // ephemeral, and won't be visited during subsequent relative positioning 434 // operations. 435 // 436 // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a 437 // straddling range key without a coincident point key is such a position. 438 func (i *Iterator) isEphemeralPosition() bool { 439 return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly && 440 !i.equal(i.rangeKey.start, i.key) 441 } 442 443 type lastPositioningOpKind int8 444 445 const ( 446 unknownLastPositionOp lastPositioningOpKind = iota 447 seekPrefixGELastPositioningOp 448 seekGELastPositioningOp 449 seekLTLastPositioningOp 450 // internalNextOp is a special internal iterator positioning operation used 451 // by CanDeterministicallySingleDelete. It exists for enforcing requirements 452 // around calling CanDeterministicallySingleDelete at most once per external 453 // iterator position. 454 internalNextOp 455 // invalidatedLastPositionOp is similar to unknownLastPositionOp and the 456 // only reason to distinguish this is for the wider set of SeekGE 457 // optimizations we permit for the external iterator Iterator.forwardOnly 458 // case. Most code predicates should be doing equality comparisons with one 459 // of the seek* enum values, so this duplication should not result in code 460 // of the form: 461 // if unknownLastPositionOp || invalidLastPositionOp 462 invalidatedLastPositionOp 463 ) 464 465 // Limited iteration mode. Not for use with prefix iteration. 466 // 467 // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator 468 // at the limit in a best-effort manner. The client should behave correctly 469 // even if the limits are ignored. These limits are not "deep", in that they 470 // are not passed down to the underlying collection of internalIterators. This 471 // is because the limits are transient, and apply only until the next 472 // iteration call. They serve mainly as a way to bound the amount of work when 473 // two (or more) Iterators are being coordinated at a higher level. 474 // 475 // In limited iteration mode: 476 // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method. 477 // The return value from the *WithLimit() method provides a more precise 478 // disposition. 479 // - The limit is exclusive for forward and inclusive for reverse. 480 // 481 // 482 // Limited iteration mode & range keys 483 // 484 // Limited iteration interacts with range-key iteration. When range key 485 // iteration is enabled, range keys are interleaved at their start boundaries. 486 // Limited iteration must ensure that if a range key exists within the limit, 487 // the iterator visits the range key. 488 // 489 // During forward limited iteration, this is trivial: An overlapping range key 490 // must have a start boundary less than the limit, and the range key's start 491 // boundary will be interleaved and found to be within the limit. 492 // 493 // During reverse limited iteration, the tail of the range key may fall within 494 // the limit. The range key must be surfaced even if the range key's start 495 // boundary is less than the limit, and if there are no point keys between the 496 // current iterator position and the limit. To provide this guarantee, reverse 497 // limited iteration ignores the limit as long as there is a range key 498 // overlapping the iteration position. 499 500 // IterValidityState captures the state of the Iterator. 501 type IterValidityState int8 502 503 const ( 504 // IterExhausted represents an Iterator that is exhausted. 505 IterExhausted IterValidityState = iota 506 // IterValid represents an Iterator that is valid. 507 IterValid 508 // IterAtLimit represents an Iterator that has a non-exhausted 509 // internalIterator, but has reached a limit without any key for the 510 // caller. 511 IterAtLimit 512 ) 513 514 // readSampling stores variables used to sample a read to trigger a read 515 // compaction 516 type readSampling struct { 517 bytesUntilReadSampling uint64 518 initialSamplePassed bool 519 pendingCompactions readCompactionQueue 520 // forceReadSampling is used for testing purposes to force a read sample on every 521 // call to Iterator.maybeSampleRead() 522 forceReadSampling bool 523 } 524 525 func (i *Iterator) findNextEntry(limit []byte) { 526 i.iterValidityState = IterExhausted 527 i.pos = iterPosCurForward 528 if i.opts.rangeKeys() && i.rangeKey != nil { 529 i.rangeKey.rangeKeyOnly = false 530 } 531 532 // Close the closer for the current value if one was open. 533 if i.closeValueCloser() != nil { 534 return 535 } 536 537 for i.iterKey != nil { 538 key := *i.iterKey 539 540 if i.hasPrefix { 541 if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) { 542 return 543 } 544 } 545 // Compare with limit every time we start at a different user key. 546 // Note that given the best-effort contract of limit, we could avoid a 547 // comparison in the common case by doing this only after 548 // i.nextUserKey is called for the deletes below. However that makes 549 // the behavior non-deterministic (since the behavior will vary based 550 // on what has been compacted), which makes it hard to test with the 551 // metamorphic test. So we forego that performance optimization. 552 if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 { 553 i.iterValidityState = IterAtLimit 554 i.pos = iterPosCurForwardPaused 555 return 556 } 557 558 // If the user has configured a SkipPoint function, invoke it to see 559 // whether we should skip over the current user key. 560 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) { 561 // NB: We could call nextUserKey, but in some cases the SkipPoint 562 // predicate function might be cheaper than nextUserKey's key copy 563 // and key comparison. This should be the case for MVCC suffix 564 // comparisons, for example. In the future, we could expand the 565 // SkipPoint interface to give the implementor more control over 566 // whether we skip over just the internal key, the user key, or even 567 // the key prefix. 568 i.stats.ForwardStepCount[InternalIterCall]++ 569 i.iterKey, i.iterValue = i.iter.Next() 570 continue 571 } 572 573 switch key.Kind() { 574 case InternalKeyKindRangeKeySet: 575 // Save the current key. 576 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 577 i.key = i.keyBuf 578 i.value = LazyValue{} 579 // There may also be a live point key at this userkey that we have 580 // not yet read. We need to find the next entry with this user key 581 // to find it. Save the range key so we don't lose it when we Next 582 // the underlying iterator. 583 i.saveRangeKey() 584 pointKeyExists := i.nextPointCurrentUserKey() 585 if i.err != nil { 586 i.iterValidityState = IterExhausted 587 return 588 } 589 i.rangeKey.rangeKeyOnly = !pointKeyExists 590 i.iterValidityState = IterValid 591 return 592 593 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 594 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 595 // only simpler, but is also necessary for correctness due to 596 // InternalKeyKindSSTableInternalObsoleteBit. 597 i.nextUserKey() 598 continue 599 600 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 601 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 602 i.key = i.keyBuf 603 i.value = i.iterValue 604 i.iterValidityState = IterValid 605 i.saveRangeKey() 606 return 607 608 case InternalKeyKindMerge: 609 // Resolving the merge may advance us to the next point key, which 610 // may be covered by a different set of range keys. Save the range 611 // key state so we don't lose it. 612 i.saveRangeKey() 613 if i.mergeForward(key) { 614 i.iterValidityState = IterValid 615 return 616 } 617 618 // The merge didn't yield a valid key, either because the value 619 // merger indicated it should be deleted, or because an error was 620 // encountered. 621 i.iterValidityState = IterExhausted 622 if i.err != nil { 623 return 624 } 625 if i.pos != iterPosNext { 626 i.nextUserKey() 627 } 628 if i.closeValueCloser() != nil { 629 return 630 } 631 i.pos = iterPosCurForward 632 633 default: 634 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 635 i.iterValidityState = IterExhausted 636 return 637 } 638 } 639 } 640 641 func (i *Iterator) nextPointCurrentUserKey() bool { 642 // If the user has configured a SkipPoint function and the current user key 643 // would be skipped by it, there's no need to step forward looking for a 644 // point key. If we were to find one, it should be skipped anyways. 645 if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) { 646 return false 647 } 648 649 i.pos = iterPosCurForward 650 651 i.iterKey, i.iterValue = i.iter.Next() 652 i.stats.ForwardStepCount[InternalIterCall]++ 653 if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) { 654 i.pos = iterPosNext 655 return false 656 } 657 658 key := *i.iterKey 659 switch key.Kind() { 660 case InternalKeyKindRangeKeySet: 661 // RangeKeySets must always be interleaved as the first internal key 662 // for a user key. 663 i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key") 664 return false 665 666 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 667 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 668 // only simpler, but is also necessary for correctness due to 669 // InternalKeyKindSSTableInternalObsoleteBit. 670 return false 671 672 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 673 i.value = i.iterValue 674 return true 675 676 case InternalKeyKindMerge: 677 return i.mergeForward(key) 678 679 default: 680 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 681 return false 682 } 683 } 684 685 // mergeForward resolves a MERGE key, advancing the underlying iterator forward 686 // to merge with subsequent keys with the same userkey. mergeForward returns a 687 // boolean indicating whether or not the merge yielded a valid key. A merge may 688 // not yield a valid key if an error occurred, in which case i.err is non-nil, 689 // or the user's value merger specified the key to be deleted. 690 // 691 // mergeForward does not update iterValidityState. 692 func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) { 693 var iterValue []byte 694 iterValue, _, i.err = i.iterValue.Value(nil) 695 if i.err != nil { 696 return false 697 } 698 var valueMerger ValueMerger 699 valueMerger, i.err = i.merge(key.UserKey, iterValue) 700 if i.err != nil { 701 return false 702 } 703 704 i.mergeNext(key, valueMerger) 705 if i.err != nil { 706 return false 707 } 708 709 var needDelete bool 710 var value []byte 711 value, needDelete, i.valueCloser, i.err = finishValueMerger( 712 valueMerger, true /* includesBase */) 713 i.value = base.MakeInPlaceValue(value) 714 if i.err != nil { 715 return false 716 } 717 if needDelete { 718 _ = i.closeValueCloser() 719 return false 720 } 721 return true 722 } 723 724 func (i *Iterator) closeValueCloser() error { 725 if i.valueCloser != nil { 726 i.err = i.valueCloser.Close() 727 i.valueCloser = nil 728 } 729 return i.err 730 } 731 732 func (i *Iterator) nextUserKey() { 733 if i.iterKey == nil { 734 return 735 } 736 trailer := i.iterKey.Trailer 737 done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 738 if i.iterValidityState != IterValid { 739 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 740 i.key = i.keyBuf 741 } 742 for { 743 i.iterKey, i.iterValue = i.iter.Next() 744 i.stats.ForwardStepCount[InternalIterCall]++ 745 // NB: We're guaranteed to be on the next user key if the previous key 746 // had a zero sequence number (`done`), or the new key has a trailer 747 // greater or equal to the previous key's trailer. This is true because 748 // internal keys with the same user key are sorted by Trailer in 749 // strictly monotonically descending order. We expect the trailer 750 // optimization to trigger around 50% of the time with randomly 751 // distributed writes. We expect it to trigger very frequently when 752 // iterating through ingested sstables, which contain keys that all have 753 // the same sequence number. 754 if done || i.iterKey == nil || i.iterKey.Trailer >= trailer { 755 break 756 } 757 if !i.equal(i.key, i.iterKey.UserKey) { 758 break 759 } 760 done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 761 trailer = i.iterKey.Trailer 762 } 763 } 764 765 func (i *Iterator) maybeSampleRead() { 766 // This method is only called when a public method of Iterator is 767 // returning, and below we exclude the case were the iterator is paused at 768 // a limit. The effect of these choices is that keys that are deleted, but 769 // are encountered during iteration, are not accounted for in the read 770 // sampling and will not cause read driven compactions, even though we are 771 // incurring cost in iterating over them. And this issue is not limited to 772 // Iterator, which does not see the effect of range deletes, which may be 773 // causing iteration work in mergingIter. It is not clear at this time 774 // whether this is a deficiency worth addressing. 775 if i.iterValidityState != IterValid { 776 return 777 } 778 if i.readState == nil { 779 return 780 } 781 if i.readSampling.forceReadSampling { 782 i.sampleRead() 783 return 784 } 785 samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier) 786 if samplingPeriod <= 0 { 787 return 788 } 789 bytesRead := uint64(len(i.key) + i.value.Len()) 790 for i.readSampling.bytesUntilReadSampling < bytesRead { 791 i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod))) 792 // The block below tries to adjust for the case where this is the 793 // first read in a newly-opened iterator. As bytesUntilReadSampling 794 // starts off at zero, we don't want to sample the first read of 795 // every newly-opened iterator, but we do want to sample some of them. 796 if !i.readSampling.initialSamplePassed { 797 i.readSampling.initialSamplePassed = true 798 if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) { 799 continue 800 } 801 } 802 i.sampleRead() 803 } 804 i.readSampling.bytesUntilReadSampling -= bytesRead 805 } 806 807 func (i *Iterator) sampleRead() { 808 var topFile *manifest.FileMetadata 809 topLevel, numOverlappingLevels := numLevels, 0 810 mi := i.merging 811 if mi == nil { 812 return 813 } 814 if len(mi.levels) > 1 { 815 mi.ForEachLevelIter(func(li *levelIter) bool { 816 l := manifest.LevelToInt(li.level) 817 if f := li.iterFile; f != nil { 818 var containsKey bool 819 if i.pos == iterPosNext || i.pos == iterPosCurForward || 820 i.pos == iterPosCurForwardPaused { 821 containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0 822 } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse || 823 i.pos == iterPosCurReversePaused { 824 containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0 825 } 826 // Do nothing if the current key is not contained in f's 827 // bounds. We could seek the LevelIterator at this level 828 // to find the right file, but the performance impacts of 829 // doing that are significant enough to negate the benefits 830 // of read sampling in the first place. See the discussion 831 // at: 832 // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492 833 if containsKey { 834 numOverlappingLevels++ 835 if numOverlappingLevels >= 2 { 836 // Terminate the loop early if at least 2 overlapping levels are found. 837 return true 838 } 839 topLevel = l 840 topFile = f 841 } 842 } 843 return false 844 }) 845 } 846 if topFile == nil || topLevel >= numLevels { 847 return 848 } 849 if numOverlappingLevels >= 2 { 850 allowedSeeks := topFile.AllowedSeeks.Add(-1) 851 if allowedSeeks == 0 { 852 853 // Since the compaction queue can handle duplicates, we can keep 854 // adding to the queue even once allowedSeeks hits 0. 855 // In fact, we NEED to keep adding to the queue, because the queue 856 // is small and evicts older and possibly useful compactions. 857 topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks) 858 859 read := readCompaction{ 860 start: topFile.SmallestPointKey.UserKey, 861 end: topFile.LargestPointKey.UserKey, 862 level: topLevel, 863 fileNum: topFile.FileNum, 864 } 865 i.readSampling.pendingCompactions.add(&read, i.cmp) 866 } 867 } 868 } 869 870 func (i *Iterator) findPrevEntry(limit []byte) { 871 i.iterValidityState = IterExhausted 872 i.pos = iterPosCurReverse 873 if i.opts.rangeKeys() && i.rangeKey != nil { 874 i.rangeKey.rangeKeyOnly = false 875 } 876 877 // Close the closer for the current value if one was open. 878 if i.valueCloser != nil { 879 i.err = i.valueCloser.Close() 880 i.valueCloser = nil 881 if i.err != nil { 882 i.iterValidityState = IterExhausted 883 return 884 } 885 } 886 887 var valueMerger ValueMerger 888 firstLoopIter := true 889 rangeKeyBoundary := false 890 // The code below compares with limit in multiple places. As documented in 891 // findNextEntry, this is being done to make the behavior of limit 892 // deterministic to allow for metamorphic testing. It is not required by 893 // the best-effort contract of limit. 894 for i.iterKey != nil { 895 key := *i.iterKey 896 897 // NB: We cannot pause if the current key is covered by a range key. 898 // Otherwise, the user might not ever learn of a range key that covers 899 // the key space being iterated over in which there are no point keys. 900 // Since limits are best effort, ignoring the limit in this case is 901 // allowed by the contract of limit. 902 if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 903 i.iterValidityState = IterAtLimit 904 i.pos = iterPosCurReversePaused 905 return 906 } 907 firstLoopIter = false 908 909 if i.iterValidityState == IterValid { 910 if !i.equal(key.UserKey, i.key) { 911 // We've iterated to the previous user key. 912 i.pos = iterPosPrev 913 if valueMerger != nil { 914 var needDelete bool 915 var value []byte 916 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 917 i.value = base.MakeInPlaceValue(value) 918 if i.err == nil && needDelete { 919 // The point key at this key is deleted. If we also have 920 // a range key boundary at this key, we still want to 921 // return. Otherwise, we need to continue looking for 922 // a live key. 923 i.value = LazyValue{} 924 if rangeKeyBoundary { 925 i.rangeKey.rangeKeyOnly = true 926 } else { 927 i.iterValidityState = IterExhausted 928 if i.closeValueCloser() == nil { 929 continue 930 } 931 } 932 } 933 } 934 if i.err != nil { 935 i.iterValidityState = IterExhausted 936 } 937 return 938 } 939 } 940 941 // If the user has configured a SkipPoint function, invoke it to see 942 // whether we should skip over the current user key. 943 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) { 944 // NB: We could call prevUserKey, but in some cases the SkipPoint 945 // predicate function might be cheaper than prevUserKey's key copy 946 // and key comparison. This should be the case for MVCC suffix 947 // comparisons, for example. In the future, we could expand the 948 // SkipPoint interface to give the implementor more control over 949 // whether we skip over just the internal key, the user key, or even 950 // the key prefix. 951 i.stats.ReverseStepCount[InternalIterCall]++ 952 i.iterKey, i.iterValue = i.iter.Prev() 953 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 954 i.iterValidityState = IterAtLimit 955 i.pos = iterPosCurReversePaused 956 return 957 } 958 continue 959 } 960 961 switch key.Kind() { 962 case InternalKeyKindRangeKeySet: 963 // Range key start boundary markers are interleaved with the maximum 964 // sequence number, so if there's a point key also at this key, we 965 // must've already iterated over it. 966 // This is the final entry at this user key, so we may return 967 i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid 968 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 969 i.key = i.keyBuf 970 i.iterValidityState = IterValid 971 i.saveRangeKey() 972 // In all other cases, previous iteration requires advancing to 973 // iterPosPrev in order to determine if the key is live and 974 // unshadowed by another key at the same user key. In this case, 975 // because range key start boundary markers are always interleaved 976 // at the maximum sequence number, we know that there aren't any 977 // additional keys with the same user key in the backward direction. 978 // 979 // We Prev the underlying iterator once anyways for consistency, so 980 // that we can maintain the invariant during backward iteration that 981 // i.iterPos = iterPosPrev. 982 i.stats.ReverseStepCount[InternalIterCall]++ 983 i.iterKey, i.iterValue = i.iter.Prev() 984 985 // Set rangeKeyBoundary so that on the next iteration, we know to 986 // return the key even if the MERGE point key is deleted. 987 rangeKeyBoundary = true 988 989 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 990 i.value = LazyValue{} 991 i.iterValidityState = IterExhausted 992 valueMerger = nil 993 i.iterKey, i.iterValue = i.iter.Prev() 994 i.stats.ReverseStepCount[InternalIterCall]++ 995 // Compare with the limit. We could optimize by only checking when 996 // we step to the previous user key, but detecting that requires a 997 // comparison too. Note that this position may already passed a 998 // number of versions of this user key, but they are all deleted, so 999 // the fact that a subsequent Prev*() call will not see them is 1000 // harmless. Also note that this is the only place in the loop, 1001 // other than the firstLoopIter and SkipPoint cases above, where we 1002 // could step to a different user key and start processing it for 1003 // returning to the caller. 1004 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 1005 i.iterValidityState = IterAtLimit 1006 i.pos = iterPosCurReversePaused 1007 return 1008 } 1009 continue 1010 1011 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1012 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1013 i.key = i.keyBuf 1014 // iterValue is owned by i.iter and could change after the Prev() 1015 // call, so use valueBuf instead. Note that valueBuf is only used 1016 // in this one instance; everywhere else (eg. in findNextEntry), 1017 // we just point i.value to the unsafe i.iter-owned value buffer. 1018 i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher) 1019 i.saveRangeKey() 1020 i.iterValidityState = IterValid 1021 i.iterKey, i.iterValue = i.iter.Prev() 1022 i.stats.ReverseStepCount[InternalIterCall]++ 1023 valueMerger = nil 1024 continue 1025 1026 case InternalKeyKindMerge: 1027 if i.iterValidityState == IterExhausted { 1028 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1029 i.key = i.keyBuf 1030 i.saveRangeKey() 1031 var iterValue []byte 1032 iterValue, _, i.err = i.iterValue.Value(nil) 1033 if i.err != nil { 1034 return 1035 } 1036 valueMerger, i.err = i.merge(i.key, iterValue) 1037 if i.err != nil { 1038 return 1039 } 1040 i.iterValidityState = IterValid 1041 } else if valueMerger == nil { 1042 // Extract value before iterValue since we use value before iterValue 1043 // and the underlying iterator is not required to provide backing 1044 // memory for both simultaneously. 1045 var value []byte 1046 var callerOwned bool 1047 value, callerOwned, i.err = i.value.Value(i.lazyValueBuf) 1048 if callerOwned { 1049 i.lazyValueBuf = value[:0] 1050 } 1051 if i.err != nil { 1052 return 1053 } 1054 valueMerger, i.err = i.merge(i.key, value) 1055 var iterValue []byte 1056 iterValue, _, i.err = i.iterValue.Value(nil) 1057 if i.err != nil { 1058 return 1059 } 1060 if i.err == nil { 1061 i.err = valueMerger.MergeNewer(iterValue) 1062 } 1063 if i.err != nil { 1064 i.iterValidityState = IterExhausted 1065 return 1066 } 1067 } else { 1068 var iterValue []byte 1069 iterValue, _, i.err = i.iterValue.Value(nil) 1070 if i.err != nil { 1071 return 1072 } 1073 i.err = valueMerger.MergeNewer(iterValue) 1074 if i.err != nil { 1075 i.iterValidityState = IterExhausted 1076 return 1077 } 1078 } 1079 i.iterKey, i.iterValue = i.iter.Prev() 1080 i.stats.ReverseStepCount[InternalIterCall]++ 1081 continue 1082 1083 default: 1084 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1085 i.iterValidityState = IterExhausted 1086 return 1087 } 1088 } 1089 1090 // i.iterKey == nil, so broke out of the preceding loop. 1091 if i.iterValidityState == IterValid { 1092 i.pos = iterPosPrev 1093 if valueMerger != nil { 1094 var needDelete bool 1095 var value []byte 1096 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 1097 i.value = base.MakeInPlaceValue(value) 1098 if i.err == nil && needDelete { 1099 i.key = nil 1100 i.value = LazyValue{} 1101 i.iterValidityState = IterExhausted 1102 } 1103 } 1104 if i.err != nil { 1105 i.iterValidityState = IterExhausted 1106 } 1107 } 1108 } 1109 1110 func (i *Iterator) prevUserKey() { 1111 if i.iterKey == nil { 1112 return 1113 } 1114 if i.iterValidityState != IterValid { 1115 // If we're going to compare against the prev key, we need to save the 1116 // current key. 1117 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 1118 i.key = i.keyBuf 1119 } 1120 for { 1121 i.iterKey, i.iterValue = i.iter.Prev() 1122 i.stats.ReverseStepCount[InternalIterCall]++ 1123 if i.iterKey == nil { 1124 break 1125 } 1126 if !i.equal(i.key, i.iterKey.UserKey) { 1127 break 1128 } 1129 } 1130 } 1131 1132 func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { 1133 // Save the current key. 1134 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1135 i.key = i.keyBuf 1136 1137 // Loop looking for older values for this key and merging them. 1138 for { 1139 i.iterKey, i.iterValue = i.iter.Next() 1140 i.stats.ForwardStepCount[InternalIterCall]++ 1141 if i.iterKey == nil { 1142 i.pos = iterPosNext 1143 return 1144 } 1145 key = *i.iterKey 1146 if !i.equal(i.key, key.UserKey) { 1147 // We've advanced to the next key. 1148 i.pos = iterPosNext 1149 return 1150 } 1151 switch key.Kind() { 1152 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 1153 // We've hit a deletion tombstone. Return everything up to this 1154 // point. 1155 // 1156 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 1157 // only simpler, but is also necessary for correctness due to 1158 // InternalKeyKindSSTableInternalObsoleteBit. 1159 return 1160 1161 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1162 // We've hit a Set value. Merge with the existing value and return. 1163 var iterValue []byte 1164 iterValue, _, i.err = i.iterValue.Value(nil) 1165 if i.err != nil { 1166 return 1167 } 1168 i.err = valueMerger.MergeOlder(iterValue) 1169 return 1170 1171 case InternalKeyKindMerge: 1172 // We've hit another Merge value. Merge with the existing value and 1173 // continue looping. 1174 var iterValue []byte 1175 iterValue, _, i.err = i.iterValue.Value(nil) 1176 if i.err != nil { 1177 return 1178 } 1179 i.err = valueMerger.MergeOlder(iterValue) 1180 if i.err != nil { 1181 return 1182 } 1183 continue 1184 1185 case InternalKeyKindRangeKeySet: 1186 // The RANGEKEYSET marker must sort before a MERGE at the same user key. 1187 i.err = base.CorruptionErrorf("pebble: out of order range key marker") 1188 return 1189 1190 default: 1191 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1192 return 1193 } 1194 } 1195 } 1196 1197 // SeekGE moves the iterator to the first key/value pair whose key is greater 1198 // than or equal to the given key. Returns true if the iterator is pointing at 1199 // a valid entry and false otherwise. 1200 func (i *Iterator) SeekGE(key []byte) bool { 1201 return i.SeekGEWithLimit(key, nil) == IterValid 1202 } 1203 1204 // SeekGEWithLimit moves the iterator to the first key/value pair whose key is 1205 // greater than or equal to the given key. 1206 // 1207 // If limit is provided, it serves as a best-effort exclusive limit. If the 1208 // first key greater than or equal to the given search key is also greater than 1209 // or equal to limit, the Iterator may pause and return IterAtLimit. Because 1210 // limits are best-effort, SeekGEWithLimit may return a key beyond limit. 1211 // 1212 // If the Iterator is configured to iterate over range keys, SeekGEWithLimit 1213 // guarantees it will surface any range keys with bounds overlapping the 1214 // keyspace [key, limit). 1215 func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { 1216 if i.rangeKey != nil { 1217 // NB: Check Valid() before clearing requiresReposition. 1218 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1219 // If we have a range key but did not expose it at the previous iterator 1220 // position (because the iterator was not at a valid position), updated 1221 // must be true. This ensures that after an iterator op sequence like: 1222 // - Next() → (IterValid, RangeBounds() = [a,b)) 1223 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1224 // - SeekGE(...) → (IterValid, RangeBounds() = [a,b)) 1225 // the iterator returns RangeKeyChanged()=true. 1226 // 1227 // The remainder of this function will only update i.rangeKey.updated if 1228 // the iterator moves into a new range key, or out of the current range 1229 // key. 1230 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1231 } 1232 lastPositioningOp := i.lastPositioningOp 1233 hasPrefix := i.hasPrefix 1234 // Set it to unknown, since this operation may not succeed, in which case 1235 // the SeekGE following this should not make any assumption about iterator 1236 // position. 1237 i.lastPositioningOp = unknownLastPositionOp 1238 i.requiresReposition = false 1239 i.err = nil // clear cached iteration error 1240 i.hasPrefix = false 1241 i.stats.ForwardSeekCount[InterfaceCall]++ 1242 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1243 key = lowerBound 1244 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1245 key = upperBound 1246 } 1247 seekInternalIter := true 1248 1249 var flags base.SeekGEFlags 1250 if i.batchJustRefreshed { 1251 i.batchJustRefreshed = false 1252 flags = flags.EnableBatchJustRefreshed() 1253 } 1254 if lastPositioningOp == seekGELastPositioningOp { 1255 cmp := i.cmp(i.prefixOrFullSeekKey, key) 1256 // If this seek is to the same or later key, and the iterator is 1257 // already positioned there, this is a noop. This can be helpful for 1258 // sparse key spaces that have many deleted keys, where one can avoid 1259 // the overhead of iterating past them again and again. 1260 if cmp <= 0 { 1261 if !flags.BatchJustRefreshed() && 1262 (i.iterValidityState == IterExhausted || 1263 (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 && 1264 (limit == nil || i.cmp(i.key, limit) < 0))) { 1265 // Noop 1266 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt { 1267 i.lastPositioningOp = seekGELastPositioningOp 1268 return i.iterValidityState 1269 } 1270 } 1271 // cmp == 0 is not safe to optimize since 1272 // - i.pos could be at iterPosNext, due to a merge. 1273 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1274 // SET pair for a key, and the iterator would have moved past DELETE 1275 // but stayed at iterPosCurForward. A similar situation occurs for a 1276 // MERGE, SET pair where the MERGE is consumed and the iterator is 1277 // at the SET. 1278 // We also leverage the IterAtLimit <=> i.pos invariant defined in the 1279 // comment on iterValidityState, to exclude any cases where i.pos 1280 // is iterPosCur{Forward,Reverse}Paused. This avoids the need to 1281 // special-case those iterator positions and their interactions with 1282 // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB 1283 // do not use limited Seeks in the first place. 1284 if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil { 1285 flags = flags.EnableTrySeekUsingNext() 1286 } 1287 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1288 flags = flags.DisableTrySeekUsingNext() 1289 } 1290 if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 { 1291 // Have some work to do, but don't need to seek, and we can 1292 // start doing findNextEntry from i.iterKey. 1293 seekInternalIter = false 1294 } 1295 } 1296 } 1297 // Check for another TrySeekUsingNext optimization opportunity, currently 1298 // specifically tailored to external iterators. This case is intended to 1299 // trigger in instances of Seek-ing with monotonically increasing keys with 1300 // Nexts interspersed. At the time of writing, this is the case for 1301 // CockroachDB scans. This optimization is important for external iterators 1302 // to avoid re-seeking within an already-exhausted sstable. It is not always 1303 // a performance win more generally, so we restrict it to external iterators 1304 // that are configured to only use forward positioning operations. 1305 // 1306 // TODO(jackson): This optimization should be obsolete once we introduce and 1307 // use the NextPrefix iterator positioning operation. 1308 if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp && 1309 i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid && 1310 i.cmp(key, i.iterKey.UserKey) > 0 { 1311 flags = flags.EnableTrySeekUsingNext() 1312 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1313 flags = flags.DisableTrySeekUsingNext() 1314 } 1315 } 1316 if seekInternalIter { 1317 i.iterKey, i.iterValue = i.iter.SeekGE(key, flags) 1318 i.stats.ForwardSeekCount[InternalIterCall]++ 1319 } 1320 i.findNextEntry(limit) 1321 i.maybeSampleRead() 1322 if i.Error() == nil { 1323 // Prepare state for a future noop optimization. 1324 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1325 i.lastPositioningOp = seekGELastPositioningOp 1326 } 1327 return i.iterValidityState 1328 } 1329 1330 // SeekPrefixGE moves the iterator to the first key/value pair whose key is 1331 // greater than or equal to the given key and which has the same "prefix" as 1332 // the given key. The prefix for a key is determined by the user-defined 1333 // Comparer.Split function. The iterator will not observe keys not matching the 1334 // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix 1335 // iteration mode. The iterator remains in prefix iteration until a subsequent 1336 // call to another absolute positioning method (SeekGE, SeekLT, First, 1337 // Last). Reverse iteration (Prev) is not supported when an iterator is in 1338 // prefix iteration mode. Returns true if the iterator is pointing at a valid 1339 // entry and false otherwise. 1340 // 1341 // The semantics of SeekPrefixGE are slightly unusual and designed for 1342 // iteration to be able to take advantage of bloom filters that have been 1343 // created on the "prefix". If you're not using bloom filters, there is no 1344 // reason to use SeekPrefixGE. 1345 // 1346 // An example Split function may separate a timestamp suffix from the prefix of 1347 // the key. 1348 // 1349 // Split(<key>@<timestamp>) -> <key> 1350 // 1351 // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys 1352 // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the 1353 // usual definition, those prefixes differ by the definition of the Split 1354 // function. To see how this works, consider the following set of calls on this 1355 // data set: 1356 // 1357 // SeekPrefixGE("a@0") -> "a@1" 1358 // Next() -> "a@2" 1359 // Next() -> EOF 1360 // 1361 // If you're just looking to iterate over keys with a shared prefix, as 1362 // defined by the configured comparer, set iterator bounds instead: 1363 // 1364 // iter := db.NewIter(&pebble.IterOptions{ 1365 // LowerBound: []byte("prefix"), 1366 // UpperBound: []byte("prefiy"), 1367 // }) 1368 // for iter.First(); iter.Valid(); iter.Next() { 1369 // // Only keys beginning with "prefix" will be visited. 1370 // } 1371 // 1372 // See ExampleIterator_SeekPrefixGE for a working example. 1373 // 1374 // When iterating with range keys enabled, all range keys encountered are 1375 // truncated to the seek key's prefix's bounds. The truncation of the upper 1376 // bound requires that the database's Comparer is configured with a 1377 // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the 1378 // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)]. 1379 func (i *Iterator) SeekPrefixGE(key []byte) bool { 1380 if i.rangeKey != nil { 1381 // NB: Check Valid() before clearing requiresReposition. 1382 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1383 // If we have a range key but did not expose it at the previous iterator 1384 // position (because the iterator was not at a valid position), updated 1385 // must be true. This ensures that after an iterator op sequence like: 1386 // - Next() → (IterValid, RangeBounds() = [a,b)) 1387 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1388 // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b)) 1389 // the iterator returns RangeKeyChanged()=true. 1390 // 1391 // The remainder of this function will only update i.rangeKey.updated if 1392 // the iterator moves into a new range key, or out of the current range 1393 // key. 1394 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1395 } 1396 lastPositioningOp := i.lastPositioningOp 1397 // Set it to unknown, since this operation may not succeed, in which case 1398 // the SeekPrefixGE following this should not make any assumption about 1399 // iterator position. 1400 i.lastPositioningOp = unknownLastPositionOp 1401 i.requiresReposition = false 1402 i.err = nil // clear cached iteration error 1403 i.stats.ForwardSeekCount[InterfaceCall]++ 1404 if i.comparer.Split == nil { 1405 panic("pebble: split must be provided for SeekPrefixGE") 1406 } 1407 if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly { 1408 panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys") 1409 } 1410 prefixLen := i.split(key) 1411 keyPrefix := key[:prefixLen] 1412 var flags base.SeekGEFlags 1413 if i.batchJustRefreshed { 1414 flags = flags.EnableBatchJustRefreshed() 1415 i.batchJustRefreshed = false 1416 } 1417 if lastPositioningOp == seekPrefixGELastPositioningOp { 1418 if !i.hasPrefix { 1419 panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false") 1420 } 1421 // The iterator has not been repositioned after the last SeekPrefixGE. 1422 // See if we are seeking to a larger key, since then we can optimize 1423 // the seek by using next. Note that we could also optimize if Next 1424 // has been called, if the iterator is not exhausted and the current 1425 // position is <= the seek key. We are keeping this limited for now 1426 // since such optimizations require care for correctness, and to not 1427 // become de-optimizations (if one usually has to do all the next 1428 // calls and then the seek). This SeekPrefixGE optimization 1429 // specifically benefits CockroachDB. 1430 cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix) 1431 // cmp == 0 is not safe to optimize since 1432 // - i.pos could be at iterPosNext, due to a merge. 1433 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1434 // SET pair for a key, and the iterator would have moved past DELETE 1435 // but stayed at iterPosCurForward. A similar situation occurs for a 1436 // MERGE, SET pair where the MERGE is consumed and the iterator is 1437 // at the SET. 1438 // In general some versions of i.prefix could have been consumed by 1439 // the iterator, so we only optimize for cmp < 0. 1440 if cmp < 0 { 1441 flags = flags.EnableTrySeekUsingNext() 1442 } 1443 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1444 flags = flags.DisableTrySeekUsingNext() 1445 } 1446 } 1447 // Make a copy of the prefix so that modifications to the key after 1448 // SeekPrefixGE returns does not affect the stored prefix. 1449 if cap(i.prefixOrFullSeekKey) < prefixLen { 1450 i.prefixOrFullSeekKey = make([]byte, prefixLen) 1451 } else { 1452 i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen] 1453 } 1454 i.hasPrefix = true 1455 copy(i.prefixOrFullSeekKey, keyPrefix) 1456 1457 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1458 if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) { 1459 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound") 1460 i.iterValidityState = IterExhausted 1461 return false 1462 } 1463 key = lowerBound 1464 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1465 if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) { 1466 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound") 1467 i.iterValidityState = IterExhausted 1468 return false 1469 } 1470 key = upperBound 1471 } 1472 i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags) 1473 i.stats.ForwardSeekCount[InternalIterCall]++ 1474 i.findNextEntry(nil) 1475 i.maybeSampleRead() 1476 if i.Error() == nil { 1477 i.lastPositioningOp = seekPrefixGELastPositioningOp 1478 } 1479 return i.iterValidityState == IterValid 1480 } 1481 1482 // Deterministic disabling of the seek optimizations. It uses the iterator 1483 // pointer, since we want diversity in iterator behavior for the same key. Used 1484 // for tests. 1485 func disableSeekOpt(key []byte, ptr uintptr) bool { 1486 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 1487 simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 1488 return key != nil && key[0]&byte(1) == 0 && simpleHash == 0 1489 } 1490 1491 // SeekLT moves the iterator to the last key/value pair whose key is less than 1492 // the given key. Returns true if the iterator is pointing at a valid entry and 1493 // false otherwise. 1494 func (i *Iterator) SeekLT(key []byte) bool { 1495 return i.SeekLTWithLimit(key, nil) == IterValid 1496 } 1497 1498 // SeekLTWithLimit moves the iterator to the last key/value pair whose key is 1499 // less than the given key. 1500 // 1501 // If limit is provided, it serves as a best-effort inclusive limit. If the last 1502 // key less than the given search key is also less than limit, the Iterator may 1503 // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit 1504 // may return a key beyond limit. 1505 // 1506 // If the Iterator is configured to iterate over range keys, SeekLTWithLimit 1507 // guarantees it will surface any range keys with bounds overlapping the 1508 // keyspace up to limit. 1509 func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { 1510 if i.rangeKey != nil { 1511 // NB: Check Valid() before clearing requiresReposition. 1512 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1513 // If we have a range key but did not expose it at the previous iterator 1514 // position (because the iterator was not at a valid position), updated 1515 // must be true. This ensures that after an iterator op sequence like: 1516 // - Next() → (IterValid, RangeBounds() = [a,b)) 1517 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1518 // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1519 // the iterator returns RangeKeyChanged()=true. 1520 // 1521 // The remainder of this function will only update i.rangeKey.updated if 1522 // the iterator moves into a new range key, or out of the current range 1523 // key. 1524 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1525 } 1526 lastPositioningOp := i.lastPositioningOp 1527 // Set it to unknown, since this operation may not succeed, in which case 1528 // the SeekLT following this should not make any assumption about iterator 1529 // position. 1530 i.lastPositioningOp = unknownLastPositionOp 1531 i.batchJustRefreshed = false 1532 i.requiresReposition = false 1533 i.err = nil // clear cached iteration error 1534 i.stats.ReverseSeekCount[InterfaceCall]++ 1535 if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1536 key = upperBound 1537 } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1538 key = lowerBound 1539 } 1540 i.hasPrefix = false 1541 seekInternalIter := true 1542 // The following noop optimization only applies when i.batch == nil, since 1543 // an iterator over a batch is iterating over mutable data, that may have 1544 // changed since the last seek. 1545 if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil { 1546 cmp := i.cmp(key, i.prefixOrFullSeekKey) 1547 // If this seek is to the same or earlier key, and the iterator is 1548 // already positioned there, this is a noop. This can be helpful for 1549 // sparse key spaces that have many deleted keys, where one can avoid 1550 // the overhead of iterating past them again and again. 1551 if cmp <= 0 { 1552 // NB: when pos != iterPosCurReversePaused, the invariant 1553 // documented earlier implies that iterValidityState != 1554 // IterAtLimit. 1555 if i.iterValidityState == IterExhausted || 1556 (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 && 1557 (limit == nil || i.cmp(limit, i.key) <= 0)) { 1558 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1559 i.lastPositioningOp = seekLTLastPositioningOp 1560 return i.iterValidityState 1561 } 1562 } 1563 if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 { 1564 // Have some work to do, but don't need to seek, and we can 1565 // start doing findPrevEntry from i.iterKey. 1566 seekInternalIter = false 1567 } 1568 } 1569 } 1570 if seekInternalIter { 1571 i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone) 1572 i.stats.ReverseSeekCount[InternalIterCall]++ 1573 } 1574 i.findPrevEntry(limit) 1575 i.maybeSampleRead() 1576 if i.Error() == nil && i.batch == nil { 1577 // Prepare state for a future noop optimization. 1578 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1579 i.lastPositioningOp = seekLTLastPositioningOp 1580 } 1581 return i.iterValidityState 1582 } 1583 1584 // First moves the iterator the the first key/value pair. Returns true if the 1585 // iterator is pointing at a valid entry and false otherwise. 1586 func (i *Iterator) First() bool { 1587 if i.rangeKey != nil { 1588 // NB: Check Valid() before clearing requiresReposition. 1589 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1590 // If we have a range key but did not expose it at the previous iterator 1591 // position (because the iterator was not at a valid position), updated 1592 // must be true. This ensures that after an iterator op sequence like: 1593 // - Next() → (IterValid, RangeBounds() = [a,b)) 1594 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1595 // - First(...) → (IterValid, RangeBounds() = [a,b)) 1596 // the iterator returns RangeKeyChanged()=true. 1597 // 1598 // The remainder of this function will only update i.rangeKey.updated if 1599 // the iterator moves into a new range key, or out of the current range 1600 // key. 1601 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1602 } 1603 i.err = nil // clear cached iteration error 1604 i.hasPrefix = false 1605 i.batchJustRefreshed = false 1606 i.lastPositioningOp = unknownLastPositionOp 1607 i.requiresReposition = false 1608 i.stats.ForwardSeekCount[InterfaceCall]++ 1609 1610 i.iterFirstWithinBounds() 1611 i.findNextEntry(nil) 1612 i.maybeSampleRead() 1613 return i.iterValidityState == IterValid 1614 } 1615 1616 // Last moves the iterator the the last key/value pair. Returns true if the 1617 // iterator is pointing at a valid entry and false otherwise. 1618 func (i *Iterator) Last() bool { 1619 if i.rangeKey != nil { 1620 // NB: Check Valid() before clearing requiresReposition. 1621 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1622 // If we have a range key but did not expose it at the previous iterator 1623 // position (because the iterator was not at a valid position), updated 1624 // must be true. This ensures that after an iterator op sequence like: 1625 // - Next() → (IterValid, RangeBounds() = [a,b)) 1626 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1627 // - Last(...) → (IterValid, RangeBounds() = [a,b)) 1628 // the iterator returns RangeKeyChanged()=true. 1629 // 1630 // The remainder of this function will only update i.rangeKey.updated if 1631 // the iterator moves into a new range key, or out of the current range 1632 // key. 1633 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1634 } 1635 i.err = nil // clear cached iteration error 1636 i.hasPrefix = false 1637 i.batchJustRefreshed = false 1638 i.lastPositioningOp = unknownLastPositionOp 1639 i.requiresReposition = false 1640 i.stats.ReverseSeekCount[InterfaceCall]++ 1641 1642 i.iterLastWithinBounds() 1643 i.findPrevEntry(nil) 1644 i.maybeSampleRead() 1645 return i.iterValidityState == IterValid 1646 } 1647 1648 // Next moves the iterator to the next key/value pair. Returns true if the 1649 // iterator is pointing at a valid entry and false otherwise. 1650 func (i *Iterator) Next() bool { 1651 return i.nextWithLimit(nil) == IterValid 1652 } 1653 1654 // NextWithLimit moves the iterator to the next key/value pair. 1655 // 1656 // If limit is provided, it serves as a best-effort exclusive limit. If the next 1657 // key is greater than or equal to limit, the Iterator may pause and return 1658 // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key 1659 // beyond limit. 1660 // 1661 // If the Iterator is configured to iterate over range keys, NextWithLimit 1662 // guarantees it will surface any range keys with bounds overlapping the 1663 // keyspace up to limit. 1664 func (i *Iterator) NextWithLimit(limit []byte) IterValidityState { 1665 return i.nextWithLimit(limit) 1666 } 1667 1668 // NextPrefix moves the iterator to the next key/value pair with a key 1669 // containing a different prefix than the current key. Prefixes are determined 1670 // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration 1671 // mode. 1672 // 1673 // It is not permitted to invoke NextPrefix while at a IterAtLimit position. 1674 // When called in this condition, NextPrefix has non-deterministic behavior. 1675 // 1676 // It is not permitted to invoke NextPrefix when the Iterator has an 1677 // upper-bound that is a versioned MVCC key (see the comment for 1678 // Comparer.Split). It returns an error in this case. 1679 func (i *Iterator) NextPrefix() bool { 1680 if i.nextPrefixNotPermittedByUpperBound { 1681 i.lastPositioningOp = unknownLastPositionOp 1682 i.requiresReposition = false 1683 i.err = errors.Errorf("NextPrefix not permitted with upper bound %s", 1684 i.comparer.FormatKey(i.opts.UpperBound)) 1685 i.iterValidityState = IterExhausted 1686 return false 1687 } 1688 if i.hasPrefix { 1689 i.iterValidityState = IterExhausted 1690 return false 1691 } 1692 return i.nextPrefix() == IterValid 1693 } 1694 1695 func (i *Iterator) nextPrefix() IterValidityState { 1696 if i.rangeKey != nil { 1697 // NB: Check Valid() before clearing requiresReposition. 1698 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1699 // If we have a range key but did not expose it at the previous iterator 1700 // position (because the iterator was not at a valid position), updated 1701 // must be true. This ensures that after an iterator op sequence like: 1702 // - Next() → (IterValid, RangeBounds() = [a,b)) 1703 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1704 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1705 // the iterator returns RangeKeyChanged()=true. 1706 // 1707 // The remainder of this function will only update i.rangeKey.updated if 1708 // the iterator moves into a new range key, or out of the current range 1709 // key. 1710 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1711 } 1712 1713 // Although NextPrefix documents that behavior at IterAtLimit is undefined, 1714 // this function handles these cases as a simple prefix-agnostic Next. This 1715 // is done for deterministic behavior in the metamorphic tests. 1716 // 1717 // TODO(jackson): If the metamorphic test operation generator is adjusted to 1718 // make generation of some operations conditional on the previous 1719 // operations, then we can remove this behavior and explicitly error. 1720 1721 i.lastPositioningOp = unknownLastPositionOp 1722 i.requiresReposition = false 1723 switch i.pos { 1724 case iterPosCurForward: 1725 // Positioned on the current key. Advance to the next prefix. 1726 i.internalNextPrefix(i.split(i.key)) 1727 case iterPosCurForwardPaused: 1728 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1729 // up above. The iterator is already positioned at the next key. 1730 case iterPosCurReverse: 1731 // Switching directions. 1732 // Unless the iterator was exhausted, reverse iteration needs to 1733 // position the iterator at iterPosPrev. 1734 if i.iterKey != nil { 1735 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1736 i.iterValidityState = IterExhausted 1737 return i.iterValidityState 1738 } 1739 // The Iterator is exhausted and i.iter is positioned before the first 1740 // key. Reposition to point to the first internal key. 1741 i.iterFirstWithinBounds() 1742 case iterPosCurReversePaused: 1743 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1744 // up above. 1745 // 1746 // Switching directions; The iterator must not be exhausted since it 1747 // paused. 1748 if i.iterKey == nil { 1749 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1750 i.iterValidityState = IterExhausted 1751 return i.iterValidityState 1752 } 1753 i.nextUserKey() 1754 case iterPosPrev: 1755 // The underlying iterator is pointed to the previous key (this can 1756 // only happen when switching iteration directions). 1757 if i.iterKey == nil { 1758 // We're positioned before the first key. Need to reposition to point to 1759 // the first key. 1760 i.iterFirstWithinBounds() 1761 } else { 1762 // Move the internal iterator back onto the user key stored in 1763 // i.key. iterPosPrev guarantees that it's positioned at the last 1764 // key with the user key less than i.key, so we're guaranteed to 1765 // land on the correct key with a single Next. 1766 i.iterKey, i.iterValue = i.iter.Next() 1767 if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) { 1768 i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", 1769 i.iterKey.UserKey, i.key) 1770 } 1771 } 1772 // The internal iterator is now positioned at i.key. Advance to the next 1773 // prefix. 1774 i.internalNextPrefix(i.split(i.key)) 1775 case iterPosNext: 1776 // Already positioned on the next key. Only call nextPrefixKey if the 1777 // next key shares the same prefix. 1778 if i.iterKey != nil { 1779 currKeyPrefixLen := i.split(i.key) 1780 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1781 if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1782 i.internalNextPrefix(currKeyPrefixLen) 1783 } 1784 } 1785 } 1786 1787 i.stats.ForwardStepCount[InterfaceCall]++ 1788 i.findNextEntry(nil /* limit */) 1789 i.maybeSampleRead() 1790 return i.iterValidityState 1791 } 1792 1793 func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) { 1794 if i.iterKey == nil { 1795 return 1796 } 1797 // The Next "fast-path" is not really a fast-path when there is more than 1798 // one version. However, even with TableFormatPebblev3, there is a small 1799 // slowdown (~10%) for one version if we remove it and only call NextPrefix. 1800 // When there are two versions, only calling NextPrefix is ~30% faster. 1801 i.stats.ForwardStepCount[InternalIterCall]++ 1802 if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil { 1803 return 1804 } 1805 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1806 if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1807 return 1808 } 1809 i.stats.ForwardStepCount[InternalIterCall]++ 1810 i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen]) 1811 i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey) 1812 if invariants.Enabled && i.iterKey != nil { 1813 if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 { 1814 panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q", 1815 i.iterKey, i.prefixOrFullSeekKey)) 1816 } 1817 } 1818 } 1819 1820 func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { 1821 i.stats.ForwardStepCount[InterfaceCall]++ 1822 if i.hasPrefix { 1823 if limit != nil { 1824 i.err = errors.New("cannot use limit with prefix iteration") 1825 i.iterValidityState = IterExhausted 1826 return i.iterValidityState 1827 } else if i.iterValidityState == IterExhausted { 1828 // No-op, already exhasuted. We avoid executing the Next because it 1829 // can break invariants: Specifically, a file that fails the bloom 1830 // filter test may result in its level being removed from the 1831 // merging iterator. The level's removal can cause a lazy combined 1832 // iterator to miss range keys and trigger a switch to combined 1833 // iteration at a larger key, breaking keyspan invariants. 1834 return i.iterValidityState 1835 } 1836 } 1837 if i.err != nil { 1838 return i.iterValidityState 1839 } 1840 if i.rangeKey != nil { 1841 // NB: Check Valid() before clearing requiresReposition. 1842 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1843 // If we have a range key but did not expose it at the previous iterator 1844 // position (because the iterator was not at a valid position), updated 1845 // must be true. This ensures that after an iterator op sequence like: 1846 // - Next() → (IterValid, RangeBounds() = [a,b)) 1847 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1848 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1849 // the iterator returns RangeKeyChanged()=true. 1850 // 1851 // The remainder of this function will only update i.rangeKey.updated if 1852 // the iterator moves into a new range key, or out of the current range 1853 // key. 1854 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1855 } 1856 i.lastPositioningOp = unknownLastPositionOp 1857 i.requiresReposition = false 1858 switch i.pos { 1859 case iterPosCurForward: 1860 i.nextUserKey() 1861 case iterPosCurForwardPaused: 1862 // Already at the right place. 1863 case iterPosCurReverse: 1864 // Switching directions. 1865 // Unless the iterator was exhausted, reverse iteration needs to 1866 // position the iterator at iterPosPrev. 1867 if i.iterKey != nil { 1868 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1869 i.iterValidityState = IterExhausted 1870 return i.iterValidityState 1871 } 1872 // We're positioned before the first key. Need to reposition to point to 1873 // the first key. 1874 i.iterFirstWithinBounds() 1875 case iterPosCurReversePaused: 1876 // Switching directions. 1877 // The iterator must not be exhausted since it paused. 1878 if i.iterKey == nil { 1879 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1880 i.iterValidityState = IterExhausted 1881 return i.iterValidityState 1882 } 1883 i.nextUserKey() 1884 case iterPosPrev: 1885 // The underlying iterator is pointed to the previous key (this can 1886 // only happen when switching iteration directions). We set 1887 // i.iterValidityState to IterExhausted here to force the calls to 1888 // nextUserKey to save the current key i.iter is pointing at in order 1889 // to determine when the next user-key is reached. 1890 i.iterValidityState = IterExhausted 1891 if i.iterKey == nil { 1892 // We're positioned before the first key. Need to reposition to point to 1893 // the first key. 1894 i.iterFirstWithinBounds() 1895 } else { 1896 i.nextUserKey() 1897 } 1898 i.nextUserKey() 1899 case iterPosNext: 1900 // Already at the right place. 1901 } 1902 i.findNextEntry(limit) 1903 i.maybeSampleRead() 1904 return i.iterValidityState 1905 } 1906 1907 // Prev moves the iterator to the previous key/value pair. Returns true if the 1908 // iterator is pointing at a valid entry and false otherwise. 1909 func (i *Iterator) Prev() bool { 1910 return i.PrevWithLimit(nil) == IterValid 1911 } 1912 1913 // PrevWithLimit moves the iterator to the previous key/value pair. 1914 // 1915 // If limit is provided, it serves as a best-effort inclusive limit. If the 1916 // previous key is less than limit, the Iterator may pause and return 1917 // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key 1918 // beyond limit. 1919 // 1920 // If the Iterator is configured to iterate over range keys, PrevWithLimit 1921 // guarantees it will surface any range keys with bounds overlapping the 1922 // keyspace up to limit. 1923 func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState { 1924 i.stats.ReverseStepCount[InterfaceCall]++ 1925 if i.err != nil { 1926 return i.iterValidityState 1927 } 1928 if i.rangeKey != nil { 1929 // NB: Check Valid() before clearing requiresReposition. 1930 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1931 // If we have a range key but did not expose it at the previous iterator 1932 // position (because the iterator was not at a valid position), updated 1933 // must be true. This ensures that after an iterator op sequence like: 1934 // - Next() → (IterValid, RangeBounds() = [a,b)) 1935 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1936 // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1937 // the iterator returns RangeKeyChanged()=true. 1938 // 1939 // The remainder of this function will only update i.rangeKey.updated if 1940 // the iterator moves into a new range key, or out of the current range 1941 // key. 1942 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1943 } 1944 i.lastPositioningOp = unknownLastPositionOp 1945 i.requiresReposition = false 1946 if i.hasPrefix { 1947 i.err = errReversePrefixIteration 1948 i.iterValidityState = IterExhausted 1949 return i.iterValidityState 1950 } 1951 switch i.pos { 1952 case iterPosCurForward: 1953 // Switching directions, and will handle this below. 1954 case iterPosCurForwardPaused: 1955 // Switching directions, and will handle this below. 1956 case iterPosCurReverse: 1957 i.prevUserKey() 1958 case iterPosCurReversePaused: 1959 // Already at the right place. 1960 case iterPosNext: 1961 // The underlying iterator is pointed to the next key (this can only happen 1962 // when switching iteration directions). We will handle this below. 1963 case iterPosPrev: 1964 // Already at the right place. 1965 } 1966 if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused { 1967 // Switching direction. 1968 stepAgain := i.pos == iterPosNext 1969 1970 // Synthetic range key markers are a special case. Consider SeekGE(b) 1971 // which finds a range key [a, c). To ensure the user observes the range 1972 // key, the Iterator pauses at Key() = b. The iterator must advance the 1973 // internal iterator to see if there's also a coincident point key at 1974 // 'b', leaving the iterator at iterPosNext if there's not. 1975 // 1976 // This is a problem: Synthetic range key markers are only interleaved 1977 // during the original seek. A subsequent Prev() of i.iter will not move 1978 // back onto the synthetic range key marker. In this case where the 1979 // previous iterator position was a synthetic range key start boundary, 1980 // we must not step a second time. 1981 if i.isEphemeralPosition() { 1982 stepAgain = false 1983 } 1984 1985 // We set i.iterValidityState to IterExhausted here to force the calls 1986 // to prevUserKey to save the current key i.iter is pointing at in 1987 // order to determine when the prev user-key is reached. 1988 i.iterValidityState = IterExhausted 1989 if i.iterKey == nil { 1990 // We're positioned after the last key. Need to reposition to point to 1991 // the last key. 1992 i.iterLastWithinBounds() 1993 } else { 1994 i.prevUserKey() 1995 } 1996 if stepAgain { 1997 i.prevUserKey() 1998 } 1999 } 2000 i.findPrevEntry(limit) 2001 i.maybeSampleRead() 2002 return i.iterValidityState 2003 } 2004 2005 // iterFirstWithinBounds moves the internal iterator to the first key, 2006 // respecting bounds. 2007 func (i *Iterator) iterFirstWithinBounds() { 2008 i.stats.ForwardSeekCount[InternalIterCall]++ 2009 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil { 2010 i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone) 2011 } else { 2012 i.iterKey, i.iterValue = i.iter.First() 2013 } 2014 } 2015 2016 // iterLastWithinBounds moves the internal iterator to the last key, respecting 2017 // bounds. 2018 func (i *Iterator) iterLastWithinBounds() { 2019 i.stats.ReverseSeekCount[InternalIterCall]++ 2020 if upperBound := i.opts.GetUpperBound(); upperBound != nil { 2021 i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone) 2022 } else { 2023 i.iterKey, i.iterValue = i.iter.Last() 2024 } 2025 } 2026 2027 // RangeKeyData describes a range key's data, set through RangeKeySet. The key 2028 // boundaries of the range key is provided by Iterator.RangeBounds. 2029 type RangeKeyData struct { 2030 Suffix []byte 2031 Value []byte 2032 } 2033 2034 // rangeKeyWithinLimit is called during limited reverse iteration when 2035 // positioned over a key beyond the limit. If there exists a range key that lies 2036 // within the limit, the iterator must not pause in order to ensure the user has 2037 // an opportunity to observe the range key within limit. 2038 // 2039 // It would be valid to ignore the limit whenever there's a range key covering 2040 // the key, but that would introduce nondeterminism. To preserve determinism for 2041 // testing, the iterator ignores the limit only if the covering range key does 2042 // cover the keyspace within the limit. 2043 // 2044 // This awkwardness exists because range keys are interleaved at their inclusive 2045 // start positions. Note that limit is inclusive. 2046 func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool { 2047 if i.rangeKey == nil || !i.opts.rangeKeys() { 2048 return false 2049 } 2050 s := i.rangeKey.iiter.Span() 2051 // If the range key ends beyond the limit, then the range key does not cover 2052 // any portion of the keyspace within the limit and it is safe to pause. 2053 return s != nil && i.cmp(s.End, limit) > 0 2054 } 2055 2056 // saveRangeKey saves the current range key to the underlying iterator's current 2057 // range key state. If the range key has not changed, saveRangeKey is a no-op. 2058 // If there is a new range key, saveRangeKey copies all of the key, value and 2059 // suffixes into Iterator-managed buffers. 2060 func (i *Iterator) saveRangeKey() { 2061 if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly { 2062 return 2063 } 2064 2065 s := i.rangeKey.iiter.Span() 2066 if s == nil { 2067 i.rangeKey.hasRangeKey = false 2068 i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey 2069 return 2070 } else if !i.rangeKey.stale { 2071 // The range key `s` is identical to the one currently saved. No-op. 2072 return 2073 } 2074 2075 if s.KeysOrder != keyspan.BySuffixAsc { 2076 panic("pebble: range key span's keys unexpectedly not in ascending suffix order") 2077 } 2078 2079 // Although `i.rangeKey.stale` is true, the span s may still be identical 2080 // to the currently saved span. This is possible when seeking the iterator, 2081 // which may land back on the same range key. If we previously had a range 2082 // key and the new one has an identical start key, then it must be the same 2083 // range key and we can avoid copying and keep `i.rangeKey.updated=false`. 2084 // 2085 // TODO(jackson): These key comparisons could be avoidable during relative 2086 // positioning operations continuing in the same direction, because these 2087 // ops will never encounter the previous position's range key while 2088 // stale=true. However, threading whether the current op is a seek or step 2089 // maybe isn't worth it. This key comparison is only necessary once when we 2090 // step onto a new range key, which should be relatively rare. 2091 if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) && 2092 i.equal(i.rangeKey.end, s.End) { 2093 i.rangeKey.updated = false 2094 i.rangeKey.stale = false 2095 i.rangeKey.hasRangeKey = true 2096 return 2097 } 2098 i.stats.RangeKeyStats.Count += len(s.Keys) 2099 i.rangeKey.buf.Reset() 2100 i.rangeKey.hasRangeKey = true 2101 i.rangeKey.updated = true 2102 i.rangeKey.stale = false 2103 i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start) 2104 i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End) 2105 i.rangeKey.keys = i.rangeKey.keys[:0] 2106 for j := 0; j < len(s.Keys); j++ { 2107 if invariants.Enabled { 2108 if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet { 2109 panic("pebble: user iteration encountered non-RangeKeySet key kind") 2110 } else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 { 2111 panic("pebble: user iteration encountered range keys not in suffix order") 2112 } 2113 } 2114 var rkd RangeKeyData 2115 i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix) 2116 i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value) 2117 i.rangeKey.keys = append(i.rangeKey.keys, rkd) 2118 } 2119 } 2120 2121 // RangeKeyChanged indicates whether the most recent iterator positioning 2122 // operation resulted in the iterator stepping into or out of a new range key. 2123 // If true, previously returned range key bounds and data has been invalidated. 2124 // If false, previously obtained range key bounds, suffix and value slices are 2125 // still valid and may continue to be read. 2126 // 2127 // Invalid iterator positions are considered to not hold range keys, meaning 2128 // that if an iterator steps from an IterExhausted or IterAtLimit position onto 2129 // a position with a range key, RangeKeyChanged will yield true. 2130 func (i *Iterator) RangeKeyChanged() bool { 2131 return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated 2132 } 2133 2134 // HasPointAndRange indicates whether there exists a point key, a range key or 2135 // both at the current iterator position. 2136 func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) { 2137 if i.iterValidityState != IterValid || i.requiresReposition { 2138 return false, false 2139 } 2140 if i.opts.KeyTypes == IterKeyTypePointsOnly { 2141 return true, false 2142 } 2143 return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey 2144 } 2145 2146 // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the 2147 // range key covering the current iterator position. RangeBounds returns nil 2148 // bounds if there is no range key covering the current iterator position, or 2149 // the iterator is not configured to surface range keys. 2150 // 2151 // If valid, the returned start bound is less than or equal to Key() and the 2152 // returned end bound is greater than Key(). 2153 func (i *Iterator) RangeBounds() (start, end []byte) { 2154 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2155 return nil, nil 2156 } 2157 return i.rangeKey.start, i.rangeKey.end 2158 } 2159 2160 // Key returns the key of the current key/value pair, or nil if done. The 2161 // caller should not modify the contents of the returned slice, and its 2162 // contents may change on the next call to Next. 2163 // 2164 // If positioned at an iterator position that only holds a range key, Key() 2165 // always returns the start bound of the range key. Otherwise, it returns the 2166 // point key's key. 2167 func (i *Iterator) Key() []byte { 2168 return i.key 2169 } 2170 2171 // Value returns the value of the current key/value pair, or nil if done. The 2172 // caller should not modify the contents of the returned slice, and its 2173 // contents may change on the next call to Next. 2174 // 2175 // Only valid if HasPointAndRange() returns true for hasPoint. 2176 // Deprecated: use ValueAndErr instead. 2177 func (i *Iterator) Value() []byte { 2178 val, _ := i.ValueAndErr() 2179 return val 2180 } 2181 2182 // ValueAndErr returns the value, and any error encountered in extracting the value. 2183 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2184 // 2185 // The caller should not modify the contents of the returned slice, and its 2186 // contents may change on the next call to Next. 2187 func (i *Iterator) ValueAndErr() ([]byte, error) { 2188 val, callerOwned, err := i.value.Value(i.lazyValueBuf) 2189 if err != nil { 2190 i.err = err 2191 } 2192 if callerOwned { 2193 i.lazyValueBuf = val[:0] 2194 } 2195 return val, err 2196 } 2197 2198 // LazyValue returns the LazyValue. Only for advanced use cases. 2199 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2200 func (i *Iterator) LazyValue() LazyValue { 2201 return i.value 2202 } 2203 2204 // RangeKeys returns the range key values and their suffixes covering the 2205 // current iterator position. The range bounds may be retrieved separately 2206 // through Iterator.RangeBounds(). 2207 func (i *Iterator) RangeKeys() []RangeKeyData { 2208 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2209 return nil 2210 } 2211 return i.rangeKey.keys 2212 } 2213 2214 // Valid returns true if the iterator is positioned at a valid key/value pair 2215 // and false otherwise. 2216 func (i *Iterator) Valid() bool { 2217 valid := i.iterValidityState == IterValid && !i.requiresReposition 2218 if invariants.Enabled { 2219 if err := i.Error(); valid && err != nil { 2220 panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err)) 2221 } 2222 } 2223 return valid 2224 } 2225 2226 // Error returns any accumulated error. 2227 func (i *Iterator) Error() error { 2228 if i.iter != nil { 2229 return firstError(i.err, i.iter.Error()) 2230 } 2231 return i.err 2232 } 2233 2234 const maxKeyBufCacheSize = 4 << 10 // 4 KB 2235 2236 // Close closes the iterator and returns any accumulated error. Exhausting 2237 // all the key/value pairs in a table is not considered to be an error. 2238 // It is not valid to call any method, including Close, after the iterator 2239 // has been closed. 2240 func (i *Iterator) Close() error { 2241 // Close the child iterator before releasing the readState because when the 2242 // readState is released sstables referenced by the readState may be deleted 2243 // which will fail on Windows if the sstables are still open by the child 2244 // iterator. 2245 if i.iter != nil { 2246 i.err = firstError(i.err, i.iter.Close()) 2247 2248 // Closing i.iter did not necessarily close the point and range key 2249 // iterators. Calls to SetOptions may have 'disconnected' either one 2250 // from i.iter if iteration key types were changed. Both point and range 2251 // key iterators are preserved in case the iterator needs to switch key 2252 // types again. We explicitly close both of these iterators here. 2253 // 2254 // NB: If the iterators were still connected to i.iter, they may be 2255 // closed, but calling Close on a closed internal iterator or fragment 2256 // iterator is allowed. 2257 if i.pointIter != nil && !i.closePointIterOnce { 2258 i.err = firstError(i.err, i.pointIter.Close()) 2259 } 2260 if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil { 2261 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2262 } 2263 } 2264 err := i.err 2265 2266 if i.readState != nil { 2267 if i.readSampling.pendingCompactions.size > 0 { 2268 // Copy pending read compactions using db.mu.Lock() 2269 i.readState.db.mu.Lock() 2270 i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp) 2271 reschedule := i.readState.db.mu.compact.rescheduleReadCompaction 2272 i.readState.db.mu.compact.rescheduleReadCompaction = false 2273 concurrentCompactions := i.readState.db.mu.compact.compactingCount 2274 i.readState.db.mu.Unlock() 2275 2276 if reschedule && concurrentCompactions == 0 { 2277 // In a read heavy workload, flushes may not happen frequently enough to 2278 // schedule compactions. 2279 i.readState.db.compactionSchedulers.Add(1) 2280 go i.readState.db.maybeScheduleCompactionAsync() 2281 } 2282 } 2283 2284 i.readState.unref() 2285 i.readState = nil 2286 } 2287 2288 if i.version != nil { 2289 i.version.Unref() 2290 } 2291 2292 for _, readers := range i.externalReaders { 2293 for _, r := range readers { 2294 err = firstError(err, r.Close()) 2295 } 2296 } 2297 2298 // Close the closer for the current value if one was open. 2299 if i.valueCloser != nil { 2300 err = firstError(err, i.valueCloser.Close()) 2301 i.valueCloser = nil 2302 } 2303 2304 if i.rangeKey != nil { 2305 2306 i.rangeKey.rangeKeyBuffers.PrepareForReuse() 2307 *i.rangeKey = iteratorRangeKeyState{ 2308 rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, 2309 } 2310 iterRangeKeyStateAllocPool.Put(i.rangeKey) 2311 i.rangeKey = nil 2312 } 2313 if alloc := i.alloc; alloc != nil { 2314 // Avoid caching the key buf if it is overly large. The constant is fairly 2315 // arbitrary. 2316 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2317 alloc.keyBuf = nil 2318 } else { 2319 alloc.keyBuf = i.keyBuf 2320 } 2321 if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize { 2322 alloc.prefixOrFullSeekKey = nil 2323 } else { 2324 alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey 2325 } 2326 for j := range i.boundsBuf { 2327 if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { 2328 alloc.boundsBuf[j] = nil 2329 } else { 2330 alloc.boundsBuf[j] = i.boundsBuf[j] 2331 } 2332 } 2333 *alloc = iterAlloc{ 2334 keyBuf: alloc.keyBuf, 2335 boundsBuf: alloc.boundsBuf, 2336 prefixOrFullSeekKey: alloc.prefixOrFullSeekKey, 2337 } 2338 iterAllocPool.Put(alloc) 2339 } else if alloc := i.getIterAlloc; alloc != nil { 2340 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2341 alloc.keyBuf = nil 2342 } else { 2343 alloc.keyBuf = i.keyBuf 2344 } 2345 *alloc = getIterAlloc{ 2346 keyBuf: alloc.keyBuf, 2347 } 2348 getIterAllocPool.Put(alloc) 2349 } 2350 return err 2351 } 2352 2353 // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds 2354 // returns, the caller is free to mutate the provided slices. 2355 // 2356 // The iterator will always be invalidated and must be repositioned with a call 2357 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2358 func (i *Iterator) SetBounds(lower, upper []byte) { 2359 // Ensure that the Iterator appears exhausted, regardless of whether we 2360 // actually have to invalidate the internal iterator. Optimizations that 2361 // avoid exhaustion are an internal implementation detail that shouldn't 2362 // leak through the interface. The caller should still call an absolute 2363 // positioning method to reposition the iterator. 2364 i.requiresReposition = true 2365 2366 if ((i.opts.LowerBound == nil) == (lower == nil)) && 2367 ((i.opts.UpperBound == nil) == (upper == nil)) && 2368 i.equal(i.opts.LowerBound, lower) && 2369 i.equal(i.opts.UpperBound, upper) { 2370 // Unchanged, noop. 2371 return 2372 } 2373 2374 // Copy the user-provided bounds into an Iterator-owned buffer, and set them 2375 // on i.opts.{Lower,Upper}Bound. 2376 i.processBounds(lower, upper) 2377 2378 i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2379 // If the iterator has an open point iterator that's not currently being 2380 // used, propagate the new bounds to it. 2381 if i.pointIter != nil && !i.opts.pointKeys() { 2382 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2383 } 2384 // If the iterator has a range key iterator, propagate bounds to it. The 2385 // top-level SetBounds on the interleaving iterator (i.iter) won't propagate 2386 // bounds to the range key iterator stack, because the FragmentIterator 2387 // interface doesn't define a SetBounds method. We need to directly inform 2388 // the iterConfig stack. 2389 if i.rangeKey != nil { 2390 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2391 } 2392 2393 // Even though this is not a positioning operation, the alteration of the 2394 // bounds means we cannot optimize Seeks by using Next. 2395 i.invalidate() 2396 } 2397 2398 // Initialization and changing of the bounds must call processBounds. 2399 // processBounds saves the bounds and computes derived state from those 2400 // bounds. 2401 func (i *Iterator) processBounds(lower, upper []byte) { 2402 // Copy the user-provided bounds into an Iterator-owned buffer. We can't 2403 // overwrite the current bounds, because some internal iterators compare old 2404 // and new bounds for optimizations. 2405 2406 buf := i.boundsBuf[i.boundsBufIdx][:0] 2407 if lower != nil { 2408 buf = append(buf, lower...) 2409 i.opts.LowerBound = buf 2410 } else { 2411 i.opts.LowerBound = nil 2412 } 2413 i.nextPrefixNotPermittedByUpperBound = false 2414 if upper != nil { 2415 buf = append(buf, upper...) 2416 i.opts.UpperBound = buf[len(buf)-len(upper):] 2417 if i.comparer.Split != nil { 2418 if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) { 2419 // Setting an upper bound that is a versioned MVCC key. This means 2420 // that a key can have some MVCC versions before the upper bound and 2421 // some after. This causes significant complications for NextPrefix, 2422 // so we bar the user of NextPrefix. 2423 i.nextPrefixNotPermittedByUpperBound = true 2424 } 2425 } 2426 } else { 2427 i.opts.UpperBound = nil 2428 } 2429 i.boundsBuf[i.boundsBufIdx] = buf 2430 i.boundsBufIdx = 1 - i.boundsBufIdx 2431 } 2432 2433 // SetOptions sets new iterator options for the iterator. Note that the lower 2434 // and upper bounds applied here will supersede any bounds set by previous calls 2435 // to SetBounds. 2436 // 2437 // Note that the slices provided in this SetOptions must not be changed by the 2438 // caller until the iterator is closed, or a subsequent SetBounds or SetOptions 2439 // has returned. This is because comparisons between the existing and new bounds 2440 // are sometimes used to optimize seeking. See the extended commentary on 2441 // SetBounds. 2442 // 2443 // If the iterator was created over an indexed mutable batch, the iterator's 2444 // view of the mutable batch is refreshed. 2445 // 2446 // The iterator will always be invalidated and must be repositioned with a call 2447 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2448 // 2449 // If only lower and upper bounds need to be modified, prefer SetBounds. 2450 func (i *Iterator) SetOptions(o *IterOptions) { 2451 if i.externalReaders != nil { 2452 if err := validateExternalIterOpts(o); err != nil { 2453 panic(err) 2454 } 2455 } 2456 2457 // Ensure that the Iterator appears exhausted, regardless of whether we 2458 // actually have to invalidate the internal iterator. Optimizations that 2459 // avoid exhaustion are an internal implementation detail that shouldn't 2460 // leak through the interface. The caller should still call an absolute 2461 // positioning method to reposition the iterator. 2462 i.requiresReposition = true 2463 2464 // Check if global state requires we close all internal iterators. 2465 // 2466 // If the Iterator is in an error state, invalidate the existing iterators 2467 // so that we reconstruct an iterator state from scratch. 2468 // 2469 // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect, 2470 // improperly including or excluding memtables. Invalidate them so that 2471 // finishInitializingIter will reconstruct them. 2472 // 2473 // If either the original options or the new options specify a table filter, 2474 // we need to reconstruct the iterator stacks. If they both supply a table 2475 // filter, we can't be certain that it's the same filter since we have no 2476 // mechanism to compare the filter closures. 2477 closeBoth := i.err != nil || 2478 o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable || 2479 o.TableFilter != nil || i.opts.TableFilter != nil 2480 2481 // If either options specify block property filters for an iterator stack, 2482 // reconstruct it. 2483 if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 || 2484 o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil || 2485 i.opts.SkipPoint != nil) { 2486 i.err = firstError(i.err, i.pointIter.Close()) 2487 i.pointIter = nil 2488 } 2489 if i.rangeKey != nil { 2490 if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 { 2491 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2492 i.rangeKey = nil 2493 } else { 2494 // If there's still a range key iterator stack, invalidate the 2495 // iterator. This ensures RangeKeyChanged() returns true if a 2496 // subsequent positioning operation discovers a range key. It also 2497 // prevents seek no-op optimizations. 2498 i.invalidate() 2499 } 2500 } 2501 2502 // If the iterator is backed by a batch that's been mutated, refresh its 2503 // existing point and range-key iterators, and invalidate the iterator to 2504 // prevent seek-using-next optimizations. If we don't yet have a point-key 2505 // iterator or range-key iterator but we require one, it'll be created in 2506 // the slow path that reconstructs the iterator in finishInitializingIter. 2507 if i.batch != nil { 2508 nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2509 if nextBatchSeqNum != i.batchSeqNum { 2510 i.batchSeqNum = nextBatchSeqNum 2511 if i.merging != nil { 2512 i.merging.batchSnapshot = nextBatchSeqNum 2513 } 2514 // Prevent a no-op seek optimization on the next seek. We won't be 2515 // able to reuse the top-level Iterator state, because it may be 2516 // incorrect after the inclusion of new batch mutations. 2517 i.batchJustRefreshed = true 2518 if i.pointIter != nil && i.batch.countRangeDels > 0 { 2519 if i.batchRangeDelIter.Count() == 0 { 2520 // When we constructed this iterator, there were no 2521 // rangedels in the batch. Iterator construction will 2522 // have excluded the batch rangedel iterator from the 2523 // point iterator stack. We need to reconstruct the 2524 // point iterator to add i.batchRangeDelIter into the 2525 // iterator stack. 2526 i.err = firstError(i.err, i.pointIter.Close()) 2527 i.pointIter = nil 2528 } else { 2529 // There are range deletions in the batch and we already 2530 // have a batch rangedel iterator. We can update the 2531 // batch rangedel iterator in place. 2532 // 2533 // NB: There may or may not be new range deletions. We 2534 // can't tell based on i.batchRangeDelIter.Count(), 2535 // which is the count of fragmented range deletions, NOT 2536 // the number of range deletions written to the batch 2537 // [i.batch.countRangeDels]. 2538 i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum) 2539 } 2540 } 2541 if i.rangeKey != nil && i.batch.countRangeKeys > 0 { 2542 if i.batchRangeKeyIter.Count() == 0 { 2543 // When we constructed this iterator, there were no range 2544 // keys in the batch. Iterator construction will have 2545 // excluded the batch rangekey iterator from the range key 2546 // iterator stack. We need to reconstruct the range key 2547 // iterator to add i.batchRangeKeyIter into the iterator 2548 // stack. 2549 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2550 i.rangeKey = nil 2551 } else { 2552 // There are range keys in the batch and we already 2553 // have a batch rangekey iterator. We can update the batch 2554 // rangekey iterator in place. 2555 // 2556 // NB: There may or may not be new range keys. We can't 2557 // tell based on i.batchRangeKeyIter.Count(), which is the 2558 // count of fragmented range keys, NOT the number of 2559 // range keys written to the batch [i.batch.countRangeKeys]. 2560 i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum) 2561 i.invalidate() 2562 } 2563 } 2564 } 2565 } 2566 2567 // Reset combinedIterState.initialized in case the iterator key types 2568 // changed. If there's already a range key iterator stack, the combined 2569 // iterator is already initialized. Additionally, if the iterator is not 2570 // configured to include range keys, mark it as initialized to signal that 2571 // lower level iterators should not trigger a switch to combined iteration. 2572 i.lazyCombinedIter.combinedIterState = combinedIterState{ 2573 initialized: i.rangeKey != nil || !i.opts.rangeKeys(), 2574 } 2575 2576 boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) && 2577 ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) && 2578 i.equal(i.opts.LowerBound, o.LowerBound) && 2579 i.equal(i.opts.UpperBound, o.UpperBound) 2580 2581 if boundsEqual && o.KeyTypes == i.opts.KeyTypes && 2582 (i.pointIter != nil || !i.opts.pointKeys()) && 2583 (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) && 2584 i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) && 2585 o.UseL6Filters == i.opts.UseL6Filters { 2586 // The options are identical, so we can likely use the fast path. In 2587 // addition to all the above constraints, we cannot use the fast path if 2588 // configured to perform lazy combined iteration but an indexed batch 2589 // used by the iterator now contains range keys. Lazy combined iteration 2590 // is not compatible with batch range keys because we always need to 2591 // merge the batch's range keys into iteration. 2592 if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 { 2593 // Fast path. This preserves the Seek-using-Next optimizations as 2594 // long as the iterator wasn't already invalidated up above. 2595 return 2596 } 2597 } 2598 // Slow path. 2599 2600 // The options changed. Save the new ones to i.opts. 2601 if boundsEqual { 2602 // Copying the options into i.opts will overwrite LowerBound and 2603 // UpperBound fields with the user-provided slices. We need to hold on 2604 // to the Pebble-owned slices, so save them and re-set them after the 2605 // copy. 2606 lower, upper := i.opts.LowerBound, i.opts.UpperBound 2607 i.opts = *o 2608 i.opts.LowerBound, i.opts.UpperBound = lower, upper 2609 } else { 2610 i.opts = *o 2611 i.processBounds(o.LowerBound, o.UpperBound) 2612 // Propagate the changed bounds to the existing point iterator. 2613 // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound 2614 // because i.opts now point to buffers owned by Pebble. 2615 if i.pointIter != nil { 2616 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2617 } 2618 if i.rangeKey != nil { 2619 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2620 } 2621 } 2622 2623 // Even though this is not a positioning operation, the invalidation of the 2624 // iterator stack means we cannot optimize Seeks by using Next. 2625 i.invalidate() 2626 2627 // Iterators created through NewExternalIter have a different iterator 2628 // initialization process. 2629 if i.externalReaders != nil { 2630 finishInitializingExternal(i.ctx, i) 2631 return 2632 } 2633 finishInitializingIter(i.ctx, i.alloc) 2634 } 2635 2636 func (i *Iterator) invalidate() { 2637 i.lastPositioningOp = invalidatedLastPositionOp 2638 i.hasPrefix = false 2639 i.iterKey = nil 2640 i.iterValue = LazyValue{} 2641 i.err = nil 2642 // This switch statement isn't necessary for correctness since callers 2643 // should call a repositioning method. We could have arbitrarily set i.pos 2644 // to one of the values. But it results in more intuitive behavior in 2645 // tests, which do not always reposition. 2646 switch i.pos { 2647 case iterPosCurForward, iterPosNext, iterPosCurForwardPaused: 2648 i.pos = iterPosCurForward 2649 case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused: 2650 i.pos = iterPosCurReverse 2651 } 2652 i.iterValidityState = IterExhausted 2653 if i.rangeKey != nil { 2654 i.rangeKey.iiter.Invalidate() 2655 i.rangeKey.prevPosHadRangeKey = false 2656 } 2657 } 2658 2659 // Metrics returns per-iterator metrics. 2660 func (i *Iterator) Metrics() IteratorMetrics { 2661 m := IteratorMetrics{ 2662 ReadAmp: 1, 2663 } 2664 if mi, ok := i.iter.(*mergingIter); ok { 2665 m.ReadAmp = len(mi.levels) 2666 } 2667 return m 2668 } 2669 2670 // ResetStats resets the stats to 0. 2671 func (i *Iterator) ResetStats() { 2672 i.stats = IteratorStats{} 2673 } 2674 2675 // Stats returns the current stats. 2676 func (i *Iterator) Stats() IteratorStats { 2677 return i.stats 2678 } 2679 2680 // CloneOptions configures an iterator constructed through Iterator.Clone. 2681 type CloneOptions struct { 2682 // IterOptions, if non-nil, define the iterator options to configure a 2683 // cloned iterator. If nil, the clone adopts the same IterOptions as the 2684 // iterator being cloned. 2685 IterOptions *IterOptions 2686 // RefreshBatchView may be set to true when cloning an Iterator over an 2687 // indexed batch. When false, the clone adopts the same (possibly stale) 2688 // view of the indexed batch as the cloned Iterator. When true, the clone is 2689 // constructed with a refreshed view of the batch, observing all of the 2690 // batch's mutations at the time of the Clone. If the cloned iterator was 2691 // not constructed to read over an indexed batch, RefreshVatchView has no 2692 // effect. 2693 RefreshBatchView bool 2694 } 2695 2696 // Clone creates a new Iterator over the same underlying data, i.e., over the 2697 // same {batch, memtables, sstables}). The resulting iterator is not positioned. 2698 // It starts with the same IterOptions, unless opts.IterOptions is set. 2699 // 2700 // When called on an Iterator over an indexed batch, the clone's visibility of 2701 // the indexed batch is determined by CloneOptions.RefreshBatchView. If false, 2702 // the clone inherits the iterator's current (possibly stale) view of the batch, 2703 // and callers may call SetOptions to subsequently refresh the clone's view to 2704 // include all batch mutations. If true, the clone is constructed with a 2705 // complete view of the indexed batch's mutations at the time of the Clone. 2706 // 2707 // Callers can use Clone if they need multiple iterators that need to see 2708 // exactly the same underlying state of the DB. This should not be used to 2709 // extend the lifetime of the data backing the original Iterator since that 2710 // will cause an increase in memory and disk usage (use NewSnapshot for that 2711 // purpose). 2712 func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) { 2713 return i.CloneWithContext(context.Background(), opts) 2714 } 2715 2716 // CloneWithContext is like Clone, and additionally accepts a context for 2717 // tracing. 2718 func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) { 2719 if opts.IterOptions == nil { 2720 opts.IterOptions = &i.opts 2721 } 2722 2723 readState := i.readState 2724 vers := i.version 2725 if readState == nil && vers == nil { 2726 return nil, errors.Errorf("cannot Clone a closed Iterator") 2727 } 2728 // i is already holding a ref, so there is no race with unref here. 2729 // 2730 // TODO(bilal): If the underlying iterator was created on a snapshot, we could 2731 // grab a reference to the current readState instead of reffing the original 2732 // readState. This allows us to release references to some zombie sstables 2733 // and memtables. 2734 if readState != nil { 2735 readState.ref() 2736 } 2737 if vers != nil { 2738 vers.Ref() 2739 } 2740 // Bundle various structures under a single umbrella in order to allocate 2741 // them together. 2742 buf := iterAllocPool.Get().(*iterAlloc) 2743 dbi := &buf.dbi 2744 *dbi = Iterator{ 2745 ctx: ctx, 2746 opts: *opts.IterOptions, 2747 alloc: buf, 2748 merge: i.merge, 2749 comparer: i.comparer, 2750 readState: readState, 2751 version: vers, 2752 keyBuf: buf.keyBuf, 2753 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 2754 boundsBuf: buf.boundsBuf, 2755 batch: i.batch, 2756 batchSeqNum: i.batchSeqNum, 2757 newIters: i.newIters, 2758 newIterRangeKey: i.newIterRangeKey, 2759 seqNum: i.seqNum, 2760 } 2761 dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) 2762 2763 // If the caller requested the clone have a current view of the indexed 2764 // batch, set the clone's batch sequence number appropriately. 2765 if i.batch != nil && opts.RefreshBatchView { 2766 dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2767 } 2768 2769 return finishInitializingIter(ctx, buf), nil 2770 } 2771 2772 // Merge adds all of the argument's statistics to the receiver. It may be used 2773 // to accumulate stats across multiple iterators. 2774 func (stats *IteratorStats) Merge(o IteratorStats) { 2775 for i := InterfaceCall; i < NumStatsKind; i++ { 2776 stats.ForwardSeekCount[i] += o.ForwardSeekCount[i] 2777 stats.ReverseSeekCount[i] += o.ReverseSeekCount[i] 2778 stats.ForwardStepCount[i] += o.ForwardStepCount[i] 2779 stats.ReverseStepCount[i] += o.ReverseStepCount[i] 2780 } 2781 stats.InternalStats.Merge(o.InternalStats) 2782 stats.RangeKeyStats.Merge(o.RangeKeyStats) 2783 } 2784 2785 func (stats *IteratorStats) String() string { 2786 return redact.StringWithoutMarkers(stats) 2787 } 2788 2789 // SafeFormat implements the redact.SafeFormatter interface. 2790 func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) { 2791 for i := range stats.ForwardStepCount { 2792 switch IteratorStatsKind(i) { 2793 case InterfaceCall: 2794 s.SafeString("(interface (dir, seek, step): ") 2795 case InternalIterCall: 2796 s.SafeString(", (internal (dir, seek, step): ") 2797 } 2798 s.Printf("(fwd, %d, %d), (rev, %d, %d))", 2799 redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]), 2800 redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i])) 2801 } 2802 if stats.InternalStats != (InternalIteratorStats{}) { 2803 s.SafeString(",\n(internal-stats: ") 2804 s.Printf("(block-bytes: (total %s, cached %s, read-time %s)), "+ 2805 "(points: (count %s, key-bytes %s, value-bytes %s, tombstoned %s))", 2806 humanize.Bytes.Uint64(stats.InternalStats.BlockBytes), 2807 humanize.Bytes.Uint64(stats.InternalStats.BlockBytesInCache), 2808 humanize.FormattedString(stats.InternalStats.BlockReadDuration.String()), 2809 humanize.Count.Uint64(stats.InternalStats.PointCount), 2810 humanize.Bytes.Uint64(stats.InternalStats.KeyBytes), 2811 humanize.Bytes.Uint64(stats.InternalStats.ValueBytes), 2812 humanize.Count.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones), 2813 ) 2814 if stats.InternalStats.SeparatedPointValue.Count != 0 { 2815 s.Printf(", (separated: (count %s, bytes %s, fetched %s)))", 2816 humanize.Count.Uint64(stats.InternalStats.SeparatedPointValue.Count), 2817 humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytes), 2818 humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytesFetched)) 2819 } else { 2820 s.Printf(")") 2821 } 2822 } 2823 if stats.RangeKeyStats != (RangeKeyIteratorStats{}) { 2824 s.SafeString(",\n(range-key-stats: ") 2825 s.Printf("(count %d), (contained points: (count %d, skipped %d)))", 2826 stats.RangeKeyStats.Count, 2827 stats.RangeKeyStats.ContainedPoints, 2828 stats.RangeKeyStats.SkippedPoints) 2829 } 2830 } 2831 2832 // CanDeterministicallySingleDelete takes a valid iterator and examines internal 2833 // state to determine if a SingleDelete deleting Iterator.Key() would 2834 // deterministically delete the key. CanDeterministicallySingleDelete requires 2835 // the iterator to be oriented in the forward direction (eg, the last 2836 // positioning operation must've been a First, a Seek[Prefix]GE, or a 2837 // Next[Prefix][WithLimit]). 2838 // 2839 // This function does not change the external position of the iterator, and all 2840 // positioning methods should behave the same as if it was never called. This 2841 // function will only return a meaningful result the first time it's invoked at 2842 // an iterator position. This function invalidates the iterator Value's memory, 2843 // and the caller must not rely on the memory safety of the previous Iterator 2844 // position. 2845 // 2846 // If CanDeterministicallySingleDelete returns true AND the key at the iterator 2847 // position is not modified between the creation of the Iterator and the commit 2848 // of a batch containing a SingleDelete over the key, then the caller can be 2849 // assured that SingleDelete is equivalent to Delete on the local engine, but it 2850 // may not be true on another engine that received the same writes and with 2851 // logically equivalent state since this engine may have collapsed multiple SETs 2852 // into one. 2853 func CanDeterministicallySingleDelete(it *Iterator) (bool, error) { 2854 // This function may only be called once per external iterator position. We 2855 // can validate this by checking the last positioning operation. 2856 if it.lastPositioningOp == internalNextOp { 2857 return false, errors.New("pebble: CanDeterministicallySingleDelete called twice") 2858 } 2859 validity, kind := it.internalNext() 2860 var shadowedBySingleDelete bool 2861 for validity == internalNextValid { 2862 switch kind { 2863 case InternalKeyKindDelete, InternalKeyKindDeleteSized: 2864 // A DEL or DELSIZED tombstone is okay. An internal key 2865 // sequence like SINGLEDEL; SET; DEL; SET can be handled 2866 // deterministically. If there are SETs further down, we 2867 // don't care about them. 2868 return true, nil 2869 case InternalKeyKindSingleDelete: 2870 // A SingleDelete is okay as long as when that SingleDelete was 2871 // written, it was written deterministically (eg, with its own 2872 // CanDeterministicallySingleDelete check). Validate that it was 2873 // written deterministically. We'll allow one set to appear after 2874 // the SingleDelete. 2875 shadowedBySingleDelete = true 2876 validity, kind = it.internalNext() 2877 continue 2878 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: 2879 // If we observed a single delete, it's allowed to delete 1 key. 2880 // We'll keep looping to validate that the internal keys beneath the 2881 // already-written single delete are copacetic. 2882 if shadowedBySingleDelete { 2883 shadowedBySingleDelete = false 2884 validity, kind = it.internalNext() 2885 continue 2886 } 2887 // We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL 2888 // that deleted the KV at the original iterator position could 2889 // result in this key becoming visible. 2890 return false, nil 2891 case InternalKeyKindRangeDelete: 2892 // RangeDeletes are handled by the merging iterator and should never 2893 // be observed by the top-level Iterator. 2894 panic(errors.AssertionFailedf("pebble: unexpected range delete")) 2895 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 2896 // Range keys are interleaved at the maximal sequence number and 2897 // should never be observed within a user key. 2898 panic(errors.AssertionFailedf("pebble: unexpected range key")) 2899 default: 2900 panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind))) 2901 } 2902 } 2903 if validity == internalNextError { 2904 return false, it.Error() 2905 } 2906 return true, nil 2907 } 2908 2909 // internalNextValidity enumerates the potential outcomes of a call to 2910 // internalNext. 2911 type internalNextValidity int8 2912 2913 const ( 2914 // internalNextError is returned by internalNext when an error occurred and 2915 // the caller is responsible for checking iter.Error(). 2916 internalNextError internalNextValidity = iota 2917 // internalNextExhausted is returned by internalNext when the next internal 2918 // key is an internal key with a different user key than Iterator.Key(). 2919 internalNextExhausted 2920 // internalNextValid is returned by internalNext when the internal next 2921 // found a shadowed internal key with a user key equal to Iterator.Key(). 2922 internalNextValid 2923 ) 2924 2925 // internalNext advances internal Iterator state forward to expose the 2926 // InternalKeyKind of the next internal key with a user key equal to Key(). 2927 // 2928 // internalNext is a highly specialized operation and is unlikely to be 2929 // generally useful. See Iterator.Next for how to reposition the iterator to the 2930 // next key. internalNext requires the Iterator to be at a valid position in the 2931 // forward direction (the last positioning operation must've been a First, a 2932 // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true). 2933 // 2934 // internalNext, unlike all other Iterator methods, exposes internal LSM state. 2935 // internalNext advances the Iterator's internal iterator to the next shadowed 2936 // key with a user key equal to Key(). When a key is overwritten or deleted, its 2937 // removal from the LSM occurs lazily as a part of compactions. internalNext 2938 // allows the caller to see whether an obsolete internal key exists with the 2939 // current Key(), and what it's key kind is. Note that the existence of an 2940 // internal key is nondeterministic and dependent on internal LSM state. These 2941 // semantics are unlikely to be applicable to almost all use cases. 2942 // 2943 // If internalNext finds a key that shares the same user key as Key(), it 2944 // returns internalNextValid and the internal key's kind. If internalNext 2945 // encounters an error, it returns internalNextError and the caller is expected 2946 // to call Iterator.Error() to retrieve it. In all other circumstances, 2947 // internalNext returns internalNextExhausted, indicating that there are no more 2948 // additional internal keys with the user key Key(). 2949 // 2950 // internalNext does not change the external position of the iterator, and a 2951 // Next operation should behave the same as if internalNext was never called. 2952 // internalNext does invalidate the iterator Value's memory, and the caller must 2953 // not rely on the memory safety of the previous Iterator position. 2954 func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { 2955 i.stats.ForwardStepCount[InterfaceCall]++ 2956 if i.err != nil { 2957 return internalNextError, base.InternalKeyKindInvalid 2958 } else if i.iterValidityState != IterValid { 2959 return internalNextExhausted, base.InternalKeyKindInvalid 2960 } 2961 i.lastPositioningOp = internalNextOp 2962 2963 switch i.pos { 2964 case iterPosCurForward: 2965 i.iterKey, i.iterValue = i.iter.Next() 2966 if i.iterKey == nil { 2967 // We check i.iter.Error() here and return an internalNextError enum 2968 // variant so that the caller does not need to check i.iter.Error() 2969 // in the common case that the next internal key has a new user key. 2970 if i.err = i.iter.Error(); i.err != nil { 2971 return internalNextError, base.InternalKeyKindInvalid 2972 } 2973 i.pos = iterPosNext 2974 return internalNextExhausted, base.InternalKeyKindInvalid 2975 } else if i.comparer.Equal(i.iterKey.UserKey, i.key) { 2976 return internalNextValid, i.iterKey.Kind() 2977 } 2978 i.pos = iterPosNext 2979 return internalNextExhausted, base.InternalKeyKindInvalid 2980 case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev: 2981 i.err = errors.New("switching from reverse to forward via internalNext is prohibited") 2982 i.iterValidityState = IterExhausted 2983 return internalNextError, base.InternalKeyKindInvalid 2984 case iterPosNext, iterPosCurForwardPaused: 2985 // The previous method already moved onto the next user key. This is 2986 // only possible if 2987 // - the last positioning method was a call to internalNext, and we 2988 // advanced to a new user key. 2989 // - the previous non-internalNext iterator operation encountered a 2990 // range key or merge, forcing an internal Next that found a new 2991 // user key that's not equal to i.Iterator.Key(). 2992 return internalNextExhausted, base.InternalKeyKindInvalid 2993 default: 2994 panic("unreachable") 2995 } 2996 }