github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/iterator.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "io" 11 "sync" 12 "unsafe" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/internal/bytealloc" 17 "github.com/cockroachdb/pebble/internal/fastrand" 18 "github.com/cockroachdb/pebble/internal/humanize" 19 "github.com/cockroachdb/pebble/internal/invariants" 20 "github.com/cockroachdb/pebble/internal/keyspan" 21 "github.com/cockroachdb/pebble/internal/manifest" 22 "github.com/cockroachdb/pebble/internal/rangekey" 23 "github.com/cockroachdb/pebble/sstable" 24 "github.com/cockroachdb/redact" 25 ) 26 27 // iterPos describes the state of the internal iterator, in terms of whether it 28 // is at the position returned to the user (cur), one ahead of the position 29 // returned (next for forward iteration and prev for reverse iteration). The cur 30 // position is split into two states, for forward and reverse iteration, since 31 // we need to differentiate for switching directions. 32 // 33 // There is subtlety in what is considered the current position of the Iterator. 34 // The internal iterator exposes a sequence of internal keys. There is not 35 // always a single internalIterator position corresponding to the position 36 // returned to the user. Consider the example: 37 // 38 // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4 39 // \ / 40 // \ Iterator.Key() = 'a' / 41 // 42 // The Iterator exposes one valid position at user key 'a' and the two exhausted 43 // positions at the beginning and end of iteration. The underlying 44 // internalIterator contains 7 valid positions and 2 exhausted positions. 45 // 46 // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward} 47 // iff the user key at the current internalIterator position equals the 48 // Iterator.Key returned to the user. This guarantees that a call to nextUserKey 49 // or prevUserKey will advance to the next or previous iterator position. 50 // iterPosCur{Forward,Backward} does not make any guarantee about the internal 51 // iterator position among internal keys with matching user keys, and it will 52 // vary subtly depending on the particular key kinds encountered. In the above 53 // example, the iterator returning 'a' may set iterPosCurForward if the internal 54 // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6. 55 // 56 // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator 57 // must be advanced to the first internalIterator position at a user key greater 58 // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An 59 // internalIterator position that's !Valid() must also be considered greater or 60 // less—depending on the direction of iteration—than the last valid Iterator 61 // position. 62 type iterPos int8 63 64 const ( 65 iterPosCurForward iterPos = 0 66 iterPosNext iterPos = 1 67 iterPosPrev iterPos = -1 68 iterPosCurReverse iterPos = -2 69 70 // For limited iteration. When the iterator is at iterPosCurForwardPaused 71 // - Next*() call should behave as if the internal iterator is already 72 // at next (akin to iterPosNext). 73 // - Prev*() call should behave as if the internal iterator is at the 74 // current key (akin to iterPosCurForward). 75 // 76 // Similar semantics apply to CurReversePaused. 77 iterPosCurForwardPaused iterPos = 2 78 iterPosCurReversePaused iterPos = -3 79 ) 80 81 // Approximate gap in bytes between samples of data read during iteration. 82 // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield 83 // 1 << 20 (1MB). The 1MB factor comes from: 84 // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985 85 const readBytesPeriod uint64 = 1 << 16 86 87 var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration") 88 89 // IteratorMetrics holds per-iterator metrics. These do not change over the 90 // lifetime of the iterator. 91 type IteratorMetrics struct { 92 // The read amplification experienced by this iterator. This is the sum of 93 // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read 94 // amplification generally results in slower reads, though allowing higher 95 // read amplification can also result in faster writes. 96 ReadAmp int 97 } 98 99 // IteratorStatsKind describes the two kind of iterator stats. 100 type IteratorStatsKind int8 101 102 const ( 103 // InterfaceCall represents calls to Iterator. 104 InterfaceCall IteratorStatsKind = iota 105 // InternalIterCall represents calls by Iterator to its internalIterator. 106 InternalIterCall 107 // NumStatsKind is the number of kinds, and is used for array sizing. 108 NumStatsKind 109 ) 110 111 // IteratorStats contains iteration stats. 112 type IteratorStats struct { 113 // ForwardSeekCount includes SeekGE, SeekPrefixGE, First. 114 ForwardSeekCount [NumStatsKind]int 115 // ReverseSeek includes SeekLT, Last. 116 ReverseSeekCount [NumStatsKind]int 117 // ForwardStepCount includes Next. 118 ForwardStepCount [NumStatsKind]int 119 // ReverseStepCount includes Prev. 120 ReverseStepCount [NumStatsKind]int 121 InternalStats InternalIteratorStats 122 RangeKeyStats RangeKeyIteratorStats 123 } 124 125 var _ redact.SafeFormatter = &IteratorStats{} 126 127 // InternalIteratorStats contains miscellaneous stats produced by internal 128 // iterators. 129 type InternalIteratorStats = base.InternalIteratorStats 130 131 // RangeKeyIteratorStats contains miscellaneous stats about range keys 132 // encountered by the iterator. 133 type RangeKeyIteratorStats struct { 134 // Count records the number of range keys encountered during 135 // iteration. Range keys may be counted multiple times if the iterator 136 // leaves a range key's bounds and then returns. 137 Count int 138 // ContainedPoints records the number of point keys encountered within the 139 // bounds of a range key. Note that this includes point keys with suffixes 140 // that sort both above and below the covering range key's suffix. 141 ContainedPoints int 142 // SkippedPoints records the count of the subset of ContainedPoints point 143 // keys that were skipped during iteration due to range-key masking. It does 144 // not include point keys that were never loaded because a 145 // RangeKeyMasking.Filter excluded the entire containing block. 146 SkippedPoints int 147 } 148 149 // Merge adds all of the argument's statistics to the receiver. It may be used 150 // to accumulate stats across multiple iterators. 151 func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) { 152 s.Count += o.Count 153 s.ContainedPoints += o.ContainedPoints 154 s.SkippedPoints += o.SkippedPoints 155 } 156 157 // LazyValue is a lazy value. See the long comment in base.LazyValue. 158 type LazyValue = base.LazyValue 159 160 // Iterator iterates over a DB's key/value pairs in key order. 161 // 162 // An iterator must be closed after use, but it is not necessary to read an 163 // iterator until exhaustion. 164 // 165 // An iterator is not goroutine-safe, but it is safe to use multiple iterators 166 // concurrently, with each in a dedicated goroutine. 167 // 168 // It is also safe to use an iterator concurrently with modifying its 169 // underlying DB, if that DB permits modification. However, the resultant 170 // key/value pairs are not guaranteed to be a consistent snapshot of that DB 171 // at a particular point in time. 172 // 173 // If an iterator encounters an error during any operation, it is stored by 174 // the Iterator and surfaced through the Error method. All absolute 175 // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any 176 // accumulated error before positioning. All relative positioning methods (eg, 177 // Next, Prev) return without advancing if the iterator has an accumulated 178 // error. 179 type Iterator struct { 180 // The context is stored here since (a) Iterators are expected to be 181 // short-lived (since they pin memtables and sstables), (b) plumbing a 182 // context into every method is very painful, (c) they do not (yet) respect 183 // context cancellation and are only used for tracing. 184 ctx context.Context 185 opts IterOptions 186 merge Merge 187 comparer base.Comparer 188 iter internalIterator 189 pointIter internalIterator 190 // Either readState or version is set, but not both. 191 readState *readState 192 version *version 193 // rangeKey holds iteration state specific to iteration over range keys. 194 // The range key field may be nil if the Iterator has never been configured 195 // to iterate over range keys. Its non-nilness cannot be used to determine 196 // if the Iterator is currently iterating over range keys: For that, consult 197 // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter 198 // field is guaranteed to be non-nil too. 199 rangeKey *iteratorRangeKeyState 200 // rangeKeyMasking holds state for range-key masking of point keys. 201 rangeKeyMasking rangeKeyMasking 202 err error 203 // When iterValidityState=IterValid, key represents the current key, which 204 // is backed by keyBuf. 205 key []byte 206 keyBuf []byte 207 value LazyValue 208 // For use in LazyValue.Clone. 209 valueBuf []byte 210 fetcher base.LazyFetcher 211 // For use in LazyValue.Value. 212 lazyValueBuf []byte 213 valueCloser io.Closer 214 // boundsBuf holds two buffers used to store the lower and upper bounds. 215 // Whenever the Iterator's bounds change, the new bounds are copied into 216 // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce 217 // allocations. opts.LowerBound and opts.UpperBound point into this slice. 218 boundsBuf [2][]byte 219 boundsBufIdx int 220 // iterKey, iterValue reflect the latest position of iter, except when 221 // SetBounds is called. In that case, these are explicitly set to nil. 222 iterKey *InternalKey 223 iterValue LazyValue 224 alloc *iterAlloc 225 getIterAlloc *getIterAlloc 226 prefixOrFullSeekKey []byte 227 readSampling readSampling 228 stats IteratorStats 229 externalReaders [][]*sstable.Reader 230 231 // Following fields used when constructing an iterator stack, eg, in Clone 232 // and SetOptions or when re-fragmenting a batch's range keys/range dels. 233 // Non-nil if this Iterator includes a Batch. 234 batch *Batch 235 newIters tableNewIters 236 newIterRangeKey keyspan.TableNewSpanIter 237 lazyCombinedIter lazyCombinedIter 238 seqNum uint64 239 // batchSeqNum is used by Iterators over indexed batches to detect when the 240 // underlying batch has been mutated. The batch beneath an indexed batch may 241 // be mutated while the Iterator is open, but new keys are not surfaced 242 // until the next call to SetOptions. 243 batchSeqNum uint64 244 // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is 245 // configured to read through an indexed batch. If a batch is set, these 246 // iterators will be included within the iterator stack regardless of 247 // whether the batch currently contains any keys of their kind. These 248 // pointers are used during a call to SetOptions to refresh the Iterator's 249 // view of its indexed batch. 250 batchPointIter batchIter 251 batchRangeDelIter keyspan.Iter 252 batchRangeKeyIter keyspan.Iter 253 // merging is a pointer to this iterator's point merging iterator. It 254 // appears here because key visibility is handled by the merging iterator. 255 // During SetOptions on an iterator over an indexed batch, this field is 256 // used to update the merging iterator's batch snapshot. 257 merging *mergingIter 258 259 // Keeping the bools here after all the 8 byte aligned fields shrinks the 260 // sizeof this struct by 24 bytes. 261 262 // INVARIANT: 263 // iterValidityState==IterAtLimit <=> 264 // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused 265 iterValidityState IterValidityState 266 // Set to true by SetBounds, SetOptions. Causes the Iterator to appear 267 // exhausted externally, while preserving the correct iterValidityState for 268 // the iterator's internal state. Preserving the correct internal validity 269 // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT 270 // optimizations after "no-op" calls to SetBounds and SetOptions. 271 requiresReposition bool 272 // The position of iter. When this is iterPos{Prev,Next} the iter has been 273 // moved past the current key-value, which can only happen if 274 // iterValidityState=IterValid, i.e., there is something to return to the 275 // client for the current position. 276 pos iterPos 277 // Relates to the prefixOrFullSeekKey field above. 278 hasPrefix bool 279 // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext), 280 // and SeekGE/SeekLT optimizations 281 lastPositioningOp lastPositioningOpKind 282 // Used for determining when it's safe to perform SeekGE optimizations that 283 // reuse the iterator state to avoid the cost of a full seek if the iterator 284 // is already positioned in the correct place. If the iterator's view of its 285 // indexed batch was just refreshed, some optimizations cannot be applied on 286 // the first seek after the refresh: 287 // - SeekGE has a no-op optimization that does not seek on the internal 288 // iterator at all if the iterator is already in the correct place. 289 // This optimization cannot be performed if the internal iterator was 290 // last positioned when the iterator had a different view of an 291 // underlying batch. 292 // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is 293 // greater than the previous operation's seek key, under the expectation 294 // that the various internal iterators can use their current position to 295 // avoid a full expensive re-seek. This applies to the batchIter as well. 296 // However, if the view of the batch was just refreshed, the batchIter's 297 // position is not useful because it may already be beyond new keys less 298 // than the seek key. To prevent the use of this optimization in 299 // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this 300 // bit is enabled. 301 batchJustRefreshed bool 302 // Used for an optimization in external iterators to reduce the number of 303 // merging levels. 304 forwardOnly bool 305 // batchOnlyIter is set to true for Batch.NewBatchOnlyIter. 306 batchOnlyIter bool 307 // closePointIterOnce is set to true if this point iter can only be Close()d 308 // once, _and_ closing i.iter and then i.pointIter would close i.pointIter 309 // twice. This is necessary to track if the point iter is an internal iterator 310 // that could release its resources to a pool on Close(), making it harder for 311 // that iterator to make its own closes idempotent. 312 // 313 // TODO(bilal): Update SetOptions to always close out point key iterators when 314 // they won't be used, so that Close() doesn't need to default to closing 315 // point iterators twice. 316 closePointIterOnce bool 317 // Used in some tests to disable the random disabling of seek optimizations. 318 forceEnableSeekOpt bool 319 // Set to true if NextPrefix is not currently permitted. Defaults to false 320 // in case an iterator never had any bounds. 321 nextPrefixNotPermittedByUpperBound bool 322 } 323 324 // cmp is a convenience shorthand for the i.comparer.Compare function. 325 func (i *Iterator) cmp(a, b []byte) int { 326 return i.comparer.Compare(a, b) 327 } 328 329 // split is a convenience shorthand for the i.comparer.Split function. 330 func (i *Iterator) split(a []byte) int { 331 return i.comparer.Split(a) 332 } 333 334 // equal is a convenience shorthand for the i.comparer.Equal function. 335 func (i *Iterator) equal(a, b []byte) bool { 336 return i.comparer.Equal(a, b) 337 } 338 339 // iteratorRangeKeyState holds an iterator's range key iteration state. 340 type iteratorRangeKeyState struct { 341 opts *IterOptions 342 cmp base.Compare 343 split base.Split 344 // rangeKeyIter holds the range key iterator stack that iterates over the 345 // merged spans across the entirety of the LSM. 346 rangeKeyIter keyspan.FragmentIterator 347 iiter keyspan.InterleavingIter 348 // stale is set to true when the range key state recorded here (in start, 349 // end and keys) may not be in sync with the current range key at the 350 // interleaving iterator's current position. 351 // 352 // When the interelaving iterator passes over a new span, it invokes the 353 // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale 354 // to true if the span is non-nil. 355 // 356 // The parent iterator may not be positioned over the interleaving 357 // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so 358 // {keys,start,end} are only updated to the new range key during a call to 359 // Iterator.saveRangeKey. 360 stale bool 361 // updated is used to signal to the Iterator client whether the state of 362 // range keys has changed since the previous iterator position through the 363 // `RangeKeyChanged` method. It's set to true during an Iterator positioning 364 // operation that changes the state of the current range key. Each Iterator 365 // positioning operation sets it back to false before executing. 366 // 367 // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is 368 // intricate and confusing. Try to refactor to reduce complexity. 369 updated bool 370 // prevPosHadRangeKey records whether the previous Iterator position had a 371 // range key (HasPointAndRage() = (_, true)). It's updated at the beginning 372 // of each new Iterator positioning operation. It's required by saveRangeKey to 373 // to set `updated` appropriately: Without this record of the previous iterator 374 // state, it's ambiguous whether an iterator only temporarily stepped onto a 375 // position without a range key. 376 prevPosHadRangeKey bool 377 // rangeKeyOnly is set to true if at the current iterator position there is 378 // no point key, only a range key start boundary. 379 rangeKeyOnly bool 380 // hasRangeKey is true when the current iterator position has a covering 381 // range key (eg, a range key with bounds [<lower>,<upper>) such that 382 // <lower> ≤ Key() < <upper>). 383 hasRangeKey bool 384 // start and end are the [start, end) boundaries of the current range keys. 385 start []byte 386 end []byte 387 388 rangeKeyBuffers 389 390 // iterConfig holds fields that are used for the construction of the 391 // iterator stack, but do not need to be directly accessed during iteration. 392 // This struct is bundled within the iteratorRangeKeyState struct to reduce 393 // allocations. 394 iterConfig rangekey.UserIteratorConfig 395 } 396 397 type rangeKeyBuffers struct { 398 // keys is sorted by Suffix ascending. 399 keys []RangeKeyData 400 // buf is used to save range-key data before moving the range-key iterator. 401 // Start and end boundaries, suffixes and values are all copied into buf. 402 buf bytealloc.A 403 // internal holds buffers used by the range key internal iterators. 404 internal rangekey.Buffers 405 } 406 407 func (b *rangeKeyBuffers) PrepareForReuse() { 408 const maxKeysReuse = 100 409 if len(b.keys) > maxKeysReuse { 410 b.keys = nil 411 } 412 // Avoid caching the key buf if it is overly large. The constant is 413 // fairly arbitrary. 414 if cap(b.buf) >= maxKeyBufCacheSize { 415 b.buf = nil 416 } else { 417 b.buf = b.buf[:0] 418 } 419 b.internal.PrepareForReuse() 420 } 421 422 func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) { 423 i.cmp = cmp 424 i.split = split 425 i.opts = opts 426 } 427 428 var iterRangeKeyStateAllocPool = sync.Pool{ 429 New: func() interface{} { 430 return &iteratorRangeKeyState{} 431 }, 432 } 433 434 // isEphemeralPosition returns true iff the current iterator position is 435 // ephemeral, and won't be visited during subsequent relative positioning 436 // operations. 437 // 438 // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a 439 // straddling range key without a coincident point key is such a position. 440 func (i *Iterator) isEphemeralPosition() bool { 441 return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly && 442 !i.equal(i.rangeKey.start, i.key) 443 } 444 445 type lastPositioningOpKind int8 446 447 const ( 448 unknownLastPositionOp lastPositioningOpKind = iota 449 seekPrefixGELastPositioningOp 450 seekGELastPositioningOp 451 seekLTLastPositioningOp 452 // internalNextOp is a special internal iterator positioning operation used 453 // by CanDeterministicallySingleDelete. It exists for enforcing requirements 454 // around calling CanDeterministicallySingleDelete at most once per external 455 // iterator position. 456 internalNextOp 457 // invalidatedLastPositionOp is similar to unknownLastPositionOp and the 458 // only reason to distinguish this is for the wider set of SeekGE 459 // optimizations we permit for the external iterator Iterator.forwardOnly 460 // case. Most code predicates should be doing equality comparisons with one 461 // of the seek* enum values, so this duplication should not result in code 462 // of the form: 463 // if unknownLastPositionOp || invalidLastPositionOp 464 invalidatedLastPositionOp 465 ) 466 467 // Limited iteration mode. Not for use with prefix iteration. 468 // 469 // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator 470 // at the limit in a best-effort manner. The client should behave correctly 471 // even if the limits are ignored. These limits are not "deep", in that they 472 // are not passed down to the underlying collection of internalIterators. This 473 // is because the limits are transient, and apply only until the next 474 // iteration call. They serve mainly as a way to bound the amount of work when 475 // two (or more) Iterators are being coordinated at a higher level. 476 // 477 // In limited iteration mode: 478 // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method. 479 // The return value from the *WithLimit() method provides a more precise 480 // disposition. 481 // - The limit is exclusive for forward and inclusive for reverse. 482 // 483 // 484 // Limited iteration mode & range keys 485 // 486 // Limited iteration interacts with range-key iteration. When range key 487 // iteration is enabled, range keys are interleaved at their start boundaries. 488 // Limited iteration must ensure that if a range key exists within the limit, 489 // the iterator visits the range key. 490 // 491 // During forward limited iteration, this is trivial: An overlapping range key 492 // must have a start boundary less than the limit, and the range key's start 493 // boundary will be interleaved and found to be within the limit. 494 // 495 // During reverse limited iteration, the tail of the range key may fall within 496 // the limit. The range key must be surfaced even if the range key's start 497 // boundary is less than the limit, and if there are no point keys between the 498 // current iterator position and the limit. To provide this guarantee, reverse 499 // limited iteration ignores the limit as long as there is a range key 500 // overlapping the iteration position. 501 502 // IterValidityState captures the state of the Iterator. 503 type IterValidityState int8 504 505 const ( 506 // IterExhausted represents an Iterator that is exhausted. 507 IterExhausted IterValidityState = iota 508 // IterValid represents an Iterator that is valid. 509 IterValid 510 // IterAtLimit represents an Iterator that has a non-exhausted 511 // internalIterator, but has reached a limit without any key for the 512 // caller. 513 IterAtLimit 514 ) 515 516 // readSampling stores variables used to sample a read to trigger a read 517 // compaction 518 type readSampling struct { 519 bytesUntilReadSampling uint64 520 initialSamplePassed bool 521 pendingCompactions readCompactionQueue 522 // forceReadSampling is used for testing purposes to force a read sample on every 523 // call to Iterator.maybeSampleRead() 524 forceReadSampling bool 525 } 526 527 func (i *Iterator) findNextEntry(limit []byte) { 528 i.iterValidityState = IterExhausted 529 i.pos = iterPosCurForward 530 if i.opts.rangeKeys() && i.rangeKey != nil { 531 i.rangeKey.rangeKeyOnly = false 532 } 533 534 // Close the closer for the current value if one was open. 535 if i.closeValueCloser() != nil { 536 return 537 } 538 539 for i.iterKey != nil { 540 key := *i.iterKey 541 542 if i.hasPrefix { 543 if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) { 544 return 545 } 546 } 547 // Compare with limit every time we start at a different user key. 548 // Note that given the best-effort contract of limit, we could avoid a 549 // comparison in the common case by doing this only after 550 // i.nextUserKey is called for the deletes below. However that makes 551 // the behavior non-deterministic (since the behavior will vary based 552 // on what has been compacted), which makes it hard to test with the 553 // metamorphic test. So we forego that performance optimization. 554 if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 { 555 i.iterValidityState = IterAtLimit 556 i.pos = iterPosCurForwardPaused 557 return 558 } 559 560 // If the user has configured a SkipPoint function, invoke it to see 561 // whether we should skip over the current user key. 562 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) { 563 // NB: We could call nextUserKey, but in some cases the SkipPoint 564 // predicate function might be cheaper than nextUserKey's key copy 565 // and key comparison. This should be the case for MVCC suffix 566 // comparisons, for example. In the future, we could expand the 567 // SkipPoint interface to give the implementor more control over 568 // whether we skip over just the internal key, the user key, or even 569 // the key prefix. 570 i.stats.ForwardStepCount[InternalIterCall]++ 571 i.iterKey, i.iterValue = i.iter.Next() 572 continue 573 } 574 575 switch key.Kind() { 576 case InternalKeyKindRangeKeySet: 577 // Save the current key. 578 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 579 i.key = i.keyBuf 580 i.value = LazyValue{} 581 // There may also be a live point key at this userkey that we have 582 // not yet read. We need to find the next entry with this user key 583 // to find it. Save the range key so we don't lose it when we Next 584 // the underlying iterator. 585 i.saveRangeKey() 586 pointKeyExists := i.nextPointCurrentUserKey() 587 if i.err != nil { 588 i.iterValidityState = IterExhausted 589 return 590 } 591 i.rangeKey.rangeKeyOnly = !pointKeyExists 592 i.iterValidityState = IterValid 593 return 594 595 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 596 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 597 // only simpler, but is also necessary for correctness due to 598 // InternalKeyKindSSTableInternalObsoleteBit. 599 i.nextUserKey() 600 continue 601 602 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 603 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 604 i.key = i.keyBuf 605 i.value = i.iterValue 606 i.iterValidityState = IterValid 607 i.saveRangeKey() 608 return 609 610 case InternalKeyKindMerge: 611 // Resolving the merge may advance us to the next point key, which 612 // may be covered by a different set of range keys. Save the range 613 // key state so we don't lose it. 614 i.saveRangeKey() 615 if i.mergeForward(key) { 616 i.iterValidityState = IterValid 617 return 618 } 619 620 // The merge didn't yield a valid key, either because the value 621 // merger indicated it should be deleted, or because an error was 622 // encountered. 623 i.iterValidityState = IterExhausted 624 if i.err != nil { 625 return 626 } 627 if i.pos != iterPosNext { 628 i.nextUserKey() 629 } 630 if i.closeValueCloser() != nil { 631 return 632 } 633 i.pos = iterPosCurForward 634 635 default: 636 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 637 i.iterValidityState = IterExhausted 638 return 639 } 640 } 641 } 642 643 func (i *Iterator) nextPointCurrentUserKey() bool { 644 // If the user has configured a SkipPoint function and the current user key 645 // would be skipped by it, there's no need to step forward looking for a 646 // point key. If we were to find one, it should be skipped anyways. 647 if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) { 648 return false 649 } 650 651 i.pos = iterPosCurForward 652 653 i.iterKey, i.iterValue = i.iter.Next() 654 i.stats.ForwardStepCount[InternalIterCall]++ 655 if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) { 656 i.pos = iterPosNext 657 return false 658 } 659 660 key := *i.iterKey 661 switch key.Kind() { 662 case InternalKeyKindRangeKeySet: 663 // RangeKeySets must always be interleaved as the first internal key 664 // for a user key. 665 i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key") 666 return false 667 668 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 669 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 670 // only simpler, but is also necessary for correctness due to 671 // InternalKeyKindSSTableInternalObsoleteBit. 672 return false 673 674 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 675 i.value = i.iterValue 676 return true 677 678 case InternalKeyKindMerge: 679 return i.mergeForward(key) 680 681 default: 682 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 683 return false 684 } 685 } 686 687 // mergeForward resolves a MERGE key, advancing the underlying iterator forward 688 // to merge with subsequent keys with the same userkey. mergeForward returns a 689 // boolean indicating whether or not the merge yielded a valid key. A merge may 690 // not yield a valid key if an error occurred, in which case i.err is non-nil, 691 // or the user's value merger specified the key to be deleted. 692 // 693 // mergeForward does not update iterValidityState. 694 func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) { 695 var iterValue []byte 696 iterValue, _, i.err = i.iterValue.Value(nil) 697 if i.err != nil { 698 return false 699 } 700 var valueMerger ValueMerger 701 valueMerger, i.err = i.merge(key.UserKey, iterValue) 702 if i.err != nil { 703 return false 704 } 705 706 i.mergeNext(key, valueMerger) 707 if i.err != nil { 708 return false 709 } 710 711 var needDelete bool 712 var value []byte 713 value, needDelete, i.valueCloser, i.err = finishValueMerger( 714 valueMerger, true /* includesBase */) 715 i.value = base.MakeInPlaceValue(value) 716 if i.err != nil { 717 return false 718 } 719 if needDelete { 720 _ = i.closeValueCloser() 721 return false 722 } 723 return true 724 } 725 726 func (i *Iterator) closeValueCloser() error { 727 if i.valueCloser != nil { 728 i.err = i.valueCloser.Close() 729 i.valueCloser = nil 730 } 731 return i.err 732 } 733 734 func (i *Iterator) nextUserKey() { 735 if i.iterKey == nil { 736 return 737 } 738 trailer := i.iterKey.Trailer 739 done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 740 if i.iterValidityState != IterValid { 741 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 742 i.key = i.keyBuf 743 } 744 for { 745 i.iterKey, i.iterValue = i.iter.Next() 746 i.stats.ForwardStepCount[InternalIterCall]++ 747 // NB: We're guaranteed to be on the next user key if the previous key 748 // had a zero sequence number (`done`), or the new key has a trailer 749 // greater or equal to the previous key's trailer. This is true because 750 // internal keys with the same user key are sorted by Trailer in 751 // strictly monotonically descending order. We expect the trailer 752 // optimization to trigger around 50% of the time with randomly 753 // distributed writes. We expect it to trigger very frequently when 754 // iterating through ingested sstables, which contain keys that all have 755 // the same sequence number. 756 if done || i.iterKey == nil || i.iterKey.Trailer >= trailer { 757 break 758 } 759 if !i.equal(i.key, i.iterKey.UserKey) { 760 break 761 } 762 done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 763 trailer = i.iterKey.Trailer 764 } 765 } 766 767 func (i *Iterator) maybeSampleRead() { 768 // This method is only called when a public method of Iterator is 769 // returning, and below we exclude the case were the iterator is paused at 770 // a limit. The effect of these choices is that keys that are deleted, but 771 // are encountered during iteration, are not accounted for in the read 772 // sampling and will not cause read driven compactions, even though we are 773 // incurring cost in iterating over them. And this issue is not limited to 774 // Iterator, which does not see the effect of range deletes, which may be 775 // causing iteration work in mergingIter. It is not clear at this time 776 // whether this is a deficiency worth addressing. 777 if i.iterValidityState != IterValid { 778 return 779 } 780 if i.readState == nil { 781 return 782 } 783 if i.readSampling.forceReadSampling { 784 i.sampleRead() 785 return 786 } 787 samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier) 788 if samplingPeriod <= 0 { 789 return 790 } 791 bytesRead := uint64(len(i.key) + i.value.Len()) 792 for i.readSampling.bytesUntilReadSampling < bytesRead { 793 i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod))) 794 // The block below tries to adjust for the case where this is the 795 // first read in a newly-opened iterator. As bytesUntilReadSampling 796 // starts off at zero, we don't want to sample the first read of 797 // every newly-opened iterator, but we do want to sample some of them. 798 if !i.readSampling.initialSamplePassed { 799 i.readSampling.initialSamplePassed = true 800 if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) { 801 continue 802 } 803 } 804 i.sampleRead() 805 } 806 i.readSampling.bytesUntilReadSampling -= bytesRead 807 } 808 809 func (i *Iterator) sampleRead() { 810 var topFile *manifest.FileMetadata 811 topLevel, numOverlappingLevels := numLevels, 0 812 mi := i.merging 813 if mi == nil { 814 return 815 } 816 if len(mi.levels) > 1 { 817 mi.ForEachLevelIter(func(li *levelIter) bool { 818 l := manifest.LevelToInt(li.level) 819 if f := li.iterFile; f != nil { 820 var containsKey bool 821 if i.pos == iterPosNext || i.pos == iterPosCurForward || 822 i.pos == iterPosCurForwardPaused { 823 containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0 824 } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse || 825 i.pos == iterPosCurReversePaused { 826 containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0 827 } 828 // Do nothing if the current key is not contained in f's 829 // bounds. We could seek the LevelIterator at this level 830 // to find the right file, but the performance impacts of 831 // doing that are significant enough to negate the benefits 832 // of read sampling in the first place. See the discussion 833 // at: 834 // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492 835 if containsKey { 836 numOverlappingLevels++ 837 if numOverlappingLevels >= 2 { 838 // Terminate the loop early if at least 2 overlapping levels are found. 839 return true 840 } 841 topLevel = l 842 topFile = f 843 } 844 } 845 return false 846 }) 847 } 848 if topFile == nil || topLevel >= numLevels { 849 return 850 } 851 if numOverlappingLevels >= 2 { 852 allowedSeeks := topFile.AllowedSeeks.Add(-1) 853 if allowedSeeks == 0 { 854 855 // Since the compaction queue can handle duplicates, we can keep 856 // adding to the queue even once allowedSeeks hits 0. 857 // In fact, we NEED to keep adding to the queue, because the queue 858 // is small and evicts older and possibly useful compactions. 859 topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks) 860 861 read := readCompaction{ 862 start: topFile.SmallestPointKey.UserKey, 863 end: topFile.LargestPointKey.UserKey, 864 level: topLevel, 865 fileNum: topFile.FileNum, 866 } 867 i.readSampling.pendingCompactions.add(&read, i.cmp) 868 } 869 } 870 } 871 872 func (i *Iterator) findPrevEntry(limit []byte) { 873 i.iterValidityState = IterExhausted 874 i.pos = iterPosCurReverse 875 if i.opts.rangeKeys() && i.rangeKey != nil { 876 i.rangeKey.rangeKeyOnly = false 877 } 878 879 // Close the closer for the current value if one was open. 880 if i.valueCloser != nil { 881 i.err = i.valueCloser.Close() 882 i.valueCloser = nil 883 if i.err != nil { 884 i.iterValidityState = IterExhausted 885 return 886 } 887 } 888 889 var valueMerger ValueMerger 890 firstLoopIter := true 891 rangeKeyBoundary := false 892 // The code below compares with limit in multiple places. As documented in 893 // findNextEntry, this is being done to make the behavior of limit 894 // deterministic to allow for metamorphic testing. It is not required by 895 // the best-effort contract of limit. 896 for i.iterKey != nil { 897 key := *i.iterKey 898 899 // NB: We cannot pause if the current key is covered by a range key. 900 // Otherwise, the user might not ever learn of a range key that covers 901 // the key space being iterated over in which there are no point keys. 902 // Since limits are best effort, ignoring the limit in this case is 903 // allowed by the contract of limit. 904 if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 905 i.iterValidityState = IterAtLimit 906 i.pos = iterPosCurReversePaused 907 return 908 } 909 firstLoopIter = false 910 911 if i.iterValidityState == IterValid { 912 if !i.equal(key.UserKey, i.key) { 913 // We've iterated to the previous user key. 914 i.pos = iterPosPrev 915 if valueMerger != nil { 916 var needDelete bool 917 var value []byte 918 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 919 i.value = base.MakeInPlaceValue(value) 920 if i.err == nil && needDelete { 921 // The point key at this key is deleted. If we also have 922 // a range key boundary at this key, we still want to 923 // return. Otherwise, we need to continue looking for 924 // a live key. 925 i.value = LazyValue{} 926 if rangeKeyBoundary { 927 i.rangeKey.rangeKeyOnly = true 928 } else { 929 i.iterValidityState = IterExhausted 930 if i.closeValueCloser() == nil { 931 continue 932 } 933 } 934 } 935 } 936 if i.err != nil { 937 i.iterValidityState = IterExhausted 938 } 939 return 940 } 941 } 942 943 // If the user has configured a SkipPoint function, invoke it to see 944 // whether we should skip over the current user key. 945 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) { 946 // NB: We could call prevUserKey, but in some cases the SkipPoint 947 // predicate function might be cheaper than prevUserKey's key copy 948 // and key comparison. This should be the case for MVCC suffix 949 // comparisons, for example. In the future, we could expand the 950 // SkipPoint interface to give the implementor more control over 951 // whether we skip over just the internal key, the user key, or even 952 // the key prefix. 953 i.stats.ReverseStepCount[InternalIterCall]++ 954 i.iterKey, i.iterValue = i.iter.Prev() 955 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 956 i.iterValidityState = IterAtLimit 957 i.pos = iterPosCurReversePaused 958 return 959 } 960 continue 961 } 962 963 switch key.Kind() { 964 case InternalKeyKindRangeKeySet: 965 // Range key start boundary markers are interleaved with the maximum 966 // sequence number, so if there's a point key also at this key, we 967 // must've already iterated over it. 968 // This is the final entry at this user key, so we may return 969 i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid 970 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 971 i.key = i.keyBuf 972 i.iterValidityState = IterValid 973 i.saveRangeKey() 974 // In all other cases, previous iteration requires advancing to 975 // iterPosPrev in order to determine if the key is live and 976 // unshadowed by another key at the same user key. In this case, 977 // because range key start boundary markers are always interleaved 978 // at the maximum sequence number, we know that there aren't any 979 // additional keys with the same user key in the backward direction. 980 // 981 // We Prev the underlying iterator once anyways for consistency, so 982 // that we can maintain the invariant during backward iteration that 983 // i.iterPos = iterPosPrev. 984 i.stats.ReverseStepCount[InternalIterCall]++ 985 i.iterKey, i.iterValue = i.iter.Prev() 986 987 // Set rangeKeyBoundary so that on the next iteration, we know to 988 // return the key even if the MERGE point key is deleted. 989 rangeKeyBoundary = true 990 991 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 992 i.value = LazyValue{} 993 i.iterValidityState = IterExhausted 994 valueMerger = nil 995 i.iterKey, i.iterValue = i.iter.Prev() 996 i.stats.ReverseStepCount[InternalIterCall]++ 997 // Compare with the limit. We could optimize by only checking when 998 // we step to the previous user key, but detecting that requires a 999 // comparison too. Note that this position may already passed a 1000 // number of versions of this user key, but they are all deleted, so 1001 // the fact that a subsequent Prev*() call will not see them is 1002 // harmless. Also note that this is the only place in the loop, 1003 // other than the firstLoopIter and SkipPoint cases above, where we 1004 // could step to a different user key and start processing it for 1005 // returning to the caller. 1006 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 1007 i.iterValidityState = IterAtLimit 1008 i.pos = iterPosCurReversePaused 1009 return 1010 } 1011 continue 1012 1013 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1014 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1015 i.key = i.keyBuf 1016 // iterValue is owned by i.iter and could change after the Prev() 1017 // call, so use valueBuf instead. Note that valueBuf is only used 1018 // in this one instance; everywhere else (eg. in findNextEntry), 1019 // we just point i.value to the unsafe i.iter-owned value buffer. 1020 i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher) 1021 i.saveRangeKey() 1022 i.iterValidityState = IterValid 1023 i.iterKey, i.iterValue = i.iter.Prev() 1024 i.stats.ReverseStepCount[InternalIterCall]++ 1025 valueMerger = nil 1026 continue 1027 1028 case InternalKeyKindMerge: 1029 if i.iterValidityState == IterExhausted { 1030 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1031 i.key = i.keyBuf 1032 i.saveRangeKey() 1033 var iterValue []byte 1034 iterValue, _, i.err = i.iterValue.Value(nil) 1035 if i.err != nil { 1036 return 1037 } 1038 valueMerger, i.err = i.merge(i.key, iterValue) 1039 if i.err != nil { 1040 return 1041 } 1042 i.iterValidityState = IterValid 1043 } else if valueMerger == nil { 1044 // Extract value before iterValue since we use value before iterValue 1045 // and the underlying iterator is not required to provide backing 1046 // memory for both simultaneously. 1047 var value []byte 1048 var callerOwned bool 1049 value, callerOwned, i.err = i.value.Value(i.lazyValueBuf) 1050 if callerOwned { 1051 i.lazyValueBuf = value[:0] 1052 } 1053 if i.err != nil { 1054 return 1055 } 1056 valueMerger, i.err = i.merge(i.key, value) 1057 var iterValue []byte 1058 iterValue, _, i.err = i.iterValue.Value(nil) 1059 if i.err != nil { 1060 return 1061 } 1062 if i.err == nil { 1063 i.err = valueMerger.MergeNewer(iterValue) 1064 } 1065 if i.err != nil { 1066 i.iterValidityState = IterExhausted 1067 return 1068 } 1069 } else { 1070 var iterValue []byte 1071 iterValue, _, i.err = i.iterValue.Value(nil) 1072 if i.err != nil { 1073 return 1074 } 1075 i.err = valueMerger.MergeNewer(iterValue) 1076 if i.err != nil { 1077 i.iterValidityState = IterExhausted 1078 return 1079 } 1080 } 1081 i.iterKey, i.iterValue = i.iter.Prev() 1082 i.stats.ReverseStepCount[InternalIterCall]++ 1083 continue 1084 1085 default: 1086 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1087 i.iterValidityState = IterExhausted 1088 return 1089 } 1090 } 1091 1092 // i.iterKey == nil, so broke out of the preceding loop. 1093 if i.iterValidityState == IterValid { 1094 i.pos = iterPosPrev 1095 if valueMerger != nil { 1096 var needDelete bool 1097 var value []byte 1098 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 1099 i.value = base.MakeInPlaceValue(value) 1100 if i.err == nil && needDelete { 1101 i.key = nil 1102 i.value = LazyValue{} 1103 i.iterValidityState = IterExhausted 1104 } 1105 } 1106 if i.err != nil { 1107 i.iterValidityState = IterExhausted 1108 } 1109 } 1110 } 1111 1112 func (i *Iterator) prevUserKey() { 1113 if i.iterKey == nil { 1114 return 1115 } 1116 if i.iterValidityState != IterValid { 1117 // If we're going to compare against the prev key, we need to save the 1118 // current key. 1119 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 1120 i.key = i.keyBuf 1121 } 1122 for { 1123 i.iterKey, i.iterValue = i.iter.Prev() 1124 i.stats.ReverseStepCount[InternalIterCall]++ 1125 if i.iterKey == nil { 1126 break 1127 } 1128 if !i.equal(i.key, i.iterKey.UserKey) { 1129 break 1130 } 1131 } 1132 } 1133 1134 func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { 1135 // Save the current key. 1136 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1137 i.key = i.keyBuf 1138 1139 // Loop looking for older values for this key and merging them. 1140 for { 1141 i.iterKey, i.iterValue = i.iter.Next() 1142 i.stats.ForwardStepCount[InternalIterCall]++ 1143 if i.iterKey == nil { 1144 i.pos = iterPosNext 1145 return 1146 } 1147 key = *i.iterKey 1148 if !i.equal(i.key, key.UserKey) { 1149 // We've advanced to the next key. 1150 i.pos = iterPosNext 1151 return 1152 } 1153 switch key.Kind() { 1154 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 1155 // We've hit a deletion tombstone. Return everything up to this 1156 // point. 1157 // 1158 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 1159 // only simpler, but is also necessary for correctness due to 1160 // InternalKeyKindSSTableInternalObsoleteBit. 1161 return 1162 1163 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1164 // We've hit a Set value. Merge with the existing value and return. 1165 var iterValue []byte 1166 iterValue, _, i.err = i.iterValue.Value(nil) 1167 if i.err != nil { 1168 return 1169 } 1170 i.err = valueMerger.MergeOlder(iterValue) 1171 return 1172 1173 case InternalKeyKindMerge: 1174 // We've hit another Merge value. Merge with the existing value and 1175 // continue looping. 1176 var iterValue []byte 1177 iterValue, _, i.err = i.iterValue.Value(nil) 1178 if i.err != nil { 1179 return 1180 } 1181 i.err = valueMerger.MergeOlder(iterValue) 1182 if i.err != nil { 1183 return 1184 } 1185 continue 1186 1187 case InternalKeyKindRangeKeySet: 1188 // The RANGEKEYSET marker must sort before a MERGE at the same user key. 1189 i.err = base.CorruptionErrorf("pebble: out of order range key marker") 1190 return 1191 1192 default: 1193 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1194 return 1195 } 1196 } 1197 } 1198 1199 // SeekGE moves the iterator to the first key/value pair whose key is greater 1200 // than or equal to the given key. Returns true if the iterator is pointing at 1201 // a valid entry and false otherwise. 1202 func (i *Iterator) SeekGE(key []byte) bool { 1203 return i.SeekGEWithLimit(key, nil) == IterValid 1204 } 1205 1206 // SeekGEWithLimit moves the iterator to the first key/value pair whose key is 1207 // greater than or equal to the given key. 1208 // 1209 // If limit is provided, it serves as a best-effort exclusive limit. If the 1210 // first key greater than or equal to the given search key is also greater than 1211 // or equal to limit, the Iterator may pause and return IterAtLimit. Because 1212 // limits are best-effort, SeekGEWithLimit may return a key beyond limit. 1213 // 1214 // If the Iterator is configured to iterate over range keys, SeekGEWithLimit 1215 // guarantees it will surface any range keys with bounds overlapping the 1216 // keyspace [key, limit). 1217 func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { 1218 if i.rangeKey != nil { 1219 // NB: Check Valid() before clearing requiresReposition. 1220 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1221 // If we have a range key but did not expose it at the previous iterator 1222 // position (because the iterator was not at a valid position), updated 1223 // must be true. This ensures that after an iterator op sequence like: 1224 // - Next() → (IterValid, RangeBounds() = [a,b)) 1225 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1226 // - SeekGE(...) → (IterValid, RangeBounds() = [a,b)) 1227 // the iterator returns RangeKeyChanged()=true. 1228 // 1229 // The remainder of this function will only update i.rangeKey.updated if 1230 // the iterator moves into a new range key, or out of the current range 1231 // key. 1232 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1233 } 1234 lastPositioningOp := i.lastPositioningOp 1235 hasPrefix := i.hasPrefix 1236 // Set it to unknown, since this operation may not succeed, in which case 1237 // the SeekGE following this should not make any assumption about iterator 1238 // position. 1239 i.lastPositioningOp = unknownLastPositionOp 1240 i.requiresReposition = false 1241 i.err = nil // clear cached iteration error 1242 i.hasPrefix = false 1243 i.stats.ForwardSeekCount[InterfaceCall]++ 1244 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1245 key = lowerBound 1246 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1247 key = upperBound 1248 } 1249 seekInternalIter := true 1250 1251 var flags base.SeekGEFlags 1252 if i.batchJustRefreshed { 1253 i.batchJustRefreshed = false 1254 flags = flags.EnableBatchJustRefreshed() 1255 } 1256 if lastPositioningOp == seekGELastPositioningOp { 1257 cmp := i.cmp(i.prefixOrFullSeekKey, key) 1258 // If this seek is to the same or later key, and the iterator is 1259 // already positioned there, this is a noop. This can be helpful for 1260 // sparse key spaces that have many deleted keys, where one can avoid 1261 // the overhead of iterating past them again and again. 1262 if cmp <= 0 { 1263 if !flags.BatchJustRefreshed() && 1264 (i.iterValidityState == IterExhausted || 1265 (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 && 1266 (limit == nil || i.cmp(i.key, limit) < 0))) { 1267 // Noop 1268 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt { 1269 i.lastPositioningOp = seekGELastPositioningOp 1270 return i.iterValidityState 1271 } 1272 } 1273 // cmp == 0 is not safe to optimize since 1274 // - i.pos could be at iterPosNext, due to a merge. 1275 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1276 // SET pair for a key, and the iterator would have moved past DELETE 1277 // but stayed at iterPosCurForward. A similar situation occurs for a 1278 // MERGE, SET pair where the MERGE is consumed and the iterator is 1279 // at the SET. 1280 // We also leverage the IterAtLimit <=> i.pos invariant defined in the 1281 // comment on iterValidityState, to exclude any cases where i.pos 1282 // is iterPosCur{Forward,Reverse}Paused. This avoids the need to 1283 // special-case those iterator positions and their interactions with 1284 // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB 1285 // do not use limited Seeks in the first place. 1286 if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil { 1287 flags = flags.EnableTrySeekUsingNext() 1288 } 1289 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1290 flags = flags.DisableTrySeekUsingNext() 1291 } 1292 if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 { 1293 // Have some work to do, but don't need to seek, and we can 1294 // start doing findNextEntry from i.iterKey. 1295 seekInternalIter = false 1296 } 1297 } 1298 } 1299 // Check for another TrySeekUsingNext optimization opportunity, currently 1300 // specifically tailored to external iterators. This case is intended to 1301 // trigger in instances of Seek-ing with monotonically increasing keys with 1302 // Nexts interspersed. At the time of writing, this is the case for 1303 // CockroachDB scans. This optimization is important for external iterators 1304 // to avoid re-seeking within an already-exhausted sstable. It is not always 1305 // a performance win more generally, so we restrict it to external iterators 1306 // that are configured to only use forward positioning operations. 1307 // 1308 // TODO(jackson): This optimization should be obsolete once we introduce and 1309 // use the NextPrefix iterator positioning operation. 1310 if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp && 1311 i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid && 1312 i.cmp(key, i.iterKey.UserKey) > 0 { 1313 flags = flags.EnableTrySeekUsingNext() 1314 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1315 flags = flags.DisableTrySeekUsingNext() 1316 } 1317 } 1318 if seekInternalIter { 1319 i.iterKey, i.iterValue = i.iter.SeekGE(key, flags) 1320 i.stats.ForwardSeekCount[InternalIterCall]++ 1321 } 1322 i.findNextEntry(limit) 1323 i.maybeSampleRead() 1324 if i.Error() == nil { 1325 // Prepare state for a future noop optimization. 1326 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1327 i.lastPositioningOp = seekGELastPositioningOp 1328 } 1329 return i.iterValidityState 1330 } 1331 1332 // SeekPrefixGE moves the iterator to the first key/value pair whose key is 1333 // greater than or equal to the given key and which has the same "prefix" as 1334 // the given key. The prefix for a key is determined by the user-defined 1335 // Comparer.Split function. The iterator will not observe keys not matching the 1336 // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix 1337 // iteration mode. The iterator remains in prefix iteration until a subsequent 1338 // call to another absolute positioning method (SeekGE, SeekLT, First, 1339 // Last). Reverse iteration (Prev) is not supported when an iterator is in 1340 // prefix iteration mode. Returns true if the iterator is pointing at a valid 1341 // entry and false otherwise. 1342 // 1343 // The semantics of SeekPrefixGE are slightly unusual and designed for 1344 // iteration to be able to take advantage of bloom filters that have been 1345 // created on the "prefix". If you're not using bloom filters, there is no 1346 // reason to use SeekPrefixGE. 1347 // 1348 // An example Split function may separate a timestamp suffix from the prefix of 1349 // the key. 1350 // 1351 // Split(<key>@<timestamp>) -> <key> 1352 // 1353 // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys 1354 // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the 1355 // usual definition, those prefixes differ by the definition of the Split 1356 // function. To see how this works, consider the following set of calls on this 1357 // data set: 1358 // 1359 // SeekPrefixGE("a@0") -> "a@1" 1360 // Next() -> "a@2" 1361 // Next() -> EOF 1362 // 1363 // If you're just looking to iterate over keys with a shared prefix, as 1364 // defined by the configured comparer, set iterator bounds instead: 1365 // 1366 // iter := db.NewIter(&pebble.IterOptions{ 1367 // LowerBound: []byte("prefix"), 1368 // UpperBound: []byte("prefiy"), 1369 // }) 1370 // for iter.First(); iter.Valid(); iter.Next() { 1371 // // Only keys beginning with "prefix" will be visited. 1372 // } 1373 // 1374 // See ExampleIterator_SeekPrefixGE for a working example. 1375 // 1376 // When iterating with range keys enabled, all range keys encountered are 1377 // truncated to the seek key's prefix's bounds. The truncation of the upper 1378 // bound requires that the database's Comparer is configured with a 1379 // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the 1380 // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)]. 1381 func (i *Iterator) SeekPrefixGE(key []byte) bool { 1382 if i.rangeKey != nil { 1383 // NB: Check Valid() before clearing requiresReposition. 1384 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1385 // If we have a range key but did not expose it at the previous iterator 1386 // position (because the iterator was not at a valid position), updated 1387 // must be true. This ensures that after an iterator op sequence like: 1388 // - Next() → (IterValid, RangeBounds() = [a,b)) 1389 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1390 // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b)) 1391 // the iterator returns RangeKeyChanged()=true. 1392 // 1393 // The remainder of this function will only update i.rangeKey.updated if 1394 // the iterator moves into a new range key, or out of the current range 1395 // key. 1396 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1397 } 1398 lastPositioningOp := i.lastPositioningOp 1399 // Set it to unknown, since this operation may not succeed, in which case 1400 // the SeekPrefixGE following this should not make any assumption about 1401 // iterator position. 1402 i.lastPositioningOp = unknownLastPositionOp 1403 i.requiresReposition = false 1404 i.err = nil // clear cached iteration error 1405 i.stats.ForwardSeekCount[InterfaceCall]++ 1406 if i.comparer.Split == nil { 1407 panic("pebble: split must be provided for SeekPrefixGE") 1408 } 1409 if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly { 1410 panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys") 1411 } 1412 prefixLen := i.split(key) 1413 keyPrefix := key[:prefixLen] 1414 var flags base.SeekGEFlags 1415 if i.batchJustRefreshed { 1416 flags = flags.EnableBatchJustRefreshed() 1417 i.batchJustRefreshed = false 1418 } 1419 if lastPositioningOp == seekPrefixGELastPositioningOp { 1420 if !i.hasPrefix { 1421 panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false") 1422 } 1423 // The iterator has not been repositioned after the last SeekPrefixGE. 1424 // See if we are seeking to a larger key, since then we can optimize 1425 // the seek by using next. Note that we could also optimize if Next 1426 // has been called, if the iterator is not exhausted and the current 1427 // position is <= the seek key. We are keeping this limited for now 1428 // since such optimizations require care for correctness, and to not 1429 // become de-optimizations (if one usually has to do all the next 1430 // calls and then the seek). This SeekPrefixGE optimization 1431 // specifically benefits CockroachDB. 1432 cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix) 1433 // cmp == 0 is not safe to optimize since 1434 // - i.pos could be at iterPosNext, due to a merge. 1435 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1436 // SET pair for a key, and the iterator would have moved past DELETE 1437 // but stayed at iterPosCurForward. A similar situation occurs for a 1438 // MERGE, SET pair where the MERGE is consumed and the iterator is 1439 // at the SET. 1440 // In general some versions of i.prefix could have been consumed by 1441 // the iterator, so we only optimize for cmp < 0. 1442 if cmp < 0 { 1443 flags = flags.EnableTrySeekUsingNext() 1444 } 1445 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1446 flags = flags.DisableTrySeekUsingNext() 1447 } 1448 } 1449 // Make a copy of the prefix so that modifications to the key after 1450 // SeekPrefixGE returns does not affect the stored prefix. 1451 if cap(i.prefixOrFullSeekKey) < prefixLen { 1452 i.prefixOrFullSeekKey = make([]byte, prefixLen) 1453 } else { 1454 i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen] 1455 } 1456 i.hasPrefix = true 1457 copy(i.prefixOrFullSeekKey, keyPrefix) 1458 1459 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1460 if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) { 1461 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound") 1462 i.iterValidityState = IterExhausted 1463 return false 1464 } 1465 key = lowerBound 1466 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1467 if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) { 1468 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound") 1469 i.iterValidityState = IterExhausted 1470 return false 1471 } 1472 key = upperBound 1473 } 1474 i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags) 1475 i.stats.ForwardSeekCount[InternalIterCall]++ 1476 i.findNextEntry(nil) 1477 i.maybeSampleRead() 1478 if i.Error() == nil { 1479 i.lastPositioningOp = seekPrefixGELastPositioningOp 1480 } 1481 return i.iterValidityState == IterValid 1482 } 1483 1484 // Deterministic disabling of the seek optimizations. It uses the iterator 1485 // pointer, since we want diversity in iterator behavior for the same key. Used 1486 // for tests. 1487 func disableSeekOpt(key []byte, ptr uintptr) bool { 1488 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 1489 simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 1490 return key != nil && key[0]&byte(1) == 0 && simpleHash == 0 1491 } 1492 1493 // SeekLT moves the iterator to the last key/value pair whose key is less than 1494 // the given key. Returns true if the iterator is pointing at a valid entry and 1495 // false otherwise. 1496 func (i *Iterator) SeekLT(key []byte) bool { 1497 return i.SeekLTWithLimit(key, nil) == IterValid 1498 } 1499 1500 // SeekLTWithLimit moves the iterator to the last key/value pair whose key is 1501 // less than the given key. 1502 // 1503 // If limit is provided, it serves as a best-effort inclusive limit. If the last 1504 // key less than the given search key is also less than limit, the Iterator may 1505 // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit 1506 // may return a key beyond limit. 1507 // 1508 // If the Iterator is configured to iterate over range keys, SeekLTWithLimit 1509 // guarantees it will surface any range keys with bounds overlapping the 1510 // keyspace up to limit. 1511 func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { 1512 if i.rangeKey != nil { 1513 // NB: Check Valid() before clearing requiresReposition. 1514 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1515 // If we have a range key but did not expose it at the previous iterator 1516 // position (because the iterator was not at a valid position), updated 1517 // must be true. This ensures that after an iterator op sequence like: 1518 // - Next() → (IterValid, RangeBounds() = [a,b)) 1519 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1520 // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1521 // the iterator returns RangeKeyChanged()=true. 1522 // 1523 // The remainder of this function will only update i.rangeKey.updated if 1524 // the iterator moves into a new range key, or out of the current range 1525 // key. 1526 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1527 } 1528 lastPositioningOp := i.lastPositioningOp 1529 // Set it to unknown, since this operation may not succeed, in which case 1530 // the SeekLT following this should not make any assumption about iterator 1531 // position. 1532 i.lastPositioningOp = unknownLastPositionOp 1533 i.batchJustRefreshed = false 1534 i.requiresReposition = false 1535 i.err = nil // clear cached iteration error 1536 i.stats.ReverseSeekCount[InterfaceCall]++ 1537 if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1538 key = upperBound 1539 } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1540 key = lowerBound 1541 } 1542 i.hasPrefix = false 1543 seekInternalIter := true 1544 // The following noop optimization only applies when i.batch == nil, since 1545 // an iterator over a batch is iterating over mutable data, that may have 1546 // changed since the last seek. 1547 if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil { 1548 cmp := i.cmp(key, i.prefixOrFullSeekKey) 1549 // If this seek is to the same or earlier key, and the iterator is 1550 // already positioned there, this is a noop. This can be helpful for 1551 // sparse key spaces that have many deleted keys, where one can avoid 1552 // the overhead of iterating past them again and again. 1553 if cmp <= 0 { 1554 // NB: when pos != iterPosCurReversePaused, the invariant 1555 // documented earlier implies that iterValidityState != 1556 // IterAtLimit. 1557 if i.iterValidityState == IterExhausted || 1558 (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 && 1559 (limit == nil || i.cmp(limit, i.key) <= 0)) { 1560 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1561 i.lastPositioningOp = seekLTLastPositioningOp 1562 return i.iterValidityState 1563 } 1564 } 1565 if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 { 1566 // Have some work to do, but don't need to seek, and we can 1567 // start doing findPrevEntry from i.iterKey. 1568 seekInternalIter = false 1569 } 1570 } 1571 } 1572 if seekInternalIter { 1573 i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone) 1574 i.stats.ReverseSeekCount[InternalIterCall]++ 1575 } 1576 i.findPrevEntry(limit) 1577 i.maybeSampleRead() 1578 if i.Error() == nil && i.batch == nil { 1579 // Prepare state for a future noop optimization. 1580 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1581 i.lastPositioningOp = seekLTLastPositioningOp 1582 } 1583 return i.iterValidityState 1584 } 1585 1586 // First moves the iterator the the first key/value pair. Returns true if the 1587 // iterator is pointing at a valid entry and false otherwise. 1588 func (i *Iterator) First() bool { 1589 if i.rangeKey != nil { 1590 // NB: Check Valid() before clearing requiresReposition. 1591 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1592 // If we have a range key but did not expose it at the previous iterator 1593 // position (because the iterator was not at a valid position), updated 1594 // must be true. This ensures that after an iterator op sequence like: 1595 // - Next() → (IterValid, RangeBounds() = [a,b)) 1596 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1597 // - First(...) → (IterValid, RangeBounds() = [a,b)) 1598 // the iterator returns RangeKeyChanged()=true. 1599 // 1600 // The remainder of this function will only update i.rangeKey.updated if 1601 // the iterator moves into a new range key, or out of the current range 1602 // key. 1603 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1604 } 1605 i.err = nil // clear cached iteration error 1606 i.hasPrefix = false 1607 i.batchJustRefreshed = false 1608 i.lastPositioningOp = unknownLastPositionOp 1609 i.requiresReposition = false 1610 i.stats.ForwardSeekCount[InterfaceCall]++ 1611 1612 i.iterFirstWithinBounds() 1613 i.findNextEntry(nil) 1614 i.maybeSampleRead() 1615 return i.iterValidityState == IterValid 1616 } 1617 1618 // Last moves the iterator the the last key/value pair. Returns true if the 1619 // iterator is pointing at a valid entry and false otherwise. 1620 func (i *Iterator) Last() bool { 1621 if i.rangeKey != nil { 1622 // NB: Check Valid() before clearing requiresReposition. 1623 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1624 // If we have a range key but did not expose it at the previous iterator 1625 // position (because the iterator was not at a valid position), updated 1626 // must be true. This ensures that after an iterator op sequence like: 1627 // - Next() → (IterValid, RangeBounds() = [a,b)) 1628 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1629 // - Last(...) → (IterValid, RangeBounds() = [a,b)) 1630 // the iterator returns RangeKeyChanged()=true. 1631 // 1632 // The remainder of this function will only update i.rangeKey.updated if 1633 // the iterator moves into a new range key, or out of the current range 1634 // key. 1635 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1636 } 1637 i.err = nil // clear cached iteration error 1638 i.hasPrefix = false 1639 i.batchJustRefreshed = false 1640 i.lastPositioningOp = unknownLastPositionOp 1641 i.requiresReposition = false 1642 i.stats.ReverseSeekCount[InterfaceCall]++ 1643 1644 i.iterLastWithinBounds() 1645 i.findPrevEntry(nil) 1646 i.maybeSampleRead() 1647 return i.iterValidityState == IterValid 1648 } 1649 1650 // Next moves the iterator to the next key/value pair. Returns true if the 1651 // iterator is pointing at a valid entry and false otherwise. 1652 func (i *Iterator) Next() bool { 1653 return i.nextWithLimit(nil) == IterValid 1654 } 1655 1656 // NextWithLimit moves the iterator to the next key/value pair. 1657 // 1658 // If limit is provided, it serves as a best-effort exclusive limit. If the next 1659 // key is greater than or equal to limit, the Iterator may pause and return 1660 // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key 1661 // beyond limit. 1662 // 1663 // If the Iterator is configured to iterate over range keys, NextWithLimit 1664 // guarantees it will surface any range keys with bounds overlapping the 1665 // keyspace up to limit. 1666 func (i *Iterator) NextWithLimit(limit []byte) IterValidityState { 1667 return i.nextWithLimit(limit) 1668 } 1669 1670 // NextPrefix moves the iterator to the next key/value pair with a key 1671 // containing a different prefix than the current key. Prefixes are determined 1672 // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration 1673 // mode. 1674 // 1675 // It is not permitted to invoke NextPrefix while at a IterAtLimit position. 1676 // When called in this condition, NextPrefix has non-deterministic behavior. 1677 // 1678 // It is not permitted to invoke NextPrefix when the Iterator has an 1679 // upper-bound that is a versioned MVCC key (see the comment for 1680 // Comparer.Split). It returns an error in this case. 1681 func (i *Iterator) NextPrefix() bool { 1682 if i.nextPrefixNotPermittedByUpperBound { 1683 i.lastPositioningOp = unknownLastPositionOp 1684 i.requiresReposition = false 1685 i.err = errors.Errorf("NextPrefix not permitted with upper bound %s", 1686 i.comparer.FormatKey(i.opts.UpperBound)) 1687 i.iterValidityState = IterExhausted 1688 return false 1689 } 1690 if i.hasPrefix { 1691 i.iterValidityState = IterExhausted 1692 return false 1693 } 1694 return i.nextPrefix() == IterValid 1695 } 1696 1697 func (i *Iterator) nextPrefix() IterValidityState { 1698 if i.rangeKey != nil { 1699 // NB: Check Valid() before clearing requiresReposition. 1700 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1701 // If we have a range key but did not expose it at the previous iterator 1702 // position (because the iterator was not at a valid position), updated 1703 // must be true. This ensures that after an iterator op sequence like: 1704 // - Next() → (IterValid, RangeBounds() = [a,b)) 1705 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1706 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1707 // the iterator returns RangeKeyChanged()=true. 1708 // 1709 // The remainder of this function will only update i.rangeKey.updated if 1710 // the iterator moves into a new range key, or out of the current range 1711 // key. 1712 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1713 } 1714 1715 // Although NextPrefix documents that behavior at IterAtLimit is undefined, 1716 // this function handles these cases as a simple prefix-agnostic Next. This 1717 // is done for deterministic behavior in the metamorphic tests. 1718 // 1719 // TODO(jackson): If the metamorphic test operation generator is adjusted to 1720 // make generation of some operations conditional on the previous 1721 // operations, then we can remove this behavior and explicitly error. 1722 1723 i.lastPositioningOp = unknownLastPositionOp 1724 i.requiresReposition = false 1725 switch i.pos { 1726 case iterPosCurForward: 1727 // Positioned on the current key. Advance to the next prefix. 1728 i.internalNextPrefix(i.split(i.key)) 1729 case iterPosCurForwardPaused: 1730 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1731 // up above. The iterator is already positioned at the next key. 1732 case iterPosCurReverse: 1733 // Switching directions. 1734 // Unless the iterator was exhausted, reverse iteration needs to 1735 // position the iterator at iterPosPrev. 1736 if i.iterKey != nil { 1737 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1738 i.iterValidityState = IterExhausted 1739 return i.iterValidityState 1740 } 1741 // The Iterator is exhausted and i.iter is positioned before the first 1742 // key. Reposition to point to the first internal key. 1743 i.iterFirstWithinBounds() 1744 case iterPosCurReversePaused: 1745 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1746 // up above. 1747 // 1748 // Switching directions; The iterator must not be exhausted since it 1749 // paused. 1750 if i.iterKey == nil { 1751 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1752 i.iterValidityState = IterExhausted 1753 return i.iterValidityState 1754 } 1755 i.nextUserKey() 1756 case iterPosPrev: 1757 // The underlying iterator is pointed to the previous key (this can 1758 // only happen when switching iteration directions). 1759 if i.iterKey == nil { 1760 // We're positioned before the first key. Need to reposition to point to 1761 // the first key. 1762 i.iterFirstWithinBounds() 1763 } else { 1764 // Move the internal iterator back onto the user key stored in 1765 // i.key. iterPosPrev guarantees that it's positioned at the last 1766 // key with the user key less than i.key, so we're guaranteed to 1767 // land on the correct key with a single Next. 1768 i.iterKey, i.iterValue = i.iter.Next() 1769 if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) { 1770 i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", 1771 i.iterKey.UserKey, i.key) 1772 } 1773 } 1774 // The internal iterator is now positioned at i.key. Advance to the next 1775 // prefix. 1776 i.internalNextPrefix(i.split(i.key)) 1777 case iterPosNext: 1778 // Already positioned on the next key. Only call nextPrefixKey if the 1779 // next key shares the same prefix. 1780 if i.iterKey != nil { 1781 currKeyPrefixLen := i.split(i.key) 1782 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1783 if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1784 i.internalNextPrefix(currKeyPrefixLen) 1785 } 1786 } 1787 } 1788 1789 i.stats.ForwardStepCount[InterfaceCall]++ 1790 i.findNextEntry(nil /* limit */) 1791 i.maybeSampleRead() 1792 return i.iterValidityState 1793 } 1794 1795 func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) { 1796 if i.iterKey == nil { 1797 return 1798 } 1799 // The Next "fast-path" is not really a fast-path when there is more than 1800 // one version. However, even with TableFormatPebblev3, there is a small 1801 // slowdown (~10%) for one version if we remove it and only call NextPrefix. 1802 // When there are two versions, only calling NextPrefix is ~30% faster. 1803 i.stats.ForwardStepCount[InternalIterCall]++ 1804 if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil { 1805 return 1806 } 1807 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1808 if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1809 return 1810 } 1811 i.stats.ForwardStepCount[InternalIterCall]++ 1812 i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen]) 1813 i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey) 1814 if invariants.Enabled && i.iterKey != nil { 1815 if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 { 1816 panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q", 1817 i.iterKey, i.prefixOrFullSeekKey)) 1818 } 1819 } 1820 } 1821 1822 func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { 1823 i.stats.ForwardStepCount[InterfaceCall]++ 1824 if i.hasPrefix { 1825 if limit != nil { 1826 i.err = errors.New("cannot use limit with prefix iteration") 1827 i.iterValidityState = IterExhausted 1828 return i.iterValidityState 1829 } else if i.iterValidityState == IterExhausted { 1830 // No-op, already exhasuted. We avoid executing the Next because it 1831 // can break invariants: Specifically, a file that fails the bloom 1832 // filter test may result in its level being removed from the 1833 // merging iterator. The level's removal can cause a lazy combined 1834 // iterator to miss range keys and trigger a switch to combined 1835 // iteration at a larger key, breaking keyspan invariants. 1836 return i.iterValidityState 1837 } 1838 } 1839 if i.err != nil { 1840 return i.iterValidityState 1841 } 1842 if i.rangeKey != nil { 1843 // NB: Check Valid() before clearing requiresReposition. 1844 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1845 // If we have a range key but did not expose it at the previous iterator 1846 // position (because the iterator was not at a valid position), updated 1847 // must be true. This ensures that after an iterator op sequence like: 1848 // - Next() → (IterValid, RangeBounds() = [a,b)) 1849 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1850 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1851 // the iterator returns RangeKeyChanged()=true. 1852 // 1853 // The remainder of this function will only update i.rangeKey.updated if 1854 // the iterator moves into a new range key, or out of the current range 1855 // key. 1856 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1857 } 1858 i.lastPositioningOp = unknownLastPositionOp 1859 i.requiresReposition = false 1860 switch i.pos { 1861 case iterPosCurForward: 1862 i.nextUserKey() 1863 case iterPosCurForwardPaused: 1864 // Already at the right place. 1865 case iterPosCurReverse: 1866 // Switching directions. 1867 // Unless the iterator was exhausted, reverse iteration needs to 1868 // position the iterator at iterPosPrev. 1869 if i.iterKey != nil { 1870 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1871 i.iterValidityState = IterExhausted 1872 return i.iterValidityState 1873 } 1874 // We're positioned before the first key. Need to reposition to point to 1875 // the first key. 1876 i.iterFirstWithinBounds() 1877 case iterPosCurReversePaused: 1878 // Switching directions. 1879 // The iterator must not be exhausted since it paused. 1880 if i.iterKey == nil { 1881 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1882 i.iterValidityState = IterExhausted 1883 return i.iterValidityState 1884 } 1885 i.nextUserKey() 1886 case iterPosPrev: 1887 // The underlying iterator is pointed to the previous key (this can 1888 // only happen when switching iteration directions). We set 1889 // i.iterValidityState to IterExhausted here to force the calls to 1890 // nextUserKey to save the current key i.iter is pointing at in order 1891 // to determine when the next user-key is reached. 1892 i.iterValidityState = IterExhausted 1893 if i.iterKey == nil { 1894 // We're positioned before the first key. Need to reposition to point to 1895 // the first key. 1896 i.iterFirstWithinBounds() 1897 } else { 1898 i.nextUserKey() 1899 } 1900 i.nextUserKey() 1901 case iterPosNext: 1902 // Already at the right place. 1903 } 1904 i.findNextEntry(limit) 1905 i.maybeSampleRead() 1906 return i.iterValidityState 1907 } 1908 1909 // Prev moves the iterator to the previous key/value pair. Returns true if the 1910 // iterator is pointing at a valid entry and false otherwise. 1911 func (i *Iterator) Prev() bool { 1912 return i.PrevWithLimit(nil) == IterValid 1913 } 1914 1915 // PrevWithLimit moves the iterator to the previous key/value pair. 1916 // 1917 // If limit is provided, it serves as a best-effort inclusive limit. If the 1918 // previous key is less than limit, the Iterator may pause and return 1919 // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key 1920 // beyond limit. 1921 // 1922 // If the Iterator is configured to iterate over range keys, PrevWithLimit 1923 // guarantees it will surface any range keys with bounds overlapping the 1924 // keyspace up to limit. 1925 func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState { 1926 i.stats.ReverseStepCount[InterfaceCall]++ 1927 if i.err != nil { 1928 return i.iterValidityState 1929 } 1930 if i.rangeKey != nil { 1931 // NB: Check Valid() before clearing requiresReposition. 1932 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1933 // If we have a range key but did not expose it at the previous iterator 1934 // position (because the iterator was not at a valid position), updated 1935 // must be true. This ensures that after an iterator op sequence like: 1936 // - Next() → (IterValid, RangeBounds() = [a,b)) 1937 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1938 // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1939 // the iterator returns RangeKeyChanged()=true. 1940 // 1941 // The remainder of this function will only update i.rangeKey.updated if 1942 // the iterator moves into a new range key, or out of the current range 1943 // key. 1944 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1945 } 1946 i.lastPositioningOp = unknownLastPositionOp 1947 i.requiresReposition = false 1948 if i.hasPrefix { 1949 i.err = errReversePrefixIteration 1950 i.iterValidityState = IterExhausted 1951 return i.iterValidityState 1952 } 1953 switch i.pos { 1954 case iterPosCurForward: 1955 // Switching directions, and will handle this below. 1956 case iterPosCurForwardPaused: 1957 // Switching directions, and will handle this below. 1958 case iterPosCurReverse: 1959 i.prevUserKey() 1960 case iterPosCurReversePaused: 1961 // Already at the right place. 1962 case iterPosNext: 1963 // The underlying iterator is pointed to the next key (this can only happen 1964 // when switching iteration directions). We will handle this below. 1965 case iterPosPrev: 1966 // Already at the right place. 1967 } 1968 if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused { 1969 // Switching direction. 1970 stepAgain := i.pos == iterPosNext 1971 1972 // Synthetic range key markers are a special case. Consider SeekGE(b) 1973 // which finds a range key [a, c). To ensure the user observes the range 1974 // key, the Iterator pauses at Key() = b. The iterator must advance the 1975 // internal iterator to see if there's also a coincident point key at 1976 // 'b', leaving the iterator at iterPosNext if there's not. 1977 // 1978 // This is a problem: Synthetic range key markers are only interleaved 1979 // during the original seek. A subsequent Prev() of i.iter will not move 1980 // back onto the synthetic range key marker. In this case where the 1981 // previous iterator position was a synthetic range key start boundary, 1982 // we must not step a second time. 1983 if i.isEphemeralPosition() { 1984 stepAgain = false 1985 } 1986 1987 // We set i.iterValidityState to IterExhausted here to force the calls 1988 // to prevUserKey to save the current key i.iter is pointing at in 1989 // order to determine when the prev user-key is reached. 1990 i.iterValidityState = IterExhausted 1991 if i.iterKey == nil { 1992 // We're positioned after the last key. Need to reposition to point to 1993 // the last key. 1994 i.iterLastWithinBounds() 1995 } else { 1996 i.prevUserKey() 1997 } 1998 if stepAgain { 1999 i.prevUserKey() 2000 } 2001 } 2002 i.findPrevEntry(limit) 2003 i.maybeSampleRead() 2004 return i.iterValidityState 2005 } 2006 2007 // iterFirstWithinBounds moves the internal iterator to the first key, 2008 // respecting bounds. 2009 func (i *Iterator) iterFirstWithinBounds() { 2010 i.stats.ForwardSeekCount[InternalIterCall]++ 2011 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil { 2012 i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone) 2013 } else { 2014 i.iterKey, i.iterValue = i.iter.First() 2015 } 2016 } 2017 2018 // iterLastWithinBounds moves the internal iterator to the last key, respecting 2019 // bounds. 2020 func (i *Iterator) iterLastWithinBounds() { 2021 i.stats.ReverseSeekCount[InternalIterCall]++ 2022 if upperBound := i.opts.GetUpperBound(); upperBound != nil { 2023 i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone) 2024 } else { 2025 i.iterKey, i.iterValue = i.iter.Last() 2026 } 2027 } 2028 2029 // RangeKeyData describes a range key's data, set through RangeKeySet. The key 2030 // boundaries of the range key is provided by Iterator.RangeBounds. 2031 type RangeKeyData struct { 2032 Suffix []byte 2033 Value []byte 2034 } 2035 2036 // rangeKeyWithinLimit is called during limited reverse iteration when 2037 // positioned over a key beyond the limit. If there exists a range key that lies 2038 // within the limit, the iterator must not pause in order to ensure the user has 2039 // an opportunity to observe the range key within limit. 2040 // 2041 // It would be valid to ignore the limit whenever there's a range key covering 2042 // the key, but that would introduce nondeterminism. To preserve determinism for 2043 // testing, the iterator ignores the limit only if the covering range key does 2044 // cover the keyspace within the limit. 2045 // 2046 // This awkwardness exists because range keys are interleaved at their inclusive 2047 // start positions. Note that limit is inclusive. 2048 func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool { 2049 if i.rangeKey == nil || !i.opts.rangeKeys() { 2050 return false 2051 } 2052 s := i.rangeKey.iiter.Span() 2053 // If the range key ends beyond the limit, then the range key does not cover 2054 // any portion of the keyspace within the limit and it is safe to pause. 2055 return s != nil && i.cmp(s.End, limit) > 0 2056 } 2057 2058 // saveRangeKey saves the current range key to the underlying iterator's current 2059 // range key state. If the range key has not changed, saveRangeKey is a no-op. 2060 // If there is a new range key, saveRangeKey copies all of the key, value and 2061 // suffixes into Iterator-managed buffers. 2062 func (i *Iterator) saveRangeKey() { 2063 if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly { 2064 return 2065 } 2066 2067 s := i.rangeKey.iiter.Span() 2068 if s == nil { 2069 i.rangeKey.hasRangeKey = false 2070 i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey 2071 return 2072 } else if !i.rangeKey.stale { 2073 // The range key `s` is identical to the one currently saved. No-op. 2074 return 2075 } 2076 2077 if s.KeysOrder != keyspan.BySuffixAsc { 2078 panic("pebble: range key span's keys unexpectedly not in ascending suffix order") 2079 } 2080 2081 // Although `i.rangeKey.stale` is true, the span s may still be identical 2082 // to the currently saved span. This is possible when seeking the iterator, 2083 // which may land back on the same range key. If we previously had a range 2084 // key and the new one has an identical start key, then it must be the same 2085 // range key and we can avoid copying and keep `i.rangeKey.updated=false`. 2086 // 2087 // TODO(jackson): These key comparisons could be avoidable during relative 2088 // positioning operations continuing in the same direction, because these 2089 // ops will never encounter the previous position's range key while 2090 // stale=true. However, threading whether the current op is a seek or step 2091 // maybe isn't worth it. This key comparison is only necessary once when we 2092 // step onto a new range key, which should be relatively rare. 2093 if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) && 2094 i.equal(i.rangeKey.end, s.End) { 2095 i.rangeKey.updated = false 2096 i.rangeKey.stale = false 2097 i.rangeKey.hasRangeKey = true 2098 return 2099 } 2100 i.stats.RangeKeyStats.Count += len(s.Keys) 2101 i.rangeKey.buf.Reset() 2102 i.rangeKey.hasRangeKey = true 2103 i.rangeKey.updated = true 2104 i.rangeKey.stale = false 2105 i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start) 2106 i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End) 2107 i.rangeKey.keys = i.rangeKey.keys[:0] 2108 for j := 0; j < len(s.Keys); j++ { 2109 if invariants.Enabled { 2110 if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet { 2111 panic("pebble: user iteration encountered non-RangeKeySet key kind") 2112 } else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 { 2113 panic("pebble: user iteration encountered range keys not in suffix order") 2114 } 2115 } 2116 var rkd RangeKeyData 2117 i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix) 2118 i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value) 2119 i.rangeKey.keys = append(i.rangeKey.keys, rkd) 2120 } 2121 } 2122 2123 // RangeKeyChanged indicates whether the most recent iterator positioning 2124 // operation resulted in the iterator stepping into or out of a new range key. 2125 // If true, previously returned range key bounds and data has been invalidated. 2126 // If false, previously obtained range key bounds, suffix and value slices are 2127 // still valid and may continue to be read. 2128 // 2129 // Invalid iterator positions are considered to not hold range keys, meaning 2130 // that if an iterator steps from an IterExhausted or IterAtLimit position onto 2131 // a position with a range key, RangeKeyChanged will yield true. 2132 func (i *Iterator) RangeKeyChanged() bool { 2133 return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated 2134 } 2135 2136 // HasPointAndRange indicates whether there exists a point key, a range key or 2137 // both at the current iterator position. 2138 func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) { 2139 if i.iterValidityState != IterValid || i.requiresReposition { 2140 return false, false 2141 } 2142 if i.opts.KeyTypes == IterKeyTypePointsOnly { 2143 return true, false 2144 } 2145 return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey 2146 } 2147 2148 // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the 2149 // range key covering the current iterator position. RangeBounds returns nil 2150 // bounds if there is no range key covering the current iterator position, or 2151 // the iterator is not configured to surface range keys. 2152 // 2153 // If valid, the returned start bound is less than or equal to Key() and the 2154 // returned end bound is greater than Key(). 2155 func (i *Iterator) RangeBounds() (start, end []byte) { 2156 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2157 return nil, nil 2158 } 2159 return i.rangeKey.start, i.rangeKey.end 2160 } 2161 2162 // Key returns the key of the current key/value pair, or nil if done. The 2163 // caller should not modify the contents of the returned slice, and its 2164 // contents may change on the next call to Next. 2165 // 2166 // If positioned at an iterator position that only holds a range key, Key() 2167 // always returns the start bound of the range key. Otherwise, it returns the 2168 // point key's key. 2169 func (i *Iterator) Key() []byte { 2170 return i.key 2171 } 2172 2173 // Value returns the value of the current key/value pair, or nil if done. The 2174 // caller should not modify the contents of the returned slice, and its 2175 // contents may change on the next call to Next. 2176 // 2177 // Only valid if HasPointAndRange() returns true for hasPoint. 2178 // Deprecated: use ValueAndErr instead. 2179 func (i *Iterator) Value() []byte { 2180 val, _ := i.ValueAndErr() 2181 return val 2182 } 2183 2184 // ValueAndErr returns the value, and any error encountered in extracting the value. 2185 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2186 // 2187 // The caller should not modify the contents of the returned slice, and its 2188 // contents may change on the next call to Next. 2189 func (i *Iterator) ValueAndErr() ([]byte, error) { 2190 val, callerOwned, err := i.value.Value(i.lazyValueBuf) 2191 if err != nil { 2192 i.err = err 2193 } 2194 if callerOwned { 2195 i.lazyValueBuf = val[:0] 2196 } 2197 return val, err 2198 } 2199 2200 // LazyValue returns the LazyValue. Only for advanced use cases. 2201 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2202 func (i *Iterator) LazyValue() LazyValue { 2203 return i.value 2204 } 2205 2206 // RangeKeys returns the range key values and their suffixes covering the 2207 // current iterator position. The range bounds may be retrieved separately 2208 // through Iterator.RangeBounds(). 2209 func (i *Iterator) RangeKeys() []RangeKeyData { 2210 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2211 return nil 2212 } 2213 return i.rangeKey.keys 2214 } 2215 2216 // Valid returns true if the iterator is positioned at a valid key/value pair 2217 // and false otherwise. 2218 func (i *Iterator) Valid() bool { 2219 valid := i.iterValidityState == IterValid && !i.requiresReposition 2220 if invariants.Enabled { 2221 if err := i.Error(); valid && err != nil { 2222 panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err)) 2223 } 2224 } 2225 return valid 2226 } 2227 2228 // Error returns any accumulated error. 2229 func (i *Iterator) Error() error { 2230 if i.iter != nil { 2231 return firstError(i.err, i.iter.Error()) 2232 } 2233 return i.err 2234 } 2235 2236 const maxKeyBufCacheSize = 4 << 10 // 4 KB 2237 2238 // Close closes the iterator and returns any accumulated error. Exhausting 2239 // all the key/value pairs in a table is not considered to be an error. 2240 // It is not valid to call any method, including Close, after the iterator 2241 // has been closed. 2242 func (i *Iterator) Close() error { 2243 // Close the child iterator before releasing the readState because when the 2244 // readState is released sstables referenced by the readState may be deleted 2245 // which will fail on Windows if the sstables are still open by the child 2246 // iterator. 2247 if i.iter != nil { 2248 i.err = firstError(i.err, i.iter.Close()) 2249 2250 // Closing i.iter did not necessarily close the point and range key 2251 // iterators. Calls to SetOptions may have 'disconnected' either one 2252 // from i.iter if iteration key types were changed. Both point and range 2253 // key iterators are preserved in case the iterator needs to switch key 2254 // types again. We explicitly close both of these iterators here. 2255 // 2256 // NB: If the iterators were still connected to i.iter, they may be 2257 // closed, but calling Close on a closed internal iterator or fragment 2258 // iterator is allowed. 2259 if i.pointIter != nil && !i.closePointIterOnce { 2260 i.err = firstError(i.err, i.pointIter.Close()) 2261 } 2262 if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil { 2263 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2264 } 2265 } 2266 err := i.err 2267 2268 if i.readState != nil { 2269 if i.readSampling.pendingCompactions.size > 0 { 2270 // Copy pending read compactions using db.mu.Lock() 2271 i.readState.db.mu.Lock() 2272 i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp) 2273 reschedule := i.readState.db.mu.compact.rescheduleReadCompaction 2274 i.readState.db.mu.compact.rescheduleReadCompaction = false 2275 concurrentCompactions := i.readState.db.mu.compact.compactingCount 2276 i.readState.db.mu.Unlock() 2277 2278 if reschedule && concurrentCompactions == 0 { 2279 // In a read heavy workload, flushes may not happen frequently enough to 2280 // schedule compactions. 2281 i.readState.db.compactionSchedulers.Add(1) 2282 go i.readState.db.maybeScheduleCompactionAsync() 2283 } 2284 } 2285 2286 i.readState.unref() 2287 i.readState = nil 2288 } 2289 2290 if i.version != nil { 2291 i.version.Unref() 2292 } 2293 2294 for _, readers := range i.externalReaders { 2295 for _, r := range readers { 2296 err = firstError(err, r.Close()) 2297 } 2298 } 2299 2300 // Close the closer for the current value if one was open. 2301 if i.valueCloser != nil { 2302 err = firstError(err, i.valueCloser.Close()) 2303 i.valueCloser = nil 2304 } 2305 2306 if i.rangeKey != nil { 2307 2308 i.rangeKey.rangeKeyBuffers.PrepareForReuse() 2309 *i.rangeKey = iteratorRangeKeyState{ 2310 rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, 2311 } 2312 iterRangeKeyStateAllocPool.Put(i.rangeKey) 2313 i.rangeKey = nil 2314 } 2315 if alloc := i.alloc; alloc != nil { 2316 // Avoid caching the key buf if it is overly large. The constant is fairly 2317 // arbitrary. 2318 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2319 alloc.keyBuf = nil 2320 } else { 2321 alloc.keyBuf = i.keyBuf 2322 } 2323 if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize { 2324 alloc.prefixOrFullSeekKey = nil 2325 } else { 2326 alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey 2327 } 2328 for j := range i.boundsBuf { 2329 if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { 2330 alloc.boundsBuf[j] = nil 2331 } else { 2332 alloc.boundsBuf[j] = i.boundsBuf[j] 2333 } 2334 } 2335 *alloc = iterAlloc{ 2336 keyBuf: alloc.keyBuf, 2337 boundsBuf: alloc.boundsBuf, 2338 prefixOrFullSeekKey: alloc.prefixOrFullSeekKey, 2339 } 2340 iterAllocPool.Put(alloc) 2341 } else if alloc := i.getIterAlloc; alloc != nil { 2342 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2343 alloc.keyBuf = nil 2344 } else { 2345 alloc.keyBuf = i.keyBuf 2346 } 2347 *alloc = getIterAlloc{ 2348 keyBuf: alloc.keyBuf, 2349 } 2350 getIterAllocPool.Put(alloc) 2351 } 2352 return err 2353 } 2354 2355 // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds 2356 // returns, the caller is free to mutate the provided slices. 2357 // 2358 // The iterator will always be invalidated and must be repositioned with a call 2359 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2360 func (i *Iterator) SetBounds(lower, upper []byte) { 2361 // Ensure that the Iterator appears exhausted, regardless of whether we 2362 // actually have to invalidate the internal iterator. Optimizations that 2363 // avoid exhaustion are an internal implementation detail that shouldn't 2364 // leak through the interface. The caller should still call an absolute 2365 // positioning method to reposition the iterator. 2366 i.requiresReposition = true 2367 2368 if ((i.opts.LowerBound == nil) == (lower == nil)) && 2369 ((i.opts.UpperBound == nil) == (upper == nil)) && 2370 i.equal(i.opts.LowerBound, lower) && 2371 i.equal(i.opts.UpperBound, upper) { 2372 // Unchanged, noop. 2373 return 2374 } 2375 2376 // Copy the user-provided bounds into an Iterator-owned buffer, and set them 2377 // on i.opts.{Lower,Upper}Bound. 2378 i.processBounds(lower, upper) 2379 2380 i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2381 // If the iterator has an open point iterator that's not currently being 2382 // used, propagate the new bounds to it. 2383 if i.pointIter != nil && !i.opts.pointKeys() { 2384 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2385 } 2386 // If the iterator has a range key iterator, propagate bounds to it. The 2387 // top-level SetBounds on the interleaving iterator (i.iter) won't propagate 2388 // bounds to the range key iterator stack, because the FragmentIterator 2389 // interface doesn't define a SetBounds method. We need to directly inform 2390 // the iterConfig stack. 2391 if i.rangeKey != nil { 2392 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2393 } 2394 2395 // Even though this is not a positioning operation, the alteration of the 2396 // bounds means we cannot optimize Seeks by using Next. 2397 i.invalidate() 2398 } 2399 2400 // SetContext replaces the context provided at iterator creation, or the last 2401 // one provided by SetContext. Even though iterators are expected to be 2402 // short-lived, there are some cases where either (a) iterators are used far 2403 // from the code that created them, (b) iterators are reused (while being 2404 // short-lived) for processing different requests. For such scenarios, we 2405 // allow the caller to replace the context. 2406 func (i *Iterator) SetContext(ctx context.Context) { 2407 i.ctx = ctx 2408 i.iter.SetContext(ctx) 2409 // If the iterator has an open point iterator that's not currently being 2410 // used, propagate the new context to it. 2411 if i.pointIter != nil && !i.opts.pointKeys() { 2412 i.pointIter.SetContext(i.ctx) 2413 } 2414 } 2415 2416 // Initialization and changing of the bounds must call processBounds. 2417 // processBounds saves the bounds and computes derived state from those 2418 // bounds. 2419 func (i *Iterator) processBounds(lower, upper []byte) { 2420 // Copy the user-provided bounds into an Iterator-owned buffer. We can't 2421 // overwrite the current bounds, because some internal iterators compare old 2422 // and new bounds for optimizations. 2423 2424 buf := i.boundsBuf[i.boundsBufIdx][:0] 2425 if lower != nil { 2426 buf = append(buf, lower...) 2427 i.opts.LowerBound = buf 2428 } else { 2429 i.opts.LowerBound = nil 2430 } 2431 i.nextPrefixNotPermittedByUpperBound = false 2432 if upper != nil { 2433 buf = append(buf, upper...) 2434 i.opts.UpperBound = buf[len(buf)-len(upper):] 2435 if i.comparer.Split != nil { 2436 if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) { 2437 // Setting an upper bound that is a versioned MVCC key. This means 2438 // that a key can have some MVCC versions before the upper bound and 2439 // some after. This causes significant complications for NextPrefix, 2440 // so we bar the user of NextPrefix. 2441 i.nextPrefixNotPermittedByUpperBound = true 2442 } 2443 } 2444 } else { 2445 i.opts.UpperBound = nil 2446 } 2447 i.boundsBuf[i.boundsBufIdx] = buf 2448 i.boundsBufIdx = 1 - i.boundsBufIdx 2449 } 2450 2451 // SetOptions sets new iterator options for the iterator. Note that the lower 2452 // and upper bounds applied here will supersede any bounds set by previous calls 2453 // to SetBounds. 2454 // 2455 // Note that the slices provided in this SetOptions must not be changed by the 2456 // caller until the iterator is closed, or a subsequent SetBounds or SetOptions 2457 // has returned. This is because comparisons between the existing and new bounds 2458 // are sometimes used to optimize seeking. See the extended commentary on 2459 // SetBounds. 2460 // 2461 // If the iterator was created over an indexed mutable batch, the iterator's 2462 // view of the mutable batch is refreshed. 2463 // 2464 // The iterator will always be invalidated and must be repositioned with a call 2465 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2466 // 2467 // If only lower and upper bounds need to be modified, prefer SetBounds. 2468 func (i *Iterator) SetOptions(o *IterOptions) { 2469 if i.externalReaders != nil { 2470 if err := validateExternalIterOpts(o); err != nil { 2471 panic(err) 2472 } 2473 } 2474 2475 // Ensure that the Iterator appears exhausted, regardless of whether we 2476 // actually have to invalidate the internal iterator. Optimizations that 2477 // avoid exhaustion are an internal implementation detail that shouldn't 2478 // leak through the interface. The caller should still call an absolute 2479 // positioning method to reposition the iterator. 2480 i.requiresReposition = true 2481 2482 // Check if global state requires we close all internal iterators. 2483 // 2484 // If the Iterator is in an error state, invalidate the existing iterators 2485 // so that we reconstruct an iterator state from scratch. 2486 // 2487 // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect, 2488 // improperly including or excluding memtables. Invalidate them so that 2489 // finishInitializingIter will reconstruct them. 2490 // 2491 // If either the original options or the new options specify a table filter, 2492 // we need to reconstruct the iterator stacks. If they both supply a table 2493 // filter, we can't be certain that it's the same filter since we have no 2494 // mechanism to compare the filter closures. 2495 closeBoth := i.err != nil || 2496 o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable || 2497 o.TableFilter != nil || i.opts.TableFilter != nil 2498 2499 // If either options specify block property filters for an iterator stack, 2500 // reconstruct it. 2501 if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 || 2502 o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil || 2503 i.opts.SkipPoint != nil) { 2504 i.err = firstError(i.err, i.pointIter.Close()) 2505 i.pointIter = nil 2506 } 2507 if i.rangeKey != nil { 2508 if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 { 2509 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2510 i.rangeKey = nil 2511 } else { 2512 // If there's still a range key iterator stack, invalidate the 2513 // iterator. This ensures RangeKeyChanged() returns true if a 2514 // subsequent positioning operation discovers a range key. It also 2515 // prevents seek no-op optimizations. 2516 i.invalidate() 2517 } 2518 } 2519 2520 // If the iterator is backed by a batch that's been mutated, refresh its 2521 // existing point and range-key iterators, and invalidate the iterator to 2522 // prevent seek-using-next optimizations. If we don't yet have a point-key 2523 // iterator or range-key iterator but we require one, it'll be created in 2524 // the slow path that reconstructs the iterator in finishInitializingIter. 2525 if i.batch != nil { 2526 nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2527 if nextBatchSeqNum != i.batchSeqNum { 2528 i.batchSeqNum = nextBatchSeqNum 2529 if i.merging != nil { 2530 i.merging.batchSnapshot = nextBatchSeqNum 2531 } 2532 // Prevent a no-op seek optimization on the next seek. We won't be 2533 // able to reuse the top-level Iterator state, because it may be 2534 // incorrect after the inclusion of new batch mutations. 2535 i.batchJustRefreshed = true 2536 if i.pointIter != nil && i.batch.countRangeDels > 0 { 2537 if i.batchRangeDelIter.Count() == 0 { 2538 // When we constructed this iterator, there were no 2539 // rangedels in the batch. Iterator construction will 2540 // have excluded the batch rangedel iterator from the 2541 // point iterator stack. We need to reconstruct the 2542 // point iterator to add i.batchRangeDelIter into the 2543 // iterator stack. 2544 i.err = firstError(i.err, i.pointIter.Close()) 2545 i.pointIter = nil 2546 } else { 2547 // There are range deletions in the batch and we already 2548 // have a batch rangedel iterator. We can update the 2549 // batch rangedel iterator in place. 2550 // 2551 // NB: There may or may not be new range deletions. We 2552 // can't tell based on i.batchRangeDelIter.Count(), 2553 // which is the count of fragmented range deletions, NOT 2554 // the number of range deletions written to the batch 2555 // [i.batch.countRangeDels]. 2556 i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum) 2557 } 2558 } 2559 if i.rangeKey != nil && i.batch.countRangeKeys > 0 { 2560 if i.batchRangeKeyIter.Count() == 0 { 2561 // When we constructed this iterator, there were no range 2562 // keys in the batch. Iterator construction will have 2563 // excluded the batch rangekey iterator from the range key 2564 // iterator stack. We need to reconstruct the range key 2565 // iterator to add i.batchRangeKeyIter into the iterator 2566 // stack. 2567 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2568 i.rangeKey = nil 2569 } else { 2570 // There are range keys in the batch and we already 2571 // have a batch rangekey iterator. We can update the batch 2572 // rangekey iterator in place. 2573 // 2574 // NB: There may or may not be new range keys. We can't 2575 // tell based on i.batchRangeKeyIter.Count(), which is the 2576 // count of fragmented range keys, NOT the number of 2577 // range keys written to the batch [i.batch.countRangeKeys]. 2578 i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum) 2579 i.invalidate() 2580 } 2581 } 2582 } 2583 } 2584 2585 // Reset combinedIterState.initialized in case the iterator key types 2586 // changed. If there's already a range key iterator stack, the combined 2587 // iterator is already initialized. Additionally, if the iterator is not 2588 // configured to include range keys, mark it as initialized to signal that 2589 // lower level iterators should not trigger a switch to combined iteration. 2590 i.lazyCombinedIter.combinedIterState = combinedIterState{ 2591 initialized: i.rangeKey != nil || !i.opts.rangeKeys(), 2592 } 2593 2594 boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) && 2595 ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) && 2596 i.equal(i.opts.LowerBound, o.LowerBound) && 2597 i.equal(i.opts.UpperBound, o.UpperBound) 2598 2599 if boundsEqual && o.KeyTypes == i.opts.KeyTypes && 2600 (i.pointIter != nil || !i.opts.pointKeys()) && 2601 (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) && 2602 i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) && 2603 o.UseL6Filters == i.opts.UseL6Filters { 2604 // The options are identical, so we can likely use the fast path. In 2605 // addition to all the above constraints, we cannot use the fast path if 2606 // configured to perform lazy combined iteration but an indexed batch 2607 // used by the iterator now contains range keys. Lazy combined iteration 2608 // is not compatible with batch range keys because we always need to 2609 // merge the batch's range keys into iteration. 2610 if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 { 2611 // Fast path. This preserves the Seek-using-Next optimizations as 2612 // long as the iterator wasn't already invalidated up above. 2613 return 2614 } 2615 } 2616 // Slow path. 2617 2618 // The options changed. Save the new ones to i.opts. 2619 if boundsEqual { 2620 // Copying the options into i.opts will overwrite LowerBound and 2621 // UpperBound fields with the user-provided slices. We need to hold on 2622 // to the Pebble-owned slices, so save them and re-set them after the 2623 // copy. 2624 lower, upper := i.opts.LowerBound, i.opts.UpperBound 2625 i.opts = *o 2626 i.opts.LowerBound, i.opts.UpperBound = lower, upper 2627 } else { 2628 i.opts = *o 2629 i.processBounds(o.LowerBound, o.UpperBound) 2630 // Propagate the changed bounds to the existing point iterator. 2631 // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound 2632 // because i.opts now point to buffers owned by Pebble. 2633 if i.pointIter != nil { 2634 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2635 } 2636 if i.rangeKey != nil { 2637 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2638 } 2639 } 2640 2641 // Even though this is not a positioning operation, the invalidation of the 2642 // iterator stack means we cannot optimize Seeks by using Next. 2643 i.invalidate() 2644 2645 // Iterators created through NewExternalIter have a different iterator 2646 // initialization process. 2647 if i.externalReaders != nil { 2648 finishInitializingExternal(i.ctx, i) 2649 return 2650 } 2651 finishInitializingIter(i.ctx, i.alloc) 2652 } 2653 2654 func (i *Iterator) invalidate() { 2655 i.lastPositioningOp = invalidatedLastPositionOp 2656 i.hasPrefix = false 2657 i.iterKey = nil 2658 i.iterValue = LazyValue{} 2659 i.err = nil 2660 // This switch statement isn't necessary for correctness since callers 2661 // should call a repositioning method. We could have arbitrarily set i.pos 2662 // to one of the values. But it results in more intuitive behavior in 2663 // tests, which do not always reposition. 2664 switch i.pos { 2665 case iterPosCurForward, iterPosNext, iterPosCurForwardPaused: 2666 i.pos = iterPosCurForward 2667 case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused: 2668 i.pos = iterPosCurReverse 2669 } 2670 i.iterValidityState = IterExhausted 2671 if i.rangeKey != nil { 2672 i.rangeKey.iiter.Invalidate() 2673 i.rangeKey.prevPosHadRangeKey = false 2674 } 2675 } 2676 2677 // Metrics returns per-iterator metrics. 2678 func (i *Iterator) Metrics() IteratorMetrics { 2679 m := IteratorMetrics{ 2680 ReadAmp: 1, 2681 } 2682 if mi, ok := i.iter.(*mergingIter); ok { 2683 m.ReadAmp = len(mi.levels) 2684 } 2685 return m 2686 } 2687 2688 // ResetStats resets the stats to 0. 2689 func (i *Iterator) ResetStats() { 2690 i.stats = IteratorStats{} 2691 } 2692 2693 // Stats returns the current stats. 2694 func (i *Iterator) Stats() IteratorStats { 2695 return i.stats 2696 } 2697 2698 // CloneOptions configures an iterator constructed through Iterator.Clone. 2699 type CloneOptions struct { 2700 // IterOptions, if non-nil, define the iterator options to configure a 2701 // cloned iterator. If nil, the clone adopts the same IterOptions as the 2702 // iterator being cloned. 2703 IterOptions *IterOptions 2704 // RefreshBatchView may be set to true when cloning an Iterator over an 2705 // indexed batch. When false, the clone adopts the same (possibly stale) 2706 // view of the indexed batch as the cloned Iterator. When true, the clone is 2707 // constructed with a refreshed view of the batch, observing all of the 2708 // batch's mutations at the time of the Clone. If the cloned iterator was 2709 // not constructed to read over an indexed batch, RefreshVatchView has no 2710 // effect. 2711 RefreshBatchView bool 2712 } 2713 2714 // Clone creates a new Iterator over the same underlying data, i.e., over the 2715 // same {batch, memtables, sstables}). The resulting iterator is not positioned. 2716 // It starts with the same IterOptions, unless opts.IterOptions is set. 2717 // 2718 // When called on an Iterator over an indexed batch, the clone's visibility of 2719 // the indexed batch is determined by CloneOptions.RefreshBatchView. If false, 2720 // the clone inherits the iterator's current (possibly stale) view of the batch, 2721 // and callers may call SetOptions to subsequently refresh the clone's view to 2722 // include all batch mutations. If true, the clone is constructed with a 2723 // complete view of the indexed batch's mutations at the time of the Clone. 2724 // 2725 // Callers can use Clone if they need multiple iterators that need to see 2726 // exactly the same underlying state of the DB. This should not be used to 2727 // extend the lifetime of the data backing the original Iterator since that 2728 // will cause an increase in memory and disk usage (use NewSnapshot for that 2729 // purpose). 2730 func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) { 2731 return i.CloneWithContext(context.Background(), opts) 2732 } 2733 2734 // CloneWithContext is like Clone, and additionally accepts a context for 2735 // tracing. 2736 func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) { 2737 if opts.IterOptions == nil { 2738 opts.IterOptions = &i.opts 2739 } 2740 if i.batchOnlyIter { 2741 return nil, errors.Errorf("cannot Clone a batch-only Iterator") 2742 } 2743 readState := i.readState 2744 vers := i.version 2745 if readState == nil && vers == nil { 2746 return nil, errors.Errorf("cannot Clone a closed Iterator") 2747 } 2748 // i is already holding a ref, so there is no race with unref here. 2749 // 2750 // TODO(bilal): If the underlying iterator was created on a snapshot, we could 2751 // grab a reference to the current readState instead of reffing the original 2752 // readState. This allows us to release references to some zombie sstables 2753 // and memtables. 2754 if readState != nil { 2755 readState.ref() 2756 } 2757 if vers != nil { 2758 vers.Ref() 2759 } 2760 // Bundle various structures under a single umbrella in order to allocate 2761 // them together. 2762 buf := iterAllocPool.Get().(*iterAlloc) 2763 dbi := &buf.dbi 2764 *dbi = Iterator{ 2765 ctx: ctx, 2766 opts: *opts.IterOptions, 2767 alloc: buf, 2768 merge: i.merge, 2769 comparer: i.comparer, 2770 readState: readState, 2771 version: vers, 2772 keyBuf: buf.keyBuf, 2773 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 2774 boundsBuf: buf.boundsBuf, 2775 batch: i.batch, 2776 batchSeqNum: i.batchSeqNum, 2777 newIters: i.newIters, 2778 newIterRangeKey: i.newIterRangeKey, 2779 seqNum: i.seqNum, 2780 } 2781 dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) 2782 2783 // If the caller requested the clone have a current view of the indexed 2784 // batch, set the clone's batch sequence number appropriately. 2785 if i.batch != nil && opts.RefreshBatchView { 2786 dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2787 } 2788 2789 return finishInitializingIter(ctx, buf), nil 2790 } 2791 2792 // Merge adds all of the argument's statistics to the receiver. It may be used 2793 // to accumulate stats across multiple iterators. 2794 func (stats *IteratorStats) Merge(o IteratorStats) { 2795 for i := InterfaceCall; i < NumStatsKind; i++ { 2796 stats.ForwardSeekCount[i] += o.ForwardSeekCount[i] 2797 stats.ReverseSeekCount[i] += o.ReverseSeekCount[i] 2798 stats.ForwardStepCount[i] += o.ForwardStepCount[i] 2799 stats.ReverseStepCount[i] += o.ReverseStepCount[i] 2800 } 2801 stats.InternalStats.Merge(o.InternalStats) 2802 stats.RangeKeyStats.Merge(o.RangeKeyStats) 2803 } 2804 2805 func (stats *IteratorStats) String() string { 2806 return redact.StringWithoutMarkers(stats) 2807 } 2808 2809 // SafeFormat implements the redact.SafeFormatter interface. 2810 func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) { 2811 for i := range stats.ForwardStepCount { 2812 switch IteratorStatsKind(i) { 2813 case InterfaceCall: 2814 s.SafeString("(interface (dir, seek, step): ") 2815 case InternalIterCall: 2816 s.SafeString(", (internal (dir, seek, step): ") 2817 } 2818 s.Printf("(fwd, %d, %d), (rev, %d, %d))", 2819 redact.Safe(stats.ForwardSeekCount[i]), redact.Safe(stats.ForwardStepCount[i]), 2820 redact.Safe(stats.ReverseSeekCount[i]), redact.Safe(stats.ReverseStepCount[i])) 2821 } 2822 if stats.InternalStats != (InternalIteratorStats{}) { 2823 s.SafeString(",\n(internal-stats: ") 2824 s.Printf("(block-bytes: (total %s, cached %s, read-time %s)), "+ 2825 "(points: (count %s, key-bytes %s, value-bytes %s, tombstoned %s))", 2826 humanize.Bytes.Uint64(stats.InternalStats.BlockBytes), 2827 humanize.Bytes.Uint64(stats.InternalStats.BlockBytesInCache), 2828 humanize.FormattedString(stats.InternalStats.BlockReadDuration.String()), 2829 humanize.Count.Uint64(stats.InternalStats.PointCount), 2830 humanize.Bytes.Uint64(stats.InternalStats.KeyBytes), 2831 humanize.Bytes.Uint64(stats.InternalStats.ValueBytes), 2832 humanize.Count.Uint64(stats.InternalStats.PointsCoveredByRangeTombstones), 2833 ) 2834 if stats.InternalStats.SeparatedPointValue.Count != 0 { 2835 s.Printf(", (separated: (count %s, bytes %s, fetched %s)))", 2836 humanize.Count.Uint64(stats.InternalStats.SeparatedPointValue.Count), 2837 humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytes), 2838 humanize.Bytes.Uint64(stats.InternalStats.SeparatedPointValue.ValueBytesFetched)) 2839 } else { 2840 s.Printf(")") 2841 } 2842 } 2843 if stats.RangeKeyStats != (RangeKeyIteratorStats{}) { 2844 s.SafeString(",\n(range-key-stats: ") 2845 s.Printf("(count %d), (contained points: (count %d, skipped %d)))", 2846 stats.RangeKeyStats.Count, 2847 stats.RangeKeyStats.ContainedPoints, 2848 stats.RangeKeyStats.SkippedPoints) 2849 } 2850 } 2851 2852 // CanDeterministicallySingleDelete takes a valid iterator and examines internal 2853 // state to determine if a SingleDelete deleting Iterator.Key() would 2854 // deterministically delete the key. CanDeterministicallySingleDelete requires 2855 // the iterator to be oriented in the forward direction (eg, the last 2856 // positioning operation must've been a First, a Seek[Prefix]GE, or a 2857 // Next[Prefix][WithLimit]). 2858 // 2859 // This function does not change the external position of the iterator, and all 2860 // positioning methods should behave the same as if it was never called. This 2861 // function will only return a meaningful result the first time it's invoked at 2862 // an iterator position. This function invalidates the iterator Value's memory, 2863 // and the caller must not rely on the memory safety of the previous Iterator 2864 // position. 2865 // 2866 // If CanDeterministicallySingleDelete returns true AND the key at the iterator 2867 // position is not modified between the creation of the Iterator and the commit 2868 // of a batch containing a SingleDelete over the key, then the caller can be 2869 // assured that SingleDelete is equivalent to Delete on the local engine, but it 2870 // may not be true on another engine that received the same writes and with 2871 // logically equivalent state since this engine may have collapsed multiple SETs 2872 // into one. 2873 func CanDeterministicallySingleDelete(it *Iterator) (bool, error) { 2874 // This function may only be called once per external iterator position. We 2875 // can validate this by checking the last positioning operation. 2876 if it.lastPositioningOp == internalNextOp { 2877 return false, errors.New("pebble: CanDeterministicallySingleDelete called twice") 2878 } 2879 validity, kind := it.internalNext() 2880 var shadowedBySingleDelete bool 2881 for validity == internalNextValid { 2882 switch kind { 2883 case InternalKeyKindDelete, InternalKeyKindDeleteSized: 2884 // A DEL or DELSIZED tombstone is okay. An internal key 2885 // sequence like SINGLEDEL; SET; DEL; SET can be handled 2886 // deterministically. If there are SETs further down, we 2887 // don't care about them. 2888 return true, nil 2889 case InternalKeyKindSingleDelete: 2890 // A SingleDelete is okay as long as when that SingleDelete was 2891 // written, it was written deterministically (eg, with its own 2892 // CanDeterministicallySingleDelete check). Validate that it was 2893 // written deterministically. We'll allow one set to appear after 2894 // the SingleDelete. 2895 shadowedBySingleDelete = true 2896 validity, kind = it.internalNext() 2897 continue 2898 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: 2899 // If we observed a single delete, it's allowed to delete 1 key. 2900 // We'll keep looping to validate that the internal keys beneath the 2901 // already-written single delete are copacetic. 2902 if shadowedBySingleDelete { 2903 shadowedBySingleDelete = false 2904 validity, kind = it.internalNext() 2905 continue 2906 } 2907 // We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL 2908 // that deleted the KV at the original iterator position could 2909 // result in this key becoming visible. 2910 return false, nil 2911 case InternalKeyKindRangeDelete: 2912 // RangeDeletes are handled by the merging iterator and should never 2913 // be observed by the top-level Iterator. 2914 panic(errors.AssertionFailedf("pebble: unexpected range delete")) 2915 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 2916 // Range keys are interleaved at the maximal sequence number and 2917 // should never be observed within a user key. 2918 panic(errors.AssertionFailedf("pebble: unexpected range key")) 2919 default: 2920 panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind))) 2921 } 2922 } 2923 if validity == internalNextError { 2924 return false, it.Error() 2925 } 2926 return true, nil 2927 } 2928 2929 // internalNextValidity enumerates the potential outcomes of a call to 2930 // internalNext. 2931 type internalNextValidity int8 2932 2933 const ( 2934 // internalNextError is returned by internalNext when an error occurred and 2935 // the caller is responsible for checking iter.Error(). 2936 internalNextError internalNextValidity = iota 2937 // internalNextExhausted is returned by internalNext when the next internal 2938 // key is an internal key with a different user key than Iterator.Key(). 2939 internalNextExhausted 2940 // internalNextValid is returned by internalNext when the internal next 2941 // found a shadowed internal key with a user key equal to Iterator.Key(). 2942 internalNextValid 2943 ) 2944 2945 // internalNext advances internal Iterator state forward to expose the 2946 // InternalKeyKind of the next internal key with a user key equal to Key(). 2947 // 2948 // internalNext is a highly specialized operation and is unlikely to be 2949 // generally useful. See Iterator.Next for how to reposition the iterator to the 2950 // next key. internalNext requires the Iterator to be at a valid position in the 2951 // forward direction (the last positioning operation must've been a First, a 2952 // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true). 2953 // 2954 // internalNext, unlike all other Iterator methods, exposes internal LSM state. 2955 // internalNext advances the Iterator's internal iterator to the next shadowed 2956 // key with a user key equal to Key(). When a key is overwritten or deleted, its 2957 // removal from the LSM occurs lazily as a part of compactions. internalNext 2958 // allows the caller to see whether an obsolete internal key exists with the 2959 // current Key(), and what it's key kind is. Note that the existence of an 2960 // internal key is nondeterministic and dependent on internal LSM state. These 2961 // semantics are unlikely to be applicable to almost all use cases. 2962 // 2963 // If internalNext finds a key that shares the same user key as Key(), it 2964 // returns internalNextValid and the internal key's kind. If internalNext 2965 // encounters an error, it returns internalNextError and the caller is expected 2966 // to call Iterator.Error() to retrieve it. In all other circumstances, 2967 // internalNext returns internalNextExhausted, indicating that there are no more 2968 // additional internal keys with the user key Key(). 2969 // 2970 // internalNext does not change the external position of the iterator, and a 2971 // Next operation should behave the same as if internalNext was never called. 2972 // internalNext does invalidate the iterator Value's memory, and the caller must 2973 // not rely on the memory safety of the previous Iterator position. 2974 func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { 2975 i.stats.ForwardStepCount[InterfaceCall]++ 2976 if i.err != nil { 2977 return internalNextError, base.InternalKeyKindInvalid 2978 } else if i.iterValidityState != IterValid { 2979 return internalNextExhausted, base.InternalKeyKindInvalid 2980 } 2981 i.lastPositioningOp = internalNextOp 2982 2983 switch i.pos { 2984 case iterPosCurForward: 2985 i.iterKey, i.iterValue = i.iter.Next() 2986 if i.iterKey == nil { 2987 // We check i.iter.Error() here and return an internalNextError enum 2988 // variant so that the caller does not need to check i.iter.Error() 2989 // in the common case that the next internal key has a new user key. 2990 if i.err = i.iter.Error(); i.err != nil { 2991 return internalNextError, base.InternalKeyKindInvalid 2992 } 2993 i.pos = iterPosNext 2994 return internalNextExhausted, base.InternalKeyKindInvalid 2995 } else if i.comparer.Equal(i.iterKey.UserKey, i.key) { 2996 return internalNextValid, i.iterKey.Kind() 2997 } 2998 i.pos = iterPosNext 2999 return internalNextExhausted, base.InternalKeyKindInvalid 3000 case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev: 3001 i.err = errors.New("switching from reverse to forward via internalNext is prohibited") 3002 i.iterValidityState = IterExhausted 3003 return internalNextError, base.InternalKeyKindInvalid 3004 case iterPosNext, iterPosCurForwardPaused: 3005 // The previous method already moved onto the next user key. This is 3006 // only possible if 3007 // - the last positioning method was a call to internalNext, and we 3008 // advanced to a new user key. 3009 // - the previous non-internalNext iterator operation encountered a 3010 // range key or merge, forcing an internal Next that found a new 3011 // user key that's not equal to i.Iterator.Key(). 3012 return internalNextExhausted, base.InternalKeyKindInvalid 3013 default: 3014 panic("unreachable") 3015 } 3016 }