github.com/cockroachdb/pebble@v1.1.2/iterator.go (about) 1 // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use 2 // of this source code is governed by a BSD-style license that can be found in 3 // the LICENSE file. 4 5 package pebble 6 7 import ( 8 "bytes" 9 "context" 10 "io" 11 "sync" 12 "unsafe" 13 14 "github.com/cockroachdb/errors" 15 "github.com/cockroachdb/pebble/internal/base" 16 "github.com/cockroachdb/pebble/internal/bytealloc" 17 "github.com/cockroachdb/pebble/internal/fastrand" 18 "github.com/cockroachdb/pebble/internal/humanize" 19 "github.com/cockroachdb/pebble/internal/invariants" 20 "github.com/cockroachdb/pebble/internal/keyspan" 21 "github.com/cockroachdb/pebble/internal/manifest" 22 "github.com/cockroachdb/pebble/internal/rangekey" 23 "github.com/cockroachdb/pebble/sstable" 24 "github.com/cockroachdb/redact" 25 ) 26 27 // iterPos describes the state of the internal iterator, in terms of whether it 28 // is at the position returned to the user (cur), one ahead of the position 29 // returned (next for forward iteration and prev for reverse iteration). The cur 30 // position is split into two states, for forward and reverse iteration, since 31 // we need to differentiate for switching directions. 32 // 33 // There is subtlety in what is considered the current position of the Iterator. 34 // The internal iterator exposes a sequence of internal keys. There is not 35 // always a single internalIterator position corresponding to the position 36 // returned to the user. Consider the example: 37 // 38 // a.MERGE.9 a.MERGE.8 a.MERGE.7 a.SET.6 b.DELETE.9 b.DELETE.5 b.SET.4 39 // \ / 40 // \ Iterator.Key() = 'a' / 41 // 42 // The Iterator exposes one valid position at user key 'a' and the two exhausted 43 // positions at the beginning and end of iteration. The underlying 44 // internalIterator contains 7 valid positions and 2 exhausted positions. 45 // 46 // Iterator positioning methods must set iterPos to iterPosCur{Foward,Backward} 47 // iff the user key at the current internalIterator position equals the 48 // Iterator.Key returned to the user. This guarantees that a call to nextUserKey 49 // or prevUserKey will advance to the next or previous iterator position. 50 // iterPosCur{Forward,Backward} does not make any guarantee about the internal 51 // iterator position among internal keys with matching user keys, and it will 52 // vary subtly depending on the particular key kinds encountered. In the above 53 // example, the iterator returning 'a' may set iterPosCurForward if the internal 54 // iterator is positioned at any of a.MERGE.9, a.MERGE.8, a.MERGE.7 or a.SET.6. 55 // 56 // When setting iterPos to iterPosNext or iterPosPrev, the internal iterator 57 // must be advanced to the first internalIterator position at a user key greater 58 // (iterPosNext) or less (iterPosPrev) than the key returned to the user. An 59 // internalIterator position that's !Valid() must also be considered greater or 60 // less—depending on the direction of iteration—than the last valid Iterator 61 // position. 62 type iterPos int8 63 64 const ( 65 iterPosCurForward iterPos = 0 66 iterPosNext iterPos = 1 67 iterPosPrev iterPos = -1 68 iterPosCurReverse iterPos = -2 69 70 // For limited iteration. When the iterator is at iterPosCurForwardPaused 71 // - Next*() call should behave as if the internal iterator is already 72 // at next (akin to iterPosNext). 73 // - Prev*() call should behave as if the internal iterator is at the 74 // current key (akin to iterPosCurForward). 75 // 76 // Similar semantics apply to CurReversePaused. 77 iterPosCurForwardPaused iterPos = 2 78 iterPosCurReversePaused iterPos = -3 79 ) 80 81 // Approximate gap in bytes between samples of data read during iteration. 82 // This is multiplied with a default ReadSamplingMultiplier of 1 << 4 to yield 83 // 1 << 20 (1MB). The 1MB factor comes from: 84 // https://github.com/cockroachdb/pebble/issues/29#issuecomment-494477985 85 const readBytesPeriod uint64 = 1 << 16 86 87 var errReversePrefixIteration = errors.New("pebble: unsupported reverse prefix iteration") 88 89 // IteratorMetrics holds per-iterator metrics. These do not change over the 90 // lifetime of the iterator. 91 type IteratorMetrics struct { 92 // The read amplification experienced by this iterator. This is the sum of 93 // the memtables, the L0 sublevels and the non-empty Ln levels. Higher read 94 // amplification generally results in slower reads, though allowing higher 95 // read amplification can also result in faster writes. 96 ReadAmp int 97 } 98 99 // IteratorStatsKind describes the two kind of iterator stats. 100 type IteratorStatsKind int8 101 102 const ( 103 // InterfaceCall represents calls to Iterator. 104 InterfaceCall IteratorStatsKind = iota 105 // InternalIterCall represents calls by Iterator to its internalIterator. 106 InternalIterCall 107 // NumStatsKind is the number of kinds, and is used for array sizing. 108 NumStatsKind 109 ) 110 111 // IteratorStats contains iteration stats. 112 type IteratorStats struct { 113 // ForwardSeekCount includes SeekGE, SeekPrefixGE, First. 114 ForwardSeekCount [NumStatsKind]int 115 // ReverseSeek includes SeekLT, Last. 116 ReverseSeekCount [NumStatsKind]int 117 // ForwardStepCount includes Next. 118 ForwardStepCount [NumStatsKind]int 119 // ReverseStepCount includes Prev. 120 ReverseStepCount [NumStatsKind]int 121 InternalStats InternalIteratorStats 122 RangeKeyStats RangeKeyIteratorStats 123 } 124 125 var _ redact.SafeFormatter = &IteratorStats{} 126 127 // InternalIteratorStats contains miscellaneous stats produced by internal 128 // iterators. 129 type InternalIteratorStats = base.InternalIteratorStats 130 131 // RangeKeyIteratorStats contains miscellaneous stats about range keys 132 // encountered by the iterator. 133 type RangeKeyIteratorStats struct { 134 // Count records the number of range keys encountered during 135 // iteration. Range keys may be counted multiple times if the iterator 136 // leaves a range key's bounds and then returns. 137 Count int 138 // ContainedPoints records the number of point keys encountered within the 139 // bounds of a range key. Note that this includes point keys with suffixes 140 // that sort both above and below the covering range key's suffix. 141 ContainedPoints int 142 // SkippedPoints records the count of the subset of ContainedPoints point 143 // keys that were skipped during iteration due to range-key masking. It does 144 // not include point keys that were never loaded because a 145 // RangeKeyMasking.Filter excluded the entire containing block. 146 SkippedPoints int 147 } 148 149 // Merge adds all of the argument's statistics to the receiver. It may be used 150 // to accumulate stats across multiple iterators. 151 func (s *RangeKeyIteratorStats) Merge(o RangeKeyIteratorStats) { 152 s.Count += o.Count 153 s.ContainedPoints += o.ContainedPoints 154 s.SkippedPoints += o.SkippedPoints 155 } 156 157 func (s *RangeKeyIteratorStats) String() string { 158 return redact.StringWithoutMarkers(s) 159 } 160 161 // SafeFormat implements the redact.SafeFormatter interface. 162 func (s *RangeKeyIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) { 163 p.Printf("range keys: %s, contained points: %s (%s skipped)", 164 humanize.Count.Uint64(uint64(s.Count)), 165 humanize.Count.Uint64(uint64(s.ContainedPoints)), 166 humanize.Count.Uint64(uint64(s.SkippedPoints))) 167 } 168 169 // LazyValue is a lazy value. See the long comment in base.LazyValue. 170 type LazyValue = base.LazyValue 171 172 // Iterator iterates over a DB's key/value pairs in key order. 173 // 174 // An iterator must be closed after use, but it is not necessary to read an 175 // iterator until exhaustion. 176 // 177 // An iterator is not goroutine-safe, but it is safe to use multiple iterators 178 // concurrently, with each in a dedicated goroutine. 179 // 180 // It is also safe to use an iterator concurrently with modifying its 181 // underlying DB, if that DB permits modification. However, the resultant 182 // key/value pairs are not guaranteed to be a consistent snapshot of that DB 183 // at a particular point in time. 184 // 185 // If an iterator encounters an error during any operation, it is stored by 186 // the Iterator and surfaced through the Error method. All absolute 187 // positioning methods (eg, SeekLT, SeekGT, First, Last, etc) reset any 188 // accumulated error before positioning. All relative positioning methods (eg, 189 // Next, Prev) return without advancing if the iterator has an accumulated 190 // error. 191 type Iterator struct { 192 // The context is stored here since (a) Iterators are expected to be 193 // short-lived (since they pin memtables and sstables), (b) plumbing a 194 // context into every method is very painful, (c) they do not (yet) respect 195 // context cancellation and are only used for tracing. 196 ctx context.Context 197 opts IterOptions 198 merge Merge 199 comparer base.Comparer 200 iter internalIterator 201 pointIter internalIterator 202 // Either readState or version is set, but not both. 203 readState *readState 204 version *version 205 // rangeKey holds iteration state specific to iteration over range keys. 206 // The range key field may be nil if the Iterator has never been configured 207 // to iterate over range keys. Its non-nilness cannot be used to determine 208 // if the Iterator is currently iterating over range keys: For that, consult 209 // the IterOptions using opts.rangeKeys(). If non-nil, its rangeKeyIter 210 // field is guaranteed to be non-nil too. 211 rangeKey *iteratorRangeKeyState 212 // rangeKeyMasking holds state for range-key masking of point keys. 213 rangeKeyMasking rangeKeyMasking 214 err error 215 // When iterValidityState=IterValid, key represents the current key, which 216 // is backed by keyBuf. 217 key []byte 218 keyBuf []byte 219 value LazyValue 220 // For use in LazyValue.Clone. 221 valueBuf []byte 222 fetcher base.LazyFetcher 223 // For use in LazyValue.Value. 224 lazyValueBuf []byte 225 valueCloser io.Closer 226 // boundsBuf holds two buffers used to store the lower and upper bounds. 227 // Whenever the Iterator's bounds change, the new bounds are copied into 228 // boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce 229 // allocations. opts.LowerBound and opts.UpperBound point into this slice. 230 boundsBuf [2][]byte 231 boundsBufIdx int 232 // iterKey, iterValue reflect the latest position of iter, except when 233 // SetBounds is called. In that case, these are explicitly set to nil. 234 iterKey *InternalKey 235 iterValue LazyValue 236 alloc *iterAlloc 237 getIterAlloc *getIterAlloc 238 prefixOrFullSeekKey []byte 239 readSampling readSampling 240 stats IteratorStats 241 externalReaders [][]*sstable.Reader 242 243 // Following fields used when constructing an iterator stack, eg, in Clone 244 // and SetOptions or when re-fragmenting a batch's range keys/range dels. 245 // Non-nil if this Iterator includes a Batch. 246 batch *Batch 247 newIters tableNewIters 248 newIterRangeKey keyspan.TableNewSpanIter 249 lazyCombinedIter lazyCombinedIter 250 seqNum uint64 251 // batchSeqNum is used by Iterators over indexed batches to detect when the 252 // underlying batch has been mutated. The batch beneath an indexed batch may 253 // be mutated while the Iterator is open, but new keys are not surfaced 254 // until the next call to SetOptions. 255 batchSeqNum uint64 256 // batch{PointIter,RangeDelIter,RangeKeyIter} are used when the Iterator is 257 // configured to read through an indexed batch. If a batch is set, these 258 // iterators will be included within the iterator stack regardless of 259 // whether the batch currently contains any keys of their kind. These 260 // pointers are used during a call to SetOptions to refresh the Iterator's 261 // view of its indexed batch. 262 batchPointIter batchIter 263 batchRangeDelIter keyspan.Iter 264 batchRangeKeyIter keyspan.Iter 265 // merging is a pointer to this iterator's point merging iterator. It 266 // appears here because key visibility is handled by the merging iterator. 267 // During SetOptions on an iterator over an indexed batch, this field is 268 // used to update the merging iterator's batch snapshot. 269 merging *mergingIter 270 271 // Keeping the bools here after all the 8 byte aligned fields shrinks the 272 // sizeof this struct by 24 bytes. 273 274 // INVARIANT: 275 // iterValidityState==IterAtLimit <=> 276 // pos==iterPosCurForwardPaused || pos==iterPosCurReversePaused 277 iterValidityState IterValidityState 278 // Set to true by SetBounds, SetOptions. Causes the Iterator to appear 279 // exhausted externally, while preserving the correct iterValidityState for 280 // the iterator's internal state. Preserving the correct internal validity 281 // is used for SeekPrefixGE(..., trySeekUsingNext), and SeekGE/SeekLT 282 // optimizations after "no-op" calls to SetBounds and SetOptions. 283 requiresReposition bool 284 // The position of iter. When this is iterPos{Prev,Next} the iter has been 285 // moved past the current key-value, which can only happen if 286 // iterValidityState=IterValid, i.e., there is something to return to the 287 // client for the current position. 288 pos iterPos 289 // Relates to the prefixOrFullSeekKey field above. 290 hasPrefix bool 291 // Used for deriving the value of SeekPrefixGE(..., trySeekUsingNext), 292 // and SeekGE/SeekLT optimizations 293 lastPositioningOp lastPositioningOpKind 294 // Used for determining when it's safe to perform SeekGE optimizations that 295 // reuse the iterator state to avoid the cost of a full seek if the iterator 296 // is already positioned in the correct place. If the iterator's view of its 297 // indexed batch was just refreshed, some optimizations cannot be applied on 298 // the first seek after the refresh: 299 // - SeekGE has a no-op optimization that does not seek on the internal 300 // iterator at all if the iterator is already in the correct place. 301 // This optimization cannot be performed if the internal iterator was 302 // last positioned when the iterator had a different view of an 303 // underlying batch. 304 // - Seek[Prefix]GE set flags.TrySeekUsingNext()=true when the seek key is 305 // greater than the previous operation's seek key, under the expectation 306 // that the various internal iterators can use their current position to 307 // avoid a full expensive re-seek. This applies to the batchIter as well. 308 // However, if the view of the batch was just refreshed, the batchIter's 309 // position is not useful because it may already be beyond new keys less 310 // than the seek key. To prevent the use of this optimization in 311 // batchIter, Seek[Prefix]GE set flags.BatchJustRefreshed()=true if this 312 // bit is enabled. 313 batchJustRefreshed bool 314 // Used for an optimization in external iterators to reduce the number of 315 // merging levels. 316 forwardOnly bool 317 // closePointIterOnce is set to true if this point iter can only be Close()d 318 // once, _and_ closing i.iter and then i.pointIter would close i.pointIter 319 // twice. This is necessary to track if the point iter is an internal iterator 320 // that could release its resources to a pool on Close(), making it harder for 321 // that iterator to make its own closes idempotent. 322 // 323 // TODO(bilal): Update SetOptions to always close out point key iterators when 324 // they won't be used, so that Close() doesn't need to default to closing 325 // point iterators twice. 326 closePointIterOnce bool 327 // Used in some tests to disable the random disabling of seek optimizations. 328 forceEnableSeekOpt bool 329 // Set to true if NextPrefix is not currently permitted. Defaults to false 330 // in case an iterator never had any bounds. 331 nextPrefixNotPermittedByUpperBound bool 332 } 333 334 // cmp is a convenience shorthand for the i.comparer.Compare function. 335 func (i *Iterator) cmp(a, b []byte) int { 336 return i.comparer.Compare(a, b) 337 } 338 339 // split is a convenience shorthand for the i.comparer.Split function. 340 func (i *Iterator) split(a []byte) int { 341 return i.comparer.Split(a) 342 } 343 344 // equal is a convenience shorthand for the i.comparer.Equal function. 345 func (i *Iterator) equal(a, b []byte) bool { 346 return i.comparer.Equal(a, b) 347 } 348 349 // iteratorRangeKeyState holds an iterator's range key iteration state. 350 type iteratorRangeKeyState struct { 351 opts *IterOptions 352 cmp base.Compare 353 split base.Split 354 // rangeKeyIter holds the range key iterator stack that iterates over the 355 // merged spans across the entirety of the LSM. 356 rangeKeyIter keyspan.FragmentIterator 357 iiter keyspan.InterleavingIter 358 // stale is set to true when the range key state recorded here (in start, 359 // end and keys) may not be in sync with the current range key at the 360 // interleaving iterator's current position. 361 // 362 // When the interelaving iterator passes over a new span, it invokes the 363 // SpanChanged hook defined on the `rangeKeyMasking` type, which sets stale 364 // to true if the span is non-nil. 365 // 366 // The parent iterator may not be positioned over the interleaving 367 // iterator's current position (eg, i.iterPos = iterPos{Next,Prev}), so 368 // {keys,start,end} are only updated to the new range key during a call to 369 // Iterator.saveRangeKey. 370 stale bool 371 // updated is used to signal to the Iterator client whether the state of 372 // range keys has changed since the previous iterator position through the 373 // `RangeKeyChanged` method. It's set to true during an Iterator positioning 374 // operation that changes the state of the current range key. Each Iterator 375 // positioning operation sets it back to false before executing. 376 // 377 // TODO(jackson): The lifecycle of {stale,updated,prevPosHadRangeKey} is 378 // intricate and confusing. Try to refactor to reduce complexity. 379 updated bool 380 // prevPosHadRangeKey records whether the previous Iterator position had a 381 // range key (HasPointAndRage() = (_, true)). It's updated at the beginning 382 // of each new Iterator positioning operation. It's required by saveRangeKey to 383 // to set `updated` appropriately: Without this record of the previous iterator 384 // state, it's ambiguous whether an iterator only temporarily stepped onto a 385 // position without a range key. 386 prevPosHadRangeKey bool 387 // rangeKeyOnly is set to true if at the current iterator position there is 388 // no point key, only a range key start boundary. 389 rangeKeyOnly bool 390 // hasRangeKey is true when the current iterator position has a covering 391 // range key (eg, a range key with bounds [<lower>,<upper>) such that 392 // <lower> ≤ Key() < <upper>). 393 hasRangeKey bool 394 // start and end are the [start, end) boundaries of the current range keys. 395 start []byte 396 end []byte 397 398 rangeKeyBuffers 399 400 // iterConfig holds fields that are used for the construction of the 401 // iterator stack, but do not need to be directly accessed during iteration. 402 // This struct is bundled within the iteratorRangeKeyState struct to reduce 403 // allocations. 404 iterConfig rangekey.UserIteratorConfig 405 } 406 407 type rangeKeyBuffers struct { 408 // keys is sorted by Suffix ascending. 409 keys []RangeKeyData 410 // buf is used to save range-key data before moving the range-key iterator. 411 // Start and end boundaries, suffixes and values are all copied into buf. 412 buf bytealloc.A 413 // internal holds buffers used by the range key internal iterators. 414 internal rangekey.Buffers 415 } 416 417 func (b *rangeKeyBuffers) PrepareForReuse() { 418 const maxKeysReuse = 100 419 if len(b.keys) > maxKeysReuse { 420 b.keys = nil 421 } 422 // Avoid caching the key buf if it is overly large. The constant is 423 // fairly arbitrary. 424 if cap(b.buf) >= maxKeyBufCacheSize { 425 b.buf = nil 426 } else { 427 b.buf = b.buf[:0] 428 } 429 b.internal.PrepareForReuse() 430 } 431 432 func (i *iteratorRangeKeyState) init(cmp base.Compare, split base.Split, opts *IterOptions) { 433 i.cmp = cmp 434 i.split = split 435 i.opts = opts 436 } 437 438 var iterRangeKeyStateAllocPool = sync.Pool{ 439 New: func() interface{} { 440 return &iteratorRangeKeyState{} 441 }, 442 } 443 444 // isEphemeralPosition returns true iff the current iterator position is 445 // ephemeral, and won't be visited during subsequent relative positioning 446 // operations. 447 // 448 // The iterator position resulting from a SeekGE or SeekPrefixGE that lands on a 449 // straddling range key without a coincident point key is such a position. 450 func (i *Iterator) isEphemeralPosition() bool { 451 return i.opts.rangeKeys() && i.rangeKey != nil && i.rangeKey.rangeKeyOnly && 452 !i.equal(i.rangeKey.start, i.key) 453 } 454 455 type lastPositioningOpKind int8 456 457 const ( 458 unknownLastPositionOp lastPositioningOpKind = iota 459 seekPrefixGELastPositioningOp 460 seekGELastPositioningOp 461 seekLTLastPositioningOp 462 // internalNextOp is a special internal iterator positioning operation used 463 // by CanDeterministicallySingleDelete. It exists for enforcing requirements 464 // around calling CanDeterministicallySingleDelete at most once per external 465 // iterator position. 466 internalNextOp 467 // invalidatedLastPositionOp is similar to unknownLastPositionOp and the 468 // only reason to distinguish this is for the wider set of SeekGE 469 // optimizations we permit for the external iterator Iterator.forwardOnly 470 // case. Most code predicates should be doing equality comparisons with one 471 // of the seek* enum values, so this duplication should not result in code 472 // of the form: 473 // if unknownLastPositionOp || invalidLastPositionOp 474 invalidatedLastPositionOp 475 ) 476 477 // Limited iteration mode. Not for use with prefix iteration. 478 // 479 // SeekGE, SeekLT, Prev, Next have WithLimit variants, that pause the iterator 480 // at the limit in a best-effort manner. The client should behave correctly 481 // even if the limits are ignored. These limits are not "deep", in that they 482 // are not passed down to the underlying collection of internalIterators. This 483 // is because the limits are transient, and apply only until the next 484 // iteration call. They serve mainly as a way to bound the amount of work when 485 // two (or more) Iterators are being coordinated at a higher level. 486 // 487 // In limited iteration mode: 488 // - Avoid using Iterator.Valid if the last call was to a *WithLimit() method. 489 // The return value from the *WithLimit() method provides a more precise 490 // disposition. 491 // - The limit is exclusive for forward and inclusive for reverse. 492 // 493 // 494 // Limited iteration mode & range keys 495 // 496 // Limited iteration interacts with range-key iteration. When range key 497 // iteration is enabled, range keys are interleaved at their start boundaries. 498 // Limited iteration must ensure that if a range key exists within the limit, 499 // the iterator visits the range key. 500 // 501 // During forward limited iteration, this is trivial: An overlapping range key 502 // must have a start boundary less than the limit, and the range key's start 503 // boundary will be interleaved and found to be within the limit. 504 // 505 // During reverse limited iteration, the tail of the range key may fall within 506 // the limit. The range key must be surfaced even if the range key's start 507 // boundary is less than the limit, and if there are no point keys between the 508 // current iterator position and the limit. To provide this guarantee, reverse 509 // limited iteration ignores the limit as long as there is a range key 510 // overlapping the iteration position. 511 512 // IterValidityState captures the state of the Iterator. 513 type IterValidityState int8 514 515 const ( 516 // IterExhausted represents an Iterator that is exhausted. 517 IterExhausted IterValidityState = iota 518 // IterValid represents an Iterator that is valid. 519 IterValid 520 // IterAtLimit represents an Iterator that has a non-exhausted 521 // internalIterator, but has reached a limit without any key for the 522 // caller. 523 IterAtLimit 524 ) 525 526 // readSampling stores variables used to sample a read to trigger a read 527 // compaction 528 type readSampling struct { 529 bytesUntilReadSampling uint64 530 initialSamplePassed bool 531 pendingCompactions readCompactionQueue 532 // forceReadSampling is used for testing purposes to force a read sample on every 533 // call to Iterator.maybeSampleRead() 534 forceReadSampling bool 535 } 536 537 func (i *Iterator) findNextEntry(limit []byte) { 538 i.iterValidityState = IterExhausted 539 i.pos = iterPosCurForward 540 if i.opts.rangeKeys() && i.rangeKey != nil { 541 i.rangeKey.rangeKeyOnly = false 542 } 543 544 // Close the closer for the current value if one was open. 545 if i.closeValueCloser() != nil { 546 return 547 } 548 549 for i.iterKey != nil { 550 key := *i.iterKey 551 552 if i.hasPrefix { 553 if n := i.split(key.UserKey); !i.equal(i.prefixOrFullSeekKey, key.UserKey[:n]) { 554 return 555 } 556 } 557 // Compare with limit every time we start at a different user key. 558 // Note that given the best-effort contract of limit, we could avoid a 559 // comparison in the common case by doing this only after 560 // i.nextUserKey is called for the deletes below. However that makes 561 // the behavior non-deterministic (since the behavior will vary based 562 // on what has been compacted), which makes it hard to test with the 563 // metamorphic test. So we forego that performance optimization. 564 if limit != nil && i.cmp(limit, i.iterKey.UserKey) <= 0 { 565 i.iterValidityState = IterAtLimit 566 i.pos = iterPosCurForwardPaused 567 return 568 } 569 570 // If the user has configured a SkipPoint function, invoke it to see 571 // whether we should skip over the current user key. 572 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(i.iterKey.UserKey) { 573 // NB: We could call nextUserKey, but in some cases the SkipPoint 574 // predicate function might be cheaper than nextUserKey's key copy 575 // and key comparison. This should be the case for MVCC suffix 576 // comparisons, for example. In the future, we could expand the 577 // SkipPoint interface to give the implementor more control over 578 // whether we skip over just the internal key, the user key, or even 579 // the key prefix. 580 i.stats.ForwardStepCount[InternalIterCall]++ 581 i.iterKey, i.iterValue = i.iter.Next() 582 continue 583 } 584 585 switch key.Kind() { 586 case InternalKeyKindRangeKeySet: 587 // Save the current key. 588 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 589 i.key = i.keyBuf 590 i.value = LazyValue{} 591 // There may also be a live point key at this userkey that we have 592 // not yet read. We need to find the next entry with this user key 593 // to find it. Save the range key so we don't lose it when we Next 594 // the underlying iterator. 595 i.saveRangeKey() 596 pointKeyExists := i.nextPointCurrentUserKey() 597 if i.err != nil { 598 i.iterValidityState = IterExhausted 599 return 600 } 601 i.rangeKey.rangeKeyOnly = !pointKeyExists 602 i.iterValidityState = IterValid 603 return 604 605 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 606 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 607 // only simpler, but is also necessary for correctness due to 608 // InternalKeyKindSSTableInternalObsoleteBit. 609 i.nextUserKey() 610 continue 611 612 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 613 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 614 i.key = i.keyBuf 615 i.value = i.iterValue 616 i.iterValidityState = IterValid 617 i.saveRangeKey() 618 return 619 620 case InternalKeyKindMerge: 621 // Resolving the merge may advance us to the next point key, which 622 // may be covered by a different set of range keys. Save the range 623 // key state so we don't lose it. 624 i.saveRangeKey() 625 if i.mergeForward(key) { 626 i.iterValidityState = IterValid 627 return 628 } 629 630 // The merge didn't yield a valid key, either because the value 631 // merger indicated it should be deleted, or because an error was 632 // encountered. 633 i.iterValidityState = IterExhausted 634 if i.err != nil { 635 return 636 } 637 if i.pos != iterPosNext { 638 i.nextUserKey() 639 } 640 if i.closeValueCloser() != nil { 641 return 642 } 643 i.pos = iterPosCurForward 644 645 default: 646 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 647 i.iterValidityState = IterExhausted 648 return 649 } 650 } 651 } 652 653 func (i *Iterator) nextPointCurrentUserKey() bool { 654 // If the user has configured a SkipPoint function and the current user key 655 // would be skipped by it, there's no need to step forward looking for a 656 // point key. If we were to find one, it should be skipped anyways. 657 if i.opts.SkipPoint != nil && i.opts.SkipPoint(i.key) { 658 return false 659 } 660 661 i.pos = iterPosCurForward 662 663 i.iterKey, i.iterValue = i.iter.Next() 664 i.stats.ForwardStepCount[InternalIterCall]++ 665 if i.iterKey == nil || !i.equal(i.key, i.iterKey.UserKey) { 666 i.pos = iterPosNext 667 return false 668 } 669 670 key := *i.iterKey 671 switch key.Kind() { 672 case InternalKeyKindRangeKeySet: 673 // RangeKeySets must always be interleaved as the first internal key 674 // for a user key. 675 i.err = base.CorruptionErrorf("pebble: unexpected range key set mid-user key") 676 return false 677 678 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 679 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 680 // only simpler, but is also necessary for correctness due to 681 // InternalKeyKindSSTableInternalObsoleteBit. 682 return false 683 684 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 685 i.value = i.iterValue 686 return true 687 688 case InternalKeyKindMerge: 689 return i.mergeForward(key) 690 691 default: 692 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 693 return false 694 } 695 } 696 697 // mergeForward resolves a MERGE key, advancing the underlying iterator forward 698 // to merge with subsequent keys with the same userkey. mergeForward returns a 699 // boolean indicating whether or not the merge yielded a valid key. A merge may 700 // not yield a valid key if an error occurred, in which case i.err is non-nil, 701 // or the user's value merger specified the key to be deleted. 702 // 703 // mergeForward does not update iterValidityState. 704 func (i *Iterator) mergeForward(key base.InternalKey) (valid bool) { 705 var iterValue []byte 706 iterValue, _, i.err = i.iterValue.Value(nil) 707 if i.err != nil { 708 return false 709 } 710 var valueMerger ValueMerger 711 valueMerger, i.err = i.merge(key.UserKey, iterValue) 712 if i.err != nil { 713 return false 714 } 715 716 i.mergeNext(key, valueMerger) 717 if i.err != nil { 718 return false 719 } 720 721 var needDelete bool 722 var value []byte 723 value, needDelete, i.valueCloser, i.err = finishValueMerger( 724 valueMerger, true /* includesBase */) 725 i.value = base.MakeInPlaceValue(value) 726 if i.err != nil { 727 return false 728 } 729 if needDelete { 730 _ = i.closeValueCloser() 731 return false 732 } 733 return true 734 } 735 736 func (i *Iterator) closeValueCloser() error { 737 if i.valueCloser != nil { 738 i.err = i.valueCloser.Close() 739 i.valueCloser = nil 740 } 741 return i.err 742 } 743 744 func (i *Iterator) nextUserKey() { 745 if i.iterKey == nil { 746 return 747 } 748 trailer := i.iterKey.Trailer 749 done := i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 750 if i.iterValidityState != IterValid { 751 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 752 i.key = i.keyBuf 753 } 754 for { 755 i.iterKey, i.iterValue = i.iter.Next() 756 i.stats.ForwardStepCount[InternalIterCall]++ 757 // NB: We're guaranteed to be on the next user key if the previous key 758 // had a zero sequence number (`done`), or the new key has a trailer 759 // greater or equal to the previous key's trailer. This is true because 760 // internal keys with the same user key are sorted by Trailer in 761 // strictly monotonically descending order. We expect the trailer 762 // optimization to trigger around 50% of the time with randomly 763 // distributed writes. We expect it to trigger very frequently when 764 // iterating through ingested sstables, which contain keys that all have 765 // the same sequence number. 766 if done || i.iterKey == nil || i.iterKey.Trailer >= trailer { 767 break 768 } 769 if !i.equal(i.key, i.iterKey.UserKey) { 770 break 771 } 772 done = i.iterKey.Trailer <= base.InternalKeyZeroSeqnumMaxTrailer 773 trailer = i.iterKey.Trailer 774 } 775 } 776 777 func (i *Iterator) maybeSampleRead() { 778 // This method is only called when a public method of Iterator is 779 // returning, and below we exclude the case were the iterator is paused at 780 // a limit. The effect of these choices is that keys that are deleted, but 781 // are encountered during iteration, are not accounted for in the read 782 // sampling and will not cause read driven compactions, even though we are 783 // incurring cost in iterating over them. And this issue is not limited to 784 // Iterator, which does not see the effect of range deletes, which may be 785 // causing iteration work in mergingIter. It is not clear at this time 786 // whether this is a deficiency worth addressing. 787 if i.iterValidityState != IterValid { 788 return 789 } 790 if i.readState == nil { 791 return 792 } 793 if i.readSampling.forceReadSampling { 794 i.sampleRead() 795 return 796 } 797 samplingPeriod := int32(int64(readBytesPeriod) * i.readState.db.opts.Experimental.ReadSamplingMultiplier) 798 if samplingPeriod <= 0 { 799 return 800 } 801 bytesRead := uint64(len(i.key) + i.value.Len()) 802 for i.readSampling.bytesUntilReadSampling < bytesRead { 803 i.readSampling.bytesUntilReadSampling += uint64(fastrand.Uint32n(2 * uint32(samplingPeriod))) 804 // The block below tries to adjust for the case where this is the 805 // first read in a newly-opened iterator. As bytesUntilReadSampling 806 // starts off at zero, we don't want to sample the first read of 807 // every newly-opened iterator, but we do want to sample some of them. 808 if !i.readSampling.initialSamplePassed { 809 i.readSampling.initialSamplePassed = true 810 if fastrand.Uint32n(uint32(i.readSampling.bytesUntilReadSampling)) > uint32(bytesRead) { 811 continue 812 } 813 } 814 i.sampleRead() 815 } 816 i.readSampling.bytesUntilReadSampling -= bytesRead 817 } 818 819 func (i *Iterator) sampleRead() { 820 var topFile *manifest.FileMetadata 821 topLevel, numOverlappingLevels := numLevels, 0 822 mi := i.merging 823 if mi == nil { 824 return 825 } 826 if len(mi.levels) > 1 { 827 mi.ForEachLevelIter(func(li *levelIter) bool { 828 l := manifest.LevelToInt(li.level) 829 if f := li.iterFile; f != nil { 830 var containsKey bool 831 if i.pos == iterPosNext || i.pos == iterPosCurForward || 832 i.pos == iterPosCurForwardPaused { 833 containsKey = i.cmp(f.SmallestPointKey.UserKey, i.key) <= 0 834 } else if i.pos == iterPosPrev || i.pos == iterPosCurReverse || 835 i.pos == iterPosCurReversePaused { 836 containsKey = i.cmp(f.LargestPointKey.UserKey, i.key) >= 0 837 } 838 // Do nothing if the current key is not contained in f's 839 // bounds. We could seek the LevelIterator at this level 840 // to find the right file, but the performance impacts of 841 // doing that are significant enough to negate the benefits 842 // of read sampling in the first place. See the discussion 843 // at: 844 // https://github.com/cockroachdb/pebble/pull/1041#issuecomment-763226492 845 if containsKey { 846 numOverlappingLevels++ 847 if numOverlappingLevels >= 2 { 848 // Terminate the loop early if at least 2 overlapping levels are found. 849 return true 850 } 851 topLevel = l 852 topFile = f 853 } 854 } 855 return false 856 }) 857 } 858 if topFile == nil || topLevel >= numLevels { 859 return 860 } 861 if numOverlappingLevels >= 2 { 862 allowedSeeks := topFile.AllowedSeeks.Add(-1) 863 if allowedSeeks == 0 { 864 865 // Since the compaction queue can handle duplicates, we can keep 866 // adding to the queue even once allowedSeeks hits 0. 867 // In fact, we NEED to keep adding to the queue, because the queue 868 // is small and evicts older and possibly useful compactions. 869 topFile.AllowedSeeks.Add(topFile.InitAllowedSeeks) 870 871 read := readCompaction{ 872 start: topFile.SmallestPointKey.UserKey, 873 end: topFile.LargestPointKey.UserKey, 874 level: topLevel, 875 fileNum: topFile.FileNum, 876 } 877 i.readSampling.pendingCompactions.add(&read, i.cmp) 878 } 879 } 880 } 881 882 func (i *Iterator) findPrevEntry(limit []byte) { 883 i.iterValidityState = IterExhausted 884 i.pos = iterPosCurReverse 885 if i.opts.rangeKeys() && i.rangeKey != nil { 886 i.rangeKey.rangeKeyOnly = false 887 } 888 889 // Close the closer for the current value if one was open. 890 if i.valueCloser != nil { 891 i.err = i.valueCloser.Close() 892 i.valueCloser = nil 893 if i.err != nil { 894 i.iterValidityState = IterExhausted 895 return 896 } 897 } 898 899 var valueMerger ValueMerger 900 firstLoopIter := true 901 rangeKeyBoundary := false 902 // The code below compares with limit in multiple places. As documented in 903 // findNextEntry, this is being done to make the behavior of limit 904 // deterministic to allow for metamorphic testing. It is not required by 905 // the best-effort contract of limit. 906 for i.iterKey != nil { 907 key := *i.iterKey 908 909 // NB: We cannot pause if the current key is covered by a range key. 910 // Otherwise, the user might not ever learn of a range key that covers 911 // the key space being iterated over in which there are no point keys. 912 // Since limits are best effort, ignoring the limit in this case is 913 // allowed by the contract of limit. 914 if firstLoopIter && limit != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 915 i.iterValidityState = IterAtLimit 916 i.pos = iterPosCurReversePaused 917 return 918 } 919 firstLoopIter = false 920 921 if i.iterValidityState == IterValid { 922 if !i.equal(key.UserKey, i.key) { 923 // We've iterated to the previous user key. 924 i.pos = iterPosPrev 925 if valueMerger != nil { 926 var needDelete bool 927 var value []byte 928 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 929 i.value = base.MakeInPlaceValue(value) 930 if i.err == nil && needDelete { 931 // The point key at this key is deleted. If we also have 932 // a range key boundary at this key, we still want to 933 // return. Otherwise, we need to continue looking for 934 // a live key. 935 i.value = LazyValue{} 936 if rangeKeyBoundary { 937 i.rangeKey.rangeKeyOnly = true 938 } else { 939 i.iterValidityState = IterExhausted 940 if i.closeValueCloser() == nil { 941 continue 942 } 943 } 944 } 945 } 946 if i.err != nil { 947 i.iterValidityState = IterExhausted 948 } 949 return 950 } 951 } 952 953 // If the user has configured a SkipPoint function, invoke it to see 954 // whether we should skip over the current user key. 955 if i.opts.SkipPoint != nil && key.Kind() != InternalKeyKindRangeKeySet && i.opts.SkipPoint(key.UserKey) { 956 // NB: We could call prevUserKey, but in some cases the SkipPoint 957 // predicate function might be cheaper than prevUserKey's key copy 958 // and key comparison. This should be the case for MVCC suffix 959 // comparisons, for example. In the future, we could expand the 960 // SkipPoint interface to give the implementor more control over 961 // whether we skip over just the internal key, the user key, or even 962 // the key prefix. 963 i.stats.ReverseStepCount[InternalIterCall]++ 964 i.iterKey, i.iterValue = i.iter.Prev() 965 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 966 i.iterValidityState = IterAtLimit 967 i.pos = iterPosCurReversePaused 968 return 969 } 970 continue 971 } 972 973 switch key.Kind() { 974 case InternalKeyKindRangeKeySet: 975 // Range key start boundary markers are interleaved with the maximum 976 // sequence number, so if there's a point key also at this key, we 977 // must've already iterated over it. 978 // This is the final entry at this user key, so we may return 979 i.rangeKey.rangeKeyOnly = i.iterValidityState != IterValid 980 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 981 i.key = i.keyBuf 982 i.iterValidityState = IterValid 983 i.saveRangeKey() 984 // In all other cases, previous iteration requires advancing to 985 // iterPosPrev in order to determine if the key is live and 986 // unshadowed by another key at the same user key. In this case, 987 // because range key start boundary markers are always interleaved 988 // at the maximum sequence number, we know that there aren't any 989 // additional keys with the same user key in the backward direction. 990 // 991 // We Prev the underlying iterator once anyways for consistency, so 992 // that we can maintain the invariant during backward iteration that 993 // i.iterPos = iterPosPrev. 994 i.stats.ReverseStepCount[InternalIterCall]++ 995 i.iterKey, i.iterValue = i.iter.Prev() 996 997 // Set rangeKeyBoundary so that on the next iteration, we know to 998 // return the key even if the MERGE point key is deleted. 999 rangeKeyBoundary = true 1000 1001 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 1002 i.value = LazyValue{} 1003 i.iterValidityState = IterExhausted 1004 valueMerger = nil 1005 i.iterKey, i.iterValue = i.iter.Prev() 1006 i.stats.ReverseStepCount[InternalIterCall]++ 1007 // Compare with the limit. We could optimize by only checking when 1008 // we step to the previous user key, but detecting that requires a 1009 // comparison too. Note that this position may already passed a 1010 // number of versions of this user key, but they are all deleted, so 1011 // the fact that a subsequent Prev*() call will not see them is 1012 // harmless. Also note that this is the only place in the loop, 1013 // other than the firstLoopIter and SkipPoint cases above, where we 1014 // could step to a different user key and start processing it for 1015 // returning to the caller. 1016 if limit != nil && i.iterKey != nil && i.cmp(limit, i.iterKey.UserKey) > 0 && !i.rangeKeyWithinLimit(limit) { 1017 i.iterValidityState = IterAtLimit 1018 i.pos = iterPosCurReversePaused 1019 return 1020 } 1021 continue 1022 1023 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1024 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1025 i.key = i.keyBuf 1026 // iterValue is owned by i.iter and could change after the Prev() 1027 // call, so use valueBuf instead. Note that valueBuf is only used 1028 // in this one instance; everywhere else (eg. in findNextEntry), 1029 // we just point i.value to the unsafe i.iter-owned value buffer. 1030 i.value, i.valueBuf = i.iterValue.Clone(i.valueBuf[:0], &i.fetcher) 1031 i.saveRangeKey() 1032 i.iterValidityState = IterValid 1033 i.iterKey, i.iterValue = i.iter.Prev() 1034 i.stats.ReverseStepCount[InternalIterCall]++ 1035 valueMerger = nil 1036 continue 1037 1038 case InternalKeyKindMerge: 1039 if i.iterValidityState == IterExhausted { 1040 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1041 i.key = i.keyBuf 1042 i.saveRangeKey() 1043 var iterValue []byte 1044 iterValue, _, i.err = i.iterValue.Value(nil) 1045 if i.err != nil { 1046 return 1047 } 1048 valueMerger, i.err = i.merge(i.key, iterValue) 1049 if i.err != nil { 1050 return 1051 } 1052 i.iterValidityState = IterValid 1053 } else if valueMerger == nil { 1054 // Extract value before iterValue since we use value before iterValue 1055 // and the underlying iterator is not required to provide backing 1056 // memory for both simultaneously. 1057 var value []byte 1058 var callerOwned bool 1059 value, callerOwned, i.err = i.value.Value(i.lazyValueBuf) 1060 if callerOwned { 1061 i.lazyValueBuf = value[:0] 1062 } 1063 if i.err != nil { 1064 return 1065 } 1066 valueMerger, i.err = i.merge(i.key, value) 1067 var iterValue []byte 1068 iterValue, _, i.err = i.iterValue.Value(nil) 1069 if i.err != nil { 1070 return 1071 } 1072 if i.err == nil { 1073 i.err = valueMerger.MergeNewer(iterValue) 1074 } 1075 if i.err != nil { 1076 i.iterValidityState = IterExhausted 1077 return 1078 } 1079 } else { 1080 var iterValue []byte 1081 iterValue, _, i.err = i.iterValue.Value(nil) 1082 if i.err != nil { 1083 return 1084 } 1085 i.err = valueMerger.MergeNewer(iterValue) 1086 if i.err != nil { 1087 i.iterValidityState = IterExhausted 1088 return 1089 } 1090 } 1091 i.iterKey, i.iterValue = i.iter.Prev() 1092 i.stats.ReverseStepCount[InternalIterCall]++ 1093 continue 1094 1095 default: 1096 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1097 i.iterValidityState = IterExhausted 1098 return 1099 } 1100 } 1101 1102 // i.iterKey == nil, so broke out of the preceding loop. 1103 if i.iterValidityState == IterValid { 1104 i.pos = iterPosPrev 1105 if valueMerger != nil { 1106 var needDelete bool 1107 var value []byte 1108 value, needDelete, i.valueCloser, i.err = finishValueMerger(valueMerger, true /* includesBase */) 1109 i.value = base.MakeInPlaceValue(value) 1110 if i.err == nil && needDelete { 1111 i.key = nil 1112 i.value = LazyValue{} 1113 i.iterValidityState = IterExhausted 1114 } 1115 } 1116 if i.err != nil { 1117 i.iterValidityState = IterExhausted 1118 } 1119 } 1120 } 1121 1122 func (i *Iterator) prevUserKey() { 1123 if i.iterKey == nil { 1124 return 1125 } 1126 if i.iterValidityState != IterValid { 1127 // If we're going to compare against the prev key, we need to save the 1128 // current key. 1129 i.keyBuf = append(i.keyBuf[:0], i.iterKey.UserKey...) 1130 i.key = i.keyBuf 1131 } 1132 for { 1133 i.iterKey, i.iterValue = i.iter.Prev() 1134 i.stats.ReverseStepCount[InternalIterCall]++ 1135 if i.iterKey == nil { 1136 break 1137 } 1138 if !i.equal(i.key, i.iterKey.UserKey) { 1139 break 1140 } 1141 } 1142 } 1143 1144 func (i *Iterator) mergeNext(key InternalKey, valueMerger ValueMerger) { 1145 // Save the current key. 1146 i.keyBuf = append(i.keyBuf[:0], key.UserKey...) 1147 i.key = i.keyBuf 1148 1149 // Loop looking for older values for this key and merging them. 1150 for { 1151 i.iterKey, i.iterValue = i.iter.Next() 1152 i.stats.ForwardStepCount[InternalIterCall]++ 1153 if i.iterKey == nil { 1154 i.pos = iterPosNext 1155 return 1156 } 1157 key = *i.iterKey 1158 if !i.equal(i.key, key.UserKey) { 1159 // We've advanced to the next key. 1160 i.pos = iterPosNext 1161 return 1162 } 1163 switch key.Kind() { 1164 case InternalKeyKindDelete, InternalKeyKindSingleDelete, InternalKeyKindDeleteSized: 1165 // We've hit a deletion tombstone. Return everything up to this 1166 // point. 1167 // 1168 // NB: treating InternalKeyKindSingleDelete as equivalent to DEL is not 1169 // only simpler, but is also necessary for correctness due to 1170 // InternalKeyKindSSTableInternalObsoleteBit. 1171 return 1172 1173 case InternalKeyKindSet, InternalKeyKindSetWithDelete: 1174 // We've hit a Set value. Merge with the existing value and return. 1175 var iterValue []byte 1176 iterValue, _, i.err = i.iterValue.Value(nil) 1177 if i.err != nil { 1178 return 1179 } 1180 i.err = valueMerger.MergeOlder(iterValue) 1181 return 1182 1183 case InternalKeyKindMerge: 1184 // We've hit another Merge value. Merge with the existing value and 1185 // continue looping. 1186 var iterValue []byte 1187 iterValue, _, i.err = i.iterValue.Value(nil) 1188 if i.err != nil { 1189 return 1190 } 1191 i.err = valueMerger.MergeOlder(iterValue) 1192 if i.err != nil { 1193 return 1194 } 1195 continue 1196 1197 case InternalKeyKindRangeKeySet: 1198 // The RANGEKEYSET marker must sort before a MERGE at the same user key. 1199 i.err = base.CorruptionErrorf("pebble: out of order range key marker") 1200 return 1201 1202 default: 1203 i.err = base.CorruptionErrorf("pebble: invalid internal key kind: %d", errors.Safe(key.Kind())) 1204 return 1205 } 1206 } 1207 } 1208 1209 // SeekGE moves the iterator to the first key/value pair whose key is greater 1210 // than or equal to the given key. Returns true if the iterator is pointing at 1211 // a valid entry and false otherwise. 1212 func (i *Iterator) SeekGE(key []byte) bool { 1213 return i.SeekGEWithLimit(key, nil) == IterValid 1214 } 1215 1216 // SeekGEWithLimit moves the iterator to the first key/value pair whose key is 1217 // greater than or equal to the given key. 1218 // 1219 // If limit is provided, it serves as a best-effort exclusive limit. If the 1220 // first key greater than or equal to the given search key is also greater than 1221 // or equal to limit, the Iterator may pause and return IterAtLimit. Because 1222 // limits are best-effort, SeekGEWithLimit may return a key beyond limit. 1223 // 1224 // If the Iterator is configured to iterate over range keys, SeekGEWithLimit 1225 // guarantees it will surface any range keys with bounds overlapping the 1226 // keyspace [key, limit). 1227 func (i *Iterator) SeekGEWithLimit(key []byte, limit []byte) IterValidityState { 1228 if i.rangeKey != nil { 1229 // NB: Check Valid() before clearing requiresReposition. 1230 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1231 // If we have a range key but did not expose it at the previous iterator 1232 // position (because the iterator was not at a valid position), updated 1233 // must be true. This ensures that after an iterator op sequence like: 1234 // - Next() → (IterValid, RangeBounds() = [a,b)) 1235 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1236 // - SeekGE(...) → (IterValid, RangeBounds() = [a,b)) 1237 // the iterator returns RangeKeyChanged()=true. 1238 // 1239 // The remainder of this function will only update i.rangeKey.updated if 1240 // the iterator moves into a new range key, or out of the current range 1241 // key. 1242 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1243 } 1244 lastPositioningOp := i.lastPositioningOp 1245 hasPrefix := i.hasPrefix 1246 // Set it to unknown, since this operation may not succeed, in which case 1247 // the SeekGE following this should not make any assumption about iterator 1248 // position. 1249 i.lastPositioningOp = unknownLastPositionOp 1250 i.requiresReposition = false 1251 i.err = nil // clear cached iteration error 1252 i.hasPrefix = false 1253 i.stats.ForwardSeekCount[InterfaceCall]++ 1254 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1255 key = lowerBound 1256 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1257 key = upperBound 1258 } 1259 seekInternalIter := true 1260 1261 var flags base.SeekGEFlags 1262 if i.batchJustRefreshed { 1263 i.batchJustRefreshed = false 1264 flags = flags.EnableBatchJustRefreshed() 1265 } 1266 if lastPositioningOp == seekGELastPositioningOp { 1267 cmp := i.cmp(i.prefixOrFullSeekKey, key) 1268 // If this seek is to the same or later key, and the iterator is 1269 // already positioned there, this is a noop. This can be helpful for 1270 // sparse key spaces that have many deleted keys, where one can avoid 1271 // the overhead of iterating past them again and again. 1272 if cmp <= 0 { 1273 if !flags.BatchJustRefreshed() && 1274 (i.iterValidityState == IterExhausted || 1275 (i.iterValidityState == IterValid && i.cmp(key, i.key) <= 0 && 1276 (limit == nil || i.cmp(i.key, limit) < 0))) { 1277 // Noop 1278 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) || i.forceEnableSeekOpt { 1279 i.lastPositioningOp = seekGELastPositioningOp 1280 return i.iterValidityState 1281 } 1282 } 1283 // cmp == 0 is not safe to optimize since 1284 // - i.pos could be at iterPosNext, due to a merge. 1285 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1286 // SET pair for a key, and the iterator would have moved past DELETE 1287 // but stayed at iterPosCurForward. A similar situation occurs for a 1288 // MERGE, SET pair where the MERGE is consumed and the iterator is 1289 // at the SET. 1290 // We also leverage the IterAtLimit <=> i.pos invariant defined in the 1291 // comment on iterValidityState, to exclude any cases where i.pos 1292 // is iterPosCur{Forward,Reverse}Paused. This avoids the need to 1293 // special-case those iterator positions and their interactions with 1294 // TrySeekUsingNext, as the main uses for TrySeekUsingNext in CockroachDB 1295 // do not use limited Seeks in the first place. 1296 if cmp < 0 && i.iterValidityState != IterAtLimit && limit == nil { 1297 flags = flags.EnableTrySeekUsingNext() 1298 } 1299 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1300 flags = flags.DisableTrySeekUsingNext() 1301 } 1302 if !flags.BatchJustRefreshed() && i.pos == iterPosCurForwardPaused && i.cmp(key, i.iterKey.UserKey) <= 0 { 1303 // Have some work to do, but don't need to seek, and we can 1304 // start doing findNextEntry from i.iterKey. 1305 seekInternalIter = false 1306 } 1307 } 1308 } 1309 // Check for another TrySeekUsingNext optimization opportunity, currently 1310 // specifically tailored to external iterators. This case is intended to 1311 // trigger in instances of Seek-ing with monotonically increasing keys with 1312 // Nexts interspersed. At the time of writing, this is the case for 1313 // CockroachDB scans. This optimization is important for external iterators 1314 // to avoid re-seeking within an already-exhausted sstable. It is not always 1315 // a performance win more generally, so we restrict it to external iterators 1316 // that are configured to only use forward positioning operations. 1317 // 1318 // TODO(jackson): This optimization should be obsolete once we introduce and 1319 // use the NextPrefix iterator positioning operation. 1320 if seekInternalIter && i.forwardOnly && lastPositioningOp != invalidatedLastPositionOp && 1321 i.pos == iterPosCurForward && !hasPrefix && i.iterValidityState == IterValid && 1322 i.cmp(key, i.iterKey.UserKey) > 0 { 1323 flags = flags.EnableTrySeekUsingNext() 1324 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1325 flags = flags.DisableTrySeekUsingNext() 1326 } 1327 } 1328 if seekInternalIter { 1329 i.iterKey, i.iterValue = i.iter.SeekGE(key, flags) 1330 i.stats.ForwardSeekCount[InternalIterCall]++ 1331 } 1332 i.findNextEntry(limit) 1333 i.maybeSampleRead() 1334 if i.Error() == nil { 1335 // Prepare state for a future noop optimization. 1336 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1337 i.lastPositioningOp = seekGELastPositioningOp 1338 } 1339 return i.iterValidityState 1340 } 1341 1342 // SeekPrefixGE moves the iterator to the first key/value pair whose key is 1343 // greater than or equal to the given key and which has the same "prefix" as 1344 // the given key. The prefix for a key is determined by the user-defined 1345 // Comparer.Split function. The iterator will not observe keys not matching the 1346 // "prefix" of the search key. Calling SeekPrefixGE puts the iterator in prefix 1347 // iteration mode. The iterator remains in prefix iteration until a subsequent 1348 // call to another absolute positioning method (SeekGE, SeekLT, First, 1349 // Last). Reverse iteration (Prev) is not supported when an iterator is in 1350 // prefix iteration mode. Returns true if the iterator is pointing at a valid 1351 // entry and false otherwise. 1352 // 1353 // The semantics of SeekPrefixGE are slightly unusual and designed for 1354 // iteration to be able to take advantage of bloom filters that have been 1355 // created on the "prefix". If you're not using bloom filters, there is no 1356 // reason to use SeekPrefixGE. 1357 // 1358 // An example Split function may separate a timestamp suffix from the prefix of 1359 // the key. 1360 // 1361 // Split(<key>@<timestamp>) -> <key> 1362 // 1363 // Consider the keys "a@1", "a@2", "aa@3", "aa@4". The prefixes for these keys 1364 // are "a", and "aa". Note that despite "a" and "aa" sharing a prefix by the 1365 // usual definition, those prefixes differ by the definition of the Split 1366 // function. To see how this works, consider the following set of calls on this 1367 // data set: 1368 // 1369 // SeekPrefixGE("a@0") -> "a@1" 1370 // Next() -> "a@2" 1371 // Next() -> EOF 1372 // 1373 // If you're just looking to iterate over keys with a shared prefix, as 1374 // defined by the configured comparer, set iterator bounds instead: 1375 // 1376 // iter := db.NewIter(&pebble.IterOptions{ 1377 // LowerBound: []byte("prefix"), 1378 // UpperBound: []byte("prefiy"), 1379 // }) 1380 // for iter.First(); iter.Valid(); iter.Next() { 1381 // // Only keys beginning with "prefix" will be visited. 1382 // } 1383 // 1384 // See ExampleIterator_SeekPrefixGE for a working example. 1385 // 1386 // When iterating with range keys enabled, all range keys encountered are 1387 // truncated to the seek key's prefix's bounds. The truncation of the upper 1388 // bound requires that the database's Comparer is configured with a 1389 // ImmediateSuccessor method. For example, a SeekPrefixGE("a@9") call with the 1390 // prefix "a" will truncate range key bounds to [a,ImmediateSuccessor(a)]. 1391 func (i *Iterator) SeekPrefixGE(key []byte) bool { 1392 if i.rangeKey != nil { 1393 // NB: Check Valid() before clearing requiresReposition. 1394 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1395 // If we have a range key but did not expose it at the previous iterator 1396 // position (because the iterator was not at a valid position), updated 1397 // must be true. This ensures that after an iterator op sequence like: 1398 // - Next() → (IterValid, RangeBounds() = [a,b)) 1399 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1400 // - SeekPrefixGE(...) → (IterValid, RangeBounds() = [a,b)) 1401 // the iterator returns RangeKeyChanged()=true. 1402 // 1403 // The remainder of this function will only update i.rangeKey.updated if 1404 // the iterator moves into a new range key, or out of the current range 1405 // key. 1406 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1407 } 1408 lastPositioningOp := i.lastPositioningOp 1409 // Set it to unknown, since this operation may not succeed, in which case 1410 // the SeekPrefixGE following this should not make any assumption about 1411 // iterator position. 1412 i.lastPositioningOp = unknownLastPositionOp 1413 i.requiresReposition = false 1414 i.err = nil // clear cached iteration error 1415 i.stats.ForwardSeekCount[InterfaceCall]++ 1416 if i.comparer.Split == nil { 1417 panic("pebble: split must be provided for SeekPrefixGE") 1418 } 1419 if i.comparer.ImmediateSuccessor == nil && i.opts.KeyTypes != IterKeyTypePointsOnly { 1420 panic("pebble: ImmediateSuccessor must be provided for SeekPrefixGE with range keys") 1421 } 1422 prefixLen := i.split(key) 1423 keyPrefix := key[:prefixLen] 1424 var flags base.SeekGEFlags 1425 if i.batchJustRefreshed { 1426 flags = flags.EnableBatchJustRefreshed() 1427 i.batchJustRefreshed = false 1428 } 1429 if lastPositioningOp == seekPrefixGELastPositioningOp { 1430 if !i.hasPrefix { 1431 panic("lastPositioningOpsIsSeekPrefixGE is true, but hasPrefix is false") 1432 } 1433 // The iterator has not been repositioned after the last SeekPrefixGE. 1434 // See if we are seeking to a larger key, since then we can optimize 1435 // the seek by using next. Note that we could also optimize if Next 1436 // has been called, if the iterator is not exhausted and the current 1437 // position is <= the seek key. We are keeping this limited for now 1438 // since such optimizations require care for correctness, and to not 1439 // become de-optimizations (if one usually has to do all the next 1440 // calls and then the seek). This SeekPrefixGE optimization 1441 // specifically benefits CockroachDB. 1442 cmp := i.cmp(i.prefixOrFullSeekKey, keyPrefix) 1443 // cmp == 0 is not safe to optimize since 1444 // - i.pos could be at iterPosNext, due to a merge. 1445 // - Even if i.pos were at iterPosCurForward, we could have a DELETE, 1446 // SET pair for a key, and the iterator would have moved past DELETE 1447 // but stayed at iterPosCurForward. A similar situation occurs for a 1448 // MERGE, SET pair where the MERGE is consumed and the iterator is 1449 // at the SET. 1450 // In general some versions of i.prefix could have been consumed by 1451 // the iterator, so we only optimize for cmp < 0. 1452 if cmp < 0 { 1453 flags = flags.EnableTrySeekUsingNext() 1454 } 1455 if invariants.Enabled && flags.TrySeekUsingNext() && !i.forceEnableSeekOpt && disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1456 flags = flags.DisableTrySeekUsingNext() 1457 } 1458 } 1459 // Make a copy of the prefix so that modifications to the key after 1460 // SeekPrefixGE returns does not affect the stored prefix. 1461 if cap(i.prefixOrFullSeekKey) < prefixLen { 1462 i.prefixOrFullSeekKey = make([]byte, prefixLen) 1463 } else { 1464 i.prefixOrFullSeekKey = i.prefixOrFullSeekKey[:prefixLen] 1465 } 1466 i.hasPrefix = true 1467 copy(i.prefixOrFullSeekKey, keyPrefix) 1468 1469 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1470 if n := i.split(lowerBound); !bytes.Equal(i.prefixOrFullSeekKey, lowerBound[:n]) { 1471 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of lower bound") 1472 i.iterValidityState = IterExhausted 1473 return false 1474 } 1475 key = lowerBound 1476 } else if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1477 if n := i.split(upperBound); !bytes.Equal(i.prefixOrFullSeekKey, upperBound[:n]) { 1478 i.err = errors.New("pebble: SeekPrefixGE supplied with key outside of upper bound") 1479 i.iterValidityState = IterExhausted 1480 return false 1481 } 1482 key = upperBound 1483 } 1484 i.iterKey, i.iterValue = i.iter.SeekPrefixGE(i.prefixOrFullSeekKey, key, flags) 1485 i.stats.ForwardSeekCount[InternalIterCall]++ 1486 i.findNextEntry(nil) 1487 i.maybeSampleRead() 1488 if i.Error() == nil { 1489 i.lastPositioningOp = seekPrefixGELastPositioningOp 1490 } 1491 return i.iterValidityState == IterValid 1492 } 1493 1494 // Deterministic disabling of the seek optimizations. It uses the iterator 1495 // pointer, since we want diversity in iterator behavior for the same key. Used 1496 // for tests. 1497 func disableSeekOpt(key []byte, ptr uintptr) bool { 1498 // Fibonacci hash https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/ 1499 simpleHash := (11400714819323198485 * uint64(ptr)) >> 63 1500 return key != nil && key[0]&byte(1) == 0 && simpleHash == 0 1501 } 1502 1503 // SeekLT moves the iterator to the last key/value pair whose key is less than 1504 // the given key. Returns true if the iterator is pointing at a valid entry and 1505 // false otherwise. 1506 func (i *Iterator) SeekLT(key []byte) bool { 1507 return i.SeekLTWithLimit(key, nil) == IterValid 1508 } 1509 1510 // SeekLTWithLimit moves the iterator to the last key/value pair whose key is 1511 // less than the given key. 1512 // 1513 // If limit is provided, it serves as a best-effort inclusive limit. If the last 1514 // key less than the given search key is also less than limit, the Iterator may 1515 // pause and return IterAtLimit. Because limits are best-effort, SeekLTWithLimit 1516 // may return a key beyond limit. 1517 // 1518 // If the Iterator is configured to iterate over range keys, SeekLTWithLimit 1519 // guarantees it will surface any range keys with bounds overlapping the 1520 // keyspace up to limit. 1521 func (i *Iterator) SeekLTWithLimit(key []byte, limit []byte) IterValidityState { 1522 if i.rangeKey != nil { 1523 // NB: Check Valid() before clearing requiresReposition. 1524 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1525 // If we have a range key but did not expose it at the previous iterator 1526 // position (because the iterator was not at a valid position), updated 1527 // must be true. This ensures that after an iterator op sequence like: 1528 // - Next() → (IterValid, RangeBounds() = [a,b)) 1529 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1530 // - SeekLTWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1531 // the iterator returns RangeKeyChanged()=true. 1532 // 1533 // The remainder of this function will only update i.rangeKey.updated if 1534 // the iterator moves into a new range key, or out of the current range 1535 // key. 1536 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1537 } 1538 lastPositioningOp := i.lastPositioningOp 1539 // Set it to unknown, since this operation may not succeed, in which case 1540 // the SeekLT following this should not make any assumption about iterator 1541 // position. 1542 i.lastPositioningOp = unknownLastPositionOp 1543 i.batchJustRefreshed = false 1544 i.requiresReposition = false 1545 i.err = nil // clear cached iteration error 1546 i.stats.ReverseSeekCount[InterfaceCall]++ 1547 if upperBound := i.opts.GetUpperBound(); upperBound != nil && i.cmp(key, upperBound) > 0 { 1548 key = upperBound 1549 } else if lowerBound := i.opts.GetLowerBound(); lowerBound != nil && i.cmp(key, lowerBound) < 0 { 1550 key = lowerBound 1551 } 1552 i.hasPrefix = false 1553 seekInternalIter := true 1554 // The following noop optimization only applies when i.batch == nil, since 1555 // an iterator over a batch is iterating over mutable data, that may have 1556 // changed since the last seek. 1557 if lastPositioningOp == seekLTLastPositioningOp && i.batch == nil { 1558 cmp := i.cmp(key, i.prefixOrFullSeekKey) 1559 // If this seek is to the same or earlier key, and the iterator is 1560 // already positioned there, this is a noop. This can be helpful for 1561 // sparse key spaces that have many deleted keys, where one can avoid 1562 // the overhead of iterating past them again and again. 1563 if cmp <= 0 { 1564 // NB: when pos != iterPosCurReversePaused, the invariant 1565 // documented earlier implies that iterValidityState != 1566 // IterAtLimit. 1567 if i.iterValidityState == IterExhausted || 1568 (i.iterValidityState == IterValid && i.cmp(i.key, key) < 0 && 1569 (limit == nil || i.cmp(limit, i.key) <= 0)) { 1570 if !invariants.Enabled || !disableSeekOpt(key, uintptr(unsafe.Pointer(i))) { 1571 i.lastPositioningOp = seekLTLastPositioningOp 1572 return i.iterValidityState 1573 } 1574 } 1575 if i.pos == iterPosCurReversePaused && i.cmp(i.iterKey.UserKey, key) < 0 { 1576 // Have some work to do, but don't need to seek, and we can 1577 // start doing findPrevEntry from i.iterKey. 1578 seekInternalIter = false 1579 } 1580 } 1581 } 1582 if seekInternalIter { 1583 i.iterKey, i.iterValue = i.iter.SeekLT(key, base.SeekLTFlagsNone) 1584 i.stats.ReverseSeekCount[InternalIterCall]++ 1585 } 1586 i.findPrevEntry(limit) 1587 i.maybeSampleRead() 1588 if i.Error() == nil && i.batch == nil { 1589 // Prepare state for a future noop optimization. 1590 i.prefixOrFullSeekKey = append(i.prefixOrFullSeekKey[:0], key...) 1591 i.lastPositioningOp = seekLTLastPositioningOp 1592 } 1593 return i.iterValidityState 1594 } 1595 1596 // First moves the iterator the the first key/value pair. Returns true if the 1597 // iterator is pointing at a valid entry and false otherwise. 1598 func (i *Iterator) First() bool { 1599 if i.rangeKey != nil { 1600 // NB: Check Valid() before clearing requiresReposition. 1601 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1602 // If we have a range key but did not expose it at the previous iterator 1603 // position (because the iterator was not at a valid position), updated 1604 // must be true. This ensures that after an iterator op sequence like: 1605 // - Next() → (IterValid, RangeBounds() = [a,b)) 1606 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1607 // - First(...) → (IterValid, RangeBounds() = [a,b)) 1608 // the iterator returns RangeKeyChanged()=true. 1609 // 1610 // The remainder of this function will only update i.rangeKey.updated if 1611 // the iterator moves into a new range key, or out of the current range 1612 // key. 1613 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1614 } 1615 i.err = nil // clear cached iteration error 1616 i.hasPrefix = false 1617 i.batchJustRefreshed = false 1618 i.lastPositioningOp = unknownLastPositionOp 1619 i.requiresReposition = false 1620 i.stats.ForwardSeekCount[InterfaceCall]++ 1621 1622 i.iterFirstWithinBounds() 1623 i.findNextEntry(nil) 1624 i.maybeSampleRead() 1625 return i.iterValidityState == IterValid 1626 } 1627 1628 // Last moves the iterator the the last key/value pair. Returns true if the 1629 // iterator is pointing at a valid entry and false otherwise. 1630 func (i *Iterator) Last() bool { 1631 if i.rangeKey != nil { 1632 // NB: Check Valid() before clearing requiresReposition. 1633 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1634 // If we have a range key but did not expose it at the previous iterator 1635 // position (because the iterator was not at a valid position), updated 1636 // must be true. This ensures that after an iterator op sequence like: 1637 // - Next() → (IterValid, RangeBounds() = [a,b)) 1638 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1639 // - Last(...) → (IterValid, RangeBounds() = [a,b)) 1640 // the iterator returns RangeKeyChanged()=true. 1641 // 1642 // The remainder of this function will only update i.rangeKey.updated if 1643 // the iterator moves into a new range key, or out of the current range 1644 // key. 1645 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1646 } 1647 i.err = nil // clear cached iteration error 1648 i.hasPrefix = false 1649 i.batchJustRefreshed = false 1650 i.lastPositioningOp = unknownLastPositionOp 1651 i.requiresReposition = false 1652 i.stats.ReverseSeekCount[InterfaceCall]++ 1653 1654 i.iterLastWithinBounds() 1655 i.findPrevEntry(nil) 1656 i.maybeSampleRead() 1657 return i.iterValidityState == IterValid 1658 } 1659 1660 // Next moves the iterator to the next key/value pair. Returns true if the 1661 // iterator is pointing at a valid entry and false otherwise. 1662 func (i *Iterator) Next() bool { 1663 return i.nextWithLimit(nil) == IterValid 1664 } 1665 1666 // NextWithLimit moves the iterator to the next key/value pair. 1667 // 1668 // If limit is provided, it serves as a best-effort exclusive limit. If the next 1669 // key is greater than or equal to limit, the Iterator may pause and return 1670 // IterAtLimit. Because limits are best-effort, NextWithLimit may return a key 1671 // beyond limit. 1672 // 1673 // If the Iterator is configured to iterate over range keys, NextWithLimit 1674 // guarantees it will surface any range keys with bounds overlapping the 1675 // keyspace up to limit. 1676 func (i *Iterator) NextWithLimit(limit []byte) IterValidityState { 1677 return i.nextWithLimit(limit) 1678 } 1679 1680 // NextPrefix moves the iterator to the next key/value pair with a key 1681 // containing a different prefix than the current key. Prefixes are determined 1682 // by Comparer.Split. Exhausts the iterator if invoked while in prefix-iteration 1683 // mode. 1684 // 1685 // It is not permitted to invoke NextPrefix while at a IterAtLimit position. 1686 // When called in this condition, NextPrefix has non-deterministic behavior. 1687 // 1688 // It is not permitted to invoke NextPrefix when the Iterator has an 1689 // upper-bound that is a versioned MVCC key (see the comment for 1690 // Comparer.Split). It returns an error in this case. 1691 func (i *Iterator) NextPrefix() bool { 1692 if i.nextPrefixNotPermittedByUpperBound { 1693 i.lastPositioningOp = unknownLastPositionOp 1694 i.requiresReposition = false 1695 i.err = errors.Errorf("NextPrefix not permitted with upper bound %s", 1696 i.comparer.FormatKey(i.opts.UpperBound)) 1697 i.iterValidityState = IterExhausted 1698 return false 1699 } 1700 if i.hasPrefix { 1701 i.iterValidityState = IterExhausted 1702 return false 1703 } 1704 return i.nextPrefix() == IterValid 1705 } 1706 1707 func (i *Iterator) nextPrefix() IterValidityState { 1708 if i.rangeKey != nil { 1709 // NB: Check Valid() before clearing requiresReposition. 1710 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1711 // If we have a range key but did not expose it at the previous iterator 1712 // position (because the iterator was not at a valid position), updated 1713 // must be true. This ensures that after an iterator op sequence like: 1714 // - Next() → (IterValid, RangeBounds() = [a,b)) 1715 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1716 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1717 // the iterator returns RangeKeyChanged()=true. 1718 // 1719 // The remainder of this function will only update i.rangeKey.updated if 1720 // the iterator moves into a new range key, or out of the current range 1721 // key. 1722 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1723 } 1724 1725 // Although NextPrefix documents that behavior at IterAtLimit is undefined, 1726 // this function handles these cases as a simple prefix-agnostic Next. This 1727 // is done for deterministic behavior in the metamorphic tests. 1728 // 1729 // TODO(jackson): If the metamorphic test operation generator is adjusted to 1730 // make generation of some operations conditional on the previous 1731 // operations, then we can remove this behavior and explicitly error. 1732 1733 i.lastPositioningOp = unknownLastPositionOp 1734 i.requiresReposition = false 1735 switch i.pos { 1736 case iterPosCurForward: 1737 // Positioned on the current key. Advance to the next prefix. 1738 i.internalNextPrefix(i.split(i.key)) 1739 case iterPosCurForwardPaused: 1740 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1741 // up above. The iterator is already positioned at the next key. 1742 case iterPosCurReverse: 1743 // Switching directions. 1744 // Unless the iterator was exhausted, reverse iteration needs to 1745 // position the iterator at iterPosPrev. 1746 if i.iterKey != nil { 1747 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1748 i.iterValidityState = IterExhausted 1749 return i.iterValidityState 1750 } 1751 // The Iterator is exhausted and i.iter is positioned before the first 1752 // key. Reposition to point to the first internal key. 1753 i.iterFirstWithinBounds() 1754 case iterPosCurReversePaused: 1755 // Positioned at a limit. Implement as a prefix-agnostic Next. See TODO 1756 // up above. 1757 // 1758 // Switching directions; The iterator must not be exhausted since it 1759 // paused. 1760 if i.iterKey == nil { 1761 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1762 i.iterValidityState = IterExhausted 1763 return i.iterValidityState 1764 } 1765 i.nextUserKey() 1766 case iterPosPrev: 1767 // The underlying iterator is pointed to the previous key (this can 1768 // only happen when switching iteration directions). 1769 if i.iterKey == nil { 1770 // We're positioned before the first key. Need to reposition to point to 1771 // the first key. 1772 i.iterFirstWithinBounds() 1773 } else { 1774 // Move the internal iterator back onto the user key stored in 1775 // i.key. iterPosPrev guarantees that it's positioned at the last 1776 // key with the user key less than i.key, so we're guaranteed to 1777 // land on the correct key with a single Next. 1778 i.iterKey, i.iterValue = i.iter.Next() 1779 if invariants.Enabled && !i.equal(i.iterKey.UserKey, i.key) { 1780 i.opts.logger.Fatalf("pebble: invariant violation: Nexting internal iterator from iterPosPrev landed on %q, not %q", 1781 i.iterKey.UserKey, i.key) 1782 } 1783 } 1784 // The internal iterator is now positioned at i.key. Advance to the next 1785 // prefix. 1786 i.internalNextPrefix(i.split(i.key)) 1787 case iterPosNext: 1788 // Already positioned on the next key. Only call nextPrefixKey if the 1789 // next key shares the same prefix. 1790 if i.iterKey != nil { 1791 currKeyPrefixLen := i.split(i.key) 1792 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1793 if bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1794 i.internalNextPrefix(currKeyPrefixLen) 1795 } 1796 } 1797 } 1798 1799 i.stats.ForwardStepCount[InterfaceCall]++ 1800 i.findNextEntry(nil /* limit */) 1801 i.maybeSampleRead() 1802 return i.iterValidityState 1803 } 1804 1805 func (i *Iterator) internalNextPrefix(currKeyPrefixLen int) { 1806 if i.iterKey == nil { 1807 return 1808 } 1809 // The Next "fast-path" is not really a fast-path when there is more than 1810 // one version. However, even with TableFormatPebblev3, there is a small 1811 // slowdown (~10%) for one version if we remove it and only call NextPrefix. 1812 // When there are two versions, only calling NextPrefix is ~30% faster. 1813 i.stats.ForwardStepCount[InternalIterCall]++ 1814 if i.iterKey, i.iterValue = i.iter.Next(); i.iterKey == nil { 1815 return 1816 } 1817 iterKeyPrefixLen := i.split(i.iterKey.UserKey) 1818 if !bytes.Equal(i.iterKey.UserKey[:iterKeyPrefixLen], i.key[:currKeyPrefixLen]) { 1819 return 1820 } 1821 i.stats.ForwardStepCount[InternalIterCall]++ 1822 i.prefixOrFullSeekKey = i.comparer.ImmediateSuccessor(i.prefixOrFullSeekKey[:0], i.key[:currKeyPrefixLen]) 1823 i.iterKey, i.iterValue = i.iter.NextPrefix(i.prefixOrFullSeekKey) 1824 if invariants.Enabled && i.iterKey != nil { 1825 if iterKeyPrefixLen := i.split(i.iterKey.UserKey); i.cmp(i.iterKey.UserKey[:iterKeyPrefixLen], i.prefixOrFullSeekKey) < 0 { 1826 panic(errors.AssertionFailedf("pebble: iter.NextPrefix did not advance beyond the current prefix: now at %q; expected to be geq %q", 1827 i.iterKey, i.prefixOrFullSeekKey)) 1828 } 1829 } 1830 } 1831 1832 func (i *Iterator) nextWithLimit(limit []byte) IterValidityState { 1833 i.stats.ForwardStepCount[InterfaceCall]++ 1834 if i.hasPrefix { 1835 if limit != nil { 1836 i.err = errors.New("cannot use limit with prefix iteration") 1837 i.iterValidityState = IterExhausted 1838 return i.iterValidityState 1839 } else if i.iterValidityState == IterExhausted { 1840 // No-op, already exhasuted. We avoid executing the Next because it 1841 // can break invariants: Specifically, a file that fails the bloom 1842 // filter test may result in its level being removed from the 1843 // merging iterator. The level's removal can cause a lazy combined 1844 // iterator to miss range keys and trigger a switch to combined 1845 // iteration at a larger key, breaking keyspan invariants. 1846 return i.iterValidityState 1847 } 1848 } 1849 if i.err != nil { 1850 return i.iterValidityState 1851 } 1852 if i.rangeKey != nil { 1853 // NB: Check Valid() before clearing requiresReposition. 1854 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1855 // If we have a range key but did not expose it at the previous iterator 1856 // position (because the iterator was not at a valid position), updated 1857 // must be true. This ensures that after an iterator op sequence like: 1858 // - Next() → (IterValid, RangeBounds() = [a,b)) 1859 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1860 // - NextWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1861 // the iterator returns RangeKeyChanged()=true. 1862 // 1863 // The remainder of this function will only update i.rangeKey.updated if 1864 // the iterator moves into a new range key, or out of the current range 1865 // key. 1866 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1867 } 1868 i.lastPositioningOp = unknownLastPositionOp 1869 i.requiresReposition = false 1870 switch i.pos { 1871 case iterPosCurForward: 1872 i.nextUserKey() 1873 case iterPosCurForwardPaused: 1874 // Already at the right place. 1875 case iterPosCurReverse: 1876 // Switching directions. 1877 // Unless the iterator was exhausted, reverse iteration needs to 1878 // position the iterator at iterPosPrev. 1879 if i.iterKey != nil { 1880 i.err = errors.New("switching from reverse to forward but iter is not at prev") 1881 i.iterValidityState = IterExhausted 1882 return i.iterValidityState 1883 } 1884 // We're positioned before the first key. Need to reposition to point to 1885 // the first key. 1886 i.iterFirstWithinBounds() 1887 case iterPosCurReversePaused: 1888 // Switching directions. 1889 // The iterator must not be exhausted since it paused. 1890 if i.iterKey == nil { 1891 i.err = errors.New("switching paused from reverse to forward but iter is exhausted") 1892 i.iterValidityState = IterExhausted 1893 return i.iterValidityState 1894 } 1895 i.nextUserKey() 1896 case iterPosPrev: 1897 // The underlying iterator is pointed to the previous key (this can 1898 // only happen when switching iteration directions). We set 1899 // i.iterValidityState to IterExhausted here to force the calls to 1900 // nextUserKey to save the current key i.iter is pointing at in order 1901 // to determine when the next user-key is reached. 1902 i.iterValidityState = IterExhausted 1903 if i.iterKey == nil { 1904 // We're positioned before the first key. Need to reposition to point to 1905 // the first key. 1906 i.iterFirstWithinBounds() 1907 } else { 1908 i.nextUserKey() 1909 } 1910 i.nextUserKey() 1911 case iterPosNext: 1912 // Already at the right place. 1913 } 1914 i.findNextEntry(limit) 1915 i.maybeSampleRead() 1916 return i.iterValidityState 1917 } 1918 1919 // Prev moves the iterator to the previous key/value pair. Returns true if the 1920 // iterator is pointing at a valid entry and false otherwise. 1921 func (i *Iterator) Prev() bool { 1922 return i.PrevWithLimit(nil) == IterValid 1923 } 1924 1925 // PrevWithLimit moves the iterator to the previous key/value pair. 1926 // 1927 // If limit is provided, it serves as a best-effort inclusive limit. If the 1928 // previous key is less than limit, the Iterator may pause and return 1929 // IterAtLimit. Because limits are best-effort, PrevWithLimit may return a key 1930 // beyond limit. 1931 // 1932 // If the Iterator is configured to iterate over range keys, PrevWithLimit 1933 // guarantees it will surface any range keys with bounds overlapping the 1934 // keyspace up to limit. 1935 func (i *Iterator) PrevWithLimit(limit []byte) IterValidityState { 1936 i.stats.ReverseStepCount[InterfaceCall]++ 1937 if i.err != nil { 1938 return i.iterValidityState 1939 } 1940 if i.rangeKey != nil { 1941 // NB: Check Valid() before clearing requiresReposition. 1942 i.rangeKey.prevPosHadRangeKey = i.rangeKey.hasRangeKey && i.Valid() 1943 // If we have a range key but did not expose it at the previous iterator 1944 // position (because the iterator was not at a valid position), updated 1945 // must be true. This ensures that after an iterator op sequence like: 1946 // - Next() → (IterValid, RangeBounds() = [a,b)) 1947 // - NextWithLimit(...) → (IterAtLimit, RangeBounds() = -) 1948 // - PrevWithLimit(...) → (IterValid, RangeBounds() = [a,b)) 1949 // the iterator returns RangeKeyChanged()=true. 1950 // 1951 // The remainder of this function will only update i.rangeKey.updated if 1952 // the iterator moves into a new range key, or out of the current range 1953 // key. 1954 i.rangeKey.updated = i.rangeKey.hasRangeKey && !i.Valid() && i.opts.rangeKeys() 1955 } 1956 i.lastPositioningOp = unknownLastPositionOp 1957 i.requiresReposition = false 1958 if i.hasPrefix { 1959 i.err = errReversePrefixIteration 1960 i.iterValidityState = IterExhausted 1961 return i.iterValidityState 1962 } 1963 switch i.pos { 1964 case iterPosCurForward: 1965 // Switching directions, and will handle this below. 1966 case iterPosCurForwardPaused: 1967 // Switching directions, and will handle this below. 1968 case iterPosCurReverse: 1969 i.prevUserKey() 1970 case iterPosCurReversePaused: 1971 // Already at the right place. 1972 case iterPosNext: 1973 // The underlying iterator is pointed to the next key (this can only happen 1974 // when switching iteration directions). We will handle this below. 1975 case iterPosPrev: 1976 // Already at the right place. 1977 } 1978 if i.pos == iterPosCurForward || i.pos == iterPosNext || i.pos == iterPosCurForwardPaused { 1979 // Switching direction. 1980 stepAgain := i.pos == iterPosNext 1981 1982 // Synthetic range key markers are a special case. Consider SeekGE(b) 1983 // which finds a range key [a, c). To ensure the user observes the range 1984 // key, the Iterator pauses at Key() = b. The iterator must advance the 1985 // internal iterator to see if there's also a coincident point key at 1986 // 'b', leaving the iterator at iterPosNext if there's not. 1987 // 1988 // This is a problem: Synthetic range key markers are only interleaved 1989 // during the original seek. A subsequent Prev() of i.iter will not move 1990 // back onto the synthetic range key marker. In this case where the 1991 // previous iterator position was a synthetic range key start boundary, 1992 // we must not step a second time. 1993 if i.isEphemeralPosition() { 1994 stepAgain = false 1995 } 1996 1997 // We set i.iterValidityState to IterExhausted here to force the calls 1998 // to prevUserKey to save the current key i.iter is pointing at in 1999 // order to determine when the prev user-key is reached. 2000 i.iterValidityState = IterExhausted 2001 if i.iterKey == nil { 2002 // We're positioned after the last key. Need to reposition to point to 2003 // the last key. 2004 i.iterLastWithinBounds() 2005 } else { 2006 i.prevUserKey() 2007 } 2008 if stepAgain { 2009 i.prevUserKey() 2010 } 2011 } 2012 i.findPrevEntry(limit) 2013 i.maybeSampleRead() 2014 return i.iterValidityState 2015 } 2016 2017 // iterFirstWithinBounds moves the internal iterator to the first key, 2018 // respecting bounds. 2019 func (i *Iterator) iterFirstWithinBounds() { 2020 i.stats.ForwardSeekCount[InternalIterCall]++ 2021 if lowerBound := i.opts.GetLowerBound(); lowerBound != nil { 2022 i.iterKey, i.iterValue = i.iter.SeekGE(lowerBound, base.SeekGEFlagsNone) 2023 } else { 2024 i.iterKey, i.iterValue = i.iter.First() 2025 } 2026 } 2027 2028 // iterLastWithinBounds moves the internal iterator to the last key, respecting 2029 // bounds. 2030 func (i *Iterator) iterLastWithinBounds() { 2031 i.stats.ReverseSeekCount[InternalIterCall]++ 2032 if upperBound := i.opts.GetUpperBound(); upperBound != nil { 2033 i.iterKey, i.iterValue = i.iter.SeekLT(upperBound, base.SeekLTFlagsNone) 2034 } else { 2035 i.iterKey, i.iterValue = i.iter.Last() 2036 } 2037 } 2038 2039 // RangeKeyData describes a range key's data, set through RangeKeySet. The key 2040 // boundaries of the range key is provided by Iterator.RangeBounds. 2041 type RangeKeyData struct { 2042 Suffix []byte 2043 Value []byte 2044 } 2045 2046 // rangeKeyWithinLimit is called during limited reverse iteration when 2047 // positioned over a key beyond the limit. If there exists a range key that lies 2048 // within the limit, the iterator must not pause in order to ensure the user has 2049 // an opportunity to observe the range key within limit. 2050 // 2051 // It would be valid to ignore the limit whenever there's a range key covering 2052 // the key, but that would introduce nondeterminism. To preserve determinism for 2053 // testing, the iterator ignores the limit only if the covering range key does 2054 // cover the keyspace within the limit. 2055 // 2056 // This awkwardness exists because range keys are interleaved at their inclusive 2057 // start positions. Note that limit is inclusive. 2058 func (i *Iterator) rangeKeyWithinLimit(limit []byte) bool { 2059 if i.rangeKey == nil || !i.opts.rangeKeys() { 2060 return false 2061 } 2062 s := i.rangeKey.iiter.Span() 2063 // If the range key ends beyond the limit, then the range key does not cover 2064 // any portion of the keyspace within the limit and it is safe to pause. 2065 return s != nil && i.cmp(s.End, limit) > 0 2066 } 2067 2068 // saveRangeKey saves the current range key to the underlying iterator's current 2069 // range key state. If the range key has not changed, saveRangeKey is a no-op. 2070 // If there is a new range key, saveRangeKey copies all of the key, value and 2071 // suffixes into Iterator-managed buffers. 2072 func (i *Iterator) saveRangeKey() { 2073 if i.rangeKey == nil || i.opts.KeyTypes == IterKeyTypePointsOnly { 2074 return 2075 } 2076 2077 s := i.rangeKey.iiter.Span() 2078 if s == nil { 2079 i.rangeKey.hasRangeKey = false 2080 i.rangeKey.updated = i.rangeKey.prevPosHadRangeKey 2081 return 2082 } else if !i.rangeKey.stale { 2083 // The range key `s` is identical to the one currently saved. No-op. 2084 return 2085 } 2086 2087 if s.KeysOrder != keyspan.BySuffixAsc { 2088 panic("pebble: range key span's keys unexpectedly not in ascending suffix order") 2089 } 2090 2091 // Although `i.rangeKey.stale` is true, the span s may still be identical 2092 // to the currently saved span. This is possible when seeking the iterator, 2093 // which may land back on the same range key. If we previously had a range 2094 // key and the new one has an identical start key, then it must be the same 2095 // range key and we can avoid copying and keep `i.rangeKey.updated=false`. 2096 // 2097 // TODO(jackson): These key comparisons could be avoidable during relative 2098 // positioning operations continuing in the same direction, because these 2099 // ops will never encounter the previous position's range key while 2100 // stale=true. However, threading whether the current op is a seek or step 2101 // maybe isn't worth it. This key comparison is only necessary once when we 2102 // step onto a new range key, which should be relatively rare. 2103 if i.rangeKey.prevPosHadRangeKey && i.equal(i.rangeKey.start, s.Start) && 2104 i.equal(i.rangeKey.end, s.End) { 2105 i.rangeKey.updated = false 2106 i.rangeKey.stale = false 2107 i.rangeKey.hasRangeKey = true 2108 return 2109 } 2110 i.stats.RangeKeyStats.Count += len(s.Keys) 2111 i.rangeKey.buf.Reset() 2112 i.rangeKey.hasRangeKey = true 2113 i.rangeKey.updated = true 2114 i.rangeKey.stale = false 2115 i.rangeKey.buf, i.rangeKey.start = i.rangeKey.buf.Copy(s.Start) 2116 i.rangeKey.buf, i.rangeKey.end = i.rangeKey.buf.Copy(s.End) 2117 i.rangeKey.keys = i.rangeKey.keys[:0] 2118 for j := 0; j < len(s.Keys); j++ { 2119 if invariants.Enabled { 2120 if s.Keys[j].Kind() != base.InternalKeyKindRangeKeySet { 2121 panic("pebble: user iteration encountered non-RangeKeySet key kind") 2122 } else if j > 0 && i.cmp(s.Keys[j].Suffix, s.Keys[j-1].Suffix) < 0 { 2123 panic("pebble: user iteration encountered range keys not in suffix order") 2124 } 2125 } 2126 var rkd RangeKeyData 2127 i.rangeKey.buf, rkd.Suffix = i.rangeKey.buf.Copy(s.Keys[j].Suffix) 2128 i.rangeKey.buf, rkd.Value = i.rangeKey.buf.Copy(s.Keys[j].Value) 2129 i.rangeKey.keys = append(i.rangeKey.keys, rkd) 2130 } 2131 } 2132 2133 // RangeKeyChanged indicates whether the most recent iterator positioning 2134 // operation resulted in the iterator stepping into or out of a new range key. 2135 // If true, previously returned range key bounds and data has been invalidated. 2136 // If false, previously obtained range key bounds, suffix and value slices are 2137 // still valid and may continue to be read. 2138 // 2139 // Invalid iterator positions are considered to not hold range keys, meaning 2140 // that if an iterator steps from an IterExhausted or IterAtLimit position onto 2141 // a position with a range key, RangeKeyChanged will yield true. 2142 func (i *Iterator) RangeKeyChanged() bool { 2143 return i.iterValidityState == IterValid && i.rangeKey != nil && i.rangeKey.updated 2144 } 2145 2146 // HasPointAndRange indicates whether there exists a point key, a range key or 2147 // both at the current iterator position. 2148 func (i *Iterator) HasPointAndRange() (hasPoint, hasRange bool) { 2149 if i.iterValidityState != IterValid || i.requiresReposition { 2150 return false, false 2151 } 2152 if i.opts.KeyTypes == IterKeyTypePointsOnly { 2153 return true, false 2154 } 2155 return i.rangeKey == nil || !i.rangeKey.rangeKeyOnly, i.rangeKey != nil && i.rangeKey.hasRangeKey 2156 } 2157 2158 // RangeBounds returns the start (inclusive) and end (exclusive) bounds of the 2159 // range key covering the current iterator position. RangeBounds returns nil 2160 // bounds if there is no range key covering the current iterator position, or 2161 // the iterator is not configured to surface range keys. 2162 // 2163 // If valid, the returned start bound is less than or equal to Key() and the 2164 // returned end bound is greater than Key(). 2165 func (i *Iterator) RangeBounds() (start, end []byte) { 2166 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2167 return nil, nil 2168 } 2169 return i.rangeKey.start, i.rangeKey.end 2170 } 2171 2172 // Key returns the key of the current key/value pair, or nil if done. The 2173 // caller should not modify the contents of the returned slice, and its 2174 // contents may change on the next call to Next. 2175 // 2176 // If positioned at an iterator position that only holds a range key, Key() 2177 // always returns the start bound of the range key. Otherwise, it returns the 2178 // point key's key. 2179 func (i *Iterator) Key() []byte { 2180 return i.key 2181 } 2182 2183 // Value returns the value of the current key/value pair, or nil if done. The 2184 // caller should not modify the contents of the returned slice, and its 2185 // contents may change on the next call to Next. 2186 // 2187 // Only valid if HasPointAndRange() returns true for hasPoint. 2188 // Deprecated: use ValueAndErr instead. 2189 func (i *Iterator) Value() []byte { 2190 val, _ := i.ValueAndErr() 2191 return val 2192 } 2193 2194 // ValueAndErr returns the value, and any error encountered in extracting the value. 2195 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2196 // 2197 // The caller should not modify the contents of the returned slice, and its 2198 // contents may change on the next call to Next. 2199 func (i *Iterator) ValueAndErr() ([]byte, error) { 2200 val, callerOwned, err := i.value.Value(i.lazyValueBuf) 2201 if err != nil { 2202 i.err = err 2203 } 2204 if callerOwned { 2205 i.lazyValueBuf = val[:0] 2206 } 2207 return val, err 2208 } 2209 2210 // LazyValue returns the LazyValue. Only for advanced use cases. 2211 // REQUIRES: i.Error()==nil and HasPointAndRange() returns true for hasPoint. 2212 func (i *Iterator) LazyValue() LazyValue { 2213 return i.value 2214 } 2215 2216 // RangeKeys returns the range key values and their suffixes covering the 2217 // current iterator position. The range bounds may be retrieved separately 2218 // through Iterator.RangeBounds(). 2219 func (i *Iterator) RangeKeys() []RangeKeyData { 2220 if i.rangeKey == nil || !i.opts.rangeKeys() || !i.rangeKey.hasRangeKey { 2221 return nil 2222 } 2223 return i.rangeKey.keys 2224 } 2225 2226 // Valid returns true if the iterator is positioned at a valid key/value pair 2227 // and false otherwise. 2228 func (i *Iterator) Valid() bool { 2229 valid := i.iterValidityState == IterValid && !i.requiresReposition 2230 if invariants.Enabled { 2231 if err := i.Error(); valid && err != nil { 2232 panic(errors.WithSecondaryError(errors.AssertionFailedf("pebble: iterator is valid with non-nil Error"), err)) 2233 } 2234 } 2235 return valid 2236 } 2237 2238 // Error returns any accumulated error. 2239 func (i *Iterator) Error() error { 2240 if i.iter != nil { 2241 return firstError(i.err, i.iter.Error()) 2242 } 2243 return i.err 2244 } 2245 2246 const maxKeyBufCacheSize = 4 << 10 // 4 KB 2247 2248 // Close closes the iterator and returns any accumulated error. Exhausting 2249 // all the key/value pairs in a table is not considered to be an error. 2250 // It is not valid to call any method, including Close, after the iterator 2251 // has been closed. 2252 func (i *Iterator) Close() error { 2253 // Close the child iterator before releasing the readState because when the 2254 // readState is released sstables referenced by the readState may be deleted 2255 // which will fail on Windows if the sstables are still open by the child 2256 // iterator. 2257 if i.iter != nil { 2258 i.err = firstError(i.err, i.iter.Close()) 2259 2260 // Closing i.iter did not necessarily close the point and range key 2261 // iterators. Calls to SetOptions may have 'disconnected' either one 2262 // from i.iter if iteration key types were changed. Both point and range 2263 // key iterators are preserved in case the iterator needs to switch key 2264 // types again. We explicitly close both of these iterators here. 2265 // 2266 // NB: If the iterators were still connected to i.iter, they may be 2267 // closed, but calling Close on a closed internal iterator or fragment 2268 // iterator is allowed. 2269 if i.pointIter != nil && !i.closePointIterOnce { 2270 i.err = firstError(i.err, i.pointIter.Close()) 2271 } 2272 if i.rangeKey != nil && i.rangeKey.rangeKeyIter != nil { 2273 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2274 } 2275 } 2276 err := i.err 2277 2278 if i.readState != nil { 2279 if i.readSampling.pendingCompactions.size > 0 { 2280 // Copy pending read compactions using db.mu.Lock() 2281 i.readState.db.mu.Lock() 2282 i.readState.db.mu.compact.readCompactions.combine(&i.readSampling.pendingCompactions, i.cmp) 2283 reschedule := i.readState.db.mu.compact.rescheduleReadCompaction 2284 i.readState.db.mu.compact.rescheduleReadCompaction = false 2285 concurrentCompactions := i.readState.db.mu.compact.compactingCount 2286 i.readState.db.mu.Unlock() 2287 2288 if reschedule && concurrentCompactions == 0 { 2289 // In a read heavy workload, flushes may not happen frequently enough to 2290 // schedule compactions. 2291 i.readState.db.compactionSchedulers.Add(1) 2292 go i.readState.db.maybeScheduleCompactionAsync() 2293 } 2294 } 2295 2296 i.readState.unref() 2297 i.readState = nil 2298 } 2299 2300 if i.version != nil { 2301 i.version.Unref() 2302 } 2303 2304 for _, readers := range i.externalReaders { 2305 for _, r := range readers { 2306 err = firstError(err, r.Close()) 2307 } 2308 } 2309 2310 // Close the closer for the current value if one was open. 2311 if i.valueCloser != nil { 2312 err = firstError(err, i.valueCloser.Close()) 2313 i.valueCloser = nil 2314 } 2315 2316 if i.rangeKey != nil { 2317 2318 i.rangeKey.rangeKeyBuffers.PrepareForReuse() 2319 *i.rangeKey = iteratorRangeKeyState{ 2320 rangeKeyBuffers: i.rangeKey.rangeKeyBuffers, 2321 } 2322 iterRangeKeyStateAllocPool.Put(i.rangeKey) 2323 i.rangeKey = nil 2324 } 2325 if alloc := i.alloc; alloc != nil { 2326 // Avoid caching the key buf if it is overly large. The constant is fairly 2327 // arbitrary. 2328 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2329 alloc.keyBuf = nil 2330 } else { 2331 alloc.keyBuf = i.keyBuf 2332 } 2333 if cap(i.prefixOrFullSeekKey) >= maxKeyBufCacheSize { 2334 alloc.prefixOrFullSeekKey = nil 2335 } else { 2336 alloc.prefixOrFullSeekKey = i.prefixOrFullSeekKey 2337 } 2338 for j := range i.boundsBuf { 2339 if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize { 2340 alloc.boundsBuf[j] = nil 2341 } else { 2342 alloc.boundsBuf[j] = i.boundsBuf[j] 2343 } 2344 } 2345 *alloc = iterAlloc{ 2346 keyBuf: alloc.keyBuf, 2347 boundsBuf: alloc.boundsBuf, 2348 prefixOrFullSeekKey: alloc.prefixOrFullSeekKey, 2349 } 2350 iterAllocPool.Put(alloc) 2351 } else if alloc := i.getIterAlloc; alloc != nil { 2352 if cap(i.keyBuf) >= maxKeyBufCacheSize { 2353 alloc.keyBuf = nil 2354 } else { 2355 alloc.keyBuf = i.keyBuf 2356 } 2357 *alloc = getIterAlloc{ 2358 keyBuf: alloc.keyBuf, 2359 } 2360 getIterAllocPool.Put(alloc) 2361 } 2362 return err 2363 } 2364 2365 // SetBounds sets the lower and upper bounds for the iterator. Once SetBounds 2366 // returns, the caller is free to mutate the provided slices. 2367 // 2368 // The iterator will always be invalidated and must be repositioned with a call 2369 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2370 func (i *Iterator) SetBounds(lower, upper []byte) { 2371 // Ensure that the Iterator appears exhausted, regardless of whether we 2372 // actually have to invalidate the internal iterator. Optimizations that 2373 // avoid exhaustion are an internal implementation detail that shouldn't 2374 // leak through the interface. The caller should still call an absolute 2375 // positioning method to reposition the iterator. 2376 i.requiresReposition = true 2377 2378 if ((i.opts.LowerBound == nil) == (lower == nil)) && 2379 ((i.opts.UpperBound == nil) == (upper == nil)) && 2380 i.equal(i.opts.LowerBound, lower) && 2381 i.equal(i.opts.UpperBound, upper) { 2382 // Unchanged, noop. 2383 return 2384 } 2385 2386 // Copy the user-provided bounds into an Iterator-owned buffer, and set them 2387 // on i.opts.{Lower,Upper}Bound. 2388 i.processBounds(lower, upper) 2389 2390 i.iter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2391 // If the iterator has an open point iterator that's not currently being 2392 // used, propagate the new bounds to it. 2393 if i.pointIter != nil && !i.opts.pointKeys() { 2394 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2395 } 2396 // If the iterator has a range key iterator, propagate bounds to it. The 2397 // top-level SetBounds on the interleaving iterator (i.iter) won't propagate 2398 // bounds to the range key iterator stack, because the FragmentIterator 2399 // interface doesn't define a SetBounds method. We need to directly inform 2400 // the iterConfig stack. 2401 if i.rangeKey != nil { 2402 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2403 } 2404 2405 // Even though this is not a positioning operation, the alteration of the 2406 // bounds means we cannot optimize Seeks by using Next. 2407 i.invalidate() 2408 } 2409 2410 // Initialization and changing of the bounds must call processBounds. 2411 // processBounds saves the bounds and computes derived state from those 2412 // bounds. 2413 func (i *Iterator) processBounds(lower, upper []byte) { 2414 // Copy the user-provided bounds into an Iterator-owned buffer. We can't 2415 // overwrite the current bounds, because some internal iterators compare old 2416 // and new bounds for optimizations. 2417 2418 buf := i.boundsBuf[i.boundsBufIdx][:0] 2419 if lower != nil { 2420 buf = append(buf, lower...) 2421 i.opts.LowerBound = buf 2422 } else { 2423 i.opts.LowerBound = nil 2424 } 2425 i.nextPrefixNotPermittedByUpperBound = false 2426 if upper != nil { 2427 buf = append(buf, upper...) 2428 i.opts.UpperBound = buf[len(buf)-len(upper):] 2429 if i.comparer.Split != nil { 2430 if i.comparer.Split(i.opts.UpperBound) != len(i.opts.UpperBound) { 2431 // Setting an upper bound that is a versioned MVCC key. This means 2432 // that a key can have some MVCC versions before the upper bound and 2433 // some after. This causes significant complications for NextPrefix, 2434 // so we bar the user of NextPrefix. 2435 i.nextPrefixNotPermittedByUpperBound = true 2436 } 2437 } 2438 } else { 2439 i.opts.UpperBound = nil 2440 } 2441 i.boundsBuf[i.boundsBufIdx] = buf 2442 i.boundsBufIdx = 1 - i.boundsBufIdx 2443 } 2444 2445 // SetOptions sets new iterator options for the iterator. Note that the lower 2446 // and upper bounds applied here will supersede any bounds set by previous calls 2447 // to SetBounds. 2448 // 2449 // Note that the slices provided in this SetOptions must not be changed by the 2450 // caller until the iterator is closed, or a subsequent SetBounds or SetOptions 2451 // has returned. This is because comparisons between the existing and new bounds 2452 // are sometimes used to optimize seeking. See the extended commentary on 2453 // SetBounds. 2454 // 2455 // If the iterator was created over an indexed mutable batch, the iterator's 2456 // view of the mutable batch is refreshed. 2457 // 2458 // The iterator will always be invalidated and must be repositioned with a call 2459 // to SeekGE, SeekPrefixGE, SeekLT, First, or Last. 2460 // 2461 // If only lower and upper bounds need to be modified, prefer SetBounds. 2462 func (i *Iterator) SetOptions(o *IterOptions) { 2463 if i.externalReaders != nil { 2464 if err := validateExternalIterOpts(o); err != nil { 2465 panic(err) 2466 } 2467 } 2468 2469 // Ensure that the Iterator appears exhausted, regardless of whether we 2470 // actually have to invalidate the internal iterator. Optimizations that 2471 // avoid exhaustion are an internal implementation detail that shouldn't 2472 // leak through the interface. The caller should still call an absolute 2473 // positioning method to reposition the iterator. 2474 i.requiresReposition = true 2475 2476 // Check if global state requires we close all internal iterators. 2477 // 2478 // If the Iterator is in an error state, invalidate the existing iterators 2479 // so that we reconstruct an iterator state from scratch. 2480 // 2481 // If OnlyReadGuaranteedDurable changed, the iterator stacks are incorrect, 2482 // improperly including or excluding memtables. Invalidate them so that 2483 // finishInitializingIter will reconstruct them. 2484 // 2485 // If either the original options or the new options specify a table filter, 2486 // we need to reconstruct the iterator stacks. If they both supply a table 2487 // filter, we can't be certain that it's the same filter since we have no 2488 // mechanism to compare the filter closures. 2489 closeBoth := i.err != nil || 2490 o.OnlyReadGuaranteedDurable != i.opts.OnlyReadGuaranteedDurable || 2491 o.TableFilter != nil || i.opts.TableFilter != nil 2492 2493 // If either options specify block property filters for an iterator stack, 2494 // reconstruct it. 2495 if i.pointIter != nil && (closeBoth || len(o.PointKeyFilters) > 0 || len(i.opts.PointKeyFilters) > 0 || 2496 o.RangeKeyMasking.Filter != nil || i.opts.RangeKeyMasking.Filter != nil || o.SkipPoint != nil || 2497 i.opts.SkipPoint != nil) { 2498 i.err = firstError(i.err, i.pointIter.Close()) 2499 i.pointIter = nil 2500 } 2501 if i.rangeKey != nil { 2502 if closeBoth || len(o.RangeKeyFilters) > 0 || len(i.opts.RangeKeyFilters) > 0 { 2503 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2504 i.rangeKey = nil 2505 } else { 2506 // If there's still a range key iterator stack, invalidate the 2507 // iterator. This ensures RangeKeyChanged() returns true if a 2508 // subsequent positioning operation discovers a range key. It also 2509 // prevents seek no-op optimizations. 2510 i.invalidate() 2511 } 2512 } 2513 2514 // If the iterator is backed by a batch that's been mutated, refresh its 2515 // existing point and range-key iterators, and invalidate the iterator to 2516 // prevent seek-using-next optimizations. If we don't yet have a point-key 2517 // iterator or range-key iterator but we require one, it'll be created in 2518 // the slow path that reconstructs the iterator in finishInitializingIter. 2519 if i.batch != nil { 2520 nextBatchSeqNum := (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2521 if nextBatchSeqNum != i.batchSeqNum { 2522 i.batchSeqNum = nextBatchSeqNum 2523 if i.merging != nil { 2524 i.merging.batchSnapshot = nextBatchSeqNum 2525 } 2526 // Prevent a no-op seek optimization on the next seek. We won't be 2527 // able to reuse the top-level Iterator state, because it may be 2528 // incorrect after the inclusion of new batch mutations. 2529 i.batchJustRefreshed = true 2530 if i.pointIter != nil && i.batch.countRangeDels > 0 { 2531 if i.batchRangeDelIter.Count() == 0 { 2532 // When we constructed this iterator, there were no 2533 // rangedels in the batch. Iterator construction will 2534 // have excluded the batch rangedel iterator from the 2535 // point iterator stack. We need to reconstruct the 2536 // point iterator to add i.batchRangeDelIter into the 2537 // iterator stack. 2538 i.err = firstError(i.err, i.pointIter.Close()) 2539 i.pointIter = nil 2540 } else { 2541 // There are range deletions in the batch and we already 2542 // have a batch rangedel iterator. We can update the 2543 // batch rangedel iterator in place. 2544 // 2545 // NB: There may or may not be new range deletions. We 2546 // can't tell based on i.batchRangeDelIter.Count(), 2547 // which is the count of fragmented range deletions, NOT 2548 // the number of range deletions written to the batch 2549 // [i.batch.countRangeDels]. 2550 i.batch.initRangeDelIter(&i.opts, &i.batchRangeDelIter, nextBatchSeqNum) 2551 } 2552 } 2553 if i.rangeKey != nil && i.batch.countRangeKeys > 0 { 2554 if i.batchRangeKeyIter.Count() == 0 { 2555 // When we constructed this iterator, there were no range 2556 // keys in the batch. Iterator construction will have 2557 // excluded the batch rangekey iterator from the range key 2558 // iterator stack. We need to reconstruct the range key 2559 // iterator to add i.batchRangeKeyIter into the iterator 2560 // stack. 2561 i.err = firstError(i.err, i.rangeKey.rangeKeyIter.Close()) 2562 i.rangeKey = nil 2563 } else { 2564 // There are range keys in the batch and we already 2565 // have a batch rangekey iterator. We can update the batch 2566 // rangekey iterator in place. 2567 // 2568 // NB: There may or may not be new range keys. We can't 2569 // tell based on i.batchRangeKeyIter.Count(), which is the 2570 // count of fragmented range keys, NOT the number of 2571 // range keys written to the batch [i.batch.countRangeKeys]. 2572 i.batch.initRangeKeyIter(&i.opts, &i.batchRangeKeyIter, nextBatchSeqNum) 2573 i.invalidate() 2574 } 2575 } 2576 } 2577 } 2578 2579 // Reset combinedIterState.initialized in case the iterator key types 2580 // changed. If there's already a range key iterator stack, the combined 2581 // iterator is already initialized. Additionally, if the iterator is not 2582 // configured to include range keys, mark it as initialized to signal that 2583 // lower level iterators should not trigger a switch to combined iteration. 2584 i.lazyCombinedIter.combinedIterState = combinedIterState{ 2585 initialized: i.rangeKey != nil || !i.opts.rangeKeys(), 2586 } 2587 2588 boundsEqual := ((i.opts.LowerBound == nil) == (o.LowerBound == nil)) && 2589 ((i.opts.UpperBound == nil) == (o.UpperBound == nil)) && 2590 i.equal(i.opts.LowerBound, o.LowerBound) && 2591 i.equal(i.opts.UpperBound, o.UpperBound) 2592 2593 if boundsEqual && o.KeyTypes == i.opts.KeyTypes && 2594 (i.pointIter != nil || !i.opts.pointKeys()) && 2595 (i.rangeKey != nil || !i.opts.rangeKeys() || i.opts.KeyTypes == IterKeyTypePointsAndRanges) && 2596 i.equal(o.RangeKeyMasking.Suffix, i.opts.RangeKeyMasking.Suffix) && 2597 o.UseL6Filters == i.opts.UseL6Filters { 2598 // The options are identical, so we can likely use the fast path. In 2599 // addition to all the above constraints, we cannot use the fast path if 2600 // configured to perform lazy combined iteration but an indexed batch 2601 // used by the iterator now contains range keys. Lazy combined iteration 2602 // is not compatible with batch range keys because we always need to 2603 // merge the batch's range keys into iteration. 2604 if i.rangeKey != nil || !i.opts.rangeKeys() || i.batch == nil || i.batch.countRangeKeys == 0 { 2605 // Fast path. This preserves the Seek-using-Next optimizations as 2606 // long as the iterator wasn't already invalidated up above. 2607 return 2608 } 2609 } 2610 // Slow path. 2611 2612 // The options changed. Save the new ones to i.opts. 2613 if boundsEqual { 2614 // Copying the options into i.opts will overwrite LowerBound and 2615 // UpperBound fields with the user-provided slices. We need to hold on 2616 // to the Pebble-owned slices, so save them and re-set them after the 2617 // copy. 2618 lower, upper := i.opts.LowerBound, i.opts.UpperBound 2619 i.opts = *o 2620 i.opts.LowerBound, i.opts.UpperBound = lower, upper 2621 } else { 2622 i.opts = *o 2623 i.processBounds(o.LowerBound, o.UpperBound) 2624 // Propagate the changed bounds to the existing point iterator. 2625 // NB: We propagate i.opts.{Lower,Upper}Bound, not o.{Lower,Upper}Bound 2626 // because i.opts now point to buffers owned by Pebble. 2627 if i.pointIter != nil { 2628 i.pointIter.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2629 } 2630 if i.rangeKey != nil { 2631 i.rangeKey.iterConfig.SetBounds(i.opts.LowerBound, i.opts.UpperBound) 2632 } 2633 } 2634 2635 // Even though this is not a positioning operation, the invalidation of the 2636 // iterator stack means we cannot optimize Seeks by using Next. 2637 i.invalidate() 2638 2639 // Iterators created through NewExternalIter have a different iterator 2640 // initialization process. 2641 if i.externalReaders != nil { 2642 finishInitializingExternal(i.ctx, i) 2643 return 2644 } 2645 finishInitializingIter(i.ctx, i.alloc) 2646 } 2647 2648 func (i *Iterator) invalidate() { 2649 i.lastPositioningOp = invalidatedLastPositionOp 2650 i.hasPrefix = false 2651 i.iterKey = nil 2652 i.iterValue = LazyValue{} 2653 i.err = nil 2654 // This switch statement isn't necessary for correctness since callers 2655 // should call a repositioning method. We could have arbitrarily set i.pos 2656 // to one of the values. But it results in more intuitive behavior in 2657 // tests, which do not always reposition. 2658 switch i.pos { 2659 case iterPosCurForward, iterPosNext, iterPosCurForwardPaused: 2660 i.pos = iterPosCurForward 2661 case iterPosCurReverse, iterPosPrev, iterPosCurReversePaused: 2662 i.pos = iterPosCurReverse 2663 } 2664 i.iterValidityState = IterExhausted 2665 if i.rangeKey != nil { 2666 i.rangeKey.iiter.Invalidate() 2667 i.rangeKey.prevPosHadRangeKey = false 2668 } 2669 } 2670 2671 // Metrics returns per-iterator metrics. 2672 func (i *Iterator) Metrics() IteratorMetrics { 2673 m := IteratorMetrics{ 2674 ReadAmp: 1, 2675 } 2676 if mi, ok := i.iter.(*mergingIter); ok { 2677 m.ReadAmp = len(mi.levels) 2678 } 2679 return m 2680 } 2681 2682 // ResetStats resets the stats to 0. 2683 func (i *Iterator) ResetStats() { 2684 i.stats = IteratorStats{} 2685 } 2686 2687 // Stats returns the current stats. 2688 func (i *Iterator) Stats() IteratorStats { 2689 return i.stats 2690 } 2691 2692 // CloneOptions configures an iterator constructed through Iterator.Clone. 2693 type CloneOptions struct { 2694 // IterOptions, if non-nil, define the iterator options to configure a 2695 // cloned iterator. If nil, the clone adopts the same IterOptions as the 2696 // iterator being cloned. 2697 IterOptions *IterOptions 2698 // RefreshBatchView may be set to true when cloning an Iterator over an 2699 // indexed batch. When false, the clone adopts the same (possibly stale) 2700 // view of the indexed batch as the cloned Iterator. When true, the clone is 2701 // constructed with a refreshed view of the batch, observing all of the 2702 // batch's mutations at the time of the Clone. If the cloned iterator was 2703 // not constructed to read over an indexed batch, RefreshVatchView has no 2704 // effect. 2705 RefreshBatchView bool 2706 } 2707 2708 // Clone creates a new Iterator over the same underlying data, i.e., over the 2709 // same {batch, memtables, sstables}). The resulting iterator is not positioned. 2710 // It starts with the same IterOptions, unless opts.IterOptions is set. 2711 // 2712 // When called on an Iterator over an indexed batch, the clone's visibility of 2713 // the indexed batch is determined by CloneOptions.RefreshBatchView. If false, 2714 // the clone inherits the iterator's current (possibly stale) view of the batch, 2715 // and callers may call SetOptions to subsequently refresh the clone's view to 2716 // include all batch mutations. If true, the clone is constructed with a 2717 // complete view of the indexed batch's mutations at the time of the Clone. 2718 // 2719 // Callers can use Clone if they need multiple iterators that need to see 2720 // exactly the same underlying state of the DB. This should not be used to 2721 // extend the lifetime of the data backing the original Iterator since that 2722 // will cause an increase in memory and disk usage (use NewSnapshot for that 2723 // purpose). 2724 func (i *Iterator) Clone(opts CloneOptions) (*Iterator, error) { 2725 return i.CloneWithContext(context.Background(), opts) 2726 } 2727 2728 // CloneWithContext is like Clone, and additionally accepts a context for 2729 // tracing. 2730 func (i *Iterator) CloneWithContext(ctx context.Context, opts CloneOptions) (*Iterator, error) { 2731 if opts.IterOptions == nil { 2732 opts.IterOptions = &i.opts 2733 } 2734 2735 readState := i.readState 2736 vers := i.version 2737 if readState == nil && vers == nil { 2738 return nil, errors.Errorf("cannot Clone a closed Iterator") 2739 } 2740 // i is already holding a ref, so there is no race with unref here. 2741 // 2742 // TODO(bilal): If the underlying iterator was created on a snapshot, we could 2743 // grab a reference to the current readState instead of reffing the original 2744 // readState. This allows us to release references to some zombie sstables 2745 // and memtables. 2746 if readState != nil { 2747 readState.ref() 2748 } 2749 if vers != nil { 2750 vers.Ref() 2751 } 2752 // Bundle various structures under a single umbrella in order to allocate 2753 // them together. 2754 buf := iterAllocPool.Get().(*iterAlloc) 2755 dbi := &buf.dbi 2756 *dbi = Iterator{ 2757 ctx: ctx, 2758 opts: *opts.IterOptions, 2759 alloc: buf, 2760 merge: i.merge, 2761 comparer: i.comparer, 2762 readState: readState, 2763 version: vers, 2764 keyBuf: buf.keyBuf, 2765 prefixOrFullSeekKey: buf.prefixOrFullSeekKey, 2766 boundsBuf: buf.boundsBuf, 2767 batch: i.batch, 2768 batchSeqNum: i.batchSeqNum, 2769 newIters: i.newIters, 2770 newIterRangeKey: i.newIterRangeKey, 2771 seqNum: i.seqNum, 2772 } 2773 dbi.processBounds(dbi.opts.LowerBound, dbi.opts.UpperBound) 2774 2775 // If the caller requested the clone have a current view of the indexed 2776 // batch, set the clone's batch sequence number appropriately. 2777 if i.batch != nil && opts.RefreshBatchView { 2778 dbi.batchSeqNum = (uint64(len(i.batch.data)) | base.InternalKeySeqNumBatch) 2779 } 2780 2781 return finishInitializingIter(ctx, buf), nil 2782 } 2783 2784 // Merge adds all of the argument's statistics to the receiver. It may be used 2785 // to accumulate stats across multiple iterators. 2786 func (stats *IteratorStats) Merge(o IteratorStats) { 2787 for i := InterfaceCall; i < NumStatsKind; i++ { 2788 stats.ForwardSeekCount[i] += o.ForwardSeekCount[i] 2789 stats.ReverseSeekCount[i] += o.ReverseSeekCount[i] 2790 stats.ForwardStepCount[i] += o.ForwardStepCount[i] 2791 stats.ReverseStepCount[i] += o.ReverseStepCount[i] 2792 } 2793 stats.InternalStats.Merge(o.InternalStats) 2794 stats.RangeKeyStats.Merge(o.RangeKeyStats) 2795 } 2796 2797 func (stats *IteratorStats) String() string { 2798 return redact.StringWithoutMarkers(stats) 2799 } 2800 2801 // SafeFormat implements the redact.SafeFormatter interface. 2802 func (stats *IteratorStats) SafeFormat(s redact.SafePrinter, verb rune) { 2803 if stats.ReverseSeekCount[InterfaceCall] == 0 && stats.ReverseSeekCount[InternalIterCall] == 0 { 2804 s.Printf("seeked %s times (%s internal)", 2805 humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])), 2806 humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])), 2807 ) 2808 } else { 2809 s.Printf("seeked %s times (%s fwd/%s rev, internal: %s fwd/%s rev)", 2810 humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall]+stats.ReverseSeekCount[InterfaceCall])), 2811 humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InterfaceCall])), 2812 humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InterfaceCall])), 2813 humanize.Count.Uint64(uint64(stats.ForwardSeekCount[InternalIterCall])), 2814 humanize.Count.Uint64(uint64(stats.ReverseSeekCount[InternalIterCall])), 2815 ) 2816 } 2817 s.SafeString("; ") 2818 2819 if stats.ReverseStepCount[InterfaceCall] == 0 && stats.ReverseStepCount[InternalIterCall] == 0 { 2820 s.Printf("stepped %s times (%s internal)", 2821 humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])), 2822 humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])), 2823 ) 2824 } else { 2825 s.Printf("stepped %s times (%s fwd/%s rev, internal: %s fwd/%s rev)", 2826 humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall]+stats.ReverseStepCount[InterfaceCall])), 2827 humanize.Count.Uint64(uint64(stats.ForwardStepCount[InterfaceCall])), 2828 humanize.Count.Uint64(uint64(stats.ReverseStepCount[InterfaceCall])), 2829 humanize.Count.Uint64(uint64(stats.ForwardStepCount[InternalIterCall])), 2830 humanize.Count.Uint64(uint64(stats.ReverseStepCount[InternalIterCall])), 2831 ) 2832 } 2833 2834 if stats.InternalStats != (InternalIteratorStats{}) { 2835 s.SafeString("; ") 2836 stats.InternalStats.SafeFormat(s, verb) 2837 } 2838 if stats.RangeKeyStats != (RangeKeyIteratorStats{}) { 2839 s.SafeString(", ") 2840 stats.RangeKeyStats.SafeFormat(s, verb) 2841 } 2842 } 2843 2844 // CanDeterministicallySingleDelete takes a valid iterator and examines internal 2845 // state to determine if a SingleDelete deleting Iterator.Key() would 2846 // deterministically delete the key. CanDeterministicallySingleDelete requires 2847 // the iterator to be oriented in the forward direction (eg, the last 2848 // positioning operation must've been a First, a Seek[Prefix]GE, or a 2849 // Next[Prefix][WithLimit]). 2850 // 2851 // This function does not change the external position of the iterator, and all 2852 // positioning methods should behave the same as if it was never called. This 2853 // function will only return a meaningful result the first time it's invoked at 2854 // an iterator position. This function invalidates the iterator Value's memory, 2855 // and the caller must not rely on the memory safety of the previous Iterator 2856 // position. 2857 // 2858 // If CanDeterministicallySingleDelete returns true AND the key at the iterator 2859 // position is not modified between the creation of the Iterator and the commit 2860 // of a batch containing a SingleDelete over the key, then the caller can be 2861 // assured that SingleDelete is equivalent to Delete on the local engine, but it 2862 // may not be true on another engine that received the same writes and with 2863 // logically equivalent state since this engine may have collapsed multiple SETs 2864 // into one. 2865 func CanDeterministicallySingleDelete(it *Iterator) (bool, error) { 2866 // This function may only be called once per external iterator position. We 2867 // can validate this by checking the last positioning operation. 2868 if it.lastPositioningOp == internalNextOp { 2869 return false, errors.New("pebble: CanDeterministicallySingleDelete called twice") 2870 } 2871 validity, kind := it.internalNext() 2872 var shadowedBySingleDelete bool 2873 for validity == internalNextValid { 2874 switch kind { 2875 case InternalKeyKindDelete, InternalKeyKindDeleteSized: 2876 // A DEL or DELSIZED tombstone is okay. An internal key 2877 // sequence like SINGLEDEL; SET; DEL; SET can be handled 2878 // deterministically. If there are SETs further down, we 2879 // don't care about them. 2880 return true, nil 2881 case InternalKeyKindSingleDelete: 2882 // A SingleDelete is okay as long as when that SingleDelete was 2883 // written, it was written deterministically (eg, with its own 2884 // CanDeterministicallySingleDelete check). Validate that it was 2885 // written deterministically. We'll allow one set to appear after 2886 // the SingleDelete. 2887 shadowedBySingleDelete = true 2888 validity, kind = it.internalNext() 2889 continue 2890 case InternalKeyKindSet, InternalKeyKindSetWithDelete, InternalKeyKindMerge: 2891 // If we observed a single delete, it's allowed to delete 1 key. 2892 // We'll keep looping to validate that the internal keys beneath the 2893 // already-written single delete are copacetic. 2894 if shadowedBySingleDelete { 2895 shadowedBySingleDelete = false 2896 validity, kind = it.internalNext() 2897 continue 2898 } 2899 // We encountered a shadowed SET, SETWITHDEL, MERGE. A SINGLEDEL 2900 // that deleted the KV at the original iterator position could 2901 // result in this key becoming visible. 2902 return false, nil 2903 case InternalKeyKindRangeDelete: 2904 // RangeDeletes are handled by the merging iterator and should never 2905 // be observed by the top-level Iterator. 2906 panic(errors.AssertionFailedf("pebble: unexpected range delete")) 2907 case InternalKeyKindRangeKeySet, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeyDelete: 2908 // Range keys are interleaved at the maximal sequence number and 2909 // should never be observed within a user key. 2910 panic(errors.AssertionFailedf("pebble: unexpected range key")) 2911 default: 2912 panic(errors.AssertionFailedf("pebble: unexpected key kind: %s", errors.Safe(kind))) 2913 } 2914 } 2915 if validity == internalNextError { 2916 return false, it.Error() 2917 } 2918 return true, nil 2919 } 2920 2921 // internalNextValidity enumerates the potential outcomes of a call to 2922 // internalNext. 2923 type internalNextValidity int8 2924 2925 const ( 2926 // internalNextError is returned by internalNext when an error occurred and 2927 // the caller is responsible for checking iter.Error(). 2928 internalNextError internalNextValidity = iota 2929 // internalNextExhausted is returned by internalNext when the next internal 2930 // key is an internal key with a different user key than Iterator.Key(). 2931 internalNextExhausted 2932 // internalNextValid is returned by internalNext when the internal next 2933 // found a shadowed internal key with a user key equal to Iterator.Key(). 2934 internalNextValid 2935 ) 2936 2937 // internalNext advances internal Iterator state forward to expose the 2938 // InternalKeyKind of the next internal key with a user key equal to Key(). 2939 // 2940 // internalNext is a highly specialized operation and is unlikely to be 2941 // generally useful. See Iterator.Next for how to reposition the iterator to the 2942 // next key. internalNext requires the Iterator to be at a valid position in the 2943 // forward direction (the last positioning operation must've been a First, a 2944 // Seek[Prefix]GE, or a Next[Prefix][WithLimit] and Valid() must return true). 2945 // 2946 // internalNext, unlike all other Iterator methods, exposes internal LSM state. 2947 // internalNext advances the Iterator's internal iterator to the next shadowed 2948 // key with a user key equal to Key(). When a key is overwritten or deleted, its 2949 // removal from the LSM occurs lazily as a part of compactions. internalNext 2950 // allows the caller to see whether an obsolete internal key exists with the 2951 // current Key(), and what it's key kind is. Note that the existence of an 2952 // internal key is nondeterministic and dependent on internal LSM state. These 2953 // semantics are unlikely to be applicable to almost all use cases. 2954 // 2955 // If internalNext finds a key that shares the same user key as Key(), it 2956 // returns internalNextValid and the internal key's kind. If internalNext 2957 // encounters an error, it returns internalNextError and the caller is expected 2958 // to call Iterator.Error() to retrieve it. In all other circumstances, 2959 // internalNext returns internalNextExhausted, indicating that there are no more 2960 // additional internal keys with the user key Key(). 2961 // 2962 // internalNext does not change the external position of the iterator, and a 2963 // Next operation should behave the same as if internalNext was never called. 2964 // internalNext does invalidate the iterator Value's memory, and the caller must 2965 // not rely on the memory safety of the previous Iterator position. 2966 func (i *Iterator) internalNext() (internalNextValidity, base.InternalKeyKind) { 2967 i.stats.ForwardStepCount[InterfaceCall]++ 2968 if i.err != nil { 2969 return internalNextError, base.InternalKeyKindInvalid 2970 } else if i.iterValidityState != IterValid { 2971 return internalNextExhausted, base.InternalKeyKindInvalid 2972 } 2973 i.lastPositioningOp = internalNextOp 2974 2975 switch i.pos { 2976 case iterPosCurForward: 2977 i.iterKey, i.iterValue = i.iter.Next() 2978 if i.iterKey == nil { 2979 // We check i.iter.Error() here and return an internalNextError enum 2980 // variant so that the caller does not need to check i.iter.Error() 2981 // in the common case that the next internal key has a new user key. 2982 if i.err = i.iter.Error(); i.err != nil { 2983 return internalNextError, base.InternalKeyKindInvalid 2984 } 2985 i.pos = iterPosNext 2986 return internalNextExhausted, base.InternalKeyKindInvalid 2987 } else if i.comparer.Equal(i.iterKey.UserKey, i.key) { 2988 return internalNextValid, i.iterKey.Kind() 2989 } 2990 i.pos = iterPosNext 2991 return internalNextExhausted, base.InternalKeyKindInvalid 2992 case iterPosCurReverse, iterPosCurReversePaused, iterPosPrev: 2993 i.err = errors.New("switching from reverse to forward via internalNext is prohibited") 2994 i.iterValidityState = IterExhausted 2995 return internalNextError, base.InternalKeyKindInvalid 2996 case iterPosNext, iterPosCurForwardPaused: 2997 // The previous method already moved onto the next user key. This is 2998 // only possible if 2999 // - the last positioning method was a call to internalNext, and we 3000 // advanced to a new user key. 3001 // - the previous non-internalNext iterator operation encountered a 3002 // range key or merge, forcing an internal Next that found a new 3003 // user key that's not equal to i.Iterator.Key(). 3004 return internalNextExhausted, base.InternalKeyKindInvalid 3005 default: 3006 panic("unreachable") 3007 } 3008 }