github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_mvcc_scanner.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "bytes" 15 "encoding/binary" 16 "sort" 17 "sync" 18 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 23 "github.com/cockroachdb/errors" 24 "github.com/cockroachdb/pebble" 25 ) 26 27 const ( 28 maxItersBeforeSeek = 10 29 ) 30 31 // Struct to store MVCCScan / MVCCGet in the same binary format as that 32 // expected by MVCCScanDecodeKeyValue. 33 type pebbleResults struct { 34 count int64 35 bytes int64 36 repr []byte 37 bufs [][]byte 38 } 39 40 func (p *pebbleResults) clear() { 41 *p = pebbleResults{} 42 } 43 44 // The repr that MVCCScan / MVCCGet expects to provide as output goes: 45 // <valueLen:Uint32><keyLen:Uint32><Key><Value> 46 // This function adds to repr in that format. 47 func (p *pebbleResults) put(key MVCCKey, value []byte) { 48 // Key value lengths take up 8 bytes (2 x Uint32). 49 const kvLenSize = 8 50 const minSize = 16 51 const maxSize = 128 << 20 // 128 MB 52 53 // We maintain a list of buffers, always encoding into the last one (a.k.a. 54 // pebbleResults.repr). The size of the buffers is exponentially increasing, 55 // capped at maxSize. 56 lenKey := key.Len() 57 lenToAdd := kvLenSize + lenKey + len(value) 58 if len(p.repr)+lenToAdd > cap(p.repr) { 59 newSize := 2 * cap(p.repr) 60 if newSize == 0 { 61 newSize = minSize 62 } 63 for newSize < lenToAdd && newSize < maxSize { 64 newSize *= 2 65 } 66 if len(p.repr) > 0 { 67 p.bufs = append(p.bufs, p.repr) 68 } 69 p.repr = nonZeroingMakeByteSlice(newSize)[:0] 70 } 71 72 startIdx := len(p.repr) 73 p.repr = p.repr[:startIdx+lenToAdd] 74 binary.LittleEndian.PutUint32(p.repr[startIdx:], uint32(len(value))) 75 binary.LittleEndian.PutUint32(p.repr[startIdx+4:], uint32(lenKey)) 76 encodeKeyToBuf(p.repr[startIdx+kvLenSize:startIdx+kvLenSize+lenKey], key, lenKey) 77 copy(p.repr[startIdx+kvLenSize+lenKey:], value) 78 p.count++ 79 p.bytes += int64(lenToAdd) 80 } 81 82 func (p *pebbleResults) finish() [][]byte { 83 if len(p.repr) > 0 { 84 p.bufs = append(p.bufs, p.repr) 85 p.repr = nil 86 } 87 return p.bufs 88 } 89 90 // Go port of mvccScanner in libroach/mvcc.h. Stores all variables relating to 91 // one MVCCGet / MVCCScan call. 92 type pebbleMVCCScanner struct { 93 parent Iterator 94 reverse bool 95 peeked bool 96 // Iteration bounds. Does not contain MVCC timestamp. 97 start, end roachpb.Key 98 // Timestamp with which MVCCScan/MVCCGet was called. 99 ts hlc.Timestamp 100 // Max number of keys to return. Note that targetBytes below is implemented 101 // by mutating maxKeys. (In particular, one must not assume that if maxKeys 102 // is zero initially it will always be zero). 103 maxKeys int64 104 // Stop adding keys once p.result.bytes matches or exceeds this threshold, 105 // if nonzero. 106 targetBytes int64 107 // Transaction epoch and sequence number. 108 txn *roachpb.Transaction 109 txnEpoch enginepb.TxnEpoch 110 txnSequence enginepb.TxnSeq 111 txnIgnoredSeqNums []enginepb.IgnoredSeqNumRange 112 // Metadata object for unmarshalling intents. 113 meta enginepb.MVCCMetadata 114 // Bools copied over from MVCC{Scan,Get}Options. See the comment on the 115 // package level MVCCScan for what these mean. 116 inconsistent, tombstones bool 117 failOnMoreRecent bool 118 checkUncertainty bool 119 isGet bool 120 keyBuf []byte 121 savedBuf []byte 122 // cur* variables store the "current" record we're pointing to. Updated in 123 // updateCurrent. 124 curKey MVCCKey 125 curValue []byte 126 results pebbleResults 127 intents pebble.Batch 128 // Stores any error returned. If non-nil, iteration short circuits. 129 err error 130 // Number of iterations to try before we do a Seek/SeekReverse. Stays within 131 // [1, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2 . 132 itersBeforeSeek int 133 } 134 135 // Pool for allocating pebble MVCC Scanners. 136 var pebbleMVCCScannerPool = sync.Pool{ 137 New: func() interface{} { 138 return &pebbleMVCCScanner{} 139 }, 140 } 141 142 // init sets bounds on the underlying pebble iterator, and initializes other 143 // fields not set by the calling method. 144 func (p *pebbleMVCCScanner) init(txn *roachpb.Transaction) { 145 p.itersBeforeSeek = maxItersBeforeSeek / 2 146 147 if txn != nil { 148 p.txn = txn 149 p.txnEpoch = txn.Epoch 150 p.txnSequence = txn.Sequence 151 p.txnIgnoredSeqNums = txn.IgnoredSeqNums 152 p.checkUncertainty = p.ts.Less(txn.MaxTimestamp) 153 } 154 } 155 156 // get iterates exactly once and adds one KV to the result set. 157 func (p *pebbleMVCCScanner) get() { 158 p.isGet = true 159 p.parent.SeekGE(MVCCKey{Key: p.start}) 160 if !p.updateCurrent() { 161 return 162 } 163 p.getAndAdvance() 164 } 165 166 // scan iterates until maxKeys records are in results, or the underlying 167 // iterator is exhausted, or an error is encountered. 168 func (p *pebbleMVCCScanner) scan() (*roachpb.Span, error) { 169 p.isGet = false 170 if p.reverse { 171 if !p.iterSeekReverse(MVCCKey{Key: p.end}) { 172 return nil, p.err 173 } 174 } else { 175 if !p.iterSeek(MVCCKey{Key: p.start}) { 176 return nil, p.err 177 } 178 } 179 180 for p.getAndAdvance() { 181 } 182 183 var resume *roachpb.Span 184 if p.maxKeys > 0 && p.results.count == p.maxKeys && p.advanceKey() { 185 if p.reverse { 186 // curKey was not added to results, so it needs to be included in the 187 // resume span. 188 // 189 // NB: this is equivalent to: 190 // append(roachpb.Key(nil), p.curKey.Key...).Next() 191 // but with half the allocations. 192 curKey := p.curKey.Key 193 curKeyCopy := make(roachpb.Key, len(curKey), len(curKey)+1) 194 copy(curKeyCopy, curKey) 195 resume = &roachpb.Span{ 196 Key: p.start, 197 EndKey: curKeyCopy.Next(), 198 } 199 } else { 200 resume = &roachpb.Span{ 201 Key: append(roachpb.Key(nil), p.curKey.Key...), 202 EndKey: p.end, 203 } 204 } 205 } 206 return resume, p.err 207 } 208 209 // Increments itersBeforeSeek while ensuring it stays <= maxItersBeforeSeek 210 func (p *pebbleMVCCScanner) incrementItersBeforeSeek() { 211 p.itersBeforeSeek++ 212 if p.itersBeforeSeek > maxItersBeforeSeek { 213 p.itersBeforeSeek = maxItersBeforeSeek 214 } 215 } 216 217 // Decrements itersBeforeSeek while ensuring it stays positive. 218 func (p *pebbleMVCCScanner) decrementItersBeforeSeek() { 219 p.itersBeforeSeek-- 220 if p.itersBeforeSeek < 1 { 221 p.itersBeforeSeek = 1 222 } 223 } 224 225 // Try to read from the current value's intent history. Assumes p.meta has been 226 // unmarshalled already. Returns found = true if a value was found and returned. 227 func (p *pebbleMVCCScanner) getFromIntentHistory() (value []byte, found bool) { 228 intentHistory := p.meta.IntentHistory 229 // upIdx is the index of the first intent in intentHistory with a sequence 230 // number greater than our transaction's sequence number. Subtract 1 from it 231 // to get the index of the intent with the highest sequence number that is 232 // still less than or equal to p.txnSeq. 233 upIdx := sort.Search(len(intentHistory), func(i int) bool { 234 return intentHistory[i].Sequence > p.txnSequence 235 }) 236 // If the candidate intent has a sequence number that is ignored by this txn, 237 // iterate backward along the sorted intent history until we come across an 238 // intent which isn't ignored. 239 // 240 // TODO(itsbilal): Explore if this iteration can be improved through binary 241 // search. 242 for upIdx > 0 && enginepb.TxnSeqIsIgnored(p.meta.IntentHistory[upIdx-1].Sequence, p.txnIgnoredSeqNums) { 243 upIdx-- 244 } 245 if upIdx == 0 { 246 // It is possible that no intent exists such that the sequence is less 247 // than the read sequence, and is not ignored by this transaction. 248 // In this case, we cannot read a value from the intent history. 249 return nil, false 250 } 251 intent := &p.meta.IntentHistory[upIdx-1] 252 return intent.Value, true 253 } 254 255 // Returns a write too old error with the specified timestamp. 256 func (p *pebbleMVCCScanner) writeTooOldError(ts hlc.Timestamp) bool { 257 // The txn can't write at the existing timestamp, so we provide the error 258 // with the timestamp immediately after it. 259 p.err = roachpb.NewWriteTooOldError(p.ts, ts.Next()) 260 p.results.clear() 261 p.intents.Reset() 262 return false 263 } 264 265 // Returns an uncertainty error with the specified timestamp and p.txn. 266 func (p *pebbleMVCCScanner) uncertaintyError(ts hlc.Timestamp) bool { 267 p.err = roachpb.NewReadWithinUncertaintyIntervalError(p.ts, ts, p.txn) 268 p.results.clear() 269 p.intents.Reset() 270 return false 271 } 272 273 // Emit a tuple and return true if we have reason to believe iteration can 274 // continue. 275 func (p *pebbleMVCCScanner) getAndAdvance() bool { 276 if p.curKey.Timestamp != (hlc.Timestamp{}) { 277 if p.curKey.Timestamp.LessEq(p.ts) { 278 // 1. Fast path: there is no intent and our read timestamp is newer than 279 // the most recent version's timestamp. 280 return p.addAndAdvance(p.curValue) 281 } 282 283 if p.failOnMoreRecent { 284 // 2. Our txn's read timestamp is less than the most recent 285 // version's timestamp and the scanner has been configured 286 // to throw a write too old error on more recent versions. 287 return p.writeTooOldError(p.curKey.Timestamp) 288 } 289 290 if p.checkUncertainty { 291 // 3. Our txn's read timestamp is less than the max timestamp 292 // seen by the txn. We need to check for clock uncertainty 293 // errors. 294 if p.curKey.Timestamp.LessEq(p.txn.MaxTimestamp) { 295 return p.uncertaintyError(p.curKey.Timestamp) 296 } 297 298 return p.seekVersion(p.txn.MaxTimestamp, true) 299 } 300 301 // 4. Our txn's read timestamp is greater than or equal to the 302 // max timestamp seen by the txn so clock uncertainty checks are 303 // unnecessary. We need to seek to the desired version of the 304 // value (i.e. one with a timestamp earlier than our read 305 // timestamp). 306 return p.seekVersion(p.ts, false) 307 } 308 309 if len(p.curValue) == 0 { 310 p.err = errors.Errorf("zero-length mvcc metadata") 311 return false 312 } 313 err := protoutil.Unmarshal(p.curValue, &p.meta) 314 if err != nil { 315 p.err = errors.Errorf("unable to decode MVCCMetadata: %s", err) 316 return false 317 } 318 if len(p.meta.RawBytes) != 0 { 319 // 5. Emit immediately if the value is inline. 320 return p.addAndAdvance(p.meta.RawBytes) 321 } 322 323 if p.meta.Txn == nil { 324 p.err = errors.Errorf("intent without transaction") 325 return false 326 } 327 metaTS := hlc.Timestamp(p.meta.Timestamp) 328 329 // metaTS is the timestamp of an intent value, which we may or may 330 // not end up ignoring, depending on factors codified below. If we do ignore 331 // the intent then we want to read at a lower timestamp that's strictly 332 // below the intent timestamp (to skip the intent), but also does not exceed 333 // our read timestamp (to avoid erroneously picking up future committed 334 // values); this timestamp is prevTS. 335 prevTS := p.ts 336 if metaTS.LessEq(p.ts) { 337 prevTS = metaTS.Prev() 338 } 339 340 ownIntent := p.txn != nil && p.meta.Txn.ID.Equal(p.txn.ID) 341 maxVisibleTS := p.ts 342 if p.checkUncertainty { 343 maxVisibleTS = p.txn.MaxTimestamp 344 } 345 otherIntentVisible := metaTS.LessEq(maxVisibleTS) || p.failOnMoreRecent 346 347 if !ownIntent && !otherIntentVisible { 348 // 6. The key contains an intent, but we're reading before the 349 // intent. Seek to the desired version. Note that if we own the 350 // intent (i.e. we're reading transactionally) we want to read 351 // the intent regardless of our read timestamp and fall into 352 // case 8 below. 353 return p.seekVersion(p.ts, false) 354 } 355 356 if p.inconsistent { 357 // 7. The key contains an intent and we're doing an inconsistent 358 // read at a timestamp newer than the intent. We ignore the 359 // intent by insisting that the timestamp we're reading at is a 360 // historical timestamp < the intent timestamp. However, we 361 // return the intent separately; the caller may want to resolve 362 // it. 363 if p.maxKeys > 0 && p.results.count == p.maxKeys { 364 // We've already retrieved the desired number of keys and now 365 // we're adding the resume key. We don't want to add the 366 // intent here as the intents should only correspond to KVs 367 // that lie before the resume key. 368 return false 369 } 370 p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curKey) 371 p.err = p.intents.Set(p.keyBuf, p.curValue, nil) 372 if p.err != nil { 373 return false 374 } 375 376 return p.seekVersion(prevTS, false) 377 } 378 379 if !ownIntent { 380 // 8. The key contains an intent which was not written by our 381 // transaction and either: 382 // - our read timestamp is equal to or newer than that of the 383 // intent 384 // - our read timestamp is older than that of the intent but 385 // the intent is in our transaction's uncertainty interval 386 // - our read timestamp is older than that of the intent but 387 // we want to fail on more recent writes 388 // Note that this will trigger an error higher up the stack. We 389 // continue scanning so that we can return all of the intents 390 // in the scan range. 391 p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curKey) 392 p.err = p.intents.Set(p.keyBuf, p.curValue, nil) 393 if p.err != nil { 394 return false 395 } 396 return p.advanceKey() 397 } 398 399 if p.txnEpoch == p.meta.Txn.Epoch { 400 if p.txnSequence >= p.meta.Txn.Sequence && !enginepb.TxnSeqIsIgnored(p.meta.Txn.Sequence, p.txnIgnoredSeqNums) { 401 // 9. We're reading our own txn's intent at an equal or higher sequence. 402 // Note that we read at the intent timestamp, not at our read timestamp 403 // as the intent timestamp may have been pushed forward by another 404 // transaction. Txn's always need to read their own writes. 405 return p.seekVersion(metaTS, false) 406 } 407 408 // 10. We're reading our own txn's intent at a lower sequence than is 409 // currently present in the intent. This means the intent we're seeing 410 // was written at a higher sequence than the read and that there may or 411 // may not be earlier versions of the intent (with lower sequence 412 // numbers) that we should read. If there exists a value in the intent 413 // history that has a sequence number equal to or less than the read 414 // sequence, read that value. 415 if value, found := p.getFromIntentHistory(); found { 416 return p.addAndAdvance(value) 417 } 418 // 11. If no value in the intent history has a sequence number equal to 419 // or less than the read, we must ignore the intents laid down by the 420 // transaction all together. We ignore the intent by insisting that the 421 // timestamp we're reading at is a historical timestamp < the intent 422 // timestamp. 423 return p.seekVersion(prevTS, false) 424 } 425 426 if p.txnEpoch < p.meta.Txn.Epoch { 427 // 12. We're reading our own txn's intent but the current txn has 428 // an earlier epoch than the intent. Return an error so that the 429 // earlier incarnation of our transaction aborts (presumably 430 // this is some operation that was retried). 431 p.err = errors.Errorf("failed to read with epoch %d due to a write intent with epoch %d", 432 p.txnEpoch, p.meta.Txn.Epoch) 433 return false 434 } 435 436 // 13. We're reading our own txn's intent but the current txn has a 437 // later epoch than the intent. This can happen if the txn was 438 // restarted and an earlier iteration wrote the value we're now 439 // reading. In this case, we ignore the intent and read the 440 // previous value as if the transaction were starting fresh. 441 return p.seekVersion(prevTS, false) 442 } 443 444 // nextKey advances to the next user key. 445 func (p *pebbleMVCCScanner) nextKey() bool { 446 p.keyBuf = append(p.keyBuf[:0], p.curKey.Key...) 447 448 for i := 0; i < p.itersBeforeSeek; i++ { 449 if !p.iterNext() { 450 return false 451 } 452 if !bytes.Equal(p.curKey.Key, p.keyBuf) { 453 p.incrementItersBeforeSeek() 454 return true 455 } 456 } 457 458 p.decrementItersBeforeSeek() 459 // We're pointed at a different version of the same key. Fall back to 460 // seeking to the next key. We append a NUL to account for the "next-key". 461 p.keyBuf = append(p.keyBuf, 0) 462 return p.iterSeek(MVCCKey{Key: p.keyBuf}) 463 } 464 465 // backwardLatestVersion backs up the iterator to the latest version for the 466 // specified key. The parameter i is used to maintain iteration count between 467 // the loop here and the caller (usually prevKey). Returns false if the 468 // iterator was exhausted. Assumes that the iterator is currently positioned at 469 // the oldest version of key. 470 func (p *pebbleMVCCScanner) backwardLatestVersion(key []byte, i int) bool { 471 p.keyBuf = append(p.keyBuf[:0], key...) 472 473 for ; i < p.itersBeforeSeek; i++ { 474 peekedKey, ok := p.iterPeekPrev() 475 if !ok { 476 // No previous entry exists, so we're at the latest version of key. 477 return true 478 } 479 if !bytes.Equal(peekedKey, p.keyBuf) { 480 p.incrementItersBeforeSeek() 481 return true 482 } 483 if !p.iterPrev() { 484 return false 485 } 486 } 487 488 p.decrementItersBeforeSeek() 489 return p.iterSeek(MVCCKey{Key: p.keyBuf}) 490 } 491 492 // prevKey advances to the newest version of the user key preceding the 493 // specified key. Assumes that the iterator is currently positioned at 494 // key or 1 record after key. 495 func (p *pebbleMVCCScanner) prevKey(key []byte) bool { 496 p.keyBuf = append(p.keyBuf[:0], key...) 497 498 for i := 0; i < p.itersBeforeSeek; i++ { 499 peekedKey, ok := p.iterPeekPrev() 500 if !ok { 501 return false 502 } 503 if !bytes.Equal(peekedKey, p.keyBuf) { 504 return p.backwardLatestVersion(peekedKey, i+1) 505 } 506 if !p.iterPrev() { 507 return false 508 } 509 } 510 511 p.decrementItersBeforeSeek() 512 return p.iterSeekReverse(MVCCKey{Key: p.keyBuf}) 513 } 514 515 // advanceKey advances to the next key in the iterator's direction. 516 func (p *pebbleMVCCScanner) advanceKey() bool { 517 if p.isGet { 518 return false 519 } 520 if p.reverse { 521 return p.prevKey(p.curKey.Key) 522 } 523 return p.nextKey() 524 } 525 526 // advanceKeyAtEnd advances to the next key when the iterator's end has been 527 // reached. 528 func (p *pebbleMVCCScanner) advanceKeyAtEnd() bool { 529 if p.reverse { 530 // Iterating to the next key might have caused the iterator to reach the 531 // end of the key space. If that happens, back up to the very last key. 532 p.peeked = false 533 p.parent.SeekLT(MVCCKey{Key: p.end}) 534 if !p.updateCurrent() { 535 return false 536 } 537 return p.advanceKey() 538 } 539 // We've reached the end of the iterator and there is nothing left to do. 540 return false 541 } 542 543 // advanceKeyAtNewKey advances to the key after the specified key, assuming we 544 // have just reached the specified key. 545 func (p *pebbleMVCCScanner) advanceKeyAtNewKey(key []byte) bool { 546 if p.reverse { 547 // We've advanced to the next key but need to move back to the previous 548 // key. 549 return p.prevKey(key) 550 } 551 // We're already at the new key so there is nothing to do. 552 return true 553 } 554 555 // Adds the specified value to the result set, excluding tombstones unless 556 // p.tombstones is true. Advances to the next key unless we've reached the max 557 // results limit. 558 func (p *pebbleMVCCScanner) addAndAdvance(val []byte) bool { 559 // Don't include deleted versions len(val) == 0, unless we've been instructed 560 // to include tombstones in the results. 561 if len(val) > 0 || p.tombstones { 562 p.results.put(p.curKey, val) 563 if p.targetBytes > 0 && p.results.bytes >= p.targetBytes { 564 // When the target bytes are met or exceeded, stop producing more 565 // keys. We implement this by reducing maxKeys to the current 566 // number of keys. 567 // 568 // TODO(bilal): see if this can be implemented more transparently. 569 p.maxKeys = p.results.count 570 } 571 if p.maxKeys > 0 && p.results.count == p.maxKeys { 572 return false 573 } 574 } 575 return p.advanceKey() 576 } 577 578 // Seeks to the latest revision of the current key that's still less than or 579 // equal to the specified timestamp, adds it to the result set, then moves onto 580 // the next user key. 581 func (p *pebbleMVCCScanner) seekVersion(ts hlc.Timestamp, uncertaintyCheck bool) bool { 582 key := MVCCKey{Key: p.curKey.Key, Timestamp: ts} 583 p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key) 584 origKey := p.keyBuf[:len(p.curKey.Key)] 585 586 for i := 0; i < p.itersBeforeSeek; i++ { 587 if !p.iterNext() { 588 return p.advanceKeyAtEnd() 589 } 590 if !bytes.Equal(p.curKey.Key, origKey) { 591 p.incrementItersBeforeSeek() 592 return p.advanceKeyAtNewKey(origKey) 593 } 594 if p.curKey.Timestamp.LessEq(ts) { 595 p.incrementItersBeforeSeek() 596 if uncertaintyCheck && p.ts.Less(p.curKey.Timestamp) { 597 return p.uncertaintyError(p.curKey.Timestamp) 598 } 599 return p.addAndAdvance(p.curValue) 600 } 601 } 602 603 p.decrementItersBeforeSeek() 604 if !p.iterSeek(key) { 605 return p.advanceKeyAtEnd() 606 } 607 if !bytes.Equal(p.curKey.Key, origKey) { 608 return p.advanceKeyAtNewKey(origKey) 609 } 610 if p.curKey.Timestamp.LessEq(ts) { 611 if uncertaintyCheck && p.ts.Less(p.curKey.Timestamp) { 612 return p.uncertaintyError(p.curKey.Timestamp) 613 } 614 return p.addAndAdvance(p.curValue) 615 } 616 return p.advanceKey() 617 } 618 619 // Updates cur{RawKey, Key, TS} to match record the iterator is pointing to. 620 func (p *pebbleMVCCScanner) updateCurrent() bool { 621 if !p.iterValid() { 622 return false 623 } 624 625 p.curKey = p.parent.UnsafeKey() 626 p.curValue = p.parent.UnsafeValue() 627 return true 628 } 629 630 func (p *pebbleMVCCScanner) iterValid() bool { 631 if valid, err := p.parent.Valid(); !valid { 632 p.err = err 633 return false 634 } 635 return true 636 } 637 638 // iterSeek seeks to the latest revision of the specified key (or a greater key). 639 func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool { 640 p.clearPeeked() 641 p.parent.SeekGE(key) 642 return p.updateCurrent() 643 } 644 645 // iterSeekReverse seeks to the latest revision of the key before the specified key. 646 func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool { 647 p.clearPeeked() 648 p.parent.SeekLT(key) 649 if !p.updateCurrent() { 650 // We have seeked to before the start key. Return. 651 return false 652 } 653 654 if p.curKey.Timestamp == (hlc.Timestamp{}) { 655 // We landed on an intent or inline value. 656 return true 657 } 658 // We landed on a versioned value, we need to back up to find the 659 // latest version. 660 return p.backwardLatestVersion(p.curKey.Key, 0) 661 } 662 663 // iterNext advances to the next MVCC key. 664 func (p *pebbleMVCCScanner) iterNext() bool { 665 if p.reverse && p.peeked { 666 // If we have peeked at the previous entry, we need to advance the iterator 667 // twice. 668 p.peeked = false 669 if !p.iterValid() { 670 // We were peeked off the beginning of iteration. Seek to the first 671 // entry, and then advance one step. 672 p.parent.SeekGE(MVCCKey{Key: p.start}) 673 if !p.iterValid() { 674 return false 675 } 676 p.parent.Next() 677 return p.updateCurrent() 678 } 679 p.parent.Next() 680 if !p.iterValid() { 681 return false 682 } 683 } 684 p.parent.Next() 685 return p.updateCurrent() 686 } 687 688 // iterPrev advances to the previous MVCC Key. 689 func (p *pebbleMVCCScanner) iterPrev() bool { 690 if p.peeked { 691 p.peeked = false 692 return p.updateCurrent() 693 } 694 p.parent.Prev() 695 return p.updateCurrent() 696 } 697 698 // Peek the previous key and store the result in peekedKey. Note that this 699 // moves the iterator backward, while leaving p.cur{key,value,rawKey} untouched 700 // and therefore out of sync. iterPrev and iterNext take this into account. 701 func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool) { 702 if !p.peeked { 703 p.peeked = true 704 // We need to save a copy of the current iterator key and value and adjust 705 // curRawKey, curKey and curValue to point to this saved data. We use a 706 // single buffer for this purpose: savedBuf. 707 p.savedBuf = append(p.savedBuf[:0], p.curKey.Key...) 708 p.savedBuf = append(p.savedBuf, p.curValue...) 709 p.curKey.Key = p.savedBuf[:len(p.curKey.Key)] 710 p.curValue = p.savedBuf[len(p.curKey.Key):] 711 712 // With the current iterator state saved we can move the iterator to the 713 // previous entry. 714 p.parent.Prev() 715 if !p.iterValid() { 716 // The iterator is now invalid, but note that this case is handled in 717 // both iterNext and iterPrev. In the former case, we'll position the 718 // iterator at the first entry, and in the latter iteration will be done. 719 return nil, false 720 } 721 } else if !p.iterValid() { 722 return nil, false 723 } 724 725 peekedKey := p.parent.UnsafeKey() 726 return peekedKey.Key, true 727 } 728 729 // Clear the peeked flag. Call this before any iterator operations. 730 func (p *pebbleMVCCScanner) clearPeeked() { 731 if p.reverse { 732 p.peeked = false 733 } 734 } 735 736 func (p *pebbleMVCCScanner) intentsRepr() []byte { 737 if p.intents.Count() == 0 { 738 return nil 739 } 740 return p.intents.Repr() 741 }