github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_mvcc_scanner.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"encoding/binary"
    16  	"sort"
    17  	"sync"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    23  	"github.com/cockroachdb/errors"
    24  	"github.com/cockroachdb/pebble"
    25  )
    26  
    27  const (
    28  	maxItersBeforeSeek = 10
    29  )
    30  
    31  // Struct to store MVCCScan / MVCCGet in the same binary format as that
    32  // expected by MVCCScanDecodeKeyValue.
    33  type pebbleResults struct {
    34  	count int64
    35  	bytes int64
    36  	repr  []byte
    37  	bufs  [][]byte
    38  }
    39  
    40  func (p *pebbleResults) clear() {
    41  	*p = pebbleResults{}
    42  }
    43  
    44  // The repr that MVCCScan / MVCCGet expects to provide as output goes:
    45  // <valueLen:Uint32><keyLen:Uint32><Key><Value>
    46  // This function adds to repr in that format.
    47  func (p *pebbleResults) put(key MVCCKey, value []byte) {
    48  	// Key value lengths take up 8 bytes (2 x Uint32).
    49  	const kvLenSize = 8
    50  	const minSize = 16
    51  	const maxSize = 128 << 20 // 128 MB
    52  
    53  	// We maintain a list of buffers, always encoding into the last one (a.k.a.
    54  	// pebbleResults.repr). The size of the buffers is exponentially increasing,
    55  	// capped at maxSize.
    56  	lenKey := key.Len()
    57  	lenToAdd := kvLenSize + lenKey + len(value)
    58  	if len(p.repr)+lenToAdd > cap(p.repr) {
    59  		newSize := 2 * cap(p.repr)
    60  		if newSize == 0 {
    61  			newSize = minSize
    62  		}
    63  		for newSize < lenToAdd && newSize < maxSize {
    64  			newSize *= 2
    65  		}
    66  		if len(p.repr) > 0 {
    67  			p.bufs = append(p.bufs, p.repr)
    68  		}
    69  		p.repr = nonZeroingMakeByteSlice(newSize)[:0]
    70  	}
    71  
    72  	startIdx := len(p.repr)
    73  	p.repr = p.repr[:startIdx+lenToAdd]
    74  	binary.LittleEndian.PutUint32(p.repr[startIdx:], uint32(len(value)))
    75  	binary.LittleEndian.PutUint32(p.repr[startIdx+4:], uint32(lenKey))
    76  	encodeKeyToBuf(p.repr[startIdx+kvLenSize:startIdx+kvLenSize+lenKey], key, lenKey)
    77  	copy(p.repr[startIdx+kvLenSize+lenKey:], value)
    78  	p.count++
    79  	p.bytes += int64(lenToAdd)
    80  }
    81  
    82  func (p *pebbleResults) finish() [][]byte {
    83  	if len(p.repr) > 0 {
    84  		p.bufs = append(p.bufs, p.repr)
    85  		p.repr = nil
    86  	}
    87  	return p.bufs
    88  }
    89  
    90  // Go port of mvccScanner in libroach/mvcc.h. Stores all variables relating to
    91  // one MVCCGet / MVCCScan call.
    92  type pebbleMVCCScanner struct {
    93  	parent  Iterator
    94  	reverse bool
    95  	peeked  bool
    96  	// Iteration bounds. Does not contain MVCC timestamp.
    97  	start, end roachpb.Key
    98  	// Timestamp with which MVCCScan/MVCCGet was called.
    99  	ts hlc.Timestamp
   100  	// Max number of keys to return. Note that targetBytes below is implemented
   101  	// by mutating maxKeys. (In particular, one must not assume that if maxKeys
   102  	// is zero initially it will always be zero).
   103  	maxKeys int64
   104  	// Stop adding keys once p.result.bytes matches or exceeds this threshold,
   105  	// if nonzero.
   106  	targetBytes int64
   107  	// Transaction epoch and sequence number.
   108  	txn               *roachpb.Transaction
   109  	txnEpoch          enginepb.TxnEpoch
   110  	txnSequence       enginepb.TxnSeq
   111  	txnIgnoredSeqNums []enginepb.IgnoredSeqNumRange
   112  	// Metadata object for unmarshalling intents.
   113  	meta enginepb.MVCCMetadata
   114  	// Bools copied over from MVCC{Scan,Get}Options. See the comment on the
   115  	// package level MVCCScan for what these mean.
   116  	inconsistent, tombstones bool
   117  	failOnMoreRecent         bool
   118  	checkUncertainty         bool
   119  	isGet                    bool
   120  	keyBuf                   []byte
   121  	savedBuf                 []byte
   122  	// cur* variables store the "current" record we're pointing to. Updated in
   123  	// updateCurrent.
   124  	curKey   MVCCKey
   125  	curValue []byte
   126  	results  pebbleResults
   127  	intents  pebble.Batch
   128  	// Stores any error returned. If non-nil, iteration short circuits.
   129  	err error
   130  	// Number of iterations to try before we do a Seek/SeekReverse. Stays within
   131  	// [1, maxItersBeforeSeek] and defaults to maxItersBeforeSeek/2 .
   132  	itersBeforeSeek int
   133  }
   134  
   135  // Pool for allocating pebble MVCC Scanners.
   136  var pebbleMVCCScannerPool = sync.Pool{
   137  	New: func() interface{} {
   138  		return &pebbleMVCCScanner{}
   139  	},
   140  }
   141  
   142  // init sets bounds on the underlying pebble iterator, and initializes other
   143  // fields not set by the calling method.
   144  func (p *pebbleMVCCScanner) init(txn *roachpb.Transaction) {
   145  	p.itersBeforeSeek = maxItersBeforeSeek / 2
   146  
   147  	if txn != nil {
   148  		p.txn = txn
   149  		p.txnEpoch = txn.Epoch
   150  		p.txnSequence = txn.Sequence
   151  		p.txnIgnoredSeqNums = txn.IgnoredSeqNums
   152  		p.checkUncertainty = p.ts.Less(txn.MaxTimestamp)
   153  	}
   154  }
   155  
   156  // get iterates exactly once and adds one KV to the result set.
   157  func (p *pebbleMVCCScanner) get() {
   158  	p.isGet = true
   159  	p.parent.SeekGE(MVCCKey{Key: p.start})
   160  	if !p.updateCurrent() {
   161  		return
   162  	}
   163  	p.getAndAdvance()
   164  }
   165  
   166  // scan iterates until maxKeys records are in results, or the underlying
   167  // iterator is exhausted, or an error is encountered.
   168  func (p *pebbleMVCCScanner) scan() (*roachpb.Span, error) {
   169  	p.isGet = false
   170  	if p.reverse {
   171  		if !p.iterSeekReverse(MVCCKey{Key: p.end}) {
   172  			return nil, p.err
   173  		}
   174  	} else {
   175  		if !p.iterSeek(MVCCKey{Key: p.start}) {
   176  			return nil, p.err
   177  		}
   178  	}
   179  
   180  	for p.getAndAdvance() {
   181  	}
   182  
   183  	var resume *roachpb.Span
   184  	if p.maxKeys > 0 && p.results.count == p.maxKeys && p.advanceKey() {
   185  		if p.reverse {
   186  			// curKey was not added to results, so it needs to be included in the
   187  			// resume span.
   188  			//
   189  			// NB: this is equivalent to:
   190  			//  append(roachpb.Key(nil), p.curKey.Key...).Next()
   191  			// but with half the allocations.
   192  			curKey := p.curKey.Key
   193  			curKeyCopy := make(roachpb.Key, len(curKey), len(curKey)+1)
   194  			copy(curKeyCopy, curKey)
   195  			resume = &roachpb.Span{
   196  				Key:    p.start,
   197  				EndKey: curKeyCopy.Next(),
   198  			}
   199  		} else {
   200  			resume = &roachpb.Span{
   201  				Key:    append(roachpb.Key(nil), p.curKey.Key...),
   202  				EndKey: p.end,
   203  			}
   204  		}
   205  	}
   206  	return resume, p.err
   207  }
   208  
   209  // Increments itersBeforeSeek while ensuring it stays <= maxItersBeforeSeek
   210  func (p *pebbleMVCCScanner) incrementItersBeforeSeek() {
   211  	p.itersBeforeSeek++
   212  	if p.itersBeforeSeek > maxItersBeforeSeek {
   213  		p.itersBeforeSeek = maxItersBeforeSeek
   214  	}
   215  }
   216  
   217  // Decrements itersBeforeSeek while ensuring it stays positive.
   218  func (p *pebbleMVCCScanner) decrementItersBeforeSeek() {
   219  	p.itersBeforeSeek--
   220  	if p.itersBeforeSeek < 1 {
   221  		p.itersBeforeSeek = 1
   222  	}
   223  }
   224  
   225  // Try to read from the current value's intent history. Assumes p.meta has been
   226  // unmarshalled already. Returns found = true if a value was found and returned.
   227  func (p *pebbleMVCCScanner) getFromIntentHistory() (value []byte, found bool) {
   228  	intentHistory := p.meta.IntentHistory
   229  	// upIdx is the index of the first intent in intentHistory with a sequence
   230  	// number greater than our transaction's sequence number. Subtract 1 from it
   231  	// to get the index of the intent with the highest sequence number that is
   232  	// still less than or equal to p.txnSeq.
   233  	upIdx := sort.Search(len(intentHistory), func(i int) bool {
   234  		return intentHistory[i].Sequence > p.txnSequence
   235  	})
   236  	// If the candidate intent has a sequence number that is ignored by this txn,
   237  	// iterate backward along the sorted intent history until we come across an
   238  	// intent which isn't ignored.
   239  	//
   240  	// TODO(itsbilal): Explore if this iteration can be improved through binary
   241  	// search.
   242  	for upIdx > 0 && enginepb.TxnSeqIsIgnored(p.meta.IntentHistory[upIdx-1].Sequence, p.txnIgnoredSeqNums) {
   243  		upIdx--
   244  	}
   245  	if upIdx == 0 {
   246  		// It is possible that no intent exists such that the sequence is less
   247  		// than the read sequence, and is not ignored by this transaction.
   248  		// In this case, we cannot read a value from the intent history.
   249  		return nil, false
   250  	}
   251  	intent := &p.meta.IntentHistory[upIdx-1]
   252  	return intent.Value, true
   253  }
   254  
   255  // Returns a write too old error with the specified timestamp.
   256  func (p *pebbleMVCCScanner) writeTooOldError(ts hlc.Timestamp) bool {
   257  	// The txn can't write at the existing timestamp, so we provide the error
   258  	// with the timestamp immediately after it.
   259  	p.err = roachpb.NewWriteTooOldError(p.ts, ts.Next())
   260  	p.results.clear()
   261  	p.intents.Reset()
   262  	return false
   263  }
   264  
   265  // Returns an uncertainty error with the specified timestamp and p.txn.
   266  func (p *pebbleMVCCScanner) uncertaintyError(ts hlc.Timestamp) bool {
   267  	p.err = roachpb.NewReadWithinUncertaintyIntervalError(p.ts, ts, p.txn)
   268  	p.results.clear()
   269  	p.intents.Reset()
   270  	return false
   271  }
   272  
   273  // Emit a tuple and return true if we have reason to believe iteration can
   274  // continue.
   275  func (p *pebbleMVCCScanner) getAndAdvance() bool {
   276  	if p.curKey.Timestamp != (hlc.Timestamp{}) {
   277  		if p.curKey.Timestamp.LessEq(p.ts) {
   278  			// 1. Fast path: there is no intent and our read timestamp is newer than
   279  			// the most recent version's timestamp.
   280  			return p.addAndAdvance(p.curValue)
   281  		}
   282  
   283  		if p.failOnMoreRecent {
   284  			// 2. Our txn's read timestamp is less than the most recent
   285  			// version's timestamp and the scanner has been configured
   286  			// to throw a write too old error on more recent versions.
   287  			return p.writeTooOldError(p.curKey.Timestamp)
   288  		}
   289  
   290  		if p.checkUncertainty {
   291  			// 3. Our txn's read timestamp is less than the max timestamp
   292  			// seen by the txn. We need to check for clock uncertainty
   293  			// errors.
   294  			if p.curKey.Timestamp.LessEq(p.txn.MaxTimestamp) {
   295  				return p.uncertaintyError(p.curKey.Timestamp)
   296  			}
   297  
   298  			return p.seekVersion(p.txn.MaxTimestamp, true)
   299  		}
   300  
   301  		// 4. Our txn's read timestamp is greater than or equal to the
   302  		// max timestamp seen by the txn so clock uncertainty checks are
   303  		// unnecessary. We need to seek to the desired version of the
   304  		// value (i.e. one with a timestamp earlier than our read
   305  		// timestamp).
   306  		return p.seekVersion(p.ts, false)
   307  	}
   308  
   309  	if len(p.curValue) == 0 {
   310  		p.err = errors.Errorf("zero-length mvcc metadata")
   311  		return false
   312  	}
   313  	err := protoutil.Unmarshal(p.curValue, &p.meta)
   314  	if err != nil {
   315  		p.err = errors.Errorf("unable to decode MVCCMetadata: %s", err)
   316  		return false
   317  	}
   318  	if len(p.meta.RawBytes) != 0 {
   319  		// 5. Emit immediately if the value is inline.
   320  		return p.addAndAdvance(p.meta.RawBytes)
   321  	}
   322  
   323  	if p.meta.Txn == nil {
   324  		p.err = errors.Errorf("intent without transaction")
   325  		return false
   326  	}
   327  	metaTS := hlc.Timestamp(p.meta.Timestamp)
   328  
   329  	// metaTS is the timestamp of an intent value, which we may or may
   330  	// not end up ignoring, depending on factors codified below. If we do ignore
   331  	// the intent then we want to read at a lower timestamp that's strictly
   332  	// below the intent timestamp (to skip the intent), but also does not exceed
   333  	// our read timestamp (to avoid erroneously picking up future committed
   334  	// values); this timestamp is prevTS.
   335  	prevTS := p.ts
   336  	if metaTS.LessEq(p.ts) {
   337  		prevTS = metaTS.Prev()
   338  	}
   339  
   340  	ownIntent := p.txn != nil && p.meta.Txn.ID.Equal(p.txn.ID)
   341  	maxVisibleTS := p.ts
   342  	if p.checkUncertainty {
   343  		maxVisibleTS = p.txn.MaxTimestamp
   344  	}
   345  	otherIntentVisible := metaTS.LessEq(maxVisibleTS) || p.failOnMoreRecent
   346  
   347  	if !ownIntent && !otherIntentVisible {
   348  		// 6. The key contains an intent, but we're reading before the
   349  		// intent. Seek to the desired version. Note that if we own the
   350  		// intent (i.e. we're reading transactionally) we want to read
   351  		// the intent regardless of our read timestamp and fall into
   352  		// case 8 below.
   353  		return p.seekVersion(p.ts, false)
   354  	}
   355  
   356  	if p.inconsistent {
   357  		// 7. The key contains an intent and we're doing an inconsistent
   358  		// read at a timestamp newer than the intent. We ignore the
   359  		// intent by insisting that the timestamp we're reading at is a
   360  		// historical timestamp < the intent timestamp. However, we
   361  		// return the intent separately; the caller may want to resolve
   362  		// it.
   363  		if p.maxKeys > 0 && p.results.count == p.maxKeys {
   364  			// We've already retrieved the desired number of keys and now
   365  			// we're adding the resume key. We don't want to add the
   366  			// intent here as the intents should only correspond to KVs
   367  			// that lie before the resume key.
   368  			return false
   369  		}
   370  		p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curKey)
   371  		p.err = p.intents.Set(p.keyBuf, p.curValue, nil)
   372  		if p.err != nil {
   373  			return false
   374  		}
   375  
   376  		return p.seekVersion(prevTS, false)
   377  	}
   378  
   379  	if !ownIntent {
   380  		// 8. The key contains an intent which was not written by our
   381  		// transaction and either:
   382  		// - our read timestamp is equal to or newer than that of the
   383  		//   intent
   384  		// - our read timestamp is older than that of the intent but
   385  		//   the intent is in our transaction's uncertainty interval
   386  		// - our read timestamp is older than that of the intent but
   387  		//   we want to fail on more recent writes
   388  		// Note that this will trigger an error higher up the stack. We
   389  		// continue scanning so that we can return all of the intents
   390  		// in the scan range.
   391  		p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], p.curKey)
   392  		p.err = p.intents.Set(p.keyBuf, p.curValue, nil)
   393  		if p.err != nil {
   394  			return false
   395  		}
   396  		return p.advanceKey()
   397  	}
   398  
   399  	if p.txnEpoch == p.meta.Txn.Epoch {
   400  		if p.txnSequence >= p.meta.Txn.Sequence && !enginepb.TxnSeqIsIgnored(p.meta.Txn.Sequence, p.txnIgnoredSeqNums) {
   401  			// 9. We're reading our own txn's intent at an equal or higher sequence.
   402  			// Note that we read at the intent timestamp, not at our read timestamp
   403  			// as the intent timestamp may have been pushed forward by another
   404  			// transaction. Txn's always need to read their own writes.
   405  			return p.seekVersion(metaTS, false)
   406  		}
   407  
   408  		// 10. We're reading our own txn's intent at a lower sequence than is
   409  		// currently present in the intent. This means the intent we're seeing
   410  		// was written at a higher sequence than the read and that there may or
   411  		// may not be earlier versions of the intent (with lower sequence
   412  		// numbers) that we should read. If there exists a value in the intent
   413  		// history that has a sequence number equal to or less than the read
   414  		// sequence, read that value.
   415  		if value, found := p.getFromIntentHistory(); found {
   416  			return p.addAndAdvance(value)
   417  		}
   418  		// 11. If no value in the intent history has a sequence number equal to
   419  		// or less than the read, we must ignore the intents laid down by the
   420  		// transaction all together. We ignore the intent by insisting that the
   421  		// timestamp we're reading at is a historical timestamp < the intent
   422  		// timestamp.
   423  		return p.seekVersion(prevTS, false)
   424  	}
   425  
   426  	if p.txnEpoch < p.meta.Txn.Epoch {
   427  		// 12. We're reading our own txn's intent but the current txn has
   428  		// an earlier epoch than the intent. Return an error so that the
   429  		// earlier incarnation of our transaction aborts (presumably
   430  		// this is some operation that was retried).
   431  		p.err = errors.Errorf("failed to read with epoch %d due to a write intent with epoch %d",
   432  			p.txnEpoch, p.meta.Txn.Epoch)
   433  		return false
   434  	}
   435  
   436  	// 13. We're reading our own txn's intent but the current txn has a
   437  	// later epoch than the intent. This can happen if the txn was
   438  	// restarted and an earlier iteration wrote the value we're now
   439  	// reading. In this case, we ignore the intent and read the
   440  	// previous value as if the transaction were starting fresh.
   441  	return p.seekVersion(prevTS, false)
   442  }
   443  
   444  // nextKey advances to the next user key.
   445  func (p *pebbleMVCCScanner) nextKey() bool {
   446  	p.keyBuf = append(p.keyBuf[:0], p.curKey.Key...)
   447  
   448  	for i := 0; i < p.itersBeforeSeek; i++ {
   449  		if !p.iterNext() {
   450  			return false
   451  		}
   452  		if !bytes.Equal(p.curKey.Key, p.keyBuf) {
   453  			p.incrementItersBeforeSeek()
   454  			return true
   455  		}
   456  	}
   457  
   458  	p.decrementItersBeforeSeek()
   459  	// We're pointed at a different version of the same key. Fall back to
   460  	// seeking to the next key. We append a NUL to account for the "next-key".
   461  	p.keyBuf = append(p.keyBuf, 0)
   462  	return p.iterSeek(MVCCKey{Key: p.keyBuf})
   463  }
   464  
   465  // backwardLatestVersion backs up the iterator to the latest version for the
   466  // specified key. The parameter i is used to maintain iteration count between
   467  // the loop here and the caller (usually prevKey). Returns false if the
   468  // iterator was exhausted. Assumes that the iterator is currently positioned at
   469  // the oldest version of key.
   470  func (p *pebbleMVCCScanner) backwardLatestVersion(key []byte, i int) bool {
   471  	p.keyBuf = append(p.keyBuf[:0], key...)
   472  
   473  	for ; i < p.itersBeforeSeek; i++ {
   474  		peekedKey, ok := p.iterPeekPrev()
   475  		if !ok {
   476  			// No previous entry exists, so we're at the latest version of key.
   477  			return true
   478  		}
   479  		if !bytes.Equal(peekedKey, p.keyBuf) {
   480  			p.incrementItersBeforeSeek()
   481  			return true
   482  		}
   483  		if !p.iterPrev() {
   484  			return false
   485  		}
   486  	}
   487  
   488  	p.decrementItersBeforeSeek()
   489  	return p.iterSeek(MVCCKey{Key: p.keyBuf})
   490  }
   491  
   492  // prevKey advances to the newest version of the user key preceding the
   493  // specified key. Assumes that the iterator is currently positioned at
   494  // key or 1 record after key.
   495  func (p *pebbleMVCCScanner) prevKey(key []byte) bool {
   496  	p.keyBuf = append(p.keyBuf[:0], key...)
   497  
   498  	for i := 0; i < p.itersBeforeSeek; i++ {
   499  		peekedKey, ok := p.iterPeekPrev()
   500  		if !ok {
   501  			return false
   502  		}
   503  		if !bytes.Equal(peekedKey, p.keyBuf) {
   504  			return p.backwardLatestVersion(peekedKey, i+1)
   505  		}
   506  		if !p.iterPrev() {
   507  			return false
   508  		}
   509  	}
   510  
   511  	p.decrementItersBeforeSeek()
   512  	return p.iterSeekReverse(MVCCKey{Key: p.keyBuf})
   513  }
   514  
   515  // advanceKey advances to the next key in the iterator's direction.
   516  func (p *pebbleMVCCScanner) advanceKey() bool {
   517  	if p.isGet {
   518  		return false
   519  	}
   520  	if p.reverse {
   521  		return p.prevKey(p.curKey.Key)
   522  	}
   523  	return p.nextKey()
   524  }
   525  
   526  // advanceKeyAtEnd advances to the next key when the iterator's end has been
   527  // reached.
   528  func (p *pebbleMVCCScanner) advanceKeyAtEnd() bool {
   529  	if p.reverse {
   530  		// Iterating to the next key might have caused the iterator to reach the
   531  		// end of the key space. If that happens, back up to the very last key.
   532  		p.peeked = false
   533  		p.parent.SeekLT(MVCCKey{Key: p.end})
   534  		if !p.updateCurrent() {
   535  			return false
   536  		}
   537  		return p.advanceKey()
   538  	}
   539  	// We've reached the end of the iterator and there is nothing left to do.
   540  	return false
   541  }
   542  
   543  // advanceKeyAtNewKey advances to the key after the specified key, assuming we
   544  // have just reached the specified key.
   545  func (p *pebbleMVCCScanner) advanceKeyAtNewKey(key []byte) bool {
   546  	if p.reverse {
   547  		// We've advanced to the next key but need to move back to the previous
   548  		// key.
   549  		return p.prevKey(key)
   550  	}
   551  	// We're already at the new key so there is nothing to do.
   552  	return true
   553  }
   554  
   555  // Adds the specified value to the result set, excluding tombstones unless
   556  // p.tombstones is true. Advances to the next key unless we've reached the max
   557  // results limit.
   558  func (p *pebbleMVCCScanner) addAndAdvance(val []byte) bool {
   559  	// Don't include deleted versions len(val) == 0, unless we've been instructed
   560  	// to include tombstones in the results.
   561  	if len(val) > 0 || p.tombstones {
   562  		p.results.put(p.curKey, val)
   563  		if p.targetBytes > 0 && p.results.bytes >= p.targetBytes {
   564  			// When the target bytes are met or exceeded, stop producing more
   565  			// keys. We implement this by reducing maxKeys to the current
   566  			// number of keys.
   567  			//
   568  			// TODO(bilal): see if this can be implemented more transparently.
   569  			p.maxKeys = p.results.count
   570  		}
   571  		if p.maxKeys > 0 && p.results.count == p.maxKeys {
   572  			return false
   573  		}
   574  	}
   575  	return p.advanceKey()
   576  }
   577  
   578  // Seeks to the latest revision of the current key that's still less than or
   579  // equal to the specified timestamp, adds it to the result set, then moves onto
   580  // the next user key.
   581  func (p *pebbleMVCCScanner) seekVersion(ts hlc.Timestamp, uncertaintyCheck bool) bool {
   582  	key := MVCCKey{Key: p.curKey.Key, Timestamp: ts}
   583  	p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key)
   584  	origKey := p.keyBuf[:len(p.curKey.Key)]
   585  
   586  	for i := 0; i < p.itersBeforeSeek; i++ {
   587  		if !p.iterNext() {
   588  			return p.advanceKeyAtEnd()
   589  		}
   590  		if !bytes.Equal(p.curKey.Key, origKey) {
   591  			p.incrementItersBeforeSeek()
   592  			return p.advanceKeyAtNewKey(origKey)
   593  		}
   594  		if p.curKey.Timestamp.LessEq(ts) {
   595  			p.incrementItersBeforeSeek()
   596  			if uncertaintyCheck && p.ts.Less(p.curKey.Timestamp) {
   597  				return p.uncertaintyError(p.curKey.Timestamp)
   598  			}
   599  			return p.addAndAdvance(p.curValue)
   600  		}
   601  	}
   602  
   603  	p.decrementItersBeforeSeek()
   604  	if !p.iterSeek(key) {
   605  		return p.advanceKeyAtEnd()
   606  	}
   607  	if !bytes.Equal(p.curKey.Key, origKey) {
   608  		return p.advanceKeyAtNewKey(origKey)
   609  	}
   610  	if p.curKey.Timestamp.LessEq(ts) {
   611  		if uncertaintyCheck && p.ts.Less(p.curKey.Timestamp) {
   612  			return p.uncertaintyError(p.curKey.Timestamp)
   613  		}
   614  		return p.addAndAdvance(p.curValue)
   615  	}
   616  	return p.advanceKey()
   617  }
   618  
   619  // Updates cur{RawKey, Key, TS} to match record the iterator is pointing to.
   620  func (p *pebbleMVCCScanner) updateCurrent() bool {
   621  	if !p.iterValid() {
   622  		return false
   623  	}
   624  
   625  	p.curKey = p.parent.UnsafeKey()
   626  	p.curValue = p.parent.UnsafeValue()
   627  	return true
   628  }
   629  
   630  func (p *pebbleMVCCScanner) iterValid() bool {
   631  	if valid, err := p.parent.Valid(); !valid {
   632  		p.err = err
   633  		return false
   634  	}
   635  	return true
   636  }
   637  
   638  // iterSeek seeks to the latest revision of the specified key (or a greater key).
   639  func (p *pebbleMVCCScanner) iterSeek(key MVCCKey) bool {
   640  	p.clearPeeked()
   641  	p.parent.SeekGE(key)
   642  	return p.updateCurrent()
   643  }
   644  
   645  // iterSeekReverse seeks to the latest revision of the key before the specified key.
   646  func (p *pebbleMVCCScanner) iterSeekReverse(key MVCCKey) bool {
   647  	p.clearPeeked()
   648  	p.parent.SeekLT(key)
   649  	if !p.updateCurrent() {
   650  		// We have seeked to before the start key. Return.
   651  		return false
   652  	}
   653  
   654  	if p.curKey.Timestamp == (hlc.Timestamp{}) {
   655  		// We landed on an intent or inline value.
   656  		return true
   657  	}
   658  	// We landed on a versioned value, we need to back up to find the
   659  	// latest version.
   660  	return p.backwardLatestVersion(p.curKey.Key, 0)
   661  }
   662  
   663  // iterNext advances to the next MVCC key.
   664  func (p *pebbleMVCCScanner) iterNext() bool {
   665  	if p.reverse && p.peeked {
   666  		// If we have peeked at the previous entry, we need to advance the iterator
   667  		// twice.
   668  		p.peeked = false
   669  		if !p.iterValid() {
   670  			// We were peeked off the beginning of iteration. Seek to the first
   671  			// entry, and then advance one step.
   672  			p.parent.SeekGE(MVCCKey{Key: p.start})
   673  			if !p.iterValid() {
   674  				return false
   675  			}
   676  			p.parent.Next()
   677  			return p.updateCurrent()
   678  		}
   679  		p.parent.Next()
   680  		if !p.iterValid() {
   681  			return false
   682  		}
   683  	}
   684  	p.parent.Next()
   685  	return p.updateCurrent()
   686  }
   687  
   688  // iterPrev advances to the previous MVCC Key.
   689  func (p *pebbleMVCCScanner) iterPrev() bool {
   690  	if p.peeked {
   691  		p.peeked = false
   692  		return p.updateCurrent()
   693  	}
   694  	p.parent.Prev()
   695  	return p.updateCurrent()
   696  }
   697  
   698  // Peek the previous key and store the result in peekedKey. Note that this
   699  // moves the iterator backward, while leaving p.cur{key,value,rawKey} untouched
   700  // and therefore out of sync. iterPrev and iterNext take this into account.
   701  func (p *pebbleMVCCScanner) iterPeekPrev() ([]byte, bool) {
   702  	if !p.peeked {
   703  		p.peeked = true
   704  		// We need to save a copy of the current iterator key and value and adjust
   705  		// curRawKey, curKey and curValue to point to this saved data. We use a
   706  		// single buffer for this purpose: savedBuf.
   707  		p.savedBuf = append(p.savedBuf[:0], p.curKey.Key...)
   708  		p.savedBuf = append(p.savedBuf, p.curValue...)
   709  		p.curKey.Key = p.savedBuf[:len(p.curKey.Key)]
   710  		p.curValue = p.savedBuf[len(p.curKey.Key):]
   711  
   712  		// With the current iterator state saved we can move the iterator to the
   713  		// previous entry.
   714  		p.parent.Prev()
   715  		if !p.iterValid() {
   716  			// The iterator is now invalid, but note that this case is handled in
   717  			// both iterNext and iterPrev. In the former case, we'll position the
   718  			// iterator at the first entry, and in the latter iteration will be done.
   719  			return nil, false
   720  		}
   721  	} else if !p.iterValid() {
   722  		return nil, false
   723  	}
   724  
   725  	peekedKey := p.parent.UnsafeKey()
   726  	return peekedKey.Key, true
   727  }
   728  
   729  // Clear the peeked flag. Call this before any iterator operations.
   730  func (p *pebbleMVCCScanner) clearPeeked() {
   731  	if p.reverse {
   732  		p.peeked = false
   733  	}
   734  }
   735  
   736  func (p *pebbleMVCCScanner) intentsRepr() []byte {
   737  	if p.intents.Count() == 0 {
   738  		return nil
   739  	}
   740  	return p.intents.Repr()
   741  }