github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/pebble_iterator.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"bytes"
    15  	"math"
    16  	"sync"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/keys"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    23  	"github.com/cockroachdb/pebble"
    24  )
    25  
    26  // pebbleIterator is a wrapper around a pebble.Iterator that implements the
    27  // Iterator interface.
    28  type pebbleIterator struct {
    29  	// Underlying iterator for the DB.
    30  	iter    *pebble.Iterator
    31  	options pebble.IterOptions
    32  	// Reusable buffer for MVCC key encoding.
    33  	keyBuf []byte
    34  	// Buffers for copying iterator bounds to. Note that the underlying memory
    35  	// is not GCed upon Close(), to reduce the number of overall allocations.
    36  	lowerBoundBuf []byte
    37  	upperBoundBuf []byte
    38  	// Set to true to govern whether to call SeekPrefixGE or SeekGE. Skips
    39  	// SSTables based on MVCC key when true.
    40  	prefix bool
    41  	// If reusable is true, Close() does not actually close the underlying
    42  	// iterator, but simply marks it as not inuse. Used by pebbleReadOnly.
    43  	reusable bool
    44  	inuse    bool
    45  	// Stat tracking the number of sstables encountered during time-bound
    46  	// iteration.
    47  	timeBoundNumSSTables int
    48  }
    49  
    50  var _ Iterator = &pebbleIterator{}
    51  
    52  var pebbleIterPool = sync.Pool{
    53  	New: func() interface{} {
    54  		return &pebbleIterator{}
    55  	},
    56  }
    57  
    58  // Instantiates a new Pebble iterator, or gets one from the pool.
    59  func newPebbleIterator(handle pebble.Reader, opts IterOptions) Iterator {
    60  	iter := pebbleIterPool.Get().(*pebbleIterator)
    61  	iter.init(handle, opts)
    62  	return iter
    63  }
    64  
    65  // init resets this pebbleIterator for use with the specified arguments. The
    66  // current instance could either be a cached iterator (eg. in pebbleBatch), or
    67  // a newly-instantiated one through newPebbleIterator.
    68  func (p *pebbleIterator) init(handle pebble.Reader, opts IterOptions) {
    69  	*p = pebbleIterator{
    70  		keyBuf:        p.keyBuf,
    71  		lowerBoundBuf: p.lowerBoundBuf,
    72  		upperBoundBuf: p.upperBoundBuf,
    73  		prefix:        opts.Prefix,
    74  		reusable:      p.reusable,
    75  	}
    76  
    77  	if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
    78  		panic("iterator must set prefix or upper bound or lower bound")
    79  	}
    80  
    81  	if opts.LowerBound != nil {
    82  		// This is the same as
    83  		// p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[:0], MVCCKey{Key: opts.LowerBound}) .
    84  		// Since we are encoding zero-timestamp MVCC Keys anyway, we can just append
    85  		// the NUL byte instead of calling EncodeKey which will do the same thing.
    86  		p.lowerBoundBuf = append(p.lowerBoundBuf[:0], opts.LowerBound...)
    87  		p.lowerBoundBuf = append(p.lowerBoundBuf, 0x00)
    88  		p.options.LowerBound = p.lowerBoundBuf
    89  	}
    90  	if opts.UpperBound != nil {
    91  		// Same as above.
    92  		p.upperBoundBuf = append(p.upperBoundBuf[:0], opts.UpperBound...)
    93  		p.upperBoundBuf = append(p.upperBoundBuf, 0x00)
    94  		p.options.UpperBound = p.upperBoundBuf
    95  	}
    96  
    97  	if opts.MaxTimestampHint != (hlc.Timestamp{}) {
    98  		encodedMinTS := string(encodeTimestamp(opts.MinTimestampHint))
    99  		encodedMaxTS := string(encodeTimestamp(opts.MaxTimestampHint))
   100  		p.options.TableFilter = func(userProps map[string]string) bool {
   101  			tableMinTS := userProps["crdb.ts.min"]
   102  			if len(tableMinTS) == 0 {
   103  				if opts.WithStats {
   104  					p.timeBoundNumSSTables++
   105  				}
   106  				return true
   107  			}
   108  			tableMaxTS := userProps["crdb.ts.max"]
   109  			if len(tableMaxTS) == 0 {
   110  				if opts.WithStats {
   111  					p.timeBoundNumSSTables++
   112  				}
   113  				return true
   114  			}
   115  			used := encodedMaxTS >= tableMinTS && encodedMinTS <= tableMaxTS
   116  			if used && opts.WithStats {
   117  				p.timeBoundNumSSTables++
   118  			}
   119  			return used
   120  		}
   121  	} else if opts.MinTimestampHint != (hlc.Timestamp{}) {
   122  		panic("min timestamp hint set without max timestamp hint")
   123  	}
   124  
   125  	p.iter = handle.NewIter(&p.options)
   126  	if p.iter == nil {
   127  		panic("unable to create iterator")
   128  	}
   129  
   130  	p.inuse = true
   131  }
   132  
   133  func (p *pebbleIterator) setOptions(opts IterOptions) {
   134  	// Overwrite any stale options from last time.
   135  	p.options = pebble.IterOptions{}
   136  
   137  	if opts.MinTimestampHint != (hlc.Timestamp{}) || opts.MaxTimestampHint != (hlc.Timestamp{}) {
   138  		panic("iterator with timestamp hints cannot be reused")
   139  	}
   140  	if !opts.Prefix && len(opts.UpperBound) == 0 && len(opts.LowerBound) == 0 {
   141  		panic("iterator must set prefix or upper bound or lower bound")
   142  	}
   143  
   144  	p.prefix = opts.Prefix
   145  	if opts.LowerBound != nil {
   146  		// This is the same as
   147  		// p.options.LowerBound = EncodeKeyToBuf(p.lowerBoundBuf[:0], MVCCKey{Key: opts.LowerBound}) .
   148  		// Since we are encoding zero-timestamp MVCC Keys anyway, we can just append
   149  		// the NUL byte instead of calling EncodeKey which will do the same thing.
   150  		p.lowerBoundBuf = append(p.lowerBoundBuf[:0], opts.LowerBound...)
   151  		p.lowerBoundBuf = append(p.lowerBoundBuf, 0x00)
   152  		p.options.LowerBound = p.lowerBoundBuf
   153  	}
   154  	if opts.UpperBound != nil {
   155  		// Same as above.
   156  		p.upperBoundBuf = append(p.upperBoundBuf[:0], opts.UpperBound...)
   157  		p.upperBoundBuf = append(p.upperBoundBuf, 0x00)
   158  		p.options.UpperBound = p.upperBoundBuf
   159  	}
   160  	p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound)
   161  }
   162  
   163  // Close implements the Iterator interface.
   164  func (p *pebbleIterator) Close() {
   165  	if !p.inuse {
   166  		panic("closing idle iterator")
   167  	}
   168  	p.inuse = false
   169  
   170  	if p.reusable {
   171  		return
   172  	}
   173  
   174  	p.destroy()
   175  
   176  	pebbleIterPool.Put(p)
   177  }
   178  
   179  // SeekGE implements the Iterator interface.
   180  func (p *pebbleIterator) SeekGE(key MVCCKey) {
   181  	p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key)
   182  	if p.prefix {
   183  		p.iter.SeekPrefixGE(p.keyBuf)
   184  	} else {
   185  		p.iter.SeekGE(p.keyBuf)
   186  	}
   187  }
   188  
   189  // Valid implements the Iterator interface.
   190  func (p *pebbleIterator) Valid() (bool, error) {
   191  	// NB: A Pebble Iterator always returns Valid()==false when an error is
   192  	// present. If Valid() is true, there is no error.
   193  	if ok := p.iter.Valid(); ok {
   194  		return ok, nil
   195  	}
   196  	return false, p.iter.Error()
   197  }
   198  
   199  // Next implements the Iterator interface.
   200  func (p *pebbleIterator) Next() {
   201  	p.iter.Next()
   202  }
   203  
   204  // NextKey implements the Iterator interface.
   205  func (p *pebbleIterator) NextKey() {
   206  	if valid, err := p.Valid(); err != nil || !valid {
   207  		return
   208  	}
   209  	p.keyBuf = append(p.keyBuf[:0], p.UnsafeKey().Key...)
   210  	if !p.iter.Next() {
   211  		return
   212  	}
   213  	if bytes.Equal(p.keyBuf, p.UnsafeKey().Key) {
   214  		// This is equivalent to:
   215  		// p.iter.SeekGE(EncodeKey(MVCCKey{p.UnsafeKey().Key.Next(), hlc.Timestamp{}}))
   216  		p.iter.SeekGE(append(p.keyBuf, 0, 0))
   217  	}
   218  }
   219  
   220  // UnsafeKey implements the Iterator interface.
   221  func (p *pebbleIterator) UnsafeKey() MVCCKey {
   222  	if valid, err := p.Valid(); err != nil || !valid {
   223  		return MVCCKey{}
   224  	}
   225  
   226  	mvccKey, err := DecodeMVCCKey(p.iter.Key())
   227  	if err != nil {
   228  		return MVCCKey{}
   229  	}
   230  
   231  	return mvccKey
   232  }
   233  
   234  // unsafeRawKey returns the raw key from the underlying pebble.Iterator.
   235  func (p *pebbleIterator) unsafeRawKey() []byte {
   236  	return p.iter.Key()
   237  }
   238  
   239  // UnsafeValue implements the Iterator interface.
   240  func (p *pebbleIterator) UnsafeValue() []byte {
   241  	if valid, err := p.Valid(); err != nil || !valid {
   242  		return nil
   243  	}
   244  	return p.iter.Value()
   245  }
   246  
   247  // SeekLT implements the Iterator interface.
   248  func (p *pebbleIterator) SeekLT(key MVCCKey) {
   249  	p.keyBuf = EncodeKeyToBuf(p.keyBuf[:0], key)
   250  	p.iter.SeekLT(p.keyBuf)
   251  }
   252  
   253  // Prev implements the Iterator interface.
   254  func (p *pebbleIterator) Prev() {
   255  	p.iter.Prev()
   256  }
   257  
   258  // Key implements the Iterator interface.
   259  func (p *pebbleIterator) Key() MVCCKey {
   260  	key := p.UnsafeKey()
   261  	keyCopy := make([]byte, len(key.Key))
   262  	copy(keyCopy, key.Key)
   263  	key.Key = keyCopy
   264  	return key
   265  }
   266  
   267  // Value implements the Iterator interface.
   268  func (p *pebbleIterator) Value() []byte {
   269  	value := p.UnsafeValue()
   270  	valueCopy := make([]byte, len(value))
   271  	copy(valueCopy, value)
   272  	return valueCopy
   273  }
   274  
   275  // ValueProto implements the Iterator interface.
   276  func (p *pebbleIterator) ValueProto(msg protoutil.Message) error {
   277  	value := p.UnsafeValue()
   278  
   279  	return protoutil.Unmarshal(value, msg)
   280  }
   281  
   282  // ComputeStats implements the Iterator interface.
   283  func (p *pebbleIterator) ComputeStats(
   284  	start, end roachpb.Key, nowNanos int64,
   285  ) (enginepb.MVCCStats, error) {
   286  	return ComputeStatsGo(p, start, end, nowNanos)
   287  }
   288  
   289  // Go-only version of IsValidSplitKey. Checks if the specified key is in
   290  // NoSplitSpans.
   291  func isValidSplitKey(key roachpb.Key, noSplitSpans []roachpb.Span) bool {
   292  	for i := range noSplitSpans {
   293  		if noSplitSpans[i].ContainsKey(key) {
   294  			return false
   295  		}
   296  	}
   297  	return true
   298  }
   299  
   300  // FindSplitKey implements the Iterator interface.
   301  func (p *pebbleIterator) FindSplitKey(
   302  	start, end, minSplitKey roachpb.Key, targetSize int64,
   303  ) (MVCCKey, error) {
   304  	const timestampLen = 12
   305  
   306  	sizeSoFar := int64(0)
   307  	bestDiff := int64(math.MaxInt64)
   308  	bestSplitKey := MVCCKey{}
   309  	// found indicates that we have found a valid split key that is the best
   310  	// known so far. If bestSplitKey is empty => that split key
   311  	// is in prevKey, else it is in bestSplitKey.
   312  	found := false
   313  	prevKey := MVCCKey{}
   314  
   315  	// We only have to consider no-split spans if our minimum split key possibly
   316  	// lies before them. Note that the no-split spans are ordered by end-key.
   317  	noSplitSpans := keys.NoSplitSpans
   318  	for i := range noSplitSpans {
   319  		if minSplitKey.Compare(noSplitSpans[i].EndKey) <= 0 {
   320  			noSplitSpans = noSplitSpans[i:]
   321  			break
   322  		}
   323  	}
   324  
   325  	// Note that it is unnecessary to compare against "end" to decide to
   326  	// terminate iteration because the iterator's upper bound has already been
   327  	// set to end.
   328  	mvccMinSplitKey := MakeMVCCMetadataKey(minSplitKey)
   329  	p.SeekGE(MakeMVCCMetadataKey(start))
   330  	for ; p.iter.Valid(); p.iter.Next() {
   331  		mvccKey, err := DecodeMVCCKey(p.iter.Key())
   332  		if err != nil {
   333  			return MVCCKey{}, err
   334  		}
   335  
   336  		diff := targetSize - sizeSoFar
   337  		if diff < 0 {
   338  			diff = -diff
   339  		}
   340  		if diff > bestDiff {
   341  			// diff will keep increasing past this point. And we must have had a valid
   342  			// candidate in the past since we can't be worse than MaxInt64.
   343  			break
   344  		}
   345  
   346  		if mvccMinSplitKey.Key != nil && !mvccKey.Less(mvccMinSplitKey) {
   347  			// mvccKey is >= mvccMinSplitKey. Set the minSplitKey to nil so we do
   348  			// not have to make any more checks going forward.
   349  			mvccMinSplitKey.Key = nil
   350  		}
   351  
   352  		if mvccMinSplitKey.Key == nil && diff < bestDiff &&
   353  			(len(noSplitSpans) == 0 || isValidSplitKey(mvccKey.Key, noSplitSpans)) {
   354  			// This is a valid candidate for a split key.
   355  			//
   356  			// Instead of copying bestSplitKey just yet, flip the found flag. In the
   357  			// most common case where the actual best split key is followed by a key
   358  			// that has diff > bestDiff (see the if statement with that predicate
   359  			// above), this lets us save a copy by reusing prevCandidateKey as the
   360  			// best split key.
   361  			bestDiff = diff
   362  			found = true
   363  			// Set length of bestSplitKey to 0, which the rest of this method relies
   364  			// on to check if the last key encountered was the best split key.
   365  			bestSplitKey.Key = bestSplitKey.Key[:0]
   366  		} else if found && len(bestSplitKey.Key) == 0 {
   367  			// We were just at a valid split key candidate, but then we came across
   368  			// a key that cannot be a split key (i.e. is in noSplitSpans), or was not
   369  			// an improvement over bestDiff. Copy the previous key as the
   370  			// bestSplitKey.
   371  			bestSplitKey.Timestamp = prevKey.Timestamp
   372  			bestSplitKey.Key = append(bestSplitKey.Key[:0], prevKey.Key...)
   373  		}
   374  
   375  		sizeSoFar += int64(len(p.iter.Value()))
   376  		if mvccKey.IsValue() && bytes.Equal(prevKey.Key, mvccKey.Key) {
   377  			// We only advanced timestamps, but not new mvcc keys.
   378  			sizeSoFar += timestampLen
   379  		} else {
   380  			sizeSoFar += int64(len(mvccKey.Key) + 1)
   381  			if mvccKey.IsValue() {
   382  				sizeSoFar += timestampLen
   383  			}
   384  		}
   385  
   386  		prevKey.Key = append(prevKey.Key[:0], mvccKey.Key...)
   387  		prevKey.Timestamp = mvccKey.Timestamp
   388  	}
   389  
   390  	// There are three distinct types of cases possible here:
   391  	//
   392  	// 1. No valid split key was found (found == false), in which case we return
   393  	//    bestSplitKey (which should be MVCCKey{}).
   394  	// 2. The best candidate seen for a split key so far was encountered in the
   395  	//    last iteration of the above loop. We broke out of the loop either due
   396  	//    to iterator exhaustion (!p.iter.Valid()), or an increasing diff. Return
   397  	//    prevKey as the best split key.
   398  	// 3. The best split key was seen multiple iterations ago, and was copied into
   399  	//    bestSplitKey at some point (found == true, len(bestSplitKey.Key) > 0).
   400  	//    Keys encountered after that point were invalid for being in noSplitSpans
   401  	//    so return the bestSplitKey that had been copied.
   402  	//
   403  	// This if statement checks for case 2.
   404  	if found && len(bestSplitKey.Key) == 0 {
   405  		// Use the last key found as the best split key, since we broke out of the
   406  		// loop (due to iterator exhaustion or increasing diff) right after we saw
   407  		// the best split key. prevKey has to be a valid split key since the only
   408  		// way we'd have both found && len(bestSplitKey.Key) == 0 is when we've
   409  		// already checked prevKey for validity.
   410  		return prevKey, nil
   411  	}
   412  	return bestSplitKey, nil
   413  }
   414  
   415  // SetUpperBound implements the Iterator interface.
   416  func (p *pebbleIterator) SetUpperBound(upperBound roachpb.Key) {
   417  	p.upperBoundBuf = append(p.upperBoundBuf[:0], upperBound...)
   418  	p.upperBoundBuf = append(p.upperBoundBuf, 0x00)
   419  	p.options.UpperBound = p.upperBoundBuf
   420  	p.iter.SetBounds(p.options.LowerBound, p.options.UpperBound)
   421  }
   422  
   423  // Stats implements the Iterator interface.
   424  func (p *pebbleIterator) Stats() IteratorStats {
   425  	return IteratorStats{
   426  		TimeBoundNumSSTs: p.timeBoundNumSSTables,
   427  	}
   428  }
   429  
   430  // CheckForKeyCollisions indicates if the provided SST data collides with this
   431  // iterator in the specified range.
   432  func (p *pebbleIterator) CheckForKeyCollisions(
   433  	sstData []byte, start, end roachpb.Key,
   434  ) (enginepb.MVCCStats, error) {
   435  	return checkForKeyCollisionsGo(p, sstData, start, end)
   436  }
   437  
   438  func (p *pebbleIterator) destroy() {
   439  	if p.inuse {
   440  		panic("iterator still in use")
   441  	}
   442  	if p.iter != nil {
   443  		err := p.iter.Close()
   444  		if err != nil {
   445  			panic(err)
   446  		}
   447  		p.iter = nil
   448  	}
   449  	// Reset all fields except for the key and lower/upper bound buffers. Holding
   450  	// onto their underlying memory is more efficient to prevent extra
   451  	// allocations down the line.
   452  	*p = pebbleIterator{
   453  		keyBuf:        p.keyBuf,
   454  		lowerBoundBuf: p.lowerBoundBuf,
   455  		upperBoundBuf: p.upperBoundBuf,
   456  		reusable:      p.reusable,
   457  	}
   458  }