github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/scan_internal.go (about)

     1  // Copyright 2023 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manifest"
    16  	"github.com/cockroachdb/pebble/objstorage"
    17  	"github.com/cockroachdb/pebble/objstorage/remote"
    18  	"github.com/cockroachdb/pebble/sstable"
    19  )
    20  
    21  const (
    22  	// In skip-shared iteration mode, keys in levels sharedLevelsStart and greater
    23  	// (i.e. lower in the LSM) are skipped.
    24  	sharedLevelsStart = remote.SharedLevelsStart
    25  )
    26  
    27  // ErrInvalidSkipSharedIteration is returned by ScanInternal if it was called
    28  // with a shared file visitor function, and a file in a shareable level (i.e.
    29  // level >= sharedLevelsStart) was found to not be in shared storage according
    30  // to objstorage.Provider, or not shareable for another reason such as for
    31  // containing keys newer than the snapshot sequence number.
    32  var ErrInvalidSkipSharedIteration = errors.New("pebble: cannot use skip-shared iteration due to non-shareable files in lower levels")
    33  
    34  // SharedSSTMeta represents an sstable on shared storage that can be ingested
    35  // by another pebble instance. This struct must contain all fields that are
    36  // required for a Pebble instance to ingest a foreign sstable on shared storage,
    37  // including constructing any relevant objstorage.Provider / remoteobjcat.Catalog
    38  // data structures, as well as creating virtual FileMetadatas.
    39  //
    40  // Note that the Pebble instance creating and returning a SharedSSTMeta might
    41  // not be the one that created the underlying sstable on shared storage to begin
    42  // with; it's possible for a Pebble instance to reshare an sstable that was
    43  // shared to it.
    44  type SharedSSTMeta struct {
    45  	// Backing is the shared object underlying this SST. Can be attached to an
    46  	// objstorage.Provider.
    47  	Backing objstorage.RemoteObjectBackingHandle
    48  
    49  	// Smallest and Largest internal keys for the overall bounds. The kind and
    50  	// SeqNum of these will reflect what is physically present on the source Pebble
    51  	// instance's view of the sstable; it's up to the ingesting instance to set the
    52  	// sequence number in the trailer to match the read-time sequence numbers
    53  	// reserved for the level this SST is being ingested into. The Kind is expected
    54  	// to remain unchanged by the ingesting instance.
    55  	//
    56  	// Note that these bounds could be narrower than the bounds of the underlying
    57  	// sstable; ScanInternal is expected to truncate sstable bounds to the user key
    58  	// bounds passed into that method.
    59  	Smallest, Largest InternalKey
    60  
    61  	// SmallestRangeKey and LargestRangeKey are internal keys that denote the
    62  	// range key bounds of this sstable. Must lie within [Smallest, Largest].
    63  	SmallestRangeKey, LargestRangeKey InternalKey
    64  
    65  	// SmallestPointKey and LargestPointKey are internal keys that denote the
    66  	// point key bounds of this sstable. Must lie within [Smallest, Largest].
    67  	SmallestPointKey, LargestPointKey InternalKey
    68  
    69  	// Level denotes the level at which this file was present at read time.
    70  	// For files visited by ScanInternal, this value will only be 5 or 6.
    71  	Level uint8
    72  
    73  	// Size contains an estimate of the size of this sstable.
    74  	Size uint64
    75  
    76  	// fileNum at time of creation in the creator instance. Only used for
    77  	// debugging/tests.
    78  	fileNum base.FileNum
    79  }
    80  
    81  func (s *SharedSSTMeta) cloneFromFileMeta(f *fileMetadata) {
    82  	*s = SharedSSTMeta{
    83  		Smallest:         f.Smallest.Clone(),
    84  		Largest:          f.Largest.Clone(),
    85  		SmallestRangeKey: f.SmallestRangeKey.Clone(),
    86  		LargestRangeKey:  f.LargestRangeKey.Clone(),
    87  		SmallestPointKey: f.SmallestPointKey.Clone(),
    88  		LargestPointKey:  f.LargestPointKey.Clone(),
    89  		Size:             f.Size,
    90  		fileNum:          f.FileNum,
    91  	}
    92  }
    93  
    94  type sharedByLevel []SharedSSTMeta
    95  
    96  func (s sharedByLevel) Len() int           { return len(s) }
    97  func (s sharedByLevel) Swap(i, j int)      { s[i], s[j] = s[j], s[i] }
    98  func (s sharedByLevel) Less(i, j int) bool { return s[i].Level < s[j].Level }
    99  
   100  type pcIterPos int
   101  
   102  const (
   103  	pcIterPosCur pcIterPos = iota
   104  	pcIterPosNext
   105  )
   106  
   107  // pointCollapsingIterator is an internalIterator that collapses point keys and
   108  // returns at most one point internal key for each user key. Merges and
   109  // SingleDels are not supported and result in a panic if encountered. Point keys
   110  // deleted by rangedels are considered shadowed and not exposed.
   111  //
   112  // Only used in ScanInternal to return at most one internal key per user key.
   113  type pointCollapsingIterator struct {
   114  	iter     keyspan.InterleavingIter
   115  	pos      pcIterPos
   116  	comparer *base.Comparer
   117  	merge    base.Merge
   118  	err      error
   119  	seqNum   uint64
   120  	// The current position of `iter`. Always owned by the underlying iter.
   121  	iterKey *InternalKey
   122  	// The last saved key. findNextEntry and similar methods are expected to save
   123  	// the current value of iterKey to savedKey if they're iterating away from the
   124  	// current key but still need to retain it. See comments in findNextEntry on
   125  	// how this field is used.
   126  	//
   127  	// At the end of a positioning call:
   128  	//  - if pos == pcIterPosNext, iterKey is pointing to the next user key owned
   129  	//    by `iter` while savedKey is holding a copy to our current key.
   130  	//  - If pos == pcIterPosCur, iterKey is pointing to an `iter`-owned current
   131  	//    key, and savedKey is either undefined or pointing to a version of the
   132  	//    current key owned by this iterator (i.e. backed by savedKeyBuf).
   133  	savedKey    InternalKey
   134  	savedKeyBuf []byte
   135  	// Value at the current iterator position, at iterKey.
   136  	iterValue base.LazyValue
   137  	// If fixedSeqNum is non-zero, all emitted points are verified to have this
   138  	// fixed sequence number.
   139  	fixedSeqNum uint64
   140  }
   141  
   142  func (p *pointCollapsingIterator) Span() *keyspan.Span {
   143  	return p.iter.Span()
   144  }
   145  
   146  // SeekPrefixGE implements the InternalIterator interface.
   147  func (p *pointCollapsingIterator) SeekPrefixGE(
   148  	prefix, key []byte, flags base.SeekGEFlags,
   149  ) (*base.InternalKey, base.LazyValue) {
   150  	p.resetKey()
   151  	p.iterKey, p.iterValue = p.iter.SeekPrefixGE(prefix, key, flags)
   152  	p.pos = pcIterPosCur
   153  	if p.iterKey == nil {
   154  		return nil, base.LazyValue{}
   155  	}
   156  	return p.findNextEntry()
   157  }
   158  
   159  // SeekGE implements the InternalIterator interface.
   160  func (p *pointCollapsingIterator) SeekGE(
   161  	key []byte, flags base.SeekGEFlags,
   162  ) (*base.InternalKey, base.LazyValue) {
   163  	p.resetKey()
   164  	p.iterKey, p.iterValue = p.iter.SeekGE(key, flags)
   165  	p.pos = pcIterPosCur
   166  	if p.iterKey == nil {
   167  		return nil, base.LazyValue{}
   168  	}
   169  	return p.findNextEntry()
   170  }
   171  
   172  // SeekLT implements the InternalIterator interface.
   173  func (p *pointCollapsingIterator) SeekLT(
   174  	key []byte, flags base.SeekLTFlags,
   175  ) (*base.InternalKey, base.LazyValue) {
   176  	panic("unimplemented")
   177  }
   178  
   179  func (p *pointCollapsingIterator) resetKey() {
   180  	p.savedKey.UserKey = p.savedKeyBuf[:0]
   181  	p.savedKey.Trailer = 0
   182  	p.iterKey = nil
   183  	p.pos = pcIterPosCur
   184  }
   185  
   186  func (p *pointCollapsingIterator) verifySeqNum(key *base.InternalKey) *base.InternalKey {
   187  	if !invariants.Enabled {
   188  		return key
   189  	}
   190  	if p.fixedSeqNum == 0 || key == nil || key.Kind() == InternalKeyKindRangeDelete {
   191  		return key
   192  	}
   193  	if key.SeqNum() != p.fixedSeqNum {
   194  		panic(fmt.Sprintf("expected foreign point key to have seqnum %d, got %d", p.fixedSeqNum, key.SeqNum()))
   195  	}
   196  	return key
   197  }
   198  
   199  // findNextEntry is called to return the next key. p.iter must be positioned at the
   200  // start of the first user key we are interested in.
   201  func (p *pointCollapsingIterator) findNextEntry() (*base.InternalKey, base.LazyValue) {
   202  	p.saveKey()
   203  	// Saves a comparison in the fast path
   204  	firstIteration := true
   205  	for p.iterKey != nil {
   206  		// NB: p.savedKey is either the current key (iff p.iterKey == firstKey),
   207  		// or the previous key.
   208  		if !firstIteration && !p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) {
   209  			p.saveKey()
   210  			continue
   211  		}
   212  		firstIteration = false
   213  		if s := p.iter.Span(); s != nil && s.CoversAt(p.seqNum, p.iterKey.SeqNum()) {
   214  			// All future keys for this user key must be deleted.
   215  			if p.savedKey.Kind() == InternalKeyKindSingleDelete {
   216  				panic("cannot process singledel key in point collapsing iterator")
   217  			}
   218  			// Fast forward to the next user key.
   219  			p.saveKey()
   220  			p.iterKey, p.iterValue = p.iter.Next()
   221  			for p.iterKey != nil && p.savedKey.SeqNum() >= p.iterKey.SeqNum() && p.comparer.Equal(p.iterKey.UserKey, p.savedKey.UserKey) {
   222  				p.iterKey, p.iterValue = p.iter.Next()
   223  			}
   224  			continue
   225  		}
   226  		switch p.savedKey.Kind() {
   227  		case InternalKeyKindSet, InternalKeyKindDelete, InternalKeyKindSetWithDelete, InternalKeyKindDeleteSized:
   228  			// Note that we return SETs directly, even if they would otherwise get
   229  			// compacted into a Del to turn into a SetWithDelete. This is a fast
   230  			// path optimization that can break SINGLEDEL determinism. To lead to
   231  			// consistent SINGLEDEL behaviour, this iterator should *not* be used for
   232  			// a keyspace where SINGLEDELs could be in use. If this iterator observes
   233  			// a SINGLEDEL as the first internal key for a user key, it will panic.
   234  			//
   235  			// As p.value is a lazy value owned by the child iterator, we can thread
   236  			// it through without loading it into p.valueBuf.
   237  			//
   238  			// TODO(bilal): We can even avoid saving the key in this fast path if
   239  			// we are in a block where setHasSamePrefix = false in a v3 sstable,
   240  			// guaranteeing that there's only one internal key for each user key.
   241  			// Thread this logic through the sstable iterators and/or consider
   242  			// collapsing (ha) this logic into the sstable iterators that are aware
   243  			// of blocks and can determine user key changes without doing key saves
   244  			// or comparisons.
   245  			p.pos = pcIterPosCur
   246  			return p.verifySeqNum(p.iterKey), p.iterValue
   247  		case InternalKeyKindSingleDelete:
   248  			// Panic, as this iterator is not expected to observe single deletes.
   249  			panic("cannot process singledel key in point collapsing iterator")
   250  		case InternalKeyKindMerge:
   251  			// Panic, as this iterator is not expected to observe merges.
   252  			panic("cannot process merge key in point collapsing iterator")
   253  		case InternalKeyKindRangeDelete:
   254  			// These are interleaved by the interleaving iterator ahead of all points.
   255  			// We should pass them as-is, but also account for any points ahead of
   256  			// them.
   257  			p.pos = pcIterPosCur
   258  			return p.verifySeqNum(p.iterKey), p.iterValue
   259  		default:
   260  			panic(fmt.Sprintf("unexpected kind: %d", p.iterKey.Kind()))
   261  		}
   262  	}
   263  	p.resetKey()
   264  	return nil, base.LazyValue{}
   265  }
   266  
   267  // First implements the InternalIterator interface.
   268  func (p *pointCollapsingIterator) First() (*base.InternalKey, base.LazyValue) {
   269  	p.resetKey()
   270  	p.iterKey, p.iterValue = p.iter.First()
   271  	p.pos = pcIterPosCur
   272  	if p.iterKey == nil {
   273  		return nil, base.LazyValue{}
   274  	}
   275  	return p.findNextEntry()
   276  }
   277  
   278  // Last implements the InternalIterator interface.
   279  func (p *pointCollapsingIterator) Last() (*base.InternalKey, base.LazyValue) {
   280  	panic("unimplemented")
   281  }
   282  
   283  func (p *pointCollapsingIterator) saveKey() {
   284  	if p.iterKey == nil {
   285  		p.savedKey = InternalKey{UserKey: p.savedKeyBuf[:0]}
   286  		return
   287  	}
   288  	p.savedKeyBuf = append(p.savedKeyBuf[:0], p.iterKey.UserKey...)
   289  	p.savedKey = InternalKey{UserKey: p.savedKeyBuf, Trailer: p.iterKey.Trailer}
   290  }
   291  
   292  // Next implements the InternalIterator interface.
   293  func (p *pointCollapsingIterator) Next() (*base.InternalKey, base.LazyValue) {
   294  	switch p.pos {
   295  	case pcIterPosCur:
   296  		p.saveKey()
   297  		if p.iterKey != nil && p.iterKey.Kind() == InternalKeyKindRangeDelete {
   298  			// Step over the interleaved range delete and process the very next
   299  			// internal key, even if it's at the same user key. This is because a
   300  			// point for that user key has not been returned yet.
   301  			p.iterKey, p.iterValue = p.iter.Next()
   302  			break
   303  		}
   304  		// Fast forward to the next user key.
   305  		key, val := p.iter.Next()
   306  		// p.iterKey.SeqNum() >= key.SeqNum() is an optimization that allows us to
   307  		// use p.iterKey.SeqNum() < key.SeqNum() as a sign that the user key has
   308  		// changed, without needing to do the full key comparison.
   309  		for key != nil && p.savedKey.SeqNum() >= key.SeqNum() &&
   310  			p.comparer.Equal(p.savedKey.UserKey, key.UserKey) {
   311  			key, val = p.iter.Next()
   312  		}
   313  		if key == nil {
   314  			// There are no keys to return.
   315  			p.resetKey()
   316  			return nil, base.LazyValue{}
   317  		}
   318  		p.iterKey, p.iterValue = key, val
   319  	case pcIterPosNext:
   320  		p.pos = pcIterPosCur
   321  	}
   322  	if p.iterKey == nil {
   323  		p.resetKey()
   324  		return nil, base.LazyValue{}
   325  	}
   326  	return p.findNextEntry()
   327  }
   328  
   329  // NextPrefix implements the InternalIterator interface.
   330  func (p *pointCollapsingIterator) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
   331  	panic("unimplemented")
   332  }
   333  
   334  // Prev implements the InternalIterator interface.
   335  func (p *pointCollapsingIterator) Prev() (*base.InternalKey, base.LazyValue) {
   336  	panic("unimplemented")
   337  }
   338  
   339  // Error implements the InternalIterator interface.
   340  func (p *pointCollapsingIterator) Error() error {
   341  	if p.err != nil {
   342  		return p.err
   343  	}
   344  	return p.iter.Error()
   345  }
   346  
   347  // Close implements the InternalIterator interface.
   348  func (p *pointCollapsingIterator) Close() error {
   349  	return p.iter.Close()
   350  }
   351  
   352  // SetBounds implements the InternalIterator interface.
   353  func (p *pointCollapsingIterator) SetBounds(lower, upper []byte) {
   354  	p.resetKey()
   355  	p.iter.SetBounds(lower, upper)
   356  }
   357  
   358  func (p *pointCollapsingIterator) SetContext(ctx context.Context) {
   359  	p.iter.SetContext(ctx)
   360  }
   361  
   362  // String implements the InternalIterator interface.
   363  func (p *pointCollapsingIterator) String() string {
   364  	return p.iter.String()
   365  }
   366  
   367  var _ internalIterator = &pointCollapsingIterator{}
   368  
   369  // IteratorLevelKind is used to denote whether the current ScanInternal iterator
   370  // is unknown, belongs to a flushable, or belongs to an LSM level type.
   371  type IteratorLevelKind int8
   372  
   373  const (
   374  	// IteratorLevelUnknown indicates an unknown LSM level.
   375  	IteratorLevelUnknown IteratorLevelKind = iota
   376  	// IteratorLevelLSM indicates an LSM level.
   377  	IteratorLevelLSM
   378  	// IteratorLevelFlushable indicates a flushable (i.e. memtable).
   379  	IteratorLevelFlushable
   380  )
   381  
   382  // IteratorLevel is used with scanInternalIterator to surface additional iterator-specific info where possible.
   383  // Note: this is struct is only provided for point keys.
   384  type IteratorLevel struct {
   385  	Kind IteratorLevelKind
   386  	// FlushableIndex indicates the position within the flushable queue of this level.
   387  	// Only valid if kind == IteratorLevelFlushable.
   388  	FlushableIndex int
   389  	// The level within the LSM. Only valid if Kind == IteratorLevelLSM.
   390  	Level int
   391  	// Sublevel is only valid if Kind == IteratorLevelLSM and Level == 0.
   392  	Sublevel int
   393  }
   394  
   395  // scanInternalIterator is an iterator that returns all internal keys, including
   396  // tombstones. For instance, an InternalKeyKindDelete would be returned as an
   397  // InternalKeyKindDelete instead of the iterator skipping over to the next key.
   398  // Internal keys within a user key are collapsed, eg. if there are two SETs, the
   399  // one with the higher sequence is returned. Useful if an external user of Pebble
   400  // needs to observe and rebuild Pebble's history of internal keys, such as in
   401  // node-to-node replication. For use with {db,snapshot}.ScanInternal().
   402  //
   403  // scanInternalIterator is expected to ignore point keys deleted by range
   404  // deletions, and range keys shadowed by a range key unset or delete, however it
   405  // *must* return the range delete as well as the range key unset/delete that did
   406  // the shadowing.
   407  type scanInternalIterator struct {
   408  	ctx             context.Context
   409  	db              *DB
   410  	opts            scanInternalOptions
   411  	comparer        *base.Comparer
   412  	merge           Merge
   413  	iter            internalIterator
   414  	readState       *readState
   415  	version         *version
   416  	rangeKey        *iteratorRangeKeyState
   417  	pointKeyIter    internalIterator
   418  	iterKey         *InternalKey
   419  	iterValue       LazyValue
   420  	alloc           *iterAlloc
   421  	newIters        tableNewIters
   422  	newIterRangeKey keyspan.TableNewSpanIter
   423  	seqNum          uint64
   424  	iterLevels      []IteratorLevel
   425  	mergingIter     *mergingIter
   426  
   427  	// boundsBuf holds two buffers used to store the lower and upper bounds.
   428  	// Whenever the InternalIterator's bounds change, the new bounds are copied
   429  	// into boundsBuf[boundsBufIdx]. The two bounds share a slice to reduce
   430  	// allocations. opts.LowerBound and opts.UpperBound point into this slice.
   431  	boundsBuf    [2][]byte
   432  	boundsBufIdx int
   433  }
   434  
   435  // truncateSharedFile truncates a shared file's [Smallest, Largest] fields to
   436  // [lower, upper), potentially opening iterators on the file to find keys within
   437  // the requested bounds. A SharedSSTMeta is produced that is suitable for
   438  // external consumption by other Pebble instances. If shouldSkip is true, this
   439  // file does not contain any keys in [lower, upper) and can be skipped.
   440  //
   441  // TODO(bilal): If opening iterators and doing reads in this method is too
   442  // inefficient, consider producing non-tight file bounds instead.
   443  func (d *DB) truncateSharedFile(
   444  	ctx context.Context,
   445  	lower, upper []byte,
   446  	level int,
   447  	file *fileMetadata,
   448  	objMeta objstorage.ObjectMetadata,
   449  ) (sst *SharedSSTMeta, shouldSkip bool, err error) {
   450  	cmp := d.cmp
   451  	sst = &SharedSSTMeta{}
   452  	sst.cloneFromFileMeta(file)
   453  	sst.Level = uint8(level)
   454  	sst.Backing, err = d.objProvider.RemoteObjectBacking(&objMeta)
   455  	if err != nil {
   456  		return nil, false, err
   457  	}
   458  	needsLowerTruncate := cmp(lower, file.Smallest.UserKey) > 0
   459  	needsUpperTruncate := cmp(upper, file.Largest.UserKey) < 0 || (cmp(upper, file.Largest.UserKey) == 0 && !file.Largest.IsExclusiveSentinel())
   460  	// Fast path: file is entirely within [lower, upper).
   461  	if !needsLowerTruncate && !needsUpperTruncate {
   462  		return sst, false, nil
   463  	}
   464  
   465  	// We will need to truncate file bounds in at least one direction. Open all
   466  	// relevant iterators.
   467  	iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{
   468  		LowerBound: lower,
   469  		UpperBound: upper,
   470  		level:      manifest.Level(level),
   471  	}, internalIterOpts{})
   472  	if err != nil {
   473  		return nil, false, err
   474  	}
   475  	defer iter.Close()
   476  	if rangeDelIter != nil {
   477  		rangeDelIter = keyspan.Truncate(
   478  			cmp, rangeDelIter, lower, upper, nil, nil,
   479  			false, /* panicOnUpperTruncate */
   480  		)
   481  		defer rangeDelIter.Close()
   482  	}
   483  	rangeKeyIter, err := d.tableNewRangeKeyIter(file, keyspan.SpanIterOptions{})
   484  	if err != nil {
   485  		return nil, false, err
   486  	}
   487  	if rangeKeyIter != nil {
   488  		rangeKeyIter = keyspan.Truncate(
   489  			cmp, rangeKeyIter, lower, upper, nil, nil,
   490  			false, /* panicOnUpperTruncate */
   491  		)
   492  		defer rangeKeyIter.Close()
   493  	}
   494  	// Check if we need to truncate on the left side. This means finding a new
   495  	// LargestPointKey and LargestRangeKey that is >= lower.
   496  	if needsLowerTruncate {
   497  		sst.SmallestPointKey.UserKey = sst.SmallestPointKey.UserKey[:0]
   498  		sst.SmallestPointKey.Trailer = 0
   499  		key, _ := iter.SeekGE(lower, base.SeekGEFlagsNone)
   500  		foundPointKey := key != nil
   501  		if key != nil {
   502  			sst.SmallestPointKey.CopyFrom(*key)
   503  		}
   504  		if rangeDelIter != nil {
   505  			span := rangeDelIter.SeekGE(lower)
   506  			if span != nil && (len(sst.SmallestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.SmallestKey(), sst.SmallestPointKey) < 0) {
   507  				sst.SmallestPointKey.CopyFrom(span.SmallestKey())
   508  				foundPointKey = true
   509  			}
   510  		}
   511  		if !foundPointKey {
   512  			// There are no point keys in the span we're interested in.
   513  			sst.SmallestPointKey = InternalKey{}
   514  			sst.LargestPointKey = InternalKey{}
   515  		}
   516  		sst.SmallestRangeKey.UserKey = sst.SmallestRangeKey.UserKey[:0]
   517  		sst.SmallestRangeKey.Trailer = 0
   518  		if rangeKeyIter != nil {
   519  			span := rangeKeyIter.SeekGE(lower)
   520  			if span != nil {
   521  				sst.SmallestRangeKey.CopyFrom(span.SmallestKey())
   522  			} else {
   523  				// There are no range keys in the span we're interested in.
   524  				sst.SmallestRangeKey = InternalKey{}
   525  				sst.LargestRangeKey = InternalKey{}
   526  			}
   527  		}
   528  	}
   529  	// Check if we need to truncate on the right side. This means finding a new
   530  	// LargestPointKey and LargestRangeKey that is < upper.
   531  	if needsUpperTruncate {
   532  		sst.LargestPointKey.UserKey = sst.LargestPointKey.UserKey[:0]
   533  		sst.LargestPointKey.Trailer = 0
   534  		key, _ := iter.SeekLT(upper, base.SeekLTFlagsNone)
   535  		foundPointKey := key != nil
   536  		if key != nil {
   537  			sst.LargestPointKey.CopyFrom(*key)
   538  		}
   539  		if rangeDelIter != nil {
   540  			span := rangeDelIter.SeekLT(upper)
   541  			if span != nil && (len(sst.LargestPointKey.UserKey) == 0 || base.InternalCompare(cmp, span.LargestKey(), sst.LargestPointKey) > 0) {
   542  				sst.LargestPointKey.CopyFrom(span.LargestKey())
   543  				foundPointKey = true
   544  			}
   545  		}
   546  		if !foundPointKey {
   547  			// There are no point keys in the span we're interested in.
   548  			sst.SmallestPointKey = InternalKey{}
   549  			sst.LargestPointKey = InternalKey{}
   550  		}
   551  		sst.LargestRangeKey.UserKey = sst.LargestRangeKey.UserKey[:0]
   552  		sst.LargestRangeKey.Trailer = 0
   553  		if rangeKeyIter != nil {
   554  			span := rangeKeyIter.SeekLT(upper)
   555  			if span != nil {
   556  				sst.LargestRangeKey.CopyFrom(span.LargestKey())
   557  			} else {
   558  				// There are no range keys in the span we're interested in.
   559  				sst.SmallestRangeKey = InternalKey{}
   560  				sst.LargestRangeKey = InternalKey{}
   561  			}
   562  		}
   563  	}
   564  	// Set overall bounds based on {Smallest,Largest}{Point,Range}Key.
   565  	switch {
   566  	case len(sst.SmallestRangeKey.UserKey) == 0:
   567  		sst.Smallest = sst.SmallestPointKey
   568  	case len(sst.SmallestPointKey.UserKey) == 0:
   569  		sst.Smallest = sst.SmallestRangeKey
   570  	default:
   571  		sst.Smallest = sst.SmallestPointKey
   572  		if base.InternalCompare(cmp, sst.SmallestRangeKey, sst.SmallestPointKey) < 0 {
   573  			sst.Smallest = sst.SmallestRangeKey
   574  		}
   575  	}
   576  	switch {
   577  	case len(sst.LargestRangeKey.UserKey) == 0:
   578  		sst.Largest = sst.LargestPointKey
   579  	case len(sst.LargestPointKey.UserKey) == 0:
   580  		sst.Largest = sst.LargestRangeKey
   581  	default:
   582  		sst.Largest = sst.LargestPointKey
   583  		if base.InternalCompare(cmp, sst.LargestRangeKey, sst.LargestPointKey) > 0 {
   584  			sst.Largest = sst.LargestRangeKey
   585  		}
   586  	}
   587  	// On rare occasion, a file might overlap with [lower, upper) but not actually
   588  	// have any keys within those bounds. Skip such files.
   589  	if len(sst.Smallest.UserKey) == 0 {
   590  		return nil, true, nil
   591  	}
   592  	sst.Size, err = d.tableCache.estimateSize(file, sst.Smallest.UserKey, sst.Largest.UserKey)
   593  	if err != nil {
   594  		return nil, false, err
   595  	}
   596  	// On occasion, estimateSize gives us a low estimate, i.e. a 0 file size. This
   597  	// can cause panics in places where we divide by file sizes. Correct for it
   598  	// here.
   599  	if sst.Size == 0 {
   600  		sst.Size = 1
   601  	}
   602  	return sst, false, nil
   603  }
   604  
   605  func scanInternalImpl(
   606  	ctx context.Context, lower, upper []byte, iter *scanInternalIterator, opts *scanInternalOptions,
   607  ) error {
   608  	if opts.visitSharedFile != nil && (lower == nil || upper == nil) {
   609  		panic("lower and upper bounds must be specified in skip-shared iteration mode")
   610  	}
   611  	// Before starting iteration, check if any files in levels sharedLevelsStart
   612  	// and below are *not* shared. Error out if that is the case, as skip-shared
   613  	// iteration will not produce a consistent point-in-time view of this range
   614  	// of keys. For files that are shared, call visitSharedFile with a truncated
   615  	// version of that file.
   616  	cmp := iter.comparer.Compare
   617  	provider := iter.db.ObjProvider()
   618  	seqNum := iter.seqNum
   619  	current := iter.version
   620  	if current == nil {
   621  		current = iter.readState.current
   622  	}
   623  	if opts.visitSharedFile != nil {
   624  		if provider == nil {
   625  			panic("expected non-nil Provider in skip-shared iteration mode")
   626  		}
   627  		for level := sharedLevelsStart; level < numLevels; level++ {
   628  			files := current.Levels[level].Iter()
   629  			for f := files.SeekGE(cmp, lower); f != nil && cmp(f.Smallest.UserKey, upper) < 0; f = files.Next() {
   630  				var objMeta objstorage.ObjectMetadata
   631  				var err error
   632  				objMeta, err = provider.Lookup(fileTypeTable, f.FileBacking.DiskFileNum)
   633  				if err != nil {
   634  					return err
   635  				}
   636  				if !objMeta.IsShared() {
   637  					return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s is not shared", objMeta.DiskFileNum)
   638  				}
   639  				if !base.Visible(f.LargestSeqNum, seqNum, base.InternalKeySeqNumMax) {
   640  					return errors.Wrapf(ErrInvalidSkipSharedIteration, "file %s contains keys newer than snapshot", objMeta.DiskFileNum)
   641  				}
   642  				var sst *SharedSSTMeta
   643  				var skip bool
   644  				sst, skip, err = iter.db.truncateSharedFile(ctx, lower, upper, level, f, objMeta)
   645  				if err != nil {
   646  					return err
   647  				}
   648  				if skip {
   649  					continue
   650  				}
   651  				if err = opts.visitSharedFile(sst); err != nil {
   652  					return err
   653  				}
   654  			}
   655  		}
   656  	}
   657  
   658  	for valid := iter.seekGE(lower); valid && iter.error() == nil; valid = iter.next() {
   659  		key := iter.unsafeKey()
   660  
   661  		if opts.rateLimitFunc != nil {
   662  			if err := opts.rateLimitFunc(key, iter.lazyValue()); err != nil {
   663  				return err
   664  			}
   665  		}
   666  
   667  		switch key.Kind() {
   668  		case InternalKeyKindRangeKeyDelete, InternalKeyKindRangeKeyUnset, InternalKeyKindRangeKeySet:
   669  			if opts.visitRangeKey != nil {
   670  				span := iter.unsafeSpan()
   671  				// NB: The caller isn't interested in the sequence numbers of these
   672  				// range keys. Rather, the caller wants them to be in trailer order
   673  				// _after_ zeroing of sequence numbers. Copy span.Keys, sort it, and then
   674  				// call visitRangeKey.
   675  				keysCopy := make([]keyspan.Key, len(span.Keys))
   676  				for i := range span.Keys {
   677  					keysCopy[i] = span.Keys[i]
   678  					keysCopy[i].Trailer = base.MakeTrailer(0, span.Keys[i].Kind())
   679  				}
   680  				keyspan.SortKeysByTrailer(&keysCopy)
   681  				if err := opts.visitRangeKey(span.Start, span.End, keysCopy); err != nil {
   682  					return err
   683  				}
   684  			}
   685  		case InternalKeyKindRangeDelete:
   686  			if opts.visitRangeDel != nil {
   687  				rangeDel := iter.unsafeRangeDel()
   688  				if err := opts.visitRangeDel(rangeDel.Start, rangeDel.End, rangeDel.LargestSeqNum()); err != nil {
   689  					return err
   690  				}
   691  			}
   692  		default:
   693  			if opts.visitPointKey != nil {
   694  				var info IteratorLevel
   695  				if len(iter.mergingIter.heap.items) > 0 {
   696  					mergingIterIdx := iter.mergingIter.heap.items[0].index
   697  					info = iter.iterLevels[mergingIterIdx]
   698  				} else {
   699  					info = IteratorLevel{Kind: IteratorLevelUnknown}
   700  				}
   701  				val := iter.lazyValue()
   702  				if err := opts.visitPointKey(key, val, info); err != nil {
   703  					return err
   704  				}
   705  			}
   706  		}
   707  	}
   708  
   709  	return nil
   710  }
   711  
   712  // constructPointIter constructs a merging iterator and sets i.iter to it.
   713  func (i *scanInternalIterator) constructPointIter(
   714  	categoryAndQoS sstable.CategoryAndQoS, memtables flushableList, buf *iterAlloc,
   715  ) {
   716  	// Merging levels and levels from iterAlloc.
   717  	mlevels := buf.mlevels[:0]
   718  	levels := buf.levels[:0]
   719  
   720  	// We compute the number of levels needed ahead of time and reallocate a slice if
   721  	// the array from the iterAlloc isn't large enough. Doing this allocation once
   722  	// should improve the performance.
   723  	numMergingLevels := len(memtables)
   724  	numLevelIters := 0
   725  
   726  	current := i.version
   727  	if current == nil {
   728  		current = i.readState.current
   729  	}
   730  	numMergingLevels += len(current.L0SublevelFiles)
   731  	numLevelIters += len(current.L0SublevelFiles)
   732  
   733  	for level := 1; level < len(current.Levels); level++ {
   734  		if current.Levels[level].Empty() {
   735  			continue
   736  		}
   737  		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
   738  			continue
   739  		}
   740  		numMergingLevels++
   741  		numLevelIters++
   742  	}
   743  
   744  	if numMergingLevels > cap(mlevels) {
   745  		mlevels = make([]mergingIterLevel, 0, numMergingLevels)
   746  	}
   747  	if numLevelIters > cap(levels) {
   748  		levels = make([]levelIter, 0, numLevelIters)
   749  	}
   750  	// TODO(bilal): Push these into the iterAlloc buf.
   751  	var rangeDelMiter keyspan.MergingIter
   752  	rangeDelIters := make([]keyspan.FragmentIterator, 0, numMergingLevels)
   753  	rangeDelLevels := make([]keyspan.LevelIter, 0, numLevelIters)
   754  
   755  	i.iterLevels = make([]IteratorLevel, numMergingLevels)
   756  	mlevelsIndex := 0
   757  
   758  	// Next are the memtables.
   759  	for j := len(memtables) - 1; j >= 0; j-- {
   760  		mem := memtables[j]
   761  		mlevels = append(mlevels, mergingIterLevel{
   762  			iter: mem.newIter(&i.opts.IterOptions),
   763  		})
   764  		i.iterLevels[mlevelsIndex] = IteratorLevel{
   765  			Kind:           IteratorLevelFlushable,
   766  			FlushableIndex: j,
   767  		}
   768  		mlevelsIndex++
   769  		if rdi := mem.newRangeDelIter(&i.opts.IterOptions); rdi != nil {
   770  			rangeDelIters = append(rangeDelIters, rdi)
   771  		}
   772  	}
   773  
   774  	// Next are the file levels: L0 sub-levels followed by lower levels.
   775  	levelsIndex := len(levels)
   776  	mlevels = mlevels[:numMergingLevels]
   777  	levels = levels[:numLevelIters]
   778  	rangeDelLevels = rangeDelLevels[:numLevelIters]
   779  	i.opts.IterOptions.snapshotForHideObsoletePoints = i.seqNum
   780  	i.opts.IterOptions.CategoryAndQoS = categoryAndQoS
   781  	addLevelIterForFiles := func(files manifest.LevelIterator, level manifest.Level) {
   782  		li := &levels[levelsIndex]
   783  		rli := &rangeDelLevels[levelsIndex]
   784  
   785  		li.init(
   786  			i.ctx, i.opts.IterOptions, i.comparer, i.newIters, files, level,
   787  			internalIterOpts{})
   788  		li.initBoundaryContext(&mlevels[mlevelsIndex].levelIterBoundaryContext)
   789  		mlevels[mlevelsIndex].iter = li
   790  		rli.Init(keyspan.SpanIterOptions{RangeKeyFilters: i.opts.RangeKeyFilters},
   791  			i.comparer.Compare, tableNewRangeDelIter(i.ctx, i.newIters), files, level,
   792  			manifest.KeyTypePoint)
   793  		rangeDelIters = append(rangeDelIters, rli)
   794  
   795  		levelsIndex++
   796  		mlevelsIndex++
   797  	}
   798  
   799  	for j := len(current.L0SublevelFiles) - 1; j >= 0; j-- {
   800  		i.iterLevels[mlevelsIndex] = IteratorLevel{
   801  			Kind:     IteratorLevelLSM,
   802  			Level:    0,
   803  			Sublevel: j,
   804  		}
   805  		addLevelIterForFiles(current.L0SublevelFiles[j].Iter(), manifest.L0Sublevel(j))
   806  	}
   807  	// Add level iterators for the non-empty non-L0 levels.
   808  	for level := 1; level < numLevels; level++ {
   809  		if current.Levels[level].Empty() {
   810  			continue
   811  		}
   812  		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
   813  			continue
   814  		}
   815  		i.iterLevels[mlevelsIndex] = IteratorLevel{Kind: IteratorLevelLSM, Level: level}
   816  		addLevelIterForFiles(current.Levels[level].Iter(), manifest.Level(level))
   817  	}
   818  
   819  	buf.merging.init(&i.opts.IterOptions, &InternalIteratorStats{}, i.comparer.Compare, i.comparer.Split, mlevels...)
   820  	buf.merging.snapshot = i.seqNum
   821  	rangeDelMiter.Init(i.comparer.Compare, keyspan.VisibleTransform(i.seqNum), new(keyspan.MergingBuffers), rangeDelIters...)
   822  
   823  	if i.opts.includeObsoleteKeys {
   824  		iiter := &keyspan.InterleavingIter{}
   825  		iiter.Init(i.comparer, &buf.merging, &rangeDelMiter,
   826  			keyspan.InterleavingIterOpts{
   827  				LowerBound: i.opts.LowerBound,
   828  				UpperBound: i.opts.UpperBound,
   829  			})
   830  		i.pointKeyIter = iiter
   831  	} else {
   832  		pcIter := &pointCollapsingIterator{
   833  			comparer: i.comparer,
   834  			merge:    i.merge,
   835  			seqNum:   i.seqNum,
   836  		}
   837  		pcIter.iter.Init(i.comparer, &buf.merging, &rangeDelMiter, keyspan.InterleavingIterOpts{
   838  			LowerBound: i.opts.LowerBound,
   839  			UpperBound: i.opts.UpperBound,
   840  		})
   841  		i.pointKeyIter = pcIter
   842  	}
   843  	i.iter = i.pointKeyIter
   844  }
   845  
   846  // constructRangeKeyIter constructs the range-key iterator stack, populating
   847  // i.rangeKey.rangeKeyIter with the resulting iterator. This is similar to
   848  // Iterator.constructRangeKeyIter, except it doesn't handle batches and ensures
   849  // iterConfig does *not* elide unsets/deletes.
   850  func (i *scanInternalIterator) constructRangeKeyIter() error {
   851  	// We want the bounded iter from iterConfig, but not the collapsing of
   852  	// RangeKeyUnsets and RangeKeyDels.
   853  	i.rangeKey.rangeKeyIter = i.rangeKey.iterConfig.Init(
   854  		i.comparer, i.seqNum, i.opts.LowerBound, i.opts.UpperBound,
   855  		nil /* hasPrefix */, nil /* prefix */, true, /* internalKeys */
   856  		&i.rangeKey.rangeKeyBuffers.internal)
   857  
   858  	// Next are the flushables: memtables and large batches.
   859  	if i.readState != nil {
   860  		for j := len(i.readState.memtables) - 1; j >= 0; j-- {
   861  			mem := i.readState.memtables[j]
   862  			// We only need to read from memtables which contain sequence numbers older
   863  			// than seqNum.
   864  			if logSeqNum := mem.logSeqNum; logSeqNum >= i.seqNum {
   865  				continue
   866  			}
   867  			if rki := mem.newRangeKeyIter(&i.opts.IterOptions); rki != nil {
   868  				i.rangeKey.iterConfig.AddLevel(rki)
   869  			}
   870  		}
   871  	}
   872  
   873  	current := i.version
   874  	if current == nil {
   875  		current = i.readState.current
   876  	}
   877  	// Next are the file levels: L0 sub-levels followed by lower levels.
   878  	//
   879  	// Add file-specific iterators for L0 files containing range keys. This is less
   880  	// efficient than using levelIters for sublevels of L0 files containing
   881  	// range keys, but range keys are expected to be sparse anyway, reducing the
   882  	// cost benefit of maintaining a separate L0Sublevels instance for range key
   883  	// files and then using it here.
   884  	//
   885  	// NB: We iterate L0's files in reverse order. They're sorted by
   886  	// LargestSeqNum ascending, and we need to add them to the merging iterator
   887  	// in LargestSeqNum descending to preserve the merging iterator's invariants
   888  	// around Key Trailer order.
   889  	iter := current.RangeKeyLevels[0].Iter()
   890  	for f := iter.Last(); f != nil; f = iter.Prev() {
   891  		spanIter, err := i.newIterRangeKey(f, i.opts.SpanIterOptions())
   892  		if err != nil {
   893  			return err
   894  		}
   895  		i.rangeKey.iterConfig.AddLevel(spanIter)
   896  	}
   897  
   898  	// Add level iterators for the non-empty non-L0 levels.
   899  	for level := 1; level < len(current.RangeKeyLevels); level++ {
   900  		if current.RangeKeyLevels[level].Empty() {
   901  			continue
   902  		}
   903  		if i.opts.skipSharedLevels && level >= sharedLevelsStart {
   904  			continue
   905  		}
   906  		li := i.rangeKey.iterConfig.NewLevelIter()
   907  		spanIterOpts := i.opts.SpanIterOptions()
   908  		li.Init(spanIterOpts, i.comparer.Compare, i.newIterRangeKey, current.RangeKeyLevels[level].Iter(),
   909  			manifest.Level(level), manifest.KeyTypeRange)
   910  		i.rangeKey.iterConfig.AddLevel(li)
   911  	}
   912  	return nil
   913  }
   914  
   915  // seekGE seeks this iterator to the first key that's greater than or equal
   916  // to the specified user key.
   917  func (i *scanInternalIterator) seekGE(key []byte) bool {
   918  	i.iterKey, i.iterValue = i.iter.SeekGE(key, base.SeekGEFlagsNone)
   919  	return i.iterKey != nil
   920  }
   921  
   922  // unsafeKey returns the unsafe InternalKey at the current position. The value
   923  // is nil if the iterator is invalid or exhausted.
   924  func (i *scanInternalIterator) unsafeKey() *InternalKey {
   925  	return i.iterKey
   926  }
   927  
   928  // lazyValue returns a value pointer to the value at the current iterator
   929  // position. Behaviour undefined if unsafeKey() returns a Range key or Rangedel
   930  // kind key.
   931  func (i *scanInternalIterator) lazyValue() LazyValue {
   932  	return i.iterValue
   933  }
   934  
   935  // unsafeRangeDel returns a range key span. Behaviour undefined if UnsafeKey returns
   936  // a non-rangedel kind.
   937  func (i *scanInternalIterator) unsafeRangeDel() *keyspan.Span {
   938  	type spanInternalIterator interface {
   939  		Span() *keyspan.Span
   940  	}
   941  	return i.pointKeyIter.(spanInternalIterator).Span()
   942  }
   943  
   944  // unsafeSpan returns a range key span. Behaviour undefined if UnsafeKey returns
   945  // a non-rangekey type.
   946  func (i *scanInternalIterator) unsafeSpan() *keyspan.Span {
   947  	return i.rangeKey.iiter.Span()
   948  }
   949  
   950  // next advances the iterator in the forward direction, and returns the
   951  // iterator's new validity state.
   952  func (i *scanInternalIterator) next() bool {
   953  	i.iterKey, i.iterValue = i.iter.Next()
   954  	return i.iterKey != nil
   955  }
   956  
   957  // error returns an error from the internal iterator, if there's any.
   958  func (i *scanInternalIterator) error() error {
   959  	return i.iter.Error()
   960  }
   961  
   962  // close closes this iterator, and releases any pooled objects.
   963  func (i *scanInternalIterator) close() error {
   964  	if err := i.iter.Close(); err != nil {
   965  		return err
   966  	}
   967  	if i.readState != nil {
   968  		i.readState.unref()
   969  	}
   970  	if i.version != nil {
   971  		i.version.Unref()
   972  	}
   973  	if i.rangeKey != nil {
   974  		i.rangeKey.PrepareForReuse()
   975  		*i.rangeKey = iteratorRangeKeyState{
   976  			rangeKeyBuffers: i.rangeKey.rangeKeyBuffers,
   977  		}
   978  		iterRangeKeyStateAllocPool.Put(i.rangeKey)
   979  		i.rangeKey = nil
   980  	}
   981  	if alloc := i.alloc; alloc != nil {
   982  		for j := range i.boundsBuf {
   983  			if cap(i.boundsBuf[j]) >= maxKeyBufCacheSize {
   984  				alloc.boundsBuf[j] = nil
   985  			} else {
   986  				alloc.boundsBuf[j] = i.boundsBuf[j]
   987  			}
   988  		}
   989  		*alloc = iterAlloc{
   990  			keyBuf:              alloc.keyBuf[:0],
   991  			boundsBuf:           alloc.boundsBuf,
   992  			prefixOrFullSeekKey: alloc.prefixOrFullSeekKey[:0],
   993  		}
   994  		iterAllocPool.Put(alloc)
   995  		i.alloc = nil
   996  	}
   997  	return nil
   998  }
   999  
  1000  func (i *scanInternalIterator) initializeBoundBufs(lower, upper []byte) {
  1001  	buf := i.boundsBuf[i.boundsBufIdx][:0]
  1002  	if lower != nil {
  1003  		buf = append(buf, lower...)
  1004  		i.opts.LowerBound = buf
  1005  	} else {
  1006  		i.opts.LowerBound = nil
  1007  	}
  1008  	if upper != nil {
  1009  		buf = append(buf, upper...)
  1010  		i.opts.UpperBound = buf[len(buf)-len(upper):]
  1011  	} else {
  1012  		i.opts.UpperBound = nil
  1013  	}
  1014  	i.boundsBuf[i.boundsBufIdx] = buf
  1015  	i.boundsBufIdx = 1 - i.boundsBufIdx
  1016  }