github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/external_iterator.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"sort"
    11  
    12  	"github.com/cockroachdb/errors"
    13  	"github.com/cockroachdb/pebble/internal/base"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manifest"
    16  	"github.com/cockroachdb/pebble/sstable"
    17  )
    18  
    19  // ExternalIterOption provide an interface to specify open-time options to
    20  // NewExternalIter.
    21  type ExternalIterOption interface {
    22  	// iterApply is called on the iterator during opening in order to set internal
    23  	// parameters.
    24  	iterApply(*Iterator)
    25  	// readerOptions returns any reader options added by this iter option.
    26  	readerOptions() []sstable.ReaderOption
    27  }
    28  
    29  type externalIterReaderOptions struct {
    30  	opts []sstable.ReaderOption
    31  }
    32  
    33  func (e *externalIterReaderOptions) iterApply(iterator *Iterator) {
    34  	// Do nothing.
    35  }
    36  
    37  func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption {
    38  	return e.opts
    39  }
    40  
    41  // ExternalIterReaderOptions returns an ExternalIterOption that specifies
    42  // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter.
    43  func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption {
    44  	return &externalIterReaderOptions{opts: opts}
    45  }
    46  
    47  // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator
    48  // will only be used for forward positioning operations (First, SeekGE, Next).
    49  // This could enable optimizations that take advantage of this invariant.
    50  // Behaviour when a reverse positioning operation is done on an iterator
    51  // opened with this option is unpredictable, though in most cases it should.
    52  type ExternalIterForwardOnly struct{}
    53  
    54  func (e ExternalIterForwardOnly) iterApply(iter *Iterator) {
    55  	iter.forwardOnly = true
    56  }
    57  
    58  func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption {
    59  	return nil
    60  }
    61  
    62  // NewExternalIter takes an input 2d array of sstable files which may overlap
    63  // across subarrays but not within a subarray (at least as far as points are
    64  // concerned; range keys are allowed to overlap arbitrarily even within a
    65  // subarray), and returns an Iterator over the merged contents of the sstables.
    66  // Input sstables may contain point keys, range keys, range deletions, etc. The
    67  // input files slice must be sorted in reverse chronological ordering. A key in a
    68  // file at a lower index subarray will shadow a key with an identical user key
    69  // contained within a file at a higher index subarray. Each subarray must be
    70  // sorted in internal key order, where lower index files contain keys that sort
    71  // left of files with higher indexes.
    72  //
    73  // Input sstables must only contain keys with the zero sequence number.
    74  //
    75  // Iterators constructed through NewExternalIter do not support all iterator
    76  // options, including block-property and table filters. NewExternalIter errors
    77  // if an incompatible option is set.
    78  func NewExternalIter(
    79  	o *Options,
    80  	iterOpts *IterOptions,
    81  	files [][]sstable.ReadableFile,
    82  	extraOpts ...ExternalIterOption,
    83  ) (it *Iterator, err error) {
    84  	return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...)
    85  }
    86  
    87  // NewExternalIterWithContext is like NewExternalIter, and additionally
    88  // accepts a context for tracing.
    89  func NewExternalIterWithContext(
    90  	ctx context.Context,
    91  	o *Options,
    92  	iterOpts *IterOptions,
    93  	files [][]sstable.ReadableFile,
    94  	extraOpts ...ExternalIterOption,
    95  ) (it *Iterator, err error) {
    96  	if iterOpts != nil {
    97  		if err := validateExternalIterOpts(iterOpts); err != nil {
    98  			return nil, err
    99  		}
   100  	}
   101  
   102  	var readers [][]*sstable.Reader
   103  
   104  	// Ensure we close all the opened readers if we error out.
   105  	defer func() {
   106  		if err != nil {
   107  			for i := range readers {
   108  				for j := range readers[i] {
   109  					_ = readers[i][j].Close()
   110  				}
   111  			}
   112  		}
   113  	}()
   114  	seqNumOffset := 0
   115  	var extraReaderOpts []sstable.ReaderOption
   116  	for i := range extraOpts {
   117  		extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...)
   118  	}
   119  	for _, levelFiles := range files {
   120  		seqNumOffset += len(levelFiles)
   121  	}
   122  	for _, levelFiles := range files {
   123  		var subReaders []*sstable.Reader
   124  		seqNumOffset -= len(levelFiles)
   125  		subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...)
   126  		readers = append(readers, subReaders)
   127  	}
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  
   132  	buf := iterAllocPool.Get().(*iterAlloc)
   133  	dbi := &buf.dbi
   134  	*dbi = Iterator{
   135  		ctx:                 ctx,
   136  		alloc:               buf,
   137  		merge:               o.Merger.Merge,
   138  		comparer:            *o.Comparer,
   139  		readState:           nil,
   140  		keyBuf:              buf.keyBuf,
   141  		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
   142  		boundsBuf:           buf.boundsBuf,
   143  		batch:               nil,
   144  		// Add the readers to the Iterator so that Close closes them, and
   145  		// SetOptions can re-construct iterators from them.
   146  		externalReaders: readers,
   147  		newIters: func(
   148  			ctx context.Context, f *manifest.FileMetadata, opts *IterOptions,
   149  			internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
   150  			// NB: External iterators are currently constructed without any
   151  			// `levelIters`. newIters should never be called. When we support
   152  			// organizing multiple non-overlapping files into a single level
   153  			// (see TODO below), we'll need to adjust this tableNewIters
   154  			// implementation to open iterators by looking up f in a map
   155  			// of readers indexed by *fileMetadata.
   156  			panic("unreachable")
   157  		},
   158  		seqNum: base.InternalKeySeqNumMax,
   159  	}
   160  	if iterOpts != nil {
   161  		dbi.opts = *iterOpts
   162  		dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound)
   163  	}
   164  	for i := range extraOpts {
   165  		extraOpts[i].iterApply(dbi)
   166  	}
   167  	finishInitializingExternal(ctx, dbi)
   168  	return dbi, nil
   169  }
   170  
   171  func validateExternalIterOpts(iterOpts *IterOptions) error {
   172  	switch {
   173  	case iterOpts.TableFilter != nil:
   174  		return errors.Errorf("pebble: external iterator: TableFilter unsupported")
   175  	case iterOpts.PointKeyFilters != nil:
   176  		return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported")
   177  	case iterOpts.RangeKeyFilters != nil:
   178  		return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported")
   179  	case iterOpts.OnlyReadGuaranteedDurable:
   180  		return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported")
   181  	case iterOpts.UseL6Filters:
   182  		return errors.Errorf("pebble: external iterator: UseL6Filters unsupported")
   183  	}
   184  	return nil
   185  }
   186  
   187  func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) {
   188  	// TODO(jackson): In some instances we could generate fewer levels by using
   189  	// L0Sublevels code to organize nonoverlapping files into the same level.
   190  	// This would allow us to use levelIters and keep a smaller set of data and
   191  	// files in-memory. However, it would also require us to identify the bounds
   192  	// of all the files upfront.
   193  
   194  	if !it.opts.pointKeys() {
   195  		return emptyIter, nil
   196  	} else if it.pointIter != nil {
   197  		return it.pointIter, nil
   198  	}
   199  	mlevels := it.alloc.mlevels[:0]
   200  
   201  	if len(it.externalReaders) > cap(mlevels) {
   202  		mlevels = make([]mergingIterLevel, 0, len(it.externalReaders))
   203  	}
   204  	for _, readers := range it.externalReaders {
   205  		var combinedIters []internalIterator
   206  		for _, r := range readers {
   207  			var (
   208  				rangeDelIter keyspan.FragmentIterator
   209  				pointIter    internalIterator
   210  				err          error
   211  			)
   212  			// We could set hideObsoletePoints=true, since we are reading at
   213  			// InternalKeySeqNumMax, but we don't bother since these sstables should
   214  			// not have obsolete points (so the performance optimization is
   215  			// unnecessary), and we don't want to bother constructing a
   216  			// BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter.
   217  			pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc(
   218  				ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */
   219  				false /* hideObsoletePoints */, false, /* useFilterBlock */
   220  				&it.stats.InternalStats, sstable.TrivialReaderProvider{Reader: r})
   221  			if err != nil {
   222  				return nil, err
   223  			}
   224  			rangeDelIter, err = r.NewRawRangeDelIter()
   225  			if err != nil {
   226  				return nil, err
   227  			}
   228  			if rangeDelIter == nil && pointIter != nil && it.forwardOnly {
   229  				// TODO(bilal): Consider implementing range key pausing in
   230  				// simpleLevelIter so we can reduce mergingIterLevels even more by
   231  				// sending all sstable iterators to combinedIters, not just those
   232  				// corresponding to sstables without range deletes.
   233  				combinedIters = append(combinedIters, pointIter)
   234  				continue
   235  			}
   236  			mlevels = append(mlevels, mergingIterLevel{
   237  				iter:         pointIter,
   238  				rangeDelIter: rangeDelIter,
   239  			})
   240  		}
   241  		if len(combinedIters) == 1 {
   242  			mlevels = append(mlevels, mergingIterLevel{
   243  				iter: combinedIters[0],
   244  			})
   245  		} else if len(combinedIters) > 1 {
   246  			sli := &simpleLevelIter{
   247  				cmp:   it.cmp,
   248  				iters: combinedIters,
   249  			}
   250  			sli.init(it.opts)
   251  			mlevels = append(mlevels, mergingIterLevel{
   252  				iter:         sli,
   253  				rangeDelIter: nil,
   254  			})
   255  		}
   256  	}
   257  	if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil {
   258  		// Set closePointIterOnce to true. This is because we're bypassing the
   259  		// merging iter, which turns Close()s on it idempotent for any child
   260  		// iterators. The outer Iterator could call Close() on a point iter twice,
   261  		// which sstable iterators do not support (as they release themselves to
   262  		// a pool).
   263  		it.closePointIterOnce = true
   264  		return mlevels[0].iter, nil
   265  	}
   266  
   267  	it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...)
   268  	it.alloc.merging.snapshot = base.InternalKeySeqNumMax
   269  	if len(mlevels) <= cap(it.alloc.levelsPositioned) {
   270  		it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)]
   271  	}
   272  	return &it.alloc.merging, nil
   273  }
   274  
   275  func finishInitializingExternal(ctx context.Context, it *Iterator) {
   276  	pointIter, err := createExternalPointIter(ctx, it)
   277  	if err != nil {
   278  		it.pointIter = &errorIter{err: err}
   279  	} else {
   280  		it.pointIter = pointIter
   281  	}
   282  	it.iter = it.pointIter
   283  
   284  	if it.opts.rangeKeys() {
   285  		it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split)
   286  		var rangeKeyIters []keyspan.FragmentIterator
   287  		if it.rangeKey == nil {
   288  			// We could take advantage of the lack of overlaps in range keys within
   289  			// each slice in it.externalReaders, and generate keyspan.LevelIters
   290  			// out of those. However, since range keys are expected to be sparse to
   291  			// begin with, the performance gain might not be significant enough to
   292  			// warrant it.
   293  			//
   294  			// TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not
   295  			// operate on FileMetadatas (similar to simpleLevelIter), and implements
   296  			// this optimization.
   297  			for _, readers := range it.externalReaders {
   298  				for _, r := range readers {
   299  					if rki, err := r.NewRawRangeKeyIter(); err != nil {
   300  						rangeKeyIters = append(rangeKeyIters, &errorKeyspanIter{err: err})
   301  					} else if rki != nil {
   302  						rangeKeyIters = append(rangeKeyIters, rki)
   303  					}
   304  				}
   305  			}
   306  			if len(rangeKeyIters) > 0 {
   307  				it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
   308  				it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts)
   309  				it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init(
   310  					&it.comparer,
   311  					base.InternalKeySeqNumMax,
   312  					it.opts.LowerBound, it.opts.UpperBound,
   313  					&it.hasPrefix, &it.prefixOrFullSeekKey,
   314  					false /* internalKeys */, &it.rangeKey.internal,
   315  				)
   316  				for i := range rangeKeyIters {
   317  					it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i])
   318  				}
   319  			}
   320  		}
   321  		if it.rangeKey != nil {
   322  			it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter,
   323  				keyspan.InterleavingIterOpts{
   324  					Mask:       &it.rangeKeyMasking,
   325  					LowerBound: it.opts.LowerBound,
   326  					UpperBound: it.opts.UpperBound,
   327  				})
   328  			it.iter = &it.rangeKey.iiter
   329  		}
   330  	}
   331  }
   332  
   333  func openExternalTables(
   334  	o *Options,
   335  	files []sstable.ReadableFile,
   336  	seqNumOffset int,
   337  	readerOpts sstable.ReaderOptions,
   338  	extraReaderOpts ...sstable.ReaderOption,
   339  ) (readers []*sstable.Reader, err error) {
   340  	readers = make([]*sstable.Reader, 0, len(files))
   341  	for i := range files {
   342  		readable, err := sstable.NewSimpleReadable(files[i])
   343  		if err != nil {
   344  			return readers, err
   345  		}
   346  		r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...)
   347  		if err != nil {
   348  			return readers, err
   349  		}
   350  		// Use the index of the file in files as the sequence number for all of
   351  		// its keys.
   352  		r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset)
   353  		readers = append(readers, r)
   354  	}
   355  	return readers, err
   356  }
   357  
   358  // simpleLevelIter is similar to a levelIter in that it merges the points
   359  // from multiple point iterators that are non-overlapping in the key ranges
   360  // they return. It is only expected to support forward iteration and forward
   361  // regular seeking; reverse iteration and prefix seeking is not supported.
   362  // Intended to be a low-overhead, non-FileMetadata dependent option for
   363  // NewExternalIter. To optimize seeking and forward iteration, it maintains
   364  // two slices of child iterators; one of all iterators, and a subset of it that
   365  // contains just the iterators that contain point keys within the current
   366  // bounds.
   367  //
   368  // Note that this levelIter does not support pausing at file boundaries
   369  // in case of range tombstones in this file that could apply to points outside
   370  // of this file (and outside of this level). This is sufficient for optimizing
   371  // the main use cases of NewExternalIter, however for completeness it would make
   372  // sense to build this pausing functionality in.
   373  type simpleLevelIter struct {
   374  	cmp          Compare
   375  	err          error
   376  	lowerBound   []byte
   377  	iters        []internalIterator
   378  	filtered     []internalIterator
   379  	firstKeys    [][]byte
   380  	firstKeysBuf []byte
   381  	currentIdx   int
   382  }
   383  
   384  var _ internalIterator = &simpleLevelIter{}
   385  
   386  // init initializes this simpleLevelIter.
   387  func (s *simpleLevelIter) init(opts IterOptions) {
   388  	s.currentIdx = 0
   389  	s.lowerBound = opts.LowerBound
   390  	s.resetFilteredIters()
   391  }
   392  
   393  func (s *simpleLevelIter) resetFilteredIters() {
   394  	s.filtered = s.filtered[:0]
   395  	s.firstKeys = s.firstKeys[:0]
   396  	s.firstKeysBuf = s.firstKeysBuf[:0]
   397  	s.err = nil
   398  	for i := range s.iters {
   399  		var iterKey *base.InternalKey
   400  		if s.lowerBound != nil {
   401  			iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone)
   402  		} else {
   403  			iterKey, _ = s.iters[i].First()
   404  		}
   405  		if iterKey != nil {
   406  			s.filtered = append(s.filtered, s.iters[i])
   407  			bufStart := len(s.firstKeysBuf)
   408  			s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...)
   409  			s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)])
   410  		} else if err := s.iters[i].Error(); err != nil {
   411  			s.err = err
   412  		}
   413  	}
   414  }
   415  
   416  func (s *simpleLevelIter) SeekGE(
   417  	key []byte, flags base.SeekGEFlags,
   418  ) (*base.InternalKey, base.LazyValue) {
   419  	if s.err != nil {
   420  		return nil, base.LazyValue{}
   421  	}
   422  	// Find the first file that is entirely >= key. The file before that could
   423  	// contain the key we're looking for.
   424  	n := sort.Search(len(s.firstKeys), func(i int) bool {
   425  		return s.cmp(key, s.firstKeys[i]) <= 0
   426  	})
   427  	if n > 0 {
   428  		s.currentIdx = n - 1
   429  	} else {
   430  		s.currentIdx = n
   431  	}
   432  	if s.currentIdx < len(s.filtered) {
   433  		if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil {
   434  			return iterKey, val
   435  		}
   436  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   437  			s.err = err
   438  		}
   439  		s.currentIdx++
   440  	}
   441  	return s.skipEmptyFileForward(key, flags)
   442  }
   443  
   444  func (s *simpleLevelIter) skipEmptyFileForward(
   445  	seekKey []byte, flags base.SeekGEFlags,
   446  ) (*base.InternalKey, base.LazyValue) {
   447  	var iterKey *base.InternalKey
   448  	var val base.LazyValue
   449  	for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil {
   450  		if seekKey != nil {
   451  			iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags)
   452  		} else if s.lowerBound != nil {
   453  			iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags)
   454  		} else {
   455  			iterKey, val = s.filtered[s.currentIdx].First()
   456  		}
   457  		if iterKey != nil {
   458  			return iterKey, val
   459  		}
   460  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   461  			s.err = err
   462  		}
   463  		s.currentIdx++
   464  	}
   465  	return nil, base.LazyValue{}
   466  }
   467  
   468  func (s *simpleLevelIter) SeekPrefixGE(
   469  	prefix, key []byte, flags base.SeekGEFlags,
   470  ) (*base.InternalKey, base.LazyValue) {
   471  	panic("unimplemented")
   472  }
   473  
   474  func (s *simpleLevelIter) SeekLT(
   475  	key []byte, flags base.SeekLTFlags,
   476  ) (*base.InternalKey, base.LazyValue) {
   477  	panic("unimplemented")
   478  }
   479  
   480  func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) {
   481  	if s.err != nil {
   482  		return nil, base.LazyValue{}
   483  	}
   484  	s.currentIdx = 0
   485  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   486  }
   487  
   488  func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) {
   489  	panic("unimplemented")
   490  }
   491  
   492  func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) {
   493  	if s.err != nil {
   494  		return nil, base.LazyValue{}
   495  	}
   496  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   497  		return nil, base.LazyValue{}
   498  	}
   499  	if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil {
   500  		return iterKey, val
   501  	}
   502  	s.currentIdx++
   503  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   504  }
   505  
   506  func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
   507  	if s.err != nil {
   508  		return nil, base.LazyValue{}
   509  	}
   510  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   511  		return nil, base.LazyValue{}
   512  	}
   513  	if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil {
   514  		return iterKey, val
   515  	}
   516  	s.currentIdx++
   517  	return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone)
   518  }
   519  
   520  func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) {
   521  	panic("unimplemented")
   522  }
   523  
   524  func (s *simpleLevelIter) Error() error {
   525  	if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) {
   526  		s.err = firstError(s.err, s.filtered[s.currentIdx].Error())
   527  	}
   528  	return s.err
   529  }
   530  
   531  func (s *simpleLevelIter) Close() error {
   532  	var err error
   533  	for i := range s.iters {
   534  		err = firstError(err, s.iters[i].Close())
   535  	}
   536  	return err
   537  }
   538  
   539  func (s *simpleLevelIter) SetBounds(lower, upper []byte) {
   540  	s.currentIdx = -1
   541  	s.lowerBound = lower
   542  	for i := range s.iters {
   543  		s.iters[i].SetBounds(lower, upper)
   544  	}
   545  	s.resetFilteredIters()
   546  }
   547  
   548  func (s *simpleLevelIter) String() string {
   549  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   550  		return "simpleLevelIter: current=<nil>"
   551  	}
   552  	return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx])
   553  }
   554  
   555  var _ internalIterator = &simpleLevelIter{}