github.com/cockroachdb/pebble@v0.0.0-20231214172447-ab4952c5f87b/external_iterator.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package pebble
     6  
     7  import (
     8  	"context"
     9  	"fmt"
    10  	"sort"
    11  
    12  	"github.com/cockroachdb/errors"
    13  	"github.com/cockroachdb/pebble/internal/base"
    14  	"github.com/cockroachdb/pebble/internal/keyspan"
    15  	"github.com/cockroachdb/pebble/internal/manifest"
    16  	"github.com/cockroachdb/pebble/sstable"
    17  )
    18  
    19  // ExternalIterOption provide an interface to specify open-time options to
    20  // NewExternalIter.
    21  type ExternalIterOption interface {
    22  	// iterApply is called on the iterator during opening in order to set internal
    23  	// parameters.
    24  	iterApply(*Iterator)
    25  	// readerOptions returns any reader options added by this iter option.
    26  	readerOptions() []sstable.ReaderOption
    27  }
    28  
    29  type externalIterReaderOptions struct {
    30  	opts []sstable.ReaderOption
    31  }
    32  
    33  func (e *externalIterReaderOptions) iterApply(iterator *Iterator) {
    34  	// Do nothing.
    35  }
    36  
    37  func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption {
    38  	return e.opts
    39  }
    40  
    41  // ExternalIterReaderOptions returns an ExternalIterOption that specifies
    42  // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter.
    43  func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption {
    44  	return &externalIterReaderOptions{opts: opts}
    45  }
    46  
    47  // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator
    48  // will only be used for forward positioning operations (First, SeekGE, Next).
    49  // This could enable optimizations that take advantage of this invariant.
    50  // Behaviour when a reverse positioning operation is done on an iterator
    51  // opened with this option is unpredictable, though in most cases it should.
    52  type ExternalIterForwardOnly struct{}
    53  
    54  func (e ExternalIterForwardOnly) iterApply(iter *Iterator) {
    55  	iter.forwardOnly = true
    56  }
    57  
    58  func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption {
    59  	return nil
    60  }
    61  
    62  // NewExternalIter takes an input 2d array of sstable files which may overlap
    63  // across subarrays but not within a subarray (at least as far as points are
    64  // concerned; range keys are allowed to overlap arbitrarily even within a
    65  // subarray), and returns an Iterator over the merged contents of the sstables.
    66  // Input sstables may contain point keys, range keys, range deletions, etc. The
    67  // input files slice must be sorted in reverse chronological ordering. A key in a
    68  // file at a lower index subarray will shadow a key with an identical user key
    69  // contained within a file at a higher index subarray. Each subarray must be
    70  // sorted in internal key order, where lower index files contain keys that sort
    71  // left of files with higher indexes.
    72  //
    73  // Input sstables must only contain keys with the zero sequence number.
    74  //
    75  // Iterators constructed through NewExternalIter do not support all iterator
    76  // options, including block-property and table filters. NewExternalIter errors
    77  // if an incompatible option is set.
    78  func NewExternalIter(
    79  	o *Options,
    80  	iterOpts *IterOptions,
    81  	files [][]sstable.ReadableFile,
    82  	extraOpts ...ExternalIterOption,
    83  ) (it *Iterator, err error) {
    84  	return NewExternalIterWithContext(context.Background(), o, iterOpts, files, extraOpts...)
    85  }
    86  
    87  // NewExternalIterWithContext is like NewExternalIter, and additionally
    88  // accepts a context for tracing.
    89  func NewExternalIterWithContext(
    90  	ctx context.Context,
    91  	o *Options,
    92  	iterOpts *IterOptions,
    93  	files [][]sstable.ReadableFile,
    94  	extraOpts ...ExternalIterOption,
    95  ) (it *Iterator, err error) {
    96  	if iterOpts != nil {
    97  		if err := validateExternalIterOpts(iterOpts); err != nil {
    98  			return nil, err
    99  		}
   100  	}
   101  
   102  	var readers [][]*sstable.Reader
   103  
   104  	// Ensure we close all the opened readers if we error out.
   105  	defer func() {
   106  		if err != nil {
   107  			for i := range readers {
   108  				for j := range readers[i] {
   109  					_ = readers[i][j].Close()
   110  				}
   111  			}
   112  		}
   113  	}()
   114  	seqNumOffset := 0
   115  	var extraReaderOpts []sstable.ReaderOption
   116  	for i := range extraOpts {
   117  		extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...)
   118  	}
   119  	for _, levelFiles := range files {
   120  		seqNumOffset += len(levelFiles)
   121  	}
   122  	for _, levelFiles := range files {
   123  		var subReaders []*sstable.Reader
   124  		seqNumOffset -= len(levelFiles)
   125  		subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...)
   126  		readers = append(readers, subReaders)
   127  	}
   128  	if err != nil {
   129  		return nil, err
   130  	}
   131  
   132  	buf := iterAllocPool.Get().(*iterAlloc)
   133  	dbi := &buf.dbi
   134  	*dbi = Iterator{
   135  		ctx:                 ctx,
   136  		alloc:               buf,
   137  		merge:               o.Merger.Merge,
   138  		comparer:            *o.Comparer,
   139  		readState:           nil,
   140  		keyBuf:              buf.keyBuf,
   141  		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
   142  		boundsBuf:           buf.boundsBuf,
   143  		batch:               nil,
   144  		// Add the readers to the Iterator so that Close closes them, and
   145  		// SetOptions can re-construct iterators from them.
   146  		externalReaders: readers,
   147  		newIters: func(
   148  			ctx context.Context, f *manifest.FileMetadata, opts *IterOptions,
   149  			internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
   150  			// NB: External iterators are currently constructed without any
   151  			// `levelIters`. newIters should never be called. When we support
   152  			// organizing multiple non-overlapping files into a single level
   153  			// (see TODO below), we'll need to adjust this tableNewIters
   154  			// implementation to open iterators by looking up f in a map
   155  			// of readers indexed by *fileMetadata.
   156  			panic("unreachable")
   157  		},
   158  		seqNum: base.InternalKeySeqNumMax,
   159  	}
   160  	if iterOpts != nil {
   161  		dbi.opts = *iterOpts
   162  		dbi.processBounds(iterOpts.LowerBound, iterOpts.UpperBound)
   163  	}
   164  	for i := range extraOpts {
   165  		extraOpts[i].iterApply(dbi)
   166  	}
   167  	if err := finishInitializingExternal(ctx, dbi); err != nil {
   168  		dbi.Close()
   169  		return nil, err
   170  	}
   171  	return dbi, nil
   172  }
   173  
   174  func validateExternalIterOpts(iterOpts *IterOptions) error {
   175  	switch {
   176  	case iterOpts.TableFilter != nil:
   177  		return errors.Errorf("pebble: external iterator: TableFilter unsupported")
   178  	case iterOpts.PointKeyFilters != nil:
   179  		return errors.Errorf("pebble: external iterator: PointKeyFilters unsupported")
   180  	case iterOpts.RangeKeyFilters != nil:
   181  		return errors.Errorf("pebble: external iterator: RangeKeyFilters unsupported")
   182  	case iterOpts.OnlyReadGuaranteedDurable:
   183  		return errors.Errorf("pebble: external iterator: OnlyReadGuaranteedDurable unsupported")
   184  	case iterOpts.UseL6Filters:
   185  		return errors.Errorf("pebble: external iterator: UseL6Filters unsupported")
   186  	}
   187  	return nil
   188  }
   189  
   190  func createExternalPointIter(ctx context.Context, it *Iterator) (internalIterator, error) {
   191  	// TODO(jackson): In some instances we could generate fewer levels by using
   192  	// L0Sublevels code to organize nonoverlapping files into the same level.
   193  	// This would allow us to use levelIters and keep a smaller set of data and
   194  	// files in-memory. However, it would also require us to identify the bounds
   195  	// of all the files upfront.
   196  
   197  	if !it.opts.pointKeys() {
   198  		return emptyIter, nil
   199  	} else if it.pointIter != nil {
   200  		return it.pointIter, nil
   201  	}
   202  	mlevels := it.alloc.mlevels[:0]
   203  
   204  	if len(it.externalReaders) > cap(mlevels) {
   205  		mlevels = make([]mergingIterLevel, 0, len(it.externalReaders))
   206  	}
   207  	for _, readers := range it.externalReaders {
   208  		var combinedIters []internalIterator
   209  		for _, r := range readers {
   210  			var (
   211  				rangeDelIter keyspan.FragmentIterator
   212  				pointIter    internalIterator
   213  				err          error
   214  			)
   215  			// We could set hideObsoletePoints=true, since we are reading at
   216  			// InternalKeySeqNumMax, but we don't bother since these sstables should
   217  			// not have obsolete points (so the performance optimization is
   218  			// unnecessary), and we don't want to bother constructing a
   219  			// BlockPropertiesFilterer that includes obsoleteKeyBlockPropertyFilter.
   220  			pointIter, err = r.NewIterWithBlockPropertyFiltersAndContextEtc(
   221  				ctx, it.opts.LowerBound, it.opts.UpperBound, nil, /* BlockPropertiesFilterer */
   222  				false /* hideObsoletePoints */, false, /* useFilterBlock */
   223  				&it.stats.InternalStats, it.opts.CategoryAndQoS, nil,
   224  				sstable.TrivialReaderProvider{Reader: r})
   225  			if err != nil {
   226  				return nil, err
   227  			}
   228  			rangeDelIter, err = r.NewRawRangeDelIter()
   229  			if err != nil {
   230  				return nil, err
   231  			}
   232  			if rangeDelIter == nil && pointIter != nil && it.forwardOnly {
   233  				// TODO(bilal): Consider implementing range key pausing in
   234  				// simpleLevelIter so we can reduce mergingIterLevels even more by
   235  				// sending all sstable iterators to combinedIters, not just those
   236  				// corresponding to sstables without range deletes.
   237  				combinedIters = append(combinedIters, pointIter)
   238  				continue
   239  			}
   240  			mlevels = append(mlevels, mergingIterLevel{
   241  				iter:         pointIter,
   242  				rangeDelIter: rangeDelIter,
   243  			})
   244  		}
   245  		if len(combinedIters) == 1 {
   246  			mlevels = append(mlevels, mergingIterLevel{
   247  				iter: combinedIters[0],
   248  			})
   249  		} else if len(combinedIters) > 1 {
   250  			sli := &simpleLevelIter{
   251  				cmp:   it.cmp,
   252  				iters: combinedIters,
   253  			}
   254  			sli.init(it.opts)
   255  			mlevels = append(mlevels, mergingIterLevel{
   256  				iter:         sli,
   257  				rangeDelIter: nil,
   258  			})
   259  		}
   260  	}
   261  	if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil {
   262  		// Set closePointIterOnce to true. This is because we're bypassing the
   263  		// merging iter, which turns Close()s on it idempotent for any child
   264  		// iterators. The outer Iterator could call Close() on a point iter twice,
   265  		// which sstable iterators do not support (as they release themselves to
   266  		// a pool).
   267  		it.closePointIterOnce = true
   268  		return mlevels[0].iter, nil
   269  	}
   270  
   271  	it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...)
   272  	it.alloc.merging.snapshot = base.InternalKeySeqNumMax
   273  	if len(mlevels) <= cap(it.alloc.levelsPositioned) {
   274  		it.alloc.merging.levelsPositioned = it.alloc.levelsPositioned[:len(mlevels)]
   275  	}
   276  	return &it.alloc.merging, nil
   277  }
   278  
   279  func finishInitializingExternal(ctx context.Context, it *Iterator) error {
   280  	pointIter, err := createExternalPointIter(ctx, it)
   281  	if err != nil {
   282  		return err
   283  	}
   284  	it.pointIter = pointIter
   285  	it.iter = it.pointIter
   286  
   287  	if it.opts.rangeKeys() {
   288  		it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split)
   289  		var rangeKeyIters []keyspan.FragmentIterator
   290  		if it.rangeKey == nil {
   291  			// We could take advantage of the lack of overlaps in range keys within
   292  			// each slice in it.externalReaders, and generate keyspan.LevelIters
   293  			// out of those. However, since range keys are expected to be sparse to
   294  			// begin with, the performance gain might not be significant enough to
   295  			// warrant it.
   296  			//
   297  			// TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not
   298  			// operate on FileMetadatas (similar to simpleLevelIter), and implements
   299  			// this optimization.
   300  			for _, readers := range it.externalReaders {
   301  				for _, r := range readers {
   302  					if rki, err := r.NewRawRangeKeyIter(); err != nil {
   303  						return err
   304  					} else if rki != nil {
   305  						rangeKeyIters = append(rangeKeyIters, rki)
   306  					}
   307  				}
   308  			}
   309  			if len(rangeKeyIters) > 0 {
   310  				it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
   311  				it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts)
   312  				it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init(
   313  					&it.comparer,
   314  					base.InternalKeySeqNumMax,
   315  					it.opts.LowerBound, it.opts.UpperBound,
   316  					&it.hasPrefix, &it.prefixOrFullSeekKey,
   317  					false /* internalKeys */, &it.rangeKey.internal,
   318  				)
   319  				for i := range rangeKeyIters {
   320  					it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i])
   321  				}
   322  			}
   323  		}
   324  		if it.rangeKey != nil {
   325  			it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter,
   326  				keyspan.InterleavingIterOpts{
   327  					Mask:       &it.rangeKeyMasking,
   328  					LowerBound: it.opts.LowerBound,
   329  					UpperBound: it.opts.UpperBound,
   330  				})
   331  			it.iter = &it.rangeKey.iiter
   332  		}
   333  	}
   334  	return nil
   335  }
   336  
   337  func openExternalTables(
   338  	o *Options,
   339  	files []sstable.ReadableFile,
   340  	seqNumOffset int,
   341  	readerOpts sstable.ReaderOptions,
   342  	extraReaderOpts ...sstable.ReaderOption,
   343  ) (readers []*sstable.Reader, err error) {
   344  	readers = make([]*sstable.Reader, 0, len(files))
   345  	for i := range files {
   346  		readable, err := sstable.NewSimpleReadable(files[i])
   347  		if err != nil {
   348  			return readers, err
   349  		}
   350  		r, err := sstable.NewReader(readable, readerOpts, extraReaderOpts...)
   351  		if err != nil {
   352  			return readers, err
   353  		}
   354  		// Use the index of the file in files as the sequence number for all of
   355  		// its keys.
   356  		r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset)
   357  		readers = append(readers, r)
   358  	}
   359  	return readers, err
   360  }
   361  
   362  // simpleLevelIter is similar to a levelIter in that it merges the points
   363  // from multiple point iterators that are non-overlapping in the key ranges
   364  // they return. It is only expected to support forward iteration and forward
   365  // regular seeking; reverse iteration and prefix seeking is not supported.
   366  // Intended to be a low-overhead, non-FileMetadata dependent option for
   367  // NewExternalIter. To optimize seeking and forward iteration, it maintains
   368  // two slices of child iterators; one of all iterators, and a subset of it that
   369  // contains just the iterators that contain point keys within the current
   370  // bounds.
   371  //
   372  // Note that this levelIter does not support pausing at file boundaries
   373  // in case of range tombstones in this file that could apply to points outside
   374  // of this file (and outside of this level). This is sufficient for optimizing
   375  // the main use cases of NewExternalIter, however for completeness it would make
   376  // sense to build this pausing functionality in.
   377  type simpleLevelIter struct {
   378  	cmp          Compare
   379  	err          error
   380  	lowerBound   []byte
   381  	iters        []internalIterator
   382  	filtered     []internalIterator
   383  	firstKeys    [][]byte
   384  	firstKeysBuf []byte
   385  	currentIdx   int
   386  }
   387  
   388  var _ internalIterator = &simpleLevelIter{}
   389  
   390  // init initializes this simpleLevelIter.
   391  func (s *simpleLevelIter) init(opts IterOptions) {
   392  	s.currentIdx = 0
   393  	s.lowerBound = opts.LowerBound
   394  	s.resetFilteredIters()
   395  }
   396  
   397  func (s *simpleLevelIter) resetFilteredIters() {
   398  	s.filtered = s.filtered[:0]
   399  	s.firstKeys = s.firstKeys[:0]
   400  	s.firstKeysBuf = s.firstKeysBuf[:0]
   401  	s.err = nil
   402  	for i := range s.iters {
   403  		var iterKey *base.InternalKey
   404  		if s.lowerBound != nil {
   405  			iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone)
   406  		} else {
   407  			iterKey, _ = s.iters[i].First()
   408  		}
   409  		if iterKey != nil {
   410  			s.filtered = append(s.filtered, s.iters[i])
   411  			bufStart := len(s.firstKeysBuf)
   412  			s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...)
   413  			s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)])
   414  		} else if err := s.iters[i].Error(); err != nil {
   415  			s.err = err
   416  		}
   417  	}
   418  }
   419  
   420  func (s *simpleLevelIter) SeekGE(
   421  	key []byte, flags base.SeekGEFlags,
   422  ) (*base.InternalKey, base.LazyValue) {
   423  	if s.err != nil {
   424  		return nil, base.LazyValue{}
   425  	}
   426  	// Find the first file that is entirely >= key. The file before that could
   427  	// contain the key we're looking for.
   428  	n := sort.Search(len(s.firstKeys), func(i int) bool {
   429  		return s.cmp(key, s.firstKeys[i]) <= 0
   430  	})
   431  	if n > 0 {
   432  		s.currentIdx = n - 1
   433  	} else {
   434  		s.currentIdx = n
   435  	}
   436  	if s.currentIdx < len(s.filtered) {
   437  		if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil {
   438  			return iterKey, val
   439  		}
   440  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   441  			s.err = err
   442  		}
   443  		s.currentIdx++
   444  	}
   445  	return s.skipEmptyFileForward(key, flags)
   446  }
   447  
   448  func (s *simpleLevelIter) skipEmptyFileForward(
   449  	seekKey []byte, flags base.SeekGEFlags,
   450  ) (*base.InternalKey, base.LazyValue) {
   451  	var iterKey *base.InternalKey
   452  	var val base.LazyValue
   453  	for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil {
   454  		if seekKey != nil {
   455  			iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags)
   456  		} else if s.lowerBound != nil {
   457  			iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags)
   458  		} else {
   459  			iterKey, val = s.filtered[s.currentIdx].First()
   460  		}
   461  		if iterKey != nil {
   462  			return iterKey, val
   463  		}
   464  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   465  			s.err = err
   466  		}
   467  		s.currentIdx++
   468  	}
   469  	return nil, base.LazyValue{}
   470  }
   471  
   472  func (s *simpleLevelIter) SeekPrefixGE(
   473  	prefix, key []byte, flags base.SeekGEFlags,
   474  ) (*base.InternalKey, base.LazyValue) {
   475  	panic("unimplemented")
   476  }
   477  
   478  func (s *simpleLevelIter) SeekLT(
   479  	key []byte, flags base.SeekLTFlags,
   480  ) (*base.InternalKey, base.LazyValue) {
   481  	panic("unimplemented")
   482  }
   483  
   484  func (s *simpleLevelIter) First() (*base.InternalKey, base.LazyValue) {
   485  	if s.err != nil {
   486  		return nil, base.LazyValue{}
   487  	}
   488  	s.currentIdx = 0
   489  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   490  }
   491  
   492  func (s *simpleLevelIter) Last() (*base.InternalKey, base.LazyValue) {
   493  	panic("unimplemented")
   494  }
   495  
   496  func (s *simpleLevelIter) Next() (*base.InternalKey, base.LazyValue) {
   497  	if s.err != nil {
   498  		return nil, base.LazyValue{}
   499  	}
   500  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   501  		return nil, base.LazyValue{}
   502  	}
   503  	if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil {
   504  		return iterKey, val
   505  	}
   506  	s.currentIdx++
   507  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   508  }
   509  
   510  func (s *simpleLevelIter) NextPrefix(succKey []byte) (*base.InternalKey, base.LazyValue) {
   511  	if s.err != nil {
   512  		return nil, base.LazyValue{}
   513  	}
   514  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   515  		return nil, base.LazyValue{}
   516  	}
   517  	if iterKey, val := s.filtered[s.currentIdx].NextPrefix(succKey); iterKey != nil {
   518  		return iterKey, val
   519  	}
   520  	s.currentIdx++
   521  	return s.skipEmptyFileForward(succKey /* seekKey */, base.SeekGEFlagsNone)
   522  }
   523  
   524  func (s *simpleLevelIter) Prev() (*base.InternalKey, base.LazyValue) {
   525  	panic("unimplemented")
   526  }
   527  
   528  func (s *simpleLevelIter) Error() error {
   529  	if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) {
   530  		s.err = firstError(s.err, s.filtered[s.currentIdx].Error())
   531  	}
   532  	return s.err
   533  }
   534  
   535  func (s *simpleLevelIter) Close() error {
   536  	var err error
   537  	for i := range s.iters {
   538  		err = firstError(err, s.iters[i].Close())
   539  	}
   540  	return err
   541  }
   542  
   543  func (s *simpleLevelIter) SetBounds(lower, upper []byte) {
   544  	s.currentIdx = -1
   545  	s.lowerBound = lower
   546  	for i := range s.iters {
   547  		s.iters[i].SetBounds(lower, upper)
   548  	}
   549  	s.resetFilteredIters()
   550  }
   551  
   552  func (s *simpleLevelIter) SetContext(_ context.Context) {}
   553  
   554  func (s *simpleLevelIter) String() string {
   555  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   556  		return "simpleLevelIter: current=<nil>"
   557  	}
   558  	return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx])
   559  }
   560  
   561  var _ internalIterator = &simpleLevelIter{}