github.com/zuoyebang/bitalostable@v1.0.1-0.20240229032404-e3b99a834294/external_iterator.go (about)

     1  // Copyright 2022 The LevelDB-Go and Pebble and Bitalostored Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package bitalostable
     6  
     7  import (
     8  	"fmt"
     9  	"sort"
    10  
    11  	"github.com/cockroachdb/errors"
    12  	"github.com/zuoyebang/bitalostable/internal/base"
    13  	"github.com/zuoyebang/bitalostable/internal/keyspan"
    14  	"github.com/zuoyebang/bitalostable/internal/manifest"
    15  	"github.com/zuoyebang/bitalostable/sstable"
    16  )
    17  
    18  // ExternalIterOption provide an interface to specify open-time options to
    19  // NewExternalIter.
    20  type ExternalIterOption interface {
    21  	// iterApply is called on the iterator during opening in order to set internal
    22  	// parameters.
    23  	iterApply(*Iterator)
    24  	// readerOptions returns any reader options added by this iter option.
    25  	readerOptions() []sstable.ReaderOption
    26  }
    27  
    28  type externalIterReaderOptions struct {
    29  	opts []sstable.ReaderOption
    30  }
    31  
    32  func (e *externalIterReaderOptions) iterApply(iterator *Iterator) {
    33  	// Do nothing.
    34  }
    35  
    36  func (e *externalIterReaderOptions) readerOptions() []sstable.ReaderOption {
    37  	return e.opts
    38  }
    39  
    40  // ExternalIterReaderOptions returns an ExternalIterOption that specifies
    41  // sstable.ReaderOptions to be applied on sstable readers in NewExternalIter.
    42  func ExternalIterReaderOptions(opts ...sstable.ReaderOption) ExternalIterOption {
    43  	return &externalIterReaderOptions{opts: opts}
    44  }
    45  
    46  // ExternalIterForwardOnly is an ExternalIterOption that specifies this iterator
    47  // will only be used for forward positioning operations (First, SeekGE, Next).
    48  // This could enable optimizations that take advantage of this invariant.
    49  // Behaviour when a reverse positioning operation is done on an iterator
    50  // opened with this option is unpredictable, though in most cases it should.
    51  type ExternalIterForwardOnly struct{}
    52  
    53  func (e ExternalIterForwardOnly) iterApply(iter *Iterator) {
    54  	iter.forwardOnly = true
    55  }
    56  
    57  func (e ExternalIterForwardOnly) readerOptions() []sstable.ReaderOption {
    58  	return nil
    59  }
    60  
    61  // NewExternalIter takes an input 2d array of sstable files which may overlap
    62  // across subarrays but not within a subarray (at least as far as points are
    63  // concerned; range keys are allowed to overlap arbitrarily even within a
    64  // subarray), and returns an Iterator over the merged contents of the sstables.
    65  // Input sstables may contain point keys, range keys, range deletions, etc. The
    66  // input files slice must be sorted in reverse chronological ordering. A key in a
    67  // file at a lower index subarray will shadow a key with an identical user key
    68  // contained within a file at a higher index subarray. Each subarray must be
    69  // sorted in internal key order, where lower index files contain keys that sort
    70  // left of files with higher indexes.
    71  //
    72  // Input sstables must only contain keys with the zero sequence number.
    73  //
    74  // Iterators constructed through NewExternalIter do not support all iterator
    75  // options, including block-property and table filters. NewExternalIter errors
    76  // if an incompatible option is set.
    77  func NewExternalIter(
    78  	o *Options,
    79  	iterOpts *IterOptions,
    80  	files [][]sstable.ReadableFile,
    81  	extraOpts ...ExternalIterOption,
    82  ) (it *Iterator, err error) {
    83  	if iterOpts != nil {
    84  		if err := validateExternalIterOpts(iterOpts); err != nil {
    85  			return nil, err
    86  		}
    87  	}
    88  
    89  	var readers [][]*sstable.Reader
    90  
    91  	// Ensure we close all the opened readers if we error out.
    92  	defer func() {
    93  		if err != nil {
    94  			for i := range readers {
    95  				for j := range readers[i] {
    96  					_ = readers[i][j].Close()
    97  				}
    98  			}
    99  		}
   100  	}()
   101  	seqNumOffset := 0
   102  	var extraReaderOpts []sstable.ReaderOption
   103  	for i := range extraOpts {
   104  		extraReaderOpts = append(extraReaderOpts, extraOpts[i].readerOptions()...)
   105  	}
   106  	for _, levelFiles := range files {
   107  		seqNumOffset += len(levelFiles)
   108  	}
   109  	for _, levelFiles := range files {
   110  		var subReaders []*sstable.Reader
   111  		seqNumOffset -= len(levelFiles)
   112  		subReaders, err = openExternalTables(o, levelFiles, seqNumOffset, o.MakeReaderOptions(), extraReaderOpts...)
   113  		readers = append(readers, subReaders)
   114  	}
   115  	if err != nil {
   116  		return nil, err
   117  	}
   118  
   119  	buf := iterAllocPool.Get().(*iterAlloc)
   120  	dbi := &buf.dbi
   121  	*dbi = Iterator{
   122  		alloc:               buf,
   123  		merge:               o.Merger.Merge,
   124  		comparer:            *o.Comparer,
   125  		readState:           nil,
   126  		keyBuf:              buf.keyBuf,
   127  		prefixOrFullSeekKey: buf.prefixOrFullSeekKey,
   128  		boundsBuf:           buf.boundsBuf,
   129  		batch:               nil,
   130  		// Add the readers to the Iterator so that Close closes them, and
   131  		// SetOptions can re-construct iterators from them.
   132  		externalReaders: readers,
   133  		newIters: func(f *manifest.FileMetadata, opts *IterOptions, internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) {
   134  			// NB: External iterators are currently constructed without any
   135  			// `levelIters`. newIters should never be called. When we support
   136  			// organizing multiple non-overlapping files into a single level
   137  			// (see TODO below), we'll need to adjust this tableNewIters
   138  			// implementation to open iterators by looking up f in a map
   139  			// of readers indexed by *fileMetadata.
   140  			panic("unreachable")
   141  		},
   142  		seqNum: base.InternalKeySeqNumMax,
   143  	}
   144  	if iterOpts != nil {
   145  		dbi.opts = *iterOpts
   146  		dbi.saveBounds(iterOpts.LowerBound, iterOpts.UpperBound)
   147  	}
   148  	for i := range extraOpts {
   149  		extraOpts[i].iterApply(dbi)
   150  	}
   151  	finishInitializingExternal(dbi)
   152  	return dbi, nil
   153  }
   154  
   155  func validateExternalIterOpts(iterOpts *IterOptions) error {
   156  	switch {
   157  	case iterOpts.TableFilter != nil:
   158  		return errors.Errorf("bitalostable: external iterator: TableFilter unsupported")
   159  	case iterOpts.PointKeyFilters != nil:
   160  		return errors.Errorf("bitalostable: external iterator: PointKeyFilters unsupported")
   161  	case iterOpts.RangeKeyFilters != nil:
   162  		return errors.Errorf("bitalostable: external iterator: RangeKeyFilters unsupported")
   163  	case iterOpts.OnlyReadGuaranteedDurable:
   164  		return errors.Errorf("bitalostable: external iterator: OnlyReadGuaranteedDurable unsupported")
   165  	case iterOpts.UseL6Filters:
   166  		return errors.Errorf("bitalostable: external iterator: UseL6Filters unsupported")
   167  	}
   168  	return nil
   169  }
   170  
   171  func createExternalPointIter(it *Iterator) (internalIterator, error) {
   172  	// TODO(jackson): In some instances we could generate fewer levels by using
   173  	// L0Sublevels code to organize nonoverlapping files into the same level.
   174  	// This would allow us to use levelIters and keep a smaller set of data and
   175  	// files in-memory. However, it would also require us to identify the bounds
   176  	// of all the files upfront.
   177  
   178  	if !it.opts.pointKeys() {
   179  		return emptyIter, nil
   180  	} else if it.pointIter != nil {
   181  		return it.pointIter, nil
   182  	}
   183  	mlevels := it.alloc.mlevels[:0]
   184  
   185  	if len(it.externalReaders) > cap(mlevels) {
   186  		mlevels = make([]mergingIterLevel, 0, len(it.externalReaders))
   187  	}
   188  	for _, readers := range it.externalReaders {
   189  		var combinedIters []internalIterator
   190  		for _, r := range readers {
   191  			var (
   192  				rangeDelIter keyspan.FragmentIterator
   193  				pointIter    internalIterator
   194  				err          error
   195  			)
   196  			pointIter, err = r.NewIter(it.opts.LowerBound, it.opts.UpperBound)
   197  			if err != nil {
   198  				return nil, err
   199  			}
   200  			rangeDelIter, err = r.NewRawRangeDelIter()
   201  			if err != nil {
   202  				return nil, err
   203  			}
   204  			if rangeDelIter == nil && pointIter != nil && it.forwardOnly {
   205  				// TODO(bilal): Consider implementing range key pausing in
   206  				// simpleLevelIter so we can reduce mergingIterLevels even more by
   207  				// sending all sstable iterators to combinedIters, not just those
   208  				// corresponding to sstables without range deletes.
   209  				combinedIters = append(combinedIters, pointIter)
   210  				continue
   211  			}
   212  			mlevels = append(mlevels, mergingIterLevel{
   213  				iter:         pointIter,
   214  				rangeDelIter: rangeDelIter,
   215  			})
   216  		}
   217  		if len(combinedIters) == 1 {
   218  			mlevels = append(mlevels, mergingIterLevel{
   219  				iter: combinedIters[0],
   220  			})
   221  		} else if len(combinedIters) > 1 {
   222  			sli := &simpleLevelIter{
   223  				cmp:   it.cmp,
   224  				iters: combinedIters,
   225  			}
   226  			sli.init(it.opts)
   227  			mlevels = append(mlevels, mergingIterLevel{
   228  				iter:         sli,
   229  				rangeDelIter: nil,
   230  			})
   231  		}
   232  	}
   233  	if len(mlevels) == 1 && mlevels[0].rangeDelIter == nil {
   234  		// Set closePointIterOnce to true. This is because we're bypassing the
   235  		// merging iter, which turns Close()s on it idempotent for any child
   236  		// iterators. The outer Iterator could call Close() on a point iter twice,
   237  		// which sstable iterators do not support (as they release themselves to
   238  		// a pool).
   239  		it.closePointIterOnce = true
   240  		return mlevels[0].iter, nil
   241  	}
   242  
   243  	it.alloc.merging.init(&it.opts, &it.stats.InternalStats, it.comparer.Compare, it.comparer.Split, mlevels...)
   244  	it.alloc.merging.snapshot = base.InternalKeySeqNumMax
   245  	it.alloc.merging.elideRangeTombstones = true
   246  	return &it.alloc.merging, nil
   247  }
   248  
   249  func finishInitializingExternal(it *Iterator) {
   250  	pointIter, err := createExternalPointIter(it)
   251  	if err != nil {
   252  		it.pointIter = &errorIter{err: err}
   253  	} else {
   254  		it.pointIter = pointIter
   255  	}
   256  	it.iter = it.pointIter
   257  
   258  	if it.opts.rangeKeys() {
   259  		it.rangeKeyMasking.init(it, it.comparer.Compare, it.comparer.Split)
   260  		var rangeKeyIters []keyspan.FragmentIterator
   261  		if it.rangeKey == nil {
   262  			// We could take advantage of the lack of overlaps in range keys within
   263  			// each slice in it.externalReaders, and generate keyspan.LevelIters
   264  			// out of those. However, since range keys are expected to be sparse to
   265  			// begin with, the performance gain might not be significant enough to
   266  			// warrant it.
   267  			//
   268  			// TODO(bilal): Explore adding a simpleRangeKeyLevelIter that does not
   269  			// operate on FileMetadatas (similar to simpleLevelIter), and implements
   270  			// this optimization.
   271  			for _, readers := range it.externalReaders {
   272  				for _, r := range readers {
   273  					if rki, err := r.NewRawRangeKeyIter(); err != nil {
   274  						rangeKeyIters = append(rangeKeyIters, &errorKeyspanIter{err: err})
   275  					} else if rki != nil {
   276  						rangeKeyIters = append(rangeKeyIters, rki)
   277  					}
   278  				}
   279  			}
   280  			if len(rangeKeyIters) > 0 {
   281  				it.rangeKey = iterRangeKeyStateAllocPool.Get().(*iteratorRangeKeyState)
   282  				it.rangeKey.init(it.comparer.Compare, it.comparer.Split, &it.opts)
   283  				it.rangeKey.rangeKeyIter = it.rangeKey.iterConfig.Init(
   284  					&it.comparer,
   285  					base.InternalKeySeqNumMax,
   286  					it.opts.LowerBound, it.opts.UpperBound,
   287  					&it.hasPrefix, &it.prefixOrFullSeekKey,
   288  				)
   289  				for i := range rangeKeyIters {
   290  					it.rangeKey.iterConfig.AddLevel(rangeKeyIters[i])
   291  				}
   292  			}
   293  		}
   294  		if it.rangeKey != nil {
   295  			it.rangeKey.iiter.Init(&it.comparer, it.iter, it.rangeKey.rangeKeyIter, &it.rangeKeyMasking,
   296  				it.opts.LowerBound, it.opts.UpperBound)
   297  			it.iter = &it.rangeKey.iiter
   298  		}
   299  	}
   300  }
   301  
   302  func openExternalTables(
   303  	o *Options,
   304  	files []sstable.ReadableFile,
   305  	seqNumOffset int,
   306  	readerOpts sstable.ReaderOptions,
   307  	extraReaderOpts ...sstable.ReaderOption,
   308  ) (readers []*sstable.Reader, err error) {
   309  	readers = make([]*sstable.Reader, 0, len(files))
   310  	for i := range files {
   311  		r, err := sstable.NewReader(files[i], readerOpts, extraReaderOpts...)
   312  		if err != nil {
   313  			return readers, err
   314  		}
   315  		// Use the index of the file in files as the sequence number for all of
   316  		// its keys.
   317  		r.Properties.GlobalSeqNum = uint64(len(files) - i + seqNumOffset)
   318  		readers = append(readers, r)
   319  	}
   320  	return readers, err
   321  }
   322  
   323  // simpleLevelIter is similar to a levelIter in that it merges the points
   324  // from multiple point iterators that are non-overlapping in the key ranges
   325  // they return. It is only expected to support forward iteration and forward
   326  // regular seeking; reverse iteration and prefix seeking is not supported.
   327  // Intended to be a low-overhead, non-FileMetadata dependent option for
   328  // NewExternalIter. To optimize seeking and forward iteration, it maintains
   329  // two slices of child iterators; one of all iterators, and a subset of it that
   330  // contains just the iterators that contain point keys within the current
   331  // bounds.
   332  //
   333  // Note that this levelIter does not support pausing at file boundaries
   334  // in case of range tombstones in this file that could apply to points outside
   335  // of this file (and outside of this level). This is sufficient for optimizing
   336  // the main use cases of NewExternalIter, however for completeness it would make
   337  // sense to build this pausing functionality in.
   338  type simpleLevelIter struct {
   339  	cmp          Compare
   340  	err          error
   341  	lowerBound   []byte
   342  	iters        []internalIterator
   343  	filtered     []internalIterator
   344  	firstKeys    [][]byte
   345  	firstKeysBuf []byte
   346  	currentIdx   int
   347  }
   348  
   349  // init initializes this simpleLevelIter.
   350  func (s *simpleLevelIter) init(opts IterOptions) {
   351  	s.currentIdx = 0
   352  	s.lowerBound = opts.LowerBound
   353  	s.resetFilteredIters()
   354  }
   355  
   356  func (s *simpleLevelIter) resetFilteredIters() {
   357  	s.filtered = s.filtered[:0]
   358  	s.firstKeys = s.firstKeys[:0]
   359  	s.firstKeysBuf = s.firstKeysBuf[:0]
   360  	s.err = nil
   361  	for i := range s.iters {
   362  		var iterKey *base.InternalKey
   363  		if s.lowerBound != nil {
   364  			iterKey, _ = s.iters[i].SeekGE(s.lowerBound, base.SeekGEFlagsNone)
   365  		} else {
   366  			iterKey, _ = s.iters[i].First()
   367  		}
   368  		if iterKey != nil {
   369  			s.filtered = append(s.filtered, s.iters[i])
   370  			bufStart := len(s.firstKeysBuf)
   371  			s.firstKeysBuf = append(s.firstKeysBuf, iterKey.UserKey...)
   372  			s.firstKeys = append(s.firstKeys, s.firstKeysBuf[bufStart:bufStart+len(iterKey.UserKey)])
   373  		} else if err := s.iters[i].Error(); err != nil {
   374  			s.err = err
   375  		}
   376  	}
   377  }
   378  
   379  func (s *simpleLevelIter) SeekGE(key []byte, flags base.SeekGEFlags) (*base.InternalKey, []byte) {
   380  	if s.err != nil {
   381  		return nil, nil
   382  	}
   383  	// Find the first file that is entirely >= key. The file before that could
   384  	// contain the key we're looking for.
   385  	n := sort.Search(len(s.firstKeys), func(i int) bool {
   386  		return s.cmp(key, s.firstKeys[i]) <= 0
   387  	})
   388  	if n > 0 {
   389  		s.currentIdx = n - 1
   390  	} else {
   391  		s.currentIdx = n
   392  	}
   393  	if s.currentIdx < len(s.filtered) {
   394  		if iterKey, val := s.filtered[s.currentIdx].SeekGE(key, flags); iterKey != nil {
   395  			return iterKey, val
   396  		}
   397  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   398  			s.err = err
   399  		}
   400  		s.currentIdx++
   401  	}
   402  	return s.skipEmptyFileForward(key, flags)
   403  }
   404  
   405  func (s *simpleLevelIter) skipEmptyFileForward(
   406  	seekKey []byte, flags base.SeekGEFlags,
   407  ) (*base.InternalKey, []byte) {
   408  	var iterKey *base.InternalKey
   409  	var val []byte
   410  	for s.currentIdx >= 0 && s.currentIdx < len(s.filtered) && s.err == nil {
   411  		if seekKey != nil {
   412  			iterKey, val = s.filtered[s.currentIdx].SeekGE(seekKey, flags)
   413  		} else if s.lowerBound != nil {
   414  			iterKey, val = s.filtered[s.currentIdx].SeekGE(s.lowerBound, flags)
   415  		} else {
   416  			iterKey, val = s.filtered[s.currentIdx].First()
   417  		}
   418  		if iterKey != nil {
   419  			return iterKey, val
   420  		}
   421  		if err := s.filtered[s.currentIdx].Error(); err != nil {
   422  			s.err = err
   423  		}
   424  		s.currentIdx++
   425  	}
   426  	return nil, nil
   427  }
   428  
   429  func (s *simpleLevelIter) SeekPrefixGE(
   430  	prefix, key []byte, flags base.SeekGEFlags,
   431  ) (*base.InternalKey, []byte) {
   432  	panic("unimplemented")
   433  }
   434  
   435  func (s *simpleLevelIter) SeekLT(key []byte, flags base.SeekLTFlags) (*base.InternalKey, []byte) {
   436  	panic("unimplemented")
   437  }
   438  
   439  func (s *simpleLevelIter) First() (*base.InternalKey, []byte) {
   440  	if s.err != nil {
   441  		return nil, nil
   442  	}
   443  	s.currentIdx = 0
   444  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   445  }
   446  
   447  func (s *simpleLevelIter) Last() (*base.InternalKey, []byte) {
   448  	panic("unimplemented")
   449  }
   450  
   451  func (s *simpleLevelIter) Next() (*base.InternalKey, []byte) {
   452  	if s.err != nil {
   453  		return nil, nil
   454  	}
   455  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   456  		return nil, nil
   457  	}
   458  	if iterKey, val := s.filtered[s.currentIdx].Next(); iterKey != nil {
   459  		return iterKey, val
   460  	}
   461  	s.currentIdx++
   462  	return s.skipEmptyFileForward(nil /* seekKey */, base.SeekGEFlagsNone)
   463  }
   464  
   465  func (s *simpleLevelIter) Prev() (*base.InternalKey, []byte) {
   466  	panic("unimplemented")
   467  }
   468  
   469  func (s *simpleLevelIter) Error() error {
   470  	if s.currentIdx >= 0 && s.currentIdx < len(s.filtered) {
   471  		s.err = firstError(s.err, s.filtered[s.currentIdx].Error())
   472  	}
   473  	return s.err
   474  }
   475  
   476  func (s *simpleLevelIter) Close() error {
   477  	var err error
   478  	for i := range s.iters {
   479  		err = firstError(err, s.iters[i].Close())
   480  	}
   481  	return err
   482  }
   483  
   484  func (s *simpleLevelIter) SetBounds(lower, upper []byte) {
   485  	s.currentIdx = -1
   486  	s.lowerBound = lower
   487  	for i := range s.iters {
   488  		s.iters[i].SetBounds(lower, upper)
   489  	}
   490  	s.resetFilteredIters()
   491  }
   492  
   493  func (s *simpleLevelIter) String() string {
   494  	if s.currentIdx < 0 || s.currentIdx >= len(s.filtered) {
   495  		return "simpleLevelIter: current=<nil>"
   496  	}
   497  	return fmt.Sprintf("simpleLevelIter: current=%s", s.filtered[s.currentIdx])
   498  }
   499  
   500  var _ internalIterator = &simpleLevelIter{}