github.com/cockroachdb/pebble@v1.1.1-0.20240513155919-3622ade60459/sstable/reader_iter.go (about)

     1  // Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package sstable
     6  
     7  import (
     8  	"fmt"
     9  	"os"
    10  	"sync"
    11  
    12  	"github.com/cockroachdb/pebble/internal/base"
    13  	"github.com/cockroachdb/pebble/internal/invariants"
    14  )
    15  
    16  // Iterator iterates over an entire table of data.
    17  type Iterator interface {
    18  	base.InternalIterator
    19  
    20  	// NextPrefix implements (base.InternalIterator).NextPrefix.
    21  	NextPrefix(succKey []byte) (*InternalKey, base.LazyValue)
    22  
    23  	// MaybeFilteredKeys may be called when an iterator is exhausted to indicate
    24  	// whether or not the last positioning method may have skipped any keys due
    25  	// to block-property filters. This is used by the Pebble levelIter to
    26  	// control when an iterator steps to the next sstable.
    27  	//
    28  	// MaybeFilteredKeys may always return false positives, that is it may
    29  	// return true when no keys were filtered. It should only be called when the
    30  	// iterator is exhausted. It must never return false negatives when the
    31  	// iterator is exhausted.
    32  	MaybeFilteredKeys() bool
    33  
    34  	SetCloseHook(fn func(i Iterator) error)
    35  }
    36  
    37  // Iterator positioning optimizations and singleLevelIterator and
    38  // twoLevelIterator:
    39  //
    40  // An iterator is absolute positioned using one of the Seek or First or Last
    41  // calls. After absolute positioning, there can be relative positioning done
    42  // by stepping using Prev or Next.
    43  //
    44  // We implement optimizations below where an absolute positioning call can in
    45  // some cases use the current position to do less work. To understand these,
    46  // we first define some terms. An iterator is bounds-exhausted if the bounds
    47  // (upper of lower) have been reached. An iterator is data-exhausted if it has
    48  // the reached the end of the data (forward or reverse) in the sstable. A
    49  // singleLevelIterator only knows a local-data-exhausted property since when
    50  // it is used as part of a twoLevelIterator, the twoLevelIterator can step to
    51  // the next lower-level index block.
    52  //
    53  // The bounds-exhausted property is tracked by
    54  // singleLevelIterator.exhaustedBounds being +1 (upper bound reached) or -1
    55  // (lower bound reached). The same field is reused by twoLevelIterator. Either
    56  // may notice the exhaustion of the bound and set it. Note that if
    57  // singleLevelIterator sets this property, it is not a local property (since
    58  // the bound has been reached regardless of whether this is in the context of
    59  // the twoLevelIterator or not).
    60  //
    61  // The data-exhausted property is tracked in a more subtle manner. We define
    62  // two predicates:
    63  // - partial-local-data-exhausted (PLDE):
    64  //   i.data.isDataInvalidated() || !i.data.valid()
    65  // - partial-global-data-exhausted (PGDE):
    66  //   i.index.isDataInvalidated() || !i.index.valid() || i.data.isDataInvalidated() ||
    67  //   !i.data.valid()
    68  //
    69  // PLDE is defined for a singleLevelIterator. PGDE is defined for a
    70  // twoLevelIterator. Oddly, in our code below the singleLevelIterator does not
    71  // know when it is part of a twoLevelIterator so it does not know when its
    72  // property is local or global.
    73  //
    74  // Now to define data-exhausted:
    75  // - Prerequisite: we must know that the iterator has been positioned and
    76  //   i.err is nil.
    77  // - bounds-exhausted must not be true:
    78  //   If bounds-exhausted is true, we have incomplete knowledge of
    79  //   data-exhausted since PLDE or PGDE could be true because we could have
    80  //   chosen not to load index block or data block and figured out that the
    81  //   bound is exhausted (due to block property filters filtering out index and
    82  //   data blocks and going past the bound on the top level index block). Note
    83  //   that if we tried to separate out the BPF case from others we could
    84  //   develop more knowledge here.
    85  // - PGDE is true for twoLevelIterator. PLDE is true if it is a standalone
    86  //   singleLevelIterator. !PLDE or !PGDE of course imply that data-exhausted
    87  //   is not true.
    88  //
    89  // An implication of the above is that if we are going to somehow utilize
    90  // knowledge of data-exhausted in an optimization, we must not forget the
    91  // existing value of bounds-exhausted since by forgetting the latter we can
    92  // erroneously think that data-exhausted is true. Bug #2036 was due to this
    93  // forgetting.
    94  //
    95  // Now to the two categories of optimizations we currently have:
    96  // - Monotonic bounds optimization that reuse prior iterator position when
    97  //   doing seek: These only work with !data-exhausted. We could choose to make
    98  //   these work with data-exhausted but have not bothered because in the
    99  //   context of a DB if data-exhausted were true, the DB would move to the
   100  //   next file in the level. Note that this behavior of moving to the next
   101  //   file is not necessarily true for L0 files, so there could be some benefit
   102  //   in the future in this optimization. See the WARNING-data-exhausted
   103  //   comments if trying to optimize this in the future.
   104  // - TrySeekUsingNext optimizations: these work regardless of exhaustion
   105  //   state.
   106  //
   107  // Implementation detail: In the code PLDE only checks that
   108  // i.data.isDataInvalidated(). This narrower check is safe, since this is a
   109  // subset of the set expressed by the OR expression. Also, it is not a
   110  // de-optimization since whenever we exhaust the iterator we explicitly call
   111  // i.data.invalidate(). PGDE checks i.index.isDataInvalidated() &&
   112  // i.data.isDataInvalidated(). Again, this narrower check is safe, and not a
   113  // de-optimization since whenever we exhaust the iterator we explicitly call
   114  // i.index.invalidate() and i.data.invalidate(). The && is questionable -- for
   115  // now this is a bit of defensive code. We should seriously consider removing
   116  // it, since defensive code suggests we are not confident about our invariants
   117  // (and if we are not confident, we need more invariant assertions, not
   118  // defensive code).
   119  //
   120  // TODO(sumeer): remove the aforementioned defensive code.
   121  
   122  var singleLevelIterPool = sync.Pool{
   123  	New: func() interface{} {
   124  		i := &singleLevelIterator{}
   125  		// Note: this is a no-op if invariants are disabled or race is enabled.
   126  		invariants.SetFinalizer(i, checkSingleLevelIterator)
   127  		return i
   128  	},
   129  }
   130  
   131  var twoLevelIterPool = sync.Pool{
   132  	New: func() interface{} {
   133  		i := &twoLevelIterator{}
   134  		// Note: this is a no-op if invariants are disabled or race is enabled.
   135  		invariants.SetFinalizer(i, checkTwoLevelIterator)
   136  		return i
   137  	},
   138  }
   139  
   140  // TODO(jackson): rangedel fragmentBlockIters can't be pooled because of some
   141  // code paths that double Close the iters. Fix the double close and pool the
   142  // *fragmentBlockIter type directly.
   143  
   144  var rangeKeyFragmentBlockIterPool = sync.Pool{
   145  	New: func() interface{} {
   146  		i := &rangeKeyFragmentBlockIter{}
   147  		// Note: this is a no-op if invariants are disabled or race is enabled.
   148  		invariants.SetFinalizer(i, checkRangeKeyFragmentBlockIterator)
   149  		return i
   150  	},
   151  }
   152  
   153  func checkSingleLevelIterator(obj interface{}) {
   154  	i := obj.(*singleLevelIterator)
   155  	if p := i.data.handle.Get(); p != nil {
   156  		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
   157  		os.Exit(1)
   158  	}
   159  	if p := i.index.handle.Get(); p != nil {
   160  		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
   161  		os.Exit(1)
   162  	}
   163  }
   164  
   165  func checkTwoLevelIterator(obj interface{}) {
   166  	i := obj.(*twoLevelIterator)
   167  	if p := i.data.handle.Get(); p != nil {
   168  		fmt.Fprintf(os.Stderr, "singleLevelIterator.data.handle is not nil: %p\n", p)
   169  		os.Exit(1)
   170  	}
   171  	if p := i.index.handle.Get(); p != nil {
   172  		fmt.Fprintf(os.Stderr, "singleLevelIterator.index.handle is not nil: %p\n", p)
   173  		os.Exit(1)
   174  	}
   175  }
   176  
   177  func checkRangeKeyFragmentBlockIterator(obj interface{}) {
   178  	i := obj.(*rangeKeyFragmentBlockIter)
   179  	if p := i.blockIter.handle.Get(); p != nil {
   180  		fmt.Fprintf(os.Stderr, "fragmentBlockIter.blockIter.handle is not nil: %p\n", p)
   181  		os.Exit(1)
   182  	}
   183  }
   184  
   185  // compactionIterator is similar to Iterator but it increments the number of
   186  // bytes that have been iterated through.
   187  type compactionIterator struct {
   188  	*singleLevelIterator
   189  	bytesIterated *uint64
   190  	prevOffset    uint64
   191  }
   192  
   193  // compactionIterator implements the base.InternalIterator interface.
   194  var _ base.InternalIterator = (*compactionIterator)(nil)
   195  
   196  func (i *compactionIterator) String() string {
   197  	if i.vState != nil {
   198  		return i.vState.fileNum.String()
   199  	}
   200  	return i.reader.fileNum.String()
   201  }
   202  
   203  func (i *compactionIterator) SeekGE(
   204  	key []byte, flags base.SeekGEFlags,
   205  ) (*InternalKey, base.LazyValue) {
   206  	panic("pebble: SeekGE unimplemented")
   207  }
   208  
   209  func (i *compactionIterator) SeekPrefixGE(
   210  	prefix, key []byte, flags base.SeekGEFlags,
   211  ) (*base.InternalKey, base.LazyValue) {
   212  	panic("pebble: SeekPrefixGE unimplemented")
   213  }
   214  
   215  func (i *compactionIterator) SeekLT(
   216  	key []byte, flags base.SeekLTFlags,
   217  ) (*InternalKey, base.LazyValue) {
   218  	panic("pebble: SeekLT unimplemented")
   219  }
   220  
   221  func (i *compactionIterator) First() (*InternalKey, base.LazyValue) {
   222  	i.err = nil // clear cached iteration error
   223  	return i.skipForward(i.singleLevelIterator.First())
   224  }
   225  
   226  func (i *compactionIterator) Last() (*InternalKey, base.LazyValue) {
   227  	panic("pebble: Last unimplemented")
   228  }
   229  
   230  // Note: compactionIterator.Next mirrors the implementation of Iterator.Next
   231  // due to performance. Keep the two in sync.
   232  func (i *compactionIterator) Next() (*InternalKey, base.LazyValue) {
   233  	if i.err != nil {
   234  		return nil, base.LazyValue{}
   235  	}
   236  	return i.skipForward(i.data.Next())
   237  }
   238  
   239  func (i *compactionIterator) NextPrefix(succKey []byte) (*InternalKey, base.LazyValue) {
   240  	panic("pebble: NextPrefix unimplemented")
   241  }
   242  
   243  func (i *compactionIterator) Prev() (*InternalKey, base.LazyValue) {
   244  	panic("pebble: Prev unimplemented")
   245  }
   246  
   247  func (i *compactionIterator) skipForward(
   248  	key *InternalKey, val base.LazyValue,
   249  ) (*InternalKey, base.LazyValue) {
   250  	if key == nil {
   251  		for {
   252  			if key, _ := i.index.Next(); key == nil {
   253  				break
   254  			}
   255  			result := i.loadBlock(+1)
   256  			if result != loadBlockOK {
   257  				if i.err != nil {
   258  					break
   259  				}
   260  				switch result {
   261  				case loadBlockFailed:
   262  					// We checked that i.index was at a valid entry, so
   263  					// loadBlockFailed could not have happened due to to i.index
   264  					// being exhausted, and must be due to an error.
   265  					panic("loadBlock should not have failed with no error")
   266  				case loadBlockIrrelevant:
   267  					panic("compactionIter should not be using block intervals for skipping")
   268  				default:
   269  					panic(fmt.Sprintf("unexpected case %d", result))
   270  				}
   271  			}
   272  			// result == loadBlockOK
   273  			if key, val = i.data.First(); key != nil {
   274  				break
   275  			}
   276  		}
   277  	}
   278  
   279  	curOffset := i.recordOffset()
   280  	*i.bytesIterated += uint64(curOffset - i.prevOffset)
   281  	i.prevOffset = curOffset
   282  
   283  	if i.vState != nil && key != nil {
   284  		cmp := i.cmp(key.UserKey, i.vState.upper.UserKey)
   285  		if cmp > 0 || (i.vState.upper.IsExclusiveSentinel() && cmp == 0) {
   286  			return nil, base.LazyValue{}
   287  		}
   288  	}
   289  
   290  	return key, val
   291  }