github.com/cockroachdb/pebble@v1.1.2/internal/base/iterator.go (about)

     1  // Copyright 2019 The LevelDB-Go and Pebble Authors. All rights reserved. Use
     2  // of this source code is governed by a BSD-style license that can be found in
     3  // the LICENSE file.
     4  
     5  package base
     6  
     7  import (
     8  	"fmt"
     9  	"time"
    10  
    11  	"github.com/cockroachdb/pebble/internal/humanize"
    12  	"github.com/cockroachdb/redact"
    13  )
    14  
    15  // InternalIterator iterates over a DB's key/value pairs in key order. Unlike
    16  // the Iterator interface, the returned keys are InternalKeys composed of the
    17  // user-key, a sequence number and a key kind. In forward iteration, key/value
    18  // pairs for identical user-keys are returned in descending sequence order. In
    19  // reverse iteration, key/value pairs for identical user-keys are returned in
    20  // ascending sequence order.
    21  //
    22  // InternalIterators provide 5 absolute positioning methods and 2 relative
    23  // positioning methods. The absolute positioning methods are:
    24  //
    25  // - SeekGE
    26  // - SeekPrefixGE
    27  // - SeekLT
    28  // - First
    29  // - Last
    30  //
    31  // The relative positioning methods are:
    32  //
    33  // - Next
    34  // - Prev
    35  //
    36  // The relative positioning methods can be used in conjunction with any of the
    37  // absolute positioning methods with one exception: SeekPrefixGE does not
    38  // support reverse iteration via Prev. It is undefined to call relative
    39  // positioning methods without ever calling an absolute positioning method.
    40  //
    41  // InternalIterators can optionally implement a prefix iteration mode. This
    42  // mode is entered by calling SeekPrefixGE and exited by any other absolute
    43  // positioning method (SeekGE, SeekLT, First, Last). When in prefix iteration
    44  // mode, a call to Next will advance to the next key which has the same
    45  // "prefix" as the one supplied to SeekPrefixGE. Note that "prefix" in this
    46  // context is not a strict byte prefix, but defined by byte equality for the
    47  // result of the Comparer.Split method. An InternalIterator is not required to
    48  // support prefix iteration mode, and can implement SeekPrefixGE by forwarding
    49  // to SeekGE. When the iteration prefix is exhausted, it is not valid to call
    50  // Next on an internal iterator that's already returned (nil,nilv) or a key
    51  // beyond the prefix.
    52  //
    53  // Bounds, [lower, upper), can be set on iterators, either using the SetBounds()
    54  // function in the interface, or in implementation specific ways during iterator
    55  // creation. The forward positioning routines (SeekGE, First, and Next) only
    56  // check the upper bound. The reverse positioning routines (SeekLT, Last, and
    57  // Prev) only check the lower bound. It is up to the caller to ensure that the
    58  // forward positioning routines respect the lower bound and the reverse
    59  // positioning routines respect the upper bound (i.e. calling SeekGE instead of
    60  // First if there is a lower bound, and SeekLT instead of Last if there is an
    61  // upper bound). This imposition is done in order to elevate that enforcement to
    62  // the caller (generally pebble.Iterator or pebble.mergingIter) rather than
    63  // having it duplicated in every InternalIterator implementation.
    64  //
    65  // Additionally, the caller needs to ensure that SeekGE/SeekPrefixGE are not
    66  // called with a key > the upper bound, and SeekLT is not called with a key <
    67  // the lower bound. InternalIterator implementations are required to respect
    68  // the iterator bounds, never returning records outside of the bounds with one
    69  // exception: an iterator may generate synthetic RANGEDEL marker records. See
    70  // levelIter.syntheticBoundary for the sole existing example of this behavior.
    71  // Specifically, levelIter can return synthetic keys whose user key is equal to
    72  // the lower/upper bound.
    73  //
    74  // The bounds provided to an internal iterator must remain valid until a
    75  // subsequent call to SetBounds has returned. This requirement exists so that
    76  // iterator implementations may compare old and new bounds to apply low-level
    77  // optimizations. The pebble.Iterator satisfies this requirement by maintaining
    78  // two bound buffers and switching between them.
    79  //
    80  // An iterator must be closed after use, but it is not necessary to read an
    81  // iterator until exhaustion.
    82  //
    83  // An iterator is not goroutine-safe, but it is safe to use multiple iterators
    84  // concurrently, either in separate goroutines or switching between the
    85  // iterators in a single goroutine.
    86  //
    87  // It is also safe to use an iterator concurrently with modifying its
    88  // underlying DB, if that DB permits modification. However, the resultant
    89  // key/value pairs are not guaranteed to be a consistent snapshot of that DB
    90  // at a particular point in time.
    91  //
    92  // InternalIterators accumulate errors encountered during operation, exposing
    93  // them through the Error method. All of the absolute positioning methods
    94  // reset any accumulated error before positioning. Relative positioning
    95  // methods return without advancing if the iterator has accumulated an error.
    96  //
    97  // nilv == shorthand for LazyValue{}, which represents a nil value.
    98  type InternalIterator interface {
    99  	// SeekGE moves the iterator to the first key/value pair whose key is greater
   100  	// than or equal to the given key. Returns the key and value if the iterator
   101  	// is pointing at a valid entry, and (nil, nilv) otherwise. Note that SeekGE
   102  	// only checks the upper bound. It is up to the caller to ensure that key
   103  	// is greater than or equal to the lower bound.
   104  	SeekGE(key []byte, flags SeekGEFlags) (*InternalKey, LazyValue)
   105  
   106  	// SeekPrefixGE moves the iterator to the first key/value pair whose key is
   107  	// greater than or equal to the given key. Returns the key and value if the
   108  	// iterator is pointing at a valid entry, and (nil, nilv) otherwise. Note that
   109  	// SeekPrefixGE only checks the upper bound. It is up to the caller to ensure
   110  	// that key is greater than or equal to the lower bound.
   111  	//
   112  	// The prefix argument is used by some InternalIterator implementations (e.g.
   113  	// sstable.Reader) to avoid expensive operations. A user-defined Split
   114  	// function must be supplied to the Comparer for the DB. The supplied prefix
   115  	// will be the prefix of the given key returned by that Split function. If
   116  	// the iterator is able to determine that no key with the prefix exists, it
   117  	// can return (nil,nilv). Unlike SeekGE, this is not an indication that
   118  	// iteration is exhausted.
   119  	//
   120  	// Note that the iterator may return keys not matching the prefix. It is up
   121  	// to the caller to check if the prefix matches.
   122  	//
   123  	// Calling SeekPrefixGE places the receiver into prefix iteration mode. Once
   124  	// in this mode, reverse iteration may not be supported and will return an
   125  	// error. Note that pebble/Iterator.SeekPrefixGE has this same restriction on
   126  	// not supporting reverse iteration in prefix iteration mode until a
   127  	// different positioning routine (SeekGE, SeekLT, First or Last) switches the
   128  	// iterator out of prefix iteration.
   129  	SeekPrefixGE(prefix, key []byte, flags SeekGEFlags) (*InternalKey, LazyValue)
   130  
   131  	// SeekLT moves the iterator to the last key/value pair whose key is less
   132  	// than the given key. Returns the key and value if the iterator is pointing
   133  	// at a valid entry, and (nil, nilv) otherwise. Note that SeekLT only checks
   134  	// the lower bound. It is up to the caller to ensure that key is less than
   135  	// the upper bound.
   136  	SeekLT(key []byte, flags SeekLTFlags) (*InternalKey, LazyValue)
   137  
   138  	// First moves the iterator the the first key/value pair. Returns the key and
   139  	// value if the iterator is pointing at a valid entry, and (nil, nilv)
   140  	// otherwise. Note that First only checks the upper bound. It is up to the
   141  	// caller to ensure that First() is not called when there is a lower bound,
   142  	// and instead call SeekGE(lower).
   143  	First() (*InternalKey, LazyValue)
   144  
   145  	// Last moves the iterator the the last key/value pair. Returns the key and
   146  	// value if the iterator is pointing at a valid entry, and (nil, nilv)
   147  	// otherwise. Note that Last only checks the lower bound. It is up to the
   148  	// caller to ensure that Last() is not called when there is an upper bound,
   149  	// and instead call SeekLT(upper).
   150  	Last() (*InternalKey, LazyValue)
   151  
   152  	// Next moves the iterator to the next key/value pair. Returns the key and
   153  	// value if the iterator is pointing at a valid entry, and (nil, nilv)
   154  	// otherwise. Note that Next only checks the upper bound. It is up to the
   155  	// caller to ensure that key is greater than or equal to the lower bound.
   156  	//
   157  	// It is valid to call Next when the iterator is positioned before the first
   158  	// key/value pair due to either a prior call to SeekLT or Prev which returned
   159  	// (nil, nilv). It is not allowed to call Next when the previous call to SeekGE,
   160  	// SeekPrefixGE or Next returned (nil, nilv).
   161  	Next() (*InternalKey, LazyValue)
   162  
   163  	// NextPrefix moves the iterator to the next key/value pair with a different
   164  	// prefix than the key at the current iterator position. Returns the key and
   165  	// value if the iterator is pointing at a valid entry, and (nil, nil)
   166  	// otherwise. Note that NextPrefix only checks the upper bound. It is up to
   167  	// the caller to ensure that key is greater than or equal to the lower
   168  	// bound.
   169  	//
   170  	// NextPrefix is passed the immediate successor to the current prefix key. A
   171  	// valid implementation of NextPrefix is to call SeekGE with succKey.
   172  	//
   173  	// It is not allowed to call NextPrefix when the previous call was a reverse
   174  	// positioning operation or a call to a forward positioning method that
   175  	// returned (nil, nilv). It is also not allowed to call NextPrefix when the
   176  	// iterator is in prefix iteration mode.
   177  	NextPrefix(succKey []byte) (*InternalKey, LazyValue)
   178  
   179  	// Prev moves the iterator to the previous key/value pair. Returns the key
   180  	// and value if the iterator is pointing at a valid entry, and (nil, nilv)
   181  	// otherwise. Note that Prev only checks the lower bound. It is up to the
   182  	// caller to ensure that key is less than the upper bound.
   183  	//
   184  	// It is valid to call Prev when the iterator is positioned after the last
   185  	// key/value pair due to either a prior call to SeekGE or Next which returned
   186  	// (nil, nilv). It is not allowed to call Prev when the previous call to SeekLT
   187  	// or Prev returned (nil, nilv).
   188  	Prev() (*InternalKey, LazyValue)
   189  
   190  	// Error returns any accumulated error. It may not include errors returned
   191  	// to the client when calling LazyValue.Value().
   192  	Error() error
   193  
   194  	// Close closes the iterator and returns any accumulated error. Exhausting
   195  	// all the key/value pairs in a table is not considered to be an error.
   196  	// It is valid to call Close multiple times. Other methods should not be
   197  	// called after the iterator has been closed.
   198  	Close() error
   199  
   200  	// SetBounds sets the lower and upper bounds for the iterator. Note that the
   201  	// result of Next and Prev will be undefined until the iterator has been
   202  	// repositioned with SeekGE, SeekPrefixGE, SeekLT, First, or Last.
   203  	//
   204  	// The bounds provided must remain valid until a subsequent call to
   205  	// SetBounds has returned. This requirement exists so that iterator
   206  	// implementations may compare old and new bounds to apply low-level
   207  	// optimizations.
   208  	SetBounds(lower, upper []byte)
   209  
   210  	fmt.Stringer
   211  }
   212  
   213  // SeekGEFlags holds flags that may configure the behavior of a forward seek.
   214  // Not all flags are relevant to all iterators.
   215  type SeekGEFlags uint8
   216  
   217  const (
   218  	seekGEFlagTrySeekUsingNext uint8 = iota
   219  	seekGEFlagRelativeSeek
   220  	seekGEFlagBatchJustRefreshed
   221  )
   222  
   223  // SeekGEFlagsNone is the default value of SeekGEFlags, with all flags disabled.
   224  const SeekGEFlagsNone = SeekGEFlags(0)
   225  
   226  // TrySeekUsingNext indicates whether a performance optimization was enabled
   227  // by a caller, indicating the caller has not done any action to move this
   228  // iterator beyond the first key that would be found if this iterator were to
   229  // honestly do the intended seek. For example, say the caller did a
   230  // SeekGE(k1...), followed by SeekGE(k2...) where k1 <= k2, without any
   231  // intermediate positioning calls. The caller can safely specify true for this
   232  // parameter in the second call. As another example, say the caller did do one
   233  // call to Next between the two Seek calls, and k1 < k2. Again, the caller can
   234  // safely specify a true value for this parameter. Note that a false value is
   235  // always safe. The callee is free to ignore the true value if its
   236  // implementation does not permit this optimization.
   237  //
   238  // We make the caller do this determination since a string comparison of k1, k2
   239  // is not necessarily cheap, and there may be many iterators in the iterator
   240  // stack. Doing it once at the root of the iterator stack is cheaper.
   241  //
   242  // This optimization could also be applied to SeekLT (where it would be
   243  // trySeekUsingPrev). We currently only do it for SeekPrefixGE and SeekGE
   244  // because this is where this optimization helps the performance of CockroachDB.
   245  // The SeekLT cases in CockroachDB are typically accompanied with bounds that
   246  // change between seek calls, and is optimized inside certain iterator
   247  // implementations, like singleLevelIterator, without any extra parameter
   248  // passing (though the same amortization of string comparisons could be done to
   249  // improve that optimization, by making the root of the iterator stack do it).
   250  func (s SeekGEFlags) TrySeekUsingNext() bool { return (s & (1 << seekGEFlagTrySeekUsingNext)) != 0 }
   251  
   252  // RelativeSeek is set when in the course of a forward positioning operation, a
   253  // higher-level iterator seeks a lower-level iterator to a larger key than the
   254  // one at the current iterator position.
   255  //
   256  // Concretely, this occurs when the merging iterator observes a range deletion
   257  // covering the key at a level's current position, and the merging iterator
   258  // seeks the level to the range deletion's end key. During lazy-combined
   259  // iteration, this flag signals to the level iterator that the seek is NOT an
   260  // absolute-positioning operation from the perspective of the pebble.Iterator,
   261  // and the level iterator must look for range keys in tables between the current
   262  // iterator position and the new seeked position.
   263  func (s SeekGEFlags) RelativeSeek() bool { return (s & (1 << seekGEFlagRelativeSeek)) != 0 }
   264  
   265  // BatchJustRefreshed is set by Seek[Prefix]GE when an iterator's view of an
   266  // indexed batch was just refreshed. It serves as a signal to the batch iterator
   267  // to ignore the TrySeekUsingNext optimization, because the external knowledge
   268  // imparted by the TrySeekUsingNext flag does not apply to the batch iterator's
   269  // position. See (pebble.Iterator).batchJustRefreshed.
   270  func (s SeekGEFlags) BatchJustRefreshed() bool { return (s & (1 << seekGEFlagBatchJustRefreshed)) != 0 }
   271  
   272  // EnableTrySeekUsingNext returns the provided flags with the
   273  // try-seek-using-next optimization enabled. See TrySeekUsingNext for an
   274  // explanation of this optimization.
   275  func (s SeekGEFlags) EnableTrySeekUsingNext() SeekGEFlags {
   276  	return s | (1 << seekGEFlagTrySeekUsingNext)
   277  }
   278  
   279  // DisableTrySeekUsingNext returns the provided flags with the
   280  // try-seek-using-next optimization disabled.
   281  func (s SeekGEFlags) DisableTrySeekUsingNext() SeekGEFlags {
   282  	return s &^ (1 << seekGEFlagTrySeekUsingNext)
   283  }
   284  
   285  // EnableRelativeSeek returns the provided flags with the relative-seek flag
   286  // enabled. See RelativeSeek for an explanation of this flag's use.
   287  func (s SeekGEFlags) EnableRelativeSeek() SeekGEFlags {
   288  	return s | (1 << seekGEFlagRelativeSeek)
   289  }
   290  
   291  // DisableRelativeSeek returns the provided flags with the relative-seek flag
   292  // disabled.
   293  func (s SeekGEFlags) DisableRelativeSeek() SeekGEFlags {
   294  	return s &^ (1 << seekGEFlagRelativeSeek)
   295  }
   296  
   297  // EnableBatchJustRefreshed returns the provided flags with the
   298  // batch-just-refreshed bit set. See BatchJustRefreshed for an explanation of
   299  // this flag.
   300  func (s SeekGEFlags) EnableBatchJustRefreshed() SeekGEFlags {
   301  	return s | (1 << seekGEFlagBatchJustRefreshed)
   302  }
   303  
   304  // DisableBatchJustRefreshed returns the provided flags with the
   305  // batch-just-refreshed bit unset.
   306  func (s SeekGEFlags) DisableBatchJustRefreshed() SeekGEFlags {
   307  	return s &^ (1 << seekGEFlagBatchJustRefreshed)
   308  }
   309  
   310  // SeekLTFlags holds flags that may configure the behavior of a reverse seek.
   311  // Not all flags are relevant to all iterators.
   312  type SeekLTFlags uint8
   313  
   314  const (
   315  	seekLTFlagRelativeSeek uint8 = iota
   316  )
   317  
   318  // SeekLTFlagsNone is the default value of SeekLTFlags, with all flags disabled.
   319  const SeekLTFlagsNone = SeekLTFlags(0)
   320  
   321  // RelativeSeek is set when in the course of a reverse positioning operation, a
   322  // higher-level iterator seeks a lower-level iterator to a smaller key than the
   323  // one at the current iterator position.
   324  //
   325  // Concretely, this occurs when the merging iterator observes a range deletion
   326  // covering the key at a level's current position, and the merging iterator
   327  // seeks the level to the range deletion's start key. During lazy-combined
   328  // iteration, this flag signals to the level iterator that the seek is NOT an
   329  // absolute-positioning operation from the perspective of the pebble.Iterator,
   330  // and the level iterator must look for range keys in tables between the current
   331  // iterator position and the new seeked position.
   332  func (s SeekLTFlags) RelativeSeek() bool { return s&(1<<seekLTFlagRelativeSeek) != 0 }
   333  
   334  // EnableRelativeSeek returns the provided flags with the relative-seek flag
   335  // enabled. See RelativeSeek for an explanation of this flag's use.
   336  func (s SeekLTFlags) EnableRelativeSeek() SeekLTFlags {
   337  	return s | (1 << seekLTFlagRelativeSeek)
   338  }
   339  
   340  // DisableRelativeSeek returns the provided flags with the relative-seek flag
   341  // disabled.
   342  func (s SeekLTFlags) DisableRelativeSeek() SeekLTFlags {
   343  	return s &^ (1 << seekLTFlagRelativeSeek)
   344  }
   345  
   346  // InternalIteratorStats contains miscellaneous stats produced by
   347  // InternalIterators that are part of the InternalIterator tree. Not every
   348  // field is relevant for an InternalIterator implementation. The field values
   349  // are aggregated as one goes up the InternalIterator tree.
   350  type InternalIteratorStats struct {
   351  	// Bytes in the loaded blocks. If the block was compressed, this is the
   352  	// compressed bytes. Currently, only the index blocks, data blocks
   353  	// containing points, and filter blocks are included.
   354  	BlockBytes uint64
   355  	// Subset of BlockBytes that were in the block cache.
   356  	BlockBytesInCache uint64
   357  	// BlockReadDuration accumulates the duration spent fetching blocks
   358  	// due to block cache misses.
   359  	// TODO(sumeer): this currently excludes the time spent in Reader creation,
   360  	// and in reading the rangedel and rangekey blocks. Fix that.
   361  	BlockReadDuration time.Duration
   362  	// The following can repeatedly count the same points if they are iterated
   363  	// over multiple times. Additionally, they may count a point twice when
   364  	// switching directions. The latter could be improved if needed.
   365  
   366  	// Bytes in keys that were iterated over. Currently, only point keys are
   367  	// included.
   368  	KeyBytes uint64
   369  	// Bytes in values that were iterated over. Currently, only point values are
   370  	// included. For separated values, this is the size of the handle.
   371  	ValueBytes uint64
   372  	// The count of points iterated over.
   373  	PointCount uint64
   374  	// Points that were iterated over that were covered by range tombstones. It
   375  	// can be useful for discovering instances of
   376  	// https://github.com/cockroachdb/pebble/issues/1070.
   377  	PointsCoveredByRangeTombstones uint64
   378  
   379  	// Stats related to points in value blocks encountered during iteration.
   380  	// These are useful to understand outliers, since typical user facing
   381  	// iteration should tend to only look at the latest point, and hence have
   382  	// the following stats close to 0.
   383  	SeparatedPointValue struct {
   384  		// Count is a count of points that were in value blocks. This is not a
   385  		// subset of PointCount: PointCount is produced by mergingIter and if
   386  		// positioned once, and successful in returning a point, will have a
   387  		// PointCount of 1, regardless of how many sstables (and memtables etc.)
   388  		// in the heap got positioned. The count here includes every sstable
   389  		// iterator that got positioned in the heap.
   390  		Count uint64
   391  		// ValueBytes represent the total byte length of the values (in value
   392  		// blocks) of the points corresponding to Count.
   393  		ValueBytes uint64
   394  		// ValueBytesFetched is the total byte length of the values (in value
   395  		// blocks) that were retrieved.
   396  		ValueBytesFetched uint64
   397  	}
   398  }
   399  
   400  // Merge merges the stats in from into the given stats.
   401  func (s *InternalIteratorStats) Merge(from InternalIteratorStats) {
   402  	s.BlockBytes += from.BlockBytes
   403  	s.BlockBytesInCache += from.BlockBytesInCache
   404  	s.BlockReadDuration += from.BlockReadDuration
   405  	s.KeyBytes += from.KeyBytes
   406  	s.ValueBytes += from.ValueBytes
   407  	s.PointCount += from.PointCount
   408  	s.PointsCoveredByRangeTombstones += from.PointsCoveredByRangeTombstones
   409  	s.SeparatedPointValue.Count += from.SeparatedPointValue.Count
   410  	s.SeparatedPointValue.ValueBytes += from.SeparatedPointValue.ValueBytes
   411  	s.SeparatedPointValue.ValueBytesFetched += from.SeparatedPointValue.ValueBytesFetched
   412  }
   413  
   414  func (s *InternalIteratorStats) String() string {
   415  	return redact.StringWithoutMarkers(s)
   416  }
   417  
   418  // SafeFormat implements the redact.SafeFormatter interface.
   419  func (s *InternalIteratorStats) SafeFormat(p redact.SafePrinter, verb rune) {
   420  	p.Printf("blocks: %s cached",
   421  		humanize.Bytes.Uint64(s.BlockBytesInCache),
   422  	)
   423  	if s.BlockBytes != s.BlockBytesInCache || s.BlockReadDuration != 0 {
   424  		p.Printf(", %s not cached (read time: %s)",
   425  			humanize.Bytes.Uint64(s.BlockBytes-s.BlockBytesInCache),
   426  			humanize.FormattedString(s.BlockReadDuration.String()),
   427  		)
   428  	}
   429  	p.Printf("; points: %s", humanize.Count.Uint64(s.PointCount))
   430  
   431  	if s.PointsCoveredByRangeTombstones != 0 {
   432  		p.Printf("(%s tombstoned)", humanize.Count.Uint64(s.PointsCoveredByRangeTombstones))
   433  	}
   434  	p.Printf(" (%s keys, %s values)",
   435  		humanize.Bytes.Uint64(s.KeyBytes),
   436  		humanize.Bytes.Uint64(s.ValueBytes),
   437  	)
   438  	if s.SeparatedPointValue.Count != 0 {
   439  		p.Printf("; separated: %s (%s, %s fetched)",
   440  			humanize.Count.Uint64(s.SeparatedPointValue.Count),
   441  			humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytes),
   442  			humanize.Bytes.Uint64(s.SeparatedPointValue.ValueBytesFetched))
   443  	}
   444  }