github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/mvcc_incremental_iterator.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package storage
    12  
    13  import (
    14  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    15  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    16  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    17  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    18  	"github.com/cockroachdb/errors"
    19  )
    20  
    21  // MVCCIncrementalIterator iterates over the diff of the key range
    22  // [startKey,endKey) and time range (startTime,endTime]. If a key was added or
    23  // modified between startTime and endTime, the iterator will position at the
    24  // most recent version (before or at endTime) of that key. If the key was most
    25  // recently deleted, this is signaled with an empty value.
    26  //
    27  // MVCCIncrementalIterator will return an error if either of the following are
    28  // encountered:
    29  //   1. An inline value (non-user data)
    30  //   2. An intent whose timestamp lies within the time bounds
    31  //
    32  // Note: The endTime is inclusive to be consistent with the non-incremental
    33  // iterator, where reads at a given timestamp return writes at that
    34  // timestamp. The startTime is then made exclusive so that iterating time 1 to
    35  // 2 and then 2 to 3 will only return values with time 2 once. An exclusive
    36  // start time would normally make it difficult to scan timestamp 0, but
    37  // CockroachDB uses that as a sentinel for key metadata anyway.
    38  //
    39  // Expected usage:
    40  //    iter := NewMVCCIncrementalIterator(e, IterOptions{
    41  //        StartTime:  startTime,
    42  //        EndTime:    endTime,
    43  //        UpperBound: endKey,
    44  //    })
    45  //    defer iter.Close()
    46  //    for iter.SeekGE(startKey); ; iter.Next() {
    47  //        ok, err := iter.Valid()
    48  //        if !ok { ... }
    49  //        [code using iter.Key() and iter.Value()]
    50  //    }
    51  //    if err := iter.Error(); err != nil {
    52  //      ...
    53  //    }
    54  //
    55  // Note regarding the correctness of the time-bound iterator optimization:
    56  //
    57  // When using (t_s, t_e], say there is a version (committed or provisional)
    58  // k@t where t is in that interval, that is visible to iter. All sstables
    59  // containing k@t will be included in timeBoundIter. Note that there may be
    60  // multiple sequence numbers for the key k@t at the storage layer, say k@t#n1,
    61  // k@t#n2, where n1 > n2, some of which may be deleted, but the latest
    62  // sequence number will be visible using iter (since not being visible would be
    63  // a contradiction of the initial assumption that k@t is visible to iter).
    64  // Since there is no delete across all sstables that deletes k@t#n1, there is
    65  // no delete in the subset of sstables used by timeBoundIter that deletes
    66  // k@t#n1, so the timeBoundIter will see k@t.
    67  //
    68  // NOTE: This is not used by CockroachDB and has been preserved to serve as an
    69  // oracle to prove the correctness of the new export logic.
    70  type MVCCIncrementalIterator struct {
    71  	iter Iterator
    72  
    73  	// A time-bound iterator cannot be used by itself due to a bug in the time-
    74  	// bound iterator (#28358). This was historically augmented with an iterator
    75  	// without the time-bound optimization to act as a sanity iterator, but
    76  	// issues remained (#43799), so now the iterator above is the main iterator
    77  	// the timeBoundIter is used to check if any keys can be skipped by the main
    78  	// iterator.
    79  	timeBoundIter Iterator
    80  
    81  	startTime hlc.Timestamp
    82  	endTime   hlc.Timestamp
    83  	err       error
    84  	valid     bool
    85  
    86  	// For allocation avoidance, meta is used to store the timestamp of keys
    87  	// regardless if they are metakeys.
    88  	meta enginepb.MVCCMetadata
    89  }
    90  
    91  var _ SimpleIterator = &MVCCIncrementalIterator{}
    92  
    93  // MVCCIncrementalIterOptions bundles options for NewMVCCIncrementalIterator.
    94  type MVCCIncrementalIterOptions struct {
    95  	IterOptions IterOptions
    96  	// Keys visible by the MVCCIncrementalIterator must be within (StartTime,
    97  	// EndTime]. Note that if {Min,Max}TimestampHints are specified in
    98  	// IterOptions, the timestamp hints interval should include the start and end
    99  	// time.
   100  	StartTime hlc.Timestamp
   101  	EndTime   hlc.Timestamp
   102  }
   103  
   104  // NewMVCCIncrementalIterator creates an MVCCIncrementalIterator with the
   105  // specified reader and options. The timestamp hint range should not be more
   106  // restrictive than the start and end time range.
   107  // TODO(pbardea): Add validation here and in C++ implementation that the
   108  //  timestamp hints are not more restrictive than incremental iterator's
   109  //  (startTime, endTime] interval.
   110  func NewMVCCIncrementalIterator(
   111  	reader Reader, opts MVCCIncrementalIterOptions,
   112  ) *MVCCIncrementalIterator {
   113  	var iter Iterator
   114  	var timeBoundIter Iterator
   115  	if !opts.IterOptions.MinTimestampHint.IsEmpty() && !opts.IterOptions.MaxTimestampHint.IsEmpty() {
   116  		// An iterator without the timestamp hints is created to ensure that the
   117  		// iterator visits every required version of every key that has changed.
   118  		iter = reader.NewIterator(IterOptions{
   119  			UpperBound: opts.IterOptions.UpperBound,
   120  		})
   121  		timeBoundIter = reader.NewIterator(opts.IterOptions)
   122  	} else {
   123  		iter = reader.NewIterator(opts.IterOptions)
   124  	}
   125  
   126  	return &MVCCIncrementalIterator{
   127  		iter:          iter,
   128  		startTime:     opts.StartTime,
   129  		endTime:       opts.EndTime,
   130  		timeBoundIter: timeBoundIter,
   131  	}
   132  }
   133  
   134  // SeekGE advances the iterator to the first key in the engine which is >= the
   135  // provided key. startKey should be a metadata key to ensure that the iterator
   136  // has a chance to observe any intents on the key if they are there.
   137  func (i *MVCCIncrementalIterator) SeekGE(startKey MVCCKey) {
   138  	if i.timeBoundIter != nil {
   139  		// Check which is the first key seen by the TBI.
   140  		i.timeBoundIter.SeekGE(startKey)
   141  		if ok, err := i.timeBoundIter.Valid(); !ok {
   142  			i.err = err
   143  			i.valid = false
   144  			return
   145  		}
   146  		tbiKey := i.timeBoundIter.Key().Key
   147  		if tbiKey.Compare(startKey.Key) > 0 {
   148  			// If the first key that the TBI sees is ahead of the given startKey, we
   149  			// can seek directly to the first version of the key.
   150  			startKey = MakeMVCCMetadataKey(tbiKey)
   151  		}
   152  	}
   153  	i.iter.SeekGE(startKey)
   154  	if ok, err := i.iter.Valid(); !ok {
   155  		i.err = err
   156  		i.valid = false
   157  		return
   158  	}
   159  	i.err = nil
   160  	i.valid = true
   161  	i.advance()
   162  }
   163  
   164  // Close frees up resources held by the iterator.
   165  func (i *MVCCIncrementalIterator) Close() {
   166  	i.iter.Close()
   167  	if i.timeBoundIter != nil {
   168  		i.timeBoundIter.Close()
   169  	}
   170  }
   171  
   172  // Next advances the iterator to the next key/value in the iteration. After this
   173  // call, Valid() will be true if the iterator was not positioned at the last
   174  // key.
   175  func (i *MVCCIncrementalIterator) Next() {
   176  	i.iter.Next()
   177  	if ok, err := i.iter.Valid(); !ok {
   178  		i.err = err
   179  		i.valid = false
   180  		return
   181  	}
   182  	i.advance()
   183  }
   184  
   185  // NextKey advances the iterator to the next key. This operation is distinct
   186  // from Next which advances to the next version of the current key or the next
   187  // key if the iterator is currently located at the last version for a key.
   188  func (i *MVCCIncrementalIterator) NextKey() {
   189  	i.iter.NextKey()
   190  	if ok, err := i.iter.Valid(); !ok {
   191  		i.err = err
   192  		i.valid = false
   193  		return
   194  	}
   195  	i.advance()
   196  }
   197  
   198  // maybeSkipKeys checks if any keys can be skipped by using a time-bound
   199  // iterator. If keys can be skipped, it will update the main iterator to point
   200  // to the earliest version of the next candidate key.
   201  // It is expected that TBI is at a key <= main iterator key when calling
   202  // maybeSkipKeys().
   203  func (i *MVCCIncrementalIterator) maybeSkipKeys() {
   204  	if i.timeBoundIter == nil {
   205  		// If there is no time bound iterator, we cannot skip any keys.
   206  		return
   207  	}
   208  	tbiKey := i.timeBoundIter.Key().Key
   209  	iterKey := i.iter.Key().Key
   210  	if iterKey.Compare(tbiKey) > 0 {
   211  		// If the iterKey got ahead of the TBI key, advance the TBI Key.
   212  		//
   213  		// The case where iterKey == tbiKey, after this call, is the fast-path is
   214  		// when the TBI and the main iterator are in lockstep. In this case, the
   215  		// main iterator was referencing the next key that would be visited by the
   216  		// TBI. This means that for the incremental iterator to perform a Next or
   217  		// NextKey will require only 1 extra NextKey invocation while they remain in
   218  		// lockstep. This could be common if most keys are modified or the
   219  		// modifications are clustered in keyspace.
   220  		//
   221  		// NB: The Seek() below is expensive, so we aim to avoid it if both
   222  		// iterators remain in lockstep as described above.
   223  		i.timeBoundIter.NextKey()
   224  		if ok, err := i.timeBoundIter.Valid(); !ok {
   225  			i.err = err
   226  			i.valid = false
   227  			return
   228  		}
   229  		tbiKey = i.timeBoundIter.Key().Key
   230  
   231  		cmp := iterKey.Compare(tbiKey)
   232  
   233  		if cmp > 0 {
   234  			// If the tbiKey is still behind the iterKey, the TBI key may be seeing
   235  			// phantom MVCCKey.Keys. These keys may not be seen by the main iterator
   236  			// due to aborted transactions and keys which have been subsumed due to
   237  			// range tombstones. In this case we can SeekGE() the TBI to the main iterator.
   238  			seekKey := MakeMVCCMetadataKey(iterKey)
   239  			i.timeBoundIter.SeekGE(seekKey)
   240  			if ok, err := i.timeBoundIter.Valid(); !ok {
   241  				i.err = err
   242  				i.valid = false
   243  				return
   244  			}
   245  			tbiKey = i.timeBoundIter.Key().Key
   246  			cmp = iterKey.Compare(tbiKey)
   247  		}
   248  
   249  		if cmp < 0 {
   250  			// In the case that the next MVCC key that the TBI observes is not the
   251  			// same as the main iterator, we may be able to skip over a large group
   252  			// of keys. The main iterator is seeked to the TBI in hopes that many
   253  			// keys were skipped. Note that a Seek is an order of magnitude more
   254  			// expensive than a Next call.
   255  			seekKey := MakeMVCCMetadataKey(tbiKey)
   256  			i.iter.SeekGE(seekKey)
   257  			if ok, err := i.iter.Valid(); !ok {
   258  				i.err = err
   259  				i.valid = false
   260  				return
   261  			}
   262  		}
   263  	}
   264  }
   265  
   266  // advance advances the main iterator until it is referencing a key within
   267  // (start_time, end_time].
   268  // It populates i.err with an error if either of the following was encountered:
   269  // a) an inline value
   270  // b) an intent with a timestamp within the incremental iterator's bounds
   271  func (i *MVCCIncrementalIterator) advance() {
   272  	for {
   273  		i.maybeSkipKeys()
   274  		if !i.valid {
   275  			return
   276  		}
   277  
   278  		unsafeMetaKey := i.iter.UnsafeKey()
   279  		if unsafeMetaKey.IsValue() {
   280  			// They key is an MVCC value and note an intent.
   281  			// Intents are handled next.
   282  			i.meta.Reset()
   283  			i.meta.Timestamp = hlc.LegacyTimestamp(unsafeMetaKey.Timestamp)
   284  		} else {
   285  			// The key is a metakey (an intent), this is used later to see if the
   286  			// timestamp of this intent is within the incremental iterator's time
   287  			// bounds.
   288  			if i.err = protoutil.Unmarshal(i.iter.UnsafeValue(), &i.meta); i.err != nil {
   289  				i.valid = false
   290  				return
   291  			}
   292  		}
   293  
   294  		if i.meta.IsInline() {
   295  			// Inline values are only used in non-user data. They're not needed
   296  			// for backup, so they're not handled by this method. If one shows
   297  			// up, throw an error so it's obvious something is wrong.
   298  			i.valid = false
   299  			i.err = errors.Errorf("inline values are unsupported by MVCCIncrementalIterator: %s",
   300  				unsafeMetaKey.Key)
   301  			return
   302  		}
   303  
   304  		metaTimestamp := hlc.Timestamp(i.meta.Timestamp)
   305  		if i.meta.Txn != nil {
   306  			if i.startTime.Less(metaTimestamp) && metaTimestamp.LessEq(i.endTime) {
   307  				i.err = &roachpb.WriteIntentError{
   308  					Intents: []roachpb.Intent{
   309  						roachpb.MakeIntent(i.meta.Txn, i.iter.Key().Key),
   310  					},
   311  				}
   312  				i.valid = false
   313  				return
   314  			}
   315  			i.iter.Next()
   316  			if ok, err := i.iter.Valid(); !ok {
   317  				i.err = err
   318  				i.valid = false
   319  				return
   320  			}
   321  			continue
   322  		}
   323  
   324  		// Note that MVCC keys are sorted by key, then by _descending_ timestamp
   325  		// order with the exception of the metakey (timestamp 0) being sorted
   326  		// first. See mvcc.h for more information.
   327  		if i.endTime.Less(metaTimestamp) {
   328  			i.iter.Next()
   329  		} else if metaTimestamp.LessEq(i.startTime) {
   330  			i.iter.NextKey()
   331  		} else {
   332  			// The current key is a valid user key and within the time bounds. We are
   333  			// done.
   334  			break
   335  		}
   336  
   337  		if ok, err := i.iter.Valid(); !ok {
   338  			i.err = err
   339  			i.valid = false
   340  			return
   341  		}
   342  	}
   343  }
   344  
   345  // Valid must be called after any call to Reset(), Next(), or similar methods.
   346  // It returns (true, nil) if the iterator points to a valid key (it is undefined
   347  // to call Key(), Value(), or similar methods unless Valid() has returned (true,
   348  // nil)). It returns (false, nil) if the iterator has moved past the end of the
   349  // valid range, or (false, err) if an error has occurred. Valid() will never
   350  // return true with a non-nil error.
   351  func (i *MVCCIncrementalIterator) Valid() (bool, error) {
   352  	return i.valid, i.err
   353  }
   354  
   355  // Key returns the current key.
   356  func (i *MVCCIncrementalIterator) Key() MVCCKey {
   357  	return i.iter.Key()
   358  }
   359  
   360  // Value returns the current value as a byte slice.
   361  func (i *MVCCIncrementalIterator) Value() []byte {
   362  	return i.iter.Value()
   363  }
   364  
   365  // UnsafeKey returns the same key as Key, but the memory is invalidated on the
   366  // next call to {Next,Reset,Close}.
   367  func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey {
   368  	return i.iter.UnsafeKey()
   369  }
   370  
   371  // UnsafeValue returns the same value as Value, but the memory is invalidated on
   372  // the next call to {Next,Reset,Close}.
   373  func (i *MVCCIncrementalIterator) UnsafeValue() []byte {
   374  	return i.iter.UnsafeValue()
   375  }