github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/incremental_iterator.cc (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  //
     4  // Use of this software is governed by the Business Source License included
     5  // in the file licenses/BSL.txt.
     6  //
     7  // As of the Change Date specified in that file, in accordance with the Business
     8  // Source License, use of this software will be governed by the Apache License,
     9  // Version 2.0, included in the file licenses/APL.txt.
    10  
    11  #include "incremental_iterator.h"
    12  #include "comparator.h"
    13  #include "encoding.h"
    14  #include "protos/roachpb/errors.pb.h"
    15  
    16  using namespace cockroach;
    17  
    18  DBIncrementalIterator::DBIncrementalIterator(DBEngine* engine, DBIterOptions opts, DBKey start,
    19                                               DBKey end, DBString* write_intent)
    20      : engine(engine),
    21        opts(opts),
    22        valid(true),
    23        status(kSuccess),
    24        start(start),
    25        end(end),
    26        write_intent(write_intent) {
    27  
    28    start_time.set_wall_time(start.wall_time);
    29    start_time.set_logical(start.logical);
    30    end_time.set_wall_time(end.wall_time);
    31    end_time.set_logical(end.logical);
    32  
    33    DBIterOptions iter_opts = opts;
    34    if (!EmptyTimestamp(opts.min_timestamp_hint) || !EmptyTimestamp(opts.max_timestamp_hint)) {
    35      assert(!EmptyTimestamp(opts.max_timestamp_hint));
    36      DBIterOptions nontimebound_opts = DBIterOptions();
    37      nontimebound_opts.upper_bound = opts.upper_bound;
    38      iter_opts = nontimebound_opts;
    39      time_bound_iter.reset(DBNewIter(engine, opts));
    40    }
    41    iter.reset(DBNewIter(engine, iter_opts));
    42  }
    43  
    44  DBIncrementalIterator::~DBIncrementalIterator() {}
    45  
    46  // legacyTimestampIsLess compares the timestamps t1 and t2, and returns a
    47  // boolean indicating whether t1 is less than t2.
    48  bool DBIncrementalIterator::legacyTimestampIsLess(const cockroach::util::hlc::LegacyTimestamp& t1,
    49                                                    const cockroach::util::hlc::LegacyTimestamp& t2) {
    50    return t1.wall_time() < t2.wall_time() ||
    51           (t1.wall_time() == t2.wall_time() && t1.logical() < t2.logical());
    52  }
    53  
    54  // extractKey extracts the key portion of the mvcc_key and put it in key. It
    55  // returns a validity indicator.
    56  WARN_UNUSED_RESULT bool DBIncrementalIterator::extractKey(rocksdb::Slice mvcc_key,
    57                                                            rocksdb::Slice* key) {
    58    rocksdb::Slice ts;
    59    if (!SplitKey(mvcc_key, key, &ts)) {
    60      valid = false;
    61      status = FmtStatus("failed to split mvcc key");
    62      return false;
    63    }
    64    return true;
    65  }
    66  
    67  // maybeSkipKeys checks if any keys can be skipped by using a time-bound
    68  // iterator. If keys can be skipped, it will update the main iterator to point
    69  // to the earliest version of the next candidate key.
    70  // It is expected that TBI is at a key <= main iterator key when calling
    71  // maybeSkipKeys().
    72  void DBIncrementalIterator::maybeSkipKeys() {
    73    if (time_bound_iter == nullptr) {
    74      // We don't have a TBI, so we can't skip any keys.
    75      return;
    76    }
    77  
    78    rocksdb::Slice tbi_key;
    79    if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) {
    80      return;
    81    }
    82    rocksdb::Slice iter_key;
    83    if (!extractKey(iter->rep->key(), &iter_key)) {
    84      return;
    85    }
    86  
    87    if (iter_key.compare(tbi_key) > 0) {
    88      // If the iterKey got ahead of the TBI key, advance the TBI Key.
    89      //
    90      // The case where iterKey == tbiKey, after this call, is the fast-path is
    91      // when the TBI and the main iterator are in lockstep. In this case, the
    92      // main iterator was referencing the next key that would be visited by the
    93      // TBI. This means that for the incremental iterator to perform a Next or
    94      // NextKey will require only 1 extra NextKey invocation while they remain in
    95      // lockstep. This could be common if most keys are modified or the
    96      // modifications are clustered in keyspace.
    97      //
    98      // NB: The Seek() below is expensive, so we aim to avoid it if both
    99      // iterators remain in lockstep as described above.
   100      auto state = DBIterNext(time_bound_iter.get(), true /* skip_current_key_versions */);
   101      if (!state.valid) {
   102        status = state.status;
   103        valid = false;
   104        return;
   105      }
   106      if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) {
   107        return;
   108      }
   109  
   110      auto cmp = iter_key.compare(tbi_key);
   111      if (cmp > 0) {
   112        // If the tbiKey is still behind the iterKey, the TBI key may be seeing
   113        // phantom MVCCKey.Keys. These keys may not be seen by the main iterator
   114        // due to aborted transactions and keys which have been subsumed due to
   115        // range tombstones. In this case we can SeekGE() the TBI to the main iterator.
   116        DBKey seek_to = {};
   117        // NB: We don't ToDBKey as iter_key is already split.
   118        seek_to.key = ToDBSlice(iter_key);
   119        state = DBIterSeek(time_bound_iter.get(), seek_to);
   120        if (!state.valid) {
   121          status = state.status;
   122          valid = false;
   123          return;
   124        }
   125        if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) {
   126          return;
   127        }
   128        cmp = iter_key.compare(tbi_key);
   129      }
   130  
   131      if (cmp < 0) {
   132        // In the case that the next MVCC key that the TBI observes is not the
   133        // same as the main iterator, we may be able to skip over a large group of
   134        // keys. The main iterator is seeked to the TBI in hopes that many keys
   135        // were skipped. Note that a Seek() is an order of magnitude more
   136        // expensive than a Next().
   137        DBKey seek_to = {};
   138        // NB: We don't ToDBKey as iter_key is already split.
   139        seek_to.key = ToDBSlice(tbi_key);
   140        state = DBIterSeek(iter.get(), seek_to);
   141        if (!state.valid) {
   142          status = state.status;
   143          valid = false;
   144          return;
   145        }
   146      }
   147    }
   148  }
   149  
   150  // advanceKey advances the main iterator until it is referencing a key within
   151  // (start_time, end_time].
   152  // It populates i.err with an error if either of the following was encountered:
   153  // a) an inline value
   154  // b) an intent with a timestamp within the incremental iterator's bounds
   155  void DBIncrementalIterator::advanceKey() {
   156    for (;;) {
   157      maybeSkipKeys();
   158      if (!valid) {
   159        return;
   160      }
   161  
   162      rocksdb::Slice key;
   163      int64_t wall_time = 0;
   164      int32_t logical = 0;
   165      if (!DecodeKey(iter.get()->rep->key(), &key, &wall_time, &logical)) {
   166        status = ToDBString("unable to decode key");
   167        valid = false;
   168        return;
   169      }
   170  
   171      cockroach::storage::enginepb::MVCCMetadata meta;
   172      if (wall_time != 0 || logical != 0) {
   173        meta.mutable_timestamp()->set_wall_time(wall_time);
   174        meta.mutable_timestamp()->set_logical(logical);
   175      } else {
   176        const auto value = iter->rep->value();
   177        if (!meta.ParseFromArray(value.data(), value.size())) {
   178          status = ToDBString("failed to parse meta");
   179          valid = false;
   180          return;
   181        }
   182      }
   183  
   184      // Check for an inline value, as these are only used in non-user data.
   185      // They're not needed for backup, so they're not handled by this method.
   186      // If one shows up, throw an error so it's obvious something is wrong.
   187      if (meta.has_raw_bytes()) {
   188        valid = false;
   189        status = ToDBString("Inline values are unsupported by the IncrementalIterator");
   190        return;
   191      }
   192  
   193      if (meta.has_txn()) {
   194        if (legacyTimestampIsLess(start_time, meta.timestamp()) &&
   195            !legacyTimestampIsLess(end_time, meta.timestamp())) {
   196          cockroach::roachpb::WriteIntentError err;
   197          cockroach::roachpb::Intent* intent = err.add_intents();
   198          intent->mutable_single_key_span()->set_key(key.data(), key.size());
   199          intent->mutable_txn()->CopyFrom(meta.txn());
   200  
   201          status = ToDBString("WriteIntentError");
   202          valid = false;
   203          *write_intent = ToDBString(err.SerializeAsString());
   204  
   205          return;
   206        }
   207      }
   208  
   209      DBIterState state;
   210      if (legacyTimestampIsLess(end_time, meta.timestamp())) {
   211        state = DBIterNext(iter.get(), false);
   212      } else if (!legacyTimestampIsLess(start_time, meta.timestamp())) {
   213        state = DBIterNext(iter.get(), true);
   214      } else {
   215        // We have found a key within the time bounds, break.
   216        break;
   217      }
   218  
   219      if (!state.valid) {
   220        status = state.status;
   221        valid = false;
   222        return;
   223      }
   224    }
   225  }
   226  
   227  DBIterState DBIncrementalIterator::getState() {
   228    DBIterState state = {};
   229    state.valid = valid;
   230    state.status = status;
   231  
   232    if (state.valid) {
   233      rocksdb::Slice key;
   234      state.valid = DecodeKey(iter.get()->rep->key(), &key, &state.key.wall_time, &state.key.logical);
   235      if (state.valid) {
   236        state.key.key = ToDBSlice(key);
   237        state.value = ToDBSlice(iter.get()->rep->value());
   238      }
   239    }
   240  
   241    return state;
   242  }
   243  
   244  // seek advances the iterator to the first key in the engine which is >= the
   245  // provided key. key should be a metadata key to ensure that the iterator has a
   246  // chance to observe any intents on the key if they are there.
   247  DBIterState DBIncrementalIterator::seek(DBKey key) {
   248    if (time_bound_iter != nullptr) {
   249      // Check which is the first key seen by the TBI.
   250      auto state = DBIterSeek(time_bound_iter.get(), key);
   251      if (!state.valid) {
   252        status = state.status;
   253        valid = false;
   254        return getState();
   255      }
   256      const rocksdb::Slice tbi_key(time_bound_iter->rep->key());
   257      // NB: iter_key needs to be constructed with ToSlice to ensure that an empty
   258      // rocksdb::Slice is properly created in the common case that key is an
   259      // empty key (the first key).
   260      const rocksdb::Slice iter_key(ToSlice(key.key));
   261      if (tbi_key.compare(iter_key) > 0) {
   262        // If the first key that the TBI sees is ahead of the given startKey, we
   263        // can seek directly to the first version of the key.
   264        key = ToDBKey(tbi_key);
   265      }
   266    }
   267    auto state = DBIterSeek(iter.get(), key);
   268    if (!state.valid) {
   269      status = state.status;
   270      valid = false;
   271      return getState();
   272    }
   273    advanceKey();
   274    return getState();
   275  }
   276  
   277  DBIterState DBIncrementalIterator::next(bool skip_current_key_versions) {
   278    auto state = DBIterNext(iter.get(), skip_current_key_versions);
   279    if (!state.valid) {
   280      status = state.status;
   281      valid = false;
   282      return getState();
   283    }
   284    advanceKey();
   285    return getState();
   286  }
   287  
   288  const rocksdb::Slice DBIncrementalIterator::key() { return iter.get()->rep->key(); }
   289  
   290  const rocksdb::Slice DBIncrementalIterator::value() { return iter.get()->rep->value(); }