github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/mvcc.cc (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  #include "mvcc.h"
    12  #include "comparator.h"
    13  #include "encoding.h"
    14  #include "keys.h"
    15  
    16  using namespace cockroach;
    17  
    18  namespace cockroach {
    19  
    20  namespace {
    21  
    22  bool IsValidSplitKey(const rocksdb::Slice& key) {
    23    if (key == kMeta2KeyMax) {
    24      // We do not allow splits at Meta2KeyMax. The reason for this is that range
    25      // decriptors are stored at RangeMetaKey(range.EndKey), so the new range
    26      // that ends at Meta2KeyMax would naturally store its decriptor at
    27      // RangeMetaKey(Meta2KeyMax) = Meta1KeyMax. However, Meta1KeyMax already
    28      // serves a different role of holding a second copy of the descriptor for
    29      // the range that spans the meta2/userspace boundary (see case 3a in
    30      // rangeAddressing). If we allowed splits at Meta2KeyMax, the two roles
    31      // would overlap. See #1206.
    32      return false;
    33    }
    34    for (auto span : kSortedNoSplitSpans) {
    35      // kSortedNoSplitSpans is both reverse sorted (largest to smallest) on the
    36      // span end key which allows us to early exit if our key to check is above
    37      // the end of the last no-split span.
    38      if (key.compare(span.second) >= 0) {
    39        return true;
    40      }
    41      if (key.compare(span.first) > 0) {
    42        return false;
    43      }
    44    }
    45    return true;
    46  }
    47  
    48  const int64_t kNanosecondPerSecond = 1e9;
    49  
    50  inline int64_t age_factor(int64_t fromNS, int64_t toNS) {
    51    // Careful about implicit conversions here.
    52    // toNS/1e9 - fromNS/1e9 is not the same since
    53    // "1e9" is a double.
    54    return toNS / kNanosecondPerSecond - fromNS / kNanosecondPerSecond;
    55  }
    56  
    57  }  // namespace
    58  
    59  // TODO(tschottdorf): it's unfortunate that this method duplicates the logic
    60  // in (*MVCCStats).AgeTo. Passing now_nanos in is semantically tricky if there
    61  // is a chance that we run into values ahead of now_nanos. Instead, now_nanos
    62  // should be taken as a hint but determined by the max timestamp encountered.
    63  //
    64  // This implementation must match engine.ComputeStatsGo.
    65  MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DBKey start,
    66                                           DBKey end, int64_t now_nanos) {
    67    MVCCStatsResult stats;
    68    memset(&stats, 0, sizeof(stats));
    69  
    70    iter_rep->Seek(EncodeKey(start));
    71    const std::string end_key = EncodeKey(end);
    72  
    73    cockroach::storage::enginepb::MVCCMetadata meta;
    74    std::string prev_key;
    75    bool first = false;
    76    // NB: making this uninitialized triggers compiler warnings
    77    // with `-Werror=maybe-uninitialized`. This warning seems like
    78    // a false positive (changing the above line to `first=true`
    79    // which results in equivalent code does not remove it either).
    80    // An assertion has been placed where the compiler would warn.
    81    int64_t accrue_gc_age_nanos = 0;
    82  
    83    for (; iter_rep->Valid() && kComparator.Compare(iter_rep->key(), end_key) < 0; iter_rep->Next()) {
    84      const rocksdb::Slice key = iter_rep->key();
    85      const rocksdb::Slice value = iter_rep->value();
    86  
    87      rocksdb::Slice decoded_key;
    88      int64_t wall_time = 0;
    89      int32_t logical = 0;
    90      if (!DecodeKey(key, &decoded_key, &wall_time, &logical)) {
    91        stats.status = FmtStatus("unable to decode key");
    92        return stats;
    93      }
    94  
    95      // Check for ignored keys.
    96      if (decoded_key.starts_with(kLocalRangeIDPrefix)) {
    97        // RangeID-local key.
    98        int64_t range_id = 0;
    99        rocksdb::Slice infix, suffix, detail;
   100        if (!DecodeRangeIDKey(decoded_key, &range_id, &infix, &suffix, &detail)) {
   101          stats.status = FmtStatus("unable to decode rangeID key");
   102          return stats;
   103        }
   104  
   105        if (infix.compare(kLocalRangeIDReplicatedInfix) == 0) {
   106          // Replicated RangeID-local key.
   107          if (suffix.compare(kLocalRangeAppliedStateSuffix) == 0) {
   108            // RangeAppliedState key. Ignore.
   109            continue;
   110          }
   111        }
   112      }
   113  
   114      const bool isSys = (rocksdb::Slice(decoded_key).compare(kLocalMax) < 0);
   115      const bool isValue = (wall_time != 0 || logical != 0);
   116      const bool implicitMeta = isValue && decoded_key != prev_key;
   117      prev_key.assign(decoded_key.data(), decoded_key.size());
   118  
   119      if (implicitMeta) {
   120        // No MVCCMetadata entry for this series of keys.
   121        meta.Clear();
   122        meta.set_key_bytes(kMVCCVersionTimestampSize);
   123        meta.set_val_bytes(value.size());
   124        meta.set_deleted(value.size() == 0);
   125        meta.mutable_timestamp()->set_wall_time(wall_time);
   126      }
   127  
   128      if (!isValue || implicitMeta) {
   129        const int64_t meta_key_size = decoded_key.size() + 1;
   130        const int64_t meta_val_size = implicitMeta ? 0 : value.size();
   131        const int64_t total_bytes = meta_key_size + meta_val_size;
   132        first = true;
   133  
   134        if (!implicitMeta && !meta.ParseFromArray(value.data(), value.size())) {
   135          stats.status = FmtStatus("unable to decode MVCCMetadata");
   136          return stats;
   137        }
   138  
   139        if (isSys) {
   140          stats.sys_bytes += total_bytes;
   141          stats.sys_count++;
   142        } else {
   143          if (!meta.deleted()) {
   144            stats.live_bytes += total_bytes;
   145            stats.live_count++;
   146          } else {
   147            stats.gc_bytes_age += total_bytes * age_factor(meta.timestamp().wall_time(), now_nanos);
   148          }
   149          stats.key_bytes += meta_key_size;
   150          stats.val_bytes += meta_val_size;
   151          stats.key_count++;
   152          if (meta.has_raw_bytes()) {
   153            stats.val_count++;
   154          }
   155        }
   156        if (!implicitMeta) {
   157          continue;
   158        }
   159      }
   160  
   161      const int64_t total_bytes = value.size() + kMVCCVersionTimestampSize;
   162      if (isSys) {
   163        stats.sys_bytes += total_bytes;
   164      } else {
   165        if (first) {
   166          first = false;
   167          if (!meta.deleted()) {
   168            stats.live_bytes += total_bytes;
   169          } else {
   170            stats.gc_bytes_age += total_bytes * age_factor(meta.timestamp().wall_time(), now_nanos);
   171          }
   172          if (meta.has_txn()) {
   173            stats.intent_bytes += total_bytes;
   174            stats.intent_count++;
   175            stats.intent_age += age_factor(meta.timestamp().wall_time(), now_nanos);
   176          }
   177          if (meta.key_bytes() != kMVCCVersionTimestampSize) {
   178            stats.status = FmtStatus("expected mvcc metadata key bytes to equal %d; got %d",
   179                                     kMVCCVersionTimestampSize, int(meta.key_bytes()));
   180            return stats;
   181          }
   182          if (meta.val_bytes() != value.size()) {
   183            stats.status = FmtStatus("expected mvcc metadata val bytes to equal %d; got %d",
   184                                     int(value.size()), int(meta.val_bytes()));
   185            return stats;
   186          }
   187          accrue_gc_age_nanos = meta.timestamp().wall_time();
   188        } else {
   189          bool is_tombstone = value.size() == 0;
   190          if (is_tombstone) {
   191            stats.gc_bytes_age += total_bytes * age_factor(wall_time, now_nanos);
   192          } else {
   193            assert(accrue_gc_age_nanos > 0);
   194            stats.gc_bytes_age += total_bytes * age_factor(accrue_gc_age_nanos, now_nanos);
   195          }
   196          accrue_gc_age_nanos = wall_time;
   197        }
   198        stats.key_bytes += kMVCCVersionTimestampSize;
   199        stats.val_bytes += value.size();
   200        stats.val_count++;
   201      }
   202    }
   203  
   204    stats.last_update_nanos = now_nanos;
   205    return stats;
   206  }
   207  
   208  }  // namespace cockroach
   209  
   210  MVCCStatsResult MVCCComputeStats(DBIterator* iter, DBKey start, DBKey end, int64_t now_nanos) {
   211    return MVCCComputeStatsInternal(iter->rep.get(), start, end, now_nanos);
   212  }
   213  
   214  bool MVCCIsValidSplitKey(DBSlice key) { return IsValidSplitKey(ToSlice(key)); }
   215  
   216  DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_t target_size,
   217                            DBString* split_key) {
   218    auto iter_rep = iter->rep.get();
   219    const std::string start_key = EncodeKey(start);
   220    iter_rep->Seek(start_key);
   221    const rocksdb::Slice min_split_key = ToSlice(min_split.key);
   222  
   223    int64_t size_so_far = 0;
   224    std::string best_split_key = start_key;
   225    int64_t best_split_diff = std::numeric_limits<int64_t>::max();
   226    std::string prev_key;
   227  
   228    for (; iter_rep->Valid(); iter_rep->Next()) {
   229      const rocksdb::Slice key = iter_rep->key();
   230      rocksdb::Slice decoded_key;
   231      int64_t wall_time = 0;
   232      int32_t logical = 0;
   233      if (!DecodeKey(key, &decoded_key, &wall_time, &logical)) {
   234        return FmtStatus("unable to decode key");
   235      }
   236  
   237      const bool valid = IsValidSplitKey(decoded_key) && decoded_key.compare(min_split_key) >= 0;
   238      int64_t diff = target_size - size_so_far;
   239      if (diff < 0) {
   240        diff = -diff;
   241      }
   242      if (valid && diff < best_split_diff) {
   243        best_split_key = decoded_key.ToString();
   244        best_split_diff = diff;
   245      }
   246      // If diff is increasing, that means we've passed the ideal split point and
   247      // should return the first key that we can. Note that best_split_key may
   248      // still be empty if we haven't reached min_split_key yet.
   249      if (diff > best_split_diff && !best_split_key.empty()) {
   250        break;
   251      }
   252  
   253      const bool is_value = (wall_time != 0 || logical != 0);
   254      if (is_value && decoded_key == prev_key) {
   255        size_so_far += kMVCCVersionTimestampSize + iter_rep->value().size();
   256      } else {
   257        size_so_far += decoded_key.size() + 1 + iter_rep->value().size();
   258        if (is_value) {
   259          size_so_far += kMVCCVersionTimestampSize;
   260        }
   261      }
   262      prev_key.assign(decoded_key.data(), decoded_key.size());
   263    }
   264    if (best_split_key == start_key) {
   265      return kSuccess;
   266    }
   267    *split_key = ToDBString(best_split_key);
   268    return kSuccess;
   269  }
   270  
   271  DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn,
   272                        bool inconsistent, bool tombstones, bool fail_on_more_recent) {
   273    // Get is implemented as a scan where we retrieve a single key. We specify an
   274    // empty key for the end key which will ensure we don't retrieve a key
   275    // different than the start key. This is a bit of a hack.
   276    const DBSlice end = {0, 0};
   277    ScopedStats scoped_iter(iter);
   278    mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, 0 /* target_bytes */, txn,
   279                               inconsistent, tombstones, fail_on_more_recent);
   280    return scanner.get();
   281  }
   282  
   283  DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp,
   284                         int64_t max_keys, int64_t target_bytes, DBTxn txn, bool inconsistent,
   285                         bool reverse, bool tombstones, bool fail_on_more_recent) {
   286    ScopedStats scoped_iter(iter);
   287    if (reverse) {
   288      mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, target_bytes, txn,
   289                                 inconsistent, tombstones, fail_on_more_recent);
   290      return scanner.scan();
   291    } else {
   292      mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, target_bytes, txn, inconsistent, tombstones,
   293                                 fail_on_more_recent);
   294      return scanner.scan();
   295    }
   296  }