github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/mvcc.cc (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 #include "mvcc.h" 12 #include "comparator.h" 13 #include "encoding.h" 14 #include "keys.h" 15 16 using namespace cockroach; 17 18 namespace cockroach { 19 20 namespace { 21 22 bool IsValidSplitKey(const rocksdb::Slice& key) { 23 if (key == kMeta2KeyMax) { 24 // We do not allow splits at Meta2KeyMax. The reason for this is that range 25 // decriptors are stored at RangeMetaKey(range.EndKey), so the new range 26 // that ends at Meta2KeyMax would naturally store its decriptor at 27 // RangeMetaKey(Meta2KeyMax) = Meta1KeyMax. However, Meta1KeyMax already 28 // serves a different role of holding a second copy of the descriptor for 29 // the range that spans the meta2/userspace boundary (see case 3a in 30 // rangeAddressing). If we allowed splits at Meta2KeyMax, the two roles 31 // would overlap. See #1206. 32 return false; 33 } 34 for (auto span : kSortedNoSplitSpans) { 35 // kSortedNoSplitSpans is both reverse sorted (largest to smallest) on the 36 // span end key which allows us to early exit if our key to check is above 37 // the end of the last no-split span. 38 if (key.compare(span.second) >= 0) { 39 return true; 40 } 41 if (key.compare(span.first) > 0) { 42 return false; 43 } 44 } 45 return true; 46 } 47 48 const int64_t kNanosecondPerSecond = 1e9; 49 50 inline int64_t age_factor(int64_t fromNS, int64_t toNS) { 51 // Careful about implicit conversions here. 52 // toNS/1e9 - fromNS/1e9 is not the same since 53 // "1e9" is a double. 54 return toNS / kNanosecondPerSecond - fromNS / kNanosecondPerSecond; 55 } 56 57 } // namespace 58 59 // TODO(tschottdorf): it's unfortunate that this method duplicates the logic 60 // in (*MVCCStats).AgeTo. Passing now_nanos in is semantically tricky if there 61 // is a chance that we run into values ahead of now_nanos. Instead, now_nanos 62 // should be taken as a hint but determined by the max timestamp encountered. 63 // 64 // This implementation must match engine.ComputeStatsGo. 65 MVCCStatsResult MVCCComputeStatsInternal(::rocksdb::Iterator* const iter_rep, DBKey start, 66 DBKey end, int64_t now_nanos) { 67 MVCCStatsResult stats; 68 memset(&stats, 0, sizeof(stats)); 69 70 iter_rep->Seek(EncodeKey(start)); 71 const std::string end_key = EncodeKey(end); 72 73 cockroach::storage::enginepb::MVCCMetadata meta; 74 std::string prev_key; 75 bool first = false; 76 // NB: making this uninitialized triggers compiler warnings 77 // with `-Werror=maybe-uninitialized`. This warning seems like 78 // a false positive (changing the above line to `first=true` 79 // which results in equivalent code does not remove it either). 80 // An assertion has been placed where the compiler would warn. 81 int64_t accrue_gc_age_nanos = 0; 82 83 for (; iter_rep->Valid() && kComparator.Compare(iter_rep->key(), end_key) < 0; iter_rep->Next()) { 84 const rocksdb::Slice key = iter_rep->key(); 85 const rocksdb::Slice value = iter_rep->value(); 86 87 rocksdb::Slice decoded_key; 88 int64_t wall_time = 0; 89 int32_t logical = 0; 90 if (!DecodeKey(key, &decoded_key, &wall_time, &logical)) { 91 stats.status = FmtStatus("unable to decode key"); 92 return stats; 93 } 94 95 // Check for ignored keys. 96 if (decoded_key.starts_with(kLocalRangeIDPrefix)) { 97 // RangeID-local key. 98 int64_t range_id = 0; 99 rocksdb::Slice infix, suffix, detail; 100 if (!DecodeRangeIDKey(decoded_key, &range_id, &infix, &suffix, &detail)) { 101 stats.status = FmtStatus("unable to decode rangeID key"); 102 return stats; 103 } 104 105 if (infix.compare(kLocalRangeIDReplicatedInfix) == 0) { 106 // Replicated RangeID-local key. 107 if (suffix.compare(kLocalRangeAppliedStateSuffix) == 0) { 108 // RangeAppliedState key. Ignore. 109 continue; 110 } 111 } 112 } 113 114 const bool isSys = (rocksdb::Slice(decoded_key).compare(kLocalMax) < 0); 115 const bool isValue = (wall_time != 0 || logical != 0); 116 const bool implicitMeta = isValue && decoded_key != prev_key; 117 prev_key.assign(decoded_key.data(), decoded_key.size()); 118 119 if (implicitMeta) { 120 // No MVCCMetadata entry for this series of keys. 121 meta.Clear(); 122 meta.set_key_bytes(kMVCCVersionTimestampSize); 123 meta.set_val_bytes(value.size()); 124 meta.set_deleted(value.size() == 0); 125 meta.mutable_timestamp()->set_wall_time(wall_time); 126 } 127 128 if (!isValue || implicitMeta) { 129 const int64_t meta_key_size = decoded_key.size() + 1; 130 const int64_t meta_val_size = implicitMeta ? 0 : value.size(); 131 const int64_t total_bytes = meta_key_size + meta_val_size; 132 first = true; 133 134 if (!implicitMeta && !meta.ParseFromArray(value.data(), value.size())) { 135 stats.status = FmtStatus("unable to decode MVCCMetadata"); 136 return stats; 137 } 138 139 if (isSys) { 140 stats.sys_bytes += total_bytes; 141 stats.sys_count++; 142 } else { 143 if (!meta.deleted()) { 144 stats.live_bytes += total_bytes; 145 stats.live_count++; 146 } else { 147 stats.gc_bytes_age += total_bytes * age_factor(meta.timestamp().wall_time(), now_nanos); 148 } 149 stats.key_bytes += meta_key_size; 150 stats.val_bytes += meta_val_size; 151 stats.key_count++; 152 if (meta.has_raw_bytes()) { 153 stats.val_count++; 154 } 155 } 156 if (!implicitMeta) { 157 continue; 158 } 159 } 160 161 const int64_t total_bytes = value.size() + kMVCCVersionTimestampSize; 162 if (isSys) { 163 stats.sys_bytes += total_bytes; 164 } else { 165 if (first) { 166 first = false; 167 if (!meta.deleted()) { 168 stats.live_bytes += total_bytes; 169 } else { 170 stats.gc_bytes_age += total_bytes * age_factor(meta.timestamp().wall_time(), now_nanos); 171 } 172 if (meta.has_txn()) { 173 stats.intent_bytes += total_bytes; 174 stats.intent_count++; 175 stats.intent_age += age_factor(meta.timestamp().wall_time(), now_nanos); 176 } 177 if (meta.key_bytes() != kMVCCVersionTimestampSize) { 178 stats.status = FmtStatus("expected mvcc metadata key bytes to equal %d; got %d", 179 kMVCCVersionTimestampSize, int(meta.key_bytes())); 180 return stats; 181 } 182 if (meta.val_bytes() != value.size()) { 183 stats.status = FmtStatus("expected mvcc metadata val bytes to equal %d; got %d", 184 int(value.size()), int(meta.val_bytes())); 185 return stats; 186 } 187 accrue_gc_age_nanos = meta.timestamp().wall_time(); 188 } else { 189 bool is_tombstone = value.size() == 0; 190 if (is_tombstone) { 191 stats.gc_bytes_age += total_bytes * age_factor(wall_time, now_nanos); 192 } else { 193 assert(accrue_gc_age_nanos > 0); 194 stats.gc_bytes_age += total_bytes * age_factor(accrue_gc_age_nanos, now_nanos); 195 } 196 accrue_gc_age_nanos = wall_time; 197 } 198 stats.key_bytes += kMVCCVersionTimestampSize; 199 stats.val_bytes += value.size(); 200 stats.val_count++; 201 } 202 } 203 204 stats.last_update_nanos = now_nanos; 205 return stats; 206 } 207 208 } // namespace cockroach 209 210 MVCCStatsResult MVCCComputeStats(DBIterator* iter, DBKey start, DBKey end, int64_t now_nanos) { 211 return MVCCComputeStatsInternal(iter->rep.get(), start, end, now_nanos); 212 } 213 214 bool MVCCIsValidSplitKey(DBSlice key) { return IsValidSplitKey(ToSlice(key)); } 215 216 DBStatus MVCCFindSplitKey(DBIterator* iter, DBKey start, DBKey min_split, int64_t target_size, 217 DBString* split_key) { 218 auto iter_rep = iter->rep.get(); 219 const std::string start_key = EncodeKey(start); 220 iter_rep->Seek(start_key); 221 const rocksdb::Slice min_split_key = ToSlice(min_split.key); 222 223 int64_t size_so_far = 0; 224 std::string best_split_key = start_key; 225 int64_t best_split_diff = std::numeric_limits<int64_t>::max(); 226 std::string prev_key; 227 228 for (; iter_rep->Valid(); iter_rep->Next()) { 229 const rocksdb::Slice key = iter_rep->key(); 230 rocksdb::Slice decoded_key; 231 int64_t wall_time = 0; 232 int32_t logical = 0; 233 if (!DecodeKey(key, &decoded_key, &wall_time, &logical)) { 234 return FmtStatus("unable to decode key"); 235 } 236 237 const bool valid = IsValidSplitKey(decoded_key) && decoded_key.compare(min_split_key) >= 0; 238 int64_t diff = target_size - size_so_far; 239 if (diff < 0) { 240 diff = -diff; 241 } 242 if (valid && diff < best_split_diff) { 243 best_split_key = decoded_key.ToString(); 244 best_split_diff = diff; 245 } 246 // If diff is increasing, that means we've passed the ideal split point and 247 // should return the first key that we can. Note that best_split_key may 248 // still be empty if we haven't reached min_split_key yet. 249 if (diff > best_split_diff && !best_split_key.empty()) { 250 break; 251 } 252 253 const bool is_value = (wall_time != 0 || logical != 0); 254 if (is_value && decoded_key == prev_key) { 255 size_so_far += kMVCCVersionTimestampSize + iter_rep->value().size(); 256 } else { 257 size_so_far += decoded_key.size() + 1 + iter_rep->value().size(); 258 if (is_value) { 259 size_so_far += kMVCCVersionTimestampSize; 260 } 261 } 262 prev_key.assign(decoded_key.data(), decoded_key.size()); 263 } 264 if (best_split_key == start_key) { 265 return kSuccess; 266 } 267 *split_key = ToDBString(best_split_key); 268 return kSuccess; 269 } 270 271 DBScanResults MVCCGet(DBIterator* iter, DBSlice key, DBTimestamp timestamp, DBTxn txn, 272 bool inconsistent, bool tombstones, bool fail_on_more_recent) { 273 // Get is implemented as a scan where we retrieve a single key. We specify an 274 // empty key for the end key which will ensure we don't retrieve a key 275 // different than the start key. This is a bit of a hack. 276 const DBSlice end = {0, 0}; 277 ScopedStats scoped_iter(iter); 278 mvccForwardScanner scanner(iter, key, end, timestamp, 1 /* max_keys */, 0 /* target_bytes */, txn, 279 inconsistent, tombstones, fail_on_more_recent); 280 return scanner.get(); 281 } 282 283 DBScanResults MVCCScan(DBIterator* iter, DBSlice start, DBSlice end, DBTimestamp timestamp, 284 int64_t max_keys, int64_t target_bytes, DBTxn txn, bool inconsistent, 285 bool reverse, bool tombstones, bool fail_on_more_recent) { 286 ScopedStats scoped_iter(iter); 287 if (reverse) { 288 mvccReverseScanner scanner(iter, end, start, timestamp, max_keys, target_bytes, txn, 289 inconsistent, tombstones, fail_on_more_recent); 290 return scanner.scan(); 291 } else { 292 mvccForwardScanner scanner(iter, start, end, timestamp, max_keys, target_bytes, txn, inconsistent, tombstones, 293 fail_on_more_recent); 294 return scanner.scan(); 295 } 296 }