github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/c-deps/libroach/incremental_iterator.cc (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // 4 // Use of this software is governed by the Business Source License included 5 // in the file licenses/BSL.txt. 6 // 7 // As of the Change Date specified in that file, in accordance with the Business 8 // Source License, use of this software will be governed by the Apache License, 9 // Version 2.0, included in the file licenses/APL.txt. 10 11 #include "incremental_iterator.h" 12 #include "comparator.h" 13 #include "encoding.h" 14 #include "protos/roachpb/errors.pb.h" 15 16 using namespace cockroach; 17 18 DBIncrementalIterator::DBIncrementalIterator(DBEngine* engine, DBIterOptions opts, DBKey start, 19 DBKey end, DBString* write_intent) 20 : engine(engine), 21 opts(opts), 22 valid(true), 23 status(kSuccess), 24 start(start), 25 end(end), 26 write_intent(write_intent) { 27 28 start_time.set_wall_time(start.wall_time); 29 start_time.set_logical(start.logical); 30 end_time.set_wall_time(end.wall_time); 31 end_time.set_logical(end.logical); 32 33 DBIterOptions iter_opts = opts; 34 if (!EmptyTimestamp(opts.min_timestamp_hint) || !EmptyTimestamp(opts.max_timestamp_hint)) { 35 assert(!EmptyTimestamp(opts.max_timestamp_hint)); 36 DBIterOptions nontimebound_opts = DBIterOptions(); 37 nontimebound_opts.upper_bound = opts.upper_bound; 38 iter_opts = nontimebound_opts; 39 time_bound_iter.reset(DBNewIter(engine, opts)); 40 } 41 iter.reset(DBNewIter(engine, iter_opts)); 42 } 43 44 DBIncrementalIterator::~DBIncrementalIterator() {} 45 46 // legacyTimestampIsLess compares the timestamps t1 and t2, and returns a 47 // boolean indicating whether t1 is less than t2. 48 bool DBIncrementalIterator::legacyTimestampIsLess(const cockroach::util::hlc::LegacyTimestamp& t1, 49 const cockroach::util::hlc::LegacyTimestamp& t2) { 50 return t1.wall_time() < t2.wall_time() || 51 (t1.wall_time() == t2.wall_time() && t1.logical() < t2.logical()); 52 } 53 54 // extractKey extracts the key portion of the mvcc_key and put it in key. It 55 // returns a validity indicator. 56 WARN_UNUSED_RESULT bool DBIncrementalIterator::extractKey(rocksdb::Slice mvcc_key, 57 rocksdb::Slice* key) { 58 rocksdb::Slice ts; 59 if (!SplitKey(mvcc_key, key, &ts)) { 60 valid = false; 61 status = FmtStatus("failed to split mvcc key"); 62 return false; 63 } 64 return true; 65 } 66 67 // maybeSkipKeys checks if any keys can be skipped by using a time-bound 68 // iterator. If keys can be skipped, it will update the main iterator to point 69 // to the earliest version of the next candidate key. 70 // It is expected that TBI is at a key <= main iterator key when calling 71 // maybeSkipKeys(). 72 void DBIncrementalIterator::maybeSkipKeys() { 73 if (time_bound_iter == nullptr) { 74 // We don't have a TBI, so we can't skip any keys. 75 return; 76 } 77 78 rocksdb::Slice tbi_key; 79 if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) { 80 return; 81 } 82 rocksdb::Slice iter_key; 83 if (!extractKey(iter->rep->key(), &iter_key)) { 84 return; 85 } 86 87 if (iter_key.compare(tbi_key) > 0) { 88 // If the iterKey got ahead of the TBI key, advance the TBI Key. 89 // 90 // The case where iterKey == tbiKey, after this call, is the fast-path is 91 // when the TBI and the main iterator are in lockstep. In this case, the 92 // main iterator was referencing the next key that would be visited by the 93 // TBI. This means that for the incremental iterator to perform a Next or 94 // NextKey will require only 1 extra NextKey invocation while they remain in 95 // lockstep. This could be common if most keys are modified or the 96 // modifications are clustered in keyspace. 97 // 98 // NB: The Seek() below is expensive, so we aim to avoid it if both 99 // iterators remain in lockstep as described above. 100 auto state = DBIterNext(time_bound_iter.get(), true /* skip_current_key_versions */); 101 if (!state.valid) { 102 status = state.status; 103 valid = false; 104 return; 105 } 106 if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) { 107 return; 108 } 109 110 auto cmp = iter_key.compare(tbi_key); 111 if (cmp > 0) { 112 // If the tbiKey is still behind the iterKey, the TBI key may be seeing 113 // phantom MVCCKey.Keys. These keys may not be seen by the main iterator 114 // due to aborted transactions and keys which have been subsumed due to 115 // range tombstones. In this case we can SeekGE() the TBI to the main iterator. 116 DBKey seek_to = {}; 117 // NB: We don't ToDBKey as iter_key is already split. 118 seek_to.key = ToDBSlice(iter_key); 119 state = DBIterSeek(time_bound_iter.get(), seek_to); 120 if (!state.valid) { 121 status = state.status; 122 valid = false; 123 return; 124 } 125 if (!extractKey(time_bound_iter->rep->key(), &tbi_key)) { 126 return; 127 } 128 cmp = iter_key.compare(tbi_key); 129 } 130 131 if (cmp < 0) { 132 // In the case that the next MVCC key that the TBI observes is not the 133 // same as the main iterator, we may be able to skip over a large group of 134 // keys. The main iterator is seeked to the TBI in hopes that many keys 135 // were skipped. Note that a Seek() is an order of magnitude more 136 // expensive than a Next(). 137 DBKey seek_to = {}; 138 // NB: We don't ToDBKey as iter_key is already split. 139 seek_to.key = ToDBSlice(tbi_key); 140 state = DBIterSeek(iter.get(), seek_to); 141 if (!state.valid) { 142 status = state.status; 143 valid = false; 144 return; 145 } 146 } 147 } 148 } 149 150 // advanceKey advances the main iterator until it is referencing a key within 151 // (start_time, end_time]. 152 // It populates i.err with an error if either of the following was encountered: 153 // a) an inline value 154 // b) an intent with a timestamp within the incremental iterator's bounds 155 void DBIncrementalIterator::advanceKey() { 156 for (;;) { 157 maybeSkipKeys(); 158 if (!valid) { 159 return; 160 } 161 162 rocksdb::Slice key; 163 int64_t wall_time = 0; 164 int32_t logical = 0; 165 if (!DecodeKey(iter.get()->rep->key(), &key, &wall_time, &logical)) { 166 status = ToDBString("unable to decode key"); 167 valid = false; 168 return; 169 } 170 171 cockroach::storage::enginepb::MVCCMetadata meta; 172 if (wall_time != 0 || logical != 0) { 173 meta.mutable_timestamp()->set_wall_time(wall_time); 174 meta.mutable_timestamp()->set_logical(logical); 175 } else { 176 const auto value = iter->rep->value(); 177 if (!meta.ParseFromArray(value.data(), value.size())) { 178 status = ToDBString("failed to parse meta"); 179 valid = false; 180 return; 181 } 182 } 183 184 // Check for an inline value, as these are only used in non-user data. 185 // They're not needed for backup, so they're not handled by this method. 186 // If one shows up, throw an error so it's obvious something is wrong. 187 if (meta.has_raw_bytes()) { 188 valid = false; 189 status = ToDBString("Inline values are unsupported by the IncrementalIterator"); 190 return; 191 } 192 193 if (meta.has_txn()) { 194 if (legacyTimestampIsLess(start_time, meta.timestamp()) && 195 !legacyTimestampIsLess(end_time, meta.timestamp())) { 196 cockroach::roachpb::WriteIntentError err; 197 cockroach::roachpb::Intent* intent = err.add_intents(); 198 intent->mutable_single_key_span()->set_key(key.data(), key.size()); 199 intent->mutable_txn()->CopyFrom(meta.txn()); 200 201 status = ToDBString("WriteIntentError"); 202 valid = false; 203 *write_intent = ToDBString(err.SerializeAsString()); 204 205 return; 206 } 207 } 208 209 DBIterState state; 210 if (legacyTimestampIsLess(end_time, meta.timestamp())) { 211 state = DBIterNext(iter.get(), false); 212 } else if (!legacyTimestampIsLess(start_time, meta.timestamp())) { 213 state = DBIterNext(iter.get(), true); 214 } else { 215 // We have found a key within the time bounds, break. 216 break; 217 } 218 219 if (!state.valid) { 220 status = state.status; 221 valid = false; 222 return; 223 } 224 } 225 } 226 227 DBIterState DBIncrementalIterator::getState() { 228 DBIterState state = {}; 229 state.valid = valid; 230 state.status = status; 231 232 if (state.valid) { 233 rocksdb::Slice key; 234 state.valid = DecodeKey(iter.get()->rep->key(), &key, &state.key.wall_time, &state.key.logical); 235 if (state.valid) { 236 state.key.key = ToDBSlice(key); 237 state.value = ToDBSlice(iter.get()->rep->value()); 238 } 239 } 240 241 return state; 242 } 243 244 // seek advances the iterator to the first key in the engine which is >= the 245 // provided key. key should be a metadata key to ensure that the iterator has a 246 // chance to observe any intents on the key if they are there. 247 DBIterState DBIncrementalIterator::seek(DBKey key) { 248 if (time_bound_iter != nullptr) { 249 // Check which is the first key seen by the TBI. 250 auto state = DBIterSeek(time_bound_iter.get(), key); 251 if (!state.valid) { 252 status = state.status; 253 valid = false; 254 return getState(); 255 } 256 const rocksdb::Slice tbi_key(time_bound_iter->rep->key()); 257 // NB: iter_key needs to be constructed with ToSlice to ensure that an empty 258 // rocksdb::Slice is properly created in the common case that key is an 259 // empty key (the first key). 260 const rocksdb::Slice iter_key(ToSlice(key.key)); 261 if (tbi_key.compare(iter_key) > 0) { 262 // If the first key that the TBI sees is ahead of the given startKey, we 263 // can seek directly to the first version of the key. 264 key = ToDBKey(tbi_key); 265 } 266 } 267 auto state = DBIterSeek(iter.get(), key); 268 if (!state.valid) { 269 status = state.status; 270 valid = false; 271 return getState(); 272 } 273 advanceKey(); 274 return getState(); 275 } 276 277 DBIterState DBIncrementalIterator::next(bool skip_current_key_versions) { 278 auto state = DBIterNext(iter.get(), skip_current_key_versions); 279 if (!state.valid) { 280 status = state.status; 281 valid = false; 282 return getState(); 283 } 284 advanceKey(); 285 return getState(); 286 } 287 288 const rocksdb::Slice DBIncrementalIterator::key() { return iter.get()->rep->key(); } 289 290 const rocksdb::Slice DBIncrementalIterator::value() { return iter.get()->rep->value(); }