github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/storage/mvcc_incremental_iterator.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package storage 12 13 import ( 14 "github.com/cockroachdb/cockroach/pkg/roachpb" 15 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 16 "github.com/cockroachdb/cockroach/pkg/util/hlc" 17 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 18 "github.com/cockroachdb/errors" 19 ) 20 21 // MVCCIncrementalIterator iterates over the diff of the key range 22 // [startKey,endKey) and time range (startTime,endTime]. If a key was added or 23 // modified between startTime and endTime, the iterator will position at the 24 // most recent version (before or at endTime) of that key. If the key was most 25 // recently deleted, this is signaled with an empty value. 26 // 27 // MVCCIncrementalIterator will return an error if either of the following are 28 // encountered: 29 // 1. An inline value (non-user data) 30 // 2. An intent whose timestamp lies within the time bounds 31 // 32 // Note: The endTime is inclusive to be consistent with the non-incremental 33 // iterator, where reads at a given timestamp return writes at that 34 // timestamp. The startTime is then made exclusive so that iterating time 1 to 35 // 2 and then 2 to 3 will only return values with time 2 once. An exclusive 36 // start time would normally make it difficult to scan timestamp 0, but 37 // CockroachDB uses that as a sentinel for key metadata anyway. 38 // 39 // Expected usage: 40 // iter := NewMVCCIncrementalIterator(e, IterOptions{ 41 // StartTime: startTime, 42 // EndTime: endTime, 43 // UpperBound: endKey, 44 // }) 45 // defer iter.Close() 46 // for iter.SeekGE(startKey); ; iter.Next() { 47 // ok, err := iter.Valid() 48 // if !ok { ... } 49 // [code using iter.Key() and iter.Value()] 50 // } 51 // if err := iter.Error(); err != nil { 52 // ... 53 // } 54 // 55 // Note regarding the correctness of the time-bound iterator optimization: 56 // 57 // When using (t_s, t_e], say there is a version (committed or provisional) 58 // k@t where t is in that interval, that is visible to iter. All sstables 59 // containing k@t will be included in timeBoundIter. Note that there may be 60 // multiple sequence numbers for the key k@t at the storage layer, say k@t#n1, 61 // k@t#n2, where n1 > n2, some of which may be deleted, but the latest 62 // sequence number will be visible using iter (since not being visible would be 63 // a contradiction of the initial assumption that k@t is visible to iter). 64 // Since there is no delete across all sstables that deletes k@t#n1, there is 65 // no delete in the subset of sstables used by timeBoundIter that deletes 66 // k@t#n1, so the timeBoundIter will see k@t. 67 // 68 // NOTE: This is not used by CockroachDB and has been preserved to serve as an 69 // oracle to prove the correctness of the new export logic. 70 type MVCCIncrementalIterator struct { 71 iter Iterator 72 73 // A time-bound iterator cannot be used by itself due to a bug in the time- 74 // bound iterator (#28358). This was historically augmented with an iterator 75 // without the time-bound optimization to act as a sanity iterator, but 76 // issues remained (#43799), so now the iterator above is the main iterator 77 // the timeBoundIter is used to check if any keys can be skipped by the main 78 // iterator. 79 timeBoundIter Iterator 80 81 startTime hlc.Timestamp 82 endTime hlc.Timestamp 83 err error 84 valid bool 85 86 // For allocation avoidance, meta is used to store the timestamp of keys 87 // regardless if they are metakeys. 88 meta enginepb.MVCCMetadata 89 } 90 91 var _ SimpleIterator = &MVCCIncrementalIterator{} 92 93 // MVCCIncrementalIterOptions bundles options for NewMVCCIncrementalIterator. 94 type MVCCIncrementalIterOptions struct { 95 IterOptions IterOptions 96 // Keys visible by the MVCCIncrementalIterator must be within (StartTime, 97 // EndTime]. Note that if {Min,Max}TimestampHints are specified in 98 // IterOptions, the timestamp hints interval should include the start and end 99 // time. 100 StartTime hlc.Timestamp 101 EndTime hlc.Timestamp 102 } 103 104 // NewMVCCIncrementalIterator creates an MVCCIncrementalIterator with the 105 // specified reader and options. The timestamp hint range should not be more 106 // restrictive than the start and end time range. 107 // TODO(pbardea): Add validation here and in C++ implementation that the 108 // timestamp hints are not more restrictive than incremental iterator's 109 // (startTime, endTime] interval. 110 func NewMVCCIncrementalIterator( 111 reader Reader, opts MVCCIncrementalIterOptions, 112 ) *MVCCIncrementalIterator { 113 var iter Iterator 114 var timeBoundIter Iterator 115 if !opts.IterOptions.MinTimestampHint.IsEmpty() && !opts.IterOptions.MaxTimestampHint.IsEmpty() { 116 // An iterator without the timestamp hints is created to ensure that the 117 // iterator visits every required version of every key that has changed. 118 iter = reader.NewIterator(IterOptions{ 119 UpperBound: opts.IterOptions.UpperBound, 120 }) 121 timeBoundIter = reader.NewIterator(opts.IterOptions) 122 } else { 123 iter = reader.NewIterator(opts.IterOptions) 124 } 125 126 return &MVCCIncrementalIterator{ 127 iter: iter, 128 startTime: opts.StartTime, 129 endTime: opts.EndTime, 130 timeBoundIter: timeBoundIter, 131 } 132 } 133 134 // SeekGE advances the iterator to the first key in the engine which is >= the 135 // provided key. startKey should be a metadata key to ensure that the iterator 136 // has a chance to observe any intents on the key if they are there. 137 func (i *MVCCIncrementalIterator) SeekGE(startKey MVCCKey) { 138 if i.timeBoundIter != nil { 139 // Check which is the first key seen by the TBI. 140 i.timeBoundIter.SeekGE(startKey) 141 if ok, err := i.timeBoundIter.Valid(); !ok { 142 i.err = err 143 i.valid = false 144 return 145 } 146 tbiKey := i.timeBoundIter.Key().Key 147 if tbiKey.Compare(startKey.Key) > 0 { 148 // If the first key that the TBI sees is ahead of the given startKey, we 149 // can seek directly to the first version of the key. 150 startKey = MakeMVCCMetadataKey(tbiKey) 151 } 152 } 153 i.iter.SeekGE(startKey) 154 if ok, err := i.iter.Valid(); !ok { 155 i.err = err 156 i.valid = false 157 return 158 } 159 i.err = nil 160 i.valid = true 161 i.advance() 162 } 163 164 // Close frees up resources held by the iterator. 165 func (i *MVCCIncrementalIterator) Close() { 166 i.iter.Close() 167 if i.timeBoundIter != nil { 168 i.timeBoundIter.Close() 169 } 170 } 171 172 // Next advances the iterator to the next key/value in the iteration. After this 173 // call, Valid() will be true if the iterator was not positioned at the last 174 // key. 175 func (i *MVCCIncrementalIterator) Next() { 176 i.iter.Next() 177 if ok, err := i.iter.Valid(); !ok { 178 i.err = err 179 i.valid = false 180 return 181 } 182 i.advance() 183 } 184 185 // NextKey advances the iterator to the next key. This operation is distinct 186 // from Next which advances to the next version of the current key or the next 187 // key if the iterator is currently located at the last version for a key. 188 func (i *MVCCIncrementalIterator) NextKey() { 189 i.iter.NextKey() 190 if ok, err := i.iter.Valid(); !ok { 191 i.err = err 192 i.valid = false 193 return 194 } 195 i.advance() 196 } 197 198 // maybeSkipKeys checks if any keys can be skipped by using a time-bound 199 // iterator. If keys can be skipped, it will update the main iterator to point 200 // to the earliest version of the next candidate key. 201 // It is expected that TBI is at a key <= main iterator key when calling 202 // maybeSkipKeys(). 203 func (i *MVCCIncrementalIterator) maybeSkipKeys() { 204 if i.timeBoundIter == nil { 205 // If there is no time bound iterator, we cannot skip any keys. 206 return 207 } 208 tbiKey := i.timeBoundIter.Key().Key 209 iterKey := i.iter.Key().Key 210 if iterKey.Compare(tbiKey) > 0 { 211 // If the iterKey got ahead of the TBI key, advance the TBI Key. 212 // 213 // The case where iterKey == tbiKey, after this call, is the fast-path is 214 // when the TBI and the main iterator are in lockstep. In this case, the 215 // main iterator was referencing the next key that would be visited by the 216 // TBI. This means that for the incremental iterator to perform a Next or 217 // NextKey will require only 1 extra NextKey invocation while they remain in 218 // lockstep. This could be common if most keys are modified or the 219 // modifications are clustered in keyspace. 220 // 221 // NB: The Seek() below is expensive, so we aim to avoid it if both 222 // iterators remain in lockstep as described above. 223 i.timeBoundIter.NextKey() 224 if ok, err := i.timeBoundIter.Valid(); !ok { 225 i.err = err 226 i.valid = false 227 return 228 } 229 tbiKey = i.timeBoundIter.Key().Key 230 231 cmp := iterKey.Compare(tbiKey) 232 233 if cmp > 0 { 234 // If the tbiKey is still behind the iterKey, the TBI key may be seeing 235 // phantom MVCCKey.Keys. These keys may not be seen by the main iterator 236 // due to aborted transactions and keys which have been subsumed due to 237 // range tombstones. In this case we can SeekGE() the TBI to the main iterator. 238 seekKey := MakeMVCCMetadataKey(iterKey) 239 i.timeBoundIter.SeekGE(seekKey) 240 if ok, err := i.timeBoundIter.Valid(); !ok { 241 i.err = err 242 i.valid = false 243 return 244 } 245 tbiKey = i.timeBoundIter.Key().Key 246 cmp = iterKey.Compare(tbiKey) 247 } 248 249 if cmp < 0 { 250 // In the case that the next MVCC key that the TBI observes is not the 251 // same as the main iterator, we may be able to skip over a large group 252 // of keys. The main iterator is seeked to the TBI in hopes that many 253 // keys were skipped. Note that a Seek is an order of magnitude more 254 // expensive than a Next call. 255 seekKey := MakeMVCCMetadataKey(tbiKey) 256 i.iter.SeekGE(seekKey) 257 if ok, err := i.iter.Valid(); !ok { 258 i.err = err 259 i.valid = false 260 return 261 } 262 } 263 } 264 } 265 266 // advance advances the main iterator until it is referencing a key within 267 // (start_time, end_time]. 268 // It populates i.err with an error if either of the following was encountered: 269 // a) an inline value 270 // b) an intent with a timestamp within the incremental iterator's bounds 271 func (i *MVCCIncrementalIterator) advance() { 272 for { 273 i.maybeSkipKeys() 274 if !i.valid { 275 return 276 } 277 278 unsafeMetaKey := i.iter.UnsafeKey() 279 if unsafeMetaKey.IsValue() { 280 // They key is an MVCC value and note an intent. 281 // Intents are handled next. 282 i.meta.Reset() 283 i.meta.Timestamp = hlc.LegacyTimestamp(unsafeMetaKey.Timestamp) 284 } else { 285 // The key is a metakey (an intent), this is used later to see if the 286 // timestamp of this intent is within the incremental iterator's time 287 // bounds. 288 if i.err = protoutil.Unmarshal(i.iter.UnsafeValue(), &i.meta); i.err != nil { 289 i.valid = false 290 return 291 } 292 } 293 294 if i.meta.IsInline() { 295 // Inline values are only used in non-user data. They're not needed 296 // for backup, so they're not handled by this method. If one shows 297 // up, throw an error so it's obvious something is wrong. 298 i.valid = false 299 i.err = errors.Errorf("inline values are unsupported by MVCCIncrementalIterator: %s", 300 unsafeMetaKey.Key) 301 return 302 } 303 304 metaTimestamp := hlc.Timestamp(i.meta.Timestamp) 305 if i.meta.Txn != nil { 306 if i.startTime.Less(metaTimestamp) && metaTimestamp.LessEq(i.endTime) { 307 i.err = &roachpb.WriteIntentError{ 308 Intents: []roachpb.Intent{ 309 roachpb.MakeIntent(i.meta.Txn, i.iter.Key().Key), 310 }, 311 } 312 i.valid = false 313 return 314 } 315 i.iter.Next() 316 if ok, err := i.iter.Valid(); !ok { 317 i.err = err 318 i.valid = false 319 return 320 } 321 continue 322 } 323 324 // Note that MVCC keys are sorted by key, then by _descending_ timestamp 325 // order with the exception of the metakey (timestamp 0) being sorted 326 // first. See mvcc.h for more information. 327 if i.endTime.Less(metaTimestamp) { 328 i.iter.Next() 329 } else if metaTimestamp.LessEq(i.startTime) { 330 i.iter.NextKey() 331 } else { 332 // The current key is a valid user key and within the time bounds. We are 333 // done. 334 break 335 } 336 337 if ok, err := i.iter.Valid(); !ok { 338 i.err = err 339 i.valid = false 340 return 341 } 342 } 343 } 344 345 // Valid must be called after any call to Reset(), Next(), or similar methods. 346 // It returns (true, nil) if the iterator points to a valid key (it is undefined 347 // to call Key(), Value(), or similar methods unless Valid() has returned (true, 348 // nil)). It returns (false, nil) if the iterator has moved past the end of the 349 // valid range, or (false, err) if an error has occurred. Valid() will never 350 // return true with a non-nil error. 351 func (i *MVCCIncrementalIterator) Valid() (bool, error) { 352 return i.valid, i.err 353 } 354 355 // Key returns the current key. 356 func (i *MVCCIncrementalIterator) Key() MVCCKey { 357 return i.iter.Key() 358 } 359 360 // Value returns the current value as a byte slice. 361 func (i *MVCCIncrementalIterator) Value() []byte { 362 return i.iter.Value() 363 } 364 365 // UnsafeKey returns the same key as Key, but the memory is invalidated on the 366 // next call to {Next,Reset,Close}. 367 func (i *MVCCIncrementalIterator) UnsafeKey() MVCCKey { 368 return i.iter.UnsafeKey() 369 } 370 371 // UnsafeValue returns the same value as Value, but the memory is invalidated on 372 // the next call to {Next,Reset,Close}. 373 func (i *MVCCIncrementalIterator) UnsafeValue() []byte { 374 return i.iter.UnsafeValue() 375 }