github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/gc/gc.go (about) 1 // Copyright 2020 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 // Package gc contains the logic to run scan a range for garbage and issue 12 // GC requests to remove that garbage. 13 // 14 // The Run function is the primary entry point and is called underneath the 15 // gcQueue in the storage package. It can also be run for debugging. 16 package gc 17 18 import ( 19 "context" 20 "fmt" 21 "time" 22 23 "github.com/cockroachdb/cockroach/pkg/base" 24 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 25 "github.com/cockroachdb/cockroach/pkg/keys" 26 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 28 "github.com/cockroachdb/cockroach/pkg/roachpb" 29 "github.com/cockroachdb/cockroach/pkg/storage" 30 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 31 "github.com/cockroachdb/cockroach/pkg/util/bufalloc" 32 "github.com/cockroachdb/cockroach/pkg/util/hlc" 33 "github.com/cockroachdb/cockroach/pkg/util/log" 34 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 35 "github.com/cockroachdb/cockroach/pkg/util/uuid" 36 "github.com/cockroachdb/errors" 37 ) 38 39 const ( 40 // IntentAgeThreshold is the threshold after which an extant intent 41 // will be resolved. 42 IntentAgeThreshold = 2 * time.Hour // 2 hour 43 44 // KeyVersionChunkBytes is the threshold size for splitting 45 // GCRequests into multiple batches. The goal is that the evaluated 46 // Raft command for each GCRequest does not significantly exceed 47 // this threshold. 48 KeyVersionChunkBytes = base.ChunkRaftCommandThresholdBytes 49 ) 50 51 // CalculateThreshold calculates the GC threshold given the policy and the 52 // current view of time. 53 func CalculateThreshold(now hlc.Timestamp, policy zonepb.GCPolicy) (threshold hlc.Timestamp) { 54 ttlNanos := int64(policy.TTLSeconds) * time.Second.Nanoseconds() 55 return now.Add(-ttlNanos, 0) 56 } 57 58 // TimestampForThreshold inverts CalculateThreshold. It returns the timestamp 59 // which should be used for now to arrive at the passed threshold. 60 func TimestampForThreshold(threshold hlc.Timestamp, policy zonepb.GCPolicy) (ts hlc.Timestamp) { 61 ttlNanos := int64(policy.TTLSeconds) * time.Second.Nanoseconds() 62 return threshold.Add(ttlNanos, 0) 63 } 64 65 // Thresholder is part of the GCer interface. 66 type Thresholder interface { 67 SetGCThreshold(context.Context, Threshold) error 68 } 69 70 // PureGCer is part of the GCer interface. 71 type PureGCer interface { 72 GC(context.Context, []roachpb.GCRequest_GCKey) error 73 } 74 75 // A GCer is an abstraction used by the GC queue to carry out chunked deletions. 76 type GCer interface { 77 Thresholder 78 PureGCer 79 } 80 81 // NoopGCer implements GCer by doing nothing. 82 type NoopGCer struct{} 83 84 var _ GCer = NoopGCer{} 85 86 // SetGCThreshold implements storage.GCer. 87 func (NoopGCer) SetGCThreshold(context.Context, Threshold) error { return nil } 88 89 // GC implements storage.GCer. 90 func (NoopGCer) GC(context.Context, []roachpb.GCRequest_GCKey) error { return nil } 91 92 // Threshold holds the key and txn span GC thresholds, respectively. 93 type Threshold struct { 94 Key hlc.Timestamp 95 Txn hlc.Timestamp 96 } 97 98 // Info contains statistics and insights from a GC run. 99 type Info struct { 100 // Now is the timestamp used for age computations. 101 Now hlc.Timestamp 102 // Policy is the policy used for this garbage collection cycle. 103 Policy zonepb.GCPolicy 104 // Stats about the userspace key-values considered, namely the number of 105 // keys with GC'able data, the number of "old" intents and the number of 106 // associated distinct transactions. 107 NumKeysAffected, IntentsConsidered, IntentTxns int 108 // TransactionSpanTotal is the total number of entries in the transaction span. 109 TransactionSpanTotal int 110 // Summary of transactions which were found GCable (assuming that 111 // potentially necessary intent resolutions did not fail). 112 TransactionSpanGCAborted, TransactionSpanGCCommitted int 113 TransactionSpanGCStaging, TransactionSpanGCPending int 114 // AbortSpanTotal is the total number of transactions present in the AbortSpan. 115 AbortSpanTotal int 116 // AbortSpanConsidered is the number of AbortSpan entries old enough to be 117 // considered for removal. An "entry" corresponds to one transaction; 118 // more than one key-value pair may be associated with it. 119 AbortSpanConsidered int 120 // AbortSpanGCNum is the number of AbortSpan entries fit for removal (due 121 // to their transactions having terminated). 122 AbortSpanGCNum int 123 // PushTxn is the total number of pushes attempted in this cycle. 124 PushTxn int 125 // ResolveTotal is the total number of attempted intent resolutions in 126 // this cycle. 127 ResolveTotal int 128 // Threshold is the computed expiration timestamp. Equal to `Now - Policy`. 129 Threshold hlc.Timestamp 130 // AffectedVersionsKeyBytes is the number of (fully encoded) bytes deleted from keys in the storage engine. 131 // Note that this does not account for compression that the storage engine uses to store data on disk. Real 132 // space savings tends to be smaller due to this compression, and space may be released only at a later point 133 // in time. 134 AffectedVersionsKeyBytes int64 135 // AffectedVersionsValBytes is the number of (fully encoded) bytes deleted from values in the storage engine. 136 // See AffectedVersionsKeyBytes for caveats. 137 AffectedVersionsValBytes int64 138 } 139 140 // CleanupIntentsFunc synchronously resolves the supplied intents 141 // (which may be PENDING, in which case they are first pushed) while 142 // taking care of proper batching. 143 type CleanupIntentsFunc func(context.Context, []roachpb.Intent) error 144 145 // CleanupTxnIntentsAsyncFunc asynchronously cleans up intents from a 146 // transaction record, pushing the transaction first if it is 147 // PENDING. Once all intents are resolved successfully, removes the 148 // transaction record. 149 type CleanupTxnIntentsAsyncFunc func(context.Context, *roachpb.Transaction, []roachpb.LockUpdate) error 150 151 // Run runs garbage collection for the specified descriptor on the 152 // provided Engine (which is not mutated). It uses the provided gcFn 153 // to run garbage collection once on all implicated spans, 154 // cleanupIntentsFn to resolve intents synchronously, and 155 // cleanupTxnIntentsAsyncFn to asynchronously cleanup intents and 156 // associated transaction record on success. 157 func Run( 158 ctx context.Context, 159 desc *roachpb.RangeDescriptor, 160 snap storage.Reader, 161 now, newThreshold hlc.Timestamp, 162 policy zonepb.GCPolicy, 163 gcer GCer, 164 cleanupIntentsFn CleanupIntentsFunc, 165 cleanupTxnIntentsAsyncFn CleanupTxnIntentsAsyncFunc, 166 ) (Info, error) { 167 168 txnExp := now.Add(-kvserverbase.TxnCleanupThreshold.Nanoseconds(), 0) 169 if err := gcer.SetGCThreshold(ctx, Threshold{ 170 Key: newThreshold, 171 Txn: txnExp, 172 }); err != nil { 173 return Info{}, errors.Wrap(err, "failed to set GC thresholds") 174 } 175 176 info := Info{ 177 Policy: policy, 178 Now: now, 179 Threshold: newThreshold, 180 } 181 182 // Maps from txn ID to txn and intent key slice. 183 txnMap := map[uuid.UUID]*roachpb.Transaction{} 184 intentKeyMap := map[uuid.UUID][]roachpb.Key{} 185 err := processReplicatedKeyRange(ctx, desc, snap, now, newThreshold, gcer, txnMap, intentKeyMap, &info) 186 if err != nil { 187 return Info{}, err 188 } 189 190 // From now on, all keys processed are range-local and inline (zero timestamp). 191 192 // Process local range key entries (txn records, queue last processed times). 193 if err := processLocalKeyRange(ctx, snap, desc, txnExp, &info, cleanupTxnIntentsAsyncFn, gcer); err != nil { 194 log.Warningf(ctx, "while gc'ing local key range: %s", err) 195 } 196 197 // Clean up the AbortSpan. 198 log.Event(ctx, "processing AbortSpan") 199 processAbortSpan(ctx, snap, desc.RangeID, txnExp, &info, gcer) 200 201 log.Eventf(ctx, "GC'ed keys; stats %+v", info) 202 203 // Push transactions (if pending) and resolve intents. 204 var intents []roachpb.Intent 205 for txnID, txn := range txnMap { 206 intents = append(intents, roachpb.AsIntents(&txn.TxnMeta, intentKeyMap[txnID])...) 207 } 208 info.ResolveTotal += len(intents) 209 log.Eventf(ctx, "cleanup of %d intents", len(intents)) 210 if err := cleanupIntentsFn(ctx, intents); err != nil { 211 return Info{}, err 212 } 213 214 return info, nil 215 } 216 217 // processReplicatedKeyRange identifies garbage and sends GC requests to 218 // remove it. 219 // 220 // The logic iterates all versions of all keys in the range from oldest to 221 // newest. Expired intents are written into the txnMap and intentKeyMap. 222 func processReplicatedKeyRange( 223 ctx context.Context, 224 desc *roachpb.RangeDescriptor, 225 snap storage.Reader, 226 now hlc.Timestamp, 227 threshold hlc.Timestamp, 228 gcer GCer, 229 txnMap map[uuid.UUID]*roachpb.Transaction, 230 intentKeyMap map[uuid.UUID][]roachpb.Key, 231 info *Info, 232 ) error { 233 var alloc bufalloc.ByteAllocator 234 // Compute intent expiration (intent age at which we attempt to resolve). 235 intentExp := now.Add(-IntentAgeThreshold.Nanoseconds(), 0) 236 handleIntent := func(md *storage.MVCCKeyValue) { 237 meta := &enginepb.MVCCMetadata{} 238 if err := protoutil.Unmarshal(md.Value, meta); err != nil { 239 log.Errorf(ctx, "unable to unmarshal MVCC metadata for key %q: %+v", md.Key, err) 240 return 241 } 242 if meta.Txn != nil { 243 // Keep track of intent to resolve if older than the intent 244 // expiration threshold. 245 if hlc.Timestamp(meta.Timestamp).Less(intentExp) { 246 txnID := meta.Txn.ID 247 if _, ok := txnMap[txnID]; !ok { 248 txnMap[txnID] = &roachpb.Transaction{ 249 TxnMeta: *meta.Txn, 250 } 251 // IntentTxns and PushTxn will be equal here, since 252 // pushes to transactions whose record lies in this 253 // range (but which are not associated to a remaining 254 // intent on it) happen asynchronously and are accounted 255 // for separately. Thus higher up in the stack, we 256 // expect PushTxn > IntentTxns. 257 info.IntentTxns++ 258 // All transactions in txnMap may be PENDING and 259 // cleanupIntentsFn will push them to finalize them. 260 info.PushTxn++ 261 } 262 info.IntentsConsidered++ 263 alloc, md.Key.Key = alloc.Copy(md.Key.Key, 0) 264 intentKeyMap[txnID] = append(intentKeyMap[txnID], md.Key.Key) 265 } 266 } 267 } 268 269 // Iterate all versions of all keys from oldest to newest. If a version is an 270 // intent it will have the highest timestamp of any versions and will be 271 // followed by a metadata entry. The loop will determine whether a given key 272 // has garbage and, if so, will determine the timestamp of the latest version 273 // which is garbage to be added to the current batch. If the current version 274 // pushes the size of keys to be removed above the limit, the current key will 275 // be added with that version and the batch will be sent. When the newest 276 // version for a key has been reached, if haveGarbageForThisKey, we'll add the 277 // current key to the batch with the gcTimestampForThisKey. 278 var ( 279 batchGCKeys []roachpb.GCRequest_GCKey 280 batchGCKeysBytes int64 281 haveGarbageForThisKey bool 282 gcTimestampForThisKey hlc.Timestamp 283 sentBatchForThisKey bool 284 ) 285 it := makeGCIterator(desc, snap) 286 defer it.close() 287 for ; ; it.step() { 288 s, ok := it.state() 289 if !ok { 290 if it.err != nil { 291 return it.err 292 } 293 break 294 } 295 if s.curIsNotValue() { // Step over metadata or other system keys 296 continue 297 } 298 if s.curIsIntent() { 299 handleIntent(s.next) 300 continue 301 } 302 isNewest := s.curIsNewest() 303 if isGarbage(threshold, s.cur, s.next, isNewest) { 304 keyBytes := int64(s.cur.Key.EncodedSize()) 305 batchGCKeysBytes += keyBytes 306 haveGarbageForThisKey = true 307 gcTimestampForThisKey = s.cur.Key.Timestamp 308 info.AffectedVersionsKeyBytes += keyBytes 309 info.AffectedVersionsValBytes += int64(len(s.cur.Value)) 310 } 311 if affected := isNewest && (sentBatchForThisKey || haveGarbageForThisKey); affected { 312 info.NumKeysAffected++ 313 } 314 shouldSendBatch := batchGCKeysBytes >= KeyVersionChunkBytes 315 if shouldSendBatch || isNewest && haveGarbageForThisKey { 316 alloc, s.cur.Key.Key = alloc.Copy(s.cur.Key.Key, 0) 317 batchGCKeys = append(batchGCKeys, roachpb.GCRequest_GCKey{ 318 Key: s.cur.Key.Key, 319 Timestamp: gcTimestampForThisKey, 320 }) 321 haveGarbageForThisKey = false 322 gcTimestampForThisKey = hlc.Timestamp{} 323 324 // Mark that we sent a batch for this key so we know that we had garbage 325 // even if it turns out that there's no more garbage for this key. 326 // We want to count a key as affected once even if we paginate the 327 // deletion of its versions. 328 sentBatchForThisKey = shouldSendBatch && !isNewest 329 } 330 if shouldSendBatch { 331 if err := gcer.GC(ctx, batchGCKeys); err != nil { 332 // Even though we are batching the GC process, it's 333 // safe to continue because we bumped the GC 334 // thresholds. We may leave some inconsistent history 335 // behind, but nobody can read it. 336 log.Warningf(ctx, "failed to GC a batch of keys: %v", err) 337 } 338 batchGCKeys = nil 339 batchGCKeysBytes = 0 340 alloc = nil 341 } 342 } 343 if len(batchGCKeys) > 0 { 344 if err := gcer.GC(ctx, batchGCKeys); err != nil { 345 return err 346 } 347 } 348 return nil 349 } 350 351 // isGarbage makes a determination whether a key ('cur') is garbage. If 'next' 352 // is non-nil, it should be the chronologically newer version of the same key 353 // (or the metadata KV if cur is an intent). If isNewest is false, next must be 354 // non-nil. isNewest implies that this is the highest timestamp committed 355 // version for this key. If isNewest is true and next is non-nil, it is an 356 // intent. Conservatively we have to assume that the intent will get aborted, 357 // so we will be able to GC just the values that we could remove if there 358 // weren't an intent. Hence this definition of isNewest. 359 // 360 // We keep all values (including deletes) above the expiration time, plus 361 // the first value before or at the expiration time. This allows reads to be 362 // guaranteed as described above. However if this were the only rule, then if 363 // the most recent write was a delete, it would never be removed. Thus, when a 364 // deleted value is the most recent before expiration, it can be deleted. 365 func isGarbage(threshold hlc.Timestamp, cur, next *storage.MVCCKeyValue, isNewest bool) bool { 366 // If the value is not at or below the threshold then it's not garbage. 367 if belowThreshold := cur.Key.Timestamp.LessEq(threshold); !belowThreshold { 368 return false 369 } 370 isDelete := len(cur.Value) == 0 371 if isNewest && !isDelete { 372 return false 373 } 374 // If this value is not a delete, then we need to make sure that the next 375 // value is also at or below the threshold. 376 // NB: This doesn't need to check whether next is nil because we know 377 // isNewest is false when evaluating rhs of the or below. 378 if !isDelete && next == nil { 379 panic("huh") 380 } 381 return isDelete || next.Key.Timestamp.LessEq(threshold) 382 } 383 384 // processLocalKeyRange scans the local range key entries, consisting of 385 // transaction records, queue last processed timestamps, and range descriptors. 386 // 387 // - Transaction entries: 388 // - For expired transactions , schedule the intents for 389 // asynchronous resolution. The actual transaction spans are not 390 // returned for GC in this pass, but are separately GC'ed after 391 // successful resolution of all intents. The exception is if there 392 // are no intents on the txn record, in which case it's returned for 393 // immediate GC. 394 // 395 // - Queue last processed times: cleanup any entries which don't match 396 // this range's start key. This can happen on range merges. 397 func processLocalKeyRange( 398 ctx context.Context, 399 snap storage.Reader, 400 desc *roachpb.RangeDescriptor, 401 cutoff hlc.Timestamp, 402 info *Info, 403 cleanupTxnIntentsAsyncFn CleanupTxnIntentsAsyncFunc, 404 gcer PureGCer, 405 ) error { 406 b := makeBatchingInlineGCer(gcer, func(err error) { 407 log.Warningf(ctx, "failed to GC from local key range: %s", err) 408 }) 409 defer b.Flush(ctx) 410 411 handleTxnIntents := func(key roachpb.Key, txn *roachpb.Transaction) error { 412 // If the transaction needs to be pushed or there are intents to 413 // resolve, invoke the cleanup function. 414 if !txn.Status.IsFinalized() || len(txn.LockSpans) > 0 { 415 return cleanupTxnIntentsAsyncFn(ctx, txn, roachpb.AsLockUpdates(txn, txn.LockSpans)) 416 } 417 b.FlushingAdd(ctx, key) 418 return nil 419 } 420 421 handleOneTransaction := func(kv roachpb.KeyValue) error { 422 var txn roachpb.Transaction 423 if err := kv.Value.GetProto(&txn); err != nil { 424 return err 425 } 426 info.TransactionSpanTotal++ 427 if cutoff.LessEq(txn.LastActive()) { 428 return nil 429 } 430 431 // The transaction record should be considered for removal. 432 switch txn.Status { 433 case roachpb.PENDING: 434 info.TransactionSpanGCPending++ 435 case roachpb.STAGING: 436 info.TransactionSpanGCStaging++ 437 case roachpb.ABORTED: 438 info.TransactionSpanGCAborted++ 439 case roachpb.COMMITTED: 440 info.TransactionSpanGCCommitted++ 441 default: 442 panic(fmt.Sprintf("invalid transaction state: %s", txn)) 443 } 444 return handleTxnIntents(kv.Key, &txn) 445 } 446 447 handleOneQueueLastProcessed := func(kv roachpb.KeyValue, rangeKey roachpb.RKey) error { 448 if !rangeKey.Equal(desc.StartKey) { 449 // Garbage collect the last processed timestamp if it doesn't match start key. 450 b.FlushingAdd(ctx, kv.Key) 451 } 452 return nil 453 } 454 455 handleOne := func(kv roachpb.KeyValue) error { 456 rangeKey, suffix, _, err := keys.DecodeRangeKey(kv.Key) 457 if err != nil { 458 return err 459 } 460 if suffix.Equal(keys.LocalTransactionSuffix.AsRawKey()) { 461 if err := handleOneTransaction(kv); err != nil { 462 return err 463 } 464 } else if suffix.Equal(keys.LocalQueueLastProcessedSuffix.AsRawKey()) { 465 if err := handleOneQueueLastProcessed(kv, roachpb.RKey(rangeKey)); err != nil { 466 return err 467 } 468 } 469 return nil 470 } 471 472 startKey := keys.MakeRangeKeyPrefix(desc.StartKey) 473 endKey := keys.MakeRangeKeyPrefix(desc.EndKey) 474 475 _, err := storage.MVCCIterate(ctx, snap, startKey, endKey, hlc.Timestamp{}, storage.MVCCScanOptions{}, 476 func(kv roachpb.KeyValue) (bool, error) { 477 return false, handleOne(kv) 478 }) 479 return err 480 } 481 482 // processAbortSpan iterates through the local AbortSpan entries 483 // and collects entries which indicate that a client which was running 484 // this transaction must have realized that it has been aborted (due to 485 // heartbeating having failed). The parameter minAge is typically a 486 // multiple of the heartbeat timeout used by the coordinator. 487 func processAbortSpan( 488 ctx context.Context, 489 snap storage.Reader, 490 rangeID roachpb.RangeID, 491 threshold hlc.Timestamp, 492 info *Info, 493 gcer PureGCer, 494 ) { 495 b := makeBatchingInlineGCer(gcer, func(err error) { 496 log.Warningf(ctx, "unable to GC from abort span: %s", err) 497 }) 498 defer b.Flush(ctx) 499 abortSpan := abortspan.New(rangeID) 500 err := abortSpan.Iterate(ctx, snap, func(key roachpb.Key, v roachpb.AbortSpanEntry) error { 501 info.AbortSpanTotal++ 502 if v.Timestamp.Less(threshold) { 503 info.AbortSpanGCNum++ 504 b.FlushingAdd(ctx, key) 505 } 506 return nil 507 }) 508 if err != nil { 509 log.Warningf(ctx, "%v", err) 510 } 511 } 512 513 // batchingInlineGCer is a helper to paginate the GC of inline (i.e. zero 514 // timestamp keys). After creation, keys are added via FlushingAdd(). A 515 // final call to Flush() empties out the buffer when all keys were added. 516 type batchingInlineGCer struct { 517 gcer PureGCer 518 onErr func(error) 519 520 size int 521 max int 522 gcKeys []roachpb.GCRequest_GCKey 523 } 524 525 func makeBatchingInlineGCer(gcer PureGCer, onErr func(error)) batchingInlineGCer { 526 return batchingInlineGCer{gcer: gcer, onErr: onErr, max: KeyVersionChunkBytes} 527 } 528 529 func (b *batchingInlineGCer) FlushingAdd(ctx context.Context, key roachpb.Key) { 530 b.gcKeys = append(b.gcKeys, roachpb.GCRequest_GCKey{Key: key}) 531 b.size += len(key) 532 if b.size < b.max { 533 return 534 } 535 b.Flush(ctx) 536 } 537 538 func (b *batchingInlineGCer) Flush(ctx context.Context) { 539 err := b.gcer.GC(ctx, b.gcKeys) 540 b.gcKeys = nil 541 b.size = 0 542 if err != nil { 543 b.onErr(err) 544 } 545 }