github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/gc_queue.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "math" 17 "math/rand" 18 "sync/atomic" 19 "time" 20 21 "github.com/cockroachdb/cockroach/pkg/config" 22 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 23 "github.com/cockroachdb/cockroach/pkg/gossip" 24 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/gc" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/humanizeutil" 30 "github.com/cockroachdb/cockroach/pkg/util/log" 31 "github.com/cockroachdb/cockroach/pkg/util/stop" 32 "github.com/cockroachdb/errors" 33 ) 34 35 const ( 36 // gcQueueTimerDuration is the duration between GCs of queued replicas. 37 gcQueueTimerDuration = 1 * time.Second 38 // intentAgeNormalization is the average age of outstanding intents 39 // which amount to a score of "1" added to total replica priority. 40 intentAgeNormalization = 24 * time.Hour // 1 day 41 42 // Thresholds used to decide whether to queue for GC based 43 // on keys and intents. 44 gcKeyScoreThreshold = 2 45 gcIntentScoreThreshold = 10 46 47 probablyLargeAbortSpanSysCountThreshold = 10000 48 probablyLargeAbortSpanSysBytesThreshold = 16 * (1 << 20) // 16mb 49 ) 50 51 func probablyLargeAbortSpan(ms enginepb.MVCCStats) bool { 52 // If there is "a lot" of data in Sys{Bytes,Count}, then we are likely 53 // experiencing a large abort span. The abort span is not supposed to 54 // become that large, but it does happen and causes stability fallout, 55 // usually due to a combination of shortcomings: 56 // 57 // 1. there's no trigger for GC based on abort span size alone (before 58 // this code block here was written) 59 // 2. transaction aborts tended to create unnecessary abort span entries, 60 // fixed (and 19.2-backported) in: 61 // https://github.com/cockroachdb/cockroach/pull/42765 62 // 3. aborting transactions in a busy loop: 63 // https://github.com/cockroachdb/cockroach/issues/38088 64 // (and we suspect this also happens in user apps occasionally) 65 // 4. large snapshots would never complete due to the queue time limits 66 // (addressed in https://github.com/cockroachdb/cockroach/pull/44952). 67 // 68 // In an ideal world, we would factor in the abort span into this method 69 // directly, but until then the condition guarding this block will do. 70 return ms.SysCount >= probablyLargeAbortSpanSysCountThreshold && 71 ms.SysBytes >= probablyLargeAbortSpanSysBytesThreshold 72 } 73 74 // gcQueue manages a queue of replicas slated to be scanned in their 75 // entirety using the MVCC versions iterator. The gc queue manages the 76 // following tasks: 77 // 78 // - GC of version data via TTL expiration (and more complex schemes 79 // as implemented going forward). 80 // - Resolve extant write intents (pushing their transactions). 81 // - GC of old transaction and AbortSpan entries. This should include 82 // most committed and aborted entries almost immediately and, after a 83 // threshold on inactivity, all others. 84 // 85 // The shouldQueue function combines the need for the above tasks into a 86 // single priority. If any task is overdue, shouldQueue returns true. 87 type gcQueue struct { 88 *baseQueue 89 } 90 91 // newGCQueue returns a new instance of gcQueue. 92 func newGCQueue(store *Store, gossip *gossip.Gossip) *gcQueue { 93 gcq := &gcQueue{} 94 gcq.baseQueue = newBaseQueue( 95 "gc", gcq, store, gossip, 96 queueConfig{ 97 maxSize: defaultQueueMaxSize, 98 needsLease: true, 99 needsSystemConfig: true, 100 acceptsUnsplitRanges: false, 101 successes: store.metrics.GCQueueSuccesses, 102 failures: store.metrics.GCQueueFailures, 103 pending: store.metrics.GCQueuePending, 104 processingNanos: store.metrics.GCQueueProcessingNanos, 105 }, 106 ) 107 return gcq 108 } 109 110 // gcQueueScore holds details about the score returned by makeGCQueueScoreImpl for 111 // testing and logging. The fields in this struct are documented in 112 // makeGCQueueScoreImpl. 113 type gcQueueScore struct { 114 TTL time.Duration 115 LikelyLastGC time.Duration 116 DeadFraction float64 117 ValuesScalableScore float64 118 IntentScore float64 119 FuzzFactor float64 120 FinalScore float64 121 ShouldQueue bool 122 123 GCBytes int64 124 GCByteAge int64 125 ExpMinGCByteAgeReduction int64 126 } 127 128 func (r gcQueueScore) String() string { 129 if (r == gcQueueScore{}) { 130 return "(empty)" 131 } 132 if r.ExpMinGCByteAgeReduction < 0 { 133 r.ExpMinGCByteAgeReduction = 0 134 } 135 likelyLastGC := "never" 136 if r.LikelyLastGC != 0 { 137 likelyLastGC = fmt.Sprintf("%s ago", r.LikelyLastGC) 138 } 139 return fmt.Sprintf("queue=%t with %.2f/fuzz(%.2f)=%.2f=valScaleScore(%.2f)*deadFrac(%.2f)+intentScore(%.2f)\n"+ 140 "likely last GC: %s, %s non-live, curr. age %s*s, min exp. reduction: %s*s", 141 r.ShouldQueue, r.FinalScore, r.FuzzFactor, r.FinalScore/r.FuzzFactor, r.ValuesScalableScore, 142 r.DeadFraction, r.IntentScore, likelyLastGC, humanizeutil.IBytes(r.GCBytes), 143 humanizeutil.IBytes(r.GCByteAge), humanizeutil.IBytes(r.ExpMinGCByteAgeReduction)) 144 } 145 146 // shouldQueue determines whether a replica should be queued for garbage 147 // collection, and if so, at what priority. Returns true for shouldQ 148 // in the event that the cumulative ages of GC'able bytes or extant 149 // intents exceed thresholds. 150 func (gcq *gcQueue) shouldQueue( 151 ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, 152 ) (bool, float64) { 153 154 // Consult the protected timestamp state to determine whether we can GC and 155 // the timestamp which can be used to calculate the score. 156 _, zone := repl.DescAndZone() 157 canGC, _, gcTimestamp, _ := repl.checkProtectedTimestampsForGC(ctx, *zone.GC) 158 if !canGC { 159 return false, 0 160 } 161 r := makeGCQueueScore(ctx, repl, gcTimestamp, *zone.GC) 162 return r.ShouldQueue, r.FinalScore 163 } 164 165 func makeGCQueueScore( 166 ctx context.Context, repl *Replica, now hlc.Timestamp, policy zonepb.GCPolicy, 167 ) gcQueueScore { 168 repl.mu.Lock() 169 ms := *repl.mu.state.Stats 170 gcThreshold := *repl.mu.state.GCThreshold 171 repl.mu.Unlock() 172 173 // Use desc.RangeID for fuzzing the final score, so that different ranges 174 // have slightly different priorities and even symmetrical workloads don't 175 // trigger GC at the same time. 176 r := makeGCQueueScoreImpl( 177 ctx, int64(repl.RangeID), now, ms, policy, gcThreshold, 178 ) 179 return r 180 } 181 182 // makeGCQueueScoreImpl is used to compute when to trigger the GC Queue. It's 183 // important that we don't queue a replica before a relevant amount of data is 184 // actually deletable, or the queue might run in a tight loop. To this end, we 185 // use a base score with the right interplay between GCByteAge and TTL and 186 // additionally weigh it so that GC is delayed when a large proportion of the 187 // data in the replica is live. Additionally, returned scores are slightly 188 // perturbed to avoid groups of replicas becoming eligible for GC at the same 189 // time repeatedly. 190 // 191 // More details below. 192 // 193 // When a key of size `B` is deleted at timestamp `T` or superseded by a newer 194 // version, it henceforth is accounted for in the range's `GCBytesAge`. At time 195 // `S`, its contribution to age will be `B*seconds(S-T)`. The aggregate 196 // `GCBytesAge` of all deleted versions in the cluster is what the GC queue at 197 // the time of writing bases its `shouldQueue` method on. 198 // 199 // If a replica is queued to have its old values garbage collected, its contents 200 // are scanned. However, the values which are deleted follow a criterion that 201 // isn't immediately connected to `GCBytesAge`: We (basically) delete everything 202 // that's older than the Replica's `TTLSeconds`. 203 // 204 // Thus, it's not obvious that garbage collection has the effect of reducing the 205 // metric that we use to consider the replica for the next GC cycle, and it 206 // seems that we messed it up. 207 // 208 // The previous metric used for queueing: `GCBytesAge/(1<<20 * ttl)` does not 209 // have the right scaling. For example, consider that a value of size `1mb` is 210 // overwritten with a newer version. After `ttl` seconds, it contributes `1mb` 211 // to `GCBytesAge`, and so the replica has a score of `1`, i.e. (roughly) the 212 // range becomes interesting to the GC queue. When GC runs, it will delete value 213 // that are `ttl` old, which our value is. But a Replica is ~64mb, so picture 214 // that you have 64mb of key-value data all at the same timestamp, and they 215 // become superseded. Already after `ttl/64`, the metric becomes 1, but they 216 // keys won't be GC'able for another (63*ttl)/64. Thus, GC will run "all the 217 // time" long before it can actually have an effect. 218 // 219 // The metric with correct scaling must thus take into account the size of the 220 // range. What size exactly? Any data that isn't live (i.e. isn't readable by a 221 // scan from the far future). That's `KeyBytes + ms.ValBytes - ms.LiveBytes`, 222 // which is also known as `GCBytes` in the code. Hence, the better metric is 223 // `GCBytesAge/(ttl*GCBytes)`. 224 // 225 // Using this metric guarantees that after truncation, `GCBytesAge` is at most 226 // `ttl*GCBytes` (where `GCBytes` has been updated), i.e. the new metric is at 227 // most 1. 228 // 229 // To visualize this, picture a rectangular frame of width `ttl` and height 230 // `GCBytes` (i.e. the horizontal dimension is time, the vertical one bytes), 231 // where the right boundary of the frame corresponds to age zero. Each non-live 232 // key is a domino aligned with the right side of the frame, its height equal to 233 // its size, and its width given by the duration (in seconds) it's been 234 // non-live. 235 // 236 // The combined surface of the dominos is then `GCBytesAge`, and the claim is 237 // that if the total sum of domino heights (i.e. sizes) is `GCBytes`, and the 238 // surface is larger than `ttl*GCBytes` by some positive `X`, then after 239 // removing the dominos that cross the line `x=-ttl` (i.e. `ttl` to the left 240 // from the right side of the frame), at least a surface area of `X` has been 241 // removed. 242 // 243 // x=-ttl GCBytes=1+4 244 // | 3 (age) 245 // | +-------+ 246 // | | keep | 1 (bytes) 247 // | +-------+ 248 // +-----------------------+ 249 // | | 250 // | remove | 3 (bytes) 251 // | | 252 // +-----------------------+ 253 // | 7 (age) 254 // 255 // This is true because 256 // 257 // deletable area = total area - nondeletable area 258 // = X + ttl*GCBytes - nondeletable area 259 // >= X + ttl*GCBytes - ttl*(bytes in nondeletable area) 260 // = X + ttl*(GCBytes - bytes in nondeletable area) 261 // >= X. 262 // 263 // Or, in other words, you can only hope to put `ttl*GCBytes` of area in the 264 // "safe" rectangle. Once you've done that, everything else you put is going to 265 // be deleted. 266 // 267 // This means that running GC will always result in a `GCBytesAge` of `<= 268 // ttl*GCBytes`, and that a decent trigger for GC is a multiple of 269 // `ttl*GCBytes`. 270 func makeGCQueueScoreImpl( 271 ctx context.Context, 272 fuzzSeed int64, 273 now hlc.Timestamp, 274 ms enginepb.MVCCStats, 275 policy zonepb.GCPolicy, 276 gcThreshold hlc.Timestamp, 277 ) gcQueueScore { 278 ms.Forward(now.WallTime) 279 var r gcQueueScore 280 if (gcThreshold != hlc.Timestamp{}) { 281 r.LikelyLastGC = time.Duration(now.WallTime - gcThreshold.Add(r.TTL.Nanoseconds(), 0).WallTime) 282 } 283 284 r.TTL = policy.TTL() 285 286 // Treat a zero TTL as a one-second TTL, which avoids a priority of infinity 287 // and otherwise behaves indistinguishable given that we can't possibly hope 288 // to GC values faster than that. 289 if r.TTL <= time.Second { 290 r.TTL = time.Second 291 } 292 293 r.GCByteAge = ms.GCByteAge(now.WallTime) 294 r.GCBytes = ms.GCBytes() 295 296 // If we GC'ed now, we can expect to delete at least this much GCByteAge. 297 // GCByteAge - TTL*GCBytes = ExpMinGCByteAgeReduction & algebra. 298 // 299 // Note that for ranges with ContainsEstimates > 0, the value here may not 300 // reflect reality, and may even be nonsensical (though that's unlikely). 301 r.ExpMinGCByteAgeReduction = r.GCByteAge - r.GCBytes*int64(r.TTL.Seconds()) 302 303 // DeadFraction is close to 1 when most values are dead, and close to zero 304 // when most of the replica is live. For example, for a replica with no 305 // superseded values, this should be (almost) zero. For one just hit 306 // completely by a DeleteRange, it should be (almost) one. 307 // 308 // The algebra below is complicated by the fact that ranges may contain 309 // stats that aren't exact (ContainsEstimates > 0). 310 clamp := func(n int64) float64 { 311 if n < 0 { 312 return 0.0 313 } 314 return float64(n) 315 } 316 r.DeadFraction = math.Max(1-clamp(ms.LiveBytes)/(1+clamp(ms.ValBytes)+clamp(ms.KeyBytes)), 0) 317 318 // The "raw" GC score is the total GC'able bytes age normalized by (non-live 319 // size * the replica's TTL in seconds). This is a scale-invariant factor by 320 // (at least) which GCByteAge reduces when deleting values older than the 321 // TTL. The risk of an inaccurate GCBytes in the presence of estimated stats 322 // is neglected as GCByteAge and GCBytes undercount in the same way and 323 // estimation only happens for timeseries writes. 324 denominator := r.TTL.Seconds() * (1.0 + clamp(r.GCBytes)) // +1 avoids NaN 325 r.ValuesScalableScore = clamp(r.GCByteAge) / denominator 326 // However, it doesn't take into account the size of the live data, which 327 // also needs to be scanned in order to GC. We don't want to run this costly 328 // scan unless we get a corresponding expected reduction in GCByteAge, so we 329 // weighs by fraction of non-live data below. 330 331 // Intent score. This computes the average age of outstanding intents and 332 // normalizes. Note that at the time of writing this criterion hasn't 333 // undergone a reality check yet. 334 r.IntentScore = ms.AvgIntentAge(now.WallTime) / float64(intentAgeNormalization.Nanoseconds()/1e9) 335 336 // Randomly skew the score down a bit to cause decoherence of replicas with 337 // similar load. Note that we'll only ever reduce the score, never increase 338 // it (for increasing it could lead to a fruitless run). 339 r.FuzzFactor = 0.95 + 0.05*rand.New(rand.NewSource(fuzzSeed)).Float64() 340 341 // Compute priority. 342 valScore := r.DeadFraction * r.ValuesScalableScore 343 r.ShouldQueue = r.FuzzFactor*valScore > gcKeyScoreThreshold || r.FuzzFactor*r.IntentScore > gcIntentScoreThreshold 344 r.FinalScore = r.FuzzFactor * (valScore + r.IntentScore) 345 346 if probablyLargeAbortSpan(ms) && !r.ShouldQueue && 347 (r.LikelyLastGC == 0 || r.LikelyLastGC > kvserverbase.TxnCleanupThreshold) { 348 r.ShouldQueue = true 349 r.FinalScore++ 350 } 351 352 return r 353 } 354 355 type replicaGCer struct { 356 repl *Replica 357 count int32 // update atomically 358 } 359 360 var _ gc.GCer = &replicaGCer{} 361 362 func (r *replicaGCer) template() roachpb.GCRequest { 363 desc := r.repl.Desc() 364 var template roachpb.GCRequest 365 template.Key = desc.StartKey.AsRawKey() 366 template.EndKey = desc.EndKey.AsRawKey() 367 368 return template 369 } 370 371 func (r *replicaGCer) send(ctx context.Context, req roachpb.GCRequest) error { 372 n := atomic.AddInt32(&r.count, 1) 373 log.Eventf(ctx, "sending batch %d (%d keys)", n, len(req.Keys)) 374 375 var ba roachpb.BatchRequest 376 377 // Technically not needed since we're talking directly to the Replica. 378 ba.RangeID = r.repl.Desc().RangeID 379 ba.Timestamp = r.repl.Clock().Now() 380 ba.Add(&req) 381 382 if _, pErr := r.repl.Send(ctx, ba); pErr != nil { 383 log.VErrEventf(ctx, 2, "%v", pErr.String()) 384 return pErr.GoError() 385 } 386 return nil 387 } 388 389 func (r *replicaGCer) SetGCThreshold(ctx context.Context, thresh gc.Threshold) error { 390 req := r.template() 391 req.Threshold = thresh.Key 392 return r.send(ctx, req) 393 } 394 395 func (r *replicaGCer) GC(ctx context.Context, keys []roachpb.GCRequest_GCKey) error { 396 if len(keys) == 0 { 397 return nil 398 } 399 req := r.template() 400 req.Keys = keys 401 return r.send(ctx, req) 402 } 403 404 // process first determines whether the replica can run GC given its view of 405 // the protected timestamp subsystem and its current state. This check also 406 // determines the most recent time which can be used for the purposes of updating 407 // the GC threshold and running GC. 408 // 409 // If it is safe to GC, process iterates through all keys in a replica's range, 410 // calling the garbage collector for each key and associated set of 411 // values. GC'd keys are batched into GC calls. Extant intents are resolved if 412 // intents are older than intentAgeThreshold. The transaction and AbortSpan 413 // records are also scanned and old entries evicted. During normal operation, 414 // both of these records are cleaned up when their respective transaction 415 // finishes, so the amount of work done here is expected to be small. 416 // 417 // Some care needs to be taken to avoid cyclic recreation of entries during GC: 418 // * a Push initiated due to an intent may recreate a transaction entry 419 // * resolving an intent may write a new AbortSpan entry 420 // * obtaining the transaction for a AbortSpan entry requires a Push 421 // 422 // The following order is taken below: 423 // 1) collect all intents with sufficiently old txn record 424 // 2) collect these intents' transactions 425 // 3) scan the transaction table, collecting abandoned or completed txns 426 // 4) push all of these transactions (possibly recreating entries) 427 // 5) resolve all intents (unless the txn is not yet finalized), which 428 // will recreate AbortSpan entries (but with the txn timestamp; i.e. 429 // likely GC'able) 430 // 6) scan the AbortSpan table for old entries 431 // 7) push these transactions (again, recreating txn entries). 432 // 8) send a GCRequest. 433 func (gcq *gcQueue) process(ctx context.Context, repl *Replica, sysCfg *config.SystemConfig) error { 434 // Lookup the descriptor and GC policy for the zone containing this key range. 435 desc, zone := repl.DescAndZone() 436 // Consult the protected timestamp state to determine whether we can GC and 437 // the timestamp which can be used to calculate the score and updated GC 438 // threshold. 439 canGC, cacheTimestamp, gcTimestamp, newThreshold := repl.checkProtectedTimestampsForGC(ctx, *zone.GC) 440 if !canGC { 441 return nil 442 } 443 r := makeGCQueueScore(ctx, repl, gcTimestamp, *zone.GC) 444 log.VEventf(ctx, 2, "processing replica %s with score %s", repl.String(), r) 445 // Synchronize the new GC threshold decision with concurrent 446 // AdminVerifyProtectedTimestamp requests. 447 if err := repl.markPendingGC(cacheTimestamp, newThreshold); err != nil { 448 log.VEventf(ctx, 1, "not gc'ing replica %v due to pending protection: %v", repl, err) 449 return nil 450 } 451 snap := repl.store.Engine().NewSnapshot() 452 defer snap.Close() 453 454 info, err := gc.Run(ctx, desc, snap, gcTimestamp, newThreshold, *zone.GC, 455 &replicaGCer{repl: repl}, 456 func(ctx context.Context, intents []roachpb.Intent) error { 457 intentCount, err := repl.store.intentResolver. 458 CleanupIntents(ctx, intents, gcTimestamp, roachpb.PUSH_ABORT) 459 if err == nil { 460 gcq.store.metrics.GCResolveSuccess.Inc(int64(intentCount)) 461 } 462 return err 463 }, 464 func(ctx context.Context, txn *roachpb.Transaction, intents []roachpb.LockUpdate) error { 465 err := repl.store.intentResolver. 466 CleanupTxnIntentsOnGCAsync(ctx, repl.RangeID, txn, intents, gcTimestamp, 467 func(pushed, succeeded bool) { 468 if pushed { 469 gcq.store.metrics.GCPushTxn.Inc(1) 470 } 471 if succeeded { 472 gcq.store.metrics.GCResolveSuccess.Inc(int64(len(intents))) 473 } 474 }) 475 if errors.Is(err, stop.ErrThrottled) { 476 log.Eventf(ctx, "processing txn %s: %s; skipping for future GC", txn.ID.Short(), err) 477 return nil 478 } 479 return err 480 }) 481 if err != nil { 482 return err 483 } 484 485 log.Eventf(ctx, "MVCC stats after GC: %+v", repl.GetMVCCStats()) 486 log.Eventf(ctx, "GC score after GC: %s", makeGCQueueScore(ctx, repl, repl.store.Clock().Now(), *zone.GC)) 487 updateStoreMetricsWithGCInfo(gcq.store.metrics, info) 488 return nil 489 } 490 491 func updateStoreMetricsWithGCInfo(metrics *StoreMetrics, info gc.Info) { 492 metrics.GCNumKeysAffected.Inc(int64(info.NumKeysAffected)) 493 metrics.GCIntentsConsidered.Inc(int64(info.IntentsConsidered)) 494 metrics.GCIntentTxns.Inc(int64(info.IntentTxns)) 495 metrics.GCTransactionSpanScanned.Inc(int64(info.TransactionSpanTotal)) 496 metrics.GCTransactionSpanGCAborted.Inc(int64(info.TransactionSpanGCAborted)) 497 metrics.GCTransactionSpanGCCommitted.Inc(int64(info.TransactionSpanGCCommitted)) 498 metrics.GCTransactionSpanGCStaging.Inc(int64(info.TransactionSpanGCStaging)) 499 metrics.GCTransactionSpanGCPending.Inc(int64(info.TransactionSpanGCPending)) 500 metrics.GCAbortSpanScanned.Inc(int64(info.AbortSpanTotal)) 501 metrics.GCAbortSpanConsidered.Inc(int64(info.AbortSpanConsidered)) 502 metrics.GCAbortSpanGCNum.Inc(int64(info.AbortSpanGCNum)) 503 metrics.GCPushTxn.Inc(int64(info.PushTxn)) 504 metrics.GCResolveTotal.Inc(int64(info.ResolveTotal)) 505 } 506 507 // timer returns a constant duration to space out GC processing 508 // for successive queued replicas. 509 func (*gcQueue) timer(_ time.Duration) time.Duration { 510 return gcQueueTimerDuration 511 } 512 513 // purgatoryChan returns nil. 514 func (*gcQueue) purgatoryChan() <-chan time.Time { 515 return nil 516 }