github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_gc_queue.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "time" 16 17 "github.com/cockroachdb/cockroach/pkg/config" 18 "github.com/cockroachdb/cockroach/pkg/gossip" 19 "github.com/cockroachdb/cockroach/pkg/kv" 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 "github.com/cockroachdb/cockroach/pkg/util/metric" 24 "github.com/cockroachdb/errors" 25 "go.etcd.io/etcd/raft" 26 ) 27 28 const ( 29 // replicaGCQueueTimerDuration is the duration between GCs of queued replicas. 30 replicaGCQueueTimerDuration = 50 * time.Millisecond 31 32 // ReplicaGCQueueInactivityThreshold is the inactivity duration after which 33 // a range will be considered for garbage collection. Exported for testing. 34 ReplicaGCQueueInactivityThreshold = 10 * 24 * time.Hour // 10 days 35 // ReplicaGCQueueSuspectTimeout is the duration after which a Replica which 36 // is suspected to be removed should be processed by the queue. 37 // A Replica is suspected to have been removed if either it is in the 38 // candidate Raft state (which is a typical sign of having been removed 39 // from the group) or it is not in the VOTER_FULL state. Replicas which are 40 // in the LEARNER state will never become candidates. It seems possible that 41 // a range will quiesce and never tell a VOTER_OUTGOING that is was removed. 42 // Cases where a replica gets stuck in VOTER_INCOMING seem farfetched and 43 // would require the replica to be removed from the range before it ever 44 // learned about its promotion but that state shouldn't last long so we 45 // also treat idle replicas in that state as suspect. 46 ReplicaGCQueueSuspectTimeout = 1 * time.Second 47 ) 48 49 // Priorities for the replica GC queue. 50 const ( 51 replicaGCPriorityDefault = 0.0 52 53 // Replicas that have been removed from the range spend a lot of 54 // time in the candidate state, so treat them as higher priority. 55 // Learner replicas which have been removed never enter the candidate state 56 // but in the common case a replica should not be a learner for long so 57 // treat it the same as a candidate. 58 replicaGCPrioritySuspect = 1.0 59 60 // The highest priority is used when we have definite evidence 61 // (external to replicaGCQueue) that the replica has been removed. 62 replicaGCPriorityRemoved = 2.0 63 ) 64 65 var ( 66 metaReplicaGCQueueRemoveReplicaCount = metric.Metadata{ 67 Name: "queue.replicagc.removereplica", 68 Help: "Number of replica removals attempted by the replica gc queue", 69 Measurement: "Replica Removals", 70 Unit: metric.Unit_COUNT, 71 } 72 ) 73 74 // ReplicaGCQueueMetrics is the set of metrics for the replica GC queue. 75 type ReplicaGCQueueMetrics struct { 76 RemoveReplicaCount *metric.Counter 77 } 78 79 func makeReplicaGCQueueMetrics() ReplicaGCQueueMetrics { 80 return ReplicaGCQueueMetrics{ 81 RemoveReplicaCount: metric.NewCounter(metaReplicaGCQueueRemoveReplicaCount), 82 } 83 } 84 85 // replicaGCQueue manages a queue of replicas to be considered for garbage 86 // collections. The GC process asynchronously removes local data for 87 // ranges that have been rebalanced away from this store. 88 type replicaGCQueue struct { 89 *baseQueue 90 metrics ReplicaGCQueueMetrics 91 db *kv.DB 92 } 93 94 // newReplicaGCQueue returns a new instance of replicaGCQueue. 95 func newReplicaGCQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *replicaGCQueue { 96 rgcq := &replicaGCQueue{ 97 metrics: makeReplicaGCQueueMetrics(), 98 db: db, 99 } 100 store.metrics.registry.AddMetricStruct(&rgcq.metrics) 101 rgcq.baseQueue = newBaseQueue( 102 "replicaGC", rgcq, store, gossip, 103 queueConfig{ 104 maxSize: defaultQueueMaxSize, 105 needsLease: false, 106 needsRaftInitialized: true, 107 needsSystemConfig: false, 108 acceptsUnsplitRanges: true, 109 processDestroyedReplicas: true, 110 successes: store.metrics.ReplicaGCQueueSuccesses, 111 failures: store.metrics.ReplicaGCQueueFailures, 112 pending: store.metrics.ReplicaGCQueuePending, 113 processingNanos: store.metrics.ReplicaGCQueueProcessingNanos, 114 }, 115 ) 116 return rgcq 117 } 118 119 // shouldQueue determines whether a replica should be queued for GC, 120 // and if so at what priority. To be considered for possible GC, a 121 // replica's range lease must not have been active for longer than 122 // ReplicaGCQueueInactivityThreshold. Further, the last replica GC 123 // check must have occurred more than ReplicaGCQueueInactivityThreshold 124 // in the past. 125 func (rgcq *replicaGCQueue) shouldQueue( 126 ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig, 127 ) (shouldQ bool, prio float64) { 128 129 lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx) 130 if err != nil { 131 log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err) 132 return false, 0 133 } 134 replDesc, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID()) 135 if !currentMember { 136 return true, replicaGCPriorityRemoved 137 } 138 139 lastActivity := hlc.Timestamp{ 140 WallTime: repl.store.startedAt, 141 } 142 143 if lease, _ := repl.GetLease(); lease.ProposedTS != nil { 144 lastActivity.Forward(*lease.ProposedTS) 145 } 146 147 // It is critical to think of the replica as suspect if it is a learner as 148 // it both shouldn't be a learner for long but will never become a candidate. 149 // It is less critical to consider joint configuration members as suspect 150 // but in cases where a replica is removed but only ever hears about the 151 // command which sets it to VOTER_OUTGOING we would conservatively wait 152 // 10 days before removing the node. Finally we consider replicas which are 153 // VOTER_INCOMING as suspect because no replica should stay in that state for 154 // too long and being conservative here doesn't seem worthwhile. 155 isSuspect := replDesc.GetType() != roachpb.VOTER_FULL 156 if raftStatus := repl.RaftStatus(); raftStatus != nil { 157 isSuspect = isSuspect || 158 (raftStatus.SoftState.RaftState == raft.StateCandidate || 159 raftStatus.SoftState.RaftState == raft.StatePreCandidate) 160 } else { 161 // If a replica doesn't have an active raft group, we should check whether 162 // we're decommissioning. If so, we should process the replica because it 163 // has probably already been removed from its raft group but doesn't know it. 164 // Without this, node decommissioning can stall on such dormant ranges. 165 // Make sure NodeLiveness isn't nil because it can be in tests/benchmarks. 166 if repl.store.cfg.NodeLiveness != nil { 167 if liveness, err := repl.store.cfg.NodeLiveness.Self(); err == nil && liveness.Decommissioning { 168 return true, replicaGCPriorityDefault 169 } 170 } 171 } 172 return replicaGCShouldQueueImpl(now, lastCheck, lastActivity, isSuspect) 173 } 174 175 func replicaGCShouldQueueImpl( 176 now, lastCheck, lastActivity hlc.Timestamp, isSuspect bool, 177 ) (bool, float64) { 178 timeout := ReplicaGCQueueInactivityThreshold 179 priority := replicaGCPriorityDefault 180 181 if isSuspect { 182 // If the range is suspect (which happens if its former replica set 183 // ignores it), let it expire much earlier. 184 timeout = ReplicaGCQueueSuspectTimeout 185 priority = replicaGCPrioritySuspect 186 } else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) { 187 // Return false immediately if the previous check was less than the 188 // check interval in the past. Note that we don't do this if the 189 // replica is in candidate state, in which case we want to be more 190 // aggressive - a failed rebalance attempt could have checked this 191 // range, and candidate state suggests that a retry succeeded. See 192 // #7489. 193 return false, 0 194 } 195 196 shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now) 197 198 if !shouldQ { 199 return false, 0 200 } 201 202 return shouldQ, priority 203 } 204 205 // process performs a consistent lookup on the range descriptor to see if we are 206 // still a member of the range. 207 func (rgcq *replicaGCQueue) process( 208 ctx context.Context, repl *Replica, _ *config.SystemConfig, 209 ) error { 210 // Note that the Replicas field of desc is probably out of date, so 211 // we should only use `desc` for its static fields like RangeID and 212 // StartKey (and avoid rng.GetReplica() for the same reason). 213 desc := repl.Desc() 214 215 // Now get an updated descriptor for the range. Note that this may 216 // not be _our_ range but instead some earlier range if our range has 217 // been merged. See below. 218 219 // Calls to RangeLookup typically use inconsistent reads, but we 220 // want to do a consistent read here. This is important when we are 221 // considering one of the metadata ranges: we must not do an inconsistent 222 // lookup in our own copy of the range. 223 rs, _, err := kv.RangeLookup(ctx, rgcq.db.NonTransactionalSender(), desc.StartKey.AsRawKey(), 224 roachpb.CONSISTENT, 0 /* prefetchNum */, false /* reverse */) 225 if err != nil { 226 return err 227 } 228 if len(rs) != 1 { 229 // Regardless of whether ranges were merged, we're guaranteed one answer. 230 // 231 // TODO(knz): we should really have a separate type for assertion 232 // errors that trigger telemetry, like 233 // errors.AssertionFailedf() does. 234 return errors.Errorf("expected 1 range descriptor, got %d", len(rs)) 235 } 236 replyDesc := rs[0] 237 238 // Now check whether the replica is meant to still exist. 239 // Maybe it was deleted "under us" by being moved. 240 currentDesc, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID()) 241 sameRange := desc.RangeID == replyDesc.RangeID 242 if sameRange && currentMember { 243 // This replica is a current member of the raft group. Set the last replica 244 // GC check time to avoid re-processing for another check interval. 245 // 246 // TODO(tschottdorf): should keep stats in particular on this outcome 247 // but also on how good a job the queue does at inspecting every 248 // Replica (see #8111) when inactive ones can be starved by 249 // event-driven additions. 250 log.VEventf(ctx, 1, "not gc'able, replica is still in range descriptor: %v", currentDesc) 251 if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil { 252 return err 253 } 254 255 // Note that we do not check the replicaID at this point. If our 256 // local replica ID is behind the one in the meta descriptor, we 257 // could safely delete our local copy, but this would just force 258 // the use of a snapshot when catching up to the new replica ID. 259 // We don't normally expect to have a *higher* local replica ID 260 // than the one in the meta descriptor, but it's possible after 261 // recovering with unsafe-remove-dead-replicas. 262 } else if sameRange { 263 // We are no longer a member of this range, but the range still exists. 264 // Clean up our local data. 265 266 if replyDesc.EndKey.Less(desc.EndKey) { 267 // The meta records indicate that the range has split but that this 268 // replica hasn't processed the split trigger yet. By removing this 269 // replica, we're also wiping out the data of what would become the 270 // right hand side of the split (which may or may not still have a 271 // replica on this store), and will need a Raft snapshot. Even worse, 272 // the mechanism introduced in #31875 will artificially delay this 273 // snapshot by seconds, during which time the RHS may see more splits 274 // and incur more snapshots. 275 // 276 // TODO(tschottdorf): we can look up the range descriptor for the 277 // RHS of the split (by querying with replyDesc.EndKey) and fetch 278 // the local replica (which will be uninitialized, i.e. we have to 279 // look it up by RangeID) to disable the mechanism in #31875 for it. 280 // We should be able to use prefetching unconditionally to have this 281 // desc ready whenever we need it. 282 // 283 // NB: there's solid evidence that this phenomenon can actually lead 284 // to a large spike in Raft snapshots early in the life of a cluster 285 // (in particular when combined with a restore operation) when the 286 // removed replica has many pending splits and thus incurs a Raft 287 // snapshot for *each* of them. This typically happens for the last 288 // range: 289 // [n1,replicaGC,s1,r33/1:/{Table/53/1/3…-Max}] removing replica [...] 290 log.Infof(ctx, "removing replica with pending split; will incur Raft snapshot for right hand side") 291 } 292 293 rgcq.metrics.RemoveReplicaCount.Inc(1) 294 log.VEventf(ctx, 1, "destroying local data") 295 296 nextReplicaID := replyDesc.NextReplicaID 297 // Note that this seems racy - we didn't hold any locks between reading 298 // the range descriptor above and deciding to remove the replica - but 299 // we pass in the NextReplicaID to detect situations in which the 300 // replica became "non-gc'able" in the meantime by checking (with raftMu 301 // held throughout) whether the replicaID is still smaller than the 302 // NextReplicaID. Given non-zero replica IDs don't change, this is only 303 // possible if we currently think we're processing a pre-emptive snapshot 304 // but discover in RemoveReplica that this range has since been added and 305 // knows that. 306 if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{ 307 DestroyData: true, 308 }); err != nil { 309 return err 310 } 311 } else { 312 // This case is tricky. This range has been merged away, so it is likely 313 // that we can GC this replica, but we need to be careful. If this store has 314 // a replica of the subsuming range that has not yet applied the merge 315 // trigger, we must not GC this replica. 316 // 317 // We can't just ask our local left neighbor whether it has an unapplied 318 // merge, as if it's a slow follower it might not have learned about the 319 // merge yet! What we can do, though, is check whether the generation of our 320 // local left neighbor matches the generation of its meta2 descriptor. If it 321 // is generationally up-to-date, it has applied all splits and merges, and 322 // it is thus safe to remove this replica. 323 leftRepl := repl.store.lookupPrecedingReplica(desc.StartKey) 324 if leftRepl != nil { 325 leftDesc := leftRepl.Desc() 326 rs, _, err := kv.RangeLookup(ctx, rgcq.db.NonTransactionalSender(), leftDesc.StartKey.AsRawKey(), 327 roachpb.CONSISTENT, 0 /* prefetchNum */, false /* reverse */) 328 if err != nil { 329 return err 330 } 331 if len(rs) != 1 { 332 return errors.Errorf("expected 1 range descriptor, got %d", len(rs)) 333 } 334 if leftReplyDesc := &rs[0]; !leftDesc.Equal(*leftReplyDesc) { 335 log.VEventf(ctx, 1, "left neighbor %s not up-to-date with meta descriptor %s; cannot safely GC range yet", 336 leftDesc, leftReplyDesc) 337 // Chances are that the left replica needs to be GC'd. Since we don't 338 // have definitive proof, queue it with a low priority. 339 rgcq.AddAsync(ctx, leftRepl, replicaGCPriorityDefault) 340 return nil 341 } 342 } 343 344 // A tombstone is written with a value of mergedTombstoneReplicaID because 345 // we know the range to have been merged. See the Merge case of 346 // runPreApplyTriggers() for details. 347 if err := repl.store.RemoveReplica(ctx, repl, mergedTombstoneReplicaID, RemoveOptions{ 348 DestroyData: true, 349 }); err != nil { 350 return err 351 } 352 } 353 return nil 354 } 355 356 func (*replicaGCQueue) timer(_ time.Duration) time.Duration { 357 return replicaGCQueueTimerDuration 358 } 359 360 // purgatoryChan returns nil. 361 func (*replicaGCQueue) purgatoryChan() <-chan time.Time { 362 return nil 363 }