github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_gc_queue.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_gc_queue.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"time"
    16  
    17  	"github.com/cockroachdb/cockroach/pkg/config"
    18  	"github.com/cockroachdb/cockroach/pkg/gossip"
    19  	"github.com/cockroachdb/cockroach/pkg/kv"
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    24  	"github.com/cockroachdb/errors"
    25  	"go.etcd.io/etcd/raft"
    26  )
    27  
    28  const (
    29  	// replicaGCQueueTimerDuration is the duration between GCs of queued replicas.
    30  	replicaGCQueueTimerDuration = 50 * time.Millisecond
    31  
    32  	// ReplicaGCQueueInactivityThreshold is the inactivity duration after which
    33  	// a range will be considered for garbage collection. Exported for testing.
    34  	ReplicaGCQueueInactivityThreshold = 10 * 24 * time.Hour // 10 days
    35  	// ReplicaGCQueueSuspectTimeout is the duration after which a Replica which
    36  	// is suspected to be removed should be processed by the queue.
    37  	// A Replica is suspected to have been removed if either it is in the
    38  	// candidate Raft state (which is a typical sign of having been removed
    39  	// from the group) or it is not in the VOTER_FULL state. Replicas which are
    40  	// in the LEARNER state will never become candidates. It seems possible that
    41  	// a range will quiesce and never tell a VOTER_OUTGOING that is was removed.
    42  	// Cases where a replica gets stuck in VOTER_INCOMING seem farfetched and
    43  	// would require the replica to be removed from the range before it ever
    44  	// learned about its promotion but that state shouldn't last long so we
    45  	// also treat idle replicas in that state as suspect.
    46  	ReplicaGCQueueSuspectTimeout = 1 * time.Second
    47  )
    48  
    49  // Priorities for the replica GC queue.
    50  const (
    51  	replicaGCPriorityDefault = 0.0
    52  
    53  	// Replicas that have been removed from the range spend a lot of
    54  	// time in the candidate state, so treat them as higher priority.
    55  	// Learner replicas which have been removed never enter the candidate state
    56  	// but in the common case a replica should not be a learner for long so
    57  	// treat it the same as a candidate.
    58  	replicaGCPrioritySuspect = 1.0
    59  
    60  	// The highest priority is used when we have definite evidence
    61  	// (external to replicaGCQueue) that the replica has been removed.
    62  	replicaGCPriorityRemoved = 2.0
    63  )
    64  
    65  var (
    66  	metaReplicaGCQueueRemoveReplicaCount = metric.Metadata{
    67  		Name:        "queue.replicagc.removereplica",
    68  		Help:        "Number of replica removals attempted by the replica gc queue",
    69  		Measurement: "Replica Removals",
    70  		Unit:        metric.Unit_COUNT,
    71  	}
    72  )
    73  
    74  // ReplicaGCQueueMetrics is the set of metrics for the replica GC queue.
    75  type ReplicaGCQueueMetrics struct {
    76  	RemoveReplicaCount *metric.Counter
    77  }
    78  
    79  func makeReplicaGCQueueMetrics() ReplicaGCQueueMetrics {
    80  	return ReplicaGCQueueMetrics{
    81  		RemoveReplicaCount: metric.NewCounter(metaReplicaGCQueueRemoveReplicaCount),
    82  	}
    83  }
    84  
    85  // replicaGCQueue manages a queue of replicas to be considered for garbage
    86  // collections. The GC process asynchronously removes local data for
    87  // ranges that have been rebalanced away from this store.
    88  type replicaGCQueue struct {
    89  	*baseQueue
    90  	metrics ReplicaGCQueueMetrics
    91  	db      *kv.DB
    92  }
    93  
    94  // newReplicaGCQueue returns a new instance of replicaGCQueue.
    95  func newReplicaGCQueue(store *Store, db *kv.DB, gossip *gossip.Gossip) *replicaGCQueue {
    96  	rgcq := &replicaGCQueue{
    97  		metrics: makeReplicaGCQueueMetrics(),
    98  		db:      db,
    99  	}
   100  	store.metrics.registry.AddMetricStruct(&rgcq.metrics)
   101  	rgcq.baseQueue = newBaseQueue(
   102  		"replicaGC", rgcq, store, gossip,
   103  		queueConfig{
   104  			maxSize:                  defaultQueueMaxSize,
   105  			needsLease:               false,
   106  			needsRaftInitialized:     true,
   107  			needsSystemConfig:        false,
   108  			acceptsUnsplitRanges:     true,
   109  			processDestroyedReplicas: true,
   110  			successes:                store.metrics.ReplicaGCQueueSuccesses,
   111  			failures:                 store.metrics.ReplicaGCQueueFailures,
   112  			pending:                  store.metrics.ReplicaGCQueuePending,
   113  			processingNanos:          store.metrics.ReplicaGCQueueProcessingNanos,
   114  		},
   115  	)
   116  	return rgcq
   117  }
   118  
   119  // shouldQueue determines whether a replica should be queued for GC,
   120  // and if so at what priority. To be considered for possible GC, a
   121  // replica's range lease must not have been active for longer than
   122  // ReplicaGCQueueInactivityThreshold. Further, the last replica GC
   123  // check must have occurred more than ReplicaGCQueueInactivityThreshold
   124  // in the past.
   125  func (rgcq *replicaGCQueue) shouldQueue(
   126  	ctx context.Context, now hlc.Timestamp, repl *Replica, _ *config.SystemConfig,
   127  ) (shouldQ bool, prio float64) {
   128  
   129  	lastCheck, err := repl.GetLastReplicaGCTimestamp(ctx)
   130  	if err != nil {
   131  		log.Errorf(ctx, "could not read last replica GC timestamp: %+v", err)
   132  		return false, 0
   133  	}
   134  	replDesc, currentMember := repl.Desc().GetReplicaDescriptor(repl.store.StoreID())
   135  	if !currentMember {
   136  		return true, replicaGCPriorityRemoved
   137  	}
   138  
   139  	lastActivity := hlc.Timestamp{
   140  		WallTime: repl.store.startedAt,
   141  	}
   142  
   143  	if lease, _ := repl.GetLease(); lease.ProposedTS != nil {
   144  		lastActivity.Forward(*lease.ProposedTS)
   145  	}
   146  
   147  	// It is critical to think of the replica as suspect if it is a learner as
   148  	// it both shouldn't be a learner for long but will never become a candidate.
   149  	// It is less critical to consider joint configuration members as suspect
   150  	// but in cases where a replica is removed but only ever hears about the
   151  	// command which sets it to VOTER_OUTGOING we would conservatively wait
   152  	// 10 days before removing the node. Finally we consider replicas which are
   153  	// VOTER_INCOMING as suspect because no replica should stay in that state for
   154  	// too long and being conservative here doesn't seem worthwhile.
   155  	isSuspect := replDesc.GetType() != roachpb.VOTER_FULL
   156  	if raftStatus := repl.RaftStatus(); raftStatus != nil {
   157  		isSuspect = isSuspect ||
   158  			(raftStatus.SoftState.RaftState == raft.StateCandidate ||
   159  				raftStatus.SoftState.RaftState == raft.StatePreCandidate)
   160  	} else {
   161  		// If a replica doesn't have an active raft group, we should check whether
   162  		// we're decommissioning. If so, we should process the replica because it
   163  		// has probably already been removed from its raft group but doesn't know it.
   164  		// Without this, node decommissioning can stall on such dormant ranges.
   165  		// Make sure NodeLiveness isn't nil because it can be in tests/benchmarks.
   166  		if repl.store.cfg.NodeLiveness != nil {
   167  			if liveness, err := repl.store.cfg.NodeLiveness.Self(); err == nil && liveness.Decommissioning {
   168  				return true, replicaGCPriorityDefault
   169  			}
   170  		}
   171  	}
   172  	return replicaGCShouldQueueImpl(now, lastCheck, lastActivity, isSuspect)
   173  }
   174  
   175  func replicaGCShouldQueueImpl(
   176  	now, lastCheck, lastActivity hlc.Timestamp, isSuspect bool,
   177  ) (bool, float64) {
   178  	timeout := ReplicaGCQueueInactivityThreshold
   179  	priority := replicaGCPriorityDefault
   180  
   181  	if isSuspect {
   182  		// If the range is suspect (which happens if its former replica set
   183  		// ignores it), let it expire much earlier.
   184  		timeout = ReplicaGCQueueSuspectTimeout
   185  		priority = replicaGCPrioritySuspect
   186  	} else if now.Less(lastCheck.Add(ReplicaGCQueueInactivityThreshold.Nanoseconds(), 0)) {
   187  		// Return false immediately if the previous check was less than the
   188  		// check interval in the past. Note that we don't do this if the
   189  		// replica is in candidate state, in which case we want to be more
   190  		// aggressive - a failed rebalance attempt could have checked this
   191  		// range, and candidate state suggests that a retry succeeded. See
   192  		// #7489.
   193  		return false, 0
   194  	}
   195  
   196  	shouldQ := lastActivity.Add(timeout.Nanoseconds(), 0).Less(now)
   197  
   198  	if !shouldQ {
   199  		return false, 0
   200  	}
   201  
   202  	return shouldQ, priority
   203  }
   204  
   205  // process performs a consistent lookup on the range descriptor to see if we are
   206  // still a member of the range.
   207  func (rgcq *replicaGCQueue) process(
   208  	ctx context.Context, repl *Replica, _ *config.SystemConfig,
   209  ) error {
   210  	// Note that the Replicas field of desc is probably out of date, so
   211  	// we should only use `desc` for its static fields like RangeID and
   212  	// StartKey (and avoid rng.GetReplica() for the same reason).
   213  	desc := repl.Desc()
   214  
   215  	// Now get an updated descriptor for the range. Note that this may
   216  	// not be _our_ range but instead some earlier range if our range has
   217  	// been merged. See below.
   218  
   219  	// Calls to RangeLookup typically use inconsistent reads, but we
   220  	// want to do a consistent read here. This is important when we are
   221  	// considering one of the metadata ranges: we must not do an inconsistent
   222  	// lookup in our own copy of the range.
   223  	rs, _, err := kv.RangeLookup(ctx, rgcq.db.NonTransactionalSender(), desc.StartKey.AsRawKey(),
   224  		roachpb.CONSISTENT, 0 /* prefetchNum */, false /* reverse */)
   225  	if err != nil {
   226  		return err
   227  	}
   228  	if len(rs) != 1 {
   229  		// Regardless of whether ranges were merged, we're guaranteed one answer.
   230  		//
   231  		// TODO(knz): we should really have a separate type for assertion
   232  		// errors that trigger telemetry, like
   233  		// errors.AssertionFailedf() does.
   234  		return errors.Errorf("expected 1 range descriptor, got %d", len(rs))
   235  	}
   236  	replyDesc := rs[0]
   237  
   238  	// Now check whether the replica is meant to still exist.
   239  	// Maybe it was deleted "under us" by being moved.
   240  	currentDesc, currentMember := replyDesc.GetReplicaDescriptor(repl.store.StoreID())
   241  	sameRange := desc.RangeID == replyDesc.RangeID
   242  	if sameRange && currentMember {
   243  		// This replica is a current member of the raft group. Set the last replica
   244  		// GC check time to avoid re-processing for another check interval.
   245  		//
   246  		// TODO(tschottdorf): should keep stats in particular on this outcome
   247  		// but also on how good a job the queue does at inspecting every
   248  		// Replica (see #8111) when inactive ones can be starved by
   249  		// event-driven additions.
   250  		log.VEventf(ctx, 1, "not gc'able, replica is still in range descriptor: %v", currentDesc)
   251  		if err := repl.setLastReplicaGCTimestamp(ctx, repl.store.Clock().Now()); err != nil {
   252  			return err
   253  		}
   254  
   255  		// Note that we do not check the replicaID at this point. If our
   256  		// local replica ID is behind the one in the meta descriptor, we
   257  		// could safely delete our local copy, but this would just force
   258  		// the use of a snapshot when catching up to the new replica ID.
   259  		// We don't normally expect to have a *higher* local replica ID
   260  		// than the one in the meta descriptor, but it's possible after
   261  		// recovering with unsafe-remove-dead-replicas.
   262  	} else if sameRange {
   263  		// We are no longer a member of this range, but the range still exists.
   264  		// Clean up our local data.
   265  
   266  		if replyDesc.EndKey.Less(desc.EndKey) {
   267  			// The meta records indicate that the range has split but that this
   268  			// replica hasn't processed the split trigger yet. By removing this
   269  			// replica, we're also wiping out the data of what would become the
   270  			// right hand side of the split (which may or may not still have a
   271  			// replica on this store), and will need a Raft snapshot. Even worse,
   272  			// the mechanism introduced in #31875 will artificially delay this
   273  			// snapshot by seconds, during which time the RHS may see more splits
   274  			// and incur more snapshots.
   275  			//
   276  			// TODO(tschottdorf): we can look up the range descriptor for the
   277  			// RHS of the split (by querying with replyDesc.EndKey) and fetch
   278  			// the local replica (which will be uninitialized, i.e. we have to
   279  			// look it up by RangeID) to disable the mechanism in #31875 for it.
   280  			// We should be able to use prefetching unconditionally to have this
   281  			// desc ready whenever we need it.
   282  			//
   283  			// NB: there's solid evidence that this phenomenon can actually lead
   284  			// to a large spike in Raft snapshots early in the life of a cluster
   285  			// (in particular when combined with a restore operation) when the
   286  			// removed replica has many pending splits and thus incurs a Raft
   287  			// snapshot for *each* of them. This typically happens for the last
   288  			// range:
   289  			// [n1,replicaGC,s1,r33/1:/{Table/53/1/3…-Max}] removing replica [...]
   290  			log.Infof(ctx, "removing replica with pending split; will incur Raft snapshot for right hand side")
   291  		}
   292  
   293  		rgcq.metrics.RemoveReplicaCount.Inc(1)
   294  		log.VEventf(ctx, 1, "destroying local data")
   295  
   296  		nextReplicaID := replyDesc.NextReplicaID
   297  		// Note that this seems racy - we didn't hold any locks between reading
   298  		// the range descriptor above and deciding to remove the replica - but
   299  		// we pass in the NextReplicaID to detect situations in which the
   300  		// replica became "non-gc'able" in the meantime by checking (with raftMu
   301  		// held throughout) whether the replicaID is still smaller than the
   302  		// NextReplicaID. Given non-zero replica IDs don't change, this is only
   303  		// possible if we currently think we're processing a pre-emptive snapshot
   304  		// but discover in RemoveReplica that this range has since been added and
   305  		// knows that.
   306  		if err := repl.store.RemoveReplica(ctx, repl, nextReplicaID, RemoveOptions{
   307  			DestroyData: true,
   308  		}); err != nil {
   309  			return err
   310  		}
   311  	} else {
   312  		// This case is tricky. This range has been merged away, so it is likely
   313  		// that we can GC this replica, but we need to be careful. If this store has
   314  		// a replica of the subsuming range that has not yet applied the merge
   315  		// trigger, we must not GC this replica.
   316  		//
   317  		// We can't just ask our local left neighbor whether it has an unapplied
   318  		// merge, as if it's a slow follower it might not have learned about the
   319  		// merge yet! What we can do, though, is check whether the generation of our
   320  		// local left neighbor matches the generation of its meta2 descriptor. If it
   321  		// is generationally up-to-date, it has applied all splits and merges, and
   322  		// it is thus safe to remove this replica.
   323  		leftRepl := repl.store.lookupPrecedingReplica(desc.StartKey)
   324  		if leftRepl != nil {
   325  			leftDesc := leftRepl.Desc()
   326  			rs, _, err := kv.RangeLookup(ctx, rgcq.db.NonTransactionalSender(), leftDesc.StartKey.AsRawKey(),
   327  				roachpb.CONSISTENT, 0 /* prefetchNum */, false /* reverse */)
   328  			if err != nil {
   329  				return err
   330  			}
   331  			if len(rs) != 1 {
   332  				return errors.Errorf("expected 1 range descriptor, got %d", len(rs))
   333  			}
   334  			if leftReplyDesc := &rs[0]; !leftDesc.Equal(*leftReplyDesc) {
   335  				log.VEventf(ctx, 1, "left neighbor %s not up-to-date with meta descriptor %s; cannot safely GC range yet",
   336  					leftDesc, leftReplyDesc)
   337  				// Chances are that the left replica needs to be GC'd. Since we don't
   338  				// have definitive proof, queue it with a low priority.
   339  				rgcq.AddAsync(ctx, leftRepl, replicaGCPriorityDefault)
   340  				return nil
   341  			}
   342  		}
   343  
   344  		// A tombstone is written with a value of mergedTombstoneReplicaID because
   345  		// we know the range to have been merged. See the Merge case of
   346  		// runPreApplyTriggers() for details.
   347  		if err := repl.store.RemoveReplica(ctx, repl, mergedTombstoneReplicaID, RemoveOptions{
   348  			DestroyData: true,
   349  		}); err != nil {
   350  			return err
   351  		}
   352  	}
   353  	return nil
   354  }
   355  
   356  func (*replicaGCQueue) timer(_ time.Duration) time.Duration {
   357  	return replicaGCQueueTimerDuration
   358  }
   359  
   360  // purgatoryChan returns nil.
   361  func (*replicaGCQueue) purgatoryChan() <-chan time.Time {
   362  	return nil
   363  }