github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replicate_queue.go (about)

     1  // Copyright 2015 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"sync/atomic"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/base"
    21  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    22  	"github.com/cockroachdb/cockroach/pkg/config"
    23  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    24  	"github.com/cockroachdb/cockroach/pkg/gossip"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/settings"
    28  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    29  	"github.com/cockroachdb/cockroach/pkg/util/log"
    30  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    31  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    32  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    33  	"github.com/cockroachdb/errors"
    34  	"go.etcd.io/etcd/raft"
    35  )
    36  
    37  const (
    38  	// replicateQueueTimerDuration is the duration between replication of queued
    39  	// replicas.
    40  	replicateQueueTimerDuration = 0 // zero duration to process replication greedily
    41  
    42  	// newReplicaGracePeriod is the amount of time that we allow for a new
    43  	// replica's raft state to catch up to the leader's before we start
    44  	// considering it to be behind for the sake of rebalancing. We choose a
    45  	// large value here because snapshots of large replicas can take a while
    46  	// in high latency clusters, and not allowing enough of a cushion can
    47  	// make rebalance thrashing more likely (#17879).
    48  	newReplicaGracePeriod = 5 * time.Minute
    49  )
    50  
    51  // minLeaseTransferInterval controls how frequently leases can be transferred
    52  // for rebalancing. It does not prevent transferring leases in order to allow
    53  // a replica to be removed from a range.
    54  var minLeaseTransferInterval = settings.RegisterNonNegativeDurationSetting(
    55  	"kv.allocator.min_lease_transfer_interval",
    56  	"controls how frequently leases can be transferred for rebalancing. "+
    57  		"It does not prevent transferring leases in order to allow a "+
    58  		"replica to be removed from a range.",
    59  	1*time.Second,
    60  )
    61  
    62  var (
    63  	metaReplicateQueueAddReplicaCount = metric.Metadata{
    64  		Name:        "queue.replicate.addreplica",
    65  		Help:        "Number of replica additions attempted by the replicate queue",
    66  		Measurement: "Replica Additions",
    67  		Unit:        metric.Unit_COUNT,
    68  	}
    69  	metaReplicateQueueRemoveReplicaCount = metric.Metadata{
    70  		Name:        "queue.replicate.removereplica",
    71  		Help:        "Number of replica removals attempted by the replicate queue (typically in response to a rebalancer-initiated addition)",
    72  		Measurement: "Replica Removals",
    73  		Unit:        metric.Unit_COUNT,
    74  	}
    75  	metaReplicateQueueRemoveDeadReplicaCount = metric.Metadata{
    76  		Name:        "queue.replicate.removedeadreplica",
    77  		Help:        "Number of dead replica removals attempted by the replicate queue (typically in response to a node outage)",
    78  		Measurement: "Replica Removals",
    79  		Unit:        metric.Unit_COUNT,
    80  	}
    81  	metaReplicateQueueRemoveLearnerReplicaCount = metric.Metadata{
    82  		Name:        "queue.replicate.removelearnerreplica",
    83  		Help:        "Number of learner replica removals attempted by the replicate queue (typically due to internal race conditions)",
    84  		Measurement: "Replica Removals",
    85  		Unit:        metric.Unit_COUNT,
    86  	}
    87  	metaReplicateQueueRebalanceReplicaCount = metric.Metadata{
    88  		Name:        "queue.replicate.rebalancereplica",
    89  		Help:        "Number of replica rebalancer-initiated additions attempted by the replicate queue",
    90  		Measurement: "Replica Additions",
    91  		Unit:        metric.Unit_COUNT,
    92  	}
    93  	metaReplicateQueueTransferLeaseCount = metric.Metadata{
    94  		Name:        "queue.replicate.transferlease",
    95  		Help:        "Number of range lease transfers attempted by the replicate queue",
    96  		Measurement: "Lease Transfers",
    97  		Unit:        metric.Unit_COUNT,
    98  	}
    99  )
   100  
   101  // quorumError indicates a retryable error condition which sends replicas being
   102  // processed through the replicate queue into purgatory so that they can be
   103  // retried quickly as soon as nodes come online.
   104  type quorumError struct {
   105  	msg string
   106  }
   107  
   108  func newQuorumError(f string, args ...interface{}) *quorumError {
   109  	return &quorumError{
   110  		msg: fmt.Sprintf(f, args...),
   111  	}
   112  }
   113  
   114  func (e *quorumError) Error() string {
   115  	return e.msg
   116  }
   117  
   118  func (*quorumError) purgatoryErrorMarker() {}
   119  
   120  // ReplicateQueueMetrics is the set of metrics for the replicate queue.
   121  type ReplicateQueueMetrics struct {
   122  	AddReplicaCount           *metric.Counter
   123  	RemoveReplicaCount        *metric.Counter
   124  	RemoveDeadReplicaCount    *metric.Counter
   125  	RemoveLearnerReplicaCount *metric.Counter
   126  	RebalanceReplicaCount     *metric.Counter
   127  	TransferLeaseCount        *metric.Counter
   128  }
   129  
   130  func makeReplicateQueueMetrics() ReplicateQueueMetrics {
   131  	return ReplicateQueueMetrics{
   132  		AddReplicaCount:           metric.NewCounter(metaReplicateQueueAddReplicaCount),
   133  		RemoveReplicaCount:        metric.NewCounter(metaReplicateQueueRemoveReplicaCount),
   134  		RemoveDeadReplicaCount:    metric.NewCounter(metaReplicateQueueRemoveDeadReplicaCount),
   135  		RemoveLearnerReplicaCount: metric.NewCounter(metaReplicateQueueRemoveLearnerReplicaCount),
   136  		RebalanceReplicaCount:     metric.NewCounter(metaReplicateQueueRebalanceReplicaCount),
   137  		TransferLeaseCount:        metric.NewCounter(metaReplicateQueueTransferLeaseCount),
   138  	}
   139  }
   140  
   141  // replicateQueue manages a queue of replicas which may need to add an
   142  // additional replica to their range.
   143  type replicateQueue struct {
   144  	*baseQueue
   145  	metrics           ReplicateQueueMetrics
   146  	allocator         Allocator
   147  	updateChan        chan time.Time
   148  	lastLeaseTransfer atomic.Value // read and written by scanner & queue goroutines
   149  }
   150  
   151  // newReplicateQueue returns a new instance of replicateQueue.
   152  func newReplicateQueue(store *Store, g *gossip.Gossip, allocator Allocator) *replicateQueue {
   153  	rq := &replicateQueue{
   154  		metrics:    makeReplicateQueueMetrics(),
   155  		allocator:  allocator,
   156  		updateChan: make(chan time.Time, 1),
   157  	}
   158  	store.metrics.registry.AddMetricStruct(&rq.metrics)
   159  	rq.baseQueue = newBaseQueue(
   160  		"replicate", rq, store, g,
   161  		queueConfig{
   162  			maxSize:              defaultQueueMaxSize,
   163  			needsLease:           true,
   164  			needsSystemConfig:    true,
   165  			acceptsUnsplitRanges: store.TestingKnobs().ReplicateQueueAcceptsUnsplit,
   166  			// The processing of the replicate queue often needs to send snapshots
   167  			// so we use the raftSnapshotQueueTimeoutFunc. This function sets a
   168  			// timeout based on the range size and the sending rate in addition
   169  			// to consulting the setting which controls the minimum timeout.
   170  			processTimeoutFunc: makeQueueSnapshotTimeoutFunc(rebalanceSnapshotRate),
   171  			successes:          store.metrics.ReplicateQueueSuccesses,
   172  			failures:           store.metrics.ReplicateQueueFailures,
   173  			pending:            store.metrics.ReplicateQueuePending,
   174  			processingNanos:    store.metrics.ReplicateQueueProcessingNanos,
   175  			purgatory:          store.metrics.ReplicateQueuePurgatory,
   176  		},
   177  	)
   178  
   179  	updateFn := func() {
   180  		select {
   181  		case rq.updateChan <- timeutil.Now():
   182  		default:
   183  		}
   184  	}
   185  
   186  	// Register gossip and node liveness callbacks to signal that
   187  	// replicas in purgatory might be retried.
   188  	if g != nil { // gossip is nil for some unittests
   189  		g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(key string, _ roachpb.Value) {
   190  			if !rq.store.IsStarted() {
   191  				return
   192  			}
   193  			// Because updates to our store's own descriptor won't affect
   194  			// replicas in purgatory, skip updating the purgatory channel
   195  			// in this case.
   196  			if storeID, err := gossip.StoreIDFromKey(key); err == nil && storeID == rq.store.StoreID() {
   197  				return
   198  			}
   199  			updateFn()
   200  		})
   201  	}
   202  	if nl := store.cfg.NodeLiveness; nl != nil { // node liveness is nil for some unittests
   203  		nl.RegisterCallback(func(_ roachpb.NodeID) {
   204  			updateFn()
   205  		})
   206  	}
   207  
   208  	return rq
   209  }
   210  
   211  func (rq *replicateQueue) shouldQueue(
   212  	ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig,
   213  ) (shouldQ bool, priority float64) {
   214  	desc, zone := repl.DescAndZone()
   215  	action, priority := rq.allocator.ComputeAction(ctx, zone, desc)
   216  
   217  	// For simplicity, the first thing the allocator does is remove learners, so
   218  	// it can do all of its reasoning about only voters. We do the same here so
   219  	// the executions of the allocator's decisions can be in terms of voters.
   220  	if action == AllocatorRemoveLearner {
   221  		return true, priority
   222  	}
   223  	voterReplicas := desc.Replicas().Voters()
   224  
   225  	if action == AllocatorNoop {
   226  		log.VEventf(ctx, 2, "no action to take")
   227  		return false, 0
   228  	} else if action != AllocatorConsiderRebalance {
   229  		log.VEventf(ctx, 2, "repair needed (%s), enqueuing", action)
   230  		return true, priority
   231  	}
   232  
   233  	if !rq.store.TestingKnobs().DisableReplicaRebalancing {
   234  		rangeUsageInfo := rangeUsageInfoForRepl(repl)
   235  		_, _, _, ok := rq.allocator.RebalanceTarget(
   236  			ctx, zone, repl.RaftStatus(), voterReplicas, rangeUsageInfo, storeFilterThrottled)
   237  		if ok {
   238  			log.VEventf(ctx, 2, "rebalance target found, enqueuing")
   239  			return true, 0
   240  		}
   241  		log.VEventf(ctx, 2, "no rebalance target found, not enqueuing")
   242  	}
   243  
   244  	// If the lease is valid, check to see if we should transfer it.
   245  	if lease, _ := repl.GetLease(); repl.IsLeaseValid(lease, now) {
   246  		if rq.canTransferLease() &&
   247  			rq.allocator.ShouldTransferLease(
   248  				ctx, zone, voterReplicas, lease.Replica.StoreID, repl.leaseholderStats) {
   249  			log.VEventf(ctx, 2, "lease transfer needed, enqueuing")
   250  			return true, 0
   251  		}
   252  	}
   253  
   254  	return false, 0
   255  }
   256  
   257  func (rq *replicateQueue) process(
   258  	ctx context.Context, repl *Replica, sysCfg *config.SystemConfig,
   259  ) error {
   260  	retryOpts := retry.Options{
   261  		InitialBackoff: 50 * time.Millisecond,
   262  		MaxBackoff:     1 * time.Second,
   263  		Multiplier:     2,
   264  		MaxRetries:     5,
   265  	}
   266  
   267  	// Use a retry loop in order to backoff in the case of snapshot errors,
   268  	// usually signaling that a rebalancing reservation could not be made with the
   269  	// selected target.
   270  	for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
   271  		for {
   272  			requeue, err := rq.processOneChange(ctx, repl, rq.canTransferLease, false /* dryRun */)
   273  			if IsSnapshotError(err) {
   274  				// If ChangeReplicas failed because the snapshot failed, we log the
   275  				// error but then return success indicating we should retry the
   276  				// operation. The most likely causes of the snapshot failing are a
   277  				// declined reservation or the remote node being unavailable. In either
   278  				// case we don't want to wait another scanner cycle before reconsidering
   279  				// the range.
   280  				log.Infof(ctx, "%v", err)
   281  				break
   282  			}
   283  
   284  			if err != nil {
   285  				return err
   286  			}
   287  
   288  			if testingAggressiveConsistencyChecks {
   289  				if err := rq.store.consistencyQueue.process(ctx, repl, sysCfg); err != nil {
   290  					log.Warningf(ctx, "%v", err)
   291  				}
   292  			}
   293  
   294  			if !requeue {
   295  				return nil
   296  			}
   297  
   298  			log.VEventf(ctx, 1, "re-processing")
   299  		}
   300  	}
   301  
   302  	return errors.Errorf("failed to replicate after %d retries", retryOpts.MaxRetries)
   303  }
   304  
   305  func (rq *replicateQueue) processOneChange(
   306  	ctx context.Context, repl *Replica, canTransferLease func() bool, dryRun bool,
   307  ) (requeue bool, _ error) {
   308  	// Check lease and destroy status here. The queue does this higher up already, but
   309  	// adminScatter (and potential other future callers) also call this method and don't
   310  	// perform this check, which could lead to infinite loops.
   311  	if _, err := repl.IsDestroyed(); err != nil {
   312  		return false, err
   313  	}
   314  	if _, pErr := repl.redirectOnOrAcquireLease(ctx); pErr != nil {
   315  		return false, pErr.GoError()
   316  	}
   317  
   318  	desc, zone := repl.DescAndZone()
   319  
   320  	// Avoid taking action if the range has too many dead replicas to make
   321  	// quorum.
   322  	voterReplicas := desc.Replicas().Voters()
   323  	liveVoterReplicas, deadVoterReplicas := rq.allocator.storePool.liveAndDeadReplicas(voterReplicas)
   324  	{
   325  		unavailable := !desc.Replicas().CanMakeProgress(func(rDesc roachpb.ReplicaDescriptor) bool {
   326  			for _, inner := range liveVoterReplicas {
   327  				if inner.ReplicaID == rDesc.ReplicaID {
   328  					return true
   329  				}
   330  			}
   331  			return false
   332  		})
   333  		if unavailable {
   334  			return false, newQuorumError(
   335  				"range requires a replication change, but live replicas %v don't constitute a quorum for %v:",
   336  				liveVoterReplicas,
   337  				desc.Replicas().All(),
   338  			)
   339  		}
   340  	}
   341  
   342  	action, _ := rq.allocator.ComputeAction(ctx, zone, desc)
   343  	log.VEventf(ctx, 1, "next replica action: %s", action)
   344  
   345  	// For simplicity, the first thing the allocator does is remove learners, so
   346  	// it can do all of its reasoning about only voters. We do the same here so
   347  	// the executions of the allocator's decisions can be in terms of voters.
   348  	if action == AllocatorRemoveLearner {
   349  		return rq.removeLearner(ctx, repl, dryRun)
   350  	}
   351  
   352  	switch action {
   353  	case AllocatorNoop, AllocatorRangeUnavailable:
   354  		// We're either missing liveness information or the range is known to have
   355  		// lost quorum. Either way, it's not a good idea to make changes right now.
   356  		// Let the scanner requeue it again later.
   357  		return false, nil
   358  	case AllocatorAdd:
   359  		return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, -1 /* removeIdx */, dryRun)
   360  	case AllocatorRemove:
   361  		return rq.remove(ctx, repl, voterReplicas, dryRun)
   362  	case AllocatorReplaceDead:
   363  		if len(deadVoterReplicas) == 0 {
   364  			// Nothing to do.
   365  			return false, nil
   366  		}
   367  		removeIdx := -1 // guaranteed to be changed below
   368  		for i, rDesc := range voterReplicas {
   369  			if rDesc.StoreID == deadVoterReplicas[0].StoreID {
   370  				removeIdx = i
   371  				break
   372  			}
   373  		}
   374  		if removeIdx < 0 {
   375  			return false, errors.AssertionFailedf(
   376  				"dead voter %v unexpectedly not found in %v",
   377  				deadVoterReplicas[0], voterReplicas)
   378  		}
   379  		return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, removeIdx, dryRun)
   380  	case AllocatorReplaceDecommissioning:
   381  		decommissioningReplicas := rq.allocator.storePool.decommissioningReplicas(voterReplicas)
   382  		if len(decommissioningReplicas) == 0 {
   383  			// Nothing to do.
   384  			return false, nil
   385  		}
   386  		removeIdx := -1 // guaranteed to be changed below
   387  		for i, rDesc := range voterReplicas {
   388  			if rDesc.StoreID == decommissioningReplicas[0].StoreID {
   389  				removeIdx = i
   390  				break
   391  			}
   392  		}
   393  		if removeIdx < 0 {
   394  			return false, errors.AssertionFailedf(
   395  				"decommissioning voter %v unexpectedly not found in %v",
   396  				decommissioningReplicas[0], voterReplicas)
   397  		}
   398  		return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, removeIdx, dryRun)
   399  	case AllocatorRemoveDecommissioning:
   400  		// NB: this path will only be hit when the range is over-replicated and
   401  		// has decommissioning replicas; in the common case we'll hit
   402  		// AllocatorReplaceDecommissioning above.
   403  		return rq.removeDecommissioning(ctx, repl, dryRun)
   404  	case AllocatorRemoveDead:
   405  		// NB: this path will only be hit when the range is over-replicated and
   406  		// has dead replicas; in the common case we'll hit AllocatorReplaceDead
   407  		// above.
   408  		return rq.removeDead(ctx, repl, deadVoterReplicas, dryRun)
   409  	case AllocatorRemoveLearner:
   410  		return rq.removeLearner(ctx, repl, dryRun)
   411  	case AllocatorConsiderRebalance:
   412  		return rq.considerRebalance(ctx, repl, voterReplicas, canTransferLease, dryRun)
   413  	case AllocatorFinalizeAtomicReplicationChange:
   414  		_, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, repl.store, repl.Desc())
   415  		// Requeue because either we failed to transition out of a joint state
   416  		// (bad) or we did and there might be more to do for that range.
   417  		return true, err
   418  	default:
   419  		return false, errors.Errorf("unknown allocator action %v", action)
   420  	}
   421  }
   422  
   423  // addOrReplace adds or replaces a replica. If removeIdx is -1, an addition is
   424  // carried out. Otherwise, removeIdx must be a valid index into existingReplicas
   425  // and specifies which replica to replace with a new one.
   426  //
   427  // The method preferably issues an atomic replica swap, but may not be able to
   428  // do this in all cases, such as when atomic replication changes are not
   429  // available, or when the range consists of a single replica. As a fall back,
   430  // only the addition is carried out; the removal is then a follow-up step for
   431  // the next scanner cycle.
   432  func (rq *replicateQueue) addOrReplace(
   433  	ctx context.Context,
   434  	repl *Replica,
   435  	existingReplicas []roachpb.ReplicaDescriptor,
   436  	liveVoterReplicas []roachpb.ReplicaDescriptor,
   437  	removeIdx int, // -1 for no removal
   438  	dryRun bool,
   439  ) (requeue bool, _ error) {
   440  	if len(existingReplicas) == 1 {
   441  		// If only one replica remains, that replica is the leaseholder and
   442  		// we won't be able to swap it out. Ignore the removal and simply add
   443  		// a replica.
   444  		removeIdx = -1
   445  	}
   446  	st := rq.store.cfg.Settings
   447  	if !st.Version.IsActive(ctx, clusterversion.VersionAtomicChangeReplicas) {
   448  		// If we can't swap yet, don't.
   449  		removeIdx = -1
   450  	}
   451  
   452  	remainingLiveReplicas := liveVoterReplicas
   453  	if removeIdx >= 0 {
   454  		replToRemove := existingReplicas[removeIdx]
   455  		for i, r := range liveVoterReplicas {
   456  			if r.ReplicaID == replToRemove.ReplicaID {
   457  				remainingLiveReplicas = append(liveVoterReplicas[:i:i], liveVoterReplicas[i+1:]...)
   458  				break
   459  			}
   460  		}
   461  		// See about transferring the lease away if we're about to remove the
   462  		// leaseholder.
   463  		done, err := rq.maybeTransferLeaseAway(ctx, repl, existingReplicas[removeIdx].StoreID, dryRun)
   464  		if err != nil {
   465  			return false, err
   466  		}
   467  		if done {
   468  			// Lease was transferred away. Next leaseholder is going to take over.
   469  			return false, nil
   470  		}
   471  	}
   472  
   473  	desc, zone := repl.DescAndZone()
   474  	// Allocate a target assuming that the replica we're replacing (if any) is
   475  	// already gone. The allocator should not try to re-add this replica since
   476  	// there is a reason we're removing it (i.e. dead or decommissioning). If we
   477  	// left the replica in the slice, the allocator would not be guaranteed to
   478  	// pick a replica that fills the gap removeRepl leaves once it's gone.
   479  	newStore, details, err := rq.allocator.AllocateTarget(
   480  		ctx,
   481  		zone,
   482  		remainingLiveReplicas,
   483  	)
   484  	if err != nil {
   485  		return false, err
   486  	}
   487  	if removeIdx >= 0 && newStore.StoreID == existingReplicas[removeIdx].StoreID {
   488  		return false, errors.AssertionFailedf("allocator suggested to replace replica on s%d with itself", newStore.StoreID)
   489  	}
   490  	newReplica := roachpb.ReplicationTarget{
   491  		NodeID:  newStore.Node.NodeID,
   492  		StoreID: newStore.StoreID,
   493  	}
   494  
   495  	clusterNodes := rq.allocator.storePool.ClusterNodeCount()
   496  	need := GetNeededReplicas(*zone.NumReplicas, clusterNodes)
   497  
   498  	// Only up-replicate if there are suitable allocation targets such that,
   499  	// either the replication goal is met, or it is possible to get to the next
   500  	// odd number of replicas. A consensus group of size 2n has worse failure
   501  	// tolerance properties than a group of size 2n - 1 because it has a larger
   502  	// quorum. For example, up-replicating from 1 to 2 replicas only makes sense
   503  	// if it is possible to be able to go to 3 replicas.
   504  	//
   505  	// NB: If willHave > need, then always allow up-replicating as that
   506  	// will be the case when up-replicating a range with a decommissioning
   507  	// replica.
   508  	//
   509  	// We skip this check if we're swapping a replica, since that does not
   510  	// change the quorum size.
   511  	if willHave := len(existingReplicas) + 1; removeIdx < 0 && willHave < need && willHave%2 == 0 {
   512  		// This means we are going to up-replicate to an even replica state.
   513  		// Check if it is possible to go to an odd replica state beyond it.
   514  		oldPlusNewReplicas := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...)
   515  		oldPlusNewReplicas = append(oldPlusNewReplicas, roachpb.ReplicaDescriptor{
   516  			NodeID:  newStore.Node.NodeID,
   517  			StoreID: newStore.StoreID,
   518  		})
   519  		_, _, err := rq.allocator.AllocateTarget(
   520  			ctx,
   521  			zone,
   522  			oldPlusNewReplicas,
   523  		)
   524  		if err != nil {
   525  			// It does not seem possible to go to the next odd replica state. Note
   526  			// that AllocateTarget returns an allocatorError (a purgatoryError)
   527  			// when purgatory is requested.
   528  			return false, errors.Wrap(err, "avoid up-replicating to fragile quorum")
   529  		}
   530  	}
   531  	rq.metrics.AddReplicaCount.Inc(1)
   532  	ops := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, newReplica)
   533  	if removeIdx < 0 {
   534  		log.VEventf(ctx, 1, "adding replica %+v: %s",
   535  			newReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas))
   536  	} else {
   537  		rq.metrics.RemoveReplicaCount.Inc(1)
   538  		removeReplica := existingReplicas[removeIdx]
   539  		log.VEventf(ctx, 1, "replacing replica %s with %+v: %s",
   540  			removeReplica, newReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas))
   541  		ops = append(ops,
   542  			roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, roachpb.ReplicationTarget{
   543  				StoreID: removeReplica.StoreID,
   544  				NodeID:  removeReplica.NodeID,
   545  			})...)
   546  	}
   547  
   548  	if err := rq.changeReplicas(
   549  		ctx,
   550  		repl,
   551  		ops,
   552  		desc,
   553  		SnapshotRequest_RECOVERY,
   554  		kvserverpb.ReasonRangeUnderReplicated,
   555  		details,
   556  		dryRun,
   557  	); err != nil {
   558  		return false, err
   559  	}
   560  	// Always requeue to see if more work needs to be done.
   561  	return true, nil
   562  }
   563  
   564  // findRemoveTarget takes a list of replicas and picks one to remove, making
   565  // sure to not remove a newly added replica or to violate the zone configs in
   566  // the progress.
   567  func (rq *replicateQueue) findRemoveTarget(
   568  	ctx context.Context,
   569  	repl interface {
   570  		DescAndZone() (*roachpb.RangeDescriptor, *zonepb.ZoneConfig)
   571  		LastReplicaAdded() (roachpb.ReplicaID, time.Time)
   572  		RaftStatus() *raft.Status
   573  	},
   574  	existingReplicas []roachpb.ReplicaDescriptor,
   575  ) (roachpb.ReplicaDescriptor, string, error) {
   576  	_, zone := repl.DescAndZone()
   577  	// This retry loop involves quick operations on local state, so a
   578  	// small MaxBackoff is good (but those local variables change on
   579  	// network time scales as raft receives responses).
   580  	//
   581  	// TODO(bdarnell): There's another retry loop at process(). It
   582  	// would be nice to combine these, but I'm keeping them separate
   583  	// for now so we can tune the options separately.
   584  	retryOpts := retry.Options{
   585  		InitialBackoff: time.Millisecond,
   586  		MaxBackoff:     200 * time.Millisecond,
   587  		Multiplier:     2,
   588  	}
   589  
   590  	var candidates []roachpb.ReplicaDescriptor
   591  	deadline := timeutil.Now().Add(2 * base.NetworkTimeout)
   592  	for r := retry.StartWithCtx(ctx, retryOpts); r.Next() && timeutil.Now().Before(deadline); {
   593  		lastReplAdded, lastAddedTime := repl.LastReplicaAdded()
   594  		if timeutil.Since(lastAddedTime) > newReplicaGracePeriod {
   595  			lastReplAdded = 0
   596  		}
   597  		raftStatus := repl.RaftStatus()
   598  		if raftStatus == nil || raftStatus.RaftState != raft.StateLeader {
   599  			// If we've lost raft leadership, we're unlikely to regain it so give up immediately.
   600  			return roachpb.ReplicaDescriptor{}, "", &benignError{errors.Errorf("not raft leader while range needs removal")}
   601  		}
   602  		candidates = filterUnremovableReplicas(ctx, raftStatus, existingReplicas, lastReplAdded)
   603  		log.VEventf(ctx, 3, "filtered unremovable replicas from %v to get %v as candidates for removal: %s",
   604  			existingReplicas, candidates, rangeRaftProgress(raftStatus, existingReplicas))
   605  		if len(candidates) > 0 {
   606  			break
   607  		}
   608  		if len(raftStatus.Progress) <= 2 {
   609  			// HACK(bdarnell): Downreplicating to a single node from
   610  			// multiple nodes is not really supported. There are edge
   611  			// cases in which the two peers stop communicating with each
   612  			// other too soon and we don't reach a satisfactory
   613  			// resolution. However, some tests (notably
   614  			// TestRepartitioning) get into this state, and if the
   615  			// replication queue spends its entire timeout waiting for the
   616  			// downreplication to finish the test will time out. As a
   617  			// hack, just fail-fast when we're trying to go down to a
   618  			// single replica.
   619  			break
   620  		}
   621  		// After upreplication, the candidates for removal could still
   622  		// be catching up. The allocator determined that the range was
   623  		// over-replicated, and it's important to clear that state as
   624  		// quickly as we can (because over-replicated ranges may be
   625  		// under-diversified). If we return an error here, this range
   626  		// probably won't be processed again until the next scanner
   627  		// cycle, which is too long, so we retry here.
   628  	}
   629  	if len(candidates) == 0 {
   630  		// If we timed out and still don't have any valid candidates, give up.
   631  		return roachpb.ReplicaDescriptor{}, "", &benignError{errors.Errorf("no removable replicas from range that needs a removal: %s",
   632  			rangeRaftProgress(repl.RaftStatus(), existingReplicas))}
   633  	}
   634  
   635  	return rq.allocator.RemoveTarget(ctx, zone, candidates, existingReplicas)
   636  }
   637  
   638  // maybeTransferLeaseAway is called whenever a replica on a given store is
   639  // slated for removal. If the store corresponds to the store of the caller
   640  // (which is very likely to be the leaseholder), then this removal would fail.
   641  // Instead, this method will attempt to transfer the lease away, and returns
   642  // true to indicate to the caller that it should not pursue the current
   643  // replication change further because it is no longer the leaseholder. When the
   644  // returned bool is false, it should continue. On error, the caller should also
   645  // stop.
   646  func (rq *replicateQueue) maybeTransferLeaseAway(
   647  	ctx context.Context, repl *Replica, removeStoreID roachpb.StoreID, dryRun bool,
   648  ) (done bool, _ error) {
   649  	if removeStoreID != repl.store.StoreID() {
   650  		return false, nil
   651  	}
   652  	desc, zone := repl.DescAndZone()
   653  	// The local replica was selected as the removal target, but that replica
   654  	// is the leaseholder, so transfer the lease instead. We don't check that
   655  	// the current store has too many leases in this case under the
   656  	// assumption that replica balance is a greater concern. Also note that
   657  	// AllocatorRemove action takes preference over AllocatorConsiderRebalance
   658  	// (rebalancing) which is where lease transfer would otherwise occur. We
   659  	// need to be able to transfer leases in AllocatorRemove in order to get
   660  	// out of situations where this store is overfull and yet holds all the
   661  	// leases. The fullness checks need to be ignored for cases where
   662  	// a replica needs to be removed for constraint violations.
   663  	return rq.findTargetAndTransferLease(
   664  		ctx,
   665  		repl,
   666  		desc,
   667  		zone,
   668  		transferLeaseOptions{
   669  			dryRun: dryRun,
   670  		},
   671  	)
   672  }
   673  
   674  func (rq *replicateQueue) remove(
   675  	ctx context.Context, repl *Replica, existingReplicas []roachpb.ReplicaDescriptor, dryRun bool,
   676  ) (requeue bool, _ error) {
   677  	removeReplica, details, err := rq.findRemoveTarget(ctx, repl, existingReplicas)
   678  	if err != nil {
   679  		return false, err
   680  	}
   681  	done, err := rq.maybeTransferLeaseAway(ctx, repl, removeReplica.StoreID, dryRun)
   682  	if err != nil {
   683  		return false, err
   684  	}
   685  	if done {
   686  		// Lease is now elsewhere, so we're not in charge any more.
   687  		return false, nil
   688  	}
   689  
   690  	// Remove a replica.
   691  	rq.metrics.RemoveReplicaCount.Inc(1)
   692  	log.VEventf(ctx, 1, "removing replica %+v due to over-replication: %s",
   693  		removeReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas))
   694  	target := roachpb.ReplicationTarget{
   695  		NodeID:  removeReplica.NodeID,
   696  		StoreID: removeReplica.StoreID,
   697  	}
   698  	desc, _ := repl.DescAndZone()
   699  	if err := rq.changeReplicas(
   700  		ctx,
   701  		repl,
   702  		roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target),
   703  		desc,
   704  		SnapshotRequest_UNKNOWN, // unused
   705  		kvserverpb.ReasonRangeOverReplicated,
   706  		details,
   707  		dryRun,
   708  	); err != nil {
   709  		return false, err
   710  	}
   711  	return true, nil
   712  }
   713  
   714  func (rq *replicateQueue) removeDecommissioning(
   715  	ctx context.Context, repl *Replica, dryRun bool,
   716  ) (requeue bool, _ error) {
   717  	desc, _ := repl.DescAndZone()
   718  	decommissioningReplicas := rq.allocator.storePool.decommissioningReplicas(desc.Replicas().All())
   719  	if len(decommissioningReplicas) == 0 {
   720  		log.VEventf(ctx, 1, "range of replica %s was identified as having decommissioning replicas, "+
   721  			"but no decommissioning replicas were found", repl)
   722  		return true, nil
   723  	}
   724  	decommissioningReplica := decommissioningReplicas[0]
   725  	done, err := rq.maybeTransferLeaseAway(ctx, repl, decommissioningReplica.StoreID, dryRun)
   726  	if err != nil {
   727  		return false, err
   728  	}
   729  	if done {
   730  		// Not leaseholder any more.
   731  		return false, nil
   732  	}
   733  	// Remove the decommissioning replica.
   734  	rq.metrics.RemoveReplicaCount.Inc(1)
   735  	log.VEventf(ctx, 1, "removing decommissioning replica %+v from store", decommissioningReplica)
   736  	target := roachpb.ReplicationTarget{
   737  		NodeID:  decommissioningReplica.NodeID,
   738  		StoreID: decommissioningReplica.StoreID,
   739  	}
   740  	if err := rq.changeReplicas(
   741  		ctx,
   742  		repl,
   743  		roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target),
   744  		desc,
   745  		SnapshotRequest_UNKNOWN, // unused
   746  		kvserverpb.ReasonStoreDecommissioning, "", dryRun,
   747  	); err != nil {
   748  		return false, err
   749  	}
   750  	// We removed a replica, so check if there's more to do.
   751  	return true, nil
   752  }
   753  
   754  func (rq *replicateQueue) removeDead(
   755  	ctx context.Context, repl *Replica, deadVoterReplicas []roachpb.ReplicaDescriptor, dryRun bool,
   756  ) (requeue bool, _ error) {
   757  	desc := repl.Desc()
   758  	if len(deadVoterReplicas) == 0 {
   759  		log.VEventf(ctx, 1, "range of replica %s was identified as having dead replicas, but no dead replicas were found", repl)
   760  		return true, nil
   761  	}
   762  	deadReplica := deadVoterReplicas[0]
   763  	rq.metrics.RemoveDeadReplicaCount.Inc(1)
   764  	log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica)
   765  	target := roachpb.ReplicationTarget{
   766  		NodeID:  deadReplica.NodeID,
   767  		StoreID: deadReplica.StoreID,
   768  	}
   769  	// NB: we don't check whether to transfer the lease away because if the removal target
   770  	// is dead, it's not us (and if for some reason that happens, the removal is simply
   771  	// going to fail).
   772  	if err := rq.changeReplicas(
   773  		ctx,
   774  		repl,
   775  		roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target),
   776  		desc,
   777  		SnapshotRequest_UNKNOWN, // unused
   778  		kvserverpb.ReasonStoreDead,
   779  		"",
   780  		dryRun,
   781  	); err != nil {
   782  		return false, err
   783  	}
   784  	return true, nil
   785  }
   786  
   787  func (rq *replicateQueue) removeLearner(
   788  	ctx context.Context, repl *Replica, dryRun bool,
   789  ) (requeue bool, _ error) {
   790  	desc := repl.Desc()
   791  	learnerReplicas := desc.Replicas().Learners()
   792  	if len(learnerReplicas) == 0 {
   793  		log.VEventf(ctx, 1, "range of replica %s was identified as having learner replicas, "+
   794  			"but no learner replicas were found", repl)
   795  		return true, nil
   796  	}
   797  	learnerReplica := learnerReplicas[0]
   798  	rq.metrics.RemoveLearnerReplicaCount.Inc(1)
   799  	log.VEventf(ctx, 1, "removing learner replica %+v from store", learnerReplica)
   800  	target := roachpb.ReplicationTarget{
   801  		NodeID:  learnerReplica.NodeID,
   802  		StoreID: learnerReplica.StoreID,
   803  	}
   804  	// NB: we don't check whether to transfer the lease away because we're very unlikely
   805  	// to be the learner (and if so, we don't have the lease any more, so after the removal
   806  	// fails the situation will have rectified itself).
   807  	if err := rq.changeReplicas(
   808  		ctx,
   809  		repl,
   810  		roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target),
   811  		desc,
   812  		SnapshotRequest_UNKNOWN,
   813  		kvserverpb.ReasonAbandonedLearner,
   814  		"",
   815  		dryRun,
   816  	); err != nil {
   817  		return false, err
   818  	}
   819  	return true, nil
   820  }
   821  
   822  func (rq *replicateQueue) considerRebalance(
   823  	ctx context.Context,
   824  	repl *Replica,
   825  	existingReplicas []roachpb.ReplicaDescriptor,
   826  	canTransferLease func() bool,
   827  	dryRun bool,
   828  ) (requeue bool, _ error) {
   829  	desc, zone := repl.DescAndZone()
   830  	// The Noop case will result if this replica was queued in order to
   831  	// rebalance. Attempt to find a rebalancing target.
   832  	if !rq.store.TestingKnobs().DisableReplicaRebalancing {
   833  		rangeUsageInfo := rangeUsageInfoForRepl(repl)
   834  		addTarget, removeTarget, details, ok := rq.allocator.RebalanceTarget(
   835  			ctx, zone, repl.RaftStatus(), existingReplicas, rangeUsageInfo,
   836  			storeFilterThrottled)
   837  		if !ok {
   838  			log.VEventf(ctx, 1, "no suitable rebalance target")
   839  		} else if done, err := rq.maybeTransferLeaseAway(ctx, repl, removeTarget.StoreID, dryRun); err != nil {
   840  			log.VEventf(ctx, 1, "want to remove self, but failed to transfer lease away: %s", err)
   841  		} else if done {
   842  			// Lease is now elsewhere, so we're not in charge any more.
   843  			return false, nil
   844  		} else {
   845  			// We have a replica to remove and one we can add, so let's swap them
   846  			// out.
   847  			chgs := []roachpb.ReplicationChange{
   848  				// NB: we place the addition first because in the case of
   849  				// atomic replication changes being turned off, the changes
   850  				// will be executed individually in the order in which they
   851  				// appear.
   852  				{Target: addTarget, ChangeType: roachpb.ADD_REPLICA},
   853  				{Target: removeTarget, ChangeType: roachpb.REMOVE_REPLICA},
   854  			}
   855  
   856  			if len(existingReplicas) == 1 {
   857  				// If there's only one replica, the removal target is the
   858  				// leaseholder and this is unsupported and will fail. However,
   859  				// this is also the only way to rebalance in a single-replica
   860  				// range. If we try the atomic swap here, we'll fail doing
   861  				// nothing, and so we stay locked into the current distribution
   862  				// of replicas. (Note that maybeTransferLeaseAway above will not
   863  				// have found a target, and so will have returned (false, nil).
   864  				//
   865  				// Do the best thing we can, which is carry out the addition
   866  				// only, which should succeed, and the next time we touch this
   867  				// range, we will have one more replica and hopefully it will
   868  				// take the lease and remove the current leaseholder.
   869  				//
   870  				// It's possible that "rebalancing deadlock" can occur in other
   871  				// scenarios, it's really impossible to tell from the code given
   872  				// the constraints we support. However, the lease transfer often
   873  				// does not happen spuriously, and we can't enter dangerous
   874  				// configurations sporadically, so this code path is only hit
   875  				// when we know it's necessary, picking the smaller of two evils.
   876  				//
   877  				// See https://github.com/cockroachdb/cockroach/issues/40333.
   878  				chgs = chgs[:1]
   879  				log.VEventf(ctx, 1, "can't swap replica due to lease; falling back to add")
   880  			}
   881  
   882  			rq.metrics.RebalanceReplicaCount.Inc(1)
   883  			log.VEventf(ctx, 1, "rebalancing %+v to %+v: %s",
   884  				removeTarget, addTarget, rangeRaftProgress(repl.RaftStatus(), existingReplicas))
   885  
   886  			if err := rq.changeReplicas(
   887  				ctx,
   888  				repl,
   889  				chgs,
   890  				desc,
   891  				SnapshotRequest_REBALANCE,
   892  				kvserverpb.ReasonRebalance,
   893  				details,
   894  				dryRun,
   895  			); err != nil {
   896  				return false, err
   897  			}
   898  			return true, nil
   899  		}
   900  	}
   901  
   902  	if canTransferLease() {
   903  		// We require the lease in order to process replicas, so
   904  		// repl.store.StoreID() corresponds to the lease-holder's store ID.
   905  		transferred, err := rq.findTargetAndTransferLease(
   906  			ctx,
   907  			repl,
   908  			desc,
   909  			zone,
   910  			transferLeaseOptions{
   911  				checkTransferLeaseSource: true,
   912  				checkCandidateFullness:   true,
   913  				dryRun:                   dryRun,
   914  			},
   915  		)
   916  		if err != nil {
   917  			return false, err
   918  		}
   919  		// Do not requeue as we transferred our lease away.
   920  		if transferred {
   921  			return false, nil
   922  		}
   923  	}
   924  
   925  	// No action was necessary and no rebalance target was found. Return
   926  	// without re-queuing this replica.
   927  	return false, nil
   928  }
   929  
   930  type transferLeaseOptions struct {
   931  	checkTransferLeaseSource bool
   932  	checkCandidateFullness   bool
   933  	dryRun                   bool
   934  }
   935  
   936  func (rq *replicateQueue) findTargetAndTransferLease(
   937  	ctx context.Context,
   938  	repl *Replica,
   939  	desc *roachpb.RangeDescriptor,
   940  	zone *zonepb.ZoneConfig,
   941  	opts transferLeaseOptions,
   942  ) (bool, error) {
   943  	// Learner replicas aren't allowed to become the leaseholder or raft leader,
   944  	// so only consider the `Voters` replicas.
   945  	target := rq.allocator.TransferLeaseTarget(
   946  		ctx,
   947  		zone,
   948  		desc.Replicas().Voters(),
   949  		repl.store.StoreID(),
   950  		repl.leaseholderStats,
   951  		opts.checkTransferLeaseSource,
   952  		opts.checkCandidateFullness,
   953  		false, /* alwaysAllowDecisionWithoutStats */
   954  	)
   955  	if target == (roachpb.ReplicaDescriptor{}) {
   956  		return false, nil
   957  	}
   958  
   959  	if opts.dryRun {
   960  		log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
   961  		return false, nil
   962  	}
   963  
   964  	avgQPS, qpsMeasurementDur := repl.leaseholderStats.avgQPS()
   965  	if qpsMeasurementDur < MinStatsDuration {
   966  		avgQPS = 0
   967  	}
   968  	err := rq.transferLease(ctx, repl, target, avgQPS)
   969  	return err == nil, err
   970  }
   971  
   972  func (rq *replicateQueue) transferLease(
   973  	ctx context.Context, repl *Replica, target roachpb.ReplicaDescriptor, rangeQPS float64,
   974  ) error {
   975  	rq.metrics.TransferLeaseCount.Inc(1)
   976  	log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID)
   977  	if err := repl.AdminTransferLease(ctx, target.StoreID); err != nil {
   978  		return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID)
   979  	}
   980  	rq.lastLeaseTransfer.Store(timeutil.Now())
   981  	rq.allocator.storePool.updateLocalStoresAfterLeaseTransfer(
   982  		repl.store.StoreID(), target.StoreID, rangeQPS)
   983  	return nil
   984  }
   985  
   986  func (rq *replicateQueue) changeReplicas(
   987  	ctx context.Context,
   988  	repl *Replica,
   989  	chgs roachpb.ReplicationChanges,
   990  	desc *roachpb.RangeDescriptor,
   991  	priority SnapshotRequest_Priority,
   992  	reason kvserverpb.RangeLogEventReason,
   993  	details string,
   994  	dryRun bool,
   995  ) error {
   996  	if dryRun {
   997  		return nil
   998  	}
   999  	if _, err := repl.ChangeReplicas(ctx, desc, priority, reason, details, chgs); err != nil {
  1000  		return err
  1001  	}
  1002  	rangeUsageInfo := rangeUsageInfoForRepl(repl)
  1003  	for _, chg := range chgs {
  1004  		rq.allocator.storePool.updateLocalStoreAfterRebalance(
  1005  			chg.Target.StoreID, rangeUsageInfo, chg.ChangeType)
  1006  	}
  1007  	return nil
  1008  }
  1009  
  1010  func (rq *replicateQueue) canTransferLease() bool {
  1011  	if lastLeaseTransfer := rq.lastLeaseTransfer.Load(); lastLeaseTransfer != nil {
  1012  		minInterval := minLeaseTransferInterval.Get(&rq.store.cfg.Settings.SV)
  1013  		return timeutil.Since(lastLeaseTransfer.(time.Time)) > minInterval
  1014  	}
  1015  	return true
  1016  }
  1017  
  1018  func (*replicateQueue) timer(_ time.Duration) time.Duration {
  1019  	return replicateQueueTimerDuration
  1020  }
  1021  
  1022  // purgatoryChan returns the replicate queue's store update channel.
  1023  func (rq *replicateQueue) purgatoryChan() <-chan time.Time {
  1024  	return rq.updateChan
  1025  }
  1026  
  1027  // rangeRaftStatus pretty-prints the Raft progress (i.e. Raft log position) of
  1028  // the replicas.
  1029  func rangeRaftProgress(raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor) string {
  1030  	if raftStatus == nil {
  1031  		return "[no raft status]"
  1032  	} else if len(raftStatus.Progress) == 0 {
  1033  		return "[no raft progress]"
  1034  	}
  1035  	var buf bytes.Buffer
  1036  	buf.WriteString("[")
  1037  	for i, r := range replicas {
  1038  		if i > 0 {
  1039  			buf.WriteString(", ")
  1040  		}
  1041  		fmt.Fprintf(&buf, "%d", r.ReplicaID)
  1042  		if uint64(r.ReplicaID) == raftStatus.Lead {
  1043  			buf.WriteString("*")
  1044  		}
  1045  		if progress, ok := raftStatus.Progress[uint64(r.ReplicaID)]; ok {
  1046  			fmt.Fprintf(&buf, ":%d", progress.Match)
  1047  		} else {
  1048  			buf.WriteString(":?")
  1049  		}
  1050  	}
  1051  	buf.WriteString("]")
  1052  	return buf.String()
  1053  }