github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/allocator.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/allocator.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"encoding/json"
    16  	"fmt"
    17  	"math"
    18  	"math/rand"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/config/zonepb"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/constraint"
    24  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    25  	"github.com/cockroachdb/cockroach/pkg/settings"
    26  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    27  	"github.com/cockroachdb/cockroach/pkg/util/log"
    28  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    29  	"github.com/cockroachdb/errors"
    30  	"go.etcd.io/etcd/raft"
    31  	"go.etcd.io/etcd/raft/tracker"
    32  )
    33  
    34  const (
    35  	// leaseRebalanceThreshold is the minimum ratio of a store's lease surplus
    36  	// to the mean range/lease count that permits lease-transfers away from that
    37  	// store.
    38  	leaseRebalanceThreshold = 0.05
    39  
    40  	// baseLoadBasedLeaseRebalanceThreshold is the equivalent of
    41  	// leaseRebalanceThreshold for load-based lease rebalance decisions (i.e.
    42  	// "follow-the-workload"). It's the base threshold for decisions that get
    43  	// adjusted based on the load and latency of the involved ranges/nodes.
    44  	baseLoadBasedLeaseRebalanceThreshold = 2 * leaseRebalanceThreshold
    45  
    46  	// minReplicaWeight sets a floor for how low a replica weight can be. This is
    47  	// needed because a weight of zero doesn't work in the current lease scoring
    48  	// algorithm.
    49  	minReplicaWeight = 0.001
    50  
    51  	// Priorities for various repair operations.
    52  	finalizeAtomicReplicationChangePriority float64 = 12002
    53  	removeLearnerReplicaPriority            float64 = 12001
    54  	addDeadReplacementPriority              float64 = 12000
    55  	addMissingReplicaPriority               float64 = 10000
    56  	addDecommissioningReplacementPriority   float64 = 5000
    57  	removeDeadReplicaPriority               float64 = 1000
    58  	removeDecommissioningReplicaPriority    float64 = 200
    59  	removeExtraReplicaPriority              float64 = 100
    60  )
    61  
    62  // MinLeaseTransferStatsDuration configures the minimum amount of time a
    63  // replica must wait for stats about request counts to accumulate before
    64  // making decisions based on them. The higher this is, the less likely
    65  // thrashing is (up to a point).
    66  // Made configurable for the sake of testing.
    67  var MinLeaseTransferStatsDuration = 30 * time.Second
    68  
    69  // enableLoadBasedLeaseRebalancing controls whether lease rebalancing is done
    70  // via the new heuristic based on request load and latency or via the simpler
    71  // approach that purely seeks to balance the number of leases per node evenly.
    72  var enableLoadBasedLeaseRebalancing = settings.RegisterPublicBoolSetting(
    73  	"kv.allocator.load_based_lease_rebalancing.enabled",
    74  	"set to enable rebalancing of range leases based on load and latency",
    75  	true,
    76  )
    77  
    78  // leaseRebalancingAggressiveness enables users to tweak how aggressive their
    79  // cluster is at moving leases towards the localities where the most requests
    80  // are coming from. Settings lower than 1.0 will make the system less
    81  // aggressive about moving leases toward requests than the default, while
    82  // settings greater than 1.0 will cause more aggressive placement.
    83  //
    84  // Setting this to 0 effectively disables load-based lease rebalancing, and
    85  // settings less than 0 are disallowed.
    86  var leaseRebalancingAggressiveness = settings.RegisterNonNegativeFloatSetting(
    87  	"kv.allocator.lease_rebalancing_aggressiveness",
    88  	"set greater than 1.0 to rebalance leases toward load more aggressively, "+
    89  		"or between 0 and 1.0 to be more conservative about rebalancing leases",
    90  	1.0,
    91  )
    92  
    93  // AllocatorAction enumerates the various replication adjustments that may be
    94  // recommended by the allocator.
    95  type AllocatorAction int
    96  
    97  // These are the possible allocator actions.
    98  const (
    99  	_ AllocatorAction = iota
   100  	AllocatorNoop
   101  	AllocatorRemove
   102  	AllocatorAdd
   103  	AllocatorReplaceDead
   104  	AllocatorRemoveDead
   105  	AllocatorReplaceDecommissioning
   106  	AllocatorRemoveDecommissioning
   107  	AllocatorRemoveLearner
   108  	AllocatorConsiderRebalance
   109  	AllocatorRangeUnavailable
   110  	AllocatorFinalizeAtomicReplicationChange
   111  )
   112  
   113  var allocatorActionNames = map[AllocatorAction]string{
   114  	AllocatorNoop:                            "noop",
   115  	AllocatorRemove:                          "remove",
   116  	AllocatorAdd:                             "add",
   117  	AllocatorReplaceDead:                     "replace dead",
   118  	AllocatorRemoveDead:                      "remove dead",
   119  	AllocatorReplaceDecommissioning:          "replace decommissioning",
   120  	AllocatorRemoveDecommissioning:           "remove decommissioning",
   121  	AllocatorRemoveLearner:                   "remove learner",
   122  	AllocatorConsiderRebalance:               "consider rebalance",
   123  	AllocatorRangeUnavailable:                "range unavailable",
   124  	AllocatorFinalizeAtomicReplicationChange: "finalize conf change",
   125  }
   126  
   127  func (a AllocatorAction) String() string {
   128  	return allocatorActionNames[a]
   129  }
   130  
   131  type transferDecision int
   132  
   133  const (
   134  	_ transferDecision = iota
   135  	shouldTransfer
   136  	shouldNotTransfer
   137  	decideWithoutStats
   138  )
   139  
   140  // allocatorError indicates a retryable error condition which sends replicas
   141  // being processed through the replicate_queue into purgatory so that they
   142  // can be retried quickly as soon as new stores come online, or additional
   143  // space frees up.
   144  type allocatorError struct {
   145  	constraints      []zonepb.ConstraintsConjunction
   146  	existingReplicas int
   147  	aliveStores      int
   148  	throttledStores  int
   149  }
   150  
   151  func (ae *allocatorError) Error() string {
   152  	var existingReplsStr string
   153  	if ae.existingReplicas == 1 {
   154  		existingReplsStr = "1 already has a replica"
   155  	} else {
   156  		existingReplsStr = fmt.Sprintf("%d already have a replica", ae.existingReplicas)
   157  	}
   158  
   159  	var baseMsg string
   160  	if ae.throttledStores != 0 {
   161  		baseMsg = fmt.Sprintf(
   162  			"0 of %d live stores are able to take a new replica for the range (%d throttled, %s)",
   163  			ae.aliveStores, ae.throttledStores, existingReplsStr)
   164  	} else {
   165  		baseMsg = fmt.Sprintf(
   166  			"0 of %d live stores are able to take a new replica for the range (%s)",
   167  			ae.aliveStores, existingReplsStr)
   168  	}
   169  
   170  	if len(ae.constraints) == 0 {
   171  		if ae.throttledStores > 0 {
   172  			return baseMsg
   173  		}
   174  		return baseMsg + "; likely not enough nodes in cluster"
   175  	}
   176  	var b strings.Builder
   177  	b.WriteString(baseMsg)
   178  	b.WriteString("; must match constraints [")
   179  	for i := range ae.constraints {
   180  		if i > 0 {
   181  			b.WriteByte(' ')
   182  		}
   183  		b.WriteByte('{')
   184  		b.WriteString(ae.constraints[i].String())
   185  		b.WriteByte('}')
   186  	}
   187  	b.WriteString("]")
   188  	return b.String()
   189  }
   190  
   191  func (*allocatorError) purgatoryErrorMarker() {}
   192  
   193  var _ purgatoryError = &allocatorError{}
   194  
   195  // allocatorRand pairs a rand.Rand with a mutex.
   196  // NOTE: Allocator is typically only accessed from a single thread (the
   197  // replication queue), but this assumption is broken in tests which force
   198  // replication scans. If those tests can be modified to suspend the normal
   199  // replication queue during the forced scan, then this rand could be used
   200  // without a mutex.
   201  type allocatorRand struct {
   202  	*syncutil.Mutex
   203  	*rand.Rand
   204  }
   205  
   206  func makeAllocatorRand(source rand.Source) allocatorRand {
   207  	return allocatorRand{
   208  		Mutex: &syncutil.Mutex{},
   209  		Rand:  rand.New(source),
   210  	}
   211  }
   212  
   213  // RangeUsageInfo contains usage information (sizes and traffic) needed by the
   214  // allocator to make rebalancing decisions for a given range.
   215  type RangeUsageInfo struct {
   216  	LogicalBytes     int64
   217  	QueriesPerSecond float64
   218  	WritesPerSecond  float64
   219  }
   220  
   221  func rangeUsageInfoForRepl(repl *Replica) RangeUsageInfo {
   222  	info := RangeUsageInfo{
   223  		LogicalBytes: repl.GetMVCCStats().Total(),
   224  	}
   225  	if queriesPerSecond, dur := repl.leaseholderStats.avgQPS(); dur >= MinStatsDuration {
   226  		info.QueriesPerSecond = queriesPerSecond
   227  	}
   228  	if writesPerSecond, dur := repl.writeStats.avgQPS(); dur >= MinStatsDuration {
   229  		info.WritesPerSecond = writesPerSecond
   230  	}
   231  	return info
   232  }
   233  
   234  // Allocator tries to spread replicas as evenly as possible across the stores
   235  // in the cluster.
   236  type Allocator struct {
   237  	storePool     *StorePool
   238  	nodeLatencyFn func(addr string) (time.Duration, bool)
   239  	randGen       allocatorRand
   240  }
   241  
   242  // MakeAllocator creates a new allocator using the specified StorePool.
   243  func MakeAllocator(
   244  	storePool *StorePool, nodeLatencyFn func(addr string) (time.Duration, bool),
   245  ) Allocator {
   246  	var randSource rand.Source
   247  	// There are number of test cases that make a test store but don't add
   248  	// gossip or a store pool. So we can't rely on the existence of the
   249  	// store pool in those cases.
   250  	if storePool != nil && storePool.deterministic {
   251  		randSource = rand.NewSource(777)
   252  	} else {
   253  		randSource = rand.NewSource(rand.Int63())
   254  	}
   255  	return Allocator{
   256  		storePool:     storePool,
   257  		nodeLatencyFn: nodeLatencyFn,
   258  		randGen:       makeAllocatorRand(randSource),
   259  	}
   260  }
   261  
   262  // GetNeededReplicas calculates the number of replicas a range should
   263  // have given its zone config and the number of nodes available for
   264  // up-replication (i.e. not dead and not decommissioning).
   265  func GetNeededReplicas(zoneConfigReplicaCount int32, clusterNodes int) int {
   266  	numZoneReplicas := int(zoneConfigReplicaCount)
   267  	need := numZoneReplicas
   268  
   269  	// Adjust the replication factor for all ranges if there are fewer
   270  	// nodes than replicas specified in the zone config, so the cluster
   271  	// can still function.
   272  	if clusterNodes < need {
   273  		need = clusterNodes
   274  	}
   275  
   276  	// Ensure that we don't up- or down-replicate to an even number of replicas
   277  	// unless an even number of replicas was specifically requested by the user
   278  	// in the zone config.
   279  	//
   280  	// Note that in the case of 5 desired replicas and a decommissioning store,
   281  	// this prefers down-replicating from 5 to 3 rather than sticking with 4
   282  	// desired stores or blocking the decommissioning from completing.
   283  	if need == numZoneReplicas {
   284  		return need
   285  	}
   286  	if need%2 == 0 {
   287  		need = need - 1
   288  	}
   289  	if need < 3 {
   290  		need = 3
   291  	}
   292  	if need > numZoneReplicas {
   293  		need = numZoneReplicas
   294  	}
   295  
   296  	return need
   297  }
   298  
   299  // ComputeAction determines the exact operation needed to repair the
   300  // supplied range, as governed by the supplied zone configuration. It
   301  // returns the required action that should be taken and a priority.
   302  func (a *Allocator) ComputeAction(
   303  	ctx context.Context, zone *zonepb.ZoneConfig, desc *roachpb.RangeDescriptor,
   304  ) (AllocatorAction, float64) {
   305  	if a.storePool == nil {
   306  		// Do nothing if storePool is nil for some unittests.
   307  		return AllocatorNoop, 0
   308  	}
   309  
   310  	if desc.Replicas().InAtomicReplicationChange() {
   311  		// With a similar reasoning to the learner branch below, if we're in a
   312  		// joint configuration the top priority is to leave it before we can
   313  		// even think about doing anything else.
   314  		return AllocatorFinalizeAtomicReplicationChange, finalizeAtomicReplicationChangePriority
   315  	}
   316  
   317  	// Seeing a learner replica at this point is unexpected because learners are a
   318  	// short-lived (ish) transient state in a learner+snapshot+voter cycle, which
   319  	// is always done atomically. Only two places could have added a learner: the
   320  	// replicate queue or AdminChangeReplicas request.
   321  	//
   322  	// The replicate queue only operates on leaseholders, which means that only
   323  	// one node at a time is operating on a given range except in rare cases (old
   324  	// leaseholder could start the operation, and a new leaseholder steps up and
   325  	// also starts an overlapping operation). Combined with the above atomicity,
   326  	// this means that if the replicate queue sees a learner, either the node that
   327  	// was adding it crashed somewhere in the learner+snapshot+voter cycle and
   328  	// we're the new leaseholder or we caught a race.
   329  	//
   330  	// In the first case, we could assume the node that was adding it knew what it
   331  	// was doing and finish the addition. Or we could leave it and do higher
   332  	// priority operations first if there are any. However, this comes with code
   333  	// complexity and concept complexity (computing old vs new quorum sizes
   334  	// becomes ambiguous, the learner isn't in the quorum but it likely will be
   335  	// soon, so do you count it?). Instead, we do the simplest thing and remove it
   336  	// before doing any other operations to the range. We'll revisit this decision
   337  	// if and when the complexity becomes necessary.
   338  	//
   339  	// If we get the race where AdminChangeReplicas is adding a replica and the
   340  	// queue happens to run during the snapshot, this will remove the learner and
   341  	// AdminChangeReplicas will notice either during the snapshot transfer or when
   342  	// it tries to promote the learner to a voter. AdminChangeReplicas should
   343  	// retry.
   344  	//
   345  	// On the other hand if we get the race where a leaseholder starts adding a
   346  	// replica in the replicate queue and during this loses its lease, it should
   347  	// probably not retry.
   348  	if learners := desc.Replicas().Learners(); len(learners) > 0 {
   349  		// TODO(dan): Since this goes before anything else, the priority here should
   350  		// be influenced by whatever operations would happen right after the learner
   351  		// is removed. In the meantime, we don't want to block something important
   352  		// from happening (like addDeadReplacementPriority) by queueing this at a
   353  		// low priority so until this TODO is done, keep
   354  		// removeLearnerReplicaPriority as the highest priority.
   355  		return AllocatorRemoveLearner, removeLearnerReplicaPriority
   356  	}
   357  	// computeAction expects to operate only on voters.
   358  	return a.computeAction(ctx, zone, desc.Replicas().Voters())
   359  }
   360  
   361  func (a *Allocator) computeAction(
   362  	ctx context.Context, zone *zonepb.ZoneConfig, voterReplicas []roachpb.ReplicaDescriptor,
   363  ) (AllocatorAction, float64) {
   364  	// TODO(mrtracy): Handle non-homogeneous and mismatched attribute sets.
   365  	have := len(voterReplicas)
   366  	decommissioningReplicas := a.storePool.decommissioningReplicas(voterReplicas)
   367  	clusterNodes := a.storePool.ClusterNodeCount()
   368  	need := GetNeededReplicas(*zone.NumReplicas, clusterNodes)
   369  	desiredQuorum := computeQuorum(need)
   370  	quorum := computeQuorum(have)
   371  
   372  	if have < need {
   373  		// Range is under-replicated, and should add an additional replica.
   374  		// Priority is adjusted by the difference between the current replica
   375  		// count and the quorum of the desired replica count.
   376  		priority := addMissingReplicaPriority + float64(desiredQuorum-have)
   377  		action := AllocatorAdd
   378  		log.VEventf(ctx, 3, "%s - missing replica need=%d, have=%d, priority=%.2f",
   379  			action, need, have, priority)
   380  		return action, priority
   381  	}
   382  
   383  	liveVoterReplicas, deadVoterReplicas := a.storePool.liveAndDeadReplicas(voterReplicas)
   384  
   385  	if len(liveVoterReplicas) < quorum {
   386  		// Do not take any replacement/removal action if we do not have a quorum of live
   387  		// replicas. If we're correctly assessing the unavailable state of the range, we
   388  		// also won't be able to add replicas as we try above, but hope springs eternal.
   389  		log.VEventf(ctx, 1, "unable to take action - live replicas %v don't meet quorum of %d",
   390  			liveVoterReplicas, quorum)
   391  		return AllocatorRangeUnavailable, 0
   392  	}
   393  
   394  	if have == need && len(deadVoterReplicas) > 0 {
   395  		// Range has dead replica(s). We should up-replicate to add another before
   396  		// before removing the dead one. This can avoid permanent data loss in cases
   397  		// where the node is only temporarily dead, but we remove it from the range
   398  		// and lose a second node before we can up-replicate (#25392).
   399  		// The dead replica(s) will be down-replicated later.
   400  		priority := addDeadReplacementPriority
   401  		action := AllocatorReplaceDead
   402  		log.VEventf(ctx, 3, "%s - replacement for %d dead replicas priority=%.2f",
   403  			action, len(deadVoterReplicas), priority)
   404  		return action, priority
   405  	}
   406  
   407  	if have == need && len(decommissioningReplicas) > 0 {
   408  		// Range has decommissioning replica(s), which should be replaced.
   409  		priority := addDecommissioningReplacementPriority
   410  		action := AllocatorReplaceDecommissioning
   411  		log.VEventf(ctx, 3, "%s - replacement for %d decommissioning replicas priority=%.2f",
   412  			action, len(decommissioningReplicas), priority)
   413  		return action, priority
   414  	}
   415  
   416  	// Removal actions follow.
   417  	// TODO(a-robinson): There's an additional case related to dead replicas that
   418  	// we should handle above. If there are one or more dead replicas, have <
   419  	// need, and there are no available stores to up-replicate to, then we should
   420  	// try to remove the dead replica(s) to get down to an odd number of
   421  	// replicas.
   422  	if len(deadVoterReplicas) > 0 {
   423  		// The range has dead replicas, which should be removed immediately.
   424  		priority := removeDeadReplicaPriority + float64(quorum-len(liveVoterReplicas))
   425  		action := AllocatorRemoveDead
   426  		log.VEventf(ctx, 3, "%s - dead=%d, live=%d, quorum=%d, priority=%.2f",
   427  			action, len(deadVoterReplicas), len(liveVoterReplicas), quorum, priority)
   428  		return action, priority
   429  	}
   430  
   431  	if len(decommissioningReplicas) > 0 {
   432  		// Range is over-replicated, and has a decommissioning replica which
   433  		// should be removed.
   434  		priority := removeDecommissioningReplicaPriority
   435  		action := AllocatorRemoveDecommissioning
   436  		log.VEventf(ctx, 3,
   437  			"%s - need=%d, have=%d, num_decommissioning=%d, priority=%.2f",
   438  			action, need, have, len(decommissioningReplicas), priority)
   439  		return action, priority
   440  	}
   441  
   442  	if have > need {
   443  		// Range is over-replicated, and should remove a replica.
   444  		// Ranges with an even number of replicas get extra priority because
   445  		// they have a more fragile quorum.
   446  		priority := removeExtraReplicaPriority - float64(have%2)
   447  		action := AllocatorRemove
   448  		log.VEventf(ctx, 3, "%s - need=%d, have=%d, priority=%.2f", action, need, have, priority)
   449  		return action, priority
   450  	}
   451  
   452  	// Nothing needs to be done, but we may want to rebalance.
   453  	return AllocatorConsiderRebalance, 0
   454  }
   455  
   456  type decisionDetails struct {
   457  	Target   string
   458  	Existing string `json:",omitempty"`
   459  }
   460  
   461  // AllocateTarget returns a suitable store for a new allocation with the
   462  // required attributes. Nodes already accommodating existing replicas are ruled
   463  // out as targets. The range ID of the replica being allocated for is also
   464  // passed in to ensure that we don't try to replace an existing dead replica on
   465  // a store.
   466  //
   467  // TODO(tbg): AllocateReplacement?
   468  func (a *Allocator) AllocateTarget(
   469  	ctx context.Context, zone *zonepb.ZoneConfig, existingReplicas []roachpb.ReplicaDescriptor,
   470  ) (*roachpb.StoreDescriptor, string, error) {
   471  	sl, aliveStoreCount, throttled := a.storePool.getStoreList(storeFilterThrottled)
   472  
   473  	target, details := a.allocateTargetFromList(
   474  		ctx, sl, zone, existingReplicas, a.scorerOptions())
   475  
   476  	if target != nil {
   477  		return target, details, nil
   478  	}
   479  
   480  	// When there are throttled stores that do match, we shouldn't send
   481  	// the replica to purgatory.
   482  	if len(throttled) > 0 {
   483  		return nil, "", errors.Errorf(
   484  			"%d matching stores are currently throttled: %v", len(throttled), throttled,
   485  		)
   486  	}
   487  	return nil, "", &allocatorError{
   488  		constraints:      zone.Constraints,
   489  		existingReplicas: len(existingReplicas),
   490  		aliveStores:      aliveStoreCount,
   491  		throttledStores:  len(throttled),
   492  	}
   493  }
   494  
   495  func (a *Allocator) allocateTargetFromList(
   496  	ctx context.Context,
   497  	sl StoreList,
   498  	zone *zonepb.ZoneConfig,
   499  	candidateReplicas []roachpb.ReplicaDescriptor,
   500  	options scorerOptions,
   501  ) (*roachpb.StoreDescriptor, string) {
   502  	analyzedConstraints := constraint.AnalyzeConstraints(
   503  		ctx, a.storePool.getStoreDescriptor, candidateReplicas, zone)
   504  	candidates := allocateCandidates(
   505  		sl, analyzedConstraints, candidateReplicas, a.storePool.getLocalities(candidateReplicas),
   506  		options,
   507  	)
   508  	log.VEventf(ctx, 3, "allocate candidates: %s", candidates)
   509  	if target := candidates.selectGood(a.randGen); target != nil {
   510  		log.VEventf(ctx, 3, "add target: %s", target)
   511  		details := decisionDetails{Target: target.compactString(options)}
   512  		detailsBytes, err := json.Marshal(details)
   513  		if err != nil {
   514  			log.Warningf(ctx, "failed to marshal details for choosing allocate target: %+v", err)
   515  		}
   516  		return &target.store, string(detailsBytes)
   517  	}
   518  
   519  	return nil, ""
   520  }
   521  
   522  func (a Allocator) simulateRemoveTarget(
   523  	ctx context.Context,
   524  	targetStore roachpb.StoreID,
   525  	zone *zonepb.ZoneConfig,
   526  	candidates []roachpb.ReplicaDescriptor,
   527  	existingReplicas []roachpb.ReplicaDescriptor,
   528  	rangeUsageInfo RangeUsageInfo,
   529  ) (roachpb.ReplicaDescriptor, string, error) {
   530  	// Update statistics first
   531  	// TODO(a-robinson): This could theoretically interfere with decisions made by other goroutines,
   532  	// but as of October 2017 calls to the Allocator are mostly serialized by the ReplicateQueue
   533  	// (with the main exceptions being Scatter and the status server's allocator debug endpoint).
   534  	// Try to make this interfere less with other callers.
   535  	a.storePool.updateLocalStoreAfterRebalance(targetStore, rangeUsageInfo, roachpb.ADD_REPLICA)
   536  	defer func() {
   537  		a.storePool.updateLocalStoreAfterRebalance(targetStore, rangeUsageInfo, roachpb.REMOVE_REPLICA)
   538  	}()
   539  	log.VEventf(ctx, 3, "simulating which replica would be removed after adding s%d", targetStore)
   540  	return a.RemoveTarget(ctx, zone, candidates, existingReplicas)
   541  }
   542  
   543  // RemoveTarget returns a suitable replica to remove from the provided replica
   544  // set. It first attempts to randomly select a target from the set of stores
   545  // that have greater than the average number of replicas. Failing that, it
   546  // falls back to selecting a random target from any of the existing
   547  // replicas.
   548  func (a Allocator) RemoveTarget(
   549  	ctx context.Context,
   550  	zone *zonepb.ZoneConfig,
   551  	candidates []roachpb.ReplicaDescriptor,
   552  	existingReplicas []roachpb.ReplicaDescriptor,
   553  ) (roachpb.ReplicaDescriptor, string, error) {
   554  	if len(candidates) == 0 {
   555  		return roachpb.ReplicaDescriptor{}, "", errors.Errorf("must supply at least one candidate replica to allocator.RemoveTarget()")
   556  	}
   557  
   558  	// Retrieve store descriptors for the provided candidates from the StorePool.
   559  	existingStoreIDs := make(roachpb.StoreIDSlice, len(candidates))
   560  	for i, exist := range candidates {
   561  		existingStoreIDs[i] = exist.StoreID
   562  	}
   563  	sl, _, _ := a.storePool.getStoreListFromIDs(existingStoreIDs, storeFilterNone)
   564  
   565  	analyzedConstraints := constraint.AnalyzeConstraints(
   566  		ctx, a.storePool.getStoreDescriptor, existingReplicas, zone)
   567  	options := a.scorerOptions()
   568  	rankedCandidates := removeCandidates(
   569  		sl,
   570  		analyzedConstraints,
   571  		a.storePool.getLocalities(existingReplicas),
   572  		options,
   573  	)
   574  	log.VEventf(ctx, 3, "remove candidates: %s", rankedCandidates)
   575  	if bad := rankedCandidates.selectBad(a.randGen); bad != nil {
   576  		for _, exist := range existingReplicas {
   577  			if exist.StoreID == bad.store.StoreID {
   578  				log.VEventf(ctx, 3, "remove target: %s", bad)
   579  				details := decisionDetails{Target: bad.compactString(options)}
   580  				detailsBytes, err := json.Marshal(details)
   581  				if err != nil {
   582  					log.Warningf(ctx, "failed to marshal details for choosing remove target: %+v", err)
   583  				}
   584  				return exist, string(detailsBytes), nil
   585  			}
   586  		}
   587  	}
   588  
   589  	return roachpb.ReplicaDescriptor{}, "", errors.New("could not select an appropriate replica to be removed")
   590  }
   591  
   592  // RebalanceTarget returns a suitable store for a rebalance target with
   593  // required attributes. Rebalance targets are selected via the same mechanism
   594  // as AllocateTarget(), except the chosen target must follow some additional
   595  // criteria. Namely, if chosen, it must further the goal of balancing the
   596  // cluster.
   597  //
   598  // The supplied parameters are the required attributes for the range and
   599  // information about the range being considered for rebalancing.
   600  //
   601  // The existing replicas modulo any store with dead replicas are candidates for
   602  // rebalancing. Note that rebalancing is accomplished by first adding a new
   603  // replica to the range, then removing the most undesirable replica.
   604  //
   605  // Simply ignoring a rebalance opportunity in the event that the target chosen
   606  // by AllocateTarget() doesn't fit balancing criteria is perfectly fine, as
   607  // other stores in the cluster will also be doing their probabilistic best to
   608  // rebalance. This helps prevent a stampeding herd targeting an abnormally
   609  // under-utilized store.
   610  //
   611  // The return values are, in order:
   612  //
   613  // 1. The target on which to add a new replica,
   614  // 2. An existing replica to remove,
   615  // 3. a JSON string for use in the range log, and
   616  // 4. a boolean indicationg whether 1-3 were populated (i.e. whether a rebalance
   617  //    opportunity was found).
   618  func (a Allocator) RebalanceTarget(
   619  	ctx context.Context,
   620  	zone *zonepb.ZoneConfig,
   621  	raftStatus *raft.Status,
   622  	existingReplicas []roachpb.ReplicaDescriptor,
   623  	rangeUsageInfo RangeUsageInfo,
   624  	filter storeFilter,
   625  ) (add roachpb.ReplicationTarget, remove roachpb.ReplicationTarget, details string, ok bool) {
   626  	sl, _, _ := a.storePool.getStoreList(filter)
   627  
   628  	zero := roachpb.ReplicationTarget{}
   629  
   630  	// We're going to add another replica to the range which will change the
   631  	// quorum size. Verify that the number of existing live replicas is sufficient
   632  	// to meet the new quorum. For a range configured for 3 replicas, this will
   633  	// disable rebalancing if one of the replicas is on a down node. Instead,
   634  	// we'll have to wait for the down node to be declared dead and go through the
   635  	// dead-node removal dance: remove dead replica, add new replica.
   636  	//
   637  	// NB: The len(replicas) > 1 check allows rebalancing of ranges with only a
   638  	// single replica. This is a corner case which could happen in practice and
   639  	// also affects tests.
   640  	if len(existingReplicas) > 1 {
   641  		var numLiveReplicas int
   642  		for _, s := range sl.stores {
   643  			for _, repl := range existingReplicas {
   644  				if s.StoreID == repl.StoreID {
   645  					numLiveReplicas++
   646  					break
   647  				}
   648  			}
   649  		}
   650  		newQuorum := computeQuorum(len(existingReplicas) + 1)
   651  		if numLiveReplicas < newQuorum {
   652  			// Don't rebalance as we won't be able to make quorum after the rebalance
   653  			// until the new replica has been caught up.
   654  			return zero, zero, "", false
   655  		}
   656  	}
   657  
   658  	analyzedConstraints := constraint.AnalyzeConstraints(
   659  		ctx, a.storePool.getStoreDescriptor, existingReplicas, zone)
   660  	options := a.scorerOptions()
   661  	results := rebalanceCandidates(
   662  		ctx,
   663  		sl,
   664  		analyzedConstraints,
   665  		existingReplicas,
   666  		a.storePool.getLocalities(existingReplicas),
   667  		a.storePool.getNodeLocalityString,
   668  		options,
   669  	)
   670  
   671  	if len(results) == 0 {
   672  		return zero, zero, "", false
   673  	}
   674  	// Keep looping until we either run out of options or find a target that we're
   675  	// pretty sure we won't want to remove immediately after adding it.
   676  	// If we would, we don't want to actually rebalance to that target.
   677  	var target *candidate
   678  	var removeReplica roachpb.ReplicaDescriptor
   679  	var existingCandidates candidateList
   680  	for {
   681  		target, existingCandidates = bestRebalanceTarget(a.randGen, results)
   682  		if target == nil {
   683  			return zero, zero, "", false
   684  		}
   685  
   686  		// Add a fake new replica to our copy of the range descriptor so that we can
   687  		// simulate the removal logic. If we decide not to go with this target, note
   688  		// that this needs to be removed from desc before we try any other target.
   689  		newReplica := roachpb.ReplicaDescriptor{
   690  			NodeID:    target.store.Node.NodeID,
   691  			StoreID:   target.store.StoreID,
   692  			ReplicaID: maxReplicaID(existingReplicas) + 1,
   693  		}
   694  		// Deep-copy the Replicas slice since we'll mutate it below.
   695  		existingPlusOneNew := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...)
   696  		existingPlusOneNew = append(existingPlusOneNew, newReplica)
   697  		replicaCandidates := existingPlusOneNew
   698  		// If we can, filter replicas as we would if we were actually removing one.
   699  		// If we can't (e.g. because we're the leaseholder but not the raft leader),
   700  		// it's better to simulate the removal with the info that we do have than to
   701  		// assume that the rebalance is ok (#20241).
   702  		if raftStatus != nil && raftStatus.Progress != nil {
   703  			replicaCandidates = simulateFilterUnremovableReplicas(
   704  				ctx, raftStatus, replicaCandidates, newReplica.ReplicaID)
   705  		}
   706  		if len(replicaCandidates) == 0 {
   707  			// No existing replicas are suitable to remove.
   708  			log.VEventf(ctx, 2, "not rebalancing to s%d because there are no existing "+
   709  				"replicas that can be removed", target.store.StoreID)
   710  			return zero, zero, "", false
   711  		}
   712  
   713  		var removeDetails string
   714  		var err error
   715  		removeReplica, removeDetails, err = a.simulateRemoveTarget(
   716  			ctx,
   717  			target.store.StoreID,
   718  			zone,
   719  			replicaCandidates,
   720  			existingPlusOneNew,
   721  			rangeUsageInfo,
   722  		)
   723  		if err != nil {
   724  			log.Warningf(ctx, "simulating RemoveTarget failed: %+v", err)
   725  			return zero, zero, "", false
   726  		}
   727  		if target.store.StoreID != removeReplica.StoreID {
   728  			// Successfully populated these variables
   729  			_, _ = target, removeReplica
   730  			break
   731  		}
   732  
   733  		log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s",
   734  			target.store.StoreID, removeDetails)
   735  	}
   736  
   737  	// Compile the details entry that will be persisted into system.rangelog for
   738  	// debugging/auditability purposes.
   739  	dDetails := decisionDetails{
   740  		Target:   target.compactString(options),
   741  		Existing: existingCandidates.compactString(options),
   742  	}
   743  	detailsBytes, err := json.Marshal(dDetails)
   744  	if err != nil {
   745  		log.Warningf(ctx, "failed to marshal details for choosing rebalance target: %+v", err)
   746  	}
   747  
   748  	addTarget := roachpb.ReplicationTarget{
   749  		NodeID:  target.store.Node.NodeID,
   750  		StoreID: target.store.StoreID,
   751  	}
   752  	removeTarget := roachpb.ReplicationTarget{
   753  		NodeID:  removeReplica.NodeID,
   754  		StoreID: removeReplica.StoreID,
   755  	}
   756  	return addTarget, removeTarget, string(detailsBytes), true
   757  }
   758  
   759  func (a *Allocator) scorerOptions() scorerOptions {
   760  	return scorerOptions{
   761  		deterministic:           a.storePool.deterministic,
   762  		rangeRebalanceThreshold: rangeRebalanceThreshold.Get(&a.storePool.st.SV),
   763  	}
   764  }
   765  
   766  // TransferLeaseTarget returns a suitable replica to transfer the range lease
   767  // to from the provided list. It excludes the current lease holder replica
   768  // unless asked to do otherwise by the checkTransferLeaseSource parameter.
   769  func (a *Allocator) TransferLeaseTarget(
   770  	ctx context.Context,
   771  	zone *zonepb.ZoneConfig,
   772  	existing []roachpb.ReplicaDescriptor,
   773  	leaseStoreID roachpb.StoreID,
   774  	stats *replicaStats,
   775  	checkTransferLeaseSource bool,
   776  	checkCandidateFullness bool,
   777  	alwaysAllowDecisionWithoutStats bool,
   778  ) roachpb.ReplicaDescriptor {
   779  	sl, _, _ := a.storePool.getStoreList(storeFilterNone)
   780  	sl = sl.filter(zone.Constraints)
   781  
   782  	// Filter stores that are on nodes containing existing replicas, but leave
   783  	// the stores containing the existing replicas in place. This excludes stores
   784  	// that we can't rebalance to, avoiding an issue in a 3-node cluster where
   785  	// there are multiple stores per node.
   786  	//
   787  	// TODO(peter,bram): This will need adjustment with the new allocator. `sl`
   788  	// needs to contain only the possible rebalance candidates + the existing
   789  	// stores the replicas are on.
   790  	filteredDescs := make([]roachpb.StoreDescriptor, 0, len(sl.stores))
   791  	for _, s := range sl.stores {
   792  		var exclude bool
   793  		for _, r := range existing {
   794  			if r.NodeID == s.Node.NodeID && r.StoreID != s.StoreID {
   795  				exclude = true
   796  				break
   797  			}
   798  		}
   799  		if !exclude {
   800  			filteredDescs = append(filteredDescs, s)
   801  		}
   802  	}
   803  	sl = makeStoreList(filteredDescs)
   804  
   805  	source, ok := a.storePool.getStoreDescriptor(leaseStoreID)
   806  	if !ok {
   807  		return roachpb.ReplicaDescriptor{}
   808  	}
   809  
   810  	// Determine which store(s) is preferred based on user-specified preferences.
   811  	// If any stores match, only consider those stores as candidates. If only one
   812  	// store matches, it's where the lease should be (unless the preferred store
   813  	// is the current one and checkTransferLeaseSource is false).
   814  	var preferred []roachpb.ReplicaDescriptor
   815  	if checkTransferLeaseSource {
   816  		preferred = a.preferredLeaseholders(zone, existing)
   817  	} else {
   818  		// TODO(a-robinson): Should we just always remove the source store from
   819  		// existing when checkTransferLeaseSource is false? I'd do it now, but
   820  		// it's too big a change to make right before a major release.
   821  		var candidates []roachpb.ReplicaDescriptor
   822  		for _, repl := range existing {
   823  			if repl.StoreID != leaseStoreID {
   824  				candidates = append(candidates, repl)
   825  			}
   826  		}
   827  		preferred = a.preferredLeaseholders(zone, candidates)
   828  	}
   829  	if len(preferred) == 1 {
   830  		if preferred[0].StoreID == leaseStoreID {
   831  			return roachpb.ReplicaDescriptor{}
   832  		}
   833  		return preferred[0]
   834  	} else if len(preferred) > 1 {
   835  		// If the current leaseholder is not preferred, set checkTransferLeaseSource
   836  		// to false to motivate the below logic to transfer the lease.
   837  		existing = preferred
   838  		if !storeHasReplica(leaseStoreID, preferred) {
   839  			checkTransferLeaseSource = false
   840  		}
   841  	}
   842  
   843  	// Only consider live, non-draining replicas.
   844  	existing, _ = a.storePool.liveAndDeadReplicas(existing)
   845  
   846  	// Short-circuit if there are no valid targets out there.
   847  	if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == leaseStoreID) {
   848  		log.VEventf(ctx, 2, "no lease transfer target found")
   849  		return roachpb.ReplicaDescriptor{}
   850  	}
   851  
   852  	// Try to pick a replica to transfer the lease to while also determining
   853  	// whether we actually should be transferring the lease. The transfer
   854  	// decision is only needed if we've been asked to check the source.
   855  	transferDec, repl := a.shouldTransferLeaseUsingStats(
   856  		ctx, sl, source, existing, stats, nil,
   857  	)
   858  	if checkTransferLeaseSource {
   859  		switch transferDec {
   860  		case shouldNotTransfer:
   861  			if !alwaysAllowDecisionWithoutStats {
   862  				return roachpb.ReplicaDescriptor{}
   863  			}
   864  			fallthrough
   865  		case decideWithoutStats:
   866  			if !a.shouldTransferLeaseWithoutStats(ctx, sl, source, existing) {
   867  				return roachpb.ReplicaDescriptor{}
   868  			}
   869  		case shouldTransfer:
   870  		default:
   871  			log.Fatalf(ctx, "unexpected transfer decision %d with replica %+v", transferDec, repl)
   872  		}
   873  	}
   874  
   875  	if repl != (roachpb.ReplicaDescriptor{}) {
   876  		return repl
   877  	}
   878  
   879  	// Fall back to logic that doesn't take request counts and latency into
   880  	// account if the counts/latency-based logic couldn't pick a best replica.
   881  	candidates := make([]roachpb.ReplicaDescriptor, 0, len(existing))
   882  	var bestOption roachpb.ReplicaDescriptor
   883  	bestOptionLeaseCount := int32(math.MaxInt32)
   884  	for _, repl := range existing {
   885  		if leaseStoreID == repl.StoreID {
   886  			continue
   887  		}
   888  		storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID)
   889  		if !ok {
   890  			continue
   891  		}
   892  		if !checkCandidateFullness || float64(storeDesc.Capacity.LeaseCount) < sl.candidateLeases.mean-0.5 {
   893  			candidates = append(candidates, repl)
   894  		} else if storeDesc.Capacity.LeaseCount < bestOptionLeaseCount {
   895  			bestOption = repl
   896  			bestOptionLeaseCount = storeDesc.Capacity.LeaseCount
   897  		}
   898  	}
   899  	if len(candidates) == 0 {
   900  		// If we aren't supposed to be considering the current leaseholder (e.g.
   901  		// because we need to remove this replica for some reason), return
   902  		// our best option if we otherwise wouldn't want to do anything.
   903  		if !checkTransferLeaseSource {
   904  			return bestOption
   905  		}
   906  		return roachpb.ReplicaDescriptor{}
   907  	}
   908  	a.randGen.Lock()
   909  	defer a.randGen.Unlock()
   910  	return candidates[a.randGen.Intn(len(candidates))]
   911  }
   912  
   913  // ShouldTransferLease returns true if the specified store is overfull in terms
   914  // of leases with respect to the other stores matching the specified
   915  // attributes.
   916  func (a *Allocator) ShouldTransferLease(
   917  	ctx context.Context,
   918  	zone *zonepb.ZoneConfig,
   919  	existing []roachpb.ReplicaDescriptor,
   920  	leaseStoreID roachpb.StoreID,
   921  	stats *replicaStats,
   922  ) bool {
   923  	source, ok := a.storePool.getStoreDescriptor(leaseStoreID)
   924  	if !ok {
   925  		return false
   926  	}
   927  
   928  	// Determine which store(s) is preferred based on user-specified preferences.
   929  	// If any stores match, only consider those stores as options. If only one
   930  	// store matches, it's where the lease should be.
   931  	preferred := a.preferredLeaseholders(zone, existing)
   932  	if len(preferred) == 1 {
   933  		return preferred[0].StoreID != leaseStoreID
   934  	} else if len(preferred) > 1 {
   935  		existing = preferred
   936  		// If the current leaseholder isn't one of the preferred stores, then we
   937  		// should try to transfer the lease.
   938  		if !storeHasReplica(leaseStoreID, existing) {
   939  			return true
   940  		}
   941  	}
   942  
   943  	sl, _, _ := a.storePool.getStoreList(storeFilterNone)
   944  	sl = sl.filter(zone.Constraints)
   945  	log.VEventf(ctx, 3, "ShouldTransferLease (lease-holder=%d):\n%s", leaseStoreID, sl)
   946  
   947  	// Only consider live, non-draining replicas.
   948  	existing, _ = a.storePool.liveAndDeadReplicas(existing)
   949  
   950  	// Short-circuit if there are no valid targets out there.
   951  	if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == source.StoreID) {
   952  		return false
   953  	}
   954  
   955  	transferDec, _ := a.shouldTransferLeaseUsingStats(ctx, sl, source, existing, stats, nil)
   956  	var result bool
   957  	switch transferDec {
   958  	case shouldNotTransfer:
   959  		result = false
   960  	case shouldTransfer:
   961  		result = true
   962  	case decideWithoutStats:
   963  		result = a.shouldTransferLeaseWithoutStats(ctx, sl, source, existing)
   964  	default:
   965  		log.Fatalf(ctx, "unexpected transfer decision %d", transferDec)
   966  	}
   967  
   968  	log.VEventf(ctx, 3, "ShouldTransferLease decision (lease-holder=%d): %t", leaseStoreID, result)
   969  	return result
   970  }
   971  
   972  func (a Allocator) followTheWorkloadPrefersLocal(
   973  	ctx context.Context,
   974  	sl StoreList,
   975  	source roachpb.StoreDescriptor,
   976  	candidate roachpb.StoreID,
   977  	existing []roachpb.ReplicaDescriptor,
   978  	stats *replicaStats,
   979  ) bool {
   980  	adjustments := make(map[roachpb.StoreID]float64)
   981  	decision, _ := a.shouldTransferLeaseUsingStats(ctx, sl, source, existing, stats, adjustments)
   982  	if decision == decideWithoutStats {
   983  		return false
   984  	}
   985  	adjustment := adjustments[candidate]
   986  	if adjustment > baseLoadBasedLeaseRebalanceThreshold {
   987  		log.VEventf(ctx, 3,
   988  			"s%d is a better fit than s%d due to follow-the-workload (score: %.2f; threshold: %.2f)",
   989  			source.StoreID, candidate, adjustment, baseLoadBasedLeaseRebalanceThreshold)
   990  		return true
   991  	}
   992  	return false
   993  }
   994  
   995  func (a Allocator) shouldTransferLeaseUsingStats(
   996  	ctx context.Context,
   997  	sl StoreList,
   998  	source roachpb.StoreDescriptor,
   999  	existing []roachpb.ReplicaDescriptor,
  1000  	stats *replicaStats,
  1001  	rebalanceAdjustments map[roachpb.StoreID]float64,
  1002  ) (transferDecision, roachpb.ReplicaDescriptor) {
  1003  	// Only use load-based rebalancing if it's enabled and we have both
  1004  	// stats and locality information to base our decision on.
  1005  	if stats == nil || !enableLoadBasedLeaseRebalancing.Get(&a.storePool.st.SV) {
  1006  		return decideWithoutStats, roachpb.ReplicaDescriptor{}
  1007  	}
  1008  	replicaLocalities := a.storePool.getLocalities(existing)
  1009  	for _, locality := range replicaLocalities {
  1010  		if len(locality.Tiers) == 0 {
  1011  			return decideWithoutStats, roachpb.ReplicaDescriptor{}
  1012  		}
  1013  	}
  1014  
  1015  	qpsStats, qpsStatsDur := stats.perLocalityDecayingQPS()
  1016  
  1017  	// If we haven't yet accumulated enough data, avoid transferring for now,
  1018  	// unless we've been explicitly asked otherwise. Do not fall back to the
  1019  	// algorithm that doesn't use stats, since it can easily start fighting with
  1020  	// the stats-based algorithm. This provides some amount of safety from lease
  1021  	// thrashing, since leases cannot transfer more frequently than this threshold
  1022  	// (because replica stats get reset upon lease transfer).
  1023  	if qpsStatsDur < MinLeaseTransferStatsDuration {
  1024  		return shouldNotTransfer, roachpb.ReplicaDescriptor{}
  1025  	}
  1026  
  1027  	// On the other hand, if we don't have any stats with associated localities,
  1028  	// then do fall back to the algorithm that doesn't use request stats.
  1029  	delete(qpsStats, "")
  1030  	if len(qpsStats) == 0 {
  1031  		return decideWithoutStats, roachpb.ReplicaDescriptor{}
  1032  	}
  1033  
  1034  	replicaWeights := make(map[roachpb.NodeID]float64)
  1035  	for requestLocalityStr, qps := range qpsStats {
  1036  		var requestLocality roachpb.Locality
  1037  		if err := requestLocality.Set(requestLocalityStr); err != nil {
  1038  			log.Errorf(ctx, "unable to parse locality string %q: %+v", requestLocalityStr, err)
  1039  			continue
  1040  		}
  1041  		for nodeID, replicaLocality := range replicaLocalities {
  1042  			// Add weights to each replica based on the number of requests from
  1043  			// that replica's locality and neighboring localities.
  1044  			replicaWeights[nodeID] += (1 - replicaLocality.DiversityScore(requestLocality)) * qps
  1045  		}
  1046  	}
  1047  
  1048  	log.VEventf(ctx, 1,
  1049  		"shouldTransferLease qpsStats: %+v, replicaLocalities: %+v, replicaWeights: %+v",
  1050  		qpsStats, replicaLocalities, replicaWeights)
  1051  	sourceWeight := math.Max(minReplicaWeight, replicaWeights[source.Node.NodeID])
  1052  
  1053  	// TODO(a-robinson): This may not have enough protection against all leases
  1054  	// ending up on a single node in extreme cases. Continue testing against
  1055  	// different situations.
  1056  	var bestRepl roachpb.ReplicaDescriptor
  1057  	bestReplScore := int32(math.MinInt32)
  1058  	for _, repl := range existing {
  1059  		if repl.NodeID == source.Node.NodeID {
  1060  			continue
  1061  		}
  1062  		storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID)
  1063  		if !ok {
  1064  			continue
  1065  		}
  1066  		addr, err := a.storePool.gossip.GetNodeIDAddress(repl.NodeID)
  1067  		if err != nil {
  1068  			log.Errorf(ctx, "missing address for n%d: %+v", repl.NodeID, err)
  1069  			continue
  1070  		}
  1071  		remoteLatency, ok := a.nodeLatencyFn(addr.String())
  1072  		if !ok {
  1073  			continue
  1074  		}
  1075  
  1076  		remoteWeight := math.Max(minReplicaWeight, replicaWeights[repl.NodeID])
  1077  		replScore, rebalanceAdjustment := loadBasedLeaseRebalanceScore(
  1078  			ctx, a.storePool.st, remoteWeight, remoteLatency, storeDesc, sourceWeight, source, sl.candidateLeases.mean)
  1079  		if replScore > bestReplScore {
  1080  			bestReplScore = replScore
  1081  			bestRepl = repl
  1082  		}
  1083  		if rebalanceAdjustments != nil {
  1084  			rebalanceAdjustments[repl.StoreID] = rebalanceAdjustment
  1085  		}
  1086  	}
  1087  
  1088  	if bestReplScore > 0 {
  1089  		return shouldTransfer, bestRepl
  1090  	}
  1091  
  1092  	// Return the best replica even in cases where transferring is not advised in
  1093  	// order to support forced lease transfers, such as when removing a replica or
  1094  	// draining all leases before shutdown.
  1095  	return shouldNotTransfer, bestRepl
  1096  }
  1097  
  1098  // loadBasedLeaseRebalanceScore attempts to give a score to how desirable it
  1099  // would be to transfer a range lease from the local store to a remote store.
  1100  // It does so using a formula based on the latency between the stores and
  1101  // a number that we call the "weight" of each replica, which represents how
  1102  // many requests for the range have been coming from localities near the
  1103  // replica.
  1104  //
  1105  // The overarching goal is to move leases towards where requests are coming
  1106  // from when the latency between localities is high, because the leaseholder
  1107  // being near the request gateway makes for lower request latencies.
  1108  // This must be balanced against hurting throughput by putting too many leases
  1109  // one just a few nodes, though, which is why we get progressively more
  1110  // aggressive about moving the leases toward requests when latencies are high.
  1111  //
  1112  // The calculations below were determined via a bunch of manual testing (see
  1113  // #13232 or the leaseholder_locality.md RFC for more details), but the general
  1114  // logic behind each part of the formula is as follows:
  1115  //
  1116  // * LeaseRebalancingAggressiveness: Allow the aggressiveness to be tuned via
  1117  //   a cluster setting.
  1118  // * 0.1: Constant factor to reduce aggressiveness by default
  1119  // * math.Log10(remoteWeight/sourceWeight): Comparison of the remote replica's
  1120  //   weight to the local replica's weight. Taking the log of the ratio instead
  1121  //   of using the ratio directly makes things symmetric -- i.e. r1 comparing
  1122  //   itself to r2 will come to the same conclusion as r2 comparing itself to r1.
  1123  // * math.Log1p(remoteLatencyMillis): This will be 0 if there's no latency,
  1124  //   removing the weight/latency factor from consideration. Otherwise, it grows
  1125  //   the aggressiveness for stores that are farther apart. Note that Log1p grows
  1126  //   faster than Log10 as its argument gets larger, which is intentional to
  1127  //   increase the importance of latency.
  1128  // * overfullScore and underfullScore: rebalanceThreshold helps us get an idea
  1129  //   of the ideal number of leases on each store. We then calculate these to
  1130  //   compare how close each node is to its ideal state and use the differences
  1131  //   from the ideal state on each node to compute a final score.
  1132  //
  1133  // Returns a total score for the replica that takes into account the number of
  1134  // leases already on each store. Also returns the raw "adjustment" value that's
  1135  // purely based on replica weights and latency in order for the caller to
  1136  // determine how large a role the user's workload played in the decision.  The
  1137  // adjustment value is positive if the remote store is preferred for load-based
  1138  // reasons or negative if the local store is preferred. The magnitude depends
  1139  // on the difference in load and the latency between the nodes.
  1140  //
  1141  // TODO(a-robinson): Should this be changed to avoid even thinking about lease
  1142  // counts now that we try to spread leases and replicas based on QPS? As is it
  1143  // may fight back a little bit against store-level QPS-based rebalancing.
  1144  func loadBasedLeaseRebalanceScore(
  1145  	ctx context.Context,
  1146  	st *cluster.Settings,
  1147  	remoteWeight float64,
  1148  	remoteLatency time.Duration,
  1149  	remoteStore roachpb.StoreDescriptor,
  1150  	sourceWeight float64,
  1151  	source roachpb.StoreDescriptor,
  1152  	meanLeases float64,
  1153  ) (int32, float64) {
  1154  	remoteLatencyMillis := float64(remoteLatency) / float64(time.Millisecond)
  1155  	rebalanceAdjustment :=
  1156  		leaseRebalancingAggressiveness.Get(&st.SV) * 0.1 * math.Log10(remoteWeight/sourceWeight) * math.Log1p(remoteLatencyMillis)
  1157  	// Start with twice the base rebalance threshold in order to fight more
  1158  	// strongly against thrashing caused by small variances in the distribution
  1159  	// of request weights.
  1160  	rebalanceThreshold := baseLoadBasedLeaseRebalanceThreshold - rebalanceAdjustment
  1161  
  1162  	overfullLeaseThreshold := int32(math.Ceil(meanLeases * (1 + rebalanceThreshold)))
  1163  	overfullScore := source.Capacity.LeaseCount - overfullLeaseThreshold
  1164  	underfullLeaseThreshold := int32(math.Floor(meanLeases * (1 - rebalanceThreshold)))
  1165  	underfullScore := underfullLeaseThreshold - remoteStore.Capacity.LeaseCount
  1166  	totalScore := overfullScore + underfullScore
  1167  
  1168  	log.VEventf(ctx, 1,
  1169  		"node: %d, sourceWeight: %.2f, remoteWeight: %.2f, remoteLatency: %v, "+
  1170  			"rebalanceThreshold: %.2f, meanLeases: %.2f, sourceLeaseCount: %d, overfullThreshold: %d, "+
  1171  			"remoteLeaseCount: %d, underfullThreshold: %d, totalScore: %d",
  1172  		remoteStore.Node.NodeID, sourceWeight, remoteWeight, remoteLatency,
  1173  		rebalanceThreshold, meanLeases, source.Capacity.LeaseCount, overfullLeaseThreshold,
  1174  		remoteStore.Capacity.LeaseCount, underfullLeaseThreshold, totalScore,
  1175  	)
  1176  	return totalScore, rebalanceAdjustment
  1177  }
  1178  
  1179  func (a Allocator) shouldTransferLeaseWithoutStats(
  1180  	ctx context.Context,
  1181  	sl StoreList,
  1182  	source roachpb.StoreDescriptor,
  1183  	existing []roachpb.ReplicaDescriptor,
  1184  ) bool {
  1185  	// TODO(a-robinson): Should we disable this behavior when load-based lease
  1186  	// rebalancing is enabled? In happy cases it's nice to keep this working
  1187  	// to even out the number of leases in addition to the number of replicas,
  1188  	// but it's certainly a blunt instrument that could undo what we want.
  1189  
  1190  	// Allow lease transfer if we're above the overfull threshold, which is
  1191  	// mean*(1+leaseRebalanceThreshold).
  1192  	overfullLeaseThreshold := int32(math.Ceil(sl.candidateLeases.mean * (1 + leaseRebalanceThreshold)))
  1193  	minOverfullThreshold := int32(math.Ceil(sl.candidateLeases.mean + 5))
  1194  	if overfullLeaseThreshold < minOverfullThreshold {
  1195  		overfullLeaseThreshold = minOverfullThreshold
  1196  	}
  1197  	if source.Capacity.LeaseCount > overfullLeaseThreshold {
  1198  		return true
  1199  	}
  1200  
  1201  	if float64(source.Capacity.LeaseCount) > sl.candidateLeases.mean {
  1202  		underfullLeaseThreshold := int32(math.Ceil(sl.candidateLeases.mean * (1 - leaseRebalanceThreshold)))
  1203  		minUnderfullThreshold := int32(math.Ceil(sl.candidateLeases.mean - 5))
  1204  		if underfullLeaseThreshold > minUnderfullThreshold {
  1205  			underfullLeaseThreshold = minUnderfullThreshold
  1206  		}
  1207  
  1208  		for _, repl := range existing {
  1209  			storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID)
  1210  			if !ok {
  1211  				continue
  1212  			}
  1213  			if storeDesc.Capacity.LeaseCount < underfullLeaseThreshold {
  1214  				return true
  1215  			}
  1216  		}
  1217  	}
  1218  	return false
  1219  }
  1220  
  1221  func (a Allocator) preferredLeaseholders(
  1222  	zone *zonepb.ZoneConfig, existing []roachpb.ReplicaDescriptor,
  1223  ) []roachpb.ReplicaDescriptor {
  1224  	// Go one preference at a time. As soon as we've found replicas that match a
  1225  	// preference, we don't need to look at the later preferences, because
  1226  	// they're meant to be ordered by priority.
  1227  	for _, preference := range zone.LeasePreferences {
  1228  		var preferred []roachpb.ReplicaDescriptor
  1229  		for _, repl := range existing {
  1230  			// TODO(a-robinson): Do all these lookups at once, up front? We could
  1231  			// easily be passing a slice of StoreDescriptors around all the Allocator
  1232  			// functions instead of ReplicaDescriptors.
  1233  			storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID)
  1234  			if !ok {
  1235  				continue
  1236  			}
  1237  			if constraint.ConjunctionsCheck(storeDesc, preference.Constraints) {
  1238  				preferred = append(preferred, repl)
  1239  			}
  1240  		}
  1241  		if len(preferred) > 0 {
  1242  			return preferred
  1243  		}
  1244  	}
  1245  	return nil
  1246  }
  1247  
  1248  // computeQuorum computes the quorum value for the given number of nodes.
  1249  func computeQuorum(nodes int) int {
  1250  	return (nodes / 2) + 1
  1251  }
  1252  
  1253  // filterBehindReplicas removes any "behind" replicas from the supplied
  1254  // slice. A "behind" replica is one which is not at or past the quorum commit
  1255  // index.
  1256  func filterBehindReplicas(
  1257  	ctx context.Context, raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor,
  1258  ) []roachpb.ReplicaDescriptor {
  1259  	if raftStatus == nil || len(raftStatus.Progress) == 0 {
  1260  		// raftStatus.Progress is only populated on the Raft leader which means we
  1261  		// won't be able to rebalance a lease away if the lease holder is not the
  1262  		// Raft leader. This is rare enough not to matter.
  1263  		return nil
  1264  	}
  1265  	candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas))
  1266  	for _, r := range replicas {
  1267  		if !replicaIsBehind(raftStatus, r.ReplicaID) {
  1268  			candidates = append(candidates, r)
  1269  		}
  1270  	}
  1271  	return candidates
  1272  }
  1273  
  1274  func replicaIsBehind(raftStatus *raft.Status, replicaID roachpb.ReplicaID) bool {
  1275  	if raftStatus == nil || len(raftStatus.Progress) == 0 {
  1276  		return true
  1277  	}
  1278  	// NB: We use raftStatus.Commit instead of getQuorumIndex() because the
  1279  	// latter can return a value that is less than the commit index. This is
  1280  	// useful for Raft log truncation which sometimes wishes to keep those
  1281  	// earlier indexes, but not appropriate for determining which nodes are
  1282  	// behind the actual commit index of the range.
  1283  	if progress, ok := raftStatus.Progress[uint64(replicaID)]; ok {
  1284  		if uint64(replicaID) == raftStatus.Lead ||
  1285  			(progress.State == tracker.StateReplicate &&
  1286  				progress.Match >= raftStatus.Commit) {
  1287  			return false
  1288  		}
  1289  	}
  1290  	return true
  1291  }
  1292  
  1293  // simulateFilterUnremovableReplicas removes any unremovable replicas from the
  1294  // supplied slice. Unlike filterUnremovableReplicas, brandNewReplicaID is
  1295  // considered up-to-date (and thus can participate in quorum), but is not
  1296  // considered a candidate for removal.
  1297  func simulateFilterUnremovableReplicas(
  1298  	ctx context.Context,
  1299  	raftStatus *raft.Status,
  1300  	replicas []roachpb.ReplicaDescriptor,
  1301  	brandNewReplicaID roachpb.ReplicaID,
  1302  ) []roachpb.ReplicaDescriptor {
  1303  	status := *raftStatus
  1304  	status.Progress[uint64(brandNewReplicaID)] = tracker.Progress{
  1305  		State: tracker.StateReplicate,
  1306  		Match: status.Commit,
  1307  	}
  1308  	return filterUnremovableReplicas(ctx, &status, replicas, brandNewReplicaID)
  1309  }
  1310  
  1311  // filterUnremovableReplicas removes any unremovable replicas from the supplied
  1312  // slice. An unremovable replica is one which is a necessary part of the
  1313  // quorum that will result from removing 1 replica. We forgive brandNewReplicaID
  1314  // for being behind, since a new range can take a little while to catch up.
  1315  // This is important when we've just added a replica in order to rebalance to
  1316  // it (#17879).
  1317  func filterUnremovableReplicas(
  1318  	ctx context.Context,
  1319  	raftStatus *raft.Status,
  1320  	replicas []roachpb.ReplicaDescriptor,
  1321  	brandNewReplicaID roachpb.ReplicaID,
  1322  ) []roachpb.ReplicaDescriptor {
  1323  	upToDateReplicas := filterBehindReplicas(ctx, raftStatus, replicas)
  1324  	oldQuorum := computeQuorum(len(replicas))
  1325  	if len(upToDateReplicas) < oldQuorum {
  1326  		// The number of up-to-date replicas is less than the old quorum. No
  1327  		// replicas can be removed. A below quorum range won't be able to process a
  1328  		// replica removal in any case. The logic here prevents any attempt to even
  1329  		// try the removal.
  1330  		return nil
  1331  	}
  1332  
  1333  	newQuorum := computeQuorum(len(replicas) - 1)
  1334  	if len(upToDateReplicas) > newQuorum {
  1335  		// The number of up-to-date replicas is larger than the new quorum. Any
  1336  		// replica can be removed, though we want to filter out brandNewReplicaID.
  1337  		if brandNewReplicaID != 0 {
  1338  			candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas)-len(upToDateReplicas))
  1339  			for _, r := range replicas {
  1340  				if r.ReplicaID != brandNewReplicaID {
  1341  					candidates = append(candidates, r)
  1342  				}
  1343  			}
  1344  			return candidates
  1345  		}
  1346  		return replicas
  1347  	}
  1348  
  1349  	// The number of up-to-date replicas is equal to the new quorum. Only allow
  1350  	// removal of behind replicas (except for brandNewReplicaID which is given a
  1351  	// free pass).
  1352  	candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas)-len(upToDateReplicas))
  1353  	necessary := func(r roachpb.ReplicaDescriptor) bool {
  1354  		if r.ReplicaID == brandNewReplicaID {
  1355  			return true
  1356  		}
  1357  		for _, t := range upToDateReplicas {
  1358  			if t == r {
  1359  				return true
  1360  			}
  1361  		}
  1362  		return false
  1363  	}
  1364  	for _, r := range replicas {
  1365  		if !necessary(r) {
  1366  			candidates = append(candidates, r)
  1367  		}
  1368  	}
  1369  	return candidates
  1370  }
  1371  
  1372  func maxReplicaID(replicas []roachpb.ReplicaDescriptor) roachpb.ReplicaID {
  1373  	var max roachpb.ReplicaID
  1374  	for i := range replicas {
  1375  		if replicaID := replicas[i].ReplicaID; replicaID > max {
  1376  			max = replicaID
  1377  		}
  1378  	}
  1379  	return max
  1380  }