github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_rebalancer.go (about)

     1  // Copyright 2018 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"math"
    16  	"math/rand"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    21  	"github.com/cockroachdb/cockroach/pkg/settings"
    22  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    23  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    25  	"github.com/cockroachdb/cockroach/pkg/util/log"
    26  	"github.com/cockroachdb/cockroach/pkg/util/metric"
    27  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    28  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    29  	"go.etcd.io/etcd/raft"
    30  )
    31  
    32  const (
    33  	// storeRebalancerTimerDuration is how frequently to check the store-level
    34  	// balance of the cluster.
    35  	storeRebalancerTimerDuration = time.Minute
    36  
    37  	// minQPSThresholdDifference is the minimum QPS difference from the cluster
    38  	// mean that this system should care about. In other words, we won't worry
    39  	// about rebalancing for QPS reasons if a store's QPS differs from the mean
    40  	// by less than this amount even if the amount is greater than the percentage
    41  	// threshold. This avoids too many lease transfers in lightly loaded clusters.
    42  	minQPSThresholdDifference = 100
    43  )
    44  
    45  var (
    46  	metaStoreRebalancerLeaseTransferCount = metric.Metadata{
    47  		Name:        "rebalancing.lease.transfers",
    48  		Help:        "Number of lease transfers motivated by store-level load imbalances",
    49  		Measurement: "Lease Transfers",
    50  		Unit:        metric.Unit_COUNT,
    51  	}
    52  	metaStoreRebalancerRangeRebalanceCount = metric.Metadata{
    53  		Name:        "rebalancing.range.rebalances",
    54  		Help:        "Number of range rebalance operations motivated by store-level load imbalances",
    55  		Measurement: "Range Rebalances",
    56  		Unit:        metric.Unit_COUNT,
    57  	}
    58  )
    59  
    60  // StoreRebalancerMetrics is the set of metrics for the store-level rebalancer.
    61  type StoreRebalancerMetrics struct {
    62  	LeaseTransferCount  *metric.Counter
    63  	RangeRebalanceCount *metric.Counter
    64  }
    65  
    66  func makeStoreRebalancerMetrics() StoreRebalancerMetrics {
    67  	return StoreRebalancerMetrics{
    68  		LeaseTransferCount:  metric.NewCounter(metaStoreRebalancerLeaseTransferCount),
    69  		RangeRebalanceCount: metric.NewCounter(metaStoreRebalancerRangeRebalanceCount),
    70  	}
    71  }
    72  
    73  // LoadBasedRebalancingMode controls whether range rebalancing takes
    74  // additional variables such as write load and disk usage into account.
    75  // If disabled, rebalancing is done purely based on replica count.
    76  var LoadBasedRebalancingMode = settings.RegisterPublicEnumSetting(
    77  	"kv.allocator.load_based_rebalancing",
    78  	"whether to rebalance based on the distribution of QPS across stores",
    79  	"leases and replicas",
    80  	map[int64]string{
    81  		int64(LBRebalancingOff):               "off",
    82  		int64(LBRebalancingLeasesOnly):        "leases",
    83  		int64(LBRebalancingLeasesAndReplicas): "leases and replicas",
    84  	},
    85  )
    86  
    87  // qpsRebalanceThreshold is much like rangeRebalanceThreshold, but for
    88  // QPS rather than range count. This should be set higher than
    89  // rangeRebalanceThreshold because QPS can naturally vary over time as
    90  // workloads change and clients come and go, so we need to be a little more
    91  // forgiving to avoid thrashing.
    92  var qpsRebalanceThreshold = func() *settings.FloatSetting {
    93  	s := settings.RegisterNonNegativeFloatSetting(
    94  		"kv.allocator.qps_rebalance_threshold",
    95  		"minimum fraction away from the mean a store's QPS (such as queries per second) can be before it is considered overfull or underfull",
    96  		0.25,
    97  	)
    98  	s.SetVisibility(settings.Public)
    99  	return s
   100  }()
   101  
   102  // LBRebalancingMode controls if and when we do store-level rebalancing
   103  // based on load.
   104  type LBRebalancingMode int64
   105  
   106  const (
   107  	// LBRebalancingOff means that we do not do store-level rebalancing
   108  	// based on load statistics.
   109  	LBRebalancingOff LBRebalancingMode = iota
   110  	// LBRebalancingLeasesOnly means that we rebalance leases based on
   111  	// store-level QPS imbalances.
   112  	LBRebalancingLeasesOnly
   113  	// LBRebalancingLeasesAndReplicas means that we rebalance both leases and
   114  	// replicas based on store-level QPS imbalances.
   115  	LBRebalancingLeasesAndReplicas
   116  )
   117  
   118  // StoreRebalancer is responsible for examining how the associated store's load
   119  // compares to the load on other stores in the cluster and transferring leases
   120  // or replicas away if the local store is overloaded.
   121  //
   122  // This isn't implemented as a Queue because the Queues all operate on one
   123  // replica at a time, making a local decision about each replica. Queues don't
   124  // really know how the replica they're looking at compares to other replicas on
   125  // the store. Our goal is balancing stores, though, so it's preferable to make
   126  // decisions about each store and then carefully pick replicas to move that
   127  // will best accomplish the store-level goals.
   128  type StoreRebalancer struct {
   129  	log.AmbientContext
   130  	metrics         StoreRebalancerMetrics
   131  	st              *cluster.Settings
   132  	rq              *replicateQueue
   133  	replRankings    *replicaRankings
   134  	getRaftStatusFn func(replica *Replica) *raft.Status
   135  }
   136  
   137  // NewStoreRebalancer creates a StoreRebalancer to work in tandem with the
   138  // provided replicateQueue.
   139  func NewStoreRebalancer(
   140  	ambientCtx log.AmbientContext,
   141  	st *cluster.Settings,
   142  	rq *replicateQueue,
   143  	replRankings *replicaRankings,
   144  ) *StoreRebalancer {
   145  	sr := &StoreRebalancer{
   146  		AmbientContext: ambientCtx,
   147  		metrics:        makeStoreRebalancerMetrics(),
   148  		st:             st,
   149  		rq:             rq,
   150  		replRankings:   replRankings,
   151  		getRaftStatusFn: func(replica *Replica) *raft.Status {
   152  			return replica.RaftStatus()
   153  		},
   154  	}
   155  	sr.AddLogTag("store-rebalancer", nil)
   156  	sr.rq.store.metrics.registry.AddMetricStruct(&sr.metrics)
   157  	return sr
   158  }
   159  
   160  // Start runs an infinite loop in a goroutine which regularly checks whether
   161  // the store is overloaded along any important dimension (e.g. range count,
   162  // QPS, disk usage), and if so attempts to correct that by moving leases or
   163  // replicas elsewhere.
   164  //
   165  // This worker acts on store-level imbalances, whereas the replicate queue
   166  // makes decisions based on the zone config constraints and diversity of
   167  // individual ranges. This means that there are two different workers that
   168  // could potentially be making decisions about a given range, so they have to
   169  // be careful to avoid stepping on each others' toes.
   170  //
   171  // TODO(a-robinson): Expose metrics to make this understandable without having
   172  // to dive into logspy.
   173  func (sr *StoreRebalancer) Start(ctx context.Context, stopper *stop.Stopper) {
   174  	ctx = sr.AnnotateCtx(ctx)
   175  
   176  	// Start a goroutine that watches and proactively renews certain
   177  	// expiration-based leases.
   178  	stopper.RunWorker(ctx, func(ctx context.Context) {
   179  		timer := timeutil.NewTimer()
   180  		defer timer.Stop()
   181  		timer.Reset(jitteredInterval(storeRebalancerTimerDuration))
   182  		for {
   183  			// Wait out the first tick before doing anything since the store is still
   184  			// starting up and we might as well wait for some qps/wps stats to
   185  			// accumulate.
   186  			select {
   187  			case <-stopper.ShouldQuiesce():
   188  				return
   189  			case <-timer.C:
   190  				timer.Read = true
   191  				timer.Reset(jitteredInterval(storeRebalancerTimerDuration))
   192  			}
   193  
   194  			mode := LBRebalancingMode(LoadBasedRebalancingMode.Get(&sr.st.SV))
   195  			if mode == LBRebalancingOff {
   196  				continue
   197  			}
   198  
   199  			storeList, _, _ := sr.rq.allocator.storePool.getStoreList(storeFilterNone)
   200  			sr.rebalanceStore(ctx, mode, storeList)
   201  		}
   202  	})
   203  }
   204  
   205  func (sr *StoreRebalancer) rebalanceStore(
   206  	ctx context.Context, mode LBRebalancingMode, storeList StoreList,
   207  ) {
   208  	qpsThresholdFraction := qpsRebalanceThreshold.Get(&sr.st.SV)
   209  
   210  	// First check if we should transfer leases away to better balance QPS.
   211  	qpsMinThreshold := math.Min(storeList.candidateQueriesPerSecond.mean*(1-qpsThresholdFraction),
   212  		storeList.candidateQueriesPerSecond.mean-minQPSThresholdDifference)
   213  	qpsMaxThreshold := math.Max(storeList.candidateQueriesPerSecond.mean*(1+qpsThresholdFraction),
   214  		storeList.candidateQueriesPerSecond.mean+minQPSThresholdDifference)
   215  
   216  	var localDesc *roachpb.StoreDescriptor
   217  	for i := range storeList.stores {
   218  		if storeList.stores[i].StoreID == sr.rq.store.StoreID() {
   219  			localDesc = &storeList.stores[i]
   220  		}
   221  	}
   222  	if localDesc == nil {
   223  		log.Warningf(ctx, "StorePool missing descriptor for local store")
   224  		return
   225  	}
   226  
   227  	if !(localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold) {
   228  		log.VEventf(ctx, 1, "local QPS %.2f is below max threshold %.2f (mean=%.2f); no rebalancing needed",
   229  			localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold, storeList.candidateQueriesPerSecond.mean)
   230  		return
   231  	}
   232  
   233  	var replicasToMaybeRebalance []replicaWithStats
   234  	storeMap := storeListToMap(storeList)
   235  
   236  	log.Infof(ctx,
   237  		"considering load-based lease transfers for s%d with %.2f qps (mean=%.2f, upperThreshold=%.2f)",
   238  		localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold)
   239  
   240  	hottestRanges := sr.replRankings.topQPS()
   241  	for localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold {
   242  		replWithStats, target, considerForRebalance := sr.chooseLeaseToTransfer(
   243  			ctx, &hottestRanges, localDesc, storeList, storeMap, qpsMinThreshold, qpsMaxThreshold)
   244  		replicasToMaybeRebalance = append(replicasToMaybeRebalance, considerForRebalance...)
   245  		if replWithStats.repl == nil {
   246  			break
   247  		}
   248  
   249  		log.VEventf(ctx, 1, "transferring r%d (%.2f qps) to s%d to better balance load",
   250  			replWithStats.repl.RangeID, replWithStats.qps, target.StoreID)
   251  		timeout := sr.rq.processTimeoutFunc(sr.st, replWithStats.repl)
   252  		if err := contextutil.RunWithTimeout(ctx, "transfer lease", timeout, func(ctx context.Context) error {
   253  			return sr.rq.transferLease(ctx, replWithStats.repl, target, replWithStats.qps)
   254  		}); err != nil {
   255  			log.Errorf(ctx, "unable to transfer lease to s%d: %+v", target.StoreID, err)
   256  			continue
   257  		}
   258  		sr.metrics.LeaseTransferCount.Inc(1)
   259  
   260  		// Finally, update our local copies of the descriptors so that if
   261  		// additional transfers are needed we'll be making the decisions with more
   262  		// up-to-date info. The StorePool copies are updated by transferLease.
   263  		localDesc.Capacity.LeaseCount--
   264  		localDesc.Capacity.QueriesPerSecond -= replWithStats.qps
   265  		if otherDesc := storeMap[target.StoreID]; otherDesc != nil {
   266  			otherDesc.Capacity.LeaseCount++
   267  			otherDesc.Capacity.QueriesPerSecond += replWithStats.qps
   268  		}
   269  	}
   270  
   271  	if !(localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold) {
   272  		log.Infof(ctx,
   273  			"load-based lease transfers successfully brought s%d down to %.2f qps (mean=%.2f, upperThreshold=%.2f)",
   274  			localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold)
   275  		return
   276  	}
   277  
   278  	if mode != LBRebalancingLeasesAndReplicas {
   279  		log.Infof(ctx,
   280  			"ran out of leases worth transferring and qps (%.2f) is still above desired threshold (%.2f)",
   281  			localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold)
   282  		return
   283  	}
   284  	log.Infof(ctx,
   285  		"ran out of leases worth transferring and qps (%.2f) is still above desired threshold (%.2f); considering load-based replica rebalances",
   286  		localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold)
   287  
   288  	// Re-combine replicasToMaybeRebalance with what remains of hottestRanges so
   289  	// that we'll reconsider them for replica rebalancing.
   290  	replicasToMaybeRebalance = append(replicasToMaybeRebalance, hottestRanges...)
   291  
   292  	for localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold {
   293  		replWithStats, targets := sr.chooseReplicaToRebalance(
   294  			ctx,
   295  			&replicasToMaybeRebalance,
   296  			localDesc,
   297  			storeList,
   298  			storeMap,
   299  			qpsMinThreshold,
   300  			qpsMaxThreshold)
   301  		if replWithStats.repl == nil {
   302  			log.Infof(ctx,
   303  				"ran out of replicas worth transferring and qps (%.2f) is still above desired threshold (%.2f); will check again soon",
   304  				localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold)
   305  			return
   306  		}
   307  
   308  		descBeforeRebalance := replWithStats.repl.Desc()
   309  		log.VEventf(ctx, 1, "rebalancing r%d (%.2f qps) from %v to %v to better balance load",
   310  			replWithStats.repl.RangeID, replWithStats.qps, descBeforeRebalance.Replicas(), targets)
   311  		timeout := sr.rq.processTimeoutFunc(sr.st, replWithStats.repl)
   312  		if err := contextutil.RunWithTimeout(ctx, "relocate range", timeout, func(ctx context.Context) error {
   313  			return sr.rq.store.AdminRelocateRange(ctx, *descBeforeRebalance, targets)
   314  		}); err != nil {
   315  			log.Errorf(ctx, "unable to relocate range to %v: %+v", targets, err)
   316  			continue
   317  		}
   318  		sr.metrics.RangeRebalanceCount.Inc(1)
   319  
   320  		// Finally, update our local copies of the descriptors so that if
   321  		// additional transfers are needed we'll be making the decisions with more
   322  		// up-to-date info.
   323  		//
   324  		// TODO(a-robinson): This just updates the copies used locally by the
   325  		// storeRebalancer. We may also want to update the copies in the StorePool
   326  		// itself.
   327  		replicasBeforeRebalance := descBeforeRebalance.Replicas().All()
   328  		for i := range replicasBeforeRebalance {
   329  			if storeDesc := storeMap[replicasBeforeRebalance[i].StoreID]; storeDesc != nil {
   330  				storeDesc.Capacity.RangeCount--
   331  			}
   332  		}
   333  		localDesc.Capacity.LeaseCount--
   334  		localDesc.Capacity.QueriesPerSecond -= replWithStats.qps
   335  		for i := range targets {
   336  			if storeDesc := storeMap[targets[i].StoreID]; storeDesc != nil {
   337  				storeDesc.Capacity.RangeCount++
   338  				if i == 0 {
   339  					storeDesc.Capacity.LeaseCount++
   340  					storeDesc.Capacity.QueriesPerSecond += replWithStats.qps
   341  				}
   342  			}
   343  		}
   344  	}
   345  
   346  	log.Infof(ctx,
   347  		"load-based replica transfers successfully brought s%d down to %.2f qps (mean=%.2f, upperThreshold=%.2f)",
   348  		localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold)
   349  }
   350  
   351  // TODO(a-robinson): Should we take the number of leases on each store into
   352  // account here or just continue to let that happen in allocator.go?
   353  func (sr *StoreRebalancer) chooseLeaseToTransfer(
   354  	ctx context.Context,
   355  	hottestRanges *[]replicaWithStats,
   356  	localDesc *roachpb.StoreDescriptor,
   357  	storeList StoreList,
   358  	storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor,
   359  	minQPS float64,
   360  	maxQPS float64,
   361  ) (replicaWithStats, roachpb.ReplicaDescriptor, []replicaWithStats) {
   362  	var considerForRebalance []replicaWithStats
   363  	now := sr.rq.store.Clock().Now()
   364  	for {
   365  		if len(*hottestRanges) == 0 {
   366  			return replicaWithStats{}, roachpb.ReplicaDescriptor{}, considerForRebalance
   367  		}
   368  		replWithStats := (*hottestRanges)[0]
   369  		*hottestRanges = (*hottestRanges)[1:]
   370  
   371  		// We're all out of replicas.
   372  		if replWithStats.repl == nil {
   373  			return replicaWithStats{}, roachpb.ReplicaDescriptor{}, considerForRebalance
   374  		}
   375  
   376  		if shouldNotMoveAway(ctx, replWithStats, localDesc, now, minQPS) {
   377  			continue
   378  		}
   379  
   380  		// Don't bother moving leases whose QPS is below some small fraction of the
   381  		// store's QPS (unless the store has extra leases to spare anyway). It's
   382  		// just unnecessary churn with no benefit to move leases responsible for,
   383  		// for example, 1 qps on a store with 5000 qps.
   384  		const minQPSFraction = .001
   385  		if replWithStats.qps < localDesc.Capacity.QueriesPerSecond*minQPSFraction &&
   386  			float64(localDesc.Capacity.LeaseCount) <= storeList.candidateLeases.mean {
   387  			log.VEventf(ctx, 5, "r%d's %.2f qps is too little to matter relative to s%d's %.2f total qps",
   388  				replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, localDesc.Capacity.QueriesPerSecond)
   389  			continue
   390  		}
   391  
   392  		desc, zone := replWithStats.repl.DescAndZone()
   393  		log.VEventf(ctx, 3, "considering lease transfer for r%d with %.2f qps",
   394  			desc.RangeID, replWithStats.qps)
   395  
   396  		// Check all the other replicas in order of increasing qps. Learner replicas
   397  		// aren't allowed to become the leaseholder or raft leader, so only consider
   398  		// the `Voters` replicas.
   399  		candidates := desc.Replicas().DeepCopy().Voters()
   400  		sort.Slice(candidates, func(i, j int) bool {
   401  			var iQPS, jQPS float64
   402  			if desc := storeMap[candidates[i].StoreID]; desc != nil {
   403  				iQPS = desc.Capacity.QueriesPerSecond
   404  			}
   405  			if desc := storeMap[candidates[j].StoreID]; desc != nil {
   406  				jQPS = desc.Capacity.QueriesPerSecond
   407  			}
   408  			return iQPS < jQPS
   409  		})
   410  
   411  		var raftStatus *raft.Status
   412  
   413  		preferred := sr.rq.allocator.preferredLeaseholders(zone, candidates)
   414  		for _, candidate := range candidates {
   415  			if candidate.StoreID == localDesc.StoreID {
   416  				continue
   417  			}
   418  
   419  			meanQPS := storeList.candidateQueriesPerSecond.mean
   420  			if shouldNotMoveTo(ctx, storeMap, replWithStats, candidate.StoreID, meanQPS, minQPS, maxQPS) {
   421  				continue
   422  			}
   423  
   424  			if raftStatus == nil {
   425  				raftStatus = sr.getRaftStatusFn(replWithStats.repl)
   426  			}
   427  			if replicaIsBehind(raftStatus, candidate.ReplicaID) {
   428  				log.VEventf(ctx, 3, "%v is behind or this store isn't the raft leader for r%d; raftStatus: %v",
   429  					candidate, desc.RangeID, raftStatus)
   430  				continue
   431  			}
   432  
   433  			if len(preferred) > 0 && !storeHasReplica(candidate.StoreID, preferred) {
   434  				log.VEventf(ctx, 3, "s%d not a preferred leaseholder for r%d; preferred: %v",
   435  					candidate.StoreID, desc.RangeID, preferred)
   436  				continue
   437  			}
   438  
   439  			filteredStoreList := storeList.filter(zone.Constraints)
   440  			if sr.rq.allocator.followTheWorkloadPrefersLocal(
   441  				ctx,
   442  				filteredStoreList,
   443  				*localDesc,
   444  				candidate.StoreID,
   445  				candidates,
   446  				replWithStats.repl.leaseholderStats,
   447  			) {
   448  				log.VEventf(ctx, 3, "r%d is on s%d due to follow-the-workload; skipping",
   449  					desc.RangeID, localDesc.StoreID)
   450  				continue
   451  			}
   452  
   453  			return replWithStats, candidate, considerForRebalance
   454  		}
   455  
   456  		// If none of the other replicas are valid lease transfer targets, consider
   457  		// this range for replica rebalancing.
   458  		considerForRebalance = append(considerForRebalance, replWithStats)
   459  	}
   460  }
   461  
   462  func (sr *StoreRebalancer) chooseReplicaToRebalance(
   463  	ctx context.Context,
   464  	hottestRanges *[]replicaWithStats,
   465  	localDesc *roachpb.StoreDescriptor,
   466  	storeList StoreList,
   467  	storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor,
   468  	minQPS float64,
   469  	maxQPS float64,
   470  ) (replicaWithStats, []roachpb.ReplicationTarget) {
   471  	now := sr.rq.store.Clock().Now()
   472  	for {
   473  		if len(*hottestRanges) == 0 {
   474  			return replicaWithStats{}, nil
   475  		}
   476  		replWithStats := (*hottestRanges)[0]
   477  		*hottestRanges = (*hottestRanges)[1:]
   478  
   479  		if replWithStats.repl == nil {
   480  			return replicaWithStats{}, nil
   481  		}
   482  
   483  		if shouldNotMoveAway(ctx, replWithStats, localDesc, now, minQPS) {
   484  			continue
   485  		}
   486  
   487  		// Don't bother moving ranges whose QPS is below some small fraction of the
   488  		// store's QPS (unless the store has extra ranges to spare anyway). It's
   489  		// just unnecessary churn with no benefit to move ranges responsible for,
   490  		// for example, 1 qps on a store with 5000 qps.
   491  		const minQPSFraction = .001
   492  		if replWithStats.qps < localDesc.Capacity.QueriesPerSecond*minQPSFraction &&
   493  			float64(localDesc.Capacity.RangeCount) <= storeList.candidateRanges.mean {
   494  			log.VEventf(ctx, 5, "r%d's %.2f qps is too little to matter relative to s%d's %.2f total qps",
   495  				replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, localDesc.Capacity.QueriesPerSecond)
   496  			continue
   497  		}
   498  
   499  		desc, zone := replWithStats.repl.DescAndZone()
   500  		log.VEventf(ctx, 3, "considering replica rebalance for r%d with %.2f qps",
   501  			desc.RangeID, replWithStats.qps)
   502  
   503  		clusterNodes := sr.rq.allocator.storePool.ClusterNodeCount()
   504  		desiredReplicas := GetNeededReplicas(*zone.NumReplicas, clusterNodes)
   505  		targets := make([]roachpb.ReplicationTarget, 0, desiredReplicas)
   506  		targetReplicas := make([]roachpb.ReplicaDescriptor, 0, desiredReplicas)
   507  		currentReplicas := desc.Replicas().All()
   508  
   509  		// Check the range's existing diversity score, since we want to ensure we
   510  		// don't hurt locality diversity just to improve QPS.
   511  		curDiversity := rangeDiversityScore(
   512  			sr.rq.allocator.storePool.getLocalities(currentReplicas))
   513  
   514  		// Check the existing replicas, keeping around those that aren't overloaded.
   515  		for i := range currentReplicas {
   516  			if currentReplicas[i].StoreID == localDesc.StoreID {
   517  				continue
   518  			}
   519  			// Keep the replica in the range if we don't know its QPS or if its QPS
   520  			// is below the upper threshold. Punishing stores not in our store map
   521  			// could cause mass evictions if the storePool gets out of sync.
   522  			storeDesc, ok := storeMap[currentReplicas[i].StoreID]
   523  			if !ok || storeDesc.Capacity.QueriesPerSecond < maxQPS {
   524  				targets = append(targets, roachpb.ReplicationTarget{
   525  					NodeID:  currentReplicas[i].NodeID,
   526  					StoreID: currentReplicas[i].StoreID,
   527  				})
   528  				targetReplicas = append(targetReplicas, roachpb.ReplicaDescriptor{
   529  					NodeID:  currentReplicas[i].NodeID,
   530  					StoreID: currentReplicas[i].StoreID,
   531  				})
   532  			}
   533  		}
   534  
   535  		// Then pick out which new stores to add the remaining replicas to.
   536  		options := sr.rq.allocator.scorerOptions()
   537  		options.qpsRebalanceThreshold = qpsRebalanceThreshold.Get(&sr.st.SV)
   538  		for len(targets) < desiredReplicas {
   539  			// Use the preexisting AllocateTarget logic to ensure that considerations
   540  			// such as zone constraints, locality diversity, and full disk come
   541  			// into play.
   542  			target, _ := sr.rq.allocator.allocateTargetFromList(
   543  				ctx,
   544  				storeList,
   545  				zone,
   546  				targetReplicas,
   547  				options,
   548  			)
   549  			if target == nil {
   550  				log.VEventf(ctx, 3, "no rebalance targets found to replace the current store for r%d",
   551  					desc.RangeID)
   552  				break
   553  			}
   554  
   555  			meanQPS := storeList.candidateQueriesPerSecond.mean
   556  			if shouldNotMoveTo(ctx, storeMap, replWithStats, target.StoreID, meanQPS, minQPS, maxQPS) {
   557  				break
   558  			}
   559  
   560  			targets = append(targets, roachpb.ReplicationTarget{
   561  				NodeID:  target.Node.NodeID,
   562  				StoreID: target.StoreID,
   563  			})
   564  			targetReplicas = append(targetReplicas, roachpb.ReplicaDescriptor{
   565  				NodeID:  target.Node.NodeID,
   566  				StoreID: target.StoreID,
   567  			})
   568  		}
   569  
   570  		// If we couldn't find enough valid targets, forget about this range.
   571  		//
   572  		// TODO(a-robinson): Support more incremental improvements -- move what we
   573  		// can if it makes things better even if it isn't great. For example,
   574  		// moving one of the other existing replicas that's on a store with less
   575  		// qps than the max threshold but above the mean would help in certain
   576  		// locality configurations.
   577  		if len(targets) < desiredReplicas {
   578  			log.VEventf(ctx, 3, "couldn't find enough rebalance targets for r%d (%d/%d)",
   579  				desc.RangeID, len(targets), desiredReplicas)
   580  			continue
   581  		}
   582  		newDiversity := rangeDiversityScore(sr.rq.allocator.storePool.getLocalities(targetReplicas))
   583  		if newDiversity < curDiversity {
   584  			log.VEventf(ctx, 3,
   585  				"new diversity %.2f for r%d worse than current diversity %.2f; not rebalancing",
   586  				newDiversity, desc.RangeID, curDiversity)
   587  			continue
   588  		}
   589  
   590  		// Pick the replica with the least QPS to be leaseholder;
   591  		// RelocateRange transfers the lease to the first provided target.
   592  		newLeaseIdx := 0
   593  		newLeaseQPS := math.MaxFloat64
   594  		var raftStatus *raft.Status
   595  		for i := 0; i < len(targets); i++ {
   596  			// Ensure we don't transfer the lease to an existing replica that is behind
   597  			// in processing its raft log.
   598  			if replica, ok := desc.GetReplicaDescriptor(targets[i].StoreID); ok {
   599  				if raftStatus == nil {
   600  					raftStatus = sr.getRaftStatusFn(replWithStats.repl)
   601  				}
   602  				if replicaIsBehind(raftStatus, replica.ReplicaID) {
   603  					continue
   604  				}
   605  			}
   606  
   607  			storeDesc, ok := storeMap[targets[i].StoreID]
   608  			if ok && storeDesc.Capacity.QueriesPerSecond < newLeaseQPS {
   609  				newLeaseIdx = i
   610  				newLeaseQPS = storeDesc.Capacity.QueriesPerSecond
   611  			}
   612  		}
   613  		targets[0], targets[newLeaseIdx] = targets[newLeaseIdx], targets[0]
   614  		return replWithStats, targets
   615  	}
   616  }
   617  
   618  func shouldNotMoveAway(
   619  	ctx context.Context,
   620  	replWithStats replicaWithStats,
   621  	localDesc *roachpb.StoreDescriptor,
   622  	now hlc.Timestamp,
   623  	minQPS float64,
   624  ) bool {
   625  	if !replWithStats.repl.OwnsValidLease(now) {
   626  		log.VEventf(ctx, 3, "store doesn't own the lease for r%d", replWithStats.repl.RangeID)
   627  		return true
   628  	}
   629  	if localDesc.Capacity.QueriesPerSecond-replWithStats.qps < minQPS {
   630  		log.VEventf(ctx, 3, "moving r%d's %.2f qps would bring s%d below the min threshold (%.2f)",
   631  			replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, minQPS)
   632  		return true
   633  	}
   634  	return false
   635  }
   636  
   637  func shouldNotMoveTo(
   638  	ctx context.Context,
   639  	storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor,
   640  	replWithStats replicaWithStats,
   641  	candidateStore roachpb.StoreID,
   642  	meanQPS float64,
   643  	minQPS float64,
   644  	maxQPS float64,
   645  ) bool {
   646  	storeDesc, ok := storeMap[candidateStore]
   647  	if !ok {
   648  		log.VEventf(ctx, 3, "missing store descriptor for s%d", candidateStore)
   649  		return true
   650  	}
   651  
   652  	newCandidateQPS := storeDesc.Capacity.QueriesPerSecond + replWithStats.qps
   653  	if storeDesc.Capacity.QueriesPerSecond < minQPS {
   654  		if newCandidateQPS > maxQPS {
   655  			log.VEventf(ctx, 3,
   656  				"r%d's %.2f qps would push s%d over the max threshold (%.2f) with %.2f qps afterwards",
   657  				replWithStats.repl.RangeID, replWithStats.qps, candidateStore, maxQPS, newCandidateQPS)
   658  			return true
   659  		}
   660  	} else if newCandidateQPS > meanQPS {
   661  		log.VEventf(ctx, 3,
   662  			"r%d's %.2f qps would push s%d over the mean (%.2f) with %.2f qps afterwards",
   663  			replWithStats.repl.RangeID, replWithStats.qps, candidateStore, meanQPS, newCandidateQPS)
   664  		return true
   665  	}
   666  
   667  	return false
   668  }
   669  
   670  func storeListToMap(sl StoreList) map[roachpb.StoreID]*roachpb.StoreDescriptor {
   671  	storeMap := make(map[roachpb.StoreID]*roachpb.StoreDescriptor)
   672  	for i := range sl.stores {
   673  		storeMap[sl.stores[i].StoreID] = &sl.stores[i]
   674  	}
   675  	return storeMap
   676  }
   677  
   678  // jitteredInterval returns a randomly jittered (+/-25%) duration
   679  // from checkInterval.
   680  func jitteredInterval(interval time.Duration) time.Duration {
   681  	return time.Duration(float64(interval) * (0.75 + 0.5*rand.Float64()))
   682  }