github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_rebalancer.go (about) 1 // Copyright 2018 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "math" 16 "math/rand" 17 "sort" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/roachpb" 21 "github.com/cockroachdb/cockroach/pkg/settings" 22 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 23 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 24 "github.com/cockroachdb/cockroach/pkg/util/hlc" 25 "github.com/cockroachdb/cockroach/pkg/util/log" 26 "github.com/cockroachdb/cockroach/pkg/util/metric" 27 "github.com/cockroachdb/cockroach/pkg/util/stop" 28 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 29 "go.etcd.io/etcd/raft" 30 ) 31 32 const ( 33 // storeRebalancerTimerDuration is how frequently to check the store-level 34 // balance of the cluster. 35 storeRebalancerTimerDuration = time.Minute 36 37 // minQPSThresholdDifference is the minimum QPS difference from the cluster 38 // mean that this system should care about. In other words, we won't worry 39 // about rebalancing for QPS reasons if a store's QPS differs from the mean 40 // by less than this amount even if the amount is greater than the percentage 41 // threshold. This avoids too many lease transfers in lightly loaded clusters. 42 minQPSThresholdDifference = 100 43 ) 44 45 var ( 46 metaStoreRebalancerLeaseTransferCount = metric.Metadata{ 47 Name: "rebalancing.lease.transfers", 48 Help: "Number of lease transfers motivated by store-level load imbalances", 49 Measurement: "Lease Transfers", 50 Unit: metric.Unit_COUNT, 51 } 52 metaStoreRebalancerRangeRebalanceCount = metric.Metadata{ 53 Name: "rebalancing.range.rebalances", 54 Help: "Number of range rebalance operations motivated by store-level load imbalances", 55 Measurement: "Range Rebalances", 56 Unit: metric.Unit_COUNT, 57 } 58 ) 59 60 // StoreRebalancerMetrics is the set of metrics for the store-level rebalancer. 61 type StoreRebalancerMetrics struct { 62 LeaseTransferCount *metric.Counter 63 RangeRebalanceCount *metric.Counter 64 } 65 66 func makeStoreRebalancerMetrics() StoreRebalancerMetrics { 67 return StoreRebalancerMetrics{ 68 LeaseTransferCount: metric.NewCounter(metaStoreRebalancerLeaseTransferCount), 69 RangeRebalanceCount: metric.NewCounter(metaStoreRebalancerRangeRebalanceCount), 70 } 71 } 72 73 // LoadBasedRebalancingMode controls whether range rebalancing takes 74 // additional variables such as write load and disk usage into account. 75 // If disabled, rebalancing is done purely based on replica count. 76 var LoadBasedRebalancingMode = settings.RegisterPublicEnumSetting( 77 "kv.allocator.load_based_rebalancing", 78 "whether to rebalance based on the distribution of QPS across stores", 79 "leases and replicas", 80 map[int64]string{ 81 int64(LBRebalancingOff): "off", 82 int64(LBRebalancingLeasesOnly): "leases", 83 int64(LBRebalancingLeasesAndReplicas): "leases and replicas", 84 }, 85 ) 86 87 // qpsRebalanceThreshold is much like rangeRebalanceThreshold, but for 88 // QPS rather than range count. This should be set higher than 89 // rangeRebalanceThreshold because QPS can naturally vary over time as 90 // workloads change and clients come and go, so we need to be a little more 91 // forgiving to avoid thrashing. 92 var qpsRebalanceThreshold = func() *settings.FloatSetting { 93 s := settings.RegisterNonNegativeFloatSetting( 94 "kv.allocator.qps_rebalance_threshold", 95 "minimum fraction away from the mean a store's QPS (such as queries per second) can be before it is considered overfull or underfull", 96 0.25, 97 ) 98 s.SetVisibility(settings.Public) 99 return s 100 }() 101 102 // LBRebalancingMode controls if and when we do store-level rebalancing 103 // based on load. 104 type LBRebalancingMode int64 105 106 const ( 107 // LBRebalancingOff means that we do not do store-level rebalancing 108 // based on load statistics. 109 LBRebalancingOff LBRebalancingMode = iota 110 // LBRebalancingLeasesOnly means that we rebalance leases based on 111 // store-level QPS imbalances. 112 LBRebalancingLeasesOnly 113 // LBRebalancingLeasesAndReplicas means that we rebalance both leases and 114 // replicas based on store-level QPS imbalances. 115 LBRebalancingLeasesAndReplicas 116 ) 117 118 // StoreRebalancer is responsible for examining how the associated store's load 119 // compares to the load on other stores in the cluster and transferring leases 120 // or replicas away if the local store is overloaded. 121 // 122 // This isn't implemented as a Queue because the Queues all operate on one 123 // replica at a time, making a local decision about each replica. Queues don't 124 // really know how the replica they're looking at compares to other replicas on 125 // the store. Our goal is balancing stores, though, so it's preferable to make 126 // decisions about each store and then carefully pick replicas to move that 127 // will best accomplish the store-level goals. 128 type StoreRebalancer struct { 129 log.AmbientContext 130 metrics StoreRebalancerMetrics 131 st *cluster.Settings 132 rq *replicateQueue 133 replRankings *replicaRankings 134 getRaftStatusFn func(replica *Replica) *raft.Status 135 } 136 137 // NewStoreRebalancer creates a StoreRebalancer to work in tandem with the 138 // provided replicateQueue. 139 func NewStoreRebalancer( 140 ambientCtx log.AmbientContext, 141 st *cluster.Settings, 142 rq *replicateQueue, 143 replRankings *replicaRankings, 144 ) *StoreRebalancer { 145 sr := &StoreRebalancer{ 146 AmbientContext: ambientCtx, 147 metrics: makeStoreRebalancerMetrics(), 148 st: st, 149 rq: rq, 150 replRankings: replRankings, 151 getRaftStatusFn: func(replica *Replica) *raft.Status { 152 return replica.RaftStatus() 153 }, 154 } 155 sr.AddLogTag("store-rebalancer", nil) 156 sr.rq.store.metrics.registry.AddMetricStruct(&sr.metrics) 157 return sr 158 } 159 160 // Start runs an infinite loop in a goroutine which regularly checks whether 161 // the store is overloaded along any important dimension (e.g. range count, 162 // QPS, disk usage), and if so attempts to correct that by moving leases or 163 // replicas elsewhere. 164 // 165 // This worker acts on store-level imbalances, whereas the replicate queue 166 // makes decisions based on the zone config constraints and diversity of 167 // individual ranges. This means that there are two different workers that 168 // could potentially be making decisions about a given range, so they have to 169 // be careful to avoid stepping on each others' toes. 170 // 171 // TODO(a-robinson): Expose metrics to make this understandable without having 172 // to dive into logspy. 173 func (sr *StoreRebalancer) Start(ctx context.Context, stopper *stop.Stopper) { 174 ctx = sr.AnnotateCtx(ctx) 175 176 // Start a goroutine that watches and proactively renews certain 177 // expiration-based leases. 178 stopper.RunWorker(ctx, func(ctx context.Context) { 179 timer := timeutil.NewTimer() 180 defer timer.Stop() 181 timer.Reset(jitteredInterval(storeRebalancerTimerDuration)) 182 for { 183 // Wait out the first tick before doing anything since the store is still 184 // starting up and we might as well wait for some qps/wps stats to 185 // accumulate. 186 select { 187 case <-stopper.ShouldQuiesce(): 188 return 189 case <-timer.C: 190 timer.Read = true 191 timer.Reset(jitteredInterval(storeRebalancerTimerDuration)) 192 } 193 194 mode := LBRebalancingMode(LoadBasedRebalancingMode.Get(&sr.st.SV)) 195 if mode == LBRebalancingOff { 196 continue 197 } 198 199 storeList, _, _ := sr.rq.allocator.storePool.getStoreList(storeFilterNone) 200 sr.rebalanceStore(ctx, mode, storeList) 201 } 202 }) 203 } 204 205 func (sr *StoreRebalancer) rebalanceStore( 206 ctx context.Context, mode LBRebalancingMode, storeList StoreList, 207 ) { 208 qpsThresholdFraction := qpsRebalanceThreshold.Get(&sr.st.SV) 209 210 // First check if we should transfer leases away to better balance QPS. 211 qpsMinThreshold := math.Min(storeList.candidateQueriesPerSecond.mean*(1-qpsThresholdFraction), 212 storeList.candidateQueriesPerSecond.mean-minQPSThresholdDifference) 213 qpsMaxThreshold := math.Max(storeList.candidateQueriesPerSecond.mean*(1+qpsThresholdFraction), 214 storeList.candidateQueriesPerSecond.mean+minQPSThresholdDifference) 215 216 var localDesc *roachpb.StoreDescriptor 217 for i := range storeList.stores { 218 if storeList.stores[i].StoreID == sr.rq.store.StoreID() { 219 localDesc = &storeList.stores[i] 220 } 221 } 222 if localDesc == nil { 223 log.Warningf(ctx, "StorePool missing descriptor for local store") 224 return 225 } 226 227 if !(localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold) { 228 log.VEventf(ctx, 1, "local QPS %.2f is below max threshold %.2f (mean=%.2f); no rebalancing needed", 229 localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold, storeList.candidateQueriesPerSecond.mean) 230 return 231 } 232 233 var replicasToMaybeRebalance []replicaWithStats 234 storeMap := storeListToMap(storeList) 235 236 log.Infof(ctx, 237 "considering load-based lease transfers for s%d with %.2f qps (mean=%.2f, upperThreshold=%.2f)", 238 localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold) 239 240 hottestRanges := sr.replRankings.topQPS() 241 for localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold { 242 replWithStats, target, considerForRebalance := sr.chooseLeaseToTransfer( 243 ctx, &hottestRanges, localDesc, storeList, storeMap, qpsMinThreshold, qpsMaxThreshold) 244 replicasToMaybeRebalance = append(replicasToMaybeRebalance, considerForRebalance...) 245 if replWithStats.repl == nil { 246 break 247 } 248 249 log.VEventf(ctx, 1, "transferring r%d (%.2f qps) to s%d to better balance load", 250 replWithStats.repl.RangeID, replWithStats.qps, target.StoreID) 251 timeout := sr.rq.processTimeoutFunc(sr.st, replWithStats.repl) 252 if err := contextutil.RunWithTimeout(ctx, "transfer lease", timeout, func(ctx context.Context) error { 253 return sr.rq.transferLease(ctx, replWithStats.repl, target, replWithStats.qps) 254 }); err != nil { 255 log.Errorf(ctx, "unable to transfer lease to s%d: %+v", target.StoreID, err) 256 continue 257 } 258 sr.metrics.LeaseTransferCount.Inc(1) 259 260 // Finally, update our local copies of the descriptors so that if 261 // additional transfers are needed we'll be making the decisions with more 262 // up-to-date info. The StorePool copies are updated by transferLease. 263 localDesc.Capacity.LeaseCount-- 264 localDesc.Capacity.QueriesPerSecond -= replWithStats.qps 265 if otherDesc := storeMap[target.StoreID]; otherDesc != nil { 266 otherDesc.Capacity.LeaseCount++ 267 otherDesc.Capacity.QueriesPerSecond += replWithStats.qps 268 } 269 } 270 271 if !(localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold) { 272 log.Infof(ctx, 273 "load-based lease transfers successfully brought s%d down to %.2f qps (mean=%.2f, upperThreshold=%.2f)", 274 localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold) 275 return 276 } 277 278 if mode != LBRebalancingLeasesAndReplicas { 279 log.Infof(ctx, 280 "ran out of leases worth transferring and qps (%.2f) is still above desired threshold (%.2f)", 281 localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold) 282 return 283 } 284 log.Infof(ctx, 285 "ran out of leases worth transferring and qps (%.2f) is still above desired threshold (%.2f); considering load-based replica rebalances", 286 localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold) 287 288 // Re-combine replicasToMaybeRebalance with what remains of hottestRanges so 289 // that we'll reconsider them for replica rebalancing. 290 replicasToMaybeRebalance = append(replicasToMaybeRebalance, hottestRanges...) 291 292 for localDesc.Capacity.QueriesPerSecond > qpsMaxThreshold { 293 replWithStats, targets := sr.chooseReplicaToRebalance( 294 ctx, 295 &replicasToMaybeRebalance, 296 localDesc, 297 storeList, 298 storeMap, 299 qpsMinThreshold, 300 qpsMaxThreshold) 301 if replWithStats.repl == nil { 302 log.Infof(ctx, 303 "ran out of replicas worth transferring and qps (%.2f) is still above desired threshold (%.2f); will check again soon", 304 localDesc.Capacity.QueriesPerSecond, qpsMaxThreshold) 305 return 306 } 307 308 descBeforeRebalance := replWithStats.repl.Desc() 309 log.VEventf(ctx, 1, "rebalancing r%d (%.2f qps) from %v to %v to better balance load", 310 replWithStats.repl.RangeID, replWithStats.qps, descBeforeRebalance.Replicas(), targets) 311 timeout := sr.rq.processTimeoutFunc(sr.st, replWithStats.repl) 312 if err := contextutil.RunWithTimeout(ctx, "relocate range", timeout, func(ctx context.Context) error { 313 return sr.rq.store.AdminRelocateRange(ctx, *descBeforeRebalance, targets) 314 }); err != nil { 315 log.Errorf(ctx, "unable to relocate range to %v: %+v", targets, err) 316 continue 317 } 318 sr.metrics.RangeRebalanceCount.Inc(1) 319 320 // Finally, update our local copies of the descriptors so that if 321 // additional transfers are needed we'll be making the decisions with more 322 // up-to-date info. 323 // 324 // TODO(a-robinson): This just updates the copies used locally by the 325 // storeRebalancer. We may also want to update the copies in the StorePool 326 // itself. 327 replicasBeforeRebalance := descBeforeRebalance.Replicas().All() 328 for i := range replicasBeforeRebalance { 329 if storeDesc := storeMap[replicasBeforeRebalance[i].StoreID]; storeDesc != nil { 330 storeDesc.Capacity.RangeCount-- 331 } 332 } 333 localDesc.Capacity.LeaseCount-- 334 localDesc.Capacity.QueriesPerSecond -= replWithStats.qps 335 for i := range targets { 336 if storeDesc := storeMap[targets[i].StoreID]; storeDesc != nil { 337 storeDesc.Capacity.RangeCount++ 338 if i == 0 { 339 storeDesc.Capacity.LeaseCount++ 340 storeDesc.Capacity.QueriesPerSecond += replWithStats.qps 341 } 342 } 343 } 344 } 345 346 log.Infof(ctx, 347 "load-based replica transfers successfully brought s%d down to %.2f qps (mean=%.2f, upperThreshold=%.2f)", 348 localDesc.StoreID, localDesc.Capacity.QueriesPerSecond, storeList.candidateQueriesPerSecond.mean, qpsMaxThreshold) 349 } 350 351 // TODO(a-robinson): Should we take the number of leases on each store into 352 // account here or just continue to let that happen in allocator.go? 353 func (sr *StoreRebalancer) chooseLeaseToTransfer( 354 ctx context.Context, 355 hottestRanges *[]replicaWithStats, 356 localDesc *roachpb.StoreDescriptor, 357 storeList StoreList, 358 storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor, 359 minQPS float64, 360 maxQPS float64, 361 ) (replicaWithStats, roachpb.ReplicaDescriptor, []replicaWithStats) { 362 var considerForRebalance []replicaWithStats 363 now := sr.rq.store.Clock().Now() 364 for { 365 if len(*hottestRanges) == 0 { 366 return replicaWithStats{}, roachpb.ReplicaDescriptor{}, considerForRebalance 367 } 368 replWithStats := (*hottestRanges)[0] 369 *hottestRanges = (*hottestRanges)[1:] 370 371 // We're all out of replicas. 372 if replWithStats.repl == nil { 373 return replicaWithStats{}, roachpb.ReplicaDescriptor{}, considerForRebalance 374 } 375 376 if shouldNotMoveAway(ctx, replWithStats, localDesc, now, minQPS) { 377 continue 378 } 379 380 // Don't bother moving leases whose QPS is below some small fraction of the 381 // store's QPS (unless the store has extra leases to spare anyway). It's 382 // just unnecessary churn with no benefit to move leases responsible for, 383 // for example, 1 qps on a store with 5000 qps. 384 const minQPSFraction = .001 385 if replWithStats.qps < localDesc.Capacity.QueriesPerSecond*minQPSFraction && 386 float64(localDesc.Capacity.LeaseCount) <= storeList.candidateLeases.mean { 387 log.VEventf(ctx, 5, "r%d's %.2f qps is too little to matter relative to s%d's %.2f total qps", 388 replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, localDesc.Capacity.QueriesPerSecond) 389 continue 390 } 391 392 desc, zone := replWithStats.repl.DescAndZone() 393 log.VEventf(ctx, 3, "considering lease transfer for r%d with %.2f qps", 394 desc.RangeID, replWithStats.qps) 395 396 // Check all the other replicas in order of increasing qps. Learner replicas 397 // aren't allowed to become the leaseholder or raft leader, so only consider 398 // the `Voters` replicas. 399 candidates := desc.Replicas().DeepCopy().Voters() 400 sort.Slice(candidates, func(i, j int) bool { 401 var iQPS, jQPS float64 402 if desc := storeMap[candidates[i].StoreID]; desc != nil { 403 iQPS = desc.Capacity.QueriesPerSecond 404 } 405 if desc := storeMap[candidates[j].StoreID]; desc != nil { 406 jQPS = desc.Capacity.QueriesPerSecond 407 } 408 return iQPS < jQPS 409 }) 410 411 var raftStatus *raft.Status 412 413 preferred := sr.rq.allocator.preferredLeaseholders(zone, candidates) 414 for _, candidate := range candidates { 415 if candidate.StoreID == localDesc.StoreID { 416 continue 417 } 418 419 meanQPS := storeList.candidateQueriesPerSecond.mean 420 if shouldNotMoveTo(ctx, storeMap, replWithStats, candidate.StoreID, meanQPS, minQPS, maxQPS) { 421 continue 422 } 423 424 if raftStatus == nil { 425 raftStatus = sr.getRaftStatusFn(replWithStats.repl) 426 } 427 if replicaIsBehind(raftStatus, candidate.ReplicaID) { 428 log.VEventf(ctx, 3, "%v is behind or this store isn't the raft leader for r%d; raftStatus: %v", 429 candidate, desc.RangeID, raftStatus) 430 continue 431 } 432 433 if len(preferred) > 0 && !storeHasReplica(candidate.StoreID, preferred) { 434 log.VEventf(ctx, 3, "s%d not a preferred leaseholder for r%d; preferred: %v", 435 candidate.StoreID, desc.RangeID, preferred) 436 continue 437 } 438 439 filteredStoreList := storeList.filter(zone.Constraints) 440 if sr.rq.allocator.followTheWorkloadPrefersLocal( 441 ctx, 442 filteredStoreList, 443 *localDesc, 444 candidate.StoreID, 445 candidates, 446 replWithStats.repl.leaseholderStats, 447 ) { 448 log.VEventf(ctx, 3, "r%d is on s%d due to follow-the-workload; skipping", 449 desc.RangeID, localDesc.StoreID) 450 continue 451 } 452 453 return replWithStats, candidate, considerForRebalance 454 } 455 456 // If none of the other replicas are valid lease transfer targets, consider 457 // this range for replica rebalancing. 458 considerForRebalance = append(considerForRebalance, replWithStats) 459 } 460 } 461 462 func (sr *StoreRebalancer) chooseReplicaToRebalance( 463 ctx context.Context, 464 hottestRanges *[]replicaWithStats, 465 localDesc *roachpb.StoreDescriptor, 466 storeList StoreList, 467 storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor, 468 minQPS float64, 469 maxQPS float64, 470 ) (replicaWithStats, []roachpb.ReplicationTarget) { 471 now := sr.rq.store.Clock().Now() 472 for { 473 if len(*hottestRanges) == 0 { 474 return replicaWithStats{}, nil 475 } 476 replWithStats := (*hottestRanges)[0] 477 *hottestRanges = (*hottestRanges)[1:] 478 479 if replWithStats.repl == nil { 480 return replicaWithStats{}, nil 481 } 482 483 if shouldNotMoveAway(ctx, replWithStats, localDesc, now, minQPS) { 484 continue 485 } 486 487 // Don't bother moving ranges whose QPS is below some small fraction of the 488 // store's QPS (unless the store has extra ranges to spare anyway). It's 489 // just unnecessary churn with no benefit to move ranges responsible for, 490 // for example, 1 qps on a store with 5000 qps. 491 const minQPSFraction = .001 492 if replWithStats.qps < localDesc.Capacity.QueriesPerSecond*minQPSFraction && 493 float64(localDesc.Capacity.RangeCount) <= storeList.candidateRanges.mean { 494 log.VEventf(ctx, 5, "r%d's %.2f qps is too little to matter relative to s%d's %.2f total qps", 495 replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, localDesc.Capacity.QueriesPerSecond) 496 continue 497 } 498 499 desc, zone := replWithStats.repl.DescAndZone() 500 log.VEventf(ctx, 3, "considering replica rebalance for r%d with %.2f qps", 501 desc.RangeID, replWithStats.qps) 502 503 clusterNodes := sr.rq.allocator.storePool.ClusterNodeCount() 504 desiredReplicas := GetNeededReplicas(*zone.NumReplicas, clusterNodes) 505 targets := make([]roachpb.ReplicationTarget, 0, desiredReplicas) 506 targetReplicas := make([]roachpb.ReplicaDescriptor, 0, desiredReplicas) 507 currentReplicas := desc.Replicas().All() 508 509 // Check the range's existing diversity score, since we want to ensure we 510 // don't hurt locality diversity just to improve QPS. 511 curDiversity := rangeDiversityScore( 512 sr.rq.allocator.storePool.getLocalities(currentReplicas)) 513 514 // Check the existing replicas, keeping around those that aren't overloaded. 515 for i := range currentReplicas { 516 if currentReplicas[i].StoreID == localDesc.StoreID { 517 continue 518 } 519 // Keep the replica in the range if we don't know its QPS or if its QPS 520 // is below the upper threshold. Punishing stores not in our store map 521 // could cause mass evictions if the storePool gets out of sync. 522 storeDesc, ok := storeMap[currentReplicas[i].StoreID] 523 if !ok || storeDesc.Capacity.QueriesPerSecond < maxQPS { 524 targets = append(targets, roachpb.ReplicationTarget{ 525 NodeID: currentReplicas[i].NodeID, 526 StoreID: currentReplicas[i].StoreID, 527 }) 528 targetReplicas = append(targetReplicas, roachpb.ReplicaDescriptor{ 529 NodeID: currentReplicas[i].NodeID, 530 StoreID: currentReplicas[i].StoreID, 531 }) 532 } 533 } 534 535 // Then pick out which new stores to add the remaining replicas to. 536 options := sr.rq.allocator.scorerOptions() 537 options.qpsRebalanceThreshold = qpsRebalanceThreshold.Get(&sr.st.SV) 538 for len(targets) < desiredReplicas { 539 // Use the preexisting AllocateTarget logic to ensure that considerations 540 // such as zone constraints, locality diversity, and full disk come 541 // into play. 542 target, _ := sr.rq.allocator.allocateTargetFromList( 543 ctx, 544 storeList, 545 zone, 546 targetReplicas, 547 options, 548 ) 549 if target == nil { 550 log.VEventf(ctx, 3, "no rebalance targets found to replace the current store for r%d", 551 desc.RangeID) 552 break 553 } 554 555 meanQPS := storeList.candidateQueriesPerSecond.mean 556 if shouldNotMoveTo(ctx, storeMap, replWithStats, target.StoreID, meanQPS, minQPS, maxQPS) { 557 break 558 } 559 560 targets = append(targets, roachpb.ReplicationTarget{ 561 NodeID: target.Node.NodeID, 562 StoreID: target.StoreID, 563 }) 564 targetReplicas = append(targetReplicas, roachpb.ReplicaDescriptor{ 565 NodeID: target.Node.NodeID, 566 StoreID: target.StoreID, 567 }) 568 } 569 570 // If we couldn't find enough valid targets, forget about this range. 571 // 572 // TODO(a-robinson): Support more incremental improvements -- move what we 573 // can if it makes things better even if it isn't great. For example, 574 // moving one of the other existing replicas that's on a store with less 575 // qps than the max threshold but above the mean would help in certain 576 // locality configurations. 577 if len(targets) < desiredReplicas { 578 log.VEventf(ctx, 3, "couldn't find enough rebalance targets for r%d (%d/%d)", 579 desc.RangeID, len(targets), desiredReplicas) 580 continue 581 } 582 newDiversity := rangeDiversityScore(sr.rq.allocator.storePool.getLocalities(targetReplicas)) 583 if newDiversity < curDiversity { 584 log.VEventf(ctx, 3, 585 "new diversity %.2f for r%d worse than current diversity %.2f; not rebalancing", 586 newDiversity, desc.RangeID, curDiversity) 587 continue 588 } 589 590 // Pick the replica with the least QPS to be leaseholder; 591 // RelocateRange transfers the lease to the first provided target. 592 newLeaseIdx := 0 593 newLeaseQPS := math.MaxFloat64 594 var raftStatus *raft.Status 595 for i := 0; i < len(targets); i++ { 596 // Ensure we don't transfer the lease to an existing replica that is behind 597 // in processing its raft log. 598 if replica, ok := desc.GetReplicaDescriptor(targets[i].StoreID); ok { 599 if raftStatus == nil { 600 raftStatus = sr.getRaftStatusFn(replWithStats.repl) 601 } 602 if replicaIsBehind(raftStatus, replica.ReplicaID) { 603 continue 604 } 605 } 606 607 storeDesc, ok := storeMap[targets[i].StoreID] 608 if ok && storeDesc.Capacity.QueriesPerSecond < newLeaseQPS { 609 newLeaseIdx = i 610 newLeaseQPS = storeDesc.Capacity.QueriesPerSecond 611 } 612 } 613 targets[0], targets[newLeaseIdx] = targets[newLeaseIdx], targets[0] 614 return replWithStats, targets 615 } 616 } 617 618 func shouldNotMoveAway( 619 ctx context.Context, 620 replWithStats replicaWithStats, 621 localDesc *roachpb.StoreDescriptor, 622 now hlc.Timestamp, 623 minQPS float64, 624 ) bool { 625 if !replWithStats.repl.OwnsValidLease(now) { 626 log.VEventf(ctx, 3, "store doesn't own the lease for r%d", replWithStats.repl.RangeID) 627 return true 628 } 629 if localDesc.Capacity.QueriesPerSecond-replWithStats.qps < minQPS { 630 log.VEventf(ctx, 3, "moving r%d's %.2f qps would bring s%d below the min threshold (%.2f)", 631 replWithStats.repl.RangeID, replWithStats.qps, localDesc.StoreID, minQPS) 632 return true 633 } 634 return false 635 } 636 637 func shouldNotMoveTo( 638 ctx context.Context, 639 storeMap map[roachpb.StoreID]*roachpb.StoreDescriptor, 640 replWithStats replicaWithStats, 641 candidateStore roachpb.StoreID, 642 meanQPS float64, 643 minQPS float64, 644 maxQPS float64, 645 ) bool { 646 storeDesc, ok := storeMap[candidateStore] 647 if !ok { 648 log.VEventf(ctx, 3, "missing store descriptor for s%d", candidateStore) 649 return true 650 } 651 652 newCandidateQPS := storeDesc.Capacity.QueriesPerSecond + replWithStats.qps 653 if storeDesc.Capacity.QueriesPerSecond < minQPS { 654 if newCandidateQPS > maxQPS { 655 log.VEventf(ctx, 3, 656 "r%d's %.2f qps would push s%d over the max threshold (%.2f) with %.2f qps afterwards", 657 replWithStats.repl.RangeID, replWithStats.qps, candidateStore, maxQPS, newCandidateQPS) 658 return true 659 } 660 } else if newCandidateQPS > meanQPS { 661 log.VEventf(ctx, 3, 662 "r%d's %.2f qps would push s%d over the mean (%.2f) with %.2f qps afterwards", 663 replWithStats.repl.RangeID, replWithStats.qps, candidateStore, meanQPS, newCandidateQPS) 664 return true 665 } 666 667 return false 668 } 669 670 func storeListToMap(sl StoreList) map[roachpb.StoreID]*roachpb.StoreDescriptor { 671 storeMap := make(map[roachpb.StoreID]*roachpb.StoreDescriptor) 672 for i := range sl.stores { 673 storeMap[sl.stores[i].StoreID] = &sl.stores[i] 674 } 675 return storeMap 676 } 677 678 // jitteredInterval returns a randomly jittered (+/-25%) duration 679 // from checkInterval. 680 func jitteredInterval(interval time.Duration) time.Duration { 681 return time.Duration(float64(interval) * (0.75 + 0.5*rand.Float64())) 682 }