github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/allocator.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "encoding/json" 16 "fmt" 17 "math" 18 "math/rand" 19 "strings" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 23 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/constraint" 24 "github.com/cockroachdb/cockroach/pkg/roachpb" 25 "github.com/cockroachdb/cockroach/pkg/settings" 26 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 27 "github.com/cockroachdb/cockroach/pkg/util/log" 28 "github.com/cockroachdb/cockroach/pkg/util/syncutil" 29 "github.com/cockroachdb/errors" 30 "go.etcd.io/etcd/raft" 31 "go.etcd.io/etcd/raft/tracker" 32 ) 33 34 const ( 35 // leaseRebalanceThreshold is the minimum ratio of a store's lease surplus 36 // to the mean range/lease count that permits lease-transfers away from that 37 // store. 38 leaseRebalanceThreshold = 0.05 39 40 // baseLoadBasedLeaseRebalanceThreshold is the equivalent of 41 // leaseRebalanceThreshold for load-based lease rebalance decisions (i.e. 42 // "follow-the-workload"). It's the base threshold for decisions that get 43 // adjusted based on the load and latency of the involved ranges/nodes. 44 baseLoadBasedLeaseRebalanceThreshold = 2 * leaseRebalanceThreshold 45 46 // minReplicaWeight sets a floor for how low a replica weight can be. This is 47 // needed because a weight of zero doesn't work in the current lease scoring 48 // algorithm. 49 minReplicaWeight = 0.001 50 51 // Priorities for various repair operations. 52 finalizeAtomicReplicationChangePriority float64 = 12002 53 removeLearnerReplicaPriority float64 = 12001 54 addDeadReplacementPriority float64 = 12000 55 addMissingReplicaPriority float64 = 10000 56 addDecommissioningReplacementPriority float64 = 5000 57 removeDeadReplicaPriority float64 = 1000 58 removeDecommissioningReplicaPriority float64 = 200 59 removeExtraReplicaPriority float64 = 100 60 ) 61 62 // MinLeaseTransferStatsDuration configures the minimum amount of time a 63 // replica must wait for stats about request counts to accumulate before 64 // making decisions based on them. The higher this is, the less likely 65 // thrashing is (up to a point). 66 // Made configurable for the sake of testing. 67 var MinLeaseTransferStatsDuration = 30 * time.Second 68 69 // enableLoadBasedLeaseRebalancing controls whether lease rebalancing is done 70 // via the new heuristic based on request load and latency or via the simpler 71 // approach that purely seeks to balance the number of leases per node evenly. 72 var enableLoadBasedLeaseRebalancing = settings.RegisterPublicBoolSetting( 73 "kv.allocator.load_based_lease_rebalancing.enabled", 74 "set to enable rebalancing of range leases based on load and latency", 75 true, 76 ) 77 78 // leaseRebalancingAggressiveness enables users to tweak how aggressive their 79 // cluster is at moving leases towards the localities where the most requests 80 // are coming from. Settings lower than 1.0 will make the system less 81 // aggressive about moving leases toward requests than the default, while 82 // settings greater than 1.0 will cause more aggressive placement. 83 // 84 // Setting this to 0 effectively disables load-based lease rebalancing, and 85 // settings less than 0 are disallowed. 86 var leaseRebalancingAggressiveness = settings.RegisterNonNegativeFloatSetting( 87 "kv.allocator.lease_rebalancing_aggressiveness", 88 "set greater than 1.0 to rebalance leases toward load more aggressively, "+ 89 "or between 0 and 1.0 to be more conservative about rebalancing leases", 90 1.0, 91 ) 92 93 // AllocatorAction enumerates the various replication adjustments that may be 94 // recommended by the allocator. 95 type AllocatorAction int 96 97 // These are the possible allocator actions. 98 const ( 99 _ AllocatorAction = iota 100 AllocatorNoop 101 AllocatorRemove 102 AllocatorAdd 103 AllocatorReplaceDead 104 AllocatorRemoveDead 105 AllocatorReplaceDecommissioning 106 AllocatorRemoveDecommissioning 107 AllocatorRemoveLearner 108 AllocatorConsiderRebalance 109 AllocatorRangeUnavailable 110 AllocatorFinalizeAtomicReplicationChange 111 ) 112 113 var allocatorActionNames = map[AllocatorAction]string{ 114 AllocatorNoop: "noop", 115 AllocatorRemove: "remove", 116 AllocatorAdd: "add", 117 AllocatorReplaceDead: "replace dead", 118 AllocatorRemoveDead: "remove dead", 119 AllocatorReplaceDecommissioning: "replace decommissioning", 120 AllocatorRemoveDecommissioning: "remove decommissioning", 121 AllocatorRemoveLearner: "remove learner", 122 AllocatorConsiderRebalance: "consider rebalance", 123 AllocatorRangeUnavailable: "range unavailable", 124 AllocatorFinalizeAtomicReplicationChange: "finalize conf change", 125 } 126 127 func (a AllocatorAction) String() string { 128 return allocatorActionNames[a] 129 } 130 131 type transferDecision int 132 133 const ( 134 _ transferDecision = iota 135 shouldTransfer 136 shouldNotTransfer 137 decideWithoutStats 138 ) 139 140 // allocatorError indicates a retryable error condition which sends replicas 141 // being processed through the replicate_queue into purgatory so that they 142 // can be retried quickly as soon as new stores come online, or additional 143 // space frees up. 144 type allocatorError struct { 145 constraints []zonepb.ConstraintsConjunction 146 existingReplicas int 147 aliveStores int 148 throttledStores int 149 } 150 151 func (ae *allocatorError) Error() string { 152 var existingReplsStr string 153 if ae.existingReplicas == 1 { 154 existingReplsStr = "1 already has a replica" 155 } else { 156 existingReplsStr = fmt.Sprintf("%d already have a replica", ae.existingReplicas) 157 } 158 159 var baseMsg string 160 if ae.throttledStores != 0 { 161 baseMsg = fmt.Sprintf( 162 "0 of %d live stores are able to take a new replica for the range (%d throttled, %s)", 163 ae.aliveStores, ae.throttledStores, existingReplsStr) 164 } else { 165 baseMsg = fmt.Sprintf( 166 "0 of %d live stores are able to take a new replica for the range (%s)", 167 ae.aliveStores, existingReplsStr) 168 } 169 170 if len(ae.constraints) == 0 { 171 if ae.throttledStores > 0 { 172 return baseMsg 173 } 174 return baseMsg + "; likely not enough nodes in cluster" 175 } 176 var b strings.Builder 177 b.WriteString(baseMsg) 178 b.WriteString("; must match constraints [") 179 for i := range ae.constraints { 180 if i > 0 { 181 b.WriteByte(' ') 182 } 183 b.WriteByte('{') 184 b.WriteString(ae.constraints[i].String()) 185 b.WriteByte('}') 186 } 187 b.WriteString("]") 188 return b.String() 189 } 190 191 func (*allocatorError) purgatoryErrorMarker() {} 192 193 var _ purgatoryError = &allocatorError{} 194 195 // allocatorRand pairs a rand.Rand with a mutex. 196 // NOTE: Allocator is typically only accessed from a single thread (the 197 // replication queue), but this assumption is broken in tests which force 198 // replication scans. If those tests can be modified to suspend the normal 199 // replication queue during the forced scan, then this rand could be used 200 // without a mutex. 201 type allocatorRand struct { 202 *syncutil.Mutex 203 *rand.Rand 204 } 205 206 func makeAllocatorRand(source rand.Source) allocatorRand { 207 return allocatorRand{ 208 Mutex: &syncutil.Mutex{}, 209 Rand: rand.New(source), 210 } 211 } 212 213 // RangeUsageInfo contains usage information (sizes and traffic) needed by the 214 // allocator to make rebalancing decisions for a given range. 215 type RangeUsageInfo struct { 216 LogicalBytes int64 217 QueriesPerSecond float64 218 WritesPerSecond float64 219 } 220 221 func rangeUsageInfoForRepl(repl *Replica) RangeUsageInfo { 222 info := RangeUsageInfo{ 223 LogicalBytes: repl.GetMVCCStats().Total(), 224 } 225 if queriesPerSecond, dur := repl.leaseholderStats.avgQPS(); dur >= MinStatsDuration { 226 info.QueriesPerSecond = queriesPerSecond 227 } 228 if writesPerSecond, dur := repl.writeStats.avgQPS(); dur >= MinStatsDuration { 229 info.WritesPerSecond = writesPerSecond 230 } 231 return info 232 } 233 234 // Allocator tries to spread replicas as evenly as possible across the stores 235 // in the cluster. 236 type Allocator struct { 237 storePool *StorePool 238 nodeLatencyFn func(addr string) (time.Duration, bool) 239 randGen allocatorRand 240 } 241 242 // MakeAllocator creates a new allocator using the specified StorePool. 243 func MakeAllocator( 244 storePool *StorePool, nodeLatencyFn func(addr string) (time.Duration, bool), 245 ) Allocator { 246 var randSource rand.Source 247 // There are number of test cases that make a test store but don't add 248 // gossip or a store pool. So we can't rely on the existence of the 249 // store pool in those cases. 250 if storePool != nil && storePool.deterministic { 251 randSource = rand.NewSource(777) 252 } else { 253 randSource = rand.NewSource(rand.Int63()) 254 } 255 return Allocator{ 256 storePool: storePool, 257 nodeLatencyFn: nodeLatencyFn, 258 randGen: makeAllocatorRand(randSource), 259 } 260 } 261 262 // GetNeededReplicas calculates the number of replicas a range should 263 // have given its zone config and the number of nodes available for 264 // up-replication (i.e. not dead and not decommissioning). 265 func GetNeededReplicas(zoneConfigReplicaCount int32, clusterNodes int) int { 266 numZoneReplicas := int(zoneConfigReplicaCount) 267 need := numZoneReplicas 268 269 // Adjust the replication factor for all ranges if there are fewer 270 // nodes than replicas specified in the zone config, so the cluster 271 // can still function. 272 if clusterNodes < need { 273 need = clusterNodes 274 } 275 276 // Ensure that we don't up- or down-replicate to an even number of replicas 277 // unless an even number of replicas was specifically requested by the user 278 // in the zone config. 279 // 280 // Note that in the case of 5 desired replicas and a decommissioning store, 281 // this prefers down-replicating from 5 to 3 rather than sticking with 4 282 // desired stores or blocking the decommissioning from completing. 283 if need == numZoneReplicas { 284 return need 285 } 286 if need%2 == 0 { 287 need = need - 1 288 } 289 if need < 3 { 290 need = 3 291 } 292 if need > numZoneReplicas { 293 need = numZoneReplicas 294 } 295 296 return need 297 } 298 299 // ComputeAction determines the exact operation needed to repair the 300 // supplied range, as governed by the supplied zone configuration. It 301 // returns the required action that should be taken and a priority. 302 func (a *Allocator) ComputeAction( 303 ctx context.Context, zone *zonepb.ZoneConfig, desc *roachpb.RangeDescriptor, 304 ) (AllocatorAction, float64) { 305 if a.storePool == nil { 306 // Do nothing if storePool is nil for some unittests. 307 return AllocatorNoop, 0 308 } 309 310 if desc.Replicas().InAtomicReplicationChange() { 311 // With a similar reasoning to the learner branch below, if we're in a 312 // joint configuration the top priority is to leave it before we can 313 // even think about doing anything else. 314 return AllocatorFinalizeAtomicReplicationChange, finalizeAtomicReplicationChangePriority 315 } 316 317 // Seeing a learner replica at this point is unexpected because learners are a 318 // short-lived (ish) transient state in a learner+snapshot+voter cycle, which 319 // is always done atomically. Only two places could have added a learner: the 320 // replicate queue or AdminChangeReplicas request. 321 // 322 // The replicate queue only operates on leaseholders, which means that only 323 // one node at a time is operating on a given range except in rare cases (old 324 // leaseholder could start the operation, and a new leaseholder steps up and 325 // also starts an overlapping operation). Combined with the above atomicity, 326 // this means that if the replicate queue sees a learner, either the node that 327 // was adding it crashed somewhere in the learner+snapshot+voter cycle and 328 // we're the new leaseholder or we caught a race. 329 // 330 // In the first case, we could assume the node that was adding it knew what it 331 // was doing and finish the addition. Or we could leave it and do higher 332 // priority operations first if there are any. However, this comes with code 333 // complexity and concept complexity (computing old vs new quorum sizes 334 // becomes ambiguous, the learner isn't in the quorum but it likely will be 335 // soon, so do you count it?). Instead, we do the simplest thing and remove it 336 // before doing any other operations to the range. We'll revisit this decision 337 // if and when the complexity becomes necessary. 338 // 339 // If we get the race where AdminChangeReplicas is adding a replica and the 340 // queue happens to run during the snapshot, this will remove the learner and 341 // AdminChangeReplicas will notice either during the snapshot transfer or when 342 // it tries to promote the learner to a voter. AdminChangeReplicas should 343 // retry. 344 // 345 // On the other hand if we get the race where a leaseholder starts adding a 346 // replica in the replicate queue and during this loses its lease, it should 347 // probably not retry. 348 if learners := desc.Replicas().Learners(); len(learners) > 0 { 349 // TODO(dan): Since this goes before anything else, the priority here should 350 // be influenced by whatever operations would happen right after the learner 351 // is removed. In the meantime, we don't want to block something important 352 // from happening (like addDeadReplacementPriority) by queueing this at a 353 // low priority so until this TODO is done, keep 354 // removeLearnerReplicaPriority as the highest priority. 355 return AllocatorRemoveLearner, removeLearnerReplicaPriority 356 } 357 // computeAction expects to operate only on voters. 358 return a.computeAction(ctx, zone, desc.Replicas().Voters()) 359 } 360 361 func (a *Allocator) computeAction( 362 ctx context.Context, zone *zonepb.ZoneConfig, voterReplicas []roachpb.ReplicaDescriptor, 363 ) (AllocatorAction, float64) { 364 // TODO(mrtracy): Handle non-homogeneous and mismatched attribute sets. 365 have := len(voterReplicas) 366 decommissioningReplicas := a.storePool.decommissioningReplicas(voterReplicas) 367 clusterNodes := a.storePool.ClusterNodeCount() 368 need := GetNeededReplicas(*zone.NumReplicas, clusterNodes) 369 desiredQuorum := computeQuorum(need) 370 quorum := computeQuorum(have) 371 372 if have < need { 373 // Range is under-replicated, and should add an additional replica. 374 // Priority is adjusted by the difference between the current replica 375 // count and the quorum of the desired replica count. 376 priority := addMissingReplicaPriority + float64(desiredQuorum-have) 377 action := AllocatorAdd 378 log.VEventf(ctx, 3, "%s - missing replica need=%d, have=%d, priority=%.2f", 379 action, need, have, priority) 380 return action, priority 381 } 382 383 liveVoterReplicas, deadVoterReplicas := a.storePool.liveAndDeadReplicas(voterReplicas) 384 385 if len(liveVoterReplicas) < quorum { 386 // Do not take any replacement/removal action if we do not have a quorum of live 387 // replicas. If we're correctly assessing the unavailable state of the range, we 388 // also won't be able to add replicas as we try above, but hope springs eternal. 389 log.VEventf(ctx, 1, "unable to take action - live replicas %v don't meet quorum of %d", 390 liveVoterReplicas, quorum) 391 return AllocatorRangeUnavailable, 0 392 } 393 394 if have == need && len(deadVoterReplicas) > 0 { 395 // Range has dead replica(s). We should up-replicate to add another before 396 // before removing the dead one. This can avoid permanent data loss in cases 397 // where the node is only temporarily dead, but we remove it from the range 398 // and lose a second node before we can up-replicate (#25392). 399 // The dead replica(s) will be down-replicated later. 400 priority := addDeadReplacementPriority 401 action := AllocatorReplaceDead 402 log.VEventf(ctx, 3, "%s - replacement for %d dead replicas priority=%.2f", 403 action, len(deadVoterReplicas), priority) 404 return action, priority 405 } 406 407 if have == need && len(decommissioningReplicas) > 0 { 408 // Range has decommissioning replica(s), which should be replaced. 409 priority := addDecommissioningReplacementPriority 410 action := AllocatorReplaceDecommissioning 411 log.VEventf(ctx, 3, "%s - replacement for %d decommissioning replicas priority=%.2f", 412 action, len(decommissioningReplicas), priority) 413 return action, priority 414 } 415 416 // Removal actions follow. 417 // TODO(a-robinson): There's an additional case related to dead replicas that 418 // we should handle above. If there are one or more dead replicas, have < 419 // need, and there are no available stores to up-replicate to, then we should 420 // try to remove the dead replica(s) to get down to an odd number of 421 // replicas. 422 if len(deadVoterReplicas) > 0 { 423 // The range has dead replicas, which should be removed immediately. 424 priority := removeDeadReplicaPriority + float64(quorum-len(liveVoterReplicas)) 425 action := AllocatorRemoveDead 426 log.VEventf(ctx, 3, "%s - dead=%d, live=%d, quorum=%d, priority=%.2f", 427 action, len(deadVoterReplicas), len(liveVoterReplicas), quorum, priority) 428 return action, priority 429 } 430 431 if len(decommissioningReplicas) > 0 { 432 // Range is over-replicated, and has a decommissioning replica which 433 // should be removed. 434 priority := removeDecommissioningReplicaPriority 435 action := AllocatorRemoveDecommissioning 436 log.VEventf(ctx, 3, 437 "%s - need=%d, have=%d, num_decommissioning=%d, priority=%.2f", 438 action, need, have, len(decommissioningReplicas), priority) 439 return action, priority 440 } 441 442 if have > need { 443 // Range is over-replicated, and should remove a replica. 444 // Ranges with an even number of replicas get extra priority because 445 // they have a more fragile quorum. 446 priority := removeExtraReplicaPriority - float64(have%2) 447 action := AllocatorRemove 448 log.VEventf(ctx, 3, "%s - need=%d, have=%d, priority=%.2f", action, need, have, priority) 449 return action, priority 450 } 451 452 // Nothing needs to be done, but we may want to rebalance. 453 return AllocatorConsiderRebalance, 0 454 } 455 456 type decisionDetails struct { 457 Target string 458 Existing string `json:",omitempty"` 459 } 460 461 // AllocateTarget returns a suitable store for a new allocation with the 462 // required attributes. Nodes already accommodating existing replicas are ruled 463 // out as targets. The range ID of the replica being allocated for is also 464 // passed in to ensure that we don't try to replace an existing dead replica on 465 // a store. 466 // 467 // TODO(tbg): AllocateReplacement? 468 func (a *Allocator) AllocateTarget( 469 ctx context.Context, zone *zonepb.ZoneConfig, existingReplicas []roachpb.ReplicaDescriptor, 470 ) (*roachpb.StoreDescriptor, string, error) { 471 sl, aliveStoreCount, throttled := a.storePool.getStoreList(storeFilterThrottled) 472 473 target, details := a.allocateTargetFromList( 474 ctx, sl, zone, existingReplicas, a.scorerOptions()) 475 476 if target != nil { 477 return target, details, nil 478 } 479 480 // When there are throttled stores that do match, we shouldn't send 481 // the replica to purgatory. 482 if len(throttled) > 0 { 483 return nil, "", errors.Errorf( 484 "%d matching stores are currently throttled: %v", len(throttled), throttled, 485 ) 486 } 487 return nil, "", &allocatorError{ 488 constraints: zone.Constraints, 489 existingReplicas: len(existingReplicas), 490 aliveStores: aliveStoreCount, 491 throttledStores: len(throttled), 492 } 493 } 494 495 func (a *Allocator) allocateTargetFromList( 496 ctx context.Context, 497 sl StoreList, 498 zone *zonepb.ZoneConfig, 499 candidateReplicas []roachpb.ReplicaDescriptor, 500 options scorerOptions, 501 ) (*roachpb.StoreDescriptor, string) { 502 analyzedConstraints := constraint.AnalyzeConstraints( 503 ctx, a.storePool.getStoreDescriptor, candidateReplicas, zone) 504 candidates := allocateCandidates( 505 sl, analyzedConstraints, candidateReplicas, a.storePool.getLocalities(candidateReplicas), 506 options, 507 ) 508 log.VEventf(ctx, 3, "allocate candidates: %s", candidates) 509 if target := candidates.selectGood(a.randGen); target != nil { 510 log.VEventf(ctx, 3, "add target: %s", target) 511 details := decisionDetails{Target: target.compactString(options)} 512 detailsBytes, err := json.Marshal(details) 513 if err != nil { 514 log.Warningf(ctx, "failed to marshal details for choosing allocate target: %+v", err) 515 } 516 return &target.store, string(detailsBytes) 517 } 518 519 return nil, "" 520 } 521 522 func (a Allocator) simulateRemoveTarget( 523 ctx context.Context, 524 targetStore roachpb.StoreID, 525 zone *zonepb.ZoneConfig, 526 candidates []roachpb.ReplicaDescriptor, 527 existingReplicas []roachpb.ReplicaDescriptor, 528 rangeUsageInfo RangeUsageInfo, 529 ) (roachpb.ReplicaDescriptor, string, error) { 530 // Update statistics first 531 // TODO(a-robinson): This could theoretically interfere with decisions made by other goroutines, 532 // but as of October 2017 calls to the Allocator are mostly serialized by the ReplicateQueue 533 // (with the main exceptions being Scatter and the status server's allocator debug endpoint). 534 // Try to make this interfere less with other callers. 535 a.storePool.updateLocalStoreAfterRebalance(targetStore, rangeUsageInfo, roachpb.ADD_REPLICA) 536 defer func() { 537 a.storePool.updateLocalStoreAfterRebalance(targetStore, rangeUsageInfo, roachpb.REMOVE_REPLICA) 538 }() 539 log.VEventf(ctx, 3, "simulating which replica would be removed after adding s%d", targetStore) 540 return a.RemoveTarget(ctx, zone, candidates, existingReplicas) 541 } 542 543 // RemoveTarget returns a suitable replica to remove from the provided replica 544 // set. It first attempts to randomly select a target from the set of stores 545 // that have greater than the average number of replicas. Failing that, it 546 // falls back to selecting a random target from any of the existing 547 // replicas. 548 func (a Allocator) RemoveTarget( 549 ctx context.Context, 550 zone *zonepb.ZoneConfig, 551 candidates []roachpb.ReplicaDescriptor, 552 existingReplicas []roachpb.ReplicaDescriptor, 553 ) (roachpb.ReplicaDescriptor, string, error) { 554 if len(candidates) == 0 { 555 return roachpb.ReplicaDescriptor{}, "", errors.Errorf("must supply at least one candidate replica to allocator.RemoveTarget()") 556 } 557 558 // Retrieve store descriptors for the provided candidates from the StorePool. 559 existingStoreIDs := make(roachpb.StoreIDSlice, len(candidates)) 560 for i, exist := range candidates { 561 existingStoreIDs[i] = exist.StoreID 562 } 563 sl, _, _ := a.storePool.getStoreListFromIDs(existingStoreIDs, storeFilterNone) 564 565 analyzedConstraints := constraint.AnalyzeConstraints( 566 ctx, a.storePool.getStoreDescriptor, existingReplicas, zone) 567 options := a.scorerOptions() 568 rankedCandidates := removeCandidates( 569 sl, 570 analyzedConstraints, 571 a.storePool.getLocalities(existingReplicas), 572 options, 573 ) 574 log.VEventf(ctx, 3, "remove candidates: %s", rankedCandidates) 575 if bad := rankedCandidates.selectBad(a.randGen); bad != nil { 576 for _, exist := range existingReplicas { 577 if exist.StoreID == bad.store.StoreID { 578 log.VEventf(ctx, 3, "remove target: %s", bad) 579 details := decisionDetails{Target: bad.compactString(options)} 580 detailsBytes, err := json.Marshal(details) 581 if err != nil { 582 log.Warningf(ctx, "failed to marshal details for choosing remove target: %+v", err) 583 } 584 return exist, string(detailsBytes), nil 585 } 586 } 587 } 588 589 return roachpb.ReplicaDescriptor{}, "", errors.New("could not select an appropriate replica to be removed") 590 } 591 592 // RebalanceTarget returns a suitable store for a rebalance target with 593 // required attributes. Rebalance targets are selected via the same mechanism 594 // as AllocateTarget(), except the chosen target must follow some additional 595 // criteria. Namely, if chosen, it must further the goal of balancing the 596 // cluster. 597 // 598 // The supplied parameters are the required attributes for the range and 599 // information about the range being considered for rebalancing. 600 // 601 // The existing replicas modulo any store with dead replicas are candidates for 602 // rebalancing. Note that rebalancing is accomplished by first adding a new 603 // replica to the range, then removing the most undesirable replica. 604 // 605 // Simply ignoring a rebalance opportunity in the event that the target chosen 606 // by AllocateTarget() doesn't fit balancing criteria is perfectly fine, as 607 // other stores in the cluster will also be doing their probabilistic best to 608 // rebalance. This helps prevent a stampeding herd targeting an abnormally 609 // under-utilized store. 610 // 611 // The return values are, in order: 612 // 613 // 1. The target on which to add a new replica, 614 // 2. An existing replica to remove, 615 // 3. a JSON string for use in the range log, and 616 // 4. a boolean indicationg whether 1-3 were populated (i.e. whether a rebalance 617 // opportunity was found). 618 func (a Allocator) RebalanceTarget( 619 ctx context.Context, 620 zone *zonepb.ZoneConfig, 621 raftStatus *raft.Status, 622 existingReplicas []roachpb.ReplicaDescriptor, 623 rangeUsageInfo RangeUsageInfo, 624 filter storeFilter, 625 ) (add roachpb.ReplicationTarget, remove roachpb.ReplicationTarget, details string, ok bool) { 626 sl, _, _ := a.storePool.getStoreList(filter) 627 628 zero := roachpb.ReplicationTarget{} 629 630 // We're going to add another replica to the range which will change the 631 // quorum size. Verify that the number of existing live replicas is sufficient 632 // to meet the new quorum. For a range configured for 3 replicas, this will 633 // disable rebalancing if one of the replicas is on a down node. Instead, 634 // we'll have to wait for the down node to be declared dead and go through the 635 // dead-node removal dance: remove dead replica, add new replica. 636 // 637 // NB: The len(replicas) > 1 check allows rebalancing of ranges with only a 638 // single replica. This is a corner case which could happen in practice and 639 // also affects tests. 640 if len(existingReplicas) > 1 { 641 var numLiveReplicas int 642 for _, s := range sl.stores { 643 for _, repl := range existingReplicas { 644 if s.StoreID == repl.StoreID { 645 numLiveReplicas++ 646 break 647 } 648 } 649 } 650 newQuorum := computeQuorum(len(existingReplicas) + 1) 651 if numLiveReplicas < newQuorum { 652 // Don't rebalance as we won't be able to make quorum after the rebalance 653 // until the new replica has been caught up. 654 return zero, zero, "", false 655 } 656 } 657 658 analyzedConstraints := constraint.AnalyzeConstraints( 659 ctx, a.storePool.getStoreDescriptor, existingReplicas, zone) 660 options := a.scorerOptions() 661 results := rebalanceCandidates( 662 ctx, 663 sl, 664 analyzedConstraints, 665 existingReplicas, 666 a.storePool.getLocalities(existingReplicas), 667 a.storePool.getNodeLocalityString, 668 options, 669 ) 670 671 if len(results) == 0 { 672 return zero, zero, "", false 673 } 674 // Keep looping until we either run out of options or find a target that we're 675 // pretty sure we won't want to remove immediately after adding it. 676 // If we would, we don't want to actually rebalance to that target. 677 var target *candidate 678 var removeReplica roachpb.ReplicaDescriptor 679 var existingCandidates candidateList 680 for { 681 target, existingCandidates = bestRebalanceTarget(a.randGen, results) 682 if target == nil { 683 return zero, zero, "", false 684 } 685 686 // Add a fake new replica to our copy of the range descriptor so that we can 687 // simulate the removal logic. If we decide not to go with this target, note 688 // that this needs to be removed from desc before we try any other target. 689 newReplica := roachpb.ReplicaDescriptor{ 690 NodeID: target.store.Node.NodeID, 691 StoreID: target.store.StoreID, 692 ReplicaID: maxReplicaID(existingReplicas) + 1, 693 } 694 // Deep-copy the Replicas slice since we'll mutate it below. 695 existingPlusOneNew := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...) 696 existingPlusOneNew = append(existingPlusOneNew, newReplica) 697 replicaCandidates := existingPlusOneNew 698 // If we can, filter replicas as we would if we were actually removing one. 699 // If we can't (e.g. because we're the leaseholder but not the raft leader), 700 // it's better to simulate the removal with the info that we do have than to 701 // assume that the rebalance is ok (#20241). 702 if raftStatus != nil && raftStatus.Progress != nil { 703 replicaCandidates = simulateFilterUnremovableReplicas( 704 ctx, raftStatus, replicaCandidates, newReplica.ReplicaID) 705 } 706 if len(replicaCandidates) == 0 { 707 // No existing replicas are suitable to remove. 708 log.VEventf(ctx, 2, "not rebalancing to s%d because there are no existing "+ 709 "replicas that can be removed", target.store.StoreID) 710 return zero, zero, "", false 711 } 712 713 var removeDetails string 714 var err error 715 removeReplica, removeDetails, err = a.simulateRemoveTarget( 716 ctx, 717 target.store.StoreID, 718 zone, 719 replicaCandidates, 720 existingPlusOneNew, 721 rangeUsageInfo, 722 ) 723 if err != nil { 724 log.Warningf(ctx, "simulating RemoveTarget failed: %+v", err) 725 return zero, zero, "", false 726 } 727 if target.store.StoreID != removeReplica.StoreID { 728 // Successfully populated these variables 729 _, _ = target, removeReplica 730 break 731 } 732 733 log.VEventf(ctx, 2, "not rebalancing to s%d because we'd immediately remove it: %s", 734 target.store.StoreID, removeDetails) 735 } 736 737 // Compile the details entry that will be persisted into system.rangelog for 738 // debugging/auditability purposes. 739 dDetails := decisionDetails{ 740 Target: target.compactString(options), 741 Existing: existingCandidates.compactString(options), 742 } 743 detailsBytes, err := json.Marshal(dDetails) 744 if err != nil { 745 log.Warningf(ctx, "failed to marshal details for choosing rebalance target: %+v", err) 746 } 747 748 addTarget := roachpb.ReplicationTarget{ 749 NodeID: target.store.Node.NodeID, 750 StoreID: target.store.StoreID, 751 } 752 removeTarget := roachpb.ReplicationTarget{ 753 NodeID: removeReplica.NodeID, 754 StoreID: removeReplica.StoreID, 755 } 756 return addTarget, removeTarget, string(detailsBytes), true 757 } 758 759 func (a *Allocator) scorerOptions() scorerOptions { 760 return scorerOptions{ 761 deterministic: a.storePool.deterministic, 762 rangeRebalanceThreshold: rangeRebalanceThreshold.Get(&a.storePool.st.SV), 763 } 764 } 765 766 // TransferLeaseTarget returns a suitable replica to transfer the range lease 767 // to from the provided list. It excludes the current lease holder replica 768 // unless asked to do otherwise by the checkTransferLeaseSource parameter. 769 func (a *Allocator) TransferLeaseTarget( 770 ctx context.Context, 771 zone *zonepb.ZoneConfig, 772 existing []roachpb.ReplicaDescriptor, 773 leaseStoreID roachpb.StoreID, 774 stats *replicaStats, 775 checkTransferLeaseSource bool, 776 checkCandidateFullness bool, 777 alwaysAllowDecisionWithoutStats bool, 778 ) roachpb.ReplicaDescriptor { 779 sl, _, _ := a.storePool.getStoreList(storeFilterNone) 780 sl = sl.filter(zone.Constraints) 781 782 // Filter stores that are on nodes containing existing replicas, but leave 783 // the stores containing the existing replicas in place. This excludes stores 784 // that we can't rebalance to, avoiding an issue in a 3-node cluster where 785 // there are multiple stores per node. 786 // 787 // TODO(peter,bram): This will need adjustment with the new allocator. `sl` 788 // needs to contain only the possible rebalance candidates + the existing 789 // stores the replicas are on. 790 filteredDescs := make([]roachpb.StoreDescriptor, 0, len(sl.stores)) 791 for _, s := range sl.stores { 792 var exclude bool 793 for _, r := range existing { 794 if r.NodeID == s.Node.NodeID && r.StoreID != s.StoreID { 795 exclude = true 796 break 797 } 798 } 799 if !exclude { 800 filteredDescs = append(filteredDescs, s) 801 } 802 } 803 sl = makeStoreList(filteredDescs) 804 805 source, ok := a.storePool.getStoreDescriptor(leaseStoreID) 806 if !ok { 807 return roachpb.ReplicaDescriptor{} 808 } 809 810 // Determine which store(s) is preferred based on user-specified preferences. 811 // If any stores match, only consider those stores as candidates. If only one 812 // store matches, it's where the lease should be (unless the preferred store 813 // is the current one and checkTransferLeaseSource is false). 814 var preferred []roachpb.ReplicaDescriptor 815 if checkTransferLeaseSource { 816 preferred = a.preferredLeaseholders(zone, existing) 817 } else { 818 // TODO(a-robinson): Should we just always remove the source store from 819 // existing when checkTransferLeaseSource is false? I'd do it now, but 820 // it's too big a change to make right before a major release. 821 var candidates []roachpb.ReplicaDescriptor 822 for _, repl := range existing { 823 if repl.StoreID != leaseStoreID { 824 candidates = append(candidates, repl) 825 } 826 } 827 preferred = a.preferredLeaseholders(zone, candidates) 828 } 829 if len(preferred) == 1 { 830 if preferred[0].StoreID == leaseStoreID { 831 return roachpb.ReplicaDescriptor{} 832 } 833 return preferred[0] 834 } else if len(preferred) > 1 { 835 // If the current leaseholder is not preferred, set checkTransferLeaseSource 836 // to false to motivate the below logic to transfer the lease. 837 existing = preferred 838 if !storeHasReplica(leaseStoreID, preferred) { 839 checkTransferLeaseSource = false 840 } 841 } 842 843 // Only consider live, non-draining replicas. 844 existing, _ = a.storePool.liveAndDeadReplicas(existing) 845 846 // Short-circuit if there are no valid targets out there. 847 if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == leaseStoreID) { 848 log.VEventf(ctx, 2, "no lease transfer target found") 849 return roachpb.ReplicaDescriptor{} 850 } 851 852 // Try to pick a replica to transfer the lease to while also determining 853 // whether we actually should be transferring the lease. The transfer 854 // decision is only needed if we've been asked to check the source. 855 transferDec, repl := a.shouldTransferLeaseUsingStats( 856 ctx, sl, source, existing, stats, nil, 857 ) 858 if checkTransferLeaseSource { 859 switch transferDec { 860 case shouldNotTransfer: 861 if !alwaysAllowDecisionWithoutStats { 862 return roachpb.ReplicaDescriptor{} 863 } 864 fallthrough 865 case decideWithoutStats: 866 if !a.shouldTransferLeaseWithoutStats(ctx, sl, source, existing) { 867 return roachpb.ReplicaDescriptor{} 868 } 869 case shouldTransfer: 870 default: 871 log.Fatalf(ctx, "unexpected transfer decision %d with replica %+v", transferDec, repl) 872 } 873 } 874 875 if repl != (roachpb.ReplicaDescriptor{}) { 876 return repl 877 } 878 879 // Fall back to logic that doesn't take request counts and latency into 880 // account if the counts/latency-based logic couldn't pick a best replica. 881 candidates := make([]roachpb.ReplicaDescriptor, 0, len(existing)) 882 var bestOption roachpb.ReplicaDescriptor 883 bestOptionLeaseCount := int32(math.MaxInt32) 884 for _, repl := range existing { 885 if leaseStoreID == repl.StoreID { 886 continue 887 } 888 storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID) 889 if !ok { 890 continue 891 } 892 if !checkCandidateFullness || float64(storeDesc.Capacity.LeaseCount) < sl.candidateLeases.mean-0.5 { 893 candidates = append(candidates, repl) 894 } else if storeDesc.Capacity.LeaseCount < bestOptionLeaseCount { 895 bestOption = repl 896 bestOptionLeaseCount = storeDesc.Capacity.LeaseCount 897 } 898 } 899 if len(candidates) == 0 { 900 // If we aren't supposed to be considering the current leaseholder (e.g. 901 // because we need to remove this replica for some reason), return 902 // our best option if we otherwise wouldn't want to do anything. 903 if !checkTransferLeaseSource { 904 return bestOption 905 } 906 return roachpb.ReplicaDescriptor{} 907 } 908 a.randGen.Lock() 909 defer a.randGen.Unlock() 910 return candidates[a.randGen.Intn(len(candidates))] 911 } 912 913 // ShouldTransferLease returns true if the specified store is overfull in terms 914 // of leases with respect to the other stores matching the specified 915 // attributes. 916 func (a *Allocator) ShouldTransferLease( 917 ctx context.Context, 918 zone *zonepb.ZoneConfig, 919 existing []roachpb.ReplicaDescriptor, 920 leaseStoreID roachpb.StoreID, 921 stats *replicaStats, 922 ) bool { 923 source, ok := a.storePool.getStoreDescriptor(leaseStoreID) 924 if !ok { 925 return false 926 } 927 928 // Determine which store(s) is preferred based on user-specified preferences. 929 // If any stores match, only consider those stores as options. If only one 930 // store matches, it's where the lease should be. 931 preferred := a.preferredLeaseholders(zone, existing) 932 if len(preferred) == 1 { 933 return preferred[0].StoreID != leaseStoreID 934 } else if len(preferred) > 1 { 935 existing = preferred 936 // If the current leaseholder isn't one of the preferred stores, then we 937 // should try to transfer the lease. 938 if !storeHasReplica(leaseStoreID, existing) { 939 return true 940 } 941 } 942 943 sl, _, _ := a.storePool.getStoreList(storeFilterNone) 944 sl = sl.filter(zone.Constraints) 945 log.VEventf(ctx, 3, "ShouldTransferLease (lease-holder=%d):\n%s", leaseStoreID, sl) 946 947 // Only consider live, non-draining replicas. 948 existing, _ = a.storePool.liveAndDeadReplicas(existing) 949 950 // Short-circuit if there are no valid targets out there. 951 if len(existing) == 0 || (len(existing) == 1 && existing[0].StoreID == source.StoreID) { 952 return false 953 } 954 955 transferDec, _ := a.shouldTransferLeaseUsingStats(ctx, sl, source, existing, stats, nil) 956 var result bool 957 switch transferDec { 958 case shouldNotTransfer: 959 result = false 960 case shouldTransfer: 961 result = true 962 case decideWithoutStats: 963 result = a.shouldTransferLeaseWithoutStats(ctx, sl, source, existing) 964 default: 965 log.Fatalf(ctx, "unexpected transfer decision %d", transferDec) 966 } 967 968 log.VEventf(ctx, 3, "ShouldTransferLease decision (lease-holder=%d): %t", leaseStoreID, result) 969 return result 970 } 971 972 func (a Allocator) followTheWorkloadPrefersLocal( 973 ctx context.Context, 974 sl StoreList, 975 source roachpb.StoreDescriptor, 976 candidate roachpb.StoreID, 977 existing []roachpb.ReplicaDescriptor, 978 stats *replicaStats, 979 ) bool { 980 adjustments := make(map[roachpb.StoreID]float64) 981 decision, _ := a.shouldTransferLeaseUsingStats(ctx, sl, source, existing, stats, adjustments) 982 if decision == decideWithoutStats { 983 return false 984 } 985 adjustment := adjustments[candidate] 986 if adjustment > baseLoadBasedLeaseRebalanceThreshold { 987 log.VEventf(ctx, 3, 988 "s%d is a better fit than s%d due to follow-the-workload (score: %.2f; threshold: %.2f)", 989 source.StoreID, candidate, adjustment, baseLoadBasedLeaseRebalanceThreshold) 990 return true 991 } 992 return false 993 } 994 995 func (a Allocator) shouldTransferLeaseUsingStats( 996 ctx context.Context, 997 sl StoreList, 998 source roachpb.StoreDescriptor, 999 existing []roachpb.ReplicaDescriptor, 1000 stats *replicaStats, 1001 rebalanceAdjustments map[roachpb.StoreID]float64, 1002 ) (transferDecision, roachpb.ReplicaDescriptor) { 1003 // Only use load-based rebalancing if it's enabled and we have both 1004 // stats and locality information to base our decision on. 1005 if stats == nil || !enableLoadBasedLeaseRebalancing.Get(&a.storePool.st.SV) { 1006 return decideWithoutStats, roachpb.ReplicaDescriptor{} 1007 } 1008 replicaLocalities := a.storePool.getLocalities(existing) 1009 for _, locality := range replicaLocalities { 1010 if len(locality.Tiers) == 0 { 1011 return decideWithoutStats, roachpb.ReplicaDescriptor{} 1012 } 1013 } 1014 1015 qpsStats, qpsStatsDur := stats.perLocalityDecayingQPS() 1016 1017 // If we haven't yet accumulated enough data, avoid transferring for now, 1018 // unless we've been explicitly asked otherwise. Do not fall back to the 1019 // algorithm that doesn't use stats, since it can easily start fighting with 1020 // the stats-based algorithm. This provides some amount of safety from lease 1021 // thrashing, since leases cannot transfer more frequently than this threshold 1022 // (because replica stats get reset upon lease transfer). 1023 if qpsStatsDur < MinLeaseTransferStatsDuration { 1024 return shouldNotTransfer, roachpb.ReplicaDescriptor{} 1025 } 1026 1027 // On the other hand, if we don't have any stats with associated localities, 1028 // then do fall back to the algorithm that doesn't use request stats. 1029 delete(qpsStats, "") 1030 if len(qpsStats) == 0 { 1031 return decideWithoutStats, roachpb.ReplicaDescriptor{} 1032 } 1033 1034 replicaWeights := make(map[roachpb.NodeID]float64) 1035 for requestLocalityStr, qps := range qpsStats { 1036 var requestLocality roachpb.Locality 1037 if err := requestLocality.Set(requestLocalityStr); err != nil { 1038 log.Errorf(ctx, "unable to parse locality string %q: %+v", requestLocalityStr, err) 1039 continue 1040 } 1041 for nodeID, replicaLocality := range replicaLocalities { 1042 // Add weights to each replica based on the number of requests from 1043 // that replica's locality and neighboring localities. 1044 replicaWeights[nodeID] += (1 - replicaLocality.DiversityScore(requestLocality)) * qps 1045 } 1046 } 1047 1048 log.VEventf(ctx, 1, 1049 "shouldTransferLease qpsStats: %+v, replicaLocalities: %+v, replicaWeights: %+v", 1050 qpsStats, replicaLocalities, replicaWeights) 1051 sourceWeight := math.Max(minReplicaWeight, replicaWeights[source.Node.NodeID]) 1052 1053 // TODO(a-robinson): This may not have enough protection against all leases 1054 // ending up on a single node in extreme cases. Continue testing against 1055 // different situations. 1056 var bestRepl roachpb.ReplicaDescriptor 1057 bestReplScore := int32(math.MinInt32) 1058 for _, repl := range existing { 1059 if repl.NodeID == source.Node.NodeID { 1060 continue 1061 } 1062 storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID) 1063 if !ok { 1064 continue 1065 } 1066 addr, err := a.storePool.gossip.GetNodeIDAddress(repl.NodeID) 1067 if err != nil { 1068 log.Errorf(ctx, "missing address for n%d: %+v", repl.NodeID, err) 1069 continue 1070 } 1071 remoteLatency, ok := a.nodeLatencyFn(addr.String()) 1072 if !ok { 1073 continue 1074 } 1075 1076 remoteWeight := math.Max(minReplicaWeight, replicaWeights[repl.NodeID]) 1077 replScore, rebalanceAdjustment := loadBasedLeaseRebalanceScore( 1078 ctx, a.storePool.st, remoteWeight, remoteLatency, storeDesc, sourceWeight, source, sl.candidateLeases.mean) 1079 if replScore > bestReplScore { 1080 bestReplScore = replScore 1081 bestRepl = repl 1082 } 1083 if rebalanceAdjustments != nil { 1084 rebalanceAdjustments[repl.StoreID] = rebalanceAdjustment 1085 } 1086 } 1087 1088 if bestReplScore > 0 { 1089 return shouldTransfer, bestRepl 1090 } 1091 1092 // Return the best replica even in cases where transferring is not advised in 1093 // order to support forced lease transfers, such as when removing a replica or 1094 // draining all leases before shutdown. 1095 return shouldNotTransfer, bestRepl 1096 } 1097 1098 // loadBasedLeaseRebalanceScore attempts to give a score to how desirable it 1099 // would be to transfer a range lease from the local store to a remote store. 1100 // It does so using a formula based on the latency between the stores and 1101 // a number that we call the "weight" of each replica, which represents how 1102 // many requests for the range have been coming from localities near the 1103 // replica. 1104 // 1105 // The overarching goal is to move leases towards where requests are coming 1106 // from when the latency between localities is high, because the leaseholder 1107 // being near the request gateway makes for lower request latencies. 1108 // This must be balanced against hurting throughput by putting too many leases 1109 // one just a few nodes, though, which is why we get progressively more 1110 // aggressive about moving the leases toward requests when latencies are high. 1111 // 1112 // The calculations below were determined via a bunch of manual testing (see 1113 // #13232 or the leaseholder_locality.md RFC for more details), but the general 1114 // logic behind each part of the formula is as follows: 1115 // 1116 // * LeaseRebalancingAggressiveness: Allow the aggressiveness to be tuned via 1117 // a cluster setting. 1118 // * 0.1: Constant factor to reduce aggressiveness by default 1119 // * math.Log10(remoteWeight/sourceWeight): Comparison of the remote replica's 1120 // weight to the local replica's weight. Taking the log of the ratio instead 1121 // of using the ratio directly makes things symmetric -- i.e. r1 comparing 1122 // itself to r2 will come to the same conclusion as r2 comparing itself to r1. 1123 // * math.Log1p(remoteLatencyMillis): This will be 0 if there's no latency, 1124 // removing the weight/latency factor from consideration. Otherwise, it grows 1125 // the aggressiveness for stores that are farther apart. Note that Log1p grows 1126 // faster than Log10 as its argument gets larger, which is intentional to 1127 // increase the importance of latency. 1128 // * overfullScore and underfullScore: rebalanceThreshold helps us get an idea 1129 // of the ideal number of leases on each store. We then calculate these to 1130 // compare how close each node is to its ideal state and use the differences 1131 // from the ideal state on each node to compute a final score. 1132 // 1133 // Returns a total score for the replica that takes into account the number of 1134 // leases already on each store. Also returns the raw "adjustment" value that's 1135 // purely based on replica weights and latency in order for the caller to 1136 // determine how large a role the user's workload played in the decision. The 1137 // adjustment value is positive if the remote store is preferred for load-based 1138 // reasons or negative if the local store is preferred. The magnitude depends 1139 // on the difference in load and the latency between the nodes. 1140 // 1141 // TODO(a-robinson): Should this be changed to avoid even thinking about lease 1142 // counts now that we try to spread leases and replicas based on QPS? As is it 1143 // may fight back a little bit against store-level QPS-based rebalancing. 1144 func loadBasedLeaseRebalanceScore( 1145 ctx context.Context, 1146 st *cluster.Settings, 1147 remoteWeight float64, 1148 remoteLatency time.Duration, 1149 remoteStore roachpb.StoreDescriptor, 1150 sourceWeight float64, 1151 source roachpb.StoreDescriptor, 1152 meanLeases float64, 1153 ) (int32, float64) { 1154 remoteLatencyMillis := float64(remoteLatency) / float64(time.Millisecond) 1155 rebalanceAdjustment := 1156 leaseRebalancingAggressiveness.Get(&st.SV) * 0.1 * math.Log10(remoteWeight/sourceWeight) * math.Log1p(remoteLatencyMillis) 1157 // Start with twice the base rebalance threshold in order to fight more 1158 // strongly against thrashing caused by small variances in the distribution 1159 // of request weights. 1160 rebalanceThreshold := baseLoadBasedLeaseRebalanceThreshold - rebalanceAdjustment 1161 1162 overfullLeaseThreshold := int32(math.Ceil(meanLeases * (1 + rebalanceThreshold))) 1163 overfullScore := source.Capacity.LeaseCount - overfullLeaseThreshold 1164 underfullLeaseThreshold := int32(math.Floor(meanLeases * (1 - rebalanceThreshold))) 1165 underfullScore := underfullLeaseThreshold - remoteStore.Capacity.LeaseCount 1166 totalScore := overfullScore + underfullScore 1167 1168 log.VEventf(ctx, 1, 1169 "node: %d, sourceWeight: %.2f, remoteWeight: %.2f, remoteLatency: %v, "+ 1170 "rebalanceThreshold: %.2f, meanLeases: %.2f, sourceLeaseCount: %d, overfullThreshold: %d, "+ 1171 "remoteLeaseCount: %d, underfullThreshold: %d, totalScore: %d", 1172 remoteStore.Node.NodeID, sourceWeight, remoteWeight, remoteLatency, 1173 rebalanceThreshold, meanLeases, source.Capacity.LeaseCount, overfullLeaseThreshold, 1174 remoteStore.Capacity.LeaseCount, underfullLeaseThreshold, totalScore, 1175 ) 1176 return totalScore, rebalanceAdjustment 1177 } 1178 1179 func (a Allocator) shouldTransferLeaseWithoutStats( 1180 ctx context.Context, 1181 sl StoreList, 1182 source roachpb.StoreDescriptor, 1183 existing []roachpb.ReplicaDescriptor, 1184 ) bool { 1185 // TODO(a-robinson): Should we disable this behavior when load-based lease 1186 // rebalancing is enabled? In happy cases it's nice to keep this working 1187 // to even out the number of leases in addition to the number of replicas, 1188 // but it's certainly a blunt instrument that could undo what we want. 1189 1190 // Allow lease transfer if we're above the overfull threshold, which is 1191 // mean*(1+leaseRebalanceThreshold). 1192 overfullLeaseThreshold := int32(math.Ceil(sl.candidateLeases.mean * (1 + leaseRebalanceThreshold))) 1193 minOverfullThreshold := int32(math.Ceil(sl.candidateLeases.mean + 5)) 1194 if overfullLeaseThreshold < minOverfullThreshold { 1195 overfullLeaseThreshold = minOverfullThreshold 1196 } 1197 if source.Capacity.LeaseCount > overfullLeaseThreshold { 1198 return true 1199 } 1200 1201 if float64(source.Capacity.LeaseCount) > sl.candidateLeases.mean { 1202 underfullLeaseThreshold := int32(math.Ceil(sl.candidateLeases.mean * (1 - leaseRebalanceThreshold))) 1203 minUnderfullThreshold := int32(math.Ceil(sl.candidateLeases.mean - 5)) 1204 if underfullLeaseThreshold > minUnderfullThreshold { 1205 underfullLeaseThreshold = minUnderfullThreshold 1206 } 1207 1208 for _, repl := range existing { 1209 storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID) 1210 if !ok { 1211 continue 1212 } 1213 if storeDesc.Capacity.LeaseCount < underfullLeaseThreshold { 1214 return true 1215 } 1216 } 1217 } 1218 return false 1219 } 1220 1221 func (a Allocator) preferredLeaseholders( 1222 zone *zonepb.ZoneConfig, existing []roachpb.ReplicaDescriptor, 1223 ) []roachpb.ReplicaDescriptor { 1224 // Go one preference at a time. As soon as we've found replicas that match a 1225 // preference, we don't need to look at the later preferences, because 1226 // they're meant to be ordered by priority. 1227 for _, preference := range zone.LeasePreferences { 1228 var preferred []roachpb.ReplicaDescriptor 1229 for _, repl := range existing { 1230 // TODO(a-robinson): Do all these lookups at once, up front? We could 1231 // easily be passing a slice of StoreDescriptors around all the Allocator 1232 // functions instead of ReplicaDescriptors. 1233 storeDesc, ok := a.storePool.getStoreDescriptor(repl.StoreID) 1234 if !ok { 1235 continue 1236 } 1237 if constraint.ConjunctionsCheck(storeDesc, preference.Constraints) { 1238 preferred = append(preferred, repl) 1239 } 1240 } 1241 if len(preferred) > 0 { 1242 return preferred 1243 } 1244 } 1245 return nil 1246 } 1247 1248 // computeQuorum computes the quorum value for the given number of nodes. 1249 func computeQuorum(nodes int) int { 1250 return (nodes / 2) + 1 1251 } 1252 1253 // filterBehindReplicas removes any "behind" replicas from the supplied 1254 // slice. A "behind" replica is one which is not at or past the quorum commit 1255 // index. 1256 func filterBehindReplicas( 1257 ctx context.Context, raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor, 1258 ) []roachpb.ReplicaDescriptor { 1259 if raftStatus == nil || len(raftStatus.Progress) == 0 { 1260 // raftStatus.Progress is only populated on the Raft leader which means we 1261 // won't be able to rebalance a lease away if the lease holder is not the 1262 // Raft leader. This is rare enough not to matter. 1263 return nil 1264 } 1265 candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas)) 1266 for _, r := range replicas { 1267 if !replicaIsBehind(raftStatus, r.ReplicaID) { 1268 candidates = append(candidates, r) 1269 } 1270 } 1271 return candidates 1272 } 1273 1274 func replicaIsBehind(raftStatus *raft.Status, replicaID roachpb.ReplicaID) bool { 1275 if raftStatus == nil || len(raftStatus.Progress) == 0 { 1276 return true 1277 } 1278 // NB: We use raftStatus.Commit instead of getQuorumIndex() because the 1279 // latter can return a value that is less than the commit index. This is 1280 // useful for Raft log truncation which sometimes wishes to keep those 1281 // earlier indexes, but not appropriate for determining which nodes are 1282 // behind the actual commit index of the range. 1283 if progress, ok := raftStatus.Progress[uint64(replicaID)]; ok { 1284 if uint64(replicaID) == raftStatus.Lead || 1285 (progress.State == tracker.StateReplicate && 1286 progress.Match >= raftStatus.Commit) { 1287 return false 1288 } 1289 } 1290 return true 1291 } 1292 1293 // simulateFilterUnremovableReplicas removes any unremovable replicas from the 1294 // supplied slice. Unlike filterUnremovableReplicas, brandNewReplicaID is 1295 // considered up-to-date (and thus can participate in quorum), but is not 1296 // considered a candidate for removal. 1297 func simulateFilterUnremovableReplicas( 1298 ctx context.Context, 1299 raftStatus *raft.Status, 1300 replicas []roachpb.ReplicaDescriptor, 1301 brandNewReplicaID roachpb.ReplicaID, 1302 ) []roachpb.ReplicaDescriptor { 1303 status := *raftStatus 1304 status.Progress[uint64(brandNewReplicaID)] = tracker.Progress{ 1305 State: tracker.StateReplicate, 1306 Match: status.Commit, 1307 } 1308 return filterUnremovableReplicas(ctx, &status, replicas, brandNewReplicaID) 1309 } 1310 1311 // filterUnremovableReplicas removes any unremovable replicas from the supplied 1312 // slice. An unremovable replica is one which is a necessary part of the 1313 // quorum that will result from removing 1 replica. We forgive brandNewReplicaID 1314 // for being behind, since a new range can take a little while to catch up. 1315 // This is important when we've just added a replica in order to rebalance to 1316 // it (#17879). 1317 func filterUnremovableReplicas( 1318 ctx context.Context, 1319 raftStatus *raft.Status, 1320 replicas []roachpb.ReplicaDescriptor, 1321 brandNewReplicaID roachpb.ReplicaID, 1322 ) []roachpb.ReplicaDescriptor { 1323 upToDateReplicas := filterBehindReplicas(ctx, raftStatus, replicas) 1324 oldQuorum := computeQuorum(len(replicas)) 1325 if len(upToDateReplicas) < oldQuorum { 1326 // The number of up-to-date replicas is less than the old quorum. No 1327 // replicas can be removed. A below quorum range won't be able to process a 1328 // replica removal in any case. The logic here prevents any attempt to even 1329 // try the removal. 1330 return nil 1331 } 1332 1333 newQuorum := computeQuorum(len(replicas) - 1) 1334 if len(upToDateReplicas) > newQuorum { 1335 // The number of up-to-date replicas is larger than the new quorum. Any 1336 // replica can be removed, though we want to filter out brandNewReplicaID. 1337 if brandNewReplicaID != 0 { 1338 candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas)-len(upToDateReplicas)) 1339 for _, r := range replicas { 1340 if r.ReplicaID != brandNewReplicaID { 1341 candidates = append(candidates, r) 1342 } 1343 } 1344 return candidates 1345 } 1346 return replicas 1347 } 1348 1349 // The number of up-to-date replicas is equal to the new quorum. Only allow 1350 // removal of behind replicas (except for brandNewReplicaID which is given a 1351 // free pass). 1352 candidates := make([]roachpb.ReplicaDescriptor, 0, len(replicas)-len(upToDateReplicas)) 1353 necessary := func(r roachpb.ReplicaDescriptor) bool { 1354 if r.ReplicaID == brandNewReplicaID { 1355 return true 1356 } 1357 for _, t := range upToDateReplicas { 1358 if t == r { 1359 return true 1360 } 1361 } 1362 return false 1363 } 1364 for _, r := range replicas { 1365 if !necessary(r) { 1366 candidates = append(candidates, r) 1367 } 1368 } 1369 return candidates 1370 } 1371 1372 func maxReplicaID(replicas []roachpb.ReplicaDescriptor) roachpb.ReplicaID { 1373 var max roachpb.ReplicaID 1374 for i := range replicas { 1375 if replicaID := replicas[i].ReplicaID; replicaID > max { 1376 max = replicaID 1377 } 1378 } 1379 return max 1380 }