github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replicate_queue.go (about) 1 // Copyright 2015 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "sync/atomic" 18 "time" 19 20 "github.com/cockroachdb/cockroach/pkg/base" 21 "github.com/cockroachdb/cockroach/pkg/clusterversion" 22 "github.com/cockroachdb/cockroach/pkg/config" 23 "github.com/cockroachdb/cockroach/pkg/config/zonepb" 24 "github.com/cockroachdb/cockroach/pkg/gossip" 25 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 26 "github.com/cockroachdb/cockroach/pkg/roachpb" 27 "github.com/cockroachdb/cockroach/pkg/settings" 28 "github.com/cockroachdb/cockroach/pkg/util/hlc" 29 "github.com/cockroachdb/cockroach/pkg/util/log" 30 "github.com/cockroachdb/cockroach/pkg/util/metric" 31 "github.com/cockroachdb/cockroach/pkg/util/retry" 32 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 33 "github.com/cockroachdb/errors" 34 "go.etcd.io/etcd/raft" 35 ) 36 37 const ( 38 // replicateQueueTimerDuration is the duration between replication of queued 39 // replicas. 40 replicateQueueTimerDuration = 0 // zero duration to process replication greedily 41 42 // newReplicaGracePeriod is the amount of time that we allow for a new 43 // replica's raft state to catch up to the leader's before we start 44 // considering it to be behind for the sake of rebalancing. We choose a 45 // large value here because snapshots of large replicas can take a while 46 // in high latency clusters, and not allowing enough of a cushion can 47 // make rebalance thrashing more likely (#17879). 48 newReplicaGracePeriod = 5 * time.Minute 49 ) 50 51 // minLeaseTransferInterval controls how frequently leases can be transferred 52 // for rebalancing. It does not prevent transferring leases in order to allow 53 // a replica to be removed from a range. 54 var minLeaseTransferInterval = settings.RegisterNonNegativeDurationSetting( 55 "kv.allocator.min_lease_transfer_interval", 56 "controls how frequently leases can be transferred for rebalancing. "+ 57 "It does not prevent transferring leases in order to allow a "+ 58 "replica to be removed from a range.", 59 1*time.Second, 60 ) 61 62 var ( 63 metaReplicateQueueAddReplicaCount = metric.Metadata{ 64 Name: "queue.replicate.addreplica", 65 Help: "Number of replica additions attempted by the replicate queue", 66 Measurement: "Replica Additions", 67 Unit: metric.Unit_COUNT, 68 } 69 metaReplicateQueueRemoveReplicaCount = metric.Metadata{ 70 Name: "queue.replicate.removereplica", 71 Help: "Number of replica removals attempted by the replicate queue (typically in response to a rebalancer-initiated addition)", 72 Measurement: "Replica Removals", 73 Unit: metric.Unit_COUNT, 74 } 75 metaReplicateQueueRemoveDeadReplicaCount = metric.Metadata{ 76 Name: "queue.replicate.removedeadreplica", 77 Help: "Number of dead replica removals attempted by the replicate queue (typically in response to a node outage)", 78 Measurement: "Replica Removals", 79 Unit: metric.Unit_COUNT, 80 } 81 metaReplicateQueueRemoveLearnerReplicaCount = metric.Metadata{ 82 Name: "queue.replicate.removelearnerreplica", 83 Help: "Number of learner replica removals attempted by the replicate queue (typically due to internal race conditions)", 84 Measurement: "Replica Removals", 85 Unit: metric.Unit_COUNT, 86 } 87 metaReplicateQueueRebalanceReplicaCount = metric.Metadata{ 88 Name: "queue.replicate.rebalancereplica", 89 Help: "Number of replica rebalancer-initiated additions attempted by the replicate queue", 90 Measurement: "Replica Additions", 91 Unit: metric.Unit_COUNT, 92 } 93 metaReplicateQueueTransferLeaseCount = metric.Metadata{ 94 Name: "queue.replicate.transferlease", 95 Help: "Number of range lease transfers attempted by the replicate queue", 96 Measurement: "Lease Transfers", 97 Unit: metric.Unit_COUNT, 98 } 99 ) 100 101 // quorumError indicates a retryable error condition which sends replicas being 102 // processed through the replicate queue into purgatory so that they can be 103 // retried quickly as soon as nodes come online. 104 type quorumError struct { 105 msg string 106 } 107 108 func newQuorumError(f string, args ...interface{}) *quorumError { 109 return &quorumError{ 110 msg: fmt.Sprintf(f, args...), 111 } 112 } 113 114 func (e *quorumError) Error() string { 115 return e.msg 116 } 117 118 func (*quorumError) purgatoryErrorMarker() {} 119 120 // ReplicateQueueMetrics is the set of metrics for the replicate queue. 121 type ReplicateQueueMetrics struct { 122 AddReplicaCount *metric.Counter 123 RemoveReplicaCount *metric.Counter 124 RemoveDeadReplicaCount *metric.Counter 125 RemoveLearnerReplicaCount *metric.Counter 126 RebalanceReplicaCount *metric.Counter 127 TransferLeaseCount *metric.Counter 128 } 129 130 func makeReplicateQueueMetrics() ReplicateQueueMetrics { 131 return ReplicateQueueMetrics{ 132 AddReplicaCount: metric.NewCounter(metaReplicateQueueAddReplicaCount), 133 RemoveReplicaCount: metric.NewCounter(metaReplicateQueueRemoveReplicaCount), 134 RemoveDeadReplicaCount: metric.NewCounter(metaReplicateQueueRemoveDeadReplicaCount), 135 RemoveLearnerReplicaCount: metric.NewCounter(metaReplicateQueueRemoveLearnerReplicaCount), 136 RebalanceReplicaCount: metric.NewCounter(metaReplicateQueueRebalanceReplicaCount), 137 TransferLeaseCount: metric.NewCounter(metaReplicateQueueTransferLeaseCount), 138 } 139 } 140 141 // replicateQueue manages a queue of replicas which may need to add an 142 // additional replica to their range. 143 type replicateQueue struct { 144 *baseQueue 145 metrics ReplicateQueueMetrics 146 allocator Allocator 147 updateChan chan time.Time 148 lastLeaseTransfer atomic.Value // read and written by scanner & queue goroutines 149 } 150 151 // newReplicateQueue returns a new instance of replicateQueue. 152 func newReplicateQueue(store *Store, g *gossip.Gossip, allocator Allocator) *replicateQueue { 153 rq := &replicateQueue{ 154 metrics: makeReplicateQueueMetrics(), 155 allocator: allocator, 156 updateChan: make(chan time.Time, 1), 157 } 158 store.metrics.registry.AddMetricStruct(&rq.metrics) 159 rq.baseQueue = newBaseQueue( 160 "replicate", rq, store, g, 161 queueConfig{ 162 maxSize: defaultQueueMaxSize, 163 needsLease: true, 164 needsSystemConfig: true, 165 acceptsUnsplitRanges: store.TestingKnobs().ReplicateQueueAcceptsUnsplit, 166 // The processing of the replicate queue often needs to send snapshots 167 // so we use the raftSnapshotQueueTimeoutFunc. This function sets a 168 // timeout based on the range size and the sending rate in addition 169 // to consulting the setting which controls the minimum timeout. 170 processTimeoutFunc: makeQueueSnapshotTimeoutFunc(rebalanceSnapshotRate), 171 successes: store.metrics.ReplicateQueueSuccesses, 172 failures: store.metrics.ReplicateQueueFailures, 173 pending: store.metrics.ReplicateQueuePending, 174 processingNanos: store.metrics.ReplicateQueueProcessingNanos, 175 purgatory: store.metrics.ReplicateQueuePurgatory, 176 }, 177 ) 178 179 updateFn := func() { 180 select { 181 case rq.updateChan <- timeutil.Now(): 182 default: 183 } 184 } 185 186 // Register gossip and node liveness callbacks to signal that 187 // replicas in purgatory might be retried. 188 if g != nil { // gossip is nil for some unittests 189 g.RegisterCallback(gossip.MakePrefixPattern(gossip.KeyStorePrefix), func(key string, _ roachpb.Value) { 190 if !rq.store.IsStarted() { 191 return 192 } 193 // Because updates to our store's own descriptor won't affect 194 // replicas in purgatory, skip updating the purgatory channel 195 // in this case. 196 if storeID, err := gossip.StoreIDFromKey(key); err == nil && storeID == rq.store.StoreID() { 197 return 198 } 199 updateFn() 200 }) 201 } 202 if nl := store.cfg.NodeLiveness; nl != nil { // node liveness is nil for some unittests 203 nl.RegisterCallback(func(_ roachpb.NodeID) { 204 updateFn() 205 }) 206 } 207 208 return rq 209 } 210 211 func (rq *replicateQueue) shouldQueue( 212 ctx context.Context, now hlc.Timestamp, repl *Replica, sysCfg *config.SystemConfig, 213 ) (shouldQ bool, priority float64) { 214 desc, zone := repl.DescAndZone() 215 action, priority := rq.allocator.ComputeAction(ctx, zone, desc) 216 217 // For simplicity, the first thing the allocator does is remove learners, so 218 // it can do all of its reasoning about only voters. We do the same here so 219 // the executions of the allocator's decisions can be in terms of voters. 220 if action == AllocatorRemoveLearner { 221 return true, priority 222 } 223 voterReplicas := desc.Replicas().Voters() 224 225 if action == AllocatorNoop { 226 log.VEventf(ctx, 2, "no action to take") 227 return false, 0 228 } else if action != AllocatorConsiderRebalance { 229 log.VEventf(ctx, 2, "repair needed (%s), enqueuing", action) 230 return true, priority 231 } 232 233 if !rq.store.TestingKnobs().DisableReplicaRebalancing { 234 rangeUsageInfo := rangeUsageInfoForRepl(repl) 235 _, _, _, ok := rq.allocator.RebalanceTarget( 236 ctx, zone, repl.RaftStatus(), voterReplicas, rangeUsageInfo, storeFilterThrottled) 237 if ok { 238 log.VEventf(ctx, 2, "rebalance target found, enqueuing") 239 return true, 0 240 } 241 log.VEventf(ctx, 2, "no rebalance target found, not enqueuing") 242 } 243 244 // If the lease is valid, check to see if we should transfer it. 245 if lease, _ := repl.GetLease(); repl.IsLeaseValid(lease, now) { 246 if rq.canTransferLease() && 247 rq.allocator.ShouldTransferLease( 248 ctx, zone, voterReplicas, lease.Replica.StoreID, repl.leaseholderStats) { 249 log.VEventf(ctx, 2, "lease transfer needed, enqueuing") 250 return true, 0 251 } 252 } 253 254 return false, 0 255 } 256 257 func (rq *replicateQueue) process( 258 ctx context.Context, repl *Replica, sysCfg *config.SystemConfig, 259 ) error { 260 retryOpts := retry.Options{ 261 InitialBackoff: 50 * time.Millisecond, 262 MaxBackoff: 1 * time.Second, 263 Multiplier: 2, 264 MaxRetries: 5, 265 } 266 267 // Use a retry loop in order to backoff in the case of snapshot errors, 268 // usually signaling that a rebalancing reservation could not be made with the 269 // selected target. 270 for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); { 271 for { 272 requeue, err := rq.processOneChange(ctx, repl, rq.canTransferLease, false /* dryRun */) 273 if IsSnapshotError(err) { 274 // If ChangeReplicas failed because the snapshot failed, we log the 275 // error but then return success indicating we should retry the 276 // operation. The most likely causes of the snapshot failing are a 277 // declined reservation or the remote node being unavailable. In either 278 // case we don't want to wait another scanner cycle before reconsidering 279 // the range. 280 log.Infof(ctx, "%v", err) 281 break 282 } 283 284 if err != nil { 285 return err 286 } 287 288 if testingAggressiveConsistencyChecks { 289 if err := rq.store.consistencyQueue.process(ctx, repl, sysCfg); err != nil { 290 log.Warningf(ctx, "%v", err) 291 } 292 } 293 294 if !requeue { 295 return nil 296 } 297 298 log.VEventf(ctx, 1, "re-processing") 299 } 300 } 301 302 return errors.Errorf("failed to replicate after %d retries", retryOpts.MaxRetries) 303 } 304 305 func (rq *replicateQueue) processOneChange( 306 ctx context.Context, repl *Replica, canTransferLease func() bool, dryRun bool, 307 ) (requeue bool, _ error) { 308 // Check lease and destroy status here. The queue does this higher up already, but 309 // adminScatter (and potential other future callers) also call this method and don't 310 // perform this check, which could lead to infinite loops. 311 if _, err := repl.IsDestroyed(); err != nil { 312 return false, err 313 } 314 if _, pErr := repl.redirectOnOrAcquireLease(ctx); pErr != nil { 315 return false, pErr.GoError() 316 } 317 318 desc, zone := repl.DescAndZone() 319 320 // Avoid taking action if the range has too many dead replicas to make 321 // quorum. 322 voterReplicas := desc.Replicas().Voters() 323 liveVoterReplicas, deadVoterReplicas := rq.allocator.storePool.liveAndDeadReplicas(voterReplicas) 324 { 325 unavailable := !desc.Replicas().CanMakeProgress(func(rDesc roachpb.ReplicaDescriptor) bool { 326 for _, inner := range liveVoterReplicas { 327 if inner.ReplicaID == rDesc.ReplicaID { 328 return true 329 } 330 } 331 return false 332 }) 333 if unavailable { 334 return false, newQuorumError( 335 "range requires a replication change, but live replicas %v don't constitute a quorum for %v:", 336 liveVoterReplicas, 337 desc.Replicas().All(), 338 ) 339 } 340 } 341 342 action, _ := rq.allocator.ComputeAction(ctx, zone, desc) 343 log.VEventf(ctx, 1, "next replica action: %s", action) 344 345 // For simplicity, the first thing the allocator does is remove learners, so 346 // it can do all of its reasoning about only voters. We do the same here so 347 // the executions of the allocator's decisions can be in terms of voters. 348 if action == AllocatorRemoveLearner { 349 return rq.removeLearner(ctx, repl, dryRun) 350 } 351 352 switch action { 353 case AllocatorNoop, AllocatorRangeUnavailable: 354 // We're either missing liveness information or the range is known to have 355 // lost quorum. Either way, it's not a good idea to make changes right now. 356 // Let the scanner requeue it again later. 357 return false, nil 358 case AllocatorAdd: 359 return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, -1 /* removeIdx */, dryRun) 360 case AllocatorRemove: 361 return rq.remove(ctx, repl, voterReplicas, dryRun) 362 case AllocatorReplaceDead: 363 if len(deadVoterReplicas) == 0 { 364 // Nothing to do. 365 return false, nil 366 } 367 removeIdx := -1 // guaranteed to be changed below 368 for i, rDesc := range voterReplicas { 369 if rDesc.StoreID == deadVoterReplicas[0].StoreID { 370 removeIdx = i 371 break 372 } 373 } 374 if removeIdx < 0 { 375 return false, errors.AssertionFailedf( 376 "dead voter %v unexpectedly not found in %v", 377 deadVoterReplicas[0], voterReplicas) 378 } 379 return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, removeIdx, dryRun) 380 case AllocatorReplaceDecommissioning: 381 decommissioningReplicas := rq.allocator.storePool.decommissioningReplicas(voterReplicas) 382 if len(decommissioningReplicas) == 0 { 383 // Nothing to do. 384 return false, nil 385 } 386 removeIdx := -1 // guaranteed to be changed below 387 for i, rDesc := range voterReplicas { 388 if rDesc.StoreID == decommissioningReplicas[0].StoreID { 389 removeIdx = i 390 break 391 } 392 } 393 if removeIdx < 0 { 394 return false, errors.AssertionFailedf( 395 "decommissioning voter %v unexpectedly not found in %v", 396 decommissioningReplicas[0], voterReplicas) 397 } 398 return rq.addOrReplace(ctx, repl, voterReplicas, liveVoterReplicas, removeIdx, dryRun) 399 case AllocatorRemoveDecommissioning: 400 // NB: this path will only be hit when the range is over-replicated and 401 // has decommissioning replicas; in the common case we'll hit 402 // AllocatorReplaceDecommissioning above. 403 return rq.removeDecommissioning(ctx, repl, dryRun) 404 case AllocatorRemoveDead: 405 // NB: this path will only be hit when the range is over-replicated and 406 // has dead replicas; in the common case we'll hit AllocatorReplaceDead 407 // above. 408 return rq.removeDead(ctx, repl, deadVoterReplicas, dryRun) 409 case AllocatorRemoveLearner: 410 return rq.removeLearner(ctx, repl, dryRun) 411 case AllocatorConsiderRebalance: 412 return rq.considerRebalance(ctx, repl, voterReplicas, canTransferLease, dryRun) 413 case AllocatorFinalizeAtomicReplicationChange: 414 _, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, repl.store, repl.Desc()) 415 // Requeue because either we failed to transition out of a joint state 416 // (bad) or we did and there might be more to do for that range. 417 return true, err 418 default: 419 return false, errors.Errorf("unknown allocator action %v", action) 420 } 421 } 422 423 // addOrReplace adds or replaces a replica. If removeIdx is -1, an addition is 424 // carried out. Otherwise, removeIdx must be a valid index into existingReplicas 425 // and specifies which replica to replace with a new one. 426 // 427 // The method preferably issues an atomic replica swap, but may not be able to 428 // do this in all cases, such as when atomic replication changes are not 429 // available, or when the range consists of a single replica. As a fall back, 430 // only the addition is carried out; the removal is then a follow-up step for 431 // the next scanner cycle. 432 func (rq *replicateQueue) addOrReplace( 433 ctx context.Context, 434 repl *Replica, 435 existingReplicas []roachpb.ReplicaDescriptor, 436 liveVoterReplicas []roachpb.ReplicaDescriptor, 437 removeIdx int, // -1 for no removal 438 dryRun bool, 439 ) (requeue bool, _ error) { 440 if len(existingReplicas) == 1 { 441 // If only one replica remains, that replica is the leaseholder and 442 // we won't be able to swap it out. Ignore the removal and simply add 443 // a replica. 444 removeIdx = -1 445 } 446 st := rq.store.cfg.Settings 447 if !st.Version.IsActive(ctx, clusterversion.VersionAtomicChangeReplicas) { 448 // If we can't swap yet, don't. 449 removeIdx = -1 450 } 451 452 remainingLiveReplicas := liveVoterReplicas 453 if removeIdx >= 0 { 454 replToRemove := existingReplicas[removeIdx] 455 for i, r := range liveVoterReplicas { 456 if r.ReplicaID == replToRemove.ReplicaID { 457 remainingLiveReplicas = append(liveVoterReplicas[:i:i], liveVoterReplicas[i+1:]...) 458 break 459 } 460 } 461 // See about transferring the lease away if we're about to remove the 462 // leaseholder. 463 done, err := rq.maybeTransferLeaseAway(ctx, repl, existingReplicas[removeIdx].StoreID, dryRun) 464 if err != nil { 465 return false, err 466 } 467 if done { 468 // Lease was transferred away. Next leaseholder is going to take over. 469 return false, nil 470 } 471 } 472 473 desc, zone := repl.DescAndZone() 474 // Allocate a target assuming that the replica we're replacing (if any) is 475 // already gone. The allocator should not try to re-add this replica since 476 // there is a reason we're removing it (i.e. dead or decommissioning). If we 477 // left the replica in the slice, the allocator would not be guaranteed to 478 // pick a replica that fills the gap removeRepl leaves once it's gone. 479 newStore, details, err := rq.allocator.AllocateTarget( 480 ctx, 481 zone, 482 remainingLiveReplicas, 483 ) 484 if err != nil { 485 return false, err 486 } 487 if removeIdx >= 0 && newStore.StoreID == existingReplicas[removeIdx].StoreID { 488 return false, errors.AssertionFailedf("allocator suggested to replace replica on s%d with itself", newStore.StoreID) 489 } 490 newReplica := roachpb.ReplicationTarget{ 491 NodeID: newStore.Node.NodeID, 492 StoreID: newStore.StoreID, 493 } 494 495 clusterNodes := rq.allocator.storePool.ClusterNodeCount() 496 need := GetNeededReplicas(*zone.NumReplicas, clusterNodes) 497 498 // Only up-replicate if there are suitable allocation targets such that, 499 // either the replication goal is met, or it is possible to get to the next 500 // odd number of replicas. A consensus group of size 2n has worse failure 501 // tolerance properties than a group of size 2n - 1 because it has a larger 502 // quorum. For example, up-replicating from 1 to 2 replicas only makes sense 503 // if it is possible to be able to go to 3 replicas. 504 // 505 // NB: If willHave > need, then always allow up-replicating as that 506 // will be the case when up-replicating a range with a decommissioning 507 // replica. 508 // 509 // We skip this check if we're swapping a replica, since that does not 510 // change the quorum size. 511 if willHave := len(existingReplicas) + 1; removeIdx < 0 && willHave < need && willHave%2 == 0 { 512 // This means we are going to up-replicate to an even replica state. 513 // Check if it is possible to go to an odd replica state beyond it. 514 oldPlusNewReplicas := append([]roachpb.ReplicaDescriptor(nil), existingReplicas...) 515 oldPlusNewReplicas = append(oldPlusNewReplicas, roachpb.ReplicaDescriptor{ 516 NodeID: newStore.Node.NodeID, 517 StoreID: newStore.StoreID, 518 }) 519 _, _, err := rq.allocator.AllocateTarget( 520 ctx, 521 zone, 522 oldPlusNewReplicas, 523 ) 524 if err != nil { 525 // It does not seem possible to go to the next odd replica state. Note 526 // that AllocateTarget returns an allocatorError (a purgatoryError) 527 // when purgatory is requested. 528 return false, errors.Wrap(err, "avoid up-replicating to fragile quorum") 529 } 530 } 531 rq.metrics.AddReplicaCount.Inc(1) 532 ops := roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, newReplica) 533 if removeIdx < 0 { 534 log.VEventf(ctx, 1, "adding replica %+v: %s", 535 newReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas)) 536 } else { 537 rq.metrics.RemoveReplicaCount.Inc(1) 538 removeReplica := existingReplicas[removeIdx] 539 log.VEventf(ctx, 1, "replacing replica %s with %+v: %s", 540 removeReplica, newReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas)) 541 ops = append(ops, 542 roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, roachpb.ReplicationTarget{ 543 StoreID: removeReplica.StoreID, 544 NodeID: removeReplica.NodeID, 545 })...) 546 } 547 548 if err := rq.changeReplicas( 549 ctx, 550 repl, 551 ops, 552 desc, 553 SnapshotRequest_RECOVERY, 554 kvserverpb.ReasonRangeUnderReplicated, 555 details, 556 dryRun, 557 ); err != nil { 558 return false, err 559 } 560 // Always requeue to see if more work needs to be done. 561 return true, nil 562 } 563 564 // findRemoveTarget takes a list of replicas and picks one to remove, making 565 // sure to not remove a newly added replica or to violate the zone configs in 566 // the progress. 567 func (rq *replicateQueue) findRemoveTarget( 568 ctx context.Context, 569 repl interface { 570 DescAndZone() (*roachpb.RangeDescriptor, *zonepb.ZoneConfig) 571 LastReplicaAdded() (roachpb.ReplicaID, time.Time) 572 RaftStatus() *raft.Status 573 }, 574 existingReplicas []roachpb.ReplicaDescriptor, 575 ) (roachpb.ReplicaDescriptor, string, error) { 576 _, zone := repl.DescAndZone() 577 // This retry loop involves quick operations on local state, so a 578 // small MaxBackoff is good (but those local variables change on 579 // network time scales as raft receives responses). 580 // 581 // TODO(bdarnell): There's another retry loop at process(). It 582 // would be nice to combine these, but I'm keeping them separate 583 // for now so we can tune the options separately. 584 retryOpts := retry.Options{ 585 InitialBackoff: time.Millisecond, 586 MaxBackoff: 200 * time.Millisecond, 587 Multiplier: 2, 588 } 589 590 var candidates []roachpb.ReplicaDescriptor 591 deadline := timeutil.Now().Add(2 * base.NetworkTimeout) 592 for r := retry.StartWithCtx(ctx, retryOpts); r.Next() && timeutil.Now().Before(deadline); { 593 lastReplAdded, lastAddedTime := repl.LastReplicaAdded() 594 if timeutil.Since(lastAddedTime) > newReplicaGracePeriod { 595 lastReplAdded = 0 596 } 597 raftStatus := repl.RaftStatus() 598 if raftStatus == nil || raftStatus.RaftState != raft.StateLeader { 599 // If we've lost raft leadership, we're unlikely to regain it so give up immediately. 600 return roachpb.ReplicaDescriptor{}, "", &benignError{errors.Errorf("not raft leader while range needs removal")} 601 } 602 candidates = filterUnremovableReplicas(ctx, raftStatus, existingReplicas, lastReplAdded) 603 log.VEventf(ctx, 3, "filtered unremovable replicas from %v to get %v as candidates for removal: %s", 604 existingReplicas, candidates, rangeRaftProgress(raftStatus, existingReplicas)) 605 if len(candidates) > 0 { 606 break 607 } 608 if len(raftStatus.Progress) <= 2 { 609 // HACK(bdarnell): Downreplicating to a single node from 610 // multiple nodes is not really supported. There are edge 611 // cases in which the two peers stop communicating with each 612 // other too soon and we don't reach a satisfactory 613 // resolution. However, some tests (notably 614 // TestRepartitioning) get into this state, and if the 615 // replication queue spends its entire timeout waiting for the 616 // downreplication to finish the test will time out. As a 617 // hack, just fail-fast when we're trying to go down to a 618 // single replica. 619 break 620 } 621 // After upreplication, the candidates for removal could still 622 // be catching up. The allocator determined that the range was 623 // over-replicated, and it's important to clear that state as 624 // quickly as we can (because over-replicated ranges may be 625 // under-diversified). If we return an error here, this range 626 // probably won't be processed again until the next scanner 627 // cycle, which is too long, so we retry here. 628 } 629 if len(candidates) == 0 { 630 // If we timed out and still don't have any valid candidates, give up. 631 return roachpb.ReplicaDescriptor{}, "", &benignError{errors.Errorf("no removable replicas from range that needs a removal: %s", 632 rangeRaftProgress(repl.RaftStatus(), existingReplicas))} 633 } 634 635 return rq.allocator.RemoveTarget(ctx, zone, candidates, existingReplicas) 636 } 637 638 // maybeTransferLeaseAway is called whenever a replica on a given store is 639 // slated for removal. If the store corresponds to the store of the caller 640 // (which is very likely to be the leaseholder), then this removal would fail. 641 // Instead, this method will attempt to transfer the lease away, and returns 642 // true to indicate to the caller that it should not pursue the current 643 // replication change further because it is no longer the leaseholder. When the 644 // returned bool is false, it should continue. On error, the caller should also 645 // stop. 646 func (rq *replicateQueue) maybeTransferLeaseAway( 647 ctx context.Context, repl *Replica, removeStoreID roachpb.StoreID, dryRun bool, 648 ) (done bool, _ error) { 649 if removeStoreID != repl.store.StoreID() { 650 return false, nil 651 } 652 desc, zone := repl.DescAndZone() 653 // The local replica was selected as the removal target, but that replica 654 // is the leaseholder, so transfer the lease instead. We don't check that 655 // the current store has too many leases in this case under the 656 // assumption that replica balance is a greater concern. Also note that 657 // AllocatorRemove action takes preference over AllocatorConsiderRebalance 658 // (rebalancing) which is where lease transfer would otherwise occur. We 659 // need to be able to transfer leases in AllocatorRemove in order to get 660 // out of situations where this store is overfull and yet holds all the 661 // leases. The fullness checks need to be ignored for cases where 662 // a replica needs to be removed for constraint violations. 663 return rq.findTargetAndTransferLease( 664 ctx, 665 repl, 666 desc, 667 zone, 668 transferLeaseOptions{ 669 dryRun: dryRun, 670 }, 671 ) 672 } 673 674 func (rq *replicateQueue) remove( 675 ctx context.Context, repl *Replica, existingReplicas []roachpb.ReplicaDescriptor, dryRun bool, 676 ) (requeue bool, _ error) { 677 removeReplica, details, err := rq.findRemoveTarget(ctx, repl, existingReplicas) 678 if err != nil { 679 return false, err 680 } 681 done, err := rq.maybeTransferLeaseAway(ctx, repl, removeReplica.StoreID, dryRun) 682 if err != nil { 683 return false, err 684 } 685 if done { 686 // Lease is now elsewhere, so we're not in charge any more. 687 return false, nil 688 } 689 690 // Remove a replica. 691 rq.metrics.RemoveReplicaCount.Inc(1) 692 log.VEventf(ctx, 1, "removing replica %+v due to over-replication: %s", 693 removeReplica, rangeRaftProgress(repl.RaftStatus(), existingReplicas)) 694 target := roachpb.ReplicationTarget{ 695 NodeID: removeReplica.NodeID, 696 StoreID: removeReplica.StoreID, 697 } 698 desc, _ := repl.DescAndZone() 699 if err := rq.changeReplicas( 700 ctx, 701 repl, 702 roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target), 703 desc, 704 SnapshotRequest_UNKNOWN, // unused 705 kvserverpb.ReasonRangeOverReplicated, 706 details, 707 dryRun, 708 ); err != nil { 709 return false, err 710 } 711 return true, nil 712 } 713 714 func (rq *replicateQueue) removeDecommissioning( 715 ctx context.Context, repl *Replica, dryRun bool, 716 ) (requeue bool, _ error) { 717 desc, _ := repl.DescAndZone() 718 decommissioningReplicas := rq.allocator.storePool.decommissioningReplicas(desc.Replicas().All()) 719 if len(decommissioningReplicas) == 0 { 720 log.VEventf(ctx, 1, "range of replica %s was identified as having decommissioning replicas, "+ 721 "but no decommissioning replicas were found", repl) 722 return true, nil 723 } 724 decommissioningReplica := decommissioningReplicas[0] 725 done, err := rq.maybeTransferLeaseAway(ctx, repl, decommissioningReplica.StoreID, dryRun) 726 if err != nil { 727 return false, err 728 } 729 if done { 730 // Not leaseholder any more. 731 return false, nil 732 } 733 // Remove the decommissioning replica. 734 rq.metrics.RemoveReplicaCount.Inc(1) 735 log.VEventf(ctx, 1, "removing decommissioning replica %+v from store", decommissioningReplica) 736 target := roachpb.ReplicationTarget{ 737 NodeID: decommissioningReplica.NodeID, 738 StoreID: decommissioningReplica.StoreID, 739 } 740 if err := rq.changeReplicas( 741 ctx, 742 repl, 743 roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target), 744 desc, 745 SnapshotRequest_UNKNOWN, // unused 746 kvserverpb.ReasonStoreDecommissioning, "", dryRun, 747 ); err != nil { 748 return false, err 749 } 750 // We removed a replica, so check if there's more to do. 751 return true, nil 752 } 753 754 func (rq *replicateQueue) removeDead( 755 ctx context.Context, repl *Replica, deadVoterReplicas []roachpb.ReplicaDescriptor, dryRun bool, 756 ) (requeue bool, _ error) { 757 desc := repl.Desc() 758 if len(deadVoterReplicas) == 0 { 759 log.VEventf(ctx, 1, "range of replica %s was identified as having dead replicas, but no dead replicas were found", repl) 760 return true, nil 761 } 762 deadReplica := deadVoterReplicas[0] 763 rq.metrics.RemoveDeadReplicaCount.Inc(1) 764 log.VEventf(ctx, 1, "removing dead replica %+v from store", deadReplica) 765 target := roachpb.ReplicationTarget{ 766 NodeID: deadReplica.NodeID, 767 StoreID: deadReplica.StoreID, 768 } 769 // NB: we don't check whether to transfer the lease away because if the removal target 770 // is dead, it's not us (and if for some reason that happens, the removal is simply 771 // going to fail). 772 if err := rq.changeReplicas( 773 ctx, 774 repl, 775 roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target), 776 desc, 777 SnapshotRequest_UNKNOWN, // unused 778 kvserverpb.ReasonStoreDead, 779 "", 780 dryRun, 781 ); err != nil { 782 return false, err 783 } 784 return true, nil 785 } 786 787 func (rq *replicateQueue) removeLearner( 788 ctx context.Context, repl *Replica, dryRun bool, 789 ) (requeue bool, _ error) { 790 desc := repl.Desc() 791 learnerReplicas := desc.Replicas().Learners() 792 if len(learnerReplicas) == 0 { 793 log.VEventf(ctx, 1, "range of replica %s was identified as having learner replicas, "+ 794 "but no learner replicas were found", repl) 795 return true, nil 796 } 797 learnerReplica := learnerReplicas[0] 798 rq.metrics.RemoveLearnerReplicaCount.Inc(1) 799 log.VEventf(ctx, 1, "removing learner replica %+v from store", learnerReplica) 800 target := roachpb.ReplicationTarget{ 801 NodeID: learnerReplica.NodeID, 802 StoreID: learnerReplica.StoreID, 803 } 804 // NB: we don't check whether to transfer the lease away because we're very unlikely 805 // to be the learner (and if so, we don't have the lease any more, so after the removal 806 // fails the situation will have rectified itself). 807 if err := rq.changeReplicas( 808 ctx, 809 repl, 810 roachpb.MakeReplicationChanges(roachpb.REMOVE_REPLICA, target), 811 desc, 812 SnapshotRequest_UNKNOWN, 813 kvserverpb.ReasonAbandonedLearner, 814 "", 815 dryRun, 816 ); err != nil { 817 return false, err 818 } 819 return true, nil 820 } 821 822 func (rq *replicateQueue) considerRebalance( 823 ctx context.Context, 824 repl *Replica, 825 existingReplicas []roachpb.ReplicaDescriptor, 826 canTransferLease func() bool, 827 dryRun bool, 828 ) (requeue bool, _ error) { 829 desc, zone := repl.DescAndZone() 830 // The Noop case will result if this replica was queued in order to 831 // rebalance. Attempt to find a rebalancing target. 832 if !rq.store.TestingKnobs().DisableReplicaRebalancing { 833 rangeUsageInfo := rangeUsageInfoForRepl(repl) 834 addTarget, removeTarget, details, ok := rq.allocator.RebalanceTarget( 835 ctx, zone, repl.RaftStatus(), existingReplicas, rangeUsageInfo, 836 storeFilterThrottled) 837 if !ok { 838 log.VEventf(ctx, 1, "no suitable rebalance target") 839 } else if done, err := rq.maybeTransferLeaseAway(ctx, repl, removeTarget.StoreID, dryRun); err != nil { 840 log.VEventf(ctx, 1, "want to remove self, but failed to transfer lease away: %s", err) 841 } else if done { 842 // Lease is now elsewhere, so we're not in charge any more. 843 return false, nil 844 } else { 845 // We have a replica to remove and one we can add, so let's swap them 846 // out. 847 chgs := []roachpb.ReplicationChange{ 848 // NB: we place the addition first because in the case of 849 // atomic replication changes being turned off, the changes 850 // will be executed individually in the order in which they 851 // appear. 852 {Target: addTarget, ChangeType: roachpb.ADD_REPLICA}, 853 {Target: removeTarget, ChangeType: roachpb.REMOVE_REPLICA}, 854 } 855 856 if len(existingReplicas) == 1 { 857 // If there's only one replica, the removal target is the 858 // leaseholder and this is unsupported and will fail. However, 859 // this is also the only way to rebalance in a single-replica 860 // range. If we try the atomic swap here, we'll fail doing 861 // nothing, and so we stay locked into the current distribution 862 // of replicas. (Note that maybeTransferLeaseAway above will not 863 // have found a target, and so will have returned (false, nil). 864 // 865 // Do the best thing we can, which is carry out the addition 866 // only, which should succeed, and the next time we touch this 867 // range, we will have one more replica and hopefully it will 868 // take the lease and remove the current leaseholder. 869 // 870 // It's possible that "rebalancing deadlock" can occur in other 871 // scenarios, it's really impossible to tell from the code given 872 // the constraints we support. However, the lease transfer often 873 // does not happen spuriously, and we can't enter dangerous 874 // configurations sporadically, so this code path is only hit 875 // when we know it's necessary, picking the smaller of two evils. 876 // 877 // See https://github.com/cockroachdb/cockroach/issues/40333. 878 chgs = chgs[:1] 879 log.VEventf(ctx, 1, "can't swap replica due to lease; falling back to add") 880 } 881 882 rq.metrics.RebalanceReplicaCount.Inc(1) 883 log.VEventf(ctx, 1, "rebalancing %+v to %+v: %s", 884 removeTarget, addTarget, rangeRaftProgress(repl.RaftStatus(), existingReplicas)) 885 886 if err := rq.changeReplicas( 887 ctx, 888 repl, 889 chgs, 890 desc, 891 SnapshotRequest_REBALANCE, 892 kvserverpb.ReasonRebalance, 893 details, 894 dryRun, 895 ); err != nil { 896 return false, err 897 } 898 return true, nil 899 } 900 } 901 902 if canTransferLease() { 903 // We require the lease in order to process replicas, so 904 // repl.store.StoreID() corresponds to the lease-holder's store ID. 905 transferred, err := rq.findTargetAndTransferLease( 906 ctx, 907 repl, 908 desc, 909 zone, 910 transferLeaseOptions{ 911 checkTransferLeaseSource: true, 912 checkCandidateFullness: true, 913 dryRun: dryRun, 914 }, 915 ) 916 if err != nil { 917 return false, err 918 } 919 // Do not requeue as we transferred our lease away. 920 if transferred { 921 return false, nil 922 } 923 } 924 925 // No action was necessary and no rebalance target was found. Return 926 // without re-queuing this replica. 927 return false, nil 928 } 929 930 type transferLeaseOptions struct { 931 checkTransferLeaseSource bool 932 checkCandidateFullness bool 933 dryRun bool 934 } 935 936 func (rq *replicateQueue) findTargetAndTransferLease( 937 ctx context.Context, 938 repl *Replica, 939 desc *roachpb.RangeDescriptor, 940 zone *zonepb.ZoneConfig, 941 opts transferLeaseOptions, 942 ) (bool, error) { 943 // Learner replicas aren't allowed to become the leaseholder or raft leader, 944 // so only consider the `Voters` replicas. 945 target := rq.allocator.TransferLeaseTarget( 946 ctx, 947 zone, 948 desc.Replicas().Voters(), 949 repl.store.StoreID(), 950 repl.leaseholderStats, 951 opts.checkTransferLeaseSource, 952 opts.checkCandidateFullness, 953 false, /* alwaysAllowDecisionWithoutStats */ 954 ) 955 if target == (roachpb.ReplicaDescriptor{}) { 956 return false, nil 957 } 958 959 if opts.dryRun { 960 log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) 961 return false, nil 962 } 963 964 avgQPS, qpsMeasurementDur := repl.leaseholderStats.avgQPS() 965 if qpsMeasurementDur < MinStatsDuration { 966 avgQPS = 0 967 } 968 err := rq.transferLease(ctx, repl, target, avgQPS) 969 return err == nil, err 970 } 971 972 func (rq *replicateQueue) transferLease( 973 ctx context.Context, repl *Replica, target roachpb.ReplicaDescriptor, rangeQPS float64, 974 ) error { 975 rq.metrics.TransferLeaseCount.Inc(1) 976 log.VEventf(ctx, 1, "transferring lease to s%d", target.StoreID) 977 if err := repl.AdminTransferLease(ctx, target.StoreID); err != nil { 978 return errors.Wrapf(err, "%s: unable to transfer lease to s%d", repl, target.StoreID) 979 } 980 rq.lastLeaseTransfer.Store(timeutil.Now()) 981 rq.allocator.storePool.updateLocalStoresAfterLeaseTransfer( 982 repl.store.StoreID(), target.StoreID, rangeQPS) 983 return nil 984 } 985 986 func (rq *replicateQueue) changeReplicas( 987 ctx context.Context, 988 repl *Replica, 989 chgs roachpb.ReplicationChanges, 990 desc *roachpb.RangeDescriptor, 991 priority SnapshotRequest_Priority, 992 reason kvserverpb.RangeLogEventReason, 993 details string, 994 dryRun bool, 995 ) error { 996 if dryRun { 997 return nil 998 } 999 if _, err := repl.ChangeReplicas(ctx, desc, priority, reason, details, chgs); err != nil { 1000 return err 1001 } 1002 rangeUsageInfo := rangeUsageInfoForRepl(repl) 1003 for _, chg := range chgs { 1004 rq.allocator.storePool.updateLocalStoreAfterRebalance( 1005 chg.Target.StoreID, rangeUsageInfo, chg.ChangeType) 1006 } 1007 return nil 1008 } 1009 1010 func (rq *replicateQueue) canTransferLease() bool { 1011 if lastLeaseTransfer := rq.lastLeaseTransfer.Load(); lastLeaseTransfer != nil { 1012 minInterval := minLeaseTransferInterval.Get(&rq.store.cfg.Settings.SV) 1013 return timeutil.Since(lastLeaseTransfer.(time.Time)) > minInterval 1014 } 1015 return true 1016 } 1017 1018 func (*replicateQueue) timer(_ time.Duration) time.Duration { 1019 return replicateQueueTimerDuration 1020 } 1021 1022 // purgatoryChan returns the replicate queue's store update channel. 1023 func (rq *replicateQueue) purgatoryChan() <-chan time.Time { 1024 return rq.updateChan 1025 } 1026 1027 // rangeRaftStatus pretty-prints the Raft progress (i.e. Raft log position) of 1028 // the replicas. 1029 func rangeRaftProgress(raftStatus *raft.Status, replicas []roachpb.ReplicaDescriptor) string { 1030 if raftStatus == nil { 1031 return "[no raft status]" 1032 } else if len(raftStatus.Progress) == 0 { 1033 return "[no raft progress]" 1034 } 1035 var buf bytes.Buffer 1036 buf.WriteString("[") 1037 for i, r := range replicas { 1038 if i > 0 { 1039 buf.WriteString(", ") 1040 } 1041 fmt.Fprintf(&buf, "%d", r.ReplicaID) 1042 if uint64(r.ReplicaID) == raftStatus.Lead { 1043 buf.WriteString("*") 1044 } 1045 if progress, ok := raftStatus.Progress[uint64(r.ReplicaID)]; ok { 1046 fmt.Fprintf(&buf, ":%d", progress.Match) 1047 } else { 1048 buf.WriteString(":?") 1049 } 1050 } 1051 buf.WriteString("]") 1052 return buf.String() 1053 }