github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_command.go (about) 1 // Copyright 2014 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "bytes" 15 "context" 16 "fmt" 17 "math/rand" 18 "sort" 19 "strings" 20 "time" 21 22 "github.com/cockroachdb/cockroach/pkg/base" 23 "github.com/cockroachdb/cockroach/pkg/clusterversion" 24 "github.com/cockroachdb/cockroach/pkg/keys" 25 "github.com/cockroachdb/cockroach/pkg/kv" 26 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 27 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 28 "github.com/cockroachdb/cockroach/pkg/roachpb" 29 "github.com/cockroachdb/cockroach/pkg/rpc" 30 "github.com/cockroachdb/cockroach/pkg/rpc/nodedialer" 31 "github.com/cockroachdb/cockroach/pkg/settings/cluster" 32 "github.com/cockroachdb/cockroach/pkg/storage" 33 "github.com/cockroachdb/cockroach/pkg/util/contextutil" 34 "github.com/cockroachdb/cockroach/pkg/util/ctxgroup" 35 "github.com/cockroachdb/cockroach/pkg/util/hlc" 36 "github.com/cockroachdb/cockroach/pkg/util/log" 37 "github.com/cockroachdb/cockroach/pkg/util/protoutil" 38 "github.com/cockroachdb/cockroach/pkg/util/retry" 39 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 40 "github.com/cockroachdb/cockroach/pkg/util/uuid" 41 "github.com/cockroachdb/errors" 42 "github.com/cockroachdb/logtags" 43 "go.etcd.io/etcd/raft" 44 "go.etcd.io/etcd/raft/raftpb" 45 "go.etcd.io/etcd/raft/tracker" 46 ) 47 48 // AdminSplit divides the range into into two ranges using args.SplitKey. 49 func (r *Replica) AdminSplit( 50 ctx context.Context, args roachpb.AdminSplitRequest, reason string, 51 ) (reply roachpb.AdminSplitResponse, _ *roachpb.Error) { 52 if len(args.SplitKey) == 0 { 53 return roachpb.AdminSplitResponse{}, roachpb.NewErrorf("cannot split range with no key provided") 54 } 55 56 err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error { 57 var err error 58 reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason) 59 return err 60 }) 61 return reply, err 62 } 63 64 func maybeDescriptorChangedError( 65 desc *roachpb.RangeDescriptor, err error, 66 ) (ok bool, expectedDesc *roachpb.RangeDescriptor) { 67 if detail := (*roachpb.ConditionFailedError)(nil); errors.As(err, &detail) { 68 // Provide a better message in the common case that the range being changed 69 // was already changed by a concurrent transaction. 70 var actualDesc roachpb.RangeDescriptor 71 if !detail.ActualValue.IsPresent() { 72 return true, nil 73 } else if err := detail.ActualValue.GetProto(&actualDesc); err == nil && 74 desc.RangeID == actualDesc.RangeID && !desc.Equal(actualDesc) { 75 return true, &actualDesc 76 } 77 } 78 return false, nil 79 } 80 81 const ( 82 descChangedRangeSubsumedErrorFmt = "descriptor changed: expected %s != [actual] nil (range subsumed)" 83 descChangedErrorFmt = "descriptor changed: [expected] %s != [actual] %s" 84 ) 85 86 func newDescChangedError(desc, actualDesc *roachpb.RangeDescriptor) error { 87 if actualDesc == nil { 88 return errors.Newf(descChangedRangeSubsumedErrorFmt, desc) 89 } 90 return errors.Newf(descChangedErrorFmt, desc, actualDesc) 91 } 92 93 func wrapDescChangedError(err error, desc, actualDesc *roachpb.RangeDescriptor) error { 94 if actualDesc == nil { 95 return errors.Wrapf(err, descChangedRangeSubsumedErrorFmt, desc) 96 } 97 return errors.Wrapf(err, descChangedErrorFmt, desc, actualDesc) 98 } 99 100 func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) string { 101 var s string 102 if status != nil && status.RaftState == raft.StateLeader { 103 for replicaID, pr := range status.Progress { 104 if replicaID == status.Lead { 105 // TODO(tschottdorf): remove this line once we have picked up 106 // https://github.com/etcd-io/etcd/pull/10279 107 continue 108 } 109 if pr.State == tracker.StateReplicate { 110 // This follower is in good working order. 111 continue 112 } 113 s += fmt.Sprintf("; r%d/%d is ", rangeID, replicaID) 114 switch pr.State { 115 case tracker.StateSnapshot: 116 // If the Raft snapshot queue is backed up, replicas can spend 117 // minutes or worse until they are caught up. 118 s += "waiting for a Raft snapshot" 119 case tracker.StateProbe: 120 // Assuming the split has already been delayed for a little bit, 121 // seeing a follower that is probing hints at some problem with 122 // Raft or Raft message delivery. (Of course it's possible that 123 // the follower *just* entered probing state). 124 s += "being probed (may or may not need a Raft snapshot)" 125 default: 126 // Future proofing. 127 s += "in unknown state " + pr.State.String() 128 } 129 } 130 } 131 return s 132 } 133 134 // prepareSplitDescs returns the left and right descriptor of the split whose 135 // right side is assigned rightRangeID and starts at splitKey. The supplied 136 // expiration is the "sticky bit" stored on the right descriptor. 137 func prepareSplitDescs( 138 ctx context.Context, 139 st *cluster.Settings, 140 rightRangeID roachpb.RangeID, 141 splitKey roachpb.RKey, 142 expiration hlc.Timestamp, 143 leftDesc *roachpb.RangeDescriptor, 144 ) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor) { 145 // Create right hand side range descriptor. 146 rightDesc := roachpb.NewRangeDescriptor(rightRangeID, splitKey, leftDesc.EndKey, leftDesc.Replicas()) 147 148 // Init updated version of existing range descriptor. 149 { 150 tmp := *leftDesc 151 leftDesc = &tmp 152 } 153 154 leftDesc.IncrementGeneration() 155 leftDesc.EndKey = splitKey 156 157 // Set the generation of the right hand side descriptor to match that of the 158 // (updated) left hand side. See the comment on the field for an explanation 159 // of why generations are useful. 160 rightDesc.Generation = leftDesc.Generation 161 162 setStickyBit(rightDesc, expiration) 163 return leftDesc, rightDesc 164 } 165 166 func setStickyBit(desc *roachpb.RangeDescriptor, expiration hlc.Timestamp) { 167 // TODO(jeffreyxiao): Remove this check in 20.1. 168 // Note that the client API for splitting has expiration time as 169 // non-nullable, but the internal representation of a sticky bit is nullable 170 // for backwards compatibility. If expiration time is the zero timestamp, we 171 // must be sure not to set the sticky bit to the zero timestamp because the 172 // byte representation of setting the stickyBit to nil is different than 173 // setting it to hlc.Timestamp{}. This check ensures that CPuts would not 174 // fail on older versions. 175 if (expiration != hlc.Timestamp{}) { 176 desc.StickyBit = &expiration 177 } 178 } 179 180 func splitTxnAttempt( 181 ctx context.Context, 182 store *Store, 183 txn *kv.Txn, 184 rightRangeID roachpb.RangeID, 185 splitKey roachpb.RKey, 186 expiration hlc.Timestamp, 187 oldDesc *roachpb.RangeDescriptor, 188 ) error { 189 txn.SetDebugName(splitTxnName) 190 191 _, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, oldDesc.StartKey, checkDescsEqual(oldDesc)) 192 if err != nil { 193 return err 194 } 195 // TODO(tbg): return desc from conditionalGetDescValueFromDB and don't pass 196 // in oldDesc any more (just the start key). 197 desc := oldDesc 198 oldDesc = nil // prevent accidental use 199 200 leftDesc, rightDesc := prepareSplitDescs( 201 ctx, store.ClusterSettings(), rightRangeID, splitKey, expiration, desc) 202 203 // Update existing range descriptor for left hand side of 204 // split. Note that we mutate the descriptor for the left hand 205 // side of the split first to locate the txn record there. 206 { 207 b := txn.NewBatch() 208 leftDescKey := keys.RangeDescriptorKey(leftDesc.StartKey) 209 if err := updateRangeDescriptor(b, leftDescKey, dbDescValue, leftDesc); err != nil { 210 return err 211 } 212 // Commit this batch first to ensure that the transaction record 213 // is created in the right place (split trigger relies on this). 214 // Sending the batch containing only the first write guarantees 215 // the transaction record is written first, preventing cases 216 // where splits are aborted early due to conflicts with meta 217 // intents (see #9265). 218 log.Event(ctx, "updating LHS descriptor") 219 if err := txn.Run(ctx, b); err != nil { 220 return err 221 } 222 } 223 224 // Log the split into the range event log. 225 if err := store.logSplit(ctx, txn, *leftDesc, *rightDesc); err != nil { 226 return err 227 } 228 229 b := txn.NewBatch() 230 231 // Write range descriptor for right hand side of the split. 232 rightDescKey := keys.RangeDescriptorKey(rightDesc.StartKey) 233 if err := updateRangeDescriptor(b, rightDescKey, nil, rightDesc); err != nil { 234 return err 235 } 236 237 // Update range descriptor addressing record(s). 238 if err := splitRangeAddressing(b, rightDesc, leftDesc); err != nil { 239 return err 240 } 241 242 // End the transaction manually, instead of letting RunTransaction 243 // loop do it, in order to provide a split trigger. 244 b.AddRawRequest(&roachpb.EndTxnRequest{ 245 Commit: true, 246 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 247 SplitTrigger: &roachpb.SplitTrigger{ 248 LeftDesc: *leftDesc, 249 RightDesc: *rightDesc, 250 }, 251 }, 252 }) 253 254 // Commit txn with final batch (RHS descriptor and meta). 255 log.Event(ctx, "commit txn with batch containing RHS descriptor and meta records") 256 return txn.Run(ctx, b) 257 } 258 259 func splitTxnStickyUpdateAttempt( 260 ctx context.Context, txn *kv.Txn, desc *roachpb.RangeDescriptor, expiration hlc.Timestamp, 261 ) error { 262 _, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, desc.StartKey, checkDescsEqual(desc)) 263 if err != nil { 264 return err 265 } 266 newDesc := *desc 267 setStickyBit(&newDesc, expiration) 268 269 b := txn.NewBatch() 270 descKey := keys.RangeDescriptorKey(desc.StartKey) 271 if err := updateRangeDescriptor(b, descKey, dbDescValue, &newDesc); err != nil { 272 return err 273 } 274 if err := updateRangeAddressing(b, &newDesc); err != nil { 275 return err 276 } 277 // End the transaction manually, instead of letting RunTransaction loop 278 // do it, in order to provide a sticky bit trigger. 279 b.AddRawRequest(&roachpb.EndTxnRequest{ 280 Commit: true, 281 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 282 StickyBitTrigger: &roachpb.StickyBitTrigger{ 283 StickyBit: newDesc.GetStickyBit(), 284 }, 285 }, 286 }) 287 return txn.Run(ctx, b) 288 } 289 290 // adminSplitWithDescriptor divides the range into into two ranges, using 291 // either args.SplitKey (if provided) or an internally computed key that aims 292 // to roughly equipartition the range by size. The split is done inside of a 293 // distributed txn which writes updated left and new right hand side range 294 // descriptors, and updates the range addressing metadata. The handover of 295 // responsibility for the reassigned key range is carried out seamlessly 296 // through a split trigger carried out as part of the commit of that 297 // transaction. 298 // 299 // The supplied RangeDescriptor is used as a form of optimistic lock. An 300 // operation which might split a range should obtain a copy of the range's 301 // current descriptor before making the decision to split. If the decision is 302 // affirmative the descriptor is passed to AdminSplit, which performs a 303 // Conditional Put on the RangeDescriptor to ensure that no other operation has 304 // modified the range in the time the decision was being made. 305 // TODO(tschottdorf): should assert that split key is not a local key. 306 // 307 // See the comment on splitTrigger for details on the complexities. 308 func (r *Replica) adminSplitWithDescriptor( 309 ctx context.Context, 310 args roachpb.AdminSplitRequest, 311 desc *roachpb.RangeDescriptor, 312 delayable bool, 313 reason string, 314 ) (roachpb.AdminSplitResponse, error) { 315 var err error 316 // The split queue doesn't care about the set of replicas, so if we somehow 317 // are being handed one that's in a joint state, finalize that before 318 // continuing. 319 desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc) 320 if err != nil { 321 return roachpb.AdminSplitResponse{}, err 322 } 323 324 var reply roachpb.AdminSplitResponse 325 326 // Determine split key if not provided with args. This scan is 327 // allowed to be relatively slow because admin commands don't block 328 // other commands. 329 log.Event(ctx, "split begins") 330 var splitKey roachpb.RKey 331 { 332 var foundSplitKey roachpb.Key 333 if len(args.SplitKey) == 0 { 334 // Find a key to split by size. 335 var err error 336 targetSize := r.GetMaxBytes() / 2 337 foundSplitKey, err = storage.MVCCFindSplitKey( 338 ctx, r.store.engine, desc.StartKey, desc.EndKey, targetSize) 339 if err != nil { 340 return reply, errors.Errorf("unable to determine split key: %s", err) 341 } 342 if foundSplitKey == nil { 343 // No suitable split key could be found. 344 return reply, unsplittableRangeError{} 345 } 346 } else { 347 // If the key that routed this request to this range is now out of this 348 // range's bounds, return an error for the client to try again on the 349 // correct range. 350 if !kvserverbase.ContainsKey(desc, args.Key) { 351 return reply, roachpb.NewRangeKeyMismatchError(args.Key, args.Key, desc) 352 } 353 foundSplitKey = args.SplitKey 354 } 355 356 if !kvserverbase.ContainsKey(desc, foundSplitKey) { 357 return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r) 358 } 359 360 var err error 361 splitKey, err = keys.Addr(foundSplitKey) 362 if err != nil { 363 return reply, err 364 } 365 if !splitKey.Equal(foundSplitKey) { 366 return reply, errors.Errorf("cannot split range at range-local key %s", splitKey) 367 } 368 if !storage.IsValidSplitKey(foundSplitKey) { 369 return reply, errors.Errorf("cannot split range at key %s", splitKey) 370 } 371 } 372 373 // If the range starts at the splitKey, we treat the AdminSplit 374 // as a no-op and return success instead of throwing an error. 375 if desc.StartKey.Equal(splitKey) { 376 if len(args.SplitKey) == 0 { 377 log.Fatal(ctx, "MVCCFindSplitKey returned start key of range") 378 } 379 log.Event(ctx, "range already split") 380 // Even if the range is already split, we should still update the sticky 381 // bit if it has a later expiration time. 382 if desc.GetStickyBit().Less(args.ExpirationTime) { 383 err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 384 return splitTxnStickyUpdateAttempt(ctx, txn, desc, args.ExpirationTime) 385 }) 386 // The ConditionFailedError can occur because the descriptors acting as 387 // expected values in the CPuts used to update the range descriptor are 388 // picked outside the transaction. Return ConditionFailedError in the 389 // error detail so that the command can be retried. 390 if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok { 391 // NB: we have to wrap the existing error here as consumers of this code 392 // look at the root cause to sniff out the changed descriptor. 393 err = &benignError{wrapDescChangedError(err, desc, actualDesc)} 394 } 395 return reply, err 396 } 397 return reply, nil 398 } 399 log.Event(ctx, "found split key") 400 401 // Create right hand side range descriptor. 402 rightRangeID, err := r.store.AllocateRangeID(ctx) 403 if err != nil { 404 return reply, errors.Wrap(err, "unable to allocate range id for right hand side") 405 } 406 407 var extra string 408 if delayable { 409 extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r)) 410 } 411 extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus()) 412 413 log.Infof(ctx, "initiating a split of this range at key %s [r%d] (%s)%s", 414 splitKey.StringWithDirs(nil /* valDirs */, 50 /* maxLen */), rightRangeID, reason, extra) 415 416 if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 417 return splitTxnAttempt(ctx, r.store, txn, rightRangeID, splitKey, args.ExpirationTime, desc) 418 }); err != nil { 419 // The ConditionFailedError can occur because the descriptors acting 420 // as expected values in the CPuts used to update the left or right 421 // range descriptors are picked outside the transaction. Return 422 // ConditionFailedError in the error detail so that the command can be 423 // retried. 424 if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok { 425 // NB: we have to wrap the existing error here as consumers of this code 426 // look at the root cause to sniff out the changed descriptor. 427 err = &benignError{wrapDescChangedError(err, desc, actualDesc)} 428 } 429 return reply, errors.Wrapf(err, "split at key %s failed", splitKey) 430 } 431 return reply, nil 432 } 433 434 // AdminUnsplit removes the sticky bit of the range specified by the 435 // args.Key. 436 func (r *Replica) AdminUnsplit( 437 ctx context.Context, args roachpb.AdminUnsplitRequest, reason string, 438 ) (roachpb.AdminUnsplitResponse, *roachpb.Error) { 439 var reply roachpb.AdminUnsplitResponse 440 err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error { 441 var err error 442 reply, err = r.adminUnsplitWithDescriptor(ctx, args, desc, reason) 443 return err 444 }) 445 return reply, err 446 } 447 448 func (r *Replica) adminUnsplitWithDescriptor( 449 ctx context.Context, 450 args roachpb.AdminUnsplitRequest, 451 desc *roachpb.RangeDescriptor, 452 reason string, 453 ) (roachpb.AdminUnsplitResponse, error) { 454 var reply roachpb.AdminUnsplitResponse 455 if !bytes.Equal(desc.StartKey.AsRawKey(), args.Header().Key) { 456 return reply, errors.Errorf("key %s is not the start of a range", args.Header().Key) 457 } 458 459 // If the range's sticky bit is already hlc.Timestamp{}, we treat the unsplit 460 // command as a no-op and return success instead of throwing an error. On 461 // mixed version clusters that don't support StickyBit, all range descriptor 462 // sticky bits are guaranteed to be nil, so we can skip checking the cluster 463 // version. 464 if (desc.GetStickyBit() == hlc.Timestamp{}) { 465 return reply, nil 466 } 467 468 if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 469 _, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, desc.StartKey, checkDescsEqual(desc)) 470 if err != nil { 471 return err 472 } 473 474 newDesc := *desc 475 // Use nil instead of &zero until 20.1; this field is new in 19.2. We 476 // could use &zero here because the sticky bit will never be populated 477 // before the cluster version reaches 19.2 and the early return above 478 // already handles that case, but nothing is won in doing so. 479 newDesc.StickyBit = nil 480 descKey := keys.RangeDescriptorKey(newDesc.StartKey) 481 482 b := txn.NewBatch() 483 if err := updateRangeDescriptor(b, descKey, dbDescValue, &newDesc); err != nil { 484 return err 485 } 486 if err := updateRangeAddressing(b, &newDesc); err != nil { 487 return err 488 } 489 // End the transaction manually in order to provide a sticky bit trigger. 490 b.AddRawRequest(&roachpb.EndTxnRequest{ 491 Commit: true, 492 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 493 StickyBitTrigger: &roachpb.StickyBitTrigger{ 494 // Setting StickyBit to the zero timestamp ensures that it is always 495 // eligible for automatic merging. 496 StickyBit: hlc.Timestamp{}, 497 }, 498 }, 499 }) 500 return txn.Run(ctx, b) 501 }); err != nil { 502 // The ConditionFailedError can occur because the descriptors acting as 503 // expected values in the CPuts used to update the range descriptor are 504 // picked outside the transaction. Return ConditionFailedError in the error 505 // detail so that the command can be retried. 506 if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok { 507 // NB: we have to wrap the existing error here as consumers of this code 508 // look at the root cause to sniff out the changed descriptor. 509 err = &benignError{wrapDescChangedError(err, desc, actualDesc)} 510 } 511 return reply, err 512 } 513 return reply, nil 514 } 515 516 // executeAdminCommandWithDescriptor wraps a read-modify-write operation for RangeDescriptors in a 517 // retry loop. 518 func (r *Replica) executeAdminCommandWithDescriptor( 519 ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error, 520 ) *roachpb.Error { 521 // Retry forever as long as we see errors we know will resolve. 522 retryOpts := base.DefaultRetryOptions() 523 // Randomize quite a lot just in case someone else also interferes with us 524 // in a retry loop. Note that this is speculative; there wasn't an incident 525 // that suggested this. 526 retryOpts.RandomizationFactor = 0.5 527 lastErr := ctx.Err() 528 for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); { 529 // The replica may have been destroyed since the start of the retry loop. 530 // We need to explicitly check this condition. Having a valid lease, as we 531 // verify below, does not imply that the range still exists: even after a 532 // range has been merged into its left-hand neighbor, its final lease 533 // (i.e., the lease we have in r.mu.state.Lease) can remain valid 534 // indefinitely. 535 if _, err := r.IsDestroyed(); err != nil { 536 return roachpb.NewError(err) 537 } 538 539 // Admin commands always require the range lease to begin (see 540 // executeAdminBatch), but we may have lost it while in this retry loop. 541 // Without the lease, a replica's local descriptor can be arbitrarily 542 // stale, which will result in a ConditionFailedError. To avoid this, we 543 // make sure that we still have the lease before each attempt. 544 if _, pErr := r.redirectOnOrAcquireLease(ctx); pErr != nil { 545 return pErr 546 } 547 548 lastErr = updateDesc(r.Desc()) 549 // On seeing a ConditionFailedError or an AmbiguousResultError, retry the 550 // command with the updated descriptor. 551 if !errors.HasType(lastErr, (*roachpb.ConditionFailedError)(nil)) && 552 !errors.HasType(lastErr, (*roachpb.AmbiguousResultError)(nil)) { 553 break 554 } 555 } 556 return roachpb.NewError(lastErr) 557 } 558 559 // AdminMerge extends this range to subsume the range that comes next 560 // in the key space. The merge is performed inside of a distributed 561 // transaction which writes the left hand side range descriptor (the 562 // subsuming range) and deletes the range descriptor for the right 563 // hand side range (the subsumed range). It also updates the range 564 // addressing metadata. The handover of responsibility for the 565 // reassigned key range is carried out seamlessly through a merge 566 // trigger carried out as part of the commit of that transaction. A 567 // merge requires that the two ranges are collocated on the same set 568 // of replicas. 569 // 570 // The supplied RangeDescriptor is used as a form of optimistic lock. See the 571 // comment of "AdminSplit" for more information on this pattern. 572 func (r *Replica) AdminMerge( 573 ctx context.Context, args roachpb.AdminMergeRequest, reason string, 574 ) (roachpb.AdminMergeResponse, *roachpb.Error) { 575 var reply roachpb.AdminMergeResponse 576 577 runMergeTxn := func(txn *kv.Txn) error { 578 log.Event(ctx, "merge txn begins") 579 txn.SetDebugName(mergeTxnName) 580 581 // Observe the commit timestamp to force a client-side retry. See the 582 // comment on the retry loop after this closure for details. 583 // 584 // TODO(benesch): expose a proper API for preventing the fast path. 585 _ = txn.CommitTimestamp() 586 587 // Pipelining might send QueryIntent requests to the RHS after the RHS has 588 // noticed the merge and started blocking all traffic. This causes the merge 589 // transaction to deadlock. Just turn pipelining off; the structure of the 590 // merge transaction means pipelining provides no performance benefit 591 // anyway. 592 if err := txn.DisablePipelining(); err != nil { 593 return err 594 } 595 596 // NB: reads do NOT impact transaction record placement. 597 598 origLeftDesc := r.Desc() 599 if origLeftDesc.EndKey.Equal(roachpb.RKeyMax) { 600 // Merging the final range doesn't make sense. 601 return errors.New("cannot merge final range") 602 } 603 604 _, dbOrigLeftDescValue, err := conditionalGetDescValueFromDB(ctx, txn, origLeftDesc.StartKey, checkDescsEqual(origLeftDesc)) 605 if err != nil { 606 return err 607 } 608 609 // Ensure that every current replica of the LHS has been initialized. 610 // Otherwise there is a rare race where the replica GC queue can GC a 611 // replica of the RHS too early. The comment on 612 // TestStoreRangeMergeUninitializedLHSFollower explains the situation in full. 613 if err := waitForReplicasInit( 614 ctx, r.store.cfg.NodeDialer, origLeftDesc.RangeID, origLeftDesc.Replicas().All(), 615 ); err != nil { 616 return errors.Wrap(err, "waiting for all left-hand replicas to initialize") 617 } 618 619 // Do a consistent read of the right hand side's range descriptor. 620 var rightDesc roachpb.RangeDescriptor 621 rightDescKey := keys.RangeDescriptorKey(origLeftDesc.EndKey) 622 dbRightDescKV, err := txn.Get(ctx, rightDescKey) 623 if err != nil { 624 return err 625 } 626 if err := dbRightDescKV.ValueProto(&rightDesc); err != nil { 627 return err 628 } 629 630 // Verify that the two ranges are mergeable. 631 if !bytes.Equal(origLeftDesc.EndKey, rightDesc.StartKey) { 632 // Should never happen, but just in case. 633 return errors.Errorf("ranges are not adjacent; %s != %s", origLeftDesc.EndKey, rightDesc.StartKey) 634 } 635 // For simplicity, don't handle learner replicas or joint states, expect 636 // the caller to resolve them first. (Defensively, we check that there 637 // are no non-voter replicas, in case some third type is later added). 638 // This behavior can be changed later if the complexity becomes worth 639 // it, but it's not right now. 640 // 641 // NB: the merge queue transitions out of any joint states and removes 642 // any learners it sees. It's sort of silly that we don't do that here 643 // instead; effectively any caller of AdminMerge that is not the merge 644 // queue won't be able to recover from these cases (though the replicate 645 // queues should fix things up quickly). 646 lReplicas, rReplicas := origLeftDesc.Replicas(), rightDesc.Replicas() 647 648 predFullVoter := func(rDesc roachpb.ReplicaDescriptor) bool { 649 return rDesc.GetType() == roachpb.VOTER_FULL 650 } 651 if len(lReplicas.Filter(predFullVoter)) != len(lReplicas.All()) { 652 return errors.Errorf("cannot merge range with non-voter replicas on lhs: %s", lReplicas) 653 } 654 if len(rReplicas.Filter(predFullVoter)) != len(rReplicas.All()) { 655 return errors.Errorf("cannot merge range with non-voter replicas on rhs: %s", rReplicas) 656 } 657 if !replicaSetsEqual(lReplicas.All(), rReplicas.All()) { 658 return errors.Errorf("ranges not collocated; %s != %s", lReplicas, rReplicas) 659 } 660 mergeReplicas := lReplicas.All() 661 662 updatedLeftDesc := *origLeftDesc 663 // lhs.Generation = max(rhs.Generation, lhs.Generation)+1. 664 // See the comment on the Generation field for why generation are useful. 665 if updatedLeftDesc.Generation < rightDesc.Generation { 666 updatedLeftDesc.Generation = rightDesc.Generation 667 } 668 updatedLeftDesc.IncrementGeneration() 669 updatedLeftDesc.EndKey = rightDesc.EndKey 670 log.Infof(ctx, "initiating a merge of %s into this range (%s)", &rightDesc, reason) 671 672 // Update the range descriptor for the receiving range. It is important 673 // (for transaction record placement) that the first write inside the 674 // transaction is this conditional put to change the left hand side's 675 // descriptor end key. 676 { 677 b := txn.NewBatch() 678 leftDescKey := keys.RangeDescriptorKey(updatedLeftDesc.StartKey) 679 if err := updateRangeDescriptor( 680 b, leftDescKey, dbOrigLeftDescValue, &updatedLeftDesc, 681 ); err != nil { 682 return err 683 } 684 // Commit this batch on its own to ensure that the transaction record 685 // is created in the right place (our triggers rely on this). 686 log.Event(ctx, "updating LHS descriptor") 687 if err := txn.Run(ctx, b); err != nil { 688 return err 689 } 690 } 691 692 // Log the merge into the range event log. 693 // TODO(spencer): event logging API should accept a batch 694 // instead of a transaction; there's no reason this logging 695 // shouldn't be done in parallel via the batch with the updated 696 // range addressing. 697 if err := r.store.logMerge(ctx, txn, updatedLeftDesc, rightDesc); err != nil { 698 return err 699 } 700 701 b := txn.NewBatch() 702 703 // Update the meta addressing records. 704 if err := mergeRangeAddressing(b, origLeftDesc, &updatedLeftDesc); err != nil { 705 return err 706 } 707 708 // Remove the range descriptor for the deleted range. 709 if err := updateRangeDescriptor(b, rightDescKey, dbRightDescKV.Value, nil); err != nil { 710 return err 711 } 712 713 // Send off this batch, ensuring that intents are placed on both the local 714 // copy and meta2's copy of the right-hand side range descriptor before we 715 // send the Subsume request below. This is the precondition for sending a 716 // Subsume request; see the godoc on batcheval.Subsume for details. 717 if err := txn.Run(ctx, b); err != nil { 718 return err 719 } 720 721 // Intents have been placed, so the merge is now in its critical phase. Get 722 // a consistent view of the data from the right-hand range. If the merge 723 // commits, we'll write this data to the left-hand range in the merge 724 // trigger. 725 br, pErr := kv.SendWrapped(ctx, r.store.DB().NonTransactionalSender(), 726 &roachpb.SubsumeRequest{ 727 RequestHeader: roachpb.RequestHeader{Key: rightDesc.StartKey.AsRawKey()}, 728 LeftDesc: *origLeftDesc, 729 RightDesc: rightDesc, 730 }) 731 if pErr != nil { 732 return pErr.GoError() 733 } 734 rhsSnapshotRes := br.(*roachpb.SubsumeResponse) 735 736 err = waitForApplication( 737 ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas, 738 rhsSnapshotRes.LeaseAppliedIndex) 739 if err != nil { 740 return errors.Wrap(err, "waiting for all right-hand replicas to catch up") 741 } 742 743 // Successful subsume, so we're guaranteed that the right-hand range will 744 // not serve another request unless this transaction aborts. End the 745 // transaction manually in order to provide a merge trigger. 746 b = txn.NewBatch() 747 b.AddRawRequest(&roachpb.EndTxnRequest{ 748 Commit: true, 749 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 750 MergeTrigger: &roachpb.MergeTrigger{ 751 LeftDesc: updatedLeftDesc, 752 RightDesc: rightDesc, 753 RightMVCCStats: rhsSnapshotRes.MVCCStats, 754 FreezeStart: rhsSnapshotRes.FreezeStart, 755 }, 756 }, 757 }) 758 log.Event(ctx, "attempting commit") 759 return txn.Run(ctx, b) 760 } 761 762 // If the merge transaction encounters an error, we need to trigger a full 763 // abort and try again with a new transaction. Why? runMergeTxn has the side 764 // effect of sending a Subsume request to the right-hand range, which blocks 765 // the right-hand range from serving any traffic until the transaction commits 766 // or aborts. If we retry using the same transaction (i.e., a "transaction 767 // restart"), we'll send requests to the blocked right-hand range and 768 // deadlock. The right-hand range will see that the transaction is still 769 // pending and refuse to respond, but the transaction cannot commit until the 770 // right-hand range responds. By instead marking the transaction as aborted, 771 // we'll unlock the right-hand range, giving the next, fresh transaction a 772 // chance to succeed. 773 // 774 // Note that client.DB.Txn performs retries using the same transaction, so we 775 // have to use our own retry loop. 776 for { 777 txn := kv.NewTxn(ctx, r.store.DB(), r.NodeID()) 778 err := runMergeTxn(txn) 779 if err != nil { 780 txn.CleanupOnError(ctx, err) 781 } 782 if !errors.HasType(err, (*roachpb.TransactionRetryWithProtoRefreshError)(nil)) { 783 if err != nil { 784 return reply, roachpb.NewErrorf("merge failed: %s", err) 785 } 786 return reply, nil 787 } 788 } 789 } 790 791 func waitForApplication( 792 ctx context.Context, 793 dialer *nodedialer.Dialer, 794 rangeID roachpb.RangeID, 795 replicas []roachpb.ReplicaDescriptor, 796 leaseIndex uint64, 797 ) error { 798 return contextutil.RunWithTimeout(ctx, "wait for application", 5*time.Second, func(ctx context.Context) error { 799 g := ctxgroup.WithContext(ctx) 800 for _, repl := range replicas { 801 repl := repl // copy for goroutine 802 g.GoCtx(func(ctx context.Context) error { 803 conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass) 804 if err != nil { 805 return errors.Wrapf(err, "could not dial n%d", repl.NodeID) 806 } 807 _, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{ 808 StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID}, 809 RangeID: rangeID, 810 LeaseIndex: leaseIndex, 811 }) 812 return err 813 }) 814 } 815 return g.Wait() 816 }) 817 } 818 819 // waitForReplicasInit blocks until it has proof that the replicas listed in 820 // desc are initialized on their respective stores. It may return a false 821 // negative, i.e., claim that a replica is uninitialized when it is, in fact, 822 // initialized, but it will never return a false positive. 823 func waitForReplicasInit( 824 ctx context.Context, 825 dialer *nodedialer.Dialer, 826 rangeID roachpb.RangeID, 827 replicas []roachpb.ReplicaDescriptor, 828 ) error { 829 return contextutil.RunWithTimeout(ctx, "wait for replicas init", 5*time.Second, func(ctx context.Context) error { 830 g := ctxgroup.WithContext(ctx) 831 for _, repl := range replicas { 832 repl := repl // copy for goroutine 833 g.GoCtx(func(ctx context.Context) error { 834 conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass) 835 if err != nil { 836 return errors.Wrapf(err, "could not dial n%d", repl.NodeID) 837 } 838 _, err = NewPerReplicaClient(conn).WaitForReplicaInit(ctx, &WaitForReplicaInitRequest{ 839 StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID}, 840 RangeID: rangeID, 841 }) 842 return err 843 }) 844 } 845 return g.Wait() 846 }) 847 } 848 849 type snapshotError struct { 850 // NB: don't implement Cause() on this type without also updating IsSnapshotError. 851 cause error 852 } 853 854 func (s *snapshotError) Error() string { 855 return fmt.Sprintf("snapshot failed: %s", s.cause.Error()) 856 } 857 858 // IsSnapshotError returns true iff the error indicates a snapshot failed. 859 func IsSnapshotError(err error) bool { 860 return errors.HasType(err, (*snapshotError)(nil)) 861 } 862 863 // ChangeReplicas atomically changes the replicas that are members of a range. 864 // The change is performed in a distributed transaction and takes effect when 865 // that transaction is committed. This transaction confirms that the supplied 866 // RangeDescriptor is up to date and that the supplied slice of 867 // ReplicationChanges is a valid transition, meaning that replicas being added 868 // are not present, that replicas being removed are present, that no replica is 869 // altered more than once, and that no attempt is made at removing the 870 // leaseholder (which in particular implies that we can never remove all 871 // replicas). 872 // 873 // The returned RangeDescriptor is the new value of the range's descriptor 874 // following the successful commit of the transaction. 875 // 876 // In general, ChangeReplicas will carry out the following steps. 877 // 878 // 1. Run a distributed transaction that adds all new replicas as learner replicas. 879 // Learner replicas receive the log, but do not have voting rights. They are 880 // used to catch up these new replicas before turning them into voters, which 881 // is important for the continued availability of the range throughout the 882 // replication change. Learners are added (and removed) one by one due to a 883 // technicality (see https://github.com/cockroachdb/cockroach/pull/40268). 884 // 885 // The distributed transaction updates both copies of the range descriptor 886 // (the one on the range and that in the meta ranges) to that effect, and 887 // commits with a special trigger instructing Raft (via ProposeConfChange) to 888 // tie a corresponding replication configuration change which goes into 889 // effect (on each replica) when the transaction commit is applied to the 890 // state. Applying the command also updates each replica's local view of 891 // the state to reflect the new descriptor. 892 // 893 // If no replicas are being added, this first step is elided. 894 // 895 // 2. Send Raft snapshots to all learner replicas. This would happen 896 // automatically by the existing recovery mechanisms (raft snapshot queue), but 897 // it is done explicitly as a convenient way to ensure learners are caught up 898 // before the next step is entered. (We ensure that work is not duplicated 899 // between the snapshot queue and the explicit snapshot via the 900 // snapshotLogTruncationConstraints map). Snapshots are subject to both 901 // bandwidth rate limiting and throttling. 902 // 903 // If no replicas are being added, this step is similarly elided. 904 // 905 // 3. Carry out a distributed transaction similar to that which added the 906 // learner replicas, except this time it (atomically) changes all learners to 907 // voters and removes any replicas for which this was requested; voters are 908 // demoted before actually being removed to avoid bug in etcd/raft: 909 // See https://github.com/cockroachdb/cockroach/pull/40268. 910 // 911 // If only one replica is being added, raft can chose the simple 912 // configuration change protocol; otherwise it has to use joint consensus. In 913 // this latter mechanism, a first configuration change is made which results 914 // in a configuration ("joint configuration") in which a quorum of both the 915 // old replicas and the new replica sets is required for decision making. 916 // Transitioning into this joint configuration, the RangeDescriptor (which is 917 // the source of truth of the replication configuration) is updated with 918 // corresponding replicas of type VOTER_INCOMING and VOTER_OUTGOING. 919 // Immediately after committing this change, a second transition updates the 920 // descriptor with and activates the final configuration. 921 // 922 // Concretely, if the initial members of the range are s1/1, s2/2, and s3/3, and 923 // an atomic membership change were to adds s4/4 and s5/5 while removing s1/1 and 924 // s2/2, the following range descriptors would form the overall transition: 925 // 926 // 1. s1/1 s2/2 s3/3 (VOTER_FULL is implied) 927 // 2. s1/1 s2/2 s3/3 s4/4LEARNER 928 // 3. s1/1 s2/2 s3/3 s4/4LEARNER s5/5LEARNER 929 // 4. s1/1VOTER_DEMOTING s2/2VOTER_DEMOTING s3/3 s4/4VOTER_INCOMING s5/5VOTER_INCOMING 930 // 5. s1/1LEARNER s2/2LEARNER s3/3 s4/4 s5/5 931 // 6. s2/2LEARNER s3/3 s4/4 s5/5 932 // 7. s3/3 s4/4 s5/5 933 // 934 // A replica that learns that it was removed will queue itself for replicaGC. 935 // Note that a removed replica may never apply the configuration change removing 936 // itself and thus this trigger may not fire. This is because said replica may 937 // not have been a part of the quorum that committed the configuration change; 938 // nodes that apply the change will stop sending messages to the removed 939 // replica. At that point, the removed replica will typically campaign (since it 940 // receives no more heartbeats from the leader) and its former peers respond via 941 // a RaftGroupDeletedError (from the Raft transport) as a signal to queue to 942 // replicaGC. This second mechanism fails if all peers have rapidly moved 943 // elsewhere as well; in that last and rare case, replica GC queue will 944 // eventually discover the replica on its own; it has optimizations that handle 945 // "abandoned-looking" replicas more eagerly than healthy ones. 946 func (r *Replica) ChangeReplicas( 947 ctx context.Context, 948 desc *roachpb.RangeDescriptor, 949 priority SnapshotRequest_Priority, 950 reason kvserverpb.RangeLogEventReason, 951 details string, 952 chgs roachpb.ReplicationChanges, 953 ) (updatedDesc *roachpb.RangeDescriptor, _ error) { 954 if desc == nil { 955 // TODO(tbg): is this check just FUD? 956 return nil, errors.Errorf("%s: the current RangeDescriptor must not be nil", r) 957 } 958 959 // We execute the change serially if we're not allowed to run atomic 960 // replication changes or if that was explicitly disabled. 961 st := r.ClusterSettings() 962 unroll := !st.Version.IsActive(ctx, clusterversion.VersionAtomicChangeReplicas) || 963 !UseAtomicReplicationChanges.Get(&st.SV) 964 965 if unroll { 966 // Legacy behavior. 967 for i := range chgs { 968 var err error 969 desc, err = r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs[i:i+1]) 970 if err != nil { 971 return nil, err 972 } 973 } 974 return desc, nil 975 } 976 // Atomic replication change. 977 return r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs) 978 } 979 980 func (r *Replica) changeReplicasImpl( 981 ctx context.Context, 982 desc *roachpb.RangeDescriptor, 983 priority SnapshotRequest_Priority, 984 reason kvserverpb.RangeLogEventReason, 985 details string, 986 chgs roachpb.ReplicationChanges, 987 ) (updatedDesc *roachpb.RangeDescriptor, _ error) { 988 var err error 989 // If in a joint config, clean up. The assumption here is that the caller 990 // of ChangeReplicas didn't even realize that they were holding on to a 991 // joint descriptor and would rather not have to deal with that fact. 992 desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc) 993 if err != nil { 994 return nil, err 995 } 996 997 if err := validateReplicationChanges(desc, chgs); err != nil { 998 return nil, err 999 } 1000 1001 if adds := chgs.Additions(); len(adds) > 0 { 1002 // Lock learner snapshots even before we run the ConfChange txn to add them 1003 // to prevent a race with the raft snapshot queue trying to send it first. 1004 // Note that this lock needs to cover sending the snapshots which happens in 1005 _ = r.atomicReplicationChange 1006 // which also has some more details on what's going on here. 1007 // 1008 // Also note that the lock only prevents the raft snapshot queue from 1009 // sending snapshots to learner replicas, it will still send them to voters. 1010 // There are more details about this locking in 1011 _ = (*raftSnapshotQueue)(nil).processRaftSnapshot 1012 // as well as a TODO about fixing all this to be less subtle and brittle. 1013 releaseSnapshotLockFn := r.lockLearnerSnapshot(ctx, adds) 1014 defer releaseSnapshotLockFn() 1015 1016 // For all newly added nodes, first add raft learner replicas. They accept raft traffic 1017 // (so they can catch up) but don't get to vote (so they don't affect quorum and thus 1018 // don't introduce fragility into the system). For details see: 1019 _ = roachpb.ReplicaDescriptors.Learners 1020 var err error 1021 desc, err = addLearnerReplicas(ctx, r.store, desc, reason, details, adds) 1022 if err != nil { 1023 return nil, err 1024 } 1025 } 1026 1027 // Catch up any learners, then run the atomic replication change that adds the 1028 // final voters and removes any undesirable replicas. 1029 desc, err = r.atomicReplicationChange(ctx, desc, priority, reason, details, chgs) 1030 if err != nil { 1031 // If the error occurred while transitioning out of an atomic replication change, 1032 // try again here with a fresh descriptor; this is a noop otherwise. 1033 if _, err := maybeLeaveAtomicChangeReplicas(ctx, r.store, r.Desc()); err != nil { 1034 return nil, err 1035 } 1036 if fn := r.store.cfg.TestingKnobs.ReplicaAddSkipLearnerRollback; fn != nil && fn() { 1037 return nil, err 1038 } 1039 // Don't leave a learner replica lying around if we didn't succeed in 1040 // promoting it to a voter. 1041 if targets := chgs.Additions(); len(targets) > 0 { 1042 log.Infof(ctx, "could not promote %v to voter, rolling back: %v", targets, err) 1043 for _, target := range targets { 1044 r.tryRollBackLearnerReplica(ctx, r.Desc(), target, reason, details) 1045 } 1046 } 1047 return nil, err 1048 } 1049 return desc, err 1050 } 1051 1052 // maybeLeaveAtomicChangeReplicas transitions out of the joint configuration if 1053 // the descriptor indicates one. This involves running a distributed transaction 1054 // updating said descriptor, the result of which will be returned. The 1055 // descriptor returned from this method will contain replicas of type LEARNER 1056 // and VOTER_FULL only. 1057 func maybeLeaveAtomicChangeReplicas( 1058 ctx context.Context, store *Store, desc *roachpb.RangeDescriptor, 1059 ) (*roachpb.RangeDescriptor, error) { 1060 // We want execChangeReplicasTxn to be able to make sure it's only tasked 1061 // with leaving a joint state when it's in one, so make sure we don't call 1062 // it if we're not. 1063 if !desc.Replicas().InAtomicReplicationChange() { 1064 return desc, nil 1065 } 1066 // NB: this is matched on in TestMergeQueueSeesLearner. 1067 log.Eventf(ctx, "transitioning out of joint configuration %s", desc) 1068 1069 // NB: reason and detail won't be used because no range log event will be 1070 // emitted. 1071 // 1072 // TODO(tbg): reconsider this. 1073 return execChangeReplicasTxn( 1074 ctx, store, desc, kvserverpb.ReasonUnknown /* unused */, "", nil, /* iChgs */ 1075 ) 1076 } 1077 1078 // maybeLeaveAtomicChangeReplicasAndRemoveLearners transitions out of the joint 1079 // config (if there is one), and then removes all learners. After this function 1080 // returns, all remaining replicas will be of type VOTER_FULL. 1081 func maybeLeaveAtomicChangeReplicasAndRemoveLearners( 1082 ctx context.Context, store *Store, desc *roachpb.RangeDescriptor, 1083 ) (*roachpb.RangeDescriptor, error) { 1084 desc, err := maybeLeaveAtomicChangeReplicas(ctx, store, desc) 1085 if err != nil { 1086 return nil, err 1087 } 1088 // Now the config isn't joint any more, but we may have demoted some voters 1089 // into learners. These learners should go as well. 1090 1091 learners := desc.Replicas().Learners() 1092 if len(learners) == 0 { 1093 return desc, nil 1094 } 1095 targets := make([]roachpb.ReplicationTarget, len(learners)) 1096 for i := range learners { 1097 targets[i].NodeID = learners[i].NodeID 1098 targets[i].StoreID = learners[i].StoreID 1099 } 1100 log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc) 1101 // NB: unroll the removals because at the time of writing, we can't atomically 1102 // remove multiple learners. This will be fixed in: 1103 // 1104 // https://github.com/cockroachdb/cockroach/pull/40268 1105 origDesc := desc 1106 for _, target := range targets { 1107 var err error 1108 desc, err = execChangeReplicasTxn( 1109 ctx, store, desc, kvserverpb.ReasonAbandonedLearner, "", 1110 []internalReplicationChange{{target: target, typ: internalChangeTypeRemove}}, 1111 ) 1112 if err != nil { 1113 return nil, errors.Wrapf(err, `removing learners from %s`, origDesc) 1114 } 1115 } 1116 return desc, nil 1117 } 1118 1119 func validateReplicationChanges( 1120 desc *roachpb.RangeDescriptor, chgs roachpb.ReplicationChanges, 1121 ) error { 1122 // First make sure that the changes don't self-overlap (i.e. we're not adding 1123 // a replica twice, or removing and immediately re-adding it). 1124 byNodeID := make(map[roachpb.NodeID]roachpb.ReplicationChange, len(chgs)) 1125 for _, chg := range chgs { 1126 if _, ok := byNodeID[chg.Target.NodeID]; ok { 1127 return fmt.Errorf("changes %+v refer to n%d twice", chgs, chg.Target.NodeID) 1128 } 1129 byNodeID[chg.Target.NodeID] = chg 1130 } 1131 1132 // Then, check that we're not adding a second replica on nodes that already 1133 // have one, or "re-add" an existing replica. We delete from byNodeID so that 1134 // after this loop, it contains only StoreIDs that we haven't seen in desc. 1135 for _, rDesc := range desc.Replicas().All() { 1136 chg, ok := byNodeID[rDesc.NodeID] 1137 delete(byNodeID, rDesc.NodeID) 1138 if !ok || chg.ChangeType != roachpb.ADD_REPLICA { 1139 continue 1140 } 1141 // We're adding a replica that's already there. This isn't allowed, even 1142 // when the newly added one would be on a different store. 1143 if rDesc.StoreID != chg.Target.StoreID { 1144 return errors.Errorf("unable to add replica %v; node already has a replica in %s", chg.Target.StoreID, desc) 1145 } 1146 1147 // Looks like we found a replica with the same store and node id. If the 1148 // replica is already a learner, then either some previous leaseholder was 1149 // trying to add it with the learner+snapshot+voter cycle and got 1150 // interrupted or else we hit a race between the replicate queue and 1151 // AdminChangeReplicas. 1152 if rDesc.GetType() == roachpb.LEARNER { 1153 return errors.Errorf( 1154 "unable to add replica %v which is already present as a learner in %s", chg.Target, desc) 1155 } 1156 1157 // Otherwise, we already had a full voter replica. Can't add another to 1158 // this store. 1159 return errors.Errorf("unable to add replica %v which is already present in %s", chg.Target, desc) 1160 } 1161 1162 // Any removals left in the map now refer to nonexisting replicas, and we refuse them. 1163 for _, chg := range byNodeID { 1164 if chg.ChangeType != roachpb.REMOVE_REPLICA { 1165 continue 1166 } 1167 return errors.Errorf("removing %v which is not in %s", chg.Target, desc) 1168 } 1169 return nil 1170 } 1171 1172 // addLearnerReplicas adds learners to the given replication targets. 1173 func addLearnerReplicas( 1174 ctx context.Context, 1175 store *Store, 1176 desc *roachpb.RangeDescriptor, 1177 reason kvserverpb.RangeLogEventReason, 1178 details string, 1179 targets []roachpb.ReplicationTarget, 1180 ) (*roachpb.RangeDescriptor, error) { 1181 // TODO(tbg): we could add all learners in one go, but then we'd need to 1182 // do it as an atomic replication change (raft doesn't know which config 1183 // to apply the delta to, so we might be demoting more than one voter). 1184 // This isn't crazy, we just need to transition out of the joint config 1185 // before returning from this method, and it's unclear that it's worth 1186 // doing. 1187 for _, target := range targets { 1188 iChgs := []internalReplicationChange{{target: target, typ: internalChangeTypeAddLearner}} 1189 var err error 1190 desc, err = execChangeReplicasTxn( 1191 ctx, store, desc, reason, details, iChgs, 1192 ) 1193 if err != nil { 1194 return nil, err 1195 } 1196 } 1197 return desc, nil 1198 } 1199 1200 // lockLearnerSnapshot stops the raft snapshot queue from sending snapshots to 1201 // the soon-to-be added learner replicas to prevent duplicate snapshots from 1202 // being sent. This lock is best effort because it times out and it is a node 1203 // local lock while the raft snapshot queue might be running on a different 1204 // node. An idempotent unlock function is returned. 1205 func (r *Replica) lockLearnerSnapshot( 1206 ctx context.Context, additions []roachpb.ReplicationTarget, 1207 ) (unlock func()) { 1208 // TODO(dan): The way this works is hacky, but it was added at the last minute 1209 // in 19.2 to work around a commit in etcd/raft that made this race more 1210 // likely. It'd be nice if all learner snapshots could be sent from a single 1211 // place. 1212 var lockUUIDs []uuid.UUID 1213 for _, addition := range additions { 1214 lockUUID := uuid.MakeV4() 1215 lockUUIDs = append(lockUUIDs, lockUUID) 1216 r.addSnapshotLogTruncationConstraint(ctx, lockUUID, 1, addition.StoreID) 1217 } 1218 return func() { 1219 now := timeutil.Now() 1220 for _, lockUUID := range lockUUIDs { 1221 r.completeSnapshotLogTruncationConstraint(ctx, lockUUID, now) 1222 } 1223 } 1224 } 1225 1226 // atomicReplicationChange carries out the atomic membership change that 1227 // finalizes the addition and/or removal of replicas. Any voters in the process 1228 // of being added (as reflected by the replication changes) must have been added 1229 // as learners already and will be caught up before being promoted to voters. 1230 // Cluster version permitting, voter removals (from the replication changes) 1231 // will preferably be carried out by first demoting to a learner instead of 1232 // outright removal (this avoids a [raft-bug] that can lead to unavailability). 1233 // All of this occurs in one atomic raft membership change which is carried out 1234 // across two phases. On error, it is possible that the range is in the 1235 // intermediate ("joint") configuration in which a quorum of both the old and 1236 // new sets of voters is required. If a range is encountered in this state, 1237 // maybeLeaveAtomicReplicationChange can fix this, but it is the caller's job to 1238 // do this when necessary. 1239 // 1240 // The atomic membership change is carried out chiefly via the construction of a 1241 // suitable ChangeReplicasTrigger, see prepareChangeReplicasTrigger for details. 1242 // 1243 // Contrary to the name, *all* membership changes go through this method, even 1244 // those that add/remove only a single voter, though the simple protocol is used 1245 // when this is opportune. Notably, demotions can never use the simple protocol, 1246 // even if only a single voter is being demoted, due to a (liftable) limitation 1247 // in etcd/raft. 1248 // 1249 // [raft-bug]: https://github.com/etcd-io/etcd/issues/11284 1250 func (r *Replica) atomicReplicationChange( 1251 ctx context.Context, 1252 desc *roachpb.RangeDescriptor, 1253 priority SnapshotRequest_Priority, 1254 reason kvserverpb.RangeLogEventReason, 1255 details string, 1256 chgs roachpb.ReplicationChanges, 1257 ) (*roachpb.RangeDescriptor, error) { 1258 // TODO(dan): We allow ranges with learner replicas to split, so in theory 1259 // this may want to detect that and retry, sending a snapshot and promoting 1260 // both sides. 1261 1262 iChgs := make([]internalReplicationChange, 0, len(chgs)) 1263 1264 for _, target := range chgs.Additions() { 1265 iChgs = append(iChgs, internalReplicationChange{target: target, typ: internalChangeTypePromoteLearner}) 1266 // All adds must be present as learners right now, and we send them 1267 // snapshots in anticipation of promoting them to voters. 1268 rDesc, ok := desc.GetReplicaDescriptor(target.StoreID) 1269 if !ok { 1270 return nil, errors.Errorf("programming error: replica %v not found in %v", target, desc) 1271 } 1272 1273 if rDesc.GetType() != roachpb.LEARNER { 1274 return nil, errors.Errorf("programming error: cannot promote replica of type %s", rDesc.Type) 1275 } 1276 1277 if fn := r.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() { 1278 continue 1279 } 1280 1281 // Note that raft snapshot queue will refuse to send a snapshot to a learner 1282 // replica if its store is already sending a snapshot to that replica. That 1283 // would race with this snapshot, except that we've put a (best effort) lock 1284 // on it before the conf change txn was run. This is best effort because the 1285 // lock can time out and the lock is local to this node, while the raft 1286 // leader could be on another node entirely (they're usually co-located but 1287 // this is not guaranteed). 1288 // 1289 // We originally tried always refusing to send snapshots from the raft 1290 // snapshot queue to learner replicas, but this turned out to be brittle. 1291 // First, if the snapshot failed, any attempt to use the learner's raft 1292 // group would hang until the replicate queue got around to cleaning up the 1293 // orphaned learner. Second, this tickled some bugs in etcd/raft around 1294 // switching between StateSnapshot and StateProbe. Even if we worked through 1295 // these, it would be susceptible to future similar issues. 1296 if err := r.sendSnapshot(ctx, rDesc, SnapshotRequest_LEARNER, priority); err != nil { 1297 return nil, err 1298 } 1299 } 1300 1301 if adds := chgs.Additions(); len(adds) > 0 { 1302 if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterLearnerSnapshot; fn != nil && fn(adds) { 1303 return desc, nil 1304 } 1305 } 1306 1307 canUseDemotion := r.store.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionChangeReplicasDemotion) 1308 for _, target := range chgs.Removals() { 1309 typ := internalChangeTypeRemove 1310 if rDesc, ok := desc.GetReplicaDescriptor(target.StoreID); ok && rDesc.GetType() == roachpb.VOTER_FULL && canUseDemotion { 1311 typ = internalChangeTypeDemote 1312 } 1313 iChgs = append(iChgs, internalReplicationChange{target: target, typ: typ}) 1314 } 1315 1316 var err error 1317 desc, err = execChangeReplicasTxn(ctx, r.store, desc, reason, details, iChgs) 1318 if err != nil { 1319 return nil, err 1320 } 1321 1322 if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterJointConfig; fn != nil && fn() { 1323 return desc, nil 1324 } 1325 1326 // Leave the joint config if we entered one. Also, remove any learners we 1327 // might have picked up due to removal-via-demotion. 1328 return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc) 1329 } 1330 1331 // tryRollbackLearnerReplica attempts to remove a learner specified by the 1332 // target. If no such learner is found in the descriptor (including when it is a 1333 // voter instead), no action is taken. Otherwise, a single time-limited 1334 // best-effort attempt at removing the learner is made. 1335 func (r *Replica) tryRollBackLearnerReplica( 1336 ctx context.Context, 1337 desc *roachpb.RangeDescriptor, 1338 target roachpb.ReplicationTarget, 1339 reason kvserverpb.RangeLogEventReason, 1340 details string, 1341 ) { 1342 repDesc, ok := desc.GetReplicaDescriptor(target.StoreID) 1343 if !ok || repDesc.GetType() != roachpb.LEARNER { 1344 // There's no learner to roll back. 1345 log.Event(ctx, "learner to roll back not found; skipping") 1346 return 1347 } 1348 1349 // If (for example) the promotion failed because of a context deadline 1350 // exceeded, we do still want to clean up after ourselves, so always use a new 1351 // context (but with the old tags and with some timeout to save this from 1352 // blocking the caller indefinitely). 1353 const rollbackTimeout = 10 * time.Second 1354 1355 rollbackFn := func(ctx context.Context) error { 1356 _, err := execChangeReplicasTxn( 1357 ctx, r.store, desc, reason, details, 1358 []internalReplicationChange{{target: target, typ: internalChangeTypeRemove}}, 1359 ) 1360 return err 1361 } 1362 rollbackCtx := logtags.WithTags(context.Background(), logtags.FromContext(ctx)) 1363 if err := contextutil.RunWithTimeout( 1364 rollbackCtx, "learner rollback", rollbackTimeout, rollbackFn, 1365 ); err != nil { 1366 log.Infof(ctx, 1367 "failed to rollback learner %s, abandoning it for the replicate queue: %v", target, err) 1368 r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 1369 } else { 1370 log.Infof(ctx, "rolled back learner %s in %s", target, desc) 1371 } 1372 } 1373 1374 type internalChangeType byte 1375 1376 const ( 1377 _ internalChangeType = iota + 1 1378 internalChangeTypeAddLearner 1379 internalChangeTypePromoteLearner 1380 // internalChangeTypeDemote changes a voter to a learner. This will 1381 // necessarily go through joint consensus since it requires two individual 1382 // changes (only one changes the quorum, so we could allow it in a simple 1383 // change too, with some work here and upstream). Demotions are treated like 1384 // removals throughout (i.e. they show up in `ChangeReplicasTrigger.Removed()`, 1385 // but not in `.Added()`). 1386 internalChangeTypeDemote 1387 // NB: can't remove multiple learners at once (need to remove at least one 1388 // voter with them), see: 1389 // https://github.com/cockroachdb/cockroach/pull/40268 1390 internalChangeTypeRemove 1391 ) 1392 1393 // internalReplicationChange is a replication target together with an internal 1394 // change type. The internal change type is needed to encode in which way the 1395 // replica is mutated (i.e. in a sense, what its predecessor looked like). We 1396 // need this to accurately transcribe the old into the updated range descriptor. 1397 type internalReplicationChange struct { 1398 target roachpb.ReplicationTarget 1399 typ internalChangeType 1400 } 1401 1402 type internalReplicationChanges []internalReplicationChange 1403 1404 func (c internalReplicationChanges) leaveJoint() bool { return len(c) == 0 } 1405 func (c internalReplicationChanges) useJoint() bool { 1406 // NB: demotions require joint consensus because of limitations in etcd/raft. 1407 // These could be lifted, but it doesn't seem worth it. 1408 return len(c) > 1 || c[0].typ == internalChangeTypeDemote 1409 } 1410 1411 type storeSettings interface { 1412 ClusterSettings() *cluster.Settings 1413 TestingKnobs() *StoreTestingKnobs 1414 } 1415 1416 func prepareChangeReplicasTrigger( 1417 ctx context.Context, 1418 store storeSettings, 1419 desc *roachpb.RangeDescriptor, 1420 chgs internalReplicationChanges, 1421 ) (*roachpb.ChangeReplicasTrigger, error) { 1422 updatedDesc := *desc 1423 updatedDesc.SetReplicas(desc.Replicas().DeepCopy()) 1424 updatedDesc.IncrementGeneration() 1425 1426 var added, removed []roachpb.ReplicaDescriptor 1427 if !chgs.leaveJoint() { 1428 if desc.Replicas().InAtomicReplicationChange() { 1429 return nil, errors.Errorf("must transition out of joint config first: %s", desc) 1430 } 1431 1432 useJoint := chgs.useJoint() 1433 if fn := store.TestingKnobs().ReplicationAlwaysUseJointConfig; fn != nil && fn() { 1434 useJoint = true 1435 } 1436 for _, chg := range chgs { 1437 switch chg.typ { 1438 case internalChangeTypeAddLearner: 1439 added = append(added, 1440 updatedDesc.AddReplica(chg.target.NodeID, chg.target.StoreID, roachpb.LEARNER)) 1441 case internalChangeTypePromoteLearner: 1442 typ := roachpb.VOTER_FULL 1443 if useJoint { 1444 typ = roachpb.VOTER_INCOMING 1445 } 1446 rDesc, prevTyp, ok := updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, typ) 1447 if !ok || prevTyp != roachpb.LEARNER { 1448 return nil, errors.Errorf("cannot promote target %v which is missing as Learner", chg.target) 1449 } 1450 added = append(added, rDesc) 1451 case internalChangeTypeRemove: 1452 rDesc, ok := updatedDesc.GetReplicaDescriptor(chg.target.StoreID) 1453 if !ok { 1454 return nil, errors.Errorf("target %s not found", chg.target) 1455 } 1456 prevTyp := rDesc.GetType() 1457 if !useJoint || prevTyp == roachpb.LEARNER { 1458 rDesc, _ = updatedDesc.RemoveReplica(chg.target.NodeID, chg.target.StoreID) 1459 } else if prevTyp != roachpb.VOTER_FULL { 1460 // NB: prevTyp is already known to be VOTER_FULL because of 1461 // !InAtomicReplicationChange() and the learner handling 1462 // above. We check it anyway. 1463 return nil, errors.Errorf("cannot transition from %s to VOTER_OUTGOING", prevTyp) 1464 } else { 1465 rDesc, _, _ = updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, roachpb.VOTER_OUTGOING) 1466 } 1467 removed = append(removed, rDesc) 1468 case internalChangeTypeDemote: 1469 // Demotion is similar to removal, except that a demotion 1470 // cannot apply to a learner, and that the resulting type is 1471 // different when entering a joint config. 1472 rDesc, ok := updatedDesc.GetReplicaDescriptor(chg.target.StoreID) 1473 if !ok { 1474 return nil, errors.Errorf("target %s not found", chg.target) 1475 } 1476 if !useJoint { 1477 // NB: this won't fire because cc.useJoint() is always true when 1478 // there's a demotion. This is just a sanity check. 1479 return nil, errors.Errorf("demotions require joint consensus") 1480 } 1481 if prevTyp := rDesc.GetType(); prevTyp != roachpb.VOTER_FULL { 1482 return nil, errors.Errorf("cannot transition from %s to VOTER_DEMOTING", prevTyp) 1483 } 1484 rDesc, _, _ = updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, roachpb.VOTER_DEMOTING) 1485 removed = append(removed, rDesc) 1486 default: 1487 return nil, errors.Errorf("unsupported internal change type %d", chg.typ) 1488 } 1489 } 1490 } else { 1491 // Want to leave a joint config. Note that we're not populating 'added' or 'removed', this 1492 // is intentional; leaving the joint config corresponds to an "empty" raft conf change. 1493 var isJoint bool 1494 // NB: the DeepCopy is needed or we'll skip over an entry every time we 1495 // call RemoveReplica below. 1496 for _, rDesc := range updatedDesc.Replicas().DeepCopy().All() { 1497 switch rDesc.GetType() { 1498 case roachpb.VOTER_INCOMING: 1499 updatedDesc.SetReplicaType(rDesc.NodeID, rDesc.StoreID, roachpb.VOTER_FULL) 1500 isJoint = true 1501 case roachpb.VOTER_OUTGOING: 1502 updatedDesc.RemoveReplica(rDesc.NodeID, rDesc.StoreID) 1503 isJoint = true 1504 case roachpb.VOTER_DEMOTING: 1505 updatedDesc.SetReplicaType(rDesc.NodeID, rDesc.StoreID, roachpb.LEARNER) 1506 isJoint = true 1507 default: 1508 } 1509 } 1510 if !isJoint { 1511 return nil, errors.Errorf("cannot leave a joint config; desc not joint: %s", &updatedDesc) 1512 } 1513 } 1514 1515 if err := updatedDesc.Validate(); err != nil { 1516 return nil, errors.Wrapf(err, "validating updated descriptor %s", &updatedDesc) 1517 } 1518 1519 var crt *roachpb.ChangeReplicasTrigger 1520 if !store.ClusterSettings().Version.IsActive( 1521 ctx, clusterversion.VersionAtomicChangeReplicasTrigger, 1522 ) { 1523 var deprecatedChangeType roachpb.ReplicaChangeType 1524 var deprecatedRepDesc roachpb.ReplicaDescriptor 1525 if len(added) > 0 { 1526 deprecatedChangeType = roachpb.ADD_REPLICA 1527 deprecatedRepDesc = added[0] 1528 } else { 1529 deprecatedChangeType = roachpb.REMOVE_REPLICA 1530 deprecatedRepDesc = removed[0] 1531 } 1532 crt = &roachpb.ChangeReplicasTrigger{ 1533 // NB: populate Desc as well because locally we rely on it being 1534 // set. 1535 Desc: &updatedDesc, 1536 DeprecatedChangeType: deprecatedChangeType, 1537 DeprecatedReplica: deprecatedRepDesc, 1538 DeprecatedUpdatedReplicas: updatedDesc.Replicas().All(), 1539 DeprecatedNextReplicaID: updatedDesc.NextReplicaID, 1540 } 1541 } else { 1542 crt = &roachpb.ChangeReplicasTrigger{ 1543 Desc: &updatedDesc, 1544 InternalAddedReplicas: added, 1545 InternalRemovedReplicas: removed, 1546 } 1547 } 1548 1549 if _, err := crt.ConfChange(nil); err != nil { 1550 return nil, errors.Wrapf(err, "programming error: malformed trigger created from desc %s to %s", desc, &updatedDesc) 1551 } 1552 return crt, nil 1553 } 1554 1555 func execChangeReplicasTxn( 1556 ctx context.Context, 1557 store *Store, 1558 referenceDesc *roachpb.RangeDescriptor, 1559 reason kvserverpb.RangeLogEventReason, 1560 details string, 1561 chgs internalReplicationChanges, 1562 ) (*roachpb.RangeDescriptor, error) { 1563 var returnDesc *roachpb.RangeDescriptor 1564 1565 descKey := keys.RangeDescriptorKey(referenceDesc.StartKey) 1566 1567 check := func(kvDesc *roachpb.RangeDescriptor) bool { 1568 // NB: We might fail to find the range if the range has been merged away 1569 // in which case we definitely want to fail the check below. 1570 if kvDesc != nil && kvDesc.RangeID == referenceDesc.RangeID && chgs.leaveJoint() { 1571 // If there are no changes, we're trying to leave a joint config, 1572 // so that's all we care about. But since leaving a joint config 1573 // is done opportunistically whenever one is encountered, this is 1574 // more likely to race than other operations. So we verify literally 1575 // nothing about the descriptor, but once we get the descriptor out 1576 // from conditionalGetDescValueFromDB, we'll check if it's in a 1577 // joint config and if not, noop. 1578 return true 1579 } 1580 // Otherwise, check that the descriptors are equal. 1581 // 1582 // TODO(tbg): check that the replica sets are equal only. I was going to 1583 // do that but then discovered #40367. Try again in the 20.1 cycle. 1584 return checkDescsEqual(referenceDesc)(kvDesc) 1585 } 1586 1587 if err := store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error { 1588 log.Event(ctx, "attempting txn") 1589 txn.SetDebugName(replicaChangeTxnName) 1590 desc, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, referenceDesc.StartKey, check) 1591 if err != nil { 1592 return err 1593 } 1594 if chgs.leaveJoint() && !desc.Replicas().InAtomicReplicationChange() { 1595 // Nothing to do. See comment in 'check' above for details. 1596 returnDesc = desc 1597 return nil 1598 } 1599 // Note that we are now using the descriptor from KV, not the one passed 1600 // into this method. 1601 crt, err := prepareChangeReplicasTrigger(ctx, store, desc, chgs) 1602 if err != nil { 1603 return err 1604 } 1605 log.Infof(ctx, "change replicas (add %v remove %v): existing descriptor %s", crt.Added(), crt.Removed(), desc) 1606 1607 { 1608 b := txn.NewBatch() 1609 1610 // Important: the range descriptor must be the first thing touched in the transaction 1611 // so the transaction record is co-located with the range being modified. 1612 if err := updateRangeDescriptor(b, descKey, dbDescValue, crt.Desc); err != nil { 1613 return err 1614 } 1615 1616 // Run transaction up to this point to create txn record early (see #9265). 1617 if err := txn.Run(ctx, b); err != nil { 1618 return err 1619 } 1620 } 1621 1622 // Log replica change into range event log. 1623 for _, tup := range []struct { 1624 typ roachpb.ReplicaChangeType 1625 repDescs []roachpb.ReplicaDescriptor 1626 }{ 1627 {roachpb.ADD_REPLICA, crt.Added()}, 1628 {roachpb.REMOVE_REPLICA, crt.Removed()}, 1629 } { 1630 for _, repDesc := range tup.repDescs { 1631 if err := store.logChange( 1632 ctx, txn, tup.typ, repDesc, *crt.Desc, reason, details, 1633 ); err != nil { 1634 return err 1635 } 1636 } 1637 } 1638 1639 // End the transaction manually instead of letting RunTransaction 1640 // loop do it, in order to provide a commit trigger. 1641 b := txn.NewBatch() 1642 1643 // Update range descriptor addressing record(s). 1644 if err := updateRangeAddressing(b, crt.Desc); err != nil { 1645 return err 1646 } 1647 1648 b.AddRawRequest(&roachpb.EndTxnRequest{ 1649 Commit: true, 1650 InternalCommitTrigger: &roachpb.InternalCommitTrigger{ 1651 ChangeReplicasTrigger: crt, 1652 }, 1653 }) 1654 if err := txn.Run(ctx, b); err != nil { 1655 log.Eventf(ctx, "%v", err) 1656 return err 1657 } 1658 1659 returnDesc = crt.Desc 1660 return nil 1661 }); err != nil { 1662 log.Eventf(ctx, "%v", err) 1663 // NB: desc may not be the descriptor we actually compared against, but 1664 // either way this gives a good idea of what happened which is all it's 1665 // supposed to do. 1666 if ok, actualDesc := maybeDescriptorChangedError(referenceDesc, err); ok { 1667 // We do not include the original error as cause in this case - 1668 // the caller should not observe the cause. We still include it 1669 // as "secondary payload", in case the error object makes it way 1670 // to logs or telemetry during a crash. 1671 err = errors.WithSecondaryError(newDescChangedError(referenceDesc, actualDesc), err) 1672 err = &benignError{err} 1673 } 1674 return nil, errors.Wrapf(err, "change replicas of r%d failed", referenceDesc.RangeID) 1675 } 1676 log.Event(ctx, "txn complete") 1677 return returnDesc, nil 1678 } 1679 1680 // sendSnapshot sends a snapshot of the replica state to the specified replica. 1681 // Currently only invoked from replicateQueue and raftSnapshotQueue. Be careful 1682 // about adding additional calls as generating a snapshot is moderately 1683 // expensive. 1684 // 1685 // A snapshot is a bulk transfer of all data in a range. It consists of a 1686 // consistent view of all the state needed to run some replica of a range as of 1687 // some applied index (not as of some mvcc-time). Snapshots are used by Raft 1688 // when a follower is far enough behind the leader that it can no longer be 1689 // caught up using incremental diffs (because the leader has already garbage 1690 // collected the diffs, in this case because it truncated the Raft log past 1691 // where the follower is). 1692 // 1693 // We also proactively send a snapshot when adding a new replica to bootstrap it 1694 // (this is called a "learner" snapshot and is a special case of a Raft 1695 // snapshot, we just speed the process along). It's called a learner snapshot 1696 // because it's sent to what Raft terms a learner replica. As of 19.2, when we 1697 // add a new replica, it's first added as a learner using a Raft ConfChange, 1698 // which means it accepts Raft traffic but doesn't vote or affect quorum. Then 1699 // we immediately send it a snapshot to catch it up. After the snapshot 1700 // successfully applies, we turn it into a normal voting replica using another 1701 // ConfChange. It then uses the normal mechanisms to catch up with whatever got 1702 // committed to the Raft log during the snapshot transfer. In contrast to adding 1703 // the voting replica directly, this avoids a period of fragility when the 1704 // replica would be a full member, but very far behind. 1705 // 1706 // Snapshots are expensive and mostly unexpected (except learner snapshots 1707 // during rebalancing). The quota pool is responsible for keeping a leader from 1708 // getting too far ahead of any of the followers, so ideally they'd never be far 1709 // enough behind to need a snapshot. 1710 // 1711 // The snapshot process itself is broken into 3 parts: generating the snapshot, 1712 // transmitting it, and applying it. 1713 // 1714 // Generating the snapshot: The data contained in a snapshot is a full copy of 1715 // the replicated data plus everything the replica needs to be a healthy member 1716 // of a Raft group. The former is large, so we send it via streaming rpc 1717 // instead of keeping it all in memory at once. The `(Replica).GetSnapshot` 1718 // method does the necessary locking and gathers the various Raft state needed 1719 // to run a replica. It also creates an iterator for the range's data as it 1720 // looked under those locks (this is powered by a RocksDB snapshot, which is a 1721 // different thing but a similar idea). Notably, GetSnapshot does not do the 1722 // data iteration. 1723 // 1724 // Transmitting the snapshot: The transfer itself happens over the grpc 1725 // `RaftSnapshot` method, which is a bi-directional stream of `SnapshotRequest`s 1726 // and `SnapshotResponse`s. The two sides are orchestrated by the 1727 // `(RaftTransport).SendSnapshot` and `(Store).receiveSnapshot` methods. 1728 // 1729 // `SendSnapshot` starts up the streaming rpc and first sends a header message 1730 // with everything but the range data and then blocks, waiting on the first 1731 // streaming response from the recipient. This lets us short-circuit sending the 1732 // range data if the recipient can't be contacted or if it can't use the 1733 // snapshot (which is usually the result of a race). The recipient's grpc 1734 // handler for RaftSnapshot sanity checks a few things and ends up calling down 1735 // into `receiveSnapshot`, which does the bulk of the work. `receiveSnapshot` 1736 // starts by waiting for a reservation in the snapshot rate limiter. It then 1737 // reads the header message and hands it to `shouldAcceptSnapshotData` to 1738 // determine if it can use the snapshot [1]. `shouldAcceptSnapshotData` is 1739 // advisory and can return false positives. If `shouldAcceptSnapshotData` 1740 // returns true, this is communicated back to the sender, which then proceeds to 1741 // call `kvBatchSnapshotStrategy.Send`. This uses the iterator captured earlier 1742 // to send the data in chunks, each chunk a streaming grpc message. The sender 1743 // then sends a final message with an indicaton that it's done and blocks again, 1744 // waiting for a second and final response from the recipient which indicates if 1745 // the snapshot was a success. 1746 // 1747 // `receiveSnapshot` takes the key-value pairs sent and incrementally creates 1748 // three SSTs from them for direct ingestion: one for the replicated range-ID 1749 // local keys, one for the range local keys, and one for the user keys. The 1750 // reason it creates three separate SSTs is to prevent overlaps with the 1751 // memtable and existing SSTs in RocksDB. Each of the SSTs also has a range 1752 // deletion tombstone to delete the existing data in the range. 1753 // 1754 // Applying the snapshot: After the recipient has received the message 1755 // indicating it has all the data, it hands it all to 1756 // `(Store).processRaftSnapshotRequest` to be applied. First, this re-checks 1757 // the same things as `shouldAcceptSnapshotData` to make sure nothing has 1758 // changed while the snapshot was being transferred. It then guarantees that 1759 // there is either an initialized[2] replica or a `ReplicaPlaceholder`[3] to 1760 // accept the snapshot by creating a placeholder if necessary. Finally, a *Raft 1761 // snapshot* message is manually handed to the replica's Raft node (by calling 1762 // `stepRaftGroup` + `handleRaftReadyRaftMuLocked`). During the application 1763 // process, several other SSTs may be created for direct ingestion. An SST for 1764 // the unreplicated range-ID local keys is created for the Raft entries, hard 1765 // state, and truncated state. An SST is created for deleting each subsumed 1766 // replica's range-ID local keys and at most two SSTs are created for deleting 1767 // the user keys and range local keys of all subsumed replicas. All in all, a 1768 // maximum of 6 + SR SSTs will be created for direct ingestion where SR is the 1769 // number of subsumed replicas. In the case where there are no subsumed 1770 // replicas, 4 SSTs will be created. 1771 // 1772 // [1]: The largest class of rejections here is if the store contains a replica 1773 // that overlaps the snapshot but has a different id (we maintain an invariant 1774 // that replicas on a store never overlap). This usually happens when the 1775 // recipient has an old copy of a replica that is no longer part of a range and 1776 // the `replicaGCQueue` hasn't gotten around to collecting it yet. So if this 1777 // happens, `shouldAcceptSnapshotData` will queue it up for consideration. 1778 // 1779 // [2]: A uninitialized replica is created when a replica that's being added 1780 // gets traffic from its new peers before it gets a snapshot. It may be possible 1781 // to get rid of uninitialized replicas (by dropping all Raft traffic except 1782 // votes on the floor), but this is a cleanup that hasn't happened yet. 1783 // 1784 // [3]: The placeholder is essentially a snapshot lock, making any future 1785 // callers of `shouldAcceptSnapshotData` return an error so that we no longer 1786 // have to worry about racing with a second snapshot. See the comment on 1787 // ReplicaPlaceholder for details. 1788 func (r *Replica) sendSnapshot( 1789 ctx context.Context, 1790 recipient roachpb.ReplicaDescriptor, 1791 snapType SnapshotRequest_Type, 1792 priority SnapshotRequest_Priority, 1793 ) (retErr error) { 1794 defer func() { 1795 // Report the snapshot status to Raft, which expects us to do this once we 1796 // finish sending the snapshot. 1797 r.reportSnapshotStatus(ctx, recipient.ReplicaID, retErr) 1798 }() 1799 1800 snap, err := r.GetSnapshot(ctx, snapType, recipient.StoreID) 1801 if err != nil { 1802 return errors.Wrapf(err, "%s: failed to generate %s snapshot", r, snapType) 1803 } 1804 defer snap.Close() 1805 log.Event(ctx, "generated snapshot") 1806 1807 sender, err := r.GetReplicaDescriptor() 1808 if err != nil { 1809 return errors.Wrapf(err, "%s: change replicas failed", r) 1810 } 1811 1812 status := r.RaftStatus() 1813 if status == nil { 1814 // This code path is sometimes hit during scatter for replicas that 1815 // haven't woken up yet. 1816 return &benignError{errors.New("raft status not initialized")} 1817 } 1818 1819 usesReplicatedTruncatedState, err := storage.MVCCGetProto( 1820 ctx, snap.EngineSnap, keys.RaftTruncatedStateLegacyKey(r.RangeID), hlc.Timestamp{}, nil, storage.MVCCGetOptions{}, 1821 ) 1822 if err != nil { 1823 return errors.Wrap(err, "loading legacy truncated state") 1824 } 1825 1826 canAvoidSendingLog := !usesReplicatedTruncatedState && 1827 snap.State.TruncatedState.Index < snap.State.RaftAppliedIndex 1828 1829 if canAvoidSendingLog { 1830 // If we're not using a legacy (replicated) truncated state, we avoid 1831 // sending the (past) Raft log in the snapshot in the first place and 1832 // send only those entries that are actually useful to the follower. 1833 // This is done by changing the truncated state, which we're allowed 1834 // to do since it is not a replicated key (and thus not subject to 1835 // matching across replicas). The actual sending happens here: 1836 _ = (*kvBatchSnapshotStrategy)(nil).Send 1837 // and results in no log entries being sent at all. Note that 1838 // Metadata.Index is really the applied index of the replica. 1839 snap.State.TruncatedState = &roachpb.RaftTruncatedState{ 1840 Index: snap.RaftSnap.Metadata.Index, 1841 Term: snap.RaftSnap.Metadata.Term, 1842 } 1843 } 1844 1845 req := SnapshotRequest_Header{ 1846 State: snap.State, 1847 // Tell the recipient whether it needs to synthesize the new 1848 // unreplicated TruncatedState. It could tell by itself by peeking into 1849 // the data, but it uses a write only batch for performance which 1850 // doesn't support that; this is easier. Notably, this is true if the 1851 // snap index itself is the one at which the migration happens. 1852 // 1853 // See VersionUnreplicatedRaftTruncatedState. 1854 UnreplicatedTruncatedState: !usesReplicatedTruncatedState, 1855 RaftMessageRequest: RaftMessageRequest{ 1856 RangeID: r.RangeID, 1857 FromReplica: sender, 1858 ToReplica: recipient, 1859 Message: raftpb.Message{ 1860 Type: raftpb.MsgSnap, 1861 To: uint64(recipient.ReplicaID), 1862 From: uint64(sender.ReplicaID), 1863 Term: status.Term, 1864 Snapshot: snap.RaftSnap, 1865 }, 1866 }, 1867 RangeSize: r.GetMVCCStats().Total(), 1868 // Recipients currently cannot choose to decline any snapshots. 1869 // In 19.2 and earlier versions pre-emptive snapshots could be declined. 1870 // 1871 // TODO(ajwerner): Consider removing the CanDecline flag. 1872 CanDecline: false, 1873 Priority: priority, 1874 Strategy: SnapshotRequest_KV_BATCH, 1875 Type: snapType, 1876 } 1877 sent := func() { 1878 r.store.metrics.RangeSnapshotsGenerated.Inc(1) 1879 } 1880 if err := r.store.cfg.Transport.SendSnapshot( 1881 ctx, 1882 &r.store.cfg.RaftConfig, 1883 r.store.allocator.storePool, 1884 req, 1885 snap, 1886 r.store.Engine().NewBatch, 1887 sent, 1888 ); err != nil { 1889 if errors.Is(err, errMalformedSnapshot) { 1890 tag := fmt.Sprintf("r%d_%s", r.RangeID, snap.SnapUUID.Short()) 1891 if dir, err := r.store.checkpoint(ctx, tag); err != nil { 1892 log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err) 1893 } else { 1894 log.Warningf(ctx, "created checkpoint %s", dir) 1895 } 1896 1897 log.Fatal(ctx, "malformed snapshot generated") 1898 } 1899 return &snapshotError{err} 1900 } 1901 return nil 1902 } 1903 1904 // replicaSetsEqual is used in AdminMerge to ensure that the ranges are 1905 // all collocate on the same set of replicas. 1906 func replicaSetsEqual(a, b []roachpb.ReplicaDescriptor) bool { 1907 if len(a) != len(b) { 1908 return false 1909 } 1910 1911 set := make(map[roachpb.StoreID]int) 1912 for _, replica := range a { 1913 set[replica.StoreID]++ 1914 } 1915 1916 for _, replica := range b { 1917 set[replica.StoreID]-- 1918 } 1919 1920 for _, value := range set { 1921 if value != 0 { 1922 return false 1923 } 1924 } 1925 1926 return true 1927 } 1928 1929 func checkDescsEqual(desc *roachpb.RangeDescriptor) func(*roachpb.RangeDescriptor) bool { 1930 // TODO(jeffreyxiao): This hacky fix ensures that we don't fail the 1931 // conditional get because of the ordering of InternalReplicas. Calling 1932 // Replicas() will sort the list of InternalReplicas as a side-effect. The 1933 // invariant of having InternalReplicas sorted is not maintained in 19.1. 1934 // Additionally, in 19.2, it's possible for the in-memory copy of 1935 // RangeDescriptor to become sorted from a call to Replicas() without 1936 // updating the copy in kv. These two factors makes it possible for the 1937 // in-memory copy to be out of sync from the copy in kv. The sorted invariant 1938 // of InternalReplicas is used by ReplicaDescriptors.Voters() and 1939 // ReplicaDescriptors.Learners(). 1940 if desc != nil { 1941 desc.Replicas() // for sorting side-effect 1942 } 1943 return func(desc2 *roachpb.RangeDescriptor) bool { 1944 if desc2 != nil { 1945 desc2.Replicas() // for sorting side-effect 1946 } 1947 1948 return desc.Equal(desc2) 1949 } 1950 } 1951 1952 // conditionalGetDescValueFromDB fetches an encoded RangeDescriptor from kv, 1953 // checks that it matches the given expectation using proto Equals, and returns 1954 // the raw fetched roachpb.Value. If the fetched value doesn't match the 1955 // expectation, a ConditionFailedError is returned. 1956 // 1957 // This ConditionFailedError is a historical artifact. We used to pass the 1958 // parsed RangeDescriptor directly as the expected value in a CPut, but proto 1959 // message encodings aren't stable so this was fragile. Calling this method and 1960 // then passing the returned *roachpb.Value as the expected value in a CPut does 1961 // the same thing, but also correctly handles proto equality. See #38308. 1962 func conditionalGetDescValueFromDB( 1963 ctx context.Context, 1964 txn *kv.Txn, 1965 startKey roachpb.RKey, 1966 check func(*roachpb.RangeDescriptor) bool, 1967 ) (*roachpb.RangeDescriptor, *roachpb.Value, error) { 1968 descKey := keys.RangeDescriptorKey(startKey) 1969 existingDescKV, err := txn.Get(ctx, descKey) 1970 if err != nil { 1971 return nil, nil, errors.Wrap(err, "fetching current range descriptor value") 1972 } 1973 var existingDesc *roachpb.RangeDescriptor 1974 if existingDescKV.Value != nil { 1975 existingDesc = &roachpb.RangeDescriptor{} 1976 if err := existingDescKV.Value.GetProto(existingDesc); err != nil { 1977 return nil, nil, errors.Wrap(err, "decoding current range descriptor value") 1978 } 1979 } 1980 1981 if !check(existingDesc) { 1982 return nil, nil, &roachpb.ConditionFailedError{ActualValue: existingDescKV.Value} 1983 } 1984 return existingDesc, existingDescKV.Value, nil 1985 } 1986 1987 // updateRangeDescriptor adds a ConditionalPut on the range descriptor. The 1988 // conditional put verifies that changes to the range descriptor are made in a 1989 // well-defined order, preventing a scenario where a wayward replica which is 1990 // no longer part of the original Raft group comes back online to form a 1991 // splinter group with a node which was also a former replica, and hijacks the 1992 // range descriptor. This is a last line of defense; other mechanisms should 1993 // prevent rogue replicas from getting this far (see #768). 1994 // 1995 // oldValue can be nil, meaning that the key is expected to not exist. 1996 // 1997 // Note that in addition to using this method to update the on-disk range 1998 // descriptor, a CommitTrigger must be used to update the in-memory 1999 // descriptor; it will not automatically be copied from newDesc. 2000 func updateRangeDescriptor( 2001 b *kv.Batch, descKey roachpb.Key, oldValue *roachpb.Value, newDesc *roachpb.RangeDescriptor, 2002 ) error { 2003 // This is subtle: []byte(nil) != interface{}(nil). A []byte(nil) refers to 2004 // an empty value. An interface{}(nil) refers to a non-existent value. So 2005 // we're careful to construct interface{}(nil)s when newDesc/oldDesc are nil. 2006 var newValue interface{} 2007 if newDesc != nil { 2008 if err := newDesc.Validate(); err != nil { 2009 return errors.Wrapf(err, "validating new descriptor %+v (old descriptor is %+v)", 2010 newDesc, oldValue) 2011 } 2012 newBytes, err := protoutil.Marshal(newDesc) 2013 if err != nil { 2014 return err 2015 } 2016 newValue = newBytes 2017 } 2018 b.CPut(descKey, newValue, oldValue) 2019 return nil 2020 } 2021 2022 // AdminRelocateRange relocates a given range to a given set of stores. The 2023 // first store in the slice becomes the new leaseholder. 2024 // 2025 // This is best-effort; it's possible that the replicate queue on the 2026 // leaseholder could take action at the same time, causing errors. 2027 func (s *Store) AdminRelocateRange( 2028 ctx context.Context, rangeDesc roachpb.RangeDescriptor, targets []roachpb.ReplicationTarget, 2029 ) error { 2030 // Step 0: Remove everything that's not a full voter so we don't have to think 2031 // about them. 2032 newDesc, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, s, &rangeDesc) 2033 if err != nil { 2034 log.Warningf(ctx, "%v", err) 2035 return err 2036 } 2037 rangeDesc = *newDesc 2038 2039 canRetry := func(err error) bool { 2040 whitelist := []string{ 2041 snapshotApplySemBusyMsg, 2042 IntersectingSnapshotMsg, 2043 } 2044 errStr := err.Error() 2045 for _, substr := range whitelist { 2046 if strings.Contains(errStr, substr) { 2047 return true 2048 } 2049 } 2050 return false 2051 } 2052 2053 startKey := rangeDesc.StartKey.AsRawKey() 2054 transferLease := func(target roachpb.ReplicationTarget) { 2055 // TODO(tbg): we ignore errors here, but it seems that in practice these 2056 // transfers "always work". Some of them are essential (we can't remove 2057 // the leaseholder so we'll fail there later if this fails), so it 2058 // seems like a good idea to return any errors here to the caller (or 2059 // to retry some errors appropriately). 2060 if err := s.DB().AdminTransferLease( 2061 ctx, startKey, target.StoreID, 2062 ); err != nil { 2063 log.Warningf(ctx, "while transferring lease: %+v", err) 2064 } 2065 } 2066 2067 // Step 2: Repeatedly add and/or remove a replica until we reach the 2068 // desired state. In an "atomic replication changes" world, this is 2069 // conceptually easy: change from the old set of replicas to the new 2070 // one. But there are two reasons that complicate this: 2071 // 1. we can't remove the leaseholder, so if we ultimately want to do that 2072 // the lease has to be moved first. If we start out with *only* the 2073 // leaseholder, we will have to add a replica first. 2074 // 2. this code is rewritten late in the cycle and it is both safer and 2075 // closer to its previous incarnation to never issue atomic changes 2076 // other than simple swaps. 2077 // 2078 // The loop below repeatedly calls relocateOne, which gives us either one or 2079 // two ops that move the range towards the desired replication state. If 2080 // it's one op, then a single add or remove is carried out (and it's only 2081 // done when we can't swap instead). If it's two ops, then we're swapping 2082 // (though this code doesn't concern itself with the details); and it's 2083 // possible that we need to transfer the lease before we carry out the ops, 2084 // determined via the leaseTarget variable. 2085 // 2086 // Transient errors returned from relocateOne are retried until things work 2087 // out. 2088 every := log.Every(time.Minute) 2089 for { 2090 for re := retry.StartWithCtx(ctx, retry.Options{MaxBackoff: 5 * time.Second}); ; re.Next() { 2091 if err := ctx.Err(); err != nil { 2092 return err 2093 } 2094 2095 ops, leaseTarget, err := s.relocateOne(ctx, &rangeDesc, targets) 2096 if err != nil { 2097 return err 2098 } 2099 if leaseTarget != nil { 2100 // NB: we may need to transfer even if there are no ops, to make 2101 // sure the attempt is made to make the first target the final 2102 // leaseholder. 2103 transferLease(*leaseTarget) 2104 } 2105 if len(ops) == 0 { 2106 // Done. 2107 return ctx.Err() 2108 } 2109 if fn := s.cfg.TestingKnobs.BeforeRelocateOne; fn != nil { 2110 fn(ops, leaseTarget, err) 2111 } 2112 2113 // Make sure we don't issue anything but singles and swaps before 2114 // this migration is gone (for it doesn't support anything else). 2115 if len(ops) > 2 { 2116 log.Fatalf(ctx, "received more than 2 ops: %+v", ops) 2117 } 2118 opss := [][]roachpb.ReplicationChange{ops} 2119 success := true 2120 for _, ops := range opss { 2121 newDesc, err := s.DB().AdminChangeReplicas(ctx, startKey, rangeDesc, ops) 2122 if err != nil { 2123 returnErr := errors.Wrapf(err, "while carrying out changes %v", ops) 2124 if !canRetry(err) { 2125 return returnErr 2126 } 2127 if every.ShouldLog() { 2128 log.Infof(ctx, "%v", returnErr) 2129 } 2130 success = false 2131 break 2132 } 2133 rangeDesc = *newDesc 2134 } 2135 if success { 2136 break 2137 } 2138 } 2139 } 2140 2141 } 2142 2143 func (s *Store) relocateOne( 2144 ctx context.Context, desc *roachpb.RangeDescriptor, targets []roachpb.ReplicationTarget, 2145 ) ([]roachpb.ReplicationChange, *roachpb.ReplicationTarget, error) { 2146 rangeReplicas := desc.Replicas().All() 2147 if len(rangeReplicas) != len(desc.Replicas().Voters()) { 2148 // The caller removed all the learners, so there shouldn't be anything but 2149 // voters. 2150 return nil, nil, errors.AssertionFailedf( 2151 `range %s had non-voter replicas: %v`, desc, desc.Replicas()) 2152 } 2153 2154 sysCfg := s.cfg.Gossip.GetSystemConfig() 2155 if sysCfg == nil { 2156 return nil, nil, fmt.Errorf("no system config available, unable to perform RelocateRange") 2157 } 2158 zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey) 2159 if err != nil { 2160 return nil, nil, err 2161 } 2162 2163 storeList, _, _ := s.allocator.storePool.getStoreList(storeFilterNone) 2164 storeMap := storeListToMap(storeList) 2165 2166 // Compute which replica to add and/or remove, respectively. We ask the allocator 2167 // about this because we want to respect the constraints. For example, it would be 2168 // unfortunate if we put two replicas into the same zone despite having a locality- 2169 // preserving option available. 2170 // 2171 // TODO(radu): we can't have multiple replicas on different stores on the 2172 // same node, and this code doesn't do anything to specifically avoid that 2173 // case (although the allocator will avoid even trying to send snapshots to 2174 // such stores), so it could cause some failures. 2175 2176 var addTargets []roachpb.ReplicaDescriptor 2177 for _, t := range targets { 2178 found := false 2179 for _, replicaDesc := range rangeReplicas { 2180 if replicaDesc.StoreID == t.StoreID && replicaDesc.NodeID == t.NodeID { 2181 found = true 2182 break 2183 } 2184 } 2185 if !found { 2186 addTargets = append(addTargets, roachpb.ReplicaDescriptor{ 2187 NodeID: t.NodeID, 2188 StoreID: t.StoreID, 2189 }) 2190 } 2191 } 2192 2193 var removeTargets []roachpb.ReplicaDescriptor 2194 for _, replicaDesc := range rangeReplicas { 2195 found := false 2196 for _, t := range targets { 2197 if replicaDesc.StoreID == t.StoreID && replicaDesc.NodeID == t.NodeID { 2198 found = true 2199 break 2200 } 2201 } 2202 if !found { 2203 removeTargets = append(removeTargets, roachpb.ReplicaDescriptor{ 2204 NodeID: replicaDesc.NodeID, 2205 StoreID: replicaDesc.StoreID, 2206 }) 2207 } 2208 } 2209 2210 var ops roachpb.ReplicationChanges 2211 2212 if len(addTargets) > 0 { 2213 // Each iteration, pick the most desirable replica to add. However, 2214 // prefer the first target because it's the one that should hold the 2215 // lease in the end; it helps to add it early so that the lease doesn't 2216 // have to move too much. 2217 candidateTargets := addTargets 2218 if storeHasReplica(targets[0].StoreID, candidateTargets) { 2219 candidateTargets = []roachpb.ReplicaDescriptor{ 2220 {NodeID: targets[0].NodeID, StoreID: targets[0].StoreID}, 2221 } 2222 } 2223 2224 // The storeList's list of stores is used to constrain which stores the 2225 // allocator considers putting a new replica on. We want it to only 2226 // consider the stores in candidateTargets. 2227 candidateDescs := make([]roachpb.StoreDescriptor, 0, len(candidateTargets)) 2228 for _, candidate := range candidateTargets { 2229 store, ok := storeMap[candidate.StoreID] 2230 if !ok { 2231 return nil, nil, fmt.Errorf("cannot up-replicate to s%d; missing gossiped StoreDescriptor", 2232 candidate.StoreID) 2233 } 2234 candidateDescs = append(candidateDescs, *store) 2235 } 2236 storeList = makeStoreList(candidateDescs) 2237 2238 targetStore, _ := s.allocator.allocateTargetFromList( 2239 ctx, 2240 storeList, 2241 zone, 2242 rangeReplicas, 2243 s.allocator.scorerOptions()) 2244 if targetStore == nil { 2245 return nil, nil, fmt.Errorf("none of the remaining targets %v are legal additions to %v", 2246 addTargets, desc.Replicas()) 2247 } 2248 2249 target := roachpb.ReplicationTarget{ 2250 NodeID: targetStore.Node.NodeID, 2251 StoreID: targetStore.StoreID, 2252 } 2253 ops = append(ops, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, target)...) 2254 // Pretend the voter is already there so that the removal logic below will 2255 // take it into account when deciding which replica to remove. 2256 rangeReplicas = append(rangeReplicas, roachpb.ReplicaDescriptor{ 2257 NodeID: target.NodeID, 2258 StoreID: target.StoreID, 2259 ReplicaID: desc.NextReplicaID, 2260 Type: roachpb.ReplicaTypeVoterFull(), 2261 }) 2262 } 2263 2264 var transferTarget *roachpb.ReplicationTarget 2265 if len(removeTargets) > 0 { 2266 // Pick a replica to remove. Note that rangeReplicas may already reflect 2267 // a replica we're adding in the current round. This is the right thing 2268 // to do. For example, consider relocating from (s1,s2,s3) to (s1,s2,s4) 2269 // where addTargets will be (s4) and removeTargets is (s3). In this code, 2270 // we'll want the allocator to see if s3 can be removed from 2271 // (s1,s2,s3,s4) which is a reasonable request; that replica set is 2272 // overreplicated. If we asked it instead to remove s3 from (s1,s2,s3) 2273 // it may not want to do that due to constraints. 2274 targetStore, _, err := s.allocator.RemoveTarget(ctx, zone, removeTargets, rangeReplicas) 2275 if err != nil { 2276 return nil, nil, errors.Wrapf(err, "unable to select removal target from %v; current replicas %v", 2277 removeTargets, rangeReplicas) 2278 } 2279 removalTarget := roachpb.ReplicationTarget{ 2280 NodeID: targetStore.NodeID, 2281 StoreID: targetStore.StoreID, 2282 } 2283 // We can't remove the leaseholder, which really throws a wrench into 2284 // atomic replication changes. If we find that we're trying to do just 2285 // that, we need to first move the lease elsewhere. This is not possible 2286 // if there is no other replica available at that point, i.e. if the 2287 // existing descriptor is a single replica that's being replaced. 2288 var b kv.Batch 2289 liReq := &roachpb.LeaseInfoRequest{} 2290 liReq.Key = desc.StartKey.AsRawKey() 2291 b.AddRawRequest(liReq) 2292 if err := s.DB().Run(ctx, &b); err != nil { 2293 return nil, nil, errors.Wrap(err, "looking up lease") 2294 } 2295 curLeaseholder := b.RawResponse().Responses[0].GetLeaseInfo().Lease.Replica 2296 ok := curLeaseholder.StoreID != removalTarget.StoreID 2297 if !ok { 2298 // Pick a replica that we can give the lease to. We sort the first 2299 // target to the beginning (if it's there) because that's where the 2300 // lease needs to be in the end. We also exclude the last replica if 2301 // it was added by the add branch above (in which case it doesn't 2302 // exist yet). 2303 sortedTargetReplicas := append([]roachpb.ReplicaDescriptor(nil), rangeReplicas[:len(rangeReplicas)-len(ops)]...) 2304 sort.Slice(sortedTargetReplicas, func(i, j int) bool { 2305 sl := sortedTargetReplicas 2306 // targets[0] goes to the front (if it's present). 2307 return sl[i].StoreID == targets[0].StoreID 2308 }) 2309 for _, rDesc := range sortedTargetReplicas { 2310 if rDesc.StoreID != curLeaseholder.StoreID { 2311 transferTarget = &roachpb.ReplicationTarget{ 2312 NodeID: rDesc.NodeID, 2313 StoreID: rDesc.StoreID, 2314 } 2315 ok = true 2316 break 2317 } 2318 } 2319 } 2320 2321 // Carry out the removal only if there was no lease problem above. If 2322 // there was, we're not going to do a swap in this round but just do the 2323 // addition. (Note that !ok implies that len(ops) is not empty, or we're 2324 // trying to remove the last replica left in the descriptor which is 2325 // illegal). 2326 if ok { 2327 ops = append(ops, roachpb.MakeReplicationChanges( 2328 roachpb.REMOVE_REPLICA, 2329 removalTarget)...) 2330 } 2331 } 2332 2333 if len(ops) == 0 { 2334 // Make sure that the first target is the final leaseholder, as 2335 // AdminRelocateRange specifies. 2336 transferTarget = &targets[0] 2337 } 2338 2339 return ops, transferTarget, nil 2340 } 2341 2342 // adminScatter moves replicas and leaseholders for a selection of ranges. 2343 func (r *Replica) adminScatter( 2344 ctx context.Context, args roachpb.AdminScatterRequest, 2345 ) (roachpb.AdminScatterResponse, error) { 2346 rq := r.store.replicateQueue 2347 retryOpts := retry.Options{ 2348 InitialBackoff: 50 * time.Millisecond, 2349 MaxBackoff: 1 * time.Second, 2350 Multiplier: 2, 2351 MaxRetries: 5, 2352 } 2353 2354 // Loop until the replicate queue decides there is nothing left to do for the 2355 // range. Note that we disable lease transfers until the final step as 2356 // transferring the lease prevents any further action on this node. 2357 var allowLeaseTransfer bool 2358 canTransferLease := func() bool { return allowLeaseTransfer } 2359 for re := retry.StartWithCtx(ctx, retryOpts); re.Next(); { 2360 requeue, err := rq.processOneChange(ctx, r, canTransferLease, false /* dryRun */) 2361 if err != nil { 2362 if IsSnapshotError(err) { 2363 continue 2364 } 2365 break 2366 } 2367 if !requeue { 2368 if allowLeaseTransfer { 2369 break 2370 } 2371 allowLeaseTransfer = true 2372 } 2373 re.Reset() 2374 } 2375 2376 // If we've been asked to randomize the leases beyond what the replicate 2377 // queue would do on its own (#17341), do so after the replicate queue is 2378 // done by transferring the lease to any of the given N replicas with 2379 // probability 1/N of choosing each. 2380 if args.RandomizeLeases && r.OwnsValidLease(r.store.Clock().Now()) { 2381 desc := r.Desc() 2382 // Learner replicas aren't allowed to become the leaseholder or raft leader, 2383 // so only consider the `Voters` replicas. 2384 voterReplicas := desc.Replicas().Voters() 2385 newLeaseholderIdx := rand.Intn(len(voterReplicas)) 2386 targetStoreID := voterReplicas[newLeaseholderIdx].StoreID 2387 if targetStoreID != r.store.StoreID() { 2388 if err := r.AdminTransferLease(ctx, targetStoreID); err != nil { 2389 log.Warningf(ctx, "failed to scatter lease to s%d: %+v", targetStoreID, err) 2390 } 2391 } 2392 } 2393 2394 desc := r.Desc() 2395 return roachpb.AdminScatterResponse{ 2396 Ranges: []roachpb.AdminScatterResponse_Range{{ 2397 Span: roachpb.Span{ 2398 Key: desc.StartKey.AsRawKey(), 2399 EndKey: desc.EndKey.AsRawKey(), 2400 }, 2401 }}, 2402 }, nil 2403 } 2404 2405 func (r *Replica) adminVerifyProtectedTimestamp( 2406 ctx context.Context, args roachpb.AdminVerifyProtectedTimestampRequest, 2407 ) (resp roachpb.AdminVerifyProtectedTimestampResponse, err error) { 2408 resp.Verified, err = r.protectedTimestampRecordApplies(ctx, &args) 2409 if err == nil && !resp.Verified { 2410 resp.FailedRanges = append(resp.FailedRanges, *r.Desc()) 2411 } 2412 return resp, err 2413 }