github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_command.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math/rand"
    18  	"sort"
    19  	"strings"
    20  	"time"
    21  
    22  	"github.com/cockroachdb/cockroach/pkg/base"
    23  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    24  	"github.com/cockroachdb/cockroach/pkg/keys"
    25  	"github.com/cockroachdb/cockroach/pkg/kv"
    26  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    28  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    29  	"github.com/cockroachdb/cockroach/pkg/rpc"
    30  	"github.com/cockroachdb/cockroach/pkg/rpc/nodedialer"
    31  	"github.com/cockroachdb/cockroach/pkg/settings/cluster"
    32  	"github.com/cockroachdb/cockroach/pkg/storage"
    33  	"github.com/cockroachdb/cockroach/pkg/util/contextutil"
    34  	"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
    35  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    36  	"github.com/cockroachdb/cockroach/pkg/util/log"
    37  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    38  	"github.com/cockroachdb/cockroach/pkg/util/retry"
    39  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    40  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    41  	"github.com/cockroachdb/errors"
    42  	"github.com/cockroachdb/logtags"
    43  	"go.etcd.io/etcd/raft"
    44  	"go.etcd.io/etcd/raft/raftpb"
    45  	"go.etcd.io/etcd/raft/tracker"
    46  )
    47  
    48  // AdminSplit divides the range into into two ranges using args.SplitKey.
    49  func (r *Replica) AdminSplit(
    50  	ctx context.Context, args roachpb.AdminSplitRequest, reason string,
    51  ) (reply roachpb.AdminSplitResponse, _ *roachpb.Error) {
    52  	if len(args.SplitKey) == 0 {
    53  		return roachpb.AdminSplitResponse{}, roachpb.NewErrorf("cannot split range with no key provided")
    54  	}
    55  
    56  	err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
    57  		var err error
    58  		reply, err = r.adminSplitWithDescriptor(ctx, args, desc, true /* delayable */, reason)
    59  		return err
    60  	})
    61  	return reply, err
    62  }
    63  
    64  func maybeDescriptorChangedError(
    65  	desc *roachpb.RangeDescriptor, err error,
    66  ) (ok bool, expectedDesc *roachpb.RangeDescriptor) {
    67  	if detail := (*roachpb.ConditionFailedError)(nil); errors.As(err, &detail) {
    68  		// Provide a better message in the common case that the range being changed
    69  		// was already changed by a concurrent transaction.
    70  		var actualDesc roachpb.RangeDescriptor
    71  		if !detail.ActualValue.IsPresent() {
    72  			return true, nil
    73  		} else if err := detail.ActualValue.GetProto(&actualDesc); err == nil &&
    74  			desc.RangeID == actualDesc.RangeID && !desc.Equal(actualDesc) {
    75  			return true, &actualDesc
    76  		}
    77  	}
    78  	return false, nil
    79  }
    80  
    81  const (
    82  	descChangedRangeSubsumedErrorFmt = "descriptor changed: expected %s != [actual] nil (range subsumed)"
    83  	descChangedErrorFmt              = "descriptor changed: [expected] %s != [actual] %s"
    84  )
    85  
    86  func newDescChangedError(desc, actualDesc *roachpb.RangeDescriptor) error {
    87  	if actualDesc == nil {
    88  		return errors.Newf(descChangedRangeSubsumedErrorFmt, desc)
    89  	}
    90  	return errors.Newf(descChangedErrorFmt, desc, actualDesc)
    91  }
    92  
    93  func wrapDescChangedError(err error, desc, actualDesc *roachpb.RangeDescriptor) error {
    94  	if actualDesc == nil {
    95  		return errors.Wrapf(err, descChangedRangeSubsumedErrorFmt, desc)
    96  	}
    97  	return errors.Wrapf(err, descChangedErrorFmt, desc, actualDesc)
    98  }
    99  
   100  func splitSnapshotWarningStr(rangeID roachpb.RangeID, status *raft.Status) string {
   101  	var s string
   102  	if status != nil && status.RaftState == raft.StateLeader {
   103  		for replicaID, pr := range status.Progress {
   104  			if replicaID == status.Lead {
   105  				// TODO(tschottdorf): remove this line once we have picked up
   106  				// https://github.com/etcd-io/etcd/pull/10279
   107  				continue
   108  			}
   109  			if pr.State == tracker.StateReplicate {
   110  				// This follower is in good working order.
   111  				continue
   112  			}
   113  			s += fmt.Sprintf("; r%d/%d is ", rangeID, replicaID)
   114  			switch pr.State {
   115  			case tracker.StateSnapshot:
   116  				// If the Raft snapshot queue is backed up, replicas can spend
   117  				// minutes or worse until they are caught up.
   118  				s += "waiting for a Raft snapshot"
   119  			case tracker.StateProbe:
   120  				// Assuming the split has already been delayed for a little bit,
   121  				// seeing a follower that is probing hints at some problem with
   122  				// Raft or Raft message delivery. (Of course it's possible that
   123  				// the follower *just* entered probing state).
   124  				s += "being probed (may or may not need a Raft snapshot)"
   125  			default:
   126  				// Future proofing.
   127  				s += "in unknown state " + pr.State.String()
   128  			}
   129  		}
   130  	}
   131  	return s
   132  }
   133  
   134  // prepareSplitDescs returns the left and right descriptor of the split whose
   135  // right side is assigned rightRangeID and starts at splitKey. The supplied
   136  // expiration is the "sticky bit" stored on the right descriptor.
   137  func prepareSplitDescs(
   138  	ctx context.Context,
   139  	st *cluster.Settings,
   140  	rightRangeID roachpb.RangeID,
   141  	splitKey roachpb.RKey,
   142  	expiration hlc.Timestamp,
   143  	leftDesc *roachpb.RangeDescriptor,
   144  ) (*roachpb.RangeDescriptor, *roachpb.RangeDescriptor) {
   145  	// Create right hand side range descriptor.
   146  	rightDesc := roachpb.NewRangeDescriptor(rightRangeID, splitKey, leftDesc.EndKey, leftDesc.Replicas())
   147  
   148  	// Init updated version of existing range descriptor.
   149  	{
   150  		tmp := *leftDesc
   151  		leftDesc = &tmp
   152  	}
   153  
   154  	leftDesc.IncrementGeneration()
   155  	leftDesc.EndKey = splitKey
   156  
   157  	// Set the generation of the right hand side descriptor to match that of the
   158  	// (updated) left hand side. See the comment on the field for an explanation
   159  	// of why generations are useful.
   160  	rightDesc.Generation = leftDesc.Generation
   161  
   162  	setStickyBit(rightDesc, expiration)
   163  	return leftDesc, rightDesc
   164  }
   165  
   166  func setStickyBit(desc *roachpb.RangeDescriptor, expiration hlc.Timestamp) {
   167  	// TODO(jeffreyxiao): Remove this check in 20.1.
   168  	// Note that the client API for splitting has expiration time as
   169  	// non-nullable, but the internal representation of a sticky bit is nullable
   170  	// for backwards compatibility. If expiration time is the zero timestamp, we
   171  	// must be sure not to set the sticky bit to the zero timestamp because the
   172  	// byte representation of setting the stickyBit to nil is different than
   173  	// setting it to hlc.Timestamp{}. This check ensures that CPuts would not
   174  	// fail on older versions.
   175  	if (expiration != hlc.Timestamp{}) {
   176  		desc.StickyBit = &expiration
   177  	}
   178  }
   179  
   180  func splitTxnAttempt(
   181  	ctx context.Context,
   182  	store *Store,
   183  	txn *kv.Txn,
   184  	rightRangeID roachpb.RangeID,
   185  	splitKey roachpb.RKey,
   186  	expiration hlc.Timestamp,
   187  	oldDesc *roachpb.RangeDescriptor,
   188  ) error {
   189  	txn.SetDebugName(splitTxnName)
   190  
   191  	_, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, oldDesc.StartKey, checkDescsEqual(oldDesc))
   192  	if err != nil {
   193  		return err
   194  	}
   195  	// TODO(tbg): return desc from conditionalGetDescValueFromDB and don't pass
   196  	// in oldDesc any more (just the start key).
   197  	desc := oldDesc
   198  	oldDesc = nil // prevent accidental use
   199  
   200  	leftDesc, rightDesc := prepareSplitDescs(
   201  		ctx, store.ClusterSettings(), rightRangeID, splitKey, expiration, desc)
   202  
   203  	// Update existing range descriptor for left hand side of
   204  	// split. Note that we mutate the descriptor for the left hand
   205  	// side of the split first to locate the txn record there.
   206  	{
   207  		b := txn.NewBatch()
   208  		leftDescKey := keys.RangeDescriptorKey(leftDesc.StartKey)
   209  		if err := updateRangeDescriptor(b, leftDescKey, dbDescValue, leftDesc); err != nil {
   210  			return err
   211  		}
   212  		// Commit this batch first to ensure that the transaction record
   213  		// is created in the right place (split trigger relies on this).
   214  		// Sending the batch containing only the first write guarantees
   215  		// the transaction record is written first, preventing cases
   216  		// where splits are aborted early due to conflicts with meta
   217  		// intents (see #9265).
   218  		log.Event(ctx, "updating LHS descriptor")
   219  		if err := txn.Run(ctx, b); err != nil {
   220  			return err
   221  		}
   222  	}
   223  
   224  	// Log the split into the range event log.
   225  	if err := store.logSplit(ctx, txn, *leftDesc, *rightDesc); err != nil {
   226  		return err
   227  	}
   228  
   229  	b := txn.NewBatch()
   230  
   231  	// Write range descriptor for right hand side of the split.
   232  	rightDescKey := keys.RangeDescriptorKey(rightDesc.StartKey)
   233  	if err := updateRangeDescriptor(b, rightDescKey, nil, rightDesc); err != nil {
   234  		return err
   235  	}
   236  
   237  	// Update range descriptor addressing record(s).
   238  	if err := splitRangeAddressing(b, rightDesc, leftDesc); err != nil {
   239  		return err
   240  	}
   241  
   242  	// End the transaction manually, instead of letting RunTransaction
   243  	// loop do it, in order to provide a split trigger.
   244  	b.AddRawRequest(&roachpb.EndTxnRequest{
   245  		Commit: true,
   246  		InternalCommitTrigger: &roachpb.InternalCommitTrigger{
   247  			SplitTrigger: &roachpb.SplitTrigger{
   248  				LeftDesc:  *leftDesc,
   249  				RightDesc: *rightDesc,
   250  			},
   251  		},
   252  	})
   253  
   254  	// Commit txn with final batch (RHS descriptor and meta).
   255  	log.Event(ctx, "commit txn with batch containing RHS descriptor and meta records")
   256  	return txn.Run(ctx, b)
   257  }
   258  
   259  func splitTxnStickyUpdateAttempt(
   260  	ctx context.Context, txn *kv.Txn, desc *roachpb.RangeDescriptor, expiration hlc.Timestamp,
   261  ) error {
   262  	_, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, desc.StartKey, checkDescsEqual(desc))
   263  	if err != nil {
   264  		return err
   265  	}
   266  	newDesc := *desc
   267  	setStickyBit(&newDesc, expiration)
   268  
   269  	b := txn.NewBatch()
   270  	descKey := keys.RangeDescriptorKey(desc.StartKey)
   271  	if err := updateRangeDescriptor(b, descKey, dbDescValue, &newDesc); err != nil {
   272  		return err
   273  	}
   274  	if err := updateRangeAddressing(b, &newDesc); err != nil {
   275  		return err
   276  	}
   277  	// End the transaction manually, instead of letting RunTransaction loop
   278  	// do it, in order to provide a sticky bit trigger.
   279  	b.AddRawRequest(&roachpb.EndTxnRequest{
   280  		Commit: true,
   281  		InternalCommitTrigger: &roachpb.InternalCommitTrigger{
   282  			StickyBitTrigger: &roachpb.StickyBitTrigger{
   283  				StickyBit: newDesc.GetStickyBit(),
   284  			},
   285  		},
   286  	})
   287  	return txn.Run(ctx, b)
   288  }
   289  
   290  // adminSplitWithDescriptor divides the range into into two ranges, using
   291  // either args.SplitKey (if provided) or an internally computed key that aims
   292  // to roughly equipartition the range by size. The split is done inside of a
   293  // distributed txn which writes updated left and new right hand side range
   294  // descriptors, and updates the range addressing metadata. The handover of
   295  // responsibility for the reassigned key range is carried out seamlessly
   296  // through a split trigger carried out as part of the commit of that
   297  // transaction.
   298  //
   299  // The supplied RangeDescriptor is used as a form of optimistic lock. An
   300  // operation which might split a range should obtain a copy of the range's
   301  // current descriptor before making the decision to split. If the decision is
   302  // affirmative the descriptor is passed to AdminSplit, which performs a
   303  // Conditional Put on the RangeDescriptor to ensure that no other operation has
   304  // modified the range in the time the decision was being made.
   305  // TODO(tschottdorf): should assert that split key is not a local key.
   306  //
   307  // See the comment on splitTrigger for details on the complexities.
   308  func (r *Replica) adminSplitWithDescriptor(
   309  	ctx context.Context,
   310  	args roachpb.AdminSplitRequest,
   311  	desc *roachpb.RangeDescriptor,
   312  	delayable bool,
   313  	reason string,
   314  ) (roachpb.AdminSplitResponse, error) {
   315  	var err error
   316  	// The split queue doesn't care about the set of replicas, so if we somehow
   317  	// are being handed one that's in a joint state, finalize that before
   318  	// continuing.
   319  	desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc)
   320  	if err != nil {
   321  		return roachpb.AdminSplitResponse{}, err
   322  	}
   323  
   324  	var reply roachpb.AdminSplitResponse
   325  
   326  	// Determine split key if not provided with args. This scan is
   327  	// allowed to be relatively slow because admin commands don't block
   328  	// other commands.
   329  	log.Event(ctx, "split begins")
   330  	var splitKey roachpb.RKey
   331  	{
   332  		var foundSplitKey roachpb.Key
   333  		if len(args.SplitKey) == 0 {
   334  			// Find a key to split by size.
   335  			var err error
   336  			targetSize := r.GetMaxBytes() / 2
   337  			foundSplitKey, err = storage.MVCCFindSplitKey(
   338  				ctx, r.store.engine, desc.StartKey, desc.EndKey, targetSize)
   339  			if err != nil {
   340  				return reply, errors.Errorf("unable to determine split key: %s", err)
   341  			}
   342  			if foundSplitKey == nil {
   343  				// No suitable split key could be found.
   344  				return reply, unsplittableRangeError{}
   345  			}
   346  		} else {
   347  			// If the key that routed this request to this range is now out of this
   348  			// range's bounds, return an error for the client to try again on the
   349  			// correct range.
   350  			if !kvserverbase.ContainsKey(desc, args.Key) {
   351  				return reply, roachpb.NewRangeKeyMismatchError(args.Key, args.Key, desc)
   352  			}
   353  			foundSplitKey = args.SplitKey
   354  		}
   355  
   356  		if !kvserverbase.ContainsKey(desc, foundSplitKey) {
   357  			return reply, errors.Errorf("requested split key %s out of bounds of %s", args.SplitKey, r)
   358  		}
   359  
   360  		var err error
   361  		splitKey, err = keys.Addr(foundSplitKey)
   362  		if err != nil {
   363  			return reply, err
   364  		}
   365  		if !splitKey.Equal(foundSplitKey) {
   366  			return reply, errors.Errorf("cannot split range at range-local key %s", splitKey)
   367  		}
   368  		if !storage.IsValidSplitKey(foundSplitKey) {
   369  			return reply, errors.Errorf("cannot split range at key %s", splitKey)
   370  		}
   371  	}
   372  
   373  	// If the range starts at the splitKey, we treat the AdminSplit
   374  	// as a no-op and return success instead of throwing an error.
   375  	if desc.StartKey.Equal(splitKey) {
   376  		if len(args.SplitKey) == 0 {
   377  			log.Fatal(ctx, "MVCCFindSplitKey returned start key of range")
   378  		}
   379  		log.Event(ctx, "range already split")
   380  		// Even if the range is already split, we should still update the sticky
   381  		// bit if it has a later expiration time.
   382  		if desc.GetStickyBit().Less(args.ExpirationTime) {
   383  			err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   384  				return splitTxnStickyUpdateAttempt(ctx, txn, desc, args.ExpirationTime)
   385  			})
   386  			// The ConditionFailedError can occur because the descriptors acting as
   387  			// expected values in the CPuts used to update the range descriptor are
   388  			// picked outside the transaction. Return ConditionFailedError in the
   389  			// error detail so that the command can be retried.
   390  			if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
   391  				// NB: we have to wrap the existing error here as consumers of this code
   392  				// look at the root cause to sniff out the changed descriptor.
   393  				err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
   394  			}
   395  			return reply, err
   396  		}
   397  		return reply, nil
   398  	}
   399  	log.Event(ctx, "found split key")
   400  
   401  	// Create right hand side range descriptor.
   402  	rightRangeID, err := r.store.AllocateRangeID(ctx)
   403  	if err != nil {
   404  		return reply, errors.Wrap(err, "unable to allocate range id for right hand side")
   405  	}
   406  
   407  	var extra string
   408  	if delayable {
   409  		extra += maybeDelaySplitToAvoidSnapshot(ctx, (*splitDelayHelper)(r))
   410  	}
   411  	extra += splitSnapshotWarningStr(r.RangeID, r.RaftStatus())
   412  
   413  	log.Infof(ctx, "initiating a split of this range at key %s [r%d] (%s)%s",
   414  		splitKey.StringWithDirs(nil /* valDirs */, 50 /* maxLen */), rightRangeID, reason, extra)
   415  
   416  	if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   417  		return splitTxnAttempt(ctx, r.store, txn, rightRangeID, splitKey, args.ExpirationTime, desc)
   418  	}); err != nil {
   419  		// The ConditionFailedError can occur because the descriptors acting
   420  		// as expected values in the CPuts used to update the left or right
   421  		// range descriptors are picked outside the transaction. Return
   422  		// ConditionFailedError in the error detail so that the command can be
   423  		// retried.
   424  		if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
   425  			// NB: we have to wrap the existing error here as consumers of this code
   426  			// look at the root cause to sniff out the changed descriptor.
   427  			err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
   428  		}
   429  		return reply, errors.Wrapf(err, "split at key %s failed", splitKey)
   430  	}
   431  	return reply, nil
   432  }
   433  
   434  // AdminUnsplit removes the sticky bit of the range specified by the
   435  // args.Key.
   436  func (r *Replica) AdminUnsplit(
   437  	ctx context.Context, args roachpb.AdminUnsplitRequest, reason string,
   438  ) (roachpb.AdminUnsplitResponse, *roachpb.Error) {
   439  	var reply roachpb.AdminUnsplitResponse
   440  	err := r.executeAdminCommandWithDescriptor(ctx, func(desc *roachpb.RangeDescriptor) error {
   441  		var err error
   442  		reply, err = r.adminUnsplitWithDescriptor(ctx, args, desc, reason)
   443  		return err
   444  	})
   445  	return reply, err
   446  }
   447  
   448  func (r *Replica) adminUnsplitWithDescriptor(
   449  	ctx context.Context,
   450  	args roachpb.AdminUnsplitRequest,
   451  	desc *roachpb.RangeDescriptor,
   452  	reason string,
   453  ) (roachpb.AdminUnsplitResponse, error) {
   454  	var reply roachpb.AdminUnsplitResponse
   455  	if !bytes.Equal(desc.StartKey.AsRawKey(), args.Header().Key) {
   456  		return reply, errors.Errorf("key %s is not the start of a range", args.Header().Key)
   457  	}
   458  
   459  	// If the range's sticky bit is already hlc.Timestamp{}, we treat the unsplit
   460  	// command as a no-op and return success instead of throwing an error. On
   461  	// mixed version clusters that don't support StickyBit, all range descriptor
   462  	// sticky bits are guaranteed to be nil, so we can skip checking the cluster
   463  	// version.
   464  	if (desc.GetStickyBit() == hlc.Timestamp{}) {
   465  		return reply, nil
   466  	}
   467  
   468  	if err := r.store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
   469  		_, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, desc.StartKey, checkDescsEqual(desc))
   470  		if err != nil {
   471  			return err
   472  		}
   473  
   474  		newDesc := *desc
   475  		// Use nil instead of &zero until 20.1; this field is new in 19.2. We
   476  		// could use &zero here because the sticky bit will never be populated
   477  		// before the cluster version reaches 19.2 and the early return above
   478  		// already handles that case, but nothing is won in doing so.
   479  		newDesc.StickyBit = nil
   480  		descKey := keys.RangeDescriptorKey(newDesc.StartKey)
   481  
   482  		b := txn.NewBatch()
   483  		if err := updateRangeDescriptor(b, descKey, dbDescValue, &newDesc); err != nil {
   484  			return err
   485  		}
   486  		if err := updateRangeAddressing(b, &newDesc); err != nil {
   487  			return err
   488  		}
   489  		// End the transaction manually in order to provide a sticky bit trigger.
   490  		b.AddRawRequest(&roachpb.EndTxnRequest{
   491  			Commit: true,
   492  			InternalCommitTrigger: &roachpb.InternalCommitTrigger{
   493  				StickyBitTrigger: &roachpb.StickyBitTrigger{
   494  					// Setting StickyBit to the zero timestamp ensures that it is always
   495  					// eligible for automatic merging.
   496  					StickyBit: hlc.Timestamp{},
   497  				},
   498  			},
   499  		})
   500  		return txn.Run(ctx, b)
   501  	}); err != nil {
   502  		// The ConditionFailedError can occur because the descriptors acting as
   503  		// expected values in the CPuts used to update the range descriptor are
   504  		// picked outside the transaction. Return ConditionFailedError in the error
   505  		// detail so that the command can be retried.
   506  		if ok, actualDesc := maybeDescriptorChangedError(desc, err); ok {
   507  			// NB: we have to wrap the existing error here as consumers of this code
   508  			// look at the root cause to sniff out the changed descriptor.
   509  			err = &benignError{wrapDescChangedError(err, desc, actualDesc)}
   510  		}
   511  		return reply, err
   512  	}
   513  	return reply, nil
   514  }
   515  
   516  // executeAdminCommandWithDescriptor wraps a read-modify-write operation for RangeDescriptors in a
   517  // retry loop.
   518  func (r *Replica) executeAdminCommandWithDescriptor(
   519  	ctx context.Context, updateDesc func(*roachpb.RangeDescriptor) error,
   520  ) *roachpb.Error {
   521  	// Retry forever as long as we see errors we know will resolve.
   522  	retryOpts := base.DefaultRetryOptions()
   523  	// Randomize quite a lot just in case someone else also interferes with us
   524  	// in a retry loop. Note that this is speculative; there wasn't an incident
   525  	// that suggested this.
   526  	retryOpts.RandomizationFactor = 0.5
   527  	lastErr := ctx.Err()
   528  	for retryable := retry.StartWithCtx(ctx, retryOpts); retryable.Next(); {
   529  		// The replica may have been destroyed since the start of the retry loop.
   530  		// We need to explicitly check this condition. Having a valid lease, as we
   531  		// verify below, does not imply that the range still exists: even after a
   532  		// range has been merged into its left-hand neighbor, its final lease
   533  		// (i.e., the lease we have in r.mu.state.Lease) can remain valid
   534  		// indefinitely.
   535  		if _, err := r.IsDestroyed(); err != nil {
   536  			return roachpb.NewError(err)
   537  		}
   538  
   539  		// Admin commands always require the range lease to begin (see
   540  		// executeAdminBatch), but we may have lost it while in this retry loop.
   541  		// Without the lease, a replica's local descriptor can be arbitrarily
   542  		// stale, which will result in a ConditionFailedError. To avoid this, we
   543  		// make sure that we still have the lease before each attempt.
   544  		if _, pErr := r.redirectOnOrAcquireLease(ctx); pErr != nil {
   545  			return pErr
   546  		}
   547  
   548  		lastErr = updateDesc(r.Desc())
   549  		// On seeing a ConditionFailedError or an AmbiguousResultError, retry the
   550  		// command with the updated descriptor.
   551  		if !errors.HasType(lastErr, (*roachpb.ConditionFailedError)(nil)) &&
   552  			!errors.HasType(lastErr, (*roachpb.AmbiguousResultError)(nil)) {
   553  			break
   554  		}
   555  	}
   556  	return roachpb.NewError(lastErr)
   557  }
   558  
   559  // AdminMerge extends this range to subsume the range that comes next
   560  // in the key space. The merge is performed inside of a distributed
   561  // transaction which writes the left hand side range descriptor (the
   562  // subsuming range) and deletes the range descriptor for the right
   563  // hand side range (the subsumed range). It also updates the range
   564  // addressing metadata. The handover of responsibility for the
   565  // reassigned key range is carried out seamlessly through a merge
   566  // trigger carried out as part of the commit of that transaction. A
   567  // merge requires that the two ranges are collocated on the same set
   568  // of replicas.
   569  //
   570  // The supplied RangeDescriptor is used as a form of optimistic lock. See the
   571  // comment of "AdminSplit" for more information on this pattern.
   572  func (r *Replica) AdminMerge(
   573  	ctx context.Context, args roachpb.AdminMergeRequest, reason string,
   574  ) (roachpb.AdminMergeResponse, *roachpb.Error) {
   575  	var reply roachpb.AdminMergeResponse
   576  
   577  	runMergeTxn := func(txn *kv.Txn) error {
   578  		log.Event(ctx, "merge txn begins")
   579  		txn.SetDebugName(mergeTxnName)
   580  
   581  		// Observe the commit timestamp to force a client-side retry. See the
   582  		// comment on the retry loop after this closure for details.
   583  		//
   584  		// TODO(benesch): expose a proper API for preventing the fast path.
   585  		_ = txn.CommitTimestamp()
   586  
   587  		// Pipelining might send QueryIntent requests to the RHS after the RHS has
   588  		// noticed the merge and started blocking all traffic. This causes the merge
   589  		// transaction to deadlock. Just turn pipelining off; the structure of the
   590  		// merge transaction means pipelining provides no performance benefit
   591  		// anyway.
   592  		if err := txn.DisablePipelining(); err != nil {
   593  			return err
   594  		}
   595  
   596  		// NB: reads do NOT impact transaction record placement.
   597  
   598  		origLeftDesc := r.Desc()
   599  		if origLeftDesc.EndKey.Equal(roachpb.RKeyMax) {
   600  			// Merging the final range doesn't make sense.
   601  			return errors.New("cannot merge final range")
   602  		}
   603  
   604  		_, dbOrigLeftDescValue, err := conditionalGetDescValueFromDB(ctx, txn, origLeftDesc.StartKey, checkDescsEqual(origLeftDesc))
   605  		if err != nil {
   606  			return err
   607  		}
   608  
   609  		// Ensure that every current replica of the LHS has been initialized.
   610  		// Otherwise there is a rare race where the replica GC queue can GC a
   611  		// replica of the RHS too early. The comment on
   612  		// TestStoreRangeMergeUninitializedLHSFollower explains the situation in full.
   613  		if err := waitForReplicasInit(
   614  			ctx, r.store.cfg.NodeDialer, origLeftDesc.RangeID, origLeftDesc.Replicas().All(),
   615  		); err != nil {
   616  			return errors.Wrap(err, "waiting for all left-hand replicas to initialize")
   617  		}
   618  
   619  		// Do a consistent read of the right hand side's range descriptor.
   620  		var rightDesc roachpb.RangeDescriptor
   621  		rightDescKey := keys.RangeDescriptorKey(origLeftDesc.EndKey)
   622  		dbRightDescKV, err := txn.Get(ctx, rightDescKey)
   623  		if err != nil {
   624  			return err
   625  		}
   626  		if err := dbRightDescKV.ValueProto(&rightDesc); err != nil {
   627  			return err
   628  		}
   629  
   630  		// Verify that the two ranges are mergeable.
   631  		if !bytes.Equal(origLeftDesc.EndKey, rightDesc.StartKey) {
   632  			// Should never happen, but just in case.
   633  			return errors.Errorf("ranges are not adjacent; %s != %s", origLeftDesc.EndKey, rightDesc.StartKey)
   634  		}
   635  		// For simplicity, don't handle learner replicas or joint states, expect
   636  		// the caller to resolve them first. (Defensively, we check that there
   637  		// are no non-voter replicas, in case some third type is later added).
   638  		// This behavior can be changed later if the complexity becomes worth
   639  		// it, but it's not right now.
   640  		//
   641  		// NB: the merge queue transitions out of any joint states and removes
   642  		// any learners it sees. It's sort of silly that we don't do that here
   643  		// instead; effectively any caller of AdminMerge that is not the merge
   644  		// queue won't be able to recover from these cases (though the replicate
   645  		// queues should fix things up quickly).
   646  		lReplicas, rReplicas := origLeftDesc.Replicas(), rightDesc.Replicas()
   647  
   648  		predFullVoter := func(rDesc roachpb.ReplicaDescriptor) bool {
   649  			return rDesc.GetType() == roachpb.VOTER_FULL
   650  		}
   651  		if len(lReplicas.Filter(predFullVoter)) != len(lReplicas.All()) {
   652  			return errors.Errorf("cannot merge range with non-voter replicas on lhs: %s", lReplicas)
   653  		}
   654  		if len(rReplicas.Filter(predFullVoter)) != len(rReplicas.All()) {
   655  			return errors.Errorf("cannot merge range with non-voter replicas on rhs: %s", rReplicas)
   656  		}
   657  		if !replicaSetsEqual(lReplicas.All(), rReplicas.All()) {
   658  			return errors.Errorf("ranges not collocated; %s != %s", lReplicas, rReplicas)
   659  		}
   660  		mergeReplicas := lReplicas.All()
   661  
   662  		updatedLeftDesc := *origLeftDesc
   663  		// lhs.Generation = max(rhs.Generation, lhs.Generation)+1.
   664  		// See the comment on the Generation field for why generation are useful.
   665  		if updatedLeftDesc.Generation < rightDesc.Generation {
   666  			updatedLeftDesc.Generation = rightDesc.Generation
   667  		}
   668  		updatedLeftDesc.IncrementGeneration()
   669  		updatedLeftDesc.EndKey = rightDesc.EndKey
   670  		log.Infof(ctx, "initiating a merge of %s into this range (%s)", &rightDesc, reason)
   671  
   672  		// Update the range descriptor for the receiving range. It is important
   673  		// (for transaction record placement) that the first write inside the
   674  		// transaction is this conditional put to change the left hand side's
   675  		// descriptor end key.
   676  		{
   677  			b := txn.NewBatch()
   678  			leftDescKey := keys.RangeDescriptorKey(updatedLeftDesc.StartKey)
   679  			if err := updateRangeDescriptor(
   680  				b, leftDescKey, dbOrigLeftDescValue, &updatedLeftDesc,
   681  			); err != nil {
   682  				return err
   683  			}
   684  			// Commit this batch on its own to ensure that the transaction record
   685  			// is created in the right place (our triggers rely on this).
   686  			log.Event(ctx, "updating LHS descriptor")
   687  			if err := txn.Run(ctx, b); err != nil {
   688  				return err
   689  			}
   690  		}
   691  
   692  		// Log the merge into the range event log.
   693  		// TODO(spencer): event logging API should accept a batch
   694  		// instead of a transaction; there's no reason this logging
   695  		// shouldn't be done in parallel via the batch with the updated
   696  		// range addressing.
   697  		if err := r.store.logMerge(ctx, txn, updatedLeftDesc, rightDesc); err != nil {
   698  			return err
   699  		}
   700  
   701  		b := txn.NewBatch()
   702  
   703  		// Update the meta addressing records.
   704  		if err := mergeRangeAddressing(b, origLeftDesc, &updatedLeftDesc); err != nil {
   705  			return err
   706  		}
   707  
   708  		// Remove the range descriptor for the deleted range.
   709  		if err := updateRangeDescriptor(b, rightDescKey, dbRightDescKV.Value, nil); err != nil {
   710  			return err
   711  		}
   712  
   713  		// Send off this batch, ensuring that intents are placed on both the local
   714  		// copy and meta2's copy of the right-hand side range descriptor before we
   715  		// send the Subsume request below. This is the precondition for sending a
   716  		// Subsume request; see the godoc on batcheval.Subsume for details.
   717  		if err := txn.Run(ctx, b); err != nil {
   718  			return err
   719  		}
   720  
   721  		// Intents have been placed, so the merge is now in its critical phase. Get
   722  		// a consistent view of the data from the right-hand range. If the merge
   723  		// commits, we'll write this data to the left-hand range in the merge
   724  		// trigger.
   725  		br, pErr := kv.SendWrapped(ctx, r.store.DB().NonTransactionalSender(),
   726  			&roachpb.SubsumeRequest{
   727  				RequestHeader: roachpb.RequestHeader{Key: rightDesc.StartKey.AsRawKey()},
   728  				LeftDesc:      *origLeftDesc,
   729  				RightDesc:     rightDesc,
   730  			})
   731  		if pErr != nil {
   732  			return pErr.GoError()
   733  		}
   734  		rhsSnapshotRes := br.(*roachpb.SubsumeResponse)
   735  
   736  		err = waitForApplication(
   737  			ctx, r.store.cfg.NodeDialer, rightDesc.RangeID, mergeReplicas,
   738  			rhsSnapshotRes.LeaseAppliedIndex)
   739  		if err != nil {
   740  			return errors.Wrap(err, "waiting for all right-hand replicas to catch up")
   741  		}
   742  
   743  		// Successful subsume, so we're guaranteed that the right-hand range will
   744  		// not serve another request unless this transaction aborts. End the
   745  		// transaction manually in order to provide a merge trigger.
   746  		b = txn.NewBatch()
   747  		b.AddRawRequest(&roachpb.EndTxnRequest{
   748  			Commit: true,
   749  			InternalCommitTrigger: &roachpb.InternalCommitTrigger{
   750  				MergeTrigger: &roachpb.MergeTrigger{
   751  					LeftDesc:       updatedLeftDesc,
   752  					RightDesc:      rightDesc,
   753  					RightMVCCStats: rhsSnapshotRes.MVCCStats,
   754  					FreezeStart:    rhsSnapshotRes.FreezeStart,
   755  				},
   756  			},
   757  		})
   758  		log.Event(ctx, "attempting commit")
   759  		return txn.Run(ctx, b)
   760  	}
   761  
   762  	// If the merge transaction encounters an error, we need to trigger a full
   763  	// abort and try again with a new transaction. Why? runMergeTxn has the side
   764  	// effect of sending a Subsume request to the right-hand range, which blocks
   765  	// the right-hand range from serving any traffic until the transaction commits
   766  	// or aborts. If we retry using the same transaction (i.e., a "transaction
   767  	// restart"), we'll send requests to the blocked right-hand range and
   768  	// deadlock. The right-hand range will see that the transaction is still
   769  	// pending and refuse to respond, but the transaction cannot commit until the
   770  	// right-hand range responds. By instead marking the transaction as aborted,
   771  	// we'll unlock the right-hand range, giving the next, fresh transaction a
   772  	// chance to succeed.
   773  	//
   774  	// Note that client.DB.Txn performs retries using the same transaction, so we
   775  	// have to use our own retry loop.
   776  	for {
   777  		txn := kv.NewTxn(ctx, r.store.DB(), r.NodeID())
   778  		err := runMergeTxn(txn)
   779  		if err != nil {
   780  			txn.CleanupOnError(ctx, err)
   781  		}
   782  		if !errors.HasType(err, (*roachpb.TransactionRetryWithProtoRefreshError)(nil)) {
   783  			if err != nil {
   784  				return reply, roachpb.NewErrorf("merge failed: %s", err)
   785  			}
   786  			return reply, nil
   787  		}
   788  	}
   789  }
   790  
   791  func waitForApplication(
   792  	ctx context.Context,
   793  	dialer *nodedialer.Dialer,
   794  	rangeID roachpb.RangeID,
   795  	replicas []roachpb.ReplicaDescriptor,
   796  	leaseIndex uint64,
   797  ) error {
   798  	return contextutil.RunWithTimeout(ctx, "wait for application", 5*time.Second, func(ctx context.Context) error {
   799  		g := ctxgroup.WithContext(ctx)
   800  		for _, repl := range replicas {
   801  			repl := repl // copy for goroutine
   802  			g.GoCtx(func(ctx context.Context) error {
   803  				conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
   804  				if err != nil {
   805  					return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
   806  				}
   807  				_, err = NewPerReplicaClient(conn).WaitForApplication(ctx, &WaitForApplicationRequest{
   808  					StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
   809  					RangeID:            rangeID,
   810  					LeaseIndex:         leaseIndex,
   811  				})
   812  				return err
   813  			})
   814  		}
   815  		return g.Wait()
   816  	})
   817  }
   818  
   819  // waitForReplicasInit blocks until it has proof that the replicas listed in
   820  // desc are initialized on their respective stores. It may return a false
   821  // negative, i.e., claim that a replica is uninitialized when it is, in fact,
   822  // initialized, but it will never return a false positive.
   823  func waitForReplicasInit(
   824  	ctx context.Context,
   825  	dialer *nodedialer.Dialer,
   826  	rangeID roachpb.RangeID,
   827  	replicas []roachpb.ReplicaDescriptor,
   828  ) error {
   829  	return contextutil.RunWithTimeout(ctx, "wait for replicas init", 5*time.Second, func(ctx context.Context) error {
   830  		g := ctxgroup.WithContext(ctx)
   831  		for _, repl := range replicas {
   832  			repl := repl // copy for goroutine
   833  			g.GoCtx(func(ctx context.Context) error {
   834  				conn, err := dialer.Dial(ctx, repl.NodeID, rpc.DefaultClass)
   835  				if err != nil {
   836  					return errors.Wrapf(err, "could not dial n%d", repl.NodeID)
   837  				}
   838  				_, err = NewPerReplicaClient(conn).WaitForReplicaInit(ctx, &WaitForReplicaInitRequest{
   839  					StoreRequestHeader: StoreRequestHeader{NodeID: repl.NodeID, StoreID: repl.StoreID},
   840  					RangeID:            rangeID,
   841  				})
   842  				return err
   843  			})
   844  		}
   845  		return g.Wait()
   846  	})
   847  }
   848  
   849  type snapshotError struct {
   850  	// NB: don't implement Cause() on this type without also updating IsSnapshotError.
   851  	cause error
   852  }
   853  
   854  func (s *snapshotError) Error() string {
   855  	return fmt.Sprintf("snapshot failed: %s", s.cause.Error())
   856  }
   857  
   858  // IsSnapshotError returns true iff the error indicates a snapshot failed.
   859  func IsSnapshotError(err error) bool {
   860  	return errors.HasType(err, (*snapshotError)(nil))
   861  }
   862  
   863  // ChangeReplicas atomically changes the replicas that are members of a range.
   864  // The change is performed in a distributed transaction and takes effect when
   865  // that transaction is committed. This transaction confirms that the supplied
   866  // RangeDescriptor is up to date and that the supplied slice of
   867  // ReplicationChanges is a valid transition, meaning that replicas being added
   868  // are not present, that replicas being removed are present, that no replica is
   869  // altered more than once, and that no attempt is made at removing the
   870  // leaseholder (which in particular implies that we can never remove all
   871  // replicas).
   872  //
   873  // The returned RangeDescriptor is the new value of the range's descriptor
   874  // following the successful commit of the transaction.
   875  //
   876  // In general, ChangeReplicas will carry out the following steps.
   877  //
   878  // 1. Run a distributed transaction that adds all new replicas as learner replicas.
   879  //    Learner replicas receive the log, but do not have voting rights. They are
   880  //    used to catch up these new replicas before turning them into voters, which
   881  //    is important for the continued availability of the range throughout the
   882  //    replication change. Learners are added (and removed) one by one due to a
   883  //    technicality (see https://github.com/cockroachdb/cockroach/pull/40268).
   884  //
   885  //    The distributed transaction updates both copies of the range descriptor
   886  //    (the one on the range and that in the meta ranges) to that effect, and
   887  //    commits with a special trigger instructing Raft (via ProposeConfChange) to
   888  //    tie a corresponding replication configuration change which goes into
   889  //    effect (on each replica) when the transaction commit is applied to the
   890  //    state. Applying the command also updates each replica's local view of
   891  //    the state to reflect the new descriptor.
   892  //
   893  //    If no replicas are being added, this first step is elided.
   894  //
   895  // 2. Send Raft snapshots to all learner replicas. This would happen
   896  //    automatically by the existing recovery mechanisms (raft snapshot queue), but
   897  //    it is done explicitly as a convenient way to ensure learners are caught up
   898  //    before the next step is entered. (We ensure that work is not duplicated
   899  //    between the snapshot queue and the explicit snapshot via the
   900  //    snapshotLogTruncationConstraints map). Snapshots are subject to both
   901  //    bandwidth rate limiting and throttling.
   902  //
   903  //    If no replicas are being added, this step is similarly elided.
   904  //
   905  // 3. Carry out a distributed transaction similar to that which added the
   906  //    learner replicas, except this time it (atomically) changes all learners to
   907  //    voters and removes any replicas for which this was requested; voters are
   908  //    demoted before actually being removed to avoid bug in etcd/raft:
   909  //    See https://github.com/cockroachdb/cockroach/pull/40268.
   910  //
   911  //    If only one replica is being added, raft can chose the simple
   912  //    configuration change protocol; otherwise it has to use joint consensus. In
   913  //    this latter mechanism, a first configuration change is made which results
   914  //    in a configuration ("joint configuration") in which a quorum of both the
   915  //    old replicas and the new replica sets is required for decision making.
   916  //    Transitioning into this joint configuration, the RangeDescriptor (which is
   917  //    the source of truth of the replication configuration) is updated with
   918  //    corresponding replicas of type VOTER_INCOMING and VOTER_OUTGOING.
   919  //    Immediately after committing this change, a second transition updates the
   920  //    descriptor with and activates the final configuration.
   921  //
   922  // Concretely, if the initial members of the range are s1/1, s2/2, and s3/3, and
   923  // an atomic membership change were to adds s4/4 and s5/5 while removing s1/1 and
   924  // s2/2, the following range descriptors would form the overall transition:
   925  //
   926  // 1. s1/1 s2/2 s3/3 (VOTER_FULL is implied)
   927  // 2. s1/1 s2/2 s3/3 s4/4LEARNER
   928  // 3. s1/1 s2/2 s3/3 s4/4LEARNER s5/5LEARNER
   929  // 4. s1/1VOTER_DEMOTING s2/2VOTER_DEMOTING s3/3 s4/4VOTER_INCOMING s5/5VOTER_INCOMING
   930  // 5. s1/1LEARNER s2/2LEARNER s3/3 s4/4 s5/5
   931  // 6. s2/2LEARNER s3/3 s4/4 s5/5
   932  // 7. s3/3 s4/4 s5/5
   933  //
   934  // A replica that learns that it was removed will queue itself for replicaGC.
   935  // Note that a removed replica may never apply the configuration change removing
   936  // itself and thus this trigger may not fire. This is because said replica may
   937  // not have been a part of the quorum that committed the configuration change;
   938  // nodes that apply the change will stop sending messages to the removed
   939  // replica. At that point, the removed replica will typically campaign (since it
   940  // receives no more heartbeats from the leader) and its former peers respond via
   941  // a RaftGroupDeletedError (from the Raft transport) as a signal to queue to
   942  // replicaGC. This second mechanism fails if all peers have rapidly moved
   943  // elsewhere as well; in that last and rare case, replica GC queue will
   944  // eventually discover the replica on its own; it has optimizations that handle
   945  // "abandoned-looking" replicas more eagerly than healthy ones.
   946  func (r *Replica) ChangeReplicas(
   947  	ctx context.Context,
   948  	desc *roachpb.RangeDescriptor,
   949  	priority SnapshotRequest_Priority,
   950  	reason kvserverpb.RangeLogEventReason,
   951  	details string,
   952  	chgs roachpb.ReplicationChanges,
   953  ) (updatedDesc *roachpb.RangeDescriptor, _ error) {
   954  	if desc == nil {
   955  		// TODO(tbg): is this check just FUD?
   956  		return nil, errors.Errorf("%s: the current RangeDescriptor must not be nil", r)
   957  	}
   958  
   959  	// We execute the change serially if we're not allowed to run atomic
   960  	// replication changes or if that was explicitly disabled.
   961  	st := r.ClusterSettings()
   962  	unroll := !st.Version.IsActive(ctx, clusterversion.VersionAtomicChangeReplicas) ||
   963  		!UseAtomicReplicationChanges.Get(&st.SV)
   964  
   965  	if unroll {
   966  		// Legacy behavior.
   967  		for i := range chgs {
   968  			var err error
   969  			desc, err = r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs[i:i+1])
   970  			if err != nil {
   971  				return nil, err
   972  			}
   973  		}
   974  		return desc, nil
   975  	}
   976  	// Atomic replication change.
   977  	return r.changeReplicasImpl(ctx, desc, priority, reason, details, chgs)
   978  }
   979  
   980  func (r *Replica) changeReplicasImpl(
   981  	ctx context.Context,
   982  	desc *roachpb.RangeDescriptor,
   983  	priority SnapshotRequest_Priority,
   984  	reason kvserverpb.RangeLogEventReason,
   985  	details string,
   986  	chgs roachpb.ReplicationChanges,
   987  ) (updatedDesc *roachpb.RangeDescriptor, _ error) {
   988  	var err error
   989  	// If in a joint config, clean up. The assumption here is that the caller
   990  	// of ChangeReplicas didn't even realize that they were holding on to a
   991  	// joint descriptor and would rather not have to deal with that fact.
   992  	desc, err = maybeLeaveAtomicChangeReplicas(ctx, r.store, desc)
   993  	if err != nil {
   994  		return nil, err
   995  	}
   996  
   997  	if err := validateReplicationChanges(desc, chgs); err != nil {
   998  		return nil, err
   999  	}
  1000  
  1001  	if adds := chgs.Additions(); len(adds) > 0 {
  1002  		// Lock learner snapshots even before we run the ConfChange txn to add them
  1003  		// to prevent a race with the raft snapshot queue trying to send it first.
  1004  		// Note that this lock needs to cover sending the snapshots which happens in
  1005  		_ = r.atomicReplicationChange
  1006  		// which also has some more details on what's going on here.
  1007  		//
  1008  		// Also note that the lock only prevents the raft snapshot queue from
  1009  		// sending snapshots to learner replicas, it will still send them to voters.
  1010  		// There are more details about this locking in
  1011  		_ = (*raftSnapshotQueue)(nil).processRaftSnapshot
  1012  		// as well as a TODO about fixing all this to be less subtle and brittle.
  1013  		releaseSnapshotLockFn := r.lockLearnerSnapshot(ctx, adds)
  1014  		defer releaseSnapshotLockFn()
  1015  
  1016  		// For all newly added nodes, first add raft learner replicas. They accept raft traffic
  1017  		// (so they can catch up) but don't get to vote (so they don't affect quorum and thus
  1018  		// don't introduce fragility into the system). For details see:
  1019  		_ = roachpb.ReplicaDescriptors.Learners
  1020  		var err error
  1021  		desc, err = addLearnerReplicas(ctx, r.store, desc, reason, details, adds)
  1022  		if err != nil {
  1023  			return nil, err
  1024  		}
  1025  	}
  1026  
  1027  	// Catch up any learners, then run the atomic replication change that adds the
  1028  	// final voters and removes any undesirable replicas.
  1029  	desc, err = r.atomicReplicationChange(ctx, desc, priority, reason, details, chgs)
  1030  	if err != nil {
  1031  		// If the error occurred while transitioning out of an atomic replication change,
  1032  		// try again here with a fresh descriptor; this is a noop otherwise.
  1033  		if _, err := maybeLeaveAtomicChangeReplicas(ctx, r.store, r.Desc()); err != nil {
  1034  			return nil, err
  1035  		}
  1036  		if fn := r.store.cfg.TestingKnobs.ReplicaAddSkipLearnerRollback; fn != nil && fn() {
  1037  			return nil, err
  1038  		}
  1039  		// Don't leave a learner replica lying around if we didn't succeed in
  1040  		// promoting it to a voter.
  1041  		if targets := chgs.Additions(); len(targets) > 0 {
  1042  			log.Infof(ctx, "could not promote %v to voter, rolling back: %v", targets, err)
  1043  			for _, target := range targets {
  1044  				r.tryRollBackLearnerReplica(ctx, r.Desc(), target, reason, details)
  1045  			}
  1046  		}
  1047  		return nil, err
  1048  	}
  1049  	return desc, err
  1050  }
  1051  
  1052  // maybeLeaveAtomicChangeReplicas transitions out of the joint configuration if
  1053  // the descriptor indicates one. This involves running a distributed transaction
  1054  // updating said descriptor, the result of which will be returned. The
  1055  // descriptor returned from this method will contain replicas of type LEARNER
  1056  // and VOTER_FULL only.
  1057  func maybeLeaveAtomicChangeReplicas(
  1058  	ctx context.Context, store *Store, desc *roachpb.RangeDescriptor,
  1059  ) (*roachpb.RangeDescriptor, error) {
  1060  	// We want execChangeReplicasTxn to be able to make sure it's only tasked
  1061  	// with leaving a joint state when it's in one, so make sure we don't call
  1062  	// it if we're not.
  1063  	if !desc.Replicas().InAtomicReplicationChange() {
  1064  		return desc, nil
  1065  	}
  1066  	// NB: this is matched on in TestMergeQueueSeesLearner.
  1067  	log.Eventf(ctx, "transitioning out of joint configuration %s", desc)
  1068  
  1069  	// NB: reason and detail won't be used because no range log event will be
  1070  	// emitted.
  1071  	//
  1072  	// TODO(tbg): reconsider this.
  1073  	return execChangeReplicasTxn(
  1074  		ctx, store, desc, kvserverpb.ReasonUnknown /* unused */, "", nil, /* iChgs */
  1075  	)
  1076  }
  1077  
  1078  // maybeLeaveAtomicChangeReplicasAndRemoveLearners transitions out of the joint
  1079  // config (if there is one), and then removes all learners. After this function
  1080  // returns, all remaining replicas will be of type VOTER_FULL.
  1081  func maybeLeaveAtomicChangeReplicasAndRemoveLearners(
  1082  	ctx context.Context, store *Store, desc *roachpb.RangeDescriptor,
  1083  ) (*roachpb.RangeDescriptor, error) {
  1084  	desc, err := maybeLeaveAtomicChangeReplicas(ctx, store, desc)
  1085  	if err != nil {
  1086  		return nil, err
  1087  	}
  1088  	// Now the config isn't joint any more, but we may have demoted some voters
  1089  	// into learners. These learners should go as well.
  1090  
  1091  	learners := desc.Replicas().Learners()
  1092  	if len(learners) == 0 {
  1093  		return desc, nil
  1094  	}
  1095  	targets := make([]roachpb.ReplicationTarget, len(learners))
  1096  	for i := range learners {
  1097  		targets[i].NodeID = learners[i].NodeID
  1098  		targets[i].StoreID = learners[i].StoreID
  1099  	}
  1100  	log.VEventf(ctx, 2, `removing learner replicas %v from %v`, targets, desc)
  1101  	// NB: unroll the removals because at the time of writing, we can't atomically
  1102  	// remove multiple learners. This will be fixed in:
  1103  	//
  1104  	// https://github.com/cockroachdb/cockroach/pull/40268
  1105  	origDesc := desc
  1106  	for _, target := range targets {
  1107  		var err error
  1108  		desc, err = execChangeReplicasTxn(
  1109  			ctx, store, desc, kvserverpb.ReasonAbandonedLearner, "",
  1110  			[]internalReplicationChange{{target: target, typ: internalChangeTypeRemove}},
  1111  		)
  1112  		if err != nil {
  1113  			return nil, errors.Wrapf(err, `removing learners from %s`, origDesc)
  1114  		}
  1115  	}
  1116  	return desc, nil
  1117  }
  1118  
  1119  func validateReplicationChanges(
  1120  	desc *roachpb.RangeDescriptor, chgs roachpb.ReplicationChanges,
  1121  ) error {
  1122  	// First make sure that the changes don't self-overlap (i.e. we're not adding
  1123  	// a replica twice, or removing and immediately re-adding it).
  1124  	byNodeID := make(map[roachpb.NodeID]roachpb.ReplicationChange, len(chgs))
  1125  	for _, chg := range chgs {
  1126  		if _, ok := byNodeID[chg.Target.NodeID]; ok {
  1127  			return fmt.Errorf("changes %+v refer to n%d twice", chgs, chg.Target.NodeID)
  1128  		}
  1129  		byNodeID[chg.Target.NodeID] = chg
  1130  	}
  1131  
  1132  	// Then, check that we're not adding a second replica on nodes that already
  1133  	// have one, or "re-add" an existing replica. We delete from byNodeID so that
  1134  	// after this loop, it contains only StoreIDs that we haven't seen in desc.
  1135  	for _, rDesc := range desc.Replicas().All() {
  1136  		chg, ok := byNodeID[rDesc.NodeID]
  1137  		delete(byNodeID, rDesc.NodeID)
  1138  		if !ok || chg.ChangeType != roachpb.ADD_REPLICA {
  1139  			continue
  1140  		}
  1141  		// We're adding a replica that's already there. This isn't allowed, even
  1142  		// when the newly added one would be on a different store.
  1143  		if rDesc.StoreID != chg.Target.StoreID {
  1144  			return errors.Errorf("unable to add replica %v; node already has a replica in %s", chg.Target.StoreID, desc)
  1145  		}
  1146  
  1147  		// Looks like we found a replica with the same store and node id. If the
  1148  		// replica is already a learner, then either some previous leaseholder was
  1149  		// trying to add it with the learner+snapshot+voter cycle and got
  1150  		// interrupted or else we hit a race between the replicate queue and
  1151  		// AdminChangeReplicas.
  1152  		if rDesc.GetType() == roachpb.LEARNER {
  1153  			return errors.Errorf(
  1154  				"unable to add replica %v which is already present as a learner in %s", chg.Target, desc)
  1155  		}
  1156  
  1157  		// Otherwise, we already had a full voter replica. Can't add another to
  1158  		// this store.
  1159  		return errors.Errorf("unable to add replica %v which is already present in %s", chg.Target, desc)
  1160  	}
  1161  
  1162  	// Any removals left in the map now refer to nonexisting replicas, and we refuse them.
  1163  	for _, chg := range byNodeID {
  1164  		if chg.ChangeType != roachpb.REMOVE_REPLICA {
  1165  			continue
  1166  		}
  1167  		return errors.Errorf("removing %v which is not in %s", chg.Target, desc)
  1168  	}
  1169  	return nil
  1170  }
  1171  
  1172  // addLearnerReplicas adds learners to the given replication targets.
  1173  func addLearnerReplicas(
  1174  	ctx context.Context,
  1175  	store *Store,
  1176  	desc *roachpb.RangeDescriptor,
  1177  	reason kvserverpb.RangeLogEventReason,
  1178  	details string,
  1179  	targets []roachpb.ReplicationTarget,
  1180  ) (*roachpb.RangeDescriptor, error) {
  1181  	// TODO(tbg): we could add all learners in one go, but then we'd need to
  1182  	// do it as an atomic replication change (raft doesn't know which config
  1183  	// to apply the delta to, so we might be demoting more than one voter).
  1184  	// This isn't crazy, we just need to transition out of the joint config
  1185  	// before returning from this method, and it's unclear that it's worth
  1186  	// doing.
  1187  	for _, target := range targets {
  1188  		iChgs := []internalReplicationChange{{target: target, typ: internalChangeTypeAddLearner}}
  1189  		var err error
  1190  		desc, err = execChangeReplicasTxn(
  1191  			ctx, store, desc, reason, details, iChgs,
  1192  		)
  1193  		if err != nil {
  1194  			return nil, err
  1195  		}
  1196  	}
  1197  	return desc, nil
  1198  }
  1199  
  1200  // lockLearnerSnapshot stops the raft snapshot queue from sending snapshots to
  1201  // the soon-to-be added learner replicas to prevent duplicate snapshots from
  1202  // being sent. This lock is best effort because it times out and it is a node
  1203  // local lock while the raft snapshot queue might be running on a different
  1204  // node. An idempotent unlock function is returned.
  1205  func (r *Replica) lockLearnerSnapshot(
  1206  	ctx context.Context, additions []roachpb.ReplicationTarget,
  1207  ) (unlock func()) {
  1208  	// TODO(dan): The way this works is hacky, but it was added at the last minute
  1209  	// in 19.2 to work around a commit in etcd/raft that made this race more
  1210  	// likely. It'd be nice if all learner snapshots could be sent from a single
  1211  	// place.
  1212  	var lockUUIDs []uuid.UUID
  1213  	for _, addition := range additions {
  1214  		lockUUID := uuid.MakeV4()
  1215  		lockUUIDs = append(lockUUIDs, lockUUID)
  1216  		r.addSnapshotLogTruncationConstraint(ctx, lockUUID, 1, addition.StoreID)
  1217  	}
  1218  	return func() {
  1219  		now := timeutil.Now()
  1220  		for _, lockUUID := range lockUUIDs {
  1221  			r.completeSnapshotLogTruncationConstraint(ctx, lockUUID, now)
  1222  		}
  1223  	}
  1224  }
  1225  
  1226  // atomicReplicationChange carries out the atomic membership change that
  1227  // finalizes the addition and/or removal of replicas. Any voters in the process
  1228  // of being added (as reflected by the replication changes) must have been added
  1229  // as learners already and will be caught up before being promoted to voters.
  1230  // Cluster version permitting, voter removals (from the replication changes)
  1231  // will preferably be carried out by first demoting to a learner instead of
  1232  // outright removal (this avoids a [raft-bug] that can lead to unavailability).
  1233  // All of this occurs in one atomic raft membership change which is carried out
  1234  // across two phases. On error, it is possible that the range is in the
  1235  // intermediate ("joint") configuration in which a quorum of both the old and
  1236  // new sets of voters is required. If a range is encountered in this state,
  1237  // maybeLeaveAtomicReplicationChange can fix this, but it is the caller's job to
  1238  // do this when necessary.
  1239  //
  1240  // The atomic membership change is carried out chiefly via the construction of a
  1241  // suitable ChangeReplicasTrigger, see prepareChangeReplicasTrigger for details.
  1242  //
  1243  // Contrary to the name, *all* membership changes go through this method, even
  1244  // those that add/remove only a single voter, though the simple protocol is used
  1245  // when this is opportune. Notably, demotions can never use the simple protocol,
  1246  // even if only a single voter is being demoted, due to a (liftable) limitation
  1247  // in etcd/raft.
  1248  //
  1249  // [raft-bug]: https://github.com/etcd-io/etcd/issues/11284
  1250  func (r *Replica) atomicReplicationChange(
  1251  	ctx context.Context,
  1252  	desc *roachpb.RangeDescriptor,
  1253  	priority SnapshotRequest_Priority,
  1254  	reason kvserverpb.RangeLogEventReason,
  1255  	details string,
  1256  	chgs roachpb.ReplicationChanges,
  1257  ) (*roachpb.RangeDescriptor, error) {
  1258  	// TODO(dan): We allow ranges with learner replicas to split, so in theory
  1259  	// this may want to detect that and retry, sending a snapshot and promoting
  1260  	// both sides.
  1261  
  1262  	iChgs := make([]internalReplicationChange, 0, len(chgs))
  1263  
  1264  	for _, target := range chgs.Additions() {
  1265  		iChgs = append(iChgs, internalReplicationChange{target: target, typ: internalChangeTypePromoteLearner})
  1266  		// All adds must be present as learners right now, and we send them
  1267  		// snapshots in anticipation of promoting them to voters.
  1268  		rDesc, ok := desc.GetReplicaDescriptor(target.StoreID)
  1269  		if !ok {
  1270  			return nil, errors.Errorf("programming error: replica %v not found in %v", target, desc)
  1271  		}
  1272  
  1273  		if rDesc.GetType() != roachpb.LEARNER {
  1274  			return nil, errors.Errorf("programming error: cannot promote replica of type %s", rDesc.Type)
  1275  		}
  1276  
  1277  		if fn := r.store.cfg.TestingKnobs.ReplicaSkipLearnerSnapshot; fn != nil && fn() {
  1278  			continue
  1279  		}
  1280  
  1281  		// Note that raft snapshot queue will refuse to send a snapshot to a learner
  1282  		// replica if its store is already sending a snapshot to that replica. That
  1283  		// would race with this snapshot, except that we've put a (best effort) lock
  1284  		// on it before the conf change txn was run. This is best effort because the
  1285  		// lock can time out and the lock is local to this node, while the raft
  1286  		// leader could be on another node entirely (they're usually co-located but
  1287  		// this is not guaranteed).
  1288  		//
  1289  		// We originally tried always refusing to send snapshots from the raft
  1290  		// snapshot queue to learner replicas, but this turned out to be brittle.
  1291  		// First, if the snapshot failed, any attempt to use the learner's raft
  1292  		// group would hang until the replicate queue got around to cleaning up the
  1293  		// orphaned learner. Second, this tickled some bugs in etcd/raft around
  1294  		// switching between StateSnapshot and StateProbe. Even if we worked through
  1295  		// these, it would be susceptible to future similar issues.
  1296  		if err := r.sendSnapshot(ctx, rDesc, SnapshotRequest_LEARNER, priority); err != nil {
  1297  			return nil, err
  1298  		}
  1299  	}
  1300  
  1301  	if adds := chgs.Additions(); len(adds) > 0 {
  1302  		if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterLearnerSnapshot; fn != nil && fn(adds) {
  1303  			return desc, nil
  1304  		}
  1305  	}
  1306  
  1307  	canUseDemotion := r.store.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionChangeReplicasDemotion)
  1308  	for _, target := range chgs.Removals() {
  1309  		typ := internalChangeTypeRemove
  1310  		if rDesc, ok := desc.GetReplicaDescriptor(target.StoreID); ok && rDesc.GetType() == roachpb.VOTER_FULL && canUseDemotion {
  1311  			typ = internalChangeTypeDemote
  1312  		}
  1313  		iChgs = append(iChgs, internalReplicationChange{target: target, typ: typ})
  1314  	}
  1315  
  1316  	var err error
  1317  	desc, err = execChangeReplicasTxn(ctx, r.store, desc, reason, details, iChgs)
  1318  	if err != nil {
  1319  		return nil, err
  1320  	}
  1321  
  1322  	if fn := r.store.cfg.TestingKnobs.ReplicaAddStopAfterJointConfig; fn != nil && fn() {
  1323  		return desc, nil
  1324  	}
  1325  
  1326  	// Leave the joint config if we entered one. Also, remove any learners we
  1327  	// might have picked up due to removal-via-demotion.
  1328  	return maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, r.store, desc)
  1329  }
  1330  
  1331  // tryRollbackLearnerReplica attempts to remove a learner specified by the
  1332  // target. If no such learner is found in the descriptor (including when it is a
  1333  // voter instead), no action is taken. Otherwise, a single time-limited
  1334  // best-effort attempt at removing the learner is made.
  1335  func (r *Replica) tryRollBackLearnerReplica(
  1336  	ctx context.Context,
  1337  	desc *roachpb.RangeDescriptor,
  1338  	target roachpb.ReplicationTarget,
  1339  	reason kvserverpb.RangeLogEventReason,
  1340  	details string,
  1341  ) {
  1342  	repDesc, ok := desc.GetReplicaDescriptor(target.StoreID)
  1343  	if !ok || repDesc.GetType() != roachpb.LEARNER {
  1344  		// There's no learner to roll back.
  1345  		log.Event(ctx, "learner to roll back not found; skipping")
  1346  		return
  1347  	}
  1348  
  1349  	// If (for example) the promotion failed because of a context deadline
  1350  	// exceeded, we do still want to clean up after ourselves, so always use a new
  1351  	// context (but with the old tags and with some timeout to save this from
  1352  	// blocking the caller indefinitely).
  1353  	const rollbackTimeout = 10 * time.Second
  1354  
  1355  	rollbackFn := func(ctx context.Context) error {
  1356  		_, err := execChangeReplicasTxn(
  1357  			ctx, r.store, desc, reason, details,
  1358  			[]internalReplicationChange{{target: target, typ: internalChangeTypeRemove}},
  1359  		)
  1360  		return err
  1361  	}
  1362  	rollbackCtx := logtags.WithTags(context.Background(), logtags.FromContext(ctx))
  1363  	if err := contextutil.RunWithTimeout(
  1364  		rollbackCtx, "learner rollback", rollbackTimeout, rollbackFn,
  1365  	); err != nil {
  1366  		log.Infof(ctx,
  1367  			"failed to rollback learner %s, abandoning it for the replicate queue: %v", target, err)
  1368  		r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
  1369  	} else {
  1370  		log.Infof(ctx, "rolled back learner %s in %s", target, desc)
  1371  	}
  1372  }
  1373  
  1374  type internalChangeType byte
  1375  
  1376  const (
  1377  	_ internalChangeType = iota + 1
  1378  	internalChangeTypeAddLearner
  1379  	internalChangeTypePromoteLearner
  1380  	// internalChangeTypeDemote changes a voter to a learner. This will
  1381  	// necessarily go through joint consensus since it requires two individual
  1382  	// changes (only one changes the quorum, so we could allow it in a simple
  1383  	// change too, with some work here and upstream). Demotions are treated like
  1384  	// removals throughout (i.e. they show up in `ChangeReplicasTrigger.Removed()`,
  1385  	// but not in `.Added()`).
  1386  	internalChangeTypeDemote
  1387  	// NB: can't remove multiple learners at once (need to remove at least one
  1388  	// voter with them), see:
  1389  	// https://github.com/cockroachdb/cockroach/pull/40268
  1390  	internalChangeTypeRemove
  1391  )
  1392  
  1393  // internalReplicationChange is a replication target together with an internal
  1394  // change type. The internal change type is needed to encode in which way the
  1395  // replica is mutated (i.e. in a sense, what its predecessor looked like). We
  1396  // need this to accurately transcribe the old into the updated range descriptor.
  1397  type internalReplicationChange struct {
  1398  	target roachpb.ReplicationTarget
  1399  	typ    internalChangeType
  1400  }
  1401  
  1402  type internalReplicationChanges []internalReplicationChange
  1403  
  1404  func (c internalReplicationChanges) leaveJoint() bool { return len(c) == 0 }
  1405  func (c internalReplicationChanges) useJoint() bool {
  1406  	// NB: demotions require joint consensus because of limitations in etcd/raft.
  1407  	// These could be lifted, but it doesn't seem worth it.
  1408  	return len(c) > 1 || c[0].typ == internalChangeTypeDemote
  1409  }
  1410  
  1411  type storeSettings interface {
  1412  	ClusterSettings() *cluster.Settings
  1413  	TestingKnobs() *StoreTestingKnobs
  1414  }
  1415  
  1416  func prepareChangeReplicasTrigger(
  1417  	ctx context.Context,
  1418  	store storeSettings,
  1419  	desc *roachpb.RangeDescriptor,
  1420  	chgs internalReplicationChanges,
  1421  ) (*roachpb.ChangeReplicasTrigger, error) {
  1422  	updatedDesc := *desc
  1423  	updatedDesc.SetReplicas(desc.Replicas().DeepCopy())
  1424  	updatedDesc.IncrementGeneration()
  1425  
  1426  	var added, removed []roachpb.ReplicaDescriptor
  1427  	if !chgs.leaveJoint() {
  1428  		if desc.Replicas().InAtomicReplicationChange() {
  1429  			return nil, errors.Errorf("must transition out of joint config first: %s", desc)
  1430  		}
  1431  
  1432  		useJoint := chgs.useJoint()
  1433  		if fn := store.TestingKnobs().ReplicationAlwaysUseJointConfig; fn != nil && fn() {
  1434  			useJoint = true
  1435  		}
  1436  		for _, chg := range chgs {
  1437  			switch chg.typ {
  1438  			case internalChangeTypeAddLearner:
  1439  				added = append(added,
  1440  					updatedDesc.AddReplica(chg.target.NodeID, chg.target.StoreID, roachpb.LEARNER))
  1441  			case internalChangeTypePromoteLearner:
  1442  				typ := roachpb.VOTER_FULL
  1443  				if useJoint {
  1444  					typ = roachpb.VOTER_INCOMING
  1445  				}
  1446  				rDesc, prevTyp, ok := updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, typ)
  1447  				if !ok || prevTyp != roachpb.LEARNER {
  1448  					return nil, errors.Errorf("cannot promote target %v which is missing as Learner", chg.target)
  1449  				}
  1450  				added = append(added, rDesc)
  1451  			case internalChangeTypeRemove:
  1452  				rDesc, ok := updatedDesc.GetReplicaDescriptor(chg.target.StoreID)
  1453  				if !ok {
  1454  					return nil, errors.Errorf("target %s not found", chg.target)
  1455  				}
  1456  				prevTyp := rDesc.GetType()
  1457  				if !useJoint || prevTyp == roachpb.LEARNER {
  1458  					rDesc, _ = updatedDesc.RemoveReplica(chg.target.NodeID, chg.target.StoreID)
  1459  				} else if prevTyp != roachpb.VOTER_FULL {
  1460  					// NB: prevTyp is already known to be VOTER_FULL because of
  1461  					// !InAtomicReplicationChange() and the learner handling
  1462  					// above. We check it anyway.
  1463  					return nil, errors.Errorf("cannot transition from %s to VOTER_OUTGOING", prevTyp)
  1464  				} else {
  1465  					rDesc, _, _ = updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, roachpb.VOTER_OUTGOING)
  1466  				}
  1467  				removed = append(removed, rDesc)
  1468  			case internalChangeTypeDemote:
  1469  				// Demotion is similar to removal, except that a demotion
  1470  				// cannot apply to a learner, and that the resulting type is
  1471  				// different when entering a joint config.
  1472  				rDesc, ok := updatedDesc.GetReplicaDescriptor(chg.target.StoreID)
  1473  				if !ok {
  1474  					return nil, errors.Errorf("target %s not found", chg.target)
  1475  				}
  1476  				if !useJoint {
  1477  					// NB: this won't fire because cc.useJoint() is always true when
  1478  					// there's a demotion. This is just a sanity check.
  1479  					return nil, errors.Errorf("demotions require joint consensus")
  1480  				}
  1481  				if prevTyp := rDesc.GetType(); prevTyp != roachpb.VOTER_FULL {
  1482  					return nil, errors.Errorf("cannot transition from %s to VOTER_DEMOTING", prevTyp)
  1483  				}
  1484  				rDesc, _, _ = updatedDesc.SetReplicaType(chg.target.NodeID, chg.target.StoreID, roachpb.VOTER_DEMOTING)
  1485  				removed = append(removed, rDesc)
  1486  			default:
  1487  				return nil, errors.Errorf("unsupported internal change type %d", chg.typ)
  1488  			}
  1489  		}
  1490  	} else {
  1491  		// Want to leave a joint config. Note that we're not populating 'added' or 'removed', this
  1492  		// is intentional; leaving the joint config corresponds to an "empty" raft conf change.
  1493  		var isJoint bool
  1494  		// NB: the DeepCopy is needed or we'll skip over an entry every time we
  1495  		// call RemoveReplica below.
  1496  		for _, rDesc := range updatedDesc.Replicas().DeepCopy().All() {
  1497  			switch rDesc.GetType() {
  1498  			case roachpb.VOTER_INCOMING:
  1499  				updatedDesc.SetReplicaType(rDesc.NodeID, rDesc.StoreID, roachpb.VOTER_FULL)
  1500  				isJoint = true
  1501  			case roachpb.VOTER_OUTGOING:
  1502  				updatedDesc.RemoveReplica(rDesc.NodeID, rDesc.StoreID)
  1503  				isJoint = true
  1504  			case roachpb.VOTER_DEMOTING:
  1505  				updatedDesc.SetReplicaType(rDesc.NodeID, rDesc.StoreID, roachpb.LEARNER)
  1506  				isJoint = true
  1507  			default:
  1508  			}
  1509  		}
  1510  		if !isJoint {
  1511  			return nil, errors.Errorf("cannot leave a joint config; desc not joint: %s", &updatedDesc)
  1512  		}
  1513  	}
  1514  
  1515  	if err := updatedDesc.Validate(); err != nil {
  1516  		return nil, errors.Wrapf(err, "validating updated descriptor %s", &updatedDesc)
  1517  	}
  1518  
  1519  	var crt *roachpb.ChangeReplicasTrigger
  1520  	if !store.ClusterSettings().Version.IsActive(
  1521  		ctx, clusterversion.VersionAtomicChangeReplicasTrigger,
  1522  	) {
  1523  		var deprecatedChangeType roachpb.ReplicaChangeType
  1524  		var deprecatedRepDesc roachpb.ReplicaDescriptor
  1525  		if len(added) > 0 {
  1526  			deprecatedChangeType = roachpb.ADD_REPLICA
  1527  			deprecatedRepDesc = added[0]
  1528  		} else {
  1529  			deprecatedChangeType = roachpb.REMOVE_REPLICA
  1530  			deprecatedRepDesc = removed[0]
  1531  		}
  1532  		crt = &roachpb.ChangeReplicasTrigger{
  1533  			// NB: populate Desc as well because locally we rely on it being
  1534  			// set.
  1535  			Desc:                      &updatedDesc,
  1536  			DeprecatedChangeType:      deprecatedChangeType,
  1537  			DeprecatedReplica:         deprecatedRepDesc,
  1538  			DeprecatedUpdatedReplicas: updatedDesc.Replicas().All(),
  1539  			DeprecatedNextReplicaID:   updatedDesc.NextReplicaID,
  1540  		}
  1541  	} else {
  1542  		crt = &roachpb.ChangeReplicasTrigger{
  1543  			Desc:                    &updatedDesc,
  1544  			InternalAddedReplicas:   added,
  1545  			InternalRemovedReplicas: removed,
  1546  		}
  1547  	}
  1548  
  1549  	if _, err := crt.ConfChange(nil); err != nil {
  1550  		return nil, errors.Wrapf(err, "programming error: malformed trigger created from desc %s to %s", desc, &updatedDesc)
  1551  	}
  1552  	return crt, nil
  1553  }
  1554  
  1555  func execChangeReplicasTxn(
  1556  	ctx context.Context,
  1557  	store *Store,
  1558  	referenceDesc *roachpb.RangeDescriptor,
  1559  	reason kvserverpb.RangeLogEventReason,
  1560  	details string,
  1561  	chgs internalReplicationChanges,
  1562  ) (*roachpb.RangeDescriptor, error) {
  1563  	var returnDesc *roachpb.RangeDescriptor
  1564  
  1565  	descKey := keys.RangeDescriptorKey(referenceDesc.StartKey)
  1566  
  1567  	check := func(kvDesc *roachpb.RangeDescriptor) bool {
  1568  		// NB: We might fail to find the range if the range has been merged away
  1569  		// in which case we definitely want to fail the check below.
  1570  		if kvDesc != nil && kvDesc.RangeID == referenceDesc.RangeID && chgs.leaveJoint() {
  1571  			// If there are no changes, we're trying to leave a joint config,
  1572  			// so that's all we care about. But since leaving a joint config
  1573  			// is done opportunistically whenever one is encountered, this is
  1574  			// more likely to race than other operations. So we verify literally
  1575  			// nothing about the descriptor, but once we get the descriptor out
  1576  			// from conditionalGetDescValueFromDB, we'll check if it's in a
  1577  			// joint config and if not, noop.
  1578  			return true
  1579  		}
  1580  		// Otherwise, check that the descriptors are equal.
  1581  		//
  1582  		// TODO(tbg): check that the replica sets are equal only. I was going to
  1583  		// do that but then discovered #40367. Try again in the 20.1 cycle.
  1584  		return checkDescsEqual(referenceDesc)(kvDesc)
  1585  	}
  1586  
  1587  	if err := store.DB().Txn(ctx, func(ctx context.Context, txn *kv.Txn) error {
  1588  		log.Event(ctx, "attempting txn")
  1589  		txn.SetDebugName(replicaChangeTxnName)
  1590  		desc, dbDescValue, err := conditionalGetDescValueFromDB(ctx, txn, referenceDesc.StartKey, check)
  1591  		if err != nil {
  1592  			return err
  1593  		}
  1594  		if chgs.leaveJoint() && !desc.Replicas().InAtomicReplicationChange() {
  1595  			// Nothing to do. See comment in 'check' above for details.
  1596  			returnDesc = desc
  1597  			return nil
  1598  		}
  1599  		// Note that we are now using the descriptor from KV, not the one passed
  1600  		// into this method.
  1601  		crt, err := prepareChangeReplicasTrigger(ctx, store, desc, chgs)
  1602  		if err != nil {
  1603  			return err
  1604  		}
  1605  		log.Infof(ctx, "change replicas (add %v remove %v): existing descriptor %s", crt.Added(), crt.Removed(), desc)
  1606  
  1607  		{
  1608  			b := txn.NewBatch()
  1609  
  1610  			// Important: the range descriptor must be the first thing touched in the transaction
  1611  			// so the transaction record is co-located with the range being modified.
  1612  			if err := updateRangeDescriptor(b, descKey, dbDescValue, crt.Desc); err != nil {
  1613  				return err
  1614  			}
  1615  
  1616  			// Run transaction up to this point to create txn record early (see #9265).
  1617  			if err := txn.Run(ctx, b); err != nil {
  1618  				return err
  1619  			}
  1620  		}
  1621  
  1622  		// Log replica change into range event log.
  1623  		for _, tup := range []struct {
  1624  			typ      roachpb.ReplicaChangeType
  1625  			repDescs []roachpb.ReplicaDescriptor
  1626  		}{
  1627  			{roachpb.ADD_REPLICA, crt.Added()},
  1628  			{roachpb.REMOVE_REPLICA, crt.Removed()},
  1629  		} {
  1630  			for _, repDesc := range tup.repDescs {
  1631  				if err := store.logChange(
  1632  					ctx, txn, tup.typ, repDesc, *crt.Desc, reason, details,
  1633  				); err != nil {
  1634  					return err
  1635  				}
  1636  			}
  1637  		}
  1638  
  1639  		// End the transaction manually instead of letting RunTransaction
  1640  		// loop do it, in order to provide a commit trigger.
  1641  		b := txn.NewBatch()
  1642  
  1643  		// Update range descriptor addressing record(s).
  1644  		if err := updateRangeAddressing(b, crt.Desc); err != nil {
  1645  			return err
  1646  		}
  1647  
  1648  		b.AddRawRequest(&roachpb.EndTxnRequest{
  1649  			Commit: true,
  1650  			InternalCommitTrigger: &roachpb.InternalCommitTrigger{
  1651  				ChangeReplicasTrigger: crt,
  1652  			},
  1653  		})
  1654  		if err := txn.Run(ctx, b); err != nil {
  1655  			log.Eventf(ctx, "%v", err)
  1656  			return err
  1657  		}
  1658  
  1659  		returnDesc = crt.Desc
  1660  		return nil
  1661  	}); err != nil {
  1662  		log.Eventf(ctx, "%v", err)
  1663  		// NB: desc may not be the descriptor we actually compared against, but
  1664  		// either way this gives a good idea of what happened which is all it's
  1665  		// supposed to do.
  1666  		if ok, actualDesc := maybeDescriptorChangedError(referenceDesc, err); ok {
  1667  			// We do not include the original error as cause in this case -
  1668  			// the caller should not observe the cause. We still include it
  1669  			// as "secondary payload", in case the error object makes it way
  1670  			// to logs or telemetry during a crash.
  1671  			err = errors.WithSecondaryError(newDescChangedError(referenceDesc, actualDesc), err)
  1672  			err = &benignError{err}
  1673  		}
  1674  		return nil, errors.Wrapf(err, "change replicas of r%d failed", referenceDesc.RangeID)
  1675  	}
  1676  	log.Event(ctx, "txn complete")
  1677  	return returnDesc, nil
  1678  }
  1679  
  1680  // sendSnapshot sends a snapshot of the replica state to the specified replica.
  1681  // Currently only invoked from replicateQueue and raftSnapshotQueue. Be careful
  1682  // about adding additional calls as generating a snapshot is moderately
  1683  // expensive.
  1684  //
  1685  // A snapshot is a bulk transfer of all data in a range. It consists of a
  1686  // consistent view of all the state needed to run some replica of a range as of
  1687  // some applied index (not as of some mvcc-time). Snapshots are used by Raft
  1688  // when a follower is far enough behind the leader that it can no longer be
  1689  // caught up using incremental diffs (because the leader has already garbage
  1690  // collected the diffs, in this case because it truncated the Raft log past
  1691  // where the follower is).
  1692  //
  1693  // We also proactively send a snapshot when adding a new replica to bootstrap it
  1694  // (this is called a "learner" snapshot and is a special case of a Raft
  1695  // snapshot, we just speed the process along). It's called a learner snapshot
  1696  // because it's sent to what Raft terms a learner replica. As of 19.2, when we
  1697  // add a new replica, it's first added as a learner using a Raft ConfChange,
  1698  // which means it accepts Raft traffic but doesn't vote or affect quorum. Then
  1699  // we immediately send it a snapshot to catch it up. After the snapshot
  1700  // successfully applies, we turn it into a normal voting replica using another
  1701  // ConfChange. It then uses the normal mechanisms to catch up with whatever got
  1702  // committed to the Raft log during the snapshot transfer. In contrast to adding
  1703  // the voting replica directly, this avoids a period of fragility when the
  1704  // replica would be a full member, but very far behind.
  1705  //
  1706  // Snapshots are expensive and mostly unexpected (except learner snapshots
  1707  // during rebalancing). The quota pool is responsible for keeping a leader from
  1708  // getting too far ahead of any of the followers, so ideally they'd never be far
  1709  // enough behind to need a snapshot.
  1710  //
  1711  // The snapshot process itself is broken into 3 parts: generating the snapshot,
  1712  // transmitting it, and applying it.
  1713  //
  1714  // Generating the snapshot: The data contained in a snapshot is a full copy of
  1715  // the replicated data plus everything the replica needs to be a healthy member
  1716  // of a Raft group. The former is large, so we send it via streaming rpc
  1717  // instead of keeping it all in memory at once. The `(Replica).GetSnapshot`
  1718  // method does the necessary locking and gathers the various Raft state needed
  1719  // to run a replica. It also creates an iterator for the range's data as it
  1720  // looked under those locks (this is powered by a RocksDB snapshot, which is a
  1721  // different thing but a similar idea). Notably, GetSnapshot does not do the
  1722  // data iteration.
  1723  //
  1724  // Transmitting the snapshot: The transfer itself happens over the grpc
  1725  // `RaftSnapshot` method, which is a bi-directional stream of `SnapshotRequest`s
  1726  // and `SnapshotResponse`s. The two sides are orchestrated by the
  1727  // `(RaftTransport).SendSnapshot` and `(Store).receiveSnapshot` methods.
  1728  //
  1729  // `SendSnapshot` starts up the streaming rpc and first sends a header message
  1730  // with everything but the range data and then blocks, waiting on the first
  1731  // streaming response from the recipient. This lets us short-circuit sending the
  1732  // range data if the recipient can't be contacted or if it can't use the
  1733  // snapshot (which is usually the result of a race). The recipient's grpc
  1734  // handler for RaftSnapshot sanity checks a few things and ends up calling down
  1735  // into `receiveSnapshot`, which does the bulk of the work. `receiveSnapshot`
  1736  // starts by waiting for a reservation in the snapshot rate limiter. It then
  1737  // reads the header message and hands it to `shouldAcceptSnapshotData` to
  1738  // determine if it can use the snapshot [1]. `shouldAcceptSnapshotData` is
  1739  // advisory and can return false positives. If `shouldAcceptSnapshotData`
  1740  // returns true, this is communicated back to the sender, which then proceeds to
  1741  // call `kvBatchSnapshotStrategy.Send`. This uses the iterator captured earlier
  1742  // to send the data in chunks, each chunk a streaming grpc message. The sender
  1743  // then sends a final message with an indicaton that it's done and blocks again,
  1744  // waiting for a second and final response from the recipient which indicates if
  1745  // the snapshot was a success.
  1746  //
  1747  // `receiveSnapshot` takes the key-value pairs sent and incrementally creates
  1748  // three SSTs from them for direct ingestion: one for the replicated range-ID
  1749  // local keys, one for the range local keys, and one for the user keys. The
  1750  // reason it creates three separate SSTs is to prevent overlaps with the
  1751  // memtable and existing SSTs in RocksDB. Each of the SSTs also has a range
  1752  // deletion tombstone to delete the existing data in the range.
  1753  //
  1754  // Applying the snapshot: After the recipient has received the message
  1755  // indicating it has all the data, it hands it all to
  1756  // `(Store).processRaftSnapshotRequest` to be applied. First, this re-checks
  1757  // the same things as `shouldAcceptSnapshotData` to make sure nothing has
  1758  // changed while the snapshot was being transferred. It then guarantees that
  1759  // there is either an initialized[2] replica or a `ReplicaPlaceholder`[3] to
  1760  // accept the snapshot by creating a placeholder if necessary. Finally, a *Raft
  1761  // snapshot* message is manually handed to the replica's Raft node (by calling
  1762  // `stepRaftGroup` + `handleRaftReadyRaftMuLocked`). During the application
  1763  // process, several other SSTs may be created for direct ingestion. An SST for
  1764  // the unreplicated range-ID local keys is created for the Raft entries, hard
  1765  // state, and truncated state. An SST is created for deleting each subsumed
  1766  // replica's range-ID local keys and at most two SSTs are created for deleting
  1767  // the user keys and range local keys of all subsumed replicas. All in all, a
  1768  // maximum of 6 + SR SSTs will be created for direct ingestion where SR is the
  1769  // number of subsumed replicas. In the case where there are no subsumed
  1770  // replicas, 4 SSTs will be created.
  1771  //
  1772  // [1]: The largest class of rejections here is if the store contains a replica
  1773  // that overlaps the snapshot but has a different id (we maintain an invariant
  1774  // that replicas on a store never overlap). This usually happens when the
  1775  // recipient has an old copy of a replica that is no longer part of a range and
  1776  // the `replicaGCQueue` hasn't gotten around to collecting it yet. So if this
  1777  // happens, `shouldAcceptSnapshotData` will queue it up for consideration.
  1778  //
  1779  // [2]: A uninitialized replica is created when a replica that's being added
  1780  // gets traffic from its new peers before it gets a snapshot. It may be possible
  1781  // to get rid of uninitialized replicas (by dropping all Raft traffic except
  1782  // votes on the floor), but this is a cleanup that hasn't happened yet.
  1783  //
  1784  // [3]: The placeholder is essentially a snapshot lock, making any future
  1785  // callers of `shouldAcceptSnapshotData` return an error so that we no longer
  1786  // have to worry about racing with a second snapshot. See the comment on
  1787  // ReplicaPlaceholder for details.
  1788  func (r *Replica) sendSnapshot(
  1789  	ctx context.Context,
  1790  	recipient roachpb.ReplicaDescriptor,
  1791  	snapType SnapshotRequest_Type,
  1792  	priority SnapshotRequest_Priority,
  1793  ) (retErr error) {
  1794  	defer func() {
  1795  		// Report the snapshot status to Raft, which expects us to do this once we
  1796  		// finish sending the snapshot.
  1797  		r.reportSnapshotStatus(ctx, recipient.ReplicaID, retErr)
  1798  	}()
  1799  
  1800  	snap, err := r.GetSnapshot(ctx, snapType, recipient.StoreID)
  1801  	if err != nil {
  1802  		return errors.Wrapf(err, "%s: failed to generate %s snapshot", r, snapType)
  1803  	}
  1804  	defer snap.Close()
  1805  	log.Event(ctx, "generated snapshot")
  1806  
  1807  	sender, err := r.GetReplicaDescriptor()
  1808  	if err != nil {
  1809  		return errors.Wrapf(err, "%s: change replicas failed", r)
  1810  	}
  1811  
  1812  	status := r.RaftStatus()
  1813  	if status == nil {
  1814  		// This code path is sometimes hit during scatter for replicas that
  1815  		// haven't woken up yet.
  1816  		return &benignError{errors.New("raft status not initialized")}
  1817  	}
  1818  
  1819  	usesReplicatedTruncatedState, err := storage.MVCCGetProto(
  1820  		ctx, snap.EngineSnap, keys.RaftTruncatedStateLegacyKey(r.RangeID), hlc.Timestamp{}, nil, storage.MVCCGetOptions{},
  1821  	)
  1822  	if err != nil {
  1823  		return errors.Wrap(err, "loading legacy truncated state")
  1824  	}
  1825  
  1826  	canAvoidSendingLog := !usesReplicatedTruncatedState &&
  1827  		snap.State.TruncatedState.Index < snap.State.RaftAppliedIndex
  1828  
  1829  	if canAvoidSendingLog {
  1830  		// If we're not using a legacy (replicated) truncated state, we avoid
  1831  		// sending the (past) Raft log in the snapshot in the first place and
  1832  		// send only those entries that are actually useful to the follower.
  1833  		// This is done by changing the truncated state, which we're allowed
  1834  		// to do since it is not a replicated key (and thus not subject to
  1835  		// matching across replicas). The actual sending happens here:
  1836  		_ = (*kvBatchSnapshotStrategy)(nil).Send
  1837  		// and results in no log entries being sent at all. Note that
  1838  		// Metadata.Index is really the applied index of the replica.
  1839  		snap.State.TruncatedState = &roachpb.RaftTruncatedState{
  1840  			Index: snap.RaftSnap.Metadata.Index,
  1841  			Term:  snap.RaftSnap.Metadata.Term,
  1842  		}
  1843  	}
  1844  
  1845  	req := SnapshotRequest_Header{
  1846  		State: snap.State,
  1847  		// Tell the recipient whether it needs to synthesize the new
  1848  		// unreplicated TruncatedState. It could tell by itself by peeking into
  1849  		// the data, but it uses a write only batch for performance which
  1850  		// doesn't support that; this is easier. Notably, this is true if the
  1851  		// snap index itself is the one at which the migration happens.
  1852  		//
  1853  		// See VersionUnreplicatedRaftTruncatedState.
  1854  		UnreplicatedTruncatedState: !usesReplicatedTruncatedState,
  1855  		RaftMessageRequest: RaftMessageRequest{
  1856  			RangeID:     r.RangeID,
  1857  			FromReplica: sender,
  1858  			ToReplica:   recipient,
  1859  			Message: raftpb.Message{
  1860  				Type:     raftpb.MsgSnap,
  1861  				To:       uint64(recipient.ReplicaID),
  1862  				From:     uint64(sender.ReplicaID),
  1863  				Term:     status.Term,
  1864  				Snapshot: snap.RaftSnap,
  1865  			},
  1866  		},
  1867  		RangeSize: r.GetMVCCStats().Total(),
  1868  		// Recipients currently cannot choose to decline any snapshots.
  1869  		// In 19.2 and earlier versions pre-emptive snapshots could be declined.
  1870  		//
  1871  		// TODO(ajwerner): Consider removing the CanDecline flag.
  1872  		CanDecline: false,
  1873  		Priority:   priority,
  1874  		Strategy:   SnapshotRequest_KV_BATCH,
  1875  		Type:       snapType,
  1876  	}
  1877  	sent := func() {
  1878  		r.store.metrics.RangeSnapshotsGenerated.Inc(1)
  1879  	}
  1880  	if err := r.store.cfg.Transport.SendSnapshot(
  1881  		ctx,
  1882  		&r.store.cfg.RaftConfig,
  1883  		r.store.allocator.storePool,
  1884  		req,
  1885  		snap,
  1886  		r.store.Engine().NewBatch,
  1887  		sent,
  1888  	); err != nil {
  1889  		if errors.Is(err, errMalformedSnapshot) {
  1890  			tag := fmt.Sprintf("r%d_%s", r.RangeID, snap.SnapUUID.Short())
  1891  			if dir, err := r.store.checkpoint(ctx, tag); err != nil {
  1892  				log.Warningf(ctx, "unable to create checkpoint %s: %+v", dir, err)
  1893  			} else {
  1894  				log.Warningf(ctx, "created checkpoint %s", dir)
  1895  			}
  1896  
  1897  			log.Fatal(ctx, "malformed snapshot generated")
  1898  		}
  1899  		return &snapshotError{err}
  1900  	}
  1901  	return nil
  1902  }
  1903  
  1904  // replicaSetsEqual is used in AdminMerge to ensure that the ranges are
  1905  // all collocate on the same set of replicas.
  1906  func replicaSetsEqual(a, b []roachpb.ReplicaDescriptor) bool {
  1907  	if len(a) != len(b) {
  1908  		return false
  1909  	}
  1910  
  1911  	set := make(map[roachpb.StoreID]int)
  1912  	for _, replica := range a {
  1913  		set[replica.StoreID]++
  1914  	}
  1915  
  1916  	for _, replica := range b {
  1917  		set[replica.StoreID]--
  1918  	}
  1919  
  1920  	for _, value := range set {
  1921  		if value != 0 {
  1922  			return false
  1923  		}
  1924  	}
  1925  
  1926  	return true
  1927  }
  1928  
  1929  func checkDescsEqual(desc *roachpb.RangeDescriptor) func(*roachpb.RangeDescriptor) bool {
  1930  	// TODO(jeffreyxiao): This hacky fix ensures that we don't fail the
  1931  	// conditional get because of the ordering of InternalReplicas. Calling
  1932  	// Replicas() will sort the list of InternalReplicas as a side-effect. The
  1933  	// invariant of having InternalReplicas sorted is not maintained in 19.1.
  1934  	// Additionally, in 19.2, it's possible for the in-memory copy of
  1935  	// RangeDescriptor to become sorted from a call to Replicas() without
  1936  	// updating the copy in kv. These two factors makes it possible for the
  1937  	// in-memory copy to be out of sync from the copy in kv. The sorted invariant
  1938  	// of InternalReplicas is used by ReplicaDescriptors.Voters() and
  1939  	// ReplicaDescriptors.Learners().
  1940  	if desc != nil {
  1941  		desc.Replicas() // for sorting side-effect
  1942  	}
  1943  	return func(desc2 *roachpb.RangeDescriptor) bool {
  1944  		if desc2 != nil {
  1945  			desc2.Replicas() // for sorting side-effect
  1946  		}
  1947  
  1948  		return desc.Equal(desc2)
  1949  	}
  1950  }
  1951  
  1952  // conditionalGetDescValueFromDB fetches an encoded RangeDescriptor from kv,
  1953  // checks that it matches the given expectation using proto Equals, and returns
  1954  // the raw fetched roachpb.Value. If the fetched value doesn't match the
  1955  // expectation, a ConditionFailedError is returned.
  1956  //
  1957  // This ConditionFailedError is a historical artifact. We used to pass the
  1958  // parsed RangeDescriptor directly as the expected value in a CPut, but proto
  1959  // message encodings aren't stable so this was fragile. Calling this method and
  1960  // then passing the returned *roachpb.Value as the expected value in a CPut does
  1961  // the same thing, but also correctly handles proto equality. See #38308.
  1962  func conditionalGetDescValueFromDB(
  1963  	ctx context.Context,
  1964  	txn *kv.Txn,
  1965  	startKey roachpb.RKey,
  1966  	check func(*roachpb.RangeDescriptor) bool,
  1967  ) (*roachpb.RangeDescriptor, *roachpb.Value, error) {
  1968  	descKey := keys.RangeDescriptorKey(startKey)
  1969  	existingDescKV, err := txn.Get(ctx, descKey)
  1970  	if err != nil {
  1971  		return nil, nil, errors.Wrap(err, "fetching current range descriptor value")
  1972  	}
  1973  	var existingDesc *roachpb.RangeDescriptor
  1974  	if existingDescKV.Value != nil {
  1975  		existingDesc = &roachpb.RangeDescriptor{}
  1976  		if err := existingDescKV.Value.GetProto(existingDesc); err != nil {
  1977  			return nil, nil, errors.Wrap(err, "decoding current range descriptor value")
  1978  		}
  1979  	}
  1980  
  1981  	if !check(existingDesc) {
  1982  		return nil, nil, &roachpb.ConditionFailedError{ActualValue: existingDescKV.Value}
  1983  	}
  1984  	return existingDesc, existingDescKV.Value, nil
  1985  }
  1986  
  1987  // updateRangeDescriptor adds a ConditionalPut on the range descriptor. The
  1988  // conditional put verifies that changes to the range descriptor are made in a
  1989  // well-defined order, preventing a scenario where a wayward replica which is
  1990  // no longer part of the original Raft group comes back online to form a
  1991  // splinter group with a node which was also a former replica, and hijacks the
  1992  // range descriptor. This is a last line of defense; other mechanisms should
  1993  // prevent rogue replicas from getting this far (see #768).
  1994  //
  1995  // oldValue can be nil, meaning that the key is expected to not exist.
  1996  //
  1997  // Note that in addition to using this method to update the on-disk range
  1998  // descriptor, a CommitTrigger must be used to update the in-memory
  1999  // descriptor; it will not automatically be copied from newDesc.
  2000  func updateRangeDescriptor(
  2001  	b *kv.Batch, descKey roachpb.Key, oldValue *roachpb.Value, newDesc *roachpb.RangeDescriptor,
  2002  ) error {
  2003  	// This is subtle: []byte(nil) != interface{}(nil). A []byte(nil) refers to
  2004  	// an empty value. An interface{}(nil) refers to a non-existent value. So
  2005  	// we're careful to construct interface{}(nil)s when newDesc/oldDesc are nil.
  2006  	var newValue interface{}
  2007  	if newDesc != nil {
  2008  		if err := newDesc.Validate(); err != nil {
  2009  			return errors.Wrapf(err, "validating new descriptor %+v (old descriptor is %+v)",
  2010  				newDesc, oldValue)
  2011  		}
  2012  		newBytes, err := protoutil.Marshal(newDesc)
  2013  		if err != nil {
  2014  			return err
  2015  		}
  2016  		newValue = newBytes
  2017  	}
  2018  	b.CPut(descKey, newValue, oldValue)
  2019  	return nil
  2020  }
  2021  
  2022  // AdminRelocateRange relocates a given range to a given set of stores. The
  2023  // first store in the slice becomes the new leaseholder.
  2024  //
  2025  // This is best-effort; it's possible that the replicate queue on the
  2026  // leaseholder could take action at the same time, causing errors.
  2027  func (s *Store) AdminRelocateRange(
  2028  	ctx context.Context, rangeDesc roachpb.RangeDescriptor, targets []roachpb.ReplicationTarget,
  2029  ) error {
  2030  	// Step 0: Remove everything that's not a full voter so we don't have to think
  2031  	// about them.
  2032  	newDesc, err := maybeLeaveAtomicChangeReplicasAndRemoveLearners(ctx, s, &rangeDesc)
  2033  	if err != nil {
  2034  		log.Warningf(ctx, "%v", err)
  2035  		return err
  2036  	}
  2037  	rangeDesc = *newDesc
  2038  
  2039  	canRetry := func(err error) bool {
  2040  		whitelist := []string{
  2041  			snapshotApplySemBusyMsg,
  2042  			IntersectingSnapshotMsg,
  2043  		}
  2044  		errStr := err.Error()
  2045  		for _, substr := range whitelist {
  2046  			if strings.Contains(errStr, substr) {
  2047  				return true
  2048  			}
  2049  		}
  2050  		return false
  2051  	}
  2052  
  2053  	startKey := rangeDesc.StartKey.AsRawKey()
  2054  	transferLease := func(target roachpb.ReplicationTarget) {
  2055  		// TODO(tbg): we ignore errors here, but it seems that in practice these
  2056  		// transfers "always work". Some of them are essential (we can't remove
  2057  		// the leaseholder so we'll fail there later if this fails), so it
  2058  		// seems like a good idea to return any errors here to the caller (or
  2059  		// to retry some errors appropriately).
  2060  		if err := s.DB().AdminTransferLease(
  2061  			ctx, startKey, target.StoreID,
  2062  		); err != nil {
  2063  			log.Warningf(ctx, "while transferring lease: %+v", err)
  2064  		}
  2065  	}
  2066  
  2067  	// Step 2: Repeatedly add and/or remove a replica until we reach the
  2068  	// desired state. In an "atomic replication changes" world, this is
  2069  	// conceptually easy: change from the old set of replicas to the new
  2070  	// one. But there are two reasons that complicate this:
  2071  	// 1. we can't remove the leaseholder, so if we ultimately want to do that
  2072  	//    the lease has to be moved first. If we start out with *only* the
  2073  	//    leaseholder, we will have to add a replica first.
  2074  	// 2. this code is rewritten late in the cycle and it is both safer and
  2075  	//    closer to its previous incarnation to never issue atomic changes
  2076  	//    other than simple swaps.
  2077  	//
  2078  	// The loop below repeatedly calls relocateOne, which gives us either one or
  2079  	// two ops that move the range towards the desired replication state. If
  2080  	// it's one op, then a single add or remove is carried out (and it's only
  2081  	// done when we can't swap instead). If it's two ops, then we're swapping
  2082  	// (though this code doesn't concern itself with the details); and it's
  2083  	// possible that we need to transfer the lease before we carry out the ops,
  2084  	// determined via the leaseTarget variable.
  2085  	//
  2086  	// Transient errors returned from relocateOne are retried until things work
  2087  	// out.
  2088  	every := log.Every(time.Minute)
  2089  	for {
  2090  		for re := retry.StartWithCtx(ctx, retry.Options{MaxBackoff: 5 * time.Second}); ; re.Next() {
  2091  			if err := ctx.Err(); err != nil {
  2092  				return err
  2093  			}
  2094  
  2095  			ops, leaseTarget, err := s.relocateOne(ctx, &rangeDesc, targets)
  2096  			if err != nil {
  2097  				return err
  2098  			}
  2099  			if leaseTarget != nil {
  2100  				// NB: we may need to transfer even if there are no ops, to make
  2101  				// sure the attempt is made to make the first target the final
  2102  				// leaseholder.
  2103  				transferLease(*leaseTarget)
  2104  			}
  2105  			if len(ops) == 0 {
  2106  				// Done.
  2107  				return ctx.Err()
  2108  			}
  2109  			if fn := s.cfg.TestingKnobs.BeforeRelocateOne; fn != nil {
  2110  				fn(ops, leaseTarget, err)
  2111  			}
  2112  
  2113  			// Make sure we don't issue anything but singles and swaps before
  2114  			// this migration is gone (for it doesn't support anything else).
  2115  			if len(ops) > 2 {
  2116  				log.Fatalf(ctx, "received more than 2 ops: %+v", ops)
  2117  			}
  2118  			opss := [][]roachpb.ReplicationChange{ops}
  2119  			success := true
  2120  			for _, ops := range opss {
  2121  				newDesc, err := s.DB().AdminChangeReplicas(ctx, startKey, rangeDesc, ops)
  2122  				if err != nil {
  2123  					returnErr := errors.Wrapf(err, "while carrying out changes %v", ops)
  2124  					if !canRetry(err) {
  2125  						return returnErr
  2126  					}
  2127  					if every.ShouldLog() {
  2128  						log.Infof(ctx, "%v", returnErr)
  2129  					}
  2130  					success = false
  2131  					break
  2132  				}
  2133  				rangeDesc = *newDesc
  2134  			}
  2135  			if success {
  2136  				break
  2137  			}
  2138  		}
  2139  	}
  2140  
  2141  }
  2142  
  2143  func (s *Store) relocateOne(
  2144  	ctx context.Context, desc *roachpb.RangeDescriptor, targets []roachpb.ReplicationTarget,
  2145  ) ([]roachpb.ReplicationChange, *roachpb.ReplicationTarget, error) {
  2146  	rangeReplicas := desc.Replicas().All()
  2147  	if len(rangeReplicas) != len(desc.Replicas().Voters()) {
  2148  		// The caller removed all the learners, so there shouldn't be anything but
  2149  		// voters.
  2150  		return nil, nil, errors.AssertionFailedf(
  2151  			`range %s had non-voter replicas: %v`, desc, desc.Replicas())
  2152  	}
  2153  
  2154  	sysCfg := s.cfg.Gossip.GetSystemConfig()
  2155  	if sysCfg == nil {
  2156  		return nil, nil, fmt.Errorf("no system config available, unable to perform RelocateRange")
  2157  	}
  2158  	zone, err := sysCfg.GetZoneConfigForKey(desc.StartKey)
  2159  	if err != nil {
  2160  		return nil, nil, err
  2161  	}
  2162  
  2163  	storeList, _, _ := s.allocator.storePool.getStoreList(storeFilterNone)
  2164  	storeMap := storeListToMap(storeList)
  2165  
  2166  	// Compute which replica to add and/or remove, respectively. We ask the allocator
  2167  	// about this because we want to respect the constraints. For example, it would be
  2168  	// unfortunate if we put two replicas into the same zone despite having a locality-
  2169  	// preserving option available.
  2170  	//
  2171  	// TODO(radu): we can't have multiple replicas on different stores on the
  2172  	// same node, and this code doesn't do anything to specifically avoid that
  2173  	// case (although the allocator will avoid even trying to send snapshots to
  2174  	// such stores), so it could cause some failures.
  2175  
  2176  	var addTargets []roachpb.ReplicaDescriptor
  2177  	for _, t := range targets {
  2178  		found := false
  2179  		for _, replicaDesc := range rangeReplicas {
  2180  			if replicaDesc.StoreID == t.StoreID && replicaDesc.NodeID == t.NodeID {
  2181  				found = true
  2182  				break
  2183  			}
  2184  		}
  2185  		if !found {
  2186  			addTargets = append(addTargets, roachpb.ReplicaDescriptor{
  2187  				NodeID:  t.NodeID,
  2188  				StoreID: t.StoreID,
  2189  			})
  2190  		}
  2191  	}
  2192  
  2193  	var removeTargets []roachpb.ReplicaDescriptor
  2194  	for _, replicaDesc := range rangeReplicas {
  2195  		found := false
  2196  		for _, t := range targets {
  2197  			if replicaDesc.StoreID == t.StoreID && replicaDesc.NodeID == t.NodeID {
  2198  				found = true
  2199  				break
  2200  			}
  2201  		}
  2202  		if !found {
  2203  			removeTargets = append(removeTargets, roachpb.ReplicaDescriptor{
  2204  				NodeID:  replicaDesc.NodeID,
  2205  				StoreID: replicaDesc.StoreID,
  2206  			})
  2207  		}
  2208  	}
  2209  
  2210  	var ops roachpb.ReplicationChanges
  2211  
  2212  	if len(addTargets) > 0 {
  2213  		// Each iteration, pick the most desirable replica to add. However,
  2214  		// prefer the first target because it's the one that should hold the
  2215  		// lease in the end; it helps to add it early so that the lease doesn't
  2216  		// have to move too much.
  2217  		candidateTargets := addTargets
  2218  		if storeHasReplica(targets[0].StoreID, candidateTargets) {
  2219  			candidateTargets = []roachpb.ReplicaDescriptor{
  2220  				{NodeID: targets[0].NodeID, StoreID: targets[0].StoreID},
  2221  			}
  2222  		}
  2223  
  2224  		// The storeList's list of stores is used to constrain which stores the
  2225  		// allocator considers putting a new replica on. We want it to only
  2226  		// consider the stores in candidateTargets.
  2227  		candidateDescs := make([]roachpb.StoreDescriptor, 0, len(candidateTargets))
  2228  		for _, candidate := range candidateTargets {
  2229  			store, ok := storeMap[candidate.StoreID]
  2230  			if !ok {
  2231  				return nil, nil, fmt.Errorf("cannot up-replicate to s%d; missing gossiped StoreDescriptor",
  2232  					candidate.StoreID)
  2233  			}
  2234  			candidateDescs = append(candidateDescs, *store)
  2235  		}
  2236  		storeList = makeStoreList(candidateDescs)
  2237  
  2238  		targetStore, _ := s.allocator.allocateTargetFromList(
  2239  			ctx,
  2240  			storeList,
  2241  			zone,
  2242  			rangeReplicas,
  2243  			s.allocator.scorerOptions())
  2244  		if targetStore == nil {
  2245  			return nil, nil, fmt.Errorf("none of the remaining targets %v are legal additions to %v",
  2246  				addTargets, desc.Replicas())
  2247  		}
  2248  
  2249  		target := roachpb.ReplicationTarget{
  2250  			NodeID:  targetStore.Node.NodeID,
  2251  			StoreID: targetStore.StoreID,
  2252  		}
  2253  		ops = append(ops, roachpb.MakeReplicationChanges(roachpb.ADD_REPLICA, target)...)
  2254  		// Pretend the voter is already there so that the removal logic below will
  2255  		// take it into account when deciding which replica to remove.
  2256  		rangeReplicas = append(rangeReplicas, roachpb.ReplicaDescriptor{
  2257  			NodeID:    target.NodeID,
  2258  			StoreID:   target.StoreID,
  2259  			ReplicaID: desc.NextReplicaID,
  2260  			Type:      roachpb.ReplicaTypeVoterFull(),
  2261  		})
  2262  	}
  2263  
  2264  	var transferTarget *roachpb.ReplicationTarget
  2265  	if len(removeTargets) > 0 {
  2266  		// Pick a replica to remove. Note that rangeReplicas may already reflect
  2267  		// a replica we're adding in the current round. This is the right thing
  2268  		// to do. For example, consider relocating from (s1,s2,s3) to (s1,s2,s4)
  2269  		// where addTargets will be (s4) and removeTargets is (s3). In this code,
  2270  		// we'll want the allocator to see if s3 can be removed from
  2271  		// (s1,s2,s3,s4) which is a reasonable request; that replica set is
  2272  		// overreplicated. If we asked it instead to remove s3 from (s1,s2,s3)
  2273  		// it may not want to do that due to constraints.
  2274  		targetStore, _, err := s.allocator.RemoveTarget(ctx, zone, removeTargets, rangeReplicas)
  2275  		if err != nil {
  2276  			return nil, nil, errors.Wrapf(err, "unable to select removal target from %v; current replicas %v",
  2277  				removeTargets, rangeReplicas)
  2278  		}
  2279  		removalTarget := roachpb.ReplicationTarget{
  2280  			NodeID:  targetStore.NodeID,
  2281  			StoreID: targetStore.StoreID,
  2282  		}
  2283  		// We can't remove the leaseholder, which really throws a wrench into
  2284  		// atomic replication changes. If we find that we're trying to do just
  2285  		// that, we need to first move the lease elsewhere. This is not possible
  2286  		// if there is no other replica available at that point, i.e. if the
  2287  		// existing descriptor is a single replica that's being replaced.
  2288  		var b kv.Batch
  2289  		liReq := &roachpb.LeaseInfoRequest{}
  2290  		liReq.Key = desc.StartKey.AsRawKey()
  2291  		b.AddRawRequest(liReq)
  2292  		if err := s.DB().Run(ctx, &b); err != nil {
  2293  			return nil, nil, errors.Wrap(err, "looking up lease")
  2294  		}
  2295  		curLeaseholder := b.RawResponse().Responses[0].GetLeaseInfo().Lease.Replica
  2296  		ok := curLeaseholder.StoreID != removalTarget.StoreID
  2297  		if !ok {
  2298  			// Pick a replica that we can give the lease to. We sort the first
  2299  			// target to the beginning (if it's there) because that's where the
  2300  			// lease needs to be in the end. We also exclude the last replica if
  2301  			// it was added by the add branch above (in which case it doesn't
  2302  			// exist yet).
  2303  			sortedTargetReplicas := append([]roachpb.ReplicaDescriptor(nil), rangeReplicas[:len(rangeReplicas)-len(ops)]...)
  2304  			sort.Slice(sortedTargetReplicas, func(i, j int) bool {
  2305  				sl := sortedTargetReplicas
  2306  				// targets[0] goes to the front (if it's present).
  2307  				return sl[i].StoreID == targets[0].StoreID
  2308  			})
  2309  			for _, rDesc := range sortedTargetReplicas {
  2310  				if rDesc.StoreID != curLeaseholder.StoreID {
  2311  					transferTarget = &roachpb.ReplicationTarget{
  2312  						NodeID:  rDesc.NodeID,
  2313  						StoreID: rDesc.StoreID,
  2314  					}
  2315  					ok = true
  2316  					break
  2317  				}
  2318  			}
  2319  		}
  2320  
  2321  		// Carry out the removal only if there was no lease problem above. If
  2322  		// there was, we're not going to do a swap in this round but just do the
  2323  		// addition. (Note that !ok implies that len(ops) is not empty, or we're
  2324  		// trying to remove the last replica left in the descriptor which is
  2325  		// illegal).
  2326  		if ok {
  2327  			ops = append(ops, roachpb.MakeReplicationChanges(
  2328  				roachpb.REMOVE_REPLICA,
  2329  				removalTarget)...)
  2330  		}
  2331  	}
  2332  
  2333  	if len(ops) == 0 {
  2334  		// Make sure that the first target is the final leaseholder, as
  2335  		// AdminRelocateRange specifies.
  2336  		transferTarget = &targets[0]
  2337  	}
  2338  
  2339  	return ops, transferTarget, nil
  2340  }
  2341  
  2342  // adminScatter moves replicas and leaseholders for a selection of ranges.
  2343  func (r *Replica) adminScatter(
  2344  	ctx context.Context, args roachpb.AdminScatterRequest,
  2345  ) (roachpb.AdminScatterResponse, error) {
  2346  	rq := r.store.replicateQueue
  2347  	retryOpts := retry.Options{
  2348  		InitialBackoff: 50 * time.Millisecond,
  2349  		MaxBackoff:     1 * time.Second,
  2350  		Multiplier:     2,
  2351  		MaxRetries:     5,
  2352  	}
  2353  
  2354  	// Loop until the replicate queue decides there is nothing left to do for the
  2355  	// range. Note that we disable lease transfers until the final step as
  2356  	// transferring the lease prevents any further action on this node.
  2357  	var allowLeaseTransfer bool
  2358  	canTransferLease := func() bool { return allowLeaseTransfer }
  2359  	for re := retry.StartWithCtx(ctx, retryOpts); re.Next(); {
  2360  		requeue, err := rq.processOneChange(ctx, r, canTransferLease, false /* dryRun */)
  2361  		if err != nil {
  2362  			if IsSnapshotError(err) {
  2363  				continue
  2364  			}
  2365  			break
  2366  		}
  2367  		if !requeue {
  2368  			if allowLeaseTransfer {
  2369  				break
  2370  			}
  2371  			allowLeaseTransfer = true
  2372  		}
  2373  		re.Reset()
  2374  	}
  2375  
  2376  	// If we've been asked to randomize the leases beyond what the replicate
  2377  	// queue would do on its own (#17341), do so after the replicate queue is
  2378  	// done by transferring the lease to any of the given N replicas with
  2379  	// probability 1/N of choosing each.
  2380  	if args.RandomizeLeases && r.OwnsValidLease(r.store.Clock().Now()) {
  2381  		desc := r.Desc()
  2382  		// Learner replicas aren't allowed to become the leaseholder or raft leader,
  2383  		// so only consider the `Voters` replicas.
  2384  		voterReplicas := desc.Replicas().Voters()
  2385  		newLeaseholderIdx := rand.Intn(len(voterReplicas))
  2386  		targetStoreID := voterReplicas[newLeaseholderIdx].StoreID
  2387  		if targetStoreID != r.store.StoreID() {
  2388  			if err := r.AdminTransferLease(ctx, targetStoreID); err != nil {
  2389  				log.Warningf(ctx, "failed to scatter lease to s%d: %+v", targetStoreID, err)
  2390  			}
  2391  		}
  2392  	}
  2393  
  2394  	desc := r.Desc()
  2395  	return roachpb.AdminScatterResponse{
  2396  		Ranges: []roachpb.AdminScatterResponse_Range{{
  2397  			Span: roachpb.Span{
  2398  				Key:    desc.StartKey.AsRawKey(),
  2399  				EndKey: desc.EndKey.AsRawKey(),
  2400  			},
  2401  		}},
  2402  	}, nil
  2403  }
  2404  
  2405  func (r *Replica) adminVerifyProtectedTimestamp(
  2406  	ctx context.Context, args roachpb.AdminVerifyProtectedTimestampRequest,
  2407  ) (resp roachpb.AdminVerifyProtectedTimestampResponse, err error) {
  2408  	resp.Verified, err = r.protectedTimestampRecordApplies(ctx, &args)
  2409  	if err == nil && !resp.Verified {
  2410  		resp.FailedRanges = append(resp.FailedRanges, *r.Desc())
  2411  	}
  2412  	return resp, err
  2413  }