github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_end_transaction.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/batcheval/cmd_end_transaction.go (about)

     1  // Copyright 2014 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package batcheval
    12  
    13  import (
    14  	"bytes"
    15  	"context"
    16  	"fmt"
    17  	"math"
    18  	"sync/atomic"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    21  	"github.com/cockroachdb/cockroach/pkg/keys"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/abortspan"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/batcheval/result"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    26  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/rditer"
    27  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/spanset"
    28  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    29  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    30  	"github.com/cockroachdb/cockroach/pkg/storage"
    31  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    32  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    33  	"github.com/cockroachdb/cockroach/pkg/util/log"
    34  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    35  	"github.com/cockroachdb/errors"
    36  	"github.com/cockroachdb/logtags"
    37  )
    38  
    39  func init() {
    40  	RegisterReadWriteCommand(roachpb.EndTxn, declareKeysEndTxn, EndTxn)
    41  }
    42  
    43  // declareKeysWriteTransaction is the shared portion of
    44  // declareKeys{End,Heartbeat}Transaction.
    45  func declareKeysWriteTransaction(
    46  	_ *roachpb.RangeDescriptor,
    47  	header roachpb.Header,
    48  	req roachpb.Request,
    49  	latchSpans *spanset.SpanSet,
    50  ) {
    51  	if header.Txn != nil {
    52  		header.Txn.AssertInitialized(context.TODO())
    53  		latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
    54  			Key: keys.TransactionKey(req.Header().Key, header.Txn.ID),
    55  		})
    56  	}
    57  }
    58  
    59  func declareKeysEndTxn(
    60  	desc *roachpb.RangeDescriptor,
    61  	header roachpb.Header,
    62  	req roachpb.Request,
    63  	latchSpans, _ *spanset.SpanSet,
    64  ) {
    65  	et := req.(*roachpb.EndTxnRequest)
    66  	declareKeysWriteTransaction(desc, header, req, latchSpans)
    67  	var minTxnTS hlc.Timestamp
    68  	if header.Txn != nil {
    69  		header.Txn.AssertInitialized(context.TODO())
    70  		minTxnTS = header.Txn.MinTimestamp
    71  		abortSpanAccess := spanset.SpanReadOnly
    72  		if !et.Commit {
    73  			// Rollback EndTxn requests may write to the abort span, either if
    74  			// their Poison flag is set, in which case they will add an abort
    75  			// span entry, or if their Poison flag is not set and an abort span
    76  			// entry already exists on this Range, in which case they will clear
    77  			// that entry.
    78  			abortSpanAccess = spanset.SpanReadWrite
    79  		}
    80  		latchSpans.AddNonMVCC(abortSpanAccess, roachpb.Span{
    81  			Key: keys.AbortSpanKey(header.RangeID, header.Txn.ID),
    82  		})
    83  	}
    84  
    85  	// If the request is intending to finalize the transaction record then it
    86  	// needs to declare a few extra keys.
    87  	if !et.IsParallelCommit() {
    88  		// All requests that intend on resolving local locks need to depend on
    89  		// the range descriptor because they need to determine which locks are
    90  		// within the local range.
    91  		latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{Key: keys.RangeDescriptorKey(desc.StartKey)})
    92  
    93  		// The spans may extend beyond this Range, but it's ok for the
    94  		// purpose of acquiring latches. The parts in our Range will
    95  		// be resolved eagerly.
    96  		for _, span := range et.LockSpans {
    97  			latchSpans.AddMVCC(spanset.SpanReadWrite, span, minTxnTS)
    98  		}
    99  
   100  		if et.InternalCommitTrigger != nil {
   101  			if st := et.InternalCommitTrigger.SplitTrigger; st != nil {
   102  				// Splits may read from the entire pre-split range (they read
   103  				// from the LHS in all cases, and the RHS only when the existing
   104  				// stats contain estimates). Splits declare non-MVCC read access
   105  				// across the entire LHS to block all concurrent writes to the
   106  				// LHS because their stat deltas will interfere with the
   107  				// non-delta stats computed as a part of the split. Splits
   108  				// declare non-MVCC write access across the entire RHS to block
   109  				// all concurrent reads and writes to the RHS because they will
   110  				// fail if applied after the split. (see
   111  				// https://github.com/cockroachdb/cockroach/issues/14881)
   112  				latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{
   113  					Key:    st.LeftDesc.StartKey.AsRawKey(),
   114  					EndKey: st.LeftDesc.EndKey.AsRawKey(),
   115  				})
   116  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   117  					Key:    st.RightDesc.StartKey.AsRawKey(),
   118  					EndKey: st.RightDesc.EndKey.AsRawKey(),
   119  				})
   120  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   121  					Key:    keys.MakeRangeKeyPrefix(st.LeftDesc.StartKey),
   122  					EndKey: keys.MakeRangeKeyPrefix(st.RightDesc.EndKey).PrefixEnd(),
   123  				})
   124  
   125  				leftRangeIDPrefix := keys.MakeRangeIDReplicatedPrefix(header.RangeID)
   126  				latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{
   127  					Key:    leftRangeIDPrefix,
   128  					EndKey: leftRangeIDPrefix.PrefixEnd(),
   129  				})
   130  				rightRangeIDPrefix := keys.MakeRangeIDReplicatedPrefix(st.RightDesc.RangeID)
   131  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   132  					Key:    rightRangeIDPrefix,
   133  					EndKey: rightRangeIDPrefix.PrefixEnd(),
   134  				})
   135  
   136  				rightRangeIDUnreplicatedPrefix := keys.MakeRangeIDUnreplicatedPrefix(st.RightDesc.RangeID)
   137  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   138  					Key:    rightRangeIDUnreplicatedPrefix,
   139  					EndKey: rightRangeIDUnreplicatedPrefix.PrefixEnd(),
   140  				})
   141  
   142  				latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{
   143  					Key: keys.RangeLastReplicaGCTimestampKey(st.LeftDesc.RangeID),
   144  				})
   145  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   146  					Key: keys.RangeLastReplicaGCTimestampKey(st.RightDesc.RangeID),
   147  				})
   148  
   149  				latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{
   150  					Key:    abortspan.MinKey(header.RangeID),
   151  					EndKey: abortspan.MaxKey(header.RangeID),
   152  				})
   153  			}
   154  			if mt := et.InternalCommitTrigger.MergeTrigger; mt != nil {
   155  				// Merges copy over the RHS abort span to the LHS, and compute
   156  				// replicated range ID stats over the RHS in the merge trigger.
   157  				latchSpans.AddNonMVCC(spanset.SpanReadWrite, roachpb.Span{
   158  					Key:    abortspan.MinKey(mt.LeftDesc.RangeID),
   159  					EndKey: abortspan.MaxKey(mt.LeftDesc.RangeID).PrefixEnd(),
   160  				})
   161  				latchSpans.AddNonMVCC(spanset.SpanReadOnly, roachpb.Span{
   162  					Key:    keys.MakeRangeIDReplicatedPrefix(mt.RightDesc.RangeID),
   163  					EndKey: keys.MakeRangeIDReplicatedPrefix(mt.RightDesc.RangeID).PrefixEnd(),
   164  				})
   165  			}
   166  		}
   167  	}
   168  }
   169  
   170  // EndTxn either commits or aborts (rolls back) an extant transaction according
   171  // to the args.Commit parameter. Rolling back an already rolled-back txn is ok.
   172  // TODO(nvanbenschoten): rename this file to cmd_end_txn.go once some of andrei's
   173  // recent PRs have landed.
   174  func EndTxn(
   175  	ctx context.Context, readWriter storage.ReadWriter, cArgs CommandArgs, resp roachpb.Response,
   176  ) (result.Result, error) {
   177  	args := cArgs.Args.(*roachpb.EndTxnRequest)
   178  	h := cArgs.Header
   179  	ms := cArgs.Stats
   180  	reply := resp.(*roachpb.EndTxnResponse)
   181  
   182  	if err := VerifyTransaction(h, args, roachpb.PENDING, roachpb.STAGING, roachpb.ABORTED); err != nil {
   183  		return result.Result{}, err
   184  	}
   185  	if args.Require1PC {
   186  		// If a 1PC txn was required and we're in EndTxn, we've failed to evaluate
   187  		// the batch as a 1PC. We're returning early instead of preferring a
   188  		// possible retriable error because we might want to leave locks behind in
   189  		// case of retriable errors - which Require1PC does not want.
   190  		return result.Result{}, roachpb.NewTransactionStatusError("could not commit in one phase as requested")
   191  	}
   192  	if args.Commit && args.Poison {
   193  		return result.Result{}, errors.Errorf("cannot poison during a committing EndTxn request")
   194  	}
   195  
   196  	key := keys.TransactionKey(h.Txn.Key, h.Txn.ID)
   197  
   198  	// Fetch existing transaction.
   199  	var existingTxn roachpb.Transaction
   200  	if ok, err := storage.MVCCGetProto(
   201  		ctx, readWriter, key, hlc.Timestamp{}, &existingTxn, storage.MVCCGetOptions{},
   202  	); err != nil {
   203  		return result.Result{}, err
   204  	} else if !ok {
   205  		// No existing transaction record was found - create one by writing it
   206  		// below in updateFinalizedTxn.
   207  		reply.Txn = h.Txn.Clone()
   208  
   209  		// Verify that it is safe to create the transaction record. We only need
   210  		// to perform this verification for commits. Rollbacks can always write
   211  		// an aborted txn record.
   212  		if args.Commit {
   213  			if err := CanCreateTxnRecord(cArgs.EvalCtx, reply.Txn); err != nil {
   214  				return result.Result{}, err
   215  			}
   216  		}
   217  	} else {
   218  		// We're using existingTxn on the reply, although it can be stale
   219  		// compared to the Transaction in the request (e.g. the Sequence,
   220  		// and various timestamps). We must be careful to update it with the
   221  		// supplied ba.Txn if we return it with an error which might be
   222  		// retried, as for example to avoid client-side serializable restart.
   223  		reply.Txn = &existingTxn
   224  
   225  		// Verify that we can either commit it or abort it (according
   226  		// to args.Commit), and also that the Timestamp and Epoch have
   227  		// not suffered regression.
   228  		switch reply.Txn.Status {
   229  		case roachpb.COMMITTED:
   230  			// This can happen if the coordinator had left the transaction in the
   231  			// implicitly committed state, and is now coming to clean it up. Someone
   232  			// else must have performed the STAGING->COMMITTED transition in the
   233  			// meantime. The TransactionStatusError is going to be handled by the
   234  			// txnCommitter interceptor.
   235  			log.VEventf(ctx, 2, "transaction found to be already committed")
   236  			return result.Result{}, roachpb.NewTransactionCommittedStatusError()
   237  
   238  		case roachpb.ABORTED:
   239  			if !args.Commit {
   240  				// The transaction has already been aborted by other.
   241  				// Do not return TransactionAbortedError since the client anyway
   242  				// wanted to abort the transaction.
   243  				desc := cArgs.EvalCtx.Desc()
   244  				resolvedLocks, externalLocks, err := resolveLocalLocks(ctx, desc, readWriter, ms, args, reply.Txn, cArgs.EvalCtx)
   245  				if err != nil {
   246  					return result.Result{}, err
   247  				}
   248  				if err := updateFinalizedTxn(
   249  					ctx, readWriter, ms, key, args, reply.Txn, externalLocks,
   250  				); err != nil {
   251  					return result.Result{}, err
   252  				}
   253  				// Use alwaysReturn==true because the transaction is definitely
   254  				// aborted, no matter what happens to this command.
   255  				res := result.FromEndTxn(reply.Txn, true /* alwaysReturn */, args.Poison)
   256  				res.Local.ResolvedLocks = resolvedLocks
   257  				return res, nil
   258  			}
   259  			// If the transaction was previously aborted by a concurrent writer's
   260  			// push, any intents written are still open. It's only now that we know
   261  			// them, so we return them all for asynchronous resolution (we're
   262  			// currently not able to write on error, but see #1989).
   263  			//
   264  			// Similarly to above, use alwaysReturn==true. The caller isn't trying
   265  			// to abort, but the transaction is definitely aborted and its locks
   266  			// can go.
   267  			reply.Txn.LockSpans = args.LockSpans
   268  			return result.FromEndTxn(reply.Txn, true /* alwaysReturn */, args.Poison),
   269  				roachpb.NewTransactionAbortedError(roachpb.ABORT_REASON_ABORTED_RECORD_FOUND)
   270  
   271  		case roachpb.PENDING, roachpb.STAGING:
   272  			if h.Txn.Epoch < reply.Txn.Epoch {
   273  				return result.Result{}, errors.AssertionFailedf(
   274  					"programming error: epoch regression: %d", h.Txn.Epoch)
   275  			}
   276  
   277  		default:
   278  			return result.Result{}, errors.AssertionFailedf("bad txn status: %s", reply.Txn)
   279  		}
   280  
   281  		// Update the existing txn with the supplied txn.
   282  		reply.Txn.Update(h.Txn)
   283  	}
   284  
   285  	// Attempt to commit or abort the transaction per the args.Commit parameter.
   286  	if args.Commit {
   287  		if retry, reason, extraMsg := IsEndTxnTriggeringRetryError(reply.Txn, args); retry {
   288  			return result.Result{}, roachpb.NewTransactionRetryError(reason, extraMsg)
   289  		}
   290  
   291  		// If the transaction needs to be staged as part of an implicit commit
   292  		// before being explicitly committed, write the staged transaction
   293  		// record and return without running commit triggers or resolving local
   294  		// locks.
   295  		if args.IsParallelCommit() {
   296  			// It's not clear how to combine transaction recovery with commit
   297  			// triggers, so for now we don't allow them to mix. This shouldn't
   298  			// cause any issues and the txn coordinator knows not to mix them.
   299  			if ct := args.InternalCommitTrigger; ct != nil {
   300  				err := errors.Errorf("cannot stage transaction with a commit trigger: %+v", ct)
   301  				return result.Result{}, err
   302  			}
   303  
   304  			reply.Txn.Status = roachpb.STAGING
   305  			reply.StagingTimestamp = reply.Txn.WriteTimestamp
   306  			if err := updateStagingTxn(ctx, readWriter, ms, key, args, reply.Txn); err != nil {
   307  				return result.Result{}, err
   308  			}
   309  			return result.Result{}, nil
   310  		}
   311  
   312  		// Else, the transaction can be explicitly committed.
   313  		reply.Txn.Status = roachpb.COMMITTED
   314  	} else {
   315  		reply.Txn.Status = roachpb.ABORTED
   316  	}
   317  
   318  	// Resolve locks on the local range synchronously so that their resolution
   319  	// ends up in the same Raft entry. There should always be at least one because
   320  	// we position the transaction record next to the first write of a transaction.
   321  	// This avoids the need for the intentResolver to have to return to this range
   322  	// to resolve locks for this transaction in the future.
   323  	desc := cArgs.EvalCtx.Desc()
   324  	resolvedLocks, externalLocks, err := resolveLocalLocks(ctx, desc, readWriter, ms, args, reply.Txn, cArgs.EvalCtx)
   325  	if err != nil {
   326  		return result.Result{}, err
   327  	}
   328  	if err := updateFinalizedTxn(ctx, readWriter, ms, key, args, reply.Txn, externalLocks); err != nil {
   329  		return result.Result{}, err
   330  	}
   331  
   332  	// Note: there's no need to clear the AbortSpan state if we've successfully
   333  	// finalized a transaction, as there's no way in which an abort cache entry
   334  	// could have been written (the txn would already have been in
   335  	// state=ABORTED).
   336  	//
   337  	// Summary of transaction replay protection after EndTxn: When a
   338  	// transactional write gets replayed over its own resolved intents, the
   339  	// write will succeed but only as an intent with a newer timestamp (with a
   340  	// WriteTooOldError). However, the replayed intent cannot be resolved by a
   341  	// subsequent replay of this EndTxn call because the txn timestamp will be
   342  	// too old. Replays of requests which attempt to create a new txn record
   343  	// (HeartbeatTxn or EndTxn) never succeed because EndTxn inserts in the
   344  	// timestamp cache in Replica's updateTimestampCache method, forcing
   345  	// the call to CanCreateTxnRecord to return false, resulting in a
   346  	// transaction retry error. If the replay didn't attempt to create a txn
   347  	// record, any push will immediately succeed as a missing txn record on push
   348  	// where CanCreateTxnRecord returns false succeeds. In both cases, the txn
   349  	// will be GC'd on the slow path.
   350  	//
   351  	// We specify alwaysReturn==false because if the commit fails below Raft, we
   352  	// don't want the locks to be up for resolution. That should happen only if
   353  	// the commit actually happens; otherwise, we risk losing writes.
   354  	txnResult := result.FromEndTxn(reply.Txn, false /* alwaysReturn */, args.Poison)
   355  	txnResult.Local.UpdatedTxns = []*roachpb.Transaction{reply.Txn}
   356  	txnResult.Local.ResolvedLocks = resolvedLocks
   357  
   358  	// Run the rest of the commit triggers if successfully committed.
   359  	if reply.Txn.Status == roachpb.COMMITTED {
   360  		triggerResult, err := RunCommitTrigger(
   361  			ctx, cArgs.EvalCtx, readWriter.(storage.Batch), ms, args, reply.Txn,
   362  		)
   363  		if err != nil {
   364  			return result.Result{}, roachpb.NewReplicaCorruptionError(err)
   365  		}
   366  		if err := txnResult.MergeAndDestroy(triggerResult); err != nil {
   367  			return result.Result{}, err
   368  		}
   369  	} else if reply.Txn.Status == roachpb.ABORTED {
   370  		// If this is the system config span and we're aborted, add a trigger to
   371  		// potentially gossip now that we've removed an intent. This is important
   372  		// to deal with cases where previously committed values were not gossipped
   373  		// due to an outstanding intent.
   374  		if cArgs.EvalCtx.ContainsKey(keys.SystemConfigSpan.Key) {
   375  			txnResult.Local.MaybeGossipSystemConfigIfHaveFailure = true
   376  		}
   377  	}
   378  
   379  	return txnResult, nil
   380  }
   381  
   382  // IsEndTxnExceedingDeadline returns true if the transaction exceeded its
   383  // deadline.
   384  func IsEndTxnExceedingDeadline(t hlc.Timestamp, args *roachpb.EndTxnRequest) bool {
   385  	return args.Deadline != nil && args.Deadline.LessEq(t)
   386  }
   387  
   388  // IsEndTxnTriggeringRetryError returns true if the EndTxnRequest cannot be
   389  // committed and needs to return a TransactionRetryError. It also returns the
   390  // reason and possibly an extra message to be used for the error.
   391  func IsEndTxnTriggeringRetryError(
   392  	txn *roachpb.Transaction, args *roachpb.EndTxnRequest,
   393  ) (retry bool, reason roachpb.TransactionRetryReason, extraMsg string) {
   394  	// If we saw any WriteTooOldErrors, we must restart to avoid lost
   395  	// update anomalies.
   396  	if txn.WriteTooOld {
   397  		retry, reason = true, roachpb.RETRY_WRITE_TOO_OLD
   398  	} else {
   399  		readTimestamp := txn.ReadTimestamp
   400  		isTxnPushed := txn.WriteTimestamp != readTimestamp
   401  
   402  		// Return a transaction retry error if the commit timestamp isn't equal to
   403  		// the txn timestamp.
   404  		if isTxnPushed {
   405  			retry, reason = true, roachpb.RETRY_SERIALIZABLE
   406  		}
   407  	}
   408  
   409  	// A transaction must obey its deadline, if set.
   410  	if !retry && IsEndTxnExceedingDeadline(txn.WriteTimestamp, args) {
   411  		exceededBy := txn.WriteTimestamp.GoTime().Sub(args.Deadline.GoTime())
   412  		extraMsg = fmt.Sprintf(
   413  			"txn timestamp pushed too much; deadline exceeded by %s (%s > %s)",
   414  			exceededBy, txn.WriteTimestamp, args.Deadline)
   415  		retry, reason = true, roachpb.RETRY_COMMIT_DEADLINE_EXCEEDED
   416  	}
   417  	return retry, reason, extraMsg
   418  }
   419  
   420  // CanForwardCommitTimestampWithoutRefresh returns whether a txn can be
   421  // safely committed with a timestamp above its read timestamp without
   422  // requiring a read refresh (see txnSpanRefresher). This requires that
   423  // the transaction's timestamp has not leaked and that the transaction
   424  // has encountered no spans which require refreshing at the forwarded
   425  // timestamp. If either of those conditions are true, a client-side
   426  // retry is required.
   427  //
   428  // Note that when deciding whether a transaction can be bumped to a particular
   429  // timestamp, the transaction's deadling must also be taken into account.
   430  func CanForwardCommitTimestampWithoutRefresh(
   431  	txn *roachpb.Transaction, args *roachpb.EndTxnRequest,
   432  ) bool {
   433  	return !txn.CommitTimestampFixed && args.CanCommitAtHigherTimestamp
   434  }
   435  
   436  const lockResolutionBatchSize = 500
   437  
   438  // resolveLocalLocks synchronously resolves any locks that are local to this
   439  // range in the same batch and returns those lock spans. The remainder are
   440  // collected and returned so that they can be handed off to asynchronous
   441  // processing. Note that there is a maximum lock resolution allowance of
   442  // lockResolutionBatchSize meant to avoid creating a batch which is too large
   443  // for Raft. Any local locks which exceed the allowance are treated as
   444  // external and are resolved asynchronously with the external locks.
   445  func resolveLocalLocks(
   446  	ctx context.Context,
   447  	desc *roachpb.RangeDescriptor,
   448  	readWriter storage.ReadWriter,
   449  	ms *enginepb.MVCCStats,
   450  	args *roachpb.EndTxnRequest,
   451  	txn *roachpb.Transaction,
   452  	evalCtx EvalContext,
   453  ) (resolvedLocks []roachpb.LockUpdate, externalLocks []roachpb.Span, _ error) {
   454  	if mergeTrigger := args.InternalCommitTrigger.GetMergeTrigger(); mergeTrigger != nil {
   455  		// If this is a merge, then use the post-merge descriptor to determine
   456  		// which locks are local (note that for a split, we want to use the
   457  		// pre-split one instead because it's larger).
   458  		desc = &mergeTrigger.LeftDesc
   459  	}
   460  
   461  	iter := readWriter.NewIterator(storage.IterOptions{
   462  		UpperBound: desc.EndKey.AsRawKey(),
   463  	})
   464  	iterAndBuf := storage.GetBufUsingIter(iter)
   465  	defer iterAndBuf.Cleanup()
   466  
   467  	var resolveAllowance int64 = lockResolutionBatchSize
   468  	if args.InternalCommitTrigger != nil {
   469  		// If this is a system transaction (such as a split or merge), don't enforce the resolve allowance.
   470  		// These transactions rely on having their locks resolved synchronously.
   471  		resolveAllowance = math.MaxInt64
   472  	}
   473  	for _, span := range args.LockSpans {
   474  		if err := func() error {
   475  			if resolveAllowance == 0 {
   476  				externalLocks = append(externalLocks, span)
   477  				return nil
   478  			}
   479  			update := roachpb.MakeLockUpdate(txn, span)
   480  			if len(span.EndKey) == 0 {
   481  				// For single-key lock updates, do a KeyAddress-aware check of
   482  				// whether it's contained in our Range.
   483  				if !kvserverbase.ContainsKey(desc, span.Key) {
   484  					externalLocks = append(externalLocks, span)
   485  					return nil
   486  				}
   487  				resolveMS := ms
   488  				ok, err := storage.MVCCResolveWriteIntentUsingIter(ctx, readWriter, iterAndBuf, resolveMS, update)
   489  				if err != nil {
   490  					return err
   491  				}
   492  				if ok {
   493  					resolveAllowance--
   494  				}
   495  				resolvedLocks = append(resolvedLocks, update)
   496  				return nil
   497  			}
   498  			// For update ranges, cut into parts inside and outside our key
   499  			// range. Resolve locally inside, delegate the rest. In particular,
   500  			// an update range for range-local data is correctly considered local.
   501  			inSpan, outSpans := kvserverbase.IntersectSpan(span, desc)
   502  			externalLocks = append(externalLocks, outSpans...)
   503  			if inSpan != nil {
   504  				update.Span = *inSpan
   505  				num, resumeSpan, err := storage.MVCCResolveWriteIntentRangeUsingIter(ctx, readWriter, iterAndBuf, ms, update, resolveAllowance)
   506  				if err != nil {
   507  					return err
   508  				}
   509  				if evalCtx.EvalKnobs().NumKeysEvaluatedForRangeIntentResolution != nil {
   510  					atomic.AddInt64(evalCtx.EvalKnobs().NumKeysEvaluatedForRangeIntentResolution, num)
   511  				}
   512  				resolveAllowance -= num
   513  				if resumeSpan != nil {
   514  					if resolveAllowance != 0 {
   515  						log.Fatalf(ctx, "expected resolve allowance to be exactly 0 resolving %s; got %d", update.Span, resolveAllowance)
   516  					}
   517  					update.EndKey = resumeSpan.Key
   518  					externalLocks = append(externalLocks, *resumeSpan)
   519  				}
   520  				resolvedLocks = append(resolvedLocks, update)
   521  				return nil
   522  			}
   523  			return nil
   524  		}(); err != nil {
   525  			return nil, nil, errors.Wrapf(err, "resolving lock at %s on end transaction [%s]", span, txn.Status)
   526  		}
   527  	}
   528  
   529  	removedAny := resolveAllowance != lockResolutionBatchSize
   530  	if WriteAbortSpanOnResolve(txn.Status, args.Poison, removedAny) {
   531  		if err := UpdateAbortSpan(ctx, evalCtx, readWriter, ms, txn.TxnMeta, args.Poison); err != nil {
   532  			return nil, nil, err
   533  		}
   534  	}
   535  	return resolvedLocks, externalLocks, nil
   536  }
   537  
   538  // updateStagingTxn persists the STAGING transaction record with updated status
   539  // (and possibly timestamp). It persists the record with the EndTxn request's
   540  // declared in-flight writes along with all of the transaction's (local and
   541  // remote) locks.
   542  func updateStagingTxn(
   543  	ctx context.Context,
   544  	readWriter storage.ReadWriter,
   545  	ms *enginepb.MVCCStats,
   546  	key []byte,
   547  	args *roachpb.EndTxnRequest,
   548  	txn *roachpb.Transaction,
   549  ) error {
   550  	txn.LockSpans = args.LockSpans
   551  	txn.InFlightWrites = args.InFlightWrites
   552  	txnRecord := txn.AsRecord()
   553  	return storage.MVCCPutProto(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */, &txnRecord)
   554  }
   555  
   556  // updateFinalizedTxn persists the COMMITTED or ABORTED transaction record with
   557  // updated status (and possibly timestamp). If we've already resolved all locks
   558  // locally, we actually delete the record right away - no use in keeping it
   559  // around.
   560  func updateFinalizedTxn(
   561  	ctx context.Context,
   562  	readWriter storage.ReadWriter,
   563  	ms *enginepb.MVCCStats,
   564  	key []byte,
   565  	args *roachpb.EndTxnRequest,
   566  	txn *roachpb.Transaction,
   567  	externalLocks []roachpb.Span,
   568  ) error {
   569  	if txnAutoGC && len(externalLocks) == 0 {
   570  		if log.V(2) {
   571  			log.Infof(ctx, "auto-gc'ed %s (%d locks)", txn.Short(), len(args.LockSpans))
   572  		}
   573  		return storage.MVCCDelete(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */)
   574  	}
   575  	txn.LockSpans = externalLocks
   576  	txn.InFlightWrites = nil
   577  	txnRecord := txn.AsRecord()
   578  	return storage.MVCCPutProto(ctx, readWriter, ms, key, hlc.Timestamp{}, nil /* txn */, &txnRecord)
   579  }
   580  
   581  // RunCommitTrigger runs the commit trigger from an end transaction request.
   582  func RunCommitTrigger(
   583  	ctx context.Context,
   584  	rec EvalContext,
   585  	batch storage.Batch,
   586  	ms *enginepb.MVCCStats,
   587  	args *roachpb.EndTxnRequest,
   588  	txn *roachpb.Transaction,
   589  ) (result.Result, error) {
   590  	ct := args.InternalCommitTrigger
   591  	if ct == nil {
   592  		return result.Result{}, nil
   593  	}
   594  
   595  	if ct.GetSplitTrigger() != nil {
   596  		newMS, trigger, err := splitTrigger(
   597  			ctx, rec, batch, *ms, ct.SplitTrigger, txn.WriteTimestamp,
   598  		)
   599  		*ms = newMS
   600  		return trigger, err
   601  	}
   602  	if mt := ct.GetMergeTrigger(); mt != nil {
   603  		return mergeTrigger(ctx, rec, batch, ms, mt, txn.WriteTimestamp)
   604  	}
   605  	if crt := ct.GetChangeReplicasTrigger(); crt != nil {
   606  		// TODO(tbg): once we support atomic replication changes, check that
   607  		// crt.Added() and crt.Removed() don't intersect (including mentioning
   608  		// the same replica more than once individually) because it would be
   609  		// silly (though possible) to have to attach semantics to that.
   610  		return changeReplicasTrigger(ctx, rec, batch, crt), nil
   611  	}
   612  	if ct.GetModifiedSpanTrigger() != nil {
   613  		var pd result.Result
   614  		if ct.ModifiedSpanTrigger.SystemConfigSpan {
   615  			// Check if we need to gossip the system config.
   616  			// NOTE: System config gossiping can only execute correctly if
   617  			// the transaction record is located on the range that contains
   618  			// the system span. If a transaction is created which modifies
   619  			// both system *and* non-system data, it should be ensured that
   620  			// the transaction record itself is on the system span. This can
   621  			// be done by making sure a system key is the first key touched
   622  			// in the transaction.
   623  			if rec.ContainsKey(keys.SystemConfigSpan.Key) {
   624  				if err := pd.MergeAndDestroy(
   625  					result.Result{
   626  						Local: result.LocalResult{
   627  							MaybeGossipSystemConfig: true,
   628  						},
   629  					},
   630  				); err != nil {
   631  					return result.Result{}, err
   632  				}
   633  			} else {
   634  				log.Errorf(ctx, "System configuration span was modified, but the "+
   635  					"modification trigger is executing on a non-system range. "+
   636  					"Configuration changes will not be gossiped.")
   637  			}
   638  		}
   639  		if nlSpan := ct.ModifiedSpanTrigger.NodeLivenessSpan; nlSpan != nil {
   640  			if err := pd.MergeAndDestroy(
   641  				result.Result{
   642  					Local: result.LocalResult{
   643  						MaybeGossipNodeLiveness: nlSpan,
   644  					},
   645  				},
   646  			); err != nil {
   647  				return result.Result{}, err
   648  			}
   649  		}
   650  		return pd, nil
   651  	}
   652  	if sbt := ct.GetStickyBitTrigger(); sbt != nil {
   653  		newDesc := *rec.Desc()
   654  		if sbt.StickyBit != (hlc.Timestamp{}) {
   655  			newDesc.StickyBit = &sbt.StickyBit
   656  		} else {
   657  			newDesc.StickyBit = nil
   658  		}
   659  		var res result.Result
   660  		res.Replicated.State = &kvserverpb.ReplicaState{
   661  			Desc: &newDesc,
   662  		}
   663  		return res, nil
   664  	}
   665  
   666  	log.Fatalf(ctx, "unknown commit trigger: %+v", ct)
   667  	return result.Result{}, nil
   668  }
   669  
   670  // splitTrigger is called on a successful commit of a transaction
   671  // containing an AdminSplit operation. It copies the AbortSpan for
   672  // the new range and recomputes stats for both the existing, left hand
   673  // side (LHS) range and the right hand side (RHS) range. For
   674  // performance it only computes the stats for the original range (the
   675  // left hand side) and infers the RHS stats by subtracting from the
   676  // original stats. We compute the LHS stats because the split key
   677  // computation ensures that we do not create large LHS
   678  // ranges. However, this optimization is only possible if the stats
   679  // are fully accurate. If they contain estimates, stats for both the
   680  // LHS and RHS are computed.
   681  //
   682  // Splits are complicated. A split is initiated when a replica receives an
   683  // AdminSplit request. Note that this request (and other "admin" requests)
   684  // differs from normal requests in that it doesn't go through Raft but instead
   685  // allows the lease holder Replica to act as the orchestrator for the
   686  // distributed transaction that performs the split. As such, this request is
   687  // only executed on the lease holder replica and the request is redirected to
   688  // the lease holder if the recipient is a follower.
   689  //
   690  // Splits do not require the lease for correctness (which is good, because we
   691  // only check that the lease is held at the beginning of the operation, and
   692  // have no way to ensure that it is continually held until the end). Followers
   693  // could perform splits too, and the only downside would be that if two splits
   694  // were attempted concurrently (or a split and a ChangeReplicas), one would
   695  // fail. The lease is used to designate one replica for this role and avoid
   696  // wasting time on splits that may fail.
   697  //
   698  // The processing of splits is divided into two phases. The first phase occurs
   699  // in Replica.AdminSplit. In that phase, the split-point is computed, and a
   700  // transaction is started which updates both the LHS and RHS range descriptors
   701  // and the meta range addressing information. (If we're splitting a meta2 range
   702  // we'll be updating the meta1 addressing, otherwise we'll be updating the
   703  // meta2 addressing). That transaction includes a special SplitTrigger flag on
   704  // the EndTxn request. Like all transactions, the requests within the
   705  // transaction are replicated via Raft, including the EndTxn request.
   706  //
   707  // The second phase of split processing occurs when each replica for the range
   708  // encounters the SplitTrigger. Processing of the SplitTrigger happens below,
   709  // in Replica.splitTrigger. The processing of the SplitTrigger occurs in two
   710  // stages. The first stage operates within the context of an engine.Batch and
   711  // updates all of the on-disk state for the old and new ranges atomically. The
   712  // second stage is invoked when the batch commits and updates the in-memory
   713  // state, creating the new replica in memory and populating its timestamp cache
   714  // and registering it with the store.
   715  //
   716  // There is lots of subtlety here. The easy scenario is that all of the
   717  // replicas process the SplitTrigger before processing any Raft message for RHS
   718  // (right hand side) of the newly split range. Something like:
   719  //
   720  //         Node A             Node B             Node C
   721  //     ----------------------------------------------------
   722  // range 1   |                  |                  |
   723  //           |                  |                  |
   724  //      SplitTrigger            |                  |
   725  //           |             SplitTrigger            |
   726  //           |                  |             SplitTrigger
   727  //           |                  |                  |
   728  //     ----------------------------------------------------
   729  // split finished on A, B and C |                  |
   730  //           |                  |                  |
   731  // range 2   |                  |                  |
   732  //           | ---- MsgVote --> |                  |
   733  //           | ---------------------- MsgVote ---> |
   734  //
   735  // But that ideal ordering is not guaranteed. The split is "finished" when two
   736  // of the replicas have appended the end-txn request containing the
   737  // SplitTrigger to their Raft log. The following scenario is possible:
   738  //
   739  //         Node A             Node B             Node C
   740  //     ----------------------------------------------------
   741  // range 1   |                  |                  |
   742  //           |                  |                  |
   743  //      SplitTrigger            |                  |
   744  //           |             SplitTrigger            |
   745  //           |                  |                  |
   746  //     ----------------------------------------------------
   747  // split finished on A and B    |                  |
   748  //           |                  |                  |
   749  // range 2   |                  |                  |
   750  //           | ---- MsgVote --> |                  |
   751  //           | --------------------- MsgVote ---> ???
   752  //           |                  |                  |
   753  //           |                  |             SplitTrigger
   754  //
   755  // In this scenario, C will create range 2 upon reception of the MsgVote from
   756  // A, though locally that span of keys is still part of range 1. This is
   757  // possible because at the Raft level ranges are identified by integer IDs and
   758  // it isn't until C receives a snapshot of range 2 from the leader that it
   759  // discovers the span of keys it covers. In order to prevent C from fully
   760  // initializing range 2 in this instance, we prohibit applying a snapshot to a
   761  // range if the snapshot overlaps another range. See Store.canApplySnapshotLocked.
   762  //
   763  // But while a snapshot may not have been applied at C, an uninitialized
   764  // Replica was created. An uninitialized Replica is one which belongs to a Raft
   765  // group but for which the range descriptor has not been received. This Replica
   766  // will have participated in the Raft elections. When we're creating the new
   767  // Replica below we take control of this uninitialized Replica and stop it from
   768  // responding to Raft messages by marking it "destroyed". Note that we use the
   769  // Replica.mu.destroyed field for this, but we don't do everything that
   770  // Replica.Destroy does (so we should probably rename that field in light of
   771  // its new uses). In particular we don't touch any data on disk or leave a
   772  // tombstone. This is especially important because leaving a tombstone would
   773  // prevent the legitimate recreation of this replica.
   774  //
   775  // There is subtle synchronization here that is currently controlled by the
   776  // Store.processRaft goroutine. In particular, the serial execution of
   777  // Replica.handleRaftReady by Store.processRaft ensures that an uninitialized
   778  // RHS won't be concurrently executing in Replica.handleRaftReady because we're
   779  // currently running on that goroutine (i.e. Replica.splitTrigger is called on
   780  // the processRaft goroutine).
   781  //
   782  // TODO(peter): The above synchronization needs to be fixed. Using a single
   783  // goroutine for executing Replica.handleRaftReady is undesirable from a
   784  // performance perspective. Likely we will have to add a mutex to Replica to
   785  // protect handleRaftReady and to grab that mutex below when marking the
   786  // uninitialized Replica as "destroyed". Hopefully we'll also be able to remove
   787  // Store.processRaftMu.
   788  //
   789  // Note that in this more complex scenario, A (which performed the SplitTrigger
   790  // first) will create the associated Raft group for range 2 and start
   791  // campaigning immediately. It is possible for B to receive MsgVote requests
   792  // before it has applied the SplitTrigger as well. Both B and C will vote for A
   793  // (and preserve the records of that vote in their HardState). It is critically
   794  // important for Raft correctness that we do not lose the records of these
   795  // votes. After electing A the Raft leader for range 2, A will then attempt to
   796  // send a snapshot to B and C and we'll fall into the situation above where a
   797  // snapshot is received for a range before it has finished splitting from its
   798  // sibling and is thus rejected. An interesting subtlety here: A will send a
   799  // snapshot to B and C because when range 2 is initialized we were careful set
   800  // synthesize its HardState to set its Raft log index to 10. If we had instead
   801  // used log index 0, Raft would have believed the group to be empty, but the
   802  // RHS has something. Using a non-zero initial log index causes Raft to believe
   803  // that there is a discarded prefix to the log and will thus send a snapshot to
   804  // followers.
   805  //
   806  // A final point of clarification: when we split a range we're splitting the
   807  // data the range contains. But we're not forking or splitting the associated
   808  // Raft group. Instead, we're creating a new Raft group to control the RHS of
   809  // the split. That Raft group is starting from an empty Raft log (positioned at
   810  // log entry 10) and a snapshot of the RHS of the split range.
   811  //
   812  // After the split trigger returns, the on-disk state of the right-hand side
   813  // will be suitable for instantiating the right hand side Replica, and
   814  // a suitable trigger is returned, along with the updated stats which represent
   815  // the LHS delta caused by the split (i.e. all writes in the current batch
   816  // which went to the left-hand side, minus the kv pairs which moved to the
   817  // RHS).
   818  //
   819  // These stats are suitable for returning up the callstack like those for
   820  // regular commands; the corresponding delta for the RHS is part of the
   821  // returned trigger and is handled by the Store.
   822  func splitTrigger(
   823  	ctx context.Context,
   824  	rec EvalContext,
   825  	batch storage.Batch,
   826  	bothDeltaMS enginepb.MVCCStats,
   827  	split *roachpb.SplitTrigger,
   828  	ts hlc.Timestamp,
   829  ) (enginepb.MVCCStats, result.Result, error) {
   830  	// TODO(andrei): should this span be a child of the ctx's (if any)?
   831  	sp := rec.ClusterSettings().Tracer.StartRootSpan(
   832  		"split", logtags.FromContext(ctx), tracing.NonRecordableSpan,
   833  	)
   834  	defer sp.Finish()
   835  	desc := rec.Desc()
   836  	if !bytes.Equal(desc.StartKey, split.LeftDesc.StartKey) ||
   837  		!bytes.Equal(desc.EndKey, split.RightDesc.EndKey) {
   838  		return enginepb.MVCCStats{}, result.Result{}, errors.Errorf("range does not match splits: (%s-%s) + (%s-%s) != %s",
   839  			split.LeftDesc.StartKey, split.LeftDesc.EndKey,
   840  			split.RightDesc.StartKey, split.RightDesc.EndKey, desc)
   841  	}
   842  
   843  	// Compute the absolute stats for the (post-split) LHS. No more
   844  	// modifications to it are allowed after this line.
   845  
   846  	leftMS, err := rditer.ComputeStatsForRange(&split.LeftDesc, batch, ts.WallTime)
   847  	if err != nil {
   848  		return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to compute stats for LHS range after split")
   849  	}
   850  	log.Event(ctx, "computed stats for left hand side range")
   851  
   852  	h := splitStatsHelperInput{
   853  		AbsPreSplitBothEstimated: rec.GetMVCCStats(),
   854  		DeltaBatchEstimated:      bothDeltaMS,
   855  		AbsPostSplitLeft:         leftMS,
   856  		AbsPostSplitRightFn: func() (enginepb.MVCCStats, error) {
   857  			rightMS, err := rditer.ComputeStatsForRange(
   858  				&split.RightDesc, batch, ts.WallTime,
   859  			)
   860  			return rightMS, errors.Wrap(
   861  				err,
   862  				"unable to compute stats for RHS range after split",
   863  			)
   864  		},
   865  	}
   866  	return splitTriggerHelper(ctx, rec, batch, h, split, ts)
   867  }
   868  
   869  // splitTriggerHelper continues the work begun by splitTrigger, but has a
   870  // reduced scope that has all stats-related concerns bundled into a
   871  // splitStatsHelper.
   872  func splitTriggerHelper(
   873  	ctx context.Context,
   874  	rec EvalContext,
   875  	batch storage.Batch,
   876  	statsInput splitStatsHelperInput,
   877  	split *roachpb.SplitTrigger,
   878  	ts hlc.Timestamp,
   879  ) (enginepb.MVCCStats, result.Result, error) {
   880  	// TODO(d4l3k): we should check which side of the split is smaller
   881  	// and compute stats for it instead of having a constraint that the
   882  	// left hand side is smaller.
   883  
   884  	// NB: the replicated post-split left hand keyspace is frozen at this point.
   885  	// Only the RHS can be mutated (and we do so to seed its state).
   886  
   887  	// Copy the last replica GC timestamp. This value is unreplicated,
   888  	// which is why the MVCC stats are set to nil on calls to
   889  	// MVCCPutProto.
   890  	replicaGCTS, err := rec.GetLastReplicaGCTimestamp(ctx)
   891  	if err != nil {
   892  		return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to fetch last replica GC timestamp")
   893  	}
   894  	if err := storage.MVCCPutProto(ctx, batch, nil, keys.RangeLastReplicaGCTimestampKey(split.RightDesc.RangeID), hlc.Timestamp{}, nil, &replicaGCTS); err != nil {
   895  		return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to copy last replica GC timestamp")
   896  	}
   897  
   898  	h, err := makeSplitStatsHelper(statsInput)
   899  	if err != nil {
   900  		return enginepb.MVCCStats{}, result.Result{}, err
   901  	}
   902  
   903  	// Initialize the RHS range's AbortSpan by copying the LHS's.
   904  	if err := rec.AbortSpan().CopyTo(
   905  		ctx, batch, batch, h.AbsPostSplitRight(), ts, split.RightDesc.RangeID,
   906  	); err != nil {
   907  		return enginepb.MVCCStats{}, result.Result{}, err
   908  	}
   909  
   910  	// Note: we don't copy the queue last processed times. This means
   911  	// we'll process the RHS range in consistency and time series
   912  	// maintenance queues again possibly sooner than if we copied. The
   913  	// lock is to limit post-raft logic.
   914  
   915  	// Now that we've computed the stats for the RHS so far, we persist them.
   916  	// This looks a bit more complicated than it really is: updating the stats
   917  	// also changes the stats, and we write not only the stats but a complete
   918  	// initial state. Additionally, since bothDeltaMS is tracking writes to
   919  	// both sides, we need to update it as well.
   920  	{
   921  		// Various pieces of code rely on a replica's lease never being unitialized,
   922  		// but it's more than that - it ensures that we properly initialize the
   923  		// timestamp cache, which is only populated on the lease holder, from that
   924  		// of the original Range.  We found out about a regression here the hard way
   925  		// in #7899. Prior to this block, the following could happen:
   926  		// - a client reads key 'd', leaving an entry in the timestamp cache on the
   927  		//   lease holder of [a,e) at the time, node one.
   928  		// - the range [a,e) splits at key 'c'. [c,e) starts out without a lease.
   929  		// - the replicas of [a,e) on nodes one and two both process the split
   930  		//   trigger and thus copy their timestamp caches to the new right-hand side
   931  		//   Replica. However, only node one's timestamp cache contains information
   932  		//   about the read of key 'd' in the first place.
   933  		// - node two becomes the lease holder for [c,e). Its timestamp cache does
   934  		//   not know about the read at 'd' which happened at the beginning.
   935  		// - node two can illegally propose a write to 'd' at a lower timestamp.
   936  		//
   937  		// TODO(tschottdorf): why would this use r.store.Engine() and not the
   938  		// batch?
   939  		leftLease, err := MakeStateLoader(rec).LoadLease(ctx, rec.Engine())
   940  		if err != nil {
   941  			return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load lease")
   942  		}
   943  		if (leftLease == roachpb.Lease{}) {
   944  			log.Fatalf(ctx, "LHS of split has no lease")
   945  		}
   946  
   947  		replica, found := split.RightDesc.GetReplicaDescriptor(leftLease.Replica.StoreID)
   948  		if !found {
   949  			return enginepb.MVCCStats{}, result.Result{}, errors.Errorf(
   950  				"pre-split lease holder %+v not found in post-split descriptor %+v",
   951  				leftLease.Replica, split.RightDesc,
   952  			)
   953  		}
   954  		rightLease := leftLease
   955  		rightLease.Replica = replica
   956  
   957  		gcThreshold, err := MakeStateLoader(rec).LoadGCThreshold(ctx, rec.Engine())
   958  		if err != nil {
   959  			return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load GCThreshold")
   960  		}
   961  		if (*gcThreshold == hlc.Timestamp{}) {
   962  			log.VEventf(ctx, 1, "LHS's GCThreshold of split is not set")
   963  		}
   964  
   965  		// We're about to write the initial state for the replica. We migrated
   966  		// the formerly replicated truncated state into unreplicated keyspace
   967  		// in 19.1, but this range may still be using the replicated version
   968  		// and we need to make a decision about what to use for the RHS that
   969  		// is consistent across the followers: do for the RHS what the LHS
   970  		// does: if the LHS has the legacy key, initialize the RHS with a
   971  		// legacy key as well.
   972  		//
   973  		// See VersionUnreplicatedRaftTruncatedState.
   974  		truncStateType := stateloader.TruncatedStateUnreplicated
   975  		if found, err := storage.MVCCGetProto(
   976  			ctx,
   977  			batch,
   978  			keys.RaftTruncatedStateLegacyKey(rec.GetRangeID()),
   979  			hlc.Timestamp{},
   980  			nil,
   981  			storage.MVCCGetOptions{},
   982  		); err != nil {
   983  			return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to load legacy truncated state")
   984  		} else if found {
   985  			truncStateType = stateloader.TruncatedStateLegacyReplicated
   986  		}
   987  
   988  		// Writing the initial state is subtle since this also seeds the Raft
   989  		// group. It becomes more subtle due to proposer-evaluated Raft.
   990  		//
   991  		// We are writing to the right hand side's Raft group state in this
   992  		// batch so we need to synchronize with anything else that could be
   993  		// touching that replica's Raft state. Specifically, we want to prohibit
   994  		// an uninitialized Replica from receiving a message for the right hand
   995  		// side range and performing raft processing. This is achieved by
   996  		// serializing execution of uninitialized Replicas in Store.processRaft
   997  		// and ensuring that no uninitialized Replica is being processed while
   998  		// an initialized one (like the one currently being split) is being
   999  		// processed.
  1000  		//
  1001  		// Since the right hand side of the split's Raft group may already
  1002  		// exist, we must be prepared to absorb an existing HardState. The Raft
  1003  		// group may already exist because other nodes could already have
  1004  		// processed the split and started talking to our node, prompting the
  1005  		// creation of a Raft group that can vote and bump its term, but not
  1006  		// much else: it can't receive snapshots because those intersect the
  1007  		// pre-split range; it can't apply log commands because it needs a
  1008  		// snapshot first.
  1009  		//
  1010  		// However, we can't absorb the right-hand side's HardState here because
  1011  		// we only *evaluate* the proposal here, but by the time it is
  1012  		// *applied*, the HardState could have changed. We do this downstream of
  1013  		// Raft, in splitPostApply, where we write the last index and the
  1014  		// HardState via a call to synthesizeRaftState. Here, we only call
  1015  		// writeInitialReplicaState which essentially writes a ReplicaState
  1016  		// only.
  1017  
  1018  		*h.AbsPostSplitRight(), err = stateloader.WriteInitialReplicaState(
  1019  			ctx, batch, *h.AbsPostSplitRight(), split.RightDesc, rightLease,
  1020  			*gcThreshold, truncStateType,
  1021  		)
  1022  		if err != nil {
  1023  			return enginepb.MVCCStats{}, result.Result{}, errors.Wrap(err, "unable to write initial Replica state")
  1024  		}
  1025  	}
  1026  
  1027  	var pd result.Result
  1028  	pd.Replicated.Split = &kvserverpb.Split{
  1029  		SplitTrigger: *split,
  1030  		// NB: the RHSDelta is identical to the stats for the newly created right
  1031  		// hand side range (i.e. it goes from zero to its stats).
  1032  		RHSDelta: *h.AbsPostSplitRight(),
  1033  	}
  1034  
  1035  	deltaPostSplitLeft := h.DeltaPostSplitLeft()
  1036  	if !rec.ClusterSettings().Version.IsActive(ctx, clusterversion.VersionContainsEstimatesCounter) {
  1037  		deltaPostSplitLeft.ContainsEstimates = 0
  1038  	}
  1039  	return deltaPostSplitLeft, pd, nil
  1040  }
  1041  
  1042  // mergeTrigger is called on a successful commit of an AdminMerge transaction.
  1043  // It calculates stats for the LHS by merging in RHS stats, and copies over the
  1044  // abort span entries from the RHS.
  1045  func mergeTrigger(
  1046  	ctx context.Context,
  1047  	rec EvalContext,
  1048  	batch storage.Batch,
  1049  	ms *enginepb.MVCCStats,
  1050  	merge *roachpb.MergeTrigger,
  1051  	ts hlc.Timestamp,
  1052  ) (result.Result, error) {
  1053  	desc := rec.Desc()
  1054  	if !bytes.Equal(desc.StartKey, merge.LeftDesc.StartKey) {
  1055  		return result.Result{}, errors.Errorf("LHS range start keys do not match: %s != %s",
  1056  			desc.StartKey, merge.LeftDesc.StartKey)
  1057  	}
  1058  	if !desc.EndKey.Less(merge.LeftDesc.EndKey) {
  1059  		return result.Result{}, errors.Errorf("original LHS end key is not less than the post merge end key: %s >= %s",
  1060  			desc.EndKey, merge.LeftDesc.EndKey)
  1061  	}
  1062  
  1063  	if err := abortspan.New(merge.RightDesc.RangeID).CopyTo(
  1064  		ctx, batch, batch, ms, ts, merge.LeftDesc.RangeID,
  1065  	); err != nil {
  1066  		return result.Result{}, err
  1067  	}
  1068  
  1069  	// The stats for the merged range are the sum of the LHS and RHS stats, less
  1070  	// the RHS's replicated range ID stats. The only replicated range ID keys we
  1071  	// copy from the RHS are the keys in the abort span, and we've already
  1072  	// accounted for those stats above.
  1073  	ms.Add(merge.RightMVCCStats)
  1074  	{
  1075  		ridPrefix := keys.MakeRangeIDReplicatedPrefix(merge.RightDesc.RangeID)
  1076  		iter := batch.NewIterator(storage.IterOptions{UpperBound: ridPrefix.PrefixEnd()})
  1077  		defer iter.Close()
  1078  		sysMS, err := iter.ComputeStats(ridPrefix, ridPrefix.PrefixEnd(), 0 /* nowNanos */)
  1079  		if err != nil {
  1080  			return result.Result{}, err
  1081  		}
  1082  		ms.Subtract(sysMS)
  1083  	}
  1084  
  1085  	var pd result.Result
  1086  	pd.Replicated.Merge = &kvserverpb.Merge{
  1087  		MergeTrigger: *merge,
  1088  	}
  1089  	return pd, nil
  1090  }
  1091  
  1092  func changeReplicasTrigger(
  1093  	_ context.Context, rec EvalContext, _ storage.Batch, change *roachpb.ChangeReplicasTrigger,
  1094  ) result.Result {
  1095  	var pd result.Result
  1096  	// After a successful replica addition or removal check to see if the
  1097  	// range needs to be split. Splitting usually takes precedence over
  1098  	// replication via configuration of the split and replicate queues, but
  1099  	// if the split occurs concurrently with the replicas change the split
  1100  	// can fail and won't retry until the next scanner cycle. Re-queuing
  1101  	// the replica here removes that latency.
  1102  	pd.Local.MaybeAddToSplitQueue = true
  1103  
  1104  	// Gossip the first range whenever the range descriptor changes. We also
  1105  	// gossip the first range whenever the lease holder changes, but that might
  1106  	// not have occurred if a replica was being added or the non-lease-holder
  1107  	// replica was being removed. Note that we attempt the gossiping even from
  1108  	// the removed replica in case it was the lease-holder and it is still
  1109  	// holding the lease.
  1110  	pd.Local.GossipFirstRange = rec.IsFirstRange()
  1111  
  1112  	var desc roachpb.RangeDescriptor
  1113  	if change.Desc != nil {
  1114  		// Trigger proposed by a 19.2+ node (and we're a 19.2+ node as well).
  1115  		desc = *change.Desc
  1116  	} else {
  1117  		// Trigger proposed by a 19.1 node. Reconstruct descriptor from deprecated
  1118  		// fields.
  1119  		desc = *rec.Desc()
  1120  		desc.SetReplicas(roachpb.MakeReplicaDescriptors(change.DeprecatedUpdatedReplicas))
  1121  		desc.NextReplicaID = change.DeprecatedNextReplicaID
  1122  	}
  1123  
  1124  	pd.Replicated.State = &kvserverpb.ReplicaState{
  1125  		Desc: &desc,
  1126  	}
  1127  	pd.Replicated.ChangeReplicas = &kvserverpb.ChangeReplicas{
  1128  		ChangeReplicasTrigger: *change,
  1129  	}
  1130  
  1131  	return pd
  1132  }
  1133  
  1134  // txnAutoGC controls whether Transaction entries are automatically gc'ed upon
  1135  // EndTxn if they only have local locks (which can be resolved synchronously
  1136  // with EndTxn). Certain tests become simpler with this being turned off.
  1137  var txnAutoGC = true
  1138  
  1139  // TestingSetTxnAutoGC is used in tests to temporarily enable/disable
  1140  // txnAutoGC.
  1141  func TestingSetTxnAutoGC(to bool) func() {
  1142  	prev := txnAutoGC
  1143  	txnAutoGC = to
  1144  	return func() { txnAutoGC = prev }
  1145  }