github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_application_state_machine.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"time"
    17  
    18  	"github.com/cockroachdb/cockroach/pkg/clusterversion"
    19  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
    20  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    22  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    23  	"github.com/cockroachdb/cockroach/pkg/storage"
    24  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    25  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    26  	"github.com/cockroachdb/cockroach/pkg/util/log"
    27  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    28  	"github.com/cockroachdb/errors"
    29  	"github.com/kr/pretty"
    30  	"go.etcd.io/etcd/raft"
    31  	"go.etcd.io/etcd/raft/raftpb"
    32  )
    33  
    34  // replica_application_*.go files provide concrete implementations of
    35  // the interfaces defined in the storage/apply package:
    36  //
    37  // replica_application_state_machine.go  ->  apply.StateMachine
    38  // replica_application_decoder.go        ->  apply.Decoder
    39  // replica_application_cmd.go            ->  apply.Command         (and variants)
    40  // replica_application_cmd_buf.go        ->  apply.CommandIterator (and variants)
    41  // replica_application_cmd_buf.go        ->  apply.CommandList     (and variants)
    42  //
    43  // These allow Replica to interface with the storage/apply package.
    44  
    45  // applyCommittedEntriesStats returns stats about what happened during the
    46  // application of a set of raft entries.
    47  //
    48  // TODO(ajwerner): add metrics to go with these stats.
    49  type applyCommittedEntriesStats struct {
    50  	batchesProcessed     int
    51  	entriesProcessed     int
    52  	stateAssertions      int
    53  	numEmptyEntries      int
    54  	numConfChangeEntries int
    55  }
    56  
    57  // nonDeterministicFailure is an error type that indicates that a state machine
    58  // transition failed due to an unexpected error. Failure to perform a state
    59  // transition is a form of non-determinism, so it can't be permitted for any
    60  // reason during the application phase of state machine replication. The only
    61  // acceptable recourse is to signal that the replica has become corrupted.
    62  //
    63  // All errors returned by replicaDecoder and replicaStateMachine will be instances
    64  // of this type.
    65  type nonDeterministicFailure struct {
    66  	wrapped  error
    67  	safeExpl string
    68  }
    69  
    70  // The provided format string should be safe for reporting.
    71  func makeNonDeterministicFailure(format string, args ...interface{}) error {
    72  	err := errors.Newf(format, args...)
    73  	return &nonDeterministicFailure{
    74  		wrapped:  err,
    75  		safeExpl: err.Error(),
    76  	}
    77  }
    78  
    79  // The provided msg should be safe for reporting.
    80  func wrapWithNonDeterministicFailure(err error, format string, args ...interface{}) error {
    81  	return &nonDeterministicFailure{
    82  		wrapped:  errors.Wrapf(err, format, args...),
    83  		safeExpl: fmt.Sprintf(format, args...),
    84  	}
    85  }
    86  
    87  // Error implements the error interface.
    88  func (e *nonDeterministicFailure) Error() string {
    89  	return fmt.Sprintf("non-deterministic failure: %s", e.wrapped.Error())
    90  }
    91  
    92  // Cause implements the github.com/pkg/errors.causer interface.
    93  func (e *nonDeterministicFailure) Cause() error { return e.wrapped }
    94  
    95  // Unwrap implements the github.com/golang/xerrors.Wrapper interface, which is
    96  // planned to be moved to the stdlib in go 1.13.
    97  func (e *nonDeterministicFailure) Unwrap() error { return e.wrapped }
    98  
    99  // replicaStateMachine implements the apply.StateMachine interface.
   100  //
   101  // The structure coordinates state transitions within the Replica state machine
   102  // due to the application of replicated commands decoded from committed raft
   103  // entries. Commands are applied to the state machine in a multi-stage process
   104  // whereby individual commands are prepared for application relative to the
   105  // current view of ReplicaState and staged in a replicaAppBatch, the batch is
   106  // committed to the Replica's storage engine atomically, and finally the
   107  // side-effects of each command is applied to the Replica's in-memory state.
   108  type replicaStateMachine struct {
   109  	r *Replica
   110  	// batch is returned from NewBatch(false /* ephemeral */).
   111  	batch replicaAppBatch
   112  	// ephemeralBatch is returned from NewBatch(true /* ephemeral */).
   113  	ephemeralBatch ephemeralReplicaAppBatch
   114  	// stats are updated during command application and reset by moveStats.
   115  	stats applyCommittedEntriesStats
   116  }
   117  
   118  // getStateMachine returns the Replica's apply.StateMachine. The Replica's
   119  // raftMu is held for the entire lifetime of the replicaStateMachine.
   120  func (r *Replica) getStateMachine() *replicaStateMachine {
   121  	sm := &r.raftMu.stateMachine
   122  	sm.r = r
   123  	return sm
   124  }
   125  
   126  // shouldApplyCommand determines whether or not a command should be applied to
   127  // the replicated state machine after it has been committed to the Raft log. It
   128  // then sets the provided command's leaseIndex, proposalRetry, and forcedErr
   129  // fields and returns whether command should be applied or rejected.
   130  func (r *Replica) shouldApplyCommand(
   131  	ctx context.Context, cmd *replicatedCmd, replicaState *kvserverpb.ReplicaState,
   132  ) bool {
   133  	cmd.leaseIndex, cmd.proposalRetry, cmd.forcedErr = checkForcedErr(
   134  		ctx, cmd.idKey, &cmd.raftCmd, cmd.IsLocal(), replicaState,
   135  	)
   136  	if filter := r.store.cfg.TestingKnobs.TestingApplyFilter; cmd.forcedErr == nil && filter != nil {
   137  		var newPropRetry int
   138  		newPropRetry, cmd.forcedErr = filter(kvserverbase.ApplyFilterArgs{
   139  			CmdID:                cmd.idKey,
   140  			ReplicatedEvalResult: *cmd.replicatedResult(),
   141  			StoreID:              r.store.StoreID(),
   142  			RangeID:              r.RangeID,
   143  		})
   144  		if cmd.proposalRetry == 0 {
   145  			cmd.proposalRetry = proposalReevaluationReason(newPropRetry)
   146  		}
   147  	}
   148  	return cmd.forcedErr == nil
   149  }
   150  
   151  // checkForcedErr determines whether or not a command should be applied to the
   152  // replicated state machine after it has been committed to the Raft log. This
   153  // decision is deterministic on all replicas, such that a command that is
   154  // rejected "beneath raft" on one replica will be rejected "beneath raft" on
   155  // all replicas.
   156  //
   157  // The decision about whether or not to apply a command is a combination of
   158  // three checks:
   159  //  1. verify that the command was proposed under the current lease. This is
   160  //     determined using the proposal's ProposerLeaseSequence.
   161  //  2. verify that the command hasn't been re-ordered with other commands that
   162  //     were proposed after it and which already applied. This is determined
   163  //     using the proposal's MaxLeaseIndex.
   164  //  3. verify that the command isn't in violation of the Range's current
   165  //     garbage collection threshold. This is determined using the proposal's
   166  //     Timestamp.
   167  //
   168  // TODO(nvanbenschoten): Unit test this function now that it is stateless.
   169  func checkForcedErr(
   170  	ctx context.Context,
   171  	idKey kvserverbase.CmdIDKey,
   172  	raftCmd *kvserverpb.RaftCommand,
   173  	isLocal bool,
   174  	replicaState *kvserverpb.ReplicaState,
   175  ) (uint64, proposalReevaluationReason, *roachpb.Error) {
   176  	leaseIndex := replicaState.LeaseAppliedIndex
   177  	isLeaseRequest := raftCmd.ReplicatedEvalResult.IsLeaseRequest
   178  	var requestedLease roachpb.Lease
   179  	if isLeaseRequest {
   180  		requestedLease = *raftCmd.ReplicatedEvalResult.State.Lease
   181  	}
   182  	if idKey == "" {
   183  		// This is an empty Raft command (which is sent by Raft after elections
   184  		// to trigger reproposals or during concurrent configuration changes).
   185  		// Nothing to do here except making sure that the corresponding batch
   186  		// (which is bogus) doesn't get executed (for it is empty and so
   187  		// properties like key range are undefined).
   188  		return leaseIndex, proposalNoReevaluation, roachpb.NewErrorf("no-op on empty Raft entry")
   189  	}
   190  
   191  	// Verify the lease matches the proposer's expectation. We rely on
   192  	// the proposer's determination of whether the existing lease is
   193  	// held, and can be used, or is expired, and can be replaced.
   194  	// Verify checks that the lease has not been modified since proposal
   195  	// due to Raft delays / reorderings.
   196  	// To understand why this lease verification is necessary, see comments on the
   197  	// proposer_lease field in the proto.
   198  	leaseMismatch := false
   199  	if raftCmd.DeprecatedProposerLease != nil {
   200  		// VersionLeaseSequence must not have been active when this was proposed.
   201  		//
   202  		// This does not prevent the lease race condition described below. The
   203  		// reason we don't fix this here as well is because fixing the race
   204  		// requires a new cluster version which implies that we'll already be
   205  		// using lease sequence numbers and will fall into the case below.
   206  		leaseMismatch = !raftCmd.DeprecatedProposerLease.Equivalent(*replicaState.Lease)
   207  	} else {
   208  		leaseMismatch = raftCmd.ProposerLeaseSequence != replicaState.Lease.Sequence
   209  		if !leaseMismatch && isLeaseRequest {
   210  			// Lease sequence numbers are a reflection of lease equivalency
   211  			// between subsequent leases. However, Lease.Equivalent is not fully
   212  			// symmetric, meaning that two leases may be Equivalent to a third
   213  			// lease but not Equivalent to each other. If these leases are
   214  			// proposed under that same third lease, neither will be able to
   215  			// detect whether the other has applied just by looking at the
   216  			// current lease sequence number because neither will will increment
   217  			// the sequence number.
   218  			//
   219  			// This can lead to inversions in lease expiration timestamps if
   220  			// we're not careful. To avoid this, if a lease request's proposer
   221  			// lease sequence matches the current lease sequence and the current
   222  			// lease sequence also matches the requested lease sequence, we make
   223  			// sure the requested lease is Equivalent to current lease.
   224  			if replicaState.Lease.Sequence == requestedLease.Sequence {
   225  				// It is only possible for this to fail when expiration-based
   226  				// lease extensions are proposed concurrently.
   227  				leaseMismatch = !replicaState.Lease.Equivalent(requestedLease)
   228  			}
   229  
   230  			// This is a check to see if the lease we proposed this lease request against is the same
   231  			// lease that we're trying to update. We need to check proposal timestamps because
   232  			// extensions don't increment sequence numbers. Without this check a lease could
   233  			// be extended and then another lease proposed against the original lease would
   234  			// be applied over the extension.
   235  			if raftCmd.ReplicatedEvalResult.PrevLeaseProposal != nil &&
   236  				(*raftCmd.ReplicatedEvalResult.PrevLeaseProposal != *replicaState.Lease.ProposedTS) {
   237  				leaseMismatch = true
   238  			}
   239  		}
   240  	}
   241  	if leaseMismatch {
   242  		log.VEventf(
   243  			ctx, 1,
   244  			"command with lease #%d incompatible to %v",
   245  			raftCmd.ProposerLeaseSequence, *replicaState.Lease,
   246  		)
   247  		if isLeaseRequest {
   248  			// For lease requests we return a special error that
   249  			// redirectOnOrAcquireLease() understands. Note that these
   250  			// requests don't go through the DistSender.
   251  			return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.LeaseRejectedError{
   252  				Existing:  *replicaState.Lease,
   253  				Requested: requestedLease,
   254  				Message:   "proposed under invalid lease",
   255  			})
   256  		}
   257  		// We return a NotLeaseHolderError so that the DistSender retries.
   258  		// NB: we set proposerStoreID to 0 because we don't know who proposed the
   259  		// Raft command. This is ok, as this is only used for debug information.
   260  		nlhe := newNotLeaseHolderError(replicaState.Lease, 0 /* proposerStoreID */, replicaState.Desc)
   261  		nlhe.CustomMsg = fmt.Sprintf(
   262  			"stale proposal: command was proposed under lease #%d but is being applied "+
   263  				"under lease: %s", raftCmd.ProposerLeaseSequence, replicaState.Lease)
   264  		return leaseIndex, proposalNoReevaluation, roachpb.NewError(nlhe)
   265  	}
   266  
   267  	if isLeaseRequest {
   268  		// Lease commands are ignored by the counter (and their MaxLeaseIndex is ignored). This
   269  		// makes sense since lease commands are proposed by anyone, so we can't expect a coherent
   270  		// MaxLeaseIndex. Also, lease proposals are often replayed, so not making them update the
   271  		// counter makes sense from a testing perspective.
   272  		//
   273  		// However, leases get special vetting to make sure we don't give one to a replica that was
   274  		// since removed (see #15385 and a comment in redirectOnOrAcquireLease).
   275  		if _, ok := replicaState.Desc.GetReplicaDescriptor(requestedLease.Replica.StoreID); !ok {
   276  			return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.LeaseRejectedError{
   277  				Existing:  *replicaState.Lease,
   278  				Requested: requestedLease,
   279  				Message:   "replica not part of range",
   280  			})
   281  		}
   282  	} else if replicaState.LeaseAppliedIndex < raftCmd.MaxLeaseIndex {
   283  		// The happy case: the command is applying at or ahead of the minimal
   284  		// permissible index. It's ok if it skips a few slots (as can happen
   285  		// during rearrangement); this command will apply, but later ones which
   286  		// were proposed at lower indexes may not. Overall though, this is more
   287  		// stable and simpler than requiring commands to apply at their exact
   288  		// lease index: Handling the case in which MaxLeaseIndex > oldIndex+1
   289  		// is otherwise tricky since we can't tell the client to try again
   290  		// (reproposals could exist and may apply at the right index, leading
   291  		// to a replay), and assigning the required index would be tedious
   292  		// seeing that it would have to rewind sometimes.
   293  		leaseIndex = raftCmd.MaxLeaseIndex
   294  	} else {
   295  		// The command is trying to apply at a past log position. That's
   296  		// unfortunate and hopefully rare; the client on the proposer will try
   297  		// again. Note that in this situation, the leaseIndex does not advance.
   298  		retry := proposalNoReevaluation
   299  		if isLocal {
   300  			log.VEventf(
   301  				ctx, 1,
   302  				"retry proposal %x: applied at lease index %d, required < %d",
   303  				idKey, leaseIndex, raftCmd.MaxLeaseIndex,
   304  			)
   305  			retry = proposalIllegalLeaseIndex
   306  		}
   307  		return leaseIndex, retry, roachpb.NewErrorf(
   308  			"command observed at lease index %d, but required < %d", leaseIndex, raftCmd.MaxLeaseIndex,
   309  		)
   310  	}
   311  
   312  	// Verify that the batch timestamp is after the GC threshold. This is
   313  	// necessary because not all commands declare read access on the GC
   314  	// threshold key, even though they implicitly depend on it. This means
   315  	// that access to this state will not be serialized by latching,
   316  	// so we must perform this check upstream and downstream of raft.
   317  	// See #14833.
   318  	ts := raftCmd.ReplicatedEvalResult.Timestamp
   319  	if ts.LessEq(*replicaState.GCThreshold) {
   320  		return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.BatchTimestampBeforeGCError{
   321  			Timestamp: ts,
   322  			Threshold: *replicaState.GCThreshold,
   323  		})
   324  	}
   325  	return leaseIndex, proposalNoReevaluation, nil
   326  }
   327  
   328  // NewBatch implements the apply.StateMachine interface.
   329  func (sm *replicaStateMachine) NewBatch(ephemeral bool) apply.Batch {
   330  	r := sm.r
   331  	if ephemeral {
   332  		mb := &sm.ephemeralBatch
   333  		mb.r = r
   334  		r.mu.RLock()
   335  		mb.state = r.mu.state
   336  		r.mu.RUnlock()
   337  		return mb
   338  	}
   339  	b := &sm.batch
   340  	b.r = r
   341  	b.sm = sm
   342  	b.batch = r.store.engine.NewBatch()
   343  	r.mu.RLock()
   344  	b.state = r.mu.state
   345  	b.state.Stats = &b.stats
   346  	*b.state.Stats = *r.mu.state.Stats
   347  	r.mu.RUnlock()
   348  	b.start = timeutil.Now()
   349  	return b
   350  }
   351  
   352  // replicaAppBatch implements the apply.Batch interface.
   353  //
   354  // The structure accumulates state due to the application of raft commands.
   355  // Committed raft commands are applied to the state machine in a multi-stage
   356  // process whereby individual commands are prepared for application relative
   357  // to the current view of ReplicaState and staged in the batch. The batch is
   358  // committed to the state machine's storage engine atomically.
   359  type replicaAppBatch struct {
   360  	r  *Replica
   361  	sm *replicaStateMachine
   362  
   363  	// batch accumulates writes implied by the raft entries in this batch.
   364  	batch storage.Batch
   365  	// state is this batch's view of the replica's state. It is copied from
   366  	// under the Replica.mu when the batch is initialized and is updated in
   367  	// stageTrivialReplicatedEvalResult.
   368  	state kvserverpb.ReplicaState
   369  	// stats is stored on the application batch to avoid an allocation in
   370  	// tracking the batch's view of replicaState. All pointer fields in
   371  	// replicaState other than Stats are overwritten completely rather than
   372  	// updated in-place.
   373  	stats enginepb.MVCCStats
   374  	// maxTS is the maximum timestamp that any command that was staged in this
   375  	// batch was evaluated at.
   376  	maxTS hlc.Timestamp
   377  	// migrateToAppliedStateKey tracks whether any command in the batch
   378  	// triggered a migration to the replica applied state key. If so, this
   379  	// migration will be performed when the application batch is committed.
   380  	migrateToAppliedStateKey bool
   381  	// changeRemovesReplica tracks whether the command in the batch (there must
   382  	// be only one) removes this replica from the range.
   383  	changeRemovesReplica bool
   384  
   385  	// Statistics.
   386  	entries      int
   387  	emptyEntries int
   388  	mutations    int
   389  	start        time.Time
   390  }
   391  
   392  // Stage implements the apply.Batch interface. The method handles the first
   393  // phase of applying a command to the replica state machine.
   394  //
   395  // The first thing the method does is determine whether the command should be
   396  // applied at all or whether it should be rejected and replaced with an empty
   397  // entry. The determination is based on the following rules: the command's
   398  // MaxLeaseIndex must move the state machine's LeaseAppliedIndex forward, the
   399  // proposer's lease (or rather its sequence number) must match that of the state
   400  // machine, and lastly the GCThreshold must be below the timestamp that the
   401  // command evaluated at. If any of the checks fail, the proposal's content is
   402  // wiped and we apply an empty log entry instead. If a rejected command was
   403  // proposed locally, the error will eventually be communicated to the waiting
   404  // proposer. The two typical cases in which errors occur are lease mismatch (in
   405  // which case the caller tries to send the command to the actual leaseholder)
   406  // and violation of the LeaseAppliedIndex (in which case the proposal is retried
   407  // if it was proposed locally).
   408  //
   409  // Assuming all checks were passed, the command's write batch is applied to the
   410  // application batch. Its trivial ReplicatedState updates are then staged in
   411  // the batch. This allows the batch to make an accurate determination about
   412  // whether to accept or reject the next command that is staged without needing
   413  // to actually update the replica state machine in between.
   414  func (b *replicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error) {
   415  	cmd := cmdI.(*replicatedCmd)
   416  	ctx := cmd.ctx
   417  	if cmd.ent.Index == 0 {
   418  		return nil, makeNonDeterministicFailure("processRaftCommand requires a non-zero index")
   419  	}
   420  	if idx, applied := cmd.ent.Index, b.state.RaftAppliedIndex; idx != applied+1 {
   421  		// If we have an out of order index, there's corruption. No sense in
   422  		// trying to update anything or running the command. Simply return.
   423  		return nil, makeNonDeterministicFailure("applied index jumped from %d to %d", applied, idx)
   424  	}
   425  	if log.V(4) {
   426  		log.Infof(ctx, "processing command %x: maxLeaseIndex=%d", cmd.idKey, cmd.raftCmd.MaxLeaseIndex)
   427  	}
   428  
   429  	// Determine whether the command should be applied to the replicated state
   430  	// machine or whether it should be rejected (and replaced by an empty command).
   431  	// This check is deterministic on all replicas, so if one replica decides to
   432  	// reject a command, all will.
   433  	if !b.r.shouldApplyCommand(ctx, cmd, &b.state) {
   434  		log.VEventf(ctx, 1, "applying command with forced error: %s", cmd.forcedErr)
   435  
   436  		// Apply an empty command.
   437  		cmd.raftCmd.ReplicatedEvalResult = kvserverpb.ReplicatedEvalResult{}
   438  		cmd.raftCmd.WriteBatch = nil
   439  		cmd.raftCmd.LogicalOpLog = nil
   440  	} else {
   441  		log.Event(ctx, "applying command")
   442  	}
   443  
   444  	// Acquire the split or merge lock, if necessary. If a split or merge
   445  	// command was rejected with a below-Raft forced error then its replicated
   446  	// result was just cleared and this will be a no-op.
   447  	if splitMergeUnlock, err := b.r.maybeAcquireSplitMergeLock(ctx, cmd.raftCmd); err != nil {
   448  		var err error
   449  		if cmd.raftCmd.ReplicatedEvalResult.Split != nil {
   450  			err = wrapWithNonDeterministicFailure(err, "unable to acquire split lock")
   451  		} else {
   452  			err = wrapWithNonDeterministicFailure(err, "unable to acquire merge lock")
   453  		}
   454  		return nil, err
   455  	} else if splitMergeUnlock != nil {
   456  		// Set the splitMergeUnlock on the replicaAppBatch to be called
   457  		// after the batch has been applied (see replicaAppBatch.commit).
   458  		cmd.splitMergeUnlock = splitMergeUnlock
   459  	}
   460  
   461  	// Update the batch's max timestamp.
   462  	b.maxTS.Forward(cmd.replicatedResult().Timestamp)
   463  
   464  	// Normalize the command, accounting for past migrations.
   465  	b.migrateReplicatedResult(ctx, cmd)
   466  
   467  	// Run any triggers that should occur before the batch is applied
   468  	// and before the write batch is staged in the batch.
   469  	if err := b.runPreApplyTriggersBeforeStagingWriteBatch(ctx, cmd); err != nil {
   470  		return nil, err
   471  	}
   472  
   473  	// Stage the command's write batch in the application batch.
   474  	if err := b.stageWriteBatch(ctx, cmd); err != nil {
   475  		return nil, err
   476  	}
   477  
   478  	// Run any triggers that should occur before the batch is applied
   479  	// but after the write batch is staged in the batch.
   480  	if err := b.runPreApplyTriggersAfterStagingWriteBatch(ctx, cmd); err != nil {
   481  		return nil, err
   482  	}
   483  
   484  	// Stage the command's trivial ReplicatedState updates in the batch. Any
   485  	// non-trivial commands will be in their own batch, so delaying their
   486  	// non-trivial ReplicatedState updates until later (without ever staging
   487  	// them in the batch) is sufficient.
   488  	b.stageTrivialReplicatedEvalResult(ctx, cmd)
   489  	b.entries++
   490  	if len(cmd.ent.Data) == 0 {
   491  		b.emptyEntries++
   492  	}
   493  
   494  	// The command was checked by shouldApplyCommand, so it can be returned
   495  	// as an apply.CheckedCommand.
   496  	return cmd, nil
   497  }
   498  
   499  // migrateReplicatedResult performs any migrations necessary on the command to
   500  // normalize it before applying it to the batch. This may modify the command.
   501  func (b *replicaAppBatch) migrateReplicatedResult(ctx context.Context, cmd *replicatedCmd) {
   502  	// If the command was using the deprecated version of the MVCCStats proto,
   503  	// migrate it to the new version and clear out the field.
   504  	res := cmd.replicatedResult()
   505  	if deprecatedDelta := res.DeprecatedDelta; deprecatedDelta != nil {
   506  		if res.Delta != (enginepb.MVCCStatsDelta{}) {
   507  			log.Fatalf(ctx, "stats delta not empty but deprecated delta provided: %+v", cmd)
   508  		}
   509  		res.Delta = deprecatedDelta.ToStatsDelta()
   510  		res.DeprecatedDelta = nil
   511  	}
   512  }
   513  
   514  // stageWriteBatch applies the command's write batch to the application batch's
   515  // RocksDB batch. This batch is committed to RocksDB in replicaAppBatch.commit.
   516  func (b *replicaAppBatch) stageWriteBatch(ctx context.Context, cmd *replicatedCmd) error {
   517  	wb := cmd.raftCmd.WriteBatch
   518  	if wb == nil {
   519  		return nil
   520  	}
   521  	if mutations, err := storage.RocksDBBatchCount(wb.Data); err != nil {
   522  		log.Errorf(ctx, "unable to read header of committed WriteBatch: %+v", err)
   523  	} else {
   524  		b.mutations += mutations
   525  	}
   526  	if err := b.batch.ApplyBatchRepr(wb.Data, false); err != nil {
   527  		return wrapWithNonDeterministicFailure(err, "unable to apply WriteBatch")
   528  	}
   529  	return nil
   530  }
   531  
   532  // changeRemovesStore returns true if any of the removals in this change have storeID.
   533  func changeRemovesStore(
   534  	desc *roachpb.RangeDescriptor, change *kvserverpb.ChangeReplicas, storeID roachpb.StoreID,
   535  ) (removesStore bool) {
   536  	curReplica, existsInDesc := desc.GetReplicaDescriptor(storeID)
   537  	// NB: if we're catching up from a preemptive snapshot then we won't
   538  	// exist in the current descriptor and we can't be removed.
   539  	if !existsInDesc {
   540  		return false
   541  	}
   542  
   543  	// NB: We don't use change.Removed() because it will include replicas being
   544  	// transitioned to VOTER_OUTGOING.
   545  
   546  	// In 19.1 and before we used DeprecatedUpdatedReplicas instead of providing
   547  	// a new range descriptor. Check first if this is 19.1 or earlier command which
   548  	// uses DeprecatedChangeType and DeprecatedReplica
   549  	if change.Desc == nil {
   550  		return change.DeprecatedChangeType == roachpb.REMOVE_REPLICA && change.DeprecatedReplica.ReplicaID == curReplica.ReplicaID
   551  	}
   552  	// In 19.2 and beyond we supply the new range descriptor in the change.
   553  	// We know we're removed if we do not appear in the new descriptor.
   554  	_, existsInChange := change.Desc.GetReplicaDescriptor(storeID)
   555  	return !existsInChange
   556  }
   557  
   558  // runPreApplyTriggersBeforeStagingWriteBatch runs any triggers that must fire
   559  // before a command is applied to the state machine but after the command is
   560  // staged in the replicaAppBatch's write batch. It may modify the command.
   561  func (b *replicaAppBatch) runPreApplyTriggersBeforeStagingWriteBatch(
   562  	ctx context.Context, cmd *replicatedCmd,
   563  ) error {
   564  	if ops := cmd.raftCmd.LogicalOpLog; ops != nil {
   565  		b.r.populatePrevValsInLogicalOpLogRaftMuLocked(ctx, ops, b.batch)
   566  	}
   567  	return nil
   568  }
   569  
   570  // runPreApplyTriggersAfterStagingWriteBatch runs any triggers that must fire
   571  // before a command is applied to the state machine but after the command is
   572  // staged in the replicaAppBatch's write batch. It may modify the command.
   573  func (b *replicaAppBatch) runPreApplyTriggersAfterStagingWriteBatch(
   574  	ctx context.Context, cmd *replicatedCmd,
   575  ) error {
   576  	res := cmd.replicatedResult()
   577  
   578  	// AddSSTable ingestions run before the actual batch gets written to the
   579  	// storage engine. This makes sure that when the Raft command is applied,
   580  	// the ingestion has definitely succeeded. Note that we have taken
   581  	// precautions during command evaluation to avoid having mutations in the
   582  	// WriteBatch that affect the SSTable. Not doing so could result in order
   583  	// reversal (and missing values) here.
   584  	//
   585  	// NB: any command which has an AddSSTable is non-trivial and will be
   586  	// applied in its own batch so it's not possible that any other commands
   587  	// which precede this command can shadow writes from this SSTable.
   588  	if res.AddSSTable != nil {
   589  		copied := addSSTablePreApply(
   590  			ctx,
   591  			b.r.store.cfg.Settings,
   592  			b.r.store.engine,
   593  			b.r.raftMu.sideloaded,
   594  			cmd.ent.Term,
   595  			cmd.ent.Index,
   596  			*res.AddSSTable,
   597  			b.r.store.limiters.BulkIOWriteRate,
   598  		)
   599  		b.r.store.metrics.AddSSTableApplications.Inc(1)
   600  		if copied {
   601  			b.r.store.metrics.AddSSTableApplicationCopies.Inc(1)
   602  		}
   603  		if added := res.Delta.KeyCount; added > 0 {
   604  			b.r.writeStats.recordCount(float64(added), 0)
   605  		}
   606  		res.AddSSTable = nil
   607  	}
   608  
   609  	if res.Split != nil {
   610  		// Splits require a new HardState to be written to the new RHS
   611  		// range (and this needs to be atomic with the main batch). This
   612  		// cannot be constructed at evaluation time because it differs
   613  		// on each replica (votes may have already been cast on the
   614  		// uninitialized replica). Write this new hardstate to the batch too.
   615  		// See https://github.com/cockroachdb/cockroach/issues/20629.
   616  		//
   617  		// Alternatively if we discover that the RHS has already been removed
   618  		// from this store, clean up its data.
   619  		splitPreApply(ctx, b.batch, res.Split.SplitTrigger, b.r)
   620  
   621  		// The rangefeed processor will no longer be provided logical ops for
   622  		// its entire range, so it needs to be shut down and all registrations
   623  		// need to retry.
   624  		// TODO(nvanbenschoten): It should be possible to only reject registrations
   625  		// that overlap with the new range of the split and keep registrations that
   626  		// are only interested in keys that are still on the original range running.
   627  		b.r.disconnectRangefeedWithReason(
   628  			roachpb.RangeFeedRetryError_REASON_RANGE_SPLIT,
   629  		)
   630  	}
   631  
   632  	if merge := res.Merge; merge != nil {
   633  		// Merges require the subsumed range to be atomically deleted when the
   634  		// merge transaction commits.
   635  
   636  		// If our range currently has a non-zero replica ID then we know we're
   637  		// safe to commit this merge because of the invariants provided to us
   638  		// by the merge protocol. Namely if this committed we know that if the
   639  		// command committed then all of the replicas in the range descriptor
   640  		// are collocated when this command commits. If we do not have a non-zero
   641  		// replica ID then the logic in Stage should detect that and destroy our
   642  		// preemptive snapshot so we shouldn't ever get here.
   643  		rhsRepl, err := b.r.store.GetReplica(merge.RightDesc.RangeID)
   644  		if err != nil {
   645  			return wrapWithNonDeterministicFailure(err, "unable to get replica for merge")
   646  		}
   647  		// We should already have acquired the raftMu for the rhsRepl and now hold
   648  		// its unlock method in cmd.splitMergeUnlock.
   649  		rhsRepl.raftMu.AssertHeld()
   650  
   651  		// Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an
   652  		// extra safeguard against creating new replicas of the RHS. This isn't
   653  		// required for correctness, since the merge protocol should guarantee that
   654  		// no new replicas of the RHS can ever be created, but it doesn't hurt to
   655  		// be careful.
   656  		const clearRangeIDLocalOnly = true
   657  		const mustClearRange = false
   658  		if err := rhsRepl.preDestroyRaftMuLocked(
   659  			ctx, b.batch, b.batch, mergedTombstoneReplicaID, clearRangeIDLocalOnly, mustClearRange,
   660  		); err != nil {
   661  			return wrapWithNonDeterministicFailure(err, "unable to destroy replica before merge")
   662  		}
   663  
   664  		// Shut down rangefeed processors on either side of the merge.
   665  		//
   666  		// NB: It is critical to shut-down a rangefeed processor on the surviving
   667  		// replica primarily do deal with the possibility that there are logical ops
   668  		// for the RHS to resolve intents written by the merge transaction. In
   669  		// practice, the only such intents that exist are on the RangeEventTable,
   670  		// but it's good to be consistent here and allow the merge transaction to
   671  		// write to the RHS of a merge. See batcheval.resolveLocalLocks for details
   672  		// on why we resolve RHS intents when committing a merge transaction.
   673  		//
   674  		// TODO(nvanbenschoten): Alternatively we could just adjust the bounds of
   675  		// b.r.Processor to include the rhsRepl span.
   676  		//
   677  		// NB: removeInitializedReplicaRaftMuLocked also disconnects any initialized
   678  		// rangefeeds with REASON_REPLICA_REMOVED. That's ok because we will have
   679  		// already disconnected the rangefeed here.
   680  		b.r.disconnectRangefeedWithReason(
   681  			roachpb.RangeFeedRetryError_REASON_RANGE_MERGED,
   682  		)
   683  		rhsRepl.disconnectRangefeedWithReason(
   684  			roachpb.RangeFeedRetryError_REASON_RANGE_MERGED,
   685  		)
   686  	}
   687  
   688  	if res.State != nil && res.State.TruncatedState != nil {
   689  		if apply, err := handleTruncatedStateBelowRaft(
   690  			ctx, b.state.TruncatedState, res.State.TruncatedState, b.r.raftMu.stateLoader, b.batch,
   691  		); err != nil {
   692  			return wrapWithNonDeterministicFailure(err, "unable to handle truncated state")
   693  		} else if !apply {
   694  			// The truncated state was discarded, so make sure we don't apply
   695  			// it to our in-memory state.
   696  			res.State.TruncatedState = nil
   697  			res.RaftLogDelta = 0
   698  			// TODO(ajwerner): consider moving this code.
   699  			// We received a truncation that doesn't apply to us, so we know that
   700  			// there's a leaseholder out there with a log that has earlier entries
   701  			// than ours. That leader also guided our log size computations by
   702  			// giving us RaftLogDeltas for past truncations, and this was likely
   703  			// off. Mark our Raft log size is not trustworthy so that, assuming
   704  			// we step up as leader at some point in the future, we recompute
   705  			// our numbers.
   706  			b.r.mu.Lock()
   707  			b.r.mu.raftLogSizeTrusted = false
   708  			b.r.mu.Unlock()
   709  		}
   710  	}
   711  
   712  	// Detect if this command will remove us from the range.
   713  	// If so we stage the removal of all of our range data into this batch.
   714  	// We'll complete the removal when it commits. Later logic detects the
   715  	// removal by inspecting the destroy status.
   716  	//
   717  	// NB: This is the last step in the preApply which durably writes to the
   718  	// replica state so that if it removes the replica it removes everything.
   719  	if change := res.ChangeReplicas; change != nil &&
   720  		changeRemovesStore(b.state.Desc, change, b.r.store.StoreID()) &&
   721  		// Don't remove the data if the testing knobs ask us not to.
   722  		!b.r.store.TestingKnobs().DisableEagerReplicaRemoval {
   723  
   724  		// We mark the replica as destroyed so that new commands are not
   725  		// accepted. This destroy status will be detected after the batch commits
   726  		// by Replica.handleChangeReplicasTrigger() to finish the removal.
   727  		//
   728  		// NB: we must be holding the raftMu here because we're in the
   729  		// midst of application.
   730  		b.r.mu.Lock()
   731  		b.r.mu.destroyStatus.Set(
   732  			roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()),
   733  			destroyReasonRemoved)
   734  		b.r.mu.Unlock()
   735  		b.changeRemovesReplica = true
   736  
   737  		// Delete all of the local data. We're going to delete the hard state too.
   738  		// In order for this to be safe we need code above this to promise that we're
   739  		// never going to write hard state in response to a message for a later
   740  		// replica (with a different replica ID) to this range state.
   741  		if err := b.r.preDestroyRaftMuLocked(
   742  			ctx,
   743  			b.batch,
   744  			b.batch,
   745  			change.NextReplicaID(),
   746  			false, /* clearRangeIDLocalOnly */
   747  			false, /* mustUseClearRange */
   748  		); err != nil {
   749  			return wrapWithNonDeterministicFailure(err, "unable to destroy replica before removal")
   750  		}
   751  	}
   752  
   753  	// Provide the command's corresponding logical operations to the Replica's
   754  	// rangefeed. Only do so if the WriteBatch is non-nil, in which case the
   755  	// rangefeed requires there to be a corresponding logical operation log or
   756  	// it will shut down with an error. If the WriteBatch is nil then we expect
   757  	// the logical operation log to also be nil. We don't want to trigger a
   758  	// shutdown of the rangefeed in that situation, so we don't pass anything to
   759  	// the rangefed. If no rangefeed is running at all, this call will be a noop.
   760  	if ops := cmd.raftCmd.LogicalOpLog; cmd.raftCmd.WriteBatch != nil {
   761  		b.r.handleLogicalOpLogRaftMuLocked(ctx, ops, b.batch)
   762  	} else if ops != nil {
   763  		log.Fatalf(ctx, "non-nil logical op log with nil write batch: %v", cmd.raftCmd)
   764  	}
   765  
   766  	return nil
   767  }
   768  
   769  // stageTrivialReplicatedEvalResult applies the trivial portions of the
   770  // command's ReplicatedEvalResult to the batch's ReplicaState. This function
   771  // modifies the receiver's ReplicaState but does not modify ReplicatedEvalResult
   772  // in order to give the TestingPostApplyFilter testing knob an opportunity to
   773  // inspect the command's ReplicatedEvalResult.
   774  func (b *replicaAppBatch) stageTrivialReplicatedEvalResult(
   775  	ctx context.Context, cmd *replicatedCmd,
   776  ) {
   777  	if raftAppliedIndex := cmd.ent.Index; raftAppliedIndex != 0 {
   778  		b.state.RaftAppliedIndex = raftAppliedIndex
   779  	}
   780  	if leaseAppliedIndex := cmd.leaseIndex; leaseAppliedIndex != 0 {
   781  		b.state.LeaseAppliedIndex = leaseAppliedIndex
   782  	}
   783  	res := cmd.replicatedResult()
   784  
   785  	// Detect whether the incoming stats contain estimates that resulted from the
   786  	// evaluation of a command under the 19.1 cluster version. These were either
   787  	// evaluated on a 19.1 node (where ContainsEstimates is a bool, which maps
   788  	// to 0 and 1 in 19.2+) or on a 19.2 node which hadn't yet had its cluster
   789  	// version bumped.
   790  	//
   791  	// 19.2 nodes will never emit a ContainsEstimates outside of 0 or 1 until
   792  	// the cluster version is active (during command evaluation). When the
   793  	// version is active, they will never emit odd positive numbers (1, 3, ...).
   794  	//
   795  	// As a result, we can pinpoint exactly when the proposer of this command
   796  	// has used the old cluster version: it's when the incoming
   797  	// ContainsEstimates is 1. If so, we need to assume that an old node is processing
   798  	// the same commands (as `true + true = true`), so make sure that `1 + 1 = 1`.
   799  	_ = clusterversion.VersionContainsEstimatesCounter // see for info on ContainsEstimates migration
   800  	deltaStats := res.Delta.ToStats()
   801  	if deltaStats.ContainsEstimates == 1 && b.state.Stats.ContainsEstimates == 1 {
   802  		deltaStats.ContainsEstimates = 0
   803  	}
   804  
   805  	// Special-cased MVCC stats handling to exploit commutativity of stats delta
   806  	// upgrades. Thanks to commutativity, the spanlatch manager does not have to
   807  	// serialize on the stats key.
   808  	b.state.Stats.Add(deltaStats)
   809  	// Exploit the fact that a split will result in a full stats
   810  	// recomputation to reset the ContainsEstimates flag.
   811  	// If we were running the new VersionContainsEstimatesCounter cluster version,
   812  	// the consistency checker will be able to reset the stats itself, and splits
   813  	// will as a side effect also remove estimates from both the resulting left and right hand sides.
   814  	//
   815  	// TODO(tbg): this can be removed in v20.2 and not earlier.
   816  	// Consider the following scenario:
   817  	// - all nodes are running 19.2
   818  	// - all nodes rebooted into 20.1
   819  	// - cluster version bumped, but node1 doesn't receive the gossip update for that
   820  	// node1 runs a split that should emit ContainsEstimates=-1, but it clamps it to 0/1 because it
   821  	// doesn't know that 20.1 is active.
   822  	if res.Split != nil && deltaStats.ContainsEstimates == 0 {
   823  		b.state.Stats.ContainsEstimates = 0
   824  	}
   825  	if res.State != nil && res.State.UsingAppliedStateKey && !b.state.UsingAppliedStateKey {
   826  		b.migrateToAppliedStateKey = true
   827  	}
   828  }
   829  
   830  // ApplyToStateMachine implements the apply.Batch interface. The method handles
   831  // the second phase of applying a command to the replica state machine. It
   832  // writes the application batch's accumulated RocksDB batch to the storage
   833  // engine. This encompasses the persistent state transition portion of entry
   834  // application.
   835  func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error {
   836  	if log.V(4) {
   837  		log.Infof(ctx, "flushing batch %v of %d entries", b.state, b.entries)
   838  	}
   839  
   840  	// Update the node clock with the maximum timestamp of all commands in the
   841  	// batch. This maintains a high water mark for all ops serviced, so that
   842  	// received ops without a timestamp specified are guaranteed one higher than
   843  	// any op already executed for overlapping keys.
   844  	r := b.r
   845  	r.store.Clock().Update(b.maxTS)
   846  
   847  	// Add the replica applied state key to the write batch if this change
   848  	// doesn't remove us.
   849  	if !b.changeRemovesReplica {
   850  		if err := b.addAppliedStateKeyToBatch(ctx); err != nil {
   851  			return err
   852  		}
   853  	}
   854  
   855  	// Apply the write batch to RockDB. Entry application is done without
   856  	// syncing to disk. The atomicity guarantees of the batch and the fact that
   857  	// the applied state is stored in this batch, ensure that if the batch ends
   858  	// up not being durably committed then the entries in this batch will be
   859  	// applied again upon startup. However, if we're removing the replica's data
   860  	// then we sync this batch as it is not safe to call postDestroyRaftMuLocked
   861  	// before ensuring that the replica's data has been synchronously removed.
   862  	// See handleChangeReplicasResult().
   863  	sync := b.changeRemovesReplica
   864  	if err := b.batch.Commit(sync); err != nil {
   865  		return wrapWithNonDeterministicFailure(err, "unable to commit Raft entry batch")
   866  	}
   867  	b.batch.Close()
   868  	b.batch = nil
   869  
   870  	// Update the replica's applied indexes and mvcc stats.
   871  	r.mu.Lock()
   872  	r.mu.state.RaftAppliedIndex = b.state.RaftAppliedIndex
   873  	r.mu.state.LeaseAppliedIndex = b.state.LeaseAppliedIndex
   874  	prevStats := *r.mu.state.Stats
   875  	*r.mu.state.Stats = *b.state.Stats
   876  
   877  	// If the range is now less than its RangeMaxBytes, clear the history of its
   878  	// largest previous max bytes.
   879  	if r.mu.largestPreviousMaxRangeSizeBytes > 0 && b.state.Stats.Total() < *r.mu.zone.RangeMaxBytes {
   880  		r.mu.largestPreviousMaxRangeSizeBytes = 0
   881  	}
   882  
   883  	// Check the queuing conditions while holding the lock.
   884  	needsSplitBySize := r.needsSplitBySizeRLocked()
   885  	needsMergeBySize := r.needsMergeBySizeRLocked()
   886  	r.mu.Unlock()
   887  
   888  	// Record the stats delta in the StoreMetrics.
   889  	deltaStats := *b.state.Stats
   890  	deltaStats.Subtract(prevStats)
   891  	r.store.metrics.addMVCCStats(deltaStats)
   892  
   893  	// Record the write activity, passing a 0 nodeID because replica.writeStats
   894  	// intentionally doesn't track the origin of the writes.
   895  	b.r.writeStats.recordCount(float64(b.mutations), 0 /* nodeID */)
   896  
   897  	// NB: the bootstrap store has a nil split queue.
   898  	// TODO(tbg): the above is probably a lie now.
   899  	now := timeutil.Now()
   900  	if r.store.splitQueue != nil && needsSplitBySize && r.splitQueueThrottle.ShouldProcess(now) {
   901  		r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
   902  	}
   903  	// The bootstrap store has a nil merge queue.
   904  	// TODO(tbg): the above is probably a lie now.
   905  	if r.store.mergeQueue != nil && needsMergeBySize && r.mergeQueueThrottle.ShouldProcess(now) {
   906  		// TODO(tbg): for ranges which are small but protected from merges by
   907  		// other means (zone configs etc), this is called on every command, and
   908  		// fires off a goroutine each time. Make this trigger (and potentially
   909  		// the split one above, though it hasn't been observed to be as
   910  		// bothersome) less aggressive.
   911  		r.store.mergeQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
   912  	}
   913  
   914  	b.recordStatsOnCommit()
   915  	return nil
   916  }
   917  
   918  // addAppliedStateKeyToBatch adds the applied state key to the application
   919  // batch's RocksDB batch. This records the highest raft and lease index that
   920  // have been applied as of this batch. It also records the Range's mvcc stats.
   921  func (b *replicaAppBatch) addAppliedStateKeyToBatch(ctx context.Context) error {
   922  	loader := &b.r.raftMu.stateLoader
   923  	if b.migrateToAppliedStateKey {
   924  		// A Raft command wants us to begin using the RangeAppliedState key
   925  		// and we haven't performed the migration yet. Delete the old keys
   926  		// that this new key is replacing.
   927  		//
   928  		// NB: entering this branch indicates that the batch contains only a
   929  		// single non-trivial command.
   930  		err := loader.MigrateToRangeAppliedStateKey(ctx, b.batch, b.state.Stats)
   931  		if err != nil {
   932  			return wrapWithNonDeterministicFailure(err, "unable to migrate to range applied state")
   933  		}
   934  		b.state.UsingAppliedStateKey = true
   935  	}
   936  	if b.state.UsingAppliedStateKey {
   937  		// Set the range applied state, which includes the last applied raft and
   938  		// lease index along with the mvcc stats, all in one key.
   939  		if err := loader.SetRangeAppliedState(
   940  			ctx, b.batch, b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex, b.state.Stats,
   941  		); err != nil {
   942  			return wrapWithNonDeterministicFailure(err, "unable to set range applied state")
   943  		}
   944  	} else {
   945  		// Advance the last applied index. We use a blind write in order to avoid
   946  		// reading the previous applied index keys on every write operation. This
   947  		// requires a little additional work in order maintain the MVCC stats.
   948  		var appliedIndexNewMS enginepb.MVCCStats
   949  		if err := loader.SetLegacyAppliedIndexBlind(
   950  			ctx, b.batch, &appliedIndexNewMS, b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex,
   951  		); err != nil {
   952  			return wrapWithNonDeterministicFailure(err, "unable to set applied index")
   953  		}
   954  		b.state.Stats.SysBytes += appliedIndexNewMS.SysBytes -
   955  			loader.CalcAppliedIndexSysBytes(b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex)
   956  
   957  		// Set the legacy MVCC stats key.
   958  		if err := loader.SetMVCCStats(ctx, b.batch, b.state.Stats); err != nil {
   959  			return wrapWithNonDeterministicFailure(err, "unable to update MVCCStats")
   960  		}
   961  	}
   962  	return nil
   963  }
   964  
   965  func (b *replicaAppBatch) recordStatsOnCommit() {
   966  	b.sm.stats.entriesProcessed += b.entries
   967  	b.sm.stats.numEmptyEntries += b.emptyEntries
   968  	b.sm.stats.batchesProcessed++
   969  
   970  	elapsed := timeutil.Since(b.start)
   971  	b.r.store.metrics.RaftCommandCommitLatency.RecordValue(elapsed.Nanoseconds())
   972  }
   973  
   974  // Close implements the apply.Batch interface.
   975  func (b *replicaAppBatch) Close() {
   976  	if b.batch != nil {
   977  		b.batch.Close()
   978  	}
   979  	*b = replicaAppBatch{}
   980  }
   981  
   982  // ephemeralReplicaAppBatch implements the apply.Batch interface.
   983  //
   984  // The batch performs the bare-minimum amount of work to be able to
   985  // determine whether a replicated command should be rejected or applied.
   986  type ephemeralReplicaAppBatch struct {
   987  	r     *Replica
   988  	state kvserverpb.ReplicaState
   989  }
   990  
   991  // Stage implements the apply.Batch interface.
   992  func (mb *ephemeralReplicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error) {
   993  	cmd := cmdI.(*replicatedCmd)
   994  	ctx := cmd.ctx
   995  
   996  	mb.r.shouldApplyCommand(ctx, cmd, &mb.state)
   997  	mb.state.LeaseAppliedIndex = cmd.leaseIndex
   998  	return cmd, nil
   999  }
  1000  
  1001  // ApplyToStateMachine implements the apply.Batch interface.
  1002  func (mb *ephemeralReplicaAppBatch) ApplyToStateMachine(ctx context.Context) error {
  1003  	panic("cannot apply ephemeralReplicaAppBatch to state machine")
  1004  }
  1005  
  1006  // Close implements the apply.Batch interface.
  1007  func (mb *ephemeralReplicaAppBatch) Close() {
  1008  	*mb = ephemeralReplicaAppBatch{}
  1009  }
  1010  
  1011  // ApplySideEffects implements the apply.StateMachine interface. The method
  1012  // handles the third phase of applying a command to the replica state machine.
  1013  //
  1014  // It is called with commands whose write batches have already been committed
  1015  // to the storage engine and whose trivial side-effects have been applied to
  1016  // the Replica's in-memory state. This method deals with applying non-trivial
  1017  // side effects of commands, such as finalizing splits/merges and informing
  1018  // raft about applied config changes.
  1019  func (sm *replicaStateMachine) ApplySideEffects(
  1020  	cmdI apply.CheckedCommand,
  1021  ) (apply.AppliedCommand, error) {
  1022  	cmd := cmdI.(*replicatedCmd)
  1023  	ctx := cmd.ctx
  1024  
  1025  	// Deal with locking during side-effect handling, which is sometimes
  1026  	// associated with complex commands such as splits and merged.
  1027  	if unlock := cmd.splitMergeUnlock; unlock != nil {
  1028  		defer unlock()
  1029  	}
  1030  
  1031  	// Set up the local result prior to handling the ReplicatedEvalResult to
  1032  	// give testing knobs an opportunity to inspect it. An injected corruption
  1033  	// error will lead to replica removal.
  1034  	sm.r.prepareLocalResult(ctx, cmd)
  1035  	if log.ExpensiveLogEnabled(ctx, 2) {
  1036  		log.VEventf(ctx, 2, "%v", cmd.localResult.String())
  1037  	}
  1038  
  1039  	// Handle the ReplicatedEvalResult, executing any side effects of the last
  1040  	// state machine transition.
  1041  	//
  1042  	// Note that this must happen after committing (the engine.Batch), but
  1043  	// before notifying a potentially waiting client.
  1044  	clearTrivialReplicatedEvalResultFields(cmd.replicatedResult())
  1045  	if !cmd.IsTrivial() {
  1046  		shouldAssert, isRemoved := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult())
  1047  
  1048  		if isRemoved {
  1049  			return nil, apply.ErrRemoved
  1050  		}
  1051  		// NB: Perform state assertion before acknowledging the client.
  1052  		// Some tests (TestRangeStatsInit) assumes that once the store has started
  1053  		// and the first range has a lease that there will not be a later hard-state.
  1054  		if shouldAssert {
  1055  			// Assert that the on-disk state doesn't diverge from the in-memory
  1056  			// state as a result of the side effects.
  1057  			sm.r.mu.Lock()
  1058  			sm.r.assertStateLocked(ctx, sm.r.store.Engine())
  1059  			sm.r.mu.Unlock()
  1060  			sm.stats.stateAssertions++
  1061  		}
  1062  	} else if res := cmd.replicatedResult(); !res.Equal(kvserverpb.ReplicatedEvalResult{}) {
  1063  		log.Fatalf(ctx, "failed to handle all side-effects of ReplicatedEvalResult: %v", res)
  1064  	}
  1065  
  1066  	if cmd.replicatedResult().RaftLogDelta == 0 {
  1067  		sm.r.handleNoRaftLogDeltaResult(ctx)
  1068  	}
  1069  	if cmd.localResult != nil {
  1070  		sm.r.handleReadWriteLocalEvalResult(ctx, *cmd.localResult)
  1071  	}
  1072  	if err := sm.maybeApplyConfChange(ctx, cmd); err != nil {
  1073  		return nil, wrapWithNonDeterministicFailure(err, "unable to apply conf change")
  1074  	}
  1075  
  1076  	// Mark the command as applied and return it as an apply.AppliedCommand.
  1077  	// NB: Commands which were reproposed at a higher MaxLeaseIndex will not be
  1078  	// considered local at this point as their proposal will have been detached
  1079  	// in prepareLocalResult().
  1080  	if cmd.IsLocal() {
  1081  		rejected := cmd.Rejected()
  1082  		higherReproposalsExist := cmd.raftCmd.MaxLeaseIndex != cmd.proposal.command.MaxLeaseIndex
  1083  		if !rejected && higherReproposalsExist {
  1084  			log.Fatalf(ctx, "finishing proposal with outstanding reproposal at a higher max lease index")
  1085  		}
  1086  		if !rejected && cmd.proposal.applied {
  1087  			// If the command already applied then we shouldn't be "finishing" its
  1088  			// application again because it should only be able to apply successfully
  1089  			// once. We expect that when any reproposal for the same command attempts
  1090  			// to apply it will be rejected by the below raft lease sequence or lease
  1091  			// index check in checkForcedErr.
  1092  			log.Fatalf(ctx, "command already applied: %+v; unexpected successful result", cmd)
  1093  		}
  1094  		// If any reproposals at a higher MaxLeaseIndex exist we know that they will
  1095  		// never successfully apply, remove them from the map to avoid future
  1096  		// reproposals. If there is no command referencing this proposal at a higher
  1097  		// MaxLeaseIndex then it will already have been removed (see
  1098  		// shouldRemove in replicaDecoder.retrieveLocalProposals()). It is possible
  1099  		// that a later command in this batch referred to this proposal but it must
  1100  		// have failed because it carried the same MaxLeaseIndex.
  1101  		if higherReproposalsExist {
  1102  			sm.r.mu.Lock()
  1103  			delete(sm.r.mu.proposals, cmd.idKey)
  1104  			sm.r.mu.Unlock()
  1105  		}
  1106  		cmd.proposal.applied = true
  1107  	}
  1108  	return cmd, nil
  1109  }
  1110  
  1111  // handleNonTrivialReplicatedEvalResult carries out the side-effects of
  1112  // non-trivial commands. It is run with the raftMu locked. It is illegal
  1113  // to pass a replicatedResult that does not imply any side-effects.
  1114  func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult(
  1115  	ctx context.Context, rResult kvserverpb.ReplicatedEvalResult,
  1116  ) (shouldAssert, isRemoved bool) {
  1117  	// Assert that this replicatedResult implies at least one side-effect.
  1118  	if rResult.Equal(kvserverpb.ReplicatedEvalResult{}) {
  1119  		log.Fatalf(ctx, "zero-value ReplicatedEvalResult passed to handleNonTrivialReplicatedEvalResult")
  1120  	}
  1121  
  1122  	if rResult.State != nil {
  1123  		if rResult.State.TruncatedState != nil {
  1124  			rResult.RaftLogDelta += sm.r.handleTruncatedStateResult(ctx, rResult.State.TruncatedState)
  1125  			rResult.State.TruncatedState = nil
  1126  		}
  1127  
  1128  		if (*rResult.State == kvserverpb.ReplicaState{}) {
  1129  			rResult.State = nil
  1130  		}
  1131  	}
  1132  
  1133  	if rResult.RaftLogDelta != 0 {
  1134  		sm.r.handleRaftLogDeltaResult(ctx, rResult.RaftLogDelta)
  1135  		rResult.RaftLogDelta = 0
  1136  	}
  1137  
  1138  	if rResult.SuggestedCompactions != nil {
  1139  		sm.r.handleSuggestedCompactionsResult(ctx, rResult.SuggestedCompactions)
  1140  		rResult.SuggestedCompactions = nil
  1141  	}
  1142  
  1143  	// The rest of the actions are "nontrivial" and may have large effects on the
  1144  	// in-memory and on-disk ReplicaStates. If any of these actions are present,
  1145  	// we want to assert that these two states do not diverge.
  1146  	shouldAssert = !rResult.Equal(kvserverpb.ReplicatedEvalResult{})
  1147  	if !shouldAssert {
  1148  		return false, false
  1149  	}
  1150  
  1151  	if rResult.Split != nil {
  1152  		sm.r.handleSplitResult(ctx, rResult.Split)
  1153  		rResult.Split = nil
  1154  	}
  1155  
  1156  	if rResult.Merge != nil {
  1157  		sm.r.handleMergeResult(ctx, rResult.Merge)
  1158  		rResult.Merge = nil
  1159  	}
  1160  
  1161  	if rResult.State != nil {
  1162  		if newDesc := rResult.State.Desc; newDesc != nil {
  1163  			sm.r.handleDescResult(ctx, newDesc)
  1164  			rResult.State.Desc = nil
  1165  		}
  1166  
  1167  		if newLease := rResult.State.Lease; newLease != nil {
  1168  			sm.r.handleLeaseResult(ctx, newLease)
  1169  			rResult.State.Lease = nil
  1170  		}
  1171  
  1172  		if newThresh := rResult.State.GCThreshold; newThresh != nil {
  1173  			sm.r.handleGCThresholdResult(ctx, newThresh)
  1174  			rResult.State.GCThreshold = nil
  1175  		}
  1176  
  1177  		if rResult.State.UsingAppliedStateKey {
  1178  			sm.r.handleUsingAppliedStateKeyResult(ctx)
  1179  			rResult.State.UsingAppliedStateKey = false
  1180  		}
  1181  
  1182  		if (*rResult.State == kvserverpb.ReplicaState{}) {
  1183  			rResult.State = nil
  1184  		}
  1185  	}
  1186  
  1187  	if rResult.ChangeReplicas != nil {
  1188  		isRemoved = sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas)
  1189  		rResult.ChangeReplicas = nil
  1190  	}
  1191  
  1192  	if rResult.ComputeChecksum != nil {
  1193  		sm.r.handleComputeChecksumResult(ctx, rResult.ComputeChecksum)
  1194  		rResult.ComputeChecksum = nil
  1195  	}
  1196  
  1197  	if !rResult.Equal(kvserverpb.ReplicatedEvalResult{}) {
  1198  		log.Fatalf(ctx, "unhandled field in ReplicatedEvalResult: %s", pretty.Diff(rResult, kvserverpb.ReplicatedEvalResult{}))
  1199  	}
  1200  	return true, isRemoved
  1201  }
  1202  
  1203  func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *replicatedCmd) error {
  1204  	switch cmd.ent.Type {
  1205  	case raftpb.EntryNormal:
  1206  		if cmd.replicatedResult().ChangeReplicas != nil {
  1207  			log.Fatalf(ctx, "unexpected replication change from command %s", &cmd.raftCmd)
  1208  		}
  1209  		return nil
  1210  	case raftpb.EntryConfChange, raftpb.EntryConfChangeV2:
  1211  		sm.stats.numConfChangeEntries++
  1212  		if cmd.replicatedResult().ChangeReplicas == nil {
  1213  			// The command was rejected. There is no need to report a ConfChange
  1214  			// to raft.
  1215  			return nil
  1216  		}
  1217  		return sm.r.withRaftGroup(true, func(rn *raft.RawNode) (bool, error) {
  1218  			rn.ApplyConfChange(cmd.confChange.ConfChangeI)
  1219  			return true, nil
  1220  		})
  1221  	default:
  1222  		panic("unexpected")
  1223  	}
  1224  }
  1225  
  1226  func (sm *replicaStateMachine) moveStats() applyCommittedEntriesStats {
  1227  	stats := sm.stats
  1228  	sm.stats = applyCommittedEntriesStats{}
  1229  	return stats
  1230  }