github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_application_result.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb"
    17  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    18  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
    21  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    22  	"github.com/cockroachdb/cockroach/pkg/util/log"
    23  )
    24  
    25  // replica_application_*.go files provide concrete implementations of
    26  // the interfaces defined in the storage/apply package:
    27  //
    28  // replica_application_state_machine.go  ->  apply.StateMachine
    29  // replica_application_decoder.go        ->  apply.Decoder
    30  // replica_application_cmd.go            ->  apply.Command         (and variants)
    31  // replica_application_cmd_buf.go        ->  apply.CommandIterator (and variants)
    32  // replica_application_cmd_buf.go        ->  apply.CommandList     (and variants)
    33  //
    34  // These allow Replica to interface with the storage/apply package.
    35  
    36  // isTrivial determines whether the side-effects of a ReplicatedEvalResult are
    37  // "trivial". A result is fundamentally considered "trivial" if it does not have
    38  // side effects which rely on the written state of the replica exactly matching
    39  // the in-memory state of the replica at the corresponding log position.
    40  // Non-trivial commands must be applied in their own batch so that after
    41  // the batch is applied the replica's written and in-memory state correspond
    42  // to that log index.
    43  //
    44  // At the time of writing it is possible that the current conditions are too
    45  // strict but they are certainly sufficient.
    46  func isTrivial(r *kvserverpb.ReplicatedEvalResult) bool {
    47  	// Check if there are any non-trivial State updates.
    48  	if r.State != nil {
    49  		stateWhitelist := *r.State
    50  		// ReplicaState.Stats was previously non-nullable which caused nodes to
    51  		// send a zero-value MVCCStats structure. If the proposal was generated by
    52  		// an old node, we'll have decoded that zero-value structure setting
    53  		// ReplicaState.Stats to a non-nil value which would trigger the "unhandled
    54  		// field in ReplicatedEvalResult" assertion to fire if we didn't clear it.
    55  		// TODO(ajwerner): eliminate this case that likely can no longer occur as of
    56  		// at least 19.1.
    57  		if stateWhitelist.Stats != nil && (*stateWhitelist.Stats == enginepb.MVCCStats{}) {
    58  			stateWhitelist.Stats = nil
    59  		}
    60  		if stateWhitelist != (kvserverpb.ReplicaState{}) {
    61  			return false
    62  		}
    63  	}
    64  	// Set whitelist to the value of r and clear the whitelisted fields.
    65  	// If whitelist is zero-valued after clearing the whitelisted fields then
    66  	// it is trivial.
    67  	whitelist := *r
    68  	whitelist.Delta = enginepb.MVCCStatsDelta{}
    69  	whitelist.Timestamp = hlc.Timestamp{}
    70  	whitelist.DeprecatedDelta = nil
    71  	whitelist.PrevLeaseProposal = nil
    72  	whitelist.State = nil
    73  	return whitelist.Equal(kvserverpb.ReplicatedEvalResult{})
    74  }
    75  
    76  // clearTrivialReplicatedEvalResultFields is used to zero out the fields of a
    77  // ReplicatedEvalResult that have already been consumed when staging the
    78  // corresponding command and applying it to the current batch's view of the
    79  // ReplicaState. This function is called after a batch has been written to the
    80  // storage engine. For trivial commands this function should result in a zero
    81  // value replicatedResult.
    82  func clearTrivialReplicatedEvalResultFields(r *kvserverpb.ReplicatedEvalResult) {
    83  	// Fields for which no action is taken in this method are zeroed so that
    84  	// they don't trigger an assertion at the end of the application process
    85  	// (which checks that all fields were handled).
    86  	r.IsLeaseRequest = false
    87  	r.Timestamp = hlc.Timestamp{}
    88  	r.PrevLeaseProposal = nil
    89  	// The state fields cleared here were already applied to the in-memory view of
    90  	// replica state for this batch.
    91  	if haveState := r.State != nil; haveState {
    92  		r.State.Stats = nil
    93  		if *r.State == (kvserverpb.ReplicaState{}) {
    94  			r.State = nil
    95  		}
    96  	}
    97  	r.Delta = enginepb.MVCCStatsDelta{}
    98  }
    99  
   100  // prepareLocalResult is performed after the command has been committed to the
   101  // engine but before its side-effects have been applied to the Replica's
   102  // in-memory state. This method gives the command an opportunity to interact
   103  // with testing knobs and to set up its local result if it was proposed
   104  // locally. This is performed prior to handling the command's
   105  // ReplicatedEvalResult because the process of handling the replicated eval
   106  // result will zero-out the struct to ensure that is has properly performed all
   107  // of the implied side-effects.
   108  func (r *Replica) prepareLocalResult(ctx context.Context, cmd *replicatedCmd) {
   109  	if !cmd.IsLocal() {
   110  		return
   111  	}
   112  
   113  	var pErr *roachpb.Error
   114  	if filter := r.store.cfg.TestingKnobs.TestingPostApplyFilter; filter != nil {
   115  		var newPropRetry int
   116  		newPropRetry, pErr = filter(kvserverbase.ApplyFilterArgs{
   117  			CmdID:                cmd.idKey,
   118  			ReplicatedEvalResult: *cmd.replicatedResult(),
   119  			StoreID:              r.store.StoreID(),
   120  			RangeID:              r.RangeID,
   121  		})
   122  		if cmd.proposalRetry == 0 {
   123  			cmd.proposalRetry = proposalReevaluationReason(newPropRetry)
   124  		}
   125  	}
   126  	if pErr == nil {
   127  		pErr = cmd.forcedErr
   128  	}
   129  
   130  	if cmd.proposalRetry != proposalNoReevaluation && pErr == nil {
   131  		log.Fatalf(ctx, "proposal with nontrivial retry behavior, but no error: %+v", cmd.proposal)
   132  	}
   133  	if pErr != nil {
   134  		// A forced error was set (i.e. we did not apply the proposal,
   135  		// for instance due to its log position).
   136  		switch cmd.proposalRetry {
   137  		case proposalNoReevaluation:
   138  			cmd.response.Err = pErr
   139  		case proposalIllegalLeaseIndex:
   140  			// If we failed to apply at the right lease index, try again with a
   141  			// new one. This is important for pipelined writes, since they don't
   142  			// have a client watching to retry, so a failure to eventually apply
   143  			// the proposal would be a user-visible error.
   144  			pErr = r.tryReproposeWithNewLeaseIndex(ctx, cmd)
   145  			if pErr != nil {
   146  				log.Warningf(ctx, "failed to repropose with new lease index: %s", pErr)
   147  				cmd.response.Err = pErr
   148  			} else {
   149  				// Unbind the entry's local proposal because we just succeeded
   150  				// in reproposing it and we don't want to acknowledge the client
   151  				// yet.
   152  				cmd.proposal = nil
   153  				return
   154  			}
   155  		default:
   156  			panic("unexpected")
   157  		}
   158  	} else if cmd.proposal.Local.Reply != nil {
   159  		cmd.response.Reply = cmd.proposal.Local.Reply
   160  	} else {
   161  		log.Fatalf(ctx, "proposal must return either a reply or an error: %+v", cmd.proposal)
   162  	}
   163  	cmd.response.EncounteredIntents = cmd.proposal.Local.DetachEncounteredIntents()
   164  	cmd.response.EndTxns = cmd.proposal.Local.DetachEndTxns(pErr != nil)
   165  	if pErr == nil {
   166  		cmd.localResult = cmd.proposal.Local
   167  	} else if cmd.localResult != nil {
   168  		log.Fatalf(ctx, "shouldn't have a local result if command processing failed. pErr: %s", pErr)
   169  	}
   170  }
   171  
   172  // tryReproposeWithNewLeaseIndex is used by prepareLocalResult to repropose
   173  // commands that have gotten an illegal lease index error, and that we know
   174  // could not have applied while their lease index was valid (that is, we
   175  // observed all applied entries between proposal and the lease index becoming
   176  // invalid, as opposed to skipping some of them by applying a snapshot).
   177  //
   178  // It is not intended for use elsewhere and is only a top-level function so that
   179  // it can avoid the below_raft_protos check. Returns a nil error if the command
   180  // has already been successfully applied or has been reproposed here or by a
   181  // different entry for the same proposal that hit an illegal lease index error.
   182  func (r *Replica) tryReproposeWithNewLeaseIndex(
   183  	ctx context.Context, cmd *replicatedCmd,
   184  ) *roachpb.Error {
   185  	// Note that we don't need to validate anything about the proposal's
   186  	// lease here - if we got this far, we know that everything but the
   187  	// index is valid at this point in the log.
   188  	p := cmd.proposal
   189  	if p.applied || cmd.raftCmd.MaxLeaseIndex != p.command.MaxLeaseIndex {
   190  		// If the command associated with this rejected raft entry already
   191  		// applied then we don't want to repropose it. Doing so could lead
   192  		// to duplicate application of the same proposal.
   193  		//
   194  		// Similarly, if the command associated with this rejected raft
   195  		// entry has a different (larger) MaxLeaseIndex than the one we
   196  		// decoded from the entry itself, the command must have already
   197  		// been reproposed (this can happen if there are multiple copies
   198  		// of the command in the logs; see TestReplicaRefreshMultiple).
   199  		// We must not create multiple copies with multiple lease indexes,
   200  		// so don't repropose it again. This ensures that at any time,
   201  		// there is only up to a single lease index that has a chance of
   202  		// succeeding in the Raft log for a given command.
   203  		return nil
   204  	}
   205  
   206  	minTS, untrack := r.store.cfg.ClosedTimestamp.Tracker.Track(ctx)
   207  	defer untrack(ctx, 0, 0, 0) // covers all error paths below
   208  	// NB: p.Request.Timestamp reflects the action of ba.SetActiveTimestamp.
   209  	if p.Request.Timestamp.Less(minTS) {
   210  		// The tracker wants us to forward the request timestamp, but we can't
   211  		// do that without re-evaluating, so give up. The error returned here
   212  		// will go to back to DistSender, so send something it can digest.
   213  		lhErr := roachpb.NewError(newNotLeaseHolderError(
   214  			r.mu.state.Lease,
   215  			r.store.StoreID(),
   216  			r.mu.state.Desc,
   217  		))
   218  
   219  		return lhErr
   220  	}
   221  	// Some tests check for this log message in the trace.
   222  	log.VEventf(ctx, 2, "retry: proposalIllegalLeaseIndex")
   223  
   224  	maxLeaseIndex, pErr := r.propose(ctx, p)
   225  	if pErr != nil {
   226  		return pErr
   227  	}
   228  	// NB: The caller already promises that the lease check succeeded, meaning
   229  	// the sequence numbers match, implying that the lease epoch hasn't changed
   230  	// from what it was under the proposal-time lease.
   231  	untrack(ctx, ctpb.Epoch(r.mu.state.Lease.Epoch), r.RangeID, ctpb.LAI(maxLeaseIndex))
   232  	log.VEventf(ctx, 2, "reproposed command %x at maxLeaseIndex=%d", cmd.idKey, maxLeaseIndex)
   233  	return nil
   234  }
   235  
   236  // The following Replica.handleXYZResult methods are called when applying
   237  // non-trivial side effects in replicaStateMachine.ApplySideEffects. As a
   238  // general rule, there is a method for each of the non-trivial fields in
   239  // ReplicatedEvalResult. Most methods are simple enough that they will be
   240  // inlined.
   241  
   242  func (r *Replica) handleSplitResult(ctx context.Context, split *kvserverpb.Split) {
   243  	splitPostApply(ctx, split.RHSDelta, &split.SplitTrigger, r)
   244  }
   245  
   246  func (r *Replica) handleMergeResult(ctx context.Context, merge *kvserverpb.Merge) {
   247  	if err := r.store.MergeRange(
   248  		ctx, r, merge.LeftDesc, merge.RightDesc, merge.FreezeStart,
   249  	); err != nil {
   250  		// Our in-memory state has diverged from the on-disk state.
   251  		log.Fatalf(ctx, "failed to update store after merging range: %s", err)
   252  	}
   253  }
   254  
   255  func (r *Replica) handleDescResult(ctx context.Context, desc *roachpb.RangeDescriptor) {
   256  	r.setDescRaftMuLocked(ctx, desc)
   257  }
   258  
   259  func (r *Replica) handleLeaseResult(ctx context.Context, lease *roachpb.Lease) {
   260  	r.leasePostApply(ctx, *lease, false /* permitJump */)
   261  }
   262  
   263  func (r *Replica) handleTruncatedStateResult(
   264  	ctx context.Context, t *roachpb.RaftTruncatedState,
   265  ) (raftLogDelta int64) {
   266  	r.mu.Lock()
   267  	r.mu.state.TruncatedState = t
   268  	r.mu.Unlock()
   269  
   270  	// Clear any entries in the Raft log entry cache for this range up
   271  	// to and including the most recently truncated index.
   272  	r.store.raftEntryCache.Clear(r.RangeID, t.Index+1)
   273  
   274  	// Truncate the sideloaded storage. Note that this is safe only if the new truncated state
   275  	// is durably on disk (i.e.) synced. This is true at the time of writing but unfortunately
   276  	// could rot.
   277  	log.Eventf(ctx, "truncating sideloaded storage up to (and including) index %d", t.Index)
   278  	size, _, err := r.raftMu.sideloaded.TruncateTo(ctx, t.Index+1)
   279  	if err != nil {
   280  		// We don't *have* to remove these entries for correctness. Log a
   281  		// loud error, but keep humming along.
   282  		log.Errorf(ctx, "while removing sideloaded files during log truncation: %+v", err)
   283  	}
   284  	return -size
   285  }
   286  
   287  func (r *Replica) handleGCThresholdResult(ctx context.Context, thresh *hlc.Timestamp) {
   288  	if thresh.IsEmpty() {
   289  		return
   290  	}
   291  	r.mu.Lock()
   292  	r.mu.state.GCThreshold = thresh
   293  	r.mu.Unlock()
   294  }
   295  
   296  func (r *Replica) handleUsingAppliedStateKeyResult(ctx context.Context) {
   297  	r.mu.Lock()
   298  	r.mu.state.UsingAppliedStateKey = true
   299  	r.mu.Unlock()
   300  }
   301  
   302  func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *kvserverpb.ComputeChecksum) {
   303  	r.computeChecksumPostApply(ctx, *cc)
   304  }
   305  
   306  func (r *Replica) handleChangeReplicasResult(
   307  	ctx context.Context, chng *kvserverpb.ChangeReplicas,
   308  ) (changeRemovedReplica bool) {
   309  	// If this command removes us then we would have set the destroy status
   310  	// to destroyReasonRemoved which we detect here.
   311  	//
   312  	// Note that a replica's destroy status is only ever updated under the
   313  	// raftMu and we validated that the replica was not RemovingOrRemoved
   314  	// before processing this raft ready.
   315  	if ds, _ := r.IsDestroyed(); ds != destroyReasonRemoved {
   316  		return false // changeRemovedReplica
   317  	}
   318  
   319  	// If this command removes us then we need to go through the process of
   320  	// removing our replica from the store. After this method returns, the code
   321  	// should roughly return all the way up to whoever called handleRaftReady
   322  	// and this Replica should never be heard from again. We can detect if this
   323  	// change removed us by inspecting the replica's destroyStatus. We check the
   324  	// destroy status before processing a raft ready so if we find ourselves with
   325  	// removal pending at this point then we know that this command must be
   326  	// responsible.
   327  	if log.V(1) {
   328  		log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng)
   329  	}
   330  
   331  	// NB: postDestroyRaftMuLocked requires that the batch which removed the data
   332  	// be durably synced to disk, which we have.
   333  	// See replicaAppBatch.ApplyToStateMachine().
   334  	if err := r.postDestroyRaftMuLocked(ctx, r.GetMVCCStats()); err != nil {
   335  		log.Fatalf(ctx, "failed to run Replica postDestroy: %v", err)
   336  	}
   337  
   338  	if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.NextReplicaID(), RemoveOptions{
   339  		// We destroyed the data when the batch committed so don't destroy it again.
   340  		DestroyData: false,
   341  		// In order to detect the GC queue racing with other causes of replica removal
   342  		// the store will no-op when removing a replica which is already marked as removed
   343  		// unless we set ignoreDestroyStatus to true.
   344  		ignoreDestroyStatus: true,
   345  	}); err != nil {
   346  		log.Fatalf(ctx, "failed to remove replica: %v", err)
   347  	}
   348  	return true
   349  }
   350  
   351  func (r *Replica) handleRaftLogDeltaResult(ctx context.Context, delta int64) {
   352  	r.mu.Lock()
   353  	defer r.mu.Unlock()
   354  	r.mu.raftLogSize += delta
   355  	r.mu.raftLogLastCheckSize += delta
   356  	// Ensure raftLog{,LastCheck}Size is not negative since it isn't persisted
   357  	// between server restarts.
   358  	if r.mu.raftLogSize < 0 {
   359  		r.mu.raftLogSize = 0
   360  	}
   361  	if r.mu.raftLogLastCheckSize < 0 {
   362  		r.mu.raftLogLastCheckSize = 0
   363  	}
   364  }
   365  
   366  func (r *Replica) handleNoRaftLogDeltaResult(ctx context.Context) {
   367  	// Check for whether to queue the range for Raft log truncation if this is
   368  	// not a Raft log truncation command itself. We don't want to check the
   369  	// Raft log for truncation on every write operation or even every operation
   370  	// which occurs after the Raft log exceeds RaftLogQueueStaleSize. The logic
   371  	// below queues the replica for possible Raft log truncation whenever an
   372  	// additional RaftLogQueueStaleSize bytes have been written to the Raft
   373  	// log.
   374  	r.mu.Lock()
   375  	checkRaftLog := r.mu.raftLogSize-r.mu.raftLogLastCheckSize >= RaftLogQueueStaleSize
   376  	if checkRaftLog {
   377  		r.mu.raftLogLastCheckSize = r.mu.raftLogSize
   378  	}
   379  	r.mu.Unlock()
   380  	if checkRaftLog {
   381  		r.store.raftLogQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
   382  	}
   383  }
   384  
   385  func (r *Replica) handleSuggestedCompactionsResult(
   386  	ctx context.Context, scs []kvserverpb.SuggestedCompaction,
   387  ) {
   388  	// TODO(itsbilal): Remove this check once Pebble supports GetSSTables
   389  	if r.store.compactor == nil {
   390  		return
   391  	}
   392  	for _, sc := range scs {
   393  		r.store.compactor.Suggest(ctx, sc)
   394  	}
   395  }