github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_application_result.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 16 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts/ctpb" 17 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 18 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 19 "github.com/cockroachdb/cockroach/pkg/roachpb" 20 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 21 "github.com/cockroachdb/cockroach/pkg/util/hlc" 22 "github.com/cockroachdb/cockroach/pkg/util/log" 23 ) 24 25 // replica_application_*.go files provide concrete implementations of 26 // the interfaces defined in the storage/apply package: 27 // 28 // replica_application_state_machine.go -> apply.StateMachine 29 // replica_application_decoder.go -> apply.Decoder 30 // replica_application_cmd.go -> apply.Command (and variants) 31 // replica_application_cmd_buf.go -> apply.CommandIterator (and variants) 32 // replica_application_cmd_buf.go -> apply.CommandList (and variants) 33 // 34 // These allow Replica to interface with the storage/apply package. 35 36 // isTrivial determines whether the side-effects of a ReplicatedEvalResult are 37 // "trivial". A result is fundamentally considered "trivial" if it does not have 38 // side effects which rely on the written state of the replica exactly matching 39 // the in-memory state of the replica at the corresponding log position. 40 // Non-trivial commands must be applied in their own batch so that after 41 // the batch is applied the replica's written and in-memory state correspond 42 // to that log index. 43 // 44 // At the time of writing it is possible that the current conditions are too 45 // strict but they are certainly sufficient. 46 func isTrivial(r *kvserverpb.ReplicatedEvalResult) bool { 47 // Check if there are any non-trivial State updates. 48 if r.State != nil { 49 stateWhitelist := *r.State 50 // ReplicaState.Stats was previously non-nullable which caused nodes to 51 // send a zero-value MVCCStats structure. If the proposal was generated by 52 // an old node, we'll have decoded that zero-value structure setting 53 // ReplicaState.Stats to a non-nil value which would trigger the "unhandled 54 // field in ReplicatedEvalResult" assertion to fire if we didn't clear it. 55 // TODO(ajwerner): eliminate this case that likely can no longer occur as of 56 // at least 19.1. 57 if stateWhitelist.Stats != nil && (*stateWhitelist.Stats == enginepb.MVCCStats{}) { 58 stateWhitelist.Stats = nil 59 } 60 if stateWhitelist != (kvserverpb.ReplicaState{}) { 61 return false 62 } 63 } 64 // Set whitelist to the value of r and clear the whitelisted fields. 65 // If whitelist is zero-valued after clearing the whitelisted fields then 66 // it is trivial. 67 whitelist := *r 68 whitelist.Delta = enginepb.MVCCStatsDelta{} 69 whitelist.Timestamp = hlc.Timestamp{} 70 whitelist.DeprecatedDelta = nil 71 whitelist.PrevLeaseProposal = nil 72 whitelist.State = nil 73 return whitelist.Equal(kvserverpb.ReplicatedEvalResult{}) 74 } 75 76 // clearTrivialReplicatedEvalResultFields is used to zero out the fields of a 77 // ReplicatedEvalResult that have already been consumed when staging the 78 // corresponding command and applying it to the current batch's view of the 79 // ReplicaState. This function is called after a batch has been written to the 80 // storage engine. For trivial commands this function should result in a zero 81 // value replicatedResult. 82 func clearTrivialReplicatedEvalResultFields(r *kvserverpb.ReplicatedEvalResult) { 83 // Fields for which no action is taken in this method are zeroed so that 84 // they don't trigger an assertion at the end of the application process 85 // (which checks that all fields were handled). 86 r.IsLeaseRequest = false 87 r.Timestamp = hlc.Timestamp{} 88 r.PrevLeaseProposal = nil 89 // The state fields cleared here were already applied to the in-memory view of 90 // replica state for this batch. 91 if haveState := r.State != nil; haveState { 92 r.State.Stats = nil 93 if *r.State == (kvserverpb.ReplicaState{}) { 94 r.State = nil 95 } 96 } 97 r.Delta = enginepb.MVCCStatsDelta{} 98 } 99 100 // prepareLocalResult is performed after the command has been committed to the 101 // engine but before its side-effects have been applied to the Replica's 102 // in-memory state. This method gives the command an opportunity to interact 103 // with testing knobs and to set up its local result if it was proposed 104 // locally. This is performed prior to handling the command's 105 // ReplicatedEvalResult because the process of handling the replicated eval 106 // result will zero-out the struct to ensure that is has properly performed all 107 // of the implied side-effects. 108 func (r *Replica) prepareLocalResult(ctx context.Context, cmd *replicatedCmd) { 109 if !cmd.IsLocal() { 110 return 111 } 112 113 var pErr *roachpb.Error 114 if filter := r.store.cfg.TestingKnobs.TestingPostApplyFilter; filter != nil { 115 var newPropRetry int 116 newPropRetry, pErr = filter(kvserverbase.ApplyFilterArgs{ 117 CmdID: cmd.idKey, 118 ReplicatedEvalResult: *cmd.replicatedResult(), 119 StoreID: r.store.StoreID(), 120 RangeID: r.RangeID, 121 }) 122 if cmd.proposalRetry == 0 { 123 cmd.proposalRetry = proposalReevaluationReason(newPropRetry) 124 } 125 } 126 if pErr == nil { 127 pErr = cmd.forcedErr 128 } 129 130 if cmd.proposalRetry != proposalNoReevaluation && pErr == nil { 131 log.Fatalf(ctx, "proposal with nontrivial retry behavior, but no error: %+v", cmd.proposal) 132 } 133 if pErr != nil { 134 // A forced error was set (i.e. we did not apply the proposal, 135 // for instance due to its log position). 136 switch cmd.proposalRetry { 137 case proposalNoReevaluation: 138 cmd.response.Err = pErr 139 case proposalIllegalLeaseIndex: 140 // If we failed to apply at the right lease index, try again with a 141 // new one. This is important for pipelined writes, since they don't 142 // have a client watching to retry, so a failure to eventually apply 143 // the proposal would be a user-visible error. 144 pErr = r.tryReproposeWithNewLeaseIndex(ctx, cmd) 145 if pErr != nil { 146 log.Warningf(ctx, "failed to repropose with new lease index: %s", pErr) 147 cmd.response.Err = pErr 148 } else { 149 // Unbind the entry's local proposal because we just succeeded 150 // in reproposing it and we don't want to acknowledge the client 151 // yet. 152 cmd.proposal = nil 153 return 154 } 155 default: 156 panic("unexpected") 157 } 158 } else if cmd.proposal.Local.Reply != nil { 159 cmd.response.Reply = cmd.proposal.Local.Reply 160 } else { 161 log.Fatalf(ctx, "proposal must return either a reply or an error: %+v", cmd.proposal) 162 } 163 cmd.response.EncounteredIntents = cmd.proposal.Local.DetachEncounteredIntents() 164 cmd.response.EndTxns = cmd.proposal.Local.DetachEndTxns(pErr != nil) 165 if pErr == nil { 166 cmd.localResult = cmd.proposal.Local 167 } else if cmd.localResult != nil { 168 log.Fatalf(ctx, "shouldn't have a local result if command processing failed. pErr: %s", pErr) 169 } 170 } 171 172 // tryReproposeWithNewLeaseIndex is used by prepareLocalResult to repropose 173 // commands that have gotten an illegal lease index error, and that we know 174 // could not have applied while their lease index was valid (that is, we 175 // observed all applied entries between proposal and the lease index becoming 176 // invalid, as opposed to skipping some of them by applying a snapshot). 177 // 178 // It is not intended for use elsewhere and is only a top-level function so that 179 // it can avoid the below_raft_protos check. Returns a nil error if the command 180 // has already been successfully applied or has been reproposed here or by a 181 // different entry for the same proposal that hit an illegal lease index error. 182 func (r *Replica) tryReproposeWithNewLeaseIndex( 183 ctx context.Context, cmd *replicatedCmd, 184 ) *roachpb.Error { 185 // Note that we don't need to validate anything about the proposal's 186 // lease here - if we got this far, we know that everything but the 187 // index is valid at this point in the log. 188 p := cmd.proposal 189 if p.applied || cmd.raftCmd.MaxLeaseIndex != p.command.MaxLeaseIndex { 190 // If the command associated with this rejected raft entry already 191 // applied then we don't want to repropose it. Doing so could lead 192 // to duplicate application of the same proposal. 193 // 194 // Similarly, if the command associated with this rejected raft 195 // entry has a different (larger) MaxLeaseIndex than the one we 196 // decoded from the entry itself, the command must have already 197 // been reproposed (this can happen if there are multiple copies 198 // of the command in the logs; see TestReplicaRefreshMultiple). 199 // We must not create multiple copies with multiple lease indexes, 200 // so don't repropose it again. This ensures that at any time, 201 // there is only up to a single lease index that has a chance of 202 // succeeding in the Raft log for a given command. 203 return nil 204 } 205 206 minTS, untrack := r.store.cfg.ClosedTimestamp.Tracker.Track(ctx) 207 defer untrack(ctx, 0, 0, 0) // covers all error paths below 208 // NB: p.Request.Timestamp reflects the action of ba.SetActiveTimestamp. 209 if p.Request.Timestamp.Less(minTS) { 210 // The tracker wants us to forward the request timestamp, but we can't 211 // do that without re-evaluating, so give up. The error returned here 212 // will go to back to DistSender, so send something it can digest. 213 lhErr := roachpb.NewError(newNotLeaseHolderError( 214 r.mu.state.Lease, 215 r.store.StoreID(), 216 r.mu.state.Desc, 217 )) 218 219 return lhErr 220 } 221 // Some tests check for this log message in the trace. 222 log.VEventf(ctx, 2, "retry: proposalIllegalLeaseIndex") 223 224 maxLeaseIndex, pErr := r.propose(ctx, p) 225 if pErr != nil { 226 return pErr 227 } 228 // NB: The caller already promises that the lease check succeeded, meaning 229 // the sequence numbers match, implying that the lease epoch hasn't changed 230 // from what it was under the proposal-time lease. 231 untrack(ctx, ctpb.Epoch(r.mu.state.Lease.Epoch), r.RangeID, ctpb.LAI(maxLeaseIndex)) 232 log.VEventf(ctx, 2, "reproposed command %x at maxLeaseIndex=%d", cmd.idKey, maxLeaseIndex) 233 return nil 234 } 235 236 // The following Replica.handleXYZResult methods are called when applying 237 // non-trivial side effects in replicaStateMachine.ApplySideEffects. As a 238 // general rule, there is a method for each of the non-trivial fields in 239 // ReplicatedEvalResult. Most methods are simple enough that they will be 240 // inlined. 241 242 func (r *Replica) handleSplitResult(ctx context.Context, split *kvserverpb.Split) { 243 splitPostApply(ctx, split.RHSDelta, &split.SplitTrigger, r) 244 } 245 246 func (r *Replica) handleMergeResult(ctx context.Context, merge *kvserverpb.Merge) { 247 if err := r.store.MergeRange( 248 ctx, r, merge.LeftDesc, merge.RightDesc, merge.FreezeStart, 249 ); err != nil { 250 // Our in-memory state has diverged from the on-disk state. 251 log.Fatalf(ctx, "failed to update store after merging range: %s", err) 252 } 253 } 254 255 func (r *Replica) handleDescResult(ctx context.Context, desc *roachpb.RangeDescriptor) { 256 r.setDescRaftMuLocked(ctx, desc) 257 } 258 259 func (r *Replica) handleLeaseResult(ctx context.Context, lease *roachpb.Lease) { 260 r.leasePostApply(ctx, *lease, false /* permitJump */) 261 } 262 263 func (r *Replica) handleTruncatedStateResult( 264 ctx context.Context, t *roachpb.RaftTruncatedState, 265 ) (raftLogDelta int64) { 266 r.mu.Lock() 267 r.mu.state.TruncatedState = t 268 r.mu.Unlock() 269 270 // Clear any entries in the Raft log entry cache for this range up 271 // to and including the most recently truncated index. 272 r.store.raftEntryCache.Clear(r.RangeID, t.Index+1) 273 274 // Truncate the sideloaded storage. Note that this is safe only if the new truncated state 275 // is durably on disk (i.e.) synced. This is true at the time of writing but unfortunately 276 // could rot. 277 log.Eventf(ctx, "truncating sideloaded storage up to (and including) index %d", t.Index) 278 size, _, err := r.raftMu.sideloaded.TruncateTo(ctx, t.Index+1) 279 if err != nil { 280 // We don't *have* to remove these entries for correctness. Log a 281 // loud error, but keep humming along. 282 log.Errorf(ctx, "while removing sideloaded files during log truncation: %+v", err) 283 } 284 return -size 285 } 286 287 func (r *Replica) handleGCThresholdResult(ctx context.Context, thresh *hlc.Timestamp) { 288 if thresh.IsEmpty() { 289 return 290 } 291 r.mu.Lock() 292 r.mu.state.GCThreshold = thresh 293 r.mu.Unlock() 294 } 295 296 func (r *Replica) handleUsingAppliedStateKeyResult(ctx context.Context) { 297 r.mu.Lock() 298 r.mu.state.UsingAppliedStateKey = true 299 r.mu.Unlock() 300 } 301 302 func (r *Replica) handleComputeChecksumResult(ctx context.Context, cc *kvserverpb.ComputeChecksum) { 303 r.computeChecksumPostApply(ctx, *cc) 304 } 305 306 func (r *Replica) handleChangeReplicasResult( 307 ctx context.Context, chng *kvserverpb.ChangeReplicas, 308 ) (changeRemovedReplica bool) { 309 // If this command removes us then we would have set the destroy status 310 // to destroyReasonRemoved which we detect here. 311 // 312 // Note that a replica's destroy status is only ever updated under the 313 // raftMu and we validated that the replica was not RemovingOrRemoved 314 // before processing this raft ready. 315 if ds, _ := r.IsDestroyed(); ds != destroyReasonRemoved { 316 return false // changeRemovedReplica 317 } 318 319 // If this command removes us then we need to go through the process of 320 // removing our replica from the store. After this method returns, the code 321 // should roughly return all the way up to whoever called handleRaftReady 322 // and this Replica should never be heard from again. We can detect if this 323 // change removed us by inspecting the replica's destroyStatus. We check the 324 // destroy status before processing a raft ready so if we find ourselves with 325 // removal pending at this point then we know that this command must be 326 // responsible. 327 if log.V(1) { 328 log.Infof(ctx, "removing replica due to ChangeReplicasTrigger: %v", chng) 329 } 330 331 // NB: postDestroyRaftMuLocked requires that the batch which removed the data 332 // be durably synced to disk, which we have. 333 // See replicaAppBatch.ApplyToStateMachine(). 334 if err := r.postDestroyRaftMuLocked(ctx, r.GetMVCCStats()); err != nil { 335 log.Fatalf(ctx, "failed to run Replica postDestroy: %v", err) 336 } 337 338 if err := r.store.removeInitializedReplicaRaftMuLocked(ctx, r, chng.NextReplicaID(), RemoveOptions{ 339 // We destroyed the data when the batch committed so don't destroy it again. 340 DestroyData: false, 341 // In order to detect the GC queue racing with other causes of replica removal 342 // the store will no-op when removing a replica which is already marked as removed 343 // unless we set ignoreDestroyStatus to true. 344 ignoreDestroyStatus: true, 345 }); err != nil { 346 log.Fatalf(ctx, "failed to remove replica: %v", err) 347 } 348 return true 349 } 350 351 func (r *Replica) handleRaftLogDeltaResult(ctx context.Context, delta int64) { 352 r.mu.Lock() 353 defer r.mu.Unlock() 354 r.mu.raftLogSize += delta 355 r.mu.raftLogLastCheckSize += delta 356 // Ensure raftLog{,LastCheck}Size is not negative since it isn't persisted 357 // between server restarts. 358 if r.mu.raftLogSize < 0 { 359 r.mu.raftLogSize = 0 360 } 361 if r.mu.raftLogLastCheckSize < 0 { 362 r.mu.raftLogLastCheckSize = 0 363 } 364 } 365 366 func (r *Replica) handleNoRaftLogDeltaResult(ctx context.Context) { 367 // Check for whether to queue the range for Raft log truncation if this is 368 // not a Raft log truncation command itself. We don't want to check the 369 // Raft log for truncation on every write operation or even every operation 370 // which occurs after the Raft log exceeds RaftLogQueueStaleSize. The logic 371 // below queues the replica for possible Raft log truncation whenever an 372 // additional RaftLogQueueStaleSize bytes have been written to the Raft 373 // log. 374 r.mu.Lock() 375 checkRaftLog := r.mu.raftLogSize-r.mu.raftLogLastCheckSize >= RaftLogQueueStaleSize 376 if checkRaftLog { 377 r.mu.raftLogLastCheckSize = r.mu.raftLogSize 378 } 379 r.mu.Unlock() 380 if checkRaftLog { 381 r.store.raftLogQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 382 } 383 } 384 385 func (r *Replica) handleSuggestedCompactionsResult( 386 ctx context.Context, scs []kvserverpb.SuggestedCompaction, 387 ) { 388 // TODO(itsbilal): Remove this check once Pebble supports GetSSTables 389 if r.store.compactor == nil { 390 return 391 } 392 for _, sc := range scs { 393 r.store.compactor.Suggest(ctx, sc) 394 } 395 }