github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_application_state_machine.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package kvserver 12 13 import ( 14 "context" 15 "fmt" 16 "time" 17 18 "github.com/cockroachdb/cockroach/pkg/clusterversion" 19 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply" 20 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase" 21 "github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb" 22 "github.com/cockroachdb/cockroach/pkg/roachpb" 23 "github.com/cockroachdb/cockroach/pkg/storage" 24 "github.com/cockroachdb/cockroach/pkg/storage/enginepb" 25 "github.com/cockroachdb/cockroach/pkg/util/hlc" 26 "github.com/cockroachdb/cockroach/pkg/util/log" 27 "github.com/cockroachdb/cockroach/pkg/util/timeutil" 28 "github.com/cockroachdb/errors" 29 "github.com/kr/pretty" 30 "go.etcd.io/etcd/raft" 31 "go.etcd.io/etcd/raft/raftpb" 32 ) 33 34 // replica_application_*.go files provide concrete implementations of 35 // the interfaces defined in the storage/apply package: 36 // 37 // replica_application_state_machine.go -> apply.StateMachine 38 // replica_application_decoder.go -> apply.Decoder 39 // replica_application_cmd.go -> apply.Command (and variants) 40 // replica_application_cmd_buf.go -> apply.CommandIterator (and variants) 41 // replica_application_cmd_buf.go -> apply.CommandList (and variants) 42 // 43 // These allow Replica to interface with the storage/apply package. 44 45 // applyCommittedEntriesStats returns stats about what happened during the 46 // application of a set of raft entries. 47 // 48 // TODO(ajwerner): add metrics to go with these stats. 49 type applyCommittedEntriesStats struct { 50 batchesProcessed int 51 entriesProcessed int 52 stateAssertions int 53 numEmptyEntries int 54 numConfChangeEntries int 55 } 56 57 // nonDeterministicFailure is an error type that indicates that a state machine 58 // transition failed due to an unexpected error. Failure to perform a state 59 // transition is a form of non-determinism, so it can't be permitted for any 60 // reason during the application phase of state machine replication. The only 61 // acceptable recourse is to signal that the replica has become corrupted. 62 // 63 // All errors returned by replicaDecoder and replicaStateMachine will be instances 64 // of this type. 65 type nonDeterministicFailure struct { 66 wrapped error 67 safeExpl string 68 } 69 70 // The provided format string should be safe for reporting. 71 func makeNonDeterministicFailure(format string, args ...interface{}) error { 72 err := errors.Newf(format, args...) 73 return &nonDeterministicFailure{ 74 wrapped: err, 75 safeExpl: err.Error(), 76 } 77 } 78 79 // The provided msg should be safe for reporting. 80 func wrapWithNonDeterministicFailure(err error, format string, args ...interface{}) error { 81 return &nonDeterministicFailure{ 82 wrapped: errors.Wrapf(err, format, args...), 83 safeExpl: fmt.Sprintf(format, args...), 84 } 85 } 86 87 // Error implements the error interface. 88 func (e *nonDeterministicFailure) Error() string { 89 return fmt.Sprintf("non-deterministic failure: %s", e.wrapped.Error()) 90 } 91 92 // Cause implements the github.com/pkg/errors.causer interface. 93 func (e *nonDeterministicFailure) Cause() error { return e.wrapped } 94 95 // Unwrap implements the github.com/golang/xerrors.Wrapper interface, which is 96 // planned to be moved to the stdlib in go 1.13. 97 func (e *nonDeterministicFailure) Unwrap() error { return e.wrapped } 98 99 // replicaStateMachine implements the apply.StateMachine interface. 100 // 101 // The structure coordinates state transitions within the Replica state machine 102 // due to the application of replicated commands decoded from committed raft 103 // entries. Commands are applied to the state machine in a multi-stage process 104 // whereby individual commands are prepared for application relative to the 105 // current view of ReplicaState and staged in a replicaAppBatch, the batch is 106 // committed to the Replica's storage engine atomically, and finally the 107 // side-effects of each command is applied to the Replica's in-memory state. 108 type replicaStateMachine struct { 109 r *Replica 110 // batch is returned from NewBatch(false /* ephemeral */). 111 batch replicaAppBatch 112 // ephemeralBatch is returned from NewBatch(true /* ephemeral */). 113 ephemeralBatch ephemeralReplicaAppBatch 114 // stats are updated during command application and reset by moveStats. 115 stats applyCommittedEntriesStats 116 } 117 118 // getStateMachine returns the Replica's apply.StateMachine. The Replica's 119 // raftMu is held for the entire lifetime of the replicaStateMachine. 120 func (r *Replica) getStateMachine() *replicaStateMachine { 121 sm := &r.raftMu.stateMachine 122 sm.r = r 123 return sm 124 } 125 126 // shouldApplyCommand determines whether or not a command should be applied to 127 // the replicated state machine after it has been committed to the Raft log. It 128 // then sets the provided command's leaseIndex, proposalRetry, and forcedErr 129 // fields and returns whether command should be applied or rejected. 130 func (r *Replica) shouldApplyCommand( 131 ctx context.Context, cmd *replicatedCmd, replicaState *kvserverpb.ReplicaState, 132 ) bool { 133 cmd.leaseIndex, cmd.proposalRetry, cmd.forcedErr = checkForcedErr( 134 ctx, cmd.idKey, &cmd.raftCmd, cmd.IsLocal(), replicaState, 135 ) 136 if filter := r.store.cfg.TestingKnobs.TestingApplyFilter; cmd.forcedErr == nil && filter != nil { 137 var newPropRetry int 138 newPropRetry, cmd.forcedErr = filter(kvserverbase.ApplyFilterArgs{ 139 CmdID: cmd.idKey, 140 ReplicatedEvalResult: *cmd.replicatedResult(), 141 StoreID: r.store.StoreID(), 142 RangeID: r.RangeID, 143 }) 144 if cmd.proposalRetry == 0 { 145 cmd.proposalRetry = proposalReevaluationReason(newPropRetry) 146 } 147 } 148 return cmd.forcedErr == nil 149 } 150 151 // checkForcedErr determines whether or not a command should be applied to the 152 // replicated state machine after it has been committed to the Raft log. This 153 // decision is deterministic on all replicas, such that a command that is 154 // rejected "beneath raft" on one replica will be rejected "beneath raft" on 155 // all replicas. 156 // 157 // The decision about whether or not to apply a command is a combination of 158 // three checks: 159 // 1. verify that the command was proposed under the current lease. This is 160 // determined using the proposal's ProposerLeaseSequence. 161 // 2. verify that the command hasn't been re-ordered with other commands that 162 // were proposed after it and which already applied. This is determined 163 // using the proposal's MaxLeaseIndex. 164 // 3. verify that the command isn't in violation of the Range's current 165 // garbage collection threshold. This is determined using the proposal's 166 // Timestamp. 167 // 168 // TODO(nvanbenschoten): Unit test this function now that it is stateless. 169 func checkForcedErr( 170 ctx context.Context, 171 idKey kvserverbase.CmdIDKey, 172 raftCmd *kvserverpb.RaftCommand, 173 isLocal bool, 174 replicaState *kvserverpb.ReplicaState, 175 ) (uint64, proposalReevaluationReason, *roachpb.Error) { 176 leaseIndex := replicaState.LeaseAppliedIndex 177 isLeaseRequest := raftCmd.ReplicatedEvalResult.IsLeaseRequest 178 var requestedLease roachpb.Lease 179 if isLeaseRequest { 180 requestedLease = *raftCmd.ReplicatedEvalResult.State.Lease 181 } 182 if idKey == "" { 183 // This is an empty Raft command (which is sent by Raft after elections 184 // to trigger reproposals or during concurrent configuration changes). 185 // Nothing to do here except making sure that the corresponding batch 186 // (which is bogus) doesn't get executed (for it is empty and so 187 // properties like key range are undefined). 188 return leaseIndex, proposalNoReevaluation, roachpb.NewErrorf("no-op on empty Raft entry") 189 } 190 191 // Verify the lease matches the proposer's expectation. We rely on 192 // the proposer's determination of whether the existing lease is 193 // held, and can be used, or is expired, and can be replaced. 194 // Verify checks that the lease has not been modified since proposal 195 // due to Raft delays / reorderings. 196 // To understand why this lease verification is necessary, see comments on the 197 // proposer_lease field in the proto. 198 leaseMismatch := false 199 if raftCmd.DeprecatedProposerLease != nil { 200 // VersionLeaseSequence must not have been active when this was proposed. 201 // 202 // This does not prevent the lease race condition described below. The 203 // reason we don't fix this here as well is because fixing the race 204 // requires a new cluster version which implies that we'll already be 205 // using lease sequence numbers and will fall into the case below. 206 leaseMismatch = !raftCmd.DeprecatedProposerLease.Equivalent(*replicaState.Lease) 207 } else { 208 leaseMismatch = raftCmd.ProposerLeaseSequence != replicaState.Lease.Sequence 209 if !leaseMismatch && isLeaseRequest { 210 // Lease sequence numbers are a reflection of lease equivalency 211 // between subsequent leases. However, Lease.Equivalent is not fully 212 // symmetric, meaning that two leases may be Equivalent to a third 213 // lease but not Equivalent to each other. If these leases are 214 // proposed under that same third lease, neither will be able to 215 // detect whether the other has applied just by looking at the 216 // current lease sequence number because neither will will increment 217 // the sequence number. 218 // 219 // This can lead to inversions in lease expiration timestamps if 220 // we're not careful. To avoid this, if a lease request's proposer 221 // lease sequence matches the current lease sequence and the current 222 // lease sequence also matches the requested lease sequence, we make 223 // sure the requested lease is Equivalent to current lease. 224 if replicaState.Lease.Sequence == requestedLease.Sequence { 225 // It is only possible for this to fail when expiration-based 226 // lease extensions are proposed concurrently. 227 leaseMismatch = !replicaState.Lease.Equivalent(requestedLease) 228 } 229 230 // This is a check to see if the lease we proposed this lease request against is the same 231 // lease that we're trying to update. We need to check proposal timestamps because 232 // extensions don't increment sequence numbers. Without this check a lease could 233 // be extended and then another lease proposed against the original lease would 234 // be applied over the extension. 235 if raftCmd.ReplicatedEvalResult.PrevLeaseProposal != nil && 236 (*raftCmd.ReplicatedEvalResult.PrevLeaseProposal != *replicaState.Lease.ProposedTS) { 237 leaseMismatch = true 238 } 239 } 240 } 241 if leaseMismatch { 242 log.VEventf( 243 ctx, 1, 244 "command with lease #%d incompatible to %v", 245 raftCmd.ProposerLeaseSequence, *replicaState.Lease, 246 ) 247 if isLeaseRequest { 248 // For lease requests we return a special error that 249 // redirectOnOrAcquireLease() understands. Note that these 250 // requests don't go through the DistSender. 251 return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.LeaseRejectedError{ 252 Existing: *replicaState.Lease, 253 Requested: requestedLease, 254 Message: "proposed under invalid lease", 255 }) 256 } 257 // We return a NotLeaseHolderError so that the DistSender retries. 258 // NB: we set proposerStoreID to 0 because we don't know who proposed the 259 // Raft command. This is ok, as this is only used for debug information. 260 nlhe := newNotLeaseHolderError(replicaState.Lease, 0 /* proposerStoreID */, replicaState.Desc) 261 nlhe.CustomMsg = fmt.Sprintf( 262 "stale proposal: command was proposed under lease #%d but is being applied "+ 263 "under lease: %s", raftCmd.ProposerLeaseSequence, replicaState.Lease) 264 return leaseIndex, proposalNoReevaluation, roachpb.NewError(nlhe) 265 } 266 267 if isLeaseRequest { 268 // Lease commands are ignored by the counter (and their MaxLeaseIndex is ignored). This 269 // makes sense since lease commands are proposed by anyone, so we can't expect a coherent 270 // MaxLeaseIndex. Also, lease proposals are often replayed, so not making them update the 271 // counter makes sense from a testing perspective. 272 // 273 // However, leases get special vetting to make sure we don't give one to a replica that was 274 // since removed (see #15385 and a comment in redirectOnOrAcquireLease). 275 if _, ok := replicaState.Desc.GetReplicaDescriptor(requestedLease.Replica.StoreID); !ok { 276 return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.LeaseRejectedError{ 277 Existing: *replicaState.Lease, 278 Requested: requestedLease, 279 Message: "replica not part of range", 280 }) 281 } 282 } else if replicaState.LeaseAppliedIndex < raftCmd.MaxLeaseIndex { 283 // The happy case: the command is applying at or ahead of the minimal 284 // permissible index. It's ok if it skips a few slots (as can happen 285 // during rearrangement); this command will apply, but later ones which 286 // were proposed at lower indexes may not. Overall though, this is more 287 // stable and simpler than requiring commands to apply at their exact 288 // lease index: Handling the case in which MaxLeaseIndex > oldIndex+1 289 // is otherwise tricky since we can't tell the client to try again 290 // (reproposals could exist and may apply at the right index, leading 291 // to a replay), and assigning the required index would be tedious 292 // seeing that it would have to rewind sometimes. 293 leaseIndex = raftCmd.MaxLeaseIndex 294 } else { 295 // The command is trying to apply at a past log position. That's 296 // unfortunate and hopefully rare; the client on the proposer will try 297 // again. Note that in this situation, the leaseIndex does not advance. 298 retry := proposalNoReevaluation 299 if isLocal { 300 log.VEventf( 301 ctx, 1, 302 "retry proposal %x: applied at lease index %d, required < %d", 303 idKey, leaseIndex, raftCmd.MaxLeaseIndex, 304 ) 305 retry = proposalIllegalLeaseIndex 306 } 307 return leaseIndex, retry, roachpb.NewErrorf( 308 "command observed at lease index %d, but required < %d", leaseIndex, raftCmd.MaxLeaseIndex, 309 ) 310 } 311 312 // Verify that the batch timestamp is after the GC threshold. This is 313 // necessary because not all commands declare read access on the GC 314 // threshold key, even though they implicitly depend on it. This means 315 // that access to this state will not be serialized by latching, 316 // so we must perform this check upstream and downstream of raft. 317 // See #14833. 318 ts := raftCmd.ReplicatedEvalResult.Timestamp 319 if ts.LessEq(*replicaState.GCThreshold) { 320 return leaseIndex, proposalNoReevaluation, roachpb.NewError(&roachpb.BatchTimestampBeforeGCError{ 321 Timestamp: ts, 322 Threshold: *replicaState.GCThreshold, 323 }) 324 } 325 return leaseIndex, proposalNoReevaluation, nil 326 } 327 328 // NewBatch implements the apply.StateMachine interface. 329 func (sm *replicaStateMachine) NewBatch(ephemeral bool) apply.Batch { 330 r := sm.r 331 if ephemeral { 332 mb := &sm.ephemeralBatch 333 mb.r = r 334 r.mu.RLock() 335 mb.state = r.mu.state 336 r.mu.RUnlock() 337 return mb 338 } 339 b := &sm.batch 340 b.r = r 341 b.sm = sm 342 b.batch = r.store.engine.NewBatch() 343 r.mu.RLock() 344 b.state = r.mu.state 345 b.state.Stats = &b.stats 346 *b.state.Stats = *r.mu.state.Stats 347 r.mu.RUnlock() 348 b.start = timeutil.Now() 349 return b 350 } 351 352 // replicaAppBatch implements the apply.Batch interface. 353 // 354 // The structure accumulates state due to the application of raft commands. 355 // Committed raft commands are applied to the state machine in a multi-stage 356 // process whereby individual commands are prepared for application relative 357 // to the current view of ReplicaState and staged in the batch. The batch is 358 // committed to the state machine's storage engine atomically. 359 type replicaAppBatch struct { 360 r *Replica 361 sm *replicaStateMachine 362 363 // batch accumulates writes implied by the raft entries in this batch. 364 batch storage.Batch 365 // state is this batch's view of the replica's state. It is copied from 366 // under the Replica.mu when the batch is initialized and is updated in 367 // stageTrivialReplicatedEvalResult. 368 state kvserverpb.ReplicaState 369 // stats is stored on the application batch to avoid an allocation in 370 // tracking the batch's view of replicaState. All pointer fields in 371 // replicaState other than Stats are overwritten completely rather than 372 // updated in-place. 373 stats enginepb.MVCCStats 374 // maxTS is the maximum timestamp that any command that was staged in this 375 // batch was evaluated at. 376 maxTS hlc.Timestamp 377 // migrateToAppliedStateKey tracks whether any command in the batch 378 // triggered a migration to the replica applied state key. If so, this 379 // migration will be performed when the application batch is committed. 380 migrateToAppliedStateKey bool 381 // changeRemovesReplica tracks whether the command in the batch (there must 382 // be only one) removes this replica from the range. 383 changeRemovesReplica bool 384 385 // Statistics. 386 entries int 387 emptyEntries int 388 mutations int 389 start time.Time 390 } 391 392 // Stage implements the apply.Batch interface. The method handles the first 393 // phase of applying a command to the replica state machine. 394 // 395 // The first thing the method does is determine whether the command should be 396 // applied at all or whether it should be rejected and replaced with an empty 397 // entry. The determination is based on the following rules: the command's 398 // MaxLeaseIndex must move the state machine's LeaseAppliedIndex forward, the 399 // proposer's lease (or rather its sequence number) must match that of the state 400 // machine, and lastly the GCThreshold must be below the timestamp that the 401 // command evaluated at. If any of the checks fail, the proposal's content is 402 // wiped and we apply an empty log entry instead. If a rejected command was 403 // proposed locally, the error will eventually be communicated to the waiting 404 // proposer. The two typical cases in which errors occur are lease mismatch (in 405 // which case the caller tries to send the command to the actual leaseholder) 406 // and violation of the LeaseAppliedIndex (in which case the proposal is retried 407 // if it was proposed locally). 408 // 409 // Assuming all checks were passed, the command's write batch is applied to the 410 // application batch. Its trivial ReplicatedState updates are then staged in 411 // the batch. This allows the batch to make an accurate determination about 412 // whether to accept or reject the next command that is staged without needing 413 // to actually update the replica state machine in between. 414 func (b *replicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error) { 415 cmd := cmdI.(*replicatedCmd) 416 ctx := cmd.ctx 417 if cmd.ent.Index == 0 { 418 return nil, makeNonDeterministicFailure("processRaftCommand requires a non-zero index") 419 } 420 if idx, applied := cmd.ent.Index, b.state.RaftAppliedIndex; idx != applied+1 { 421 // If we have an out of order index, there's corruption. No sense in 422 // trying to update anything or running the command. Simply return. 423 return nil, makeNonDeterministicFailure("applied index jumped from %d to %d", applied, idx) 424 } 425 if log.V(4) { 426 log.Infof(ctx, "processing command %x: maxLeaseIndex=%d", cmd.idKey, cmd.raftCmd.MaxLeaseIndex) 427 } 428 429 // Determine whether the command should be applied to the replicated state 430 // machine or whether it should be rejected (and replaced by an empty command). 431 // This check is deterministic on all replicas, so if one replica decides to 432 // reject a command, all will. 433 if !b.r.shouldApplyCommand(ctx, cmd, &b.state) { 434 log.VEventf(ctx, 1, "applying command with forced error: %s", cmd.forcedErr) 435 436 // Apply an empty command. 437 cmd.raftCmd.ReplicatedEvalResult = kvserverpb.ReplicatedEvalResult{} 438 cmd.raftCmd.WriteBatch = nil 439 cmd.raftCmd.LogicalOpLog = nil 440 } else { 441 log.Event(ctx, "applying command") 442 } 443 444 // Acquire the split or merge lock, if necessary. If a split or merge 445 // command was rejected with a below-Raft forced error then its replicated 446 // result was just cleared and this will be a no-op. 447 if splitMergeUnlock, err := b.r.maybeAcquireSplitMergeLock(ctx, cmd.raftCmd); err != nil { 448 var err error 449 if cmd.raftCmd.ReplicatedEvalResult.Split != nil { 450 err = wrapWithNonDeterministicFailure(err, "unable to acquire split lock") 451 } else { 452 err = wrapWithNonDeterministicFailure(err, "unable to acquire merge lock") 453 } 454 return nil, err 455 } else if splitMergeUnlock != nil { 456 // Set the splitMergeUnlock on the replicaAppBatch to be called 457 // after the batch has been applied (see replicaAppBatch.commit). 458 cmd.splitMergeUnlock = splitMergeUnlock 459 } 460 461 // Update the batch's max timestamp. 462 b.maxTS.Forward(cmd.replicatedResult().Timestamp) 463 464 // Normalize the command, accounting for past migrations. 465 b.migrateReplicatedResult(ctx, cmd) 466 467 // Run any triggers that should occur before the batch is applied 468 // and before the write batch is staged in the batch. 469 if err := b.runPreApplyTriggersBeforeStagingWriteBatch(ctx, cmd); err != nil { 470 return nil, err 471 } 472 473 // Stage the command's write batch in the application batch. 474 if err := b.stageWriteBatch(ctx, cmd); err != nil { 475 return nil, err 476 } 477 478 // Run any triggers that should occur before the batch is applied 479 // but after the write batch is staged in the batch. 480 if err := b.runPreApplyTriggersAfterStagingWriteBatch(ctx, cmd); err != nil { 481 return nil, err 482 } 483 484 // Stage the command's trivial ReplicatedState updates in the batch. Any 485 // non-trivial commands will be in their own batch, so delaying their 486 // non-trivial ReplicatedState updates until later (without ever staging 487 // them in the batch) is sufficient. 488 b.stageTrivialReplicatedEvalResult(ctx, cmd) 489 b.entries++ 490 if len(cmd.ent.Data) == 0 { 491 b.emptyEntries++ 492 } 493 494 // The command was checked by shouldApplyCommand, so it can be returned 495 // as an apply.CheckedCommand. 496 return cmd, nil 497 } 498 499 // migrateReplicatedResult performs any migrations necessary on the command to 500 // normalize it before applying it to the batch. This may modify the command. 501 func (b *replicaAppBatch) migrateReplicatedResult(ctx context.Context, cmd *replicatedCmd) { 502 // If the command was using the deprecated version of the MVCCStats proto, 503 // migrate it to the new version and clear out the field. 504 res := cmd.replicatedResult() 505 if deprecatedDelta := res.DeprecatedDelta; deprecatedDelta != nil { 506 if res.Delta != (enginepb.MVCCStatsDelta{}) { 507 log.Fatalf(ctx, "stats delta not empty but deprecated delta provided: %+v", cmd) 508 } 509 res.Delta = deprecatedDelta.ToStatsDelta() 510 res.DeprecatedDelta = nil 511 } 512 } 513 514 // stageWriteBatch applies the command's write batch to the application batch's 515 // RocksDB batch. This batch is committed to RocksDB in replicaAppBatch.commit. 516 func (b *replicaAppBatch) stageWriteBatch(ctx context.Context, cmd *replicatedCmd) error { 517 wb := cmd.raftCmd.WriteBatch 518 if wb == nil { 519 return nil 520 } 521 if mutations, err := storage.RocksDBBatchCount(wb.Data); err != nil { 522 log.Errorf(ctx, "unable to read header of committed WriteBatch: %+v", err) 523 } else { 524 b.mutations += mutations 525 } 526 if err := b.batch.ApplyBatchRepr(wb.Data, false); err != nil { 527 return wrapWithNonDeterministicFailure(err, "unable to apply WriteBatch") 528 } 529 return nil 530 } 531 532 // changeRemovesStore returns true if any of the removals in this change have storeID. 533 func changeRemovesStore( 534 desc *roachpb.RangeDescriptor, change *kvserverpb.ChangeReplicas, storeID roachpb.StoreID, 535 ) (removesStore bool) { 536 curReplica, existsInDesc := desc.GetReplicaDescriptor(storeID) 537 // NB: if we're catching up from a preemptive snapshot then we won't 538 // exist in the current descriptor and we can't be removed. 539 if !existsInDesc { 540 return false 541 } 542 543 // NB: We don't use change.Removed() because it will include replicas being 544 // transitioned to VOTER_OUTGOING. 545 546 // In 19.1 and before we used DeprecatedUpdatedReplicas instead of providing 547 // a new range descriptor. Check first if this is 19.1 or earlier command which 548 // uses DeprecatedChangeType and DeprecatedReplica 549 if change.Desc == nil { 550 return change.DeprecatedChangeType == roachpb.REMOVE_REPLICA && change.DeprecatedReplica.ReplicaID == curReplica.ReplicaID 551 } 552 // In 19.2 and beyond we supply the new range descriptor in the change. 553 // We know we're removed if we do not appear in the new descriptor. 554 _, existsInChange := change.Desc.GetReplicaDescriptor(storeID) 555 return !existsInChange 556 } 557 558 // runPreApplyTriggersBeforeStagingWriteBatch runs any triggers that must fire 559 // before a command is applied to the state machine but after the command is 560 // staged in the replicaAppBatch's write batch. It may modify the command. 561 func (b *replicaAppBatch) runPreApplyTriggersBeforeStagingWriteBatch( 562 ctx context.Context, cmd *replicatedCmd, 563 ) error { 564 if ops := cmd.raftCmd.LogicalOpLog; ops != nil { 565 b.r.populatePrevValsInLogicalOpLogRaftMuLocked(ctx, ops, b.batch) 566 } 567 return nil 568 } 569 570 // runPreApplyTriggersAfterStagingWriteBatch runs any triggers that must fire 571 // before a command is applied to the state machine but after the command is 572 // staged in the replicaAppBatch's write batch. It may modify the command. 573 func (b *replicaAppBatch) runPreApplyTriggersAfterStagingWriteBatch( 574 ctx context.Context, cmd *replicatedCmd, 575 ) error { 576 res := cmd.replicatedResult() 577 578 // AddSSTable ingestions run before the actual batch gets written to the 579 // storage engine. This makes sure that when the Raft command is applied, 580 // the ingestion has definitely succeeded. Note that we have taken 581 // precautions during command evaluation to avoid having mutations in the 582 // WriteBatch that affect the SSTable. Not doing so could result in order 583 // reversal (and missing values) here. 584 // 585 // NB: any command which has an AddSSTable is non-trivial and will be 586 // applied in its own batch so it's not possible that any other commands 587 // which precede this command can shadow writes from this SSTable. 588 if res.AddSSTable != nil { 589 copied := addSSTablePreApply( 590 ctx, 591 b.r.store.cfg.Settings, 592 b.r.store.engine, 593 b.r.raftMu.sideloaded, 594 cmd.ent.Term, 595 cmd.ent.Index, 596 *res.AddSSTable, 597 b.r.store.limiters.BulkIOWriteRate, 598 ) 599 b.r.store.metrics.AddSSTableApplications.Inc(1) 600 if copied { 601 b.r.store.metrics.AddSSTableApplicationCopies.Inc(1) 602 } 603 if added := res.Delta.KeyCount; added > 0 { 604 b.r.writeStats.recordCount(float64(added), 0) 605 } 606 res.AddSSTable = nil 607 } 608 609 if res.Split != nil { 610 // Splits require a new HardState to be written to the new RHS 611 // range (and this needs to be atomic with the main batch). This 612 // cannot be constructed at evaluation time because it differs 613 // on each replica (votes may have already been cast on the 614 // uninitialized replica). Write this new hardstate to the batch too. 615 // See https://github.com/cockroachdb/cockroach/issues/20629. 616 // 617 // Alternatively if we discover that the RHS has already been removed 618 // from this store, clean up its data. 619 splitPreApply(ctx, b.batch, res.Split.SplitTrigger, b.r) 620 621 // The rangefeed processor will no longer be provided logical ops for 622 // its entire range, so it needs to be shut down and all registrations 623 // need to retry. 624 // TODO(nvanbenschoten): It should be possible to only reject registrations 625 // that overlap with the new range of the split and keep registrations that 626 // are only interested in keys that are still on the original range running. 627 b.r.disconnectRangefeedWithReason( 628 roachpb.RangeFeedRetryError_REASON_RANGE_SPLIT, 629 ) 630 } 631 632 if merge := res.Merge; merge != nil { 633 // Merges require the subsumed range to be atomically deleted when the 634 // merge transaction commits. 635 636 // If our range currently has a non-zero replica ID then we know we're 637 // safe to commit this merge because of the invariants provided to us 638 // by the merge protocol. Namely if this committed we know that if the 639 // command committed then all of the replicas in the range descriptor 640 // are collocated when this command commits. If we do not have a non-zero 641 // replica ID then the logic in Stage should detect that and destroy our 642 // preemptive snapshot so we shouldn't ever get here. 643 rhsRepl, err := b.r.store.GetReplica(merge.RightDesc.RangeID) 644 if err != nil { 645 return wrapWithNonDeterministicFailure(err, "unable to get replica for merge") 646 } 647 // We should already have acquired the raftMu for the rhsRepl and now hold 648 // its unlock method in cmd.splitMergeUnlock. 649 rhsRepl.raftMu.AssertHeld() 650 651 // Use math.MaxInt32 (mergedTombstoneReplicaID) as the nextReplicaID as an 652 // extra safeguard against creating new replicas of the RHS. This isn't 653 // required for correctness, since the merge protocol should guarantee that 654 // no new replicas of the RHS can ever be created, but it doesn't hurt to 655 // be careful. 656 const clearRangeIDLocalOnly = true 657 const mustClearRange = false 658 if err := rhsRepl.preDestroyRaftMuLocked( 659 ctx, b.batch, b.batch, mergedTombstoneReplicaID, clearRangeIDLocalOnly, mustClearRange, 660 ); err != nil { 661 return wrapWithNonDeterministicFailure(err, "unable to destroy replica before merge") 662 } 663 664 // Shut down rangefeed processors on either side of the merge. 665 // 666 // NB: It is critical to shut-down a rangefeed processor on the surviving 667 // replica primarily do deal with the possibility that there are logical ops 668 // for the RHS to resolve intents written by the merge transaction. In 669 // practice, the only such intents that exist are on the RangeEventTable, 670 // but it's good to be consistent here and allow the merge transaction to 671 // write to the RHS of a merge. See batcheval.resolveLocalLocks for details 672 // on why we resolve RHS intents when committing a merge transaction. 673 // 674 // TODO(nvanbenschoten): Alternatively we could just adjust the bounds of 675 // b.r.Processor to include the rhsRepl span. 676 // 677 // NB: removeInitializedReplicaRaftMuLocked also disconnects any initialized 678 // rangefeeds with REASON_REPLICA_REMOVED. That's ok because we will have 679 // already disconnected the rangefeed here. 680 b.r.disconnectRangefeedWithReason( 681 roachpb.RangeFeedRetryError_REASON_RANGE_MERGED, 682 ) 683 rhsRepl.disconnectRangefeedWithReason( 684 roachpb.RangeFeedRetryError_REASON_RANGE_MERGED, 685 ) 686 } 687 688 if res.State != nil && res.State.TruncatedState != nil { 689 if apply, err := handleTruncatedStateBelowRaft( 690 ctx, b.state.TruncatedState, res.State.TruncatedState, b.r.raftMu.stateLoader, b.batch, 691 ); err != nil { 692 return wrapWithNonDeterministicFailure(err, "unable to handle truncated state") 693 } else if !apply { 694 // The truncated state was discarded, so make sure we don't apply 695 // it to our in-memory state. 696 res.State.TruncatedState = nil 697 res.RaftLogDelta = 0 698 // TODO(ajwerner): consider moving this code. 699 // We received a truncation that doesn't apply to us, so we know that 700 // there's a leaseholder out there with a log that has earlier entries 701 // than ours. That leader also guided our log size computations by 702 // giving us RaftLogDeltas for past truncations, and this was likely 703 // off. Mark our Raft log size is not trustworthy so that, assuming 704 // we step up as leader at some point in the future, we recompute 705 // our numbers. 706 b.r.mu.Lock() 707 b.r.mu.raftLogSizeTrusted = false 708 b.r.mu.Unlock() 709 } 710 } 711 712 // Detect if this command will remove us from the range. 713 // If so we stage the removal of all of our range data into this batch. 714 // We'll complete the removal when it commits. Later logic detects the 715 // removal by inspecting the destroy status. 716 // 717 // NB: This is the last step in the preApply which durably writes to the 718 // replica state so that if it removes the replica it removes everything. 719 if change := res.ChangeReplicas; change != nil && 720 changeRemovesStore(b.state.Desc, change, b.r.store.StoreID()) && 721 // Don't remove the data if the testing knobs ask us not to. 722 !b.r.store.TestingKnobs().DisableEagerReplicaRemoval { 723 724 // We mark the replica as destroyed so that new commands are not 725 // accepted. This destroy status will be detected after the batch commits 726 // by Replica.handleChangeReplicasTrigger() to finish the removal. 727 // 728 // NB: we must be holding the raftMu here because we're in the 729 // midst of application. 730 b.r.mu.Lock() 731 b.r.mu.destroyStatus.Set( 732 roachpb.NewRangeNotFoundError(b.r.RangeID, b.r.store.StoreID()), 733 destroyReasonRemoved) 734 b.r.mu.Unlock() 735 b.changeRemovesReplica = true 736 737 // Delete all of the local data. We're going to delete the hard state too. 738 // In order for this to be safe we need code above this to promise that we're 739 // never going to write hard state in response to a message for a later 740 // replica (with a different replica ID) to this range state. 741 if err := b.r.preDestroyRaftMuLocked( 742 ctx, 743 b.batch, 744 b.batch, 745 change.NextReplicaID(), 746 false, /* clearRangeIDLocalOnly */ 747 false, /* mustUseClearRange */ 748 ); err != nil { 749 return wrapWithNonDeterministicFailure(err, "unable to destroy replica before removal") 750 } 751 } 752 753 // Provide the command's corresponding logical operations to the Replica's 754 // rangefeed. Only do so if the WriteBatch is non-nil, in which case the 755 // rangefeed requires there to be a corresponding logical operation log or 756 // it will shut down with an error. If the WriteBatch is nil then we expect 757 // the logical operation log to also be nil. We don't want to trigger a 758 // shutdown of the rangefeed in that situation, so we don't pass anything to 759 // the rangefed. If no rangefeed is running at all, this call will be a noop. 760 if ops := cmd.raftCmd.LogicalOpLog; cmd.raftCmd.WriteBatch != nil { 761 b.r.handleLogicalOpLogRaftMuLocked(ctx, ops, b.batch) 762 } else if ops != nil { 763 log.Fatalf(ctx, "non-nil logical op log with nil write batch: %v", cmd.raftCmd) 764 } 765 766 return nil 767 } 768 769 // stageTrivialReplicatedEvalResult applies the trivial portions of the 770 // command's ReplicatedEvalResult to the batch's ReplicaState. This function 771 // modifies the receiver's ReplicaState but does not modify ReplicatedEvalResult 772 // in order to give the TestingPostApplyFilter testing knob an opportunity to 773 // inspect the command's ReplicatedEvalResult. 774 func (b *replicaAppBatch) stageTrivialReplicatedEvalResult( 775 ctx context.Context, cmd *replicatedCmd, 776 ) { 777 if raftAppliedIndex := cmd.ent.Index; raftAppliedIndex != 0 { 778 b.state.RaftAppliedIndex = raftAppliedIndex 779 } 780 if leaseAppliedIndex := cmd.leaseIndex; leaseAppliedIndex != 0 { 781 b.state.LeaseAppliedIndex = leaseAppliedIndex 782 } 783 res := cmd.replicatedResult() 784 785 // Detect whether the incoming stats contain estimates that resulted from the 786 // evaluation of a command under the 19.1 cluster version. These were either 787 // evaluated on a 19.1 node (where ContainsEstimates is a bool, which maps 788 // to 0 and 1 in 19.2+) or on a 19.2 node which hadn't yet had its cluster 789 // version bumped. 790 // 791 // 19.2 nodes will never emit a ContainsEstimates outside of 0 or 1 until 792 // the cluster version is active (during command evaluation). When the 793 // version is active, they will never emit odd positive numbers (1, 3, ...). 794 // 795 // As a result, we can pinpoint exactly when the proposer of this command 796 // has used the old cluster version: it's when the incoming 797 // ContainsEstimates is 1. If so, we need to assume that an old node is processing 798 // the same commands (as `true + true = true`), so make sure that `1 + 1 = 1`. 799 _ = clusterversion.VersionContainsEstimatesCounter // see for info on ContainsEstimates migration 800 deltaStats := res.Delta.ToStats() 801 if deltaStats.ContainsEstimates == 1 && b.state.Stats.ContainsEstimates == 1 { 802 deltaStats.ContainsEstimates = 0 803 } 804 805 // Special-cased MVCC stats handling to exploit commutativity of stats delta 806 // upgrades. Thanks to commutativity, the spanlatch manager does not have to 807 // serialize on the stats key. 808 b.state.Stats.Add(deltaStats) 809 // Exploit the fact that a split will result in a full stats 810 // recomputation to reset the ContainsEstimates flag. 811 // If we were running the new VersionContainsEstimatesCounter cluster version, 812 // the consistency checker will be able to reset the stats itself, and splits 813 // will as a side effect also remove estimates from both the resulting left and right hand sides. 814 // 815 // TODO(tbg): this can be removed in v20.2 and not earlier. 816 // Consider the following scenario: 817 // - all nodes are running 19.2 818 // - all nodes rebooted into 20.1 819 // - cluster version bumped, but node1 doesn't receive the gossip update for that 820 // node1 runs a split that should emit ContainsEstimates=-1, but it clamps it to 0/1 because it 821 // doesn't know that 20.1 is active. 822 if res.Split != nil && deltaStats.ContainsEstimates == 0 { 823 b.state.Stats.ContainsEstimates = 0 824 } 825 if res.State != nil && res.State.UsingAppliedStateKey && !b.state.UsingAppliedStateKey { 826 b.migrateToAppliedStateKey = true 827 } 828 } 829 830 // ApplyToStateMachine implements the apply.Batch interface. The method handles 831 // the second phase of applying a command to the replica state machine. It 832 // writes the application batch's accumulated RocksDB batch to the storage 833 // engine. This encompasses the persistent state transition portion of entry 834 // application. 835 func (b *replicaAppBatch) ApplyToStateMachine(ctx context.Context) error { 836 if log.V(4) { 837 log.Infof(ctx, "flushing batch %v of %d entries", b.state, b.entries) 838 } 839 840 // Update the node clock with the maximum timestamp of all commands in the 841 // batch. This maintains a high water mark for all ops serviced, so that 842 // received ops without a timestamp specified are guaranteed one higher than 843 // any op already executed for overlapping keys. 844 r := b.r 845 r.store.Clock().Update(b.maxTS) 846 847 // Add the replica applied state key to the write batch if this change 848 // doesn't remove us. 849 if !b.changeRemovesReplica { 850 if err := b.addAppliedStateKeyToBatch(ctx); err != nil { 851 return err 852 } 853 } 854 855 // Apply the write batch to RockDB. Entry application is done without 856 // syncing to disk. The atomicity guarantees of the batch and the fact that 857 // the applied state is stored in this batch, ensure that if the batch ends 858 // up not being durably committed then the entries in this batch will be 859 // applied again upon startup. However, if we're removing the replica's data 860 // then we sync this batch as it is not safe to call postDestroyRaftMuLocked 861 // before ensuring that the replica's data has been synchronously removed. 862 // See handleChangeReplicasResult(). 863 sync := b.changeRemovesReplica 864 if err := b.batch.Commit(sync); err != nil { 865 return wrapWithNonDeterministicFailure(err, "unable to commit Raft entry batch") 866 } 867 b.batch.Close() 868 b.batch = nil 869 870 // Update the replica's applied indexes and mvcc stats. 871 r.mu.Lock() 872 r.mu.state.RaftAppliedIndex = b.state.RaftAppliedIndex 873 r.mu.state.LeaseAppliedIndex = b.state.LeaseAppliedIndex 874 prevStats := *r.mu.state.Stats 875 *r.mu.state.Stats = *b.state.Stats 876 877 // If the range is now less than its RangeMaxBytes, clear the history of its 878 // largest previous max bytes. 879 if r.mu.largestPreviousMaxRangeSizeBytes > 0 && b.state.Stats.Total() < *r.mu.zone.RangeMaxBytes { 880 r.mu.largestPreviousMaxRangeSizeBytes = 0 881 } 882 883 // Check the queuing conditions while holding the lock. 884 needsSplitBySize := r.needsSplitBySizeRLocked() 885 needsMergeBySize := r.needsMergeBySizeRLocked() 886 r.mu.Unlock() 887 888 // Record the stats delta in the StoreMetrics. 889 deltaStats := *b.state.Stats 890 deltaStats.Subtract(prevStats) 891 r.store.metrics.addMVCCStats(deltaStats) 892 893 // Record the write activity, passing a 0 nodeID because replica.writeStats 894 // intentionally doesn't track the origin of the writes. 895 b.r.writeStats.recordCount(float64(b.mutations), 0 /* nodeID */) 896 897 // NB: the bootstrap store has a nil split queue. 898 // TODO(tbg): the above is probably a lie now. 899 now := timeutil.Now() 900 if r.store.splitQueue != nil && needsSplitBySize && r.splitQueueThrottle.ShouldProcess(now) { 901 r.store.splitQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 902 } 903 // The bootstrap store has a nil merge queue. 904 // TODO(tbg): the above is probably a lie now. 905 if r.store.mergeQueue != nil && needsMergeBySize && r.mergeQueueThrottle.ShouldProcess(now) { 906 // TODO(tbg): for ranges which are small but protected from merges by 907 // other means (zone configs etc), this is called on every command, and 908 // fires off a goroutine each time. Make this trigger (and potentially 909 // the split one above, though it hasn't been observed to be as 910 // bothersome) less aggressive. 911 r.store.mergeQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now()) 912 } 913 914 b.recordStatsOnCommit() 915 return nil 916 } 917 918 // addAppliedStateKeyToBatch adds the applied state key to the application 919 // batch's RocksDB batch. This records the highest raft and lease index that 920 // have been applied as of this batch. It also records the Range's mvcc stats. 921 func (b *replicaAppBatch) addAppliedStateKeyToBatch(ctx context.Context) error { 922 loader := &b.r.raftMu.stateLoader 923 if b.migrateToAppliedStateKey { 924 // A Raft command wants us to begin using the RangeAppliedState key 925 // and we haven't performed the migration yet. Delete the old keys 926 // that this new key is replacing. 927 // 928 // NB: entering this branch indicates that the batch contains only a 929 // single non-trivial command. 930 err := loader.MigrateToRangeAppliedStateKey(ctx, b.batch, b.state.Stats) 931 if err != nil { 932 return wrapWithNonDeterministicFailure(err, "unable to migrate to range applied state") 933 } 934 b.state.UsingAppliedStateKey = true 935 } 936 if b.state.UsingAppliedStateKey { 937 // Set the range applied state, which includes the last applied raft and 938 // lease index along with the mvcc stats, all in one key. 939 if err := loader.SetRangeAppliedState( 940 ctx, b.batch, b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex, b.state.Stats, 941 ); err != nil { 942 return wrapWithNonDeterministicFailure(err, "unable to set range applied state") 943 } 944 } else { 945 // Advance the last applied index. We use a blind write in order to avoid 946 // reading the previous applied index keys on every write operation. This 947 // requires a little additional work in order maintain the MVCC stats. 948 var appliedIndexNewMS enginepb.MVCCStats 949 if err := loader.SetLegacyAppliedIndexBlind( 950 ctx, b.batch, &appliedIndexNewMS, b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex, 951 ); err != nil { 952 return wrapWithNonDeterministicFailure(err, "unable to set applied index") 953 } 954 b.state.Stats.SysBytes += appliedIndexNewMS.SysBytes - 955 loader.CalcAppliedIndexSysBytes(b.state.RaftAppliedIndex, b.state.LeaseAppliedIndex) 956 957 // Set the legacy MVCC stats key. 958 if err := loader.SetMVCCStats(ctx, b.batch, b.state.Stats); err != nil { 959 return wrapWithNonDeterministicFailure(err, "unable to update MVCCStats") 960 } 961 } 962 return nil 963 } 964 965 func (b *replicaAppBatch) recordStatsOnCommit() { 966 b.sm.stats.entriesProcessed += b.entries 967 b.sm.stats.numEmptyEntries += b.emptyEntries 968 b.sm.stats.batchesProcessed++ 969 970 elapsed := timeutil.Since(b.start) 971 b.r.store.metrics.RaftCommandCommitLatency.RecordValue(elapsed.Nanoseconds()) 972 } 973 974 // Close implements the apply.Batch interface. 975 func (b *replicaAppBatch) Close() { 976 if b.batch != nil { 977 b.batch.Close() 978 } 979 *b = replicaAppBatch{} 980 } 981 982 // ephemeralReplicaAppBatch implements the apply.Batch interface. 983 // 984 // The batch performs the bare-minimum amount of work to be able to 985 // determine whether a replicated command should be rejected or applied. 986 type ephemeralReplicaAppBatch struct { 987 r *Replica 988 state kvserverpb.ReplicaState 989 } 990 991 // Stage implements the apply.Batch interface. 992 func (mb *ephemeralReplicaAppBatch) Stage(cmdI apply.Command) (apply.CheckedCommand, error) { 993 cmd := cmdI.(*replicatedCmd) 994 ctx := cmd.ctx 995 996 mb.r.shouldApplyCommand(ctx, cmd, &mb.state) 997 mb.state.LeaseAppliedIndex = cmd.leaseIndex 998 return cmd, nil 999 } 1000 1001 // ApplyToStateMachine implements the apply.Batch interface. 1002 func (mb *ephemeralReplicaAppBatch) ApplyToStateMachine(ctx context.Context) error { 1003 panic("cannot apply ephemeralReplicaAppBatch to state machine") 1004 } 1005 1006 // Close implements the apply.Batch interface. 1007 func (mb *ephemeralReplicaAppBatch) Close() { 1008 *mb = ephemeralReplicaAppBatch{} 1009 } 1010 1011 // ApplySideEffects implements the apply.StateMachine interface. The method 1012 // handles the third phase of applying a command to the replica state machine. 1013 // 1014 // It is called with commands whose write batches have already been committed 1015 // to the storage engine and whose trivial side-effects have been applied to 1016 // the Replica's in-memory state. This method deals with applying non-trivial 1017 // side effects of commands, such as finalizing splits/merges and informing 1018 // raft about applied config changes. 1019 func (sm *replicaStateMachine) ApplySideEffects( 1020 cmdI apply.CheckedCommand, 1021 ) (apply.AppliedCommand, error) { 1022 cmd := cmdI.(*replicatedCmd) 1023 ctx := cmd.ctx 1024 1025 // Deal with locking during side-effect handling, which is sometimes 1026 // associated with complex commands such as splits and merged. 1027 if unlock := cmd.splitMergeUnlock; unlock != nil { 1028 defer unlock() 1029 } 1030 1031 // Set up the local result prior to handling the ReplicatedEvalResult to 1032 // give testing knobs an opportunity to inspect it. An injected corruption 1033 // error will lead to replica removal. 1034 sm.r.prepareLocalResult(ctx, cmd) 1035 if log.ExpensiveLogEnabled(ctx, 2) { 1036 log.VEventf(ctx, 2, "%v", cmd.localResult.String()) 1037 } 1038 1039 // Handle the ReplicatedEvalResult, executing any side effects of the last 1040 // state machine transition. 1041 // 1042 // Note that this must happen after committing (the engine.Batch), but 1043 // before notifying a potentially waiting client. 1044 clearTrivialReplicatedEvalResultFields(cmd.replicatedResult()) 1045 if !cmd.IsTrivial() { 1046 shouldAssert, isRemoved := sm.handleNonTrivialReplicatedEvalResult(ctx, *cmd.replicatedResult()) 1047 1048 if isRemoved { 1049 return nil, apply.ErrRemoved 1050 } 1051 // NB: Perform state assertion before acknowledging the client. 1052 // Some tests (TestRangeStatsInit) assumes that once the store has started 1053 // and the first range has a lease that there will not be a later hard-state. 1054 if shouldAssert { 1055 // Assert that the on-disk state doesn't diverge from the in-memory 1056 // state as a result of the side effects. 1057 sm.r.mu.Lock() 1058 sm.r.assertStateLocked(ctx, sm.r.store.Engine()) 1059 sm.r.mu.Unlock() 1060 sm.stats.stateAssertions++ 1061 } 1062 } else if res := cmd.replicatedResult(); !res.Equal(kvserverpb.ReplicatedEvalResult{}) { 1063 log.Fatalf(ctx, "failed to handle all side-effects of ReplicatedEvalResult: %v", res) 1064 } 1065 1066 if cmd.replicatedResult().RaftLogDelta == 0 { 1067 sm.r.handleNoRaftLogDeltaResult(ctx) 1068 } 1069 if cmd.localResult != nil { 1070 sm.r.handleReadWriteLocalEvalResult(ctx, *cmd.localResult) 1071 } 1072 if err := sm.maybeApplyConfChange(ctx, cmd); err != nil { 1073 return nil, wrapWithNonDeterministicFailure(err, "unable to apply conf change") 1074 } 1075 1076 // Mark the command as applied and return it as an apply.AppliedCommand. 1077 // NB: Commands which were reproposed at a higher MaxLeaseIndex will not be 1078 // considered local at this point as their proposal will have been detached 1079 // in prepareLocalResult(). 1080 if cmd.IsLocal() { 1081 rejected := cmd.Rejected() 1082 higherReproposalsExist := cmd.raftCmd.MaxLeaseIndex != cmd.proposal.command.MaxLeaseIndex 1083 if !rejected && higherReproposalsExist { 1084 log.Fatalf(ctx, "finishing proposal with outstanding reproposal at a higher max lease index") 1085 } 1086 if !rejected && cmd.proposal.applied { 1087 // If the command already applied then we shouldn't be "finishing" its 1088 // application again because it should only be able to apply successfully 1089 // once. We expect that when any reproposal for the same command attempts 1090 // to apply it will be rejected by the below raft lease sequence or lease 1091 // index check in checkForcedErr. 1092 log.Fatalf(ctx, "command already applied: %+v; unexpected successful result", cmd) 1093 } 1094 // If any reproposals at a higher MaxLeaseIndex exist we know that they will 1095 // never successfully apply, remove them from the map to avoid future 1096 // reproposals. If there is no command referencing this proposal at a higher 1097 // MaxLeaseIndex then it will already have been removed (see 1098 // shouldRemove in replicaDecoder.retrieveLocalProposals()). It is possible 1099 // that a later command in this batch referred to this proposal but it must 1100 // have failed because it carried the same MaxLeaseIndex. 1101 if higherReproposalsExist { 1102 sm.r.mu.Lock() 1103 delete(sm.r.mu.proposals, cmd.idKey) 1104 sm.r.mu.Unlock() 1105 } 1106 cmd.proposal.applied = true 1107 } 1108 return cmd, nil 1109 } 1110 1111 // handleNonTrivialReplicatedEvalResult carries out the side-effects of 1112 // non-trivial commands. It is run with the raftMu locked. It is illegal 1113 // to pass a replicatedResult that does not imply any side-effects. 1114 func (sm *replicaStateMachine) handleNonTrivialReplicatedEvalResult( 1115 ctx context.Context, rResult kvserverpb.ReplicatedEvalResult, 1116 ) (shouldAssert, isRemoved bool) { 1117 // Assert that this replicatedResult implies at least one side-effect. 1118 if rResult.Equal(kvserverpb.ReplicatedEvalResult{}) { 1119 log.Fatalf(ctx, "zero-value ReplicatedEvalResult passed to handleNonTrivialReplicatedEvalResult") 1120 } 1121 1122 if rResult.State != nil { 1123 if rResult.State.TruncatedState != nil { 1124 rResult.RaftLogDelta += sm.r.handleTruncatedStateResult(ctx, rResult.State.TruncatedState) 1125 rResult.State.TruncatedState = nil 1126 } 1127 1128 if (*rResult.State == kvserverpb.ReplicaState{}) { 1129 rResult.State = nil 1130 } 1131 } 1132 1133 if rResult.RaftLogDelta != 0 { 1134 sm.r.handleRaftLogDeltaResult(ctx, rResult.RaftLogDelta) 1135 rResult.RaftLogDelta = 0 1136 } 1137 1138 if rResult.SuggestedCompactions != nil { 1139 sm.r.handleSuggestedCompactionsResult(ctx, rResult.SuggestedCompactions) 1140 rResult.SuggestedCompactions = nil 1141 } 1142 1143 // The rest of the actions are "nontrivial" and may have large effects on the 1144 // in-memory and on-disk ReplicaStates. If any of these actions are present, 1145 // we want to assert that these two states do not diverge. 1146 shouldAssert = !rResult.Equal(kvserverpb.ReplicatedEvalResult{}) 1147 if !shouldAssert { 1148 return false, false 1149 } 1150 1151 if rResult.Split != nil { 1152 sm.r.handleSplitResult(ctx, rResult.Split) 1153 rResult.Split = nil 1154 } 1155 1156 if rResult.Merge != nil { 1157 sm.r.handleMergeResult(ctx, rResult.Merge) 1158 rResult.Merge = nil 1159 } 1160 1161 if rResult.State != nil { 1162 if newDesc := rResult.State.Desc; newDesc != nil { 1163 sm.r.handleDescResult(ctx, newDesc) 1164 rResult.State.Desc = nil 1165 } 1166 1167 if newLease := rResult.State.Lease; newLease != nil { 1168 sm.r.handleLeaseResult(ctx, newLease) 1169 rResult.State.Lease = nil 1170 } 1171 1172 if newThresh := rResult.State.GCThreshold; newThresh != nil { 1173 sm.r.handleGCThresholdResult(ctx, newThresh) 1174 rResult.State.GCThreshold = nil 1175 } 1176 1177 if rResult.State.UsingAppliedStateKey { 1178 sm.r.handleUsingAppliedStateKeyResult(ctx) 1179 rResult.State.UsingAppliedStateKey = false 1180 } 1181 1182 if (*rResult.State == kvserverpb.ReplicaState{}) { 1183 rResult.State = nil 1184 } 1185 } 1186 1187 if rResult.ChangeReplicas != nil { 1188 isRemoved = sm.r.handleChangeReplicasResult(ctx, rResult.ChangeReplicas) 1189 rResult.ChangeReplicas = nil 1190 } 1191 1192 if rResult.ComputeChecksum != nil { 1193 sm.r.handleComputeChecksumResult(ctx, rResult.ComputeChecksum) 1194 rResult.ComputeChecksum = nil 1195 } 1196 1197 if !rResult.Equal(kvserverpb.ReplicatedEvalResult{}) { 1198 log.Fatalf(ctx, "unhandled field in ReplicatedEvalResult: %s", pretty.Diff(rResult, kvserverpb.ReplicatedEvalResult{})) 1199 } 1200 return true, isRemoved 1201 } 1202 1203 func (sm *replicaStateMachine) maybeApplyConfChange(ctx context.Context, cmd *replicatedCmd) error { 1204 switch cmd.ent.Type { 1205 case raftpb.EntryNormal: 1206 if cmd.replicatedResult().ChangeReplicas != nil { 1207 log.Fatalf(ctx, "unexpected replication change from command %s", &cmd.raftCmd) 1208 } 1209 return nil 1210 case raftpb.EntryConfChange, raftpb.EntryConfChangeV2: 1211 sm.stats.numConfChangeEntries++ 1212 if cmd.replicatedResult().ChangeReplicas == nil { 1213 // The command was rejected. There is no need to report a ConfChange 1214 // to raft. 1215 return nil 1216 } 1217 return sm.r.withRaftGroup(true, func(rn *raft.RawNode) (bool, error) { 1218 rn.ApplyConfChange(cmd.confChange.ConfChangeI) 1219 return true, nil 1220 }) 1221 default: 1222 panic("unexpected") 1223 } 1224 } 1225 1226 func (sm *replicaStateMachine) moveStats() applyCommittedEntriesStats { 1227 stats := sm.stats 1228 sm.stats = applyCommittedEntriesStats{} 1229 return stats 1230 }