github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/apply/task.go (about) 1 // Copyright 2019 The Cockroach Authors. 2 // 3 // Use of this software is governed by the Business Source License 4 // included in the file licenses/BSL.txt. 5 // 6 // As of the Change Date specified in that file, in accordance with 7 // the Business Source License, use of this software will be governed 8 // by the Apache License, Version 2.0, included in the file 9 // licenses/APL.txt. 10 11 package apply 12 13 import ( 14 "context" 15 16 "github.com/cockroachdb/errors" 17 "go.etcd.io/etcd/raft/raftpb" 18 ) 19 20 // StateMachine represents an instance of a replicated state machine being 21 // driven by a replication group. The state machine accepts Commands that 22 // have been committed to the replication group's log and applies them to 23 // advance to a new state. 24 // 25 // All state transitions performed by the state machine are expected to be 26 // deterministic, which ensures that if each instance is driven from the 27 // same consistent shared log, they will all stay in sync. 28 type StateMachine interface { 29 // NewBatch creates a new batch that is suitable for accumulating the 30 // effects that a group of Commands will have on the replicated state 31 // machine. Commands are staged in the batch one-by-one and then the 32 // entire batch is committed at once. 33 // 34 // Batch comes in two flavors - real batches and ephemeral batches. 35 // Real batches are capable of accumulating updates from commands and 36 // applying them to the state machine. Ephemeral batches are not able 37 // to make changes to the durable state machine, but can still be used 38 // for the purpose of checking commands to determine whether they will 39 // be rejected or not when staged in a real batch. The principal user 40 // of ephemeral batches is AckCommittedEntriesBeforeApplication. 41 NewBatch(ephemeral bool) Batch 42 // ApplySideEffects applies the in-memory side-effects of a Command to 43 // the replicated state machine. The method will be called in the order 44 // that the commands are committed to the state machine's log. Once the 45 // in-memory side-effects of the Command are applied, an AppliedCommand 46 // is returned so that it can be finished and acknowledged. 47 // 48 // The method will always be called with a Command that has been checked 49 // and whose persistent state transition has been applied to the state 50 // machine. Because this method is called after applying the persistent 51 // state transition for a Command, it may not be called in the case of 52 // an untimely crash. This means that applying these side-effects will 53 // typically update the in-memory representation of the state machine 54 // to the same state that it would be in if the process restarted. 55 ApplySideEffects(CheckedCommand) (AppliedCommand, error) 56 } 57 58 // ErrRemoved can be returned from ApplySideEffects which will stop the 59 // task from processing more commands and return immediately. 60 var ErrRemoved = errors.New("replica removed") 61 62 // Batch accumulates a series of updates from Commands and performs them 63 // all at once to its StateMachine when applied. Groups of Commands will be 64 // staged in the Batch such that one or more trivial Commands are staged or 65 // exactly one non-trivial Command is staged. 66 type Batch interface { 67 // Stage inserts a Command into the Batch. In doing so, the Command is 68 // checked for rejection and a CheckedCommand is returned. 69 Stage(Command) (CheckedCommand, error) 70 // ApplyToStateMachine applies the persistent state transitions staged 71 // in the Batch to the StateMachine, atomically. 72 ApplyToStateMachine(context.Context) error 73 // Close closes the batch and releases any resources that it holds. 74 Close() 75 } 76 77 // Decoder is capable of decoding a list of committed raft entries and 78 // binding any that were locally proposed to their local proposals. 79 type Decoder interface { 80 // DecodeAndBind decodes each of the provided raft entries into commands 81 // and binds any that were proposed locally to their local proposals. 82 // The method must only be called once per Decoder. It returns whether 83 // any of the commands were bound to local proposals waiting for 84 // acknowledgement. 85 DecodeAndBind(context.Context, []raftpb.Entry) (anyLocal bool, _ error) 86 // NewCommandIter creates an iterator over the replicated commands that 87 // were passed to DecodeAndBind. The method must not be called until 88 // after DecodeAndBind is called. 89 NewCommandIter() CommandIterator 90 // Reset resets the Decoder and releases any resources that it holds. 91 Reset() 92 } 93 94 // Task is an object capable of coordinating the application of commands to 95 // a replicated state machine after they have been durably committed to a 96 // raft log. 97 // 98 // Committed raft entries are provided to the task through its Decode 99 // method. The task will then apply these entries to the provided state 100 // machine when ApplyCommittedEntries is called. 101 type Task struct { 102 sm StateMachine 103 dec Decoder 104 105 // Have entries been decoded yet? 106 decoded bool 107 // Were any of the decoded commands locally proposed? 108 anyLocal bool 109 // The maximum number of commands that can be applied in a batch. 110 batchSize int32 111 } 112 113 // MakeTask creates a new task with the provided state machine and decoder. 114 func MakeTask(sm StateMachine, dec Decoder) Task { 115 return Task{sm: sm, dec: dec} 116 } 117 118 // Decode decodes the committed raft entries into commands and prepared for the 119 // commands to be applied to the replicated state machine. 120 func (t *Task) Decode(ctx context.Context, committedEntries []raftpb.Entry) error { 121 var err error 122 t.anyLocal, err = t.dec.DecodeAndBind(ctx, committedEntries) 123 t.decoded = true 124 return err 125 } 126 127 func (t *Task) assertDecoded() { 128 if !t.decoded { 129 panic("Task.Decode not called yet") 130 } 131 } 132 133 // AckCommittedEntriesBeforeApplication attempts to acknowledge the success of 134 // raft entries that have been durably committed to the raft log but have not 135 // yet been applied to the proposer replica's replicated state machine. 136 // 137 // This is safe because a proposal through raft can be known to have succeeded 138 // as soon as it is durably replicated to a quorum of replicas (i.e. has 139 // committed in the raft log). The proposal does not need to wait for the 140 // effects of the proposal to be applied in order to know whether its changes 141 // will succeed or fail. This is because the raft log is the provider of 142 // atomicity and durability for replicated writes, not (ignoring log 143 // truncation) the replicated state machine itself. 144 // 145 // However, there are a few complications to acknowledging the success of a 146 // proposal at this stage: 147 // 148 // 1. Committing an entry in the raft log and having the command in that entry 149 // succeed are similar but not equivalent concepts. Even if the entry succeeds 150 // in achieving durability by replicating to a quorum of replicas, its command 151 // may still be rejected "beneath raft". This means that a (deterministic) 152 // check after replication decides that the command will not be applied to the 153 // replicated state machine. In that case, the client waiting on the result of 154 // the command should not be informed of its success. Luckily, this check is 155 // cheap to perform so we can do it here and when applying the command. 156 // 157 // Determining whether the command will succeed or be rejected before applying 158 // it for real is accomplished using an ephemeral batch. Commands are staged in 159 // the ephemeral batch to acquire CheckedCommands, which can then be acknowledged 160 // immediately even though the ephemeral batch itself cannot be used to update 161 // the durable state machine. Once the rejection status of each command is 162 // determined, any successful commands that permit acknowledgement before 163 // application (see CanAckBeforeApplication) are acknowledged. The ephemeral 164 // batch is then thrown away. 165 // 166 // 2. Some commands perform non-trivial work such as updating Replica configuration 167 // state or performing Range splits. In those cases, it's likely that the client 168 // is interested in not only knowing whether it has succeeded in sequencing the 169 // change in the raft log, but also in knowing when the change has gone into 170 // effect. There's currently no exposed hook to ask for an acknowledgement only 171 // after a command has been applied, so for simplicity the current implementation 172 // only ever acks transactional writes before they have gone into effect. All 173 // other commands wait until they have been applied to ack their client. 174 // 175 // 3. Even though we can determine whether a command has succeeded without applying 176 // it, the effect of the command will not be visible to conflicting commands until 177 // it is applied. Because of this, the client can be informed of the success of 178 // a write at this point, but we cannot release that write's latches until the 179 // write has applied. See ProposalData.signalProposalResult/finishApplication. 180 // 181 // 4. etcd/raft may provided a series of CommittedEntries in a Ready struct that 182 // haven't actually been appended to our own log. This is most common in single 183 // node replication groups, but it is possible when a follower in a multi-node 184 // replication group is catching up after falling behind. In the first case, 185 // the entries are not yet committed so acknowledging them would be a lie. In 186 // the second case, the entries are committed so we could acknowledge them at 187 // this point, but doing so seems risky. To avoid complications in either case, 188 // the method takes a maxIndex parameter that limits the indexes that it will 189 // acknowledge. Typically, callers will supply the highest index that they have 190 // durably written to their raft log for this upper bound. 191 // 192 func (t *Task) AckCommittedEntriesBeforeApplication(ctx context.Context, maxIndex uint64) error { 193 t.assertDecoded() 194 if !t.anyLocal { 195 return nil // fast-path 196 } 197 198 // Create a new ephemeral application batch. All we're interested in is 199 // whether commands will be rejected or not when staged in a real batch. 200 batch := t.sm.NewBatch(true /* ephemeral */) 201 defer batch.Close() 202 203 iter := t.dec.NewCommandIter() 204 defer iter.Close() 205 206 // Collect a batch of trivial commands from the applier. Stop at the first 207 // non-trivial command or at the first command with an index above maxIndex. 208 batchIter := takeWhileCmdIter(iter, func(cmd Command) bool { 209 if cmd.Index() > maxIndex { 210 return false 211 } 212 return cmd.IsTrivial() 213 }) 214 215 // Stage the commands in the (ephemeral) batch. 216 stagedIter, err := mapCmdIter(batchIter, batch.Stage) 217 if err != nil { 218 return err 219 } 220 221 // Acknowledge any locally-proposed commands that succeeded in being staged 222 // in the batch and can be acknowledged before they are actually applied. 223 // Don't acknowledge rejected proposals early because the StateMachine may 224 // want to retry the command instead of returning the error to the client. 225 return forEachCheckedCmdIter(stagedIter, func(cmd CheckedCommand) error { 226 if !cmd.Rejected() && cmd.IsLocal() && cmd.CanAckBeforeApplication() { 227 return cmd.AckSuccess() 228 } 229 return nil 230 }) 231 } 232 233 // SetMaxBatchSize sets the maximum application batch size. If 0, no limit 234 // will be placed on the number of commands that can be applied in a batch. 235 func (t *Task) SetMaxBatchSize(size int) { 236 t.batchSize = int32(size) 237 } 238 239 // ApplyCommittedEntries applies raft entries that have been committed to the 240 // raft log but have not yet been applied to the replicated state machine. 241 func (t *Task) ApplyCommittedEntries(ctx context.Context) error { 242 t.assertDecoded() 243 244 iter := t.dec.NewCommandIter() 245 defer iter.Close() 246 for iter.Valid() { 247 if err := t.applyOneBatch(ctx, iter); err != nil { 248 return err 249 } 250 } 251 return nil 252 } 253 254 // applyOneBatch consumes a batch-worth of commands from the provided iter and 255 // applies them atomically to the StateMachine. A batch will contain either: 256 // a) one or more trivial commands 257 // b) exactly one non-trivial command 258 func (t *Task) applyOneBatch(ctx context.Context, iter CommandIterator) error { 259 // Create a new application batch. 260 batch := t.sm.NewBatch(false /* ephemeral */) 261 defer batch.Close() 262 263 // Consume a batch-worth of commands. 264 pol := trivialPolicy{maxCount: t.batchSize} 265 batchIter := takeWhileCmdIter(iter, func(cmd Command) bool { 266 return pol.maybeAdd(cmd.IsTrivial()) 267 }) 268 269 // Stage each command in the batch. 270 stagedIter, err := mapCmdIter(batchIter, batch.Stage) 271 if err != nil { 272 return err 273 } 274 275 // Apply the persistent state transitions to the state machine. 276 if err := batch.ApplyToStateMachine(ctx); err != nil { 277 return err 278 } 279 280 // Apply the side-effects of each command to the state machine. 281 appliedIter, err := mapCheckedCmdIter(stagedIter, t.sm.ApplySideEffects) 282 if err != nil { 283 return err 284 } 285 286 // Finish and acknowledge the outcome of each command. 287 return forEachAppliedCmdIter(ctx, appliedIter, AppliedCommand.FinishAndAckOutcome) 288 } 289 290 // trivialPolicy encodes a batching policy that allows a batch to consist of 291 // either one or more trivial commands or exactly one non-trivial command. 292 type trivialPolicy struct { 293 maxCount int32 294 295 trivialCount int32 296 nonTrivialCount int32 297 } 298 299 // maybeAdd returns whether a command with the specified triviality should be 300 // added to a batch given the batching policy. If the method returns true, the 301 // command is considered to have been added. 302 func (p *trivialPolicy) maybeAdd(trivial bool) bool { 303 if !trivial { 304 if p.trivialCount+p.nonTrivialCount > 0 { 305 return false 306 } 307 p.nonTrivialCount++ 308 return true 309 } 310 if p.nonTrivialCount > 0 { 311 return false 312 } 313 if p.maxCount > 0 && p.maxCount == p.trivialCount { 314 return false 315 } 316 p.trivialCount++ 317 return true 318 } 319 320 // Close ends the task, releasing any resources that it holds and resetting the 321 // Decoder. The Task cannot be used again after being closed. 322 func (t *Task) Close() { 323 t.dec.Reset() 324 *t = Task{} 325 }