github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/apply/task.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/apply/task.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package apply
    12  
    13  import (
    14  	"context"
    15  
    16  	"github.com/cockroachdb/errors"
    17  	"go.etcd.io/etcd/raft/raftpb"
    18  )
    19  
    20  // StateMachine represents an instance of a replicated state machine being
    21  // driven by a replication group. The state machine accepts Commands that
    22  // have been committed to the replication group's log and applies them to
    23  // advance to a new state.
    24  //
    25  // All state transitions performed by the state machine are expected to be
    26  // deterministic, which ensures that if each instance is driven from the
    27  // same consistent shared log, they will all stay in sync.
    28  type StateMachine interface {
    29  	// NewBatch creates a new batch that is suitable for accumulating the
    30  	// effects that a group of Commands will have on the replicated state
    31  	// machine. Commands are staged in the batch one-by-one and then the
    32  	// entire batch is committed at once.
    33  	//
    34  	// Batch comes in two flavors - real batches and ephemeral batches.
    35  	// Real batches are capable of accumulating updates from commands and
    36  	// applying them to the state machine. Ephemeral batches are not able
    37  	// to make changes to the durable state machine, but can still be used
    38  	// for the purpose of checking commands to determine whether they will
    39  	// be rejected or not when staged in a real batch. The principal user
    40  	// of ephemeral batches is AckCommittedEntriesBeforeApplication.
    41  	NewBatch(ephemeral bool) Batch
    42  	// ApplySideEffects applies the in-memory side-effects of a Command to
    43  	// the replicated state machine. The method will be called in the order
    44  	// that the commands are committed to the state machine's log. Once the
    45  	// in-memory side-effects of the Command are applied, an AppliedCommand
    46  	// is returned so that it can be finished and acknowledged.
    47  	//
    48  	// The method will always be called with a Command that has been checked
    49  	// and whose persistent state transition has been applied to the state
    50  	// machine. Because this method is called after applying the persistent
    51  	// state transition for a Command, it may not be called in the case of
    52  	// an untimely crash. This means that applying these side-effects will
    53  	// typically update the in-memory representation of the state machine
    54  	// to the same state that it would be in if the process restarted.
    55  	ApplySideEffects(CheckedCommand) (AppliedCommand, error)
    56  }
    57  
    58  // ErrRemoved can be returned from ApplySideEffects which will stop the
    59  // task from processing more commands and return immediately.
    60  var ErrRemoved = errors.New("replica removed")
    61  
    62  // Batch accumulates a series of updates from Commands and performs them
    63  // all at once to its StateMachine when applied. Groups of Commands will be
    64  // staged in the Batch such that one or more trivial Commands are staged or
    65  // exactly one non-trivial Command is staged.
    66  type Batch interface {
    67  	// Stage inserts a Command into the Batch. In doing so, the Command is
    68  	// checked for rejection and a CheckedCommand is returned.
    69  	Stage(Command) (CheckedCommand, error)
    70  	// ApplyToStateMachine applies the persistent state transitions staged
    71  	// in the Batch to the StateMachine, atomically.
    72  	ApplyToStateMachine(context.Context) error
    73  	// Close closes the batch and releases any resources that it holds.
    74  	Close()
    75  }
    76  
    77  // Decoder is capable of decoding a list of committed raft entries and
    78  // binding any that were locally proposed to their local proposals.
    79  type Decoder interface {
    80  	// DecodeAndBind decodes each of the provided raft entries into commands
    81  	// and binds any that were proposed locally to their local proposals.
    82  	// The method must only be called once per Decoder. It returns whether
    83  	// any of the commands were bound to local proposals waiting for
    84  	// acknowledgement.
    85  	DecodeAndBind(context.Context, []raftpb.Entry) (anyLocal bool, _ error)
    86  	// NewCommandIter creates an iterator over the replicated commands that
    87  	// were passed to DecodeAndBind. The method must not be called until
    88  	// after DecodeAndBind is called.
    89  	NewCommandIter() CommandIterator
    90  	// Reset resets the Decoder and releases any resources that it holds.
    91  	Reset()
    92  }
    93  
    94  // Task is an object capable of coordinating the application of commands to
    95  // a replicated state machine after they have been durably committed to a
    96  // raft log.
    97  //
    98  // Committed raft entries are provided to the task through its Decode
    99  // method. The task will then apply these entries to the provided state
   100  // machine when ApplyCommittedEntries is called.
   101  type Task struct {
   102  	sm  StateMachine
   103  	dec Decoder
   104  
   105  	// Have entries been decoded yet?
   106  	decoded bool
   107  	// Were any of the decoded commands locally proposed?
   108  	anyLocal bool
   109  	// The maximum number of commands that can be applied in a batch.
   110  	batchSize int32
   111  }
   112  
   113  // MakeTask creates a new task with the provided state machine and decoder.
   114  func MakeTask(sm StateMachine, dec Decoder) Task {
   115  	return Task{sm: sm, dec: dec}
   116  }
   117  
   118  // Decode decodes the committed raft entries into commands and prepared for the
   119  // commands to be applied to the replicated state machine.
   120  func (t *Task) Decode(ctx context.Context, committedEntries []raftpb.Entry) error {
   121  	var err error
   122  	t.anyLocal, err = t.dec.DecodeAndBind(ctx, committedEntries)
   123  	t.decoded = true
   124  	return err
   125  }
   126  
   127  func (t *Task) assertDecoded() {
   128  	if !t.decoded {
   129  		panic("Task.Decode not called yet")
   130  	}
   131  }
   132  
   133  // AckCommittedEntriesBeforeApplication attempts to acknowledge the success of
   134  // raft entries that have been durably committed to the raft log but have not
   135  // yet been applied to the proposer replica's replicated state machine.
   136  //
   137  // This is safe because a proposal through raft can be known to have succeeded
   138  // as soon as it is durably replicated to a quorum of replicas (i.e. has
   139  // committed in the raft log). The proposal does not need to wait for the
   140  // effects of the proposal to be applied in order to know whether its changes
   141  // will succeed or fail. This is because the raft log is the provider of
   142  // atomicity and durability for replicated writes, not (ignoring log
   143  // truncation) the replicated state machine itself.
   144  //
   145  // However, there are a few complications to acknowledging the success of a
   146  // proposal at this stage:
   147  //
   148  //  1. Committing an entry in the raft log and having the command in that entry
   149  //     succeed are similar but not equivalent concepts. Even if the entry succeeds
   150  //     in achieving durability by replicating to a quorum of replicas, its command
   151  //     may still be rejected "beneath raft". This means that a (deterministic)
   152  //     check after replication decides that the command will not be applied to the
   153  //     replicated state machine. In that case, the client waiting on the result of
   154  //     the command should not be informed of its success. Luckily, this check is
   155  //     cheap to perform so we can do it here and when applying the command.
   156  //
   157  //     Determining whether the command will succeed or be rejected before applying
   158  //     it for real is accomplished using an ephemeral batch. Commands are staged in
   159  //     the ephemeral batch to acquire CheckedCommands, which can then be acknowledged
   160  //     immediately even though the ephemeral batch itself cannot be used to update
   161  //     the durable state machine. Once the rejection status of each command is
   162  //     determined, any successful commands that permit acknowledgement before
   163  //     application (see CanAckBeforeApplication) are acknowledged. The ephemeral
   164  //     batch is then thrown away.
   165  //
   166  //  2. Some commands perform non-trivial work such as updating Replica configuration
   167  //     state or performing Range splits. In those cases, it's likely that the client
   168  //     is interested in not only knowing whether it has succeeded in sequencing the
   169  //     change in the raft log, but also in knowing when the change has gone into
   170  //     effect. There's currently no exposed hook to ask for an acknowledgement only
   171  //     after a command has been applied, so for simplicity the current implementation
   172  //     only ever acks transactional writes before they have gone into effect. All
   173  //     other commands wait until they have been applied to ack their client.
   174  //
   175  //  3. Even though we can determine whether a command has succeeded without applying
   176  //     it, the effect of the command will not be visible to conflicting commands until
   177  //     it is applied. Because of this, the client can be informed of the success of
   178  //     a write at this point, but we cannot release that write's latches until the
   179  //     write has applied. See ProposalData.signalProposalResult/finishApplication.
   180  //
   181  //  4. etcd/raft may provided a series of CommittedEntries in a Ready struct that
   182  //     haven't actually been appended to our own log. This is most common in single
   183  //     node replication groups, but it is possible when a follower in a multi-node
   184  //     replication group is catching up after falling behind. In the first case,
   185  //     the entries are not yet committed so acknowledging them would be a lie. In
   186  //     the second case, the entries are committed so we could acknowledge them at
   187  //     this point, but doing so seems risky. To avoid complications in either case,
   188  //     the method takes a maxIndex parameter that limits the indexes that it will
   189  //     acknowledge. Typically, callers will supply the highest index that they have
   190  //     durably written to their raft log for this upper bound.
   191  //
   192  func (t *Task) AckCommittedEntriesBeforeApplication(ctx context.Context, maxIndex uint64) error {
   193  	t.assertDecoded()
   194  	if !t.anyLocal {
   195  		return nil // fast-path
   196  	}
   197  
   198  	// Create a new ephemeral application batch. All we're interested in is
   199  	// whether commands will be rejected or not when staged in a real batch.
   200  	batch := t.sm.NewBatch(true /* ephemeral */)
   201  	defer batch.Close()
   202  
   203  	iter := t.dec.NewCommandIter()
   204  	defer iter.Close()
   205  
   206  	// Collect a batch of trivial commands from the applier. Stop at the first
   207  	// non-trivial command or at the first command with an index above maxIndex.
   208  	batchIter := takeWhileCmdIter(iter, func(cmd Command) bool {
   209  		if cmd.Index() > maxIndex {
   210  			return false
   211  		}
   212  		return cmd.IsTrivial()
   213  	})
   214  
   215  	// Stage the commands in the (ephemeral) batch.
   216  	stagedIter, err := mapCmdIter(batchIter, batch.Stage)
   217  	if err != nil {
   218  		return err
   219  	}
   220  
   221  	// Acknowledge any locally-proposed commands that succeeded in being staged
   222  	// in the batch and can be acknowledged before they are actually applied.
   223  	// Don't acknowledge rejected proposals early because the StateMachine may
   224  	// want to retry the command instead of returning the error to the client.
   225  	return forEachCheckedCmdIter(stagedIter, func(cmd CheckedCommand) error {
   226  		if !cmd.Rejected() && cmd.IsLocal() && cmd.CanAckBeforeApplication() {
   227  			return cmd.AckSuccess()
   228  		}
   229  		return nil
   230  	})
   231  }
   232  
   233  // SetMaxBatchSize sets the maximum application batch size. If 0, no limit
   234  // will be placed on the number of commands that can be applied in a batch.
   235  func (t *Task) SetMaxBatchSize(size int) {
   236  	t.batchSize = int32(size)
   237  }
   238  
   239  // ApplyCommittedEntries applies raft entries that have been committed to the
   240  // raft log but have not yet been applied to the replicated state machine.
   241  func (t *Task) ApplyCommittedEntries(ctx context.Context) error {
   242  	t.assertDecoded()
   243  
   244  	iter := t.dec.NewCommandIter()
   245  	defer iter.Close()
   246  	for iter.Valid() {
   247  		if err := t.applyOneBatch(ctx, iter); err != nil {
   248  			return err
   249  		}
   250  	}
   251  	return nil
   252  }
   253  
   254  // applyOneBatch consumes a batch-worth of commands from the provided iter and
   255  // applies them atomically to the StateMachine. A batch will contain either:
   256  // a) one or more trivial commands
   257  // b) exactly one non-trivial command
   258  func (t *Task) applyOneBatch(ctx context.Context, iter CommandIterator) error {
   259  	// Create a new application batch.
   260  	batch := t.sm.NewBatch(false /* ephemeral */)
   261  	defer batch.Close()
   262  
   263  	// Consume a batch-worth of commands.
   264  	pol := trivialPolicy{maxCount: t.batchSize}
   265  	batchIter := takeWhileCmdIter(iter, func(cmd Command) bool {
   266  		return pol.maybeAdd(cmd.IsTrivial())
   267  	})
   268  
   269  	// Stage each command in the batch.
   270  	stagedIter, err := mapCmdIter(batchIter, batch.Stage)
   271  	if err != nil {
   272  		return err
   273  	}
   274  
   275  	// Apply the persistent state transitions to the state machine.
   276  	if err := batch.ApplyToStateMachine(ctx); err != nil {
   277  		return err
   278  	}
   279  
   280  	// Apply the side-effects of each command to the state machine.
   281  	appliedIter, err := mapCheckedCmdIter(stagedIter, t.sm.ApplySideEffects)
   282  	if err != nil {
   283  		return err
   284  	}
   285  
   286  	// Finish and acknowledge the outcome of each command.
   287  	return forEachAppliedCmdIter(ctx, appliedIter, AppliedCommand.FinishAndAckOutcome)
   288  }
   289  
   290  // trivialPolicy encodes a batching policy that allows a batch to consist of
   291  // either one or more trivial commands or exactly one non-trivial command.
   292  type trivialPolicy struct {
   293  	maxCount int32
   294  
   295  	trivialCount    int32
   296  	nonTrivialCount int32
   297  }
   298  
   299  // maybeAdd returns whether a command with the specified triviality should be
   300  // added to a batch given the batching policy. If the method returns true, the
   301  // command is considered to have been added.
   302  func (p *trivialPolicy) maybeAdd(trivial bool) bool {
   303  	if !trivial {
   304  		if p.trivialCount+p.nonTrivialCount > 0 {
   305  			return false
   306  		}
   307  		p.nonTrivialCount++
   308  		return true
   309  	}
   310  	if p.nonTrivialCount > 0 {
   311  		return false
   312  	}
   313  	if p.maxCount > 0 && p.maxCount == p.trivialCount {
   314  		return false
   315  	}
   316  	p.trivialCount++
   317  	return true
   318  }
   319  
   320  // Close ends the task, releasing any resources that it holds and resetting the
   321  // Decoder. The Task cannot be used again after being closed.
   322  func (t *Task) Close() {
   323  	t.dec.Reset()
   324  	*t = Task{}
   325  }