github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raft.go

github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/replica_raft.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"fmt"
    16  	"math/rand"
    17  	"sort"
    18  	"time"
    19  
    20  	"github.com/cockroachdb/cockroach/pkg/keys"
    21  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/apply"
    22  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency"
    23  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverbase"
    24  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/kvserverpb"
    25  	"github.com/cockroachdb/cockroach/pkg/kv/kvserver/stateloader"
    26  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    27  	"github.com/cockroachdb/cockroach/pkg/storage"
    28  	"github.com/cockroachdb/cockroach/pkg/util"
    29  	"github.com/cockroachdb/cockroach/pkg/util/encoding"
    30  	"github.com/cockroachdb/cockroach/pkg/util/hlc"
    31  	"github.com/cockroachdb/cockroach/pkg/util/humanizeutil"
    32  	"github.com/cockroachdb/cockroach/pkg/util/log"
    33  	"github.com/cockroachdb/cockroach/pkg/util/protoutil"
    34  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    35  	"github.com/cockroachdb/cockroach/pkg/util/tracing"
    36  	"github.com/cockroachdb/cockroach/pkg/util/uuid"
    37  	"github.com/cockroachdb/errors"
    38  	"go.etcd.io/etcd/raft"
    39  	"go.etcd.io/etcd/raft/raftpb"
    40  	"go.etcd.io/etcd/raft/tracker"
    41  )
    42  
    43  func makeIDKey() kvserverbase.CmdIDKey {
    44  	idKeyBuf := make([]byte, 0, raftCommandIDLen)
    45  	idKeyBuf = encoding.EncodeUint64Ascending(idKeyBuf, uint64(rand.Int63()))
    46  	return kvserverbase.CmdIDKey(idKeyBuf)
    47  }
    48  
    49  // evalAndPropose prepares the necessary pending command struct and initializes
    50  // a client command ID if one hasn't been. A verified lease is supplied as a
    51  // parameter if the command requires a lease; nil otherwise. It then evaluates
    52  // the command and proposes it to Raft on success.
    53  //
    54  // The method accepts a concurrency guard, which it assumes responsibility for
    55  // if it succeeds in proposing a command into Raft. If the method does not
    56  // return an error, the guard is guaranteed to be eventually freed and the
    57  // caller should relinquish all ownership of it. If it does return an error, the
    58  // caller retains full ownership over the guard.
    59  //
    60  // Return values:
    61  // - a channel which receives a response or error upon application
    62  // - a closure used to attempt to abandon the command. When called, it unbinds
    63  //   the command's context from its Raft proposal. The client is then free to
    64  //   terminate execution, although it is given no guarantee that the proposal
    65  //   won't still go on to commit and apply at some later time.
    66  // - the MaxLeaseIndex of the resulting proposal, if any.
    67  // - any error obtained during the creation or proposal of the command, in
    68  //   which case the other returned values are zero.
    69  func (r *Replica) evalAndPropose(
    70  	ctx context.Context, ba *roachpb.BatchRequest, g *concurrency.Guard, lease *roachpb.Lease,
    71  ) (chan proposalResult, func(), int64, *roachpb.Error) {
    72  	idKey := makeIDKey()
    73  	proposal, pErr := r.requestToProposal(ctx, idKey, ba, g.LatchSpans())
    74  	log.Event(proposal.ctx, "evaluated request")
    75  
    76  	// If the request hit a server-side concurrency retry error, immediately
    77  	// proagate the error. Don't assume ownership of the concurrency guard.
    78  	if isConcurrencyRetryError(pErr) {
    79  		return nil, nil, 0, pErr
    80  	}
    81  
    82  	// Attach the endCmds to the proposal and assume responsibility for
    83  	// releasing the concurrency guard if the proposal makes it to Raft.
    84  	proposal.ec = endCmds{repl: r, g: g}
    85  
    86  	// Pull out proposal channel to return. proposal.doneCh may be set to
    87  	// nil if it is signaled in this function.
    88  	proposalCh := proposal.doneCh
    89  
    90  	// There are two cases where request evaluation does not lead to a Raft
    91  	// proposal:
    92  	// 1. proposal.command == nil indicates that the evaluation was a no-op
    93  	//    and that no Raft command needs to be proposed.
    94  	// 2. pErr != nil corresponds to a failed proposal - the command resulted
    95  	//    in an error.
    96  	if proposal.command == nil {
    97  		intents := proposal.Local.DetachEncounteredIntents()
    98  		endTxns := proposal.Local.DetachEndTxns(pErr != nil /* alwaysOnly */)
    99  		r.handleReadWriteLocalEvalResult(ctx, *proposal.Local)
   100  
   101  		pr := proposalResult{
   102  			Reply:              proposal.Local.Reply,
   103  			Err:                pErr,
   104  			EncounteredIntents: intents,
   105  			EndTxns:            endTxns,
   106  		}
   107  		proposal.finishApplication(ctx, pr)
   108  		return proposalCh, func() {}, 0, nil
   109  	}
   110  
   111  	// If the request requested that Raft consensus be performed asynchronously,
   112  	// return a proposal result immediately on the proposal's done channel.
   113  	// The channel's capacity will be large enough to accommodate this.
   114  	if ba.AsyncConsensus {
   115  		if ets := proposal.Local.DetachEndTxns(false /* alwaysOnly */); len(ets) != 0 {
   116  			// Disallow async consensus for commands with EndTxnIntents because
   117  			// any !Always EndTxnIntent can't be cleaned up until after the
   118  			// command succeeds.
   119  			return nil, nil, 0, roachpb.NewErrorf("cannot perform consensus asynchronously for "+
   120  				"proposal with EndTxnIntents=%v; %v", ets, ba)
   121  		}
   122  
   123  		// Fork the proposal's context span so that the proposal's context
   124  		// can outlive the original proposer's context.
   125  		proposal.ctx, proposal.sp = tracing.ForkCtxSpan(ctx, "async consensus")
   126  
   127  		// Signal the proposal's response channel immediately.
   128  		reply := *proposal.Local.Reply
   129  		reply.Responses = append([]roachpb.ResponseUnion(nil), reply.Responses...)
   130  		pr := proposalResult{
   131  			Reply:              &reply,
   132  			EncounteredIntents: proposal.Local.DetachEncounteredIntents(),
   133  		}
   134  		proposal.signalProposalResult(pr)
   135  
   136  		// Continue with proposal...
   137  	}
   138  
   139  	// Attach information about the proposer to the command.
   140  	proposal.command.ProposerLeaseSequence = lease.Sequence
   141  
   142  	// Once a command is written to the raft log, it must be loaded into memory
   143  	// and replayed on all replicas. If a command is too big, stop it here. If
   144  	// the command is not too big, acquire an appropriate amount of quota from
   145  	// the replica's proposal quota pool.
   146  	//
   147  	// TODO(tschottdorf): blocking a proposal here will leave it dangling in the
   148  	// closed timestamp tracker for an extended period of time, which will in turn
   149  	// prevent the node-wide closed timestamp from making progress. This is quite
   150  	// unfortunate; we should hoist the quota pool before the reference with the
   151  	// closed timestamp tracker is acquired. This is better anyway; right now many
   152  	// commands can evaluate but then be blocked on quota, which has worse memory
   153  	// behavior.
   154  	quotaSize := uint64(proposal.command.Size())
   155  	if maxSize := uint64(MaxCommandSize.Get(&r.store.cfg.Settings.SV)); quotaSize > maxSize {
   156  		return nil, nil, 0, roachpb.NewError(errors.Errorf(
   157  			"command is too large: %d bytes (max: %d)", quotaSize, maxSize,
   158  		))
   159  	}
   160  	var err error
   161  	proposal.quotaAlloc, err = r.maybeAcquireProposalQuota(ctx, quotaSize)
   162  	if err != nil {
   163  		return nil, nil, 0, roachpb.NewError(err)
   164  	}
   165  	// Make sure we clean up the proposal if we fail to insert it into the
   166  	// proposal buffer successfully. This ensures that we always release any
   167  	// quota that we acquire.
   168  	defer func() {
   169  		if pErr != nil {
   170  			proposal.releaseQuota()
   171  		}
   172  	}()
   173  
   174  	if filter := r.store.TestingKnobs().TestingProposalFilter; filter != nil {
   175  		filterArgs := kvserverbase.ProposalFilterArgs{
   176  			Ctx:   ctx,
   177  			Cmd:   *proposal.command,
   178  			CmdID: idKey,
   179  			Req:   *ba,
   180  		}
   181  		if pErr := filter(filterArgs); pErr != nil {
   182  			return nil, nil, 0, pErr
   183  		}
   184  	}
   185  
   186  	maxLeaseIndex, pErr := r.propose(ctx, proposal)
   187  	if pErr != nil {
   188  		return nil, nil, 0, pErr
   189  	}
   190  	// Abandoning a proposal unbinds its context so that the proposal's client
   191  	// is free to terminate execution. However, it does nothing to try to
   192  	// prevent the command from succeeding. In particular, endCmds will still be
   193  	// invoked when the command is applied. There are a handful of cases where
   194  	// the command may not be applied (or even processed): the process crashes
   195  	// or the local replica is removed from the range.
   196  	abandon := func() {
   197  		// The proposal may or may not be in the Replica's proposals map.
   198  		// Instead of trying to look it up, simply modify the captured object
   199  		// directly. The raftMu must be locked to modify the context of a
   200  		// proposal because as soon as we propose a command to Raft, ownership
   201  		// passes to the "below Raft" machinery.
   202  		r.raftMu.Lock()
   203  		defer r.raftMu.Unlock()
   204  		r.mu.Lock()
   205  		defer r.mu.Unlock()
   206  		// TODO(radu): Should this context be created via tracer.ForkCtxSpan?
   207  		// We'd need to make sure the span is finished eventually.
   208  		proposal.ctx = r.AnnotateCtx(context.TODO())
   209  	}
   210  	return proposalCh, abandon, maxLeaseIndex, nil
   211  }
   212  
   213  // propose encodes a command, starts tracking it, and proposes it to raft. The
   214  // method is also responsible for assigning the command its maximum lease index.
   215  //
   216  // The method hands ownership of the command over to the Raft machinery. After
   217  // the method returns, all access to the command must be performed while holding
   218  // Replica.mu and Replica.raftMu. If a non-nil error is returned the
   219  // MaxLeaseIndex is not updated.
   220  func (r *Replica) propose(ctx context.Context, p *ProposalData) (index int64, pErr *roachpb.Error) {
   221  
   222  	// If an error occurs reset the command's MaxLeaseIndex to its initial value.
   223  	// Failure to propose will propagate to the client. An invariant of this
   224  	// package is that proposals which are finished carry a raft command with a
   225  	// MaxLeaseIndex equal to the proposal command's max lease index.
   226  	defer func(prev uint64) {
   227  		if pErr != nil {
   228  			p.command.MaxLeaseIndex = prev
   229  		}
   230  	}(p.command.MaxLeaseIndex)
   231  
   232  	// Make sure the maximum lease index is unset. This field will be set in
   233  	// propBuf.Insert and its encoded bytes will be appended to the encoding
   234  	// buffer as a RaftCommandFooter.
   235  	p.command.MaxLeaseIndex = 0
   236  
   237  	// Determine the encoding style for the Raft command.
   238  	prefix := true
   239  	version := raftVersionStandard
   240  	if crt := p.command.ReplicatedEvalResult.ChangeReplicas; crt != nil {
   241  		// EndTxnRequest with a ChangeReplicasTrigger is special because Raft
   242  		// needs to understand it; it cannot simply be an opaque command. To
   243  		// permit this, the command is proposed by the proposal buffer using
   244  		// ProposeConfChange. For that reason, we also don't need a Raft command
   245  		// prefix because the command ID is stored in a field in
   246  		// raft.ConfChange.
   247  		log.Infof(p.ctx, "proposing %s", crt)
   248  		prefix = false
   249  
   250  		// Ensure that we aren't trying to remove ourselves from the range without
   251  		// having previously given up our lease, since the range won't be able
   252  		// to make progress while the lease is owned by a removed replica (and
   253  		// leases can stay in such a state for a very long time when using epoch-
   254  		// based range leases). This shouldn't happen often, but has been seen
   255  		// before (#12591).
   256  		//
   257  		// Note that due to atomic replication changes, when a removal is initiated,
   258  		// the replica remains in the descriptor, but as VOTER_{OUTGOING,DEMOTING}.
   259  		// We want to block it from getting into that state in the first place,
   260  		// since there's no stopping the actual removal/demotion once it's there.
   261  		// The Removed() field has contains these replicas when this first
   262  		// transition is initiated, so its use here is copacetic.
   263  		replID := r.ReplicaID()
   264  		for _, rDesc := range crt.Removed() {
   265  			if rDesc.ReplicaID == replID {
   266  				msg := fmt.Sprintf("received invalid ChangeReplicasTrigger %s to remove self (leaseholder)", crt)
   267  				log.Errorf(p.ctx, "%v", msg)
   268  				return 0, roachpb.NewErrorf("%s: %s", r, msg)
   269  			}
   270  		}
   271  
   272  	} else if p.command.ReplicatedEvalResult.AddSSTable != nil {
   273  		log.VEvent(p.ctx, 4, "sideloadable proposal detected")
   274  		version = raftVersionSideloaded
   275  		r.store.metrics.AddSSTableProposals.Inc(1)
   276  
   277  		if p.command.ReplicatedEvalResult.AddSSTable.Data == nil {
   278  			return 0, roachpb.NewErrorf("cannot sideload empty SSTable")
   279  		}
   280  	} else if log.V(4) {
   281  		log.Infof(p.ctx, "proposing command %x: %s", p.idKey, p.Request.Summary())
   282  	}
   283  
   284  	// Create encoding buffer.
   285  	preLen := 0
   286  	if prefix {
   287  		preLen = raftCommandPrefixLen
   288  	}
   289  	cmdLen := p.command.Size()
   290  	cap := preLen + cmdLen + kvserverpb.MaxRaftCommandFooterSize()
   291  	data := make([]byte, preLen, cap)
   292  	// Encode prefix with command ID, if necessary.
   293  	if prefix {
   294  		encodeRaftCommandPrefix(data, version, p.idKey)
   295  	}
   296  	// Encode body of command.
   297  	data = data[:preLen+cmdLen]
   298  	if _, err := protoutil.MarshalTo(p.command, data[preLen:]); err != nil {
   299  		return 0, roachpb.NewError(err)
   300  	}
   301  
   302  	// Too verbose even for verbose logging, so manually enable if you want to
   303  	// debug proposal sizes.
   304  	if false {
   305  		log.Infof(p.ctx, `%s: proposal: %d
   306    RaftCommand.ReplicatedEvalResult:          %d
   307    RaftCommand.ReplicatedEvalResult.Delta:    %d
   308    RaftCommand.WriteBatch:                    %d
   309  `, p.Request.Summary(), cmdLen,
   310  			p.command.ReplicatedEvalResult.Size(),
   311  			p.command.ReplicatedEvalResult.Delta.Size(),
   312  			p.command.WriteBatch.Size(),
   313  		)
   314  	}
   315  
   316  	// Log an event if this is a large proposal. These are more likely to cause
   317  	// blips or worse, and it's good to be able to pick them from traces.
   318  	//
   319  	// TODO(tschottdorf): can we mark them so lightstep can group them?
   320  	const largeProposalEventThresholdBytes = 2 << 19 // 512kb
   321  	if cmdLen > largeProposalEventThresholdBytes {
   322  		log.Eventf(p.ctx, "proposal is large: %s", humanizeutil.IBytes(int64(cmdLen)))
   323  	}
   324  
   325  	// Insert into the proposal buffer, which passes the command to Raft to be
   326  	// proposed. The proposal buffer assigns the command a maximum lease index
   327  	// when it sequences it.
   328  	//
   329  	// NB: we must not hold r.mu while using the proposal buffer, see comment
   330  	// on the field.
   331  	maxLeaseIndex, err := r.mu.proposalBuf.Insert(p, data)
   332  	if err != nil {
   333  		return 0, roachpb.NewError(err)
   334  	}
   335  	return int64(maxLeaseIndex), nil
   336  }
   337  
   338  func (r *Replica) numPendingProposalsRLocked() int {
   339  	return len(r.mu.proposals) + r.mu.proposalBuf.Len()
   340  }
   341  
   342  // hasPendingProposalsRLocked is part of the quiescer interface.
   343  // It returns true if this node has any outstanding proposals. A client might be
   344  // waiting for the outcome of these proposals, so we definitely don't want to
   345  // quiesce while such proposals are in-flight.
   346  //
   347  // Note that this method says nothing about other node's outstanding proposals:
   348  // if this node is the current leaseholders, previous leaseholders might have
   349  // proposals on which they're waiting. If this node is not the current
   350  // leaseholder, then obviously whoever is the current leaseholder might have
   351  // pending proposals. This method is called in two places: on the current
   352  // leaseholder when deciding whether the leaseholder should attempt to quiesce
   353  // the range, and then on every follower to confirm that the range can indeed be
   354  // quiesced.
   355  func (r *Replica) hasPendingProposalsRLocked() bool {
   356  	return r.numPendingProposalsRLocked() > 0
   357  }
   358  
   359  // hasPendingProposalQuotaRLocked is part of the quiescer interface. It returns
   360  // true if there are any commands that haven't completed replicating that are
   361  // tracked by this node's quota pool (i.e. commands that haven't been acked by
   362  // all live replicas).
   363  // We can't quiesce while there's outstanding quota because the respective quota
   364  // would not be released while quiesced, and it might prevent the range from
   365  // unquiescing (leading to deadlock). See #46699.
   366  func (r *Replica) hasPendingProposalQuotaRLocked() bool {
   367  	if r.mu.proposalQuota == nil {
   368  		return true
   369  	}
   370  	return !r.mu.proposalQuota.Full()
   371  }
   372  
   373  var errRemoved = errors.New("replica removed")
   374  
   375  // stepRaftGroup calls Step on the replica's RawNode with the provided request's
   376  // message. Before doing so, it assures that the replica is unquiesced and ready
   377  // to handle the request.
   378  func (r *Replica) stepRaftGroup(req *RaftMessageRequest) error {
   379  	// We're processing an incoming raft message (from a batch that may
   380  	// include MsgVotes), so don't campaign if we wake up our raft
   381  	// group.
   382  	return r.withRaftGroup(false, func(raftGroup *raft.RawNode) (bool, error) {
   383  		// We're processing a message from another replica which means that the
   384  		// other replica is not quiesced, so we don't need to wake the leader.
   385  		// Note that we avoid campaigning when receiving raft messages, because
   386  		// we expect the originator to campaign instead.
   387  		r.unquiesceWithOptionsLocked(false /* campaignOnWake */)
   388  		r.mu.lastUpdateTimes.update(req.FromReplica.ReplicaID, timeutil.Now())
   389  		err := raftGroup.Step(req.Message)
   390  		if errors.Is(err, raft.ErrProposalDropped) {
   391  			// A proposal was forwarded to this replica but we couldn't propose it.
   392  			// Swallow the error since we don't have an effective way of signaling
   393  			// this to the sender.
   394  			// TODO(bdarnell): Handle ErrProposalDropped better.
   395  			// https://github.com/cockroachdb/cockroach/issues/21849
   396  			err = nil
   397  		}
   398  		return false /* unquiesceAndWakeLeader */, err
   399  	})
   400  }
   401  
   402  type handleRaftReadyStats struct {
   403  	applyCommittedEntriesStats
   404  }
   405  
   406  // noSnap can be passed to handleRaftReady when no snapshot should be processed.
   407  var noSnap IncomingSnapshot
   408  
   409  // handleRaftReady processes a raft.Ready containing entries and messages that
   410  // are ready to read, be saved to stable storage, committed, or sent to other
   411  // peers. It takes a non-empty IncomingSnapshot to indicate that it is
   412  // about to process a snapshot.
   413  //
   414  // The returned string is nonzero whenever an error is returned to give a
   415  // non-sensitive cue as to what happened.
   416  func (r *Replica) handleRaftReady(
   417  	ctx context.Context, inSnap IncomingSnapshot,
   418  ) (handleRaftReadyStats, string, error) {
   419  	defer func(start time.Time) {
   420  		elapsed := timeutil.Since(start)
   421  		r.store.metrics.RaftHandleReadyLatency.RecordValue(elapsed.Nanoseconds())
   422  	}(timeutil.Now())
   423  	r.raftMu.Lock()
   424  	defer r.raftMu.Unlock()
   425  	return r.handleRaftReadyRaftMuLocked(ctx, inSnap)
   426  }
   427  
   428  // handleRaftReadyLocked is the same as handleRaftReady but requires that the
   429  // replica's raftMu be held.
   430  //
   431  // The returned string is nonzero whenever an error is returned to give a
   432  // non-sensitive cue as to what happened.
   433  func (r *Replica) handleRaftReadyRaftMuLocked(
   434  	ctx context.Context, inSnap IncomingSnapshot,
   435  ) (handleRaftReadyStats, string, error) {
   436  	var stats handleRaftReadyStats
   437  
   438  	var hasReady bool
   439  	var rd raft.Ready
   440  	r.mu.Lock()
   441  	lastIndex := r.mu.lastIndex // used for append below
   442  	lastTerm := r.mu.lastTerm
   443  	raftLogSize := r.mu.raftLogSize
   444  	leaderID := r.mu.leaderID
   445  	lastLeaderID := leaderID
   446  	err := r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) {
   447  		numFlushed, err := r.mu.proposalBuf.FlushLockedWithRaftGroup(raftGroup)
   448  		if err != nil {
   449  			return false, err
   450  		}
   451  		if hasReady = raftGroup.HasReady(); hasReady {
   452  			rd = raftGroup.Ready()
   453  		}
   454  		// We unquiesce if we have a Ready (= there's work to do). We also have
   455  		// to unquiesce if we just flushed some proposals but there isn't a
   456  		// Ready, which can happen if the proposals got dropped (raft does this
   457  		// if it doesn't know who the leader is). And, for extra defense in depth,
   458  		// we also unquiesce if there are outstanding proposals.
   459  		//
   460  		// NB: if we had the invariant that the group can only be in quiesced
   461  		// state if it knows the leader (state.Lead) AND we knew that raft would
   462  		// never give us an empty ready here (i.e. the only reason to drop a
   463  		// proposal is not knowing the leader) then numFlushed would not be
   464  		// necessary. The latter is likely true but we don't want to rely on
   465  		// it. The former is maybe true, but there's no easy way to enforce it.
   466  		unquiesceAndWakeLeader := hasReady || numFlushed > 0 || len(r.mu.proposals) > 0
   467  		return unquiesceAndWakeLeader, nil
   468  	})
   469  	r.mu.Unlock()
   470  	if errors.Is(err, errRemoved) {
   471  		// If we've been removed then just return.
   472  		return stats, "", nil
   473  	} else if err != nil {
   474  		const expl = "while checking raft group for Ready"
   475  		return stats, expl, errors.Wrap(err, expl)
   476  	}
   477  	if !hasReady {
   478  		// We must update the proposal quota even if we don't have a ready.
   479  		// Consider the case when our quota is of size 1 and two out of three
   480  		// replicas have committed one log entry while the third is lagging
   481  		// behind. When the third replica finally does catch up and sends
   482  		// along a MsgAppResp, since the entry is already committed on the
   483  		// leader replica, no Ready is emitted. But given that the third
   484  		// replica has caught up, we can release
   485  		// some quota back to the pool.
   486  		r.updateProposalQuotaRaftMuLocked(ctx, lastLeaderID)
   487  		return stats, "", nil
   488  	}
   489  
   490  	logRaftReady(ctx, rd)
   491  
   492  	refreshReason := noReason
   493  	if rd.SoftState != nil && leaderID != roachpb.ReplicaID(rd.SoftState.Lead) {
   494  		// Refresh pending commands if the Raft leader has changed. This is usually
   495  		// the first indication we have of a new leader on a restarted node.
   496  		//
   497  		// TODO(peter): Re-proposing commands when SoftState.Lead changes can lead
   498  		// to wasteful multiple-reproposals when we later see an empty Raft command
   499  		// indicating a newly elected leader or a conf change. Replay protection
   500  		// prevents any corruption, so the waste is only a performance issue.
   501  		if log.V(3) {
   502  			log.Infof(ctx, "raft leader changed: %d -> %d", leaderID, rd.SoftState.Lead)
   503  		}
   504  		if !r.store.TestingKnobs().DisableRefreshReasonNewLeader {
   505  			refreshReason = reasonNewLeader
   506  		}
   507  		leaderID = roachpb.ReplicaID(rd.SoftState.Lead)
   508  	}
   509  
   510  	if !raft.IsEmptySnap(rd.Snapshot) {
   511  		snapUUID, err := uuid.FromBytes(rd.Snapshot.Data)
   512  		if err != nil {
   513  			const expl = "invalid snapshot id"
   514  			return stats, expl, errors.Wrap(err, expl)
   515  		}
   516  		if inSnap.SnapUUID == (uuid.UUID{}) {
   517  			log.Fatalf(ctx, "programming error: a snapshot application was attempted outside of the streaming snapshot codepath")
   518  		}
   519  		if snapUUID != inSnap.SnapUUID {
   520  			log.Fatalf(ctx, "incoming snapshot id doesn't match raft snapshot id: %s != %s", snapUUID, inSnap.SnapUUID)
   521  		}
   522  
   523  		// Applying this snapshot may require us to subsume one or more of our right
   524  		// neighbors. This occurs if this replica is informed about the merges via a
   525  		// Raft snapshot instead of a MsgApp containing the merge commits, e.g.,
   526  		// because it went offline before the merge commits applied and did not come
   527  		// back online until after the merge commits were truncated away.
   528  		subsumedRepls, releaseMergeLock := r.maybeAcquireSnapshotMergeLock(ctx, inSnap)
   529  		defer releaseMergeLock()
   530  
   531  		if err := r.applySnapshot(ctx, inSnap, rd.Snapshot, rd.HardState, subsumedRepls); err != nil {
   532  			const expl = "while applying snapshot"
   533  			return stats, expl, errors.Wrap(err, expl)
   534  		}
   535  
   536  		// r.mu.lastIndex, r.mu.lastTerm and r.mu.raftLogSize were updated in
   537  		// applySnapshot, but we also want to make sure we reflect these changes in
   538  		// the local variables we're tracking here.
   539  		r.mu.RLock()
   540  		lastIndex = r.mu.lastIndex
   541  		lastTerm = r.mu.lastTerm
   542  		raftLogSize = r.mu.raftLogSize
   543  		r.mu.RUnlock()
   544  
   545  		// We refresh pending commands after applying a snapshot because this
   546  		// replica may have been temporarily partitioned from the Raft group and
   547  		// missed leadership changes that occurred. Suppose node A is the leader,
   548  		// and then node C gets partitioned away from the others. Leadership passes
   549  		// back and forth between A and B during the partition, but when the
   550  		// partition is healed node A is leader again.
   551  		if !r.store.TestingKnobs().DisableRefreshReasonSnapshotApplied &&
   552  			refreshReason == noReason {
   553  			refreshReason = reasonSnapshotApplied
   554  		}
   555  	}
   556  
   557  	// If the ready struct includes entries that have been committed, these
   558  	// entries will be applied to the Replica's replicated state machine down
   559  	// below, after appending new entries to the raft log and sending messages
   560  	// to peers. However, the process of appending new entries to the raft log
   561  	// and then applying committed entries to the state machine can take some
   562  	// time - and these entries are already durably committed. If they have
   563  	// clients waiting on them, we'd like to acknowledge their success as soon
   564  	// as possible. To facilitate this, we take a quick pass over the committed
   565  	// entries and acknowledge as many as we can trivially prove will not be
   566  	// rejected beneath raft.
   567  	//
   568  	// Note that we only acknowledge up to the current last index in the Raft
   569  	// log. The CommittedEntries slice may contain entries that are also in the
   570  	// Entries slice (to be appended in this ready pass), and we don't want to
   571  	// acknowledge them until they are durably in our local Raft log. This is
   572  	// most common in single node replication groups, but it is possible when a
   573  	// follower in a multi-node replication group is catching up after falling
   574  	// behind. In the first case, the entries are not yet committed so
   575  	// acknowledging them would be a lie. In the second case, the entries are
   576  	// committed so we could acknowledge them at this point, but doing so seems
   577  	// risky. To avoid complications in either case, we pass lastIndex for the
   578  	// maxIndex argument to AckCommittedEntriesBeforeApplication.
   579  	sm := r.getStateMachine()
   580  	dec := r.getDecoder()
   581  	appTask := apply.MakeTask(sm, dec)
   582  	appTask.SetMaxBatchSize(r.store.TestingKnobs().MaxApplicationBatchSize)
   583  	defer appTask.Close()
   584  	if err := appTask.Decode(ctx, rd.CommittedEntries); err != nil {
   585  		return stats, getNonDeterministicFailureExplanation(err), err
   586  	}
   587  	if err := appTask.AckCommittedEntriesBeforeApplication(ctx, lastIndex); err != nil {
   588  		return stats, getNonDeterministicFailureExplanation(err), err
   589  	}
   590  
   591  	// Separate the MsgApp messages from all other Raft message types so that we
   592  	// can take advantage of the optimization discussed in the Raft thesis under
   593  	// the section: `10.2.1 Writing to the leader’s disk in parallel`. The
   594  	// optimization suggests that instead of a leader writing new log entries to
   595  	// disk before replicating them to its followers, the leader can instead
   596  	// write the entries to disk in parallel with replicating to its followers
   597  	// and them writing to their disks.
   598  	//
   599  	// Here, we invoke this optimization by:
   600  	// 1. sending all MsgApps.
   601  	// 2. syncing all entries and Raft state to disk.
   602  	// 3. sending all other messages.
   603  	//
   604  	// Since this is all handled in handleRaftReadyRaftMuLocked, we're assured
   605  	// that even though we may sync new entries to disk after sending them in
   606  	// MsgApps to followers, we'll always have them synced to disk before we
   607  	// process followers' MsgAppResps for the corresponding entries because
   608  	// Ready processing is sequential (and because a restart of the leader would
   609  	// prevent the MsgAppResp from being handled by it). This is important
   610  	// because it makes sure that the leader always has all of the entries in
   611  	// the log for its term, which is required in etcd/raft for technical
   612  	// reasons[1].
   613  	//
   614  	// MsgApps are also used to inform followers of committed entries through
   615  	// the Commit index that they contain. Due to the optimization described
   616  	// above, a Commit index may be sent out to a follower before it is
   617  	// persisted on the leader. This is safe because the Commit index can be
   618  	// treated as volatile state, as is supported by raft.MustSync[2].
   619  	// Additionally, the Commit index can never refer to entries from the
   620  	// current Ready (due to the MsgAppResp argument above) except in
   621  	// single-node groups, in which as a result we have to be careful to not
   622  	// persist a Commit index without the entries its commit index might refer
   623  	// to (see the HardState update below for details).
   624  	//
   625  	// [1]: the Raft thesis states that this can be made safe:
   626  	//
   627  	// > The leader may even commit an entry before it has been written to its
   628  	// > own disk, if a majority of followers have written it to their disks;
   629  	// > this is still safe.
   630  	//
   631  	// [2]: Raft thesis section: `3.8 Persisted state and server restarts`:
   632  	//
   633  	// > Other state variables are safe to lose on a restart, as they can all be
   634  	// > recreated. The most interesting example is the commit index, which can
   635  	// > safely be reinitialized to zero on a restart.
   636  	//
   637  	// Note that this will change when joint quorums are implemented, at which
   638  	// point we have to introduce coupling between the Commit index and
   639  	// persisted config changes, and also require some commit indexes to be
   640  	// durably synced.
   641  	// See:
   642  	// https://github.com/etcd-io/etcd/issues/7625#issuecomment-489232411
   643  
   644  	msgApps, otherMsgs := splitMsgApps(rd.Messages)
   645  	r.traceMessageSends(msgApps, "sending msgApp")
   646  	r.sendRaftMessages(ctx, msgApps)
   647  
   648  	// Use a more efficient write-only batch because we don't need to do any
   649  	// reads from the batch. Any reads are performed via the "distinct" batch
   650  	// which passes the reads through to the underlying DB.
   651  	batch := r.store.Engine().NewWriteOnlyBatch()
   652  	defer batch.Close()
   653  
   654  	// We know that all of the writes from here forward will be to distinct keys.
   655  	writer := batch.Distinct()
   656  	prevLastIndex := lastIndex
   657  	if len(rd.Entries) > 0 {
   658  		// All of the entries are appended to distinct keys, returning a new
   659  		// last index.
   660  		thinEntries, sideLoadedEntriesSize, err := r.maybeSideloadEntriesRaftMuLocked(ctx, rd.Entries)
   661  		if err != nil {
   662  			const expl = "during sideloading"
   663  			return stats, expl, errors.Wrap(err, expl)
   664  		}
   665  		raftLogSize += sideLoadedEntriesSize
   666  		if lastIndex, lastTerm, raftLogSize, err = r.append(
   667  			ctx, writer, lastIndex, lastTerm, raftLogSize, thinEntries,
   668  		); err != nil {
   669  			const expl = "during append"
   670  			return stats, expl, errors.Wrap(err, expl)
   671  		}
   672  	}
   673  	if !raft.IsEmptyHardState(rd.HardState) {
   674  		if !r.IsInitialized() && rd.HardState.Commit != 0 {
   675  			log.Fatalf(ctx, "setting non-zero HardState.Commit on uninitialized replica %s. HS=%+v", r, rd.HardState)
   676  		}
   677  		// NB: Note that without additional safeguards, it's incorrect to write
   678  		// the HardState before appending rd.Entries. When catching up, a follower
   679  		// will receive Entries that are immediately Committed in the same
   680  		// Ready. If we persist the HardState but happen to lose the Entries,
   681  		// assertions can be tripped.
   682  		//
   683  		// We have both in the same batch, so there's no problem. If that ever
   684  		// changes, we must write and sync the Entries before the HardState.
   685  		if err := r.raftMu.stateLoader.SetHardState(ctx, writer, rd.HardState); err != nil {
   686  			const expl = "during setHardState"
   687  			return stats, expl, errors.Wrap(err, expl)
   688  		}
   689  	}
   690  	writer.Close()
   691  	// Synchronously commit the batch with the Raft log entries and Raft hard
   692  	// state as we're promising not to lose this data.
   693  	//
   694  	// Note that the data is visible to other goroutines before it is synced to
   695  	// disk. This is fine. The important constraints are that these syncs happen
   696  	// before Raft messages are sent and before the call to RawNode.Advance. Our
   697  	// regular locking is sufficient for this and if other goroutines can see the
   698  	// data early, that's fine. In particular, snapshots are not a problem (I
   699  	// think they're the only thing that might access log entries or HardState
   700  	// from other goroutines). Snapshots do not include either the HardState or
   701  	// uncommitted log entries, and even if they did include log entries that
   702  	// were not persisted to disk, it wouldn't be a problem because raft does not
   703  	// infer the that entries are persisted on the node that sends a snapshot.
   704  	commitStart := timeutil.Now()
   705  	if err := batch.Commit(rd.MustSync && !disableSyncRaftLog.Get(&r.store.cfg.Settings.SV)); err != nil {
   706  		const expl = "while committing batch"
   707  		return stats, expl, errors.Wrap(err, expl)
   708  	}
   709  	if rd.MustSync {
   710  		elapsed := timeutil.Since(commitStart)
   711  		r.store.metrics.RaftLogCommitLatency.RecordValue(elapsed.Nanoseconds())
   712  	}
   713  
   714  	if len(rd.Entries) > 0 {
   715  		// We may have just overwritten parts of the log which contain
   716  		// sideloaded SSTables from a previous term (and perhaps discarded some
   717  		// entries that we didn't overwrite). Remove any such leftover on-disk
   718  		// payloads (we can do that now because we've committed the deletion
   719  		// just above).
   720  		firstPurge := rd.Entries[0].Index // first new entry written
   721  		purgeTerm := rd.Entries[0].Term - 1
   722  		lastPurge := prevLastIndex // old end of the log, include in deletion
   723  		purgedSize, err := maybePurgeSideloaded(ctx, r.raftMu.sideloaded, firstPurge, lastPurge, purgeTerm)
   724  		if err != nil {
   725  			const expl = "while purging sideloaded storage"
   726  			return stats, expl, err
   727  		}
   728  		raftLogSize -= purgedSize
   729  		if raftLogSize < 0 {
   730  			// Might have gone negative if node was recently restarted.
   731  			raftLogSize = 0
   732  		}
   733  	}
   734  
   735  	// Update protected state - last index, last term, raft log size, and raft
   736  	// leader ID.
   737  	r.mu.Lock()
   738  	r.mu.lastIndex = lastIndex
   739  	r.mu.lastTerm = lastTerm
   740  	r.mu.raftLogSize = raftLogSize
   741  	var becameLeader bool
   742  	if r.mu.leaderID != leaderID {
   743  		r.mu.leaderID = leaderID
   744  		// Clear the remote proposal set. Would have been nil already if not
   745  		// previously the leader.
   746  		becameLeader = r.mu.leaderID == r.mu.replicaID
   747  	}
   748  	r.mu.Unlock()
   749  
   750  	// When becoming the leader, proactively add the replica to the replicate
   751  	// queue. We might have been handed leadership by a remote node which wanted
   752  	// to remove itself from the range.
   753  	if becameLeader && r.store.replicateQueue != nil {
   754  		r.store.replicateQueue.MaybeAddAsync(ctx, r, r.store.Clock().Now())
   755  	}
   756  
   757  	// Update raft log entry cache. We clear any older, uncommitted log entries
   758  	// and cache the latest ones.
   759  	r.store.raftEntryCache.Add(r.RangeID, rd.Entries, true /* truncate */)
   760  	r.sendRaftMessages(ctx, otherMsgs)
   761  	r.traceEntries(rd.CommittedEntries, "committed, before applying any entries")
   762  
   763  	applicationStart := timeutil.Now()
   764  	if len(rd.CommittedEntries) > 0 {
   765  		err := appTask.ApplyCommittedEntries(ctx)
   766  		stats.applyCommittedEntriesStats = sm.moveStats()
   767  		if errors.Is(err, apply.ErrRemoved) {
   768  			// We know that our replica has been removed. All future calls to
   769  			// r.withRaftGroup() will return errRemoved so no future Ready objects
   770  			// will be processed by this Replica.
   771  			return stats, "", err
   772  		} else if err != nil {
   773  			return stats, getNonDeterministicFailureExplanation(err), err
   774  		}
   775  
   776  		// etcd raft occasionally adds a nil entry (our own commands are never
   777  		// empty). This happens in two situations: When a new leader is elected, and
   778  		// when a config change is dropped due to the "one at a time" rule. In both
   779  		// cases we may need to resubmit our pending proposals (In the former case
   780  		// we resubmit everything because we proposed them to a former leader that
   781  		// is no longer able to commit them. In the latter case we only need to
   782  		// resubmit pending config changes, but it's hard to distinguish so we
   783  		// resubmit everything anyway). We delay resubmission until after we have
   784  		// processed the entire batch of entries.
   785  		if stats.numEmptyEntries > 0 {
   786  			// Overwrite unconditionally since this is the most aggressive
   787  			// reproposal mode.
   788  			if !r.store.TestingKnobs().DisableRefreshReasonNewLeaderOrConfigChange {
   789  				refreshReason = reasonNewLeaderOrConfigChange
   790  			}
   791  		}
   792  	}
   793  	applicationElapsed := timeutil.Since(applicationStart).Nanoseconds()
   794  	r.store.metrics.RaftApplyCommittedLatency.RecordValue(applicationElapsed)
   795  	if r.store.TestingKnobs().EnableUnconditionalRefreshesInRaftReady {
   796  		refreshReason = reasonNewLeaderOrConfigChange
   797  	}
   798  	if refreshReason != noReason {
   799  		r.mu.Lock()
   800  		r.refreshProposalsLocked(ctx, 0 /* refreshAtDelta */, refreshReason)
   801  		r.mu.Unlock()
   802  	}
   803  
   804  	// NB: if we just processed a command which removed this replica from the
   805  	// raft group we will early return before this point. This, combined with
   806  	// the fact that we'll refuse to process messages intended for a higher
   807  	// replica ID ensures that our replica ID could not have changed.
   808  	const expl = "during advance"
   809  
   810  	r.mu.Lock()
   811  	err = r.withRaftGroupLocked(true, func(raftGroup *raft.RawNode) (bool, error) {
   812  		raftGroup.Advance(rd)
   813  		if stats.numConfChangeEntries > 0 {
   814  			// If the raft leader got removed, campaign the first remaining voter.
   815  			//
   816  			// NB: this must be called after Advance() above since campaigning is
   817  			// a no-op in the presence of unapplied conf changes.
   818  			maybeCampaignAfterConfChange(ctx, r.store.StoreID(), r.descRLocked(), raftGroup)
   819  		}
   820  
   821  		// If the Raft group still has more to process then we immediately
   822  		// re-enqueue it for another round of processing. This is possible if
   823  		// the group's committed entries were paginated due to size limitations
   824  		// and we didn't apply all of them in this pass.
   825  		if raftGroup.HasReady() {
   826  			r.store.enqueueRaftUpdateCheck(r.RangeID)
   827  		}
   828  		return true, nil
   829  	})
   830  	r.mu.Unlock()
   831  	if err != nil {
   832  		return stats, expl, errors.Wrap(err, expl)
   833  	}
   834  
   835  	// NB: All early returns other than the one due to not having a ready
   836  	// which also makes the below call are due to fatal errors.
   837  	// We must also update the proposal quota when have a ready; consider the
   838  	// case where there are two replicas and we have a quota of size 1. We
   839  	// acquire the quota when the write gets proposed on the leader and expect it
   840  	// to be released when the follower commits it locally. In order to do so we
   841  	// need to have the entry 'come out of raft' and in the case of a two node
   842  	// raft group, this only happens if hasReady == true. If we don't release
   843  	// quota back at the end of handleRaftReadyRaftMuLocked, the next write will
   844  	// get blocked.
   845  	r.updateProposalQuotaRaftMuLocked(ctx, lastLeaderID)
   846  	return stats, "", nil
   847  }
   848  
   849  // splitMsgApps splits the Raft message slice into two slices, one containing
   850  // MsgApps and one containing all other message types. Each slice retains the
   851  // relative ordering between messages in the original slice.
   852  func splitMsgApps(msgs []raftpb.Message) (msgApps, otherMsgs []raftpb.Message) {
   853  	splitIdx := 0
   854  	for i, msg := range msgs {
   855  		if msg.Type == raftpb.MsgApp {
   856  			msgs[i], msgs[splitIdx] = msgs[splitIdx], msgs[i]
   857  			splitIdx++
   858  		}
   859  	}
   860  	return msgs[:splitIdx], msgs[splitIdx:]
   861  }
   862  
   863  // maybeFatalOnRaftReadyErr will fatal if err is neither nil nor
   864  // apply.ErrRemoved.
   865  func maybeFatalOnRaftReadyErr(ctx context.Context, expl string, err error) (removed bool) {
   866  	switch {
   867  	case err == nil:
   868  		return false
   869  	case errors.Is(err, apply.ErrRemoved):
   870  		return true
   871  	default:
   872  		log.FatalfDepth(ctx, 1, "%s: %+v", log.Safe(expl), err)
   873  		panic("unreachable")
   874  	}
   875  }
   876  
   877  // tick the Raft group, returning true if the raft group exists and is
   878  // unquiesced; false otherwise.
   879  func (r *Replica) tick(livenessMap IsLiveMap) (bool, error) {
   880  	ctx := r.AnnotateCtx(context.TODO())
   881  
   882  	r.unreachablesMu.Lock()
   883  	remotes := r.unreachablesMu.remotes
   884  	r.unreachablesMu.remotes = nil
   885  	r.unreachablesMu.Unlock()
   886  
   887  	r.raftMu.Lock()
   888  	defer r.raftMu.Unlock()
   889  	r.mu.Lock()
   890  	defer r.mu.Unlock()
   891  
   892  	// If the raft group is uninitialized, do not initialize on tick.
   893  	if r.mu.internalRaftGroup == nil {
   894  		return false, nil
   895  	}
   896  
   897  	for remoteReplica := range remotes {
   898  		r.mu.internalRaftGroup.ReportUnreachable(uint64(remoteReplica))
   899  	}
   900  
   901  	if r.mu.quiescent {
   902  		return false, nil
   903  	}
   904  	if r.maybeQuiesceLocked(ctx, livenessMap) {
   905  		return false, nil
   906  	}
   907  
   908  	r.maybeTransferRaftLeadershipLocked(ctx)
   909  
   910  	// For followers, we update lastUpdateTimes when we step a message from them
   911  	// into the local Raft group. The leader won't hit that path, so we update
   912  	// it whenever it ticks. In effect, this makes sure it always sees itself as
   913  	// alive.
   914  	if r.mu.replicaID == r.mu.leaderID {
   915  		r.mu.lastUpdateTimes.update(r.mu.replicaID, timeutil.Now())
   916  	}
   917  
   918  	r.mu.ticks++
   919  	r.mu.internalRaftGroup.Tick()
   920  
   921  	refreshAtDelta := r.store.cfg.RaftElectionTimeoutTicks
   922  	if knob := r.store.TestingKnobs().RefreshReasonTicksPeriod; knob > 0 {
   923  		refreshAtDelta = knob
   924  	}
   925  	if !r.store.TestingKnobs().DisableRefreshReasonTicks && r.mu.ticks%refreshAtDelta == 0 {
   926  		// RaftElectionTimeoutTicks is a reasonable approximation of how long we
   927  		// should wait before deciding that our previous proposal didn't go
   928  		// through. Note that the combination of the above condition and passing
   929  		// RaftElectionTimeoutTicks to refreshProposalsLocked means that commands
   930  		// will be refreshed when they have been pending for 1 to 2 election
   931  		// cycles.
   932  		r.refreshProposalsLocked(ctx, refreshAtDelta, reasonTicks)
   933  	}
   934  	return true, nil
   935  }
   936  
   937  func (r *Replica) hasRaftReadyRLocked() bool {
   938  	return r.mu.internalRaftGroup.HasReady()
   939  }
   940  
   941  //go:generate stringer -type refreshRaftReason
   942  type refreshRaftReason int
   943  
   944  const (
   945  	noReason refreshRaftReason = iota
   946  	reasonNewLeader
   947  	reasonNewLeaderOrConfigChange
   948  	// A snapshot was just applied and so it may have contained commands that we
   949  	// proposed whose proposal we still consider to be inflight. These commands
   950  	// will never receive a response through the regular channel.
   951  	reasonSnapshotApplied
   952  	reasonReplicaIDChanged
   953  	reasonTicks
   954  )
   955  
   956  // refreshProposalsLocked goes through the pending proposals, notifying
   957  // proposers whose proposals need to be retried, and resubmitting proposals
   958  // which were likely dropped (but may still apply at a legal Lease index) -
   959  // ensuring that the proposer will eventually get a reply on the channel it's
   960  // waiting on.
   961  // mu must be held.
   962  //
   963  // refreshAtDelta only applies for reasonTicks and specifies how old (in ticks)
   964  // a command must be for it to be inspected; the usual value is the number of
   965  // ticks of an election timeout (affect only proposals that have had ample time
   966  // to apply but didn't).
   967  func (r *Replica) refreshProposalsLocked(
   968  	ctx context.Context, refreshAtDelta int, reason refreshRaftReason,
   969  ) {
   970  	if refreshAtDelta != 0 && reason != reasonTicks {
   971  		log.Fatalf(ctx, "refreshAtDelta specified for reason %s != reasonTicks", reason)
   972  	}
   973  
   974  	var reproposals pendingCmdSlice
   975  	for _, p := range r.mu.proposals {
   976  		if p.command.MaxLeaseIndex == 0 {
   977  			// Commands without a MaxLeaseIndex cannot be reproposed, as they might
   978  			// apply twice. We also don't want to ask the proposer to retry these
   979  			// special commands.
   980  			r.cleanupFailedProposalLocked(p)
   981  			log.VEventf(p.ctx, 2, "refresh (reason: %s) returning AmbiguousResultError for command "+
   982  				"without MaxLeaseIndex: %v", reason, p.command)
   983  			p.finishApplication(ctx, proposalResult{Err: roachpb.NewError(
   984  				roachpb.NewAmbiguousResultError(
   985  					fmt.Sprintf("unknown status for command without MaxLeaseIndex "+
   986  						"at refreshProposalsLocked time (refresh reason: %s)", reason)))})
   987  			continue
   988  		}
   989  		switch reason {
   990  		case reasonSnapshotApplied:
   991  			// If we applied a snapshot, check the MaxLeaseIndexes of all
   992  			// pending commands to see if any are now prevented from
   993  			// applying, and if so make them return an ambiguous error. We
   994  			// can't tell at this point (which should be rare) whether they
   995  			// were included in the snapshot we received or not.
   996  			if p.command.MaxLeaseIndex <= r.mu.state.LeaseAppliedIndex {
   997  				r.cleanupFailedProposalLocked(p)
   998  				log.Eventf(p.ctx, "retry proposal %x: %s", p.idKey, reason)
   999  				p.finishApplication(ctx, proposalResult{Err: roachpb.NewError(
  1000  					roachpb.NewAmbiguousResultError(
  1001  						fmt.Sprintf("unable to determine whether command was applied via snapshot")))})
  1002  			}
  1003  			continue
  1004  
  1005  		case reasonTicks:
  1006  			if p.proposedAtTicks <= r.mu.ticks-refreshAtDelta {
  1007  				// The command was proposed a while ago and may have been dropped. Try it again.
  1008  				reproposals = append(reproposals, p)
  1009  			}
  1010  
  1011  		default:
  1012  			// We have reason to believe that all pending proposals were
  1013  			// dropped on the floor (e.g. because of a leader election), so
  1014  			// repropose everything.
  1015  			reproposals = append(reproposals, p)
  1016  		}
  1017  	}
  1018  
  1019  	if log.V(1) && len(reproposals) > 0 {
  1020  		log.Infof(ctx,
  1021  			"pending commands: reproposing %d (at %d.%d) %s",
  1022  			len(reproposals), r.mu.state.RaftAppliedIndex,
  1023  			r.mu.state.LeaseAppliedIndex, reason)
  1024  	}
  1025  
  1026  	// Reproposals are those commands which we weren't able to send back to the
  1027  	// client (since we're not sure that another copy of them could apply at
  1028  	// the "correct" index). For reproposals, it's generally pretty unlikely
  1029  	// that they can make it in the right place. Reproposing in order is
  1030  	// definitely required, however.
  1031  	sort.Sort(reproposals)
  1032  	for _, p := range reproposals {
  1033  		log.Eventf(p.ctx, "re-submitting command %x to Raft: %s", p.idKey, reason)
  1034  		if err := r.mu.proposalBuf.ReinsertLocked(p); err != nil {
  1035  			r.cleanupFailedProposalLocked(p)
  1036  			p.finishApplication(ctx, proposalResult{
  1037  				Err: roachpb.NewError(roachpb.NewAmbiguousResultError(err.Error())),
  1038  			})
  1039  		}
  1040  	}
  1041  }
  1042  
  1043  // maybeCoalesceHeartbeat returns true if the heartbeat was coalesced and added
  1044  // to the appropriate queue.
  1045  func (r *Replica) maybeCoalesceHeartbeat(
  1046  	ctx context.Context,
  1047  	msg raftpb.Message,
  1048  	toReplica, fromReplica roachpb.ReplicaDescriptor,
  1049  	quiesce bool,
  1050  ) bool {
  1051  	var hbMap map[roachpb.StoreIdent][]RaftHeartbeat
  1052  	switch msg.Type {
  1053  	case raftpb.MsgHeartbeat:
  1054  		r.store.coalescedMu.Lock()
  1055  		hbMap = r.store.coalescedMu.heartbeats
  1056  	case raftpb.MsgHeartbeatResp:
  1057  		r.store.coalescedMu.Lock()
  1058  		hbMap = r.store.coalescedMu.heartbeatResponses
  1059  	default:
  1060  		return false
  1061  	}
  1062  	beat := RaftHeartbeat{
  1063  		RangeID:       r.RangeID,
  1064  		ToReplicaID:   toReplica.ReplicaID,
  1065  		FromReplicaID: fromReplica.ReplicaID,
  1066  		Term:          msg.Term,
  1067  		Commit:        msg.Commit,
  1068  		Quiesce:       quiesce,
  1069  		ToIsLearner:   toReplica.GetType() == roachpb.LEARNER,
  1070  	}
  1071  	if log.V(4) {
  1072  		log.Infof(ctx, "coalescing beat: %+v", beat)
  1073  	}
  1074  	toStore := roachpb.StoreIdent{
  1075  		StoreID: toReplica.StoreID,
  1076  		NodeID:  toReplica.NodeID,
  1077  	}
  1078  	hbMap[toStore] = append(hbMap[toStore], beat)
  1079  	r.store.coalescedMu.Unlock()
  1080  	return true
  1081  }
  1082  
  1083  func (r *Replica) sendRaftMessages(ctx context.Context, messages []raftpb.Message) {
  1084  	var lastAppResp raftpb.Message
  1085  	for _, message := range messages {
  1086  		drop := false
  1087  		switch message.Type {
  1088  		case raftpb.MsgApp:
  1089  			if util.RaceEnabled {
  1090  				// Iterate over the entries to assert that all sideloaded commands
  1091  				// are already inlined. replicaRaftStorage.Entries already performs
  1092  				// the sideload inlining for stable entries and raft.unstable always
  1093  				// contain fat entries. Since these are the only two sources that
  1094  				// raft.sendAppend gathers entries from to populate MsgApps, we
  1095  				// should never see thin entries here.
  1096  				for j := range message.Entries {
  1097  					assertSideloadedRaftCommandInlined(ctx, &message.Entries[j])
  1098  				}
  1099  			}
  1100  
  1101  		case raftpb.MsgAppResp:
  1102  			// A successful (non-reject) MsgAppResp contains one piece of
  1103  			// information: the highest log index. Raft currently queues up
  1104  			// one MsgAppResp per incoming MsgApp, and we may process
  1105  			// multiple messages in one handleRaftReady call (because
  1106  			// multiple messages may arrive while we're blocked syncing to
  1107  			// disk). If we get redundant MsgAppResps, drop all but the
  1108  			// last (we've seen that too many MsgAppResps can overflow
  1109  			// message queues on the receiving side).
  1110  			//
  1111  			// Note that this reorders the chosen MsgAppResp relative to
  1112  			// other messages (including any MsgAppResps with the Reject flag),
  1113  			// but raft is fine with this reordering.
  1114  			//
  1115  			// TODO(bdarnell): Consider pushing this optimization into etcd/raft.
  1116  			// Similar optimizations may be possible for other message types,
  1117  			// although MsgAppResp is the only one that has been seen as a
  1118  			// problem in practice.
  1119  			if !message.Reject && message.Index > lastAppResp.Index {
  1120  				lastAppResp = message
  1121  				drop = true
  1122  			}
  1123  		}
  1124  
  1125  		if !drop {
  1126  			r.sendRaftMessage(ctx, message)
  1127  		}
  1128  	}
  1129  	if lastAppResp.Index > 0 {
  1130  		r.sendRaftMessage(ctx, lastAppResp)
  1131  	}
  1132  }
  1133  
  1134  // sendRaftMessage sends a Raft message.
  1135  func (r *Replica) sendRaftMessage(ctx context.Context, msg raftpb.Message) {
  1136  	r.mu.RLock()
  1137  	fromReplica, fromErr := r.getReplicaDescriptorByIDRLocked(roachpb.ReplicaID(msg.From), r.mu.lastToReplica)
  1138  	toReplica, toErr := r.getReplicaDescriptorByIDRLocked(roachpb.ReplicaID(msg.To), r.mu.lastFromReplica)
  1139  	var startKey roachpb.RKey
  1140  	if msg.Type == raftpb.MsgApp && r.mu.internalRaftGroup != nil {
  1141  		// When the follower is potentially an uninitialized replica waiting for
  1142  		// a split trigger, send the replica's StartKey along. See the method
  1143  		// below for more context:
  1144  		_ = maybeDropMsgApp
  1145  		// NB: this code is allocation free.
  1146  		r.mu.internalRaftGroup.WithProgress(func(id uint64, _ raft.ProgressType, pr tracker.Progress) {
  1147  			if id == msg.To && pr.State == tracker.StateProbe {
  1148  				// It is moderately expensive to attach a full key to the message, but note that
  1149  				// a probing follower will only be appended to once per heartbeat interval (i.e.
  1150  				// on the order of seconds). See:
  1151  				//
  1152  				// https://github.com/etcd-io/etcd/blob/7f450bf6967638673dd88fd4e730b01d1303d5ff/raft/progress.go#L41
  1153  				startKey = r.descRLocked().StartKey
  1154  			}
  1155  		})
  1156  	}
  1157  	r.mu.RUnlock()
  1158  
  1159  	if fromErr != nil {
  1160  		log.Warningf(ctx, "failed to look up sender replica %d in r%d while sending %s: %s",
  1161  			msg.From, r.RangeID, msg.Type, fromErr)
  1162  		return
  1163  	}
  1164  	if toErr != nil {
  1165  		log.Warningf(ctx, "failed to look up recipient replica %d in r%d while sending %s: %s",
  1166  			msg.To, r.RangeID, msg.Type, toErr)
  1167  		return
  1168  	}
  1169  
  1170  	// Raft-initiated snapshots are handled by the Raft snapshot queue.
  1171  	if msg.Type == raftpb.MsgSnap {
  1172  		r.store.raftSnapshotQueue.AddAsync(ctx, r, raftSnapshotPriority)
  1173  		return
  1174  	}
  1175  
  1176  	if r.maybeCoalesceHeartbeat(ctx, msg, toReplica, fromReplica, false) {
  1177  		return
  1178  	}
  1179  
  1180  	req := newRaftMessageRequest()
  1181  	*req = RaftMessageRequest{
  1182  		RangeID:       r.RangeID,
  1183  		ToReplica:     toReplica,
  1184  		FromReplica:   fromReplica,
  1185  		Message:       msg,
  1186  		RangeStartKey: startKey, // usually nil
  1187  	}
  1188  	if !r.sendRaftMessageRequest(ctx, req) {
  1189  		if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) {
  1190  			r.mu.droppedMessages++
  1191  			raftGroup.ReportUnreachable(msg.To)
  1192  			return true, nil
  1193  		}); err != nil && !errors.Is(err, errRemoved) {
  1194  			log.Fatalf(ctx, "%v", err)
  1195  		}
  1196  	}
  1197  }
  1198  
  1199  // addUnreachableRemoteReplica adds the given remote ReplicaID to be reported
  1200  // as unreachable on the next tick.
  1201  func (r *Replica) addUnreachableRemoteReplica(remoteReplica roachpb.ReplicaID) {
  1202  	r.unreachablesMu.Lock()
  1203  	if r.unreachablesMu.remotes == nil {
  1204  		r.unreachablesMu.remotes = make(map[roachpb.ReplicaID]struct{})
  1205  	}
  1206  	r.unreachablesMu.remotes[remoteReplica] = struct{}{}
  1207  	r.unreachablesMu.Unlock()
  1208  }
  1209  
  1210  // sendRaftMessageRequest sends a raft message, returning false if the message
  1211  // was dropped. It is the caller's responsibility to call ReportUnreachable on
  1212  // the Raft group.
  1213  func (r *Replica) sendRaftMessageRequest(ctx context.Context, req *RaftMessageRequest) bool {
  1214  	if log.V(4) {
  1215  		log.Infof(ctx, "sending raft request %+v", req)
  1216  	}
  1217  	ok := r.store.cfg.Transport.SendAsync(req, r.connectionClass.get())
  1218  	// TODO(peter): Looping over all of the outgoing Raft message queues to
  1219  	// update this stat on every send is a bit expensive.
  1220  	r.store.metrics.RaftEnqueuedPending.Update(r.store.cfg.Transport.queuedMessageCount())
  1221  	return ok
  1222  }
  1223  
  1224  func (r *Replica) reportSnapshotStatus(ctx context.Context, to roachpb.ReplicaID, snapErr error) {
  1225  	r.raftMu.Lock()
  1226  	defer r.raftMu.Unlock()
  1227  
  1228  	snapStatus := raft.SnapshotFinish
  1229  	if snapErr != nil {
  1230  		snapStatus = raft.SnapshotFailure
  1231  	}
  1232  
  1233  	if err := r.withRaftGroup(true, func(raftGroup *raft.RawNode) (bool, error) {
  1234  		raftGroup.ReportSnapshot(uint64(to), snapStatus)
  1235  		return true, nil
  1236  	}); err != nil && !errors.Is(err, errRemoved) {
  1237  		log.Fatalf(ctx, "%v", err)
  1238  	}
  1239  }
  1240  
  1241  type snapTruncationInfo struct {
  1242  	index          uint64
  1243  	recipientStore roachpb.StoreID
  1244  	deadline       time.Time
  1245  }
  1246  
  1247  func (r *Replica) addSnapshotLogTruncationConstraint(
  1248  	ctx context.Context, snapUUID uuid.UUID, index uint64, recipientStore roachpb.StoreID,
  1249  ) {
  1250  	r.mu.Lock()
  1251  	defer r.mu.Unlock()
  1252  	r.addSnapshotLogTruncationConstraintLocked(ctx, snapUUID, index, recipientStore)
  1253  }
  1254  
  1255  func (r *Replica) addSnapshotLogTruncationConstraintLocked(
  1256  	ctx context.Context, snapUUID uuid.UUID, index uint64, recipientStore roachpb.StoreID,
  1257  ) {
  1258  	if r.mu.snapshotLogTruncationConstraints == nil {
  1259  		r.mu.snapshotLogTruncationConstraints = make(map[uuid.UUID]snapTruncationInfo)
  1260  	}
  1261  	item, ok := r.mu.snapshotLogTruncationConstraints[snapUUID]
  1262  	if ok {
  1263  		// Uh-oh, there's either a programming error (resulting in the same snapshot
  1264  		// fed into this method twice) or a UUID collision. We discard the update
  1265  		// (which is benign) but log it loudly. If the index is the same, it's
  1266  		// likely the former, otherwise the latter.
  1267  		log.Warningf(ctx, "UUID collision at %s for %+v (index %d)", snapUUID, item, index)
  1268  		return
  1269  	}
  1270  
  1271  	r.mu.snapshotLogTruncationConstraints[snapUUID] = snapTruncationInfo{
  1272  		index:          index,
  1273  		recipientStore: recipientStore,
  1274  	}
  1275  }
  1276  
  1277  // completeSnapshotLogTruncationConstraint marks the given snapshot as finished,
  1278  // releasing the lock on raft log truncation after a grace period.
  1279  func (r *Replica) completeSnapshotLogTruncationConstraint(
  1280  	ctx context.Context, snapUUID uuid.UUID, now time.Time,
  1281  ) {
  1282  	r.mu.Lock()
  1283  	defer r.mu.Unlock()
  1284  
  1285  	item, ok := r.mu.snapshotLogTruncationConstraints[snapUUID]
  1286  	if !ok {
  1287  		// UUID collision while adding the snapshot in originally. Nothing
  1288  		// else to do.
  1289  		return
  1290  	}
  1291  
  1292  	deadline := now.Add(raftLogQueuePendingSnapshotGracePeriod)
  1293  	item.deadline = deadline
  1294  	r.mu.snapshotLogTruncationConstraints[snapUUID] = item
  1295  }
  1296  
  1297  // getAndGCSnapshotLogTruncationConstraints returns the minimum index of any
  1298  // currently outstanding snapshot being sent from this replica to the specified
  1299  // recipient or 0 if there isn't one. Passing 0 for recipientStore means any
  1300  // recipient.
  1301  func (r *Replica) getAndGCSnapshotLogTruncationConstraints(
  1302  	now time.Time, recipientStore roachpb.StoreID,
  1303  ) (minSnapIndex uint64) {
  1304  	r.mu.Lock()
  1305  	defer r.mu.Unlock()
  1306  	return r.getAndGCSnapshotLogTruncationConstraintsLocked(now, recipientStore)
  1307  }
  1308  
  1309  func (r *Replica) getAndGCSnapshotLogTruncationConstraintsLocked(
  1310  	now time.Time, recipientStore roachpb.StoreID,
  1311  ) (minSnapIndex uint64) {
  1312  	for snapUUID, item := range r.mu.snapshotLogTruncationConstraints {
  1313  		if item.deadline != (time.Time{}) && item.deadline.Before(now) {
  1314  			// The snapshot has finished and its grace period has passed.
  1315  			// Ignore it when making truncation decisions.
  1316  			delete(r.mu.snapshotLogTruncationConstraints, snapUUID)
  1317  			continue
  1318  		}
  1319  		if recipientStore != 0 && item.recipientStore != recipientStore {
  1320  			continue
  1321  		}
  1322  		if minSnapIndex == 0 || minSnapIndex > item.index {
  1323  			minSnapIndex = item.index
  1324  		}
  1325  	}
  1326  	if len(r.mu.snapshotLogTruncationConstraints) == 0 {
  1327  		// Save a little bit of memory.
  1328  		r.mu.snapshotLogTruncationConstraints = nil
  1329  	}
  1330  	return minSnapIndex
  1331  }
  1332  
  1333  func isRaftLeader(raftStatus *raft.Status) bool {
  1334  	return raftStatus != nil && raftStatus.SoftState.RaftState == raft.StateLeader
  1335  }
  1336  
  1337  // HasRaftLeader returns true if the raft group has a raft leader currently.
  1338  func HasRaftLeader(raftStatus *raft.Status) bool {
  1339  	return raftStatus != nil && raftStatus.SoftState.Lead != 0
  1340  }
  1341  
  1342  // pendingCmdSlice sorts by increasing MaxLeaseIndex.
  1343  type pendingCmdSlice []*ProposalData
  1344  
  1345  func (s pendingCmdSlice) Len() int      { return len(s) }
  1346  func (s pendingCmdSlice) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
  1347  func (s pendingCmdSlice) Less(i, j int) bool {
  1348  	return s[i].command.MaxLeaseIndex < s[j].command.MaxLeaseIndex
  1349  }
  1350  
  1351  // withRaftGroupLocked calls the supplied function with the (lazily
  1352  // initialized) Raft group. The supplied function should return true for the
  1353  // unquiesceAndWakeLeader argument if the replica should be unquiesced (and the
  1354  // leader awoken). See handleRaftReady for an instance of where this value
  1355  // varies.
  1356  //
  1357  // Requires that Replica.mu is held.
  1358  //
  1359  // If this Replica is in the process of being removed this method will return
  1360  // errRemoved.
  1361  func (r *Replica) withRaftGroupLocked(
  1362  	mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error),
  1363  ) error {
  1364  	if r.mu.destroyStatus.Removed() {
  1365  		// Callers know to detect errRemoved as non-fatal.
  1366  		return errRemoved
  1367  	}
  1368  
  1369  	if r.mu.internalRaftGroup == nil {
  1370  		ctx := r.AnnotateCtx(context.TODO())
  1371  		raftGroup, err := raft.NewRawNode(newRaftConfig(
  1372  			raft.Storage((*replicaRaftStorage)(r)),
  1373  			uint64(r.mu.replicaID),
  1374  			r.mu.state.RaftAppliedIndex,
  1375  			r.store.cfg,
  1376  			&raftLogger{ctx: ctx},
  1377  		))
  1378  		if err != nil {
  1379  			return err
  1380  		}
  1381  		r.mu.internalRaftGroup = raftGroup
  1382  
  1383  		if mayCampaignOnWake {
  1384  			r.maybeCampaignOnWakeLocked(ctx)
  1385  		}
  1386  	}
  1387  
  1388  	// This wrapper function is a hack to add range IDs to stack traces
  1389  	// using the same pattern as Replica.sendWithRangeID.
  1390  	unquiesce, err := func(rangeID roachpb.RangeID, raftGroup *raft.RawNode) (bool, error) {
  1391  		return f(raftGroup)
  1392  	}(r.RangeID, r.mu.internalRaftGroup)
  1393  	if r.mu.internalRaftGroup.BasicStatus().Lead == 0 {
  1394  		// If we don't know the leader, unquiesce unconditionally. As a
  1395  		// follower, we can't wake up the leader if we don't know who that is,
  1396  		// so we should find out now before someone needs us to unquiesce.
  1397  		//
  1398  		// This situation should occur rarely or never (ever since we got
  1399  		// stricter about validating incoming Quiesce requests) but it's good
  1400  		// defense-in-depth.
  1401  		//
  1402  		// Note that unquiesceAndWakeLeaderLocked won't manage to wake up the
  1403  		// leader since it's unknown to this replica, and at the time of writing
  1404  		// the heuristics for campaigning are defensive (won't campaign if there
  1405  		// is a live leaseholder). But if we are trying to unquiesce because
  1406  		// this follower was asked to propose something, then this means that a
  1407  		// request is going to have to wait until the leader next contacts us,
  1408  		// or, in the worst case, an election timeout. This is not ideal - if a
  1409  		// node holds a live lease, we should direct the client to it
  1410  		// immediately.
  1411  		unquiesce = true
  1412  	}
  1413  	if unquiesce {
  1414  		r.unquiesceAndWakeLeaderLocked()
  1415  	}
  1416  	return err
  1417  }
  1418  
  1419  // withRaftGroup calls the supplied function with the (lazily initialized)
  1420  // Raft group. It acquires and releases the Replica lock, so r.mu must not be
  1421  // held (or acquired by the supplied function).
  1422  //
  1423  // If mayCampaignOnWake is true, the replica may initiate a raft
  1424  // election if it was previously in a dormant state. Most callers
  1425  // should set this to true, because the prevote feature minimizes the
  1426  // disruption from unnecessary elections. The exception is that we
  1427  // should not initiate an election while handling incoming raft
  1428  // messages (which may include MsgVotes from an election in progress,
  1429  // and this election would be disrupted if we started our own).
  1430  //
  1431  // If this Replica is in the process of being removed this method will return
  1432  // errRemoved.
  1433  func (r *Replica) withRaftGroup(
  1434  	mayCampaignOnWake bool, f func(r *raft.RawNode) (unquiesceAndWakeLeader bool, _ error),
  1435  ) error {
  1436  	r.mu.Lock()
  1437  	defer r.mu.Unlock()
  1438  	return r.withRaftGroupLocked(mayCampaignOnWake, f)
  1439  }
  1440  
  1441  func shouldCampaignOnWake(
  1442  	leaseStatus kvserverpb.LeaseStatus,
  1443  	lease roachpb.Lease,
  1444  	storeID roachpb.StoreID,
  1445  	raftStatus raft.Status,
  1446  ) bool {
  1447  	// When waking up a range, campaign unless we know that another
  1448  	// node holds a valid lease (this is most important after a split,
  1449  	// when all replicas create their raft groups at about the same
  1450  	// time, with a lease pre-assigned to one of them). Note that
  1451  	// thanks to PreVote, unnecessary campaigns are not disruptive so
  1452  	// we should err on the side of campaigining here.
  1453  	anotherOwnsLease := leaseStatus.State == kvserverpb.LeaseState_VALID && !lease.OwnedBy(storeID)
  1454  
  1455  	// If we're already campaigning or know who the leader is, don't
  1456  	// start a new term.
  1457  	noLeader := raftStatus.RaftState == raft.StateFollower && raftStatus.Lead == 0
  1458  	return !anotherOwnsLease && noLeader
  1459  }
  1460  
  1461  // maybeCampaignOnWakeLocked is called when the range wakes from a
  1462  // dormant state (either the initial "raftGroup == nil" state or after
  1463  // being quiescent) and campaigns for raft leadership if appropriate.
  1464  func (r *Replica) maybeCampaignOnWakeLocked(ctx context.Context) {
  1465  	// Raft panics if a node that is not currently a member of the
  1466  	// group tries to campaign. That happens primarily when we apply
  1467  	// preemptive snapshots.
  1468  	if _, currentMember := r.mu.state.Desc.GetReplicaDescriptorByID(r.mu.replicaID); !currentMember {
  1469  		return
  1470  	}
  1471  
  1472  	leaseStatus := r.leaseStatus(*r.mu.state.Lease, r.store.Clock().Now(), r.mu.minLeaseProposedTS)
  1473  	raftStatus := r.mu.internalRaftGroup.Status()
  1474  	if shouldCampaignOnWake(leaseStatus, *r.mu.state.Lease, r.store.StoreID(), raftStatus) {
  1475  		log.VEventf(ctx, 3, "campaigning")
  1476  		if err := r.mu.internalRaftGroup.Campaign(); err != nil {
  1477  			log.VEventf(ctx, 1, "failed to campaign: %s", err)
  1478  		}
  1479  	}
  1480  }
  1481  
  1482  // a lastUpdateTimesMap is maintained on the Raft leader to keep track of the
  1483  // last communication received from followers, which in turn informs the quota
  1484  // pool and log truncations.
  1485  type lastUpdateTimesMap map[roachpb.ReplicaID]time.Time
  1486  
  1487  func (m lastUpdateTimesMap) update(replicaID roachpb.ReplicaID, now time.Time) {
  1488  	if m == nil {
  1489  		return
  1490  	}
  1491  	m[replicaID] = now
  1492  }
  1493  
  1494  // updateOnUnquiesce is called when the leader unquiesces. In that case, we
  1495  // don't want live followers to appear as dead before their next message reaches
  1496  // us; to achieve that, we optimistically mark all followers that are in
  1497  // ProgressStateReplicate (or rather, were in that state when the group
  1498  // quiesced) as live as of `now`. We don't want to mark other followers as
  1499  // live as they may be down and could artificially seem alive forever assuming
  1500  // a suitable pattern of quiesce and unquiesce operations (and this in turn
  1501  // can interfere with Raft log truncations).
  1502  func (m lastUpdateTimesMap) updateOnUnquiesce(
  1503  	descs []roachpb.ReplicaDescriptor, prs map[uint64]tracker.Progress, now time.Time,
  1504  ) {
  1505  	for _, desc := range descs {
  1506  		if prs[uint64(desc.ReplicaID)].State == tracker.StateReplicate {
  1507  			m.update(desc.ReplicaID, now)
  1508  		}
  1509  	}
  1510  }
  1511  
  1512  // updateOnBecomeLeader is similar to updateOnUnquiesce, but is called when the
  1513  // replica becomes the Raft leader. It updates all followers irrespective of
  1514  // their Raft state, for the Raft state is not yet populated by the time this
  1515  // callback is invoked. Raft leadership is usually stable, so there is no danger
  1516  // of artificially keeping down followers alive, though if it started
  1517  // flip-flopping at a <10s cadence there would be a risk of that happening.
  1518  func (m lastUpdateTimesMap) updateOnBecomeLeader(descs []roachpb.ReplicaDescriptor, now time.Time) {
  1519  	for _, desc := range descs {
  1520  		m.update(desc.ReplicaID, now)
  1521  	}
  1522  }
  1523  
  1524  // isFollowerActiveSince returns whether the specified follower has made
  1525  // communication with the leader recently (since threshold).
  1526  func (m lastUpdateTimesMap) isFollowerActiveSince(
  1527  	ctx context.Context, replicaID roachpb.ReplicaID, now time.Time, threshold time.Duration,
  1528  ) bool {
  1529  	lastUpdateTime, ok := m[replicaID]
  1530  	if !ok {
  1531  		// If the follower has no entry in lastUpdateTimes, it has not been
  1532  		// updated since r became the leader (at which point all then-existing
  1533  		// replicas were updated).
  1534  		return false
  1535  	}
  1536  	return now.Sub(lastUpdateTime) <= threshold
  1537  }
  1538  
  1539  // maybeAcquireSnapshotMergeLock checks whether the incoming snapshot subsumes
  1540  // any replicas and, if so, locks them for subsumption. See acquireMergeLock
  1541  // for details about the lock itself.
  1542  func (r *Replica) maybeAcquireSnapshotMergeLock(
  1543  	ctx context.Context, inSnap IncomingSnapshot,
  1544  ) (subsumedRepls []*Replica, releaseMergeLock func()) {
  1545  	// Any replicas that overlap with the bounds of the incoming snapshot are ours
  1546  	// to subsume; further, the end of the last overlapping replica will exactly
  1547  	// align with the end of the snapshot. How are we guaranteed this? Each merge
  1548  	// could not have committed unless this store had an up-to-date replica of the
  1549  	// RHS at the time of the merge. Nothing could have removed that RHS replica,
  1550  	// as the replica GC queue cannot GC a replica unless it can prove its
  1551  	// left-hand neighbor has no pending merges to apply. And that RHS replica
  1552  	// could not have been further split or merged, as it never processes another
  1553  	// command after the merge commits.
  1554  	endKey := r.Desc().EndKey
  1555  	if endKey == nil {
  1556  		// The existing replica is unitialized, in which case we've already
  1557  		// installed a placeholder for snapshot's keyspace. No merge lock needed.
  1558  		return nil, func() {}
  1559  	}
  1560  	for endKey.Less(inSnap.State.Desc.EndKey) {
  1561  		sRepl := r.store.LookupReplica(endKey)
  1562  		if sRepl == nil || !endKey.Equal(sRepl.Desc().StartKey) {
  1563  			log.Fatalf(ctx, "snapshot widens existing replica, but no replica exists for subsumed key %s", endKey)
  1564  		}
  1565  		sRepl.raftMu.Lock()
  1566  		subsumedRepls = append(subsumedRepls, sRepl)
  1567  		endKey = sRepl.Desc().EndKey
  1568  	}
  1569  	// TODO(benesch): we may be unnecessarily forcing another Raft snapshot here
  1570  	// by subsuming too much. Consider the case where [a, b) and [c, e) first
  1571  	// merged into [a, e), then split into [a, d) and [d, e), and we're applying a
  1572  	// snapshot that spans this merge and split. The bounds of this snapshot will
  1573  	// be [a, d), so we'll subsume [c, e). But we're still a member of [d, e)!
  1574  	// We'll currently be forced to get a Raft snapshot to catch up. Ideally, we'd
  1575  	// subsume only half of [c, e) and synthesize a new RHS [d, e), effectively
  1576  	// applying both the split and merge during snapshot application. This isn't a
  1577  	// huge deal, though: we're probably behind enough that the RHS would need to
  1578  	// get caught up with a Raft snapshot anyway, even if we synthesized it
  1579  	// properly.
  1580  	return subsumedRepls, func() {
  1581  		for _, sr := range subsumedRepls {
  1582  			sr.raftMu.Unlock()
  1583  		}
  1584  	}
  1585  }
  1586  
  1587  // maybeAcquireSplitMergeLock examines the given raftCmd (which need
  1588  // not be applied yet) and acquires the split or merge lock if
  1589  // necessary (in addition to other preparation). It returns a function
  1590  // which will release any lock acquired (or nil).
  1591  //
  1592  // After this method returns successfully the RHS of the split or merge
  1593  // is guaranteed to exist in the Store using GetReplica().
  1594  func (r *Replica) maybeAcquireSplitMergeLock(
  1595  	ctx context.Context, raftCmd kvserverpb.RaftCommand,
  1596  ) (func(), error) {
  1597  	if split := raftCmd.ReplicatedEvalResult.Split; split != nil {
  1598  		return r.acquireSplitLock(ctx, &split.SplitTrigger)
  1599  	} else if merge := raftCmd.ReplicatedEvalResult.Merge; merge != nil {
  1600  		return r.acquireMergeLock(ctx, &merge.MergeTrigger)
  1601  	}
  1602  	return nil, nil
  1603  }
  1604  
  1605  func (r *Replica) acquireSplitLock(
  1606  	ctx context.Context, split *roachpb.SplitTrigger,
  1607  ) (func(), error) {
  1608  	rightReplDesc, _ := split.RightDesc.GetReplicaDescriptor(r.StoreID())
  1609  	rightRepl, _, err := r.store.getOrCreateReplica(ctx, split.RightDesc.RangeID,
  1610  		rightReplDesc.ReplicaID, nil, /* creatingReplica */
  1611  		rightReplDesc.GetType() == roachpb.LEARNER)
  1612  	// If getOrCreateReplica returns RaftGroupDeletedError we know that the RHS
  1613  	// has already been removed. This case is handled properly in splitPostApply.
  1614  	if errors.HasType(err, (*roachpb.RaftGroupDeletedError)(nil)) {
  1615  		return func() {}, nil
  1616  	}
  1617  	if err != nil {
  1618  		return nil, err
  1619  	}
  1620  	if rightRepl.IsInitialized() {
  1621  		return nil, errors.Errorf("RHS of split %s / %s already initialized before split application",
  1622  			&split.LeftDesc, &split.RightDesc)
  1623  	}
  1624  	return rightRepl.raftMu.Unlock, nil
  1625  }
  1626  
  1627  func (r *Replica) acquireMergeLock(
  1628  	ctx context.Context, merge *roachpb.MergeTrigger,
  1629  ) (func(), error) {
  1630  	// The merge lock is the right-hand replica's raftMu. The right-hand replica
  1631  	// is required to exist on this store at the merge implied replica ID.
  1632  	// Otherwise, an incoming snapshot could create the right-hand replica before
  1633  	// the merge trigger has a chance to widen the left-hand replica's end key.
  1634  	// The merge trigger would then fatal the node upon realizing the right-hand
  1635  	// replica already exists. With a right-hand replica in place, any snapshots
  1636  	// for the right-hand range will block on raftMu, waiting for the merge to
  1637  	// complete, after which the replica will realize it has been destroyed and
  1638  	// reject the snapshot.
  1639  	//
  1640  	// These guarantees would not be held if we were catching up from a preemptive
  1641  	// snapshot and were not part of the range. That scenario, however, never
  1642  	// arises because prior to 19.2 we would ensure that a preemptive snapshot had
  1643  	// been applied before adding a store to the range which would fail if the
  1644  	// range had merged another range and in 19.2 we detect if the raft messages
  1645  	// we're processing are for a learner and our current state is due to a
  1646  	// preemptive snapshot and remove the preemptive snapshot.
  1647  	rightReplDesc, _ := merge.RightDesc.GetReplicaDescriptor(r.StoreID())
  1648  	rightRepl, _, err := r.store.getOrCreateReplica(ctx, merge.RightDesc.RangeID,
  1649  		rightReplDesc.ReplicaID, nil, /* creatingReplica */
  1650  		rightReplDesc.GetType() == roachpb.LEARNER)
  1651  	if err != nil {
  1652  		return nil, err
  1653  	}
  1654  	rightDesc := rightRepl.Desc()
  1655  	if !rightDesc.StartKey.Equal(merge.RightDesc.StartKey) || !rightDesc.EndKey.Equal(merge.RightDesc.EndKey) {
  1656  		return nil, errors.Errorf("RHS of merge %s <- %s not present on store; found %s in place of the RHS",
  1657  			&merge.LeftDesc, &merge.RightDesc, rightDesc)
  1658  	}
  1659  	return rightRepl.raftMu.Unlock, nil
  1660  }
  1661  
  1662  // handleTruncatedStateBelowRaft is called when a Raft command updates the truncated
  1663  // state. This isn't 100% trivial for two reasons:
  1664  // - in 19.1 we're making the TruncatedState key unreplicated, so there's a migration
  1665  // - we're making use of the above by not sending the Raft log in snapshots (the truncated
  1666  //   state effectively determines the first index of the log, which requires it to be unreplicated).
  1667  //   Updates to the HardState are sent out by a leaseholder truncating the log based on its local
  1668  //   knowledge. For example, the leader might have a log 10..100 and truncates to 50, and will send
  1669  //   out a TruncatedState with Index 50 to that effect. However, some replicas may not even have log
  1670  //   entries that old, and must make sure to ignore this update to the truncated state, as it would
  1671  //   otherwise clobber their "newer" truncated state.
  1672  //
  1673  // The returned boolean tells the caller whether to apply the truncated state's
  1674  // side effects, which means replacing the in-memory TruncatedState and applying
  1675  // the associated RaftLogDelta. It is usually expected to be true, but may not
  1676  // be for the first truncation after on a replica that recently received a
  1677  // snapshot.
  1678  func handleTruncatedStateBelowRaft(
  1679  	ctx context.Context,
  1680  	oldTruncatedState, newTruncatedState *roachpb.RaftTruncatedState,
  1681  	loader stateloader.StateLoader,
  1682  	readWriter storage.ReadWriter,
  1683  ) (_apply bool, _ error) {
  1684  	// If this is a log truncation, load the resulting unreplicated or legacy
  1685  	// replicated truncated state (in that order). If the migration is happening
  1686  	// in this command, the result will be an empty message. In steady state
  1687  	// after the migration, it's the unreplicated truncated state not taking
  1688  	// into account the current truncation (since the key is unreplicated).
  1689  	// Either way, we'll update it below.
  1690  	//
  1691  	// See VersionUnreplicatedRaftTruncatedState for details.
  1692  	truncStatePostApply, truncStateIsLegacy, err := loader.LoadRaftTruncatedState(ctx, readWriter)
  1693  	if err != nil {
  1694  		return false, errors.Wrap(err, "loading truncated state")
  1695  	}
  1696  
  1697  	// Truncate the Raft log from the entry after the previous
  1698  	// truncation index to the new truncation index. This is performed
  1699  	// atomically with the raft command application so that the
  1700  	// TruncatedState index is always consistent with the state of the
  1701  	// Raft log itself. We can use the distinct writer because we know
  1702  	// all writes will be to distinct keys.
  1703  	//
  1704  	// Intentionally don't use range deletion tombstones (ClearRange())
  1705  	// due to performance concerns connected to having many range
  1706  	// deletion tombstones. There is a chance that ClearRange will
  1707  	// perform well here because the tombstones could be "collapsed",
  1708  	// but it is hardly worth the risk at this point.
  1709  	prefixBuf := &loader.RangeIDPrefixBuf
  1710  	for idx := oldTruncatedState.Index + 1; idx <= newTruncatedState.Index; idx++ {
  1711  		// NB: RangeIDPrefixBufs have sufficient capacity (32 bytes) to
  1712  		// avoid allocating when constructing Raft log keys (16 bytes).
  1713  		unsafeKey := prefixBuf.RaftLogKey(idx)
  1714  		if err := readWriter.Clear(storage.MakeMVCCMetadataKey(unsafeKey)); err != nil {
  1715  			return false, errors.Wrapf(err, "unable to clear truncated Raft entries for %+v", newTruncatedState)
  1716  		}
  1717  	}
  1718  
  1719  	if !truncStateIsLegacy {
  1720  		if truncStatePostApply.Index < newTruncatedState.Index {
  1721  			// There are two cases here (though handled just the same). In the
  1722  			// first case, the Raft command has just deleted the legacy
  1723  			// replicated truncated state key as part of the migration (so
  1724  			// truncStateIsLegacy is now false for the first time and
  1725  			// truncStatePostApply is zero) and we need to atomically write the
  1726  			// new, unreplicated, key. Or we've already migrated earlier, in
  1727  			// which case truncStatePostApply equals the current value of the
  1728  			// new key (which wasn't touched by the batch), and we need to
  1729  			// overwrite it if this truncation "moves it forward".
  1730  
  1731  			if err := storage.MVCCPutProto(
  1732  				ctx, readWriter, nil /* ms */, prefixBuf.RaftTruncatedStateKey(),
  1733  				hlc.Timestamp{}, nil /* txn */, newTruncatedState,
  1734  			); err != nil {
  1735  				return false, errors.Wrap(err, "unable to migrate RaftTruncatedState")
  1736  			}
  1737  			// Have migrated and this new truncated state is moving us forward.
  1738  			// Tell caller that we applied it and that so should they.
  1739  			return true, nil
  1740  		}
  1741  		// Have migrated, but this truncated state moves the existing one
  1742  		// backwards, so instruct caller to not update in-memory state.
  1743  		return false, nil
  1744  	}
  1745  	// Haven't migrated yet, don't ever discard the update.
  1746  	return true, nil
  1747  }
  1748  
  1749  // ComputeRaftLogSize computes the size (in bytes) of the Raft log from the
  1750  // storage engine. This will iterate over the Raft log and sideloaded files, so
  1751  // depending on the size of these it can be mildly to extremely expensive and
  1752  // thus should not be called frequently.
  1753  //
  1754  // The sideloaded storage may be nil, in which case it is treated as empty.
  1755  func ComputeRaftLogSize(
  1756  	ctx context.Context, rangeID roachpb.RangeID, reader storage.Reader, sideloaded SideloadStorage,
  1757  ) (int64, error) {
  1758  	prefix := keys.RaftLogPrefix(rangeID)
  1759  	prefixEnd := prefix.PrefixEnd()
  1760  	iter := reader.NewIterator(storage.IterOptions{
  1761  		LowerBound: prefix,
  1762  		UpperBound: prefixEnd,
  1763  	})
  1764  	defer iter.Close()
  1765  	ms, err := iter.ComputeStats(prefix, prefixEnd, 0 /* nowNanos */)
  1766  	if err != nil {
  1767  		return 0, err
  1768  	}
  1769  	var totalSideloaded int64
  1770  	if sideloaded != nil {
  1771  		var err error
  1772  		// Truncating all indexes strictly smaller than zero is a no-op but
  1773  		// gives us the number of bytes in the storage back.
  1774  		_, totalSideloaded, err = sideloaded.TruncateTo(ctx, 0)
  1775  		if err != nil {
  1776  			return 0, err
  1777  		}
  1778  	}
  1779  	return ms.SysBytes + totalSideloaded, nil
  1780  }
  1781  
  1782  func maybeCampaignAfterConfChange(
  1783  	ctx context.Context,
  1784  	storeID roachpb.StoreID,
  1785  	desc *roachpb.RangeDescriptor,
  1786  	raftGroup *raft.RawNode,
  1787  ) {
  1788  	// If a config change was carried out, it's possible that the Raft
  1789  	// leader was removed. Verify that, and if so, campaign if we are
  1790  	// the first remaining voter replica. Without this, the range will
  1791  	// be leaderless (and thus unavailable) for a few seconds.
  1792  	//
  1793  	// We can't (or rather shouldn't) campaign on all remaining voters
  1794  	// because that can lead to a stalemate. For example, three voters
  1795  	// may all make it through PreVote and then reject each other.
  1796  	st := raftGroup.BasicStatus()
  1797  	if st.Lead == 0 {
  1798  		// Leader unknown. This isn't what we expect in steady state, so we
  1799  		// don't do anything.
  1800  		return
  1801  	}
  1802  	if !desc.IsInitialized() {
  1803  		// We don't have an initialized, so we can't figure out who is supposed
  1804  		// to campaign. It's possible that it's us and we're waiting for the
  1805  		// initial snapshot, but it's hard to tell. Don't do anything.
  1806  		return
  1807  	}
  1808  	// If the leader is no longer in the descriptor but we are the first voter,
  1809  	// campaign.
  1810  	_, leaderStillThere := desc.GetReplicaDescriptorByID(roachpb.ReplicaID(st.Lead))
  1811  	if !leaderStillThere && storeID == desc.Replicas().Voters()[0].StoreID {
  1812  		log.VEventf(ctx, 3, "leader got removed by conf change; campaigning")
  1813  		_ = raftGroup.Campaign()
  1814  	}
  1815  }
  1816  
  1817  func getNonDeterministicFailureExplanation(err error) string {
  1818  	if nd := (*nonDeterministicFailure)(nil); errors.As(err, &nd) {
  1819  		return nd.safeExpl
  1820  	}
  1821  	return "???"
  1822  }