github.com/cockroachdb/cockroach@v20.2.0-alpha.1+incompatible/pkg/kv/kvserver/store_raft.go (about)

     1  // Copyright 2019 The Cockroach Authors.
     2  //
     3  // Use of this software is governed by the Business Source License
     4  // included in the file licenses/BSL.txt.
     5  //
     6  // As of the Change Date specified in that file, in accordance with
     7  // the Business Source License, use of this software will be governed
     8  // by the Apache License, Version 2.0, included in the file
     9  // licenses/APL.txt.
    10  
    11  package kvserver
    12  
    13  import (
    14  	"context"
    15  	"sync/atomic"
    16  	"time"
    17  	"unsafe"
    18  
    19  	"github.com/cockroachdb/cockroach/pkg/roachpb"
    20  	"github.com/cockroachdb/cockroach/pkg/rpc"
    21  	"github.com/cockroachdb/cockroach/pkg/util/log"
    22  	"github.com/cockroachdb/cockroach/pkg/util/stop"
    23  	"github.com/cockroachdb/cockroach/pkg/util/syncutil"
    24  	"github.com/cockroachdb/cockroach/pkg/util/timeutil"
    25  	"github.com/cockroachdb/errors"
    26  	crdberrors "github.com/cockroachdb/errors"
    27  	"go.etcd.io/etcd/raft/raftpb"
    28  )
    29  
    30  type raftRequestInfo struct {
    31  	req        *RaftMessageRequest
    32  	respStream RaftMessageResponseStream
    33  }
    34  
    35  type raftRequestQueue struct {
    36  	syncutil.Mutex
    37  	infos []raftRequestInfo
    38  	// TODO(nvanbenschoten): consider recycling []raftRequestInfo slices. This
    39  	// could be done without any new mutex locking by storing two slices here
    40  	// and swapping them under lock in processRequestQueue.
    41  }
    42  
    43  // HandleSnapshot reads an incoming streaming snapshot and applies it if
    44  // possible.
    45  func (s *Store) HandleSnapshot(
    46  	header *SnapshotRequest_Header, stream SnapshotResponseStream,
    47  ) error {
    48  	ctx := s.AnnotateCtx(stream.Context())
    49  	const name = "storage.Store: handle snapshot"
    50  	return s.stopper.RunTaskWithErr(ctx, name, func(ctx context.Context) error {
    51  		s.metrics.raftRcvdMessages[raftpb.MsgSnap].Inc(1)
    52  
    53  		if s.IsDraining() {
    54  			return stream.Send(&SnapshotResponse{
    55  				Status:  SnapshotResponse_DECLINED,
    56  				Message: storeDrainingMsg,
    57  			})
    58  		}
    59  
    60  		return s.receiveSnapshot(ctx, header, stream)
    61  	})
    62  }
    63  
    64  // learnerType exists to avoid allocating on every coalesced beat to a learner.
    65  var learnerType = roachpb.LEARNER
    66  
    67  func (s *Store) uncoalesceBeats(
    68  	ctx context.Context,
    69  	beats []RaftHeartbeat,
    70  	fromReplica, toReplica roachpb.ReplicaDescriptor,
    71  	msgT raftpb.MessageType,
    72  	respStream RaftMessageResponseStream,
    73  ) {
    74  	if len(beats) == 0 {
    75  		return
    76  	}
    77  	if log.V(4) {
    78  		log.Infof(ctx, "uncoalescing %d beats of type %v: %+v", len(beats), msgT, beats)
    79  	}
    80  	beatReqs := make([]RaftMessageRequest, len(beats))
    81  	for i, beat := range beats {
    82  		msg := raftpb.Message{
    83  			Type:   msgT,
    84  			From:   uint64(beat.FromReplicaID),
    85  			To:     uint64(beat.ToReplicaID),
    86  			Term:   beat.Term,
    87  			Commit: beat.Commit,
    88  		}
    89  		beatReqs[i] = RaftMessageRequest{
    90  			RangeID: beat.RangeID,
    91  			FromReplica: roachpb.ReplicaDescriptor{
    92  				NodeID:    fromReplica.NodeID,
    93  				StoreID:   fromReplica.StoreID,
    94  				ReplicaID: beat.FromReplicaID,
    95  			},
    96  			ToReplica: roachpb.ReplicaDescriptor{
    97  				NodeID:    toReplica.NodeID,
    98  				StoreID:   toReplica.StoreID,
    99  				ReplicaID: beat.ToReplicaID,
   100  			},
   101  			Message: msg,
   102  			Quiesce: beat.Quiesce,
   103  		}
   104  		if beat.ToIsLearner {
   105  			beatReqs[i].ToReplica.Type = &learnerType
   106  		}
   107  		if log.V(4) {
   108  			log.Infof(ctx, "uncoalesced beat: %+v", beatReqs[i])
   109  		}
   110  
   111  		if err := s.HandleRaftUncoalescedRequest(ctx, &beatReqs[i], respStream); err != nil {
   112  			log.Errorf(ctx, "could not handle uncoalesced heartbeat %s", err)
   113  		}
   114  	}
   115  }
   116  
   117  // HandleRaftRequest dispatches a raft message to the appropriate Replica. It
   118  // requires that s.mu is not held.
   119  func (s *Store) HandleRaftRequest(
   120  	ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream,
   121  ) *roachpb.Error {
   122  	// NB: unlike the other two RaftMessageHandler methods implemented by Store,
   123  	// this one doesn't need to directly run through a Stopper task because it
   124  	// delegates all work through a raftScheduler, whose workers' lifetimes are
   125  	// already tied to the Store's Stopper.
   126  	if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 {
   127  		if req.RangeID != 0 {
   128  			log.Fatalf(ctx, "coalesced heartbeats must have rangeID == 0")
   129  		}
   130  		s.uncoalesceBeats(ctx, req.Heartbeats, req.FromReplica, req.ToReplica, raftpb.MsgHeartbeat, respStream)
   131  		s.uncoalesceBeats(ctx, req.HeartbeatResps, req.FromReplica, req.ToReplica, raftpb.MsgHeartbeatResp, respStream)
   132  		return nil
   133  	}
   134  	return s.HandleRaftUncoalescedRequest(ctx, req, respStream)
   135  }
   136  
   137  // HandleRaftUncoalescedRequest dispatches a raft message to the appropriate
   138  // Replica. It requires that s.mu is not held.
   139  func (s *Store) HandleRaftUncoalescedRequest(
   140  	ctx context.Context, req *RaftMessageRequest, respStream RaftMessageResponseStream,
   141  ) *roachpb.Error {
   142  
   143  	if len(req.Heartbeats)+len(req.HeartbeatResps) > 0 {
   144  		log.Fatalf(ctx, "HandleRaftUncoalescedRequest cannot be given coalesced heartbeats or heartbeat responses, received %s", req)
   145  	}
   146  	// HandleRaftRequest is called on locally uncoalesced heartbeats (which are
   147  	// not sent over the network if the environment variable is set) so do not
   148  	// count them.
   149  	s.metrics.raftRcvdMessages[req.Message.Type].Inc(1)
   150  
   151  	value, ok := s.replicaQueues.Load(int64(req.RangeID))
   152  	if !ok {
   153  		value, _ = s.replicaQueues.LoadOrStore(int64(req.RangeID), unsafe.Pointer(&raftRequestQueue{}))
   154  	}
   155  	q := (*raftRequestQueue)(value)
   156  	q.Lock()
   157  	if len(q.infos) >= replicaRequestQueueSize {
   158  		q.Unlock()
   159  		// TODO(peter): Return an error indicating the request was dropped. Note
   160  		// that dropping the request is safe. Raft will retry.
   161  		s.metrics.RaftRcvdMsgDropped.Inc(1)
   162  		return nil
   163  	}
   164  	q.infos = append(q.infos, raftRequestInfo{
   165  		req:        req,
   166  		respStream: respStream,
   167  	})
   168  	first := len(q.infos) == 1
   169  	q.Unlock()
   170  
   171  	// processRequestQueue will process all infos in the slice each time it
   172  	// runs, so we only need to schedule a Raft request event if we added the
   173  	// first info in the slice. Everyone else can rely on the request that added
   174  	// the first info already having scheduled a Raft request event.
   175  	if first {
   176  		s.scheduler.EnqueueRaftRequest(req.RangeID)
   177  	}
   178  	return nil
   179  }
   180  
   181  // withReplicaForRequest calls the supplied function with the (lazily
   182  // initialized) Replica specified in the request. The replica passed to
   183  // the function will have its Replica.raftMu locked.
   184  func (s *Store) withReplicaForRequest(
   185  	ctx context.Context, req *RaftMessageRequest, f func(context.Context, *Replica) *roachpb.Error,
   186  ) *roachpb.Error {
   187  	// Lazily create the replica.
   188  	r, _, err := s.getOrCreateReplica(
   189  		ctx,
   190  		req.RangeID,
   191  		req.ToReplica.ReplicaID,
   192  		&req.FromReplica,
   193  		req.ToReplica.GetType() == roachpb.LEARNER,
   194  	)
   195  	if err != nil {
   196  		return roachpb.NewError(err)
   197  	}
   198  	defer r.raftMu.Unlock()
   199  	ctx = r.AnnotateCtx(ctx)
   200  	r.setLastReplicaDescriptors(req)
   201  	return f(ctx, r)
   202  }
   203  
   204  // processRaftRequestWithReplica processes the (non-snapshot) Raft request on
   205  // the specified replica. Notably, it does not handle updates to the Raft Ready
   206  // state; callers will probably want to handle this themselves at some point.
   207  func (s *Store) processRaftRequestWithReplica(
   208  	ctx context.Context, r *Replica, req *RaftMessageRequest,
   209  ) *roachpb.Error {
   210  	if verboseRaftLoggingEnabled() {
   211  		log.Infof(ctx, "incoming raft message:\n%s", raftDescribeMessage(req.Message, raftEntryFormatter))
   212  	}
   213  
   214  	if req.Message.Type == raftpb.MsgSnap {
   215  		log.Fatalf(ctx, "unexpected snapshot: %+v", req)
   216  	}
   217  
   218  	if req.Quiesce {
   219  		if req.Message.Type != raftpb.MsgHeartbeat {
   220  			log.Fatalf(ctx, "unexpected quiesce: %+v", req)
   221  		}
   222  		// If another replica tells us to quiesce, we verify that according to
   223  		// it, we are fully caught up, and that we believe it to be the leader.
   224  		// If we didn't do this, this replica could only unquiesce by means of
   225  		// an election, which means that the request prompting the unquiesce
   226  		// would end up with latency on the order of an election timeout.
   227  		//
   228  		// There are additional checks in quiesceLocked() that prevent us from
   229  		// quiescing if there's outstanding work.
   230  		r.mu.Lock()
   231  		status := r.raftBasicStatusRLocked()
   232  		ok := status.Term == req.Message.Term &&
   233  			status.Commit == req.Message.Commit &&
   234  			status.Lead == req.Message.From &&
   235  			r.quiesceLocked()
   236  		r.mu.Unlock()
   237  		if ok {
   238  			return nil
   239  		}
   240  		if log.V(4) {
   241  			log.Infof(ctx, "not quiescing: local raft status is %+v, incoming quiesce message is %+v", status, req.Message)
   242  		}
   243  	}
   244  
   245  	if req.ToReplica.ReplicaID == 0 {
   246  		log.VEventf(ctx, 1, "refusing incoming Raft message %s from %+v to %+v",
   247  			req.Message.Type, req.FromReplica, req.ToReplica)
   248  		return roachpb.NewErrorf(
   249  			"cannot recreate replica that is not a member of its range (StoreID %s not found in r%d)",
   250  			r.store.StoreID(), req.RangeID,
   251  		)
   252  	}
   253  
   254  	drop := maybeDropMsgApp(ctx, (*replicaMsgAppDropper)(r), &req.Message, req.RangeStartKey)
   255  	if !drop {
   256  		if err := r.stepRaftGroup(req); err != nil {
   257  			return roachpb.NewError(err)
   258  		}
   259  	}
   260  	return nil
   261  }
   262  
   263  // processRaftSnapshotRequest processes the incoming non-preemptive snapshot
   264  // Raft request on the request's specified replica. The function makes sure to
   265  // handle any updated Raft Ready state. It also adds and later removes the
   266  // (potentially) necessary placeholder to protect against concurrent access to
   267  // the keyspace encompassed by the snapshot but not yet guarded by the replica.
   268  func (s *Store) processRaftSnapshotRequest(
   269  	ctx context.Context, snapHeader *SnapshotRequest_Header, inSnap IncomingSnapshot,
   270  ) *roachpb.Error {
   271  	if snapHeader.IsPreemptive() {
   272  		return roachpb.NewError(crdberrors.AssertionFailedf(`expected a raft or learner snapshot`))
   273  	}
   274  
   275  	return s.withReplicaForRequest(ctx, &snapHeader.RaftMessageRequest, func(
   276  		ctx context.Context, r *Replica,
   277  	) (pErr *roachpb.Error) {
   278  		if snapHeader.RaftMessageRequest.Message.Type != raftpb.MsgSnap {
   279  			log.Fatalf(ctx, "expected snapshot: %+v", snapHeader.RaftMessageRequest)
   280  		}
   281  
   282  		// Check to see if a snapshot can be applied. Snapshots can always be applied
   283  		// to initialized replicas. Note that if we add a placeholder we need to
   284  		// already be holding Replica.raftMu in order to prevent concurrent
   285  		// raft-ready processing of uninitialized replicas.
   286  		var addedPlaceholder bool
   287  		var removePlaceholder bool
   288  		if err := func() error {
   289  			s.mu.Lock()
   290  			defer s.mu.Unlock()
   291  			placeholder, err := s.canApplySnapshotLocked(ctx, snapHeader)
   292  			if err != nil {
   293  				// If the storage cannot accept the snapshot, return an
   294  				// error before passing it to RawNode.Step, since our
   295  				// error handling options past that point are limited.
   296  				log.Infof(ctx, "cannot apply snapshot: %s", err)
   297  				return err
   298  			}
   299  
   300  			if placeholder != nil {
   301  				// NB: The placeholder added here is either removed below after a
   302  				// preemptive snapshot is applied or after the next call to
   303  				// Replica.handleRaftReady. Note that we can only get here if the
   304  				// replica doesn't exist or is uninitialized.
   305  				if err := s.addPlaceholderLocked(placeholder); err != nil {
   306  					log.Fatalf(ctx, "could not add vetted placeholder %s: %+v", placeholder, err)
   307  				}
   308  				addedPlaceholder = true
   309  			}
   310  			return nil
   311  		}(); err != nil {
   312  			return roachpb.NewError(err)
   313  		}
   314  
   315  		if addedPlaceholder {
   316  			// If we added a placeholder remove it before we return unless some other
   317  			// part of the code takes ownership of the removal (indicated by setting
   318  			// removePlaceholder to false).
   319  			removePlaceholder = true
   320  			defer func() {
   321  				if removePlaceholder {
   322  					if s.removePlaceholder(ctx, snapHeader.RaftMessageRequest.RangeID) {
   323  						atomic.AddInt32(&s.counts.removedPlaceholders, 1)
   324  					}
   325  				}
   326  			}()
   327  		}
   328  		// NB: we cannot get errRemoved here because we're promised by
   329  		// withReplicaForRequest that this replica is not currently being removed
   330  		// and we've been holding the raftMu the entire time.
   331  		if err := r.stepRaftGroup(&snapHeader.RaftMessageRequest); err != nil {
   332  			return roachpb.NewError(err)
   333  		}
   334  		_, expl, err := r.handleRaftReadyRaftMuLocked(ctx, inSnap)
   335  		maybeFatalOnRaftReadyErr(ctx, expl, err)
   336  		removePlaceholder = false
   337  		return nil
   338  	})
   339  }
   340  
   341  // HandleRaftResponse implements the RaftMessageHandler interface. Per the
   342  // interface specification, an error is returned if and only if the underlying
   343  // Raft connection should be closed.
   344  // It requires that s.mu is not held.
   345  func (s *Store) HandleRaftResponse(ctx context.Context, resp *RaftMessageResponse) error {
   346  	ctx = s.AnnotateCtx(ctx)
   347  	const name = "storage.Store: handle raft response"
   348  	return s.stopper.RunTaskWithErr(ctx, name, func(ctx context.Context) error {
   349  		repl, replErr := s.GetReplica(resp.RangeID)
   350  		if replErr == nil {
   351  			// Best-effort context annotation of replica.
   352  			ctx = repl.AnnotateCtx(ctx)
   353  		}
   354  		switch val := resp.Union.GetValue().(type) {
   355  		case *roachpb.Error:
   356  			switch tErr := val.GetDetail().(type) {
   357  			case *roachpb.ReplicaTooOldError:
   358  				if replErr != nil {
   359  					// RangeNotFoundErrors are expected here; nothing else is.
   360  					if !errors.HasType(replErr, (*roachpb.RangeNotFoundError)(nil)) {
   361  						log.Errorf(ctx, "%v", replErr)
   362  					}
   363  					return nil
   364  				}
   365  
   366  				// Grab the raftMu in addition to the replica mu because
   367  				// cancelFailedProposalsLocked below requires it.
   368  				repl.raftMu.Lock()
   369  				defer repl.raftMu.Unlock()
   370  				repl.mu.Lock()
   371  
   372  				// If the replica ID in the error does not match then we know
   373  				// that the replica has been removed and re-added quickly. In
   374  				// that case, we don't want to add it to the replicaGCQueue.
   375  				// If the replica is not alive then we also should ignore this error.
   376  				if tErr.ReplicaID != repl.mu.replicaID ||
   377  					!repl.mu.destroyStatus.IsAlive() ||
   378  					// Ignore if we want to test the replicaGC queue.
   379  					s.TestingKnobs().DisableEagerReplicaRemoval {
   380  					repl.mu.Unlock()
   381  					return nil
   382  				}
   383  
   384  				// The replica will be garbage collected soon (we are sure
   385  				// since our replicaID is definitely too old), but in the meantime we
   386  				// already want to bounce all traffic from it. Note that the replica
   387  				// could be re-added with a higher replicaID, but we want to clear the
   388  				// replica's data before that happens.
   389  				if log.V(1) {
   390  					log.Infof(ctx, "setting local replica to destroyed due to ReplicaTooOld error")
   391  				}
   392  
   393  				repl.mu.Unlock()
   394  				nextReplicaID := tErr.ReplicaID + 1
   395  				return s.removeReplicaRaftMuLocked(ctx, repl, nextReplicaID, RemoveOptions{
   396  					DestroyData: true,
   397  				})
   398  			case *roachpb.RaftGroupDeletedError:
   399  				if replErr != nil {
   400  					// RangeNotFoundErrors are expected here; nothing else is.
   401  					if !errors.HasType(replErr, (*roachpb.RangeNotFoundError)(nil)) {
   402  						log.Errorf(ctx, "%v", replErr)
   403  					}
   404  					return nil
   405  				}
   406  
   407  				// If the replica is talking to a replica that's been deleted, it must be
   408  				// out of date. While this may just mean it's slightly behind, it can
   409  				// also mean that it is so far behind it no longer knows where any of the
   410  				// other replicas are (#23994). Add it to the replica GC queue to do a
   411  				// proper check.
   412  				s.replicaGCQueue.AddAsync(ctx, repl, replicaGCPriorityDefault)
   413  			case *roachpb.StoreNotFoundError:
   414  				log.Warningf(ctx, "raft error: node %d claims to not contain store %d for replica %s: %s",
   415  					resp.FromReplica.NodeID, resp.FromReplica.StoreID, resp.FromReplica, val)
   416  				return val.GetDetail() // close Raft connection
   417  			default:
   418  				log.Warningf(ctx, "got error from r%d, replica %s: %s",
   419  					resp.RangeID, resp.FromReplica, val)
   420  			}
   421  		default:
   422  			log.Warningf(ctx, "got unknown raft response type %T from replica %s: %s", val, resp.FromReplica, val)
   423  		}
   424  		return nil
   425  	})
   426  }
   427  
   428  // enqueueRaftUpdateCheck asynchronously registers the given range ID to be
   429  // checked for raft updates when the processRaft goroutine is idle.
   430  func (s *Store) enqueueRaftUpdateCheck(rangeID roachpb.RangeID) {
   431  	s.scheduler.EnqueueRaftReady(rangeID)
   432  }
   433  
   434  func (s *Store) processRequestQueue(ctx context.Context, rangeID roachpb.RangeID) bool {
   435  	value, ok := s.replicaQueues.Load(int64(rangeID))
   436  	if !ok {
   437  		return false
   438  	}
   439  	q := (*raftRequestQueue)(value)
   440  	q.Lock()
   441  	infos := q.infos
   442  	q.infos = nil
   443  	q.Unlock()
   444  	if len(infos) == 0 {
   445  		return false
   446  	}
   447  
   448  	var hadError bool
   449  	for i := range infos {
   450  		info := &infos[i]
   451  		if pErr := s.withReplicaForRequest(
   452  			ctx, info.req, func(ctx context.Context, r *Replica) *roachpb.Error {
   453  				return s.processRaftRequestWithReplica(ctx, r, info.req)
   454  			},
   455  		); pErr != nil {
   456  			hadError = true
   457  			if err := info.respStream.Send(newRaftMessageResponse(info.req, pErr)); err != nil {
   458  				// Seems excessive to log this on every occurrence as the other side
   459  				// might have closed.
   460  				log.VEventf(ctx, 1, "error sending error: %s", err)
   461  			}
   462  		}
   463  	}
   464  
   465  	if hadError {
   466  		// If we're unable to process a request, consider dropping the request queue
   467  		// to free up space in the map.
   468  		// This is relevant if requests failed because the target replica could not
   469  		// be created (for example due to the Raft tombstone). The particular code
   470  		// here takes into account that we don't want to drop the queue if there
   471  		// are other messages waiting on it, or if the target replica exists. Raft
   472  		// tolerates the occasional dropped message, but our unit tests are less
   473  		// forgiving.
   474  		//
   475  		// See https://github.com/cockroachdb/cockroach/issues/30951#issuecomment-428010411.
   476  		if _, exists := s.mu.replicas.Load(int64(rangeID)); !exists {
   477  			q.Lock()
   478  			if len(q.infos) == 0 {
   479  				s.replicaQueues.Delete(int64(rangeID))
   480  			}
   481  			q.Unlock()
   482  		}
   483  	}
   484  
   485  	// NB: Even if we had errors and the corresponding replica no longer
   486  	// exists, returning true here won't cause a new, uninitialized replica
   487  	// to be created in processReady().
   488  	return true // ready
   489  }
   490  
   491  func (s *Store) processReady(ctx context.Context, rangeID roachpb.RangeID) {
   492  	value, ok := s.mu.replicas.Load(int64(rangeID))
   493  	if !ok {
   494  		return
   495  	}
   496  
   497  	r := (*Replica)(value)
   498  	ctx = r.AnnotateCtx(ctx)
   499  	start := timeutil.Now()
   500  	stats, expl, err := r.handleRaftReady(ctx, noSnap)
   501  	removed := maybeFatalOnRaftReadyErr(ctx, expl, err)
   502  	elapsed := timeutil.Since(start)
   503  	s.metrics.RaftWorkingDurationNanos.Inc(elapsed.Nanoseconds())
   504  	// Warn if Raft processing took too long. We use the same duration as we
   505  	// use for warning about excessive raft mutex lock hold times. Long
   506  	// processing time means we'll have starved local replicas of ticks and
   507  	// remote replicas will likely start campaigning.
   508  	if elapsed >= defaultReplicaRaftMuWarnThreshold {
   509  		log.Warningf(ctx, "handle raft ready: %.1fs [applied=%d, batches=%d, state_assertions=%d]",
   510  			elapsed.Seconds(), stats.entriesProcessed, stats.batchesProcessed, stats.stateAssertions)
   511  	}
   512  	if !removed && !r.IsInitialized() {
   513  		// Only an uninitialized replica can have a placeholder since, by
   514  		// definition, an initialized replica will be present in the
   515  		// replicasByKey map. While the replica will usually consume the
   516  		// placeholder itself, that isn't guaranteed and so this invocation
   517  		// here is crucial (i.e. don't remove it).
   518  		//
   519  		// We need to hold raftMu here to prevent removing a placeholder that is
   520  		// actively being used by Store.processRaftRequest.
   521  		r.raftMu.Lock()
   522  		if s.removePlaceholder(ctx, r.RangeID) {
   523  			atomic.AddInt32(&s.counts.droppedPlaceholders, 1)
   524  		}
   525  		r.raftMu.Unlock()
   526  	}
   527  }
   528  
   529  func (s *Store) processTick(ctx context.Context, rangeID roachpb.RangeID) bool {
   530  	value, ok := s.mu.replicas.Load(int64(rangeID))
   531  	if !ok {
   532  		return false
   533  	}
   534  	livenessMap, _ := s.livenessMap.Load().(IsLiveMap)
   535  
   536  	start := timeutil.Now()
   537  	r := (*Replica)(value)
   538  	exists, err := r.tick(livenessMap)
   539  	if err != nil {
   540  		log.Errorf(ctx, "%v", err)
   541  	}
   542  	s.metrics.RaftTickingDurationNanos.Inc(timeutil.Since(start).Nanoseconds())
   543  	return exists // ready
   544  }
   545  
   546  // nodeIsLiveCallback is invoked when a node transitions from non-live
   547  // to live. Iterate through all replicas and find any which belong to
   548  // ranges containing the implicated node. Unquiesce if currently
   549  // quiesced. Note that this mechanism can race with concurrent
   550  // invocations of processTick, which may have a copy of the previous
   551  // livenessMap where the now-live node is down. Those instances should
   552  // be rare, however, and we expect the newly live node to eventually
   553  // unquiesce the range.
   554  func (s *Store) nodeIsLiveCallback(nodeID roachpb.NodeID) {
   555  	s.updateLivenessMap()
   556  
   557  	s.mu.replicas.Range(func(k int64, v unsafe.Pointer) bool {
   558  		r := (*Replica)(v)
   559  		for _, rep := range r.Desc().Replicas().All() {
   560  			if rep.NodeID == nodeID {
   561  				r.unquiesce()
   562  			}
   563  		}
   564  		return true
   565  	})
   566  }
   567  
   568  func (s *Store) processRaft(ctx context.Context) {
   569  	if s.cfg.TestingKnobs.DisableProcessRaft {
   570  		return
   571  	}
   572  
   573  	s.scheduler.Start(ctx, s.stopper)
   574  	// Wait for the scheduler worker goroutines to finish.
   575  	s.stopper.RunWorker(ctx, s.scheduler.Wait)
   576  
   577  	s.stopper.RunWorker(ctx, s.raftTickLoop)
   578  	s.stopper.RunWorker(ctx, s.coalescedHeartbeatsLoop)
   579  	s.stopper.AddCloser(stop.CloserFn(func() {
   580  		s.cfg.Transport.Stop(s.StoreID())
   581  	}))
   582  }
   583  
   584  func (s *Store) raftTickLoop(ctx context.Context) {
   585  	ticker := time.NewTicker(s.cfg.RaftTickInterval)
   586  	defer ticker.Stop()
   587  
   588  	var rangeIDs []roachpb.RangeID
   589  
   590  	for {
   591  		select {
   592  		case <-ticker.C:
   593  			rangeIDs = rangeIDs[:0]
   594  			// Update the liveness map.
   595  			if s.cfg.NodeLiveness != nil {
   596  				s.updateLivenessMap()
   597  			}
   598  
   599  			s.unquiescedReplicas.Lock()
   600  			// Why do we bother to ever queue a Replica on the Raft scheduler for
   601  			// tick processing? Couldn't we just call Replica.tick() here? Yes, but
   602  			// then a single bad/slow Replica can disrupt tick processing for every
   603  			// Replica on the store which cascades into Raft elections and more
   604  			// disruption.
   605  			for rangeID := range s.unquiescedReplicas.m {
   606  				rangeIDs = append(rangeIDs, rangeID)
   607  			}
   608  			s.unquiescedReplicas.Unlock()
   609  
   610  			s.scheduler.EnqueueRaftTick(rangeIDs...)
   611  			s.metrics.RaftTicks.Inc(1)
   612  
   613  		case <-s.stopper.ShouldStop():
   614  			return
   615  		}
   616  	}
   617  }
   618  
   619  func (s *Store) updateLivenessMap() {
   620  	nextMap := s.cfg.NodeLiveness.GetIsLiveMap()
   621  	for nodeID, entry := range nextMap {
   622  		if entry.IsLive {
   623  			// Make sure we ask all live nodes for closed timestamp updates.
   624  			s.cfg.ClosedTimestamp.Clients.EnsureClient(nodeID)
   625  			continue
   626  		}
   627  		// Liveness claims that this node is down, but ConnHealth gets the last say
   628  		// because we'd rather quiesce a range too little than one too often. Note
   629  		// that this policy is different from the one governing the releasing of
   630  		// proposal quota; see comments over there.
   631  		//
   632  		// NB: This has false negatives. If a node doesn't have a conn open to it
   633  		// when ConnHealth is called, then ConnHealth will return
   634  		// rpc.ErrNotHeartbeated regardless of whether the node is up or not. That
   635  		// said, for the nodes that matter, we're likely talking to them via the
   636  		// Raft transport, so ConnHealth should usually indicate a real problem if
   637  		// it gives us an error back. The check can also have false positives if the
   638  		// node goes down after populating the map, but that matters even less.
   639  		entry.IsLive = (s.cfg.NodeDialer.ConnHealth(nodeID, rpc.SystemClass) == nil)
   640  		nextMap[nodeID] = entry
   641  	}
   642  	s.livenessMap.Store(nextMap)
   643  }
   644  
   645  // Since coalesced heartbeats adds latency to heartbeat messages, it is
   646  // beneficial to have it run on a faster cycle than once per tick, so that
   647  // the delay does not impact latency-sensitive features such as quiescence.
   648  func (s *Store) coalescedHeartbeatsLoop(ctx context.Context) {
   649  	ticker := time.NewTicker(s.cfg.CoalescedHeartbeatsInterval)
   650  	defer ticker.Stop()
   651  
   652  	for {
   653  		select {
   654  		case <-ticker.C:
   655  			s.sendQueuedHeartbeats(ctx)
   656  		case <-s.stopper.ShouldStop():
   657  			return
   658  		}
   659  	}
   660  }
   661  
   662  // sendQueuedHeartbeatsToNode requires that the s.coalescedMu lock is held. It
   663  // returns the number of heartbeats that were sent.
   664  func (s *Store) sendQueuedHeartbeatsToNode(
   665  	ctx context.Context, beats, resps []RaftHeartbeat, to roachpb.StoreIdent,
   666  ) int {
   667  	var msgType raftpb.MessageType
   668  
   669  	if len(beats) == 0 && len(resps) == 0 {
   670  		return 0
   671  	} else if len(resps) == 0 {
   672  		msgType = raftpb.MsgHeartbeat
   673  	} else if len(beats) == 0 {
   674  		msgType = raftpb.MsgHeartbeatResp
   675  	} else {
   676  		log.Fatal(ctx, "cannot coalesce both heartbeats and responses")
   677  	}
   678  
   679  	chReq := newRaftMessageRequest()
   680  	*chReq = RaftMessageRequest{
   681  		RangeID: 0,
   682  		ToReplica: roachpb.ReplicaDescriptor{
   683  			NodeID:    to.NodeID,
   684  			StoreID:   to.StoreID,
   685  			ReplicaID: 0,
   686  		},
   687  		FromReplica: roachpb.ReplicaDescriptor{
   688  			NodeID:  s.Ident.NodeID,
   689  			StoreID: s.Ident.StoreID,
   690  		},
   691  		Message: raftpb.Message{
   692  			Type: msgType,
   693  		},
   694  		Heartbeats:     beats,
   695  		HeartbeatResps: resps,
   696  	}
   697  
   698  	if log.V(4) {
   699  		log.Infof(ctx, "sending raft request (coalesced) %+v", chReq)
   700  	}
   701  
   702  	if !s.cfg.Transport.SendAsync(chReq, rpc.SystemClass) {
   703  		for _, beat := range beats {
   704  			if value, ok := s.mu.replicas.Load(int64(beat.RangeID)); ok {
   705  				(*Replica)(value).addUnreachableRemoteReplica(beat.ToReplicaID)
   706  			}
   707  		}
   708  		for _, resp := range resps {
   709  			if value, ok := s.mu.replicas.Load(int64(resp.RangeID)); ok {
   710  				(*Replica)(value).addUnreachableRemoteReplica(resp.ToReplicaID)
   711  			}
   712  		}
   713  		return 0
   714  	}
   715  	return len(beats) + len(resps)
   716  }
   717  
   718  func (s *Store) sendQueuedHeartbeats(ctx context.Context) {
   719  	s.coalescedMu.Lock()
   720  	heartbeats := s.coalescedMu.heartbeats
   721  	heartbeatResponses := s.coalescedMu.heartbeatResponses
   722  	s.coalescedMu.heartbeats = map[roachpb.StoreIdent][]RaftHeartbeat{}
   723  	s.coalescedMu.heartbeatResponses = map[roachpb.StoreIdent][]RaftHeartbeat{}
   724  	s.coalescedMu.Unlock()
   725  
   726  	var beatsSent int
   727  
   728  	for to, beats := range heartbeats {
   729  		beatsSent += s.sendQueuedHeartbeatsToNode(ctx, beats, nil, to)
   730  	}
   731  	for to, resps := range heartbeatResponses {
   732  		beatsSent += s.sendQueuedHeartbeatsToNode(ctx, nil, resps, to)
   733  	}
   734  	s.metrics.RaftCoalescedHeartbeatsPending.Update(int64(beatsSent))
   735  }
   736  
   737  func (s *Store) updateCapacityGauges() error {
   738  	desc, err := s.Descriptor(false /* useCached */)
   739  	if err != nil {
   740  		return err
   741  	}
   742  	s.metrics.Capacity.Update(desc.Capacity.Capacity)
   743  	s.metrics.Available.Update(desc.Capacity.Available)
   744  	s.metrics.Used.Update(desc.Capacity.Used)
   745  
   746  	return nil
   747  }