github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/raft.go

github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/raft.go (about)

     1  // Copyright 2020-2023 The NATS Authors
     2  // Licensed under the Apache License, Version 2.0 (the "License");
     3  // you may not use this file except in compliance with the License.
     4  // You may obtain a copy of the License at
     5  //
     6  // http://www.apache.org/licenses/LICENSE-2.0
     7  //
     8  // Unless required by applicable law or agreed to in writing, software
     9  // distributed under the License is distributed on an "AS IS" BASIS,
    10  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    11  // See the License for the specific language governing permissions and
    12  // limitations under the License.
    13  
    14  package server
    15  
    16  import (
    17  	"bytes"
    18  	"crypto/sha256"
    19  	"encoding/binary"
    20  	"errors"
    21  	"fmt"
    22  	"hash"
    23  	"math"
    24  	"math/rand"
    25  	"net"
    26  	"os"
    27  	"path/filepath"
    28  	"runtime"
    29  	"strings"
    30  	"sync"
    31  	"sync/atomic"
    32  	"time"
    33  
    34  	"github.com/nats-io/nats-server/v2/internal/fastrand"
    35  
    36  	"github.com/minio/highwayhash"
    37  )
    38  
    39  type RaftNode interface {
    40  	Propose(entry []byte) error
    41  	ProposeDirect(entries []*Entry) error
    42  	ForwardProposal(entry []byte) error
    43  	InstallSnapshot(snap []byte) error
    44  	SendSnapshot(snap []byte) error
    45  	NeedSnapshot() bool
    46  	Applied(index uint64) (entries uint64, bytes uint64)
    47  	State() RaftState
    48  	Size() (entries, bytes uint64)
    49  	Progress() (index, commit, applied uint64)
    50  	Leader() bool
    51  	Quorum() bool
    52  	Current() bool
    53  	Healthy() bool
    54  	Term() uint64
    55  	GroupLeader() string
    56  	HadPreviousLeader() bool
    57  	StepDown(preferred ...string) error
    58  	SetObserver(isObserver bool)
    59  	IsObserver() bool
    60  	Campaign() error
    61  	ID() string
    62  	Group() string
    63  	Peers() []*Peer
    64  	UpdateKnownPeers(knownPeers []string)
    65  	ProposeAddPeer(peer string) error
    66  	ProposeRemovePeer(peer string) error
    67  	AdjustClusterSize(csz int) error
    68  	AdjustBootClusterSize(csz int) error
    69  	ClusterSize() int
    70  	ApplyQ() *ipQueue[*CommittedEntry]
    71  	PauseApply() error
    72  	ResumeApply()
    73  	LeadChangeC() <-chan bool
    74  	QuitC() <-chan struct{}
    75  	Created() time.Time
    76  	Stop()
    77  	Delete()
    78  	Wipe()
    79  }
    80  
    81  type WAL interface {
    82  	Type() StorageType
    83  	StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error)
    84  	LoadMsg(index uint64, sm *StoreMsg) (*StoreMsg, error)
    85  	RemoveMsg(index uint64) (bool, error)
    86  	Compact(index uint64) (uint64, error)
    87  	Purge() (uint64, error)
    88  	Truncate(seq uint64) error
    89  	State() StreamState
    90  	FastState(*StreamState)
    91  	Stop() error
    92  	Delete() error
    93  }
    94  
    95  type Peer struct {
    96  	ID      string
    97  	Current bool
    98  	Last    time.Time
    99  	Lag     uint64
   100  }
   101  
   102  type RaftState uint8
   103  
   104  // Allowable states for a NATS Consensus Group.
   105  const (
   106  	Follower RaftState = iota
   107  	Leader
   108  	Candidate
   109  	Closed
   110  )
   111  
   112  func (state RaftState) String() string {
   113  	switch state {
   114  	case Follower:
   115  		return "FOLLOWER"
   116  	case Candidate:
   117  		return "CANDIDATE"
   118  	case Leader:
   119  		return "LEADER"
   120  	case Closed:
   121  		return "CLOSED"
   122  	}
   123  	return "UNKNOWN"
   124  }
   125  
   126  type raft struct {
   127  	sync.RWMutex
   128  
   129  	created time.Time // Time that the group was created
   130  	accName string    // Account name of the asset this raft group is for
   131  	group   string    // Raft group
   132  	sd      string    // Store directory
   133  	id      string    // Node ID
   134  
   135  	wal   WAL         // WAL store (filestore or memstore)
   136  	wtype StorageType // WAL type, e.g. FileStorage or MemoryStorage
   137  	track bool        //
   138  	werr  error       // Last write error
   139  
   140  	state    atomic.Int32 // RaftState
   141  	hh       hash.Hash64  // Highwayhash, used for snapshots
   142  	snapfile string       // Snapshot filename
   143  
   144  	csz   int             // Cluster size
   145  	qn    int             // Number of nodes needed to establish quorum
   146  	peers map[string]*lps // Other peers in the Raft group
   147  
   148  	removed map[string]struct{}            // Peers that were removed from the group
   149  	acks    map[uint64]map[string]struct{} // Append entry responses/acks, map of entry index -> peer ID
   150  	pae     map[uint64]*appendEntry        // Pending append entries
   151  
   152  	elect  *time.Timer // Election timer, normally accessed via electTimer
   153  	active time.Time   // Last activity time, i.e. for heartbeats
   154  	llqrt  time.Time   // Last quorum lost time
   155  	lsut   time.Time   // Last scale-up time
   156  
   157  	term     uint64 // The current vote term
   158  	pterm    uint64 // Previous term from the last snapshot
   159  	pindex   uint64 // Previous index from the last snapshot
   160  	commit   uint64 // Sequence number of the most recent commit
   161  	applied  uint64 // Sequence number of the most recently applied commit
   162  	hcbehind bool   // Were we falling behind at the last health check? (see: isCurrent)
   163  
   164  	leader string // The ID of the leader
   165  	vote   string // Our current vote state
   166  	lxfer  bool   // Are we doing a leadership transfer?
   167  
   168  	s  *Server    // Reference to top-level server
   169  	c  *client    // Internal client for subscriptions
   170  	js *jetStream // JetStream, if running, to see if we are out of resources
   171  
   172  	dflag    bool           // Debug flag
   173  	pleader  bool           // Has the group ever had a leader?
   174  	observer bool           // The node is observing, i.e. not participating in voting
   175  	extSt    extensionState // Extension state
   176  
   177  	psubj  string // Proposals subject
   178  	rpsubj string // Remove peers subject
   179  	vsubj  string // Vote requests subject
   180  	vreply string // Vote responses subject
   181  	asubj  string // Append entries subject
   182  	areply string // Append entries responses subject
   183  
   184  	sq    *sendq        // Send queue for outbound RPC messages
   185  	aesub *subscription // Subscription for handleAppendEntry callbacks
   186  
   187  	wtv []byte // Term and vote to be written
   188  	wps []byte // Peer state to be written
   189  
   190  	catchup  *catchupState               // For when we need to catch up as a follower.
   191  	progress map[string]*ipQueue[uint64] // For leader or server catching up a follower.
   192  
   193  	paused    bool   // Whether or not applies are paused
   194  	hcommit   uint64 // The commit at the time that applies were paused
   195  	pobserver bool   // Whether we were an observer at the time that applies were paused
   196  
   197  	prop     *ipQueue[*Entry]               // Proposals
   198  	entry    *ipQueue[*appendEntry]         // Append entries
   199  	resp     *ipQueue[*appendEntryResponse] // Append entries responses
   200  	apply    *ipQueue[*CommittedEntry]      // Apply queue (committed entries to be passed to upper layer)
   201  	reqs     *ipQueue[*voteRequest]         // Vote requests
   202  	votes    *ipQueue[*voteResponse]        // Vote responses
   203  	stepdown *ipQueue[string]               // Stepdown requests
   204  	leadc    chan bool                      // Leader changes
   205  	quit     chan struct{}                  // Raft group shutdown
   206  }
   207  
   208  // cacthupState structure that holds our subscription, and catchup term and index
   209  // as well as starting term and index and how many updates we have seen.
   210  type catchupState struct {
   211  	sub    *subscription // Subscription that catchup messages will arrive on
   212  	cterm  uint64        // Catchup term
   213  	cindex uint64        // Catchup index
   214  	pterm  uint64        // Starting term
   215  	pindex uint64        // Starting index
   216  	active time.Time     // Last time we received a message for this catchup
   217  }
   218  
   219  // lps holds peer state of last time and last index replicated.
   220  type lps struct {
   221  	ts int64  // Last timestamp
   222  	li uint64 // Last index replicated
   223  	kp bool   // Known peer
   224  }
   225  
   226  const (
   227  	minElectionTimeoutDefault      = 4 * time.Second
   228  	maxElectionTimeoutDefault      = 9 * time.Second
   229  	minCampaignTimeoutDefault      = 100 * time.Millisecond
   230  	maxCampaignTimeoutDefault      = 8 * minCampaignTimeoutDefault
   231  	hbIntervalDefault              = 1 * time.Second
   232  	lostQuorumIntervalDefault      = hbIntervalDefault * 10 // 10 seconds
   233  	lostQuorumCheckIntervalDefault = hbIntervalDefault * 10 // 10 seconds
   234  )
   235  
   236  var (
   237  	minElectionTimeout = minElectionTimeoutDefault
   238  	maxElectionTimeout = maxElectionTimeoutDefault
   239  	minCampaignTimeout = minCampaignTimeoutDefault
   240  	maxCampaignTimeout = maxCampaignTimeoutDefault
   241  	hbInterval         = hbIntervalDefault
   242  	lostQuorumInterval = lostQuorumIntervalDefault
   243  	lostQuorumCheck    = lostQuorumCheckIntervalDefault
   244  )
   245  
   246  type RaftConfig struct {
   247  	Name     string
   248  	Store    string
   249  	Log      WAL
   250  	Track    bool
   251  	Observer bool
   252  }
   253  
   254  var (
   255  	errNotLeader         = errors.New("raft: not leader")
   256  	errAlreadyLeader     = errors.New("raft: already leader")
   257  	errNilCfg            = errors.New("raft: no config given")
   258  	errCorruptPeers      = errors.New("raft: corrupt peer state")
   259  	errEntryLoadFailed   = errors.New("raft: could not load entry from WAL")
   260  	errEntryStoreFailed  = errors.New("raft: could not store entry to WAL")
   261  	errNodeClosed        = errors.New("raft: node is closed")
   262  	errBadSnapName       = errors.New("raft: snapshot name could not be parsed")
   263  	errNoSnapAvailable   = errors.New("raft: no snapshot available")
   264  	errCatchupsRunning   = errors.New("raft: snapshot can not be installed while catchups running")
   265  	errSnapshotCorrupt   = errors.New("raft: snapshot corrupt")
   266  	errTooManyPrefs      = errors.New("raft: stepdown requires at most one preferred new leader")
   267  	errNoPeerState       = errors.New("raft: no peerstate")
   268  	errAdjustBootCluster = errors.New("raft: can not adjust boot peer size on established group")
   269  	errLeaderLen         = fmt.Errorf("raft: leader should be exactly %d bytes", idLen)
   270  	errTooManyEntries    = errors.New("raft: append entry can contain a max of 64k entries")
   271  	errBadAppendEntry    = errors.New("raft: append entry corrupt")
   272  )
   273  
   274  // This will bootstrap a raftNode by writing its config into the store directory.
   275  func (s *Server) bootstrapRaftNode(cfg *RaftConfig, knownPeers []string, allPeersKnown bool) error {
   276  	if cfg == nil {
   277  		return errNilCfg
   278  	}
   279  	// Check validity of peers if presented.
   280  	for _, p := range knownPeers {
   281  		if len(p) != idLen {
   282  			return fmt.Errorf("raft: illegal peer: %q", p)
   283  		}
   284  	}
   285  	expected := len(knownPeers)
   286  	// We need to adjust this is all peers are not known.
   287  	if !allPeersKnown {
   288  		s.Debugf("Determining expected peer size for JetStream meta group")
   289  		if expected < 2 {
   290  			expected = 2
   291  		}
   292  		opts := s.getOpts()
   293  		nrs := len(opts.Routes)
   294  
   295  		cn := s.ClusterName()
   296  		ngwps := 0
   297  		for _, gw := range opts.Gateway.Gateways {
   298  			// Ignore our own cluster if specified.
   299  			if gw.Name == cn {
   300  				continue
   301  			}
   302  			for _, u := range gw.URLs {
   303  				host := u.Hostname()
   304  				// If this is an IP just add one.
   305  				if net.ParseIP(host) != nil {
   306  					ngwps++
   307  				} else {
   308  					addrs, _ := net.LookupHost(host)
   309  					ngwps += len(addrs)
   310  				}
   311  			}
   312  		}
   313  
   314  		if expected < nrs+ngwps {
   315  			expected = nrs + ngwps
   316  			s.Debugf("Adjusting expected peer set size to %d with %d known", expected, len(knownPeers))
   317  		}
   318  	}
   319  
   320  	// Check the store directory. If we have a memory based WAL we need to make sure the directory is setup.
   321  	if stat, err := os.Stat(cfg.Store); os.IsNotExist(err) {
   322  		if err := os.MkdirAll(cfg.Store, 0750); err != nil {
   323  			return fmt.Errorf("raft: could not create storage directory - %v", err)
   324  		}
   325  	} else if stat == nil || !stat.IsDir() {
   326  		return fmt.Errorf("raft: storage directory is not a directory")
   327  	}
   328  	tmpfile, err := os.CreateTemp(cfg.Store, "_test_")
   329  	if err != nil {
   330  		return fmt.Errorf("raft: storage directory is not writable")
   331  	}
   332  	tmpfile.Close()
   333  	os.Remove(tmpfile.Name())
   334  
   335  	return writePeerState(cfg.Store, &peerState{knownPeers, expected, extUndetermined})
   336  }
   337  
   338  // startRaftNode will start the raft node.
   339  func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabels) (RaftNode, error) {
   340  	if cfg == nil {
   341  		return nil, errNilCfg
   342  	}
   343  	s.mu.RLock()
   344  	if s.sys == nil {
   345  		s.mu.RUnlock()
   346  		return nil, ErrNoSysAccount
   347  	}
   348  	sq := s.sys.sq
   349  	sacc := s.sys.account
   350  	hash := s.sys.shash
   351  	s.mu.RUnlock()
   352  
   353  	// Do this here to process error quicker.
   354  	ps, err := readPeerState(cfg.Store)
   355  	if err != nil {
   356  		return nil, err
   357  	}
   358  	if ps == nil {
   359  		return nil, errNoPeerState
   360  	}
   361  
   362  	qpfx := fmt.Sprintf("[ACC:%s] RAFT '%s' ", accName, cfg.Name)
   363  	n := &raft{
   364  		created:  time.Now(),
   365  		id:       hash[:idLen],
   366  		group:    cfg.Name,
   367  		sd:       cfg.Store,
   368  		wal:      cfg.Log,
   369  		wtype:    cfg.Log.Type(),
   370  		track:    cfg.Track,
   371  		csz:      ps.clusterSize,
   372  		qn:       ps.clusterSize/2 + 1,
   373  		peers:    make(map[string]*lps),
   374  		acks:     make(map[uint64]map[string]struct{}),
   375  		pae:      make(map[uint64]*appendEntry),
   376  		s:        s,
   377  		c:        s.createInternalSystemClient(),
   378  		js:       s.getJetStream(),
   379  		sq:       sq,
   380  		quit:     make(chan struct{}),
   381  		reqs:     newIPQueue[*voteRequest](s, qpfx+"vreq"),
   382  		votes:    newIPQueue[*voteResponse](s, qpfx+"vresp"),
   383  		prop:     newIPQueue[*Entry](s, qpfx+"entry"),
   384  		entry:    newIPQueue[*appendEntry](s, qpfx+"appendEntry"),
   385  		resp:     newIPQueue[*appendEntryResponse](s, qpfx+"appendEntryResponse"),
   386  		apply:    newIPQueue[*CommittedEntry](s, qpfx+"committedEntry"),
   387  		stepdown: newIPQueue[string](s, qpfx+"stepdown"),
   388  		accName:  accName,
   389  		leadc:    make(chan bool, 1),
   390  		observer: cfg.Observer,
   391  		extSt:    ps.domainExt,
   392  	}
   393  	n.c.registerWithAccount(sacc)
   394  
   395  	if atomic.LoadInt32(&s.logging.debug) > 0 {
   396  		n.dflag = true
   397  	}
   398  
   399  	// Set up the highwayhash for the snapshots.
   400  	key := sha256.Sum256([]byte(n.group))
   401  	n.hh, _ = highwayhash.New64(key[:])
   402  
   403  	// If we have a term and vote file (tav.idx on the filesystem) then read in
   404  	// what we think the term and vote was. It's possible these are out of date
   405  	// so a catch-up may be required.
   406  	if term, vote, err := n.readTermVote(); err == nil && term > 0 {
   407  		n.term = term
   408  		n.vote = vote
   409  	}
   410  
   411  	// Make sure that the snapshots directory exists.
   412  	if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), 0750); err != nil {
   413  		return nil, fmt.Errorf("could not create snapshots directory - %v", err)
   414  	}
   415  
   416  	// Can't recover snapshots if memory based.
   417  	if _, ok := n.wal.(*memStore); ok {
   418  		os.Remove(filepath.Join(n.sd, snapshotsDir, "*"))
   419  	} else {
   420  		// See if we have any snapshots and if so load and process on startup.
   421  		n.setupLastSnapshot()
   422  	}
   423  
   424  	truncateAndErr := func(index uint64) {
   425  		if err := n.wal.Truncate(index); err != nil {
   426  			n.setWriteErr(err)
   427  		}
   428  	}
   429  
   430  	// Retrieve the stream state from the WAL. If there are pending append
   431  	// entries that were committed but not applied before we last shut down,
   432  	// we will try to replay them and process them here.
   433  	var state StreamState
   434  	n.wal.FastState(&state)
   435  	if state.Msgs > 0 {
   436  		n.debug("Replaying state of %d entries", state.Msgs)
   437  		if first, err := n.loadFirstEntry(); err == nil {
   438  			n.pterm, n.pindex = first.pterm, first.pindex
   439  			if first.commit > 0 && first.commit > n.commit {
   440  				n.commit = first.commit
   441  			}
   442  		}
   443  
   444  		// This process will queue up entries on our applied queue but prior to the upper
   445  		// state machine running. So we will monitor how much we have queued and if we
   446  		// reach a limit will pause the apply queue and resume inside of run() go routine.
   447  		const maxQsz = 32 * 1024 * 1024 // 32MB max
   448  
   449  		// It looks like there are entries we have committed but not applied
   450  		// yet. Replay them.
   451  		for index, qsz := state.FirstSeq, 0; index <= state.LastSeq; index++ {
   452  			ae, err := n.loadEntry(index)
   453  			if err != nil {
   454  				n.warn("Could not load %d from WAL [%+v]: %v", index, state, err)
   455  				truncateAndErr(index)
   456  				break
   457  			}
   458  			if ae.pindex != index-1 {
   459  				n.warn("Corrupt WAL, will truncate")
   460  				truncateAndErr(index)
   461  				break
   462  			}
   463  			n.processAppendEntry(ae, nil)
   464  			// Check how much we have queued up so far to determine if we should pause.
   465  			for _, e := range ae.entries {
   466  				qsz += len(e.Data)
   467  				if qsz > maxQsz && !n.paused {
   468  					n.PauseApply()
   469  				}
   470  			}
   471  		}
   472  	}
   473  
   474  	// Make sure to track ourselves.
   475  	n.peers[n.id] = &lps{time.Now().UnixNano(), 0, true}
   476  
   477  	// Track known peers
   478  	for _, peer := range ps.knownPeers {
   479  		if peer != n.id {
   480  			// Set these to 0 to start but mark as known peer.
   481  			n.peers[peer] = &lps{0, 0, true}
   482  		}
   483  	}
   484  
   485  	// Setup our internal subscriptions for proposals, votes and append entries.
   486  	// If we fail to do this for some reason then this is fatal — we cannot
   487  	// continue setting up or the Raft node may be partially/totally isolated.
   488  	if err := n.createInternalSubs(); err != nil {
   489  		n.shutdown(true)
   490  		return nil, err
   491  	}
   492  
   493  	n.debug("Started")
   494  
   495  	// Check if we need to start in observer mode due to lame duck status.
   496  	// This will stop us from taking on the leader role when we're about to
   497  	// shutdown anyway.
   498  	if s.isLameDuckMode() {
   499  		n.debug("Will start in observer mode due to lame duck status")
   500  		n.SetObserver(true)
   501  	}
   502  
   503  	// Set the election timer and lost quorum timers to now, so that we
   504  	// won't accidentally trigger either state without knowing the real state
   505  	// of the other nodes.
   506  	n.Lock()
   507  	n.resetElectionTimeout()
   508  	n.llqrt = time.Now()
   509  	n.Unlock()
   510  
   511  	// Register the Raft group.
   512  	labels["group"] = n.group
   513  	s.registerRaftNode(n.group, n)
   514  
   515  	// Start the run goroutine for the Raft state machine.
   516  	s.startGoRoutine(n.run, labels)
   517  
   518  	return n, nil
   519  }
   520  
   521  // outOfResources checks to see if we are out of resources.
   522  func (n *raft) outOfResources() bool {
   523  	js := n.js
   524  	if !n.track || js == nil {
   525  		return false
   526  	}
   527  	return js.limitsExceeded(n.wtype)
   528  }
   529  
   530  // Maps node names back to server names.
   531  func (s *Server) serverNameForNode(node string) string {
   532  	if si, ok := s.nodeToInfo.Load(node); ok && si != nil {
   533  		return si.(nodeInfo).name
   534  	}
   535  	return _EMPTY_
   536  }
   537  
   538  // Maps node names back to cluster names.
   539  func (s *Server) clusterNameForNode(node string) string {
   540  	if si, ok := s.nodeToInfo.Load(node); ok && si != nil {
   541  		return si.(nodeInfo).cluster
   542  	}
   543  	return _EMPTY_
   544  }
   545  
   546  // Registers the Raft node with the server, as it will track all of the Raft
   547  // nodes.
   548  func (s *Server) registerRaftNode(group string, n RaftNode) {
   549  	s.rnMu.Lock()
   550  	defer s.rnMu.Unlock()
   551  	if s.raftNodes == nil {
   552  		s.raftNodes = make(map[string]RaftNode)
   553  	}
   554  	s.raftNodes[group] = n
   555  }
   556  
   557  // Unregisters the Raft node from the server, i.e. at shutdown.
   558  func (s *Server) unregisterRaftNode(group string) {
   559  	s.rnMu.Lock()
   560  	defer s.rnMu.Unlock()
   561  	if s.raftNodes != nil {
   562  		delete(s.raftNodes, group)
   563  	}
   564  }
   565  
   566  // Returns how many Raft nodes are running in this server instance.
   567  func (s *Server) numRaftNodes() int {
   568  	s.rnMu.Lock()
   569  	defer s.rnMu.Unlock()
   570  	return len(s.raftNodes)
   571  }
   572  
   573  // Finds the Raft node for a given Raft group, if any. If there is no Raft node
   574  // running for this group then it can return nil.
   575  func (s *Server) lookupRaftNode(group string) RaftNode {
   576  	s.rnMu.RLock()
   577  	defer s.rnMu.RUnlock()
   578  	var n RaftNode
   579  	if s.raftNodes != nil {
   580  		n = s.raftNodes[group]
   581  	}
   582  	return n
   583  }
   584  
   585  // Reloads the debug state for all running Raft nodes. This is necessary when
   586  // the configuration has been reloaded and the debug log level has changed.
   587  func (s *Server) reloadDebugRaftNodes(debug bool) {
   588  	if s == nil {
   589  		return
   590  	}
   591  	s.rnMu.RLock()
   592  	for _, ni := range s.raftNodes {
   593  		n := ni.(*raft)
   594  		n.Lock()
   595  		n.dflag = debug
   596  		n.Unlock()
   597  	}
   598  	s.rnMu.RUnlock()
   599  }
   600  
   601  // Requests that all Raft nodes on this server step down and place them into
   602  // observer mode. This is called when the server is shutting down.
   603  func (s *Server) stepdownRaftNodes() {
   604  	if s == nil {
   605  		return
   606  	}
   607  	s.rnMu.RLock()
   608  	if len(s.raftNodes) == 0 {
   609  		s.rnMu.RUnlock()
   610  		return
   611  	}
   612  	s.Debugf("Stepping down all leader raft nodes")
   613  	nodes := make([]RaftNode, 0, len(s.raftNodes))
   614  	for _, n := range s.raftNodes {
   615  		nodes = append(nodes, n)
   616  	}
   617  	s.rnMu.RUnlock()
   618  
   619  	for _, node := range nodes {
   620  		if node.Leader() {
   621  			node.StepDown()
   622  		}
   623  		node.SetObserver(true)
   624  	}
   625  }
   626  
   627  // Shuts down all Raft nodes on this server. This is called either when the
   628  // server is either entering lame duck mode, shutting down or when JetStream
   629  // has been disabled.
   630  func (s *Server) shutdownRaftNodes() {
   631  	if s == nil {
   632  		return
   633  	}
   634  	s.rnMu.RLock()
   635  	if len(s.raftNodes) == 0 {
   636  		s.rnMu.RUnlock()
   637  		return
   638  	}
   639  	nodes := make([]RaftNode, 0, len(s.raftNodes))
   640  	s.Debugf("Shutting down all raft nodes")
   641  	for _, n := range s.raftNodes {
   642  		nodes = append(nodes, n)
   643  	}
   644  	s.rnMu.RUnlock()
   645  
   646  	for _, node := range nodes {
   647  		node.Stop()
   648  	}
   649  }
   650  
   651  // Used in lameduck mode to move off the leaders.
   652  // We also put all nodes in observer mode so new leaders
   653  // can not be placed on this server.
   654  func (s *Server) transferRaftLeaders() bool {
   655  	if s == nil {
   656  		return false
   657  	}
   658  	s.rnMu.RLock()
   659  	if len(s.raftNodes) == 0 {
   660  		s.rnMu.RUnlock()
   661  		return false
   662  	}
   663  	nodes := make([]RaftNode, 0, len(s.raftNodes))
   664  	for _, n := range s.raftNodes {
   665  		nodes = append(nodes, n)
   666  	}
   667  	s.rnMu.RUnlock()
   668  
   669  	var didTransfer bool
   670  	for _, node := range nodes {
   671  		if node.Leader() {
   672  			node.StepDown()
   673  			didTransfer = true
   674  		}
   675  		node.SetObserver(true)
   676  	}
   677  	return didTransfer
   678  }
   679  
   680  // Formal API
   681  
   682  // Propose will propose a new entry to the group.
   683  // This should only be called on the leader.
   684  func (n *raft) Propose(data []byte) error {
   685  	if state := n.State(); state != Leader {
   686  		n.debug("Proposal ignored, not leader (state: %v)", state)
   687  		return errNotLeader
   688  	}
   689  	n.RLock()
   690  	// Error if we had a previous write error.
   691  	if werr := n.werr; werr != nil {
   692  		n.RUnlock()
   693  		return werr
   694  	}
   695  	prop := n.prop
   696  	n.RUnlock()
   697  
   698  	prop.push(newEntry(EntryNormal, data))
   699  	return nil
   700  }
   701  
   702  // ProposeDirect will propose entries directly by skipping the Raft state
   703  // machine and sending them straight to the wire instead.
   704  // This should only be called on the leader.
   705  func (n *raft) ProposeDirect(entries []*Entry) error {
   706  	if state := n.State(); state != Leader {
   707  		n.debug("Direct proposal ignored, not leader (state: %v)", state)
   708  		return errNotLeader
   709  	}
   710  	n.RLock()
   711  	// Error if we had a previous write error.
   712  	if werr := n.werr; werr != nil {
   713  		n.RUnlock()
   714  		return werr
   715  	}
   716  	n.RUnlock()
   717  
   718  	n.sendAppendEntry(entries)
   719  	return nil
   720  }
   721  
   722  // ForwardProposal will forward the proposal to the leader if known.
   723  // If we are the leader this is the same as calling propose.
   724  // FIXME(dlc) - We could have a reply subject and wait for a response
   725  // for retries, but would need to not block and be in separate Go routine.
   726  func (n *raft) ForwardProposal(entry []byte) error {
   727  	if n.Leader() {
   728  		return n.Propose(entry)
   729  	}
   730  
   731  	n.sendRPC(n.psubj, _EMPTY_, entry)
   732  	return nil
   733  }
   734  
   735  // ProposeAddPeer is called to add a peer to the group.
   736  func (n *raft) ProposeAddPeer(peer string) error {
   737  	if n.State() != Leader {
   738  		return errNotLeader
   739  	}
   740  	n.RLock()
   741  	// Error if we had a previous write error.
   742  	if werr := n.werr; werr != nil {
   743  		n.RUnlock()
   744  		return werr
   745  	}
   746  	prop := n.prop
   747  	n.RUnlock()
   748  
   749  	prop.push(newEntry(EntryAddPeer, []byte(peer)))
   750  	return nil
   751  }
   752  
   753  // As a leader if we are proposing to remove a peer assume its already gone.
   754  func (n *raft) doRemovePeerAsLeader(peer string) {
   755  	n.Lock()
   756  	if n.removed == nil {
   757  		n.removed = map[string]struct{}{}
   758  	}
   759  	n.removed[peer] = struct{}{}
   760  	if _, ok := n.peers[peer]; ok {
   761  		delete(n.peers, peer)
   762  		// We should decrease our cluster size since we are tracking this peer and the peer is most likely already gone.
   763  		n.adjustClusterSizeAndQuorum()
   764  	}
   765  	n.Unlock()
   766  }
   767  
   768  // ProposeRemovePeer is called to remove a peer from the group.
   769  func (n *raft) ProposeRemovePeer(peer string) error {
   770  	n.RLock()
   771  	prop, subj := n.prop, n.rpsubj
   772  	isLeader := n.State() == Leader
   773  	werr := n.werr
   774  	n.RUnlock()
   775  
   776  	// Error if we had a previous write error.
   777  	if werr != nil {
   778  		return werr
   779  	}
   780  
   781  	// If we are the leader then we are responsible for processing the
   782  	// peer remove and then notifying the rest of the group that the
   783  	// peer was removed.
   784  	if isLeader {
   785  		prop.push(newEntry(EntryRemovePeer, []byte(peer)))
   786  		n.doRemovePeerAsLeader(peer)
   787  		return nil
   788  	}
   789  
   790  	// Otherwise we need to forward the proposal to the leader.
   791  	n.sendRPC(subj, _EMPTY_, []byte(peer))
   792  	return nil
   793  }
   794  
   795  // ClusterSize reports back the total cluster size.
   796  // This effects quorum etc.
   797  func (n *raft) ClusterSize() int {
   798  	n.Lock()
   799  	defer n.Unlock()
   800  	return n.csz
   801  }
   802  
   803  // AdjustBootClusterSize can be called to adjust the boot cluster size.
   804  // Will error if called on a group with a leader or a previous leader.
   805  // This can be helpful in mixed mode.
   806  func (n *raft) AdjustBootClusterSize(csz int) error {
   807  	n.Lock()
   808  	defer n.Unlock()
   809  
   810  	if n.leader != noLeader || n.pleader {
   811  		return errAdjustBootCluster
   812  	}
   813  	// Same floor as bootstrap.
   814  	if csz < 2 {
   815  		csz = 2
   816  	}
   817  	// Adjust the cluster size and the number of nodes needed to establish
   818  	// a quorum.
   819  	n.csz = csz
   820  	n.qn = n.csz/2 + 1
   821  
   822  	return nil
   823  }
   824  
   825  // AdjustClusterSize will change the cluster set size.
   826  // Must be the leader.
   827  func (n *raft) AdjustClusterSize(csz int) error {
   828  	if n.State() != Leader {
   829  		return errNotLeader
   830  	}
   831  	n.Lock()
   832  	// Same floor as bootstrap.
   833  	if csz < 2 {
   834  		csz = 2
   835  	}
   836  
   837  	// Adjust the cluster size and the number of nodes needed to establish
   838  	// a quorum.
   839  	n.csz = csz
   840  	n.qn = n.csz/2 + 1
   841  	n.Unlock()
   842  
   843  	n.sendPeerState()
   844  	return nil
   845  }
   846  
   847  // PauseApply will allow us to pause processing of append entries onto our
   848  // external apply queue. In effect this means that the upper layer will no longer
   849  // receive any new entries from the Raft group.
   850  func (n *raft) PauseApply() error {
   851  	if n.State() == Leader {
   852  		return errAlreadyLeader
   853  	}
   854  
   855  	n.Lock()
   856  	defer n.Unlock()
   857  
   858  	// If we are currently a candidate make sure we step down.
   859  	if n.State() == Candidate {
   860  		n.stepdown.push(noLeader)
   861  	}
   862  
   863  	n.debug("Pausing our apply channel")
   864  	n.paused = true
   865  	n.hcommit = n.commit
   866  	// Also prevent us from trying to become a leader while paused and catching up.
   867  	n.pobserver, n.observer = n.observer, true
   868  	n.resetElect(48 * time.Hour)
   869  
   870  	return nil
   871  }
   872  
   873  // ResumeApply will resume sending applies to the external apply queue. This
   874  // means that we will start sending new entries to the upper layer.
   875  func (n *raft) ResumeApply() {
   876  	n.Lock()
   877  	defer n.Unlock()
   878  
   879  	if !n.paused {
   880  		return
   881  	}
   882  
   883  	n.debug("Resuming our apply channel")
   884  	n.observer, n.pobserver = n.pobserver, false
   885  	n.paused = false
   886  	// Run catchup..
   887  	if n.hcommit > n.commit {
   888  		n.debug("Resuming %d replays", n.hcommit+1-n.commit)
   889  		for index := n.commit + 1; index <= n.hcommit; index++ {
   890  			if err := n.applyCommit(index); err != nil {
   891  				n.warn("Got error on apply commit during replay: %v", err)
   892  				break
   893  			}
   894  			// We want to unlock here to allow the upper layers to call Applied() without blocking.
   895  			n.Unlock()
   896  			// Give hint to let other Go routines run.
   897  			// Might not be necessary but seems to make it more fine grained interleaving.
   898  			runtime.Gosched()
   899  			// Simply re-acquire
   900  			n.Lock()
   901  			// Need to check if we got closed or if we were paused again.
   902  			if n.State() == Closed || n.paused {
   903  				return
   904  			}
   905  		}
   906  	}
   907  	n.hcommit = 0
   908  
   909  	// If we had been selected to be the next leader campaign here now that we have resumed.
   910  	if n.lxfer {
   911  		n.xferCampaign()
   912  	} else {
   913  		n.resetElectionTimeout()
   914  	}
   915  }
   916  
   917  // Applied is a callback that must be called by the upper layer when it
   918  // has successfully applied the committed entries that it received from the
   919  // apply queue. It will return the number of entries and an estimation of the
   920  // byte size that could be removed with a snapshot/compact.
   921  func (n *raft) Applied(index uint64) (entries uint64, bytes uint64) {
   922  	n.Lock()
   923  	defer n.Unlock()
   924  
   925  	// Ignore if not applicable. This can happen during a reset.
   926  	if index > n.commit {
   927  		return 0, 0
   928  	}
   929  
   930  	// Ignore if already applied.
   931  	if index > n.applied {
   932  		n.applied = index
   933  	}
   934  
   935  	// Calculate the number of entries and estimate the byte size that
   936  	// we can now remove with a compaction/snapshot.
   937  	var state StreamState
   938  	n.wal.FastState(&state)
   939  	if n.applied > state.FirstSeq {
   940  		entries = n.applied - state.FirstSeq
   941  	}
   942  	if state.Msgs > 0 {
   943  		bytes = entries * state.Bytes / state.Msgs
   944  	}
   945  	return entries, bytes
   946  }
   947  
   948  // For capturing data needed by snapshot.
   949  type snapshot struct {
   950  	lastTerm  uint64
   951  	lastIndex uint64
   952  	peerstate []byte
   953  	data      []byte
   954  }
   955  
   956  const minSnapshotLen = 28
   957  
   958  // Encodes a snapshot into a buffer for storage.
   959  // Lock should be held.
   960  func (n *raft) encodeSnapshot(snap *snapshot) []byte {
   961  	if snap == nil {
   962  		return nil
   963  	}
   964  	var le = binary.LittleEndian
   965  	buf := make([]byte, minSnapshotLen+len(snap.peerstate)+len(snap.data))
   966  	le.PutUint64(buf[0:], snap.lastTerm)
   967  	le.PutUint64(buf[8:], snap.lastIndex)
   968  	// Peer state
   969  	le.PutUint32(buf[16:], uint32(len(snap.peerstate)))
   970  	wi := 20
   971  	copy(buf[wi:], snap.peerstate)
   972  	wi += len(snap.peerstate)
   973  	// data itself.
   974  	copy(buf[wi:], snap.data)
   975  	wi += len(snap.data)
   976  
   977  	// Now do the hash for the end.
   978  	n.hh.Reset()
   979  	n.hh.Write(buf[:wi])
   980  	checksum := n.hh.Sum(nil)
   981  	copy(buf[wi:], checksum)
   982  	wi += len(checksum)
   983  	return buf[:wi]
   984  }
   985  
   986  // SendSnapshot will send the latest snapshot as a normal AE.
   987  // Should only be used when the upper layers know this is most recent.
   988  // Used when restoring streams, moving a stream from R1 to R>1, etc.
   989  func (n *raft) SendSnapshot(data []byte) error {
   990  	n.sendAppendEntry([]*Entry{{EntrySnapshot, data}})
   991  	return nil
   992  }
   993  
   994  // Used to install a snapshot for the given term and applied index. This will release
   995  // all of the log entries up to and including index. This should not be called with
   996  // entries that have been applied to the FSM but have not been applied to the raft state.
   997  func (n *raft) InstallSnapshot(data []byte) error {
   998  	if n.State() == Closed {
   999  		return errNodeClosed
  1000  	}
  1001  
  1002  	n.Lock()
  1003  
  1004  	// If a write error has occurred already then stop here.
  1005  	if werr := n.werr; werr != nil {
  1006  		n.Unlock()
  1007  		return werr
  1008  	}
  1009  
  1010  	// Check that a catchup isn't already taking place. If it is then we won't
  1011  	// allow installing snapshots until it is done.
  1012  	if len(n.progress) > 0 {
  1013  		n.Unlock()
  1014  		return errCatchupsRunning
  1015  	}
  1016  
  1017  	var state StreamState
  1018  	n.wal.FastState(&state)
  1019  
  1020  	if n.applied == 0 {
  1021  		n.Unlock()
  1022  		return errNoSnapAvailable
  1023  	}
  1024  
  1025  	n.debug("Installing snapshot of %d bytes", len(data))
  1026  
  1027  	var term uint64
  1028  	if ae, _ := n.loadEntry(n.applied); ae != nil {
  1029  		// Use the term from the most recently applied entry if possible.
  1030  		term = ae.term
  1031  	} else if ae, _ = n.loadFirstEntry(); ae != nil {
  1032  		// Otherwise see if we can find the term from the first entry.
  1033  		term = ae.term
  1034  	} else {
  1035  		// Last resort is to use the last pterm that we knew of.
  1036  		term = n.pterm
  1037  	}
  1038  
  1039  	snap := &snapshot{
  1040  		lastTerm:  term,
  1041  		lastIndex: n.applied,
  1042  		peerstate: encodePeerState(&peerState{n.peerNames(), n.csz, n.extSt}),
  1043  		data:      data,
  1044  	}
  1045  
  1046  	snapDir := filepath.Join(n.sd, snapshotsDir)
  1047  	sn := fmt.Sprintf(snapFileT, snap.lastTerm, snap.lastIndex)
  1048  	sfile := filepath.Join(snapDir, sn)
  1049  
  1050  	<-dios
  1051  	err := os.WriteFile(sfile, n.encodeSnapshot(snap), defaultFilePerms)
  1052  	dios <- struct{}{}
  1053  
  1054  	if err != nil {
  1055  		n.Unlock()
  1056  		// We could set write err here, but if this is a temporary situation, too many open files etc.
  1057  		// we want to retry and snapshots are not fatal.
  1058  		return err
  1059  	}
  1060  
  1061  	// Remember our latest snapshot file.
  1062  	n.snapfile = sfile
  1063  	if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil {
  1064  		n.setWriteErrLocked(err)
  1065  		n.Unlock()
  1066  		return err
  1067  	}
  1068  	n.Unlock()
  1069  
  1070  	psnaps, _ := os.ReadDir(snapDir)
  1071  	// Remove any old snapshots.
  1072  	for _, fi := range psnaps {
  1073  		pn := fi.Name()
  1074  		if pn != sn {
  1075  			os.Remove(filepath.Join(snapDir, pn))
  1076  		}
  1077  	}
  1078  
  1079  	return nil
  1080  }
  1081  
  1082  // NeedSnapshot returns true if it is necessary to try to install a snapshot, i.e.
  1083  // after we have finished recovering/replaying at startup, on a regular interval or
  1084  // as a part of cleaning up when shutting down.
  1085  func (n *raft) NeedSnapshot() bool {
  1086  	n.RLock()
  1087  	defer n.RUnlock()
  1088  	return n.snapfile == _EMPTY_ && n.applied > 1
  1089  }
  1090  
  1091  const (
  1092  	snapshotsDir = "snapshots"
  1093  	snapFileT    = "snap.%d.%d"
  1094  )
  1095  
  1096  // termAndIndexFromSnapfile tries to load the snapshot file and returns the term
  1097  // and index from that snapshot.
  1098  func termAndIndexFromSnapFile(sn string) (term, index uint64, err error) {
  1099  	if sn == _EMPTY_ {
  1100  		return 0, 0, errBadSnapName
  1101  	}
  1102  	fn := filepath.Base(sn)
  1103  	if n, err := fmt.Sscanf(fn, snapFileT, &term, &index); err != nil || n != 2 {
  1104  		return 0, 0, errBadSnapName
  1105  	}
  1106  	return term, index, nil
  1107  }
  1108  
  1109  // setupLastSnapshot is called at startup to try and recover the last snapshot from
  1110  // the disk if possible. We will try to recover the term, index and commit/applied
  1111  // indices and then notify the upper layer what we found. Compacts the WAL if needed.
  1112  func (n *raft) setupLastSnapshot() {
  1113  	snapDir := filepath.Join(n.sd, snapshotsDir)
  1114  	psnaps, err := os.ReadDir(snapDir)
  1115  	if err != nil {
  1116  		return
  1117  	}
  1118  
  1119  	var lterm, lindex uint64
  1120  	var latest string
  1121  	for _, sf := range psnaps {
  1122  		sfile := filepath.Join(snapDir, sf.Name())
  1123  		var term, index uint64
  1124  		term, index, err := termAndIndexFromSnapFile(sf.Name())
  1125  		if err == nil {
  1126  			if term > lterm {
  1127  				lterm, lindex = term, index
  1128  				latest = sfile
  1129  			} else if term == lterm && index > lindex {
  1130  				lindex = index
  1131  				latest = sfile
  1132  			}
  1133  		} else {
  1134  			// Clean this up, can't parse the name.
  1135  			// TODO(dlc) - We could read in and check actual contents.
  1136  			n.debug("Removing snapshot, can't parse name: %q", sf.Name())
  1137  			os.Remove(sfile)
  1138  		}
  1139  	}
  1140  
  1141  	// Now cleanup any old entries
  1142  	for _, sf := range psnaps {
  1143  		sfile := filepath.Join(snapDir, sf.Name())
  1144  		if sfile != latest {
  1145  			n.debug("Removing old snapshot: %q", sfile)
  1146  			os.Remove(sfile)
  1147  		}
  1148  	}
  1149  
  1150  	if latest == _EMPTY_ {
  1151  		return
  1152  	}
  1153  
  1154  	// Set latest snapshot we have.
  1155  	n.Lock()
  1156  	defer n.Unlock()
  1157  
  1158  	n.snapfile = latest
  1159  	snap, err := n.loadLastSnapshot()
  1160  	if err != nil {
  1161  		// We failed to recover the last snapshot for some reason, so we will
  1162  		// assume it has been corrupted and will try to delete it.
  1163  		if n.snapfile != _EMPTY_ {
  1164  			os.Remove(n.snapfile)
  1165  			n.snapfile = _EMPTY_
  1166  		}
  1167  		return
  1168  	}
  1169  
  1170  	// We successfully recovered the last snapshot from the disk.
  1171  	// Recover state from the snapshot and then notify the upper layer.
  1172  	// Compact the WAL when we're done if needed.
  1173  	n.pindex = snap.lastIndex
  1174  	n.pterm = snap.lastTerm
  1175  	n.commit = snap.lastIndex
  1176  	n.applied = snap.lastIndex
  1177  	n.apply.push(newCommittedEntry(n.commit, []*Entry{{EntrySnapshot, snap.data}}))
  1178  	if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil {
  1179  		n.setWriteErrLocked(err)
  1180  	}
  1181  }
  1182  
  1183  // loadLastSnapshot will load and return our last snapshot.
  1184  // Lock should be held.
  1185  func (n *raft) loadLastSnapshot() (*snapshot, error) {
  1186  	if n.snapfile == _EMPTY_ {
  1187  		return nil, errNoSnapAvailable
  1188  	}
  1189  
  1190  	<-dios
  1191  	buf, err := os.ReadFile(n.snapfile)
  1192  	dios <- struct{}{}
  1193  
  1194  	if err != nil {
  1195  		n.warn("Error reading snapshot: %v", err)
  1196  		os.Remove(n.snapfile)
  1197  		n.snapfile = _EMPTY_
  1198  		return nil, err
  1199  	}
  1200  	if len(buf) < minSnapshotLen {
  1201  		n.warn("Snapshot corrupt, too short")
  1202  		os.Remove(n.snapfile)
  1203  		n.snapfile = _EMPTY_
  1204  		return nil, errSnapshotCorrupt
  1205  	}
  1206  
  1207  	// Check to make sure hash is consistent.
  1208  	hoff := len(buf) - 8
  1209  	lchk := buf[hoff:]
  1210  	n.hh.Reset()
  1211  	n.hh.Write(buf[:hoff])
  1212  	if !bytes.Equal(lchk[:], n.hh.Sum(nil)) {
  1213  		n.warn("Snapshot corrupt, checksums did not match")
  1214  		os.Remove(n.snapfile)
  1215  		n.snapfile = _EMPTY_
  1216  		return nil, errSnapshotCorrupt
  1217  	}
  1218  
  1219  	var le = binary.LittleEndian
  1220  	lps := le.Uint32(buf[16:])
  1221  	snap := &snapshot{
  1222  		lastTerm:  le.Uint64(buf[0:]),
  1223  		lastIndex: le.Uint64(buf[8:]),
  1224  		peerstate: buf[20 : 20+lps],
  1225  		data:      buf[20+lps : hoff],
  1226  	}
  1227  
  1228  	// We had a bug in 2.9.12 that would allow snapshots on last index of 0.
  1229  	// Detect that here and return err.
  1230  	if snap.lastIndex == 0 {
  1231  		n.warn("Snapshot with last index 0 is invalid, cleaning up")
  1232  		os.Remove(n.snapfile)
  1233  		n.snapfile = _EMPTY_
  1234  		return nil, errSnapshotCorrupt
  1235  	}
  1236  
  1237  	return snap, nil
  1238  }
  1239  
  1240  // Leader returns if we are the leader for our group.
  1241  // We use an atomic here now vs acquiring the read lock.
  1242  func (n *raft) Leader() bool {
  1243  	if n == nil {
  1244  		return false
  1245  	}
  1246  	return n.State() == Leader
  1247  }
  1248  
  1249  // isCatchingUp returns true if a catchup is currently taking place.
  1250  func (n *raft) isCatchingUp() bool {
  1251  	n.RLock()
  1252  	defer n.RUnlock()
  1253  	return n.catchup != nil
  1254  }
  1255  
  1256  // isCurrent is called from the healthchecks and returns true if we believe
  1257  // that the upper layer is current with the Raft layer, i.e. that it has applied
  1258  // all of the commits that we have given it.
  1259  // Optionally we can also check whether or not we're making forward progress if we
  1260  // aren't current, in which case this function may block for up to ~10ms to find out.
  1261  // Lock should be held.
  1262  func (n *raft) isCurrent(includeForwardProgress bool) bool {
  1263  	// Check if we are closed.
  1264  	if n.State() == Closed {
  1265  		n.debug("Not current, node is closed")
  1266  		return false
  1267  	}
  1268  
  1269  	// Check whether we've made progress on any state, 0 is invalid so not healthy.
  1270  	if n.commit == 0 {
  1271  		n.debug("Not current, no commits")
  1272  		return false
  1273  	}
  1274  
  1275  	// If we were previously logging about falling behind, also log when the problem
  1276  	// was cleared.
  1277  	clearBehindState := func() {
  1278  		if n.hcbehind {
  1279  			n.warn("Health check OK, no longer falling behind")
  1280  			n.hcbehind = false
  1281  		}
  1282  	}
  1283  
  1284  	// Make sure we are the leader or we know we have heard from the leader recently.
  1285  	if n.State() == Leader {
  1286  		clearBehindState()
  1287  		return true
  1288  	}
  1289  
  1290  	// Check here on catchup status.
  1291  	if cs := n.catchup; cs != nil && n.pterm >= cs.cterm && n.pindex >= cs.cindex {
  1292  		n.cancelCatchup()
  1293  	}
  1294  
  1295  	// Check to see that we have heard from the current leader lately.
  1296  	if n.leader != noLeader && n.leader != n.id && n.catchup == nil {
  1297  		okInterval := int64(hbInterval) * 2
  1298  		ts := time.Now().UnixNano()
  1299  		if ps := n.peers[n.leader]; ps == nil || ps.ts == 0 && (ts-ps.ts) > okInterval {
  1300  			n.debug("Not current, no recent leader contact")
  1301  			return false
  1302  		}
  1303  	}
  1304  	if cs := n.catchup; cs != nil {
  1305  		n.debug("Not current, still catching up pindex=%d, cindex=%d", n.pindex, cs.cindex)
  1306  	}
  1307  
  1308  	if n.commit == n.applied {
  1309  		// At this point if we are current, we can return saying so.
  1310  		clearBehindState()
  1311  		return true
  1312  	} else if !includeForwardProgress {
  1313  		// Otherwise, if we aren't allowed to include forward progress
  1314  		// (i.e. we are checking "current" instead of "healthy") then
  1315  		// give up now.
  1316  		return false
  1317  	}
  1318  
  1319  	// Otherwise, wait for a short period of time and see if we are making any
  1320  	// forward progress.
  1321  	if startDelta := n.commit - n.applied; startDelta > 0 {
  1322  		for i := 0; i < 10; i++ { // 10ms, in 1ms increments
  1323  			n.Unlock()
  1324  			time.Sleep(time.Millisecond)
  1325  			n.Lock()
  1326  			if n.commit-n.applied < startDelta {
  1327  				// The gap is getting smaller, so we're making forward progress.
  1328  				clearBehindState()
  1329  				return true
  1330  			}
  1331  		}
  1332  	}
  1333  
  1334  	n.hcbehind = true
  1335  	n.warn("Falling behind in health check, commit %d != applied %d", n.commit, n.applied)
  1336  	return false
  1337  }
  1338  
  1339  // Current returns if we are the leader for our group or an up to date follower.
  1340  func (n *raft) Current() bool {
  1341  	if n == nil {
  1342  		return false
  1343  	}
  1344  	n.Lock()
  1345  	defer n.Unlock()
  1346  	return n.isCurrent(false)
  1347  }
  1348  
  1349  // Healthy returns if we are the leader for our group and nearly up-to-date.
  1350  func (n *raft) Healthy() bool {
  1351  	if n == nil {
  1352  		return false
  1353  	}
  1354  	n.Lock()
  1355  	defer n.Unlock()
  1356  	return n.isCurrent(true)
  1357  }
  1358  
  1359  // HadPreviousLeader indicates if this group ever had a leader.
  1360  func (n *raft) HadPreviousLeader() bool {
  1361  	n.RLock()
  1362  	defer n.RUnlock()
  1363  	return n.pleader
  1364  }
  1365  
  1366  // GroupLeader returns the current leader of the group.
  1367  func (n *raft) GroupLeader() string {
  1368  	if n == nil {
  1369  		return noLeader
  1370  	}
  1371  	n.RLock()
  1372  	defer n.RUnlock()
  1373  	return n.leader
  1374  }
  1375  
  1376  // Guess the best next leader. Stepdown will check more thoroughly.
  1377  // Lock should be held.
  1378  func (n *raft) selectNextLeader() string {
  1379  	nextLeader, hli := noLeader, uint64(0)
  1380  	for peer, ps := range n.peers {
  1381  		if peer == n.id || ps.li <= hli {
  1382  			continue
  1383  		}
  1384  		hli = ps.li
  1385  		nextLeader = peer
  1386  	}
  1387  	return nextLeader
  1388  }
  1389  
  1390  // StepDown will have a leader stepdown and optionally do a leader transfer.
  1391  func (n *raft) StepDown(preferred ...string) error {
  1392  	n.Lock()
  1393  
  1394  	if len(preferred) > 1 {
  1395  		n.Unlock()
  1396  		return errTooManyPrefs
  1397  	}
  1398  
  1399  	if n.State() != Leader {
  1400  		n.Unlock()
  1401  		return errNotLeader
  1402  	}
  1403  
  1404  	n.debug("Being asked to stepdown")
  1405  
  1406  	// See if we have up to date followers.
  1407  	maybeLeader := noLeader
  1408  	if len(preferred) > 0 {
  1409  		if preferred[0] != _EMPTY_ {
  1410  			maybeLeader = preferred[0]
  1411  		} else {
  1412  			preferred = nil
  1413  		}
  1414  	}
  1415  
  1416  	// Can't pick ourselves.
  1417  	if maybeLeader == n.id {
  1418  		maybeLeader = noLeader
  1419  		preferred = nil
  1420  	}
  1421  
  1422  	nowts := time.Now().UnixNano()
  1423  
  1424  	// If we have a preferred check it first.
  1425  	if maybeLeader != noLeader {
  1426  		var isHealthy bool
  1427  		if ps, ok := n.peers[maybeLeader]; ok {
  1428  			si, ok := n.s.nodeToInfo.Load(maybeLeader)
  1429  			isHealthy = ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3)
  1430  		}
  1431  		if !isHealthy {
  1432  			maybeLeader = noLeader
  1433  		}
  1434  	}
  1435  
  1436  	// If we do not have a preferred at this point pick the first healthy one.
  1437  	// Make sure not ourselves.
  1438  	if maybeLeader == noLeader {
  1439  		for peer, ps := range n.peers {
  1440  			if peer == n.id {
  1441  				continue
  1442  			}
  1443  			si, ok := n.s.nodeToInfo.Load(peer)
  1444  			isHealthy := ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3)
  1445  			if isHealthy {
  1446  				maybeLeader = peer
  1447  				break
  1448  			}
  1449  		}
  1450  	}
  1451  
  1452  	// Clear our vote state.
  1453  	n.vote = noVote
  1454  	n.writeTermVote()
  1455  
  1456  	stepdown := n.stepdown
  1457  	prop := n.prop
  1458  	n.Unlock()
  1459  
  1460  	if len(preferred) > 0 && maybeLeader == noLeader {
  1461  		n.debug("Can not transfer to preferred peer %q", preferred[0])
  1462  	}
  1463  
  1464  	// If we have a new leader selected, transfer over to them.
  1465  	if maybeLeader != noLeader {
  1466  		n.debug("Selected %q for new leader", maybeLeader)
  1467  		prop.push(newEntry(EntryLeaderTransfer, []byte(maybeLeader)))
  1468  	} else {
  1469  		// Force us to stepdown here.
  1470  		n.debug("Stepping down")
  1471  		stepdown.push(noLeader)
  1472  	}
  1473  
  1474  	return nil
  1475  }
  1476  
  1477  // Campaign will have our node start a leadership vote.
  1478  func (n *raft) Campaign() error {
  1479  	n.Lock()
  1480  	defer n.Unlock()
  1481  	return n.campaign()
  1482  }
  1483  
  1484  func randCampaignTimeout() time.Duration {
  1485  	delta := rand.Int63n(int64(maxCampaignTimeout - minCampaignTimeout))
  1486  	return (minCampaignTimeout + time.Duration(delta))
  1487  }
  1488  
  1489  // Campaign will have our node start a leadership vote.
  1490  // Lock should be held.
  1491  func (n *raft) campaign() error {
  1492  	n.debug("Starting campaign")
  1493  	if n.State() == Leader {
  1494  		return errAlreadyLeader
  1495  	}
  1496  	n.resetElect(randCampaignTimeout())
  1497  	return nil
  1498  }
  1499  
  1500  // xferCampaign will have our node start an immediate leadership vote.
  1501  // Lock should be held.
  1502  func (n *raft) xferCampaign() error {
  1503  	n.debug("Starting transfer campaign")
  1504  	if n.State() == Leader {
  1505  		n.lxfer = false
  1506  		return errAlreadyLeader
  1507  	}
  1508  	n.resetElect(10 * time.Millisecond)
  1509  	return nil
  1510  }
  1511  
  1512  // State returns the current state for this node.
  1513  func (n *raft) State() RaftState {
  1514  	return RaftState(n.state.Load())
  1515  }
  1516  
  1517  // Progress returns the current index, commit and applied values.
  1518  func (n *raft) Progress() (index, commit, applied uint64) {
  1519  	n.RLock()
  1520  	defer n.RUnlock()
  1521  	return n.pindex + 1, n.commit, n.applied
  1522  }
  1523  
  1524  // Size returns number of entries and total bytes for our WAL.
  1525  func (n *raft) Size() (uint64, uint64) {
  1526  	n.RLock()
  1527  	var state StreamState
  1528  	n.wal.FastState(&state)
  1529  	n.RUnlock()
  1530  	return state.Msgs, state.Bytes
  1531  }
  1532  
  1533  func (n *raft) ID() string {
  1534  	if n == nil {
  1535  		return _EMPTY_
  1536  	}
  1537  	n.RLock()
  1538  	defer n.RUnlock()
  1539  	return n.id
  1540  }
  1541  
  1542  func (n *raft) Group() string {
  1543  	n.RLock()
  1544  	defer n.RUnlock()
  1545  	return n.group
  1546  }
  1547  
  1548  func (n *raft) Peers() []*Peer {
  1549  	n.RLock()
  1550  	defer n.RUnlock()
  1551  
  1552  	var peers []*Peer
  1553  	for id, ps := range n.peers {
  1554  		var lag uint64
  1555  		if n.commit > ps.li {
  1556  			lag = n.commit - ps.li
  1557  		}
  1558  		p := &Peer{
  1559  			ID:      id,
  1560  			Current: id == n.leader || ps.li >= n.applied,
  1561  			Last:    time.Unix(0, ps.ts),
  1562  			Lag:     lag,
  1563  		}
  1564  		peers = append(peers, p)
  1565  	}
  1566  	return peers
  1567  }
  1568  
  1569  // Update our known set of peers.
  1570  func (n *raft) UpdateKnownPeers(knownPeers []string) {
  1571  	n.Lock()
  1572  	// Process like peer state update.
  1573  	ps := &peerState{knownPeers, len(knownPeers), n.extSt}
  1574  	n.processPeerState(ps)
  1575  	isLeader := n.State() == Leader
  1576  	n.Unlock()
  1577  
  1578  	// If we are the leader send this update out as well.
  1579  	if isLeader {
  1580  		n.sendPeerState()
  1581  	}
  1582  }
  1583  
  1584  // ApplyQ returns the apply queue that new commits will be sent to for the
  1585  // upper layer to apply.
  1586  func (n *raft) ApplyQ() *ipQueue[*CommittedEntry] { return n.apply }
  1587  
  1588  // LeadChangeC returns the leader change channel, notifying when the Raft
  1589  // leader role has moved.
  1590  func (n *raft) LeadChangeC() <-chan bool { return n.leadc }
  1591  
  1592  // QuitC returns the quit channel, notifying when the Raft group has shut down.
  1593  func (n *raft) QuitC() <-chan struct{} { return n.quit }
  1594  
  1595  func (n *raft) Created() time.Time {
  1596  	n.RLock()
  1597  	defer n.RUnlock()
  1598  	return n.created
  1599  }
  1600  
  1601  func (n *raft) Stop() {
  1602  	n.shutdown(false)
  1603  }
  1604  
  1605  func (n *raft) Delete() {
  1606  	n.shutdown(true)
  1607  }
  1608  
  1609  func (n *raft) shutdown(shouldDelete bool) {
  1610  	n.Lock()
  1611  
  1612  	// Returned swap value is the previous state. It looks counter-intuitive
  1613  	// to do this atomic operation with the lock held, but we have to do so in
  1614  	// order to make sure that switchState() is not already running. If it is
  1615  	// then it can potentially update the n.state back to a non-closed state,
  1616  	// allowing shutdown() to be called again. If that happens then the below
  1617  	// close(n.quit) will panic from trying to close an already-closed channel.
  1618  	if n.state.Swap(int32(Closed)) == int32(Closed) {
  1619  		n.Unlock()
  1620  		return
  1621  	}
  1622  
  1623  	close(n.quit)
  1624  	if c := n.c; c != nil {
  1625  		var subs []*subscription
  1626  		c.mu.Lock()
  1627  		for _, sub := range c.subs {
  1628  			subs = append(subs, sub)
  1629  		}
  1630  		c.mu.Unlock()
  1631  		for _, sub := range subs {
  1632  			n.unsubscribe(sub)
  1633  		}
  1634  		c.closeConnection(InternalClient)
  1635  	}
  1636  	s, g, wal := n.s, n.group, n.wal
  1637  
  1638  	// Unregistering ipQueues do not prevent them from push/pop
  1639  	// just will remove them from the central monitoring map
  1640  	queues := []interface {
  1641  		unregister()
  1642  	}{n.reqs, n.votes, n.prop, n.entry, n.resp, n.apply, n.stepdown}
  1643  	for _, q := range queues {
  1644  		q.unregister()
  1645  	}
  1646  	n.Unlock()
  1647  
  1648  	s.unregisterRaftNode(g)
  1649  
  1650  	if wal != nil {
  1651  		if shouldDelete {
  1652  			wal.Delete()
  1653  		} else {
  1654  			wal.Stop()
  1655  		}
  1656  	}
  1657  
  1658  	if shouldDelete {
  1659  		// Delete all our peer state and vote state and any snapshots.
  1660  		os.RemoveAll(n.sd)
  1661  		n.debug("Deleted")
  1662  	} else {
  1663  		n.debug("Shutdown")
  1664  	}
  1665  }
  1666  
  1667  // Wipe will force an on disk state reset and then call Delete().
  1668  // Useful in case we have been stopped before this point.
  1669  func (n *raft) Wipe() {
  1670  	n.RLock()
  1671  	wal := n.wal
  1672  	n.RUnlock()
  1673  	// Delete our underlying storage.
  1674  	if wal != nil {
  1675  		wal.Delete()
  1676  	}
  1677  	// Now call delete.
  1678  	n.Delete()
  1679  }
  1680  
  1681  const (
  1682  	raftAllSubj        = "$NRG.>"
  1683  	raftVoteSubj       = "$NRG.V.%s"
  1684  	raftAppendSubj     = "$NRG.AE.%s"
  1685  	raftPropSubj       = "$NRG.P.%s"
  1686  	raftRemovePeerSubj = "$NRG.RP.%s"
  1687  	raftReply          = "$NRG.R.%s"
  1688  	raftCatchupReply   = "$NRG.CR.%s"
  1689  )
  1690  
  1691  // Lock should be held (due to use of random generator)
  1692  func (n *raft) newCatchupInbox() string {
  1693  	var b [replySuffixLen]byte
  1694  	rn := fastrand.Uint64()
  1695  	for i, l := 0, rn; i < len(b); i++ {
  1696  		b[i] = digits[l%base]
  1697  		l /= base
  1698  	}
  1699  	return fmt.Sprintf(raftCatchupReply, b[:])
  1700  }
  1701  
  1702  func (n *raft) newInbox() string {
  1703  	var b [replySuffixLen]byte
  1704  	rn := fastrand.Uint64()
  1705  	for i, l := 0, rn; i < len(b); i++ {
  1706  		b[i] = digits[l%base]
  1707  		l /= base
  1708  	}
  1709  	return fmt.Sprintf(raftReply, b[:])
  1710  }
  1711  
  1712  // Our internal subscribe.
  1713  // Lock should be held.
  1714  func (n *raft) subscribe(subject string, cb msgHandler) (*subscription, error) {
  1715  	return n.s.systemSubscribe(subject, _EMPTY_, false, n.c, cb)
  1716  }
  1717  
  1718  // Lock should be held.
  1719  func (n *raft) unsubscribe(sub *subscription) {
  1720  	if sub != nil {
  1721  		n.c.processUnsub(sub.sid)
  1722  	}
  1723  }
  1724  
  1725  func (n *raft) createInternalSubs() error {
  1726  	n.Lock()
  1727  	defer n.Unlock()
  1728  	n.vsubj, n.vreply = fmt.Sprintf(raftVoteSubj, n.group), n.newInbox()
  1729  	n.asubj, n.areply = fmt.Sprintf(raftAppendSubj, n.group), n.newInbox()
  1730  	n.psubj = fmt.Sprintf(raftPropSubj, n.group)
  1731  	n.rpsubj = fmt.Sprintf(raftRemovePeerSubj, n.group)
  1732  
  1733  	// Votes
  1734  	if _, err := n.subscribe(n.vreply, n.handleVoteResponse); err != nil {
  1735  		return err
  1736  	}
  1737  	if _, err := n.subscribe(n.vsubj, n.handleVoteRequest); err != nil {
  1738  		return err
  1739  	}
  1740  	// AppendEntry
  1741  	if _, err := n.subscribe(n.areply, n.handleAppendEntryResponse); err != nil {
  1742  		return err
  1743  	}
  1744  	if sub, err := n.subscribe(n.asubj, n.handleAppendEntry); err != nil {
  1745  		return err
  1746  	} else {
  1747  		n.aesub = sub
  1748  	}
  1749  
  1750  	return nil
  1751  }
  1752  
  1753  func randElectionTimeout() time.Duration {
  1754  	delta := rand.Int63n(int64(maxElectionTimeout - minElectionTimeout))
  1755  	return (minElectionTimeout + time.Duration(delta))
  1756  }
  1757  
  1758  // Lock should be held.
  1759  func (n *raft) resetElectionTimeout() {
  1760  	n.resetElect(randElectionTimeout())
  1761  }
  1762  
  1763  func (n *raft) resetElectionTimeoutWithLock() {
  1764  	n.resetElectWithLock(randElectionTimeout())
  1765  }
  1766  
  1767  // Lock should be held.
  1768  func (n *raft) resetElect(et time.Duration) {
  1769  	if n.elect == nil {
  1770  		n.elect = time.NewTimer(et)
  1771  	} else {
  1772  		if !n.elect.Stop() {
  1773  			select {
  1774  			case <-n.elect.C:
  1775  			default:
  1776  			}
  1777  		}
  1778  		n.elect.Reset(et)
  1779  	}
  1780  }
  1781  
  1782  func (n *raft) resetElectWithLock(et time.Duration) {
  1783  	n.Lock()
  1784  	n.resetElect(et)
  1785  	n.Unlock()
  1786  }
  1787  
  1788  // run is the top-level runner for the Raft state machine. Depending on the
  1789  // state of the node (leader, follower, candidate, observer), this will call
  1790  // through to other functions. It is expected that this function will run for
  1791  // the entire life of the Raft node once started.
  1792  func (n *raft) run() {
  1793  	s := n.s
  1794  	defer s.grWG.Done()
  1795  
  1796  	// We want to wait for some routing to be enabled, so we will wait for
  1797  	// at least a route, leaf or gateway connection to be established before
  1798  	// starting the run loop.
  1799  	for gw := s.gateway; ; {
  1800  		s.mu.RLock()
  1801  		ready, gwEnabled := s.numRemotes()+len(s.leafs) > 0, gw.enabled
  1802  		s.mu.RUnlock()
  1803  		if !ready && gwEnabled {
  1804  			gw.RLock()
  1805  			ready = len(gw.out)+len(gw.in) > 0
  1806  			gw.RUnlock()
  1807  		}
  1808  		if !ready {
  1809  			select {
  1810  			case <-s.quitCh:
  1811  				return
  1812  			case <-time.After(100 * time.Millisecond):
  1813  				s.RateLimitWarnf("Waiting for routing to be established...")
  1814  			}
  1815  		} else {
  1816  			break
  1817  		}
  1818  	}
  1819  
  1820  	// We may have paused adding entries to apply queue, resume here.
  1821  	// No-op if not paused.
  1822  	n.ResumeApply()
  1823  
  1824  	// Send nil entry to signal the upper layers we are done doing replay/restore.
  1825  	n.apply.push(nil)
  1826  
  1827  	for s.isRunning() {
  1828  		switch n.State() {
  1829  		case Follower:
  1830  			n.runAsFollower()
  1831  		case Candidate:
  1832  			n.runAsCandidate()
  1833  		case Leader:
  1834  			n.runAsLeader()
  1835  		case Closed:
  1836  			return
  1837  		}
  1838  	}
  1839  }
  1840  
  1841  func (n *raft) debug(format string, args ...any) {
  1842  	if n.dflag {
  1843  		nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format)
  1844  		n.s.Debugf(nf, args...)
  1845  	}
  1846  }
  1847  
  1848  func (n *raft) warn(format string, args ...any) {
  1849  	nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format)
  1850  	n.s.RateLimitWarnf(nf, args...)
  1851  }
  1852  
  1853  func (n *raft) error(format string, args ...any) {
  1854  	nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format)
  1855  	n.s.Errorf(nf, args...)
  1856  }
  1857  
  1858  func (n *raft) electTimer() *time.Timer {
  1859  	n.RLock()
  1860  	defer n.RUnlock()
  1861  	return n.elect
  1862  }
  1863  
  1864  func (n *raft) IsObserver() bool {
  1865  	n.RLock()
  1866  	defer n.RUnlock()
  1867  	return n.observer
  1868  }
  1869  
  1870  // Sets the state to observer only.
  1871  func (n *raft) SetObserver(isObserver bool) {
  1872  	n.setObserver(isObserver, extUndetermined)
  1873  }
  1874  
  1875  func (n *raft) setObserver(isObserver bool, extSt extensionState) {
  1876  	n.Lock()
  1877  	defer n.Unlock()
  1878  	n.observer = isObserver
  1879  	n.extSt = extSt
  1880  }
  1881  
  1882  // processAppendEntries is called by the Raft state machine when there are
  1883  // new append entries to be committed and sent to the upper state machine.
  1884  func (n *raft) processAppendEntries() {
  1885  	canProcess := true
  1886  	if n.isClosed() {
  1887  		n.debug("AppendEntry not processing inbound, closed")
  1888  		canProcess = false
  1889  	}
  1890  	if n.outOfResources() {
  1891  		n.debug("AppendEntry not processing inbound, no resources")
  1892  		canProcess = false
  1893  	}
  1894  	// Always pop the entries, but check if we can process them. If we can't
  1895  	// then the entries are effectively dropped.
  1896  	aes := n.entry.pop()
  1897  	if canProcess {
  1898  		for _, ae := range aes {
  1899  			n.processAppendEntry(ae, ae.sub)
  1900  		}
  1901  	}
  1902  	n.entry.recycle(&aes)
  1903  }
  1904  
  1905  // runAsFollower is called by run and will block for as long as the node is
  1906  // running in the follower state.
  1907  func (n *raft) runAsFollower() {
  1908  	for {
  1909  		elect := n.electTimer()
  1910  
  1911  		select {
  1912  		case <-n.entry.ch:
  1913  			// New append entries have arrived over the network.
  1914  			n.processAppendEntries()
  1915  		case <-n.s.quitCh:
  1916  			// The server is shutting down.
  1917  			n.shutdown(false)
  1918  			return
  1919  		case <-n.quit:
  1920  			// The Raft node is shutting down.
  1921  			return
  1922  		case <-elect.C:
  1923  			// The election timer has fired so we think it's time to call an election.
  1924  			// If we are out of resources we just want to stay in this state for the moment.
  1925  			if n.outOfResources() {
  1926  				n.resetElectionTimeoutWithLock()
  1927  				n.debug("Not switching to candidate, no resources")
  1928  			} else if n.IsObserver() {
  1929  				n.resetElectWithLock(48 * time.Hour)
  1930  				n.debug("Not switching to candidate, observer only")
  1931  			} else if n.isCatchingUp() {
  1932  				n.debug("Not switching to candidate, catching up")
  1933  				// Check to see if our catchup has stalled.
  1934  				n.Lock()
  1935  				if n.catchupStalled() {
  1936  					n.cancelCatchup()
  1937  				}
  1938  				n.resetElectionTimeout()
  1939  				n.Unlock()
  1940  			} else {
  1941  				n.switchToCandidate()
  1942  				return
  1943  			}
  1944  		case <-n.votes.ch:
  1945  			// We're receiving votes from the network, probably because we have only
  1946  			// just stepped down and they were already in flight. Ignore them.
  1947  			n.debug("Ignoring old vote response, we have stepped down")
  1948  			n.votes.popOne()
  1949  		case <-n.resp.ch:
  1950  			// We're receiving append entry responses from the network, probably because
  1951  			// we have only just stepped down and they were already in flight. Ignore them.
  1952  			n.resp.popOne()
  1953  		case <-n.reqs.ch:
  1954  			// We've just received a vote request from the network.
  1955  			// Because of drain() it is possible that we get nil from popOne().
  1956  			if voteReq, ok := n.reqs.popOne(); ok {
  1957  				n.processVoteRequest(voteReq)
  1958  			}
  1959  		case <-n.stepdown.ch:
  1960  			// We've received a stepdown request, start following the new leader if
  1961  			// we can.
  1962  			if newLeader, ok := n.stepdown.popOne(); ok {
  1963  				n.switchToFollower(newLeader)
  1964  				return
  1965  			}
  1966  		}
  1967  	}
  1968  }
  1969  
  1970  // Pool for CommittedEntry re-use.
  1971  var cePool = sync.Pool{
  1972  	New: func() any {
  1973  		return &CommittedEntry{}
  1974  	},
  1975  }
  1976  
  1977  // CommittedEntry is handed back to the user to apply a commit to their upper layer.
  1978  type CommittedEntry struct {
  1979  	Index   uint64
  1980  	Entries []*Entry
  1981  }
  1982  
  1983  // Create a new CommittedEntry. When the returned entry is no longer needed, it
  1984  // should be returned to the pool by calling ReturnToPool.
  1985  func newCommittedEntry(index uint64, entries []*Entry) *CommittedEntry {
  1986  	ce := cePool.Get().(*CommittedEntry)
  1987  	ce.Index, ce.Entries = index, entries
  1988  	return ce
  1989  }
  1990  
  1991  // ReturnToPool returns the CommittedEntry to the pool, after which point it is
  1992  // no longer safe to reuse.
  1993  func (ce *CommittedEntry) ReturnToPool() {
  1994  	if ce == nil {
  1995  		return
  1996  	}
  1997  	if len(ce.Entries) > 0 {
  1998  		for _, e := range ce.Entries {
  1999  			entryPool.Put(e)
  2000  		}
  2001  	}
  2002  	ce.Index, ce.Entries = 0, nil
  2003  	cePool.Put(ce)
  2004  }
  2005  
  2006  // Pool for Entry re-use.
  2007  var entryPool = sync.Pool{
  2008  	New: func() any {
  2009  		return &Entry{}
  2010  	},
  2011  }
  2012  
  2013  // Helper to create new entries. When the returned entry is no longer needed, it
  2014  // should be returned to the entryPool pool.
  2015  func newEntry(t EntryType, data []byte) *Entry {
  2016  	entry := entryPool.Get().(*Entry)
  2017  	entry.Type, entry.Data = t, data
  2018  	return entry
  2019  }
  2020  
  2021  // Pool for appendEntry re-use.
  2022  var aePool = sync.Pool{
  2023  	New: func() any {
  2024  		return &appendEntry{}
  2025  	},
  2026  }
  2027  
  2028  // appendEntry is the main struct that is used to sync raft peers.
  2029  type appendEntry struct {
  2030  	leader  string   // The leader that this append entry came from.
  2031  	term    uint64   // The current term, as the leader understands it.
  2032  	commit  uint64   // The commit index, as the leader understands it.
  2033  	pterm   uint64   // The previous term, for checking consistency.
  2034  	pindex  uint64   // The previous commit index, for checking consistency.
  2035  	entries []*Entry // Entries to process.
  2036  	// Below fields are for internal use only:
  2037  	reply string        // Reply subject to respond to once committed.
  2038  	sub   *subscription // The subscription that the append entry came in on.
  2039  	buf   []byte
  2040  }
  2041  
  2042  // Create a new appendEntry.
  2043  func newAppendEntry(leader string, term, commit, pterm, pindex uint64, entries []*Entry) *appendEntry {
  2044  	ae := aePool.Get().(*appendEntry)
  2045  	ae.leader, ae.term, ae.commit, ae.pterm, ae.pindex, ae.entries = leader, term, commit, pterm, pindex, entries
  2046  	ae.reply, ae.sub, ae.buf = _EMPTY_, nil, nil
  2047  	return ae
  2048  }
  2049  
  2050  // Will return this append entry, and its interior entries to their respective pools.
  2051  func (ae *appendEntry) returnToPool() {
  2052  	ae.entries, ae.buf, ae.sub, ae.reply = nil, nil, nil, _EMPTY_
  2053  	aePool.Put(ae)
  2054  }
  2055  
  2056  type EntryType uint8
  2057  
  2058  const (
  2059  	EntryNormal EntryType = iota
  2060  	EntryOldSnapshot
  2061  	EntryPeerState
  2062  	EntryAddPeer
  2063  	EntryRemovePeer
  2064  	EntryLeaderTransfer
  2065  	EntrySnapshot
  2066  )
  2067  
  2068  func (t EntryType) String() string {
  2069  	switch t {
  2070  	case EntryNormal:
  2071  		return "Normal"
  2072  	case EntryOldSnapshot:
  2073  		return "OldSnapshot"
  2074  	case EntryPeerState:
  2075  		return "PeerState"
  2076  	case EntryAddPeer:
  2077  		return "AddPeer"
  2078  	case EntryRemovePeer:
  2079  		return "RemovePeer"
  2080  	case EntryLeaderTransfer:
  2081  		return "LeaderTransfer"
  2082  	case EntrySnapshot:
  2083  		return "Snapshot"
  2084  	}
  2085  	return fmt.Sprintf("Unknown [%d]", uint8(t))
  2086  }
  2087  
  2088  type Entry struct {
  2089  	Type EntryType
  2090  	Data []byte
  2091  }
  2092  
  2093  func (ae *appendEntry) String() string {
  2094  	return fmt.Sprintf("&{leader:%s term:%d commit:%d pterm:%d pindex:%d entries: %d}",
  2095  		ae.leader, ae.term, ae.commit, ae.pterm, ae.pindex, len(ae.entries))
  2096  }
  2097  
  2098  const appendEntryBaseLen = idLen + 4*8 + 2
  2099  
  2100  func (ae *appendEntry) encode(b []byte) ([]byte, error) {
  2101  	if ll := len(ae.leader); ll != idLen && ll != 0 {
  2102  		return nil, errLeaderLen
  2103  	}
  2104  	if len(ae.entries) > math.MaxUint16 {
  2105  		return nil, errTooManyEntries
  2106  	}
  2107  
  2108  	var elen int
  2109  	for _, e := range ae.entries {
  2110  		elen += len(e.Data) + 1 + 4 // 1 is type, 4 is for size.
  2111  	}
  2112  	tlen := appendEntryBaseLen + elen + 1
  2113  
  2114  	var buf []byte
  2115  	if cap(b) >= tlen {
  2116  		buf = b[:tlen]
  2117  	} else {
  2118  		buf = make([]byte, tlen)
  2119  	}
  2120  
  2121  	var le = binary.LittleEndian
  2122  	copy(buf[:idLen], ae.leader)
  2123  	le.PutUint64(buf[8:], ae.term)
  2124  	le.PutUint64(buf[16:], ae.commit)
  2125  	le.PutUint64(buf[24:], ae.pterm)
  2126  	le.PutUint64(buf[32:], ae.pindex)
  2127  	le.PutUint16(buf[40:], uint16(len(ae.entries)))
  2128  	wi := 42
  2129  	for _, e := range ae.entries {
  2130  		le.PutUint32(buf[wi:], uint32(len(e.Data)+1))
  2131  		wi += 4
  2132  		buf[wi] = byte(e.Type)
  2133  		wi++
  2134  		copy(buf[wi:], e.Data)
  2135  		wi += len(e.Data)
  2136  	}
  2137  	return buf[:wi], nil
  2138  }
  2139  
  2140  // This can not be used post the wire level callback since we do not copy.
  2141  func (n *raft) decodeAppendEntry(msg []byte, sub *subscription, reply string) (*appendEntry, error) {
  2142  	if len(msg) < appendEntryBaseLen {
  2143  		return nil, errBadAppendEntry
  2144  	}
  2145  
  2146  	var le = binary.LittleEndian
  2147  
  2148  	ae := newAppendEntry(string(msg[:idLen]), le.Uint64(msg[8:]), le.Uint64(msg[16:]), le.Uint64(msg[24:]), le.Uint64(msg[32:]), nil)
  2149  	ae.reply, ae.sub = reply, sub
  2150  
  2151  	// Decode Entries.
  2152  	ne, ri := int(le.Uint16(msg[40:])), 42
  2153  	for i, max := 0, len(msg); i < ne; i++ {
  2154  		if ri >= max-1 {
  2155  			return nil, errBadAppendEntry
  2156  		}
  2157  		le := int(le.Uint32(msg[ri:]))
  2158  		ri += 4
  2159  		if le <= 0 || ri+le > max {
  2160  			return nil, errBadAppendEntry
  2161  		}
  2162  		entry := newEntry(EntryType(msg[ri]), msg[ri+1:ri+le])
  2163  		ae.entries = append(ae.entries, entry)
  2164  		ri += le
  2165  	}
  2166  	ae.buf = msg
  2167  	return ae, nil
  2168  }
  2169  
  2170  // Pool for appendEntryResponse re-use.
  2171  var arPool = sync.Pool{
  2172  	New: func() any {
  2173  		return &appendEntryResponse{}
  2174  	},
  2175  }
  2176  
  2177  // We want to make sure this does not change from system changing length of syshash.
  2178  const idLen = 8
  2179  const appendEntryResponseLen = 24 + 1
  2180  
  2181  // appendEntryResponse is our response to a received appendEntry.
  2182  type appendEntryResponse struct {
  2183  	term    uint64
  2184  	index   uint64
  2185  	peer    string
  2186  	reply   string // internal usage.
  2187  	success bool
  2188  }
  2189  
  2190  // Create a new appendEntryResponse.
  2191  func newAppendEntryResponse(term, index uint64, peer string, success bool) *appendEntryResponse {
  2192  	ar := arPool.Get().(*appendEntryResponse)
  2193  	ar.term, ar.index, ar.peer, ar.success = term, index, peer, success
  2194  	// Always empty out.
  2195  	ar.reply = _EMPTY_
  2196  	return ar
  2197  }
  2198  
  2199  func (ar *appendEntryResponse) encode(b []byte) []byte {
  2200  	var buf []byte
  2201  	if cap(b) >= appendEntryResponseLen {
  2202  		buf = b[:appendEntryResponseLen]
  2203  	} else {
  2204  		buf = make([]byte, appendEntryResponseLen)
  2205  	}
  2206  	var le = binary.LittleEndian
  2207  	le.PutUint64(buf[0:], ar.term)
  2208  	le.PutUint64(buf[8:], ar.index)
  2209  	copy(buf[16:16+idLen], ar.peer)
  2210  	if ar.success {
  2211  		buf[24] = 1
  2212  	} else {
  2213  		buf[24] = 0
  2214  	}
  2215  	return buf[:appendEntryResponseLen]
  2216  }
  2217  
  2218  // Track all peers we may have ever seen to use an string interns for appendEntryResponse decoding.
  2219  var peers sync.Map
  2220  
  2221  func (n *raft) decodeAppendEntryResponse(msg []byte) *appendEntryResponse {
  2222  	if len(msg) != appendEntryResponseLen {
  2223  		return nil
  2224  	}
  2225  	var le = binary.LittleEndian
  2226  	ar := arPool.Get().(*appendEntryResponse)
  2227  	ar.term = le.Uint64(msg[0:])
  2228  	ar.index = le.Uint64(msg[8:])
  2229  
  2230  	peer, ok := peers.Load(string(msg[16 : 16+idLen]))
  2231  	if !ok {
  2232  		// We missed so store inline here.
  2233  		peer = string(msg[16 : 16+idLen])
  2234  		peers.Store(peer, peer)
  2235  	}
  2236  	ar.peer = peer.(string)
  2237  	ar.success = msg[24] == 1
  2238  	return ar
  2239  }
  2240  
  2241  // Called when a remove peer proposal has been forwarded
  2242  func (n *raft) handleForwardedRemovePeerProposal(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) {
  2243  	n.debug("Received forwarded remove peer proposal: %q", msg)
  2244  
  2245  	if !n.Leader() {
  2246  		n.debug("Ignoring forwarded peer removal proposal, not leader")
  2247  		return
  2248  	}
  2249  	if len(msg) != idLen {
  2250  		n.warn("Received invalid peer name for remove proposal: %q", msg)
  2251  		return
  2252  	}
  2253  
  2254  	n.RLock()
  2255  	prop, werr := n.prop, n.werr
  2256  	n.RUnlock()
  2257  
  2258  	// Ignore if we have had a write error previous.
  2259  	if werr != nil {
  2260  		return
  2261  	}
  2262  
  2263  	// Need to copy since this is underlying client/route buffer.
  2264  	peer := copyBytes(msg)
  2265  	prop.push(newEntry(EntryRemovePeer, peer))
  2266  }
  2267  
  2268  // Called when a peer has forwarded a proposal.
  2269  func (n *raft) handleForwardedProposal(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) {
  2270  	if !n.Leader() {
  2271  		n.debug("Ignoring forwarded proposal, not leader")
  2272  		return
  2273  	}
  2274  	// Need to copy since this is underlying client/route buffer.
  2275  	msg = copyBytes(msg)
  2276  
  2277  	n.RLock()
  2278  	prop, werr := n.prop, n.werr
  2279  	n.RUnlock()
  2280  
  2281  	// Ignore if we have had a write error previous.
  2282  	if werr != nil {
  2283  		return
  2284  	}
  2285  
  2286  	prop.push(newEntry(EntryNormal, msg))
  2287  }
  2288  
  2289  func (n *raft) runAsLeader() {
  2290  	if n.State() == Closed {
  2291  		return
  2292  	}
  2293  
  2294  	n.RLock()
  2295  	psubj, rpsubj := n.psubj, n.rpsubj
  2296  	n.RUnlock()
  2297  
  2298  	// For forwarded proposals, both normal and remove peer proposals.
  2299  	fsub, err := n.subscribe(psubj, n.handleForwardedProposal)
  2300  	if err != nil {
  2301  		n.warn("Error subscribing to forwarded proposals: %v", err)
  2302  		n.stepdown.push(noLeader)
  2303  		return
  2304  	}
  2305  	rpsub, err := n.subscribe(rpsubj, n.handleForwardedRemovePeerProposal)
  2306  	if err != nil {
  2307  		n.warn("Error subscribing to forwarded remove peer proposals: %v", err)
  2308  		n.unsubscribe(fsub)
  2309  		n.stepdown.push(noLeader)
  2310  		return
  2311  	}
  2312  
  2313  	// Cleanup our subscription when we leave.
  2314  	defer func() {
  2315  		n.Lock()
  2316  		n.unsubscribe(fsub)
  2317  		n.unsubscribe(rpsub)
  2318  		n.Unlock()
  2319  	}()
  2320  
  2321  	// To send out our initial peer state.
  2322  	n.sendPeerState()
  2323  
  2324  	hb := time.NewTicker(hbInterval)
  2325  	defer hb.Stop()
  2326  
  2327  	lq := time.NewTicker(lostQuorumCheck)
  2328  	defer lq.Stop()
  2329  
  2330  	for n.State() == Leader {
  2331  		select {
  2332  		case <-n.s.quitCh:
  2333  			n.shutdown(false)
  2334  			return
  2335  		case <-n.quit:
  2336  			return
  2337  		case <-n.resp.ch:
  2338  			ars := n.resp.pop()
  2339  			for _, ar := range ars {
  2340  				n.processAppendEntryResponse(ar)
  2341  			}
  2342  			n.resp.recycle(&ars)
  2343  		case <-n.prop.ch:
  2344  			const maxBatch = 256 * 1024
  2345  			var entries []*Entry
  2346  
  2347  			es := n.prop.pop()
  2348  			sz := 0
  2349  			for i, b := range es {
  2350  				if b.Type == EntryRemovePeer {
  2351  					n.doRemovePeerAsLeader(string(b.Data))
  2352  				}
  2353  				entries = append(entries, b)
  2354  				sz += len(b.Data) + 1
  2355  				if i != len(es)-1 && sz < maxBatch && len(entries) < math.MaxUint16 {
  2356  					continue
  2357  				}
  2358  				n.sendAppendEntry(entries)
  2359  
  2360  				// If this is us sending out a leadership transfer stepdown inline here.
  2361  				if b.Type == EntryLeaderTransfer {
  2362  					n.prop.recycle(&es)
  2363  					n.debug("Stepping down due to leadership transfer")
  2364  					n.switchToFollower(noLeader)
  2365  					return
  2366  				}
  2367  				// We need to re-create `entries` because there is a reference
  2368  				// to it in the node's pae map.
  2369  				entries = nil
  2370  			}
  2371  			n.prop.recycle(&es)
  2372  
  2373  		case <-hb.C:
  2374  			if n.notActive() {
  2375  				n.sendHeartbeat()
  2376  			}
  2377  		case <-lq.C:
  2378  			if n.lostQuorum() {
  2379  				n.switchToFollower(noLeader)
  2380  				return
  2381  			}
  2382  		case <-n.votes.ch:
  2383  			// Because of drain() it is possible that we get nil from popOne().
  2384  			vresp, ok := n.votes.popOne()
  2385  			if !ok {
  2386  				continue
  2387  			}
  2388  			if vresp.term > n.Term() {
  2389  				n.switchToFollower(noLeader)
  2390  				return
  2391  			}
  2392  			n.trackPeer(vresp.peer)
  2393  		case <-n.reqs.ch:
  2394  			// Because of drain() it is possible that we get nil from popOne().
  2395  			if voteReq, ok := n.reqs.popOne(); ok {
  2396  				n.processVoteRequest(voteReq)
  2397  			}
  2398  		case <-n.stepdown.ch:
  2399  			if newLeader, ok := n.stepdown.popOne(); ok {
  2400  				n.switchToFollower(newLeader)
  2401  				return
  2402  			}
  2403  		case <-n.entry.ch:
  2404  			n.processAppendEntries()
  2405  		}
  2406  	}
  2407  }
  2408  
  2409  // Quorum reports the quorum status. Will be called on former leaders.
  2410  func (n *raft) Quorum() bool {
  2411  	n.RLock()
  2412  	defer n.RUnlock()
  2413  
  2414  	now, nc := time.Now().UnixNano(), 1
  2415  	for _, peer := range n.peers {
  2416  		if now-peer.ts < int64(lostQuorumInterval) {
  2417  			nc++
  2418  			if nc >= n.qn {
  2419  				return true
  2420  			}
  2421  		}
  2422  	}
  2423  	return false
  2424  }
  2425  
  2426  func (n *raft) lostQuorum() bool {
  2427  	n.RLock()
  2428  	defer n.RUnlock()
  2429  	return n.lostQuorumLocked()
  2430  }
  2431  
  2432  func (n *raft) lostQuorumLocked() bool {
  2433  	// Make sure we let any scale up actions settle before deciding.
  2434  	if !n.lsut.IsZero() && time.Since(n.lsut) < lostQuorumInterval {
  2435  		return false
  2436  	}
  2437  
  2438  	now, nc := time.Now().UnixNano(), 1
  2439  	for _, peer := range n.peers {
  2440  		if now-peer.ts < int64(lostQuorumInterval) {
  2441  			nc++
  2442  			if nc >= n.qn {
  2443  				return false
  2444  			}
  2445  		}
  2446  	}
  2447  	return true
  2448  }
  2449  
  2450  // Check for being not active in terms of sending entries.
  2451  // Used in determining if we need to send a heartbeat.
  2452  func (n *raft) notActive() bool {
  2453  	n.RLock()
  2454  	defer n.RUnlock()
  2455  	return time.Since(n.active) > hbInterval
  2456  }
  2457  
  2458  // Return our current term.
  2459  func (n *raft) Term() uint64 {
  2460  	n.RLock()
  2461  	defer n.RUnlock()
  2462  	return n.term
  2463  }
  2464  
  2465  // Lock should be held.
  2466  func (n *raft) loadFirstEntry() (ae *appendEntry, err error) {
  2467  	var state StreamState
  2468  	n.wal.FastState(&state)
  2469  	return n.loadEntry(state.FirstSeq)
  2470  }
  2471  
  2472  func (n *raft) runCatchup(ar *appendEntryResponse, indexUpdatesQ *ipQueue[uint64]) {
  2473  	n.RLock()
  2474  	s, reply := n.s, n.areply
  2475  	peer, subj, last := ar.peer, ar.reply, n.pindex
  2476  	n.RUnlock()
  2477  
  2478  	defer s.grWG.Done()
  2479  	defer arPool.Put(ar)
  2480  
  2481  	defer func() {
  2482  		n.Lock()
  2483  		delete(n.progress, peer)
  2484  		if len(n.progress) == 0 {
  2485  			n.progress = nil
  2486  		}
  2487  		// Check if this is a new peer and if so go ahead and propose adding them.
  2488  		_, exists := n.peers[peer]
  2489  		n.Unlock()
  2490  		if !exists {
  2491  			n.debug("Catchup done for %q, will add into peers", peer)
  2492  			n.ProposeAddPeer(peer)
  2493  		}
  2494  		indexUpdatesQ.unregister()
  2495  	}()
  2496  
  2497  	n.debug("Running catchup for %q", peer)
  2498  
  2499  	const maxOutstanding = 2 * 1024 * 1024 // 2MB for now.
  2500  	next, total, om := uint64(0), 0, make(map[uint64]int)
  2501  
  2502  	sendNext := func() bool {
  2503  		for total <= maxOutstanding {
  2504  			next++
  2505  			if next > last {
  2506  				return true
  2507  			}
  2508  			ae, err := n.loadEntry(next)
  2509  			if err != nil {
  2510  				if err != ErrStoreEOF {
  2511  					n.warn("Got an error loading %d index: %v", next, err)
  2512  				}
  2513  				return true
  2514  			}
  2515  			// Update our tracking total.
  2516  			om[next] = len(ae.buf)
  2517  			total += len(ae.buf)
  2518  			n.sendRPC(subj, reply, ae.buf)
  2519  		}
  2520  		return false
  2521  	}
  2522  
  2523  	const activityInterval = 2 * time.Second
  2524  	timeout := time.NewTimer(activityInterval)
  2525  	defer timeout.Stop()
  2526  
  2527  	stepCheck := time.NewTicker(100 * time.Millisecond)
  2528  	defer stepCheck.Stop()
  2529  
  2530  	// Run as long as we are leader and still not caught up.
  2531  	for n.Leader() {
  2532  		select {
  2533  		case <-n.s.quitCh:
  2534  			n.shutdown(false)
  2535  			return
  2536  		case <-n.quit:
  2537  			return
  2538  		case <-stepCheck.C:
  2539  			if !n.Leader() {
  2540  				n.debug("Catching up canceled, no longer leader")
  2541  				return
  2542  			}
  2543  		case <-timeout.C:
  2544  			n.debug("Catching up for %q stalled", peer)
  2545  			return
  2546  		case <-indexUpdatesQ.ch:
  2547  			if index, ok := indexUpdatesQ.popOne(); ok {
  2548  				// Update our activity timer.
  2549  				timeout.Reset(activityInterval)
  2550  				// Update outstanding total.
  2551  				total -= om[index]
  2552  				delete(om, index)
  2553  				if next == 0 {
  2554  					next = index
  2555  				}
  2556  				// Check if we are done.
  2557  				if index > last || sendNext() {
  2558  					n.debug("Finished catching up")
  2559  					return
  2560  				}
  2561  			}
  2562  		}
  2563  	}
  2564  }
  2565  
  2566  // Lock should be held.
  2567  func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) {
  2568  	snap, err := n.loadLastSnapshot()
  2569  	if err != nil {
  2570  		// We need to stepdown here when this happens.
  2571  		n.stepdown.push(noLeader)
  2572  		// We need to reset our state here as well.
  2573  		n.resetWAL()
  2574  		return 0, err
  2575  	}
  2576  	// Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower.
  2577  	ae := n.buildAppendEntry([]*Entry{{EntrySnapshot, snap.data}, {EntryPeerState, snap.peerstate}})
  2578  	ae.pterm, ae.pindex = snap.lastTerm, snap.lastIndex
  2579  	var state StreamState
  2580  	n.wal.FastState(&state)
  2581  
  2582  	fpIndex := state.FirstSeq - 1
  2583  	if snap.lastIndex < fpIndex && state.FirstSeq != 0 {
  2584  		snap.lastIndex = fpIndex
  2585  		ae.pindex = fpIndex
  2586  	}
  2587  
  2588  	encoding, err := ae.encode(nil)
  2589  	if err != nil {
  2590  		return 0, err
  2591  	}
  2592  	n.sendRPC(subject, n.areply, encoding)
  2593  	return snap.lastIndex, nil
  2594  }
  2595  
  2596  func (n *raft) catchupFollower(ar *appendEntryResponse) {
  2597  	n.debug("Being asked to catch up follower: %q", ar.peer)
  2598  	n.Lock()
  2599  	if n.progress == nil {
  2600  		n.progress = make(map[string]*ipQueue[uint64])
  2601  	} else if q, ok := n.progress[ar.peer]; ok {
  2602  		n.debug("Will cancel existing entry for catching up %q", ar.peer)
  2603  		delete(n.progress, ar.peer)
  2604  		q.push(n.pindex)
  2605  	}
  2606  
  2607  	// Check to make sure we have this entry.
  2608  	start := ar.index + 1
  2609  	var state StreamState
  2610  	n.wal.FastState(&state)
  2611  
  2612  	if start < state.FirstSeq || (state.Msgs == 0 && start <= state.LastSeq) {
  2613  		n.debug("Need to send snapshot to follower")
  2614  		if lastIndex, err := n.sendSnapshotToFollower(ar.reply); err != nil {
  2615  			n.error("Error sending snapshot to follower [%s]: %v", ar.peer, err)
  2616  			n.Unlock()
  2617  			arPool.Put(ar)
  2618  			return
  2619  		} else {
  2620  			start = lastIndex + 1
  2621  			// If no other entries, we can just return here.
  2622  			if state.Msgs == 0 || start > state.LastSeq {
  2623  				n.debug("Finished catching up")
  2624  				n.Unlock()
  2625  				arPool.Put(ar)
  2626  				return
  2627  			}
  2628  			n.debug("Snapshot sent, reset first catchup entry to %d", lastIndex)
  2629  		}
  2630  	}
  2631  
  2632  	ae, err := n.loadEntry(start)
  2633  	if err != nil {
  2634  		n.warn("Request from follower for entry at index [%d] errored for state %+v - %v", start, state, err)
  2635  		if err == ErrStoreEOF {
  2636  			// If we are here we are seeing a request for an item beyond our state, meaning we should stepdown.
  2637  			n.stepdown.push(noLeader)
  2638  			n.Unlock()
  2639  			arPool.Put(ar)
  2640  			return
  2641  		}
  2642  		ae, err = n.loadFirstEntry()
  2643  	}
  2644  	if err != nil || ae == nil {
  2645  		n.warn("Could not find a starting entry for catchup request: %v", err)
  2646  		// If we are here we are seeing a request for an item we do not have, meaning we should stepdown.
  2647  		// This is possible on a reset of our WAL but the other side has a snapshot already.
  2648  		// If we do not stepdown this can cycle.
  2649  		n.stepdown.push(noLeader)
  2650  		n.Unlock()
  2651  		arPool.Put(ar)
  2652  		return
  2653  	}
  2654  	if ae.pindex != ar.index || ae.pterm != ar.term {
  2655  		n.debug("Our first entry [%d:%d] does not match request from follower [%d:%d]", ae.pterm, ae.pindex, ar.term, ar.index)
  2656  	}
  2657  	// Create a queue for delivering updates from responses.
  2658  	indexUpdates := newIPQueue[uint64](n.s, fmt.Sprintf("[ACC:%s] RAFT '%s' indexUpdates", n.accName, n.group))
  2659  	indexUpdates.push(ae.pindex)
  2660  	n.progress[ar.peer] = indexUpdates
  2661  	n.Unlock()
  2662  
  2663  	n.s.startGoRoutine(func() { n.runCatchup(ar, indexUpdates) })
  2664  }
  2665  
  2666  func (n *raft) loadEntry(index uint64) (*appendEntry, error) {
  2667  	var smp StoreMsg
  2668  	sm, err := n.wal.LoadMsg(index, &smp)
  2669  	if err != nil {
  2670  		return nil, err
  2671  	}
  2672  	return n.decodeAppendEntry(sm.msg, nil, _EMPTY_)
  2673  }
  2674  
  2675  // applyCommit will update our commit index and apply the entry to the apply queue.
  2676  // lock should be held.
  2677  func (n *raft) applyCommit(index uint64) error {
  2678  	if n.State() == Closed {
  2679  		return errNodeClosed
  2680  	}
  2681  	if index <= n.commit {
  2682  		n.debug("Ignoring apply commit for %d, already processed", index)
  2683  		return nil
  2684  	}
  2685  	original := n.commit
  2686  	n.commit = index
  2687  
  2688  	if n.State() == Leader {
  2689  		delete(n.acks, index)
  2690  	}
  2691  
  2692  	var fpae bool
  2693  
  2694  	ae := n.pae[index]
  2695  	if ae == nil {
  2696  		var state StreamState
  2697  		n.wal.FastState(&state)
  2698  		if index < state.FirstSeq {
  2699  			return nil
  2700  		}
  2701  		var err error
  2702  		if ae, err = n.loadEntry(index); err != nil {
  2703  			if err != ErrStoreClosed && err != ErrStoreEOF {
  2704  				n.warn("Got an error loading %d index: %v - will reset", index, err)
  2705  				if n.State() == Leader {
  2706  					n.stepdown.push(n.selectNextLeader())
  2707  				}
  2708  				// Reset and cancel any catchup.
  2709  				n.resetWAL()
  2710  				n.cancelCatchup()
  2711  			} else {
  2712  				n.commit = original
  2713  			}
  2714  			return errEntryLoadFailed
  2715  		}
  2716  	} else {
  2717  		fpae = true
  2718  	}
  2719  
  2720  	ae.buf = nil
  2721  
  2722  	var committed []*Entry
  2723  	for _, e := range ae.entries {
  2724  		switch e.Type {
  2725  		case EntryNormal:
  2726  			committed = append(committed, e)
  2727  		case EntryOldSnapshot:
  2728  			// For old snapshots in our WAL.
  2729  			committed = append(committed, newEntry(EntrySnapshot, e.Data))
  2730  		case EntrySnapshot:
  2731  			committed = append(committed, e)
  2732  		case EntryPeerState:
  2733  			if n.State() != Leader {
  2734  				if ps, err := decodePeerState(e.Data); err == nil {
  2735  					n.processPeerState(ps)
  2736  				}
  2737  			}
  2738  		case EntryAddPeer:
  2739  			newPeer := string(e.Data)
  2740  			n.debug("Added peer %q", newPeer)
  2741  
  2742  			// Store our peer in our global peer map for all peers.
  2743  			peers.LoadOrStore(newPeer, newPeer)
  2744  
  2745  			// If we were on the removed list reverse that here.
  2746  			if n.removed != nil {
  2747  				delete(n.removed, newPeer)
  2748  			}
  2749  
  2750  			if lp, ok := n.peers[newPeer]; !ok {
  2751  				// We are not tracking this one automatically so we need to bump cluster size.
  2752  				n.peers[newPeer] = &lps{time.Now().UnixNano(), 0, true}
  2753  			} else {
  2754  				// Mark as added.
  2755  				lp.kp = true
  2756  			}
  2757  			// Adjust cluster size and quorum if needed.
  2758  			n.adjustClusterSizeAndQuorum()
  2759  			// Write out our new state.
  2760  			n.writePeerState(&peerState{n.peerNames(), n.csz, n.extSt})
  2761  			// We pass these up as well.
  2762  			committed = append(committed, e)
  2763  
  2764  		case EntryRemovePeer:
  2765  			peer := string(e.Data)
  2766  			n.debug("Removing peer %q", peer)
  2767  
  2768  			// Make sure we have our removed map.
  2769  			if n.removed == nil {
  2770  				n.removed = make(map[string]struct{})
  2771  			}
  2772  			n.removed[peer] = struct{}{}
  2773  
  2774  			if _, ok := n.peers[peer]; ok {
  2775  				delete(n.peers, peer)
  2776  				// We should decrease our cluster size since we are tracking this peer.
  2777  				n.adjustClusterSizeAndQuorum()
  2778  				// Write out our new state.
  2779  				n.writePeerState(&peerState{n.peerNames(), n.csz, n.extSt})
  2780  			}
  2781  
  2782  			// If this is us and we are the leader we should attempt to stepdown.
  2783  			if peer == n.id && n.State() == Leader {
  2784  				n.stepdown.push(n.selectNextLeader())
  2785  			}
  2786  
  2787  			// Remove from string intern map.
  2788  			peers.Delete(peer)
  2789  
  2790  			// We pass these up as well.
  2791  			committed = append(committed, e)
  2792  		}
  2793  	}
  2794  	if fpae {
  2795  		delete(n.pae, index)
  2796  	}
  2797  	// Pass to the upper layers if we have normal entries. It is
  2798  	// entirely possible that 'committed' might be an empty slice here,
  2799  	// which will happen if we've processed updates inline (like peer
  2800  	// states). In which case the upper layer will just call down with
  2801  	// Applied() with no further action.
  2802  	n.apply.push(newCommittedEntry(index, committed))
  2803  	// Place back in the pool.
  2804  	ae.returnToPool()
  2805  	return nil
  2806  }
  2807  
  2808  // Used to track a success response and apply entries.
  2809  func (n *raft) trackResponse(ar *appendEntryResponse) {
  2810  	if n.State() == Closed {
  2811  		return
  2812  	}
  2813  
  2814  	n.Lock()
  2815  
  2816  	// Update peer's last index.
  2817  	if ps := n.peers[ar.peer]; ps != nil && ar.index > ps.li {
  2818  		ps.li = ar.index
  2819  	}
  2820  
  2821  	// If we are tracking this peer as a catchup follower, update that here.
  2822  	if indexUpdateQ := n.progress[ar.peer]; indexUpdateQ != nil {
  2823  		indexUpdateQ.push(ar.index)
  2824  	}
  2825  
  2826  	// Ignore items already committed.
  2827  	if ar.index <= n.commit {
  2828  		n.Unlock()
  2829  		return
  2830  	}
  2831  
  2832  	// See if we have items to apply.
  2833  	var sendHB bool
  2834  
  2835  	if results := n.acks[ar.index]; results != nil {
  2836  		results[ar.peer] = struct{}{}
  2837  		if nr := len(results); nr >= n.qn {
  2838  			// We have a quorum.
  2839  			for index := n.commit + 1; index <= ar.index; index++ {
  2840  				if err := n.applyCommit(index); err != nil && err != errNodeClosed {
  2841  					n.error("Got an error applying commit for %d: %v", index, err)
  2842  					break
  2843  				}
  2844  			}
  2845  			sendHB = n.prop.len() == 0
  2846  		}
  2847  	}
  2848  	n.Unlock()
  2849  
  2850  	if sendHB {
  2851  		n.sendHeartbeat()
  2852  	}
  2853  }
  2854  
  2855  // Used to adjust cluster size and peer count based on added official peers.
  2856  // lock should be held.
  2857  func (n *raft) adjustClusterSizeAndQuorum() {
  2858  	pcsz, ncsz := n.csz, 0
  2859  	for _, peer := range n.peers {
  2860  		if peer.kp {
  2861  			ncsz++
  2862  		}
  2863  	}
  2864  	n.csz = ncsz
  2865  	n.qn = n.csz/2 + 1
  2866  
  2867  	if ncsz > pcsz {
  2868  		n.debug("Expanding our clustersize: %d -> %d", pcsz, ncsz)
  2869  		n.lsut = time.Now()
  2870  	} else if ncsz < pcsz {
  2871  		n.debug("Decreasing our clustersize: %d -> %d", pcsz, ncsz)
  2872  		if n.State() == Leader {
  2873  			go n.sendHeartbeat()
  2874  		}
  2875  	}
  2876  }
  2877  
  2878  // Track interactions with this peer.
  2879  func (n *raft) trackPeer(peer string) error {
  2880  	n.Lock()
  2881  	var needPeerAdd, isRemoved bool
  2882  	if n.removed != nil {
  2883  		_, isRemoved = n.removed[peer]
  2884  	}
  2885  	if n.State() == Leader {
  2886  		if lp, ok := n.peers[peer]; !ok || !lp.kp {
  2887  			// Check if this peer had been removed previously.
  2888  			needPeerAdd = !isRemoved
  2889  		}
  2890  	}
  2891  	if ps := n.peers[peer]; ps != nil {
  2892  		ps.ts = time.Now().UnixNano()
  2893  	} else if !isRemoved {
  2894  		n.peers[peer] = &lps{time.Now().UnixNano(), 0, false}
  2895  	}
  2896  	n.Unlock()
  2897  
  2898  	if needPeerAdd {
  2899  		n.ProposeAddPeer(peer)
  2900  	}
  2901  	return nil
  2902  }
  2903  
  2904  func (n *raft) runAsCandidate() {
  2905  	n.Lock()
  2906  	// Drain old responses.
  2907  	n.votes.drain()
  2908  	n.Unlock()
  2909  
  2910  	// Send out our request for votes.
  2911  	n.requestVote()
  2912  
  2913  	// We vote for ourselves.
  2914  	votes := map[string]struct{}{
  2915  		n.ID(): {},
  2916  	}
  2917  
  2918  	for {
  2919  		elect := n.electTimer()
  2920  		select {
  2921  		case <-n.entry.ch:
  2922  			n.processAppendEntries()
  2923  		case <-n.resp.ch:
  2924  			// Ignore
  2925  			n.resp.popOne()
  2926  		case <-n.s.quitCh:
  2927  			n.shutdown(false)
  2928  			return
  2929  		case <-n.quit:
  2930  			return
  2931  		case <-elect.C:
  2932  			n.switchToCandidate()
  2933  			return
  2934  		case <-n.votes.ch:
  2935  			// Because of drain() it is possible that we get nil from popOne().
  2936  			vresp, ok := n.votes.popOne()
  2937  			if !ok {
  2938  				continue
  2939  			}
  2940  			n.RLock()
  2941  			nterm := n.term
  2942  			n.RUnlock()
  2943  
  2944  			if vresp.granted && nterm == vresp.term {
  2945  				// only track peers that would be our followers
  2946  				n.trackPeer(vresp.peer)
  2947  				votes[vresp.peer] = struct{}{}
  2948  				if n.wonElection(len(votes)) {
  2949  					// Become LEADER if we have won and gotten a quorum with everyone we should hear from.
  2950  					n.switchToLeader()
  2951  					return
  2952  				}
  2953  			} else if vresp.term > nterm {
  2954  				// if we observe a bigger term, we should start over again or risk forming a quorum fully knowing
  2955  				// someone with a better term exists. This is even the right thing to do if won == true.
  2956  				n.Lock()
  2957  				n.debug("Stepping down from candidate, detected higher term: %d vs %d", vresp.term, n.term)
  2958  				n.term = vresp.term
  2959  				n.vote = noVote
  2960  				n.writeTermVote()
  2961  				n.stepdown.push(noLeader)
  2962  				n.lxfer = false
  2963  				n.Unlock()
  2964  			}
  2965  		case <-n.reqs.ch:
  2966  			// Because of drain() it is possible that we get nil from popOne().
  2967  			if voteReq, ok := n.reqs.popOne(); ok {
  2968  				n.processVoteRequest(voteReq)
  2969  			}
  2970  		case <-n.stepdown.ch:
  2971  			if newLeader, ok := n.stepdown.popOne(); ok {
  2972  				n.switchToFollower(newLeader)
  2973  				return
  2974  			}
  2975  		}
  2976  	}
  2977  }
  2978  
  2979  // handleAppendEntry handles an append entry from the wire. This function
  2980  // is an internal callback from the "asubj" append entry subscription.
  2981  func (n *raft) handleAppendEntry(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  2982  	msg = copyBytes(msg)
  2983  	if ae, err := n.decodeAppendEntry(msg, sub, reply); err == nil {
  2984  		// Push to the new entry channel. From here one of the worker
  2985  		// goroutines (runAsLeader, runAsFollower, runAsCandidate) will
  2986  		// pick it up.
  2987  		n.entry.push(ae)
  2988  	} else {
  2989  		n.warn("AppendEntry failed to be placed on internal channel: corrupt entry")
  2990  	}
  2991  }
  2992  
  2993  // cancelCatchup will stop an in-flight catchup by unsubscribing from the
  2994  // catchup subscription.
  2995  // Lock should be held.
  2996  func (n *raft) cancelCatchup() {
  2997  	n.debug("Canceling catchup subscription since we are now up to date")
  2998  
  2999  	if n.catchup != nil && n.catchup.sub != nil {
  3000  		n.unsubscribe(n.catchup.sub)
  3001  	}
  3002  	n.catchup = nil
  3003  }
  3004  
  3005  // catchupStalled will try to determine if we are stalled. This is called
  3006  // on a new entry from our leader.
  3007  // Lock should be held.
  3008  func (n *raft) catchupStalled() bool {
  3009  	if n.catchup == nil {
  3010  		return false
  3011  	}
  3012  	if n.catchup.pindex == n.pindex {
  3013  		return time.Since(n.catchup.active) > 2*time.Second
  3014  	}
  3015  	n.catchup.pindex = n.pindex
  3016  	n.catchup.active = time.Now()
  3017  	return false
  3018  }
  3019  
  3020  // createCatchup will create the state needed to track a catchup as it
  3021  // runs. It then creates a unique inbox for this catchup and subscribes
  3022  // to it. The remote side will stream entries to that subject.
  3023  // Lock should be held.
  3024  func (n *raft) createCatchup(ae *appendEntry) string {
  3025  	// Cleanup any old ones.
  3026  	if n.catchup != nil && n.catchup.sub != nil {
  3027  		n.unsubscribe(n.catchup.sub)
  3028  	}
  3029  	// Snapshot term and index.
  3030  	n.catchup = &catchupState{
  3031  		cterm:  ae.pterm,
  3032  		cindex: ae.pindex,
  3033  		pterm:  n.pterm,
  3034  		pindex: n.pindex,
  3035  		active: time.Now(),
  3036  	}
  3037  	inbox := n.newCatchupInbox()
  3038  	sub, _ := n.subscribe(inbox, n.handleAppendEntry)
  3039  	n.catchup.sub = sub
  3040  
  3041  	return inbox
  3042  }
  3043  
  3044  // Truncate our WAL and reset.
  3045  // Lock should be held.
  3046  func (n *raft) truncateWAL(term, index uint64) {
  3047  	n.debug("Truncating and repairing WAL to Term %d Index %d", term, index)
  3048  
  3049  	if term == 0 && index == 0 {
  3050  		n.warn("Resetting WAL state")
  3051  	}
  3052  
  3053  	defer func() {
  3054  		// Check to see if we invalidated any snapshots that might have held state
  3055  		// from the entries we are truncating.
  3056  		if snap, _ := n.loadLastSnapshot(); snap != nil && snap.lastIndex >= index {
  3057  			os.Remove(n.snapfile)
  3058  			n.snapfile = _EMPTY_
  3059  		}
  3060  		// Make sure to reset commit and applied if above
  3061  		if n.commit > n.pindex {
  3062  			n.commit = n.pindex
  3063  		}
  3064  		if n.applied > n.commit {
  3065  			n.applied = n.commit
  3066  		}
  3067  	}()
  3068  
  3069  	if err := n.wal.Truncate(index); err != nil {
  3070  		// If we get an invalid sequence, reset our wal all together.
  3071  		if err == ErrInvalidSequence {
  3072  			n.debug("Resetting WAL")
  3073  			n.wal.Truncate(0)
  3074  			index, n.term, n.pterm, n.pindex = 0, 0, 0, 0
  3075  		} else {
  3076  			n.warn("Error truncating WAL: %v", err)
  3077  			n.setWriteErrLocked(err)
  3078  		}
  3079  		return
  3080  	}
  3081  
  3082  	// Set after we know we have truncated properly.
  3083  	n.term, n.pterm, n.pindex = term, term, index
  3084  }
  3085  
  3086  // Reset our WAL. This is equivalent to truncating all data from the log.
  3087  // Lock should be held.
  3088  func (n *raft) resetWAL() {
  3089  	n.truncateWAL(0, 0)
  3090  }
  3091  
  3092  // Lock should be held
  3093  func (n *raft) updateLeader(newLeader string) {
  3094  	n.leader = newLeader
  3095  	if !n.pleader && newLeader != noLeader {
  3096  		n.pleader = true
  3097  	}
  3098  }
  3099  
  3100  // processAppendEntry will process an appendEntry. This is called either
  3101  // during recovery or from processAppendEntries when there are new entries
  3102  // to be committed.
  3103  func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) {
  3104  	n.Lock()
  3105  	// Don't reset here if we have been asked to assume leader position.
  3106  	if !n.lxfer {
  3107  		n.resetElectionTimeout()
  3108  	}
  3109  
  3110  	// Just return if closed or we had previous write error.
  3111  	if n.State() == Closed || n.werr != nil {
  3112  		n.Unlock()
  3113  		return
  3114  	}
  3115  
  3116  	// Scratch buffer for responses.
  3117  	var scratch [appendEntryResponseLen]byte
  3118  	arbuf := scratch[:]
  3119  
  3120  	// Are we receiving from another leader.
  3121  	if n.State() == Leader {
  3122  		// If we are the same we should step down to break the tie.
  3123  		if ae.term >= n.term {
  3124  			// If the append entry term is newer than the current term, erase our
  3125  			// vote.
  3126  			if ae.term > n.term {
  3127  				n.term = ae.term
  3128  				n.vote = noVote
  3129  				n.writeTermVote()
  3130  			}
  3131  			n.debug("Received append entry from another leader, stepping down to %q", ae.leader)
  3132  			n.stepdown.push(ae.leader)
  3133  		} else {
  3134  			// Let them know we are the leader.
  3135  			ar := newAppendEntryResponse(n.term, n.pindex, n.id, false)
  3136  			n.debug("AppendEntry ignoring old term from another leader")
  3137  			n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf))
  3138  			arPool.Put(ar)
  3139  		}
  3140  		// Always return here from processing.
  3141  		n.Unlock()
  3142  		return
  3143  	}
  3144  
  3145  	// If we received an append entry as a candidate then it would appear that
  3146  	// another node has taken on the leader role already, so we should convert
  3147  	// to a follower of that node instead.
  3148  	if n.State() == Candidate {
  3149  		// Ignore old terms, otherwise we might end up stepping down incorrectly.
  3150  		if ae.term >= n.term {
  3151  			// If the append entry term is newer than the current term, erase our
  3152  			// vote.
  3153  			if ae.term > n.term {
  3154  				n.term = ae.term
  3155  				n.vote = noVote
  3156  				n.writeTermVote()
  3157  			}
  3158  			n.debug("Received append entry in candidate state from %q, converting to follower", ae.leader)
  3159  			n.stepdown.push(ae.leader)
  3160  		}
  3161  	}
  3162  
  3163  	// Catching up state.
  3164  	catchingUp := n.catchup != nil
  3165  	// Is this a new entry? New entries will be delivered on the append entry
  3166  	// sub, rather than a catch-up sub.
  3167  	isNew := sub != nil && sub == n.aesub
  3168  
  3169  	// Track leader directly
  3170  	if isNew && ae.leader != noLeader {
  3171  		if ps := n.peers[ae.leader]; ps != nil {
  3172  			ps.ts = time.Now().UnixNano()
  3173  		} else {
  3174  			n.peers[ae.leader] = &lps{time.Now().UnixNano(), 0, true}
  3175  		}
  3176  	}
  3177  
  3178  	// If we are catching up ignore old catchup subs.
  3179  	// This could happen when we stall or cancel a catchup.
  3180  	if !isNew && catchingUp && sub != n.catchup.sub {
  3181  		n.Unlock()
  3182  		n.debug("AppendEntry ignoring old entry from previous catchup")
  3183  		return
  3184  	}
  3185  
  3186  	// Check state if we are catching up.
  3187  	if catchingUp {
  3188  		if cs := n.catchup; cs != nil && n.pterm >= cs.cterm && n.pindex >= cs.cindex {
  3189  			// If we are here we are good, so if we have a catchup pending we can cancel.
  3190  			n.cancelCatchup()
  3191  			// Reset our notion of catching up.
  3192  			catchingUp = false
  3193  		} else if isNew {
  3194  			var ar *appendEntryResponse
  3195  			var inbox string
  3196  			// Check to see if we are stalled. If so recreate our catchup state and resend response.
  3197  			if n.catchupStalled() {
  3198  				n.debug("Catchup may be stalled, will request again")
  3199  				inbox = n.createCatchup(ae)
  3200  				ar = newAppendEntryResponse(n.pterm, n.pindex, n.id, false)
  3201  			}
  3202  			n.Unlock()
  3203  			if ar != nil {
  3204  				n.sendRPC(ae.reply, inbox, ar.encode(arbuf))
  3205  				arPool.Put(ar)
  3206  			}
  3207  			// Ignore new while catching up or replaying.
  3208  			return
  3209  		}
  3210  	}
  3211  
  3212  	// If this term is greater than ours.
  3213  	if ae.term > n.term {
  3214  		n.pterm = ae.pterm
  3215  		n.term = ae.term
  3216  		n.vote = noVote
  3217  		if isNew {
  3218  			n.writeTermVote()
  3219  		}
  3220  		if n.State() != Follower {
  3221  			n.debug("Term higher than ours and we are not a follower: %v, stepping down to %q", n.State(), ae.leader)
  3222  			n.stepdown.push(ae.leader)
  3223  		}
  3224  	}
  3225  
  3226  	if isNew && n.leader != ae.leader && n.State() == Follower {
  3227  		n.debug("AppendEntry updating leader to %q", ae.leader)
  3228  		n.updateLeader(ae.leader)
  3229  		n.writeTermVote()
  3230  		n.resetElectionTimeout()
  3231  		n.updateLeadChange(false)
  3232  	}
  3233  
  3234  	if (isNew && ae.pterm != n.pterm) || ae.pindex != n.pindex {
  3235  		// Check if this is a lower or equal index than what we were expecting.
  3236  		if ae.pindex <= n.pindex {
  3237  			n.debug("AppendEntry detected pindex less than ours: %d:%d vs %d:%d", ae.pterm, ae.pindex, n.pterm, n.pindex)
  3238  			var ar *appendEntryResponse
  3239  
  3240  			var success bool
  3241  			if eae, _ := n.loadEntry(ae.pindex); eae == nil {
  3242  				// If terms are equal, and we are not catching up, we have simply already processed this message.
  3243  				// So we will ACK back to the leader. This can happen on server restarts based on timings of snapshots.
  3244  				if ae.pterm == n.pterm && !catchingUp {
  3245  					success = true
  3246  				} else {
  3247  					n.resetWAL()
  3248  				}
  3249  			} else {
  3250  				// If terms mismatched, or we got an error loading, delete that entry and all others past it.
  3251  				// Make sure to cancel any catchups in progress.
  3252  				// Truncate will reset our pterm and pindex. Only do so if we have an entry.
  3253  				n.truncateWAL(ae.pterm, ae.pindex)
  3254  			}
  3255  			// Cancel regardless.
  3256  			n.cancelCatchup()
  3257  
  3258  			// Create response.
  3259  			ar = newAppendEntryResponse(ae.pterm, ae.pindex, n.id, success)
  3260  			n.Unlock()
  3261  			n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf))
  3262  			arPool.Put(ar)
  3263  			return
  3264  		}
  3265  
  3266  		// Check if we are catching up. If we are here we know the leader did not have all of the entries
  3267  		// so make sure this is a snapshot entry. If it is not start the catchup process again since it
  3268  		// means we may have missed additional messages.
  3269  		if catchingUp {
  3270  			// Check if only our terms do not match here.
  3271  			if ae.pindex == n.pindex {
  3272  				// Make sure pterms match and we take on the leader's.
  3273  				// This prevents constant spinning.
  3274  				n.truncateWAL(ae.pterm, ae.pindex)
  3275  				n.cancelCatchup()
  3276  				n.Unlock()
  3277  				return
  3278  			}
  3279  			// This means we already entered into a catchup state but what the leader sent us did not match what we expected.
  3280  			// Snapshots and peerstate will always be together when a leader is catching us up in this fashion.
  3281  			if len(ae.entries) != 2 || ae.entries[0].Type != EntrySnapshot || ae.entries[1].Type != EntryPeerState {
  3282  				n.warn("Expected first catchup entry to be a snapshot and peerstate, will retry")
  3283  				n.cancelCatchup()
  3284  				n.Unlock()
  3285  				return
  3286  			}
  3287  
  3288  			if ps, err := decodePeerState(ae.entries[1].Data); err == nil {
  3289  				n.processPeerState(ps)
  3290  				// Also need to copy from client's buffer.
  3291  				ae.entries[0].Data = copyBytes(ae.entries[0].Data)
  3292  			} else {
  3293  				n.warn("Could not parse snapshot peerstate correctly")
  3294  				n.cancelCatchup()
  3295  				n.Unlock()
  3296  				return
  3297  			}
  3298  
  3299  			n.pindex = ae.pindex
  3300  			n.pterm = ae.pterm
  3301  			n.commit = ae.pindex
  3302  
  3303  			if _, err := n.wal.Compact(n.pindex + 1); err != nil {
  3304  				n.setWriteErrLocked(err)
  3305  				n.Unlock()
  3306  				return
  3307  			}
  3308  
  3309  			// Now send snapshot to upper levels. Only send the snapshot, not the peerstate entry.
  3310  			n.apply.push(newCommittedEntry(n.commit, ae.entries[:1]))
  3311  			n.Unlock()
  3312  			return
  3313  
  3314  		} else {
  3315  			n.debug("AppendEntry did not match %d %d with %d %d", ae.pterm, ae.pindex, n.pterm, n.pindex)
  3316  			// Reset our term.
  3317  			n.term = n.pterm
  3318  			if ae.pindex > n.pindex {
  3319  				// Setup our state for catching up.
  3320  				inbox := n.createCatchup(ae)
  3321  				ar := newAppendEntryResponse(n.pterm, n.pindex, n.id, false)
  3322  				n.Unlock()
  3323  				n.sendRPC(ae.reply, inbox, ar.encode(arbuf))
  3324  				arPool.Put(ar)
  3325  				return
  3326  			}
  3327  		}
  3328  	}
  3329  
  3330  	// Save to our WAL if we have entries.
  3331  	if ae.shouldStore() {
  3332  		// Only store if an original which will have sub != nil
  3333  		if sub != nil {
  3334  			if err := n.storeToWAL(ae); err != nil {
  3335  				if err != ErrStoreClosed {
  3336  					n.warn("Error storing entry to WAL: %v", err)
  3337  				}
  3338  				n.Unlock()
  3339  				return
  3340  			}
  3341  			// Save in memory for faster processing during applyCommit.
  3342  			// Only save so many however to avoid memory bloat.
  3343  			if l := len(n.pae); l <= paeDropThreshold {
  3344  				n.pae[n.pindex], l = ae, l+1
  3345  				if l > paeWarnThreshold && l%paeWarnModulo == 0 {
  3346  					n.warn("%d append entries pending", len(n.pae))
  3347  				}
  3348  			} else {
  3349  				n.debug("Not saving to append entries pending")
  3350  			}
  3351  		} else {
  3352  			// This is a replay on startup so just take the appendEntry version.
  3353  			n.pterm = ae.term
  3354  			n.pindex = ae.pindex + 1
  3355  		}
  3356  	}
  3357  
  3358  	// Check to see if we have any related entries to process here.
  3359  	for _, e := range ae.entries {
  3360  		switch e.Type {
  3361  		case EntryLeaderTransfer:
  3362  			// Only process these if they are new, so no replays or catchups.
  3363  			if isNew {
  3364  				maybeLeader := string(e.Data)
  3365  				// This is us. We need to check if we can become the leader.
  3366  				if maybeLeader == n.id {
  3367  					// If not an observer and not paused we are good to go.
  3368  					if !n.observer && !n.paused {
  3369  						n.lxfer = true
  3370  						n.xferCampaign()
  3371  					} else if n.paused && !n.pobserver {
  3372  						// Here we can become a leader but need to wait for resume of the apply queue.
  3373  						n.lxfer = true
  3374  					}
  3375  				} else if n.vote != noVote {
  3376  					// Since we are here we are not the chosen one but we should clear any vote preference.
  3377  					n.vote = noVote
  3378  					n.writeTermVote()
  3379  				}
  3380  			}
  3381  		case EntryAddPeer:
  3382  			if newPeer := string(e.Data); len(newPeer) == idLen {
  3383  				// Track directly, but wait for commit to be official
  3384  				if ps := n.peers[newPeer]; ps != nil {
  3385  					ps.ts = time.Now().UnixNano()
  3386  				} else {
  3387  					n.peers[newPeer] = &lps{time.Now().UnixNano(), 0, false}
  3388  				}
  3389  				// Store our peer in our global peer map for all peers.
  3390  				peers.LoadOrStore(newPeer, newPeer)
  3391  			}
  3392  		}
  3393  	}
  3394  
  3395  	// Apply anything we need here.
  3396  	if ae.commit > n.commit {
  3397  		if n.paused {
  3398  			n.hcommit = ae.commit
  3399  			n.debug("Paused, not applying %d", ae.commit)
  3400  		} else {
  3401  			for index := n.commit + 1; index <= ae.commit; index++ {
  3402  				if err := n.applyCommit(index); err != nil {
  3403  					break
  3404  				}
  3405  			}
  3406  		}
  3407  	}
  3408  
  3409  	var ar *appendEntryResponse
  3410  	if sub != nil {
  3411  		ar = newAppendEntryResponse(n.pterm, n.pindex, n.id, true)
  3412  	}
  3413  	n.Unlock()
  3414  
  3415  	// Success. Send our response.
  3416  	if ar != nil {
  3417  		n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf))
  3418  		arPool.Put(ar)
  3419  	}
  3420  }
  3421  
  3422  // processPeerState is called when a peer state entry is received
  3423  // over the wire or when we're updating known peers.
  3424  // Lock should be held.
  3425  func (n *raft) processPeerState(ps *peerState) {
  3426  	// Update our version of peers to that of the leader. Calculate
  3427  	// the number of nodes needed to establish a quorum.
  3428  	n.csz = ps.clusterSize
  3429  	n.qn = n.csz/2 + 1
  3430  
  3431  	old := n.peers
  3432  	n.peers = make(map[string]*lps)
  3433  	for _, peer := range ps.knownPeers {
  3434  		if lp := old[peer]; lp != nil {
  3435  			lp.kp = true
  3436  			n.peers[peer] = lp
  3437  		} else {
  3438  			n.peers[peer] = &lps{0, 0, true}
  3439  		}
  3440  	}
  3441  	n.debug("Update peers from leader to %+v", n.peers)
  3442  	n.writePeerState(ps)
  3443  }
  3444  
  3445  // processAppendEntryResponse is called when we receive an append entry
  3446  // response from another node. They will send a confirmation to tell us
  3447  // whether they successfully committed the entry or not.
  3448  func (n *raft) processAppendEntryResponse(ar *appendEntryResponse) {
  3449  	n.trackPeer(ar.peer)
  3450  
  3451  	if ar.success {
  3452  		// The remote node successfully committed the append entry.
  3453  		n.trackResponse(ar)
  3454  		arPool.Put(ar)
  3455  	} else if ar.term > n.term {
  3456  		// The remote node didn't commit the append entry, it looks like
  3457  		// they are on a newer term than we are. Step down.
  3458  		n.Lock()
  3459  		n.term = ar.term
  3460  		n.vote = noVote
  3461  		n.writeTermVote()
  3462  		n.warn("Detected another leader with higher term, will stepdown and reset")
  3463  		n.stepdown.push(noLeader)
  3464  		n.resetWAL()
  3465  		n.Unlock()
  3466  		arPool.Put(ar)
  3467  	} else if ar.reply != _EMPTY_ {
  3468  		// The remote node didn't commit the append entry and they are
  3469  		// still on the same term, so let's try to catch them up.
  3470  		n.catchupFollower(ar)
  3471  	}
  3472  }
  3473  
  3474  // handleAppendEntryResponse processes responses to append entries.
  3475  func (n *raft) handleAppendEntryResponse(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  3476  	ar := n.decodeAppendEntryResponse(msg)
  3477  	ar.reply = reply
  3478  	n.resp.push(ar)
  3479  }
  3480  
  3481  func (n *raft) buildAppendEntry(entries []*Entry) *appendEntry {
  3482  	return newAppendEntry(n.id, n.term, n.commit, n.pterm, n.pindex, entries)
  3483  }
  3484  
  3485  // Determine if we should store an entry. This stops us from storing
  3486  // heartbeat messages.
  3487  func (ae *appendEntry) shouldStore() bool {
  3488  	return ae != nil && len(ae.entries) > 0
  3489  }
  3490  
  3491  // Store our append entry to our WAL.
  3492  // lock should be held.
  3493  func (n *raft) storeToWAL(ae *appendEntry) error {
  3494  	if ae == nil {
  3495  		return fmt.Errorf("raft: Missing append entry for storage")
  3496  	}
  3497  	if n.werr != nil {
  3498  		return n.werr
  3499  	}
  3500  
  3501  	seq, _, err := n.wal.StoreMsg(_EMPTY_, nil, ae.buf)
  3502  	if err != nil {
  3503  		n.setWriteErrLocked(err)
  3504  		return err
  3505  	}
  3506  
  3507  	// Sanity checking for now.
  3508  	if index := ae.pindex + 1; index != seq {
  3509  		n.warn("Wrong index, ae is %+v, index stored was %d, n.pindex is %d, will reset", ae, seq, n.pindex)
  3510  		if n.State() == Leader {
  3511  			n.stepdown.push(n.selectNextLeader())
  3512  		}
  3513  		// Reset and cancel any catchup.
  3514  		n.resetWAL()
  3515  		n.cancelCatchup()
  3516  		return errEntryStoreFailed
  3517  	}
  3518  
  3519  	n.pterm = ae.term
  3520  	n.pindex = seq
  3521  	return nil
  3522  }
  3523  
  3524  const (
  3525  	paeDropThreshold = 20_000
  3526  	paeWarnThreshold = 10_000
  3527  	paeWarnModulo    = 5_000
  3528  )
  3529  
  3530  func (n *raft) sendAppendEntry(entries []*Entry) {
  3531  	n.Lock()
  3532  	defer n.Unlock()
  3533  	ae := n.buildAppendEntry(entries)
  3534  
  3535  	var err error
  3536  	var scratch [1024]byte
  3537  	ae.buf, err = ae.encode(scratch[:])
  3538  	if err != nil {
  3539  		return
  3540  	}
  3541  
  3542  	// If we have entries store this in our wal.
  3543  	shouldStore := ae.shouldStore()
  3544  	if shouldStore {
  3545  		if err := n.storeToWAL(ae); err != nil {
  3546  			return
  3547  		}
  3548  		// We count ourselves.
  3549  		n.acks[n.pindex] = map[string]struct{}{n.id: {}}
  3550  		n.active = time.Now()
  3551  
  3552  		// Save in memory for faster processing during applyCommit.
  3553  		n.pae[n.pindex] = ae
  3554  		if l := len(n.pae); l > paeWarnThreshold && l%paeWarnModulo == 0 {
  3555  			n.warn("%d append entries pending", len(n.pae))
  3556  		}
  3557  	}
  3558  	n.sendRPC(n.asubj, n.areply, ae.buf)
  3559  	if !shouldStore {
  3560  		ae.returnToPool()
  3561  	}
  3562  }
  3563  
  3564  type extensionState uint16
  3565  
  3566  const (
  3567  	extUndetermined = extensionState(iota)
  3568  	extExtended
  3569  	extNotExtended
  3570  )
  3571  
  3572  type peerState struct {
  3573  	knownPeers  []string
  3574  	clusterSize int
  3575  	domainExt   extensionState
  3576  }
  3577  
  3578  func peerStateBufSize(ps *peerState) int {
  3579  	return 4 + 4 + (idLen * len(ps.knownPeers)) + 2
  3580  }
  3581  
  3582  func encodePeerState(ps *peerState) []byte {
  3583  	var le = binary.LittleEndian
  3584  	buf := make([]byte, peerStateBufSize(ps))
  3585  	le.PutUint32(buf[0:], uint32(ps.clusterSize))
  3586  	le.PutUint32(buf[4:], uint32(len(ps.knownPeers)))
  3587  	wi := 8
  3588  	for _, peer := range ps.knownPeers {
  3589  		copy(buf[wi:], peer)
  3590  		wi += idLen
  3591  	}
  3592  	le.PutUint16(buf[wi:], uint16(ps.domainExt))
  3593  	return buf
  3594  }
  3595  
  3596  func decodePeerState(buf []byte) (*peerState, error) {
  3597  	if len(buf) < 8 {
  3598  		return nil, errCorruptPeers
  3599  	}
  3600  	var le = binary.LittleEndian
  3601  	ps := &peerState{clusterSize: int(le.Uint32(buf[0:]))}
  3602  	expectedPeers := int(le.Uint32(buf[4:]))
  3603  	buf = buf[8:]
  3604  	ri := 0
  3605  	for i, n := 0, expectedPeers; i < n && ri < len(buf); i++ {
  3606  		ps.knownPeers = append(ps.knownPeers, string(buf[ri:ri+idLen]))
  3607  		ri += idLen
  3608  	}
  3609  	if len(ps.knownPeers) != expectedPeers {
  3610  		return nil, errCorruptPeers
  3611  	}
  3612  	if len(buf[ri:]) >= 2 {
  3613  		ps.domainExt = extensionState(le.Uint16(buf[ri:]))
  3614  	}
  3615  	return ps, nil
  3616  }
  3617  
  3618  // Lock should be held.
  3619  func (n *raft) peerNames() []string {
  3620  	var peers []string
  3621  	for name, peer := range n.peers {
  3622  		if peer.kp {
  3623  			peers = append(peers, name)
  3624  		}
  3625  	}
  3626  	return peers
  3627  }
  3628  
  3629  func (n *raft) currentPeerState() *peerState {
  3630  	n.RLock()
  3631  	ps := &peerState{n.peerNames(), n.csz, n.extSt}
  3632  	n.RUnlock()
  3633  	return ps
  3634  }
  3635  
  3636  // sendPeerState will send our current peer state to the cluster.
  3637  func (n *raft) sendPeerState() {
  3638  	n.sendAppendEntry([]*Entry{{EntryPeerState, encodePeerState(n.currentPeerState())}})
  3639  }
  3640  
  3641  // Send a heartbeat.
  3642  func (n *raft) sendHeartbeat() {
  3643  	n.sendAppendEntry(nil)
  3644  }
  3645  
  3646  type voteRequest struct {
  3647  	term      uint64
  3648  	lastTerm  uint64
  3649  	lastIndex uint64
  3650  	candidate string
  3651  	// internal only.
  3652  	reply string
  3653  }
  3654  
  3655  const voteRequestLen = 24 + idLen
  3656  
  3657  func (vr *voteRequest) encode() []byte {
  3658  	var buf [voteRequestLen]byte
  3659  	var le = binary.LittleEndian
  3660  	le.PutUint64(buf[0:], vr.term)
  3661  	le.PutUint64(buf[8:], vr.lastTerm)
  3662  	le.PutUint64(buf[16:], vr.lastIndex)
  3663  	copy(buf[24:24+idLen], vr.candidate)
  3664  
  3665  	return buf[:voteRequestLen]
  3666  }
  3667  
  3668  func decodeVoteRequest(msg []byte, reply string) *voteRequest {
  3669  	if len(msg) != voteRequestLen {
  3670  		return nil
  3671  	}
  3672  
  3673  	var le = binary.LittleEndian
  3674  	return &voteRequest{
  3675  		term:      le.Uint64(msg[0:]),
  3676  		lastTerm:  le.Uint64(msg[8:]),
  3677  		lastIndex: le.Uint64(msg[16:]),
  3678  		candidate: string(copyBytes(msg[24 : 24+idLen])),
  3679  		reply:     reply,
  3680  	}
  3681  }
  3682  
  3683  const peerStateFile = "peers.idx"
  3684  
  3685  // Lock should be held.
  3686  func (n *raft) writePeerState(ps *peerState) {
  3687  	pse := encodePeerState(ps)
  3688  	if bytes.Equal(n.wps, pse) {
  3689  		return
  3690  	}
  3691  	// Stamp latest and write the peer state file.
  3692  	n.wps = pse
  3693  	if err := writePeerState(n.sd, ps); err != nil && !n.isClosed() {
  3694  		n.setWriteErrLocked(err)
  3695  		n.warn("Error writing peer state file for %q: %v", n.group, err)
  3696  	}
  3697  }
  3698  
  3699  // Writes out our peer state outside of a specific raft context.
  3700  func writePeerState(sd string, ps *peerState) error {
  3701  	psf := filepath.Join(sd, peerStateFile)
  3702  	if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) {
  3703  		return err
  3704  	}
  3705  
  3706  	<-dios
  3707  	err := os.WriteFile(psf, encodePeerState(ps), defaultFilePerms)
  3708  	dios <- struct{}{}
  3709  
  3710  	return err
  3711  }
  3712  
  3713  func readPeerState(sd string) (ps *peerState, err error) {
  3714  	<-dios
  3715  	buf, err := os.ReadFile(filepath.Join(sd, peerStateFile))
  3716  	dios <- struct{}{}
  3717  
  3718  	if err != nil {
  3719  		return nil, err
  3720  	}
  3721  	return decodePeerState(buf)
  3722  }
  3723  
  3724  const termVoteFile = "tav.idx"
  3725  const termVoteLen = idLen + 8
  3726  
  3727  // Writes out our term & vote outside of a specific raft context.
  3728  func writeTermVote(sd string, wtv []byte) error {
  3729  	psf := filepath.Join(sd, termVoteFile)
  3730  	if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) {
  3731  		return err
  3732  	}
  3733  
  3734  	<-dios
  3735  	err := os.WriteFile(psf, wtv, defaultFilePerms)
  3736  	dios <- struct{}{}
  3737  
  3738  	return err
  3739  }
  3740  
  3741  // readTermVote will read the largest term and who we voted from to stable storage.
  3742  // Lock should be held.
  3743  func (n *raft) readTermVote() (term uint64, voted string, err error) {
  3744  	<-dios
  3745  	buf, err := os.ReadFile(filepath.Join(n.sd, termVoteFile))
  3746  	dios <- struct{}{}
  3747  
  3748  	if err != nil {
  3749  		return 0, noVote, err
  3750  	}
  3751  	if len(buf) < termVoteLen {
  3752  		return 0, noVote, nil
  3753  	}
  3754  	var le = binary.LittleEndian
  3755  	term = le.Uint64(buf[0:])
  3756  	voted = string(buf[8:])
  3757  	return term, voted, nil
  3758  }
  3759  
  3760  // Lock should be held.
  3761  func (n *raft) setWriteErrLocked(err error) {
  3762  	// Check if we are closed already.
  3763  	if n.State() == Closed {
  3764  		return
  3765  	}
  3766  	// Ignore if already set.
  3767  	if n.werr == err || err == nil {
  3768  		return
  3769  	}
  3770  	// Ignore non-write errors.
  3771  	if err == ErrStoreClosed ||
  3772  		err == ErrStoreEOF ||
  3773  		err == ErrInvalidSequence ||
  3774  		err == ErrStoreMsgNotFound ||
  3775  		err == errNoPending ||
  3776  		err == errPartialCache {
  3777  		return
  3778  	}
  3779  	// If this is a not found report but do not disable.
  3780  	if os.IsNotExist(err) {
  3781  		n.error("Resource not found: %v", err)
  3782  		return
  3783  	}
  3784  	n.error("Critical write error: %v", err)
  3785  	n.werr = err
  3786  
  3787  	if isOutOfSpaceErr(err) {
  3788  		// For now since this can be happening all under the covers, we will call up and disable JetStream.
  3789  		go n.s.handleOutOfSpace(nil)
  3790  	}
  3791  }
  3792  
  3793  // Helper to check if we are closed when we do not hold a lock already.
  3794  func (n *raft) isClosed() bool {
  3795  	return n.State() == Closed
  3796  }
  3797  
  3798  // Capture our write error if any and hold.
  3799  func (n *raft) setWriteErr(err error) {
  3800  	n.Lock()
  3801  	defer n.Unlock()
  3802  	n.setWriteErrLocked(err)
  3803  }
  3804  
  3805  // writeTermVote will record the largest term and who we voted for to stable storage.
  3806  // Lock should be held.
  3807  func (n *raft) writeTermVote() {
  3808  	var buf [termVoteLen]byte
  3809  	var le = binary.LittleEndian
  3810  	le.PutUint64(buf[0:], n.term)
  3811  	copy(buf[8:], n.vote)
  3812  	b := buf[:8+len(n.vote)]
  3813  
  3814  	// If the term and vote hasn't changed then don't rewrite to disk.
  3815  	if bytes.Equal(n.wtv, b) {
  3816  		return
  3817  	}
  3818  	// Stamp latest and write the term & vote file.
  3819  	n.wtv = b
  3820  	if err := writeTermVote(n.sd, n.wtv); err != nil && !n.isClosed() {
  3821  		n.setWriteErrLocked(err)
  3822  		n.warn("Error writing term and vote file for %q: %v", n.group, err)
  3823  	}
  3824  }
  3825  
  3826  // voteResponse is a response to a vote request.
  3827  type voteResponse struct {
  3828  	term    uint64
  3829  	peer    string
  3830  	granted bool
  3831  }
  3832  
  3833  const voteResponseLen = 8 + 8 + 1
  3834  
  3835  func (vr *voteResponse) encode() []byte {
  3836  	var buf [voteResponseLen]byte
  3837  	var le = binary.LittleEndian
  3838  	le.PutUint64(buf[0:], vr.term)
  3839  	copy(buf[8:], vr.peer)
  3840  	if vr.granted {
  3841  		buf[16] = 1
  3842  	} else {
  3843  		buf[16] = 0
  3844  	}
  3845  	return buf[:voteResponseLen]
  3846  }
  3847  
  3848  func decodeVoteResponse(msg []byte) *voteResponse {
  3849  	if len(msg) != voteResponseLen {
  3850  		return nil
  3851  	}
  3852  	var le = binary.LittleEndian
  3853  	vr := &voteResponse{term: le.Uint64(msg[0:]), peer: string(msg[8:16])}
  3854  	vr.granted = msg[16] == 1
  3855  	return vr
  3856  }
  3857  
  3858  func (n *raft) handleVoteResponse(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) {
  3859  	vr := decodeVoteResponse(msg)
  3860  	n.debug("Received a voteResponse %+v", vr)
  3861  	if vr == nil {
  3862  		n.error("Received malformed vote response for %q", n.group)
  3863  		return
  3864  	}
  3865  
  3866  	if state := n.State(); state != Candidate && state != Leader {
  3867  		n.debug("Ignoring old vote response, we have stepped down")
  3868  		return
  3869  	}
  3870  
  3871  	n.votes.push(vr)
  3872  }
  3873  
  3874  func (n *raft) processVoteRequest(vr *voteRequest) error {
  3875  	// To simplify calling code, we can possibly pass `nil` to this function.
  3876  	// If that is the case, does not consider it an error.
  3877  	if vr == nil {
  3878  		return nil
  3879  	}
  3880  	n.debug("Received a voteRequest %+v", vr)
  3881  
  3882  	if err := n.trackPeer(vr.candidate); err != nil {
  3883  		return err
  3884  	}
  3885  
  3886  	n.Lock()
  3887  	n.resetElectionTimeout()
  3888  
  3889  	vresp := &voteResponse{n.term, n.id, false}
  3890  	defer n.debug("Sending a voteResponse %+v -> %q", vresp, vr.reply)
  3891  
  3892  	// Ignore if we are newer. This is important so that we don't accidentally process
  3893  	// votes from a previous term if they were still in flight somewhere.
  3894  	if vr.term < n.term {
  3895  		n.Unlock()
  3896  		n.sendReply(vr.reply, vresp.encode())
  3897  		return nil
  3898  	}
  3899  
  3900  	// If this is a higher term go ahead and stepdown.
  3901  	if vr.term > n.term {
  3902  		if n.State() != Follower {
  3903  			n.debug("Stepping down from %s, detected higher term: %d vs %d",
  3904  				strings.ToLower(n.State().String()), vr.term, n.term)
  3905  			n.stepdown.push(noLeader)
  3906  			n.term = vr.term
  3907  		}
  3908  		n.vote = noVote
  3909  		n.writeTermVote()
  3910  	}
  3911  
  3912  	// Only way we get to yes is through here.
  3913  	voteOk := n.vote == noVote || n.vote == vr.candidate
  3914  	if voteOk && (vr.lastTerm > n.pterm || vr.lastTerm == n.pterm && vr.lastIndex >= n.pindex) {
  3915  		vresp.granted = true
  3916  		n.term = vr.term
  3917  		n.vote = vr.candidate
  3918  		n.writeTermVote()
  3919  	} else {
  3920  		if vr.term >= n.term && n.vote == noVote {
  3921  			n.term = vr.term
  3922  			n.resetElect(randCampaignTimeout())
  3923  		}
  3924  	}
  3925  
  3926  	// Term might have changed, make sure response has the most current
  3927  	vresp.term = n.term
  3928  
  3929  	n.Unlock()
  3930  
  3931  	n.sendReply(vr.reply, vresp.encode())
  3932  
  3933  	return nil
  3934  }
  3935  
  3936  func (n *raft) handleVoteRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) {
  3937  	vr := decodeVoteRequest(msg, reply)
  3938  	if vr == nil {
  3939  		n.error("Received malformed vote request for %q", n.group)
  3940  		return
  3941  	}
  3942  	n.reqs.push(vr)
  3943  }
  3944  
  3945  func (n *raft) requestVote() {
  3946  	n.Lock()
  3947  	if n.State() != Candidate {
  3948  		n.Unlock()
  3949  		return
  3950  	}
  3951  	n.vote = n.id
  3952  	n.writeTermVote()
  3953  	vr := voteRequest{n.term, n.pterm, n.pindex, n.id, _EMPTY_}
  3954  	subj, reply := n.vsubj, n.vreply
  3955  	n.Unlock()
  3956  
  3957  	n.debug("Sending out voteRequest %+v", vr)
  3958  
  3959  	// Now send it out.
  3960  	n.sendRPC(subj, reply, vr.encode())
  3961  }
  3962  
  3963  func (n *raft) sendRPC(subject, reply string, msg []byte) {
  3964  	if n.sq != nil {
  3965  		n.sq.send(subject, reply, nil, msg)
  3966  	}
  3967  }
  3968  
  3969  func (n *raft) sendReply(subject string, msg []byte) {
  3970  	if n.sq != nil {
  3971  		n.sq.send(subject, _EMPTY_, nil, msg)
  3972  	}
  3973  }
  3974  
  3975  func (n *raft) wonElection(votes int) bool {
  3976  	return votes >= n.quorumNeeded()
  3977  }
  3978  
  3979  // Return the quorum size for a given cluster config.
  3980  func (n *raft) quorumNeeded() int {
  3981  	n.RLock()
  3982  	qn := n.qn
  3983  	n.RUnlock()
  3984  	return qn
  3985  }
  3986  
  3987  // Lock should be held.
  3988  func (n *raft) updateLeadChange(isLeader bool) {
  3989  	// We don't care about values that have not been consumed (transitory states),
  3990  	// so we dequeue any state that is pending and push the new one.
  3991  	for {
  3992  		select {
  3993  		case n.leadc <- isLeader:
  3994  			return
  3995  		default:
  3996  			select {
  3997  			case <-n.leadc:
  3998  			default:
  3999  				// May have been consumed by the "reader" go routine, so go back
  4000  				// to the top of the loop and try to send again.
  4001  			}
  4002  		}
  4003  	}
  4004  }
  4005  
  4006  // Lock should be held.
  4007  func (n *raft) switchState(state RaftState) {
  4008  	if n.State() == Closed {
  4009  		return
  4010  	}
  4011  
  4012  	// Reset the election timer.
  4013  	n.resetElectionTimeout()
  4014  
  4015  	if n.State() == Leader && state != Leader {
  4016  		n.updateLeadChange(false)
  4017  		// Drain the response queue.
  4018  		n.resp.drain()
  4019  	} else if state == Leader && n.State() != Leader {
  4020  		if len(n.pae) > 0 {
  4021  			n.pae = make(map[uint64]*appendEntry)
  4022  		}
  4023  		n.updateLeadChange(true)
  4024  	}
  4025  
  4026  	n.state.Store(int32(state))
  4027  	n.writeTermVote()
  4028  }
  4029  
  4030  const (
  4031  	noLeader = _EMPTY_
  4032  	noVote   = _EMPTY_
  4033  )
  4034  
  4035  func (n *raft) switchToFollower(leader string) {
  4036  	if n.State() == Closed {
  4037  		return
  4038  	}
  4039  
  4040  	n.Lock()
  4041  	defer n.Unlock()
  4042  
  4043  	n.debug("Switching to follower")
  4044  
  4045  	n.lxfer = false
  4046  	n.updateLeader(leader)
  4047  	n.switchState(Follower)
  4048  }
  4049  
  4050  func (n *raft) switchToCandidate() {
  4051  	if n.State() == Closed {
  4052  		return
  4053  	}
  4054  
  4055  	n.Lock()
  4056  	defer n.Unlock()
  4057  
  4058  	// If we are catching up or are in observer mode we can not switch.
  4059  	if n.observer || n.paused {
  4060  		return
  4061  	}
  4062  
  4063  	if n.State() != Candidate {
  4064  		n.debug("Switching to candidate")
  4065  	} else {
  4066  		if n.lostQuorumLocked() && time.Since(n.llqrt) > 20*time.Second {
  4067  			// We signal to the upper layers such that can alert on quorum lost.
  4068  			n.updateLeadChange(false)
  4069  			n.llqrt = time.Now()
  4070  		}
  4071  	}
  4072  	// Increment the term.
  4073  	n.term++
  4074  	// Clear current Leader.
  4075  	n.updateLeader(noLeader)
  4076  	n.switchState(Candidate)
  4077  }
  4078  
  4079  func (n *raft) switchToLeader() {
  4080  	if n.State() == Closed {
  4081  		return
  4082  	}
  4083  
  4084  	n.Lock()
  4085  
  4086  	n.debug("Switching to leader")
  4087  
  4088  	var state StreamState
  4089  	n.wal.FastState(&state)
  4090  
  4091  	// Check if we have items pending as we are taking over.
  4092  	sendHB := state.LastSeq > n.commit
  4093  
  4094  	n.lxfer = false
  4095  	n.updateLeader(n.id)
  4096  	n.switchState(Leader)
  4097  	n.Unlock()
  4098  
  4099  	if sendHB {
  4100  		n.sendHeartbeat()
  4101  	}
  4102  }