github.com/nats-io/nats-server/v2@v2.11.0-preview.2/server/raft.go (about) 1 // Copyright 2020-2023 The NATS Authors 2 // Licensed under the Apache License, Version 2.0 (the "License"); 3 // you may not use this file except in compliance with the License. 4 // You may obtain a copy of the License at 5 // 6 // http://www.apache.org/licenses/LICENSE-2.0 7 // 8 // Unless required by applicable law or agreed to in writing, software 9 // distributed under the License is distributed on an "AS IS" BASIS, 10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 // See the License for the specific language governing permissions and 12 // limitations under the License. 13 14 package server 15 16 import ( 17 "bytes" 18 "crypto/sha256" 19 "encoding/binary" 20 "errors" 21 "fmt" 22 "hash" 23 "math" 24 "math/rand" 25 "net" 26 "os" 27 "path/filepath" 28 "runtime" 29 "strings" 30 "sync" 31 "sync/atomic" 32 "time" 33 34 "github.com/nats-io/nats-server/v2/internal/fastrand" 35 36 "github.com/minio/highwayhash" 37 ) 38 39 type RaftNode interface { 40 Propose(entry []byte) error 41 ProposeDirect(entries []*Entry) error 42 ForwardProposal(entry []byte) error 43 InstallSnapshot(snap []byte) error 44 SendSnapshot(snap []byte) error 45 NeedSnapshot() bool 46 Applied(index uint64) (entries uint64, bytes uint64) 47 State() RaftState 48 Size() (entries, bytes uint64) 49 Progress() (index, commit, applied uint64) 50 Leader() bool 51 Quorum() bool 52 Current() bool 53 Healthy() bool 54 Term() uint64 55 GroupLeader() string 56 HadPreviousLeader() bool 57 StepDown(preferred ...string) error 58 SetObserver(isObserver bool) 59 IsObserver() bool 60 Campaign() error 61 ID() string 62 Group() string 63 Peers() []*Peer 64 UpdateKnownPeers(knownPeers []string) 65 ProposeAddPeer(peer string) error 66 ProposeRemovePeer(peer string) error 67 AdjustClusterSize(csz int) error 68 AdjustBootClusterSize(csz int) error 69 ClusterSize() int 70 ApplyQ() *ipQueue[*CommittedEntry] 71 PauseApply() error 72 ResumeApply() 73 LeadChangeC() <-chan bool 74 QuitC() <-chan struct{} 75 Created() time.Time 76 Stop() 77 Delete() 78 Wipe() 79 } 80 81 type WAL interface { 82 Type() StorageType 83 StoreMsg(subj string, hdr, msg []byte) (uint64, int64, error) 84 LoadMsg(index uint64, sm *StoreMsg) (*StoreMsg, error) 85 RemoveMsg(index uint64) (bool, error) 86 Compact(index uint64) (uint64, error) 87 Purge() (uint64, error) 88 Truncate(seq uint64) error 89 State() StreamState 90 FastState(*StreamState) 91 Stop() error 92 Delete() error 93 } 94 95 type Peer struct { 96 ID string 97 Current bool 98 Last time.Time 99 Lag uint64 100 } 101 102 type RaftState uint8 103 104 // Allowable states for a NATS Consensus Group. 105 const ( 106 Follower RaftState = iota 107 Leader 108 Candidate 109 Closed 110 ) 111 112 func (state RaftState) String() string { 113 switch state { 114 case Follower: 115 return "FOLLOWER" 116 case Candidate: 117 return "CANDIDATE" 118 case Leader: 119 return "LEADER" 120 case Closed: 121 return "CLOSED" 122 } 123 return "UNKNOWN" 124 } 125 126 type raft struct { 127 sync.RWMutex 128 129 created time.Time // Time that the group was created 130 accName string // Account name of the asset this raft group is for 131 group string // Raft group 132 sd string // Store directory 133 id string // Node ID 134 135 wal WAL // WAL store (filestore or memstore) 136 wtype StorageType // WAL type, e.g. FileStorage or MemoryStorage 137 track bool // 138 werr error // Last write error 139 140 state atomic.Int32 // RaftState 141 hh hash.Hash64 // Highwayhash, used for snapshots 142 snapfile string // Snapshot filename 143 144 csz int // Cluster size 145 qn int // Number of nodes needed to establish quorum 146 peers map[string]*lps // Other peers in the Raft group 147 148 removed map[string]struct{} // Peers that were removed from the group 149 acks map[uint64]map[string]struct{} // Append entry responses/acks, map of entry index -> peer ID 150 pae map[uint64]*appendEntry // Pending append entries 151 152 elect *time.Timer // Election timer, normally accessed via electTimer 153 active time.Time // Last activity time, i.e. for heartbeats 154 llqrt time.Time // Last quorum lost time 155 lsut time.Time // Last scale-up time 156 157 term uint64 // The current vote term 158 pterm uint64 // Previous term from the last snapshot 159 pindex uint64 // Previous index from the last snapshot 160 commit uint64 // Sequence number of the most recent commit 161 applied uint64 // Sequence number of the most recently applied commit 162 hcbehind bool // Were we falling behind at the last health check? (see: isCurrent) 163 164 leader string // The ID of the leader 165 vote string // Our current vote state 166 lxfer bool // Are we doing a leadership transfer? 167 168 s *Server // Reference to top-level server 169 c *client // Internal client for subscriptions 170 js *jetStream // JetStream, if running, to see if we are out of resources 171 172 dflag bool // Debug flag 173 pleader bool // Has the group ever had a leader? 174 observer bool // The node is observing, i.e. not participating in voting 175 extSt extensionState // Extension state 176 177 psubj string // Proposals subject 178 rpsubj string // Remove peers subject 179 vsubj string // Vote requests subject 180 vreply string // Vote responses subject 181 asubj string // Append entries subject 182 areply string // Append entries responses subject 183 184 sq *sendq // Send queue for outbound RPC messages 185 aesub *subscription // Subscription for handleAppendEntry callbacks 186 187 wtv []byte // Term and vote to be written 188 wps []byte // Peer state to be written 189 190 catchup *catchupState // For when we need to catch up as a follower. 191 progress map[string]*ipQueue[uint64] // For leader or server catching up a follower. 192 193 paused bool // Whether or not applies are paused 194 hcommit uint64 // The commit at the time that applies were paused 195 pobserver bool // Whether we were an observer at the time that applies were paused 196 197 prop *ipQueue[*Entry] // Proposals 198 entry *ipQueue[*appendEntry] // Append entries 199 resp *ipQueue[*appendEntryResponse] // Append entries responses 200 apply *ipQueue[*CommittedEntry] // Apply queue (committed entries to be passed to upper layer) 201 reqs *ipQueue[*voteRequest] // Vote requests 202 votes *ipQueue[*voteResponse] // Vote responses 203 stepdown *ipQueue[string] // Stepdown requests 204 leadc chan bool // Leader changes 205 quit chan struct{} // Raft group shutdown 206 } 207 208 // cacthupState structure that holds our subscription, and catchup term and index 209 // as well as starting term and index and how many updates we have seen. 210 type catchupState struct { 211 sub *subscription // Subscription that catchup messages will arrive on 212 cterm uint64 // Catchup term 213 cindex uint64 // Catchup index 214 pterm uint64 // Starting term 215 pindex uint64 // Starting index 216 active time.Time // Last time we received a message for this catchup 217 } 218 219 // lps holds peer state of last time and last index replicated. 220 type lps struct { 221 ts int64 // Last timestamp 222 li uint64 // Last index replicated 223 kp bool // Known peer 224 } 225 226 const ( 227 minElectionTimeoutDefault = 4 * time.Second 228 maxElectionTimeoutDefault = 9 * time.Second 229 minCampaignTimeoutDefault = 100 * time.Millisecond 230 maxCampaignTimeoutDefault = 8 * minCampaignTimeoutDefault 231 hbIntervalDefault = 1 * time.Second 232 lostQuorumIntervalDefault = hbIntervalDefault * 10 // 10 seconds 233 lostQuorumCheckIntervalDefault = hbIntervalDefault * 10 // 10 seconds 234 ) 235 236 var ( 237 minElectionTimeout = minElectionTimeoutDefault 238 maxElectionTimeout = maxElectionTimeoutDefault 239 minCampaignTimeout = minCampaignTimeoutDefault 240 maxCampaignTimeout = maxCampaignTimeoutDefault 241 hbInterval = hbIntervalDefault 242 lostQuorumInterval = lostQuorumIntervalDefault 243 lostQuorumCheck = lostQuorumCheckIntervalDefault 244 ) 245 246 type RaftConfig struct { 247 Name string 248 Store string 249 Log WAL 250 Track bool 251 Observer bool 252 } 253 254 var ( 255 errNotLeader = errors.New("raft: not leader") 256 errAlreadyLeader = errors.New("raft: already leader") 257 errNilCfg = errors.New("raft: no config given") 258 errCorruptPeers = errors.New("raft: corrupt peer state") 259 errEntryLoadFailed = errors.New("raft: could not load entry from WAL") 260 errEntryStoreFailed = errors.New("raft: could not store entry to WAL") 261 errNodeClosed = errors.New("raft: node is closed") 262 errBadSnapName = errors.New("raft: snapshot name could not be parsed") 263 errNoSnapAvailable = errors.New("raft: no snapshot available") 264 errCatchupsRunning = errors.New("raft: snapshot can not be installed while catchups running") 265 errSnapshotCorrupt = errors.New("raft: snapshot corrupt") 266 errTooManyPrefs = errors.New("raft: stepdown requires at most one preferred new leader") 267 errNoPeerState = errors.New("raft: no peerstate") 268 errAdjustBootCluster = errors.New("raft: can not adjust boot peer size on established group") 269 errLeaderLen = fmt.Errorf("raft: leader should be exactly %d bytes", idLen) 270 errTooManyEntries = errors.New("raft: append entry can contain a max of 64k entries") 271 errBadAppendEntry = errors.New("raft: append entry corrupt") 272 ) 273 274 // This will bootstrap a raftNode by writing its config into the store directory. 275 func (s *Server) bootstrapRaftNode(cfg *RaftConfig, knownPeers []string, allPeersKnown bool) error { 276 if cfg == nil { 277 return errNilCfg 278 } 279 // Check validity of peers if presented. 280 for _, p := range knownPeers { 281 if len(p) != idLen { 282 return fmt.Errorf("raft: illegal peer: %q", p) 283 } 284 } 285 expected := len(knownPeers) 286 // We need to adjust this is all peers are not known. 287 if !allPeersKnown { 288 s.Debugf("Determining expected peer size for JetStream meta group") 289 if expected < 2 { 290 expected = 2 291 } 292 opts := s.getOpts() 293 nrs := len(opts.Routes) 294 295 cn := s.ClusterName() 296 ngwps := 0 297 for _, gw := range opts.Gateway.Gateways { 298 // Ignore our own cluster if specified. 299 if gw.Name == cn { 300 continue 301 } 302 for _, u := range gw.URLs { 303 host := u.Hostname() 304 // If this is an IP just add one. 305 if net.ParseIP(host) != nil { 306 ngwps++ 307 } else { 308 addrs, _ := net.LookupHost(host) 309 ngwps += len(addrs) 310 } 311 } 312 } 313 314 if expected < nrs+ngwps { 315 expected = nrs + ngwps 316 s.Debugf("Adjusting expected peer set size to %d with %d known", expected, len(knownPeers)) 317 } 318 } 319 320 // Check the store directory. If we have a memory based WAL we need to make sure the directory is setup. 321 if stat, err := os.Stat(cfg.Store); os.IsNotExist(err) { 322 if err := os.MkdirAll(cfg.Store, 0750); err != nil { 323 return fmt.Errorf("raft: could not create storage directory - %v", err) 324 } 325 } else if stat == nil || !stat.IsDir() { 326 return fmt.Errorf("raft: storage directory is not a directory") 327 } 328 tmpfile, err := os.CreateTemp(cfg.Store, "_test_") 329 if err != nil { 330 return fmt.Errorf("raft: storage directory is not writable") 331 } 332 tmpfile.Close() 333 os.Remove(tmpfile.Name()) 334 335 return writePeerState(cfg.Store, &peerState{knownPeers, expected, extUndetermined}) 336 } 337 338 // startRaftNode will start the raft node. 339 func (s *Server) startRaftNode(accName string, cfg *RaftConfig, labels pprofLabels) (RaftNode, error) { 340 if cfg == nil { 341 return nil, errNilCfg 342 } 343 s.mu.RLock() 344 if s.sys == nil { 345 s.mu.RUnlock() 346 return nil, ErrNoSysAccount 347 } 348 sq := s.sys.sq 349 sacc := s.sys.account 350 hash := s.sys.shash 351 s.mu.RUnlock() 352 353 // Do this here to process error quicker. 354 ps, err := readPeerState(cfg.Store) 355 if err != nil { 356 return nil, err 357 } 358 if ps == nil { 359 return nil, errNoPeerState 360 } 361 362 qpfx := fmt.Sprintf("[ACC:%s] RAFT '%s' ", accName, cfg.Name) 363 n := &raft{ 364 created: time.Now(), 365 id: hash[:idLen], 366 group: cfg.Name, 367 sd: cfg.Store, 368 wal: cfg.Log, 369 wtype: cfg.Log.Type(), 370 track: cfg.Track, 371 csz: ps.clusterSize, 372 qn: ps.clusterSize/2 + 1, 373 peers: make(map[string]*lps), 374 acks: make(map[uint64]map[string]struct{}), 375 pae: make(map[uint64]*appendEntry), 376 s: s, 377 c: s.createInternalSystemClient(), 378 js: s.getJetStream(), 379 sq: sq, 380 quit: make(chan struct{}), 381 reqs: newIPQueue[*voteRequest](s, qpfx+"vreq"), 382 votes: newIPQueue[*voteResponse](s, qpfx+"vresp"), 383 prop: newIPQueue[*Entry](s, qpfx+"entry"), 384 entry: newIPQueue[*appendEntry](s, qpfx+"appendEntry"), 385 resp: newIPQueue[*appendEntryResponse](s, qpfx+"appendEntryResponse"), 386 apply: newIPQueue[*CommittedEntry](s, qpfx+"committedEntry"), 387 stepdown: newIPQueue[string](s, qpfx+"stepdown"), 388 accName: accName, 389 leadc: make(chan bool, 1), 390 observer: cfg.Observer, 391 extSt: ps.domainExt, 392 } 393 n.c.registerWithAccount(sacc) 394 395 if atomic.LoadInt32(&s.logging.debug) > 0 { 396 n.dflag = true 397 } 398 399 // Set up the highwayhash for the snapshots. 400 key := sha256.Sum256([]byte(n.group)) 401 n.hh, _ = highwayhash.New64(key[:]) 402 403 // If we have a term and vote file (tav.idx on the filesystem) then read in 404 // what we think the term and vote was. It's possible these are out of date 405 // so a catch-up may be required. 406 if term, vote, err := n.readTermVote(); err == nil && term > 0 { 407 n.term = term 408 n.vote = vote 409 } 410 411 // Make sure that the snapshots directory exists. 412 if err := os.MkdirAll(filepath.Join(n.sd, snapshotsDir), 0750); err != nil { 413 return nil, fmt.Errorf("could not create snapshots directory - %v", err) 414 } 415 416 // Can't recover snapshots if memory based. 417 if _, ok := n.wal.(*memStore); ok { 418 os.Remove(filepath.Join(n.sd, snapshotsDir, "*")) 419 } else { 420 // See if we have any snapshots and if so load and process on startup. 421 n.setupLastSnapshot() 422 } 423 424 truncateAndErr := func(index uint64) { 425 if err := n.wal.Truncate(index); err != nil { 426 n.setWriteErr(err) 427 } 428 } 429 430 // Retrieve the stream state from the WAL. If there are pending append 431 // entries that were committed but not applied before we last shut down, 432 // we will try to replay them and process them here. 433 var state StreamState 434 n.wal.FastState(&state) 435 if state.Msgs > 0 { 436 n.debug("Replaying state of %d entries", state.Msgs) 437 if first, err := n.loadFirstEntry(); err == nil { 438 n.pterm, n.pindex = first.pterm, first.pindex 439 if first.commit > 0 && first.commit > n.commit { 440 n.commit = first.commit 441 } 442 } 443 444 // This process will queue up entries on our applied queue but prior to the upper 445 // state machine running. So we will monitor how much we have queued and if we 446 // reach a limit will pause the apply queue and resume inside of run() go routine. 447 const maxQsz = 32 * 1024 * 1024 // 32MB max 448 449 // It looks like there are entries we have committed but not applied 450 // yet. Replay them. 451 for index, qsz := state.FirstSeq, 0; index <= state.LastSeq; index++ { 452 ae, err := n.loadEntry(index) 453 if err != nil { 454 n.warn("Could not load %d from WAL [%+v]: %v", index, state, err) 455 truncateAndErr(index) 456 break 457 } 458 if ae.pindex != index-1 { 459 n.warn("Corrupt WAL, will truncate") 460 truncateAndErr(index) 461 break 462 } 463 n.processAppendEntry(ae, nil) 464 // Check how much we have queued up so far to determine if we should pause. 465 for _, e := range ae.entries { 466 qsz += len(e.Data) 467 if qsz > maxQsz && !n.paused { 468 n.PauseApply() 469 } 470 } 471 } 472 } 473 474 // Make sure to track ourselves. 475 n.peers[n.id] = &lps{time.Now().UnixNano(), 0, true} 476 477 // Track known peers 478 for _, peer := range ps.knownPeers { 479 if peer != n.id { 480 // Set these to 0 to start but mark as known peer. 481 n.peers[peer] = &lps{0, 0, true} 482 } 483 } 484 485 // Setup our internal subscriptions for proposals, votes and append entries. 486 // If we fail to do this for some reason then this is fatal — we cannot 487 // continue setting up or the Raft node may be partially/totally isolated. 488 if err := n.createInternalSubs(); err != nil { 489 n.shutdown(true) 490 return nil, err 491 } 492 493 n.debug("Started") 494 495 // Check if we need to start in observer mode due to lame duck status. 496 // This will stop us from taking on the leader role when we're about to 497 // shutdown anyway. 498 if s.isLameDuckMode() { 499 n.debug("Will start in observer mode due to lame duck status") 500 n.SetObserver(true) 501 } 502 503 // Set the election timer and lost quorum timers to now, so that we 504 // won't accidentally trigger either state without knowing the real state 505 // of the other nodes. 506 n.Lock() 507 n.resetElectionTimeout() 508 n.llqrt = time.Now() 509 n.Unlock() 510 511 // Register the Raft group. 512 labels["group"] = n.group 513 s.registerRaftNode(n.group, n) 514 515 // Start the run goroutine for the Raft state machine. 516 s.startGoRoutine(n.run, labels) 517 518 return n, nil 519 } 520 521 // outOfResources checks to see if we are out of resources. 522 func (n *raft) outOfResources() bool { 523 js := n.js 524 if !n.track || js == nil { 525 return false 526 } 527 return js.limitsExceeded(n.wtype) 528 } 529 530 // Maps node names back to server names. 531 func (s *Server) serverNameForNode(node string) string { 532 if si, ok := s.nodeToInfo.Load(node); ok && si != nil { 533 return si.(nodeInfo).name 534 } 535 return _EMPTY_ 536 } 537 538 // Maps node names back to cluster names. 539 func (s *Server) clusterNameForNode(node string) string { 540 if si, ok := s.nodeToInfo.Load(node); ok && si != nil { 541 return si.(nodeInfo).cluster 542 } 543 return _EMPTY_ 544 } 545 546 // Registers the Raft node with the server, as it will track all of the Raft 547 // nodes. 548 func (s *Server) registerRaftNode(group string, n RaftNode) { 549 s.rnMu.Lock() 550 defer s.rnMu.Unlock() 551 if s.raftNodes == nil { 552 s.raftNodes = make(map[string]RaftNode) 553 } 554 s.raftNodes[group] = n 555 } 556 557 // Unregisters the Raft node from the server, i.e. at shutdown. 558 func (s *Server) unregisterRaftNode(group string) { 559 s.rnMu.Lock() 560 defer s.rnMu.Unlock() 561 if s.raftNodes != nil { 562 delete(s.raftNodes, group) 563 } 564 } 565 566 // Returns how many Raft nodes are running in this server instance. 567 func (s *Server) numRaftNodes() int { 568 s.rnMu.Lock() 569 defer s.rnMu.Unlock() 570 return len(s.raftNodes) 571 } 572 573 // Finds the Raft node for a given Raft group, if any. If there is no Raft node 574 // running for this group then it can return nil. 575 func (s *Server) lookupRaftNode(group string) RaftNode { 576 s.rnMu.RLock() 577 defer s.rnMu.RUnlock() 578 var n RaftNode 579 if s.raftNodes != nil { 580 n = s.raftNodes[group] 581 } 582 return n 583 } 584 585 // Reloads the debug state for all running Raft nodes. This is necessary when 586 // the configuration has been reloaded and the debug log level has changed. 587 func (s *Server) reloadDebugRaftNodes(debug bool) { 588 if s == nil { 589 return 590 } 591 s.rnMu.RLock() 592 for _, ni := range s.raftNodes { 593 n := ni.(*raft) 594 n.Lock() 595 n.dflag = debug 596 n.Unlock() 597 } 598 s.rnMu.RUnlock() 599 } 600 601 // Requests that all Raft nodes on this server step down and place them into 602 // observer mode. This is called when the server is shutting down. 603 func (s *Server) stepdownRaftNodes() { 604 if s == nil { 605 return 606 } 607 s.rnMu.RLock() 608 if len(s.raftNodes) == 0 { 609 s.rnMu.RUnlock() 610 return 611 } 612 s.Debugf("Stepping down all leader raft nodes") 613 nodes := make([]RaftNode, 0, len(s.raftNodes)) 614 for _, n := range s.raftNodes { 615 nodes = append(nodes, n) 616 } 617 s.rnMu.RUnlock() 618 619 for _, node := range nodes { 620 if node.Leader() { 621 node.StepDown() 622 } 623 node.SetObserver(true) 624 } 625 } 626 627 // Shuts down all Raft nodes on this server. This is called either when the 628 // server is either entering lame duck mode, shutting down or when JetStream 629 // has been disabled. 630 func (s *Server) shutdownRaftNodes() { 631 if s == nil { 632 return 633 } 634 s.rnMu.RLock() 635 if len(s.raftNodes) == 0 { 636 s.rnMu.RUnlock() 637 return 638 } 639 nodes := make([]RaftNode, 0, len(s.raftNodes)) 640 s.Debugf("Shutting down all raft nodes") 641 for _, n := range s.raftNodes { 642 nodes = append(nodes, n) 643 } 644 s.rnMu.RUnlock() 645 646 for _, node := range nodes { 647 node.Stop() 648 } 649 } 650 651 // Used in lameduck mode to move off the leaders. 652 // We also put all nodes in observer mode so new leaders 653 // can not be placed on this server. 654 func (s *Server) transferRaftLeaders() bool { 655 if s == nil { 656 return false 657 } 658 s.rnMu.RLock() 659 if len(s.raftNodes) == 0 { 660 s.rnMu.RUnlock() 661 return false 662 } 663 nodes := make([]RaftNode, 0, len(s.raftNodes)) 664 for _, n := range s.raftNodes { 665 nodes = append(nodes, n) 666 } 667 s.rnMu.RUnlock() 668 669 var didTransfer bool 670 for _, node := range nodes { 671 if node.Leader() { 672 node.StepDown() 673 didTransfer = true 674 } 675 node.SetObserver(true) 676 } 677 return didTransfer 678 } 679 680 // Formal API 681 682 // Propose will propose a new entry to the group. 683 // This should only be called on the leader. 684 func (n *raft) Propose(data []byte) error { 685 if state := n.State(); state != Leader { 686 n.debug("Proposal ignored, not leader (state: %v)", state) 687 return errNotLeader 688 } 689 n.RLock() 690 // Error if we had a previous write error. 691 if werr := n.werr; werr != nil { 692 n.RUnlock() 693 return werr 694 } 695 prop := n.prop 696 n.RUnlock() 697 698 prop.push(newEntry(EntryNormal, data)) 699 return nil 700 } 701 702 // ProposeDirect will propose entries directly by skipping the Raft state 703 // machine and sending them straight to the wire instead. 704 // This should only be called on the leader. 705 func (n *raft) ProposeDirect(entries []*Entry) error { 706 if state := n.State(); state != Leader { 707 n.debug("Direct proposal ignored, not leader (state: %v)", state) 708 return errNotLeader 709 } 710 n.RLock() 711 // Error if we had a previous write error. 712 if werr := n.werr; werr != nil { 713 n.RUnlock() 714 return werr 715 } 716 n.RUnlock() 717 718 n.sendAppendEntry(entries) 719 return nil 720 } 721 722 // ForwardProposal will forward the proposal to the leader if known. 723 // If we are the leader this is the same as calling propose. 724 // FIXME(dlc) - We could have a reply subject and wait for a response 725 // for retries, but would need to not block and be in separate Go routine. 726 func (n *raft) ForwardProposal(entry []byte) error { 727 if n.Leader() { 728 return n.Propose(entry) 729 } 730 731 n.sendRPC(n.psubj, _EMPTY_, entry) 732 return nil 733 } 734 735 // ProposeAddPeer is called to add a peer to the group. 736 func (n *raft) ProposeAddPeer(peer string) error { 737 if n.State() != Leader { 738 return errNotLeader 739 } 740 n.RLock() 741 // Error if we had a previous write error. 742 if werr := n.werr; werr != nil { 743 n.RUnlock() 744 return werr 745 } 746 prop := n.prop 747 n.RUnlock() 748 749 prop.push(newEntry(EntryAddPeer, []byte(peer))) 750 return nil 751 } 752 753 // As a leader if we are proposing to remove a peer assume its already gone. 754 func (n *raft) doRemovePeerAsLeader(peer string) { 755 n.Lock() 756 if n.removed == nil { 757 n.removed = map[string]struct{}{} 758 } 759 n.removed[peer] = struct{}{} 760 if _, ok := n.peers[peer]; ok { 761 delete(n.peers, peer) 762 // We should decrease our cluster size since we are tracking this peer and the peer is most likely already gone. 763 n.adjustClusterSizeAndQuorum() 764 } 765 n.Unlock() 766 } 767 768 // ProposeRemovePeer is called to remove a peer from the group. 769 func (n *raft) ProposeRemovePeer(peer string) error { 770 n.RLock() 771 prop, subj := n.prop, n.rpsubj 772 isLeader := n.State() == Leader 773 werr := n.werr 774 n.RUnlock() 775 776 // Error if we had a previous write error. 777 if werr != nil { 778 return werr 779 } 780 781 // If we are the leader then we are responsible for processing the 782 // peer remove and then notifying the rest of the group that the 783 // peer was removed. 784 if isLeader { 785 prop.push(newEntry(EntryRemovePeer, []byte(peer))) 786 n.doRemovePeerAsLeader(peer) 787 return nil 788 } 789 790 // Otherwise we need to forward the proposal to the leader. 791 n.sendRPC(subj, _EMPTY_, []byte(peer)) 792 return nil 793 } 794 795 // ClusterSize reports back the total cluster size. 796 // This effects quorum etc. 797 func (n *raft) ClusterSize() int { 798 n.Lock() 799 defer n.Unlock() 800 return n.csz 801 } 802 803 // AdjustBootClusterSize can be called to adjust the boot cluster size. 804 // Will error if called on a group with a leader or a previous leader. 805 // This can be helpful in mixed mode. 806 func (n *raft) AdjustBootClusterSize(csz int) error { 807 n.Lock() 808 defer n.Unlock() 809 810 if n.leader != noLeader || n.pleader { 811 return errAdjustBootCluster 812 } 813 // Same floor as bootstrap. 814 if csz < 2 { 815 csz = 2 816 } 817 // Adjust the cluster size and the number of nodes needed to establish 818 // a quorum. 819 n.csz = csz 820 n.qn = n.csz/2 + 1 821 822 return nil 823 } 824 825 // AdjustClusterSize will change the cluster set size. 826 // Must be the leader. 827 func (n *raft) AdjustClusterSize(csz int) error { 828 if n.State() != Leader { 829 return errNotLeader 830 } 831 n.Lock() 832 // Same floor as bootstrap. 833 if csz < 2 { 834 csz = 2 835 } 836 837 // Adjust the cluster size and the number of nodes needed to establish 838 // a quorum. 839 n.csz = csz 840 n.qn = n.csz/2 + 1 841 n.Unlock() 842 843 n.sendPeerState() 844 return nil 845 } 846 847 // PauseApply will allow us to pause processing of append entries onto our 848 // external apply queue. In effect this means that the upper layer will no longer 849 // receive any new entries from the Raft group. 850 func (n *raft) PauseApply() error { 851 if n.State() == Leader { 852 return errAlreadyLeader 853 } 854 855 n.Lock() 856 defer n.Unlock() 857 858 // If we are currently a candidate make sure we step down. 859 if n.State() == Candidate { 860 n.stepdown.push(noLeader) 861 } 862 863 n.debug("Pausing our apply channel") 864 n.paused = true 865 n.hcommit = n.commit 866 // Also prevent us from trying to become a leader while paused and catching up. 867 n.pobserver, n.observer = n.observer, true 868 n.resetElect(48 * time.Hour) 869 870 return nil 871 } 872 873 // ResumeApply will resume sending applies to the external apply queue. This 874 // means that we will start sending new entries to the upper layer. 875 func (n *raft) ResumeApply() { 876 n.Lock() 877 defer n.Unlock() 878 879 if !n.paused { 880 return 881 } 882 883 n.debug("Resuming our apply channel") 884 n.observer, n.pobserver = n.pobserver, false 885 n.paused = false 886 // Run catchup.. 887 if n.hcommit > n.commit { 888 n.debug("Resuming %d replays", n.hcommit+1-n.commit) 889 for index := n.commit + 1; index <= n.hcommit; index++ { 890 if err := n.applyCommit(index); err != nil { 891 n.warn("Got error on apply commit during replay: %v", err) 892 break 893 } 894 // We want to unlock here to allow the upper layers to call Applied() without blocking. 895 n.Unlock() 896 // Give hint to let other Go routines run. 897 // Might not be necessary but seems to make it more fine grained interleaving. 898 runtime.Gosched() 899 // Simply re-acquire 900 n.Lock() 901 // Need to check if we got closed or if we were paused again. 902 if n.State() == Closed || n.paused { 903 return 904 } 905 } 906 } 907 n.hcommit = 0 908 909 // If we had been selected to be the next leader campaign here now that we have resumed. 910 if n.lxfer { 911 n.xferCampaign() 912 } else { 913 n.resetElectionTimeout() 914 } 915 } 916 917 // Applied is a callback that must be called by the upper layer when it 918 // has successfully applied the committed entries that it received from the 919 // apply queue. It will return the number of entries and an estimation of the 920 // byte size that could be removed with a snapshot/compact. 921 func (n *raft) Applied(index uint64) (entries uint64, bytes uint64) { 922 n.Lock() 923 defer n.Unlock() 924 925 // Ignore if not applicable. This can happen during a reset. 926 if index > n.commit { 927 return 0, 0 928 } 929 930 // Ignore if already applied. 931 if index > n.applied { 932 n.applied = index 933 } 934 935 // Calculate the number of entries and estimate the byte size that 936 // we can now remove with a compaction/snapshot. 937 var state StreamState 938 n.wal.FastState(&state) 939 if n.applied > state.FirstSeq { 940 entries = n.applied - state.FirstSeq 941 } 942 if state.Msgs > 0 { 943 bytes = entries * state.Bytes / state.Msgs 944 } 945 return entries, bytes 946 } 947 948 // For capturing data needed by snapshot. 949 type snapshot struct { 950 lastTerm uint64 951 lastIndex uint64 952 peerstate []byte 953 data []byte 954 } 955 956 const minSnapshotLen = 28 957 958 // Encodes a snapshot into a buffer for storage. 959 // Lock should be held. 960 func (n *raft) encodeSnapshot(snap *snapshot) []byte { 961 if snap == nil { 962 return nil 963 } 964 var le = binary.LittleEndian 965 buf := make([]byte, minSnapshotLen+len(snap.peerstate)+len(snap.data)) 966 le.PutUint64(buf[0:], snap.lastTerm) 967 le.PutUint64(buf[8:], snap.lastIndex) 968 // Peer state 969 le.PutUint32(buf[16:], uint32(len(snap.peerstate))) 970 wi := 20 971 copy(buf[wi:], snap.peerstate) 972 wi += len(snap.peerstate) 973 // data itself. 974 copy(buf[wi:], snap.data) 975 wi += len(snap.data) 976 977 // Now do the hash for the end. 978 n.hh.Reset() 979 n.hh.Write(buf[:wi]) 980 checksum := n.hh.Sum(nil) 981 copy(buf[wi:], checksum) 982 wi += len(checksum) 983 return buf[:wi] 984 } 985 986 // SendSnapshot will send the latest snapshot as a normal AE. 987 // Should only be used when the upper layers know this is most recent. 988 // Used when restoring streams, moving a stream from R1 to R>1, etc. 989 func (n *raft) SendSnapshot(data []byte) error { 990 n.sendAppendEntry([]*Entry{{EntrySnapshot, data}}) 991 return nil 992 } 993 994 // Used to install a snapshot for the given term and applied index. This will release 995 // all of the log entries up to and including index. This should not be called with 996 // entries that have been applied to the FSM but have not been applied to the raft state. 997 func (n *raft) InstallSnapshot(data []byte) error { 998 if n.State() == Closed { 999 return errNodeClosed 1000 } 1001 1002 n.Lock() 1003 1004 // If a write error has occurred already then stop here. 1005 if werr := n.werr; werr != nil { 1006 n.Unlock() 1007 return werr 1008 } 1009 1010 // Check that a catchup isn't already taking place. If it is then we won't 1011 // allow installing snapshots until it is done. 1012 if len(n.progress) > 0 { 1013 n.Unlock() 1014 return errCatchupsRunning 1015 } 1016 1017 var state StreamState 1018 n.wal.FastState(&state) 1019 1020 if n.applied == 0 { 1021 n.Unlock() 1022 return errNoSnapAvailable 1023 } 1024 1025 n.debug("Installing snapshot of %d bytes", len(data)) 1026 1027 var term uint64 1028 if ae, _ := n.loadEntry(n.applied); ae != nil { 1029 // Use the term from the most recently applied entry if possible. 1030 term = ae.term 1031 } else if ae, _ = n.loadFirstEntry(); ae != nil { 1032 // Otherwise see if we can find the term from the first entry. 1033 term = ae.term 1034 } else { 1035 // Last resort is to use the last pterm that we knew of. 1036 term = n.pterm 1037 } 1038 1039 snap := &snapshot{ 1040 lastTerm: term, 1041 lastIndex: n.applied, 1042 peerstate: encodePeerState(&peerState{n.peerNames(), n.csz, n.extSt}), 1043 data: data, 1044 } 1045 1046 snapDir := filepath.Join(n.sd, snapshotsDir) 1047 sn := fmt.Sprintf(snapFileT, snap.lastTerm, snap.lastIndex) 1048 sfile := filepath.Join(snapDir, sn) 1049 1050 <-dios 1051 err := os.WriteFile(sfile, n.encodeSnapshot(snap), defaultFilePerms) 1052 dios <- struct{}{} 1053 1054 if err != nil { 1055 n.Unlock() 1056 // We could set write err here, but if this is a temporary situation, too many open files etc. 1057 // we want to retry and snapshots are not fatal. 1058 return err 1059 } 1060 1061 // Remember our latest snapshot file. 1062 n.snapfile = sfile 1063 if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil { 1064 n.setWriteErrLocked(err) 1065 n.Unlock() 1066 return err 1067 } 1068 n.Unlock() 1069 1070 psnaps, _ := os.ReadDir(snapDir) 1071 // Remove any old snapshots. 1072 for _, fi := range psnaps { 1073 pn := fi.Name() 1074 if pn != sn { 1075 os.Remove(filepath.Join(snapDir, pn)) 1076 } 1077 } 1078 1079 return nil 1080 } 1081 1082 // NeedSnapshot returns true if it is necessary to try to install a snapshot, i.e. 1083 // after we have finished recovering/replaying at startup, on a regular interval or 1084 // as a part of cleaning up when shutting down. 1085 func (n *raft) NeedSnapshot() bool { 1086 n.RLock() 1087 defer n.RUnlock() 1088 return n.snapfile == _EMPTY_ && n.applied > 1 1089 } 1090 1091 const ( 1092 snapshotsDir = "snapshots" 1093 snapFileT = "snap.%d.%d" 1094 ) 1095 1096 // termAndIndexFromSnapfile tries to load the snapshot file and returns the term 1097 // and index from that snapshot. 1098 func termAndIndexFromSnapFile(sn string) (term, index uint64, err error) { 1099 if sn == _EMPTY_ { 1100 return 0, 0, errBadSnapName 1101 } 1102 fn := filepath.Base(sn) 1103 if n, err := fmt.Sscanf(fn, snapFileT, &term, &index); err != nil || n != 2 { 1104 return 0, 0, errBadSnapName 1105 } 1106 return term, index, nil 1107 } 1108 1109 // setupLastSnapshot is called at startup to try and recover the last snapshot from 1110 // the disk if possible. We will try to recover the term, index and commit/applied 1111 // indices and then notify the upper layer what we found. Compacts the WAL if needed. 1112 func (n *raft) setupLastSnapshot() { 1113 snapDir := filepath.Join(n.sd, snapshotsDir) 1114 psnaps, err := os.ReadDir(snapDir) 1115 if err != nil { 1116 return 1117 } 1118 1119 var lterm, lindex uint64 1120 var latest string 1121 for _, sf := range psnaps { 1122 sfile := filepath.Join(snapDir, sf.Name()) 1123 var term, index uint64 1124 term, index, err := termAndIndexFromSnapFile(sf.Name()) 1125 if err == nil { 1126 if term > lterm { 1127 lterm, lindex = term, index 1128 latest = sfile 1129 } else if term == lterm && index > lindex { 1130 lindex = index 1131 latest = sfile 1132 } 1133 } else { 1134 // Clean this up, can't parse the name. 1135 // TODO(dlc) - We could read in and check actual contents. 1136 n.debug("Removing snapshot, can't parse name: %q", sf.Name()) 1137 os.Remove(sfile) 1138 } 1139 } 1140 1141 // Now cleanup any old entries 1142 for _, sf := range psnaps { 1143 sfile := filepath.Join(snapDir, sf.Name()) 1144 if sfile != latest { 1145 n.debug("Removing old snapshot: %q", sfile) 1146 os.Remove(sfile) 1147 } 1148 } 1149 1150 if latest == _EMPTY_ { 1151 return 1152 } 1153 1154 // Set latest snapshot we have. 1155 n.Lock() 1156 defer n.Unlock() 1157 1158 n.snapfile = latest 1159 snap, err := n.loadLastSnapshot() 1160 if err != nil { 1161 // We failed to recover the last snapshot for some reason, so we will 1162 // assume it has been corrupted and will try to delete it. 1163 if n.snapfile != _EMPTY_ { 1164 os.Remove(n.snapfile) 1165 n.snapfile = _EMPTY_ 1166 } 1167 return 1168 } 1169 1170 // We successfully recovered the last snapshot from the disk. 1171 // Recover state from the snapshot and then notify the upper layer. 1172 // Compact the WAL when we're done if needed. 1173 n.pindex = snap.lastIndex 1174 n.pterm = snap.lastTerm 1175 n.commit = snap.lastIndex 1176 n.applied = snap.lastIndex 1177 n.apply.push(newCommittedEntry(n.commit, []*Entry{{EntrySnapshot, snap.data}})) 1178 if _, err := n.wal.Compact(snap.lastIndex + 1); err != nil { 1179 n.setWriteErrLocked(err) 1180 } 1181 } 1182 1183 // loadLastSnapshot will load and return our last snapshot. 1184 // Lock should be held. 1185 func (n *raft) loadLastSnapshot() (*snapshot, error) { 1186 if n.snapfile == _EMPTY_ { 1187 return nil, errNoSnapAvailable 1188 } 1189 1190 <-dios 1191 buf, err := os.ReadFile(n.snapfile) 1192 dios <- struct{}{} 1193 1194 if err != nil { 1195 n.warn("Error reading snapshot: %v", err) 1196 os.Remove(n.snapfile) 1197 n.snapfile = _EMPTY_ 1198 return nil, err 1199 } 1200 if len(buf) < minSnapshotLen { 1201 n.warn("Snapshot corrupt, too short") 1202 os.Remove(n.snapfile) 1203 n.snapfile = _EMPTY_ 1204 return nil, errSnapshotCorrupt 1205 } 1206 1207 // Check to make sure hash is consistent. 1208 hoff := len(buf) - 8 1209 lchk := buf[hoff:] 1210 n.hh.Reset() 1211 n.hh.Write(buf[:hoff]) 1212 if !bytes.Equal(lchk[:], n.hh.Sum(nil)) { 1213 n.warn("Snapshot corrupt, checksums did not match") 1214 os.Remove(n.snapfile) 1215 n.snapfile = _EMPTY_ 1216 return nil, errSnapshotCorrupt 1217 } 1218 1219 var le = binary.LittleEndian 1220 lps := le.Uint32(buf[16:]) 1221 snap := &snapshot{ 1222 lastTerm: le.Uint64(buf[0:]), 1223 lastIndex: le.Uint64(buf[8:]), 1224 peerstate: buf[20 : 20+lps], 1225 data: buf[20+lps : hoff], 1226 } 1227 1228 // We had a bug in 2.9.12 that would allow snapshots on last index of 0. 1229 // Detect that here and return err. 1230 if snap.lastIndex == 0 { 1231 n.warn("Snapshot with last index 0 is invalid, cleaning up") 1232 os.Remove(n.snapfile) 1233 n.snapfile = _EMPTY_ 1234 return nil, errSnapshotCorrupt 1235 } 1236 1237 return snap, nil 1238 } 1239 1240 // Leader returns if we are the leader for our group. 1241 // We use an atomic here now vs acquiring the read lock. 1242 func (n *raft) Leader() bool { 1243 if n == nil { 1244 return false 1245 } 1246 return n.State() == Leader 1247 } 1248 1249 // isCatchingUp returns true if a catchup is currently taking place. 1250 func (n *raft) isCatchingUp() bool { 1251 n.RLock() 1252 defer n.RUnlock() 1253 return n.catchup != nil 1254 } 1255 1256 // isCurrent is called from the healthchecks and returns true if we believe 1257 // that the upper layer is current with the Raft layer, i.e. that it has applied 1258 // all of the commits that we have given it. 1259 // Optionally we can also check whether or not we're making forward progress if we 1260 // aren't current, in which case this function may block for up to ~10ms to find out. 1261 // Lock should be held. 1262 func (n *raft) isCurrent(includeForwardProgress bool) bool { 1263 // Check if we are closed. 1264 if n.State() == Closed { 1265 n.debug("Not current, node is closed") 1266 return false 1267 } 1268 1269 // Check whether we've made progress on any state, 0 is invalid so not healthy. 1270 if n.commit == 0 { 1271 n.debug("Not current, no commits") 1272 return false 1273 } 1274 1275 // If we were previously logging about falling behind, also log when the problem 1276 // was cleared. 1277 clearBehindState := func() { 1278 if n.hcbehind { 1279 n.warn("Health check OK, no longer falling behind") 1280 n.hcbehind = false 1281 } 1282 } 1283 1284 // Make sure we are the leader or we know we have heard from the leader recently. 1285 if n.State() == Leader { 1286 clearBehindState() 1287 return true 1288 } 1289 1290 // Check here on catchup status. 1291 if cs := n.catchup; cs != nil && n.pterm >= cs.cterm && n.pindex >= cs.cindex { 1292 n.cancelCatchup() 1293 } 1294 1295 // Check to see that we have heard from the current leader lately. 1296 if n.leader != noLeader && n.leader != n.id && n.catchup == nil { 1297 okInterval := int64(hbInterval) * 2 1298 ts := time.Now().UnixNano() 1299 if ps := n.peers[n.leader]; ps == nil || ps.ts == 0 && (ts-ps.ts) > okInterval { 1300 n.debug("Not current, no recent leader contact") 1301 return false 1302 } 1303 } 1304 if cs := n.catchup; cs != nil { 1305 n.debug("Not current, still catching up pindex=%d, cindex=%d", n.pindex, cs.cindex) 1306 } 1307 1308 if n.commit == n.applied { 1309 // At this point if we are current, we can return saying so. 1310 clearBehindState() 1311 return true 1312 } else if !includeForwardProgress { 1313 // Otherwise, if we aren't allowed to include forward progress 1314 // (i.e. we are checking "current" instead of "healthy") then 1315 // give up now. 1316 return false 1317 } 1318 1319 // Otherwise, wait for a short period of time and see if we are making any 1320 // forward progress. 1321 if startDelta := n.commit - n.applied; startDelta > 0 { 1322 for i := 0; i < 10; i++ { // 10ms, in 1ms increments 1323 n.Unlock() 1324 time.Sleep(time.Millisecond) 1325 n.Lock() 1326 if n.commit-n.applied < startDelta { 1327 // The gap is getting smaller, so we're making forward progress. 1328 clearBehindState() 1329 return true 1330 } 1331 } 1332 } 1333 1334 n.hcbehind = true 1335 n.warn("Falling behind in health check, commit %d != applied %d", n.commit, n.applied) 1336 return false 1337 } 1338 1339 // Current returns if we are the leader for our group or an up to date follower. 1340 func (n *raft) Current() bool { 1341 if n == nil { 1342 return false 1343 } 1344 n.Lock() 1345 defer n.Unlock() 1346 return n.isCurrent(false) 1347 } 1348 1349 // Healthy returns if we are the leader for our group and nearly up-to-date. 1350 func (n *raft) Healthy() bool { 1351 if n == nil { 1352 return false 1353 } 1354 n.Lock() 1355 defer n.Unlock() 1356 return n.isCurrent(true) 1357 } 1358 1359 // HadPreviousLeader indicates if this group ever had a leader. 1360 func (n *raft) HadPreviousLeader() bool { 1361 n.RLock() 1362 defer n.RUnlock() 1363 return n.pleader 1364 } 1365 1366 // GroupLeader returns the current leader of the group. 1367 func (n *raft) GroupLeader() string { 1368 if n == nil { 1369 return noLeader 1370 } 1371 n.RLock() 1372 defer n.RUnlock() 1373 return n.leader 1374 } 1375 1376 // Guess the best next leader. Stepdown will check more thoroughly. 1377 // Lock should be held. 1378 func (n *raft) selectNextLeader() string { 1379 nextLeader, hli := noLeader, uint64(0) 1380 for peer, ps := range n.peers { 1381 if peer == n.id || ps.li <= hli { 1382 continue 1383 } 1384 hli = ps.li 1385 nextLeader = peer 1386 } 1387 return nextLeader 1388 } 1389 1390 // StepDown will have a leader stepdown and optionally do a leader transfer. 1391 func (n *raft) StepDown(preferred ...string) error { 1392 n.Lock() 1393 1394 if len(preferred) > 1 { 1395 n.Unlock() 1396 return errTooManyPrefs 1397 } 1398 1399 if n.State() != Leader { 1400 n.Unlock() 1401 return errNotLeader 1402 } 1403 1404 n.debug("Being asked to stepdown") 1405 1406 // See if we have up to date followers. 1407 maybeLeader := noLeader 1408 if len(preferred) > 0 { 1409 if preferred[0] != _EMPTY_ { 1410 maybeLeader = preferred[0] 1411 } else { 1412 preferred = nil 1413 } 1414 } 1415 1416 // Can't pick ourselves. 1417 if maybeLeader == n.id { 1418 maybeLeader = noLeader 1419 preferred = nil 1420 } 1421 1422 nowts := time.Now().UnixNano() 1423 1424 // If we have a preferred check it first. 1425 if maybeLeader != noLeader { 1426 var isHealthy bool 1427 if ps, ok := n.peers[maybeLeader]; ok { 1428 si, ok := n.s.nodeToInfo.Load(maybeLeader) 1429 isHealthy = ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3) 1430 } 1431 if !isHealthy { 1432 maybeLeader = noLeader 1433 } 1434 } 1435 1436 // If we do not have a preferred at this point pick the first healthy one. 1437 // Make sure not ourselves. 1438 if maybeLeader == noLeader { 1439 for peer, ps := range n.peers { 1440 if peer == n.id { 1441 continue 1442 } 1443 si, ok := n.s.nodeToInfo.Load(peer) 1444 isHealthy := ok && !si.(nodeInfo).offline && (nowts-ps.ts) < int64(hbInterval*3) 1445 if isHealthy { 1446 maybeLeader = peer 1447 break 1448 } 1449 } 1450 } 1451 1452 // Clear our vote state. 1453 n.vote = noVote 1454 n.writeTermVote() 1455 1456 stepdown := n.stepdown 1457 prop := n.prop 1458 n.Unlock() 1459 1460 if len(preferred) > 0 && maybeLeader == noLeader { 1461 n.debug("Can not transfer to preferred peer %q", preferred[0]) 1462 } 1463 1464 // If we have a new leader selected, transfer over to them. 1465 if maybeLeader != noLeader { 1466 n.debug("Selected %q for new leader", maybeLeader) 1467 prop.push(newEntry(EntryLeaderTransfer, []byte(maybeLeader))) 1468 } else { 1469 // Force us to stepdown here. 1470 n.debug("Stepping down") 1471 stepdown.push(noLeader) 1472 } 1473 1474 return nil 1475 } 1476 1477 // Campaign will have our node start a leadership vote. 1478 func (n *raft) Campaign() error { 1479 n.Lock() 1480 defer n.Unlock() 1481 return n.campaign() 1482 } 1483 1484 func randCampaignTimeout() time.Duration { 1485 delta := rand.Int63n(int64(maxCampaignTimeout - minCampaignTimeout)) 1486 return (minCampaignTimeout + time.Duration(delta)) 1487 } 1488 1489 // Campaign will have our node start a leadership vote. 1490 // Lock should be held. 1491 func (n *raft) campaign() error { 1492 n.debug("Starting campaign") 1493 if n.State() == Leader { 1494 return errAlreadyLeader 1495 } 1496 n.resetElect(randCampaignTimeout()) 1497 return nil 1498 } 1499 1500 // xferCampaign will have our node start an immediate leadership vote. 1501 // Lock should be held. 1502 func (n *raft) xferCampaign() error { 1503 n.debug("Starting transfer campaign") 1504 if n.State() == Leader { 1505 n.lxfer = false 1506 return errAlreadyLeader 1507 } 1508 n.resetElect(10 * time.Millisecond) 1509 return nil 1510 } 1511 1512 // State returns the current state for this node. 1513 func (n *raft) State() RaftState { 1514 return RaftState(n.state.Load()) 1515 } 1516 1517 // Progress returns the current index, commit and applied values. 1518 func (n *raft) Progress() (index, commit, applied uint64) { 1519 n.RLock() 1520 defer n.RUnlock() 1521 return n.pindex + 1, n.commit, n.applied 1522 } 1523 1524 // Size returns number of entries and total bytes for our WAL. 1525 func (n *raft) Size() (uint64, uint64) { 1526 n.RLock() 1527 var state StreamState 1528 n.wal.FastState(&state) 1529 n.RUnlock() 1530 return state.Msgs, state.Bytes 1531 } 1532 1533 func (n *raft) ID() string { 1534 if n == nil { 1535 return _EMPTY_ 1536 } 1537 n.RLock() 1538 defer n.RUnlock() 1539 return n.id 1540 } 1541 1542 func (n *raft) Group() string { 1543 n.RLock() 1544 defer n.RUnlock() 1545 return n.group 1546 } 1547 1548 func (n *raft) Peers() []*Peer { 1549 n.RLock() 1550 defer n.RUnlock() 1551 1552 var peers []*Peer 1553 for id, ps := range n.peers { 1554 var lag uint64 1555 if n.commit > ps.li { 1556 lag = n.commit - ps.li 1557 } 1558 p := &Peer{ 1559 ID: id, 1560 Current: id == n.leader || ps.li >= n.applied, 1561 Last: time.Unix(0, ps.ts), 1562 Lag: lag, 1563 } 1564 peers = append(peers, p) 1565 } 1566 return peers 1567 } 1568 1569 // Update our known set of peers. 1570 func (n *raft) UpdateKnownPeers(knownPeers []string) { 1571 n.Lock() 1572 // Process like peer state update. 1573 ps := &peerState{knownPeers, len(knownPeers), n.extSt} 1574 n.processPeerState(ps) 1575 isLeader := n.State() == Leader 1576 n.Unlock() 1577 1578 // If we are the leader send this update out as well. 1579 if isLeader { 1580 n.sendPeerState() 1581 } 1582 } 1583 1584 // ApplyQ returns the apply queue that new commits will be sent to for the 1585 // upper layer to apply. 1586 func (n *raft) ApplyQ() *ipQueue[*CommittedEntry] { return n.apply } 1587 1588 // LeadChangeC returns the leader change channel, notifying when the Raft 1589 // leader role has moved. 1590 func (n *raft) LeadChangeC() <-chan bool { return n.leadc } 1591 1592 // QuitC returns the quit channel, notifying when the Raft group has shut down. 1593 func (n *raft) QuitC() <-chan struct{} { return n.quit } 1594 1595 func (n *raft) Created() time.Time { 1596 n.RLock() 1597 defer n.RUnlock() 1598 return n.created 1599 } 1600 1601 func (n *raft) Stop() { 1602 n.shutdown(false) 1603 } 1604 1605 func (n *raft) Delete() { 1606 n.shutdown(true) 1607 } 1608 1609 func (n *raft) shutdown(shouldDelete bool) { 1610 n.Lock() 1611 1612 // Returned swap value is the previous state. It looks counter-intuitive 1613 // to do this atomic operation with the lock held, but we have to do so in 1614 // order to make sure that switchState() is not already running. If it is 1615 // then it can potentially update the n.state back to a non-closed state, 1616 // allowing shutdown() to be called again. If that happens then the below 1617 // close(n.quit) will panic from trying to close an already-closed channel. 1618 if n.state.Swap(int32(Closed)) == int32(Closed) { 1619 n.Unlock() 1620 return 1621 } 1622 1623 close(n.quit) 1624 if c := n.c; c != nil { 1625 var subs []*subscription 1626 c.mu.Lock() 1627 for _, sub := range c.subs { 1628 subs = append(subs, sub) 1629 } 1630 c.mu.Unlock() 1631 for _, sub := range subs { 1632 n.unsubscribe(sub) 1633 } 1634 c.closeConnection(InternalClient) 1635 } 1636 s, g, wal := n.s, n.group, n.wal 1637 1638 // Unregistering ipQueues do not prevent them from push/pop 1639 // just will remove them from the central monitoring map 1640 queues := []interface { 1641 unregister() 1642 }{n.reqs, n.votes, n.prop, n.entry, n.resp, n.apply, n.stepdown} 1643 for _, q := range queues { 1644 q.unregister() 1645 } 1646 n.Unlock() 1647 1648 s.unregisterRaftNode(g) 1649 1650 if wal != nil { 1651 if shouldDelete { 1652 wal.Delete() 1653 } else { 1654 wal.Stop() 1655 } 1656 } 1657 1658 if shouldDelete { 1659 // Delete all our peer state and vote state and any snapshots. 1660 os.RemoveAll(n.sd) 1661 n.debug("Deleted") 1662 } else { 1663 n.debug("Shutdown") 1664 } 1665 } 1666 1667 // Wipe will force an on disk state reset and then call Delete(). 1668 // Useful in case we have been stopped before this point. 1669 func (n *raft) Wipe() { 1670 n.RLock() 1671 wal := n.wal 1672 n.RUnlock() 1673 // Delete our underlying storage. 1674 if wal != nil { 1675 wal.Delete() 1676 } 1677 // Now call delete. 1678 n.Delete() 1679 } 1680 1681 const ( 1682 raftAllSubj = "$NRG.>" 1683 raftVoteSubj = "$NRG.V.%s" 1684 raftAppendSubj = "$NRG.AE.%s" 1685 raftPropSubj = "$NRG.P.%s" 1686 raftRemovePeerSubj = "$NRG.RP.%s" 1687 raftReply = "$NRG.R.%s" 1688 raftCatchupReply = "$NRG.CR.%s" 1689 ) 1690 1691 // Lock should be held (due to use of random generator) 1692 func (n *raft) newCatchupInbox() string { 1693 var b [replySuffixLen]byte 1694 rn := fastrand.Uint64() 1695 for i, l := 0, rn; i < len(b); i++ { 1696 b[i] = digits[l%base] 1697 l /= base 1698 } 1699 return fmt.Sprintf(raftCatchupReply, b[:]) 1700 } 1701 1702 func (n *raft) newInbox() string { 1703 var b [replySuffixLen]byte 1704 rn := fastrand.Uint64() 1705 for i, l := 0, rn; i < len(b); i++ { 1706 b[i] = digits[l%base] 1707 l /= base 1708 } 1709 return fmt.Sprintf(raftReply, b[:]) 1710 } 1711 1712 // Our internal subscribe. 1713 // Lock should be held. 1714 func (n *raft) subscribe(subject string, cb msgHandler) (*subscription, error) { 1715 return n.s.systemSubscribe(subject, _EMPTY_, false, n.c, cb) 1716 } 1717 1718 // Lock should be held. 1719 func (n *raft) unsubscribe(sub *subscription) { 1720 if sub != nil { 1721 n.c.processUnsub(sub.sid) 1722 } 1723 } 1724 1725 func (n *raft) createInternalSubs() error { 1726 n.Lock() 1727 defer n.Unlock() 1728 n.vsubj, n.vreply = fmt.Sprintf(raftVoteSubj, n.group), n.newInbox() 1729 n.asubj, n.areply = fmt.Sprintf(raftAppendSubj, n.group), n.newInbox() 1730 n.psubj = fmt.Sprintf(raftPropSubj, n.group) 1731 n.rpsubj = fmt.Sprintf(raftRemovePeerSubj, n.group) 1732 1733 // Votes 1734 if _, err := n.subscribe(n.vreply, n.handleVoteResponse); err != nil { 1735 return err 1736 } 1737 if _, err := n.subscribe(n.vsubj, n.handleVoteRequest); err != nil { 1738 return err 1739 } 1740 // AppendEntry 1741 if _, err := n.subscribe(n.areply, n.handleAppendEntryResponse); err != nil { 1742 return err 1743 } 1744 if sub, err := n.subscribe(n.asubj, n.handleAppendEntry); err != nil { 1745 return err 1746 } else { 1747 n.aesub = sub 1748 } 1749 1750 return nil 1751 } 1752 1753 func randElectionTimeout() time.Duration { 1754 delta := rand.Int63n(int64(maxElectionTimeout - minElectionTimeout)) 1755 return (minElectionTimeout + time.Duration(delta)) 1756 } 1757 1758 // Lock should be held. 1759 func (n *raft) resetElectionTimeout() { 1760 n.resetElect(randElectionTimeout()) 1761 } 1762 1763 func (n *raft) resetElectionTimeoutWithLock() { 1764 n.resetElectWithLock(randElectionTimeout()) 1765 } 1766 1767 // Lock should be held. 1768 func (n *raft) resetElect(et time.Duration) { 1769 if n.elect == nil { 1770 n.elect = time.NewTimer(et) 1771 } else { 1772 if !n.elect.Stop() { 1773 select { 1774 case <-n.elect.C: 1775 default: 1776 } 1777 } 1778 n.elect.Reset(et) 1779 } 1780 } 1781 1782 func (n *raft) resetElectWithLock(et time.Duration) { 1783 n.Lock() 1784 n.resetElect(et) 1785 n.Unlock() 1786 } 1787 1788 // run is the top-level runner for the Raft state machine. Depending on the 1789 // state of the node (leader, follower, candidate, observer), this will call 1790 // through to other functions. It is expected that this function will run for 1791 // the entire life of the Raft node once started. 1792 func (n *raft) run() { 1793 s := n.s 1794 defer s.grWG.Done() 1795 1796 // We want to wait for some routing to be enabled, so we will wait for 1797 // at least a route, leaf or gateway connection to be established before 1798 // starting the run loop. 1799 for gw := s.gateway; ; { 1800 s.mu.RLock() 1801 ready, gwEnabled := s.numRemotes()+len(s.leafs) > 0, gw.enabled 1802 s.mu.RUnlock() 1803 if !ready && gwEnabled { 1804 gw.RLock() 1805 ready = len(gw.out)+len(gw.in) > 0 1806 gw.RUnlock() 1807 } 1808 if !ready { 1809 select { 1810 case <-s.quitCh: 1811 return 1812 case <-time.After(100 * time.Millisecond): 1813 s.RateLimitWarnf("Waiting for routing to be established...") 1814 } 1815 } else { 1816 break 1817 } 1818 } 1819 1820 // We may have paused adding entries to apply queue, resume here. 1821 // No-op if not paused. 1822 n.ResumeApply() 1823 1824 // Send nil entry to signal the upper layers we are done doing replay/restore. 1825 n.apply.push(nil) 1826 1827 for s.isRunning() { 1828 switch n.State() { 1829 case Follower: 1830 n.runAsFollower() 1831 case Candidate: 1832 n.runAsCandidate() 1833 case Leader: 1834 n.runAsLeader() 1835 case Closed: 1836 return 1837 } 1838 } 1839 } 1840 1841 func (n *raft) debug(format string, args ...any) { 1842 if n.dflag { 1843 nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format) 1844 n.s.Debugf(nf, args...) 1845 } 1846 } 1847 1848 func (n *raft) warn(format string, args ...any) { 1849 nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format) 1850 n.s.RateLimitWarnf(nf, args...) 1851 } 1852 1853 func (n *raft) error(format string, args ...any) { 1854 nf := fmt.Sprintf("RAFT [%s - %s] %s", n.id, n.group, format) 1855 n.s.Errorf(nf, args...) 1856 } 1857 1858 func (n *raft) electTimer() *time.Timer { 1859 n.RLock() 1860 defer n.RUnlock() 1861 return n.elect 1862 } 1863 1864 func (n *raft) IsObserver() bool { 1865 n.RLock() 1866 defer n.RUnlock() 1867 return n.observer 1868 } 1869 1870 // Sets the state to observer only. 1871 func (n *raft) SetObserver(isObserver bool) { 1872 n.setObserver(isObserver, extUndetermined) 1873 } 1874 1875 func (n *raft) setObserver(isObserver bool, extSt extensionState) { 1876 n.Lock() 1877 defer n.Unlock() 1878 n.observer = isObserver 1879 n.extSt = extSt 1880 } 1881 1882 // processAppendEntries is called by the Raft state machine when there are 1883 // new append entries to be committed and sent to the upper state machine. 1884 func (n *raft) processAppendEntries() { 1885 canProcess := true 1886 if n.isClosed() { 1887 n.debug("AppendEntry not processing inbound, closed") 1888 canProcess = false 1889 } 1890 if n.outOfResources() { 1891 n.debug("AppendEntry not processing inbound, no resources") 1892 canProcess = false 1893 } 1894 // Always pop the entries, but check if we can process them. If we can't 1895 // then the entries are effectively dropped. 1896 aes := n.entry.pop() 1897 if canProcess { 1898 for _, ae := range aes { 1899 n.processAppendEntry(ae, ae.sub) 1900 } 1901 } 1902 n.entry.recycle(&aes) 1903 } 1904 1905 // runAsFollower is called by run and will block for as long as the node is 1906 // running in the follower state. 1907 func (n *raft) runAsFollower() { 1908 for { 1909 elect := n.electTimer() 1910 1911 select { 1912 case <-n.entry.ch: 1913 // New append entries have arrived over the network. 1914 n.processAppendEntries() 1915 case <-n.s.quitCh: 1916 // The server is shutting down. 1917 n.shutdown(false) 1918 return 1919 case <-n.quit: 1920 // The Raft node is shutting down. 1921 return 1922 case <-elect.C: 1923 // The election timer has fired so we think it's time to call an election. 1924 // If we are out of resources we just want to stay in this state for the moment. 1925 if n.outOfResources() { 1926 n.resetElectionTimeoutWithLock() 1927 n.debug("Not switching to candidate, no resources") 1928 } else if n.IsObserver() { 1929 n.resetElectWithLock(48 * time.Hour) 1930 n.debug("Not switching to candidate, observer only") 1931 } else if n.isCatchingUp() { 1932 n.debug("Not switching to candidate, catching up") 1933 // Check to see if our catchup has stalled. 1934 n.Lock() 1935 if n.catchupStalled() { 1936 n.cancelCatchup() 1937 } 1938 n.resetElectionTimeout() 1939 n.Unlock() 1940 } else { 1941 n.switchToCandidate() 1942 return 1943 } 1944 case <-n.votes.ch: 1945 // We're receiving votes from the network, probably because we have only 1946 // just stepped down and they were already in flight. Ignore them. 1947 n.debug("Ignoring old vote response, we have stepped down") 1948 n.votes.popOne() 1949 case <-n.resp.ch: 1950 // We're receiving append entry responses from the network, probably because 1951 // we have only just stepped down and they were already in flight. Ignore them. 1952 n.resp.popOne() 1953 case <-n.reqs.ch: 1954 // We've just received a vote request from the network. 1955 // Because of drain() it is possible that we get nil from popOne(). 1956 if voteReq, ok := n.reqs.popOne(); ok { 1957 n.processVoteRequest(voteReq) 1958 } 1959 case <-n.stepdown.ch: 1960 // We've received a stepdown request, start following the new leader if 1961 // we can. 1962 if newLeader, ok := n.stepdown.popOne(); ok { 1963 n.switchToFollower(newLeader) 1964 return 1965 } 1966 } 1967 } 1968 } 1969 1970 // Pool for CommittedEntry re-use. 1971 var cePool = sync.Pool{ 1972 New: func() any { 1973 return &CommittedEntry{} 1974 }, 1975 } 1976 1977 // CommittedEntry is handed back to the user to apply a commit to their upper layer. 1978 type CommittedEntry struct { 1979 Index uint64 1980 Entries []*Entry 1981 } 1982 1983 // Create a new CommittedEntry. When the returned entry is no longer needed, it 1984 // should be returned to the pool by calling ReturnToPool. 1985 func newCommittedEntry(index uint64, entries []*Entry) *CommittedEntry { 1986 ce := cePool.Get().(*CommittedEntry) 1987 ce.Index, ce.Entries = index, entries 1988 return ce 1989 } 1990 1991 // ReturnToPool returns the CommittedEntry to the pool, after which point it is 1992 // no longer safe to reuse. 1993 func (ce *CommittedEntry) ReturnToPool() { 1994 if ce == nil { 1995 return 1996 } 1997 if len(ce.Entries) > 0 { 1998 for _, e := range ce.Entries { 1999 entryPool.Put(e) 2000 } 2001 } 2002 ce.Index, ce.Entries = 0, nil 2003 cePool.Put(ce) 2004 } 2005 2006 // Pool for Entry re-use. 2007 var entryPool = sync.Pool{ 2008 New: func() any { 2009 return &Entry{} 2010 }, 2011 } 2012 2013 // Helper to create new entries. When the returned entry is no longer needed, it 2014 // should be returned to the entryPool pool. 2015 func newEntry(t EntryType, data []byte) *Entry { 2016 entry := entryPool.Get().(*Entry) 2017 entry.Type, entry.Data = t, data 2018 return entry 2019 } 2020 2021 // Pool for appendEntry re-use. 2022 var aePool = sync.Pool{ 2023 New: func() any { 2024 return &appendEntry{} 2025 }, 2026 } 2027 2028 // appendEntry is the main struct that is used to sync raft peers. 2029 type appendEntry struct { 2030 leader string // The leader that this append entry came from. 2031 term uint64 // The current term, as the leader understands it. 2032 commit uint64 // The commit index, as the leader understands it. 2033 pterm uint64 // The previous term, for checking consistency. 2034 pindex uint64 // The previous commit index, for checking consistency. 2035 entries []*Entry // Entries to process. 2036 // Below fields are for internal use only: 2037 reply string // Reply subject to respond to once committed. 2038 sub *subscription // The subscription that the append entry came in on. 2039 buf []byte 2040 } 2041 2042 // Create a new appendEntry. 2043 func newAppendEntry(leader string, term, commit, pterm, pindex uint64, entries []*Entry) *appendEntry { 2044 ae := aePool.Get().(*appendEntry) 2045 ae.leader, ae.term, ae.commit, ae.pterm, ae.pindex, ae.entries = leader, term, commit, pterm, pindex, entries 2046 ae.reply, ae.sub, ae.buf = _EMPTY_, nil, nil 2047 return ae 2048 } 2049 2050 // Will return this append entry, and its interior entries to their respective pools. 2051 func (ae *appendEntry) returnToPool() { 2052 ae.entries, ae.buf, ae.sub, ae.reply = nil, nil, nil, _EMPTY_ 2053 aePool.Put(ae) 2054 } 2055 2056 type EntryType uint8 2057 2058 const ( 2059 EntryNormal EntryType = iota 2060 EntryOldSnapshot 2061 EntryPeerState 2062 EntryAddPeer 2063 EntryRemovePeer 2064 EntryLeaderTransfer 2065 EntrySnapshot 2066 ) 2067 2068 func (t EntryType) String() string { 2069 switch t { 2070 case EntryNormal: 2071 return "Normal" 2072 case EntryOldSnapshot: 2073 return "OldSnapshot" 2074 case EntryPeerState: 2075 return "PeerState" 2076 case EntryAddPeer: 2077 return "AddPeer" 2078 case EntryRemovePeer: 2079 return "RemovePeer" 2080 case EntryLeaderTransfer: 2081 return "LeaderTransfer" 2082 case EntrySnapshot: 2083 return "Snapshot" 2084 } 2085 return fmt.Sprintf("Unknown [%d]", uint8(t)) 2086 } 2087 2088 type Entry struct { 2089 Type EntryType 2090 Data []byte 2091 } 2092 2093 func (ae *appendEntry) String() string { 2094 return fmt.Sprintf("&{leader:%s term:%d commit:%d pterm:%d pindex:%d entries: %d}", 2095 ae.leader, ae.term, ae.commit, ae.pterm, ae.pindex, len(ae.entries)) 2096 } 2097 2098 const appendEntryBaseLen = idLen + 4*8 + 2 2099 2100 func (ae *appendEntry) encode(b []byte) ([]byte, error) { 2101 if ll := len(ae.leader); ll != idLen && ll != 0 { 2102 return nil, errLeaderLen 2103 } 2104 if len(ae.entries) > math.MaxUint16 { 2105 return nil, errTooManyEntries 2106 } 2107 2108 var elen int 2109 for _, e := range ae.entries { 2110 elen += len(e.Data) + 1 + 4 // 1 is type, 4 is for size. 2111 } 2112 tlen := appendEntryBaseLen + elen + 1 2113 2114 var buf []byte 2115 if cap(b) >= tlen { 2116 buf = b[:tlen] 2117 } else { 2118 buf = make([]byte, tlen) 2119 } 2120 2121 var le = binary.LittleEndian 2122 copy(buf[:idLen], ae.leader) 2123 le.PutUint64(buf[8:], ae.term) 2124 le.PutUint64(buf[16:], ae.commit) 2125 le.PutUint64(buf[24:], ae.pterm) 2126 le.PutUint64(buf[32:], ae.pindex) 2127 le.PutUint16(buf[40:], uint16(len(ae.entries))) 2128 wi := 42 2129 for _, e := range ae.entries { 2130 le.PutUint32(buf[wi:], uint32(len(e.Data)+1)) 2131 wi += 4 2132 buf[wi] = byte(e.Type) 2133 wi++ 2134 copy(buf[wi:], e.Data) 2135 wi += len(e.Data) 2136 } 2137 return buf[:wi], nil 2138 } 2139 2140 // This can not be used post the wire level callback since we do not copy. 2141 func (n *raft) decodeAppendEntry(msg []byte, sub *subscription, reply string) (*appendEntry, error) { 2142 if len(msg) < appendEntryBaseLen { 2143 return nil, errBadAppendEntry 2144 } 2145 2146 var le = binary.LittleEndian 2147 2148 ae := newAppendEntry(string(msg[:idLen]), le.Uint64(msg[8:]), le.Uint64(msg[16:]), le.Uint64(msg[24:]), le.Uint64(msg[32:]), nil) 2149 ae.reply, ae.sub = reply, sub 2150 2151 // Decode Entries. 2152 ne, ri := int(le.Uint16(msg[40:])), 42 2153 for i, max := 0, len(msg); i < ne; i++ { 2154 if ri >= max-1 { 2155 return nil, errBadAppendEntry 2156 } 2157 le := int(le.Uint32(msg[ri:])) 2158 ri += 4 2159 if le <= 0 || ri+le > max { 2160 return nil, errBadAppendEntry 2161 } 2162 entry := newEntry(EntryType(msg[ri]), msg[ri+1:ri+le]) 2163 ae.entries = append(ae.entries, entry) 2164 ri += le 2165 } 2166 ae.buf = msg 2167 return ae, nil 2168 } 2169 2170 // Pool for appendEntryResponse re-use. 2171 var arPool = sync.Pool{ 2172 New: func() any { 2173 return &appendEntryResponse{} 2174 }, 2175 } 2176 2177 // We want to make sure this does not change from system changing length of syshash. 2178 const idLen = 8 2179 const appendEntryResponseLen = 24 + 1 2180 2181 // appendEntryResponse is our response to a received appendEntry. 2182 type appendEntryResponse struct { 2183 term uint64 2184 index uint64 2185 peer string 2186 reply string // internal usage. 2187 success bool 2188 } 2189 2190 // Create a new appendEntryResponse. 2191 func newAppendEntryResponse(term, index uint64, peer string, success bool) *appendEntryResponse { 2192 ar := arPool.Get().(*appendEntryResponse) 2193 ar.term, ar.index, ar.peer, ar.success = term, index, peer, success 2194 // Always empty out. 2195 ar.reply = _EMPTY_ 2196 return ar 2197 } 2198 2199 func (ar *appendEntryResponse) encode(b []byte) []byte { 2200 var buf []byte 2201 if cap(b) >= appendEntryResponseLen { 2202 buf = b[:appendEntryResponseLen] 2203 } else { 2204 buf = make([]byte, appendEntryResponseLen) 2205 } 2206 var le = binary.LittleEndian 2207 le.PutUint64(buf[0:], ar.term) 2208 le.PutUint64(buf[8:], ar.index) 2209 copy(buf[16:16+idLen], ar.peer) 2210 if ar.success { 2211 buf[24] = 1 2212 } else { 2213 buf[24] = 0 2214 } 2215 return buf[:appendEntryResponseLen] 2216 } 2217 2218 // Track all peers we may have ever seen to use an string interns for appendEntryResponse decoding. 2219 var peers sync.Map 2220 2221 func (n *raft) decodeAppendEntryResponse(msg []byte) *appendEntryResponse { 2222 if len(msg) != appendEntryResponseLen { 2223 return nil 2224 } 2225 var le = binary.LittleEndian 2226 ar := arPool.Get().(*appendEntryResponse) 2227 ar.term = le.Uint64(msg[0:]) 2228 ar.index = le.Uint64(msg[8:]) 2229 2230 peer, ok := peers.Load(string(msg[16 : 16+idLen])) 2231 if !ok { 2232 // We missed so store inline here. 2233 peer = string(msg[16 : 16+idLen]) 2234 peers.Store(peer, peer) 2235 } 2236 ar.peer = peer.(string) 2237 ar.success = msg[24] == 1 2238 return ar 2239 } 2240 2241 // Called when a remove peer proposal has been forwarded 2242 func (n *raft) handleForwardedRemovePeerProposal(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) { 2243 n.debug("Received forwarded remove peer proposal: %q", msg) 2244 2245 if !n.Leader() { 2246 n.debug("Ignoring forwarded peer removal proposal, not leader") 2247 return 2248 } 2249 if len(msg) != idLen { 2250 n.warn("Received invalid peer name for remove proposal: %q", msg) 2251 return 2252 } 2253 2254 n.RLock() 2255 prop, werr := n.prop, n.werr 2256 n.RUnlock() 2257 2258 // Ignore if we have had a write error previous. 2259 if werr != nil { 2260 return 2261 } 2262 2263 // Need to copy since this is underlying client/route buffer. 2264 peer := copyBytes(msg) 2265 prop.push(newEntry(EntryRemovePeer, peer)) 2266 } 2267 2268 // Called when a peer has forwarded a proposal. 2269 func (n *raft) handleForwardedProposal(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) { 2270 if !n.Leader() { 2271 n.debug("Ignoring forwarded proposal, not leader") 2272 return 2273 } 2274 // Need to copy since this is underlying client/route buffer. 2275 msg = copyBytes(msg) 2276 2277 n.RLock() 2278 prop, werr := n.prop, n.werr 2279 n.RUnlock() 2280 2281 // Ignore if we have had a write error previous. 2282 if werr != nil { 2283 return 2284 } 2285 2286 prop.push(newEntry(EntryNormal, msg)) 2287 } 2288 2289 func (n *raft) runAsLeader() { 2290 if n.State() == Closed { 2291 return 2292 } 2293 2294 n.RLock() 2295 psubj, rpsubj := n.psubj, n.rpsubj 2296 n.RUnlock() 2297 2298 // For forwarded proposals, both normal and remove peer proposals. 2299 fsub, err := n.subscribe(psubj, n.handleForwardedProposal) 2300 if err != nil { 2301 n.warn("Error subscribing to forwarded proposals: %v", err) 2302 n.stepdown.push(noLeader) 2303 return 2304 } 2305 rpsub, err := n.subscribe(rpsubj, n.handleForwardedRemovePeerProposal) 2306 if err != nil { 2307 n.warn("Error subscribing to forwarded remove peer proposals: %v", err) 2308 n.unsubscribe(fsub) 2309 n.stepdown.push(noLeader) 2310 return 2311 } 2312 2313 // Cleanup our subscription when we leave. 2314 defer func() { 2315 n.Lock() 2316 n.unsubscribe(fsub) 2317 n.unsubscribe(rpsub) 2318 n.Unlock() 2319 }() 2320 2321 // To send out our initial peer state. 2322 n.sendPeerState() 2323 2324 hb := time.NewTicker(hbInterval) 2325 defer hb.Stop() 2326 2327 lq := time.NewTicker(lostQuorumCheck) 2328 defer lq.Stop() 2329 2330 for n.State() == Leader { 2331 select { 2332 case <-n.s.quitCh: 2333 n.shutdown(false) 2334 return 2335 case <-n.quit: 2336 return 2337 case <-n.resp.ch: 2338 ars := n.resp.pop() 2339 for _, ar := range ars { 2340 n.processAppendEntryResponse(ar) 2341 } 2342 n.resp.recycle(&ars) 2343 case <-n.prop.ch: 2344 const maxBatch = 256 * 1024 2345 var entries []*Entry 2346 2347 es := n.prop.pop() 2348 sz := 0 2349 for i, b := range es { 2350 if b.Type == EntryRemovePeer { 2351 n.doRemovePeerAsLeader(string(b.Data)) 2352 } 2353 entries = append(entries, b) 2354 sz += len(b.Data) + 1 2355 if i != len(es)-1 && sz < maxBatch && len(entries) < math.MaxUint16 { 2356 continue 2357 } 2358 n.sendAppendEntry(entries) 2359 2360 // If this is us sending out a leadership transfer stepdown inline here. 2361 if b.Type == EntryLeaderTransfer { 2362 n.prop.recycle(&es) 2363 n.debug("Stepping down due to leadership transfer") 2364 n.switchToFollower(noLeader) 2365 return 2366 } 2367 // We need to re-create `entries` because there is a reference 2368 // to it in the node's pae map. 2369 entries = nil 2370 } 2371 n.prop.recycle(&es) 2372 2373 case <-hb.C: 2374 if n.notActive() { 2375 n.sendHeartbeat() 2376 } 2377 case <-lq.C: 2378 if n.lostQuorum() { 2379 n.switchToFollower(noLeader) 2380 return 2381 } 2382 case <-n.votes.ch: 2383 // Because of drain() it is possible that we get nil from popOne(). 2384 vresp, ok := n.votes.popOne() 2385 if !ok { 2386 continue 2387 } 2388 if vresp.term > n.Term() { 2389 n.switchToFollower(noLeader) 2390 return 2391 } 2392 n.trackPeer(vresp.peer) 2393 case <-n.reqs.ch: 2394 // Because of drain() it is possible that we get nil from popOne(). 2395 if voteReq, ok := n.reqs.popOne(); ok { 2396 n.processVoteRequest(voteReq) 2397 } 2398 case <-n.stepdown.ch: 2399 if newLeader, ok := n.stepdown.popOne(); ok { 2400 n.switchToFollower(newLeader) 2401 return 2402 } 2403 case <-n.entry.ch: 2404 n.processAppendEntries() 2405 } 2406 } 2407 } 2408 2409 // Quorum reports the quorum status. Will be called on former leaders. 2410 func (n *raft) Quorum() bool { 2411 n.RLock() 2412 defer n.RUnlock() 2413 2414 now, nc := time.Now().UnixNano(), 1 2415 for _, peer := range n.peers { 2416 if now-peer.ts < int64(lostQuorumInterval) { 2417 nc++ 2418 if nc >= n.qn { 2419 return true 2420 } 2421 } 2422 } 2423 return false 2424 } 2425 2426 func (n *raft) lostQuorum() bool { 2427 n.RLock() 2428 defer n.RUnlock() 2429 return n.lostQuorumLocked() 2430 } 2431 2432 func (n *raft) lostQuorumLocked() bool { 2433 // Make sure we let any scale up actions settle before deciding. 2434 if !n.lsut.IsZero() && time.Since(n.lsut) < lostQuorumInterval { 2435 return false 2436 } 2437 2438 now, nc := time.Now().UnixNano(), 1 2439 for _, peer := range n.peers { 2440 if now-peer.ts < int64(lostQuorumInterval) { 2441 nc++ 2442 if nc >= n.qn { 2443 return false 2444 } 2445 } 2446 } 2447 return true 2448 } 2449 2450 // Check for being not active in terms of sending entries. 2451 // Used in determining if we need to send a heartbeat. 2452 func (n *raft) notActive() bool { 2453 n.RLock() 2454 defer n.RUnlock() 2455 return time.Since(n.active) > hbInterval 2456 } 2457 2458 // Return our current term. 2459 func (n *raft) Term() uint64 { 2460 n.RLock() 2461 defer n.RUnlock() 2462 return n.term 2463 } 2464 2465 // Lock should be held. 2466 func (n *raft) loadFirstEntry() (ae *appendEntry, err error) { 2467 var state StreamState 2468 n.wal.FastState(&state) 2469 return n.loadEntry(state.FirstSeq) 2470 } 2471 2472 func (n *raft) runCatchup(ar *appendEntryResponse, indexUpdatesQ *ipQueue[uint64]) { 2473 n.RLock() 2474 s, reply := n.s, n.areply 2475 peer, subj, last := ar.peer, ar.reply, n.pindex 2476 n.RUnlock() 2477 2478 defer s.grWG.Done() 2479 defer arPool.Put(ar) 2480 2481 defer func() { 2482 n.Lock() 2483 delete(n.progress, peer) 2484 if len(n.progress) == 0 { 2485 n.progress = nil 2486 } 2487 // Check if this is a new peer and if so go ahead and propose adding them. 2488 _, exists := n.peers[peer] 2489 n.Unlock() 2490 if !exists { 2491 n.debug("Catchup done for %q, will add into peers", peer) 2492 n.ProposeAddPeer(peer) 2493 } 2494 indexUpdatesQ.unregister() 2495 }() 2496 2497 n.debug("Running catchup for %q", peer) 2498 2499 const maxOutstanding = 2 * 1024 * 1024 // 2MB for now. 2500 next, total, om := uint64(0), 0, make(map[uint64]int) 2501 2502 sendNext := func() bool { 2503 for total <= maxOutstanding { 2504 next++ 2505 if next > last { 2506 return true 2507 } 2508 ae, err := n.loadEntry(next) 2509 if err != nil { 2510 if err != ErrStoreEOF { 2511 n.warn("Got an error loading %d index: %v", next, err) 2512 } 2513 return true 2514 } 2515 // Update our tracking total. 2516 om[next] = len(ae.buf) 2517 total += len(ae.buf) 2518 n.sendRPC(subj, reply, ae.buf) 2519 } 2520 return false 2521 } 2522 2523 const activityInterval = 2 * time.Second 2524 timeout := time.NewTimer(activityInterval) 2525 defer timeout.Stop() 2526 2527 stepCheck := time.NewTicker(100 * time.Millisecond) 2528 defer stepCheck.Stop() 2529 2530 // Run as long as we are leader and still not caught up. 2531 for n.Leader() { 2532 select { 2533 case <-n.s.quitCh: 2534 n.shutdown(false) 2535 return 2536 case <-n.quit: 2537 return 2538 case <-stepCheck.C: 2539 if !n.Leader() { 2540 n.debug("Catching up canceled, no longer leader") 2541 return 2542 } 2543 case <-timeout.C: 2544 n.debug("Catching up for %q stalled", peer) 2545 return 2546 case <-indexUpdatesQ.ch: 2547 if index, ok := indexUpdatesQ.popOne(); ok { 2548 // Update our activity timer. 2549 timeout.Reset(activityInterval) 2550 // Update outstanding total. 2551 total -= om[index] 2552 delete(om, index) 2553 if next == 0 { 2554 next = index 2555 } 2556 // Check if we are done. 2557 if index > last || sendNext() { 2558 n.debug("Finished catching up") 2559 return 2560 } 2561 } 2562 } 2563 } 2564 } 2565 2566 // Lock should be held. 2567 func (n *raft) sendSnapshotToFollower(subject string) (uint64, error) { 2568 snap, err := n.loadLastSnapshot() 2569 if err != nil { 2570 // We need to stepdown here when this happens. 2571 n.stepdown.push(noLeader) 2572 // We need to reset our state here as well. 2573 n.resetWAL() 2574 return 0, err 2575 } 2576 // Go ahead and send the snapshot and peerstate here as first append entry to the catchup follower. 2577 ae := n.buildAppendEntry([]*Entry{{EntrySnapshot, snap.data}, {EntryPeerState, snap.peerstate}}) 2578 ae.pterm, ae.pindex = snap.lastTerm, snap.lastIndex 2579 var state StreamState 2580 n.wal.FastState(&state) 2581 2582 fpIndex := state.FirstSeq - 1 2583 if snap.lastIndex < fpIndex && state.FirstSeq != 0 { 2584 snap.lastIndex = fpIndex 2585 ae.pindex = fpIndex 2586 } 2587 2588 encoding, err := ae.encode(nil) 2589 if err != nil { 2590 return 0, err 2591 } 2592 n.sendRPC(subject, n.areply, encoding) 2593 return snap.lastIndex, nil 2594 } 2595 2596 func (n *raft) catchupFollower(ar *appendEntryResponse) { 2597 n.debug("Being asked to catch up follower: %q", ar.peer) 2598 n.Lock() 2599 if n.progress == nil { 2600 n.progress = make(map[string]*ipQueue[uint64]) 2601 } else if q, ok := n.progress[ar.peer]; ok { 2602 n.debug("Will cancel existing entry for catching up %q", ar.peer) 2603 delete(n.progress, ar.peer) 2604 q.push(n.pindex) 2605 } 2606 2607 // Check to make sure we have this entry. 2608 start := ar.index + 1 2609 var state StreamState 2610 n.wal.FastState(&state) 2611 2612 if start < state.FirstSeq || (state.Msgs == 0 && start <= state.LastSeq) { 2613 n.debug("Need to send snapshot to follower") 2614 if lastIndex, err := n.sendSnapshotToFollower(ar.reply); err != nil { 2615 n.error("Error sending snapshot to follower [%s]: %v", ar.peer, err) 2616 n.Unlock() 2617 arPool.Put(ar) 2618 return 2619 } else { 2620 start = lastIndex + 1 2621 // If no other entries, we can just return here. 2622 if state.Msgs == 0 || start > state.LastSeq { 2623 n.debug("Finished catching up") 2624 n.Unlock() 2625 arPool.Put(ar) 2626 return 2627 } 2628 n.debug("Snapshot sent, reset first catchup entry to %d", lastIndex) 2629 } 2630 } 2631 2632 ae, err := n.loadEntry(start) 2633 if err != nil { 2634 n.warn("Request from follower for entry at index [%d] errored for state %+v - %v", start, state, err) 2635 if err == ErrStoreEOF { 2636 // If we are here we are seeing a request for an item beyond our state, meaning we should stepdown. 2637 n.stepdown.push(noLeader) 2638 n.Unlock() 2639 arPool.Put(ar) 2640 return 2641 } 2642 ae, err = n.loadFirstEntry() 2643 } 2644 if err != nil || ae == nil { 2645 n.warn("Could not find a starting entry for catchup request: %v", err) 2646 // If we are here we are seeing a request for an item we do not have, meaning we should stepdown. 2647 // This is possible on a reset of our WAL but the other side has a snapshot already. 2648 // If we do not stepdown this can cycle. 2649 n.stepdown.push(noLeader) 2650 n.Unlock() 2651 arPool.Put(ar) 2652 return 2653 } 2654 if ae.pindex != ar.index || ae.pterm != ar.term { 2655 n.debug("Our first entry [%d:%d] does not match request from follower [%d:%d]", ae.pterm, ae.pindex, ar.term, ar.index) 2656 } 2657 // Create a queue for delivering updates from responses. 2658 indexUpdates := newIPQueue[uint64](n.s, fmt.Sprintf("[ACC:%s] RAFT '%s' indexUpdates", n.accName, n.group)) 2659 indexUpdates.push(ae.pindex) 2660 n.progress[ar.peer] = indexUpdates 2661 n.Unlock() 2662 2663 n.s.startGoRoutine(func() { n.runCatchup(ar, indexUpdates) }) 2664 } 2665 2666 func (n *raft) loadEntry(index uint64) (*appendEntry, error) { 2667 var smp StoreMsg 2668 sm, err := n.wal.LoadMsg(index, &smp) 2669 if err != nil { 2670 return nil, err 2671 } 2672 return n.decodeAppendEntry(sm.msg, nil, _EMPTY_) 2673 } 2674 2675 // applyCommit will update our commit index and apply the entry to the apply queue. 2676 // lock should be held. 2677 func (n *raft) applyCommit(index uint64) error { 2678 if n.State() == Closed { 2679 return errNodeClosed 2680 } 2681 if index <= n.commit { 2682 n.debug("Ignoring apply commit for %d, already processed", index) 2683 return nil 2684 } 2685 original := n.commit 2686 n.commit = index 2687 2688 if n.State() == Leader { 2689 delete(n.acks, index) 2690 } 2691 2692 var fpae bool 2693 2694 ae := n.pae[index] 2695 if ae == nil { 2696 var state StreamState 2697 n.wal.FastState(&state) 2698 if index < state.FirstSeq { 2699 return nil 2700 } 2701 var err error 2702 if ae, err = n.loadEntry(index); err != nil { 2703 if err != ErrStoreClosed && err != ErrStoreEOF { 2704 n.warn("Got an error loading %d index: %v - will reset", index, err) 2705 if n.State() == Leader { 2706 n.stepdown.push(n.selectNextLeader()) 2707 } 2708 // Reset and cancel any catchup. 2709 n.resetWAL() 2710 n.cancelCatchup() 2711 } else { 2712 n.commit = original 2713 } 2714 return errEntryLoadFailed 2715 } 2716 } else { 2717 fpae = true 2718 } 2719 2720 ae.buf = nil 2721 2722 var committed []*Entry 2723 for _, e := range ae.entries { 2724 switch e.Type { 2725 case EntryNormal: 2726 committed = append(committed, e) 2727 case EntryOldSnapshot: 2728 // For old snapshots in our WAL. 2729 committed = append(committed, newEntry(EntrySnapshot, e.Data)) 2730 case EntrySnapshot: 2731 committed = append(committed, e) 2732 case EntryPeerState: 2733 if n.State() != Leader { 2734 if ps, err := decodePeerState(e.Data); err == nil { 2735 n.processPeerState(ps) 2736 } 2737 } 2738 case EntryAddPeer: 2739 newPeer := string(e.Data) 2740 n.debug("Added peer %q", newPeer) 2741 2742 // Store our peer in our global peer map for all peers. 2743 peers.LoadOrStore(newPeer, newPeer) 2744 2745 // If we were on the removed list reverse that here. 2746 if n.removed != nil { 2747 delete(n.removed, newPeer) 2748 } 2749 2750 if lp, ok := n.peers[newPeer]; !ok { 2751 // We are not tracking this one automatically so we need to bump cluster size. 2752 n.peers[newPeer] = &lps{time.Now().UnixNano(), 0, true} 2753 } else { 2754 // Mark as added. 2755 lp.kp = true 2756 } 2757 // Adjust cluster size and quorum if needed. 2758 n.adjustClusterSizeAndQuorum() 2759 // Write out our new state. 2760 n.writePeerState(&peerState{n.peerNames(), n.csz, n.extSt}) 2761 // We pass these up as well. 2762 committed = append(committed, e) 2763 2764 case EntryRemovePeer: 2765 peer := string(e.Data) 2766 n.debug("Removing peer %q", peer) 2767 2768 // Make sure we have our removed map. 2769 if n.removed == nil { 2770 n.removed = make(map[string]struct{}) 2771 } 2772 n.removed[peer] = struct{}{} 2773 2774 if _, ok := n.peers[peer]; ok { 2775 delete(n.peers, peer) 2776 // We should decrease our cluster size since we are tracking this peer. 2777 n.adjustClusterSizeAndQuorum() 2778 // Write out our new state. 2779 n.writePeerState(&peerState{n.peerNames(), n.csz, n.extSt}) 2780 } 2781 2782 // If this is us and we are the leader we should attempt to stepdown. 2783 if peer == n.id && n.State() == Leader { 2784 n.stepdown.push(n.selectNextLeader()) 2785 } 2786 2787 // Remove from string intern map. 2788 peers.Delete(peer) 2789 2790 // We pass these up as well. 2791 committed = append(committed, e) 2792 } 2793 } 2794 if fpae { 2795 delete(n.pae, index) 2796 } 2797 // Pass to the upper layers if we have normal entries. It is 2798 // entirely possible that 'committed' might be an empty slice here, 2799 // which will happen if we've processed updates inline (like peer 2800 // states). In which case the upper layer will just call down with 2801 // Applied() with no further action. 2802 n.apply.push(newCommittedEntry(index, committed)) 2803 // Place back in the pool. 2804 ae.returnToPool() 2805 return nil 2806 } 2807 2808 // Used to track a success response and apply entries. 2809 func (n *raft) trackResponse(ar *appendEntryResponse) { 2810 if n.State() == Closed { 2811 return 2812 } 2813 2814 n.Lock() 2815 2816 // Update peer's last index. 2817 if ps := n.peers[ar.peer]; ps != nil && ar.index > ps.li { 2818 ps.li = ar.index 2819 } 2820 2821 // If we are tracking this peer as a catchup follower, update that here. 2822 if indexUpdateQ := n.progress[ar.peer]; indexUpdateQ != nil { 2823 indexUpdateQ.push(ar.index) 2824 } 2825 2826 // Ignore items already committed. 2827 if ar.index <= n.commit { 2828 n.Unlock() 2829 return 2830 } 2831 2832 // See if we have items to apply. 2833 var sendHB bool 2834 2835 if results := n.acks[ar.index]; results != nil { 2836 results[ar.peer] = struct{}{} 2837 if nr := len(results); nr >= n.qn { 2838 // We have a quorum. 2839 for index := n.commit + 1; index <= ar.index; index++ { 2840 if err := n.applyCommit(index); err != nil && err != errNodeClosed { 2841 n.error("Got an error applying commit for %d: %v", index, err) 2842 break 2843 } 2844 } 2845 sendHB = n.prop.len() == 0 2846 } 2847 } 2848 n.Unlock() 2849 2850 if sendHB { 2851 n.sendHeartbeat() 2852 } 2853 } 2854 2855 // Used to adjust cluster size and peer count based on added official peers. 2856 // lock should be held. 2857 func (n *raft) adjustClusterSizeAndQuorum() { 2858 pcsz, ncsz := n.csz, 0 2859 for _, peer := range n.peers { 2860 if peer.kp { 2861 ncsz++ 2862 } 2863 } 2864 n.csz = ncsz 2865 n.qn = n.csz/2 + 1 2866 2867 if ncsz > pcsz { 2868 n.debug("Expanding our clustersize: %d -> %d", pcsz, ncsz) 2869 n.lsut = time.Now() 2870 } else if ncsz < pcsz { 2871 n.debug("Decreasing our clustersize: %d -> %d", pcsz, ncsz) 2872 if n.State() == Leader { 2873 go n.sendHeartbeat() 2874 } 2875 } 2876 } 2877 2878 // Track interactions with this peer. 2879 func (n *raft) trackPeer(peer string) error { 2880 n.Lock() 2881 var needPeerAdd, isRemoved bool 2882 if n.removed != nil { 2883 _, isRemoved = n.removed[peer] 2884 } 2885 if n.State() == Leader { 2886 if lp, ok := n.peers[peer]; !ok || !lp.kp { 2887 // Check if this peer had been removed previously. 2888 needPeerAdd = !isRemoved 2889 } 2890 } 2891 if ps := n.peers[peer]; ps != nil { 2892 ps.ts = time.Now().UnixNano() 2893 } else if !isRemoved { 2894 n.peers[peer] = &lps{time.Now().UnixNano(), 0, false} 2895 } 2896 n.Unlock() 2897 2898 if needPeerAdd { 2899 n.ProposeAddPeer(peer) 2900 } 2901 return nil 2902 } 2903 2904 func (n *raft) runAsCandidate() { 2905 n.Lock() 2906 // Drain old responses. 2907 n.votes.drain() 2908 n.Unlock() 2909 2910 // Send out our request for votes. 2911 n.requestVote() 2912 2913 // We vote for ourselves. 2914 votes := map[string]struct{}{ 2915 n.ID(): {}, 2916 } 2917 2918 for { 2919 elect := n.electTimer() 2920 select { 2921 case <-n.entry.ch: 2922 n.processAppendEntries() 2923 case <-n.resp.ch: 2924 // Ignore 2925 n.resp.popOne() 2926 case <-n.s.quitCh: 2927 n.shutdown(false) 2928 return 2929 case <-n.quit: 2930 return 2931 case <-elect.C: 2932 n.switchToCandidate() 2933 return 2934 case <-n.votes.ch: 2935 // Because of drain() it is possible that we get nil from popOne(). 2936 vresp, ok := n.votes.popOne() 2937 if !ok { 2938 continue 2939 } 2940 n.RLock() 2941 nterm := n.term 2942 n.RUnlock() 2943 2944 if vresp.granted && nterm == vresp.term { 2945 // only track peers that would be our followers 2946 n.trackPeer(vresp.peer) 2947 votes[vresp.peer] = struct{}{} 2948 if n.wonElection(len(votes)) { 2949 // Become LEADER if we have won and gotten a quorum with everyone we should hear from. 2950 n.switchToLeader() 2951 return 2952 } 2953 } else if vresp.term > nterm { 2954 // if we observe a bigger term, we should start over again or risk forming a quorum fully knowing 2955 // someone with a better term exists. This is even the right thing to do if won == true. 2956 n.Lock() 2957 n.debug("Stepping down from candidate, detected higher term: %d vs %d", vresp.term, n.term) 2958 n.term = vresp.term 2959 n.vote = noVote 2960 n.writeTermVote() 2961 n.stepdown.push(noLeader) 2962 n.lxfer = false 2963 n.Unlock() 2964 } 2965 case <-n.reqs.ch: 2966 // Because of drain() it is possible that we get nil from popOne(). 2967 if voteReq, ok := n.reqs.popOne(); ok { 2968 n.processVoteRequest(voteReq) 2969 } 2970 case <-n.stepdown.ch: 2971 if newLeader, ok := n.stepdown.popOne(); ok { 2972 n.switchToFollower(newLeader) 2973 return 2974 } 2975 } 2976 } 2977 } 2978 2979 // handleAppendEntry handles an append entry from the wire. This function 2980 // is an internal callback from the "asubj" append entry subscription. 2981 func (n *raft) handleAppendEntry(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 2982 msg = copyBytes(msg) 2983 if ae, err := n.decodeAppendEntry(msg, sub, reply); err == nil { 2984 // Push to the new entry channel. From here one of the worker 2985 // goroutines (runAsLeader, runAsFollower, runAsCandidate) will 2986 // pick it up. 2987 n.entry.push(ae) 2988 } else { 2989 n.warn("AppendEntry failed to be placed on internal channel: corrupt entry") 2990 } 2991 } 2992 2993 // cancelCatchup will stop an in-flight catchup by unsubscribing from the 2994 // catchup subscription. 2995 // Lock should be held. 2996 func (n *raft) cancelCatchup() { 2997 n.debug("Canceling catchup subscription since we are now up to date") 2998 2999 if n.catchup != nil && n.catchup.sub != nil { 3000 n.unsubscribe(n.catchup.sub) 3001 } 3002 n.catchup = nil 3003 } 3004 3005 // catchupStalled will try to determine if we are stalled. This is called 3006 // on a new entry from our leader. 3007 // Lock should be held. 3008 func (n *raft) catchupStalled() bool { 3009 if n.catchup == nil { 3010 return false 3011 } 3012 if n.catchup.pindex == n.pindex { 3013 return time.Since(n.catchup.active) > 2*time.Second 3014 } 3015 n.catchup.pindex = n.pindex 3016 n.catchup.active = time.Now() 3017 return false 3018 } 3019 3020 // createCatchup will create the state needed to track a catchup as it 3021 // runs. It then creates a unique inbox for this catchup and subscribes 3022 // to it. The remote side will stream entries to that subject. 3023 // Lock should be held. 3024 func (n *raft) createCatchup(ae *appendEntry) string { 3025 // Cleanup any old ones. 3026 if n.catchup != nil && n.catchup.sub != nil { 3027 n.unsubscribe(n.catchup.sub) 3028 } 3029 // Snapshot term and index. 3030 n.catchup = &catchupState{ 3031 cterm: ae.pterm, 3032 cindex: ae.pindex, 3033 pterm: n.pterm, 3034 pindex: n.pindex, 3035 active: time.Now(), 3036 } 3037 inbox := n.newCatchupInbox() 3038 sub, _ := n.subscribe(inbox, n.handleAppendEntry) 3039 n.catchup.sub = sub 3040 3041 return inbox 3042 } 3043 3044 // Truncate our WAL and reset. 3045 // Lock should be held. 3046 func (n *raft) truncateWAL(term, index uint64) { 3047 n.debug("Truncating and repairing WAL to Term %d Index %d", term, index) 3048 3049 if term == 0 && index == 0 { 3050 n.warn("Resetting WAL state") 3051 } 3052 3053 defer func() { 3054 // Check to see if we invalidated any snapshots that might have held state 3055 // from the entries we are truncating. 3056 if snap, _ := n.loadLastSnapshot(); snap != nil && snap.lastIndex >= index { 3057 os.Remove(n.snapfile) 3058 n.snapfile = _EMPTY_ 3059 } 3060 // Make sure to reset commit and applied if above 3061 if n.commit > n.pindex { 3062 n.commit = n.pindex 3063 } 3064 if n.applied > n.commit { 3065 n.applied = n.commit 3066 } 3067 }() 3068 3069 if err := n.wal.Truncate(index); err != nil { 3070 // If we get an invalid sequence, reset our wal all together. 3071 if err == ErrInvalidSequence { 3072 n.debug("Resetting WAL") 3073 n.wal.Truncate(0) 3074 index, n.term, n.pterm, n.pindex = 0, 0, 0, 0 3075 } else { 3076 n.warn("Error truncating WAL: %v", err) 3077 n.setWriteErrLocked(err) 3078 } 3079 return 3080 } 3081 3082 // Set after we know we have truncated properly. 3083 n.term, n.pterm, n.pindex = term, term, index 3084 } 3085 3086 // Reset our WAL. This is equivalent to truncating all data from the log. 3087 // Lock should be held. 3088 func (n *raft) resetWAL() { 3089 n.truncateWAL(0, 0) 3090 } 3091 3092 // Lock should be held 3093 func (n *raft) updateLeader(newLeader string) { 3094 n.leader = newLeader 3095 if !n.pleader && newLeader != noLeader { 3096 n.pleader = true 3097 } 3098 } 3099 3100 // processAppendEntry will process an appendEntry. This is called either 3101 // during recovery or from processAppendEntries when there are new entries 3102 // to be committed. 3103 func (n *raft) processAppendEntry(ae *appendEntry, sub *subscription) { 3104 n.Lock() 3105 // Don't reset here if we have been asked to assume leader position. 3106 if !n.lxfer { 3107 n.resetElectionTimeout() 3108 } 3109 3110 // Just return if closed or we had previous write error. 3111 if n.State() == Closed || n.werr != nil { 3112 n.Unlock() 3113 return 3114 } 3115 3116 // Scratch buffer for responses. 3117 var scratch [appendEntryResponseLen]byte 3118 arbuf := scratch[:] 3119 3120 // Are we receiving from another leader. 3121 if n.State() == Leader { 3122 // If we are the same we should step down to break the tie. 3123 if ae.term >= n.term { 3124 // If the append entry term is newer than the current term, erase our 3125 // vote. 3126 if ae.term > n.term { 3127 n.term = ae.term 3128 n.vote = noVote 3129 n.writeTermVote() 3130 } 3131 n.debug("Received append entry from another leader, stepping down to %q", ae.leader) 3132 n.stepdown.push(ae.leader) 3133 } else { 3134 // Let them know we are the leader. 3135 ar := newAppendEntryResponse(n.term, n.pindex, n.id, false) 3136 n.debug("AppendEntry ignoring old term from another leader") 3137 n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf)) 3138 arPool.Put(ar) 3139 } 3140 // Always return here from processing. 3141 n.Unlock() 3142 return 3143 } 3144 3145 // If we received an append entry as a candidate then it would appear that 3146 // another node has taken on the leader role already, so we should convert 3147 // to a follower of that node instead. 3148 if n.State() == Candidate { 3149 // Ignore old terms, otherwise we might end up stepping down incorrectly. 3150 if ae.term >= n.term { 3151 // If the append entry term is newer than the current term, erase our 3152 // vote. 3153 if ae.term > n.term { 3154 n.term = ae.term 3155 n.vote = noVote 3156 n.writeTermVote() 3157 } 3158 n.debug("Received append entry in candidate state from %q, converting to follower", ae.leader) 3159 n.stepdown.push(ae.leader) 3160 } 3161 } 3162 3163 // Catching up state. 3164 catchingUp := n.catchup != nil 3165 // Is this a new entry? New entries will be delivered on the append entry 3166 // sub, rather than a catch-up sub. 3167 isNew := sub != nil && sub == n.aesub 3168 3169 // Track leader directly 3170 if isNew && ae.leader != noLeader { 3171 if ps := n.peers[ae.leader]; ps != nil { 3172 ps.ts = time.Now().UnixNano() 3173 } else { 3174 n.peers[ae.leader] = &lps{time.Now().UnixNano(), 0, true} 3175 } 3176 } 3177 3178 // If we are catching up ignore old catchup subs. 3179 // This could happen when we stall or cancel a catchup. 3180 if !isNew && catchingUp && sub != n.catchup.sub { 3181 n.Unlock() 3182 n.debug("AppendEntry ignoring old entry from previous catchup") 3183 return 3184 } 3185 3186 // Check state if we are catching up. 3187 if catchingUp { 3188 if cs := n.catchup; cs != nil && n.pterm >= cs.cterm && n.pindex >= cs.cindex { 3189 // If we are here we are good, so if we have a catchup pending we can cancel. 3190 n.cancelCatchup() 3191 // Reset our notion of catching up. 3192 catchingUp = false 3193 } else if isNew { 3194 var ar *appendEntryResponse 3195 var inbox string 3196 // Check to see if we are stalled. If so recreate our catchup state and resend response. 3197 if n.catchupStalled() { 3198 n.debug("Catchup may be stalled, will request again") 3199 inbox = n.createCatchup(ae) 3200 ar = newAppendEntryResponse(n.pterm, n.pindex, n.id, false) 3201 } 3202 n.Unlock() 3203 if ar != nil { 3204 n.sendRPC(ae.reply, inbox, ar.encode(arbuf)) 3205 arPool.Put(ar) 3206 } 3207 // Ignore new while catching up or replaying. 3208 return 3209 } 3210 } 3211 3212 // If this term is greater than ours. 3213 if ae.term > n.term { 3214 n.pterm = ae.pterm 3215 n.term = ae.term 3216 n.vote = noVote 3217 if isNew { 3218 n.writeTermVote() 3219 } 3220 if n.State() != Follower { 3221 n.debug("Term higher than ours and we are not a follower: %v, stepping down to %q", n.State(), ae.leader) 3222 n.stepdown.push(ae.leader) 3223 } 3224 } 3225 3226 if isNew && n.leader != ae.leader && n.State() == Follower { 3227 n.debug("AppendEntry updating leader to %q", ae.leader) 3228 n.updateLeader(ae.leader) 3229 n.writeTermVote() 3230 n.resetElectionTimeout() 3231 n.updateLeadChange(false) 3232 } 3233 3234 if (isNew && ae.pterm != n.pterm) || ae.pindex != n.pindex { 3235 // Check if this is a lower or equal index than what we were expecting. 3236 if ae.pindex <= n.pindex { 3237 n.debug("AppendEntry detected pindex less than ours: %d:%d vs %d:%d", ae.pterm, ae.pindex, n.pterm, n.pindex) 3238 var ar *appendEntryResponse 3239 3240 var success bool 3241 if eae, _ := n.loadEntry(ae.pindex); eae == nil { 3242 // If terms are equal, and we are not catching up, we have simply already processed this message. 3243 // So we will ACK back to the leader. This can happen on server restarts based on timings of snapshots. 3244 if ae.pterm == n.pterm && !catchingUp { 3245 success = true 3246 } else { 3247 n.resetWAL() 3248 } 3249 } else { 3250 // If terms mismatched, or we got an error loading, delete that entry and all others past it. 3251 // Make sure to cancel any catchups in progress. 3252 // Truncate will reset our pterm and pindex. Only do so if we have an entry. 3253 n.truncateWAL(ae.pterm, ae.pindex) 3254 } 3255 // Cancel regardless. 3256 n.cancelCatchup() 3257 3258 // Create response. 3259 ar = newAppendEntryResponse(ae.pterm, ae.pindex, n.id, success) 3260 n.Unlock() 3261 n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf)) 3262 arPool.Put(ar) 3263 return 3264 } 3265 3266 // Check if we are catching up. If we are here we know the leader did not have all of the entries 3267 // so make sure this is a snapshot entry. If it is not start the catchup process again since it 3268 // means we may have missed additional messages. 3269 if catchingUp { 3270 // Check if only our terms do not match here. 3271 if ae.pindex == n.pindex { 3272 // Make sure pterms match and we take on the leader's. 3273 // This prevents constant spinning. 3274 n.truncateWAL(ae.pterm, ae.pindex) 3275 n.cancelCatchup() 3276 n.Unlock() 3277 return 3278 } 3279 // This means we already entered into a catchup state but what the leader sent us did not match what we expected. 3280 // Snapshots and peerstate will always be together when a leader is catching us up in this fashion. 3281 if len(ae.entries) != 2 || ae.entries[0].Type != EntrySnapshot || ae.entries[1].Type != EntryPeerState { 3282 n.warn("Expected first catchup entry to be a snapshot and peerstate, will retry") 3283 n.cancelCatchup() 3284 n.Unlock() 3285 return 3286 } 3287 3288 if ps, err := decodePeerState(ae.entries[1].Data); err == nil { 3289 n.processPeerState(ps) 3290 // Also need to copy from client's buffer. 3291 ae.entries[0].Data = copyBytes(ae.entries[0].Data) 3292 } else { 3293 n.warn("Could not parse snapshot peerstate correctly") 3294 n.cancelCatchup() 3295 n.Unlock() 3296 return 3297 } 3298 3299 n.pindex = ae.pindex 3300 n.pterm = ae.pterm 3301 n.commit = ae.pindex 3302 3303 if _, err := n.wal.Compact(n.pindex + 1); err != nil { 3304 n.setWriteErrLocked(err) 3305 n.Unlock() 3306 return 3307 } 3308 3309 // Now send snapshot to upper levels. Only send the snapshot, not the peerstate entry. 3310 n.apply.push(newCommittedEntry(n.commit, ae.entries[:1])) 3311 n.Unlock() 3312 return 3313 3314 } else { 3315 n.debug("AppendEntry did not match %d %d with %d %d", ae.pterm, ae.pindex, n.pterm, n.pindex) 3316 // Reset our term. 3317 n.term = n.pterm 3318 if ae.pindex > n.pindex { 3319 // Setup our state for catching up. 3320 inbox := n.createCatchup(ae) 3321 ar := newAppendEntryResponse(n.pterm, n.pindex, n.id, false) 3322 n.Unlock() 3323 n.sendRPC(ae.reply, inbox, ar.encode(arbuf)) 3324 arPool.Put(ar) 3325 return 3326 } 3327 } 3328 } 3329 3330 // Save to our WAL if we have entries. 3331 if ae.shouldStore() { 3332 // Only store if an original which will have sub != nil 3333 if sub != nil { 3334 if err := n.storeToWAL(ae); err != nil { 3335 if err != ErrStoreClosed { 3336 n.warn("Error storing entry to WAL: %v", err) 3337 } 3338 n.Unlock() 3339 return 3340 } 3341 // Save in memory for faster processing during applyCommit. 3342 // Only save so many however to avoid memory bloat. 3343 if l := len(n.pae); l <= paeDropThreshold { 3344 n.pae[n.pindex], l = ae, l+1 3345 if l > paeWarnThreshold && l%paeWarnModulo == 0 { 3346 n.warn("%d append entries pending", len(n.pae)) 3347 } 3348 } else { 3349 n.debug("Not saving to append entries pending") 3350 } 3351 } else { 3352 // This is a replay on startup so just take the appendEntry version. 3353 n.pterm = ae.term 3354 n.pindex = ae.pindex + 1 3355 } 3356 } 3357 3358 // Check to see if we have any related entries to process here. 3359 for _, e := range ae.entries { 3360 switch e.Type { 3361 case EntryLeaderTransfer: 3362 // Only process these if they are new, so no replays or catchups. 3363 if isNew { 3364 maybeLeader := string(e.Data) 3365 // This is us. We need to check if we can become the leader. 3366 if maybeLeader == n.id { 3367 // If not an observer and not paused we are good to go. 3368 if !n.observer && !n.paused { 3369 n.lxfer = true 3370 n.xferCampaign() 3371 } else if n.paused && !n.pobserver { 3372 // Here we can become a leader but need to wait for resume of the apply queue. 3373 n.lxfer = true 3374 } 3375 } else if n.vote != noVote { 3376 // Since we are here we are not the chosen one but we should clear any vote preference. 3377 n.vote = noVote 3378 n.writeTermVote() 3379 } 3380 } 3381 case EntryAddPeer: 3382 if newPeer := string(e.Data); len(newPeer) == idLen { 3383 // Track directly, but wait for commit to be official 3384 if ps := n.peers[newPeer]; ps != nil { 3385 ps.ts = time.Now().UnixNano() 3386 } else { 3387 n.peers[newPeer] = &lps{time.Now().UnixNano(), 0, false} 3388 } 3389 // Store our peer in our global peer map for all peers. 3390 peers.LoadOrStore(newPeer, newPeer) 3391 } 3392 } 3393 } 3394 3395 // Apply anything we need here. 3396 if ae.commit > n.commit { 3397 if n.paused { 3398 n.hcommit = ae.commit 3399 n.debug("Paused, not applying %d", ae.commit) 3400 } else { 3401 for index := n.commit + 1; index <= ae.commit; index++ { 3402 if err := n.applyCommit(index); err != nil { 3403 break 3404 } 3405 } 3406 } 3407 } 3408 3409 var ar *appendEntryResponse 3410 if sub != nil { 3411 ar = newAppendEntryResponse(n.pterm, n.pindex, n.id, true) 3412 } 3413 n.Unlock() 3414 3415 // Success. Send our response. 3416 if ar != nil { 3417 n.sendRPC(ae.reply, _EMPTY_, ar.encode(arbuf)) 3418 arPool.Put(ar) 3419 } 3420 } 3421 3422 // processPeerState is called when a peer state entry is received 3423 // over the wire or when we're updating known peers. 3424 // Lock should be held. 3425 func (n *raft) processPeerState(ps *peerState) { 3426 // Update our version of peers to that of the leader. Calculate 3427 // the number of nodes needed to establish a quorum. 3428 n.csz = ps.clusterSize 3429 n.qn = n.csz/2 + 1 3430 3431 old := n.peers 3432 n.peers = make(map[string]*lps) 3433 for _, peer := range ps.knownPeers { 3434 if lp := old[peer]; lp != nil { 3435 lp.kp = true 3436 n.peers[peer] = lp 3437 } else { 3438 n.peers[peer] = &lps{0, 0, true} 3439 } 3440 } 3441 n.debug("Update peers from leader to %+v", n.peers) 3442 n.writePeerState(ps) 3443 } 3444 3445 // processAppendEntryResponse is called when we receive an append entry 3446 // response from another node. They will send a confirmation to tell us 3447 // whether they successfully committed the entry or not. 3448 func (n *raft) processAppendEntryResponse(ar *appendEntryResponse) { 3449 n.trackPeer(ar.peer) 3450 3451 if ar.success { 3452 // The remote node successfully committed the append entry. 3453 n.trackResponse(ar) 3454 arPool.Put(ar) 3455 } else if ar.term > n.term { 3456 // The remote node didn't commit the append entry, it looks like 3457 // they are on a newer term than we are. Step down. 3458 n.Lock() 3459 n.term = ar.term 3460 n.vote = noVote 3461 n.writeTermVote() 3462 n.warn("Detected another leader with higher term, will stepdown and reset") 3463 n.stepdown.push(noLeader) 3464 n.resetWAL() 3465 n.Unlock() 3466 arPool.Put(ar) 3467 } else if ar.reply != _EMPTY_ { 3468 // The remote node didn't commit the append entry and they are 3469 // still on the same term, so let's try to catch them up. 3470 n.catchupFollower(ar) 3471 } 3472 } 3473 3474 // handleAppendEntryResponse processes responses to append entries. 3475 func (n *raft) handleAppendEntryResponse(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 3476 ar := n.decodeAppendEntryResponse(msg) 3477 ar.reply = reply 3478 n.resp.push(ar) 3479 } 3480 3481 func (n *raft) buildAppendEntry(entries []*Entry) *appendEntry { 3482 return newAppendEntry(n.id, n.term, n.commit, n.pterm, n.pindex, entries) 3483 } 3484 3485 // Determine if we should store an entry. This stops us from storing 3486 // heartbeat messages. 3487 func (ae *appendEntry) shouldStore() bool { 3488 return ae != nil && len(ae.entries) > 0 3489 } 3490 3491 // Store our append entry to our WAL. 3492 // lock should be held. 3493 func (n *raft) storeToWAL(ae *appendEntry) error { 3494 if ae == nil { 3495 return fmt.Errorf("raft: Missing append entry for storage") 3496 } 3497 if n.werr != nil { 3498 return n.werr 3499 } 3500 3501 seq, _, err := n.wal.StoreMsg(_EMPTY_, nil, ae.buf) 3502 if err != nil { 3503 n.setWriteErrLocked(err) 3504 return err 3505 } 3506 3507 // Sanity checking for now. 3508 if index := ae.pindex + 1; index != seq { 3509 n.warn("Wrong index, ae is %+v, index stored was %d, n.pindex is %d, will reset", ae, seq, n.pindex) 3510 if n.State() == Leader { 3511 n.stepdown.push(n.selectNextLeader()) 3512 } 3513 // Reset and cancel any catchup. 3514 n.resetWAL() 3515 n.cancelCatchup() 3516 return errEntryStoreFailed 3517 } 3518 3519 n.pterm = ae.term 3520 n.pindex = seq 3521 return nil 3522 } 3523 3524 const ( 3525 paeDropThreshold = 20_000 3526 paeWarnThreshold = 10_000 3527 paeWarnModulo = 5_000 3528 ) 3529 3530 func (n *raft) sendAppendEntry(entries []*Entry) { 3531 n.Lock() 3532 defer n.Unlock() 3533 ae := n.buildAppendEntry(entries) 3534 3535 var err error 3536 var scratch [1024]byte 3537 ae.buf, err = ae.encode(scratch[:]) 3538 if err != nil { 3539 return 3540 } 3541 3542 // If we have entries store this in our wal. 3543 shouldStore := ae.shouldStore() 3544 if shouldStore { 3545 if err := n.storeToWAL(ae); err != nil { 3546 return 3547 } 3548 // We count ourselves. 3549 n.acks[n.pindex] = map[string]struct{}{n.id: {}} 3550 n.active = time.Now() 3551 3552 // Save in memory for faster processing during applyCommit. 3553 n.pae[n.pindex] = ae 3554 if l := len(n.pae); l > paeWarnThreshold && l%paeWarnModulo == 0 { 3555 n.warn("%d append entries pending", len(n.pae)) 3556 } 3557 } 3558 n.sendRPC(n.asubj, n.areply, ae.buf) 3559 if !shouldStore { 3560 ae.returnToPool() 3561 } 3562 } 3563 3564 type extensionState uint16 3565 3566 const ( 3567 extUndetermined = extensionState(iota) 3568 extExtended 3569 extNotExtended 3570 ) 3571 3572 type peerState struct { 3573 knownPeers []string 3574 clusterSize int 3575 domainExt extensionState 3576 } 3577 3578 func peerStateBufSize(ps *peerState) int { 3579 return 4 + 4 + (idLen * len(ps.knownPeers)) + 2 3580 } 3581 3582 func encodePeerState(ps *peerState) []byte { 3583 var le = binary.LittleEndian 3584 buf := make([]byte, peerStateBufSize(ps)) 3585 le.PutUint32(buf[0:], uint32(ps.clusterSize)) 3586 le.PutUint32(buf[4:], uint32(len(ps.knownPeers))) 3587 wi := 8 3588 for _, peer := range ps.knownPeers { 3589 copy(buf[wi:], peer) 3590 wi += idLen 3591 } 3592 le.PutUint16(buf[wi:], uint16(ps.domainExt)) 3593 return buf 3594 } 3595 3596 func decodePeerState(buf []byte) (*peerState, error) { 3597 if len(buf) < 8 { 3598 return nil, errCorruptPeers 3599 } 3600 var le = binary.LittleEndian 3601 ps := &peerState{clusterSize: int(le.Uint32(buf[0:]))} 3602 expectedPeers := int(le.Uint32(buf[4:])) 3603 buf = buf[8:] 3604 ri := 0 3605 for i, n := 0, expectedPeers; i < n && ri < len(buf); i++ { 3606 ps.knownPeers = append(ps.knownPeers, string(buf[ri:ri+idLen])) 3607 ri += idLen 3608 } 3609 if len(ps.knownPeers) != expectedPeers { 3610 return nil, errCorruptPeers 3611 } 3612 if len(buf[ri:]) >= 2 { 3613 ps.domainExt = extensionState(le.Uint16(buf[ri:])) 3614 } 3615 return ps, nil 3616 } 3617 3618 // Lock should be held. 3619 func (n *raft) peerNames() []string { 3620 var peers []string 3621 for name, peer := range n.peers { 3622 if peer.kp { 3623 peers = append(peers, name) 3624 } 3625 } 3626 return peers 3627 } 3628 3629 func (n *raft) currentPeerState() *peerState { 3630 n.RLock() 3631 ps := &peerState{n.peerNames(), n.csz, n.extSt} 3632 n.RUnlock() 3633 return ps 3634 } 3635 3636 // sendPeerState will send our current peer state to the cluster. 3637 func (n *raft) sendPeerState() { 3638 n.sendAppendEntry([]*Entry{{EntryPeerState, encodePeerState(n.currentPeerState())}}) 3639 } 3640 3641 // Send a heartbeat. 3642 func (n *raft) sendHeartbeat() { 3643 n.sendAppendEntry(nil) 3644 } 3645 3646 type voteRequest struct { 3647 term uint64 3648 lastTerm uint64 3649 lastIndex uint64 3650 candidate string 3651 // internal only. 3652 reply string 3653 } 3654 3655 const voteRequestLen = 24 + idLen 3656 3657 func (vr *voteRequest) encode() []byte { 3658 var buf [voteRequestLen]byte 3659 var le = binary.LittleEndian 3660 le.PutUint64(buf[0:], vr.term) 3661 le.PutUint64(buf[8:], vr.lastTerm) 3662 le.PutUint64(buf[16:], vr.lastIndex) 3663 copy(buf[24:24+idLen], vr.candidate) 3664 3665 return buf[:voteRequestLen] 3666 } 3667 3668 func decodeVoteRequest(msg []byte, reply string) *voteRequest { 3669 if len(msg) != voteRequestLen { 3670 return nil 3671 } 3672 3673 var le = binary.LittleEndian 3674 return &voteRequest{ 3675 term: le.Uint64(msg[0:]), 3676 lastTerm: le.Uint64(msg[8:]), 3677 lastIndex: le.Uint64(msg[16:]), 3678 candidate: string(copyBytes(msg[24 : 24+idLen])), 3679 reply: reply, 3680 } 3681 } 3682 3683 const peerStateFile = "peers.idx" 3684 3685 // Lock should be held. 3686 func (n *raft) writePeerState(ps *peerState) { 3687 pse := encodePeerState(ps) 3688 if bytes.Equal(n.wps, pse) { 3689 return 3690 } 3691 // Stamp latest and write the peer state file. 3692 n.wps = pse 3693 if err := writePeerState(n.sd, ps); err != nil && !n.isClosed() { 3694 n.setWriteErrLocked(err) 3695 n.warn("Error writing peer state file for %q: %v", n.group, err) 3696 } 3697 } 3698 3699 // Writes out our peer state outside of a specific raft context. 3700 func writePeerState(sd string, ps *peerState) error { 3701 psf := filepath.Join(sd, peerStateFile) 3702 if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) { 3703 return err 3704 } 3705 3706 <-dios 3707 err := os.WriteFile(psf, encodePeerState(ps), defaultFilePerms) 3708 dios <- struct{}{} 3709 3710 return err 3711 } 3712 3713 func readPeerState(sd string) (ps *peerState, err error) { 3714 <-dios 3715 buf, err := os.ReadFile(filepath.Join(sd, peerStateFile)) 3716 dios <- struct{}{} 3717 3718 if err != nil { 3719 return nil, err 3720 } 3721 return decodePeerState(buf) 3722 } 3723 3724 const termVoteFile = "tav.idx" 3725 const termVoteLen = idLen + 8 3726 3727 // Writes out our term & vote outside of a specific raft context. 3728 func writeTermVote(sd string, wtv []byte) error { 3729 psf := filepath.Join(sd, termVoteFile) 3730 if _, err := os.Stat(psf); err != nil && !os.IsNotExist(err) { 3731 return err 3732 } 3733 3734 <-dios 3735 err := os.WriteFile(psf, wtv, defaultFilePerms) 3736 dios <- struct{}{} 3737 3738 return err 3739 } 3740 3741 // readTermVote will read the largest term and who we voted from to stable storage. 3742 // Lock should be held. 3743 func (n *raft) readTermVote() (term uint64, voted string, err error) { 3744 <-dios 3745 buf, err := os.ReadFile(filepath.Join(n.sd, termVoteFile)) 3746 dios <- struct{}{} 3747 3748 if err != nil { 3749 return 0, noVote, err 3750 } 3751 if len(buf) < termVoteLen { 3752 return 0, noVote, nil 3753 } 3754 var le = binary.LittleEndian 3755 term = le.Uint64(buf[0:]) 3756 voted = string(buf[8:]) 3757 return term, voted, nil 3758 } 3759 3760 // Lock should be held. 3761 func (n *raft) setWriteErrLocked(err error) { 3762 // Check if we are closed already. 3763 if n.State() == Closed { 3764 return 3765 } 3766 // Ignore if already set. 3767 if n.werr == err || err == nil { 3768 return 3769 } 3770 // Ignore non-write errors. 3771 if err == ErrStoreClosed || 3772 err == ErrStoreEOF || 3773 err == ErrInvalidSequence || 3774 err == ErrStoreMsgNotFound || 3775 err == errNoPending || 3776 err == errPartialCache { 3777 return 3778 } 3779 // If this is a not found report but do not disable. 3780 if os.IsNotExist(err) { 3781 n.error("Resource not found: %v", err) 3782 return 3783 } 3784 n.error("Critical write error: %v", err) 3785 n.werr = err 3786 3787 if isOutOfSpaceErr(err) { 3788 // For now since this can be happening all under the covers, we will call up and disable JetStream. 3789 go n.s.handleOutOfSpace(nil) 3790 } 3791 } 3792 3793 // Helper to check if we are closed when we do not hold a lock already. 3794 func (n *raft) isClosed() bool { 3795 return n.State() == Closed 3796 } 3797 3798 // Capture our write error if any and hold. 3799 func (n *raft) setWriteErr(err error) { 3800 n.Lock() 3801 defer n.Unlock() 3802 n.setWriteErrLocked(err) 3803 } 3804 3805 // writeTermVote will record the largest term and who we voted for to stable storage. 3806 // Lock should be held. 3807 func (n *raft) writeTermVote() { 3808 var buf [termVoteLen]byte 3809 var le = binary.LittleEndian 3810 le.PutUint64(buf[0:], n.term) 3811 copy(buf[8:], n.vote) 3812 b := buf[:8+len(n.vote)] 3813 3814 // If the term and vote hasn't changed then don't rewrite to disk. 3815 if bytes.Equal(n.wtv, b) { 3816 return 3817 } 3818 // Stamp latest and write the term & vote file. 3819 n.wtv = b 3820 if err := writeTermVote(n.sd, n.wtv); err != nil && !n.isClosed() { 3821 n.setWriteErrLocked(err) 3822 n.warn("Error writing term and vote file for %q: %v", n.group, err) 3823 } 3824 } 3825 3826 // voteResponse is a response to a vote request. 3827 type voteResponse struct { 3828 term uint64 3829 peer string 3830 granted bool 3831 } 3832 3833 const voteResponseLen = 8 + 8 + 1 3834 3835 func (vr *voteResponse) encode() []byte { 3836 var buf [voteResponseLen]byte 3837 var le = binary.LittleEndian 3838 le.PutUint64(buf[0:], vr.term) 3839 copy(buf[8:], vr.peer) 3840 if vr.granted { 3841 buf[16] = 1 3842 } else { 3843 buf[16] = 0 3844 } 3845 return buf[:voteResponseLen] 3846 } 3847 3848 func decodeVoteResponse(msg []byte) *voteResponse { 3849 if len(msg) != voteResponseLen { 3850 return nil 3851 } 3852 var le = binary.LittleEndian 3853 vr := &voteResponse{term: le.Uint64(msg[0:]), peer: string(msg[8:16])} 3854 vr.granted = msg[16] == 1 3855 return vr 3856 } 3857 3858 func (n *raft) handleVoteResponse(sub *subscription, c *client, _ *Account, _, reply string, msg []byte) { 3859 vr := decodeVoteResponse(msg) 3860 n.debug("Received a voteResponse %+v", vr) 3861 if vr == nil { 3862 n.error("Received malformed vote response for %q", n.group) 3863 return 3864 } 3865 3866 if state := n.State(); state != Candidate && state != Leader { 3867 n.debug("Ignoring old vote response, we have stepped down") 3868 return 3869 } 3870 3871 n.votes.push(vr) 3872 } 3873 3874 func (n *raft) processVoteRequest(vr *voteRequest) error { 3875 // To simplify calling code, we can possibly pass `nil` to this function. 3876 // If that is the case, does not consider it an error. 3877 if vr == nil { 3878 return nil 3879 } 3880 n.debug("Received a voteRequest %+v", vr) 3881 3882 if err := n.trackPeer(vr.candidate); err != nil { 3883 return err 3884 } 3885 3886 n.Lock() 3887 n.resetElectionTimeout() 3888 3889 vresp := &voteResponse{n.term, n.id, false} 3890 defer n.debug("Sending a voteResponse %+v -> %q", vresp, vr.reply) 3891 3892 // Ignore if we are newer. This is important so that we don't accidentally process 3893 // votes from a previous term if they were still in flight somewhere. 3894 if vr.term < n.term { 3895 n.Unlock() 3896 n.sendReply(vr.reply, vresp.encode()) 3897 return nil 3898 } 3899 3900 // If this is a higher term go ahead and stepdown. 3901 if vr.term > n.term { 3902 if n.State() != Follower { 3903 n.debug("Stepping down from %s, detected higher term: %d vs %d", 3904 strings.ToLower(n.State().String()), vr.term, n.term) 3905 n.stepdown.push(noLeader) 3906 n.term = vr.term 3907 } 3908 n.vote = noVote 3909 n.writeTermVote() 3910 } 3911 3912 // Only way we get to yes is through here. 3913 voteOk := n.vote == noVote || n.vote == vr.candidate 3914 if voteOk && (vr.lastTerm > n.pterm || vr.lastTerm == n.pterm && vr.lastIndex >= n.pindex) { 3915 vresp.granted = true 3916 n.term = vr.term 3917 n.vote = vr.candidate 3918 n.writeTermVote() 3919 } else { 3920 if vr.term >= n.term && n.vote == noVote { 3921 n.term = vr.term 3922 n.resetElect(randCampaignTimeout()) 3923 } 3924 } 3925 3926 // Term might have changed, make sure response has the most current 3927 vresp.term = n.term 3928 3929 n.Unlock() 3930 3931 n.sendReply(vr.reply, vresp.encode()) 3932 3933 return nil 3934 } 3935 3936 func (n *raft) handleVoteRequest(sub *subscription, c *client, _ *Account, subject, reply string, msg []byte) { 3937 vr := decodeVoteRequest(msg, reply) 3938 if vr == nil { 3939 n.error("Received malformed vote request for %q", n.group) 3940 return 3941 } 3942 n.reqs.push(vr) 3943 } 3944 3945 func (n *raft) requestVote() { 3946 n.Lock() 3947 if n.State() != Candidate { 3948 n.Unlock() 3949 return 3950 } 3951 n.vote = n.id 3952 n.writeTermVote() 3953 vr := voteRequest{n.term, n.pterm, n.pindex, n.id, _EMPTY_} 3954 subj, reply := n.vsubj, n.vreply 3955 n.Unlock() 3956 3957 n.debug("Sending out voteRequest %+v", vr) 3958 3959 // Now send it out. 3960 n.sendRPC(subj, reply, vr.encode()) 3961 } 3962 3963 func (n *raft) sendRPC(subject, reply string, msg []byte) { 3964 if n.sq != nil { 3965 n.sq.send(subject, reply, nil, msg) 3966 } 3967 } 3968 3969 func (n *raft) sendReply(subject string, msg []byte) { 3970 if n.sq != nil { 3971 n.sq.send(subject, _EMPTY_, nil, msg) 3972 } 3973 } 3974 3975 func (n *raft) wonElection(votes int) bool { 3976 return votes >= n.quorumNeeded() 3977 } 3978 3979 // Return the quorum size for a given cluster config. 3980 func (n *raft) quorumNeeded() int { 3981 n.RLock() 3982 qn := n.qn 3983 n.RUnlock() 3984 return qn 3985 } 3986 3987 // Lock should be held. 3988 func (n *raft) updateLeadChange(isLeader bool) { 3989 // We don't care about values that have not been consumed (transitory states), 3990 // so we dequeue any state that is pending and push the new one. 3991 for { 3992 select { 3993 case n.leadc <- isLeader: 3994 return 3995 default: 3996 select { 3997 case <-n.leadc: 3998 default: 3999 // May have been consumed by the "reader" go routine, so go back 4000 // to the top of the loop and try to send again. 4001 } 4002 } 4003 } 4004 } 4005 4006 // Lock should be held. 4007 func (n *raft) switchState(state RaftState) { 4008 if n.State() == Closed { 4009 return 4010 } 4011 4012 // Reset the election timer. 4013 n.resetElectionTimeout() 4014 4015 if n.State() == Leader && state != Leader { 4016 n.updateLeadChange(false) 4017 // Drain the response queue. 4018 n.resp.drain() 4019 } else if state == Leader && n.State() != Leader { 4020 if len(n.pae) > 0 { 4021 n.pae = make(map[uint64]*appendEntry) 4022 } 4023 n.updateLeadChange(true) 4024 } 4025 4026 n.state.Store(int32(state)) 4027 n.writeTermVote() 4028 } 4029 4030 const ( 4031 noLeader = _EMPTY_ 4032 noVote = _EMPTY_ 4033 ) 4034 4035 func (n *raft) switchToFollower(leader string) { 4036 if n.State() == Closed { 4037 return 4038 } 4039 4040 n.Lock() 4041 defer n.Unlock() 4042 4043 n.debug("Switching to follower") 4044 4045 n.lxfer = false 4046 n.updateLeader(leader) 4047 n.switchState(Follower) 4048 } 4049 4050 func (n *raft) switchToCandidate() { 4051 if n.State() == Closed { 4052 return 4053 } 4054 4055 n.Lock() 4056 defer n.Unlock() 4057 4058 // If we are catching up or are in observer mode we can not switch. 4059 if n.observer || n.paused { 4060 return 4061 } 4062 4063 if n.State() != Candidate { 4064 n.debug("Switching to candidate") 4065 } else { 4066 if n.lostQuorumLocked() && time.Since(n.llqrt) > 20*time.Second { 4067 // We signal to the upper layers such that can alert on quorum lost. 4068 n.updateLeadChange(false) 4069 n.llqrt = time.Now() 4070 } 4071 } 4072 // Increment the term. 4073 n.term++ 4074 // Clear current Leader. 4075 n.updateLeader(noLeader) 4076 n.switchState(Candidate) 4077 } 4078 4079 func (n *raft) switchToLeader() { 4080 if n.State() == Closed { 4081 return 4082 } 4083 4084 n.Lock() 4085 4086 n.debug("Switching to leader") 4087 4088 var state StreamState 4089 n.wal.FastState(&state) 4090 4091 // Check if we have items pending as we are taking over. 4092 sendHB := state.LastSeq > n.commit 4093 4094 n.lxfer = false 4095 n.updateLeader(n.id) 4096 n.switchState(Leader) 4097 n.Unlock() 4098 4099 if sendHB { 4100 n.sendHeartbeat() 4101 } 4102 }