github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/state/raft/raft.go (about) 1 package raft 2 3 import ( 4 "context" 5 "fmt" 6 "io" 7 "math" 8 "math/rand" 9 "net" 10 "sync" 11 "sync/atomic" 12 "time" 13 14 "code.cloudfoundry.org/clock" 15 "github.com/coreos/etcd/pkg/idutil" 16 "github.com/coreos/etcd/raft" 17 "github.com/coreos/etcd/raft/raftpb" 18 "github.com/docker/docker/pkg/signal" 19 "github.com/docker/go-events" 20 "github.com/docker/go-metrics" 21 "github.com/docker/swarmkit/api" 22 "github.com/docker/swarmkit/ca" 23 "github.com/docker/swarmkit/log" 24 "github.com/docker/swarmkit/manager/raftselector" 25 "github.com/docker/swarmkit/manager/state" 26 "github.com/docker/swarmkit/manager/state/raft/membership" 27 "github.com/docker/swarmkit/manager/state/raft/storage" 28 "github.com/docker/swarmkit/manager/state/raft/transport" 29 "github.com/docker/swarmkit/manager/state/store" 30 "github.com/docker/swarmkit/watch" 31 "github.com/gogo/protobuf/proto" 32 "github.com/pkg/errors" 33 "github.com/sirupsen/logrus" 34 "golang.org/x/time/rate" 35 "google.golang.org/grpc" 36 "google.golang.org/grpc/codes" 37 "google.golang.org/grpc/credentials" 38 "google.golang.org/grpc/peer" 39 "google.golang.org/grpc/status" 40 ) 41 42 var ( 43 // ErrNoRaftMember is thrown when the node is not yet part of a raft cluster 44 ErrNoRaftMember = errors.New("raft: node is not yet part of a raft cluster") 45 // ErrConfChangeRefused is returned when there is an issue with the configuration change 46 ErrConfChangeRefused = errors.New("raft: propose configuration change refused") 47 // ErrApplyNotSpecified is returned during the creation of a raft node when no apply method was provided 48 ErrApplyNotSpecified = errors.New("raft: apply method was not specified") 49 // ErrSetHardState is returned when the node fails to set the hard state 50 ErrSetHardState = errors.New("raft: failed to set the hard state for log append entry") 51 // ErrStopped is returned when an operation was submitted but the node was stopped in the meantime 52 ErrStopped = errors.New("raft: failed to process the request: node is stopped") 53 // ErrLostLeadership is returned when an operation was submitted but the node lost leader status before it became committed 54 ErrLostLeadership = errors.New("raft: failed to process the request: node lost leader status") 55 // ErrRequestTooLarge is returned when a raft internal message is too large to be sent 56 ErrRequestTooLarge = errors.New("raft: raft message is too large and can't be sent") 57 // ErrCannotRemoveMember is thrown when we try to remove a member from the cluster but this would result in a loss of quorum 58 ErrCannotRemoveMember = errors.New("raft: member cannot be removed, because removing it may result in loss of quorum") 59 // ErrNoClusterLeader is thrown when the cluster has no elected leader 60 ErrNoClusterLeader = errors.New("raft: no elected cluster leader") 61 // ErrMemberUnknown is sent in response to a message from an 62 // unrecognized peer. 63 ErrMemberUnknown = errors.New("raft: member unknown") 64 65 // work around lint 66 lostQuorumMessage = "The swarm does not have a leader. It's possible that too few managers are online. Make sure more than half of the managers are online." 67 errLostQuorum = errors.New(lostQuorumMessage) 68 69 // Timer to capture ProposeValue() latency. 70 proposeLatencyTimer metrics.Timer 71 ) 72 73 // LeadershipState indicates whether the node is a leader or follower. 74 type LeadershipState int 75 76 const ( 77 // IsLeader indicates that the node is a raft leader. 78 IsLeader LeadershipState = iota 79 // IsFollower indicates that the node is a raft follower. 80 IsFollower 81 82 // lostQuorumTimeout is the number of ticks that can elapse with no 83 // leader before LeaderConn starts returning an error right away. 84 lostQuorumTimeout = 10 85 ) 86 87 // EncryptionKeys are the current and, if necessary, pending DEKs with which to 88 // encrypt raft data 89 type EncryptionKeys struct { 90 CurrentDEK []byte 91 PendingDEK []byte 92 } 93 94 // EncryptionKeyRotator is an interface to find out if any keys need rotating. 95 type EncryptionKeyRotator interface { 96 GetKeys() EncryptionKeys 97 UpdateKeys(EncryptionKeys) error 98 NeedsRotation() bool 99 RotationNotify() chan struct{} 100 } 101 102 // Node represents the Raft Node useful 103 // configuration. 104 type Node struct { 105 raftNode raft.Node 106 cluster *membership.Cluster 107 transport *transport.Transport 108 109 raftStore *raft.MemoryStorage 110 memoryStore *store.MemoryStore 111 Config *raft.Config 112 opts NodeOptions 113 reqIDGen *idutil.Generator 114 wait *wait 115 campaignWhenAble bool 116 signalledLeadership uint32 117 isMember uint32 118 bootstrapMembers []*api.RaftMember 119 120 // waitProp waits for all the proposals to be terminated before 121 // shutting down the node. 122 waitProp sync.WaitGroup 123 124 confState raftpb.ConfState 125 appliedIndex uint64 126 snapshotMeta raftpb.SnapshotMetadata 127 writtenWALIndex uint64 128 129 ticker clock.Ticker 130 doneCh chan struct{} 131 // RemovedFromRaft notifies about node deletion from raft cluster 132 RemovedFromRaft chan struct{} 133 cancelFunc func() 134 // removeRaftCh notifies about node deletion from raft cluster 135 removeRaftCh chan struct{} 136 removeRaftOnce sync.Once 137 leadershipBroadcast *watch.Queue 138 139 // used to coordinate shutdown 140 // Lock should be used only in stop(), all other functions should use RLock. 141 stopMu sync.RWMutex 142 // used for membership management checks 143 membershipLock sync.Mutex 144 // synchronizes access to n.opts.Addr, and makes sure the address is not 145 // updated concurrently with JoinAndStart. 146 addrLock sync.Mutex 147 148 snapshotInProgress chan raftpb.SnapshotMetadata 149 asyncTasks sync.WaitGroup 150 151 // stopped chan is used for notifying grpc handlers that raft node going 152 // to stop. 153 stopped chan struct{} 154 155 raftLogger *storage.EncryptedRaftLogger 156 keyRotator EncryptionKeyRotator 157 rotationQueued bool 158 clearData bool 159 160 // waitForAppliedIndex stores the index of the last log that was written using 161 // an raft DEK during a raft DEK rotation, so that we won't finish a rotation until 162 // a snapshot covering that index has been written encrypted with the new raft DEK 163 waitForAppliedIndex uint64 164 ticksWithNoLeader uint32 165 } 166 167 // NodeOptions provides node-level options. 168 type NodeOptions struct { 169 // ID is the node's ID, from its certificate's CN field. 170 ID string 171 // Addr is the address of this node's listener 172 Addr string 173 // ForceNewCluster defines if we have to force a new cluster 174 // because we are recovering from a backup data directory. 175 ForceNewCluster bool 176 // JoinAddr is the cluster to join. May be an empty string to create 177 // a standalone cluster. 178 JoinAddr string 179 // ForceJoin tells us to join even if already part of a cluster. 180 ForceJoin bool 181 // Config is the raft config. 182 Config *raft.Config 183 // StateDir is the directory to store durable state. 184 StateDir string 185 // TickInterval interval is the time interval between raft ticks. 186 TickInterval time.Duration 187 // ClockSource is a Clock interface to use as a time base. 188 // Leave this nil except for tests that are designed not to run in real 189 // time. 190 ClockSource clock.Clock 191 // SendTimeout is the timeout on the sending messages to other raft 192 // nodes. Leave this as 0 to get the default value. 193 SendTimeout time.Duration 194 TLSCredentials credentials.TransportCredentials 195 KeyRotator EncryptionKeyRotator 196 // DisableStackDump prevents Run from dumping goroutine stacks when the 197 // store becomes stuck. 198 DisableStackDump bool 199 200 // FIPS specifies whether the raft encryption should be FIPS compliant 201 FIPS bool 202 } 203 204 func init() { 205 rand.Seed(time.Now().UnixNano()) 206 ns := metrics.NewNamespace("swarm", "raft", nil) 207 proposeLatencyTimer = ns.NewTimer("transaction_latency", "Raft transaction latency.") 208 metrics.Register(ns) 209 } 210 211 // NewNode generates a new Raft node 212 func NewNode(opts NodeOptions) *Node { 213 cfg := opts.Config 214 if cfg == nil { 215 cfg = DefaultNodeConfig() 216 } 217 if opts.TickInterval == 0 { 218 opts.TickInterval = time.Second 219 } 220 if opts.SendTimeout == 0 { 221 opts.SendTimeout = 2 * time.Second 222 } 223 224 raftStore := raft.NewMemoryStorage() 225 226 n := &Node{ 227 cluster: membership.NewCluster(), 228 raftStore: raftStore, 229 opts: opts, 230 Config: &raft.Config{ 231 ElectionTick: cfg.ElectionTick, 232 HeartbeatTick: cfg.HeartbeatTick, 233 Storage: raftStore, 234 MaxSizePerMsg: cfg.MaxSizePerMsg, 235 MaxInflightMsgs: cfg.MaxInflightMsgs, 236 Logger: cfg.Logger, 237 CheckQuorum: cfg.CheckQuorum, 238 }, 239 doneCh: make(chan struct{}), 240 RemovedFromRaft: make(chan struct{}), 241 stopped: make(chan struct{}), 242 leadershipBroadcast: watch.NewQueue(), 243 keyRotator: opts.KeyRotator, 244 } 245 n.memoryStore = store.NewMemoryStore(n) 246 247 if opts.ClockSource == nil { 248 n.ticker = clock.NewClock().NewTicker(opts.TickInterval) 249 } else { 250 n.ticker = opts.ClockSource.NewTicker(opts.TickInterval) 251 } 252 253 n.reqIDGen = idutil.NewGenerator(uint16(n.Config.ID), time.Now()) 254 n.wait = newWait() 255 256 n.cancelFunc = func(n *Node) func() { 257 var cancelOnce sync.Once 258 return func() { 259 cancelOnce.Do(func() { 260 close(n.stopped) 261 }) 262 } 263 }(n) 264 265 return n 266 } 267 268 // IsIDRemoved reports if member with id was removed from cluster. 269 // Part of transport.Raft interface. 270 func (n *Node) IsIDRemoved(id uint64) bool { 271 return n.cluster.IsIDRemoved(id) 272 } 273 274 // NodeRemoved signals that node was removed from cluster and should stop. 275 // Part of transport.Raft interface. 276 func (n *Node) NodeRemoved() { 277 n.removeRaftOnce.Do(func() { 278 atomic.StoreUint32(&n.isMember, 0) 279 close(n.RemovedFromRaft) 280 }) 281 } 282 283 // ReportSnapshot reports snapshot status to underlying raft node. 284 // Part of transport.Raft interface. 285 func (n *Node) ReportSnapshot(id uint64, status raft.SnapshotStatus) { 286 n.raftNode.ReportSnapshot(id, status) 287 } 288 289 // ReportUnreachable reports to underlying raft node that member with id is 290 // unreachable. 291 // Part of transport.Raft interface. 292 func (n *Node) ReportUnreachable(id uint64) { 293 n.raftNode.ReportUnreachable(id) 294 } 295 296 // SetAddr provides the raft node's address. This can be used in cases where 297 // opts.Addr was not provided to NewNode, for example when a port was not bound 298 // until after the raft node was created. 299 func (n *Node) SetAddr(ctx context.Context, addr string) error { 300 n.addrLock.Lock() 301 defer n.addrLock.Unlock() 302 303 n.opts.Addr = addr 304 305 if !n.IsMember() { 306 return nil 307 } 308 309 newRaftMember := &api.RaftMember{ 310 RaftID: n.Config.ID, 311 NodeID: n.opts.ID, 312 Addr: addr, 313 } 314 if err := n.cluster.UpdateMember(n.Config.ID, newRaftMember); err != nil { 315 return err 316 } 317 318 // If the raft node is running, submit a configuration change 319 // with the new address. 320 321 // TODO(aaronl): Currently, this node must be the leader to 322 // submit this configuration change. This works for the initial 323 // use cases (single-node cluster late binding ports, or calling 324 // SetAddr before joining a cluster). In the future, we may want 325 // to support having a follower proactively change its remote 326 // address. 327 328 leadershipCh, cancelWatch := n.SubscribeLeadership() 329 defer cancelWatch() 330 331 ctx, cancelCtx := n.WithContext(ctx) 332 defer cancelCtx() 333 334 isLeader := atomic.LoadUint32(&n.signalledLeadership) == 1 335 for !isLeader { 336 select { 337 case leadershipChange := <-leadershipCh: 338 if leadershipChange == IsLeader { 339 isLeader = true 340 } 341 case <-ctx.Done(): 342 return ctx.Err() 343 } 344 } 345 346 return n.updateNodeBlocking(ctx, n.Config.ID, addr) 347 } 348 349 // WithContext returns context which is cancelled when parent context cancelled 350 // or node is stopped. 351 func (n *Node) WithContext(ctx context.Context) (context.Context, context.CancelFunc) { 352 ctx, cancel := context.WithCancel(ctx) 353 354 go func() { 355 select { 356 case <-ctx.Done(): 357 case <-n.stopped: 358 cancel() 359 } 360 }() 361 return ctx, cancel 362 } 363 364 func (n *Node) initTransport() { 365 transportConfig := &transport.Config{ 366 HeartbeatInterval: time.Duration(n.Config.ElectionTick) * n.opts.TickInterval, 367 SendTimeout: n.opts.SendTimeout, 368 Credentials: n.opts.TLSCredentials, 369 Raft: n, 370 } 371 n.transport = transport.New(transportConfig) 372 } 373 374 // JoinAndStart joins and starts the raft server 375 func (n *Node) JoinAndStart(ctx context.Context) (err error) { 376 ctx, cancel := n.WithContext(ctx) 377 defer func() { 378 cancel() 379 if err != nil { 380 n.stopMu.Lock() 381 // to shutdown transport 382 n.cancelFunc() 383 n.stopMu.Unlock() 384 n.done() 385 } else { 386 atomic.StoreUint32(&n.isMember, 1) 387 } 388 }() 389 390 loadAndStartErr := n.loadAndStart(ctx, n.opts.ForceNewCluster) 391 if loadAndStartErr != nil && loadAndStartErr != storage.ErrNoWAL { 392 return loadAndStartErr 393 } 394 395 snapshot, err := n.raftStore.Snapshot() 396 // Snapshot never returns an error 397 if err != nil { 398 panic("could not get snapshot of raft store") 399 } 400 401 n.confState = snapshot.Metadata.ConfState 402 n.appliedIndex = snapshot.Metadata.Index 403 n.snapshotMeta = snapshot.Metadata 404 n.writtenWALIndex, _ = n.raftStore.LastIndex() // lastIndex always returns nil as an error 405 406 n.addrLock.Lock() 407 defer n.addrLock.Unlock() 408 409 // override the module field entirely, since etcd/raft is not exactly a submodule 410 n.Config.Logger = log.G(ctx).WithField("module", "raft") 411 412 // restore from snapshot 413 if loadAndStartErr == nil { 414 if n.opts.JoinAddr != "" && n.opts.ForceJoin { 415 if err := n.joinCluster(ctx); err != nil { 416 return errors.Wrap(err, "failed to rejoin cluster") 417 } 418 } 419 n.campaignWhenAble = true 420 n.initTransport() 421 n.raftNode = raft.RestartNode(n.Config) 422 return nil 423 } 424 425 if n.opts.JoinAddr == "" { 426 // First member in the cluster, self-assign ID 427 n.Config.ID = uint64(rand.Int63()) + 1 428 peer, err := n.newRaftLogs(n.opts.ID) 429 if err != nil { 430 return err 431 } 432 n.campaignWhenAble = true 433 n.initTransport() 434 n.raftNode = raft.StartNode(n.Config, []raft.Peer{peer}) 435 return nil 436 } 437 438 // join to existing cluster 439 440 if err := n.joinCluster(ctx); err != nil { 441 return err 442 } 443 444 if _, err := n.newRaftLogs(n.opts.ID); err != nil { 445 return err 446 } 447 448 n.initTransport() 449 n.raftNode = raft.StartNode(n.Config, nil) 450 451 return nil 452 } 453 454 func (n *Node) joinCluster(ctx context.Context) error { 455 if n.opts.Addr == "" { 456 return errors.New("attempted to join raft cluster without knowing own address") 457 } 458 459 conn, err := dial(n.opts.JoinAddr, "tcp", n.opts.TLSCredentials, 10*time.Second) 460 if err != nil { 461 return err 462 } 463 defer conn.Close() 464 client := api.NewRaftMembershipClient(conn) 465 466 joinCtx, joinCancel := context.WithTimeout(ctx, n.reqTimeout()) 467 defer joinCancel() 468 resp, err := client.Join(joinCtx, &api.JoinRequest{ 469 Addr: n.opts.Addr, 470 }) 471 if err != nil { 472 return err 473 } 474 475 n.Config.ID = resp.RaftID 476 n.bootstrapMembers = resp.Members 477 return nil 478 } 479 480 // DefaultNodeConfig returns the default config for a 481 // raft node that can be modified and customized 482 func DefaultNodeConfig() *raft.Config { 483 return &raft.Config{ 484 HeartbeatTick: 1, 485 // Recommended value in etcd/raft is 10 x (HeartbeatTick). 486 // Lower values were seen to have caused instability because of 487 // frequent leader elections when running on flakey networks. 488 ElectionTick: 10, 489 MaxSizePerMsg: math.MaxUint16, 490 MaxInflightMsgs: 256, 491 Logger: log.L, 492 CheckQuorum: true, 493 } 494 } 495 496 // DefaultRaftConfig returns a default api.RaftConfig. 497 func DefaultRaftConfig() api.RaftConfig { 498 return api.RaftConfig{ 499 KeepOldSnapshots: 0, 500 SnapshotInterval: 10000, 501 LogEntriesForSlowFollowers: 500, 502 // Recommended value in etcd/raft is 10 x (HeartbeatTick). 503 // Lower values were seen to have caused instability because of 504 // frequent leader elections when running on flakey networks. 505 HeartbeatTick: 1, 506 ElectionTick: 10, 507 } 508 } 509 510 // MemoryStore returns the memory store that is kept in sync with the raft log. 511 func (n *Node) MemoryStore() *store.MemoryStore { 512 return n.memoryStore 513 } 514 515 func (n *Node) done() { 516 n.cluster.Clear() 517 518 n.ticker.Stop() 519 n.leadershipBroadcast.Close() 520 n.cluster.PeersBroadcast.Close() 521 n.memoryStore.Close() 522 if n.transport != nil { 523 n.transport.Stop() 524 } 525 526 close(n.doneCh) 527 } 528 529 // ClearData tells the raft node to delete its WALs, snapshots, and keys on 530 // shutdown. 531 func (n *Node) ClearData() { 532 n.clearData = true 533 } 534 535 // Run is the main loop for a Raft node, it goes along the state machine, 536 // acting on the messages received from other Raft nodes in the cluster. 537 // 538 // Before running the main loop, it first starts the raft node based on saved 539 // cluster state. If no saved state exists, it starts a single-node cluster. 540 func (n *Node) Run(ctx context.Context) error { 541 ctx = log.WithLogger(ctx, logrus.WithField("raft_id", fmt.Sprintf("%x", n.Config.ID))) 542 ctx, cancel := context.WithCancel(ctx) 543 544 for _, node := range n.bootstrapMembers { 545 if err := n.registerNode(node); err != nil { 546 log.G(ctx).WithError(err).Errorf("failed to register member %x", node.RaftID) 547 } 548 } 549 550 defer func() { 551 cancel() 552 n.stop(ctx) 553 if n.clearData { 554 // Delete WAL and snapshots, since they are no longer 555 // usable. 556 if err := n.raftLogger.Clear(ctx); err != nil { 557 log.G(ctx).WithError(err).Error("failed to move wal after node removal") 558 } 559 // clear out the DEKs 560 if err := n.keyRotator.UpdateKeys(EncryptionKeys{}); err != nil { 561 log.G(ctx).WithError(err).Error("could not remove DEKs") 562 } 563 } 564 n.done() 565 }() 566 567 // Flag that indicates if this manager node is *currently* the raft leader. 568 wasLeader := false 569 transferLeadershipLimit := rate.NewLimiter(rate.Every(time.Minute), 1) 570 571 for { 572 select { 573 case <-n.ticker.C(): 574 n.raftNode.Tick() 575 576 if n.leader() == raft.None { 577 atomic.AddUint32(&n.ticksWithNoLeader, 1) 578 } else { 579 atomic.StoreUint32(&n.ticksWithNoLeader, 0) 580 } 581 case rd := <-n.raftNode.Ready(): 582 raftConfig := n.getCurrentRaftConfig() 583 584 // Save entries to storage 585 if err := n.saveToStorage(ctx, &raftConfig, rd.HardState, rd.Entries, rd.Snapshot); err != nil { 586 return errors.Wrap(err, "failed to save entries to storage") 587 } 588 589 // If the memory store lock has been held for too long, 590 // transferring leadership is an easy way to break out of it. 591 if wasLeader && 592 (rd.SoftState == nil || rd.SoftState.RaftState == raft.StateLeader) && 593 n.memoryStore.Wedged() && 594 transferLeadershipLimit.Allow() { 595 log.G(ctx).Error("Attempting to transfer leadership") 596 if !n.opts.DisableStackDump { 597 signal.DumpStacks("") 598 } 599 transferee, err := n.transport.LongestActive() 600 if err != nil { 601 log.G(ctx).WithError(err).Error("failed to get longest-active member") 602 } else { 603 log.G(ctx).Error("data store lock held too long - transferring leadership") 604 n.raftNode.TransferLeadership(ctx, n.Config.ID, transferee) 605 } 606 } 607 608 for _, msg := range rd.Messages { 609 // Send raft messages to peers 610 if err := n.transport.Send(msg); err != nil { 611 log.G(ctx).WithError(err).Error("failed to send message to member") 612 } 613 } 614 615 // Apply snapshot to memory store. The snapshot 616 // was applied to the raft store in 617 // saveToStorage. 618 if !raft.IsEmptySnap(rd.Snapshot) { 619 // Load the snapshot data into the store 620 if err := n.restoreFromSnapshot(ctx, rd.Snapshot.Data); err != nil { 621 log.G(ctx).WithError(err).Error("failed to restore cluster from snapshot") 622 } 623 n.appliedIndex = rd.Snapshot.Metadata.Index 624 n.snapshotMeta = rd.Snapshot.Metadata 625 n.confState = rd.Snapshot.Metadata.ConfState 626 } 627 628 // If we cease to be the leader, we must cancel any 629 // proposals that are currently waiting for a quorum to 630 // acknowledge them. It is still possible for these to 631 // become committed, but if that happens we will apply 632 // them as any follower would. 633 634 // It is important that we cancel these proposals before 635 // calling processCommitted, so processCommitted does 636 // not deadlock. 637 638 if rd.SoftState != nil { 639 if wasLeader && rd.SoftState.RaftState != raft.StateLeader { 640 wasLeader = false 641 log.G(ctx).Error("soft state changed, node no longer a leader, resetting and cancelling all waits") 642 643 if atomic.LoadUint32(&n.signalledLeadership) == 1 { 644 atomic.StoreUint32(&n.signalledLeadership, 0) 645 n.leadershipBroadcast.Publish(IsFollower) 646 } 647 648 // It is important that we set n.signalledLeadership to 0 649 // before calling n.wait.cancelAll. When a new raft 650 // request is registered, it checks n.signalledLeadership 651 // afterwards, and cancels the registration if it is 0. 652 // If cancelAll was called first, this call might run 653 // before the new request registers, but 654 // signalledLeadership would be set after the check. 655 // Setting signalledLeadership before calling cancelAll 656 // ensures that if a new request is registered during 657 // this transition, it will either be cancelled by 658 // cancelAll, or by its own check of signalledLeadership. 659 n.wait.cancelAll() 660 } else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader { 661 // Node just became a leader. 662 wasLeader = true 663 } 664 } 665 666 // Process committed entries 667 for _, entry := range rd.CommittedEntries { 668 if err := n.processCommitted(ctx, entry); err != nil { 669 log.G(ctx).WithError(err).Error("failed to process committed entries") 670 } 671 } 672 673 // in case the previous attempt to update the key failed 674 n.maybeMarkRotationFinished(ctx) 675 676 // Trigger a snapshot every once in awhile 677 if n.snapshotInProgress == nil && 678 (n.needsSnapshot(ctx) || raftConfig.SnapshotInterval > 0 && 679 n.appliedIndex-n.snapshotMeta.Index >= raftConfig.SnapshotInterval) { 680 n.triggerSnapshot(ctx, raftConfig) 681 } 682 683 if wasLeader && atomic.LoadUint32(&n.signalledLeadership) != 1 { 684 // If all the entries in the log have become 685 // committed, broadcast our leadership status. 686 if n.caughtUp() { 687 atomic.StoreUint32(&n.signalledLeadership, 1) 688 n.leadershipBroadcast.Publish(IsLeader) 689 } 690 } 691 692 // Advance the state machine 693 n.raftNode.Advance() 694 695 // On the first startup, or if we are the only 696 // registered member after restoring from the state, 697 // campaign to be the leader. 698 if n.campaignWhenAble { 699 members := n.cluster.Members() 700 if len(members) >= 1 { 701 n.campaignWhenAble = false 702 } 703 if len(members) == 1 && members[n.Config.ID] != nil { 704 n.raftNode.Campaign(ctx) 705 } 706 } 707 708 case snapshotMeta := <-n.snapshotInProgress: 709 raftConfig := n.getCurrentRaftConfig() 710 if snapshotMeta.Index > n.snapshotMeta.Index { 711 n.snapshotMeta = snapshotMeta 712 if err := n.raftLogger.GC(snapshotMeta.Index, snapshotMeta.Term, raftConfig.KeepOldSnapshots); err != nil { 713 log.G(ctx).WithError(err).Error("failed to clean up old snapshots and WALs") 714 } 715 } 716 n.snapshotInProgress = nil 717 n.maybeMarkRotationFinished(ctx) 718 if n.rotationQueued && n.needsSnapshot(ctx) { 719 // there was a key rotation that took place before while the snapshot 720 // was in progress - we have to take another snapshot and encrypt with the new key 721 n.rotationQueued = false 722 n.triggerSnapshot(ctx, raftConfig) 723 } 724 case <-n.keyRotator.RotationNotify(): 725 // There are 2 separate checks: rotationQueued, and n.needsSnapshot(). 726 // We set rotationQueued so that when we are notified of a rotation, we try to 727 // do a snapshot as soon as possible. However, if there is an error while doing 728 // the snapshot, we don't want to hammer the node attempting to do snapshots over 729 // and over. So if doing a snapshot fails, wait until the next entry comes in to 730 // try again. 731 switch { 732 case n.snapshotInProgress != nil: 733 n.rotationQueued = true 734 case n.needsSnapshot(ctx): 735 n.triggerSnapshot(ctx, n.getCurrentRaftConfig()) 736 } 737 case <-ctx.Done(): 738 return nil 739 } 740 } 741 } 742 743 func (n *Node) restoreFromSnapshot(ctx context.Context, data []byte) error { 744 snapCluster, err := n.clusterSnapshot(data) 745 if err != nil { 746 return err 747 } 748 749 oldMembers := n.cluster.Members() 750 751 for _, member := range snapCluster.Members { 752 delete(oldMembers, member.RaftID) 753 } 754 755 for _, removedMember := range snapCluster.Removed { 756 n.cluster.RemoveMember(removedMember) 757 n.transport.RemovePeer(removedMember) 758 delete(oldMembers, removedMember) 759 } 760 761 for id, member := range oldMembers { 762 n.cluster.ClearMember(id) 763 if err := n.transport.RemovePeer(member.RaftID); err != nil { 764 log.G(ctx).WithError(err).Errorf("failed to remove peer %x from transport", member.RaftID) 765 } 766 } 767 for _, node := range snapCluster.Members { 768 if err := n.registerNode(&api.RaftMember{RaftID: node.RaftID, NodeID: node.NodeID, Addr: node.Addr}); err != nil { 769 log.G(ctx).WithError(err).Error("failed to register node from snapshot") 770 } 771 } 772 return nil 773 } 774 775 func (n *Node) needsSnapshot(ctx context.Context) bool { 776 if n.waitForAppliedIndex == 0 && n.keyRotator.NeedsRotation() { 777 keys := n.keyRotator.GetKeys() 778 if keys.PendingDEK != nil { 779 n.raftLogger.RotateEncryptionKey(keys.PendingDEK) 780 // we want to wait for the last index written with the old DEK to be committed, else a snapshot taken 781 // may have an index less than the index of a WAL written with an old DEK. We want the next snapshot 782 // written with the new key to supercede any WAL written with an old DEK. 783 n.waitForAppliedIndex = n.writtenWALIndex 784 // if there is already a snapshot at this index or higher, bump the wait index up to 1 higher than the current 785 // snapshot index, because the rotation cannot be completed until the next snapshot 786 if n.waitForAppliedIndex <= n.snapshotMeta.Index { 787 n.waitForAppliedIndex = n.snapshotMeta.Index + 1 788 } 789 log.G(ctx).Debugf( 790 "beginning raft DEK rotation - last indices written with the old key are (snapshot: %d, WAL: %d) - waiting for snapshot of index %d to be written before rotation can be completed", n.snapshotMeta.Index, n.writtenWALIndex, n.waitForAppliedIndex) 791 } 792 } 793 794 result := n.waitForAppliedIndex > 0 && n.waitForAppliedIndex <= n.appliedIndex 795 if result { 796 log.G(ctx).Debugf( 797 "a snapshot at index %d is needed in order to complete raft DEK rotation - a snapshot with index >= %d can now be triggered", 798 n.waitForAppliedIndex, n.appliedIndex) 799 } 800 return result 801 } 802 803 func (n *Node) maybeMarkRotationFinished(ctx context.Context) { 804 if n.waitForAppliedIndex > 0 && n.waitForAppliedIndex <= n.snapshotMeta.Index { 805 // this means we tried to rotate - so finish the rotation 806 if err := n.keyRotator.UpdateKeys(EncryptionKeys{CurrentDEK: n.raftLogger.EncryptionKey}); err != nil { 807 log.G(ctx).WithError(err).Error("failed to update encryption keys after a successful rotation") 808 } else { 809 log.G(ctx).Debugf( 810 "a snapshot with index %d is available, which completes the DEK rotation requiring a snapshot of at least index %d - throwing away DEK and older snapshots encrypted with the old key", 811 n.snapshotMeta.Index, n.waitForAppliedIndex) 812 n.waitForAppliedIndex = 0 813 814 if err := n.raftLogger.GC(n.snapshotMeta.Index, n.snapshotMeta.Term, 0); err != nil { 815 log.G(ctx).WithError(err).Error("failed to remove old snapshots and WALs that were written with the previous raft DEK") 816 } 817 } 818 } 819 } 820 821 func (n *Node) getCurrentRaftConfig() api.RaftConfig { 822 raftConfig := DefaultRaftConfig() 823 n.memoryStore.View(func(readTx store.ReadTx) { 824 clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName)) 825 if err == nil && len(clusters) == 1 { 826 raftConfig = clusters[0].Spec.Raft 827 } 828 }) 829 return raftConfig 830 } 831 832 // Cancel interrupts all ongoing proposals, and prevents new ones from 833 // starting. This is useful for the shutdown sequence because it allows 834 // the manager to shut down raft-dependent services that might otherwise 835 // block on shutdown if quorum isn't met. Then the raft node can be completely 836 // shut down once no more code is using it. 837 func (n *Node) Cancel() { 838 n.cancelFunc() 839 } 840 841 // Done returns channel which is closed when raft node is fully stopped. 842 func (n *Node) Done() <-chan struct{} { 843 return n.doneCh 844 } 845 846 func (n *Node) stop(ctx context.Context) { 847 n.stopMu.Lock() 848 defer n.stopMu.Unlock() 849 850 n.Cancel() 851 n.waitProp.Wait() 852 n.asyncTasks.Wait() 853 854 n.raftNode.Stop() 855 n.ticker.Stop() 856 n.raftLogger.Close(ctx) 857 atomic.StoreUint32(&n.isMember, 0) 858 // TODO(stevvooe): Handle ctx.Done() 859 } 860 861 // isLeader checks if we are the leader or not, without the protection of lock 862 func (n *Node) isLeader() bool { 863 if !n.IsMember() { 864 return false 865 } 866 867 if n.Status().Lead == n.Config.ID { 868 return true 869 } 870 return false 871 } 872 873 // IsLeader checks if we are the leader or not, with the protection of lock 874 func (n *Node) IsLeader() bool { 875 n.stopMu.RLock() 876 defer n.stopMu.RUnlock() 877 878 return n.isLeader() 879 } 880 881 // leader returns the id of the leader, without the protection of lock and 882 // membership check, so it's caller task. 883 func (n *Node) leader() uint64 { 884 return n.Status().Lead 885 } 886 887 // Leader returns the id of the leader, with the protection of lock 888 func (n *Node) Leader() (uint64, error) { 889 n.stopMu.RLock() 890 defer n.stopMu.RUnlock() 891 892 if !n.IsMember() { 893 return raft.None, ErrNoRaftMember 894 } 895 leader := n.leader() 896 if leader == raft.None { 897 return raft.None, ErrNoClusterLeader 898 } 899 900 return leader, nil 901 } 902 903 // ReadyForProposals returns true if the node has broadcasted a message 904 // saying that it has become the leader. This means it is ready to accept 905 // proposals. 906 func (n *Node) ReadyForProposals() bool { 907 return atomic.LoadUint32(&n.signalledLeadership) == 1 908 } 909 910 func (n *Node) caughtUp() bool { 911 // obnoxious function that always returns a nil error 912 lastIndex, _ := n.raftStore.LastIndex() 913 return n.appliedIndex >= lastIndex 914 } 915 916 // Join asks to a member of the raft to propose 917 // a configuration change and add us as a member thus 918 // beginning the log replication process. This method 919 // is called from an aspiring member to an existing member 920 func (n *Node) Join(ctx context.Context, req *api.JoinRequest) (*api.JoinResponse, error) { 921 nodeInfo, err := ca.RemoteNode(ctx) 922 if err != nil { 923 return nil, err 924 } 925 926 fields := logrus.Fields{ 927 "node.id": nodeInfo.NodeID, 928 "method": "(*Node).Join", 929 "raft_id": fmt.Sprintf("%x", n.Config.ID), 930 } 931 if nodeInfo.ForwardedBy != nil { 932 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 933 } 934 log := log.G(ctx).WithFields(fields) 935 log.Debug("") 936 937 // can't stop the raft node while an async RPC is in progress 938 n.stopMu.RLock() 939 defer n.stopMu.RUnlock() 940 941 n.membershipLock.Lock() 942 defer n.membershipLock.Unlock() 943 944 if !n.IsMember() { 945 return nil, status.Errorf(codes.FailedPrecondition, "%s", ErrNoRaftMember.Error()) 946 } 947 948 if !n.isLeader() { 949 return nil, status.Errorf(codes.FailedPrecondition, "%s", ErrLostLeadership.Error()) 950 } 951 952 remoteAddr := req.Addr 953 954 // If the joining node sent an address like 0.0.0.0:4242, automatically 955 // determine its actual address based on the GRPC connection. This 956 // avoids the need for a prospective member to know its own address. 957 958 requestHost, requestPort, err := net.SplitHostPort(remoteAddr) 959 if err != nil { 960 return nil, status.Errorf(codes.InvalidArgument, "invalid address %s in raft join request", remoteAddr) 961 } 962 963 requestIP := net.ParseIP(requestHost) 964 if requestIP != nil && requestIP.IsUnspecified() { 965 remoteHost, _, err := net.SplitHostPort(nodeInfo.RemoteAddr) 966 if err != nil { 967 return nil, err 968 } 969 remoteAddr = net.JoinHostPort(remoteHost, requestPort) 970 } 971 972 // We do not bother submitting a configuration change for the 973 // new member if we can't contact it back using its address 974 if err := n.checkHealth(ctx, remoteAddr, 5*time.Second); err != nil { 975 return nil, err 976 } 977 978 // If the peer is already a member of the cluster, we will only update 979 // its information, not add it as a new member. Adding it again would 980 // cause the quorum to be computed incorrectly. 981 for _, m := range n.cluster.Members() { 982 if m.NodeID == nodeInfo.NodeID { 983 if remoteAddr == m.Addr { 984 return n.joinResponse(m.RaftID), nil 985 } 986 updatedRaftMember := &api.RaftMember{ 987 RaftID: m.RaftID, 988 NodeID: m.NodeID, 989 Addr: remoteAddr, 990 } 991 if err := n.cluster.UpdateMember(m.RaftID, updatedRaftMember); err != nil { 992 return nil, err 993 } 994 995 if err := n.updateNodeBlocking(ctx, m.RaftID, remoteAddr); err != nil { 996 log.WithError(err).Error("failed to update node address") 997 return nil, err 998 } 999 1000 log.Info("updated node address") 1001 return n.joinResponse(m.RaftID), nil 1002 } 1003 } 1004 1005 // Find a unique ID for the joining member. 1006 var raftID uint64 1007 for { 1008 raftID = uint64(rand.Int63()) + 1 1009 if n.cluster.GetMember(raftID) == nil && !n.cluster.IsIDRemoved(raftID) { 1010 break 1011 } 1012 } 1013 1014 err = n.addMember(ctx, remoteAddr, raftID, nodeInfo.NodeID) 1015 if err != nil { 1016 log.WithError(err).Errorf("failed to add member %x", raftID) 1017 return nil, err 1018 } 1019 1020 log.Debug("node joined") 1021 1022 return n.joinResponse(raftID), nil 1023 } 1024 1025 func (n *Node) joinResponse(raftID uint64) *api.JoinResponse { 1026 var nodes []*api.RaftMember 1027 for _, node := range n.cluster.Members() { 1028 nodes = append(nodes, &api.RaftMember{ 1029 RaftID: node.RaftID, 1030 NodeID: node.NodeID, 1031 Addr: node.Addr, 1032 }) 1033 } 1034 1035 return &api.JoinResponse{Members: nodes, RaftID: raftID} 1036 } 1037 1038 // checkHealth tries to contact an aspiring member through its advertised address 1039 // and checks if its raft server is running. 1040 func (n *Node) checkHealth(ctx context.Context, addr string, timeout time.Duration) error { 1041 conn, err := dial(addr, "tcp", n.opts.TLSCredentials, timeout) 1042 if err != nil { 1043 return err 1044 } 1045 1046 defer conn.Close() 1047 1048 if timeout != 0 { 1049 tctx, cancel := context.WithTimeout(ctx, timeout) 1050 defer cancel() 1051 ctx = tctx 1052 } 1053 1054 healthClient := api.NewHealthClient(conn) 1055 resp, err := healthClient.Check(ctx, &api.HealthCheckRequest{Service: "Raft"}) 1056 if err != nil { 1057 return errors.Wrap(err, "could not connect to prospective new cluster member using its advertised address") 1058 } 1059 if resp.Status != api.HealthCheckResponse_SERVING { 1060 return fmt.Errorf("health check returned status %s", resp.Status.String()) 1061 } 1062 1063 return nil 1064 } 1065 1066 // addMember submits a configuration change to add a new member on the raft cluster. 1067 func (n *Node) addMember(ctx context.Context, addr string, raftID uint64, nodeID string) error { 1068 node := api.RaftMember{ 1069 RaftID: raftID, 1070 NodeID: nodeID, 1071 Addr: addr, 1072 } 1073 1074 meta, err := node.Marshal() 1075 if err != nil { 1076 return err 1077 } 1078 1079 cc := raftpb.ConfChange{ 1080 Type: raftpb.ConfChangeAddNode, 1081 NodeID: raftID, 1082 Context: meta, 1083 } 1084 1085 // Wait for a raft round to process the configuration change 1086 return n.configure(ctx, cc) 1087 } 1088 1089 // updateNodeBlocking runs synchronous job to update node address in whole cluster. 1090 func (n *Node) updateNodeBlocking(ctx context.Context, id uint64, addr string) error { 1091 m := n.cluster.GetMember(id) 1092 if m == nil { 1093 return errors.Errorf("member %x is not found for update", id) 1094 } 1095 node := api.RaftMember{ 1096 RaftID: m.RaftID, 1097 NodeID: m.NodeID, 1098 Addr: addr, 1099 } 1100 1101 meta, err := node.Marshal() 1102 if err != nil { 1103 return err 1104 } 1105 1106 cc := raftpb.ConfChange{ 1107 Type: raftpb.ConfChangeUpdateNode, 1108 NodeID: id, 1109 Context: meta, 1110 } 1111 1112 // Wait for a raft round to process the configuration change 1113 return n.configure(ctx, cc) 1114 } 1115 1116 // UpdateNode submits a configuration change to change a member's address. 1117 func (n *Node) UpdateNode(id uint64, addr string) { 1118 ctx, cancel := n.WithContext(context.Background()) 1119 defer cancel() 1120 // spawn updating info in raft in background to unblock transport 1121 go func() { 1122 if err := n.updateNodeBlocking(ctx, id, addr); err != nil { 1123 log.G(ctx).WithFields(logrus.Fields{"raft_id": n.Config.ID, "update_id": id}).WithError(err).Error("failed to update member address in cluster") 1124 } 1125 }() 1126 } 1127 1128 // Leave asks to a member of the raft to remove 1129 // us from the raft cluster. This method is called 1130 // from a member who is willing to leave its raft 1131 // membership to an active member of the raft 1132 func (n *Node) Leave(ctx context.Context, req *api.LeaveRequest) (*api.LeaveResponse, error) { 1133 if req.Node == nil { 1134 return nil, status.Errorf(codes.InvalidArgument, "no node information provided") 1135 } 1136 1137 nodeInfo, err := ca.RemoteNode(ctx) 1138 if err != nil { 1139 return nil, err 1140 } 1141 1142 ctx, cancel := n.WithContext(ctx) 1143 defer cancel() 1144 1145 fields := logrus.Fields{ 1146 "node.id": nodeInfo.NodeID, 1147 "method": "(*Node).Leave", 1148 "raft_id": fmt.Sprintf("%x", n.Config.ID), 1149 } 1150 if nodeInfo.ForwardedBy != nil { 1151 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 1152 } 1153 log.G(ctx).WithFields(fields).Debug("") 1154 1155 if err := n.removeMember(ctx, req.Node.RaftID); err != nil { 1156 return nil, err 1157 } 1158 1159 return &api.LeaveResponse{}, nil 1160 } 1161 1162 // CanRemoveMember checks if a member can be removed from 1163 // the context of the current node. 1164 func (n *Node) CanRemoveMember(id uint64) bool { 1165 members := n.cluster.Members() 1166 nreachable := 0 // reachable managers after removal 1167 1168 for _, m := range members { 1169 if m.RaftID == id { 1170 continue 1171 } 1172 1173 // Local node from where the remove is issued 1174 if m.RaftID == n.Config.ID { 1175 nreachable++ 1176 continue 1177 } 1178 1179 if n.transport.Active(m.RaftID) { 1180 nreachable++ 1181 } 1182 } 1183 1184 nquorum := (len(members)-1)/2 + 1 1185 1186 return nreachable >= nquorum 1187 } 1188 1189 func (n *Node) removeMember(ctx context.Context, id uint64) error { 1190 // can't stop the raft node while an async RPC is in progress 1191 n.stopMu.RLock() 1192 defer n.stopMu.RUnlock() 1193 1194 if !n.IsMember() { 1195 return ErrNoRaftMember 1196 } 1197 1198 if !n.isLeader() { 1199 return ErrLostLeadership 1200 } 1201 1202 n.membershipLock.Lock() 1203 defer n.membershipLock.Unlock() 1204 if !n.CanRemoveMember(id) { 1205 return ErrCannotRemoveMember 1206 } 1207 1208 cc := raftpb.ConfChange{ 1209 ID: id, 1210 Type: raftpb.ConfChangeRemoveNode, 1211 NodeID: id, 1212 Context: []byte(""), 1213 } 1214 return n.configure(ctx, cc) 1215 } 1216 1217 // TransferLeadership attempts to transfer leadership to a different node, 1218 // and wait for the transfer to happen. 1219 func (n *Node) TransferLeadership(ctx context.Context) error { 1220 ctx, cancelTransfer := context.WithTimeout(ctx, n.reqTimeout()) 1221 defer cancelTransfer() 1222 1223 n.stopMu.RLock() 1224 defer n.stopMu.RUnlock() 1225 1226 if !n.IsMember() { 1227 return ErrNoRaftMember 1228 } 1229 1230 if !n.isLeader() { 1231 return ErrLostLeadership 1232 } 1233 1234 transferee, err := n.transport.LongestActive() 1235 if err != nil { 1236 return errors.Wrap(err, "failed to get longest-active member") 1237 } 1238 start := time.Now() 1239 n.raftNode.TransferLeadership(ctx, n.Config.ID, transferee) 1240 ticker := time.NewTicker(n.opts.TickInterval / 10) 1241 defer ticker.Stop() 1242 var leader uint64 1243 for { 1244 leader = n.leader() 1245 if leader != raft.None && leader != n.Config.ID { 1246 break 1247 } 1248 select { 1249 case <-ctx.Done(): 1250 return ctx.Err() 1251 case <-ticker.C: 1252 } 1253 } 1254 log.G(ctx).Infof("raft: transfer leadership %x -> %x finished in %v", n.Config.ID, leader, time.Since(start)) 1255 return nil 1256 } 1257 1258 // RemoveMember submits a configuration change to remove a member from the raft cluster 1259 // after checking if the operation would not result in a loss of quorum. 1260 func (n *Node) RemoveMember(ctx context.Context, id uint64) error { 1261 ctx, cancel := n.WithContext(ctx) 1262 defer cancel() 1263 return n.removeMember(ctx, id) 1264 } 1265 1266 // processRaftMessageLogger is used to lazily create a logger for 1267 // ProcessRaftMessage. Usually nothing will be logged, so it is useful to avoid 1268 // formatting strings and allocating a logger when it won't be used. 1269 func (n *Node) processRaftMessageLogger(ctx context.Context, msg *api.ProcessRaftMessageRequest) *logrus.Entry { 1270 fields := logrus.Fields{ 1271 "method": "(*Node).ProcessRaftMessage", 1272 } 1273 1274 if n.IsMember() { 1275 fields["raft_id"] = fmt.Sprintf("%x", n.Config.ID) 1276 } 1277 1278 if msg != nil && msg.Message != nil { 1279 fields["from"] = fmt.Sprintf("%x", msg.Message.From) 1280 } 1281 1282 return log.G(ctx).WithFields(fields) 1283 } 1284 1285 func (n *Node) reportNewAddress(ctx context.Context, id uint64) error { 1286 // too early 1287 if !n.IsMember() { 1288 return nil 1289 } 1290 p, ok := peer.FromContext(ctx) 1291 if !ok { 1292 return nil 1293 } 1294 oldAddr, err := n.transport.PeerAddr(id) 1295 if err != nil { 1296 return err 1297 } 1298 if oldAddr == "" { 1299 // Don't know the address of the peer yet, so can't report an 1300 // update. 1301 return nil 1302 } 1303 newHost, _, err := net.SplitHostPort(p.Addr.String()) 1304 if err != nil { 1305 return err 1306 } 1307 _, officialPort, err := net.SplitHostPort(oldAddr) 1308 if err != nil { 1309 return err 1310 } 1311 newAddr := net.JoinHostPort(newHost, officialPort) 1312 return n.transport.UpdatePeerAddr(id, newAddr) 1313 } 1314 1315 // StreamRaftMessage is the server endpoint for streaming Raft messages. 1316 // It accepts a stream of raft messages to be processed on this raft member, 1317 // returning a StreamRaftMessageResponse when processing of the streamed 1318 // messages is complete. 1319 // It is called from the Raft leader, which uses it to stream messages 1320 // to this raft member. 1321 // A single stream corresponds to a single raft message, 1322 // which may be disassembled and streamed by the sender 1323 // as individual messages. Therefore, each of the messages 1324 // received by the stream will have the same raft message type and index. 1325 // Currently, only messages of type raftpb.MsgSnap can be disassembled, sent 1326 // and received on the stream. 1327 func (n *Node) StreamRaftMessage(stream api.Raft_StreamRaftMessageServer) error { 1328 // recvdMsg is the current messasge received from the stream. 1329 // assembledMessage is where the data from recvdMsg is appended to. 1330 var recvdMsg, assembledMessage *api.StreamRaftMessageRequest 1331 var err error 1332 1333 // First message index. 1334 var raftMsgIndex uint64 1335 1336 for { 1337 recvdMsg, err = stream.Recv() 1338 if err == io.EOF { 1339 break 1340 } else if err != nil { 1341 log.G(stream.Context()).WithError(err).Error("error while reading from stream") 1342 return err 1343 } 1344 1345 // Initialized the message to be used for assembling 1346 // the raft message. 1347 if assembledMessage == nil { 1348 // For all message types except raftpb.MsgSnap, 1349 // we don't expect more than a single message 1350 // on the stream so we'll get an EOF on the next Recv() 1351 // and go on to process the received message. 1352 assembledMessage = recvdMsg 1353 raftMsgIndex = recvdMsg.Message.Index 1354 continue 1355 } 1356 1357 // Verify raft message index. 1358 if recvdMsg.Message.Index != raftMsgIndex { 1359 errMsg := fmt.Sprintf("Raft message chunk with index %d is different from the previously received raft message index %d", 1360 recvdMsg.Message.Index, raftMsgIndex) 1361 log.G(stream.Context()).Errorf(errMsg) 1362 return status.Errorf(codes.InvalidArgument, "%s", errMsg) 1363 } 1364 1365 // Verify that multiple message received on a stream 1366 // can only be of type raftpb.MsgSnap. 1367 if recvdMsg.Message.Type != raftpb.MsgSnap { 1368 errMsg := fmt.Sprintf("Raft message chunk is not of type %d", 1369 raftpb.MsgSnap) 1370 log.G(stream.Context()).Errorf(errMsg) 1371 return status.Errorf(codes.InvalidArgument, "%s", errMsg) 1372 } 1373 1374 // Append the received snapshot data. 1375 assembledMessage.Message.Snapshot.Data = append(assembledMessage.Message.Snapshot.Data, recvdMsg.Message.Snapshot.Data...) 1376 } 1377 1378 // We should have the complete snapshot. Verify and process. 1379 if err == io.EOF { 1380 _, err = n.ProcessRaftMessage(stream.Context(), &api.ProcessRaftMessageRequest{Message: assembledMessage.Message}) 1381 if err == nil { 1382 // Translate the response of ProcessRaftMessage() from 1383 // ProcessRaftMessageResponse to StreamRaftMessageResponse if needed. 1384 return stream.SendAndClose(&api.StreamRaftMessageResponse{}) 1385 } 1386 } 1387 1388 return err 1389 } 1390 1391 // ProcessRaftMessage calls 'Step' which advances the 1392 // raft state machine with the provided message on the 1393 // receiving node 1394 func (n *Node) ProcessRaftMessage(ctx context.Context, msg *api.ProcessRaftMessageRequest) (*api.ProcessRaftMessageResponse, error) { 1395 if msg == nil || msg.Message == nil { 1396 n.processRaftMessageLogger(ctx, msg).Debug("received empty message") 1397 return &api.ProcessRaftMessageResponse{}, nil 1398 } 1399 1400 // Don't process the message if this comes from 1401 // a node in the remove set 1402 if n.cluster.IsIDRemoved(msg.Message.From) { 1403 n.processRaftMessageLogger(ctx, msg).Debug("received message from removed member") 1404 return nil, status.Errorf(codes.NotFound, "%s", membership.ErrMemberRemoved.Error()) 1405 } 1406 1407 ctx, cancel := n.WithContext(ctx) 1408 defer cancel() 1409 1410 // TODO(aaronl): Address changes are temporarily disabled. 1411 // See https://github.com/docker/docker/issues/30455. 1412 // This should be reenabled in the future with additional 1413 // safeguards (perhaps storing multiple addresses per node). 1414 //if err := n.reportNewAddress(ctx, msg.Message.From); err != nil { 1415 // log.G(ctx).WithError(err).Errorf("failed to report new address of %x to transport", msg.Message.From) 1416 //} 1417 1418 // Reject vote requests from unreachable peers 1419 if msg.Message.Type == raftpb.MsgVote { 1420 member := n.cluster.GetMember(msg.Message.From) 1421 if member == nil { 1422 n.processRaftMessageLogger(ctx, msg).Debug("received message from unknown member") 1423 return &api.ProcessRaftMessageResponse{}, nil 1424 } 1425 1426 if err := n.transport.HealthCheck(ctx, msg.Message.From); err != nil { 1427 n.processRaftMessageLogger(ctx, msg).WithError(err).Debug("member which sent vote request failed health check") 1428 return &api.ProcessRaftMessageResponse{}, nil 1429 } 1430 } 1431 1432 if msg.Message.Type == raftpb.MsgProp { 1433 // We don't accept forwarded proposals. Our 1434 // current architecture depends on only the leader 1435 // making proposals, so in-flight proposals can be 1436 // guaranteed not to conflict. 1437 n.processRaftMessageLogger(ctx, msg).Debug("dropped forwarded proposal") 1438 return &api.ProcessRaftMessageResponse{}, nil 1439 } 1440 1441 // can't stop the raft node while an async RPC is in progress 1442 n.stopMu.RLock() 1443 defer n.stopMu.RUnlock() 1444 1445 if n.IsMember() { 1446 if msg.Message.To != n.Config.ID { 1447 n.processRaftMessageLogger(ctx, msg).Errorf("received message intended for raft_id %x", msg.Message.To) 1448 return &api.ProcessRaftMessageResponse{}, nil 1449 } 1450 1451 if err := n.raftNode.Step(ctx, *msg.Message); err != nil { 1452 n.processRaftMessageLogger(ctx, msg).WithError(err).Debug("raft Step failed") 1453 } 1454 } 1455 1456 return &api.ProcessRaftMessageResponse{}, nil 1457 } 1458 1459 // ResolveAddress returns the address reaching for a given node ID. 1460 func (n *Node) ResolveAddress(ctx context.Context, msg *api.ResolveAddressRequest) (*api.ResolveAddressResponse, error) { 1461 if !n.IsMember() { 1462 return nil, ErrNoRaftMember 1463 } 1464 1465 nodeInfo, err := ca.RemoteNode(ctx) 1466 if err != nil { 1467 return nil, err 1468 } 1469 1470 fields := logrus.Fields{ 1471 "node.id": nodeInfo.NodeID, 1472 "method": "(*Node).ResolveAddress", 1473 "raft_id": fmt.Sprintf("%x", n.Config.ID), 1474 } 1475 if nodeInfo.ForwardedBy != nil { 1476 fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID 1477 } 1478 log.G(ctx).WithFields(fields).Debug("") 1479 1480 member := n.cluster.GetMember(msg.RaftID) 1481 if member == nil { 1482 return nil, status.Errorf(codes.NotFound, "member %x not found", msg.RaftID) 1483 } 1484 return &api.ResolveAddressResponse{Addr: member.Addr}, nil 1485 } 1486 1487 func (n *Node) getLeaderConn() (*grpc.ClientConn, error) { 1488 leader, err := n.Leader() 1489 if err != nil { 1490 return nil, err 1491 } 1492 1493 if leader == n.Config.ID { 1494 return nil, raftselector.ErrIsLeader 1495 } 1496 conn, err := n.transport.PeerConn(leader) 1497 if err != nil { 1498 return nil, errors.Wrap(err, "failed to get connection to leader") 1499 } 1500 return conn, nil 1501 } 1502 1503 // LeaderConn returns current connection to cluster leader or raftselector.ErrIsLeader 1504 // if current machine is leader. 1505 func (n *Node) LeaderConn(ctx context.Context) (*grpc.ClientConn, error) { 1506 cc, err := n.getLeaderConn() 1507 if err == nil { 1508 return cc, nil 1509 } 1510 if err == raftselector.ErrIsLeader { 1511 return nil, err 1512 } 1513 if atomic.LoadUint32(&n.ticksWithNoLeader) > lostQuorumTimeout { 1514 return nil, errLostQuorum 1515 } 1516 1517 ticker := time.NewTicker(1 * time.Second) 1518 defer ticker.Stop() 1519 for { 1520 select { 1521 case <-ticker.C: 1522 cc, err := n.getLeaderConn() 1523 if err == nil { 1524 return cc, nil 1525 } 1526 if err == raftselector.ErrIsLeader { 1527 return nil, err 1528 } 1529 case <-ctx.Done(): 1530 return nil, ctx.Err() 1531 } 1532 } 1533 } 1534 1535 // registerNode registers a new node on the cluster memberlist 1536 func (n *Node) registerNode(node *api.RaftMember) error { 1537 if n.cluster.IsIDRemoved(node.RaftID) { 1538 return nil 1539 } 1540 1541 member := &membership.Member{} 1542 1543 existingMember := n.cluster.GetMember(node.RaftID) 1544 if existingMember != nil { 1545 // Member already exists 1546 1547 // If the address is different from what we thought it was, 1548 // update it. This can happen if we just joined a cluster 1549 // and are adding ourself now with the remotely-reachable 1550 // address. 1551 if existingMember.Addr != node.Addr { 1552 if node.RaftID != n.Config.ID { 1553 if err := n.transport.UpdatePeer(node.RaftID, node.Addr); err != nil { 1554 return err 1555 } 1556 } 1557 member.RaftMember = node 1558 n.cluster.AddMember(member) 1559 } 1560 1561 return nil 1562 } 1563 1564 // Avoid opening a connection to the local node 1565 if node.RaftID != n.Config.ID { 1566 if err := n.transport.AddPeer(node.RaftID, node.Addr); err != nil { 1567 return err 1568 } 1569 } 1570 1571 member.RaftMember = node 1572 err := n.cluster.AddMember(member) 1573 if err != nil { 1574 if rerr := n.transport.RemovePeer(node.RaftID); rerr != nil { 1575 return errors.Wrapf(rerr, "failed to remove peer after error %v", err) 1576 } 1577 return err 1578 } 1579 1580 return nil 1581 } 1582 1583 // ProposeValue calls Propose on the underlying raft library(etcd/raft) and waits 1584 // on the commit log action before returning a result 1585 func (n *Node) ProposeValue(ctx context.Context, storeAction []api.StoreAction, cb func()) error { 1586 defer metrics.StartTimer(proposeLatencyTimer)() 1587 ctx, cancel := n.WithContext(ctx) 1588 defer cancel() 1589 _, err := n.processInternalRaftRequest(ctx, &api.InternalRaftRequest{Action: storeAction}, cb) 1590 1591 return err 1592 } 1593 1594 // GetVersion returns the sequence information for the current raft round. 1595 func (n *Node) GetVersion() *api.Version { 1596 n.stopMu.RLock() 1597 defer n.stopMu.RUnlock() 1598 1599 if !n.IsMember() { 1600 return nil 1601 } 1602 1603 status := n.Status() 1604 return &api.Version{Index: status.Commit} 1605 } 1606 1607 // ChangesBetween returns the changes starting after "from", up to and 1608 // including "to". If these changes are not available because the log 1609 // has been compacted, an error will be returned. 1610 func (n *Node) ChangesBetween(from, to api.Version) ([]state.Change, error) { 1611 n.stopMu.RLock() 1612 defer n.stopMu.RUnlock() 1613 1614 if from.Index > to.Index { 1615 return nil, errors.New("versions are out of order") 1616 } 1617 1618 if !n.IsMember() { 1619 return nil, ErrNoRaftMember 1620 } 1621 1622 // never returns error 1623 last, _ := n.raftStore.LastIndex() 1624 1625 if to.Index > last { 1626 return nil, errors.New("last version is out of bounds") 1627 } 1628 1629 pbs, err := n.raftStore.Entries(from.Index+1, to.Index+1, math.MaxUint64) 1630 if err != nil { 1631 return nil, err 1632 } 1633 1634 var changes []state.Change 1635 for _, pb := range pbs { 1636 if pb.Type != raftpb.EntryNormal || pb.Data == nil { 1637 continue 1638 } 1639 r := &api.InternalRaftRequest{} 1640 err := proto.Unmarshal(pb.Data, r) 1641 if err != nil { 1642 return nil, errors.Wrap(err, "error umarshalling internal raft request") 1643 } 1644 1645 if r.Action != nil { 1646 changes = append(changes, state.Change{StoreActions: r.Action, Version: api.Version{Index: pb.Index}}) 1647 } 1648 } 1649 1650 return changes, nil 1651 } 1652 1653 // SubscribePeers subscribes to peer updates in cluster. It sends always full 1654 // list of peers. 1655 func (n *Node) SubscribePeers() (q chan events.Event, cancel func()) { 1656 return n.cluster.PeersBroadcast.Watch() 1657 } 1658 1659 // GetMemberlist returns the current list of raft members in the cluster. 1660 func (n *Node) GetMemberlist() map[uint64]*api.RaftMember { 1661 memberlist := make(map[uint64]*api.RaftMember) 1662 members := n.cluster.Members() 1663 leaderID, err := n.Leader() 1664 if err != nil { 1665 leaderID = raft.None 1666 } 1667 1668 for id, member := range members { 1669 reachability := api.RaftMemberStatus_REACHABLE 1670 leader := false 1671 1672 if member.RaftID != n.Config.ID { 1673 if !n.transport.Active(member.RaftID) { 1674 reachability = api.RaftMemberStatus_UNREACHABLE 1675 } 1676 } 1677 1678 if member.RaftID == leaderID { 1679 leader = true 1680 } 1681 1682 memberlist[id] = &api.RaftMember{ 1683 RaftID: member.RaftID, 1684 NodeID: member.NodeID, 1685 Addr: member.Addr, 1686 Status: api.RaftMemberStatus{ 1687 Leader: leader, 1688 Reachability: reachability, 1689 }, 1690 } 1691 } 1692 1693 return memberlist 1694 } 1695 1696 // Status returns status of underlying etcd.Node. 1697 func (n *Node) Status() raft.Status { 1698 return n.raftNode.Status() 1699 } 1700 1701 // GetMemberByNodeID returns member information based 1702 // on its generic Node ID. 1703 func (n *Node) GetMemberByNodeID(nodeID string) *membership.Member { 1704 members := n.cluster.Members() 1705 for _, member := range members { 1706 if member.NodeID == nodeID { 1707 return member 1708 } 1709 } 1710 return nil 1711 } 1712 1713 // GetNodeIDByRaftID returns the generic Node ID of a member given its raft ID. 1714 // It returns ErrMemberUnknown if the raft ID is unknown. 1715 func (n *Node) GetNodeIDByRaftID(raftID uint64) (string, error) { 1716 if member, ok := n.cluster.Members()[raftID]; ok { 1717 return member.NodeID, nil 1718 } 1719 // this is the only possible error value that should be returned; the 1720 // manager code depends on this. if you need to add more errors later, make 1721 // sure that you update the callers of this method accordingly 1722 return "", ErrMemberUnknown 1723 } 1724 1725 // IsMember checks if the raft node has effectively joined 1726 // a cluster of existing members. 1727 func (n *Node) IsMember() bool { 1728 return atomic.LoadUint32(&n.isMember) == 1 1729 } 1730 1731 // Saves a log entry to our Store 1732 func (n *Node) saveToStorage( 1733 ctx context.Context, 1734 raftConfig *api.RaftConfig, 1735 hardState raftpb.HardState, 1736 entries []raftpb.Entry, 1737 snapshot raftpb.Snapshot, 1738 ) (err error) { 1739 1740 if !raft.IsEmptySnap(snapshot) { 1741 if err := n.raftLogger.SaveSnapshot(snapshot); err != nil { 1742 return errors.Wrap(err, "failed to save snapshot") 1743 } 1744 if err := n.raftLogger.GC(snapshot.Metadata.Index, snapshot.Metadata.Term, raftConfig.KeepOldSnapshots); err != nil { 1745 log.G(ctx).WithError(err).Error("unable to clean old snapshots and WALs") 1746 } 1747 if err = n.raftStore.ApplySnapshot(snapshot); err != nil { 1748 return errors.Wrap(err, "failed to apply snapshot on raft node") 1749 } 1750 } 1751 1752 if err := n.raftLogger.SaveEntries(hardState, entries); err != nil { 1753 return errors.Wrap(err, "failed to save raft log entries") 1754 } 1755 1756 if len(entries) > 0 { 1757 lastIndex := entries[len(entries)-1].Index 1758 if lastIndex > n.writtenWALIndex { 1759 n.writtenWALIndex = lastIndex 1760 } 1761 } 1762 1763 if err = n.raftStore.Append(entries); err != nil { 1764 return errors.Wrap(err, "failed to append raft log entries") 1765 } 1766 1767 return nil 1768 } 1769 1770 // processInternalRaftRequest proposes a value to be appended to the raft log. 1771 // It calls Propose() on etcd/raft, which calls back into the raft FSM, 1772 // which then sends a message to each of the participating nodes 1773 // in the raft group to apply a log entry and then waits for it to be applied 1774 // on this node. It will block until the this node: 1775 // 1. Gets the necessary replies back from the participating nodes and also performs the commit itself, or 1776 // 2. There is an error, or 1777 // 3. Until the raft node finalizes all the proposals on node shutdown. 1778 func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRaftRequest, cb func()) (proto.Message, error) { 1779 n.stopMu.RLock() 1780 if !n.IsMember() { 1781 n.stopMu.RUnlock() 1782 return nil, ErrStopped 1783 } 1784 n.waitProp.Add(1) 1785 defer n.waitProp.Done() 1786 n.stopMu.RUnlock() 1787 1788 r.ID = n.reqIDGen.Next() 1789 1790 // This must be derived from the context which is cancelled by stop() 1791 // to avoid a deadlock on shutdown. 1792 waitCtx, cancel := context.WithCancel(ctx) 1793 1794 ch := n.wait.register(r.ID, cb, cancel) 1795 1796 // Do this check after calling register to avoid a race. 1797 if atomic.LoadUint32(&n.signalledLeadership) != 1 { 1798 log.G(ctx).Error("node is no longer leader, aborting propose") 1799 n.wait.cancel(r.ID) 1800 return nil, ErrLostLeadership 1801 } 1802 1803 data, err := r.Marshal() 1804 if err != nil { 1805 n.wait.cancel(r.ID) 1806 return nil, err 1807 } 1808 1809 if len(data) > store.MaxTransactionBytes { 1810 n.wait.cancel(r.ID) 1811 return nil, ErrRequestTooLarge 1812 } 1813 1814 err = n.raftNode.Propose(waitCtx, data) 1815 if err != nil { 1816 n.wait.cancel(r.ID) 1817 return nil, err 1818 } 1819 1820 select { 1821 case x, ok := <-ch: 1822 if !ok { 1823 // Wait notification channel was closed. This should only happen if the wait was cancelled. 1824 log.G(ctx).Error("wait cancelled") 1825 if atomic.LoadUint32(&n.signalledLeadership) == 1 { 1826 log.G(ctx).Error("wait cancelled but node is still a leader") 1827 } 1828 return nil, ErrLostLeadership 1829 } 1830 return x.(proto.Message), nil 1831 case <-waitCtx.Done(): 1832 n.wait.cancel(r.ID) 1833 // If we can read from the channel, wait item was triggered. Otherwise it was cancelled. 1834 x, ok := <-ch 1835 if !ok { 1836 log.G(ctx).WithError(waitCtx.Err()).Error("wait context cancelled") 1837 if atomic.LoadUint32(&n.signalledLeadership) == 1 { 1838 log.G(ctx).Error("wait context cancelled but node is still a leader") 1839 } 1840 return nil, ErrLostLeadership 1841 } 1842 return x.(proto.Message), nil 1843 case <-ctx.Done(): 1844 n.wait.cancel(r.ID) 1845 // if channel is closed, wait item was canceled, otherwise it was triggered 1846 x, ok := <-ch 1847 if !ok { 1848 return nil, ctx.Err() 1849 } 1850 return x.(proto.Message), nil 1851 } 1852 } 1853 1854 // configure sends a configuration change through consensus and 1855 // then waits for it to be applied to the server. It will block 1856 // until the change is performed or there is an error. 1857 func (n *Node) configure(ctx context.Context, cc raftpb.ConfChange) error { 1858 cc.ID = n.reqIDGen.Next() 1859 1860 ctx, cancel := context.WithCancel(ctx) 1861 ch := n.wait.register(cc.ID, nil, cancel) 1862 1863 if err := n.raftNode.ProposeConfChange(ctx, cc); err != nil { 1864 n.wait.cancel(cc.ID) 1865 return err 1866 } 1867 1868 select { 1869 case x := <-ch: 1870 if err, ok := x.(error); ok { 1871 return err 1872 } 1873 if x != nil { 1874 log.G(ctx).Panic("raft: configuration change error, return type should always be error") 1875 } 1876 return nil 1877 case <-ctx.Done(): 1878 n.wait.cancel(cc.ID) 1879 return ctx.Err() 1880 } 1881 } 1882 1883 func (n *Node) processCommitted(ctx context.Context, entry raftpb.Entry) error { 1884 // Process a normal entry 1885 if entry.Type == raftpb.EntryNormal && entry.Data != nil { 1886 if err := n.processEntry(ctx, entry); err != nil { 1887 return err 1888 } 1889 } 1890 1891 // Process a configuration change (add/remove node) 1892 if entry.Type == raftpb.EntryConfChange { 1893 n.processConfChange(ctx, entry) 1894 } 1895 1896 n.appliedIndex = entry.Index 1897 return nil 1898 } 1899 1900 func (n *Node) processEntry(ctx context.Context, entry raftpb.Entry) error { 1901 r := &api.InternalRaftRequest{} 1902 err := proto.Unmarshal(entry.Data, r) 1903 if err != nil { 1904 return err 1905 } 1906 1907 if !n.wait.trigger(r.ID, r) { 1908 // There was no wait on this ID, meaning we don't have a 1909 // transaction in progress that would be committed to the 1910 // memory store by the "trigger" call. This could mean that: 1911 // 1. Startup is in progress, and the raft WAL is being parsed, 1912 // processed and applied to the store, or 1913 // 2. Either a different node wrote this to raft, 1914 // or we wrote it before losing the leader 1915 // position and cancelling the transaction. This entry still needs 1916 // to be committed since other nodes have already committed it. 1917 // Create a new transaction to commit this entry. 1918 1919 // It should not be possible for processInternalRaftRequest 1920 // to be running in this situation, but out of caution we 1921 // cancel any current invocations to avoid a deadlock. 1922 // TODO(anshul) This call is likely redundant, remove after consideration. 1923 n.wait.cancelAll() 1924 1925 err := n.memoryStore.ApplyStoreActions(r.Action) 1926 if err != nil { 1927 log.G(ctx).WithError(err).Error("failed to apply actions from raft") 1928 } 1929 } 1930 return nil 1931 } 1932 1933 func (n *Node) processConfChange(ctx context.Context, entry raftpb.Entry) { 1934 var ( 1935 err error 1936 cc raftpb.ConfChange 1937 ) 1938 1939 if err := proto.Unmarshal(entry.Data, &cc); err != nil { 1940 n.wait.trigger(cc.ID, err) 1941 } 1942 1943 if err := n.cluster.ValidateConfigurationChange(cc); err != nil { 1944 n.wait.trigger(cc.ID, err) 1945 } 1946 1947 switch cc.Type { 1948 case raftpb.ConfChangeAddNode: 1949 err = n.applyAddNode(cc) 1950 case raftpb.ConfChangeUpdateNode: 1951 err = n.applyUpdateNode(ctx, cc) 1952 case raftpb.ConfChangeRemoveNode: 1953 err = n.applyRemoveNode(ctx, cc) 1954 } 1955 1956 if err != nil { 1957 n.wait.trigger(cc.ID, err) 1958 } 1959 1960 n.confState = *n.raftNode.ApplyConfChange(cc) 1961 n.wait.trigger(cc.ID, nil) 1962 } 1963 1964 // applyAddNode is called when we receive a ConfChange 1965 // from a member in the raft cluster, this adds a new 1966 // node to the existing raft cluster 1967 func (n *Node) applyAddNode(cc raftpb.ConfChange) error { 1968 member := &api.RaftMember{} 1969 err := proto.Unmarshal(cc.Context, member) 1970 if err != nil { 1971 return err 1972 } 1973 1974 // ID must be non zero 1975 if member.RaftID == 0 { 1976 return nil 1977 } 1978 1979 return n.registerNode(member) 1980 } 1981 1982 // applyUpdateNode is called when we receive a ConfChange from a member in the 1983 // raft cluster which update the address of an existing node. 1984 func (n *Node) applyUpdateNode(ctx context.Context, cc raftpb.ConfChange) error { 1985 newMember := &api.RaftMember{} 1986 err := proto.Unmarshal(cc.Context, newMember) 1987 if err != nil { 1988 return err 1989 } 1990 1991 if newMember.RaftID == n.Config.ID { 1992 return nil 1993 } 1994 if err := n.transport.UpdatePeer(newMember.RaftID, newMember.Addr); err != nil { 1995 return err 1996 } 1997 return n.cluster.UpdateMember(newMember.RaftID, newMember) 1998 } 1999 2000 // applyRemoveNode is called when we receive a ConfChange 2001 // from a member in the raft cluster, this removes a node 2002 // from the existing raft cluster 2003 func (n *Node) applyRemoveNode(ctx context.Context, cc raftpb.ConfChange) (err error) { 2004 // If the node from where the remove is issued is 2005 // a follower and the leader steps down, Campaign 2006 // to be the leader. 2007 2008 if cc.NodeID == n.leader() && !n.isLeader() { 2009 if err = n.raftNode.Campaign(ctx); err != nil { 2010 return err 2011 } 2012 } 2013 2014 if cc.NodeID == n.Config.ID { 2015 // wait for the commit ack to be sent before closing connection 2016 n.asyncTasks.Wait() 2017 2018 n.NodeRemoved() 2019 } else if err := n.transport.RemovePeer(cc.NodeID); err != nil { 2020 return err 2021 } 2022 2023 return n.cluster.RemoveMember(cc.NodeID) 2024 } 2025 2026 // SubscribeLeadership returns channel to which events about leadership change 2027 // will be sent in form of raft.LeadershipState. Also cancel func is returned - 2028 // it should be called when listener is no longer interested in events. 2029 func (n *Node) SubscribeLeadership() (q chan events.Event, cancel func()) { 2030 return n.leadershipBroadcast.Watch() 2031 } 2032 2033 // createConfigChangeEnts creates a series of Raft entries (i.e. 2034 // EntryConfChange) to remove the set of given IDs from the cluster. The ID 2035 // `self` is _not_ removed, even if present in the set. 2036 // If `self` is not inside the given ids, it creates a Raft entry to add a 2037 // default member with the given `self`. 2038 func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry { 2039 var ents []raftpb.Entry 2040 next := index + 1 2041 found := false 2042 for _, id := range ids { 2043 if id == self { 2044 found = true 2045 continue 2046 } 2047 cc := &raftpb.ConfChange{ 2048 Type: raftpb.ConfChangeRemoveNode, 2049 NodeID: id, 2050 } 2051 data, err := cc.Marshal() 2052 if err != nil { 2053 log.L.WithError(err).Panic("marshal configuration change should never fail") 2054 } 2055 e := raftpb.Entry{ 2056 Type: raftpb.EntryConfChange, 2057 Data: data, 2058 Term: term, 2059 Index: next, 2060 } 2061 ents = append(ents, e) 2062 next++ 2063 } 2064 if !found { 2065 node := &api.RaftMember{RaftID: self} 2066 meta, err := node.Marshal() 2067 if err != nil { 2068 log.L.WithError(err).Panic("marshal member should never fail") 2069 } 2070 cc := &raftpb.ConfChange{ 2071 Type: raftpb.ConfChangeAddNode, 2072 NodeID: self, 2073 Context: meta, 2074 } 2075 data, err := cc.Marshal() 2076 if err != nil { 2077 log.L.WithError(err).Panic("marshal configuration change should never fail") 2078 } 2079 e := raftpb.Entry{ 2080 Type: raftpb.EntryConfChange, 2081 Data: data, 2082 Term: term, 2083 Index: next, 2084 } 2085 ents = append(ents, e) 2086 } 2087 return ents 2088 } 2089 2090 // getIDs returns an ordered set of IDs included in the given snapshot and 2091 // the entries. The given snapshot/entries can contain two kinds of 2092 // ID-related entry: 2093 // - ConfChangeAddNode, in which case the contained ID will be added into the set. 2094 // - ConfChangeRemoveNode, in which case the contained ID will be removed from the set. 2095 func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 { 2096 ids := make(map[uint64]struct{}) 2097 if snap != nil { 2098 for _, id := range snap.Metadata.ConfState.Nodes { 2099 ids[id] = struct{}{} 2100 } 2101 } 2102 for _, e := range ents { 2103 if e.Type != raftpb.EntryConfChange { 2104 continue 2105 } 2106 if snap != nil && e.Index < snap.Metadata.Index { 2107 continue 2108 } 2109 var cc raftpb.ConfChange 2110 if err := cc.Unmarshal(e.Data); err != nil { 2111 log.L.WithError(err).Panic("unmarshal configuration change should never fail") 2112 } 2113 switch cc.Type { 2114 case raftpb.ConfChangeAddNode: 2115 ids[cc.NodeID] = struct{}{} 2116 case raftpb.ConfChangeRemoveNode: 2117 delete(ids, cc.NodeID) 2118 case raftpb.ConfChangeUpdateNode: 2119 // do nothing 2120 default: 2121 log.L.Panic("ConfChange Type should be either ConfChangeAddNode, or ConfChangeRemoveNode, or ConfChangeUpdateNode!") 2122 } 2123 } 2124 var sids []uint64 2125 for id := range ids { 2126 sids = append(sids, id) 2127 } 2128 return sids 2129 } 2130 2131 func (n *Node) reqTimeout() time.Duration { 2132 return 5*time.Second + 2*time.Duration(n.Config.ElectionTick)*n.opts.TickInterval 2133 }