github.com/kaisenlinux/docker.io@v0.0.0-20230510090727-ea55db55fac7/swarmkit/manager/state/raft/raft.go (about)

     1  package raft
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"math"
     8  	"math/rand"
     9  	"net"
    10  	"sync"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	"code.cloudfoundry.org/clock"
    15  	"github.com/coreos/etcd/pkg/idutil"
    16  	"github.com/coreos/etcd/raft"
    17  	"github.com/coreos/etcd/raft/raftpb"
    18  	"github.com/docker/docker/pkg/signal"
    19  	"github.com/docker/go-events"
    20  	"github.com/docker/go-metrics"
    21  	"github.com/docker/swarmkit/api"
    22  	"github.com/docker/swarmkit/ca"
    23  	"github.com/docker/swarmkit/log"
    24  	"github.com/docker/swarmkit/manager/raftselector"
    25  	"github.com/docker/swarmkit/manager/state"
    26  	"github.com/docker/swarmkit/manager/state/raft/membership"
    27  	"github.com/docker/swarmkit/manager/state/raft/storage"
    28  	"github.com/docker/swarmkit/manager/state/raft/transport"
    29  	"github.com/docker/swarmkit/manager/state/store"
    30  	"github.com/docker/swarmkit/watch"
    31  	"github.com/gogo/protobuf/proto"
    32  	"github.com/pkg/errors"
    33  	"github.com/sirupsen/logrus"
    34  	"golang.org/x/time/rate"
    35  	"google.golang.org/grpc"
    36  	"google.golang.org/grpc/codes"
    37  	"google.golang.org/grpc/credentials"
    38  	"google.golang.org/grpc/peer"
    39  	"google.golang.org/grpc/status"
    40  )
    41  
    42  var (
    43  	// ErrNoRaftMember is thrown when the node is not yet part of a raft cluster
    44  	ErrNoRaftMember = errors.New("raft: node is not yet part of a raft cluster")
    45  	// ErrConfChangeRefused is returned when there is an issue with the configuration change
    46  	ErrConfChangeRefused = errors.New("raft: propose configuration change refused")
    47  	// ErrApplyNotSpecified is returned during the creation of a raft node when no apply method was provided
    48  	ErrApplyNotSpecified = errors.New("raft: apply method was not specified")
    49  	// ErrSetHardState is returned when the node fails to set the hard state
    50  	ErrSetHardState = errors.New("raft: failed to set the hard state for log append entry")
    51  	// ErrStopped is returned when an operation was submitted but the node was stopped in the meantime
    52  	ErrStopped = errors.New("raft: failed to process the request: node is stopped")
    53  	// ErrLostLeadership is returned when an operation was submitted but the node lost leader status before it became committed
    54  	ErrLostLeadership = errors.New("raft: failed to process the request: node lost leader status")
    55  	// ErrRequestTooLarge is returned when a raft internal message is too large to be sent
    56  	ErrRequestTooLarge = errors.New("raft: raft message is too large and can't be sent")
    57  	// ErrCannotRemoveMember is thrown when we try to remove a member from the cluster but this would result in a loss of quorum
    58  	ErrCannotRemoveMember = errors.New("raft: member cannot be removed, because removing it may result in loss of quorum")
    59  	// ErrNoClusterLeader is thrown when the cluster has no elected leader
    60  	ErrNoClusterLeader = errors.New("raft: no elected cluster leader")
    61  	// ErrMemberUnknown is sent in response to a message from an
    62  	// unrecognized peer.
    63  	ErrMemberUnknown = errors.New("raft: member unknown")
    64  
    65  	// work around lint
    66  	lostQuorumMessage = "The swarm does not have a leader. It's possible that too few managers are online. Make sure more than half of the managers are online."
    67  	errLostQuorum     = errors.New(lostQuorumMessage)
    68  
    69  	// Timer to capture ProposeValue() latency.
    70  	proposeLatencyTimer metrics.Timer
    71  )
    72  
    73  // LeadershipState indicates whether the node is a leader or follower.
    74  type LeadershipState int
    75  
    76  const (
    77  	// IsLeader indicates that the node is a raft leader.
    78  	IsLeader LeadershipState = iota
    79  	// IsFollower indicates that the node is a raft follower.
    80  	IsFollower
    81  
    82  	// lostQuorumTimeout is the number of ticks that can elapse with no
    83  	// leader before LeaderConn starts returning an error right away.
    84  	lostQuorumTimeout = 10
    85  )
    86  
    87  // EncryptionKeys are the current and, if necessary, pending DEKs with which to
    88  // encrypt raft data
    89  type EncryptionKeys struct {
    90  	CurrentDEK []byte
    91  	PendingDEK []byte
    92  }
    93  
    94  // EncryptionKeyRotator is an interface to find out if any keys need rotating.
    95  type EncryptionKeyRotator interface {
    96  	GetKeys() EncryptionKeys
    97  	UpdateKeys(EncryptionKeys) error
    98  	NeedsRotation() bool
    99  	RotationNotify() chan struct{}
   100  }
   101  
   102  // Node represents the Raft Node useful
   103  // configuration.
   104  type Node struct {
   105  	raftNode  raft.Node
   106  	cluster   *membership.Cluster
   107  	transport *transport.Transport
   108  
   109  	raftStore           *raft.MemoryStorage
   110  	memoryStore         *store.MemoryStore
   111  	Config              *raft.Config
   112  	opts                NodeOptions
   113  	reqIDGen            *idutil.Generator
   114  	wait                *wait
   115  	campaignWhenAble    bool
   116  	signalledLeadership uint32
   117  	isMember            uint32
   118  	bootstrapMembers    []*api.RaftMember
   119  
   120  	// waitProp waits for all the proposals to be terminated before
   121  	// shutting down the node.
   122  	waitProp sync.WaitGroup
   123  
   124  	confState       raftpb.ConfState
   125  	appliedIndex    uint64
   126  	snapshotMeta    raftpb.SnapshotMetadata
   127  	writtenWALIndex uint64
   128  
   129  	ticker clock.Ticker
   130  	doneCh chan struct{}
   131  	// RemovedFromRaft notifies about node deletion from raft cluster
   132  	RemovedFromRaft chan struct{}
   133  	cancelFunc      func()
   134  	// removeRaftCh notifies about node deletion from raft cluster
   135  	removeRaftCh        chan struct{}
   136  	removeRaftOnce      sync.Once
   137  	leadershipBroadcast *watch.Queue
   138  
   139  	// used to coordinate shutdown
   140  	// Lock should be used only in stop(), all other functions should use RLock.
   141  	stopMu sync.RWMutex
   142  	// used for membership management checks
   143  	membershipLock sync.Mutex
   144  	// synchronizes access to n.opts.Addr, and makes sure the address is not
   145  	// updated concurrently with JoinAndStart.
   146  	addrLock sync.Mutex
   147  
   148  	snapshotInProgress chan raftpb.SnapshotMetadata
   149  	asyncTasks         sync.WaitGroup
   150  
   151  	// stopped chan is used for notifying grpc handlers that raft node going
   152  	// to stop.
   153  	stopped chan struct{}
   154  
   155  	raftLogger     *storage.EncryptedRaftLogger
   156  	keyRotator     EncryptionKeyRotator
   157  	rotationQueued bool
   158  	clearData      bool
   159  
   160  	// waitForAppliedIndex stores the index of the last log that was written using
   161  	// an raft DEK during a raft DEK rotation, so that we won't finish a rotation until
   162  	// a snapshot covering that index has been written encrypted with the new raft DEK
   163  	waitForAppliedIndex uint64
   164  	ticksWithNoLeader   uint32
   165  }
   166  
   167  // NodeOptions provides node-level options.
   168  type NodeOptions struct {
   169  	// ID is the node's ID, from its certificate's CN field.
   170  	ID string
   171  	// Addr is the address of this node's listener
   172  	Addr string
   173  	// ForceNewCluster defines if we have to force a new cluster
   174  	// because we are recovering from a backup data directory.
   175  	ForceNewCluster bool
   176  	// JoinAddr is the cluster to join. May be an empty string to create
   177  	// a standalone cluster.
   178  	JoinAddr string
   179  	// ForceJoin tells us to join even if already part of a cluster.
   180  	ForceJoin bool
   181  	// Config is the raft config.
   182  	Config *raft.Config
   183  	// StateDir is the directory to store durable state.
   184  	StateDir string
   185  	// TickInterval interval is the time interval between raft ticks.
   186  	TickInterval time.Duration
   187  	// ClockSource is a Clock interface to use as a time base.
   188  	// Leave this nil except for tests that are designed not to run in real
   189  	// time.
   190  	ClockSource clock.Clock
   191  	// SendTimeout is the timeout on the sending messages to other raft
   192  	// nodes. Leave this as 0 to get the default value.
   193  	SendTimeout    time.Duration
   194  	TLSCredentials credentials.TransportCredentials
   195  	KeyRotator     EncryptionKeyRotator
   196  	// DisableStackDump prevents Run from dumping goroutine stacks when the
   197  	// store becomes stuck.
   198  	DisableStackDump bool
   199  
   200  	// FIPS specifies whether the raft encryption should be FIPS compliant
   201  	FIPS bool
   202  }
   203  
   204  func init() {
   205  	rand.Seed(time.Now().UnixNano())
   206  	ns := metrics.NewNamespace("swarm", "raft", nil)
   207  	proposeLatencyTimer = ns.NewTimer("transaction_latency", "Raft transaction latency.")
   208  	metrics.Register(ns)
   209  }
   210  
   211  // NewNode generates a new Raft node
   212  func NewNode(opts NodeOptions) *Node {
   213  	cfg := opts.Config
   214  	if cfg == nil {
   215  		cfg = DefaultNodeConfig()
   216  	}
   217  	if opts.TickInterval == 0 {
   218  		opts.TickInterval = time.Second
   219  	}
   220  	if opts.SendTimeout == 0 {
   221  		opts.SendTimeout = 2 * time.Second
   222  	}
   223  
   224  	raftStore := raft.NewMemoryStorage()
   225  
   226  	n := &Node{
   227  		cluster:   membership.NewCluster(),
   228  		raftStore: raftStore,
   229  		opts:      opts,
   230  		Config: &raft.Config{
   231  			ElectionTick:    cfg.ElectionTick,
   232  			HeartbeatTick:   cfg.HeartbeatTick,
   233  			Storage:         raftStore,
   234  			MaxSizePerMsg:   cfg.MaxSizePerMsg,
   235  			MaxInflightMsgs: cfg.MaxInflightMsgs,
   236  			Logger:          cfg.Logger,
   237  			CheckQuorum:     cfg.CheckQuorum,
   238  		},
   239  		doneCh:              make(chan struct{}),
   240  		RemovedFromRaft:     make(chan struct{}),
   241  		stopped:             make(chan struct{}),
   242  		leadershipBroadcast: watch.NewQueue(),
   243  		keyRotator:          opts.KeyRotator,
   244  	}
   245  	n.memoryStore = store.NewMemoryStore(n)
   246  
   247  	if opts.ClockSource == nil {
   248  		n.ticker = clock.NewClock().NewTicker(opts.TickInterval)
   249  	} else {
   250  		n.ticker = opts.ClockSource.NewTicker(opts.TickInterval)
   251  	}
   252  
   253  	n.reqIDGen = idutil.NewGenerator(uint16(n.Config.ID), time.Now())
   254  	n.wait = newWait()
   255  
   256  	n.cancelFunc = func(n *Node) func() {
   257  		var cancelOnce sync.Once
   258  		return func() {
   259  			cancelOnce.Do(func() {
   260  				close(n.stopped)
   261  			})
   262  		}
   263  	}(n)
   264  
   265  	return n
   266  }
   267  
   268  // IsIDRemoved reports if member with id was removed from cluster.
   269  // Part of transport.Raft interface.
   270  func (n *Node) IsIDRemoved(id uint64) bool {
   271  	return n.cluster.IsIDRemoved(id)
   272  }
   273  
   274  // NodeRemoved signals that node was removed from cluster and should stop.
   275  // Part of transport.Raft interface.
   276  func (n *Node) NodeRemoved() {
   277  	n.removeRaftOnce.Do(func() {
   278  		atomic.StoreUint32(&n.isMember, 0)
   279  		close(n.RemovedFromRaft)
   280  	})
   281  }
   282  
   283  // ReportSnapshot reports snapshot status to underlying raft node.
   284  // Part of transport.Raft interface.
   285  func (n *Node) ReportSnapshot(id uint64, status raft.SnapshotStatus) {
   286  	n.raftNode.ReportSnapshot(id, status)
   287  }
   288  
   289  // ReportUnreachable reports to underlying raft node that member with id is
   290  // unreachable.
   291  // Part of transport.Raft interface.
   292  func (n *Node) ReportUnreachable(id uint64) {
   293  	n.raftNode.ReportUnreachable(id)
   294  }
   295  
   296  // SetAddr provides the raft node's address. This can be used in cases where
   297  // opts.Addr was not provided to NewNode, for example when a port was not bound
   298  // until after the raft node was created.
   299  func (n *Node) SetAddr(ctx context.Context, addr string) error {
   300  	n.addrLock.Lock()
   301  	defer n.addrLock.Unlock()
   302  
   303  	n.opts.Addr = addr
   304  
   305  	if !n.IsMember() {
   306  		return nil
   307  	}
   308  
   309  	newRaftMember := &api.RaftMember{
   310  		RaftID: n.Config.ID,
   311  		NodeID: n.opts.ID,
   312  		Addr:   addr,
   313  	}
   314  	if err := n.cluster.UpdateMember(n.Config.ID, newRaftMember); err != nil {
   315  		return err
   316  	}
   317  
   318  	// If the raft node is running, submit a configuration change
   319  	// with the new address.
   320  
   321  	// TODO(aaronl): Currently, this node must be the leader to
   322  	// submit this configuration change. This works for the initial
   323  	// use cases (single-node cluster late binding ports, or calling
   324  	// SetAddr before joining a cluster). In the future, we may want
   325  	// to support having a follower proactively change its remote
   326  	// address.
   327  
   328  	leadershipCh, cancelWatch := n.SubscribeLeadership()
   329  	defer cancelWatch()
   330  
   331  	ctx, cancelCtx := n.WithContext(ctx)
   332  	defer cancelCtx()
   333  
   334  	isLeader := atomic.LoadUint32(&n.signalledLeadership) == 1
   335  	for !isLeader {
   336  		select {
   337  		case leadershipChange := <-leadershipCh:
   338  			if leadershipChange == IsLeader {
   339  				isLeader = true
   340  			}
   341  		case <-ctx.Done():
   342  			return ctx.Err()
   343  		}
   344  	}
   345  
   346  	return n.updateNodeBlocking(ctx, n.Config.ID, addr)
   347  }
   348  
   349  // WithContext returns context which is cancelled when parent context cancelled
   350  // or node is stopped.
   351  func (n *Node) WithContext(ctx context.Context) (context.Context, context.CancelFunc) {
   352  	ctx, cancel := context.WithCancel(ctx)
   353  
   354  	go func() {
   355  		select {
   356  		case <-ctx.Done():
   357  		case <-n.stopped:
   358  			cancel()
   359  		}
   360  	}()
   361  	return ctx, cancel
   362  }
   363  
   364  func (n *Node) initTransport() {
   365  	transportConfig := &transport.Config{
   366  		HeartbeatInterval: time.Duration(n.Config.ElectionTick) * n.opts.TickInterval,
   367  		SendTimeout:       n.opts.SendTimeout,
   368  		Credentials:       n.opts.TLSCredentials,
   369  		Raft:              n,
   370  	}
   371  	n.transport = transport.New(transportConfig)
   372  }
   373  
   374  // JoinAndStart joins and starts the raft server
   375  func (n *Node) JoinAndStart(ctx context.Context) (err error) {
   376  	ctx, cancel := n.WithContext(ctx)
   377  	defer func() {
   378  		cancel()
   379  		if err != nil {
   380  			n.stopMu.Lock()
   381  			// to shutdown transport
   382  			n.cancelFunc()
   383  			n.stopMu.Unlock()
   384  			n.done()
   385  		} else {
   386  			atomic.StoreUint32(&n.isMember, 1)
   387  		}
   388  	}()
   389  
   390  	loadAndStartErr := n.loadAndStart(ctx, n.opts.ForceNewCluster)
   391  	if loadAndStartErr != nil && loadAndStartErr != storage.ErrNoWAL {
   392  		return loadAndStartErr
   393  	}
   394  
   395  	snapshot, err := n.raftStore.Snapshot()
   396  	// Snapshot never returns an error
   397  	if err != nil {
   398  		panic("could not get snapshot of raft store")
   399  	}
   400  
   401  	n.confState = snapshot.Metadata.ConfState
   402  	n.appliedIndex = snapshot.Metadata.Index
   403  	n.snapshotMeta = snapshot.Metadata
   404  	n.writtenWALIndex, _ = n.raftStore.LastIndex() // lastIndex always returns nil as an error
   405  
   406  	n.addrLock.Lock()
   407  	defer n.addrLock.Unlock()
   408  
   409  	// override the module field entirely, since etcd/raft is not exactly a submodule
   410  	n.Config.Logger = log.G(ctx).WithField("module", "raft")
   411  
   412  	// restore from snapshot
   413  	if loadAndStartErr == nil {
   414  		if n.opts.JoinAddr != "" && n.opts.ForceJoin {
   415  			if err := n.joinCluster(ctx); err != nil {
   416  				return errors.Wrap(err, "failed to rejoin cluster")
   417  			}
   418  		}
   419  		n.campaignWhenAble = true
   420  		n.initTransport()
   421  		n.raftNode = raft.RestartNode(n.Config)
   422  		return nil
   423  	}
   424  
   425  	if n.opts.JoinAddr == "" {
   426  		// First member in the cluster, self-assign ID
   427  		n.Config.ID = uint64(rand.Int63()) + 1
   428  		peer, err := n.newRaftLogs(n.opts.ID)
   429  		if err != nil {
   430  			return err
   431  		}
   432  		n.campaignWhenAble = true
   433  		n.initTransport()
   434  		n.raftNode = raft.StartNode(n.Config, []raft.Peer{peer})
   435  		return nil
   436  	}
   437  
   438  	// join to existing cluster
   439  
   440  	if err := n.joinCluster(ctx); err != nil {
   441  		return err
   442  	}
   443  
   444  	if _, err := n.newRaftLogs(n.opts.ID); err != nil {
   445  		return err
   446  	}
   447  
   448  	n.initTransport()
   449  	n.raftNode = raft.StartNode(n.Config, nil)
   450  
   451  	return nil
   452  }
   453  
   454  func (n *Node) joinCluster(ctx context.Context) error {
   455  	if n.opts.Addr == "" {
   456  		return errors.New("attempted to join raft cluster without knowing own address")
   457  	}
   458  
   459  	conn, err := dial(n.opts.JoinAddr, "tcp", n.opts.TLSCredentials, 10*time.Second)
   460  	if err != nil {
   461  		return err
   462  	}
   463  	defer conn.Close()
   464  	client := api.NewRaftMembershipClient(conn)
   465  
   466  	joinCtx, joinCancel := context.WithTimeout(ctx, n.reqTimeout())
   467  	defer joinCancel()
   468  	resp, err := client.Join(joinCtx, &api.JoinRequest{
   469  		Addr: n.opts.Addr,
   470  	})
   471  	if err != nil {
   472  		return err
   473  	}
   474  
   475  	n.Config.ID = resp.RaftID
   476  	n.bootstrapMembers = resp.Members
   477  	return nil
   478  }
   479  
   480  // DefaultNodeConfig returns the default config for a
   481  // raft node that can be modified and customized
   482  func DefaultNodeConfig() *raft.Config {
   483  	return &raft.Config{
   484  		HeartbeatTick: 1,
   485  		// Recommended value in etcd/raft is 10 x (HeartbeatTick).
   486  		// Lower values were seen to have caused instability because of
   487  		// frequent leader elections when running on flakey networks.
   488  		ElectionTick:    10,
   489  		MaxSizePerMsg:   math.MaxUint16,
   490  		MaxInflightMsgs: 256,
   491  		Logger:          log.L,
   492  		CheckQuorum:     true,
   493  	}
   494  }
   495  
   496  // DefaultRaftConfig returns a default api.RaftConfig.
   497  func DefaultRaftConfig() api.RaftConfig {
   498  	return api.RaftConfig{
   499  		KeepOldSnapshots:           0,
   500  		SnapshotInterval:           10000,
   501  		LogEntriesForSlowFollowers: 500,
   502  		// Recommended value in etcd/raft is 10 x (HeartbeatTick).
   503  		// Lower values were seen to have caused instability because of
   504  		// frequent leader elections when running on flakey networks.
   505  		HeartbeatTick: 1,
   506  		ElectionTick:  10,
   507  	}
   508  }
   509  
   510  // MemoryStore returns the memory store that is kept in sync with the raft log.
   511  func (n *Node) MemoryStore() *store.MemoryStore {
   512  	return n.memoryStore
   513  }
   514  
   515  func (n *Node) done() {
   516  	n.cluster.Clear()
   517  
   518  	n.ticker.Stop()
   519  	n.leadershipBroadcast.Close()
   520  	n.cluster.PeersBroadcast.Close()
   521  	n.memoryStore.Close()
   522  	if n.transport != nil {
   523  		n.transport.Stop()
   524  	}
   525  
   526  	close(n.doneCh)
   527  }
   528  
   529  // ClearData tells the raft node to delete its WALs, snapshots, and keys on
   530  // shutdown.
   531  func (n *Node) ClearData() {
   532  	n.clearData = true
   533  }
   534  
   535  // Run is the main loop for a Raft node, it goes along the state machine,
   536  // acting on the messages received from other Raft nodes in the cluster.
   537  //
   538  // Before running the main loop, it first starts the raft node based on saved
   539  // cluster state. If no saved state exists, it starts a single-node cluster.
   540  func (n *Node) Run(ctx context.Context) error {
   541  	ctx = log.WithLogger(ctx, logrus.WithField("raft_id", fmt.Sprintf("%x", n.Config.ID)))
   542  	ctx, cancel := context.WithCancel(ctx)
   543  
   544  	for _, node := range n.bootstrapMembers {
   545  		if err := n.registerNode(node); err != nil {
   546  			log.G(ctx).WithError(err).Errorf("failed to register member %x", node.RaftID)
   547  		}
   548  	}
   549  
   550  	defer func() {
   551  		cancel()
   552  		n.stop(ctx)
   553  		if n.clearData {
   554  			// Delete WAL and snapshots, since they are no longer
   555  			// usable.
   556  			if err := n.raftLogger.Clear(ctx); err != nil {
   557  				log.G(ctx).WithError(err).Error("failed to move wal after node removal")
   558  			}
   559  			// clear out the DEKs
   560  			if err := n.keyRotator.UpdateKeys(EncryptionKeys{}); err != nil {
   561  				log.G(ctx).WithError(err).Error("could not remove DEKs")
   562  			}
   563  		}
   564  		n.done()
   565  	}()
   566  
   567  	// Flag that indicates if this manager node is *currently* the raft leader.
   568  	wasLeader := false
   569  	transferLeadershipLimit := rate.NewLimiter(rate.Every(time.Minute), 1)
   570  
   571  	for {
   572  		select {
   573  		case <-n.ticker.C():
   574  			n.raftNode.Tick()
   575  
   576  			if n.leader() == raft.None {
   577  				atomic.AddUint32(&n.ticksWithNoLeader, 1)
   578  			} else {
   579  				atomic.StoreUint32(&n.ticksWithNoLeader, 0)
   580  			}
   581  		case rd := <-n.raftNode.Ready():
   582  			raftConfig := n.getCurrentRaftConfig()
   583  
   584  			// Save entries to storage
   585  			if err := n.saveToStorage(ctx, &raftConfig, rd.HardState, rd.Entries, rd.Snapshot); err != nil {
   586  				return errors.Wrap(err, "failed to save entries to storage")
   587  			}
   588  
   589  			// If the memory store lock has been held for too long,
   590  			// transferring leadership is an easy way to break out of it.
   591  			if wasLeader &&
   592  				(rd.SoftState == nil || rd.SoftState.RaftState == raft.StateLeader) &&
   593  				n.memoryStore.Wedged() &&
   594  				transferLeadershipLimit.Allow() {
   595  				log.G(ctx).Error("Attempting to transfer leadership")
   596  				if !n.opts.DisableStackDump {
   597  					signal.DumpStacks("")
   598  				}
   599  				transferee, err := n.transport.LongestActive()
   600  				if err != nil {
   601  					log.G(ctx).WithError(err).Error("failed to get longest-active member")
   602  				} else {
   603  					log.G(ctx).Error("data store lock held too long - transferring leadership")
   604  					n.raftNode.TransferLeadership(ctx, n.Config.ID, transferee)
   605  				}
   606  			}
   607  
   608  			for _, msg := range rd.Messages {
   609  				// Send raft messages to peers
   610  				if err := n.transport.Send(msg); err != nil {
   611  					log.G(ctx).WithError(err).Error("failed to send message to member")
   612  				}
   613  			}
   614  
   615  			// Apply snapshot to memory store. The snapshot
   616  			// was applied to the raft store in
   617  			// saveToStorage.
   618  			if !raft.IsEmptySnap(rd.Snapshot) {
   619  				// Load the snapshot data into the store
   620  				if err := n.restoreFromSnapshot(ctx, rd.Snapshot.Data); err != nil {
   621  					log.G(ctx).WithError(err).Error("failed to restore cluster from snapshot")
   622  				}
   623  				n.appliedIndex = rd.Snapshot.Metadata.Index
   624  				n.snapshotMeta = rd.Snapshot.Metadata
   625  				n.confState = rd.Snapshot.Metadata.ConfState
   626  			}
   627  
   628  			// If we cease to be the leader, we must cancel any
   629  			// proposals that are currently waiting for a quorum to
   630  			// acknowledge them. It is still possible for these to
   631  			// become committed, but if that happens we will apply
   632  			// them as any follower would.
   633  
   634  			// It is important that we cancel these proposals before
   635  			// calling processCommitted, so processCommitted does
   636  			// not deadlock.
   637  
   638  			if rd.SoftState != nil {
   639  				if wasLeader && rd.SoftState.RaftState != raft.StateLeader {
   640  					wasLeader = false
   641  					log.G(ctx).Error("soft state changed, node no longer a leader, resetting and cancelling all waits")
   642  
   643  					if atomic.LoadUint32(&n.signalledLeadership) == 1 {
   644  						atomic.StoreUint32(&n.signalledLeadership, 0)
   645  						n.leadershipBroadcast.Publish(IsFollower)
   646  					}
   647  
   648  					// It is important that we set n.signalledLeadership to 0
   649  					// before calling n.wait.cancelAll. When a new raft
   650  					// request is registered, it checks n.signalledLeadership
   651  					// afterwards, and cancels the registration if it is 0.
   652  					// If cancelAll was called first, this call might run
   653  					// before the new request registers, but
   654  					// signalledLeadership would be set after the check.
   655  					// Setting signalledLeadership before calling cancelAll
   656  					// ensures that if a new request is registered during
   657  					// this transition, it will either be cancelled by
   658  					// cancelAll, or by its own check of signalledLeadership.
   659  					n.wait.cancelAll()
   660  				} else if !wasLeader && rd.SoftState.RaftState == raft.StateLeader {
   661  					// Node just became a leader.
   662  					wasLeader = true
   663  				}
   664  			}
   665  
   666  			// Process committed entries
   667  			for _, entry := range rd.CommittedEntries {
   668  				if err := n.processCommitted(ctx, entry); err != nil {
   669  					log.G(ctx).WithError(err).Error("failed to process committed entries")
   670  				}
   671  			}
   672  
   673  			// in case the previous attempt to update the key failed
   674  			n.maybeMarkRotationFinished(ctx)
   675  
   676  			// Trigger a snapshot every once in awhile
   677  			if n.snapshotInProgress == nil &&
   678  				(n.needsSnapshot(ctx) || raftConfig.SnapshotInterval > 0 &&
   679  					n.appliedIndex-n.snapshotMeta.Index >= raftConfig.SnapshotInterval) {
   680  				n.triggerSnapshot(ctx, raftConfig)
   681  			}
   682  
   683  			if wasLeader && atomic.LoadUint32(&n.signalledLeadership) != 1 {
   684  				// If all the entries in the log have become
   685  				// committed, broadcast our leadership status.
   686  				if n.caughtUp() {
   687  					atomic.StoreUint32(&n.signalledLeadership, 1)
   688  					n.leadershipBroadcast.Publish(IsLeader)
   689  				}
   690  			}
   691  
   692  			// Advance the state machine
   693  			n.raftNode.Advance()
   694  
   695  			// On the first startup, or if we are the only
   696  			// registered member after restoring from the state,
   697  			// campaign to be the leader.
   698  			if n.campaignWhenAble {
   699  				members := n.cluster.Members()
   700  				if len(members) >= 1 {
   701  					n.campaignWhenAble = false
   702  				}
   703  				if len(members) == 1 && members[n.Config.ID] != nil {
   704  					n.raftNode.Campaign(ctx)
   705  				}
   706  			}
   707  
   708  		case snapshotMeta := <-n.snapshotInProgress:
   709  			raftConfig := n.getCurrentRaftConfig()
   710  			if snapshotMeta.Index > n.snapshotMeta.Index {
   711  				n.snapshotMeta = snapshotMeta
   712  				if err := n.raftLogger.GC(snapshotMeta.Index, snapshotMeta.Term, raftConfig.KeepOldSnapshots); err != nil {
   713  					log.G(ctx).WithError(err).Error("failed to clean up old snapshots and WALs")
   714  				}
   715  			}
   716  			n.snapshotInProgress = nil
   717  			n.maybeMarkRotationFinished(ctx)
   718  			if n.rotationQueued && n.needsSnapshot(ctx) {
   719  				// there was a key rotation that took place before while the snapshot
   720  				// was in progress - we have to take another snapshot and encrypt with the new key
   721  				n.rotationQueued = false
   722  				n.triggerSnapshot(ctx, raftConfig)
   723  			}
   724  		case <-n.keyRotator.RotationNotify():
   725  			// There are 2 separate checks:  rotationQueued, and n.needsSnapshot().
   726  			// We set rotationQueued so that when we are notified of a rotation, we try to
   727  			// do a snapshot as soon as possible.  However, if there is an error while doing
   728  			// the snapshot, we don't want to hammer the node attempting to do snapshots over
   729  			// and over.  So if doing a snapshot fails, wait until the next entry comes in to
   730  			// try again.
   731  			switch {
   732  			case n.snapshotInProgress != nil:
   733  				n.rotationQueued = true
   734  			case n.needsSnapshot(ctx):
   735  				n.triggerSnapshot(ctx, n.getCurrentRaftConfig())
   736  			}
   737  		case <-ctx.Done():
   738  			return nil
   739  		}
   740  	}
   741  }
   742  
   743  func (n *Node) restoreFromSnapshot(ctx context.Context, data []byte) error {
   744  	snapCluster, err := n.clusterSnapshot(data)
   745  	if err != nil {
   746  		return err
   747  	}
   748  
   749  	oldMembers := n.cluster.Members()
   750  
   751  	for _, member := range snapCluster.Members {
   752  		delete(oldMembers, member.RaftID)
   753  	}
   754  
   755  	for _, removedMember := range snapCluster.Removed {
   756  		n.cluster.RemoveMember(removedMember)
   757  		n.transport.RemovePeer(removedMember)
   758  		delete(oldMembers, removedMember)
   759  	}
   760  
   761  	for id, member := range oldMembers {
   762  		n.cluster.ClearMember(id)
   763  		if err := n.transport.RemovePeer(member.RaftID); err != nil {
   764  			log.G(ctx).WithError(err).Errorf("failed to remove peer %x from transport", member.RaftID)
   765  		}
   766  	}
   767  	for _, node := range snapCluster.Members {
   768  		if err := n.registerNode(&api.RaftMember{RaftID: node.RaftID, NodeID: node.NodeID, Addr: node.Addr}); err != nil {
   769  			log.G(ctx).WithError(err).Error("failed to register node from snapshot")
   770  		}
   771  	}
   772  	return nil
   773  }
   774  
   775  func (n *Node) needsSnapshot(ctx context.Context) bool {
   776  	if n.waitForAppliedIndex == 0 && n.keyRotator.NeedsRotation() {
   777  		keys := n.keyRotator.GetKeys()
   778  		if keys.PendingDEK != nil {
   779  			n.raftLogger.RotateEncryptionKey(keys.PendingDEK)
   780  			// we want to wait for the last index written with the old DEK to be committed, else a snapshot taken
   781  			// may have an index less than the index of a WAL written with an old DEK.  We want the next snapshot
   782  			// written with the new key to supercede any WAL written with an old DEK.
   783  			n.waitForAppliedIndex = n.writtenWALIndex
   784  			// if there is already a snapshot at this index or higher, bump the wait index up to 1 higher than the current
   785  			// snapshot index, because the rotation cannot be completed until the next snapshot
   786  			if n.waitForAppliedIndex <= n.snapshotMeta.Index {
   787  				n.waitForAppliedIndex = n.snapshotMeta.Index + 1
   788  			}
   789  			log.G(ctx).Debugf(
   790  				"beginning raft DEK rotation - last indices written with the old key are (snapshot: %d, WAL: %d) - waiting for snapshot of index %d to be written before rotation can be completed", n.snapshotMeta.Index, n.writtenWALIndex, n.waitForAppliedIndex)
   791  		}
   792  	}
   793  
   794  	result := n.waitForAppliedIndex > 0 && n.waitForAppliedIndex <= n.appliedIndex
   795  	if result {
   796  		log.G(ctx).Debugf(
   797  			"a snapshot at index %d is needed in order to complete raft DEK rotation - a snapshot with index >= %d can now be triggered",
   798  			n.waitForAppliedIndex, n.appliedIndex)
   799  	}
   800  	return result
   801  }
   802  
   803  func (n *Node) maybeMarkRotationFinished(ctx context.Context) {
   804  	if n.waitForAppliedIndex > 0 && n.waitForAppliedIndex <= n.snapshotMeta.Index {
   805  		// this means we tried to rotate - so finish the rotation
   806  		if err := n.keyRotator.UpdateKeys(EncryptionKeys{CurrentDEK: n.raftLogger.EncryptionKey}); err != nil {
   807  			log.G(ctx).WithError(err).Error("failed to update encryption keys after a successful rotation")
   808  		} else {
   809  			log.G(ctx).Debugf(
   810  				"a snapshot with index %d is available, which completes the DEK rotation requiring a snapshot of at least index %d - throwing away DEK and older snapshots encrypted with the old key",
   811  				n.snapshotMeta.Index, n.waitForAppliedIndex)
   812  			n.waitForAppliedIndex = 0
   813  
   814  			if err := n.raftLogger.GC(n.snapshotMeta.Index, n.snapshotMeta.Term, 0); err != nil {
   815  				log.G(ctx).WithError(err).Error("failed to remove old snapshots and WALs that were written with the previous raft DEK")
   816  			}
   817  		}
   818  	}
   819  }
   820  
   821  func (n *Node) getCurrentRaftConfig() api.RaftConfig {
   822  	raftConfig := DefaultRaftConfig()
   823  	n.memoryStore.View(func(readTx store.ReadTx) {
   824  		clusters, err := store.FindClusters(readTx, store.ByName(store.DefaultClusterName))
   825  		if err == nil && len(clusters) == 1 {
   826  			raftConfig = clusters[0].Spec.Raft
   827  		}
   828  	})
   829  	return raftConfig
   830  }
   831  
   832  // Cancel interrupts all ongoing proposals, and prevents new ones from
   833  // starting. This is useful for the shutdown sequence because it allows
   834  // the manager to shut down raft-dependent services that might otherwise
   835  // block on shutdown if quorum isn't met. Then the raft node can be completely
   836  // shut down once no more code is using it.
   837  func (n *Node) Cancel() {
   838  	n.cancelFunc()
   839  }
   840  
   841  // Done returns channel which is closed when raft node is fully stopped.
   842  func (n *Node) Done() <-chan struct{} {
   843  	return n.doneCh
   844  }
   845  
   846  func (n *Node) stop(ctx context.Context) {
   847  	n.stopMu.Lock()
   848  	defer n.stopMu.Unlock()
   849  
   850  	n.Cancel()
   851  	n.waitProp.Wait()
   852  	n.asyncTasks.Wait()
   853  
   854  	n.raftNode.Stop()
   855  	n.ticker.Stop()
   856  	n.raftLogger.Close(ctx)
   857  	atomic.StoreUint32(&n.isMember, 0)
   858  	// TODO(stevvooe): Handle ctx.Done()
   859  }
   860  
   861  // isLeader checks if we are the leader or not, without the protection of lock
   862  func (n *Node) isLeader() bool {
   863  	if !n.IsMember() {
   864  		return false
   865  	}
   866  
   867  	if n.Status().Lead == n.Config.ID {
   868  		return true
   869  	}
   870  	return false
   871  }
   872  
   873  // IsLeader checks if we are the leader or not, with the protection of lock
   874  func (n *Node) IsLeader() bool {
   875  	n.stopMu.RLock()
   876  	defer n.stopMu.RUnlock()
   877  
   878  	return n.isLeader()
   879  }
   880  
   881  // leader returns the id of the leader, without the protection of lock and
   882  // membership check, so it's caller task.
   883  func (n *Node) leader() uint64 {
   884  	return n.Status().Lead
   885  }
   886  
   887  // Leader returns the id of the leader, with the protection of lock
   888  func (n *Node) Leader() (uint64, error) {
   889  	n.stopMu.RLock()
   890  	defer n.stopMu.RUnlock()
   891  
   892  	if !n.IsMember() {
   893  		return raft.None, ErrNoRaftMember
   894  	}
   895  	leader := n.leader()
   896  	if leader == raft.None {
   897  		return raft.None, ErrNoClusterLeader
   898  	}
   899  
   900  	return leader, nil
   901  }
   902  
   903  // ReadyForProposals returns true if the node has broadcasted a message
   904  // saying that it has become the leader. This means it is ready to accept
   905  // proposals.
   906  func (n *Node) ReadyForProposals() bool {
   907  	return atomic.LoadUint32(&n.signalledLeadership) == 1
   908  }
   909  
   910  func (n *Node) caughtUp() bool {
   911  	// obnoxious function that always returns a nil error
   912  	lastIndex, _ := n.raftStore.LastIndex()
   913  	return n.appliedIndex >= lastIndex
   914  }
   915  
   916  // Join asks to a member of the raft to propose
   917  // a configuration change and add us as a member thus
   918  // beginning the log replication process. This method
   919  // is called from an aspiring member to an existing member
   920  func (n *Node) Join(ctx context.Context, req *api.JoinRequest) (*api.JoinResponse, error) {
   921  	nodeInfo, err := ca.RemoteNode(ctx)
   922  	if err != nil {
   923  		return nil, err
   924  	}
   925  
   926  	fields := logrus.Fields{
   927  		"node.id": nodeInfo.NodeID,
   928  		"method":  "(*Node).Join",
   929  		"raft_id": fmt.Sprintf("%x", n.Config.ID),
   930  	}
   931  	if nodeInfo.ForwardedBy != nil {
   932  		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
   933  	}
   934  	log := log.G(ctx).WithFields(fields)
   935  	log.Debug("")
   936  
   937  	// can't stop the raft node while an async RPC is in progress
   938  	n.stopMu.RLock()
   939  	defer n.stopMu.RUnlock()
   940  
   941  	n.membershipLock.Lock()
   942  	defer n.membershipLock.Unlock()
   943  
   944  	if !n.IsMember() {
   945  		return nil, status.Errorf(codes.FailedPrecondition, "%s", ErrNoRaftMember.Error())
   946  	}
   947  
   948  	if !n.isLeader() {
   949  		return nil, status.Errorf(codes.FailedPrecondition, "%s", ErrLostLeadership.Error())
   950  	}
   951  
   952  	remoteAddr := req.Addr
   953  
   954  	// If the joining node sent an address like 0.0.0.0:4242, automatically
   955  	// determine its actual address based on the GRPC connection. This
   956  	// avoids the need for a prospective member to know its own address.
   957  
   958  	requestHost, requestPort, err := net.SplitHostPort(remoteAddr)
   959  	if err != nil {
   960  		return nil, status.Errorf(codes.InvalidArgument, "invalid address %s in raft join request", remoteAddr)
   961  	}
   962  
   963  	requestIP := net.ParseIP(requestHost)
   964  	if requestIP != nil && requestIP.IsUnspecified() {
   965  		remoteHost, _, err := net.SplitHostPort(nodeInfo.RemoteAddr)
   966  		if err != nil {
   967  			return nil, err
   968  		}
   969  		remoteAddr = net.JoinHostPort(remoteHost, requestPort)
   970  	}
   971  
   972  	// We do not bother submitting a configuration change for the
   973  	// new member if we can't contact it back using its address
   974  	if err := n.checkHealth(ctx, remoteAddr, 5*time.Second); err != nil {
   975  		return nil, err
   976  	}
   977  
   978  	// If the peer is already a member of the cluster, we will only update
   979  	// its information, not add it as a new member. Adding it again would
   980  	// cause the quorum to be computed incorrectly.
   981  	for _, m := range n.cluster.Members() {
   982  		if m.NodeID == nodeInfo.NodeID {
   983  			if remoteAddr == m.Addr {
   984  				return n.joinResponse(m.RaftID), nil
   985  			}
   986  			updatedRaftMember := &api.RaftMember{
   987  				RaftID: m.RaftID,
   988  				NodeID: m.NodeID,
   989  				Addr:   remoteAddr,
   990  			}
   991  			if err := n.cluster.UpdateMember(m.RaftID, updatedRaftMember); err != nil {
   992  				return nil, err
   993  			}
   994  
   995  			if err := n.updateNodeBlocking(ctx, m.RaftID, remoteAddr); err != nil {
   996  				log.WithError(err).Error("failed to update node address")
   997  				return nil, err
   998  			}
   999  
  1000  			log.Info("updated node address")
  1001  			return n.joinResponse(m.RaftID), nil
  1002  		}
  1003  	}
  1004  
  1005  	// Find a unique ID for the joining member.
  1006  	var raftID uint64
  1007  	for {
  1008  		raftID = uint64(rand.Int63()) + 1
  1009  		if n.cluster.GetMember(raftID) == nil && !n.cluster.IsIDRemoved(raftID) {
  1010  			break
  1011  		}
  1012  	}
  1013  
  1014  	err = n.addMember(ctx, remoteAddr, raftID, nodeInfo.NodeID)
  1015  	if err != nil {
  1016  		log.WithError(err).Errorf("failed to add member %x", raftID)
  1017  		return nil, err
  1018  	}
  1019  
  1020  	log.Debug("node joined")
  1021  
  1022  	return n.joinResponse(raftID), nil
  1023  }
  1024  
  1025  func (n *Node) joinResponse(raftID uint64) *api.JoinResponse {
  1026  	var nodes []*api.RaftMember
  1027  	for _, node := range n.cluster.Members() {
  1028  		nodes = append(nodes, &api.RaftMember{
  1029  			RaftID: node.RaftID,
  1030  			NodeID: node.NodeID,
  1031  			Addr:   node.Addr,
  1032  		})
  1033  	}
  1034  
  1035  	return &api.JoinResponse{Members: nodes, RaftID: raftID}
  1036  }
  1037  
  1038  // checkHealth tries to contact an aspiring member through its advertised address
  1039  // and checks if its raft server is running.
  1040  func (n *Node) checkHealth(ctx context.Context, addr string, timeout time.Duration) error {
  1041  	conn, err := dial(addr, "tcp", n.opts.TLSCredentials, timeout)
  1042  	if err != nil {
  1043  		return err
  1044  	}
  1045  
  1046  	defer conn.Close()
  1047  
  1048  	if timeout != 0 {
  1049  		tctx, cancel := context.WithTimeout(ctx, timeout)
  1050  		defer cancel()
  1051  		ctx = tctx
  1052  	}
  1053  
  1054  	healthClient := api.NewHealthClient(conn)
  1055  	resp, err := healthClient.Check(ctx, &api.HealthCheckRequest{Service: "Raft"})
  1056  	if err != nil {
  1057  		return errors.Wrap(err, "could not connect to prospective new cluster member using its advertised address")
  1058  	}
  1059  	if resp.Status != api.HealthCheckResponse_SERVING {
  1060  		return fmt.Errorf("health check returned status %s", resp.Status.String())
  1061  	}
  1062  
  1063  	return nil
  1064  }
  1065  
  1066  // addMember submits a configuration change to add a new member on the raft cluster.
  1067  func (n *Node) addMember(ctx context.Context, addr string, raftID uint64, nodeID string) error {
  1068  	node := api.RaftMember{
  1069  		RaftID: raftID,
  1070  		NodeID: nodeID,
  1071  		Addr:   addr,
  1072  	}
  1073  
  1074  	meta, err := node.Marshal()
  1075  	if err != nil {
  1076  		return err
  1077  	}
  1078  
  1079  	cc := raftpb.ConfChange{
  1080  		Type:    raftpb.ConfChangeAddNode,
  1081  		NodeID:  raftID,
  1082  		Context: meta,
  1083  	}
  1084  
  1085  	// Wait for a raft round to process the configuration change
  1086  	return n.configure(ctx, cc)
  1087  }
  1088  
  1089  // updateNodeBlocking runs synchronous job to update node address in whole cluster.
  1090  func (n *Node) updateNodeBlocking(ctx context.Context, id uint64, addr string) error {
  1091  	m := n.cluster.GetMember(id)
  1092  	if m == nil {
  1093  		return errors.Errorf("member %x is not found for update", id)
  1094  	}
  1095  	node := api.RaftMember{
  1096  		RaftID: m.RaftID,
  1097  		NodeID: m.NodeID,
  1098  		Addr:   addr,
  1099  	}
  1100  
  1101  	meta, err := node.Marshal()
  1102  	if err != nil {
  1103  		return err
  1104  	}
  1105  
  1106  	cc := raftpb.ConfChange{
  1107  		Type:    raftpb.ConfChangeUpdateNode,
  1108  		NodeID:  id,
  1109  		Context: meta,
  1110  	}
  1111  
  1112  	// Wait for a raft round to process the configuration change
  1113  	return n.configure(ctx, cc)
  1114  }
  1115  
  1116  // UpdateNode submits a configuration change to change a member's address.
  1117  func (n *Node) UpdateNode(id uint64, addr string) {
  1118  	ctx, cancel := n.WithContext(context.Background())
  1119  	defer cancel()
  1120  	// spawn updating info in raft in background to unblock transport
  1121  	go func() {
  1122  		if err := n.updateNodeBlocking(ctx, id, addr); err != nil {
  1123  			log.G(ctx).WithFields(logrus.Fields{"raft_id": n.Config.ID, "update_id": id}).WithError(err).Error("failed to update member address in cluster")
  1124  		}
  1125  	}()
  1126  }
  1127  
  1128  // Leave asks to a member of the raft to remove
  1129  // us from the raft cluster. This method is called
  1130  // from a member who is willing to leave its raft
  1131  // membership to an active member of the raft
  1132  func (n *Node) Leave(ctx context.Context, req *api.LeaveRequest) (*api.LeaveResponse, error) {
  1133  	if req.Node == nil {
  1134  		return nil, status.Errorf(codes.InvalidArgument, "no node information provided")
  1135  	}
  1136  
  1137  	nodeInfo, err := ca.RemoteNode(ctx)
  1138  	if err != nil {
  1139  		return nil, err
  1140  	}
  1141  
  1142  	ctx, cancel := n.WithContext(ctx)
  1143  	defer cancel()
  1144  
  1145  	fields := logrus.Fields{
  1146  		"node.id": nodeInfo.NodeID,
  1147  		"method":  "(*Node).Leave",
  1148  		"raft_id": fmt.Sprintf("%x", n.Config.ID),
  1149  	}
  1150  	if nodeInfo.ForwardedBy != nil {
  1151  		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
  1152  	}
  1153  	log.G(ctx).WithFields(fields).Debug("")
  1154  
  1155  	if err := n.removeMember(ctx, req.Node.RaftID); err != nil {
  1156  		return nil, err
  1157  	}
  1158  
  1159  	return &api.LeaveResponse{}, nil
  1160  }
  1161  
  1162  // CanRemoveMember checks if a member can be removed from
  1163  // the context of the current node.
  1164  func (n *Node) CanRemoveMember(id uint64) bool {
  1165  	members := n.cluster.Members()
  1166  	nreachable := 0 // reachable managers after removal
  1167  
  1168  	for _, m := range members {
  1169  		if m.RaftID == id {
  1170  			continue
  1171  		}
  1172  
  1173  		// Local node from where the remove is issued
  1174  		if m.RaftID == n.Config.ID {
  1175  			nreachable++
  1176  			continue
  1177  		}
  1178  
  1179  		if n.transport.Active(m.RaftID) {
  1180  			nreachable++
  1181  		}
  1182  	}
  1183  
  1184  	nquorum := (len(members)-1)/2 + 1
  1185  
  1186  	return nreachable >= nquorum
  1187  }
  1188  
  1189  func (n *Node) removeMember(ctx context.Context, id uint64) error {
  1190  	// can't stop the raft node while an async RPC is in progress
  1191  	n.stopMu.RLock()
  1192  	defer n.stopMu.RUnlock()
  1193  
  1194  	if !n.IsMember() {
  1195  		return ErrNoRaftMember
  1196  	}
  1197  
  1198  	if !n.isLeader() {
  1199  		return ErrLostLeadership
  1200  	}
  1201  
  1202  	n.membershipLock.Lock()
  1203  	defer n.membershipLock.Unlock()
  1204  	if !n.CanRemoveMember(id) {
  1205  		return ErrCannotRemoveMember
  1206  	}
  1207  
  1208  	cc := raftpb.ConfChange{
  1209  		ID:      id,
  1210  		Type:    raftpb.ConfChangeRemoveNode,
  1211  		NodeID:  id,
  1212  		Context: []byte(""),
  1213  	}
  1214  	return n.configure(ctx, cc)
  1215  }
  1216  
  1217  // TransferLeadership attempts to transfer leadership to a different node,
  1218  // and wait for the transfer to happen.
  1219  func (n *Node) TransferLeadership(ctx context.Context) error {
  1220  	ctx, cancelTransfer := context.WithTimeout(ctx, n.reqTimeout())
  1221  	defer cancelTransfer()
  1222  
  1223  	n.stopMu.RLock()
  1224  	defer n.stopMu.RUnlock()
  1225  
  1226  	if !n.IsMember() {
  1227  		return ErrNoRaftMember
  1228  	}
  1229  
  1230  	if !n.isLeader() {
  1231  		return ErrLostLeadership
  1232  	}
  1233  
  1234  	transferee, err := n.transport.LongestActive()
  1235  	if err != nil {
  1236  		return errors.Wrap(err, "failed to get longest-active member")
  1237  	}
  1238  	start := time.Now()
  1239  	n.raftNode.TransferLeadership(ctx, n.Config.ID, transferee)
  1240  	ticker := time.NewTicker(n.opts.TickInterval / 10)
  1241  	defer ticker.Stop()
  1242  	var leader uint64
  1243  	for {
  1244  		leader = n.leader()
  1245  		if leader != raft.None && leader != n.Config.ID {
  1246  			break
  1247  		}
  1248  		select {
  1249  		case <-ctx.Done():
  1250  			return ctx.Err()
  1251  		case <-ticker.C:
  1252  		}
  1253  	}
  1254  	log.G(ctx).Infof("raft: transfer leadership %x -> %x finished in %v", n.Config.ID, leader, time.Since(start))
  1255  	return nil
  1256  }
  1257  
  1258  // RemoveMember submits a configuration change to remove a member from the raft cluster
  1259  // after checking if the operation would not result in a loss of quorum.
  1260  func (n *Node) RemoveMember(ctx context.Context, id uint64) error {
  1261  	ctx, cancel := n.WithContext(ctx)
  1262  	defer cancel()
  1263  	return n.removeMember(ctx, id)
  1264  }
  1265  
  1266  // processRaftMessageLogger is used to lazily create a logger for
  1267  // ProcessRaftMessage. Usually nothing will be logged, so it is useful to avoid
  1268  // formatting strings and allocating a logger when it won't be used.
  1269  func (n *Node) processRaftMessageLogger(ctx context.Context, msg *api.ProcessRaftMessageRequest) *logrus.Entry {
  1270  	fields := logrus.Fields{
  1271  		"method": "(*Node).ProcessRaftMessage",
  1272  	}
  1273  
  1274  	if n.IsMember() {
  1275  		fields["raft_id"] = fmt.Sprintf("%x", n.Config.ID)
  1276  	}
  1277  
  1278  	if msg != nil && msg.Message != nil {
  1279  		fields["from"] = fmt.Sprintf("%x", msg.Message.From)
  1280  	}
  1281  
  1282  	return log.G(ctx).WithFields(fields)
  1283  }
  1284  
  1285  func (n *Node) reportNewAddress(ctx context.Context, id uint64) error {
  1286  	// too early
  1287  	if !n.IsMember() {
  1288  		return nil
  1289  	}
  1290  	p, ok := peer.FromContext(ctx)
  1291  	if !ok {
  1292  		return nil
  1293  	}
  1294  	oldAddr, err := n.transport.PeerAddr(id)
  1295  	if err != nil {
  1296  		return err
  1297  	}
  1298  	if oldAddr == "" {
  1299  		// Don't know the address of the peer yet, so can't report an
  1300  		// update.
  1301  		return nil
  1302  	}
  1303  	newHost, _, err := net.SplitHostPort(p.Addr.String())
  1304  	if err != nil {
  1305  		return err
  1306  	}
  1307  	_, officialPort, err := net.SplitHostPort(oldAddr)
  1308  	if err != nil {
  1309  		return err
  1310  	}
  1311  	newAddr := net.JoinHostPort(newHost, officialPort)
  1312  	return n.transport.UpdatePeerAddr(id, newAddr)
  1313  }
  1314  
  1315  // StreamRaftMessage is the server endpoint for streaming Raft messages.
  1316  // It accepts a stream of raft messages to be processed on this raft member,
  1317  // returning a StreamRaftMessageResponse when processing of the streamed
  1318  // messages is complete.
  1319  // It is called from the Raft leader, which uses it to stream messages
  1320  // to this raft member.
  1321  // A single stream corresponds to a single raft message,
  1322  // which may be disassembled and streamed by the sender
  1323  // as individual messages. Therefore, each of the messages
  1324  // received by the stream will have the same raft message type and index.
  1325  // Currently, only messages of type raftpb.MsgSnap can be disassembled, sent
  1326  // and received on the stream.
  1327  func (n *Node) StreamRaftMessage(stream api.Raft_StreamRaftMessageServer) error {
  1328  	// recvdMsg is the current messasge received from the stream.
  1329  	// assembledMessage is where the data from recvdMsg is appended to.
  1330  	var recvdMsg, assembledMessage *api.StreamRaftMessageRequest
  1331  	var err error
  1332  
  1333  	// First message index.
  1334  	var raftMsgIndex uint64
  1335  
  1336  	for {
  1337  		recvdMsg, err = stream.Recv()
  1338  		if err == io.EOF {
  1339  			break
  1340  		} else if err != nil {
  1341  			log.G(stream.Context()).WithError(err).Error("error while reading from stream")
  1342  			return err
  1343  		}
  1344  
  1345  		// Initialized the message to be used for assembling
  1346  		// the raft message.
  1347  		if assembledMessage == nil {
  1348  			// For all message types except raftpb.MsgSnap,
  1349  			// we don't expect more than a single message
  1350  			// on the stream so we'll get an EOF on the next Recv()
  1351  			// and go on to process the received message.
  1352  			assembledMessage = recvdMsg
  1353  			raftMsgIndex = recvdMsg.Message.Index
  1354  			continue
  1355  		}
  1356  
  1357  		// Verify raft message index.
  1358  		if recvdMsg.Message.Index != raftMsgIndex {
  1359  			errMsg := fmt.Sprintf("Raft message chunk with index %d is different from the previously received raft message index %d",
  1360  				recvdMsg.Message.Index, raftMsgIndex)
  1361  			log.G(stream.Context()).Errorf(errMsg)
  1362  			return status.Errorf(codes.InvalidArgument, "%s", errMsg)
  1363  		}
  1364  
  1365  		// Verify that multiple message received on a stream
  1366  		// can only be of type raftpb.MsgSnap.
  1367  		if recvdMsg.Message.Type != raftpb.MsgSnap {
  1368  			errMsg := fmt.Sprintf("Raft message chunk is not of type %d",
  1369  				raftpb.MsgSnap)
  1370  			log.G(stream.Context()).Errorf(errMsg)
  1371  			return status.Errorf(codes.InvalidArgument, "%s", errMsg)
  1372  		}
  1373  
  1374  		// Append the received snapshot data.
  1375  		assembledMessage.Message.Snapshot.Data = append(assembledMessage.Message.Snapshot.Data, recvdMsg.Message.Snapshot.Data...)
  1376  	}
  1377  
  1378  	// We should have the complete snapshot. Verify and process.
  1379  	if err == io.EOF {
  1380  		_, err = n.ProcessRaftMessage(stream.Context(), &api.ProcessRaftMessageRequest{Message: assembledMessage.Message})
  1381  		if err == nil {
  1382  			// Translate the response of ProcessRaftMessage() from
  1383  			// ProcessRaftMessageResponse to StreamRaftMessageResponse if needed.
  1384  			return stream.SendAndClose(&api.StreamRaftMessageResponse{})
  1385  		}
  1386  	}
  1387  
  1388  	return err
  1389  }
  1390  
  1391  // ProcessRaftMessage calls 'Step' which advances the
  1392  // raft state machine with the provided message on the
  1393  // receiving node
  1394  func (n *Node) ProcessRaftMessage(ctx context.Context, msg *api.ProcessRaftMessageRequest) (*api.ProcessRaftMessageResponse, error) {
  1395  	if msg == nil || msg.Message == nil {
  1396  		n.processRaftMessageLogger(ctx, msg).Debug("received empty message")
  1397  		return &api.ProcessRaftMessageResponse{}, nil
  1398  	}
  1399  
  1400  	// Don't process the message if this comes from
  1401  	// a node in the remove set
  1402  	if n.cluster.IsIDRemoved(msg.Message.From) {
  1403  		n.processRaftMessageLogger(ctx, msg).Debug("received message from removed member")
  1404  		return nil, status.Errorf(codes.NotFound, "%s", membership.ErrMemberRemoved.Error())
  1405  	}
  1406  
  1407  	ctx, cancel := n.WithContext(ctx)
  1408  	defer cancel()
  1409  
  1410  	// TODO(aaronl): Address changes are temporarily disabled.
  1411  	// See https://github.com/docker/docker/issues/30455.
  1412  	// This should be reenabled in the future with additional
  1413  	// safeguards (perhaps storing multiple addresses per node).
  1414  	//if err := n.reportNewAddress(ctx, msg.Message.From); err != nil {
  1415  	//	log.G(ctx).WithError(err).Errorf("failed to report new address of %x to transport", msg.Message.From)
  1416  	//}
  1417  
  1418  	// Reject vote requests from unreachable peers
  1419  	if msg.Message.Type == raftpb.MsgVote {
  1420  		member := n.cluster.GetMember(msg.Message.From)
  1421  		if member == nil {
  1422  			n.processRaftMessageLogger(ctx, msg).Debug("received message from unknown member")
  1423  			return &api.ProcessRaftMessageResponse{}, nil
  1424  		}
  1425  
  1426  		if err := n.transport.HealthCheck(ctx, msg.Message.From); err != nil {
  1427  			n.processRaftMessageLogger(ctx, msg).WithError(err).Debug("member which sent vote request failed health check")
  1428  			return &api.ProcessRaftMessageResponse{}, nil
  1429  		}
  1430  	}
  1431  
  1432  	if msg.Message.Type == raftpb.MsgProp {
  1433  		// We don't accept forwarded proposals. Our
  1434  		// current architecture depends on only the leader
  1435  		// making proposals, so in-flight proposals can be
  1436  		// guaranteed not to conflict.
  1437  		n.processRaftMessageLogger(ctx, msg).Debug("dropped forwarded proposal")
  1438  		return &api.ProcessRaftMessageResponse{}, nil
  1439  	}
  1440  
  1441  	// can't stop the raft node while an async RPC is in progress
  1442  	n.stopMu.RLock()
  1443  	defer n.stopMu.RUnlock()
  1444  
  1445  	if n.IsMember() {
  1446  		if msg.Message.To != n.Config.ID {
  1447  			n.processRaftMessageLogger(ctx, msg).Errorf("received message intended for raft_id %x", msg.Message.To)
  1448  			return &api.ProcessRaftMessageResponse{}, nil
  1449  		}
  1450  
  1451  		if err := n.raftNode.Step(ctx, *msg.Message); err != nil {
  1452  			n.processRaftMessageLogger(ctx, msg).WithError(err).Debug("raft Step failed")
  1453  		}
  1454  	}
  1455  
  1456  	return &api.ProcessRaftMessageResponse{}, nil
  1457  }
  1458  
  1459  // ResolveAddress returns the address reaching for a given node ID.
  1460  func (n *Node) ResolveAddress(ctx context.Context, msg *api.ResolveAddressRequest) (*api.ResolveAddressResponse, error) {
  1461  	if !n.IsMember() {
  1462  		return nil, ErrNoRaftMember
  1463  	}
  1464  
  1465  	nodeInfo, err := ca.RemoteNode(ctx)
  1466  	if err != nil {
  1467  		return nil, err
  1468  	}
  1469  
  1470  	fields := logrus.Fields{
  1471  		"node.id": nodeInfo.NodeID,
  1472  		"method":  "(*Node).ResolveAddress",
  1473  		"raft_id": fmt.Sprintf("%x", n.Config.ID),
  1474  	}
  1475  	if nodeInfo.ForwardedBy != nil {
  1476  		fields["forwarder.id"] = nodeInfo.ForwardedBy.NodeID
  1477  	}
  1478  	log.G(ctx).WithFields(fields).Debug("")
  1479  
  1480  	member := n.cluster.GetMember(msg.RaftID)
  1481  	if member == nil {
  1482  		return nil, status.Errorf(codes.NotFound, "member %x not found", msg.RaftID)
  1483  	}
  1484  	return &api.ResolveAddressResponse{Addr: member.Addr}, nil
  1485  }
  1486  
  1487  func (n *Node) getLeaderConn() (*grpc.ClientConn, error) {
  1488  	leader, err := n.Leader()
  1489  	if err != nil {
  1490  		return nil, err
  1491  	}
  1492  
  1493  	if leader == n.Config.ID {
  1494  		return nil, raftselector.ErrIsLeader
  1495  	}
  1496  	conn, err := n.transport.PeerConn(leader)
  1497  	if err != nil {
  1498  		return nil, errors.Wrap(err, "failed to get connection to leader")
  1499  	}
  1500  	return conn, nil
  1501  }
  1502  
  1503  // LeaderConn returns current connection to cluster leader or raftselector.ErrIsLeader
  1504  // if current machine is leader.
  1505  func (n *Node) LeaderConn(ctx context.Context) (*grpc.ClientConn, error) {
  1506  	cc, err := n.getLeaderConn()
  1507  	if err == nil {
  1508  		return cc, nil
  1509  	}
  1510  	if err == raftselector.ErrIsLeader {
  1511  		return nil, err
  1512  	}
  1513  	if atomic.LoadUint32(&n.ticksWithNoLeader) > lostQuorumTimeout {
  1514  		return nil, errLostQuorum
  1515  	}
  1516  
  1517  	ticker := time.NewTicker(1 * time.Second)
  1518  	defer ticker.Stop()
  1519  	for {
  1520  		select {
  1521  		case <-ticker.C:
  1522  			cc, err := n.getLeaderConn()
  1523  			if err == nil {
  1524  				return cc, nil
  1525  			}
  1526  			if err == raftselector.ErrIsLeader {
  1527  				return nil, err
  1528  			}
  1529  		case <-ctx.Done():
  1530  			return nil, ctx.Err()
  1531  		}
  1532  	}
  1533  }
  1534  
  1535  // registerNode registers a new node on the cluster memberlist
  1536  func (n *Node) registerNode(node *api.RaftMember) error {
  1537  	if n.cluster.IsIDRemoved(node.RaftID) {
  1538  		return nil
  1539  	}
  1540  
  1541  	member := &membership.Member{}
  1542  
  1543  	existingMember := n.cluster.GetMember(node.RaftID)
  1544  	if existingMember != nil {
  1545  		// Member already exists
  1546  
  1547  		// If the address is different from what we thought it was,
  1548  		// update it. This can happen if we just joined a cluster
  1549  		// and are adding ourself now with the remotely-reachable
  1550  		// address.
  1551  		if existingMember.Addr != node.Addr {
  1552  			if node.RaftID != n.Config.ID {
  1553  				if err := n.transport.UpdatePeer(node.RaftID, node.Addr); err != nil {
  1554  					return err
  1555  				}
  1556  			}
  1557  			member.RaftMember = node
  1558  			n.cluster.AddMember(member)
  1559  		}
  1560  
  1561  		return nil
  1562  	}
  1563  
  1564  	// Avoid opening a connection to the local node
  1565  	if node.RaftID != n.Config.ID {
  1566  		if err := n.transport.AddPeer(node.RaftID, node.Addr); err != nil {
  1567  			return err
  1568  		}
  1569  	}
  1570  
  1571  	member.RaftMember = node
  1572  	err := n.cluster.AddMember(member)
  1573  	if err != nil {
  1574  		if rerr := n.transport.RemovePeer(node.RaftID); rerr != nil {
  1575  			return errors.Wrapf(rerr, "failed to remove peer after error %v", err)
  1576  		}
  1577  		return err
  1578  	}
  1579  
  1580  	return nil
  1581  }
  1582  
  1583  // ProposeValue calls Propose on the underlying raft library(etcd/raft) and waits
  1584  // on the commit log action before returning a result
  1585  func (n *Node) ProposeValue(ctx context.Context, storeAction []api.StoreAction, cb func()) error {
  1586  	defer metrics.StartTimer(proposeLatencyTimer)()
  1587  	ctx, cancel := n.WithContext(ctx)
  1588  	defer cancel()
  1589  	_, err := n.processInternalRaftRequest(ctx, &api.InternalRaftRequest{Action: storeAction}, cb)
  1590  
  1591  	return err
  1592  }
  1593  
  1594  // GetVersion returns the sequence information for the current raft round.
  1595  func (n *Node) GetVersion() *api.Version {
  1596  	n.stopMu.RLock()
  1597  	defer n.stopMu.RUnlock()
  1598  
  1599  	if !n.IsMember() {
  1600  		return nil
  1601  	}
  1602  
  1603  	status := n.Status()
  1604  	return &api.Version{Index: status.Commit}
  1605  }
  1606  
  1607  // ChangesBetween returns the changes starting after "from", up to and
  1608  // including "to". If these changes are not available because the log
  1609  // has been compacted, an error will be returned.
  1610  func (n *Node) ChangesBetween(from, to api.Version) ([]state.Change, error) {
  1611  	n.stopMu.RLock()
  1612  	defer n.stopMu.RUnlock()
  1613  
  1614  	if from.Index > to.Index {
  1615  		return nil, errors.New("versions are out of order")
  1616  	}
  1617  
  1618  	if !n.IsMember() {
  1619  		return nil, ErrNoRaftMember
  1620  	}
  1621  
  1622  	// never returns error
  1623  	last, _ := n.raftStore.LastIndex()
  1624  
  1625  	if to.Index > last {
  1626  		return nil, errors.New("last version is out of bounds")
  1627  	}
  1628  
  1629  	pbs, err := n.raftStore.Entries(from.Index+1, to.Index+1, math.MaxUint64)
  1630  	if err != nil {
  1631  		return nil, err
  1632  	}
  1633  
  1634  	var changes []state.Change
  1635  	for _, pb := range pbs {
  1636  		if pb.Type != raftpb.EntryNormal || pb.Data == nil {
  1637  			continue
  1638  		}
  1639  		r := &api.InternalRaftRequest{}
  1640  		err := proto.Unmarshal(pb.Data, r)
  1641  		if err != nil {
  1642  			return nil, errors.Wrap(err, "error umarshalling internal raft request")
  1643  		}
  1644  
  1645  		if r.Action != nil {
  1646  			changes = append(changes, state.Change{StoreActions: r.Action, Version: api.Version{Index: pb.Index}})
  1647  		}
  1648  	}
  1649  
  1650  	return changes, nil
  1651  }
  1652  
  1653  // SubscribePeers subscribes to peer updates in cluster. It sends always full
  1654  // list of peers.
  1655  func (n *Node) SubscribePeers() (q chan events.Event, cancel func()) {
  1656  	return n.cluster.PeersBroadcast.Watch()
  1657  }
  1658  
  1659  // GetMemberlist returns the current list of raft members in the cluster.
  1660  func (n *Node) GetMemberlist() map[uint64]*api.RaftMember {
  1661  	memberlist := make(map[uint64]*api.RaftMember)
  1662  	members := n.cluster.Members()
  1663  	leaderID, err := n.Leader()
  1664  	if err != nil {
  1665  		leaderID = raft.None
  1666  	}
  1667  
  1668  	for id, member := range members {
  1669  		reachability := api.RaftMemberStatus_REACHABLE
  1670  		leader := false
  1671  
  1672  		if member.RaftID != n.Config.ID {
  1673  			if !n.transport.Active(member.RaftID) {
  1674  				reachability = api.RaftMemberStatus_UNREACHABLE
  1675  			}
  1676  		}
  1677  
  1678  		if member.RaftID == leaderID {
  1679  			leader = true
  1680  		}
  1681  
  1682  		memberlist[id] = &api.RaftMember{
  1683  			RaftID: member.RaftID,
  1684  			NodeID: member.NodeID,
  1685  			Addr:   member.Addr,
  1686  			Status: api.RaftMemberStatus{
  1687  				Leader:       leader,
  1688  				Reachability: reachability,
  1689  			},
  1690  		}
  1691  	}
  1692  
  1693  	return memberlist
  1694  }
  1695  
  1696  // Status returns status of underlying etcd.Node.
  1697  func (n *Node) Status() raft.Status {
  1698  	return n.raftNode.Status()
  1699  }
  1700  
  1701  // GetMemberByNodeID returns member information based
  1702  // on its generic Node ID.
  1703  func (n *Node) GetMemberByNodeID(nodeID string) *membership.Member {
  1704  	members := n.cluster.Members()
  1705  	for _, member := range members {
  1706  		if member.NodeID == nodeID {
  1707  			return member
  1708  		}
  1709  	}
  1710  	return nil
  1711  }
  1712  
  1713  // GetNodeIDByRaftID returns the generic Node ID of a member given its raft ID.
  1714  // It returns ErrMemberUnknown if the raft ID is unknown.
  1715  func (n *Node) GetNodeIDByRaftID(raftID uint64) (string, error) {
  1716  	if member, ok := n.cluster.Members()[raftID]; ok {
  1717  		return member.NodeID, nil
  1718  	}
  1719  	// this is the only possible error value that should be returned; the
  1720  	// manager code depends on this. if you need to add more errors later, make
  1721  	// sure that you update the callers of this method accordingly
  1722  	return "", ErrMemberUnknown
  1723  }
  1724  
  1725  // IsMember checks if the raft node has effectively joined
  1726  // a cluster of existing members.
  1727  func (n *Node) IsMember() bool {
  1728  	return atomic.LoadUint32(&n.isMember) == 1
  1729  }
  1730  
  1731  // Saves a log entry to our Store
  1732  func (n *Node) saveToStorage(
  1733  	ctx context.Context,
  1734  	raftConfig *api.RaftConfig,
  1735  	hardState raftpb.HardState,
  1736  	entries []raftpb.Entry,
  1737  	snapshot raftpb.Snapshot,
  1738  ) (err error) {
  1739  
  1740  	if !raft.IsEmptySnap(snapshot) {
  1741  		if err := n.raftLogger.SaveSnapshot(snapshot); err != nil {
  1742  			return errors.Wrap(err, "failed to save snapshot")
  1743  		}
  1744  		if err := n.raftLogger.GC(snapshot.Metadata.Index, snapshot.Metadata.Term, raftConfig.KeepOldSnapshots); err != nil {
  1745  			log.G(ctx).WithError(err).Error("unable to clean old snapshots and WALs")
  1746  		}
  1747  		if err = n.raftStore.ApplySnapshot(snapshot); err != nil {
  1748  			return errors.Wrap(err, "failed to apply snapshot on raft node")
  1749  		}
  1750  	}
  1751  
  1752  	if err := n.raftLogger.SaveEntries(hardState, entries); err != nil {
  1753  		return errors.Wrap(err, "failed to save raft log entries")
  1754  	}
  1755  
  1756  	if len(entries) > 0 {
  1757  		lastIndex := entries[len(entries)-1].Index
  1758  		if lastIndex > n.writtenWALIndex {
  1759  			n.writtenWALIndex = lastIndex
  1760  		}
  1761  	}
  1762  
  1763  	if err = n.raftStore.Append(entries); err != nil {
  1764  		return errors.Wrap(err, "failed to append raft log entries")
  1765  	}
  1766  
  1767  	return nil
  1768  }
  1769  
  1770  // processInternalRaftRequest proposes a value to be appended to the raft log.
  1771  // It calls Propose() on etcd/raft, which calls back into the raft FSM,
  1772  // which then sends a message to each of the participating nodes
  1773  // in the raft group to apply a log entry and then waits for it to be applied
  1774  // on this node. It will block until the this node:
  1775  // 1. Gets the necessary replies back from the participating nodes and also performs the commit itself, or
  1776  // 2. There is an error, or
  1777  // 3. Until the raft node finalizes all the proposals on node shutdown.
  1778  func (n *Node) processInternalRaftRequest(ctx context.Context, r *api.InternalRaftRequest, cb func()) (proto.Message, error) {
  1779  	n.stopMu.RLock()
  1780  	if !n.IsMember() {
  1781  		n.stopMu.RUnlock()
  1782  		return nil, ErrStopped
  1783  	}
  1784  	n.waitProp.Add(1)
  1785  	defer n.waitProp.Done()
  1786  	n.stopMu.RUnlock()
  1787  
  1788  	r.ID = n.reqIDGen.Next()
  1789  
  1790  	// This must be derived from the context which is cancelled by stop()
  1791  	// to avoid a deadlock on shutdown.
  1792  	waitCtx, cancel := context.WithCancel(ctx)
  1793  
  1794  	ch := n.wait.register(r.ID, cb, cancel)
  1795  
  1796  	// Do this check after calling register to avoid a race.
  1797  	if atomic.LoadUint32(&n.signalledLeadership) != 1 {
  1798  		log.G(ctx).Error("node is no longer leader, aborting propose")
  1799  		n.wait.cancel(r.ID)
  1800  		return nil, ErrLostLeadership
  1801  	}
  1802  
  1803  	data, err := r.Marshal()
  1804  	if err != nil {
  1805  		n.wait.cancel(r.ID)
  1806  		return nil, err
  1807  	}
  1808  
  1809  	if len(data) > store.MaxTransactionBytes {
  1810  		n.wait.cancel(r.ID)
  1811  		return nil, ErrRequestTooLarge
  1812  	}
  1813  
  1814  	err = n.raftNode.Propose(waitCtx, data)
  1815  	if err != nil {
  1816  		n.wait.cancel(r.ID)
  1817  		return nil, err
  1818  	}
  1819  
  1820  	select {
  1821  	case x, ok := <-ch:
  1822  		if !ok {
  1823  			// Wait notification channel was closed. This should only happen if the wait was cancelled.
  1824  			log.G(ctx).Error("wait cancelled")
  1825  			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
  1826  				log.G(ctx).Error("wait cancelled but node is still a leader")
  1827  			}
  1828  			return nil, ErrLostLeadership
  1829  		}
  1830  		return x.(proto.Message), nil
  1831  	case <-waitCtx.Done():
  1832  		n.wait.cancel(r.ID)
  1833  		// If we can read from the channel, wait item was triggered. Otherwise it was cancelled.
  1834  		x, ok := <-ch
  1835  		if !ok {
  1836  			log.G(ctx).WithError(waitCtx.Err()).Error("wait context cancelled")
  1837  			if atomic.LoadUint32(&n.signalledLeadership) == 1 {
  1838  				log.G(ctx).Error("wait context cancelled but node is still a leader")
  1839  			}
  1840  			return nil, ErrLostLeadership
  1841  		}
  1842  		return x.(proto.Message), nil
  1843  	case <-ctx.Done():
  1844  		n.wait.cancel(r.ID)
  1845  		// if channel is closed, wait item was canceled, otherwise it was triggered
  1846  		x, ok := <-ch
  1847  		if !ok {
  1848  			return nil, ctx.Err()
  1849  		}
  1850  		return x.(proto.Message), nil
  1851  	}
  1852  }
  1853  
  1854  // configure sends a configuration change through consensus and
  1855  // then waits for it to be applied to the server. It will block
  1856  // until the change is performed or there is an error.
  1857  func (n *Node) configure(ctx context.Context, cc raftpb.ConfChange) error {
  1858  	cc.ID = n.reqIDGen.Next()
  1859  
  1860  	ctx, cancel := context.WithCancel(ctx)
  1861  	ch := n.wait.register(cc.ID, nil, cancel)
  1862  
  1863  	if err := n.raftNode.ProposeConfChange(ctx, cc); err != nil {
  1864  		n.wait.cancel(cc.ID)
  1865  		return err
  1866  	}
  1867  
  1868  	select {
  1869  	case x := <-ch:
  1870  		if err, ok := x.(error); ok {
  1871  			return err
  1872  		}
  1873  		if x != nil {
  1874  			log.G(ctx).Panic("raft: configuration change error, return type should always be error")
  1875  		}
  1876  		return nil
  1877  	case <-ctx.Done():
  1878  		n.wait.cancel(cc.ID)
  1879  		return ctx.Err()
  1880  	}
  1881  }
  1882  
  1883  func (n *Node) processCommitted(ctx context.Context, entry raftpb.Entry) error {
  1884  	// Process a normal entry
  1885  	if entry.Type == raftpb.EntryNormal && entry.Data != nil {
  1886  		if err := n.processEntry(ctx, entry); err != nil {
  1887  			return err
  1888  		}
  1889  	}
  1890  
  1891  	// Process a configuration change (add/remove node)
  1892  	if entry.Type == raftpb.EntryConfChange {
  1893  		n.processConfChange(ctx, entry)
  1894  	}
  1895  
  1896  	n.appliedIndex = entry.Index
  1897  	return nil
  1898  }
  1899  
  1900  func (n *Node) processEntry(ctx context.Context, entry raftpb.Entry) error {
  1901  	r := &api.InternalRaftRequest{}
  1902  	err := proto.Unmarshal(entry.Data, r)
  1903  	if err != nil {
  1904  		return err
  1905  	}
  1906  
  1907  	if !n.wait.trigger(r.ID, r) {
  1908  		// There was no wait on this ID, meaning we don't have a
  1909  		// transaction in progress that would be committed to the
  1910  		// memory store by the "trigger" call. This could mean that:
  1911  		// 1. Startup is in progress, and the raft WAL is being parsed,
  1912  		// processed and applied to the store, or
  1913  		// 2. Either a different node wrote this to raft,
  1914  		// or we wrote it before losing the leader
  1915  		// position and cancelling the transaction. This entry still needs
  1916  		// to be committed since other nodes have already committed it.
  1917  		// Create a new transaction to commit this entry.
  1918  
  1919  		// It should not be possible for processInternalRaftRequest
  1920  		// to be running in this situation, but out of caution we
  1921  		// cancel any current invocations to avoid a deadlock.
  1922  		// TODO(anshul) This call is likely redundant, remove after consideration.
  1923  		n.wait.cancelAll()
  1924  
  1925  		err := n.memoryStore.ApplyStoreActions(r.Action)
  1926  		if err != nil {
  1927  			log.G(ctx).WithError(err).Error("failed to apply actions from raft")
  1928  		}
  1929  	}
  1930  	return nil
  1931  }
  1932  
  1933  func (n *Node) processConfChange(ctx context.Context, entry raftpb.Entry) {
  1934  	var (
  1935  		err error
  1936  		cc  raftpb.ConfChange
  1937  	)
  1938  
  1939  	if err := proto.Unmarshal(entry.Data, &cc); err != nil {
  1940  		n.wait.trigger(cc.ID, err)
  1941  	}
  1942  
  1943  	if err := n.cluster.ValidateConfigurationChange(cc); err != nil {
  1944  		n.wait.trigger(cc.ID, err)
  1945  	}
  1946  
  1947  	switch cc.Type {
  1948  	case raftpb.ConfChangeAddNode:
  1949  		err = n.applyAddNode(cc)
  1950  	case raftpb.ConfChangeUpdateNode:
  1951  		err = n.applyUpdateNode(ctx, cc)
  1952  	case raftpb.ConfChangeRemoveNode:
  1953  		err = n.applyRemoveNode(ctx, cc)
  1954  	}
  1955  
  1956  	if err != nil {
  1957  		n.wait.trigger(cc.ID, err)
  1958  	}
  1959  
  1960  	n.confState = *n.raftNode.ApplyConfChange(cc)
  1961  	n.wait.trigger(cc.ID, nil)
  1962  }
  1963  
  1964  // applyAddNode is called when we receive a ConfChange
  1965  // from a member in the raft cluster, this adds a new
  1966  // node to the existing raft cluster
  1967  func (n *Node) applyAddNode(cc raftpb.ConfChange) error {
  1968  	member := &api.RaftMember{}
  1969  	err := proto.Unmarshal(cc.Context, member)
  1970  	if err != nil {
  1971  		return err
  1972  	}
  1973  
  1974  	// ID must be non zero
  1975  	if member.RaftID == 0 {
  1976  		return nil
  1977  	}
  1978  
  1979  	return n.registerNode(member)
  1980  }
  1981  
  1982  // applyUpdateNode is called when we receive a ConfChange from a member in the
  1983  // raft cluster which update the address of an existing node.
  1984  func (n *Node) applyUpdateNode(ctx context.Context, cc raftpb.ConfChange) error {
  1985  	newMember := &api.RaftMember{}
  1986  	err := proto.Unmarshal(cc.Context, newMember)
  1987  	if err != nil {
  1988  		return err
  1989  	}
  1990  
  1991  	if newMember.RaftID == n.Config.ID {
  1992  		return nil
  1993  	}
  1994  	if err := n.transport.UpdatePeer(newMember.RaftID, newMember.Addr); err != nil {
  1995  		return err
  1996  	}
  1997  	return n.cluster.UpdateMember(newMember.RaftID, newMember)
  1998  }
  1999  
  2000  // applyRemoveNode is called when we receive a ConfChange
  2001  // from a member in the raft cluster, this removes a node
  2002  // from the existing raft cluster
  2003  func (n *Node) applyRemoveNode(ctx context.Context, cc raftpb.ConfChange) (err error) {
  2004  	// If the node from where the remove is issued is
  2005  	// a follower and the leader steps down, Campaign
  2006  	// to be the leader.
  2007  
  2008  	if cc.NodeID == n.leader() && !n.isLeader() {
  2009  		if err = n.raftNode.Campaign(ctx); err != nil {
  2010  			return err
  2011  		}
  2012  	}
  2013  
  2014  	if cc.NodeID == n.Config.ID {
  2015  		// wait for the commit ack to be sent before closing connection
  2016  		n.asyncTasks.Wait()
  2017  
  2018  		n.NodeRemoved()
  2019  	} else if err := n.transport.RemovePeer(cc.NodeID); err != nil {
  2020  		return err
  2021  	}
  2022  
  2023  	return n.cluster.RemoveMember(cc.NodeID)
  2024  }
  2025  
  2026  // SubscribeLeadership returns channel to which events about leadership change
  2027  // will be sent in form of raft.LeadershipState. Also cancel func is returned -
  2028  // it should be called when listener is no longer interested in events.
  2029  func (n *Node) SubscribeLeadership() (q chan events.Event, cancel func()) {
  2030  	return n.leadershipBroadcast.Watch()
  2031  }
  2032  
  2033  // createConfigChangeEnts creates a series of Raft entries (i.e.
  2034  // EntryConfChange) to remove the set of given IDs from the cluster. The ID
  2035  // `self` is _not_ removed, even if present in the set.
  2036  // If `self` is not inside the given ids, it creates a Raft entry to add a
  2037  // default member with the given `self`.
  2038  func createConfigChangeEnts(ids []uint64, self uint64, term, index uint64) []raftpb.Entry {
  2039  	var ents []raftpb.Entry
  2040  	next := index + 1
  2041  	found := false
  2042  	for _, id := range ids {
  2043  		if id == self {
  2044  			found = true
  2045  			continue
  2046  		}
  2047  		cc := &raftpb.ConfChange{
  2048  			Type:   raftpb.ConfChangeRemoveNode,
  2049  			NodeID: id,
  2050  		}
  2051  		data, err := cc.Marshal()
  2052  		if err != nil {
  2053  			log.L.WithError(err).Panic("marshal configuration change should never fail")
  2054  		}
  2055  		e := raftpb.Entry{
  2056  			Type:  raftpb.EntryConfChange,
  2057  			Data:  data,
  2058  			Term:  term,
  2059  			Index: next,
  2060  		}
  2061  		ents = append(ents, e)
  2062  		next++
  2063  	}
  2064  	if !found {
  2065  		node := &api.RaftMember{RaftID: self}
  2066  		meta, err := node.Marshal()
  2067  		if err != nil {
  2068  			log.L.WithError(err).Panic("marshal member should never fail")
  2069  		}
  2070  		cc := &raftpb.ConfChange{
  2071  			Type:    raftpb.ConfChangeAddNode,
  2072  			NodeID:  self,
  2073  			Context: meta,
  2074  		}
  2075  		data, err := cc.Marshal()
  2076  		if err != nil {
  2077  			log.L.WithError(err).Panic("marshal configuration change should never fail")
  2078  		}
  2079  		e := raftpb.Entry{
  2080  			Type:  raftpb.EntryConfChange,
  2081  			Data:  data,
  2082  			Term:  term,
  2083  			Index: next,
  2084  		}
  2085  		ents = append(ents, e)
  2086  	}
  2087  	return ents
  2088  }
  2089  
  2090  // getIDs returns an ordered set of IDs included in the given snapshot and
  2091  // the entries. The given snapshot/entries can contain two kinds of
  2092  // ID-related entry:
  2093  // - ConfChangeAddNode, in which case the contained ID will be added into the set.
  2094  // - ConfChangeRemoveNode, in which case the contained ID will be removed from the set.
  2095  func getIDs(snap *raftpb.Snapshot, ents []raftpb.Entry) []uint64 {
  2096  	ids := make(map[uint64]struct{})
  2097  	if snap != nil {
  2098  		for _, id := range snap.Metadata.ConfState.Nodes {
  2099  			ids[id] = struct{}{}
  2100  		}
  2101  	}
  2102  	for _, e := range ents {
  2103  		if e.Type != raftpb.EntryConfChange {
  2104  			continue
  2105  		}
  2106  		if snap != nil && e.Index < snap.Metadata.Index {
  2107  			continue
  2108  		}
  2109  		var cc raftpb.ConfChange
  2110  		if err := cc.Unmarshal(e.Data); err != nil {
  2111  			log.L.WithError(err).Panic("unmarshal configuration change should never fail")
  2112  		}
  2113  		switch cc.Type {
  2114  		case raftpb.ConfChangeAddNode:
  2115  			ids[cc.NodeID] = struct{}{}
  2116  		case raftpb.ConfChangeRemoveNode:
  2117  			delete(ids, cc.NodeID)
  2118  		case raftpb.ConfChangeUpdateNode:
  2119  			// do nothing
  2120  		default:
  2121  			log.L.Panic("ConfChange Type should be either ConfChangeAddNode, or ConfChangeRemoveNode, or ConfChangeUpdateNode!")
  2122  		}
  2123  	}
  2124  	var sids []uint64
  2125  	for id := range ids {
  2126  		sids = append(sids, id)
  2127  	}
  2128  	return sids
  2129  }
  2130  
  2131  func (n *Node) reqTimeout() time.Duration {
  2132  	return 5*time.Second + 2*time.Duration(n.Config.ElectionTick)*n.opts.TickInterval
  2133  }