github.com/unionj-cloud/go-doudou/v2@v2.3.5/toolkit/memberlist/state.go

github.com/unionj-cloud/go-doudou/v2@v2.3.5/toolkit/memberlist/state.go (about)

     1  package memberlist
     2  
     3  import (
     4  	"bytes"
     5  	"fmt"
     6  	"github.com/shirou/gopsutil/cpu"
     7  	"math"
     8  	"math/rand"
     9  	"net"
    10  	"strings"
    11  	"sync/atomic"
    12  	"time"
    13  
    14  	metrics "github.com/armon/go-metrics"
    15  )
    16  
    17  type NodeStateType int
    18  
    19  const (
    20  	StateAlive NodeStateType = iota
    21  	StateSuspect
    22  	StateDead
    23  	StateLeft
    24  )
    25  
    26  // Node represents a node in the cluster.
    27  type Node struct {
    28  	Name   string
    29  	Addr   string
    30  	Port   uint16
    31  	Meta   []byte        // Metadata from the delegate for this node.
    32  	State  NodeStateType // State of the node.
    33  	PMin   uint8         // Minimum protocol version this understands
    34  	PMax   uint8         // Maximum protocol version this understands
    35  	PCur   uint8         // Current version node is speaking
    36  	DMin   uint8         // Min protocol version for the delegate to understand
    37  	DMax   uint8         // Max protocol version for the delegate to understand
    38  	DCur   uint8         // Current version delegate is speaking
    39  	Weight int           // node weight for load balancing
    40  }
    41  
    42  // Address returns the host:port form of a node's address, suitable for use
    43  // with a transport.
    44  func (n *Node) Address() string {
    45  	return joinHostPort(n.Addr, n.Port)
    46  }
    47  
    48  // FullAddress returns the node name and host:port form of a node's address,
    49  // suitable for use with a transport.
    50  func (n *Node) FullAddress() Address {
    51  	return Address{
    52  		Addr: joinHostPort(n.Addr, n.Port),
    53  		Name: n.Name,
    54  	}
    55  }
    56  
    57  // String returns the node name
    58  func (n *Node) String() string {
    59  	return n.Name
    60  }
    61  
    62  // NodeState is used to manage our state view of another node
    63  type nodeState struct {
    64  	Node
    65  	Incarnation uint32        // Last known incarnation number
    66  	State       NodeStateType // Current state
    67  	StateChange time.Time     // Time last state change happened
    68  	Weight      int           // node weight for load balancing
    69  	WeightAt    int64         // UTC timestamp which node weight calculated at
    70  }
    71  
    72  func NewNodeState(node Node, state NodeStateType) *nodeState {
    73  	return &nodeState{Node: node, State: state}
    74  }
    75  
    76  // Address returns the host:port form of a node's address, suitable for use
    77  // with a transport.
    78  func (n *nodeState) Address() string {
    79  	return n.Node.Address()
    80  }
    81  
    82  // FullAddress returns the node name and host:port form of a node's address,
    83  // suitable for use with a transport.
    84  func (n *nodeState) FullAddress() Address {
    85  	return n.Node.FullAddress()
    86  }
    87  
    88  func (n *nodeState) DeadOrLeft() bool {
    89  	return n.State == StateDead || n.State == StateLeft
    90  }
    91  
    92  // ackHandler is used to register handlers for incoming acks and nacks.
    93  type ackHandler struct {
    94  	ackFn  func([]byte, time.Time)
    95  	nackFn func()
    96  	timer  *time.Timer
    97  }
    98  
    99  // NoPingResponseError is used to indicate a 'ping' packet was
   100  // successfully issued but no response was received
   101  type NoPingResponseError struct {
   102  	node string
   103  }
   104  
   105  func (f NoPingResponseError) Error() string {
   106  	return fmt.Sprintf("No response from node %s", f.node)
   107  }
   108  
   109  // Schedule is used to ensure the Tick is performed periodically. This
   110  // function is safe to call multiple times. If the memberlist is already
   111  // scheduled, then it won't do anything.
   112  func (m *Memberlist) schedule() {
   113  	m.tickerLock.Lock()
   114  	defer m.tickerLock.Unlock()
   115  
   116  	// If we already have tickers, then don't do anything, since we're
   117  	// scheduled
   118  	if len(m.tickers) > 0 {
   119  		return
   120  	}
   121  
   122  	// Create the stop tick channel, a blocking channel. We close this
   123  	// when we should stop the tickers.
   124  	stopCh := make(chan struct{})
   125  
   126  	// Create a new probeTicker
   127  	if m.config.ProbeInterval > 0 {
   128  		t := time.NewTicker(m.config.ProbeInterval)
   129  		go m.triggerFuncDynamic(func() time.Duration {
   130  			return m.config.ProbeInterval
   131  		}, t, stopCh, m.probe)
   132  		m.tickers = append(m.tickers, t)
   133  	}
   134  
   135  	// Create a push pull ticker if needed
   136  	if m.config.PushPullInterval > 0 {
   137  		go m.pushPullTrigger(stopCh)
   138  	}
   139  
   140  	// Create a gossip ticker if needed
   141  	if m.config.GossipInterval > 0 && m.config.GossipNodes > 0 {
   142  		t := time.NewTicker(m.config.GossipInterval)
   143  		go m.triggerFuncDynamic(func() time.Duration {
   144  			return m.config.GossipInterval
   145  		}, t, stopCh, m.gossip)
   146  		m.tickers = append(m.tickers, t)
   147  	}
   148  
   149  	// Create node weight ticker if needed
   150  	if m.config.WeightInterval > 0 {
   151  		t := time.NewTicker(m.config.WeightInterval)
   152  		go m.triggerFunc(m.config.WeightInterval, t.C, stopCh, m.weight)
   153  		m.tickers = append(m.tickers, t)
   154  	}
   155  
   156  	// If we made any tickers, then record the stopTick channel for
   157  	// later.
   158  	if len(m.tickers) > 0 {
   159  		m.stopTick = stopCh
   160  	}
   161  }
   162  
   163  // triggerFunc is used to trigger a function call each time a
   164  // message is received until a stop tick arrives.
   165  func (m *Memberlist) triggerFunc(stagger time.Duration, C <-chan time.Time, stop <-chan struct{}, f func()) {
   166  	// Use a random stagger to avoid syncronizing
   167  	randStagger := time.Duration(uint64(rand.Int63()) % uint64(stagger))
   168  	select {
   169  	case <-time.After(randStagger):
   170  	case <-stop:
   171  		return
   172  	}
   173  	for {
   174  		select {
   175  		case <-C:
   176  			f()
   177  		case <-stop:
   178  			return
   179  		}
   180  	}
   181  }
   182  
   183  func (m *Memberlist) triggerFuncDynamic(getter func() time.Duration, t *time.Ticker, stop <-chan struct{}, f func()) {
   184  	stagger := getter()
   185  	randStagger := time.Duration(uint64(rand.Int63()) % uint64(stagger))
   186  	select {
   187  	case <-time.After(randStagger):
   188  	case <-stop:
   189  		return
   190  	}
   191  	for {
   192  		select {
   193  		case <-t.C:
   194  			t.Reset(getter())
   195  			f()
   196  		case <-stop:
   197  			return
   198  		}
   199  	}
   200  }
   201  
   202  // pushPullTrigger is used to periodically trigger a push/pull until
   203  // a stop tick arrives. We don't use triggerFunc since the push/pull
   204  // timer is dynamically scaled based on cluster size to avoid network
   205  // saturation
   206  func (m *Memberlist) pushPullTrigger(stop <-chan struct{}) {
   207  	interval := m.config.PushPullInterval
   208  
   209  	// Use a random stagger to avoid syncronizing
   210  	randStagger := time.Duration(uint64(rand.Int63()) % uint64(interval))
   211  	select {
   212  	case <-time.After(randStagger):
   213  	case <-stop:
   214  		return
   215  	}
   216  
   217  	// Tick using a dynamic timer
   218  	for {
   219  		tickTime := pushPullScale(m.config.PushPullInterval, m.estNumNodes())
   220  		select {
   221  		case <-time.After(tickTime):
   222  			m.pushPull()
   223  		case <-stop:
   224  			return
   225  		}
   226  	}
   227  }
   228  
   229  // Deschedule is used to stop the background maintenance. This is safe
   230  // to call multiple times.
   231  func (m *Memberlist) deschedule() {
   232  	m.tickerLock.Lock()
   233  	defer m.tickerLock.Unlock()
   234  
   235  	// If we have no tickers, then we aren't scheduled.
   236  	if len(m.tickers) == 0 {
   237  		return
   238  	}
   239  
   240  	// Close the stop channel so all the ticker listeners stop.
   241  	close(m.stopTick)
   242  
   243  	// Explicitly stop all the tickers themselves so they don't take
   244  	// up any more resources, and get rid of the list.
   245  	for _, t := range m.tickers {
   246  		t.Stop()
   247  	}
   248  	m.tickers = nil
   249  }
   250  
   251  // Tick is used to perform a single round of failure detection and gossip
   252  func (m *Memberlist) probe() {
   253  	// Track the number of indexes we've considered probing
   254  	numCheck := 0
   255  START:
   256  	m.nodeLock.RLock()
   257  
   258  	// Make sure we don't wrap around infinitely
   259  	if numCheck >= len(m.nodes) {
   260  		m.nodeLock.RUnlock()
   261  		return
   262  	}
   263  
   264  	// Handle the wrap around case
   265  	if m.probeIndex >= len(m.nodes) {
   266  		m.nodeLock.RUnlock()
   267  		m.resetNodes()
   268  		m.probeIndex = 0
   269  		numCheck++
   270  		goto START
   271  	}
   272  
   273  	// Determine if we should probe this node
   274  	skip := false
   275  	var node nodeState
   276  
   277  	node = *m.nodes[m.probeIndex]
   278  	if node.Name == m.config.Name {
   279  		skip = true
   280  	} else if node.DeadOrLeft() {
   281  		skip = true
   282  	}
   283  
   284  	// Potentially skip
   285  	m.nodeLock.RUnlock()
   286  	m.probeIndex++
   287  	if skip {
   288  		numCheck++
   289  		goto START
   290  	}
   291  
   292  	// Probe the specific node
   293  	m.probeNode(&node)
   294  }
   295  
   296  // probeNodeByAddr just safely calls probeNode given only the address of the node (for tests)
   297  func (m *Memberlist) probeNodeByAddr(addr string) {
   298  	m.nodeLock.RLock()
   299  	n := m.nodeMap[addr]
   300  	m.nodeLock.RUnlock()
   301  
   302  	m.probeNode(n)
   303  }
   304  
   305  // failedRemote checks the error and decides if it indicates a failure on the
   306  // other end.
   307  func failedRemote(err error) bool {
   308  	switch t := err.(type) {
   309  	case *net.OpError:
   310  		if strings.HasPrefix(t.Net, "tcp") {
   311  			switch t.Op {
   312  			case "dial", "read", "write":
   313  				return true
   314  			}
   315  		}
   316  	}
   317  	return false
   318  }
   319  
   320  // probeNode handles a single round of failure checking on a node.
   321  func (m *Memberlist) probeNode(node *nodeState) {
   322  	defer metrics.MeasureSince([]string{"memberlist", "probeNode"}, time.Now())
   323  
   324  	// We use our health awareness to scale the overall probe interval, so we
   325  	// slow down if we detect problems. The ticker that calls us can handle
   326  	// us running over the base interval, and will skip missed ticks.
   327  	probeInterval := m.awareness.ScaleTimeout(m.config.ProbeInterval)
   328  	if probeInterval > m.config.ProbeInterval {
   329  		metrics.IncrCounter([]string{"memberlist", "degraded", "probe"}, 1)
   330  	}
   331  
   332  	// Prepare a ping message and setup an ack handler.
   333  	selfAddr, selfPort := m.getAdvertise()
   334  	ping := ping{
   335  		SeqNo:      m.nextSeqNo(),
   336  		Node:       node.Name,
   337  		SourceAddr: selfAddr,
   338  		SourcePort: selfPort,
   339  		SourceNode: m.config.Name,
   340  	}
   341  	ackCh := make(chan ackMessage, m.config.IndirectChecks+1)
   342  	nackCh := make(chan struct{}, m.config.IndirectChecks+1)
   343  	m.setProbeChannels(ping.SeqNo, ackCh, nackCh, probeInterval)
   344  
   345  	// Mark the sent time here, which should be after any pre-processing but
   346  	// before system calls to do the actual send. This probably over-reports
   347  	// a bit, but it's the best we can do. We had originally put this right
   348  	// after the I/O, but that would sometimes give negative RTT measurements
   349  	// which was not desirable.
   350  	sent := time.Now()
   351  
   352  	// Send a ping to the node. If this node looks like it's suspect or dead,
   353  	// also tack on a suspect message so that it has a chance to refute as
   354  	// soon as possible.
   355  	deadline := sent.Add(probeInterval)
   356  	addr := node.Address()
   357  
   358  	// Arrange for our self-awareness to get updated.
   359  	var awarenessDelta int
   360  	defer func() {
   361  		m.awareness.ApplyDelta(awarenessDelta)
   362  	}()
   363  	if node.State == StateAlive {
   364  		if err := m.encodeAndSendMsg(node.FullAddress(), pingMsg, &ping); err != nil {
   365  			m.logger.Printf("[ERR] memberlist: Failed to send ping: %s", err)
   366  			if failedRemote(err) {
   367  				goto HANDLE_REMOTE_FAILURE
   368  			} else {
   369  				if _, ok := err.(*net.DNSError); ok {
   370  					// Update our self-awareness based on the results of this failed probe.
   371  					// If we don't have peers who will send nacks then we penalize for any
   372  					// failed probe as a simple health metric. If we do have peers to nack
   373  					// verify, then we can use that as a more sophisticated measure of self-
   374  					// health because we assume them to be working, and they can help us
   375  					// decide if the probed node was really dead or if it was something wrong
   376  					// with ourselves.
   377  					awarenessDelta = 1
   378  					s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name}
   379  					m.suspectNode(&s)
   380  				}
   381  				return
   382  			}
   383  		}
   384  	} else {
   385  		var msgs [][]byte
   386  		if buf, err := encode(pingMsg, &ping); err != nil {
   387  			m.logger.Printf("[ERR] memberlist: Failed to encode ping message: %s", err)
   388  			return
   389  		} else {
   390  			msgs = append(msgs, buf.Bytes())
   391  		}
   392  		s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name}
   393  		if buf, err := encode(suspectMsg, &s); err != nil {
   394  			m.logger.Printf("[ERR] memberlist: Failed to encode suspect message: %s", err)
   395  			return
   396  		} else {
   397  			msgs = append(msgs, buf.Bytes())
   398  		}
   399  
   400  		compound := makeCompoundMessage(msgs)
   401  		if err := m.rawSendMsgPacket(node.FullAddress(), &node.Node, compound.Bytes()); err != nil {
   402  			m.logger.Printf("[ERR] memberlist: Failed to send compound ping and suspect message to %s: %s", addr, err)
   403  			if failedRemote(err) {
   404  				goto HANDLE_REMOTE_FAILURE
   405  			} else {
   406  				return
   407  			}
   408  		}
   409  	}
   410  
   411  	// Arrange for our self-awareness to get updated. At this point we've
   412  	// sent the ping, so any return statement means the probe succeeded
   413  	// which will improve our health until we get to the failure scenarios
   414  	// at the end of this function, which will alter this delta variable
   415  	// accordingly.
   416  	awarenessDelta = -1
   417  
   418  	// Wait for response or round-trip-time.
   419  	select {
   420  	case v := <-ackCh:
   421  		if v.Complete == true {
   422  			rtt := v.Timestamp.Sub(sent)
   423  			m.logger.Printf("[DEBUG] memberlist: ping remote node %s success in %s", node.Node.Name, rtt.String())
   424  			if m.config.Ping != nil {
   425  				m.config.Ping.NotifyPingComplete(&node.Node, rtt, v.Payload)
   426  			}
   427  			return
   428  		}
   429  
   430  		// As an edge case, if we get a timeout, we need to re-enqueue it
   431  		// here to break out of the select below.
   432  		if v.Complete == false {
   433  			ackCh <- v
   434  		}
   435  	case <-time.After(m.config.ProbeTimeout):
   436  		// Note that we don't scale this timeout based on awareness and
   437  		// the health score. That's because we don't really expect waiting
   438  		// longer to help get UDP through. Since health does extend the
   439  		// probe interval it will give the TCP fallback more time, which
   440  		// is more active in dealing with lost packets, and it gives more
   441  		// time to wait for indirect acks/nacks.
   442  		m.logger.Printf("[DEBUG] memberlist: Failed ping: %s (timeout reached)", node.Name)
   443  	}
   444  
   445  HANDLE_REMOTE_FAILURE:
   446  	// Get some random live nodes.
   447  	m.nodeLock.RLock()
   448  	kNodes := kRandomNodes(m.config.IndirectChecks, m.nodes, func(n *nodeState) bool {
   449  		return n.Name == m.config.Name ||
   450  			n.Name == node.Name ||
   451  			n.State != StateAlive
   452  	})
   453  	m.nodeLock.RUnlock()
   454  
   455  	// Attempt an indirect ping.
   456  	expectedNacks := 0
   457  	selfAddr, selfPort = m.getAdvertise()
   458  	ind := indirectPingReq{
   459  		SeqNo:      ping.SeqNo,
   460  		Target:     node.Addr,
   461  		Port:       node.Port,
   462  		Node:       node.Name,
   463  		SourceAddr: selfAddr,
   464  		SourcePort: selfPort,
   465  		SourceNode: m.config.Name,
   466  	}
   467  	for _, peer := range kNodes {
   468  		// We only expect nack to be sent from peers who understand
   469  		// version 4 of the protocol.
   470  		if ind.Nack = peer.PMax >= 4; ind.Nack {
   471  			expectedNacks++
   472  		}
   473  
   474  		if err := m.encodeAndSendMsg(peer.FullAddress(), indirectPingMsg, &ind); err != nil {
   475  			m.logger.Printf("[ERR] memberlist: Failed to send indirect ping: %s", err)
   476  		}
   477  	}
   478  
   479  	// Also make an attempt to contact the node directly over TCP. This
   480  	// helps prevent confused clients who get isolated from UDP traffic
   481  	// but can still speak TCP (which also means they can possibly report
   482  	// misinformation to other nodes via anti-entropy), avoiding flapping in
   483  	// the cluster.
   484  	//
   485  	// This is a little unusual because we will attempt a TCP ping to any
   486  	// member who understands version 3 of the protocol, regardless of
   487  	// which protocol version we are speaking. That's why we've included a
   488  	// config option to turn this off if desired.
   489  	fallbackCh := make(chan bool, 1)
   490  
   491  	disableTcpPings := m.config.DisableTcpPings ||
   492  		(m.config.DisableTcpPingsForNode != nil && m.config.DisableTcpPingsForNode(node.Name))
   493  	if (!disableTcpPings) && (node.PMax >= 3) {
   494  		go func() {
   495  			defer close(fallbackCh)
   496  			didContact, err := m.sendPingAndWaitForAck(node.FullAddress(), ping, deadline)
   497  			if err != nil {
   498  				m.logger.Printf("[ERR] memberlist: Failed fallback ping: %s", err)
   499  			} else {
   500  				fallbackCh <- didContact
   501  			}
   502  		}()
   503  	} else {
   504  		close(fallbackCh)
   505  	}
   506  
   507  	// Wait for the acks or timeout. Note that we don't check the fallback
   508  	// channel here because we want to issue a warning below if that's the
   509  	// *only* way we hear back from the peer, so we have to let this time
   510  	// out first to allow the normal UDP-based acks to come in.
   511  	select {
   512  	case v := <-ackCh:
   513  		if v.Complete == true {
   514  			return
   515  		}
   516  	}
   517  
   518  	// Finally, poll the fallback channel. The timeouts are set such that
   519  	// the channel will have something or be closed without having to wait
   520  	// any additional time here.
   521  	for didContact := range fallbackCh {
   522  		if didContact {
   523  			m.logger.Printf("[WARN] memberlist: Was able to connect to %s but other probes failed, network may be misconfigured", node.Name)
   524  			return
   525  		}
   526  	}
   527  	// Update our self-awareness based on the results of this failed probe.
   528  	// If we don't have peers who will send nacks then we penalize for any
   529  	// failed probe as a simple health metric. If we do have peers to nack
   530  	// verify, then we can use that as a more sophisticated measure of self-
   531  	// health because we assume them to be working, and they can help us
   532  	// decide if the probed node was really dead or if it was something wrong
   533  	// with ourselves.
   534  	awarenessDelta = 0
   535  	if expectedNacks > 0 {
   536  		if nackCount := len(nackCh); nackCount < expectedNacks {
   537  			awarenessDelta += (expectedNacks - nackCount)
   538  		}
   539  	} else {
   540  		awarenessDelta += 1
   541  	}
   542  
   543  	// No acks received from target, suspect it as failed.
   544  	m.logger.Printf("[DEBUG] memberlist: Suspect %s has failed, no acks received", node.Name)
   545  	s := suspect{Incarnation: node.Incarnation, Node: node.Name, From: m.config.Name}
   546  	m.suspectNode(&s)
   547  }
   548  
   549  // Ping initiates a ping to the node with the specified name.
   550  func (m *Memberlist) Ping(node string, addr net.Addr) (time.Duration, error) {
   551  	// Prepare a ping message and setup an ack handler.
   552  	selfAddr, selfPort := m.getAdvertise()
   553  	ping := ping{
   554  		SeqNo:      m.nextSeqNo(),
   555  		Node:       node,
   556  		SourceAddr: selfAddr,
   557  		SourcePort: selfPort,
   558  		SourceNode: m.config.Name,
   559  	}
   560  	ackCh := make(chan ackMessage, m.config.IndirectChecks+1)
   561  	m.setProbeChannels(ping.SeqNo, ackCh, nil, m.config.ProbeInterval)
   562  
   563  	a := Address{Addr: addr.String(), Name: node}
   564  
   565  	// Send a ping to the node.
   566  	if err := m.encodeAndSendMsg(a, pingMsg, &ping); err != nil {
   567  		return 0, err
   568  	}
   569  
   570  	// Mark the sent time here, which should be after any pre-processing and
   571  	// system calls to do the actual send. This probably under-reports a bit,
   572  	// but it's the best we can do.
   573  	sent := time.Now()
   574  
   575  	// Wait for response or timeout.
   576  	select {
   577  	case v := <-ackCh:
   578  		if v.Complete == true {
   579  			return v.Timestamp.Sub(sent), nil
   580  		}
   581  	case <-time.After(m.config.ProbeTimeout):
   582  		// Timeout, return an error below.
   583  	}
   584  
   585  	m.logger.Printf("[DEBUG] memberlist: Failed UDP ping: %v (timeout reached)", node)
   586  	return 0, NoPingResponseError{ping.Node}
   587  }
   588  
   589  // resetNodes is used when the tick wraps around. It will reap the
   590  // dead nodes and shuffle the node list.
   591  func (m *Memberlist) resetNodes() {
   592  	m.nodeLock.Lock()
   593  	defer m.nodeLock.Unlock()
   594  
   595  	// Move dead nodes, but respect gossip to the dead interval
   596  	deadIdx := moveDeadNodes(m.nodes, m.config.GossipToTheDeadTime)
   597  
   598  	// Deregister the dead nodes
   599  	for i := deadIdx; i < len(m.nodes); i++ {
   600  		delete(m.nodeMap, m.nodes[i].Name)
   601  		m.nodes[i] = nil
   602  	}
   603  
   604  	// Trim the nodes to exclude the dead nodes
   605  	m.nodes = m.nodes[0:deadIdx]
   606  
   607  	// Update numNodes after we've trimmed the dead nodes
   608  	atomic.StoreUint32(&m.numNodes, uint32(deadIdx))
   609  
   610  	// Shuffle live nodes
   611  	shuffleNodes(m.nodes)
   612  }
   613  
   614  // gossip is invoked every GossipInterval period to broadcast our gossip
   615  // messages to a few random nodes.
   616  func (m *Memberlist) gossip() {
   617  	defer metrics.MeasureSince([]string{"memberlist", "gossip"}, time.Now())
   618  
   619  	// Get some random live, suspect, or recently dead nodes
   620  	m.nodeLock.RLock()
   621  	kNodes := kRandomNodes(m.config.GossipNodes, m.nodes, func(n *nodeState) bool {
   622  		if n.Name == m.config.Name {
   623  			return true
   624  		}
   625  
   626  		switch n.State {
   627  		case StateAlive, StateSuspect:
   628  			return false
   629  
   630  		case StateDead:
   631  			return time.Since(n.StateChange) > m.config.GossipToTheDeadTime
   632  
   633  		default:
   634  			return true
   635  		}
   636  	})
   637  	m.nodeLock.RUnlock()
   638  
   639  	// Compute the bytes available
   640  	bytesAvail := m.config.UDPBufferSize - compoundHeaderOverhead
   641  	if m.config.EncryptionEnabled() {
   642  		bytesAvail -= encryptOverhead(m.encryptionVersion())
   643  	}
   644  
   645  	for _, node := range kNodes {
   646  		// Get any pending broadcasts
   647  		msgs := m.getBroadcasts(compoundOverhead, bytesAvail)
   648  		if len(msgs) == 0 {
   649  			return
   650  		}
   651  
   652  		addr := node.Address()
   653  		if len(msgs) == 1 {
   654  			// Send single message as is
   655  			if err := m.rawSendMsgPacket(node.FullAddress(), &node, msgs[0]); err != nil {
   656  				m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err)
   657  			}
   658  		} else {
   659  			// Otherwise create and send a compound message
   660  			compound := makeCompoundMessage(msgs)
   661  			if err := m.rawSendMsgPacket(node.FullAddress(), &node, compound.Bytes()); err != nil {
   662  				m.logger.Printf("[ERR] memberlist: Failed to send gossip to %s: %s", addr, err)
   663  			}
   664  		}
   665  	}
   666  }
   667  
   668  // weight is invoked every WeightInterval period to calculate local node weight and
   669  // enqueue a message carrying the result
   670  func (m *Memberlist) weight() {
   671  	defer metrics.MeasureSince([]string{"memberlist", "weight"}, time.Now())
   672  
   673  	// Weight = (AwarenessMaxMultiplier - AwarenessScore) * 0.5 + AwarenessMaxMultiplier * CPUIdlePercent * 0.5
   674  	percent, err := cpu.Percent(0, false)
   675  	if err != nil {
   676  		m.logger.Printf("[ERR] memberlist: Failed to get cpu busy percent: %s", err)
   677  		return
   678  	}
   679  	cpuIdlePercent := 100 - percent[0]
   680  	result := int(math.Round(float64(m.config.AwarenessMaxMultiplier-m.awareness.GetHealthScore())*0.6 +
   681  		float64(m.config.AwarenessMaxMultiplier)*cpuIdlePercent/100*0.4))
   682  
   683  	w := weight{Incarnation: m.incarnation, Node: m.config.Name, From: m.config.Name, Weight: result, WeightAt: time.Now().UTC().UnixNano() / 1000000}
   684  	m.encodeWeightMsgAndBroadcast(m.config.Name, w)
   685  	m.logger.Printf("[DEBUG] memberlist: enqueued latest weight of local node %s: %d", m.config.Name, result)
   686  }
   687  
   688  // pushPull is invoked periodically to randomly perform a complete state
   689  // exchange. Used to ensure a high level of convergence, but is also
   690  // reasonably expensive as the entire state of this node is exchanged
   691  // with the other node.
   692  func (m *Memberlist) pushPull() {
   693  	// Get a random live node
   694  	m.nodeLock.RLock()
   695  	nodes := kRandomNodes(1, m.nodes, func(n *nodeState) bool {
   696  		return n.Name == m.config.Name ||
   697  			n.State != StateAlive
   698  	})
   699  	m.nodeLock.RUnlock()
   700  
   701  	// If no nodes, bail
   702  	if len(nodes) == 0 {
   703  		return
   704  	}
   705  	node := nodes[0]
   706  
   707  	// Attempt a push pull
   708  	if err := m.pushPullNode(node.FullAddress(), false); err != nil {
   709  		m.logger.Printf("[ERR] memberlist: Push/Pull with %s failed: %s", node.Name, err)
   710  	}
   711  }
   712  
   713  // pushPullNode does a complete state exchange with a specific node.
   714  func (m *Memberlist) pushPullNode(a Address, join bool) error {
   715  	defer metrics.MeasureSince([]string{"memberlist", "pushPullNode"}, time.Now())
   716  
   717  	// Attempt to send and receive with the node
   718  	remote, userState, err := m.sendAndReceiveState(a, join)
   719  	if err != nil {
   720  		return err
   721  	}
   722  
   723  	if err := m.mergeRemoteState(join, remote, userState); err != nil {
   724  		return err
   725  	}
   726  	return nil
   727  }
   728  
   729  // verifyProtocol verifies that all the remote nodes can speak with our
   730  // nodes and vice versa on both the core protocol as well as the
   731  // delegate protocol level.
   732  //
   733  // The verification works by finding the maximum minimum and
   734  // minimum maximum understood protocol and delegate versions. In other words,
   735  // it finds the common denominator of protocol and delegate version ranges
   736  // for the entire cluster.
   737  //
   738  // After this, it goes through the entire cluster (local and remote) and
   739  // verifies that everyone's speaking protocol versions satisfy this range.
   740  // If this passes, it means that every node can understand each other.
   741  func (m *Memberlist) verifyProtocol(remote []pushNodeState) error {
   742  	m.nodeLock.RLock()
   743  	defer m.nodeLock.RUnlock()
   744  
   745  	// Maximum minimum understood and minimum maximum understood for both
   746  	// the protocol and delegate versions. We use this to verify everyone
   747  	// can be understood.
   748  	var maxpmin, minpmax uint8
   749  	var maxdmin, mindmax uint8
   750  	minpmax = math.MaxUint8
   751  	mindmax = math.MaxUint8
   752  
   753  	for _, rn := range remote {
   754  		// If the node isn't alive, then skip it
   755  		if rn.State != StateAlive {
   756  			continue
   757  		}
   758  
   759  		// Skip nodes that don't have versions set, it just means
   760  		// their version is zero.
   761  		if len(rn.Vsn) == 0 {
   762  			continue
   763  		}
   764  
   765  		if rn.Vsn[0] > maxpmin {
   766  			maxpmin = rn.Vsn[0]
   767  		}
   768  
   769  		if rn.Vsn[1] < minpmax {
   770  			minpmax = rn.Vsn[1]
   771  		}
   772  
   773  		if rn.Vsn[3] > maxdmin {
   774  			maxdmin = rn.Vsn[3]
   775  		}
   776  
   777  		if rn.Vsn[4] < mindmax {
   778  			mindmax = rn.Vsn[4]
   779  		}
   780  	}
   781  
   782  	for _, n := range m.nodes {
   783  		// Ignore non-alive nodes
   784  		if n.State != StateAlive {
   785  			continue
   786  		}
   787  
   788  		if n.PMin > maxpmin {
   789  			maxpmin = n.PMin
   790  		}
   791  
   792  		if n.PMax < minpmax {
   793  			minpmax = n.PMax
   794  		}
   795  
   796  		if n.DMin > maxdmin {
   797  			maxdmin = n.DMin
   798  		}
   799  
   800  		if n.DMax < mindmax {
   801  			mindmax = n.DMax
   802  		}
   803  	}
   804  
   805  	// Now that we definitively know the minimum and maximum understood
   806  	// version that satisfies the whole cluster, we verify that every
   807  	// node in the cluster satisifies this.
   808  	for _, n := range remote {
   809  		var nPCur, nDCur uint8
   810  		if len(n.Vsn) > 0 {
   811  			nPCur = n.Vsn[2]
   812  			nDCur = n.Vsn[5]
   813  		}
   814  
   815  		if nPCur < maxpmin || nPCur > minpmax {
   816  			return fmt.Errorf(
   817  				"Node '%s' protocol version (%d) is incompatible: [%d, %d]",
   818  				n.Name, nPCur, maxpmin, minpmax)
   819  		}
   820  
   821  		if nDCur < maxdmin || nDCur > mindmax {
   822  			return fmt.Errorf(
   823  				"Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]",
   824  				n.Name, nDCur, maxdmin, mindmax)
   825  		}
   826  	}
   827  
   828  	for _, n := range m.nodes {
   829  		nPCur := n.PCur
   830  		nDCur := n.DCur
   831  
   832  		if nPCur < maxpmin || nPCur > minpmax {
   833  			return fmt.Errorf(
   834  				"Node '%s' protocol version (%d) is incompatible: [%d, %d]",
   835  				n.Name, nPCur, maxpmin, minpmax)
   836  		}
   837  
   838  		if nDCur < maxdmin || nDCur > mindmax {
   839  			return fmt.Errorf(
   840  				"Node '%s' delegate protocol version (%d) is incompatible: [%d, %d]",
   841  				n.Name, nDCur, maxdmin, mindmax)
   842  		}
   843  	}
   844  
   845  	return nil
   846  }
   847  
   848  // nextSeqNo returns a usable sequence number in a thread safe way
   849  func (m *Memberlist) nextSeqNo() uint32 {
   850  	return atomic.AddUint32(&m.sequenceNum, 1)
   851  }
   852  
   853  // nextIncarnation returns the next incarnation number in a thread safe way
   854  func (m *Memberlist) nextIncarnation() uint32 {
   855  	return atomic.AddUint32(&m.incarnation, 1)
   856  }
   857  
   858  // skipIncarnation adds the positive offset to the incarnation number.
   859  func (m *Memberlist) skipIncarnation(offset uint32) uint32 {
   860  	return atomic.AddUint32(&m.incarnation, offset)
   861  }
   862  
   863  // estNumNodes is used to get the current estimate of the number of nodes
   864  func (m *Memberlist) estNumNodes() int {
   865  	return int(atomic.LoadUint32(&m.numNodes))
   866  }
   867  
   868  type ackMessage struct {
   869  	Complete  bool
   870  	Payload   []byte
   871  	Timestamp time.Time
   872  }
   873  
   874  // setProbeChannels is used to attach the ackCh to receive a message when an ack
   875  // with a given sequence number is received. The `complete` field of the message
   876  // will be false on timeout. Any nack messages will cause an empty struct to be
   877  // passed to the nackCh, which can be nil if not needed.
   878  func (m *Memberlist) setProbeChannels(seqNo uint32, ackCh chan ackMessage, nackCh chan struct{}, timeout time.Duration) {
   879  	// Create handler functions for acks and nacks
   880  	ackFn := func(payload []byte, timestamp time.Time) {
   881  		select {
   882  		case ackCh <- ackMessage{true, payload, timestamp}:
   883  		default:
   884  		}
   885  	}
   886  	nackFn := func() {
   887  		select {
   888  		case nackCh <- struct{}{}:
   889  		default:
   890  		}
   891  	}
   892  
   893  	// Add the handlers
   894  	ah := &ackHandler{ackFn, nackFn, nil}
   895  	m.ackLock.Lock()
   896  	m.ackHandlers[seqNo] = ah
   897  	m.ackLock.Unlock()
   898  
   899  	// Setup a reaping routing
   900  	ah.timer = time.AfterFunc(timeout, func() {
   901  		m.ackLock.Lock()
   902  		delete(m.ackHandlers, seqNo)
   903  		m.ackLock.Unlock()
   904  		select {
   905  		case ackCh <- ackMessage{false, nil, time.Now()}:
   906  		default:
   907  		}
   908  	})
   909  }
   910  
   911  // setAckHandler is used to attach a handler to be invoked when an ack with a
   912  // given sequence number is received. If a timeout is reached, the handler is
   913  // deleted. This is used for indirect pings so does not configure a function
   914  // for nacks.
   915  func (m *Memberlist) setAckHandler(seqNo uint32, ackFn func([]byte, time.Time), timeout time.Duration) {
   916  	// Add the handler
   917  	ah := &ackHandler{ackFn, nil, nil}
   918  	m.ackLock.Lock()
   919  	m.ackHandlers[seqNo] = ah
   920  	m.ackLock.Unlock()
   921  
   922  	// Setup a reaping routing
   923  	ah.timer = time.AfterFunc(timeout, func() {
   924  		m.ackLock.Lock()
   925  		delete(m.ackHandlers, seqNo)
   926  		m.ackLock.Unlock()
   927  	})
   928  }
   929  
   930  // Invokes an ack handler if any is associated, and reaps the handler immediately
   931  func (m *Memberlist) invokeAckHandler(ack ackResp, timestamp time.Time) {
   932  	m.ackLock.Lock()
   933  	ah, ok := m.ackHandlers[ack.SeqNo]
   934  	delete(m.ackHandlers, ack.SeqNo)
   935  	m.ackLock.Unlock()
   936  	if !ok {
   937  		return
   938  	}
   939  	ah.timer.Stop()
   940  	ah.ackFn(ack.Payload, timestamp)
   941  }
   942  
   943  // Invokes nack handler if any is associated.
   944  func (m *Memberlist) invokeNackHandler(nack nackResp) {
   945  	m.ackLock.Lock()
   946  	ah, ok := m.ackHandlers[nack.SeqNo]
   947  	m.ackLock.Unlock()
   948  	if !ok || ah.nackFn == nil {
   949  		return
   950  	}
   951  	ah.nackFn()
   952  }
   953  
   954  // refute gossips an alive message in response to incoming information that we
   955  // are suspect or dead. It will make sure the incarnation number beats the given
   956  // accusedInc value, or you can supply 0 to just get the next incarnation number.
   957  // This alters the node state that's passed in so this MUST be called while the
   958  // nodeLock is held.
   959  func (m *Memberlist) refute(me *nodeState, accusedInc uint32) {
   960  	// Make sure the incarnation number beats the accusation.
   961  	inc := m.nextIncarnation()
   962  	if accusedInc >= inc {
   963  		inc = m.skipIncarnation(accusedInc - inc + 1)
   964  	}
   965  	me.Incarnation = inc
   966  
   967  	// Decrease our health because we are being asked to refute a problem.
   968  	m.awareness.ApplyDelta(1)
   969  
   970  	// Format and broadcast an alive message.
   971  	a := alive{
   972  		Incarnation: inc,
   973  		Node:        me.Name,
   974  		Addr:        me.Addr,
   975  		Port:        me.Port,
   976  		Meta:        me.Meta,
   977  		Vsn: []uint8{
   978  			me.PMin, me.PMax, me.PCur,
   979  			me.DMin, me.DMax, me.DCur,
   980  		},
   981  	}
   982  	m.encodeAndBroadcast(me.Addr, aliveMsg, a)
   983  }
   984  
   985  // aliveNode is invoked by the network layer when we get a message about a
   986  // live node.
   987  func (m *Memberlist) aliveNode(a *alive, notify chan struct{}, bootstrap bool) {
   988  	m.nodeLock.Lock()
   989  	defer m.nodeLock.Unlock()
   990  	state, ok := m.nodeMap[a.Node]
   991  
   992  	// It is possible that during a Leave(), there is already an aliveMsg
   993  	// in-queue to be processed but blocked by the locks above. If we let
   994  	// that aliveMsg process, it'll cause us to re-join the cluster. This
   995  	// ensures that we don't.
   996  	if m.hasLeft() && a.Node == m.config.Name {
   997  		return
   998  	}
   999  
  1000  	if len(a.Vsn) >= 3 {
  1001  		pMin := a.Vsn[0]
  1002  		pMax := a.Vsn[1]
  1003  		pCur := a.Vsn[2]
  1004  		if pMin == 0 || pMax == 0 || pMin > pMax {
  1005  			m.logger.Printf("[WARN] memberlist: Ignoring an alive message for '%s' (%v:%d) because protocol version(s) are wrong: %d <= %d <= %d should be >0", a.Node, a.Addr, a.Port, pMin, pCur, pMax)
  1006  			return
  1007  		}
  1008  	}
  1009  
  1010  	// Invoke the Alive delegate if any. This can be used to filter out
  1011  	// alive messages based on custom logic. For example, using a cluster name.
  1012  	// Using a merge delegate is not enough, as it is possible for passive
  1013  	// cluster merging to still occur.
  1014  	if m.config.Alive != nil {
  1015  		if len(a.Vsn) < 6 {
  1016  			m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s' (%v:%d) because Vsn is not present",
  1017  				a.Node, a.Addr, a.Port)
  1018  			return
  1019  		}
  1020  		node := &Node{
  1021  			Name: a.Node,
  1022  			Addr: a.Addr,
  1023  			Port: a.Port,
  1024  			Meta: a.Meta,
  1025  			PMin: a.Vsn[0],
  1026  			PMax: a.Vsn[1],
  1027  			PCur: a.Vsn[2],
  1028  			DMin: a.Vsn[3],
  1029  			DMax: a.Vsn[4],
  1030  			DCur: a.Vsn[5],
  1031  		}
  1032  		if err := m.config.Alive.NotifyAlive(node); err != nil {
  1033  			m.logger.Printf("[WARN] memberlist: ignoring alive message for '%s': %s",
  1034  				a.Node, err)
  1035  			return
  1036  		}
  1037  	}
  1038  
  1039  	// Check if we've never seen this node before, and if not, then
  1040  	// store this node in our node map.
  1041  	var updatesNode bool
  1042  	if !ok {
  1043  		errCon := m.config.AddrAllowed(a.Addr)
  1044  		if errCon != nil {
  1045  			m.logger.Printf("[WARN] memberlist: Rejected node %s (%v): %s", a.Node, a.Addr, errCon)
  1046  			return
  1047  		}
  1048  		state = &nodeState{
  1049  			Node: Node{
  1050  				Name: a.Node,
  1051  				Addr: a.Addr,
  1052  				Port: a.Port,
  1053  				Meta: a.Meta,
  1054  			},
  1055  			State: StateDead,
  1056  		}
  1057  		if len(a.Vsn) > 5 {
  1058  			state.PMin = a.Vsn[0]
  1059  			state.PMax = a.Vsn[1]
  1060  			state.PCur = a.Vsn[2]
  1061  			state.DMin = a.Vsn[3]
  1062  			state.DMax = a.Vsn[4]
  1063  			state.DCur = a.Vsn[5]
  1064  		}
  1065  
  1066  		// Add to map
  1067  		m.nodeMap[a.Node] = state
  1068  
  1069  		// Get a random offset. This is important to ensure
  1070  		// the failure detection bound is low on average. If all
  1071  		// nodes did an append, failure detection bound would be
  1072  		// very high.
  1073  		n := len(m.nodes)
  1074  		offset := randomOffset(n)
  1075  
  1076  		// Add at the end and swap with the node at the offset
  1077  		m.nodes = append(m.nodes, state)
  1078  		m.nodes[offset], m.nodes[n] = m.nodes[n], m.nodes[offset]
  1079  
  1080  		// Update numNodes after we've added a new node
  1081  		atomic.AddUint32(&m.numNodes, 1)
  1082  	} else {
  1083  		// Check if this address is different from the existing node unless the old node is dead.
  1084  		if state.Addr != a.Addr || state.Port != a.Port {
  1085  			errCon := m.config.AddrAllowed(a.Addr)
  1086  			if errCon != nil {
  1087  				m.logger.Printf("[WARN] memberlist: Rejected IP update from %v to %v for node %s: %s", a.Node, state.Addr, a.Addr, errCon)
  1088  				return
  1089  			}
  1090  			// If DeadNodeReclaimTime is configured, check if enough time has elapsed since the node died.
  1091  			canReclaim := (m.config.DeadNodeReclaimTime > 0 &&
  1092  				time.Since(state.StateChange) > m.config.DeadNodeReclaimTime)
  1093  
  1094  			// Allow the address to be updated if a dead node is being replaced.
  1095  			if state.State == StateLeft || (state.State == StateDead && canReclaim) {
  1096  				m.logger.Printf("[INFO] memberlist: Updating address for left or failed node %s from %v:%d to %v:%d",
  1097  					state.Name, state.Addr, state.Port, a.Addr, a.Port)
  1098  				updatesNode = true
  1099  			} else {
  1100  				m.logger.Printf("[ERR] memberlist: Conflicting address for %s. Mine: %v:%d Theirs: %v:%d Old state: %v",
  1101  					state.Name, state.Addr, state.Port, a.Addr, a.Port, state.State)
  1102  
  1103  				// Inform the conflict delegate if provided
  1104  				if m.config.Conflict != nil {
  1105  					other := Node{
  1106  						Name: a.Node,
  1107  						Addr: a.Addr,
  1108  						Port: a.Port,
  1109  						Meta: a.Meta,
  1110  					}
  1111  					m.config.Conflict.NotifyConflict(&state.Node, &other)
  1112  				}
  1113  				return
  1114  			}
  1115  		}
  1116  	}
  1117  
  1118  	// Bail if the incarnation number is older, and this is not about us
  1119  	isLocalNode := state.Name == m.config.Name
  1120  	if a.Incarnation <= state.Incarnation && !isLocalNode && !updatesNode {
  1121  		return
  1122  	}
  1123  
  1124  	// Bail if strictly less and this is about us
  1125  	if a.Incarnation < state.Incarnation && isLocalNode {
  1126  		return
  1127  	}
  1128  
  1129  	// Clear out any suspicion timer that may be in effect.
  1130  	delete(m.nodeTimers, a.Node)
  1131  
  1132  	// Store the old state and meta data
  1133  	oldState := state.State
  1134  	oldMeta := state.Meta
  1135  
  1136  	// If this is us we need to refute, otherwise re-broadcast
  1137  	if !bootstrap && isLocalNode {
  1138  		// Compute the version vector
  1139  		versions := []uint8{
  1140  			state.PMin, state.PMax, state.PCur,
  1141  			state.DMin, state.DMax, state.DCur,
  1142  		}
  1143  
  1144  		// If the Incarnation is the same, we need special handling, since it
  1145  		// possible for the following situation to happen:
  1146  		// 1) Start with configuration C, join cluster
  1147  		// 2) Hard fail / Kill / Shutdown
  1148  		// 3) Restart with configuration C', join cluster
  1149  		//
  1150  		// In this case, other nodes and the local node see the same incarnation,
  1151  		// but the values may not be the same. For this reason, we always
  1152  		// need to do an equality check for this Incarnation. In most cases,
  1153  		// we just ignore, but we may need to refute.
  1154  		//
  1155  		if a.Incarnation == state.Incarnation &&
  1156  			bytes.Equal(a.Meta, state.Meta) &&
  1157  			bytes.Equal(a.Vsn, versions) {
  1158  			return
  1159  		}
  1160  		m.refute(state, a.Incarnation)
  1161  		m.logger.Printf("[WARN] memberlist: Refuting an alive message for '%s' (%v:%d) meta:(%v VS %v), vsn:(%v VS %v)", a.Node, a.Addr, a.Port, a.Meta, state.Meta, a.Vsn, versions)
  1162  	} else {
  1163  		m.encodeBroadcastNotify(a.Node, aliveMsg, a, notify)
  1164  
  1165  		// Update protocol versions if it arrived
  1166  		if len(a.Vsn) > 0 {
  1167  			state.PMin = a.Vsn[0]
  1168  			state.PMax = a.Vsn[1]
  1169  			state.PCur = a.Vsn[2]
  1170  			state.DMin = a.Vsn[3]
  1171  			state.DMax = a.Vsn[4]
  1172  			state.DCur = a.Vsn[5]
  1173  		}
  1174  
  1175  		// Update the state and incarnation number
  1176  		state.Incarnation = a.Incarnation
  1177  		state.Meta = a.Meta
  1178  		state.Addr = a.Addr
  1179  		state.Port = a.Port
  1180  		if state.State != StateAlive {
  1181  			state.State = StateAlive
  1182  			state.StateChange = time.Now()
  1183  		}
  1184  	}
  1185  
  1186  	// Update metrics
  1187  	metrics.IncrCounter([]string{"memberlist", "msg", "alive"}, 1)
  1188  
  1189  	// Notify the delegate of any relevant updates
  1190  	if m.config.Events != nil {
  1191  		if oldState == StateDead || oldState == StateLeft {
  1192  			// if Dead/Left -> Alive, notify of join
  1193  			state.Node.State = state.State
  1194  			m.config.Events.NotifyJoin(&state.Node)
  1195  		} else if oldState == StateSuspect {
  1196  			state.Node.State = state.State
  1197  			m.config.Events.NotifySuspectSateChange(&state.Node)
  1198  		} else if !bytes.Equal(oldMeta, state.Meta) {
  1199  			// if Meta changed, trigger an update notification
  1200  			m.config.Events.NotifyUpdate(&state.Node)
  1201  		}
  1202  	}
  1203  }
  1204  
  1205  // suspectNode is invoked by the network layer when we get a message
  1206  // about a suspect node
  1207  func (m *Memberlist) suspectNode(s *suspect) {
  1208  	m.nodeLock.Lock()
  1209  	defer m.nodeLock.Unlock()
  1210  	state, ok := m.nodeMap[s.Node]
  1211  
  1212  	// If we've never heard about this node before, ignore it
  1213  	if !ok {
  1214  		return
  1215  	}
  1216  
  1217  	// Ignore old incarnation numbers
  1218  	if s.Incarnation < state.Incarnation {
  1219  		return
  1220  	}
  1221  
  1222  	// See if there's a suspicion timer we can confirm. If the info is new
  1223  	// to us we will go ahead and re-gossip it. This allows for multiple
  1224  	// independent confirmations to flow even when a node probes a node
  1225  	// that's already suspect.
  1226  	if timer, ok := m.nodeTimers[s.Node]; ok {
  1227  		if timer.Confirm(s.From) {
  1228  			m.encodeAndBroadcast(s.Node, suspectMsg, s)
  1229  		}
  1230  		return
  1231  	}
  1232  
  1233  	// Ignore non-alive nodes
  1234  	if state.State != StateAlive {
  1235  		return
  1236  	}
  1237  
  1238  	// If this is us we need to refute, otherwise re-broadcast
  1239  	if state.Name == m.config.Name {
  1240  		m.refute(state, s.Incarnation)
  1241  		m.logger.Printf("[WARN] memberlist: Refuting a suspect message (from: %s)", s.From)
  1242  		return // Do not mark ourself suspect
  1243  	} else {
  1244  		m.encodeAndBroadcast(s.Node, suspectMsg, s)
  1245  	}
  1246  
  1247  	// Update metrics
  1248  	metrics.IncrCounter([]string{"memberlist", "msg", "suspect"}, 1)
  1249  
  1250  	// Update the state
  1251  	state.Incarnation = s.Incarnation
  1252  	state.State = StateSuspect
  1253  	changeTime := time.Now()
  1254  	state.StateChange = changeTime
  1255  
  1256  	// Setup a suspicion timer. Given that we don't have any known phase
  1257  	// relationship with our peers, we set up k such that we hit the nominal
  1258  	// timeout two probe intervals short of what we expect given the suspicion
  1259  	// multiplier.
  1260  	k := m.config.SuspicionMult - 2
  1261  
  1262  	// If there aren't enough nodes to give the expected confirmations, just
  1263  	// set k to 0 to say that we don't expect any. Note we subtract 2 from n
  1264  	// here to take out ourselves and the node being probed.
  1265  	n := m.estNumNodes()
  1266  	if n-2 < k {
  1267  		k = 0
  1268  	}
  1269  
  1270  	// Compute the timeouts based on the size of the cluster.
  1271  	min := suspicionTimeout(m.config.SuspicionMult, n, m.config.ProbeInterval)
  1272  	max := time.Duration(m.config.SuspicionMaxTimeoutMult) * min
  1273  	fn := func(numConfirmations int) {
  1274  		var d *dead
  1275  
  1276  		m.nodeLock.Lock()
  1277  		state, ok := m.nodeMap[s.Node]
  1278  		timeout := ok && state.State == StateSuspect && state.StateChange == changeTime
  1279  		if timeout {
  1280  			d = &dead{Incarnation: state.Incarnation, Node: state.Name, From: m.config.Name}
  1281  		}
  1282  		m.nodeLock.Unlock()
  1283  
  1284  		if timeout {
  1285  			if k > 0 && numConfirmations < k {
  1286  				metrics.IncrCounter([]string{"memberlist", "degraded", "timeout"}, 1)
  1287  			}
  1288  
  1289  			m.logger.Printf("[INFO] memberlist: Marking %s as failed, suspect timeout reached (%d peer confirmations)",
  1290  				state.Name, numConfirmations)
  1291  
  1292  			m.deadNode(d)
  1293  		}
  1294  	}
  1295  	m.nodeTimers[s.Node] = newSuspicion(s.From, k, min, max, fn)
  1296  	if m.config.Events != nil {
  1297  		state.Node.State = state.State
  1298  		m.config.Events.NotifySuspectSateChange(&state.Node)
  1299  	}
  1300  }
  1301  
  1302  // deadNode is invoked by the network layer when we get a message
  1303  // about a dead node
  1304  func (m *Memberlist) deadNode(d *dead) {
  1305  	m.nodeLock.Lock()
  1306  	defer m.nodeLock.Unlock()
  1307  	state, ok := m.nodeMap[d.Node]
  1308  
  1309  	// If we've never heard about this node before, ignore it
  1310  	if !ok {
  1311  		return
  1312  	}
  1313  
  1314  	// Ignore old incarnation numbers
  1315  	if d.Incarnation < state.Incarnation {
  1316  		return
  1317  	}
  1318  
  1319  	// Clear out any suspicion timer that may be in effect.
  1320  	delete(m.nodeTimers, d.Node)
  1321  
  1322  	// Ignore if node is already dead
  1323  	if state.DeadOrLeft() {
  1324  		return
  1325  	}
  1326  
  1327  	// Check if this is us
  1328  	if state.Name == m.config.Name {
  1329  		// If we are not leaving we need to refute
  1330  		if !m.hasLeft() {
  1331  			m.refute(state, d.Incarnation)
  1332  			m.logger.Printf("[WARN] memberlist: Refuting a dead message (from: %s)", d.From)
  1333  			return // Do not mark ourself dead
  1334  		}
  1335  
  1336  		// If we are leaving, we broadcast and wait
  1337  		m.encodeBroadcastNotify(d.Node, deadMsg, d, m.leaveBroadcast)
  1338  	} else {
  1339  		m.encodeAndBroadcast(d.Node, deadMsg, d)
  1340  	}
  1341  
  1342  	// Update metrics
  1343  	metrics.IncrCounter([]string{"memberlist", "msg", "dead"}, 1)
  1344  
  1345  	// Update the state
  1346  	state.Incarnation = d.Incarnation
  1347  
  1348  	// If the dead message was send by the node itself, mark it is left
  1349  	// instead of dead.
  1350  	if d.Node == d.From {
  1351  		state.State = StateLeft
  1352  	} else {
  1353  		state.State = StateDead
  1354  	}
  1355  	state.StateChange = time.Now()
  1356  
  1357  	// Notify of death
  1358  	if m.config.Events != nil {
  1359  		m.config.Events.NotifyLeave(&state.Node)
  1360  	}
  1361  }
  1362  
  1363  // weightNode is invoked by the network layer when we get a message
  1364  // about node weight
  1365  func (m *Memberlist) weightNode(s *weight) {
  1366  	m.nodeLock.Lock()
  1367  	defer m.nodeLock.Unlock()
  1368  	state, ok := m.nodeMap[s.Node]
  1369  
  1370  	// If we've never heard about this node before, ignore it
  1371  	if !ok {
  1372  		return
  1373  	}
  1374  
  1375  	// Ignore old incarnation numbers
  1376  	if s.Incarnation < state.Incarnation {
  1377  		return
  1378  	}
  1379  
  1380  	// Ignore non-alive nodes or this is about us
  1381  	if state.State != StateAlive || state.Name == m.config.Name {
  1382  		return
  1383  	}
  1384  
  1385  	// Ignore old weight messages
  1386  	if s.WeightAt <= state.WeightAt {
  1387  		return
  1388  	}
  1389  
  1390  	m.encodeWeightMsgAndBroadcast(s.Node, s)
  1391  
  1392  	// Update metrics
  1393  	metrics.IncrCounter([]string{"memberlist", "msg", "weight"}, 1)
  1394  
  1395  	// Update the state
  1396  	old := state.Weight
  1397  	state.Weight = s.Weight
  1398  	state.WeightAt = s.WeightAt
  1399  	if state.Weight != old {
  1400  		if m.config.Events != nil {
  1401  			state.Node.Weight = state.Weight
  1402  			m.config.Events.NotifyWeight(&state.Node)
  1403  		}
  1404  		m.logger.Printf("[DEBUG] memberlist: updated weight (calculated at %s) of node %s from %d to %d",
  1405  			time.Unix(s.WeightAt/1000, (s.WeightAt%1000)*1000000).Local().Format("2006-01-02T15:04:05-0700"), state.Name, old, state.Weight)
  1406  	}
  1407  }
  1408  
  1409  // mergeState is invoked by the network layer when we get a Push/Pull
  1410  // state transfer
  1411  func (m *Memberlist) mergeState(remote []pushNodeState) {
  1412  	for _, r := range remote {
  1413  		switch r.State {
  1414  		case StateAlive:
  1415  			a := alive{
  1416  				Incarnation: r.Incarnation,
  1417  				Node:        r.Name,
  1418  				Addr:        r.Addr,
  1419  				Port:        r.Port,
  1420  				Meta:        r.Meta,
  1421  				Vsn:         r.Vsn,
  1422  			}
  1423  			m.aliveNode(&a, nil, false)
  1424  
  1425  		case StateLeft:
  1426  			d := dead{Incarnation: r.Incarnation, Node: r.Name, From: r.Name}
  1427  			m.deadNode(&d)
  1428  		case StateDead:
  1429  			// If the remote node believes a node is dead, we prefer to
  1430  			// suspect that node instead of declaring it dead instantly
  1431  			fallthrough
  1432  		case StateSuspect:
  1433  			s := suspect{Incarnation: r.Incarnation, Node: r.Name, From: m.config.Name}
  1434  			m.suspectNode(&s)
  1435  		}
  1436  	}
  1437  }