github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/heartbeat.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"sync"
     6  	"time"
     7  
     8  	metrics "github.com/armon/go-metrics"
     9  	log "github.com/hashicorp/go-hclog"
    10  	memdb "github.com/hashicorp/go-memdb"
    11  
    12  	"github.com/hashicorp/nomad/helper"
    13  	"github.com/hashicorp/nomad/nomad/structs"
    14  )
    15  
    16  const (
    17  	// heartbeatNotLeader is the error string returned when the heartbeat request
    18  	// couldn't be completed since the server is not the leader.
    19  	heartbeatNotLeader = "failed to reset heartbeat since server is not leader"
    20  
    21  	// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is
    22  	// missed.
    23  	NodeHeartbeatEventMissed = "Node heartbeat missed"
    24  )
    25  
    26  var (
    27  	// heartbeatNotLeaderErr is the error returned when the heartbeat request
    28  	// couldn't be completed since the server is not the leader.
    29  	heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
    30  )
    31  
    32  // nodeHeartbeater is used to track expiration times of node heartbeats. If it
    33  // detects an expired node, the node status is updated to be 'down'.
    34  type nodeHeartbeater struct {
    35  	*Server
    36  	logger log.Logger
    37  
    38  	// heartbeatTimers track the expiration time of each heartbeat that has
    39  	// a TTL. On expiration, the node status is updated to be 'down'.
    40  	heartbeatTimers     map[string]*time.Timer
    41  	heartbeatTimersLock sync.Mutex
    42  }
    43  
    44  // newNodeHeartbeater returns a new node heartbeater used to detect and act on
    45  // failed node heartbeats.
    46  func newNodeHeartbeater(s *Server) *nodeHeartbeater {
    47  	return &nodeHeartbeater{
    48  		Server: s,
    49  		logger: s.logger.Named("heartbeat"),
    50  	}
    51  }
    52  
    53  // initializeHeartbeatTimers is used when a leader is newly elected to create
    54  // a new map to track heartbeat expiration and to reset all the timers from
    55  // the previously known set of timers.
    56  func (h *nodeHeartbeater) initializeHeartbeatTimers() error {
    57  	// Scan all nodes and reset their timer
    58  	snap, err := h.fsm.State().Snapshot()
    59  	if err != nil {
    60  		return err
    61  	}
    62  
    63  	// Get an iterator over nodes
    64  	ws := memdb.NewWatchSet()
    65  	iter, err := snap.Nodes(ws)
    66  	if err != nil {
    67  		return err
    68  	}
    69  
    70  	h.heartbeatTimersLock.Lock()
    71  	defer h.heartbeatTimersLock.Unlock()
    72  
    73  	// Handle each node
    74  	for {
    75  		raw := iter.Next()
    76  		if raw == nil {
    77  			break
    78  		}
    79  		node := raw.(*structs.Node)
    80  		if node.TerminalStatus() {
    81  			continue
    82  		}
    83  		h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL)
    84  	}
    85  	return nil
    86  }
    87  
    88  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    89  // This can be used for new heartbeats and existing ones.
    90  func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) {
    91  	h.heartbeatTimersLock.Lock()
    92  	defer h.heartbeatTimersLock.Unlock()
    93  
    94  	// Do not create a timer for the node since we are not the leader. This
    95  	// check avoids the race in which leadership is lost but a timer is created
    96  	// on this server since it was servicing an RPC during a leadership loss.
    97  	if !h.IsLeader() {
    98  		h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id)
    99  		return 0, heartbeatNotLeaderErr
   100  	}
   101  
   102  	// Compute the target TTL value
   103  	n := len(h.heartbeatTimers)
   104  	ttl := helper.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n)
   105  	ttl += helper.RandomStagger(ttl)
   106  
   107  	// Reset the TTL
   108  	h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace)
   109  	return ttl, nil
   110  }
   111  
   112  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
   113  // assuming the heartbeatTimerLock is already held
   114  func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
   115  	// Ensure a timer map exists
   116  	if h.heartbeatTimers == nil {
   117  		h.heartbeatTimers = make(map[string]*time.Timer)
   118  	}
   119  
   120  	// Renew the heartbeat timer if it exists
   121  	if timer, ok := h.heartbeatTimers[id]; ok {
   122  		timer.Reset(ttl)
   123  		return
   124  	}
   125  
   126  	// Create a new timer to track expiration of this heartbeat
   127  	timer := time.AfterFunc(ttl, func() {
   128  		h.invalidateHeartbeat(id)
   129  	})
   130  	h.heartbeatTimers[id] = timer
   131  }
   132  
   133  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
   134  // need to invalidate the heartbeat.
   135  func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
   136  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
   137  	// Clear the heartbeat timer
   138  	h.heartbeatTimersLock.Lock()
   139  	if timer, ok := h.heartbeatTimers[id]; ok {
   140  		timer.Stop()
   141  		delete(h.heartbeatTimers, id)
   142  	}
   143  	h.heartbeatTimersLock.Unlock()
   144  
   145  	// Do not invalidate the node since we are not the leader. This check avoids
   146  	// the race in which leadership is lost but a timer is created on this
   147  	// server since it was servicing an RPC during a leadership loss.
   148  	if !h.IsLeader() {
   149  		h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id)
   150  		return
   151  	}
   152  
   153  	h.logger.Warn("node TTL expired", "node_id", id)
   154  
   155  	canDisconnect, hasPendingReconnects := h.disconnectState(id)
   156  
   157  	// Make a request to update the node status
   158  	req := structs.NodeUpdateStatusRequest{
   159  		NodeID:    id,
   160  		Status:    structs.NodeStatusDown,
   161  		NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
   162  		WriteRequest: structs.WriteRequest{
   163  			Region: h.config.Region,
   164  		},
   165  	}
   166  
   167  	if canDisconnect && hasPendingReconnects {
   168  		req.Status = structs.NodeStatusDisconnected
   169  	}
   170  	var resp structs.NodeUpdateResponse
   171  
   172  	if err := h.RPC("Node.UpdateStatus", &req, &resp); err != nil {
   173  		h.logger.Error("update node status failed", "error", err)
   174  	}
   175  }
   176  
   177  func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) {
   178  	node, err := h.State().NodeByID(nil, id)
   179  	if err != nil {
   180  		h.logger.Error("error retrieving node by id", "error", err)
   181  		return false, false
   182  	}
   183  
   184  	// Exit if the node is already down or just initializing.
   185  	if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit {
   186  		return false, false
   187  	}
   188  
   189  	allocs, err := h.State().AllocsByNode(nil, id)
   190  	if err != nil {
   191  		h.logger.Error("error retrieving allocs by node", "error", err)
   192  		return false, false
   193  	}
   194  
   195  	now := time.Now().UTC()
   196  	// Check if the node has any allocs that are configured with max_client_disconnect,
   197  	// that are past the disconnect window, and if so, whether it has at least one
   198  	// alloc that isn't yet expired.
   199  	nodeCanDisconnect := false
   200  	for _, alloc := range allocs {
   201  		allocCanDisconnect := alloc.DisconnectTimeout(now).After(now)
   202  		// Only process this until we find that at least one alloc is configured
   203  		// with max_client_disconnect.
   204  		if !nodeCanDisconnect && allocCanDisconnect {
   205  			nodeCanDisconnect = true
   206  		}
   207  		// Only process this until we find one that we want to run and has not
   208  		// yet expired.
   209  		if allocCanDisconnect &&
   210  			alloc.DesiredStatus == structs.AllocDesiredStatusRun &&
   211  			!alloc.Expired(now) {
   212  			return true, true
   213  		}
   214  	}
   215  
   216  	return nodeCanDisconnect, false
   217  }
   218  
   219  // clearHeartbeatTimer is used to clear the heartbeat time for
   220  // a single heartbeat. This is used when a heartbeat is destroyed
   221  // explicitly and no longer needed.
   222  func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error {
   223  	h.heartbeatTimersLock.Lock()
   224  	defer h.heartbeatTimersLock.Unlock()
   225  
   226  	if timer, ok := h.heartbeatTimers[id]; ok {
   227  		timer.Stop()
   228  		delete(h.heartbeatTimers, id)
   229  	}
   230  	return nil
   231  }
   232  
   233  // clearAllHeartbeatTimers is used when a leader is stepping
   234  // down and we no longer need to track any heartbeat timers.
   235  func (h *nodeHeartbeater) clearAllHeartbeatTimers() error {
   236  	h.heartbeatTimersLock.Lock()
   237  	defer h.heartbeatTimersLock.Unlock()
   238  
   239  	for _, t := range h.heartbeatTimers {
   240  		t.Stop()
   241  	}
   242  	h.heartbeatTimers = nil
   243  	return nil
   244  }
   245  
   246  // heartbeatStats is a long running routine used to capture
   247  // the number of active heartbeats being tracked
   248  func (h *nodeHeartbeater) heartbeatStats() {
   249  	for {
   250  		select {
   251  		case <-time.After(5 * time.Second):
   252  			h.heartbeatTimersLock.Lock()
   253  			num := len(h.heartbeatTimers)
   254  			h.heartbeatTimersLock.Unlock()
   255  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   256  
   257  		case <-h.shutdownCh:
   258  			return
   259  		}
   260  	}
   261  }