github.com/hernad/nomad@v1.6.112/nomad/heartbeat.go (about)

     1  // Copyright (c) HashiCorp, Inc.
     2  // SPDX-License-Identifier: MPL-2.0
     3  
     4  package nomad
     5  
     6  import (
     7  	"errors"
     8  	"sync"
     9  	"time"
    10  
    11  	metrics "github.com/armon/go-metrics"
    12  	log "github.com/hashicorp/go-hclog"
    13  	memdb "github.com/hashicorp/go-memdb"
    14  
    15  	"github.com/hernad/nomad/helper"
    16  	"github.com/hernad/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// heartbeatNotLeader is the error string returned when the heartbeat request
    21  	// couldn't be completed since the server is not the leader.
    22  	heartbeatNotLeader = "failed to reset heartbeat since server is not leader"
    23  
    24  	// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is
    25  	// missed.
    26  	NodeHeartbeatEventMissed = "Node heartbeat missed"
    27  )
    28  
    29  var (
    30  	// heartbeatNotLeaderErr is the error returned when the heartbeat request
    31  	// couldn't be completed since the server is not the leader.
    32  	heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
    33  )
    34  
    35  // nodeHeartbeater is used to track expiration times of node heartbeats. If it
    36  // detects an expired node, the node status is updated to be 'down'.
    37  type nodeHeartbeater struct {
    38  	*Server
    39  	logger log.Logger
    40  
    41  	// heartbeatTimers track the expiration time of each heartbeat that has
    42  	// a TTL. On expiration, the node status is updated to be 'down'.
    43  	heartbeatTimers     map[string]*time.Timer
    44  	heartbeatTimersLock sync.Mutex
    45  }
    46  
    47  // newNodeHeartbeater returns a new node heartbeater used to detect and act on
    48  // failed node heartbeats.
    49  func newNodeHeartbeater(s *Server) *nodeHeartbeater {
    50  	return &nodeHeartbeater{
    51  		Server: s,
    52  		logger: s.logger.Named("heartbeat"),
    53  	}
    54  }
    55  
    56  // initializeHeartbeatTimers is used when a leader is newly elected to create
    57  // a new map to track heartbeat expiration and to reset all the timers from
    58  // the previously known set of timers.
    59  func (h *nodeHeartbeater) initializeHeartbeatTimers() error {
    60  	// Scan all nodes and reset their timer
    61  	snap, err := h.fsm.State().Snapshot()
    62  	if err != nil {
    63  		return err
    64  	}
    65  
    66  	// Get an iterator over nodes
    67  	ws := memdb.NewWatchSet()
    68  	iter, err := snap.Nodes(ws)
    69  	if err != nil {
    70  		return err
    71  	}
    72  
    73  	h.heartbeatTimersLock.Lock()
    74  	defer h.heartbeatTimersLock.Unlock()
    75  
    76  	// Handle each node
    77  	for {
    78  		raw := iter.Next()
    79  		if raw == nil {
    80  			break
    81  		}
    82  		node := raw.(*structs.Node)
    83  		if node.TerminalStatus() {
    84  			continue
    85  		}
    86  		h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL)
    87  	}
    88  	return nil
    89  }
    90  
    91  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    92  // This can be used for new heartbeats and existing ones.
    93  func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) {
    94  	h.heartbeatTimersLock.Lock()
    95  	defer h.heartbeatTimersLock.Unlock()
    96  
    97  	// Do not create a timer for the node since we are not the leader. This
    98  	// check avoids the race in which leadership is lost but a timer is created
    99  	// on this server since it was servicing an RPC during a leadership loss.
   100  	if !h.IsLeader() {
   101  		h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id)
   102  		return 0, heartbeatNotLeaderErr
   103  	}
   104  
   105  	// Compute the target TTL value
   106  	n := len(h.heartbeatTimers)
   107  	ttl := helper.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n)
   108  	ttl += helper.RandomStagger(ttl)
   109  
   110  	// Reset the TTL
   111  	h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace)
   112  	return ttl, nil
   113  }
   114  
   115  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
   116  // assuming the heartbeatTimerLock is already held
   117  func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
   118  	// Ensure a timer map exists
   119  	if h.heartbeatTimers == nil {
   120  		h.heartbeatTimers = make(map[string]*time.Timer)
   121  	}
   122  
   123  	// Renew the heartbeat timer if it exists
   124  	if timer, ok := h.heartbeatTimers[id]; ok {
   125  		timer.Reset(ttl)
   126  		return
   127  	}
   128  
   129  	// Create a new timer to track expiration of this heartbeat
   130  	timer := time.AfterFunc(ttl, func() {
   131  		h.invalidateHeartbeat(id)
   132  	})
   133  	h.heartbeatTimers[id] = timer
   134  }
   135  
   136  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
   137  // need to invalidate the heartbeat.
   138  func (h *nodeHeartbeater) invalidateHeartbeat(id string) {
   139  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
   140  	// Clear the heartbeat timer
   141  	h.heartbeatTimersLock.Lock()
   142  	if timer, ok := h.heartbeatTimers[id]; ok {
   143  		timer.Stop()
   144  		delete(h.heartbeatTimers, id)
   145  	}
   146  	h.heartbeatTimersLock.Unlock()
   147  
   148  	// Do not invalidate the node since we are not the leader. This check avoids
   149  	// the race in which leadership is lost but a timer is created on this
   150  	// server since it was servicing an RPC during a leadership loss.
   151  	if !h.IsLeader() {
   152  		h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id)
   153  		return
   154  	}
   155  
   156  	h.logger.Warn("node TTL expired", "node_id", id)
   157  
   158  	canDisconnect, hasPendingReconnects := h.disconnectState(id)
   159  
   160  	// Make a request to update the node status
   161  	req := structs.NodeUpdateStatusRequest{
   162  		NodeID:    id,
   163  		Status:    structs.NodeStatusDown,
   164  		NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
   165  		WriteRequest: structs.WriteRequest{
   166  			Region: h.config.Region,
   167  		},
   168  	}
   169  
   170  	if canDisconnect && hasPendingReconnects {
   171  		req.Status = structs.NodeStatusDisconnected
   172  	}
   173  	var resp structs.NodeUpdateResponse
   174  
   175  	if err := h.RPC("Node.UpdateStatus", &req, &resp); err != nil {
   176  		h.logger.Error("update node status failed", "error", err)
   177  	}
   178  }
   179  
   180  func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) {
   181  	node, err := h.State().NodeByID(nil, id)
   182  	if err != nil {
   183  		h.logger.Error("error retrieving node by id", "error", err)
   184  		return false, false
   185  	}
   186  
   187  	// Exit if the node is already down or just initializing.
   188  	if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit {
   189  		return false, false
   190  	}
   191  
   192  	allocs, err := h.State().AllocsByNode(nil, id)
   193  	if err != nil {
   194  		h.logger.Error("error retrieving allocs by node", "error", err)
   195  		return false, false
   196  	}
   197  
   198  	now := time.Now().UTC()
   199  	// Check if the node has any allocs that are configured with max_client_disconnect,
   200  	// that are past the disconnect window, and if so, whether it has at least one
   201  	// alloc that isn't yet expired.
   202  	nodeCanDisconnect := false
   203  	for _, alloc := range allocs {
   204  		allocCanDisconnect := alloc.DisconnectTimeout(now).After(now)
   205  		// Only process this until we find that at least one alloc is configured
   206  		// with max_client_disconnect.
   207  		if !nodeCanDisconnect && allocCanDisconnect {
   208  			nodeCanDisconnect = true
   209  		}
   210  		// Only process this until we find one that we want to run and has not
   211  		// yet expired.
   212  		if allocCanDisconnect &&
   213  			alloc.DesiredStatus == structs.AllocDesiredStatusRun &&
   214  			!alloc.Expired(now) {
   215  			return true, true
   216  		}
   217  	}
   218  
   219  	return nodeCanDisconnect, false
   220  }
   221  
   222  // clearHeartbeatTimer is used to clear the heartbeat time for
   223  // a single heartbeat. This is used when a heartbeat is destroyed
   224  // explicitly and no longer needed.
   225  func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error {
   226  	h.heartbeatTimersLock.Lock()
   227  	defer h.heartbeatTimersLock.Unlock()
   228  
   229  	if timer, ok := h.heartbeatTimers[id]; ok {
   230  		timer.Stop()
   231  		delete(h.heartbeatTimers, id)
   232  	}
   233  	return nil
   234  }
   235  
   236  // clearAllHeartbeatTimers is used when a leader is stepping
   237  // down and we no longer need to track any heartbeat timers.
   238  func (h *nodeHeartbeater) clearAllHeartbeatTimers() error {
   239  	h.heartbeatTimersLock.Lock()
   240  	defer h.heartbeatTimersLock.Unlock()
   241  
   242  	for _, t := range h.heartbeatTimers {
   243  		t.Stop()
   244  	}
   245  	h.heartbeatTimers = nil
   246  	return nil
   247  }
   248  
   249  // heartbeatStats is a long running routine used to capture
   250  // the number of active heartbeats being tracked
   251  func (h *nodeHeartbeater) heartbeatStats() {
   252  	for {
   253  		select {
   254  		case <-time.After(5 * time.Second):
   255  			h.heartbeatTimersLock.Lock()
   256  			num := len(h.heartbeatTimers)
   257  			h.heartbeatTimersLock.Unlock()
   258  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   259  
   260  		case <-h.shutdownCh:
   261  			return
   262  		}
   263  	}
   264  }