github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "sync" 6 "time" 7 8 metrics "github.com/armon/go-metrics" 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 12 "github.com/hashicorp/nomad/helper" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 const ( 17 // heartbeatNotLeader is the error string returned when the heartbeat request 18 // couldn't be completed since the server is not the leader. 19 heartbeatNotLeader = "failed to reset heartbeat since server is not leader" 20 21 // NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is 22 // missed. 23 NodeHeartbeatEventMissed = "Node heartbeat missed" 24 ) 25 26 var ( 27 // heartbeatNotLeaderErr is the error returned when the heartbeat request 28 // couldn't be completed since the server is not the leader. 29 heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) 30 ) 31 32 // nodeHeartbeater is used to track expiration times of node heartbeats. If it 33 // detects an expired node, the node status is updated to be 'down'. 34 type nodeHeartbeater struct { 35 *Server 36 logger log.Logger 37 38 // heartbeatTimers track the expiration time of each heartbeat that has 39 // a TTL. On expiration, the node status is updated to be 'down'. 40 heartbeatTimers map[string]*time.Timer 41 heartbeatTimersLock sync.Mutex 42 } 43 44 // newNodeHeartbeater returns a new node heartbeater used to detect and act on 45 // failed node heartbeats. 46 func newNodeHeartbeater(s *Server) *nodeHeartbeater { 47 return &nodeHeartbeater{ 48 Server: s, 49 logger: s.logger.Named("heartbeat"), 50 } 51 } 52 53 // initializeHeartbeatTimers is used when a leader is newly elected to create 54 // a new map to track heartbeat expiration and to reset all the timers from 55 // the previously known set of timers. 56 func (h *nodeHeartbeater) initializeHeartbeatTimers() error { 57 // Scan all nodes and reset their timer 58 snap, err := h.fsm.State().Snapshot() 59 if err != nil { 60 return err 61 } 62 63 // Get an iterator over nodes 64 ws := memdb.NewWatchSet() 65 iter, err := snap.Nodes(ws) 66 if err != nil { 67 return err 68 } 69 70 h.heartbeatTimersLock.Lock() 71 defer h.heartbeatTimersLock.Unlock() 72 73 // Handle each node 74 for { 75 raw := iter.Next() 76 if raw == nil { 77 break 78 } 79 node := raw.(*structs.Node) 80 if node.TerminalStatus() { 81 continue 82 } 83 h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL) 84 } 85 return nil 86 } 87 88 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 89 // This can be used for new heartbeats and existing ones. 90 func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) { 91 h.heartbeatTimersLock.Lock() 92 defer h.heartbeatTimersLock.Unlock() 93 94 // Do not create a timer for the node since we are not the leader. This 95 // check avoids the race in which leadership is lost but a timer is created 96 // on this server since it was servicing an RPC during a leadership loss. 97 if !h.IsLeader() { 98 h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id) 99 return 0, heartbeatNotLeaderErr 100 } 101 102 // Compute the target TTL value 103 n := len(h.heartbeatTimers) 104 ttl := helper.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n) 105 ttl += helper.RandomStagger(ttl) 106 107 // Reset the TTL 108 h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace) 109 return ttl, nil 110 } 111 112 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 113 // assuming the heartbeatTimerLock is already held 114 func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 115 // Ensure a timer map exists 116 if h.heartbeatTimers == nil { 117 h.heartbeatTimers = make(map[string]*time.Timer) 118 } 119 120 // Renew the heartbeat timer if it exists 121 if timer, ok := h.heartbeatTimers[id]; ok { 122 timer.Reset(ttl) 123 return 124 } 125 126 // Create a new timer to track expiration of this heartbeat 127 timer := time.AfterFunc(ttl, func() { 128 h.invalidateHeartbeat(id) 129 }) 130 h.heartbeatTimers[id] = timer 131 } 132 133 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 134 // need to invalidate the heartbeat. 135 func (h *nodeHeartbeater) invalidateHeartbeat(id string) { 136 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 137 // Clear the heartbeat timer 138 h.heartbeatTimersLock.Lock() 139 if timer, ok := h.heartbeatTimers[id]; ok { 140 timer.Stop() 141 delete(h.heartbeatTimers, id) 142 } 143 h.heartbeatTimersLock.Unlock() 144 145 // Do not invalidate the node since we are not the leader. This check avoids 146 // the race in which leadership is lost but a timer is created on this 147 // server since it was servicing an RPC during a leadership loss. 148 if !h.IsLeader() { 149 h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id) 150 return 151 } 152 153 h.logger.Warn("node TTL expired", "node_id", id) 154 155 canDisconnect, hasPendingReconnects := h.disconnectState(id) 156 157 // Make a request to update the node status 158 req := structs.NodeUpdateStatusRequest{ 159 NodeID: id, 160 Status: structs.NodeStatusDown, 161 NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), 162 WriteRequest: structs.WriteRequest{ 163 Region: h.config.Region, 164 }, 165 } 166 167 if canDisconnect && hasPendingReconnects { 168 req.Status = structs.NodeStatusDisconnected 169 } 170 var resp structs.NodeUpdateResponse 171 172 if err := h.RPC("Node.UpdateStatus", &req, &resp); err != nil { 173 h.logger.Error("update node status failed", "error", err) 174 } 175 } 176 177 func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) { 178 node, err := h.State().NodeByID(nil, id) 179 if err != nil { 180 h.logger.Error("error retrieving node by id", "error", err) 181 return false, false 182 } 183 184 // Exit if the node is already down or just initializing. 185 if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit { 186 return false, false 187 } 188 189 allocs, err := h.State().AllocsByNode(nil, id) 190 if err != nil { 191 h.logger.Error("error retrieving allocs by node", "error", err) 192 return false, false 193 } 194 195 now := time.Now().UTC() 196 // Check if the node has any allocs that are configured with max_client_disconnect, 197 // that are past the disconnect window, and if so, whether it has at least one 198 // alloc that isn't yet expired. 199 nodeCanDisconnect := false 200 for _, alloc := range allocs { 201 allocCanDisconnect := alloc.DisconnectTimeout(now).After(now) 202 // Only process this until we find that at least one alloc is configured 203 // with max_client_disconnect. 204 if !nodeCanDisconnect && allocCanDisconnect { 205 nodeCanDisconnect = true 206 } 207 // Only process this until we find one that we want to run and has not 208 // yet expired. 209 if allocCanDisconnect && 210 alloc.DesiredStatus == structs.AllocDesiredStatusRun && 211 !alloc.Expired(now) { 212 return true, true 213 } 214 } 215 216 return nodeCanDisconnect, false 217 } 218 219 // clearHeartbeatTimer is used to clear the heartbeat time for 220 // a single heartbeat. This is used when a heartbeat is destroyed 221 // explicitly and no longer needed. 222 func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error { 223 h.heartbeatTimersLock.Lock() 224 defer h.heartbeatTimersLock.Unlock() 225 226 if timer, ok := h.heartbeatTimers[id]; ok { 227 timer.Stop() 228 delete(h.heartbeatTimers, id) 229 } 230 return nil 231 } 232 233 // clearAllHeartbeatTimers is used when a leader is stepping 234 // down and we no longer need to track any heartbeat timers. 235 func (h *nodeHeartbeater) clearAllHeartbeatTimers() error { 236 h.heartbeatTimersLock.Lock() 237 defer h.heartbeatTimersLock.Unlock() 238 239 for _, t := range h.heartbeatTimers { 240 t.Stop() 241 } 242 h.heartbeatTimers = nil 243 return nil 244 } 245 246 // heartbeatStats is a long running routine used to capture 247 // the number of active heartbeats being tracked 248 func (h *nodeHeartbeater) heartbeatStats() { 249 for { 250 select { 251 case <-time.After(5 * time.Second): 252 h.heartbeatTimersLock.Lock() 253 num := len(h.heartbeatTimers) 254 h.heartbeatTimersLock.Unlock() 255 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 256 257 case <-h.shutdownCh: 258 return 259 } 260 } 261 }