github.com/hernad/nomad@v1.6.112/nomad/heartbeat.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package nomad 5 6 import ( 7 "errors" 8 "sync" 9 "time" 10 11 metrics "github.com/armon/go-metrics" 12 log "github.com/hashicorp/go-hclog" 13 memdb "github.com/hashicorp/go-memdb" 14 15 "github.com/hernad/nomad/helper" 16 "github.com/hernad/nomad/nomad/structs" 17 ) 18 19 const ( 20 // heartbeatNotLeader is the error string returned when the heartbeat request 21 // couldn't be completed since the server is not the leader. 22 heartbeatNotLeader = "failed to reset heartbeat since server is not leader" 23 24 // NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is 25 // missed. 26 NodeHeartbeatEventMissed = "Node heartbeat missed" 27 ) 28 29 var ( 30 // heartbeatNotLeaderErr is the error returned when the heartbeat request 31 // couldn't be completed since the server is not the leader. 32 heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) 33 ) 34 35 // nodeHeartbeater is used to track expiration times of node heartbeats. If it 36 // detects an expired node, the node status is updated to be 'down'. 37 type nodeHeartbeater struct { 38 *Server 39 logger log.Logger 40 41 // heartbeatTimers track the expiration time of each heartbeat that has 42 // a TTL. On expiration, the node status is updated to be 'down'. 43 heartbeatTimers map[string]*time.Timer 44 heartbeatTimersLock sync.Mutex 45 } 46 47 // newNodeHeartbeater returns a new node heartbeater used to detect and act on 48 // failed node heartbeats. 49 func newNodeHeartbeater(s *Server) *nodeHeartbeater { 50 return &nodeHeartbeater{ 51 Server: s, 52 logger: s.logger.Named("heartbeat"), 53 } 54 } 55 56 // initializeHeartbeatTimers is used when a leader is newly elected to create 57 // a new map to track heartbeat expiration and to reset all the timers from 58 // the previously known set of timers. 59 func (h *nodeHeartbeater) initializeHeartbeatTimers() error { 60 // Scan all nodes and reset their timer 61 snap, err := h.fsm.State().Snapshot() 62 if err != nil { 63 return err 64 } 65 66 // Get an iterator over nodes 67 ws := memdb.NewWatchSet() 68 iter, err := snap.Nodes(ws) 69 if err != nil { 70 return err 71 } 72 73 h.heartbeatTimersLock.Lock() 74 defer h.heartbeatTimersLock.Unlock() 75 76 // Handle each node 77 for { 78 raw := iter.Next() 79 if raw == nil { 80 break 81 } 82 node := raw.(*structs.Node) 83 if node.TerminalStatus() { 84 continue 85 } 86 h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL) 87 } 88 return nil 89 } 90 91 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 92 // This can be used for new heartbeats and existing ones. 93 func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) { 94 h.heartbeatTimersLock.Lock() 95 defer h.heartbeatTimersLock.Unlock() 96 97 // Do not create a timer for the node since we are not the leader. This 98 // check avoids the race in which leadership is lost but a timer is created 99 // on this server since it was servicing an RPC during a leadership loss. 100 if !h.IsLeader() { 101 h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id) 102 return 0, heartbeatNotLeaderErr 103 } 104 105 // Compute the target TTL value 106 n := len(h.heartbeatTimers) 107 ttl := helper.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n) 108 ttl += helper.RandomStagger(ttl) 109 110 // Reset the TTL 111 h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace) 112 return ttl, nil 113 } 114 115 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 116 // assuming the heartbeatTimerLock is already held 117 func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 118 // Ensure a timer map exists 119 if h.heartbeatTimers == nil { 120 h.heartbeatTimers = make(map[string]*time.Timer) 121 } 122 123 // Renew the heartbeat timer if it exists 124 if timer, ok := h.heartbeatTimers[id]; ok { 125 timer.Reset(ttl) 126 return 127 } 128 129 // Create a new timer to track expiration of this heartbeat 130 timer := time.AfterFunc(ttl, func() { 131 h.invalidateHeartbeat(id) 132 }) 133 h.heartbeatTimers[id] = timer 134 } 135 136 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 137 // need to invalidate the heartbeat. 138 func (h *nodeHeartbeater) invalidateHeartbeat(id string) { 139 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 140 // Clear the heartbeat timer 141 h.heartbeatTimersLock.Lock() 142 if timer, ok := h.heartbeatTimers[id]; ok { 143 timer.Stop() 144 delete(h.heartbeatTimers, id) 145 } 146 h.heartbeatTimersLock.Unlock() 147 148 // Do not invalidate the node since we are not the leader. This check avoids 149 // the race in which leadership is lost but a timer is created on this 150 // server since it was servicing an RPC during a leadership loss. 151 if !h.IsLeader() { 152 h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id) 153 return 154 } 155 156 h.logger.Warn("node TTL expired", "node_id", id) 157 158 canDisconnect, hasPendingReconnects := h.disconnectState(id) 159 160 // Make a request to update the node status 161 req := structs.NodeUpdateStatusRequest{ 162 NodeID: id, 163 Status: structs.NodeStatusDown, 164 NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), 165 WriteRequest: structs.WriteRequest{ 166 Region: h.config.Region, 167 }, 168 } 169 170 if canDisconnect && hasPendingReconnects { 171 req.Status = structs.NodeStatusDisconnected 172 } 173 var resp structs.NodeUpdateResponse 174 175 if err := h.RPC("Node.UpdateStatus", &req, &resp); err != nil { 176 h.logger.Error("update node status failed", "error", err) 177 } 178 } 179 180 func (h *nodeHeartbeater) disconnectState(id string) (bool, bool) { 181 node, err := h.State().NodeByID(nil, id) 182 if err != nil { 183 h.logger.Error("error retrieving node by id", "error", err) 184 return false, false 185 } 186 187 // Exit if the node is already down or just initializing. 188 if node.Status == structs.NodeStatusDown || node.Status == structs.NodeStatusInit { 189 return false, false 190 } 191 192 allocs, err := h.State().AllocsByNode(nil, id) 193 if err != nil { 194 h.logger.Error("error retrieving allocs by node", "error", err) 195 return false, false 196 } 197 198 now := time.Now().UTC() 199 // Check if the node has any allocs that are configured with max_client_disconnect, 200 // that are past the disconnect window, and if so, whether it has at least one 201 // alloc that isn't yet expired. 202 nodeCanDisconnect := false 203 for _, alloc := range allocs { 204 allocCanDisconnect := alloc.DisconnectTimeout(now).After(now) 205 // Only process this until we find that at least one alloc is configured 206 // with max_client_disconnect. 207 if !nodeCanDisconnect && allocCanDisconnect { 208 nodeCanDisconnect = true 209 } 210 // Only process this until we find one that we want to run and has not 211 // yet expired. 212 if allocCanDisconnect && 213 alloc.DesiredStatus == structs.AllocDesiredStatusRun && 214 !alloc.Expired(now) { 215 return true, true 216 } 217 } 218 219 return nodeCanDisconnect, false 220 } 221 222 // clearHeartbeatTimer is used to clear the heartbeat time for 223 // a single heartbeat. This is used when a heartbeat is destroyed 224 // explicitly and no longer needed. 225 func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error { 226 h.heartbeatTimersLock.Lock() 227 defer h.heartbeatTimersLock.Unlock() 228 229 if timer, ok := h.heartbeatTimers[id]; ok { 230 timer.Stop() 231 delete(h.heartbeatTimers, id) 232 } 233 return nil 234 } 235 236 // clearAllHeartbeatTimers is used when a leader is stepping 237 // down and we no longer need to track any heartbeat timers. 238 func (h *nodeHeartbeater) clearAllHeartbeatTimers() error { 239 h.heartbeatTimersLock.Lock() 240 defer h.heartbeatTimersLock.Unlock() 241 242 for _, t := range h.heartbeatTimers { 243 t.Stop() 244 } 245 h.heartbeatTimers = nil 246 return nil 247 } 248 249 // heartbeatStats is a long running routine used to capture 250 // the number of active heartbeats being tracked 251 func (h *nodeHeartbeater) heartbeatStats() { 252 for { 253 select { 254 case <-time.After(5 * time.Second): 255 h.heartbeatTimersLock.Lock() 256 num := len(h.heartbeatTimers) 257 h.heartbeatTimersLock.Unlock() 258 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 259 260 case <-h.shutdownCh: 261 return 262 } 263 } 264 }