github.com/bigcommerce/nomad@v0.9.3-bc/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "sync" 6 "time" 7 8 metrics "github.com/armon/go-metrics" 9 log "github.com/hashicorp/go-hclog" 10 memdb "github.com/hashicorp/go-memdb" 11 12 "github.com/hashicorp/consul/lib" 13 "github.com/hashicorp/nomad/nomad/structs" 14 ) 15 16 const ( 17 // heartbeatNotLeader is the error string returned when the heartbeat request 18 // couldn't be completed since the server is not the leader. 19 heartbeatNotLeader = "failed to reset heartbeat since server is not leader" 20 21 // NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is 22 // missed. 23 NodeHeartbeatEventMissed = "Node heartbeat missed" 24 ) 25 26 var ( 27 // heartbeatNotLeaderErr is the error returned when the heartbeat request 28 // couldn't be completed since the server is not the leader. 29 heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) 30 ) 31 32 // nodeHeartbeater is used to track expiration times of node heartbeats. If it 33 // detects an expired node, the node status is updated to be 'down'. 34 type nodeHeartbeater struct { 35 *Server 36 logger log.Logger 37 38 // heartbeatTimers track the expiration time of each heartbeat that has 39 // a TTL. On expiration, the node status is updated to be 'down'. 40 heartbeatTimers map[string]*time.Timer 41 heartbeatTimersLock sync.Mutex 42 } 43 44 // newNodeHeartbeater returns a new node heartbeater used to detect and act on 45 // failed node heartbeats. 46 func newNodeHeartbeater(s *Server) *nodeHeartbeater { 47 return &nodeHeartbeater{ 48 Server: s, 49 logger: s.logger.Named("heartbeat"), 50 } 51 } 52 53 // initializeHeartbeatTimers is used when a leader is newly elected to create 54 // a new map to track heartbeat expiration and to reset all the timers from 55 // the previously known set of timers. 56 func (h *nodeHeartbeater) initializeHeartbeatTimers() error { 57 // Scan all nodes and reset their timer 58 snap, err := h.fsm.State().Snapshot() 59 if err != nil { 60 return err 61 } 62 63 // Get an iterator over nodes 64 ws := memdb.NewWatchSet() 65 iter, err := snap.Nodes(ws) 66 if err != nil { 67 return err 68 } 69 70 h.heartbeatTimersLock.Lock() 71 defer h.heartbeatTimersLock.Unlock() 72 73 // Handle each node 74 for { 75 raw := iter.Next() 76 if raw == nil { 77 break 78 } 79 node := raw.(*structs.Node) 80 if node.TerminalStatus() { 81 continue 82 } 83 h.resetHeartbeatTimerLocked(node.ID, h.config.FailoverHeartbeatTTL) 84 } 85 return nil 86 } 87 88 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 89 // This can be used for new heartbeats and existing ones. 90 func (h *nodeHeartbeater) resetHeartbeatTimer(id string) (time.Duration, error) { 91 h.heartbeatTimersLock.Lock() 92 defer h.heartbeatTimersLock.Unlock() 93 94 // Do not create a timer for the node since we are not the leader. This 95 // check avoids the race in which leadership is lost but a timer is created 96 // on this server since it was servicing an RPC during a leadership loss. 97 if !h.IsLeader() { 98 h.logger.Debug("ignoring resetting node TTL since this server is not the leader", "node_id", id) 99 return 0, heartbeatNotLeaderErr 100 } 101 102 // Compute the target TTL value 103 n := len(h.heartbeatTimers) 104 ttl := lib.RateScaledInterval(h.config.MaxHeartbeatsPerSecond, h.config.MinHeartbeatTTL, n) 105 ttl += lib.RandomStagger(ttl) 106 107 // Reset the TTL 108 h.resetHeartbeatTimerLocked(id, ttl+h.config.HeartbeatGrace) 109 return ttl, nil 110 } 111 112 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 113 // assuming the heartbeatTimerLock is already held 114 func (h *nodeHeartbeater) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 115 // Ensure a timer map exists 116 if h.heartbeatTimers == nil { 117 h.heartbeatTimers = make(map[string]*time.Timer) 118 } 119 120 // Renew the heartbeat timer if it exists 121 if timer, ok := h.heartbeatTimers[id]; ok { 122 timer.Reset(ttl) 123 return 124 } 125 126 // Create a new timer to track expiration of this heartbeat 127 timer := time.AfterFunc(ttl, func() { 128 h.invalidateHeartbeat(id) 129 }) 130 h.heartbeatTimers[id] = timer 131 } 132 133 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 134 // need to invalidate the heartbeat. 135 func (h *nodeHeartbeater) invalidateHeartbeat(id string) { 136 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 137 // Clear the heartbeat timer 138 h.heartbeatTimersLock.Lock() 139 if timer, ok := h.heartbeatTimers[id]; ok { 140 timer.Stop() 141 delete(h.heartbeatTimers, id) 142 } 143 h.heartbeatTimersLock.Unlock() 144 145 // Do not invalidate the node since we are not the leader. This check avoids 146 // the race in which leadership is lost but a timer is created on this 147 // server since it was servicing an RPC during a leadership loss. 148 if !h.IsLeader() { 149 h.logger.Debug("ignoring node TTL since this server is not the leader", "node_id", id) 150 return 151 } 152 153 h.logger.Warn("node TTL expired", "node_id", id) 154 155 // Make a request to update the node status 156 req := structs.NodeUpdateStatusRequest{ 157 NodeID: id, 158 Status: structs.NodeStatusDown, 159 NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), 160 WriteRequest: structs.WriteRequest{ 161 Region: h.config.Region, 162 }, 163 } 164 var resp structs.NodeUpdateResponse 165 if err := h.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { 166 h.logger.Error("update node status failed", "error", err) 167 } 168 } 169 170 // clearHeartbeatTimer is used to clear the heartbeat time for 171 // a single heartbeat. This is used when a heartbeat is destroyed 172 // explicitly and no longer needed. 173 func (h *nodeHeartbeater) clearHeartbeatTimer(id string) error { 174 h.heartbeatTimersLock.Lock() 175 defer h.heartbeatTimersLock.Unlock() 176 177 if timer, ok := h.heartbeatTimers[id]; ok { 178 timer.Stop() 179 delete(h.heartbeatTimers, id) 180 } 181 return nil 182 } 183 184 // clearAllHeartbeatTimers is used when a leader is stepping 185 // down and we no longer need to track any heartbeat timers. 186 func (h *nodeHeartbeater) clearAllHeartbeatTimers() error { 187 h.heartbeatTimersLock.Lock() 188 defer h.heartbeatTimersLock.Unlock() 189 190 for _, t := range h.heartbeatTimers { 191 t.Stop() 192 } 193 h.heartbeatTimers = nil 194 return nil 195 } 196 197 // heartbeatStats is a long running routine used to capture 198 // the number of active heartbeats being tracked 199 func (h *nodeHeartbeater) heartbeatStats() { 200 for { 201 select { 202 case <-time.After(5 * time.Second): 203 h.heartbeatTimersLock.Lock() 204 num := len(h.heartbeatTimers) 205 h.heartbeatTimersLock.Unlock() 206 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 207 208 case <-h.shutdownCh: 209 return 210 } 211 } 212 }