github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/consul/lib" 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // heartbeatNotLeader is the error string returned when the heartbeat request 15 // couldn't be completed since the server is not the leader. 16 heartbeatNotLeader = "failed to reset heartbeat since server is not leader" 17 18 // NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is 19 // missed. 20 NodeHeartbeatEventMissed = "Node heartbeat missed" 21 ) 22 23 var ( 24 // heartbeatNotLeaderErr is the error returned when the heartbeat request 25 // couldn't be completed since the server is not the leader. 26 heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) 27 ) 28 29 // initializeHeartbeatTimers is used when a leader is newly elected to create 30 // a new map to track heartbeat expiration and to reset all the timers from 31 // the previously known set of timers. 32 func (s *Server) initializeHeartbeatTimers() error { 33 // Scan all nodes and reset their timer 34 snap, err := s.fsm.State().Snapshot() 35 if err != nil { 36 return err 37 } 38 39 // Get an iterator over nodes 40 ws := memdb.NewWatchSet() 41 iter, err := snap.Nodes(ws) 42 if err != nil { 43 return err 44 } 45 46 s.heartbeatTimersLock.Lock() 47 defer s.heartbeatTimersLock.Unlock() 48 49 // Handle each node 50 for { 51 raw := iter.Next() 52 if raw == nil { 53 break 54 } 55 node := raw.(*structs.Node) 56 if node.TerminalStatus() { 57 continue 58 } 59 s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) 60 } 61 return nil 62 } 63 64 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 65 // This can be used for new heartbeats and existing ones. 66 func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { 67 s.heartbeatTimersLock.Lock() 68 defer s.heartbeatTimersLock.Unlock() 69 70 // Do not create a timer for the node since we are not the leader. This 71 // check avoids the race in which leadership is lost but a timer is created 72 // on this server since it was servicing an RPC during a leadership loss. 73 if !s.IsLeader() { 74 s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id) 75 return 0, heartbeatNotLeaderErr 76 } 77 78 // Compute the target TTL value 79 n := len(s.heartbeatTimers) 80 ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) 81 ttl += lib.RandomStagger(ttl) 82 83 // Reset the TTL 84 s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) 85 return ttl, nil 86 } 87 88 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 89 // assuming the heartbeatTimerLock is already held 90 func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 91 // Ensure a timer map exists 92 if s.heartbeatTimers == nil { 93 s.heartbeatTimers = make(map[string]*time.Timer) 94 } 95 96 // Renew the heartbeat timer if it exists 97 if timer, ok := s.heartbeatTimers[id]; ok { 98 timer.Reset(ttl) 99 return 100 } 101 102 // Create a new timer to track expiration of this heartbeat 103 timer := time.AfterFunc(ttl, func() { 104 s.invalidateHeartbeat(id) 105 }) 106 s.heartbeatTimers[id] = timer 107 } 108 109 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 110 // need to invalidate the heartbeat. 111 func (s *Server) invalidateHeartbeat(id string) { 112 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 113 // Clear the heartbeat timer 114 s.heartbeatTimersLock.Lock() 115 if timer, ok := s.heartbeatTimers[id]; ok { 116 timer.Stop() 117 delete(s.heartbeatTimers, id) 118 } 119 s.heartbeatTimersLock.Unlock() 120 121 // Do not invalidate the node since we are not the leader. This check avoids 122 // the race in which leadership is lost but a timer is created on this 123 // server since it was servicing an RPC during a leadership loss. 124 if !s.IsLeader() { 125 s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id) 126 return 127 } 128 129 s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) 130 131 // Make a request to update the node status 132 req := structs.NodeUpdateStatusRequest{ 133 NodeID: id, 134 Status: structs.NodeStatusDown, 135 NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed), 136 WriteRequest: structs.WriteRequest{ 137 Region: s.config.Region, 138 }, 139 } 140 var resp structs.NodeUpdateResponse 141 if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { 142 s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) 143 } 144 } 145 146 // clearHeartbeatTimer is used to clear the heartbeat time for 147 // a single heartbeat. This is used when a heartbeat is destroyed 148 // explicitly and no longer needed. 149 func (s *Server) clearHeartbeatTimer(id string) error { 150 s.heartbeatTimersLock.Lock() 151 defer s.heartbeatTimersLock.Unlock() 152 153 if timer, ok := s.heartbeatTimers[id]; ok { 154 timer.Stop() 155 delete(s.heartbeatTimers, id) 156 } 157 return nil 158 } 159 160 // clearAllHeartbeatTimers is used when a leader is stepping 161 // down and we no longer need to track any heartbeat timers. 162 func (s *Server) clearAllHeartbeatTimers() error { 163 s.heartbeatTimersLock.Lock() 164 defer s.heartbeatTimersLock.Unlock() 165 166 for _, t := range s.heartbeatTimers { 167 t.Stop() 168 } 169 s.heartbeatTimers = nil 170 return nil 171 } 172 173 // heartbeatStats is a long running routine used to capture 174 // the number of active heartbeats being tracked 175 func (s *Server) heartbeatStats() { 176 for { 177 select { 178 case <-time.After(5 * time.Second): 179 s.heartbeatTimersLock.Lock() 180 num := len(s.heartbeatTimers) 181 s.heartbeatTimersLock.Unlock() 182 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 183 184 case <-s.shutdownCh: 185 return 186 } 187 } 188 }