github.com/emate/nomad@v0.8.2-wo-binpacking/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "errors" 5 "time" 6 7 "github.com/armon/go-metrics" 8 "github.com/hashicorp/consul/lib" 9 memdb "github.com/hashicorp/go-memdb" 10 "github.com/hashicorp/nomad/nomad/structs" 11 ) 12 13 const ( 14 // heartbeatNotLeader is the error string returned when the heartbeat request 15 // couldn't be completed since the server is not the leader. 16 heartbeatNotLeader = "failed to reset heartbeat since server is not leader" 17 ) 18 19 var ( 20 // heartbeatNotLeaderErr is the error returned when the heartbeat request 21 // couldn't be completed since the server is not the leader. 22 heartbeatNotLeaderErr = errors.New(heartbeatNotLeader) 23 ) 24 25 // initializeHeartbeatTimers is used when a leader is newly elected to create 26 // a new map to track heartbeat expiration and to reset all the timers from 27 // the previously known set of timers. 28 func (s *Server) initializeHeartbeatTimers() error { 29 // Scan all nodes and reset their timer 30 snap, err := s.fsm.State().Snapshot() 31 if err != nil { 32 return err 33 } 34 35 // Get an iterator over nodes 36 ws := memdb.NewWatchSet() 37 iter, err := snap.Nodes(ws) 38 if err != nil { 39 return err 40 } 41 42 s.heartbeatTimersLock.Lock() 43 defer s.heartbeatTimersLock.Unlock() 44 45 // Handle each node 46 for { 47 raw := iter.Next() 48 if raw == nil { 49 break 50 } 51 node := raw.(*structs.Node) 52 if node.TerminalStatus() { 53 continue 54 } 55 s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) 56 } 57 return nil 58 } 59 60 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 61 // This can be used for new heartbeats and existing ones. 62 func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { 63 s.heartbeatTimersLock.Lock() 64 defer s.heartbeatTimersLock.Unlock() 65 66 // Do not create a timer for the node since we are not the leader. This 67 // check avoids the race in which leadership is lost but a timer is created 68 // on this server since it was servicing an RPC during a leadership loss. 69 if !s.IsLeader() { 70 s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id) 71 return 0, heartbeatNotLeaderErr 72 } 73 74 // Compute the target TTL value 75 n := len(s.heartbeatTimers) 76 ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) 77 ttl += lib.RandomStagger(ttl) 78 79 // Reset the TTL 80 s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) 81 return ttl, nil 82 } 83 84 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 85 // assuming the heartbeatTimerLock is already held 86 func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 87 // Ensure a timer map exists 88 if s.heartbeatTimers == nil { 89 s.heartbeatTimers = make(map[string]*time.Timer) 90 } 91 92 // Renew the heartbeat timer if it exists 93 if timer, ok := s.heartbeatTimers[id]; ok { 94 timer.Reset(ttl) 95 return 96 } 97 98 // Create a new timer to track expiration of this heartbeat 99 timer := time.AfterFunc(ttl, func() { 100 s.invalidateHeartbeat(id) 101 }) 102 s.heartbeatTimers[id] = timer 103 } 104 105 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 106 // need to invalidate the heartbeat. 107 func (s *Server) invalidateHeartbeat(id string) { 108 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 109 // Clear the heartbeat timer 110 s.heartbeatTimersLock.Lock() 111 delete(s.heartbeatTimers, id) 112 s.heartbeatTimersLock.Unlock() 113 114 // Do not invalidate the node since we are not the leader. This check avoids 115 // the race in which leadership is lost but a timer is created on this 116 // server since it was servicing an RPC during a leadership loss. 117 if !s.IsLeader() { 118 s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id) 119 return 120 } 121 122 s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) 123 124 // Make a request to update the node status 125 req := structs.NodeUpdateStatusRequest{ 126 NodeID: id, 127 Status: structs.NodeStatusDown, 128 WriteRequest: structs.WriteRequest{ 129 Region: s.config.Region, 130 }, 131 } 132 var resp structs.NodeUpdateResponse 133 if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil { 134 s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) 135 } 136 } 137 138 // clearHeartbeatTimer is used to clear the heartbeat time for 139 // a single heartbeat. This is used when a heartbeat is destroyed 140 // explicitly and no longer needed. 141 func (s *Server) clearHeartbeatTimer(id string) error { 142 s.heartbeatTimersLock.Lock() 143 defer s.heartbeatTimersLock.Unlock() 144 145 if timer, ok := s.heartbeatTimers[id]; ok { 146 timer.Stop() 147 delete(s.heartbeatTimers, id) 148 } 149 return nil 150 } 151 152 // clearAllHeartbeatTimers is used when a leader is stepping 153 // down and we no longer need to track any heartbeat timers. 154 func (s *Server) clearAllHeartbeatTimers() error { 155 s.heartbeatTimersLock.Lock() 156 defer s.heartbeatTimersLock.Unlock() 157 158 for _, t := range s.heartbeatTimers { 159 t.Stop() 160 } 161 s.heartbeatTimers = nil 162 return nil 163 } 164 165 // heartbeatStats is a long running routine used to capture 166 // the number of active heartbeats being tracked 167 func (s *Server) heartbeatStats() { 168 for { 169 select { 170 case <-time.After(5 * time.Second): 171 s.heartbeatTimersLock.Lock() 172 num := len(s.heartbeatTimers) 173 s.heartbeatTimersLock.Unlock() 174 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 175 176 case <-s.shutdownCh: 177 return 178 } 179 } 180 }