github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "time" 5 6 "github.com/armon/go-metrics" 7 "github.com/hashicorp/consul/lib" 8 memdb "github.com/hashicorp/go-memdb" 9 "github.com/hashicorp/nomad/nomad/structs" 10 ) 11 12 // initializeHeartbeatTimers is used when a leader is newly elected to create 13 // a new map to track heartbeat expiration and to reset all the timers from 14 // the previously known set of timers. 15 func (s *Server) initializeHeartbeatTimers() error { 16 // Scan all nodes and reset their timer 17 snap, err := s.fsm.State().Snapshot() 18 if err != nil { 19 return err 20 } 21 22 // Get an iterator over nodes 23 ws := memdb.NewWatchSet() 24 iter, err := snap.Nodes(ws) 25 if err != nil { 26 return err 27 } 28 29 s.heartbeatTimersLock.Lock() 30 defer s.heartbeatTimersLock.Unlock() 31 32 // Handle each node 33 for { 34 raw := iter.Next() 35 if raw == nil { 36 break 37 } 38 node := raw.(*structs.Node) 39 if node.TerminalStatus() { 40 continue 41 } 42 s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) 43 } 44 return nil 45 } 46 47 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 48 // This can be used for new heartbeats and existing ones. 49 func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { 50 s.heartbeatTimersLock.Lock() 51 defer s.heartbeatTimersLock.Unlock() 52 53 // Compute the target TTL value 54 n := len(s.heartbeatTimers) 55 ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) 56 ttl += lib.RandomStagger(ttl) 57 58 // Reset the TTL 59 s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) 60 return ttl, nil 61 } 62 63 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 64 // assuming the heartbeatTimerLock is already held 65 func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 66 // Ensure a timer map exists 67 if s.heartbeatTimers == nil { 68 s.heartbeatTimers = make(map[string]*time.Timer) 69 } 70 71 // Renew the heartbeat timer if it exists 72 if timer, ok := s.heartbeatTimers[id]; ok { 73 timer.Reset(ttl) 74 return 75 } 76 77 // Create a new timer to track expiration of this heartbeat 78 timer := time.AfterFunc(ttl, func() { 79 s.invalidateHeartbeat(id) 80 }) 81 s.heartbeatTimers[id] = timer 82 } 83 84 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 85 // need to invalidate the heartbeat. 86 func (s *Server) invalidateHeartbeat(id string) { 87 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 88 // Clear the heartbeat timer 89 s.heartbeatTimersLock.Lock() 90 delete(s.heartbeatTimers, id) 91 s.heartbeatTimersLock.Unlock() 92 s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) 93 94 // Make a request to update the node status 95 req := structs.NodeUpdateStatusRequest{ 96 NodeID: id, 97 Status: structs.NodeStatusDown, 98 WriteRequest: structs.WriteRequest{ 99 Region: s.config.Region, 100 }, 101 } 102 var resp structs.NodeUpdateResponse 103 if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil { 104 s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) 105 } 106 } 107 108 // clearHeartbeatTimer is used to clear the heartbeat time for 109 // a single heartbeat. This is used when a heartbeat is destroyed 110 // explicitly and no longer needed. 111 func (s *Server) clearHeartbeatTimer(id string) error { 112 s.heartbeatTimersLock.Lock() 113 defer s.heartbeatTimersLock.Unlock() 114 115 if timer, ok := s.heartbeatTimers[id]; ok { 116 timer.Stop() 117 delete(s.heartbeatTimers, id) 118 } 119 return nil 120 } 121 122 // clearAllHeartbeatTimers is used when a leader is stepping 123 // down and we no longer need to track any heartbeat timers. 124 func (s *Server) clearAllHeartbeatTimers() error { 125 s.heartbeatTimersLock.Lock() 126 defer s.heartbeatTimersLock.Unlock() 127 128 for _, t := range s.heartbeatTimers { 129 t.Stop() 130 } 131 s.heartbeatTimers = nil 132 return nil 133 } 134 135 // heartbeatStats is a long running routine used to capture 136 // the number of active heartbeats being tracked 137 func (s *Server) heartbeatStats() { 138 for { 139 select { 140 case <-time.After(5 * time.Second): 141 s.heartbeatTimersLock.Lock() 142 num := len(s.heartbeatTimers) 143 s.heartbeatTimersLock.Unlock() 144 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 145 146 case <-s.shutdownCh: 147 return 148 } 149 } 150 }