github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/heartbeat.go (about) 1 package nomad 2 3 import ( 4 "time" 5 6 "github.com/armon/go-metrics" 7 "github.com/hashicorp/consul/lib" 8 "github.com/hashicorp/nomad/nomad/structs" 9 ) 10 11 // initializeHeartbeatTimers is used when a leader is newly elected to create 12 // a new map to track heartbeat expiration and to reset all the timers from 13 // the previously known set of timers. 14 func (s *Server) initializeHeartbeatTimers() error { 15 // Scan all nodes and reset their timer 16 snap, err := s.fsm.State().Snapshot() 17 if err != nil { 18 return err 19 } 20 21 // Get an iterator over nodes 22 iter, err := snap.Nodes() 23 if err != nil { 24 return err 25 } 26 27 s.heartbeatTimersLock.Lock() 28 defer s.heartbeatTimersLock.Unlock() 29 30 // Handle each node 31 for { 32 raw := iter.Next() 33 if raw == nil { 34 break 35 } 36 node := raw.(*structs.Node) 37 if node.TerminalStatus() { 38 continue 39 } 40 s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL) 41 } 42 return nil 43 } 44 45 // resetHeartbeatTimer is used to reset the TTL of a heartbeat. 46 // This can be used for new heartbeats and existing ones. 47 func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) { 48 s.heartbeatTimersLock.Lock() 49 defer s.heartbeatTimersLock.Unlock() 50 51 // Compute the target TTL value 52 n := len(s.heartbeatTimers) 53 ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n) 54 ttl += lib.RandomStagger(ttl) 55 56 // Reset the TTL 57 s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace) 58 return ttl, nil 59 } 60 61 // resetHeartbeatTimerLocked is used to reset a heartbeat timer 62 // assuming the heartbeatTimerLock is already held 63 func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) { 64 // Ensure a timer map exists 65 if s.heartbeatTimers == nil { 66 s.heartbeatTimers = make(map[string]*time.Timer) 67 } 68 69 // Renew the heartbeat timer if it exists 70 if timer, ok := s.heartbeatTimers[id]; ok { 71 timer.Reset(ttl) 72 return 73 } 74 75 // Create a new timer to track expiration of this heartbeat 76 timer := time.AfterFunc(ttl, func() { 77 s.invalidateHeartbeat(id) 78 }) 79 s.heartbeatTimers[id] = timer 80 } 81 82 // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we 83 // need to invalidate the heartbeat. 84 func (s *Server) invalidateHeartbeat(id string) { 85 defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now()) 86 // Clear the heartbeat timer 87 s.heartbeatTimersLock.Lock() 88 delete(s.heartbeatTimers, id) 89 s.heartbeatTimersLock.Unlock() 90 s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id) 91 92 // Make a request to update the node status 93 req := structs.NodeUpdateStatusRequest{ 94 NodeID: id, 95 Status: structs.NodeStatusDown, 96 WriteRequest: structs.WriteRequest{ 97 Region: s.config.Region, 98 }, 99 } 100 var resp structs.NodeUpdateResponse 101 if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil { 102 s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err) 103 } 104 } 105 106 // clearHeartbeatTimer is used to clear the heartbeat time for 107 // a single heartbeat. This is used when a heartbeat is destroyed 108 // explicitly and no longer needed. 109 func (s *Server) clearHeartbeatTimer(id string) error { 110 s.heartbeatTimersLock.Lock() 111 defer s.heartbeatTimersLock.Unlock() 112 113 if timer, ok := s.heartbeatTimers[id]; ok { 114 timer.Stop() 115 delete(s.heartbeatTimers, id) 116 } 117 return nil 118 } 119 120 // clearAllHeartbeatTimers is used when a leader is stepping 121 // down and we no longer need to track any heartbeat timers. 122 func (s *Server) clearAllHeartbeatTimers() error { 123 s.heartbeatTimersLock.Lock() 124 defer s.heartbeatTimersLock.Unlock() 125 126 for _, t := range s.heartbeatTimers { 127 t.Stop() 128 } 129 s.heartbeatTimers = nil 130 return nil 131 } 132 133 // heartbeatStats is a long running routine used to capture 134 // the number of active heartbeats being tracked 135 func (s *Server) heartbeatStats() { 136 for { 137 select { 138 case <-time.After(5 * time.Second): 139 s.heartbeatTimersLock.Lock() 140 num := len(s.heartbeatTimers) 141 s.heartbeatTimersLock.Unlock() 142 metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num)) 143 144 case <-s.shutdownCh: 145 return 146 } 147 } 148 }