github.com/maier/nomad@v0.4.1-0.20161110003312-a9e3d0b8549d/nomad/heartbeat.go (about)

     1  package nomad
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/armon/go-metrics"
     7  	"github.com/hashicorp/consul/lib"
     8  	"github.com/hashicorp/nomad/nomad/structs"
     9  )
    10  
    11  // initializeHeartbeatTimers is used when a leader is newly elected to create
    12  // a new map to track heartbeat expiration and to reset all the timers from
    13  // the previously known set of timers.
    14  func (s *Server) initializeHeartbeatTimers() error {
    15  	// Scan all nodes and reset their timer
    16  	snap, err := s.fsm.State().Snapshot()
    17  	if err != nil {
    18  		return err
    19  	}
    20  
    21  	// Get an iterator over nodes
    22  	iter, err := snap.Nodes()
    23  	if err != nil {
    24  		return err
    25  	}
    26  
    27  	s.heartbeatTimersLock.Lock()
    28  	defer s.heartbeatTimersLock.Unlock()
    29  
    30  	// Handle each node
    31  	for {
    32  		raw := iter.Next()
    33  		if raw == nil {
    34  			break
    35  		}
    36  		node := raw.(*structs.Node)
    37  		if node.TerminalStatus() {
    38  			continue
    39  		}
    40  		s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL)
    41  	}
    42  	return nil
    43  }
    44  
    45  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    46  // This can be used for new heartbeats and existing ones.
    47  func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) {
    48  	s.heartbeatTimersLock.Lock()
    49  	defer s.heartbeatTimersLock.Unlock()
    50  
    51  	// Compute the target TTL value
    52  	n := len(s.heartbeatTimers)
    53  	ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n)
    54  	ttl += lib.RandomStagger(ttl)
    55  
    56  	// Reset the TTL
    57  	s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace)
    58  	return ttl, nil
    59  }
    60  
    61  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
    62  // assuming the heartbeatTimerLock is already held
    63  func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
    64  	// Ensure a timer map exists
    65  	if s.heartbeatTimers == nil {
    66  		s.heartbeatTimers = make(map[string]*time.Timer)
    67  	}
    68  
    69  	// Renew the heartbeat timer if it exists
    70  	if timer, ok := s.heartbeatTimers[id]; ok {
    71  		timer.Reset(ttl)
    72  		return
    73  	}
    74  
    75  	// Create a new timer to track expiration of this heartbeat
    76  	timer := time.AfterFunc(ttl, func() {
    77  		s.invalidateHeartbeat(id)
    78  	})
    79  	s.heartbeatTimers[id] = timer
    80  }
    81  
    82  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
    83  // need to invalidate the heartbeat.
    84  func (s *Server) invalidateHeartbeat(id string) {
    85  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
    86  	// Clear the heartbeat timer
    87  	s.heartbeatTimersLock.Lock()
    88  	delete(s.heartbeatTimers, id)
    89  	s.heartbeatTimersLock.Unlock()
    90  	s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id)
    91  
    92  	// Make a request to update the node status
    93  	req := structs.NodeUpdateStatusRequest{
    94  		NodeID: id,
    95  		Status: structs.NodeStatusDown,
    96  		WriteRequest: structs.WriteRequest{
    97  			Region: s.config.Region,
    98  		},
    99  	}
   100  	var resp structs.NodeUpdateResponse
   101  	if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil {
   102  		s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err)
   103  	}
   104  }
   105  
   106  // clearHeartbeatTimer is used to clear the heartbeat time for
   107  // a single heartbeat. This is used when a heartbeat is destroyed
   108  // explicitly and no longer needed.
   109  func (s *Server) clearHeartbeatTimer(id string) error {
   110  	s.heartbeatTimersLock.Lock()
   111  	defer s.heartbeatTimersLock.Unlock()
   112  
   113  	if timer, ok := s.heartbeatTimers[id]; ok {
   114  		timer.Stop()
   115  		delete(s.heartbeatTimers, id)
   116  	}
   117  	return nil
   118  }
   119  
   120  // clearAllHeartbeatTimers is used when a leader is stepping
   121  // down and we no longer need to track any heartbeat timers.
   122  func (s *Server) clearAllHeartbeatTimers() error {
   123  	s.heartbeatTimersLock.Lock()
   124  	defer s.heartbeatTimersLock.Unlock()
   125  
   126  	for _, t := range s.heartbeatTimers {
   127  		t.Stop()
   128  	}
   129  	s.heartbeatTimers = nil
   130  	return nil
   131  }
   132  
   133  // heartbeatStats is a long running routine used to capture
   134  // the number of active heartbeats being tracked
   135  func (s *Server) heartbeatStats() {
   136  	for {
   137  		select {
   138  		case <-time.After(5 * time.Second):
   139  			s.heartbeatTimersLock.Lock()
   140  			num := len(s.heartbeatTimers)
   141  			s.heartbeatTimersLock.Unlock()
   142  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   143  
   144  		case <-s.shutdownCh:
   145  			return
   146  		}
   147  	}
   148  }