github.com/djenriquez/nomad-1@v0.8.1/nomad/heartbeat.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/consul/lib"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// heartbeatNotLeader is the error string returned when the heartbeat request
    15  	// couldn't be completed since the server is not the leader.
    16  	heartbeatNotLeader = "failed to reset heartbeat since server is not leader"
    17  )
    18  
    19  var (
    20  	// heartbeatNotLeaderErr is the error returned when the heartbeat request
    21  	// couldn't be completed since the server is not the leader.
    22  	heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
    23  )
    24  
    25  // initializeHeartbeatTimers is used when a leader is newly elected to create
    26  // a new map to track heartbeat expiration and to reset all the timers from
    27  // the previously known set of timers.
    28  func (s *Server) initializeHeartbeatTimers() error {
    29  	// Scan all nodes and reset their timer
    30  	snap, err := s.fsm.State().Snapshot()
    31  	if err != nil {
    32  		return err
    33  	}
    34  
    35  	// Get an iterator over nodes
    36  	ws := memdb.NewWatchSet()
    37  	iter, err := snap.Nodes(ws)
    38  	if err != nil {
    39  		return err
    40  	}
    41  
    42  	s.heartbeatTimersLock.Lock()
    43  	defer s.heartbeatTimersLock.Unlock()
    44  
    45  	// Handle each node
    46  	for {
    47  		raw := iter.Next()
    48  		if raw == nil {
    49  			break
    50  		}
    51  		node := raw.(*structs.Node)
    52  		if node.TerminalStatus() {
    53  			continue
    54  		}
    55  		s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL)
    56  	}
    57  	return nil
    58  }
    59  
    60  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    61  // This can be used for new heartbeats and existing ones.
    62  func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) {
    63  	s.heartbeatTimersLock.Lock()
    64  	defer s.heartbeatTimersLock.Unlock()
    65  
    66  	// Do not create a timer for the node since we are not the leader. This
    67  	// check avoids the race in which leadership is lost but a timer is created
    68  	// on this server since it was servicing an RPC during a leadership loss.
    69  	if !s.IsLeader() {
    70  		s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id)
    71  		return 0, heartbeatNotLeaderErr
    72  	}
    73  
    74  	// Compute the target TTL value
    75  	n := len(s.heartbeatTimers)
    76  	ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n)
    77  	ttl += lib.RandomStagger(ttl)
    78  
    79  	// Reset the TTL
    80  	s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace)
    81  	return ttl, nil
    82  }
    83  
    84  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
    85  // assuming the heartbeatTimerLock is already held
    86  func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
    87  	// Ensure a timer map exists
    88  	if s.heartbeatTimers == nil {
    89  		s.heartbeatTimers = make(map[string]*time.Timer)
    90  	}
    91  
    92  	// Renew the heartbeat timer if it exists
    93  	if timer, ok := s.heartbeatTimers[id]; ok {
    94  		timer.Reset(ttl)
    95  		return
    96  	}
    97  
    98  	// Create a new timer to track expiration of this heartbeat
    99  	timer := time.AfterFunc(ttl, func() {
   100  		s.invalidateHeartbeat(id)
   101  	})
   102  	s.heartbeatTimers[id] = timer
   103  }
   104  
   105  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
   106  // need to invalidate the heartbeat.
   107  func (s *Server) invalidateHeartbeat(id string) {
   108  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
   109  	// Clear the heartbeat timer
   110  	s.heartbeatTimersLock.Lock()
   111  	delete(s.heartbeatTimers, id)
   112  	s.heartbeatTimersLock.Unlock()
   113  
   114  	// Do not invalidate the node since we are not the leader. This check avoids
   115  	// the race in which leadership is lost but a timer is created on this
   116  	// server since it was servicing an RPC during a leadership loss.
   117  	if !s.IsLeader() {
   118  		s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id)
   119  		return
   120  	}
   121  
   122  	s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id)
   123  
   124  	// Make a request to update the node status
   125  	req := structs.NodeUpdateStatusRequest{
   126  		NodeID: id,
   127  		Status: structs.NodeStatusDown,
   128  		WriteRequest: structs.WriteRequest{
   129  			Region: s.config.Region,
   130  		},
   131  	}
   132  	var resp structs.NodeUpdateResponse
   133  	if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {
   134  		s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err)
   135  	}
   136  }
   137  
   138  // clearHeartbeatTimer is used to clear the heartbeat time for
   139  // a single heartbeat. This is used when a heartbeat is destroyed
   140  // explicitly and no longer needed.
   141  func (s *Server) clearHeartbeatTimer(id string) error {
   142  	s.heartbeatTimersLock.Lock()
   143  	defer s.heartbeatTimersLock.Unlock()
   144  
   145  	if timer, ok := s.heartbeatTimers[id]; ok {
   146  		timer.Stop()
   147  		delete(s.heartbeatTimers, id)
   148  	}
   149  	return nil
   150  }
   151  
   152  // clearAllHeartbeatTimers is used when a leader is stepping
   153  // down and we no longer need to track any heartbeat timers.
   154  func (s *Server) clearAllHeartbeatTimers() error {
   155  	s.heartbeatTimersLock.Lock()
   156  	defer s.heartbeatTimersLock.Unlock()
   157  
   158  	for _, t := range s.heartbeatTimers {
   159  		t.Stop()
   160  	}
   161  	s.heartbeatTimers = nil
   162  	return nil
   163  }
   164  
   165  // heartbeatStats is a long running routine used to capture
   166  // the number of active heartbeats being tracked
   167  func (s *Server) heartbeatStats() {
   168  	for {
   169  		select {
   170  		case <-time.After(5 * time.Second):
   171  			s.heartbeatTimersLock.Lock()
   172  			num := len(s.heartbeatTimers)
   173  			s.heartbeatTimersLock.Unlock()
   174  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   175  
   176  		case <-s.shutdownCh:
   177  			return
   178  		}
   179  	}
   180  }