github.com/zoomfoo/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/nomad/heartbeat.go (about)

     1  package nomad
     2  
     3  import (
     4  	"errors"
     5  	"time"
     6  
     7  	"github.com/armon/go-metrics"
     8  	"github.com/hashicorp/consul/lib"
     9  	memdb "github.com/hashicorp/go-memdb"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// heartbeatNotLeader is the error string returned when the heartbeat request
    15  	// couldn't be completed since the server is not the leader.
    16  	heartbeatNotLeader = "failed to reset heartbeat since server is not leader"
    17  
    18  	// NodeHeartbeatEventMissed is the event used when the Nodes heartbeat is
    19  	// missed.
    20  	NodeHeartbeatEventMissed = "Node heartbeat missed"
    21  )
    22  
    23  var (
    24  	// heartbeatNotLeaderErr is the error returned when the heartbeat request
    25  	// couldn't be completed since the server is not the leader.
    26  	heartbeatNotLeaderErr = errors.New(heartbeatNotLeader)
    27  )
    28  
    29  // initializeHeartbeatTimers is used when a leader is newly elected to create
    30  // a new map to track heartbeat expiration and to reset all the timers from
    31  // the previously known set of timers.
    32  func (s *Server) initializeHeartbeatTimers() error {
    33  	// Scan all nodes and reset their timer
    34  	snap, err := s.fsm.State().Snapshot()
    35  	if err != nil {
    36  		return err
    37  	}
    38  
    39  	// Get an iterator over nodes
    40  	ws := memdb.NewWatchSet()
    41  	iter, err := snap.Nodes(ws)
    42  	if err != nil {
    43  		return err
    44  	}
    45  
    46  	s.heartbeatTimersLock.Lock()
    47  	defer s.heartbeatTimersLock.Unlock()
    48  
    49  	// Handle each node
    50  	for {
    51  		raw := iter.Next()
    52  		if raw == nil {
    53  			break
    54  		}
    55  		node := raw.(*structs.Node)
    56  		if node.TerminalStatus() {
    57  			continue
    58  		}
    59  		s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL)
    60  	}
    61  	return nil
    62  }
    63  
    64  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    65  // This can be used for new heartbeats and existing ones.
    66  func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) {
    67  	s.heartbeatTimersLock.Lock()
    68  	defer s.heartbeatTimersLock.Unlock()
    69  
    70  	// Do not create a timer for the node since we are not the leader. This
    71  	// check avoids the race in which leadership is lost but a timer is created
    72  	// on this server since it was servicing an RPC during a leadership loss.
    73  	if !s.IsLeader() {
    74  		s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring resetting node %q TTL since this node is not the leader", id)
    75  		return 0, heartbeatNotLeaderErr
    76  	}
    77  
    78  	// Compute the target TTL value
    79  	n := len(s.heartbeatTimers)
    80  	ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n)
    81  	ttl += lib.RandomStagger(ttl)
    82  
    83  	// Reset the TTL
    84  	s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace)
    85  	return ttl, nil
    86  }
    87  
    88  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
    89  // assuming the heartbeatTimerLock is already held
    90  func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
    91  	// Ensure a timer map exists
    92  	if s.heartbeatTimers == nil {
    93  		s.heartbeatTimers = make(map[string]*time.Timer)
    94  	}
    95  
    96  	// Renew the heartbeat timer if it exists
    97  	if timer, ok := s.heartbeatTimers[id]; ok {
    98  		timer.Reset(ttl)
    99  		return
   100  	}
   101  
   102  	// Create a new timer to track expiration of this heartbeat
   103  	timer := time.AfterFunc(ttl, func() {
   104  		s.invalidateHeartbeat(id)
   105  	})
   106  	s.heartbeatTimers[id] = timer
   107  }
   108  
   109  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
   110  // need to invalidate the heartbeat.
   111  func (s *Server) invalidateHeartbeat(id string) {
   112  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
   113  	// Clear the heartbeat timer
   114  	s.heartbeatTimersLock.Lock()
   115  	if timer, ok := s.heartbeatTimers[id]; ok {
   116  		timer.Stop()
   117  		delete(s.heartbeatTimers, id)
   118  	}
   119  	s.heartbeatTimersLock.Unlock()
   120  
   121  	// Do not invalidate the node since we are not the leader. This check avoids
   122  	// the race in which leadership is lost but a timer is created on this
   123  	// server since it was servicing an RPC during a leadership loss.
   124  	if !s.IsLeader() {
   125  		s.logger.Printf("[DEBUG] nomad.heartbeat: ignoring node %q TTL since this node is not the leader", id)
   126  		return
   127  	}
   128  
   129  	s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id)
   130  
   131  	// Make a request to update the node status
   132  	req := structs.NodeUpdateStatusRequest{
   133  		NodeID:    id,
   134  		Status:    structs.NodeStatusDown,
   135  		NodeEvent: structs.NewNodeEvent().SetSubsystem(structs.NodeEventSubsystemCluster).SetMessage(NodeHeartbeatEventMissed),
   136  		WriteRequest: structs.WriteRequest{
   137  			Region: s.config.Region,
   138  		},
   139  	}
   140  	var resp structs.NodeUpdateResponse
   141  	if err := s.staticEndpoints.Node.UpdateStatus(&req, &resp); err != nil {
   142  		s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err)
   143  	}
   144  }
   145  
   146  // clearHeartbeatTimer is used to clear the heartbeat time for
   147  // a single heartbeat. This is used when a heartbeat is destroyed
   148  // explicitly and no longer needed.
   149  func (s *Server) clearHeartbeatTimer(id string) error {
   150  	s.heartbeatTimersLock.Lock()
   151  	defer s.heartbeatTimersLock.Unlock()
   152  
   153  	if timer, ok := s.heartbeatTimers[id]; ok {
   154  		timer.Stop()
   155  		delete(s.heartbeatTimers, id)
   156  	}
   157  	return nil
   158  }
   159  
   160  // clearAllHeartbeatTimers is used when a leader is stepping
   161  // down and we no longer need to track any heartbeat timers.
   162  func (s *Server) clearAllHeartbeatTimers() error {
   163  	s.heartbeatTimersLock.Lock()
   164  	defer s.heartbeatTimersLock.Unlock()
   165  
   166  	for _, t := range s.heartbeatTimers {
   167  		t.Stop()
   168  	}
   169  	s.heartbeatTimers = nil
   170  	return nil
   171  }
   172  
   173  // heartbeatStats is a long running routine used to capture
   174  // the number of active heartbeats being tracked
   175  func (s *Server) heartbeatStats() {
   176  	for {
   177  		select {
   178  		case <-time.After(5 * time.Second):
   179  			s.heartbeatTimersLock.Lock()
   180  			num := len(s.heartbeatTimers)
   181  			s.heartbeatTimersLock.Unlock()
   182  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   183  
   184  		case <-s.shutdownCh:
   185  			return
   186  		}
   187  	}
   188  }