github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/nomad/heartbeat.go (about)

     1  package nomad
     2  
     3  import (
     4  	"time"
     5  
     6  	"github.com/armon/go-metrics"
     7  	"github.com/hashicorp/consul/lib"
     8  	memdb "github.com/hashicorp/go-memdb"
     9  	"github.com/hashicorp/nomad/nomad/structs"
    10  )
    11  
    12  // initializeHeartbeatTimers is used when a leader is newly elected to create
    13  // a new map to track heartbeat expiration and to reset all the timers from
    14  // the previously known set of timers.
    15  func (s *Server) initializeHeartbeatTimers() error {
    16  	// Scan all nodes and reset their timer
    17  	snap, err := s.fsm.State().Snapshot()
    18  	if err != nil {
    19  		return err
    20  	}
    21  
    22  	// Get an iterator over nodes
    23  	ws := memdb.NewWatchSet()
    24  	iter, err := snap.Nodes(ws)
    25  	if err != nil {
    26  		return err
    27  	}
    28  
    29  	s.heartbeatTimersLock.Lock()
    30  	defer s.heartbeatTimersLock.Unlock()
    31  
    32  	// Handle each node
    33  	for {
    34  		raw := iter.Next()
    35  		if raw == nil {
    36  			break
    37  		}
    38  		node := raw.(*structs.Node)
    39  		if node.TerminalStatus() {
    40  			continue
    41  		}
    42  		s.resetHeartbeatTimerLocked(node.ID, s.config.FailoverHeartbeatTTL)
    43  	}
    44  	return nil
    45  }
    46  
    47  // resetHeartbeatTimer is used to reset the TTL of a heartbeat.
    48  // This can be used for new heartbeats and existing ones.
    49  func (s *Server) resetHeartbeatTimer(id string) (time.Duration, error) {
    50  	s.heartbeatTimersLock.Lock()
    51  	defer s.heartbeatTimersLock.Unlock()
    52  
    53  	// Compute the target TTL value
    54  	n := len(s.heartbeatTimers)
    55  	ttl := lib.RateScaledInterval(s.config.MaxHeartbeatsPerSecond, s.config.MinHeartbeatTTL, n)
    56  	ttl += lib.RandomStagger(ttl)
    57  
    58  	// Reset the TTL
    59  	s.resetHeartbeatTimerLocked(id, ttl+s.config.HeartbeatGrace)
    60  	return ttl, nil
    61  }
    62  
    63  // resetHeartbeatTimerLocked is used to reset a heartbeat timer
    64  // assuming the heartbeatTimerLock is already held
    65  func (s *Server) resetHeartbeatTimerLocked(id string, ttl time.Duration) {
    66  	// Ensure a timer map exists
    67  	if s.heartbeatTimers == nil {
    68  		s.heartbeatTimers = make(map[string]*time.Timer)
    69  	}
    70  
    71  	// Renew the heartbeat timer if it exists
    72  	if timer, ok := s.heartbeatTimers[id]; ok {
    73  		timer.Reset(ttl)
    74  		return
    75  	}
    76  
    77  	// Create a new timer to track expiration of this heartbeat
    78  	timer := time.AfterFunc(ttl, func() {
    79  		s.invalidateHeartbeat(id)
    80  	})
    81  	s.heartbeatTimers[id] = timer
    82  }
    83  
    84  // invalidateHeartbeat is invoked when a heartbeat TTL is reached and we
    85  // need to invalidate the heartbeat.
    86  func (s *Server) invalidateHeartbeat(id string) {
    87  	defer metrics.MeasureSince([]string{"nomad", "heartbeat", "invalidate"}, time.Now())
    88  	// Clear the heartbeat timer
    89  	s.heartbeatTimersLock.Lock()
    90  	delete(s.heartbeatTimers, id)
    91  	s.heartbeatTimersLock.Unlock()
    92  	s.logger.Printf("[WARN] nomad.heartbeat: node '%s' TTL expired", id)
    93  
    94  	// Make a request to update the node status
    95  	req := structs.NodeUpdateStatusRequest{
    96  		NodeID: id,
    97  		Status: structs.NodeStatusDown,
    98  		WriteRequest: structs.WriteRequest{
    99  			Region: s.config.Region,
   100  		},
   101  	}
   102  	var resp structs.NodeUpdateResponse
   103  	if err := s.endpoints.Node.UpdateStatus(&req, &resp); err != nil {
   104  		s.logger.Printf("[ERR] nomad.heartbeat: update status failed: %v", err)
   105  	}
   106  }
   107  
   108  // clearHeartbeatTimer is used to clear the heartbeat time for
   109  // a single heartbeat. This is used when a heartbeat is destroyed
   110  // explicitly and no longer needed.
   111  func (s *Server) clearHeartbeatTimer(id string) error {
   112  	s.heartbeatTimersLock.Lock()
   113  	defer s.heartbeatTimersLock.Unlock()
   114  
   115  	if timer, ok := s.heartbeatTimers[id]; ok {
   116  		timer.Stop()
   117  		delete(s.heartbeatTimers, id)
   118  	}
   119  	return nil
   120  }
   121  
   122  // clearAllHeartbeatTimers is used when a leader is stepping
   123  // down and we no longer need to track any heartbeat timers.
   124  func (s *Server) clearAllHeartbeatTimers() error {
   125  	s.heartbeatTimersLock.Lock()
   126  	defer s.heartbeatTimersLock.Unlock()
   127  
   128  	for _, t := range s.heartbeatTimers {
   129  		t.Stop()
   130  	}
   131  	s.heartbeatTimers = nil
   132  	return nil
   133  }
   134  
   135  // heartbeatStats is a long running routine used to capture
   136  // the number of active heartbeats being tracked
   137  func (s *Server) heartbeatStats() {
   138  	for {
   139  		select {
   140  		case <-time.After(5 * time.Second):
   141  			s.heartbeatTimersLock.Lock()
   142  			num := len(s.heartbeatTimers)
   143  			s.heartbeatTimersLock.Unlock()
   144  			metrics.SetGauge([]string{"nomad", "heartbeat", "active"}, float32(num))
   145  
   146  		case <-s.shutdownCh:
   147  			return
   148  		}
   149  	}
   150  }