github.com/superfly/nomad@v0.10.5-fly/command/agent/consul/check_watcher.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// defaultPollFreq is the default rate to poll the Consul Checks API
    16  	defaultPollFreq = 900 * time.Millisecond
    17  )
    18  
    19  // ChecksAPI is the part of the Consul API the checkWatcher requires.
    20  type ChecksAPI interface {
    21  	// Checks returns a list of all checks.
    22  	Checks() (map[string]*api.AgentCheck, error)
    23  }
    24  
    25  // WorkloadRestarter allows the checkWatcher to restart tasks or entire task groups.
    26  type WorkloadRestarter interface {
    27  	Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error
    28  }
    29  
    30  // checkRestart handles restarting a task if a check is unhealthy.
    31  type checkRestart struct {
    32  	allocID   string
    33  	taskName  string
    34  	checkID   string
    35  	checkName string
    36  	taskKey   string // composite of allocID + taskName for uniqueness
    37  
    38  	task           WorkloadRestarter
    39  	grace          time.Duration
    40  	interval       time.Duration
    41  	timeLimit      time.Duration
    42  	ignoreWarnings bool
    43  
    44  	// Mutable fields
    45  
    46  	// unhealthyState is the time a check first went unhealthy. Set to the
    47  	// zero value if the check passes before timeLimit.
    48  	unhealthyState time.Time
    49  
    50  	// graceUntil is when the check's grace period expires and unhealthy
    51  	// checks should be counted.
    52  	graceUntil time.Time
    53  
    54  	logger log.Logger
    55  }
    56  
    57  // apply restart state for check and restart task if necessary. Current
    58  // timestamp is passed in so all check updates have the same view of time (and
    59  // to ease testing).
    60  //
    61  // Returns true if a restart was triggered in which case this check should be
    62  // removed (checks are added on task startup).
    63  func (c *checkRestart) apply(ctx context.Context, now time.Time, status string) bool {
    64  	healthy := func() {
    65  		if !c.unhealthyState.IsZero() {
    66  			c.logger.Debug("canceling restart because check became healthy")
    67  			c.unhealthyState = time.Time{}
    68  		}
    69  	}
    70  	switch status {
    71  	case api.HealthCritical:
    72  	case api.HealthWarning:
    73  		if c.ignoreWarnings {
    74  			// Warnings are ignored, reset state and exit
    75  			healthy()
    76  			return false
    77  		}
    78  	default:
    79  		// All other statuses are ok, reset state and exit
    80  		healthy()
    81  		return false
    82  	}
    83  
    84  	if now.Before(c.graceUntil) {
    85  		// In grace period, exit
    86  		return false
    87  	}
    88  
    89  	if c.unhealthyState.IsZero() {
    90  		// First failure, set restart deadline
    91  		if c.timeLimit != 0 {
    92  			c.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", c.timeLimit)
    93  		}
    94  		c.unhealthyState = now
    95  	}
    96  
    97  	// restart timeLimit after start of this check becoming unhealthy
    98  	restartAt := c.unhealthyState.Add(c.timeLimit)
    99  
   100  	// Must test >= because if limit=1, restartAt == first failure
   101  	if now.Equal(restartAt) || now.After(restartAt) {
   102  		// hasn't become healthy by deadline, restart!
   103  		c.logger.Debug("restarting due to unhealthy check")
   104  
   105  		// Tell TaskRunner to restart due to failure
   106  		reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
   107  		event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
   108  		go asyncRestart(ctx, c.logger, c.task, event)
   109  		return true
   110  	}
   111  
   112  	return false
   113  }
   114  
   115  // asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
   116  // to be called in a goroutine.
   117  func asyncRestart(ctx context.Context, logger log.Logger, task WorkloadRestarter, event *structs.TaskEvent) {
   118  	// Check watcher restarts are always failures
   119  	const failure = true
   120  
   121  	// Restarting is asynchronous so there's no reason to allow this
   122  	// goroutine to block indefinitely.
   123  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   124  	defer cancel()
   125  
   126  	if err := task.Restart(ctx, event, failure); err != nil {
   127  		// Restart errors are not actionable and only relevant when
   128  		// debugging allocation lifecycle management.
   129  		logger.Debug("failed to restart task", "error", err,
   130  			"event_time", event.Time, "event_type", event.Type)
   131  	}
   132  }
   133  
   134  // checkWatchUpdates add or remove checks from the watcher
   135  type checkWatchUpdate struct {
   136  	checkID      string
   137  	remove       bool
   138  	checkRestart *checkRestart
   139  }
   140  
   141  // checkWatcher watches Consul checks and restarts tasks when they're
   142  // unhealthy.
   143  type checkWatcher struct {
   144  	consul ChecksAPI
   145  
   146  	// pollFreq is how often to poll the checks API and defaults to
   147  	// defaultPollFreq
   148  	pollFreq time.Duration
   149  
   150  	// checkUpdateCh is how watches (and removals) are sent to the main
   151  	// watching loop
   152  	checkUpdateCh chan checkWatchUpdate
   153  
   154  	// done is closed when Run has exited
   155  	done chan struct{}
   156  
   157  	// lastErr is true if the last Consul call failed. It is used to
   158  	// squelch repeated error messages.
   159  	lastErr bool
   160  
   161  	logger log.Logger
   162  }
   163  
   164  // newCheckWatcher creates a new checkWatcher but does not call its Run method.
   165  func newCheckWatcher(logger log.Logger, consul ChecksAPI) *checkWatcher {
   166  	return &checkWatcher{
   167  		consul:        consul,
   168  		pollFreq:      defaultPollFreq,
   169  		checkUpdateCh: make(chan checkWatchUpdate, 8),
   170  		done:          make(chan struct{}),
   171  		logger:        logger.ResetNamed("consul.health"),
   172  	}
   173  }
   174  
   175  // Run the main Consul checks watching loop to restart tasks when their checks
   176  // fail. Blocks until context is canceled.
   177  func (w *checkWatcher) Run(ctx context.Context) {
   178  	defer close(w.done)
   179  
   180  	// map of check IDs to their metadata
   181  	checks := map[string]*checkRestart{}
   182  
   183  	// timer for check polling
   184  	checkTimer := time.NewTimer(0)
   185  	defer checkTimer.Stop() // ensure timer is never leaked
   186  
   187  	stopTimer := func() {
   188  		checkTimer.Stop()
   189  		select {
   190  		case <-checkTimer.C:
   191  		default:
   192  		}
   193  	}
   194  
   195  	// disable by default
   196  	stopTimer()
   197  
   198  	// Main watch loop
   199  	for {
   200  		// disable polling if there are no checks
   201  		if len(checks) == 0 {
   202  			stopTimer()
   203  		}
   204  
   205  		select {
   206  		case update := <-w.checkUpdateCh:
   207  			if update.remove {
   208  				// Remove a check
   209  				delete(checks, update.checkID)
   210  				continue
   211  			}
   212  
   213  			// Add/update a check
   214  			checks[update.checkID] = update.checkRestart
   215  			w.logger.Debug("watching check", "alloc_id", update.checkRestart.allocID,
   216  				"task", update.checkRestart.taskName, "check", update.checkRestart.checkName)
   217  
   218  			// if first check was added make sure polling is enabled
   219  			if len(checks) == 1 {
   220  				stopTimer()
   221  				checkTimer.Reset(w.pollFreq)
   222  			}
   223  
   224  		case <-ctx.Done():
   225  			return
   226  
   227  		case <-checkTimer.C:
   228  			checkTimer.Reset(w.pollFreq)
   229  
   230  			// Set "now" as the point in time the following check results represent
   231  			now := time.Now()
   232  
   233  			results, err := w.consul.Checks()
   234  			if err != nil {
   235  				if !w.lastErr {
   236  					w.lastErr = true
   237  					w.logger.Error("failed retrieving health checks", "error", err)
   238  				}
   239  				continue
   240  			}
   241  
   242  			w.lastErr = false
   243  
   244  			// Keep track of tasks restarted this period so they
   245  			// are only restarted once and all of their checks are
   246  			// removed.
   247  			restartedTasks := map[string]struct{}{}
   248  
   249  			// Loop over watched checks and update their status from results
   250  			for cid, check := range checks {
   251  				// Shortcircuit if told to exit
   252  				if ctx.Err() != nil {
   253  					return
   254  				}
   255  
   256  				if _, ok := restartedTasks[check.taskKey]; ok {
   257  					// Check for this task already restarted; remove and skip check
   258  					delete(checks, cid)
   259  					continue
   260  				}
   261  
   262  				result, ok := results[cid]
   263  				if !ok {
   264  					// Only warn if outside grace period to avoid races with check registration
   265  					if now.After(check.graceUntil) {
   266  						w.logger.Warn("watched check not found in Consul", "check", check.checkName, "check_id", cid)
   267  					}
   268  					continue
   269  				}
   270  
   271  				restarted := check.apply(ctx, now, result.Status)
   272  				if restarted {
   273  					// Checks are registered+watched on
   274  					// startup, so it's safe to remove them
   275  					// whenever they're restarted
   276  					delete(checks, cid)
   277  
   278  					restartedTasks[check.taskKey] = struct{}{}
   279  				}
   280  			}
   281  
   282  			// Ensure even passing checks for restartedTasks are removed
   283  			if len(restartedTasks) > 0 {
   284  				for cid, check := range checks {
   285  					if _, ok := restartedTasks[check.taskKey]; ok {
   286  						delete(checks, cid)
   287  					}
   288  				}
   289  			}
   290  		}
   291  	}
   292  }
   293  
   294  // Watch a check and restart its task if unhealthy.
   295  func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter WorkloadRestarter) {
   296  	if !check.TriggersRestarts() {
   297  		// Not watched, noop
   298  		return
   299  	}
   300  
   301  	c := &checkRestart{
   302  		allocID:        allocID,
   303  		taskName:       taskName,
   304  		checkID:        checkID,
   305  		checkName:      check.Name,
   306  		taskKey:        fmt.Sprintf("%s%s", allocID, taskName), // unique task ID
   307  		task:           restarter,
   308  		interval:       check.Interval,
   309  		grace:          check.CheckRestart.Grace,
   310  		graceUntil:     time.Now().Add(check.CheckRestart.Grace),
   311  		timeLimit:      check.Interval * time.Duration(check.CheckRestart.Limit-1),
   312  		ignoreWarnings: check.CheckRestart.IgnoreWarnings,
   313  		logger:         w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),
   314  	}
   315  
   316  	update := checkWatchUpdate{
   317  		checkID:      checkID,
   318  		checkRestart: c,
   319  	}
   320  
   321  	select {
   322  	case w.checkUpdateCh <- update:
   323  		// sent watch
   324  	case <-w.done:
   325  		// exited; nothing to do
   326  	}
   327  }
   328  
   329  // Unwatch a check.
   330  func (w *checkWatcher) Unwatch(cid string) {
   331  	c := checkWatchUpdate{
   332  		checkID: cid,
   333  		remove:  true,
   334  	}
   335  	select {
   336  	case w.checkUpdateCh <- c:
   337  		// sent remove watch
   338  	case <-w.done:
   339  		// exited; nothing to do
   340  	}
   341  }