github.com/bigcommerce/nomad@v0.9.3-bc/command/agent/consul/check_watcher.go

github.com/bigcommerce/nomad@v0.9.3-bc/command/agent/consul/check_watcher.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	log "github.com/hashicorp/go-hclog"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  const (
    15  	// defaultPollFreq is the default rate to poll the Consul Checks API
    16  	defaultPollFreq = 900 * time.Millisecond
    17  )
    18  
    19  // ChecksAPI is the part of the Consul API the checkWatcher requires.
    20  type ChecksAPI interface {
    21  	// Checks returns a list of all checks.
    22  	Checks() (map[string]*api.AgentCheck, error)
    23  }
    24  
    25  // TaskRestarter allows the checkWatcher to restart tasks.
    26  type TaskRestarter interface {
    27  	Restart(ctx context.Context, event *structs.TaskEvent, failure bool) error
    28  }
    29  
    30  // checkRestart handles restarting a task if a check is unhealthy.
    31  type checkRestart struct {
    32  	allocID   string
    33  	taskName  string
    34  	checkID   string
    35  	checkName string
    36  	taskKey   string // composite of allocID + taskName for uniqueness
    37  
    38  	task           TaskRestarter
    39  	grace          time.Duration
    40  	interval       time.Duration
    41  	timeLimit      time.Duration
    42  	ignoreWarnings bool
    43  
    44  	// Mutable fields
    45  
    46  	// unhealthyState is the time a check first went unhealthy. Set to the
    47  	// zero value if the check passes before timeLimit.
    48  	unhealthyState time.Time
    49  
    50  	// graceUntil is when the check's grace period expires and unhealthy
    51  	// checks should be counted.
    52  	graceUntil time.Time
    53  
    54  	logger log.Logger
    55  }
    56  
    57  // apply restart state for check and restart task if necessary. Current
    58  // timestamp is passed in so all check updates have the same view of time (and
    59  // to ease testing).
    60  //
    61  // Returns true if a restart was triggered in which case this check should be
    62  // removed (checks are added on task startup).
    63  func (c *checkRestart) apply(ctx context.Context, now time.Time, status string) bool {
    64  	healthy := func() {
    65  		if !c.unhealthyState.IsZero() {
    66  			c.logger.Debug("canceling restart because check became healthy")
    67  			c.unhealthyState = time.Time{}
    68  		}
    69  	}
    70  	switch status {
    71  	case api.HealthCritical:
    72  	case api.HealthWarning:
    73  		if c.ignoreWarnings {
    74  			// Warnings are ignored, reset state and exit
    75  			healthy()
    76  			return false
    77  		}
    78  	default:
    79  		// All other statuses are ok, reset state and exit
    80  		healthy()
    81  		return false
    82  	}
    83  
    84  	if now.Before(c.graceUntil) {
    85  		// In grace period, exit
    86  		return false
    87  	}
    88  
    89  	if c.unhealthyState.IsZero() {
    90  		// First failure, set restart deadline
    91  		if c.timeLimit != 0 {
    92  			c.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", c.timeLimit)
    93  		}
    94  		c.unhealthyState = now
    95  	}
    96  
    97  	// restart timeLimit after start of this check becoming unhealthy
    98  	restartAt := c.unhealthyState.Add(c.timeLimit)
    99  
   100  	// Must test >= because if limit=1, restartAt == first failure
   101  	if now.Equal(restartAt) || now.After(restartAt) {
   102  		// hasn't become healthy by deadline, restart!
   103  		c.logger.Debug("restarting due to unhealthy check")
   104  
   105  		// Tell TaskRunner to restart due to failure
   106  		const failure = true
   107  		reason := fmt.Sprintf("healthcheck: check %q unhealthy", c.checkName)
   108  		event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
   109  		err := c.task.Restart(ctx, event, failure)
   110  		if err != nil {
   111  			// Error restarting
   112  			return false
   113  		}
   114  		return true
   115  	}
   116  
   117  	return false
   118  }
   119  
   120  // checkWatchUpdates add or remove checks from the watcher
   121  type checkWatchUpdate struct {
   122  	checkID      string
   123  	remove       bool
   124  	checkRestart *checkRestart
   125  }
   126  
   127  // checkWatcher watches Consul checks and restarts tasks when they're
   128  // unhealthy.
   129  type checkWatcher struct {
   130  	consul ChecksAPI
   131  
   132  	// pollFreq is how often to poll the checks API and defaults to
   133  	// defaultPollFreq
   134  	pollFreq time.Duration
   135  
   136  	// checkUpdateCh is how watches (and removals) are sent to the main
   137  	// watching loop
   138  	checkUpdateCh chan checkWatchUpdate
   139  
   140  	// done is closed when Run has exited
   141  	done chan struct{}
   142  
   143  	// lastErr is true if the last Consul call failed. It is used to
   144  	// squelch repeated error messages.
   145  	lastErr bool
   146  
   147  	logger log.Logger
   148  }
   149  
   150  // newCheckWatcher creates a new checkWatcher but does not call its Run method.
   151  func newCheckWatcher(logger log.Logger, consul ChecksAPI) *checkWatcher {
   152  	return &checkWatcher{
   153  		consul:        consul,
   154  		pollFreq:      defaultPollFreq,
   155  		checkUpdateCh: make(chan checkWatchUpdate, 8),
   156  		done:          make(chan struct{}),
   157  		logger:        logger.ResetNamed("consul.health"),
   158  	}
   159  }
   160  
   161  // Run the main Consul checks watching loop to restart tasks when their checks
   162  // fail. Blocks until context is canceled.
   163  func (w *checkWatcher) Run(ctx context.Context) {
   164  	defer close(w.done)
   165  
   166  	// map of check IDs to their metadata
   167  	checks := map[string]*checkRestart{}
   168  
   169  	// timer for check polling
   170  	checkTimer := time.NewTimer(0)
   171  	defer checkTimer.Stop() // ensure timer is never leaked
   172  
   173  	stopTimer := func() {
   174  		checkTimer.Stop()
   175  		select {
   176  		case <-checkTimer.C:
   177  		default:
   178  		}
   179  	}
   180  
   181  	// disable by default
   182  	stopTimer()
   183  
   184  	// Main watch loop
   185  	for {
   186  		// disable polling if there are no checks
   187  		if len(checks) == 0 {
   188  			stopTimer()
   189  		}
   190  
   191  		select {
   192  		case update := <-w.checkUpdateCh:
   193  			if update.remove {
   194  				// Remove a check
   195  				delete(checks, update.checkID)
   196  				continue
   197  			}
   198  
   199  			// Add/update a check
   200  			checks[update.checkID] = update.checkRestart
   201  			w.logger.Debug("watching check", "alloc_id", update.checkRestart.allocID,
   202  				"task", update.checkRestart.taskName, "check", update.checkRestart.checkName)
   203  
   204  			// if first check was added make sure polling is enabled
   205  			if len(checks) == 1 {
   206  				stopTimer()
   207  				checkTimer.Reset(w.pollFreq)
   208  			}
   209  
   210  		case <-ctx.Done():
   211  			return
   212  
   213  		case <-checkTimer.C:
   214  			checkTimer.Reset(w.pollFreq)
   215  
   216  			// Set "now" as the point in time the following check results represent
   217  			now := time.Now()
   218  
   219  			results, err := w.consul.Checks()
   220  			if err != nil {
   221  				if !w.lastErr {
   222  					w.lastErr = true
   223  					w.logger.Error("failed retrieving health checks", "error", err)
   224  				}
   225  				continue
   226  			}
   227  
   228  			w.lastErr = false
   229  
   230  			// Keep track of tasks restarted this period so they
   231  			// are only restarted once and all of their checks are
   232  			// removed.
   233  			restartedTasks := map[string]struct{}{}
   234  
   235  			// Loop over watched checks and update their status from results
   236  			for cid, check := range checks {
   237  				// Shortcircuit if told to exit
   238  				if ctx.Err() != nil {
   239  					return
   240  				}
   241  
   242  				if _, ok := restartedTasks[check.taskKey]; ok {
   243  					// Check for this task already restarted; remove and skip check
   244  					delete(checks, cid)
   245  					continue
   246  				}
   247  
   248  				result, ok := results[cid]
   249  				if !ok {
   250  					// Only warn if outside grace period to avoid races with check registration
   251  					if now.After(check.graceUntil) {
   252  						w.logger.Warn("watched check not found in Consul", "check", check.checkName, "check_id", cid)
   253  					}
   254  					continue
   255  				}
   256  
   257  				restarted := check.apply(ctx, now, result.Status)
   258  				if restarted {
   259  					// Checks are registered+watched on
   260  					// startup, so it's safe to remove them
   261  					// whenever they're restarted
   262  					delete(checks, cid)
   263  
   264  					restartedTasks[check.taskKey] = struct{}{}
   265  				}
   266  			}
   267  
   268  			// Ensure even passing checks for restartedTasks are removed
   269  			if len(restartedTasks) > 0 {
   270  				for cid, check := range checks {
   271  					if _, ok := restartedTasks[check.taskKey]; ok {
   272  						delete(checks, cid)
   273  					}
   274  				}
   275  			}
   276  		}
   277  	}
   278  }
   279  
   280  // Watch a check and restart its task if unhealthy.
   281  func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) {
   282  	if !check.TriggersRestarts() {
   283  		// Not watched, noop
   284  		return
   285  	}
   286  
   287  	c := &checkRestart{
   288  		allocID:        allocID,
   289  		taskName:       taskName,
   290  		checkID:        checkID,
   291  		checkName:      check.Name,
   292  		taskKey:        fmt.Sprintf("%s%s", allocID, taskName), // unique task ID
   293  		task:           restarter,
   294  		interval:       check.Interval,
   295  		grace:          check.CheckRestart.Grace,
   296  		graceUntil:     time.Now().Add(check.CheckRestart.Grace),
   297  		timeLimit:      check.Interval * time.Duration(check.CheckRestart.Limit-1),
   298  		ignoreWarnings: check.CheckRestart.IgnoreWarnings,
   299  		logger:         w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),
   300  	}
   301  
   302  	update := checkWatchUpdate{
   303  		checkID:      checkID,
   304  		checkRestart: c,
   305  	}
   306  
   307  	select {
   308  	case w.checkUpdateCh <- update:
   309  		// sent watch
   310  	case <-w.done:
   311  		// exited; nothing to do
   312  	}
   313  }
   314  
   315  // Unwatch a check.
   316  func (w *checkWatcher) Unwatch(cid string) {
   317  	c := checkWatchUpdate{
   318  		checkID: cid,
   319  		remove:  true,
   320  	}
   321  	select {
   322  	case w.checkUpdateCh <- c:
   323  		// sent remove watch
   324  	case <-w.done:
   325  		// exited; nothing to do
   326  	}
   327  }