github.com/djenriquez/nomad-1@v0.8.1/command/agent/consul/check_watcher.go (about)

     1  package consul
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"time"
     8  
     9  	"github.com/hashicorp/consul/api"
    10  	"github.com/hashicorp/nomad/nomad/structs"
    11  )
    12  
    13  const (
    14  	// defaultPollFreq is the default rate to poll the Consul Checks API
    15  	defaultPollFreq = 900 * time.Millisecond
    16  )
    17  
    18  // ChecksAPI is the part of the Consul API the checkWatcher requires.
    19  type ChecksAPI interface {
    20  	// Checks returns a list of all checks.
    21  	Checks() (map[string]*api.AgentCheck, error)
    22  }
    23  
    24  // TaskRestarter allows the checkWatcher to restart tasks.
    25  type TaskRestarter interface {
    26  	Restart(source, reason string, failure bool)
    27  }
    28  
    29  // checkRestart handles restarting a task if a check is unhealthy.
    30  type checkRestart struct {
    31  	allocID   string
    32  	taskName  string
    33  	checkID   string
    34  	checkName string
    35  	taskKey   string // composite of allocID + taskName for uniqueness
    36  
    37  	task           TaskRestarter
    38  	grace          time.Duration
    39  	interval       time.Duration
    40  	timeLimit      time.Duration
    41  	ignoreWarnings bool
    42  
    43  	// Mutable fields
    44  
    45  	// unhealthyState is the time a check first went unhealthy. Set to the
    46  	// zero value if the check passes before timeLimit.
    47  	unhealthyState time.Time
    48  
    49  	// graceUntil is when the check's grace period expires and unhealthy
    50  	// checks should be counted.
    51  	graceUntil time.Time
    52  
    53  	logger *log.Logger
    54  }
    55  
    56  // apply restart state for check and restart task if necessary. Current
    57  // timestamp is passed in so all check updates have the same view of time (and
    58  // to ease testing).
    59  //
    60  // Returns true if a restart was triggered in which case this check should be
    61  // removed (checks are added on task startup).
    62  func (c *checkRestart) apply(now time.Time, status string) bool {
    63  	healthy := func() {
    64  		if !c.unhealthyState.IsZero() {
    65  			c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart",
    66  				c.allocID, c.taskName, c.checkName)
    67  			c.unhealthyState = time.Time{}
    68  		}
    69  	}
    70  	switch status {
    71  	case api.HealthCritical:
    72  	case api.HealthWarning:
    73  		if c.ignoreWarnings {
    74  			// Warnings are ignored, reset state and exit
    75  			healthy()
    76  			return false
    77  		}
    78  	default:
    79  		// All other statuses are ok, reset state and exit
    80  		healthy()
    81  		return false
    82  	}
    83  
    84  	if now.Before(c.graceUntil) {
    85  		// In grace period, exit
    86  		return false
    87  	}
    88  
    89  	if c.unhealthyState.IsZero() {
    90  		// First failure, set restart deadline
    91  		if c.timeLimit != 0 {
    92  			c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy",
    93  				c.allocID, c.taskName, c.checkName, c.timeLimit)
    94  		}
    95  		c.unhealthyState = now
    96  	}
    97  
    98  	// restart timeLimit after start of this check becoming unhealthy
    99  	restartAt := c.unhealthyState.Add(c.timeLimit)
   100  
   101  	// Must test >= because if limit=1, restartAt == first failure
   102  	if now.Equal(restartAt) || now.After(restartAt) {
   103  		// hasn't become healthy by deadline, restart!
   104  		c.logger.Printf("[DEBUG] consul.health: restarting alloc %q task %q due to unhealthy check %q", c.allocID, c.taskName, c.checkName)
   105  
   106  		// Tell TaskRunner to restart due to failure
   107  		const failure = true
   108  		c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure)
   109  		return true
   110  	}
   111  
   112  	return false
   113  }
   114  
   115  // checkWatchUpdates add or remove checks from the watcher
   116  type checkWatchUpdate struct {
   117  	checkID      string
   118  	remove       bool
   119  	checkRestart *checkRestart
   120  }
   121  
   122  // checkWatcher watches Consul checks and restarts tasks when they're
   123  // unhealthy.
   124  type checkWatcher struct {
   125  	consul ChecksAPI
   126  
   127  	// pollFreq is how often to poll the checks API and defaults to
   128  	// defaultPollFreq
   129  	pollFreq time.Duration
   130  
   131  	// checkUpdateCh is how watches (and removals) are sent to the main
   132  	// watching loop
   133  	checkUpdateCh chan checkWatchUpdate
   134  
   135  	// done is closed when Run has exited
   136  	done chan struct{}
   137  
   138  	// lastErr is true if the last Consul call failed. It is used to
   139  	// squelch repeated error messages.
   140  	lastErr bool
   141  
   142  	logger *log.Logger
   143  }
   144  
   145  // newCheckWatcher creates a new checkWatcher but does not call its Run method.
   146  func newCheckWatcher(logger *log.Logger, consul ChecksAPI) *checkWatcher {
   147  	return &checkWatcher{
   148  		consul:        consul,
   149  		pollFreq:      defaultPollFreq,
   150  		checkUpdateCh: make(chan checkWatchUpdate, 8),
   151  		done:          make(chan struct{}),
   152  		logger:        logger,
   153  	}
   154  }
   155  
   156  // Run the main Consul checks watching loop to restart tasks when their checks
   157  // fail. Blocks until context is canceled.
   158  func (w *checkWatcher) Run(ctx context.Context) {
   159  	defer close(w.done)
   160  
   161  	// map of check IDs to their metadata
   162  	checks := map[string]*checkRestart{}
   163  
   164  	// timer for check polling
   165  	checkTimer := time.NewTimer(0)
   166  	defer checkTimer.Stop() // ensure timer is never leaked
   167  
   168  	stopTimer := func() {
   169  		checkTimer.Stop()
   170  		select {
   171  		case <-checkTimer.C:
   172  		default:
   173  		}
   174  	}
   175  
   176  	// disable by default
   177  	stopTimer()
   178  
   179  	// Main watch loop
   180  	for {
   181  		// disable polling if there are no checks
   182  		if len(checks) == 0 {
   183  			stopTimer()
   184  		}
   185  
   186  		select {
   187  		case update := <-w.checkUpdateCh:
   188  			if update.remove {
   189  				// Remove a check
   190  				delete(checks, update.checkID)
   191  				continue
   192  			}
   193  
   194  			// Add/update a check
   195  			checks[update.checkID] = update.checkRestart
   196  			w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q",
   197  				update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName)
   198  
   199  			// if first check was added make sure polling is enabled
   200  			if len(checks) == 1 {
   201  				stopTimer()
   202  				checkTimer.Reset(w.pollFreq)
   203  			}
   204  
   205  		case <-ctx.Done():
   206  			return
   207  
   208  		case <-checkTimer.C:
   209  			checkTimer.Reset(w.pollFreq)
   210  
   211  			// Set "now" as the point in time the following check results represent
   212  			now := time.Now()
   213  
   214  			results, err := w.consul.Checks()
   215  			if err != nil {
   216  				if !w.lastErr {
   217  					w.lastErr = true
   218  					w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err)
   219  				}
   220  				continue
   221  			}
   222  
   223  			w.lastErr = false
   224  
   225  			// Keep track of tasks restarted this period so they
   226  			// are only restarted once and all of their checks are
   227  			// removed.
   228  			restartedTasks := map[string]struct{}{}
   229  
   230  			// Loop over watched checks and update their status from results
   231  			for cid, check := range checks {
   232  				if _, ok := restartedTasks[check.taskKey]; ok {
   233  					// Check for this task already restarted; remove and skip check
   234  					delete(checks, cid)
   235  					continue
   236  				}
   237  
   238  				result, ok := results[cid]
   239  				if !ok {
   240  					// Only warn if outside grace period to avoid races with check registration
   241  					if now.After(check.graceUntil) {
   242  						w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid)
   243  					}
   244  					continue
   245  				}
   246  
   247  				restarted := check.apply(now, result.Status)
   248  				if restarted {
   249  					// Checks are registered+watched on
   250  					// startup, so it's safe to remove them
   251  					// whenever they're restarted
   252  					delete(checks, cid)
   253  
   254  					restartedTasks[check.taskKey] = struct{}{}
   255  				}
   256  			}
   257  
   258  			// Ensure even passing checks for restartedTasks are removed
   259  			if len(restartedTasks) > 0 {
   260  				for cid, check := range checks {
   261  					if _, ok := restartedTasks[check.taskKey]; ok {
   262  						delete(checks, cid)
   263  					}
   264  				}
   265  			}
   266  		}
   267  	}
   268  }
   269  
   270  // Watch a check and restart its task if unhealthy.
   271  func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) {
   272  	if !check.TriggersRestarts() {
   273  		// Not watched, noop
   274  		return
   275  	}
   276  
   277  	c := &checkRestart{
   278  		allocID:        allocID,
   279  		taskName:       taskName,
   280  		checkID:        checkID,
   281  		checkName:      check.Name,
   282  		taskKey:        fmt.Sprintf("%s%s", allocID, taskName), // unique task ID
   283  		task:           restarter,
   284  		interval:       check.Interval,
   285  		grace:          check.CheckRestart.Grace,
   286  		graceUntil:     time.Now().Add(check.CheckRestart.Grace),
   287  		timeLimit:      check.Interval * time.Duration(check.CheckRestart.Limit-1),
   288  		ignoreWarnings: check.CheckRestart.IgnoreWarnings,
   289  		logger:         w.logger,
   290  	}
   291  
   292  	update := checkWatchUpdate{
   293  		checkID:      checkID,
   294  		checkRestart: c,
   295  	}
   296  
   297  	select {
   298  	case w.checkUpdateCh <- update:
   299  		// sent watch
   300  	case <-w.done:
   301  		// exited; nothing to do
   302  	}
   303  }
   304  
   305  // Unwatch a check.
   306  func (w *checkWatcher) Unwatch(cid string) {
   307  	c := checkWatchUpdate{
   308  		checkID: cid,
   309  		remove:  true,
   310  	}
   311  	select {
   312  	case w.checkUpdateCh <- c:
   313  		// sent remove watch
   314  	case <-w.done:
   315  		// exited; nothing to do
   316  	}
   317  }