github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/serviceregistration/watcher.go

github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/serviceregistration/watcher.go (about)

     1  package serviceregistration
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"time"
     7  
     8  	"github.com/hashicorp/go-hclog"
     9  	"github.com/hashicorp/go-set"
    10  	"github.com/hashicorp/nomad/helper"
    11  	"github.com/hashicorp/nomad/nomad/structs"
    12  )
    13  
    14  // composite of allocID + taskName for uniqueness
    15  type key string
    16  
    17  type restarter struct {
    18  	allocID   string
    19  	taskName  string
    20  	checkID   string
    21  	checkName string
    22  	taskKey   key
    23  
    24  	logger         hclog.Logger
    25  	task           WorkloadRestarter
    26  	grace          time.Duration
    27  	interval       time.Duration
    28  	timeLimit      time.Duration
    29  	ignoreWarnings bool
    30  
    31  	// unhealthyState is the time a check first went unhealthy. Set to the
    32  	// zero value if the check passes before timeLimit.
    33  	unhealthyState time.Time
    34  
    35  	// graceUntil is when the check's grace period expires and unhealthy
    36  	// checks should be counted.
    37  	graceUntil time.Time
    38  }
    39  
    40  // apply restart state for check and restart task if necessary. Current
    41  // timestamp is passed in so all check updates have the same view of time (and
    42  // to ease testing).
    43  //
    44  // Returns true if a restart was triggered in which case this check should be
    45  // removed (checks are added on task startup).
    46  func (r *restarter) apply(ctx context.Context, now time.Time, status string) bool {
    47  	healthy := func() {
    48  		if !r.unhealthyState.IsZero() {
    49  			r.logger.Debug("canceling restart because check became healthy")
    50  			r.unhealthyState = time.Time{}
    51  		}
    52  	}
    53  	switch status {
    54  	case "critical": // consul
    55  	case string(structs.CheckFailure): // nomad
    56  	case string(structs.CheckPending): // nomad
    57  	case "warning": // consul
    58  		if r.ignoreWarnings {
    59  			// Warnings are ignored, reset state and exit
    60  			healthy()
    61  			return false
    62  		}
    63  	default:
    64  		// All other statuses are ok, reset state and exit
    65  		healthy()
    66  		return false
    67  	}
    68  
    69  	if now.Before(r.graceUntil) {
    70  		// In grace period, exit
    71  		return false
    72  	}
    73  
    74  	if r.unhealthyState.IsZero() {
    75  		// First failure, set restart deadline
    76  		if r.timeLimit != 0 {
    77  			r.logger.Debug("check became unhealthy. Will restart if check doesn't become healthy", "time_limit", r.timeLimit)
    78  		}
    79  		r.unhealthyState = now
    80  	}
    81  
    82  	// restart timeLimit after start of this check becoming unhealthy
    83  	restartAt := r.unhealthyState.Add(r.timeLimit)
    84  
    85  	// Must test >= because if limit=1, restartAt == first failure
    86  	if now.Equal(restartAt) || now.After(restartAt) {
    87  		// hasn't become healthy by deadline, restart!
    88  		r.logger.Debug("restarting due to unhealthy check")
    89  
    90  		// Tell TaskRunner to restart due to failure
    91  		reason := fmt.Sprintf("healthcheck: check %q unhealthy", r.checkName)
    92  		event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason)
    93  		go asyncRestart(ctx, r.logger, r.task, event)
    94  		return true
    95  	}
    96  
    97  	return false
    98  }
    99  
   100  // asyncRestart mimics the pre-0.9 TaskRunner.Restart behavior and is intended
   101  // to be called in a goroutine.
   102  func asyncRestart(ctx context.Context, logger hclog.Logger, task WorkloadRestarter, event *structs.TaskEvent) {
   103  	// Check watcher restarts are always failures
   104  	const failure = true
   105  
   106  	// Restarting is asynchronous so there's no reason to allow this
   107  	// goroutine to block indefinitely.
   108  	ctx, cancel := context.WithTimeout(ctx, 10*time.Second)
   109  	defer cancel()
   110  
   111  	if err := task.Restart(ctx, event, failure); err != nil {
   112  		// Restart errors are not actionable and only relevant when
   113  		// debugging allocation lifecycle management.
   114  		logger.Debug("failed to restart task", "error", err, "event_time", event.Time, "event_type", event.Type)
   115  	}
   116  }
   117  
   118  // CheckStatusGetter is implemented per-provider.
   119  type CheckStatusGetter interface {
   120  	// Get returns a map from CheckID -> (minimal) CheckStatus
   121  	Get() (map[string]string, error)
   122  }
   123  
   124  // checkWatchUpdates add or remove checks from the watcher
   125  type checkWatchUpdate struct {
   126  	checkID string
   127  	remove  bool
   128  	restart *restarter
   129  }
   130  
   131  // A CheckWatcher watches for check failures and restarts tasks according to
   132  // their check_restart policy.
   133  type CheckWatcher interface {
   134  	// Run the CheckWatcher. Maintains a background process to continuously
   135  	// monitor active checks. Must be called before Watch or Unwatch. Must be
   136  	// called as a goroutine.
   137  	Run(ctx context.Context)
   138  
   139  	// Watch the given check. If the check status enters a failing state, the
   140  	// task associated with the check will be restarted according to its check_restart
   141  	// policy via wr.
   142  	Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter)
   143  
   144  	// Unwatch will cause the CheckWatcher to no longer monitor the check of given checkID.
   145  	Unwatch(checkID string)
   146  }
   147  
   148  // UniversalCheckWatcher is an implementation of CheckWatcher capable of watching
   149  // checks in the Nomad or Consul service providers.
   150  type UniversalCheckWatcher struct {
   151  	logger hclog.Logger
   152  	getter CheckStatusGetter
   153  
   154  	// pollFrequency is how often to poll the checks API
   155  	pollFrequency time.Duration
   156  
   157  	// checkUpdateCh sends watches/removals to the main loop
   158  	checkUpdateCh chan checkWatchUpdate
   159  
   160  	// done is closed when Run has exited
   161  	done chan struct{}
   162  
   163  	// failedPreviousInterval is used to indicate whether something went wrong during
   164  	// the previous poll interval - if so we can silence ongoing errors
   165  	failedPreviousInterval bool
   166  }
   167  
   168  func NewCheckWatcher(logger hclog.Logger, getter CheckStatusGetter) *UniversalCheckWatcher {
   169  	return &UniversalCheckWatcher{
   170  		logger:        logger.ResetNamed("watch.checks"),
   171  		getter:        getter,
   172  		pollFrequency: 1 * time.Second,
   173  		checkUpdateCh: make(chan checkWatchUpdate, 8),
   174  		done:          make(chan struct{}),
   175  	}
   176  }
   177  
   178  // Watch a check and restart its task if unhealthy.
   179  func (w *UniversalCheckWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, wr WorkloadRestarter) {
   180  	if !check.TriggersRestarts() {
   181  		return // check_restart not set; no-op
   182  	}
   183  
   184  	c := &restarter{
   185  		allocID:        allocID,
   186  		taskName:       taskName,
   187  		checkID:        checkID,
   188  		checkName:      check.Name,
   189  		taskKey:        key(allocID + taskName),
   190  		task:           wr,
   191  		interval:       check.Interval,
   192  		grace:          check.CheckRestart.Grace,
   193  		graceUntil:     time.Now().Add(check.CheckRestart.Grace),
   194  		timeLimit:      check.Interval * time.Duration(check.CheckRestart.Limit-1),
   195  		ignoreWarnings: check.CheckRestart.IgnoreWarnings,
   196  		logger:         w.logger.With("alloc_id", allocID, "task", taskName, "check", check.Name),
   197  	}
   198  
   199  	select {
   200  	case w.checkUpdateCh <- checkWatchUpdate{
   201  		checkID: checkID,
   202  		restart: c,
   203  	}: // activate watch
   204  	case <-w.done: // exited; nothing to do
   205  	}
   206  }
   207  
   208  // Unwatch a check.
   209  func (w *UniversalCheckWatcher) Unwatch(checkID string) {
   210  	select {
   211  	case w.checkUpdateCh <- checkWatchUpdate{
   212  		checkID: checkID,
   213  		remove:  true,
   214  	}: // deactivate watch
   215  	case <-w.done: // exited; nothing to do
   216  	}
   217  }
   218  
   219  func (w *UniversalCheckWatcher) Run(ctx context.Context) {
   220  	defer close(w.done)
   221  
   222  	// map of checkID to their restarter handle (contains only checks we are watching)
   223  	watched := make(map[string]*restarter)
   224  
   225  	checkTimer, cleanupCheckTimer := helper.NewSafeTimer(0)
   226  	defer cleanupCheckTimer()
   227  
   228  	stopCheckTimer := func() { // todo: refactor using that other pattern
   229  		checkTimer.Stop()
   230  		select {
   231  		case <-checkTimer.C:
   232  		default:
   233  		}
   234  	}
   235  
   236  	// initialize with checkTimer disabled
   237  	stopCheckTimer()
   238  
   239  	for {
   240  		// disable polling if there are no checks
   241  		if len(watched) == 0 {
   242  			stopCheckTimer()
   243  		}
   244  
   245  		select {
   246  		// caller cancelled us; goodbye
   247  		case <-ctx.Done():
   248  			return
   249  
   250  		// received an update; add or remove check
   251  		case update := <-w.checkUpdateCh:
   252  			if update.remove {
   253  				delete(watched, update.checkID)
   254  				continue
   255  			}
   256  
   257  			watched[update.checkID] = update.restart
   258  			allocID := update.restart.allocID
   259  			taskName := update.restart.taskName
   260  			checkName := update.restart.checkName
   261  			w.logger.Trace("now watching check", "alloc_i", allocID, "task", taskName, "check", checkName)
   262  
   263  			// turn on the timer if we are now active
   264  			if len(watched) == 1 {
   265  				stopCheckTimer()
   266  				checkTimer.Reset(w.pollFrequency)
   267  			}
   268  
   269  		// poll time; refresh check statuses
   270  		case now := <-checkTimer.C:
   271  			w.interval(ctx, now, watched)
   272  			checkTimer.Reset(w.pollFrequency)
   273  		}
   274  	}
   275  }
   276  
   277  func (w *UniversalCheckWatcher) interval(ctx context.Context, now time.Time, watched map[string]*restarter) {
   278  	statuses, err := w.getter.Get()
   279  	if err != nil && !w.failedPreviousInterval {
   280  		w.failedPreviousInterval = true
   281  		w.logger.Error("failed to retrieve check statuses", "error", err)
   282  		return
   283  	}
   284  	w.failedPreviousInterval = false
   285  
   286  	// keep track of tasks restarted this interval
   287  	restarts := set.New[key](len(statuses))
   288  
   289  	// iterate over status of all checks, and update the status of checks
   290  	// we care about watching
   291  	for checkID, checkRestarter := range watched {
   292  		if ctx.Err() != nil {
   293  			return //  short circuit; caller cancelled us
   294  		}
   295  
   296  		if restarts.Contains(checkRestarter.taskKey) {
   297  			// skip; task is already being restarted
   298  			delete(watched, checkID)
   299  			continue
   300  		}
   301  
   302  		status, exists := statuses[checkID]
   303  		if !exists {
   304  			// warn only if outside grace period; avoiding race with check registration
   305  			if now.After(checkRestarter.graceUntil) {
   306  				w.logger.Warn("watched check not found", "check_id", checkID)
   307  			}
   308  			continue
   309  		}
   310  
   311  		if checkRestarter.apply(ctx, now, status) {
   312  			// check will be re-registered & re-watched on startup
   313  			delete(watched, checkID)
   314  			restarts.Insert(checkRestarter.taskKey)
   315  		}
   316  	}
   317  
   318  	// purge passing checks of tasks that are being restarted
   319  	if restarts.Size() > 0 {
   320  		for checkID, checkRestarter := range watched {
   321  			if restarts.Contains(checkRestarter.taskKey) {
   322  				delete(watched, checkID)
   323  			}
   324  		}
   325  	}
   326  }