github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/health_hook.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	"github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/client/allochealth"
    11  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    12  	"github.com/hashicorp/nomad/client/serviceregistration"
    13  	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
    14  	cstructs "github.com/hashicorp/nomad/client/structs"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  // healthSetter is able to set/clear alloc health.
    19  type healthSetter interface {
    20  	// HasHealth returns true if health is already set.
    21  	HasHealth() bool
    22  
    23  	// SetHealth via the mutator.
    24  	SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent)
    25  
    26  	// ClearHealth for when the deployment ID changes.
    27  	ClearHealth()
    28  }
    29  
    30  // allocHealthWatcherHook is responsible for watching an allocation's task
    31  // status and (optionally) Consul health check status to determine if the
    32  // allocation is healthy or unhealthy. Used by deployments and migrations.
    33  type allocHealthWatcherHook struct {
    34  	healthSetter healthSetter
    35  
    36  	// consul client used to monitor Consul service health checks
    37  	consul serviceregistration.Handler
    38  
    39  	// checkStore is used to monitor Nomad service health checks
    40  	checkStore checkstore.Shim
    41  
    42  	// listener is given to trackers to listen for alloc updates and closed
    43  	// when the alloc is destroyed.
    44  	listener *cstructs.AllocListener
    45  
    46  	// hookLock is held by hook methods to prevent concurrent access by
    47  	// Update and synchronous hooks.
    48  	hookLock sync.Mutex
    49  
    50  	// watchDone is created before calling watchHealth and is closed when
    51  	// watchHealth exits. Must be passed into watchHealth to avoid races.
    52  	// Initialized already closed as Update may be called before Prerun.
    53  	watchDone chan struct{}
    54  
    55  	// ranOnce is set once Prerun or Update have run at least once. This
    56  	// prevents Prerun from running if an Update has already been
    57  	// processed. Must hold hookLock to access.
    58  	ranOnce bool
    59  
    60  	// cancelFn stops the health watching/setting goroutine. Wait on
    61  	// watchLock to block until the watcher exits.
    62  	cancelFn context.CancelFunc
    63  
    64  	// alloc set by new func or Update. Must hold hookLock to access.
    65  	alloc *structs.Allocation
    66  
    67  	// isDeploy is true if monitoring a deployment. Set in init(). Must
    68  	// hold hookLock to access.
    69  	isDeploy bool
    70  
    71  	logger hclog.Logger
    72  }
    73  
    74  func newAllocHealthWatcherHook(logger hclog.Logger, alloc *structs.Allocation, hs healthSetter,
    75  	listener *cstructs.AllocListener, consul serviceregistration.Handler, checkStore checkstore.Shim) interfaces.RunnerHook {
    76  
    77  	// Neither deployments nor migrations care about the health of
    78  	// non-service jobs so never watch their health
    79  	if alloc.Job.Type != structs.JobTypeService {
    80  		return noopAllocHealthWatcherHook{}
    81  	}
    82  
    83  	// Initialize watchDone with a closed chan in case Update runs before Prerun
    84  	closedDone := make(chan struct{})
    85  	close(closedDone)
    86  
    87  	h := &allocHealthWatcherHook{
    88  		alloc:        alloc,
    89  		cancelFn:     func() {}, // initialize to prevent nil func panics
    90  		watchDone:    closedDone,
    91  		consul:       consul,
    92  		checkStore:   checkStore,
    93  		healthSetter: hs,
    94  		listener:     listener,
    95  	}
    96  
    97  	h.logger = logger.Named(h.Name())
    98  	return h
    99  }
   100  
   101  func (h *allocHealthWatcherHook) Name() string {
   102  	return "alloc_health_watcher"
   103  }
   104  
   105  // init starts the allochealth.Tracker and watchHealth goroutine on either
   106  // Prerun or Update. Caller must set/update alloc and logger fields.
   107  //
   108  // Not threadsafe so the caller should lock since Updates occur concurrently.
   109  func (h *allocHealthWatcherHook) init() error {
   110  	// No need to watch health as it's already set
   111  	if h.healthSetter.HasHealth() {
   112  		h.logger.Trace("not watching; already has health set")
   113  		return nil
   114  	}
   115  
   116  	tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
   117  	if tg == nil {
   118  		return fmt.Errorf("task group %q does not exist in job %q", h.alloc.TaskGroup, h.alloc.Job.ID)
   119  	}
   120  
   121  	h.isDeploy = h.alloc.DeploymentID != ""
   122  
   123  	// No need to watch allocs for deployments that rely on operators
   124  	// manually setting health
   125  	if h.isDeploy && (tg.Update.IsEmpty() || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
   126  		return nil
   127  	}
   128  
   129  	// Define the deadline, health method, min healthy time from the
   130  	// deployment if this is a deployment; otherwise from the migration
   131  	// strategy.
   132  	deadline, useChecks, minHealthyTime := getHealthParams(time.Now(), tg, h.isDeploy)
   133  
   134  	// Create a context that is canceled when the tracker should shutdown.
   135  	ctx := context.Background()
   136  	ctx, h.cancelFn = context.WithCancel(ctx)
   137  
   138  	h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime)
   139  	// Create a new tracker, start it, and watch for health results.
   140  	tracker := allochealth.NewTracker(
   141  		ctx, h.logger, h.alloc, h.listener, h.consul, h.checkStore, minHealthyTime, useChecks,
   142  	)
   143  	tracker.Start()
   144  
   145  	// Create a new done chan and start watching for health updates
   146  	h.watchDone = make(chan struct{})
   147  	go h.watchHealth(ctx, deadline, tracker, h.watchDone)
   148  	return nil
   149  }
   150  
   151  func (h *allocHealthWatcherHook) Prerun() error {
   152  	h.hookLock.Lock()
   153  	defer h.hookLock.Unlock()
   154  
   155  	if h.ranOnce {
   156  		// An Update beat Prerun to running the watcher; noop
   157  		return nil
   158  	}
   159  
   160  	h.ranOnce = true
   161  	return h.init()
   162  }
   163  
   164  func (h *allocHealthWatcherHook) Update(req *interfaces.RunnerUpdateRequest) error {
   165  	h.hookLock.Lock()
   166  	defer h.hookLock.Unlock()
   167  
   168  	// Prevent Prerun from running after an Update
   169  	h.ranOnce = true
   170  
   171  	// Cancel the old watcher and create a new one
   172  	h.cancelFn()
   173  
   174  	// Wait until the watcher exits
   175  	<-h.watchDone
   176  
   177  	// Deployment has changed, reset status
   178  	if req.Alloc.DeploymentID != h.alloc.DeploymentID {
   179  		h.healthSetter.ClearHealth()
   180  	}
   181  
   182  	// Update alloc
   183  	h.alloc = req.Alloc
   184  
   185  	return h.init()
   186  }
   187  
   188  func (h *allocHealthWatcherHook) Postrun() error {
   189  	h.hookLock.Lock()
   190  	defer h.hookLock.Unlock()
   191  
   192  	h.cancelFn()
   193  	h.listener.Close()
   194  
   195  	// Wait until the watcher exits
   196  	<-h.watchDone
   197  
   198  	return nil
   199  }
   200  
   201  func (h *allocHealthWatcherHook) Shutdown() {
   202  	// Same as Postrun
   203  	_ = h.Postrun()
   204  }
   205  
   206  // watchHealth watches alloc health until it is set, the alloc is stopped, the
   207  // deadline is reached, or the context is canceled. watchHealth will be
   208  // canceled and restarted on Updates so calls are serialized with a lock.
   209  func (h *allocHealthWatcherHook) watchHealth(ctx context.Context, deadline time.Time, tracker *allochealth.Tracker, done chan<- struct{}) {
   210  	defer close(done)
   211  
   212  	// Default to unhealthy for the deadline reached case
   213  	healthy := false
   214  
   215  	select {
   216  	case <-ctx.Done():
   217  		// Graceful shutdown
   218  		return
   219  
   220  	case <-tracker.AllocStoppedCh():
   221  		// Allocation has stopped so no need to set health
   222  		return
   223  
   224  	case <-time.After(time.Until(deadline)):
   225  		// Time is up! Fallthrough to set unhealthy.
   226  		h.logger.Trace("deadline reached; setting unhealthy", "deadline", deadline)
   227  
   228  	case healthy = <-tracker.HealthyCh():
   229  		// Health received. Fallthrough to set it.
   230  	}
   231  
   232  	h.logger.Trace("health set", "healthy", healthy)
   233  
   234  	// If this is an unhealthy deployment emit events for tasks
   235  	var taskEvents map[string]*structs.TaskEvent
   236  	if !healthy && h.isDeploy {
   237  		taskEvents = tracker.TaskEvents()
   238  	}
   239  
   240  	h.healthSetter.SetHealth(healthy, h.isDeploy, taskEvents)
   241  }
   242  
   243  // getHealthParams returns the health watcher parameters which vary based on
   244  // whether this allocation is in a deployment or migration.
   245  func getHealthParams(now time.Time, tg *structs.TaskGroup, isDeploy bool) (deadline time.Time, useChecks bool, minHealthyTime time.Duration) {
   246  	if isDeploy {
   247  		deadline = now.Add(tg.Update.HealthyDeadline)
   248  		minHealthyTime = tg.Update.MinHealthyTime
   249  		useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   250  	} else {
   251  		strategy := tg.Migrate
   252  		if strategy == nil {
   253  			// For backwards compat with pre-0.8 allocations that
   254  			// don't have a migrate strategy set.
   255  			strategy = structs.DefaultMigrateStrategy()
   256  		}
   257  
   258  		deadline = now.Add(strategy.HealthyDeadline)
   259  		minHealthyTime = strategy.MinHealthyTime
   260  		useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
   261  	}
   262  	return
   263  }
   264  
   265  // noopAllocHealthWatcherHook is an empty hook implementation returned by
   266  // newAllocHealthWatcherHook when an allocation will never need its health
   267  // monitored.
   268  type noopAllocHealthWatcherHook struct{}
   269  
   270  func (noopAllocHealthWatcherHook) Name() string {
   271  	return "alloc_health_watcher"
   272  }