github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/health_hook.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync"
     7  	"time"
     8  
     9  	log "github.com/hashicorp/go-hclog"
    10  	"github.com/hashicorp/nomad/client/allochealth"
    11  	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
    12  	"github.com/hashicorp/nomad/client/consul"
    13  	cstructs "github.com/hashicorp/nomad/client/structs"
    14  	"github.com/hashicorp/nomad/nomad/structs"
    15  )
    16  
    17  // healthMutator is able to set/clear alloc health.
    18  type healthSetter interface {
    19  	// HasHealth returns true if health is already set.
    20  	HasHealth() bool
    21  
    22  	// Set health via the mutator
    23  	SetHealth(healthy, isDeploy bool, taskEvents map[string]*structs.TaskEvent)
    24  
    25  	// Clear health when the deployment ID changes
    26  	ClearHealth()
    27  }
    28  
    29  // allocHealthWatcherHook is responsible for watching an allocation's task
    30  // status and (optionally) Consul health check status to determine if the
    31  // allocation is health or unhealthy. Used by deployments and migrations.
    32  type allocHealthWatcherHook struct {
    33  	healthSetter healthSetter
    34  
    35  	// consul client used to monitor health checks
    36  	consul consul.ConsulServiceAPI
    37  
    38  	// listener is given to trackers to listen for alloc updates and closed
    39  	// when the alloc is destroyed.
    40  	listener *cstructs.AllocListener
    41  
    42  	// hookLock is held by hook methods to prevent concurrent access by
    43  	// Update and synchronous hooks.
    44  	hookLock sync.Mutex
    45  
    46  	// watchDone is created before calling watchHealth and is closed when
    47  	// watchHealth exits. Must be passed into watchHealth to avoid races.
    48  	// Initialized already closed as Update may be called before Prerun.
    49  	watchDone chan struct{}
    50  
    51  	// ranOnce is set once Prerun or Update have run at least once. This
    52  	// prevents Prerun from running if an Update has already been
    53  	// processed. Must hold hookLock to access.
    54  	ranOnce bool
    55  
    56  	// cancelFn stops the health watching/setting goroutine. Wait on
    57  	// watchLock to block until the watcher exits.
    58  	cancelFn context.CancelFunc
    59  
    60  	// alloc set by new func or Update. Must hold hookLock to access.
    61  	alloc *structs.Allocation
    62  
    63  	// isDeploy is true if monitoring a deployment. Set in init(). Must
    64  	// hold hookLock to access.
    65  	isDeploy bool
    66  
    67  	logger log.Logger
    68  }
    69  
    70  func newAllocHealthWatcherHook(logger log.Logger, alloc *structs.Allocation, hs healthSetter,
    71  	listener *cstructs.AllocListener, consul consul.ConsulServiceAPI) interfaces.RunnerHook {
    72  
    73  	// Neither deployments nor migrations care about the health of
    74  	// non-service jobs so never watch their health
    75  	if alloc.Job.Type != structs.JobTypeService {
    76  		return noopAllocHealthWatcherHook{}
    77  	}
    78  
    79  	// Initialize watchDone with a closed chan in case Update runs before Prerun
    80  	closedDone := make(chan struct{})
    81  	close(closedDone)
    82  
    83  	h := &allocHealthWatcherHook{
    84  		alloc:        alloc,
    85  		cancelFn:     func() {}, // initialize to prevent nil func panics
    86  		watchDone:    closedDone,
    87  		consul:       consul,
    88  		healthSetter: hs,
    89  		listener:     listener,
    90  	}
    91  
    92  	h.logger = logger.Named(h.Name())
    93  	return h
    94  }
    95  
    96  func (h *allocHealthWatcherHook) Name() string {
    97  	return "alloc_health_watcher"
    98  }
    99  
   100  // init starts the allochealth.Tracker and watchHealth goroutine on either
   101  // Prerun or Update. Caller must set/update alloc and logger fields.
   102  //
   103  // Not threadsafe so the caller should lock since Updates occur concurrently.
   104  func (h *allocHealthWatcherHook) init() error {
   105  	// No need to watch health as it's already set
   106  	if h.healthSetter.HasHealth() {
   107  		h.logger.Trace("not watching; already has health set")
   108  		return nil
   109  	}
   110  
   111  	tg := h.alloc.Job.LookupTaskGroup(h.alloc.TaskGroup)
   112  	if tg == nil {
   113  		return fmt.Errorf("task group %q does not exist in job %q", h.alloc.TaskGroup, h.alloc.Job.ID)
   114  	}
   115  
   116  	h.isDeploy = h.alloc.DeploymentID != ""
   117  
   118  	// No need to watch allocs for deployments that rely on operators
   119  	// manually setting health
   120  	if h.isDeploy && (tg.Update.IsEmpty() || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
   121  		return nil
   122  	}
   123  
   124  	// Define the deadline, health method, min healthy time from the
   125  	// deployment if this is a deployment; otherwise from the migration
   126  	// strategy.
   127  	deadline, useChecks, minHealthyTime := getHealthParams(time.Now(), tg, h.isDeploy)
   128  
   129  	// Create a context that is canceled when the tracker should shutdown.
   130  	ctx := context.Background()
   131  	ctx, h.cancelFn = context.WithCancel(ctx)
   132  
   133  	h.logger.Trace("watching", "deadline", deadline, "checks", useChecks, "min_healthy_time", minHealthyTime)
   134  	// Create a new tracker, start it, and watch for health results.
   135  	tracker := allochealth.NewTracker(ctx, h.logger, h.alloc,
   136  		h.listener, h.consul, minHealthyTime, useChecks)
   137  	tracker.Start()
   138  
   139  	// Create a new done chan and start watching for health updates
   140  	h.watchDone = make(chan struct{})
   141  	go h.watchHealth(ctx, deadline, tracker, h.watchDone)
   142  	return nil
   143  }
   144  
   145  func (h *allocHealthWatcherHook) Prerun() error {
   146  	h.hookLock.Lock()
   147  	defer h.hookLock.Unlock()
   148  
   149  	if h.ranOnce {
   150  		// An Update beat Prerun to running the watcher; noop
   151  		return nil
   152  	}
   153  
   154  	h.ranOnce = true
   155  	return h.init()
   156  }
   157  
   158  func (h *allocHealthWatcherHook) Update(req *interfaces.RunnerUpdateRequest) error {
   159  	h.hookLock.Lock()
   160  	defer h.hookLock.Unlock()
   161  
   162  	// Prevent Prerun from running after an Update
   163  	h.ranOnce = true
   164  
   165  	// Cancel the old watcher and create a new one
   166  	h.cancelFn()
   167  
   168  	// Wait until the watcher exits
   169  	<-h.watchDone
   170  
   171  	// Deployment has changed, reset status
   172  	if req.Alloc.DeploymentID != h.alloc.DeploymentID {
   173  		h.healthSetter.ClearHealth()
   174  	}
   175  
   176  	// Update alloc
   177  	h.alloc = req.Alloc
   178  
   179  	return h.init()
   180  }
   181  
   182  func (h *allocHealthWatcherHook) Postrun() error {
   183  	h.hookLock.Lock()
   184  	defer h.hookLock.Unlock()
   185  
   186  	h.cancelFn()
   187  	h.listener.Close()
   188  
   189  	// Wait until the watcher exits
   190  	<-h.watchDone
   191  
   192  	return nil
   193  }
   194  
   195  func (h *allocHealthWatcherHook) Shutdown() {
   196  	// Same as Postrun
   197  	h.Postrun()
   198  }
   199  
   200  // watchHealth watches alloc health until it is set, the alloc is stopped, the
   201  // deadline is reached, or the context is canceled. watchHealth will be
   202  // canceled and restarted on Updates so calls are serialized with a lock.
   203  func (h *allocHealthWatcherHook) watchHealth(ctx context.Context, deadline time.Time, tracker *allochealth.Tracker, done chan<- struct{}) {
   204  	defer close(done)
   205  
   206  	// Default to unhealthy for the deadline reached case
   207  	healthy := false
   208  
   209  	select {
   210  	case <-ctx.Done():
   211  		// Graceful shutdown
   212  		return
   213  
   214  	case <-tracker.AllocStoppedCh():
   215  		// Allocation has stopped so no need to set health
   216  		return
   217  
   218  	case <-time.After(deadline.Sub(time.Now())):
   219  		// Time is up! Fallthrough to set unhealthy.
   220  		h.logger.Trace("deadline reached; setting unhealthy", "deadline", deadline)
   221  
   222  	case healthy = <-tracker.HealthyCh():
   223  		// Health received. Fallthrough to set it.
   224  	}
   225  
   226  	h.logger.Trace("health set", "healthy", healthy)
   227  
   228  	// If this is an unhealthy deployment emit events for tasks
   229  	var taskEvents map[string]*structs.TaskEvent
   230  	if !healthy && h.isDeploy {
   231  		taskEvents = tracker.TaskEvents()
   232  	}
   233  
   234  	h.healthSetter.SetHealth(healthy, h.isDeploy, taskEvents)
   235  }
   236  
   237  // getHealthParams returns the health watcher parameters which vary based on
   238  // whether this allocation is in a deployment or migration.
   239  func getHealthParams(now time.Time, tg *structs.TaskGroup, isDeploy bool) (deadline time.Time, useChecks bool, minHealthyTime time.Duration) {
   240  	if isDeploy {
   241  		deadline = now.Add(tg.Update.HealthyDeadline)
   242  		minHealthyTime = tg.Update.MinHealthyTime
   243  		useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   244  	} else {
   245  		strategy := tg.Migrate
   246  		if strategy == nil {
   247  			// For backwards compat with pre-0.8 allocations that
   248  			// don't have a migrate strategy set.
   249  			strategy = structs.DefaultMigrateStrategy()
   250  		}
   251  
   252  		deadline = now.Add(strategy.HealthyDeadline)
   253  		minHealthyTime = strategy.MinHealthyTime
   254  		useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
   255  	}
   256  	return
   257  }
   258  
   259  // noopAllocHealthWatcherHook is an empty hook implementation returned by
   260  // newAllocHealthWatcherHook when an allocation will never need its health
   261  // monitored.
   262  type noopAllocHealthWatcherHook struct{}
   263  
   264  func (noopAllocHealthWatcherHook) Name() string {
   265  	return "alloc_health_watcher"
   266  }