github.com/djenriquez/nomad-1@v0.8.1/client/alloc_runner_health_watcher.go

github.com/djenriquez/nomad-1@v0.8.1/client/alloc_runner_health_watcher.go (about)

     1  package client
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/hashicorp/consul/api"
    12  	cstructs "github.com/hashicorp/nomad/client/structs"
    13  	"github.com/hashicorp/nomad/command/agent/consul"
    14  	"github.com/hashicorp/nomad/helper"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// consulCheckLookupInterval is the  interval at which we check if the
    20  	// Consul checks are healthy or unhealthy.
    21  	consulCheckLookupInterval = 500 * time.Millisecond
    22  
    23  	// allocHealthEventSource is the source used for emitting task events
    24  	allocHealthEventSource = "Alloc Unhealthy"
    25  )
    26  
    27  // watchHealth is responsible for watching an allocation's task status and
    28  // potentially Consul health check status to determine if the allocation is
    29  // healthy or unhealthy.
    30  func (r *AllocRunner) watchHealth(ctx context.Context) {
    31  
    32  	// See if we should watch the allocs health
    33  	alloc := r.Alloc()
    34  
    35  	// Neither deployments nor migrations care about the health of
    36  	// non-service jobs so never watch their health
    37  	if alloc.Job.Type != structs.JobTypeService {
    38  		return
    39  	}
    40  
    41  	// No need to watch health as it's already set
    42  	if alloc.DeploymentStatus.HasHealth() {
    43  		return
    44  	}
    45  
    46  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
    47  	if tg == nil {
    48  		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher",
    49  			alloc.ID, alloc.TaskGroup)
    50  		return
    51  	}
    52  
    53  	isDeploy := alloc.DeploymentID != ""
    54  
    55  	// No need to watch allocs for deployments that rely on operators
    56  	// manually setting health
    57  	if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
    58  		return
    59  	}
    60  
    61  	// Get an allocation listener to watch for alloc events
    62  	l := r.allocBroadcast.Listen()
    63  	defer l.Close()
    64  
    65  	// Define the deadline, health method, min healthy time from the
    66  	// deployment if this is a deployment; otherwise from the migration
    67  	// strategy.
    68  	var deadline time.Time
    69  	var useChecks bool
    70  	var minHealthyTime time.Duration
    71  
    72  	if isDeploy {
    73  		deadline = time.Now().Add(tg.Update.HealthyDeadline)
    74  		minHealthyTime = tg.Update.MinHealthyTime
    75  		useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
    76  	} else {
    77  		strategy := tg.Migrate
    78  		if strategy == nil {
    79  			// For backwards compat with pre-0.8 allocations that
    80  			// don't have a migrate strategy set.
    81  			strategy = structs.DefaultMigrateStrategy()
    82  		}
    83  		deadline = time.Now().Add(strategy.HealthyDeadline)
    84  		minHealthyTime = strategy.MinHealthyTime
    85  		useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
    86  	}
    87  
    88  	// Create a new context with the health deadline
    89  	healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline)
    90  	defer healthCtxCancel()
    91  	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks)
    92  
    93  	// Create the health tracker object
    94  	tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks)
    95  	tracker.Start()
    96  
    97  	allocHealthy := false
    98  	select {
    99  	case <-healthCtx.Done():
   100  		// We were cancelled which means we are no longer needed
   101  		if healthCtx.Err() == context.Canceled {
   102  			return
   103  		}
   104  
   105  		// Since the deadline has been reached we are not healthy
   106  	case <-tracker.AllocStoppedCh():
   107  		// The allocation was stopped so nothing to do
   108  		return
   109  	case healthy := <-tracker.HealthyCh():
   110  		allocHealthy = healthy
   111  	}
   112  
   113  	r.allocLock.Lock()
   114  	r.allocHealth = helper.BoolToPtr(allocHealthy)
   115  	r.allocLock.Unlock()
   116  
   117  	// If deployment is unhealthy emit task events explaining why
   118  	if !allocHealthy && isDeploy {
   119  		r.taskLock.RLock()
   120  		for task, event := range tracker.TaskEvents() {
   121  			if tr, ok := r.tasks[task]; ok {
   122  				tr.EmitEvent(allocHealthEventSource, event)
   123  			}
   124  		}
   125  		r.taskLock.RUnlock()
   126  	}
   127  
   128  	r.syncStatus()
   129  }
   130  
   131  // allocHealthTracker tracks the health of an allocation and makes health events
   132  // watchable via channels.
   133  type allocHealthTracker struct {
   134  	// logger is used to log
   135  	logger *log.Logger
   136  
   137  	// ctx and cancelFn is used to shutdown the tracker
   138  	ctx      context.Context
   139  	cancelFn context.CancelFunc
   140  
   141  	// alloc is the alloc we are tracking
   142  	alloc *structs.Allocation
   143  
   144  	// tg is the task group we are tracking
   145  	tg *structs.TaskGroup
   146  
   147  	// minHealthyTime is the duration an alloc must remain healthy to be
   148  	// considered healthy
   149  	minHealthyTime time.Duration
   150  
   151  	// useChecks specifies whether to use Consul healh checks or not
   152  	useChecks bool
   153  
   154  	// consulCheckCount is the number of checks the task group will attempt to
   155  	// register
   156  	consulCheckCount int
   157  
   158  	// allocUpdates is a listener for retrieving new alloc updates
   159  	allocUpdates *cstructs.AllocListener
   160  
   161  	// consulClient is used to look up the state of the task's checks
   162  	consulClient ConsulServiceAPI
   163  
   164  	// healthy is used to signal whether we have determined the allocation to be
   165  	// healthy or unhealthy
   166  	healthy chan bool
   167  
   168  	// allocStopped is triggered when the allocation is stopped and tracking is
   169  	// not needed
   170  	allocStopped chan struct{}
   171  
   172  	// l is used to lock shared fields listed below
   173  	l sync.Mutex
   174  
   175  	// tasksHealthy marks whether all the tasks have met their health check
   176  	// (disregards Consul)
   177  	tasksHealthy bool
   178  
   179  	// allocFailed marks whether the allocation failed
   180  	allocFailed bool
   181  
   182  	// checksHealthy marks whether all the task's Consul checks are healthy
   183  	checksHealthy bool
   184  
   185  	// taskHealth contains the health state for each task
   186  	taskHealth map[string]*taskHealthState
   187  }
   188  
   189  // newAllocHealthTracker returns a health tracker for the given allocation. An
   190  // alloc listener and consul API object are given so that the watcher can detect
   191  // health changes.
   192  func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation,
   193  	allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI,
   194  	minHealthyTime time.Duration, useChecks bool) *allocHealthTracker {
   195  
   196  	a := &allocHealthTracker{
   197  		logger:         logger,
   198  		healthy:        make(chan bool, 1),
   199  		allocStopped:   make(chan struct{}),
   200  		alloc:          alloc,
   201  		tg:             alloc.Job.LookupTaskGroup(alloc.TaskGroup),
   202  		minHealthyTime: minHealthyTime,
   203  		useChecks:      useChecks,
   204  		allocUpdates:   allocUpdates,
   205  		consulClient:   consulClient,
   206  	}
   207  
   208  	a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks))
   209  	for _, task := range a.tg.Tasks {
   210  		a.taskHealth[task.Name] = &taskHealthState{task: task}
   211  	}
   212  
   213  	for _, task := range a.tg.Tasks {
   214  		for _, s := range task.Services {
   215  			a.consulCheckCount += len(s.Checks)
   216  		}
   217  	}
   218  
   219  	a.ctx, a.cancelFn = context.WithCancel(parentCtx)
   220  	return a
   221  }
   222  
   223  // Start starts the watcher.
   224  func (a *allocHealthTracker) Start() {
   225  	go a.watchTaskEvents()
   226  	if a.useChecks {
   227  		go a.watchConsulEvents()
   228  	}
   229  }
   230  
   231  // HealthyCh returns a channel that will emit a boolean indicating the health of
   232  // the allocation.
   233  func (a *allocHealthTracker) HealthyCh() <-chan bool {
   234  	return a.healthy
   235  }
   236  
   237  // AllocStoppedCh returns a channel that will be fired if the allocation is
   238  // stopped. This means that health will not be set.
   239  func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} {
   240  	return a.allocStopped
   241  }
   242  
   243  // TaskEvents returns a map of events by task. This should only be called after
   244  // health has been determined. Only tasks that have contributed to the
   245  // allocation being unhealthy will have an event.
   246  func (a *allocHealthTracker) TaskEvents() map[string]string {
   247  	a.l.Lock()
   248  	defer a.l.Unlock()
   249  
   250  	// Nothing to do since the failure wasn't task related
   251  	if a.allocFailed {
   252  		return nil
   253  	}
   254  
   255  	deadline, _ := a.ctx.Deadline()
   256  	events := make(map[string]string, len(a.tg.Tasks))
   257  
   258  	// Go through are task information and build the event map
   259  	for task, state := range a.taskHealth {
   260  		useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   261  		if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok {
   262  			events[task] = e
   263  		}
   264  	}
   265  
   266  	return events
   267  }
   268  
   269  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   270  // allocation is terminal, health is immediately broadcasted.
   271  func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) {
   272  	a.l.Lock()
   273  	defer a.l.Unlock()
   274  	a.tasksHealthy = healthy
   275  
   276  	// If we are marked healthy but we also require Consul to be healthy and it
   277  	// isn't yet, return, unless the task is terminal
   278  	requireConsul := a.useChecks && a.consulCheckCount > 0
   279  	if !terminal && healthy && requireConsul && !a.checksHealthy {
   280  		return
   281  	}
   282  
   283  	select {
   284  	case a.healthy <- healthy:
   285  	default:
   286  	}
   287  
   288  	// Shutdown the tracker
   289  	a.cancelFn()
   290  }
   291  
   292  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   293  func (a *allocHealthTracker) setCheckHealth(healthy bool) {
   294  	a.l.Lock()
   295  	defer a.l.Unlock()
   296  	a.checksHealthy = healthy
   297  
   298  	// Only signal if we are healthy and so is the tasks
   299  	if !healthy || !a.tasksHealthy {
   300  		return
   301  	}
   302  
   303  	select {
   304  	case a.healthy <- healthy:
   305  	default:
   306  	}
   307  
   308  	// Shutdown the tracker
   309  	a.cancelFn()
   310  }
   311  
   312  // markAllocStopped is used to mark the allocation as having stopped.
   313  func (a *allocHealthTracker) markAllocStopped() {
   314  	close(a.allocStopped)
   315  	a.cancelFn()
   316  }
   317  
   318  // watchTaskEvents is a long lived watcher that watches for the health of the
   319  // allocation's tasks.
   320  func (a *allocHealthTracker) watchTaskEvents() {
   321  	alloc := a.alloc
   322  	allStartedTime := time.Time{}
   323  	healthyTimer := time.NewTimer(0)
   324  	if !healthyTimer.Stop() {
   325  		select {
   326  		case <-healthyTimer.C:
   327  		default:
   328  		}
   329  	}
   330  
   331  	for {
   332  		// If the alloc is being stopped by the server just exit
   333  		switch alloc.DesiredStatus {
   334  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   335  			a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
   336  			a.markAllocStopped()
   337  			return
   338  		}
   339  
   340  		// Store the task states
   341  		a.l.Lock()
   342  		for task, state := range alloc.TaskStates {
   343  			a.taskHealth[task].state = state
   344  		}
   345  		a.l.Unlock()
   346  
   347  		// Detect if the alloc is unhealthy or if all tasks have started yet
   348  		latestStartTime := time.Time{}
   349  		for _, state := range alloc.TaskStates {
   350  			// One of the tasks has failed so we can exit watching
   351  			if state.Failed || !state.FinishedAt.IsZero() {
   352  				a.setTaskHealth(false, true)
   353  				return
   354  			}
   355  
   356  			if state.State != structs.TaskStateRunning {
   357  				latestStartTime = time.Time{}
   358  				break
   359  			} else if state.StartedAt.After(latestStartTime) {
   360  				latestStartTime = state.StartedAt
   361  			}
   362  		}
   363  
   364  		// If the alloc is marked as failed by the client but none of the
   365  		// individual tasks failed, that means something failed at the alloc
   366  		// level.
   367  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   368  			a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
   369  			a.l.Lock()
   370  			a.allocFailed = true
   371  			a.l.Unlock()
   372  			a.setTaskHealth(false, true)
   373  			return
   374  		}
   375  
   376  		if !latestStartTime.Equal(allStartedTime) {
   377  			// Avoid the timer from firing at the old start time
   378  			if !healthyTimer.Stop() {
   379  				select {
   380  				case <-healthyTimer.C:
   381  				default:
   382  				}
   383  			}
   384  
   385  			// Set the timer since all tasks are started
   386  			if !latestStartTime.IsZero() {
   387  				allStartedTime = latestStartTime
   388  				healthyTimer.Reset(a.minHealthyTime)
   389  			}
   390  		}
   391  
   392  		select {
   393  		case <-a.ctx.Done():
   394  			return
   395  		case newAlloc, ok := <-a.allocUpdates.Ch:
   396  			if !ok {
   397  				return
   398  			}
   399  			alloc = newAlloc
   400  		case <-healthyTimer.C:
   401  			a.setTaskHealth(true, false)
   402  		}
   403  	}
   404  }
   405  
   406  // watchConsulEvents iis a long lived watcher that watches for the health of the
   407  // allocation's Consul checks.
   408  func (a *allocHealthTracker) watchConsulEvents() {
   409  	// checkTicker is the ticker that triggers us to look at the checks in
   410  	// Consul
   411  	checkTicker := time.NewTicker(consulCheckLookupInterval)
   412  	defer checkTicker.Stop()
   413  
   414  	// healthyTimer fires when the checks have been healthy for the
   415  	// MinHealthyTime
   416  	healthyTimer := time.NewTimer(0)
   417  	if !healthyTimer.Stop() {
   418  		select {
   419  		case <-healthyTimer.C:
   420  		default:
   421  		}
   422  	}
   423  
   424  	// primed marks whether the healthy timer has been set
   425  	primed := false
   426  
   427  	// Store whether the last Consul checks call was successful or not
   428  	consulChecksErr := false
   429  
   430  	// allocReg are the registered objects in Consul for the allocation
   431  	var allocReg *consul.AllocRegistration
   432  
   433  OUTER:
   434  	for {
   435  		select {
   436  		case <-a.ctx.Done():
   437  			return
   438  		case <-checkTicker.C:
   439  			newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID)
   440  			if err != nil {
   441  				if !consulChecksErr {
   442  					consulChecksErr = true
   443  					a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err)
   444  				}
   445  				continue OUTER
   446  			} else {
   447  				consulChecksErr = false
   448  				allocReg = newAllocReg
   449  			}
   450  		case <-healthyTimer.C:
   451  			a.setCheckHealth(true)
   452  		}
   453  
   454  		if allocReg == nil {
   455  			continue
   456  		}
   457  
   458  		// Store the task registrations
   459  		a.l.Lock()
   460  		for task, reg := range allocReg.Tasks {
   461  			a.taskHealth[task].taskRegistrations = reg
   462  		}
   463  		a.l.Unlock()
   464  
   465  		// Detect if all the checks are passing
   466  		passed := true
   467  
   468  	CHECKS:
   469  		for _, treg := range allocReg.Tasks {
   470  			for _, sreg := range treg.Services {
   471  				for _, check := range sreg.Checks {
   472  					if check.Status == api.HealthPassing {
   473  						continue
   474  					}
   475  
   476  					passed = false
   477  					a.setCheckHealth(false)
   478  					break CHECKS
   479  				}
   480  			}
   481  		}
   482  
   483  		if !passed {
   484  			// Reset the timer since we have transitioned back to unhealthy
   485  			if primed {
   486  				if !healthyTimer.Stop() {
   487  					select {
   488  					case <-healthyTimer.C:
   489  					default:
   490  					}
   491  				}
   492  				primed = false
   493  			}
   494  		} else if !primed {
   495  			// Reset the timer to fire after MinHealthyTime
   496  			if !healthyTimer.Stop() {
   497  				select {
   498  				case <-healthyTimer.C:
   499  				default:
   500  				}
   501  			}
   502  
   503  			primed = true
   504  			healthyTimer.Reset(a.minHealthyTime)
   505  		}
   506  	}
   507  }
   508  
   509  // taskHealthState captures all known health information about a task. It is
   510  // largely used to determine if the task has contributed to the allocation being
   511  // unhealthy.
   512  type taskHealthState struct {
   513  	task              *structs.Task
   514  	state             *structs.TaskState
   515  	taskRegistrations *consul.TaskRegistration
   516  }
   517  
   518  // event takes the deadline time for the allocation to be healthy and the update
   519  // strategy of the group. It returns true if the task has contributed to the
   520  // allocation being unhealthy and if so, an event description of why.
   521  func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   522  	requireChecks := false
   523  	desiredChecks := 0
   524  	for _, s := range t.task.Services {
   525  		if nc := len(s.Checks); nc > 0 {
   526  			requireChecks = true
   527  			desiredChecks += nc
   528  		}
   529  	}
   530  	requireChecks = requireChecks && useChecks
   531  
   532  	if t.state != nil {
   533  		if t.state.Failed {
   534  			return "Unhealthy because of failed task", true
   535  		}
   536  		if t.state.State != structs.TaskStateRunning {
   537  			return "Task not running by deadline", true
   538  		}
   539  
   540  		// We are running so check if we have been running long enough
   541  		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   542  			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
   543  		}
   544  	}
   545  
   546  	if t.taskRegistrations != nil {
   547  		var notPassing []string
   548  		passing := 0
   549  
   550  	OUTER:
   551  		for _, sreg := range t.taskRegistrations.Services {
   552  			for _, check := range sreg.Checks {
   553  				if check.Status != api.HealthPassing {
   554  					notPassing = append(notPassing, sreg.Service.Service)
   555  					continue OUTER
   556  				} else {
   557  					passing++
   558  				}
   559  			}
   560  		}
   561  
   562  		if len(notPassing) != 0 {
   563  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   564  		}
   565  
   566  		if passing != desiredChecks {
   567  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   568  		}
   569  
   570  	} else if requireChecks {
   571  		return "Service checks not registered", true
   572  	}
   573  
   574  	return "", false
   575  }