github.com/blixtra/nomad@v0.7.2-0.20171221000451-da9a1d7bb050/client/alloc_runner_health_watcher.go (about)

     1  package client
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/hashicorp/consul/api"
    12  	cstructs "github.com/hashicorp/nomad/client/structs"
    13  	"github.com/hashicorp/nomad/command/agent/consul"
    14  	"github.com/hashicorp/nomad/helper"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// consulCheckLookupInterval is the  interval at which we check if the
    20  	// Consul checks are healthy or unhealthy.
    21  	consulCheckLookupInterval = 500 * time.Millisecond
    22  
    23  	// allocHealthEventSource is the source used for emitting task events
    24  	allocHealthEventSource = "Alloc Unhealthy"
    25  )
    26  
    27  // watchHealth is responsible for watching an allocation's task status and
    28  // potentially Consul health check status to determine if the allocation is
    29  // healthy or unhealthy.
    30  func (r *AllocRunner) watchHealth(ctx context.Context) {
    31  
    32  	// See if we should watch the allocs health
    33  	alloc := r.Alloc()
    34  	if alloc.DeploymentID == "" || alloc.DeploymentStatus.IsHealthy() || alloc.DeploymentStatus.IsUnhealthy() {
    35  		return
    36  	}
    37  
    38  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
    39  	if tg == nil {
    40  		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation's task group. Exiting watcher")
    41  		return
    42  	} else if tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual {
    43  		return
    44  	}
    45  
    46  	// Get an allocation listener to watch for alloc events
    47  	l := r.allocBroadcast.Listen()
    48  	defer l.Close()
    49  
    50  	// Create a new context with the health deadline
    51  	deadline := time.Now().Add(tg.Update.HealthyDeadline)
    52  	healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline)
    53  	defer healthCtxCancel()
    54  	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline (%v) for alloc %q is at %v", tg.Update.HealthyDeadline, alloc.ID, deadline)
    55  
    56  	// Create the health tracker object
    57  	tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient)
    58  	tracker.Start()
    59  
    60  	allocHealthy := false
    61  	select {
    62  	case <-healthCtx.Done():
    63  		// We were cancelled which means we are no longer needed
    64  		if healthCtx.Err() == context.Canceled {
    65  			return
    66  		}
    67  
    68  		// Since the deadline has been reached we are not healthy
    69  	case <-tracker.AllocStoppedCh():
    70  		// The allocation was stopped so nothing to do
    71  		return
    72  	case healthy := <-tracker.HealthyCh():
    73  		allocHealthy = healthy
    74  	}
    75  
    76  	r.allocLock.Lock()
    77  	r.allocHealth = helper.BoolToPtr(allocHealthy)
    78  	r.allocLock.Unlock()
    79  
    80  	// We are unhealthy so emit task events explaining why
    81  	if !allocHealthy {
    82  		r.taskLock.RLock()
    83  		for task, event := range tracker.TaskEvents() {
    84  			if tr, ok := r.tasks[task]; ok {
    85  				tr.EmitEvent(allocHealthEventSource, event)
    86  			}
    87  		}
    88  		r.taskLock.RUnlock()
    89  	}
    90  
    91  	r.syncStatus()
    92  }
    93  
    94  // allocHealthTracker tracks the health of an allocation and makes health events
    95  // watchable via channels.
    96  type allocHealthTracker struct {
    97  	// logger is used to log
    98  	logger *log.Logger
    99  
   100  	// ctx and cancelFn is used to shutdown the tracker
   101  	ctx      context.Context
   102  	cancelFn context.CancelFunc
   103  
   104  	// alloc is the alloc we are tracking
   105  	alloc *structs.Allocation
   106  
   107  	// tg is the task group we are tracking
   108  	tg *structs.TaskGroup
   109  
   110  	// consulCheckCount is the number of checks the task group will attempt to
   111  	// register
   112  	consulCheckCount int
   113  
   114  	// allocUpdates is a listener for retrieving new alloc updates
   115  	allocUpdates *cstructs.AllocListener
   116  
   117  	// consulClient is used to look up the state of the task's checks
   118  	consulClient ConsulServiceAPI
   119  
   120  	// healthy is used to signal whether we have determined the allocation to be
   121  	// healthy or unhealthy
   122  	healthy chan bool
   123  
   124  	// allocStopped is triggered when the allocation is stopped and tracking is
   125  	// not needed
   126  	allocStopped chan struct{}
   127  
   128  	// l is used to lock shared fields listed below
   129  	l sync.Mutex
   130  
   131  	// tasksHealthy marks whether all the tasks have met their health check
   132  	// (disregards Consul)
   133  	tasksHealthy bool
   134  
   135  	// allocFailed marks whether the allocation failed
   136  	allocFailed bool
   137  
   138  	// checksHealthy marks whether all the task's Consul checks are healthy
   139  	checksHealthy bool
   140  
   141  	// taskHealth contains the health state for each task
   142  	taskHealth map[string]*taskHealthState
   143  }
   144  
   145  // newAllocHealthTracker returns a health tracker for the given allocation. An
   146  // alloc listener and consul API object are given so that the watcher can detect
   147  // health changes.
   148  func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation,
   149  	allocUpdates *cstructs.AllocListener, consulClient ConsulServiceAPI) *allocHealthTracker {
   150  
   151  	a := &allocHealthTracker{
   152  		logger:       logger,
   153  		healthy:      make(chan bool, 1),
   154  		allocStopped: make(chan struct{}),
   155  		alloc:        alloc,
   156  		tg:           alloc.Job.LookupTaskGroup(alloc.TaskGroup),
   157  		allocUpdates: allocUpdates,
   158  		consulClient: consulClient,
   159  	}
   160  
   161  	a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks))
   162  	for _, task := range a.tg.Tasks {
   163  		a.taskHealth[task.Name] = &taskHealthState{task: task}
   164  	}
   165  
   166  	for _, task := range a.tg.Tasks {
   167  		for _, s := range task.Services {
   168  			a.consulCheckCount += len(s.Checks)
   169  		}
   170  	}
   171  
   172  	a.ctx, a.cancelFn = context.WithCancel(parentCtx)
   173  	return a
   174  }
   175  
   176  // Start starts the watcher.
   177  func (a *allocHealthTracker) Start() {
   178  	go a.watchTaskEvents()
   179  	if a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks {
   180  		go a.watchConsulEvents()
   181  	}
   182  }
   183  
   184  // HealthyCh returns a channel that will emit a boolean indicating the health of
   185  // the allocation.
   186  func (a *allocHealthTracker) HealthyCh() <-chan bool {
   187  	return a.healthy
   188  }
   189  
   190  // AllocStoppedCh returns a channel that will be fired if the allocation is
   191  // stopped. This means that health will not be set.
   192  func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} {
   193  	return a.allocStopped
   194  }
   195  
   196  // TaskEvents returns a map of events by task. This should only be called after
   197  // health has been determined. Only tasks that have contributed to the
   198  // allocation being unhealthy will have an event.
   199  func (a *allocHealthTracker) TaskEvents() map[string]string {
   200  	a.l.Lock()
   201  	defer a.l.Unlock()
   202  
   203  	// Nothing to do since the failure wasn't task related
   204  	if a.allocFailed {
   205  		return nil
   206  	}
   207  
   208  	deadline, _ := a.ctx.Deadline()
   209  	events := make(map[string]string, len(a.tg.Tasks))
   210  
   211  	// Go through are task information and build the event map
   212  	for task, state := range a.taskHealth {
   213  		if e, ok := state.event(deadline, a.tg.Update); ok {
   214  			events[task] = e
   215  		}
   216  	}
   217  
   218  	return events
   219  }
   220  
   221  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   222  // allocation is terminal, health is immediately broadcasted.
   223  func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) {
   224  	a.l.Lock()
   225  	defer a.l.Unlock()
   226  	a.tasksHealthy = healthy
   227  
   228  	// If we are marked healthy but we also require Consul to be healthy and it
   229  	// isn't yet, return, unless the task is terminal
   230  	requireConsul := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks && a.consulCheckCount > 0
   231  	if !terminal && healthy && requireConsul && !a.checksHealthy {
   232  		return
   233  	}
   234  
   235  	select {
   236  	case a.healthy <- healthy:
   237  	default:
   238  	}
   239  
   240  	// Shutdown the tracker
   241  	a.cancelFn()
   242  }
   243  
   244  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   245  func (a *allocHealthTracker) setCheckHealth(healthy bool) {
   246  	a.l.Lock()
   247  	defer a.l.Unlock()
   248  	a.checksHealthy = healthy
   249  
   250  	// Only signal if we are healthy and so is the tasks
   251  	if !healthy || !a.tasksHealthy {
   252  		return
   253  	}
   254  
   255  	select {
   256  	case a.healthy <- healthy:
   257  	default:
   258  	}
   259  
   260  	// Shutdown the tracker
   261  	a.cancelFn()
   262  }
   263  
   264  // markAllocStopped is used to mark the allocation as having stopped.
   265  func (a *allocHealthTracker) markAllocStopped() {
   266  	close(a.allocStopped)
   267  	a.cancelFn()
   268  }
   269  
   270  // watchTaskEvents is a long lived watcher that watches for the health of the
   271  // allocation's tasks.
   272  func (a *allocHealthTracker) watchTaskEvents() {
   273  	alloc := a.alloc
   274  	allStartedTime := time.Time{}
   275  	healthyTimer := time.NewTimer(0)
   276  	if !healthyTimer.Stop() {
   277  		select {
   278  		case <-healthyTimer.C:
   279  		default:
   280  		}
   281  	}
   282  
   283  	for {
   284  		// If the alloc is being stopped by the server just exit
   285  		switch alloc.DesiredStatus {
   286  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   287  			a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
   288  			a.markAllocStopped()
   289  			return
   290  		}
   291  
   292  		// Store the task states
   293  		a.l.Lock()
   294  		for task, state := range alloc.TaskStates {
   295  			a.taskHealth[task].state = state
   296  		}
   297  		a.l.Unlock()
   298  
   299  		// Detect if the alloc is unhealthy or if all tasks have started yet
   300  		latestStartTime := time.Time{}
   301  		for _, state := range alloc.TaskStates {
   302  			// One of the tasks has failed so we can exit watching
   303  			if state.Failed || !state.FinishedAt.IsZero() {
   304  				a.setTaskHealth(false, true)
   305  				return
   306  			}
   307  
   308  			if state.State != structs.TaskStateRunning {
   309  				latestStartTime = time.Time{}
   310  				break
   311  			} else if state.StartedAt.After(latestStartTime) {
   312  				latestStartTime = state.StartedAt
   313  			}
   314  		}
   315  
   316  		// If the alloc is marked as failed by the client but none of the
   317  		// individual tasks failed, that means something failed at the alloc
   318  		// level.
   319  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   320  			a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
   321  			a.l.Lock()
   322  			a.allocFailed = true
   323  			a.l.Unlock()
   324  			a.setTaskHealth(false, true)
   325  			return
   326  		}
   327  
   328  		if !latestStartTime.Equal(allStartedTime) {
   329  			// Avoid the timer from firing at the old start time
   330  			if !healthyTimer.Stop() {
   331  				select {
   332  				case <-healthyTimer.C:
   333  				default:
   334  				}
   335  			}
   336  
   337  			// Set the timer since all tasks are started
   338  			if !latestStartTime.IsZero() {
   339  				allStartedTime = latestStartTime
   340  				healthyTimer.Reset(a.tg.Update.MinHealthyTime)
   341  			}
   342  		}
   343  
   344  		select {
   345  		case <-a.ctx.Done():
   346  			return
   347  		case newAlloc, ok := <-a.allocUpdates.Ch:
   348  			if !ok {
   349  				return
   350  			}
   351  			alloc = newAlloc
   352  		case <-healthyTimer.C:
   353  			a.setTaskHealth(true, false)
   354  		}
   355  	}
   356  }
   357  
   358  // watchConsulEvents iis a long lived watcher that watches for the health of the
   359  // allocation's Consul checks.
   360  func (a *allocHealthTracker) watchConsulEvents() {
   361  	// checkTicker is the ticker that triggers us to look at the checks in
   362  	// Consul
   363  	checkTicker := time.NewTicker(consulCheckLookupInterval)
   364  	defer checkTicker.Stop()
   365  
   366  	// healthyTimer fires when the checks have been healthy for the
   367  	// MinHealthyTime
   368  	healthyTimer := time.NewTimer(0)
   369  	if !healthyTimer.Stop() {
   370  		select {
   371  		case <-healthyTimer.C:
   372  		default:
   373  		}
   374  	}
   375  
   376  	// primed marks whether the healthy timer has been set
   377  	primed := false
   378  
   379  	// Store whether the last Consul checks call was successful or not
   380  	consulChecksErr := false
   381  
   382  	// allocReg are the registered objects in Consul for the allocation
   383  	var allocReg *consul.AllocRegistration
   384  
   385  OUTER:
   386  	for {
   387  		select {
   388  		case <-a.ctx.Done():
   389  			return
   390  		case <-checkTicker.C:
   391  			newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID)
   392  			if err != nil {
   393  				if !consulChecksErr {
   394  					consulChecksErr = true
   395  					a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err)
   396  				}
   397  				continue OUTER
   398  			} else {
   399  				consulChecksErr = false
   400  				allocReg = newAllocReg
   401  			}
   402  		case <-healthyTimer.C:
   403  			a.setCheckHealth(true)
   404  		}
   405  
   406  		if allocReg == nil {
   407  			continue
   408  		}
   409  
   410  		// Store the task registrations
   411  		a.l.Lock()
   412  		for task, reg := range allocReg.Tasks {
   413  			a.taskHealth[task].taskRegistrations = reg
   414  		}
   415  		a.l.Unlock()
   416  
   417  		// Detect if all the checks are passing
   418  		passed := true
   419  
   420  	CHECKS:
   421  		for _, treg := range allocReg.Tasks {
   422  			for _, sreg := range treg.Services {
   423  				for _, check := range sreg.Checks {
   424  					if check.Status == api.HealthPassing {
   425  						continue
   426  					}
   427  
   428  					passed = false
   429  					a.setCheckHealth(false)
   430  					break CHECKS
   431  				}
   432  			}
   433  		}
   434  
   435  		if !passed {
   436  			// Reset the timer since we have transistioned back to unhealthy
   437  			if primed {
   438  				if !healthyTimer.Stop() {
   439  					select {
   440  					case <-healthyTimer.C:
   441  					default:
   442  					}
   443  				}
   444  				primed = false
   445  			}
   446  		} else if !primed {
   447  			// Reset the timer to fire after MinHealthyTime
   448  			if !healthyTimer.Stop() {
   449  				select {
   450  				case <-healthyTimer.C:
   451  				default:
   452  				}
   453  			}
   454  
   455  			primed = true
   456  			healthyTimer.Reset(a.tg.Update.MinHealthyTime)
   457  		}
   458  	}
   459  }
   460  
   461  // taskHealthState captures all known health information about a task. It is
   462  // largely used to determine if the task has contributed to the allocation being
   463  // unhealthy.
   464  type taskHealthState struct {
   465  	task              *structs.Task
   466  	state             *structs.TaskState
   467  	taskRegistrations *consul.TaskRegistration
   468  }
   469  
   470  // event takes the deadline time for the allocation to be healthy and the update
   471  // strategy of the group. It returns true if the task has contributed to the
   472  // allocation being unhealthy and if so, an event description of why.
   473  func (t *taskHealthState) event(deadline time.Time, update *structs.UpdateStrategy) (string, bool) {
   474  	requireChecks := false
   475  	desiredChecks := 0
   476  	for _, s := range t.task.Services {
   477  		if nc := len(s.Checks); nc > 0 {
   478  			requireChecks = true
   479  			desiredChecks += nc
   480  		}
   481  	}
   482  	requireChecks = requireChecks && update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   483  
   484  	if t.state != nil {
   485  		if t.state.Failed {
   486  			return "Unhealthy because of failed task", true
   487  		}
   488  		if t.state.State != structs.TaskStateRunning {
   489  			return "Task not running by deadline", true
   490  		}
   491  
   492  		// We are running so check if we have been running long enough
   493  		if t.state.StartedAt.Add(update.MinHealthyTime).After(deadline) {
   494  			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", update.MinHealthyTime), true
   495  		}
   496  	}
   497  
   498  	if t.taskRegistrations != nil {
   499  		var notPassing []string
   500  		passing := 0
   501  
   502  	OUTER:
   503  		for _, sreg := range t.taskRegistrations.Services {
   504  			for _, check := range sreg.Checks {
   505  				if check.Status != api.HealthPassing {
   506  					notPassing = append(notPassing, sreg.Service.Service)
   507  					continue OUTER
   508  				} else {
   509  					passing++
   510  				}
   511  			}
   512  		}
   513  
   514  		if len(notPassing) != 0 {
   515  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   516  		}
   517  
   518  		if passing != desiredChecks {
   519  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   520  		}
   521  
   522  	} else if requireChecks {
   523  		return "Service checks not registered", true
   524  	}
   525  
   526  	return "", false
   527  }