github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/alloc_runner_health_watcher.go

github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/alloc_runner_health_watcher.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"log"
     7  	"strings"
     8  	"sync"
     9  	"time"
    10  
    11  	"github.com/hashicorp/consul/api"
    12  	consulApi "github.com/hashicorp/nomad/client/consul"
    13  	cstructs "github.com/hashicorp/nomad/client/structs"
    14  	"github.com/hashicorp/nomad/command/agent/consul"
    15  	"github.com/hashicorp/nomad/helper"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// consulCheckLookupInterval is the  interval at which we check if the
    21  	// Consul checks are healthy or unhealthy.
    22  	consulCheckLookupInterval = 500 * time.Millisecond
    23  
    24  	// allocHealthEventSource is the source used for emitting task events
    25  	allocHealthEventSource = "Alloc Unhealthy"
    26  )
    27  
    28  // watchHealth is responsible for watching an allocation's task status and
    29  // potentially Consul health check status to determine if the allocation is
    30  // healthy or unhealthy.
    31  func (r *AllocRunner) watchHealth(ctx context.Context) {
    32  
    33  	// See if we should watch the allocs health
    34  	alloc := r.Alloc()
    35  
    36  	// Neither deployments nor migrations care about the health of
    37  	// non-service jobs so never watch their health
    38  	if alloc.Job.Type != structs.JobTypeService {
    39  		return
    40  	}
    41  
    42  	// No need to watch health as it's already set
    43  	if alloc.DeploymentStatus.HasHealth() {
    44  		return
    45  	}
    46  
    47  	tg := alloc.Job.LookupTaskGroup(alloc.TaskGroup)
    48  	if tg == nil {
    49  		r.logger.Printf("[ERR] client.alloc_watcher: failed to lookup allocation %q task group %q. Exiting watcher",
    50  			alloc.ID, alloc.TaskGroup)
    51  		return
    52  	}
    53  
    54  	isDeploy := alloc.DeploymentID != ""
    55  
    56  	// No need to watch allocs for deployments that rely on operators
    57  	// manually setting health
    58  	if isDeploy && (tg.Update == nil || tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Manual) {
    59  		return
    60  	}
    61  
    62  	// Get an allocation listener to watch for alloc events
    63  	l := r.allocBroadcast.Listen()
    64  	defer l.Close()
    65  
    66  	// Define the deadline, health method, min healthy time from the
    67  	// deployment if this is a deployment; otherwise from the migration
    68  	// strategy.
    69  	var deadline time.Time
    70  	var useChecks bool
    71  	var minHealthyTime time.Duration
    72  
    73  	if isDeploy {
    74  		deadline = time.Now().Add(tg.Update.HealthyDeadline)
    75  		minHealthyTime = tg.Update.MinHealthyTime
    76  		useChecks = tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
    77  	} else {
    78  		strategy := tg.Migrate
    79  		if strategy == nil {
    80  			// For backwards compat with pre-0.8 allocations that
    81  			// don't have a migrate strategy set.
    82  			strategy = structs.DefaultMigrateStrategy()
    83  		}
    84  		deadline = time.Now().Add(strategy.HealthyDeadline)
    85  		minHealthyTime = strategy.MinHealthyTime
    86  		useChecks = strategy.HealthCheck == structs.MigrateStrategyHealthChecks
    87  	}
    88  
    89  	// Create a new context with the health deadline
    90  	healthCtx, healthCtxCancel := context.WithDeadline(ctx, deadline)
    91  	defer healthCtxCancel()
    92  	r.logger.Printf("[DEBUG] client.alloc_watcher: deadline for alloc %q is at %v (deploy=%t checks=%t)", alloc.ID, deadline, isDeploy, useChecks)
    93  
    94  	// Create the health tracker object
    95  	tracker := newAllocHealthTracker(healthCtx, r.logger, alloc, l, r.consulClient, minHealthyTime, useChecks)
    96  	tracker.Start()
    97  
    98  	allocHealthy := false
    99  	select {
   100  	case <-healthCtx.Done():
   101  		// We were cancelled which means we are no longer needed
   102  		if healthCtx.Err() == context.Canceled {
   103  			return
   104  		}
   105  
   106  		// Since the deadline has been reached we are not healthy
   107  	case <-tracker.AllocStoppedCh():
   108  		// The allocation was stopped so nothing to do
   109  		return
   110  	case healthy := <-tracker.HealthyCh():
   111  		allocHealthy = healthy
   112  	}
   113  
   114  	r.allocLock.Lock()
   115  	r.allocHealth = helper.BoolToPtr(allocHealthy)
   116  	r.allocHealthTime = time.Now()
   117  	r.allocLock.Unlock()
   118  
   119  	// If deployment is unhealthy emit task events explaining why
   120  	if !allocHealthy && isDeploy {
   121  		r.taskLock.RLock()
   122  		for task, event := range tracker.TaskEvents() {
   123  			if tr, ok := r.tasks[task]; ok {
   124  				tr.EmitEvent(allocHealthEventSource, event)
   125  			}
   126  		}
   127  		r.taskLock.RUnlock()
   128  	}
   129  
   130  	r.syncStatus()
   131  }
   132  
   133  // allocHealthTracker tracks the health of an allocation and makes health events
   134  // watchable via channels.
   135  type allocHealthTracker struct {
   136  	// logger is used to log
   137  	logger *log.Logger
   138  
   139  	// ctx and cancelFn is used to shutdown the tracker
   140  	ctx      context.Context
   141  	cancelFn context.CancelFunc
   142  
   143  	// alloc is the alloc we are tracking
   144  	alloc *structs.Allocation
   145  
   146  	// tg is the task group we are tracking
   147  	tg *structs.TaskGroup
   148  
   149  	// minHealthyTime is the duration an alloc must remain healthy to be
   150  	// considered healthy
   151  	minHealthyTime time.Duration
   152  
   153  	// useChecks specifies whether to use Consul healh checks or not
   154  	useChecks bool
   155  
   156  	// consulCheckCount is the number of checks the task group will attempt to
   157  	// register
   158  	consulCheckCount int
   159  
   160  	// allocUpdates is a listener for retrieving new alloc updates
   161  	allocUpdates *cstructs.AllocListener
   162  
   163  	// consulClient is used to look up the state of the task's checks
   164  	consulClient consulApi.ConsulServiceAPI
   165  
   166  	// healthy is used to signal whether we have determined the allocation to be
   167  	// healthy or unhealthy
   168  	healthy chan bool
   169  
   170  	// allocStopped is triggered when the allocation is stopped and tracking is
   171  	// not needed
   172  	allocStopped chan struct{}
   173  
   174  	// l is used to lock shared fields listed below
   175  	l sync.Mutex
   176  
   177  	// tasksHealthy marks whether all the tasks have met their health check
   178  	// (disregards Consul)
   179  	tasksHealthy bool
   180  
   181  	// allocFailed marks whether the allocation failed
   182  	allocFailed bool
   183  
   184  	// checksHealthy marks whether all the task's Consul checks are healthy
   185  	checksHealthy bool
   186  
   187  	// taskHealth contains the health state for each task
   188  	taskHealth map[string]*taskHealthState
   189  }
   190  
   191  // newAllocHealthTracker returns a health tracker for the given allocation. An
   192  // alloc listener and consul API object are given so that the watcher can detect
   193  // health changes.
   194  func newAllocHealthTracker(parentCtx context.Context, logger *log.Logger, alloc *structs.Allocation,
   195  	allocUpdates *cstructs.AllocListener, consulClient consulApi.ConsulServiceAPI,
   196  	minHealthyTime time.Duration, useChecks bool) *allocHealthTracker {
   197  
   198  	a := &allocHealthTracker{
   199  		logger:         logger,
   200  		healthy:        make(chan bool, 1),
   201  		allocStopped:   make(chan struct{}),
   202  		alloc:          alloc,
   203  		tg:             alloc.Job.LookupTaskGroup(alloc.TaskGroup),
   204  		minHealthyTime: minHealthyTime,
   205  		useChecks:      useChecks,
   206  		allocUpdates:   allocUpdates,
   207  		consulClient:   consulClient,
   208  	}
   209  
   210  	a.taskHealth = make(map[string]*taskHealthState, len(a.tg.Tasks))
   211  	for _, task := range a.tg.Tasks {
   212  		a.taskHealth[task.Name] = &taskHealthState{task: task}
   213  	}
   214  
   215  	for _, task := range a.tg.Tasks {
   216  		for _, s := range task.Services {
   217  			a.consulCheckCount += len(s.Checks)
   218  		}
   219  	}
   220  
   221  	a.ctx, a.cancelFn = context.WithCancel(parentCtx)
   222  	return a
   223  }
   224  
   225  // Start starts the watcher.
   226  func (a *allocHealthTracker) Start() {
   227  	go a.watchTaskEvents()
   228  	if a.useChecks {
   229  		go a.watchConsulEvents()
   230  	}
   231  }
   232  
   233  // HealthyCh returns a channel that will emit a boolean indicating the health of
   234  // the allocation.
   235  func (a *allocHealthTracker) HealthyCh() <-chan bool {
   236  	return a.healthy
   237  }
   238  
   239  // AllocStoppedCh returns a channel that will be fired if the allocation is
   240  // stopped. This means that health will not be set.
   241  func (a *allocHealthTracker) AllocStoppedCh() <-chan struct{} {
   242  	return a.allocStopped
   243  }
   244  
   245  // TaskEvents returns a map of events by task. This should only be called after
   246  // health has been determined. Only tasks that have contributed to the
   247  // allocation being unhealthy will have an event.
   248  func (a *allocHealthTracker) TaskEvents() map[string]string {
   249  	a.l.Lock()
   250  	defer a.l.Unlock()
   251  
   252  	// Nothing to do since the failure wasn't task related
   253  	if a.allocFailed {
   254  		return nil
   255  	}
   256  
   257  	deadline, _ := a.ctx.Deadline()
   258  	events := make(map[string]string, len(a.tg.Tasks))
   259  
   260  	// Go through are task information and build the event map
   261  	for task, state := range a.taskHealth {
   262  		useChecks := a.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   263  		if e, ok := state.event(deadline, a.tg.Update.MinHealthyTime, useChecks); ok {
   264  			events[task] = e
   265  		}
   266  	}
   267  
   268  	return events
   269  }
   270  
   271  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   272  // allocation is terminal, health is immediately broadcasted.
   273  func (a *allocHealthTracker) setTaskHealth(healthy, terminal bool) {
   274  	a.l.Lock()
   275  	defer a.l.Unlock()
   276  	a.tasksHealthy = healthy
   277  
   278  	// If we are marked healthy but we also require Consul to be healthy and it
   279  	// isn't yet, return, unless the task is terminal
   280  	requireConsul := a.useChecks && a.consulCheckCount > 0
   281  	if !terminal && healthy && requireConsul && !a.checksHealthy {
   282  		return
   283  	}
   284  
   285  	select {
   286  	case a.healthy <- healthy:
   287  	default:
   288  	}
   289  
   290  	// Shutdown the tracker
   291  	a.cancelFn()
   292  }
   293  
   294  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   295  func (a *allocHealthTracker) setCheckHealth(healthy bool) {
   296  	a.l.Lock()
   297  	defer a.l.Unlock()
   298  	a.checksHealthy = healthy
   299  
   300  	// Only signal if we are healthy and so is the tasks
   301  	if !healthy || !a.tasksHealthy {
   302  		return
   303  	}
   304  
   305  	select {
   306  	case a.healthy <- healthy:
   307  	default:
   308  	}
   309  
   310  	// Shutdown the tracker
   311  	a.cancelFn()
   312  }
   313  
   314  // markAllocStopped is used to mark the allocation as having stopped.
   315  func (a *allocHealthTracker) markAllocStopped() {
   316  	close(a.allocStopped)
   317  	a.cancelFn()
   318  }
   319  
   320  // watchTaskEvents is a long lived watcher that watches for the health of the
   321  // allocation's tasks.
   322  func (a *allocHealthTracker) watchTaskEvents() {
   323  	alloc := a.alloc
   324  	allStartedTime := time.Time{}
   325  	healthyTimer := time.NewTimer(0)
   326  	if !healthyTimer.Stop() {
   327  		select {
   328  		case <-healthyTimer.C:
   329  		default:
   330  		}
   331  	}
   332  
   333  	for {
   334  		// If the alloc is being stopped by the server just exit
   335  		switch alloc.DesiredStatus {
   336  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   337  			a.logger.Printf("[TRACE] client.alloc_watcher: desired status terminal for alloc %q", alloc.ID)
   338  			a.markAllocStopped()
   339  			return
   340  		}
   341  
   342  		// Store the task states
   343  		a.l.Lock()
   344  		for task, state := range alloc.TaskStates {
   345  			a.taskHealth[task].state = state
   346  		}
   347  		a.l.Unlock()
   348  
   349  		// Detect if the alloc is unhealthy or if all tasks have started yet
   350  		latestStartTime := time.Time{}
   351  		for _, state := range alloc.TaskStates {
   352  			// One of the tasks has failed so we can exit watching
   353  			if state.Failed || !state.FinishedAt.IsZero() {
   354  				a.setTaskHealth(false, true)
   355  				return
   356  			}
   357  
   358  			if state.State != structs.TaskStateRunning {
   359  				latestStartTime = time.Time{}
   360  				break
   361  			} else if state.StartedAt.After(latestStartTime) {
   362  				latestStartTime = state.StartedAt
   363  			}
   364  		}
   365  
   366  		// If the alloc is marked as failed by the client but none of the
   367  		// individual tasks failed, that means something failed at the alloc
   368  		// level.
   369  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   370  			a.logger.Printf("[TRACE] client.alloc_watcher: client status failed for alloc %q", alloc.ID)
   371  			a.l.Lock()
   372  			a.allocFailed = true
   373  			a.l.Unlock()
   374  			a.setTaskHealth(false, true)
   375  			return
   376  		}
   377  
   378  		if !latestStartTime.Equal(allStartedTime) {
   379  			// Avoid the timer from firing at the old start time
   380  			if !healthyTimer.Stop() {
   381  				select {
   382  				case <-healthyTimer.C:
   383  				default:
   384  				}
   385  			}
   386  
   387  			// Set the timer since all tasks are started
   388  			if !latestStartTime.IsZero() {
   389  				allStartedTime = latestStartTime
   390  				healthyTimer.Reset(a.minHealthyTime)
   391  			}
   392  		}
   393  
   394  		select {
   395  		case <-a.ctx.Done():
   396  			return
   397  		case newAlloc, ok := <-a.allocUpdates.Ch:
   398  			if !ok {
   399  				return
   400  			}
   401  			alloc = newAlloc
   402  		case <-healthyTimer.C:
   403  			a.setTaskHealth(true, false)
   404  		}
   405  	}
   406  }
   407  
   408  // watchConsulEvents iis a long lived watcher that watches for the health of the
   409  // allocation's Consul checks.
   410  func (a *allocHealthTracker) watchConsulEvents() {
   411  	// checkTicker is the ticker that triggers us to look at the checks in
   412  	// Consul
   413  	checkTicker := time.NewTicker(consulCheckLookupInterval)
   414  	defer checkTicker.Stop()
   415  
   416  	// healthyTimer fires when the checks have been healthy for the
   417  	// MinHealthyTime
   418  	healthyTimer := time.NewTimer(0)
   419  	if !healthyTimer.Stop() {
   420  		select {
   421  		case <-healthyTimer.C:
   422  		default:
   423  		}
   424  	}
   425  
   426  	// primed marks whether the healthy timer has been set
   427  	primed := false
   428  
   429  	// Store whether the last Consul checks call was successful or not
   430  	consulChecksErr := false
   431  
   432  	// allocReg are the registered objects in Consul for the allocation
   433  	var allocReg *consul.AllocRegistration
   434  
   435  OUTER:
   436  	for {
   437  		select {
   438  		case <-a.ctx.Done():
   439  			return
   440  		case <-checkTicker.C:
   441  			newAllocReg, err := a.consulClient.AllocRegistrations(a.alloc.ID)
   442  			if err != nil {
   443  				if !consulChecksErr {
   444  					consulChecksErr = true
   445  					a.logger.Printf("[WARN] client.alloc_watcher: failed to lookup Consul registrations for allocation %q: %v", a.alloc.ID, err)
   446  				}
   447  				continue OUTER
   448  			} else {
   449  				consulChecksErr = false
   450  				allocReg = newAllocReg
   451  			}
   452  		case <-healthyTimer.C:
   453  			a.setCheckHealth(true)
   454  		}
   455  
   456  		if allocReg == nil {
   457  			continue
   458  		}
   459  
   460  		// Store the task registrations
   461  		a.l.Lock()
   462  		for task, reg := range allocReg.Tasks {
   463  			a.taskHealth[task].taskRegistrations = reg
   464  		}
   465  		a.l.Unlock()
   466  
   467  		// Detect if all the checks are passing
   468  		passed := true
   469  
   470  	CHECKS:
   471  		for _, treg := range allocReg.Tasks {
   472  			for _, sreg := range treg.Services {
   473  				for _, check := range sreg.Checks {
   474  					if check.Status == api.HealthPassing {
   475  						continue
   476  					}
   477  
   478  					passed = false
   479  					a.setCheckHealth(false)
   480  					break CHECKS
   481  				}
   482  			}
   483  		}
   484  
   485  		if !passed {
   486  			// Reset the timer since we have transitioned back to unhealthy
   487  			if primed {
   488  				if !healthyTimer.Stop() {
   489  					select {
   490  					case <-healthyTimer.C:
   491  					default:
   492  					}
   493  				}
   494  				primed = false
   495  			}
   496  		} else if !primed {
   497  			// Reset the timer to fire after MinHealthyTime
   498  			if !healthyTimer.Stop() {
   499  				select {
   500  				case <-healthyTimer.C:
   501  				default:
   502  				}
   503  			}
   504  
   505  			primed = true
   506  			healthyTimer.Reset(a.minHealthyTime)
   507  		}
   508  	}
   509  }
   510  
   511  // taskHealthState captures all known health information about a task. It is
   512  // largely used to determine if the task has contributed to the allocation being
   513  // unhealthy.
   514  type taskHealthState struct {
   515  	task              *structs.Task
   516  	state             *structs.TaskState
   517  	taskRegistrations *consul.TaskRegistration
   518  }
   519  
   520  // event takes the deadline time for the allocation to be healthy and the update
   521  // strategy of the group. It returns true if the task has contributed to the
   522  // allocation being unhealthy and if so, an event description of why.
   523  func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   524  	requireChecks := false
   525  	desiredChecks := 0
   526  	for _, s := range t.task.Services {
   527  		if nc := len(s.Checks); nc > 0 {
   528  			requireChecks = true
   529  			desiredChecks += nc
   530  		}
   531  	}
   532  	requireChecks = requireChecks && useChecks
   533  
   534  	if t.state != nil {
   535  		if t.state.Failed {
   536  			return "Unhealthy because of failed task", true
   537  		}
   538  		if t.state.State != structs.TaskStateRunning {
   539  			return "Task not running by deadline", true
   540  		}
   541  
   542  		// We are running so check if we have been running long enough
   543  		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   544  			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
   545  		}
   546  	}
   547  
   548  	if t.taskRegistrations != nil {
   549  		var notPassing []string
   550  		passing := 0
   551  
   552  	OUTER:
   553  		for _, sreg := range t.taskRegistrations.Services {
   554  			for _, check := range sreg.Checks {
   555  				if check.Status != api.HealthPassing {
   556  					notPassing = append(notPassing, sreg.Service.Service)
   557  					continue OUTER
   558  				} else {
   559  					passing++
   560  				}
   561  			}
   562  		}
   563  
   564  		if len(notPassing) != 0 {
   565  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   566  		}
   567  
   568  		if passing != desiredChecks {
   569  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   570  		}
   571  
   572  	} else if requireChecks {
   573  		return "Service checks not registered", true
   574  	}
   575  
   576  	return "", false
   577  }