github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allochealth/tracker.go

github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/client/allochealth/tracker.go (about)

     1  package allochealth
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  	cconsul "github.com/hashicorp/nomad/client/consul"
    13  	cstructs "github.com/hashicorp/nomad/client/structs"
    14  	"github.com/hashicorp/nomad/command/agent/consul"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// AllocHealthEventSource is the source used for emitting task events
    20  	AllocHealthEventSource = "Alloc Unhealthy"
    21  
    22  	// consulCheckLookupInterval is the  interval at which we check if the
    23  	// Consul checks are healthy or unhealthy.
    24  	consulCheckLookupInterval = 500 * time.Millisecond
    25  )
    26  
    27  // Tracker tracks the health of an allocation and makes health events watchable
    28  // via channels.
    29  type Tracker struct {
    30  	// ctx and cancelFn is used to shutdown the tracker
    31  	ctx      context.Context
    32  	cancelFn context.CancelFunc
    33  
    34  	// alloc is the alloc we are tracking
    35  	alloc *structs.Allocation
    36  
    37  	// tg is the task group we are tracking
    38  	tg *structs.TaskGroup
    39  
    40  	// minHealthyTime is the duration an alloc must remain healthy to be
    41  	// considered healthy
    42  	minHealthyTime time.Duration
    43  
    44  	// checkLookupInterval is the interval at which we check if the
    45  	// Consul checks are healthy or unhealthy.
    46  	checkLookupInterval time.Duration
    47  
    48  	// useChecks specifies whether to use Consul healh checks or not
    49  	useChecks bool
    50  
    51  	// consulCheckCount is the number of checks the task group will attempt to
    52  	// register
    53  	consulCheckCount int
    54  
    55  	// allocUpdates is a listener for retrieving new alloc updates
    56  	allocUpdates *cstructs.AllocListener
    57  
    58  	// consulClient is used to look up the state of the task's checks
    59  	consulClient cconsul.ConsulServiceAPI
    60  
    61  	// healthy is used to signal whether we have determined the allocation to be
    62  	// healthy or unhealthy
    63  	healthy chan bool
    64  
    65  	// allocStopped is triggered when the allocation is stopped and tracking is
    66  	// not needed
    67  	allocStopped chan struct{}
    68  
    69  	// lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks.
    70  	// These tasks may terminate without affecting alloc health
    71  	lifecycleTasks map[string]string
    72  
    73  	// l is used to lock shared fields listed below
    74  	l sync.Mutex
    75  
    76  	// tasksHealthy marks whether all the tasks have met their health check
    77  	// (disregards Consul)
    78  	tasksHealthy bool
    79  
    80  	// allocFailed marks whether the allocation failed
    81  	allocFailed bool
    82  
    83  	// checksHealthy marks whether all the task's Consul checks are healthy
    84  	checksHealthy bool
    85  
    86  	// taskHealth contains the health state for each task
    87  	taskHealth map[string]*taskHealthState
    88  
    89  	logger hclog.Logger
    90  }
    91  
    92  // NewTracker returns a health tracker for the given allocation. An alloc
    93  // listener and consul API object are given so that the watcher can detect
    94  // health changes.
    95  func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation,
    96  	allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI,
    97  	minHealthyTime time.Duration, useChecks bool) *Tracker {
    98  
    99  	// Do not create a named sub-logger as the hook controlling
   100  	// this struct should pass in an appropriately named
   101  	// sub-logger.
   102  	t := &Tracker{
   103  		healthy:             make(chan bool, 1),
   104  		allocStopped:        make(chan struct{}),
   105  		alloc:               alloc,
   106  		tg:                  alloc.Job.LookupTaskGroup(alloc.TaskGroup),
   107  		minHealthyTime:      minHealthyTime,
   108  		useChecks:           useChecks,
   109  		allocUpdates:        allocUpdates,
   110  		consulClient:        consulClient,
   111  		checkLookupInterval: consulCheckLookupInterval,
   112  		logger:              logger,
   113  		lifecycleTasks:      map[string]string{},
   114  	}
   115  
   116  	t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
   117  	for _, task := range t.tg.Tasks {
   118  		t.taskHealth[task.Name] = &taskHealthState{task: task}
   119  
   120  		if task.Lifecycle != nil && !task.Lifecycle.Sidecar {
   121  			t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
   122  		}
   123  
   124  		for _, s := range task.Services {
   125  			t.consulCheckCount += len(s.Checks)
   126  		}
   127  	}
   128  
   129  	for _, s := range t.tg.Services {
   130  		t.consulCheckCount += len(s.Checks)
   131  	}
   132  
   133  	t.ctx, t.cancelFn = context.WithCancel(parentCtx)
   134  	return t
   135  }
   136  
   137  // Start starts the watcher.
   138  func (t *Tracker) Start() {
   139  	go t.watchTaskEvents()
   140  	if t.useChecks {
   141  		go t.watchConsulEvents()
   142  	}
   143  }
   144  
   145  // HealthyCh returns a channel that will emit a boolean indicating the health of
   146  // the allocation.
   147  func (t *Tracker) HealthyCh() <-chan bool {
   148  	return t.healthy
   149  }
   150  
   151  // AllocStoppedCh returns a channel that will be fired if the allocation is
   152  // stopped. This means that health will not be set.
   153  func (t *Tracker) AllocStoppedCh() <-chan struct{} {
   154  	return t.allocStopped
   155  }
   156  
   157  // TaskEvents returns a map of events by task. This should only be called after
   158  // health has been determined. Only tasks that have contributed to the
   159  // allocation being unhealthy will have an event.
   160  func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
   161  	t.l.Lock()
   162  	defer t.l.Unlock()
   163  
   164  	// Nothing to do since the failure wasn't task related
   165  	if t.allocFailed {
   166  		return nil
   167  	}
   168  
   169  	deadline, _ := t.ctx.Deadline()
   170  	events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks))
   171  
   172  	// Go through are task information and build the event map
   173  	for task, state := range t.taskHealth {
   174  		useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   175  		if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok {
   176  			events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e)
   177  		}
   178  	}
   179  
   180  	return events
   181  }
   182  
   183  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   184  // allocation is terminal, health is immediately broadcasted.
   185  func (t *Tracker) setTaskHealth(healthy, terminal bool) {
   186  	t.l.Lock()
   187  	defer t.l.Unlock()
   188  	t.tasksHealthy = healthy
   189  
   190  	// if unhealthy, force waiting for new checks health status
   191  	if !terminal && !healthy {
   192  		t.checksHealthy = false
   193  		return
   194  	}
   195  
   196  	// If we are marked healthy but we also require Consul to be healthy and it
   197  	// isn't yet, return, unless the task is terminal
   198  	requireConsul := t.useChecks && t.consulCheckCount > 0
   199  	if !terminal && healthy && requireConsul && !t.checksHealthy {
   200  		return
   201  	}
   202  
   203  	select {
   204  	case t.healthy <- healthy:
   205  	default:
   206  	}
   207  
   208  	// Shutdown the tracker
   209  	t.cancelFn()
   210  }
   211  
   212  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   213  // returns true if health is propagated and no more health monitoring is needed
   214  func (t *Tracker) setCheckHealth(healthy bool) bool {
   215  	t.l.Lock()
   216  	defer t.l.Unlock()
   217  
   218  	// check health should always be false if tasks are unhealthy
   219  	// as checks might be missing from unhealthy tasks
   220  	t.checksHealthy = healthy && t.tasksHealthy
   221  
   222  	// Only signal if we are healthy and so is the tasks
   223  	if !t.checksHealthy {
   224  		return false
   225  	}
   226  
   227  	select {
   228  	case t.healthy <- healthy:
   229  	default:
   230  	}
   231  
   232  	// Shutdown the tracker
   233  	t.cancelFn()
   234  	return true
   235  }
   236  
   237  // markAllocStopped is used to mark the allocation as having stopped.
   238  func (t *Tracker) markAllocStopped() {
   239  	close(t.allocStopped)
   240  	t.cancelFn()
   241  }
   242  
   243  // watchTaskEvents is a long lived watcher that watches for the health of the
   244  // allocation's tasks.
   245  func (t *Tracker) watchTaskEvents() {
   246  	alloc := t.alloc
   247  	allStartedTime := time.Time{}
   248  	healthyTimer := time.NewTimer(0)
   249  	if !healthyTimer.Stop() {
   250  		select {
   251  		case <-healthyTimer.C:
   252  		default:
   253  		}
   254  	}
   255  
   256  	for {
   257  		// If the alloc is being stopped by the server just exit
   258  		switch alloc.DesiredStatus {
   259  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   260  			t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
   261  			t.markAllocStopped()
   262  			return
   263  		}
   264  
   265  		// Store the task states
   266  		t.l.Lock()
   267  		for task, state := range alloc.TaskStates {
   268  			//TODO(schmichael) for now skip unknown tasks as
   269  			//they're task group services which don't currently
   270  			//support checks anyway
   271  			if v, ok := t.taskHealth[task]; ok {
   272  				v.state = state
   273  			}
   274  		}
   275  		t.l.Unlock()
   276  
   277  		// Detect if the alloc is unhealthy or if all tasks have started yet
   278  		latestStartTime := time.Time{}
   279  		for taskName, state := range alloc.TaskStates {
   280  			// If the task is a poststop task we do not want to evaluate it
   281  			// since it will remain pending until the main task has finished
   282  			// or exited.
   283  			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop {
   284  				continue
   285  			}
   286  
   287  			// One of the tasks has failed so we can exit watching
   288  			if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
   289  				t.setTaskHealth(false, true)
   290  				return
   291  			}
   292  
   293  			if state.State == structs.TaskStatePending {
   294  				latestStartTime = time.Time{}
   295  				break
   296  			} else if state.StartedAt.After(latestStartTime) {
   297  				// task is either running or exited successfully
   298  				latestStartTime = state.StartedAt
   299  			}
   300  		}
   301  
   302  		// If the alloc is marked as failed by the client but none of the
   303  		// individual tasks failed, that means something failed at the alloc
   304  		// level.
   305  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   306  			t.l.Lock()
   307  			t.allocFailed = true
   308  			t.l.Unlock()
   309  
   310  			t.setTaskHealth(false, true)
   311  			return
   312  		}
   313  
   314  		if !latestStartTime.Equal(allStartedTime) {
   315  			// reset task health
   316  			t.setTaskHealth(false, false)
   317  
   318  			// Avoid the timer from firing at the old start time
   319  			if !healthyTimer.Stop() {
   320  				select {
   321  				case <-healthyTimer.C:
   322  				default:
   323  				}
   324  			}
   325  
   326  			// Set the timer since all tasks are started
   327  			if !latestStartTime.IsZero() {
   328  				allStartedTime = latestStartTime
   329  				healthyTimer.Reset(t.minHealthyTime)
   330  			}
   331  		}
   332  
   333  		select {
   334  		case <-t.ctx.Done():
   335  			return
   336  		case newAlloc, ok := <-t.allocUpdates.Ch():
   337  			if !ok {
   338  				return
   339  			}
   340  			alloc = newAlloc
   341  		case <-healthyTimer.C:
   342  			t.setTaskHealth(true, false)
   343  		}
   344  	}
   345  }
   346  
   347  // watchConsulEvents is a  watcher for the health of the allocation's Consul
   348  // checks. If all checks report healthy the watcher will exit after the
   349  // MinHealthyTime has been reached, Otherwise the watcher will continue to
   350  // check unhealthy checks until the ctx is cancelled
   351  func (t *Tracker) watchConsulEvents() {
   352  	// checkTicker is the ticker that triggers us to look at the checks in
   353  	// Consul
   354  	checkTicker := time.NewTicker(t.checkLookupInterval)
   355  	defer checkTicker.Stop()
   356  
   357  	// healthyTimer fires when the checks have been healthy for the
   358  	// MinHealthyTime
   359  	healthyTimer := time.NewTimer(0)
   360  	if !healthyTimer.Stop() {
   361  		select {
   362  		case <-healthyTimer.C:
   363  		default:
   364  		}
   365  	}
   366  
   367  	// primed marks whether the healthy timer has been set
   368  	primed := false
   369  
   370  	// Store whether the last Consul checks call was successful or not
   371  	consulChecksErr := false
   372  
   373  	// allocReg are the registered objects in Consul for the allocation
   374  	var allocReg *consul.AllocRegistration
   375  
   376  OUTER:
   377  	for {
   378  		select {
   379  		case <-t.ctx.Done():
   380  			return
   381  		case <-checkTicker.C:
   382  			newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
   383  			if err != nil {
   384  				if !consulChecksErr {
   385  					consulChecksErr = true
   386  					t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID)
   387  				}
   388  				continue OUTER
   389  			} else {
   390  				consulChecksErr = false
   391  				allocReg = newAllocReg
   392  			}
   393  		case <-healthyTimer.C:
   394  			if t.setCheckHealth(true) {
   395  				// final health set and propagated
   396  				return
   397  			}
   398  			// tasks are unhealthy, reset and wait until all is healthy
   399  			primed = false
   400  		}
   401  
   402  		if allocReg == nil {
   403  			continue
   404  		}
   405  
   406  		// Store the task registrations
   407  		t.l.Lock()
   408  		for task, reg := range allocReg.Tasks {
   409  			//TODO(schmichael) for now skip unknown tasks as
   410  			//they're task group services which don't currently
   411  			//support checks anyway
   412  			if v, ok := t.taskHealth[task]; ok {
   413  				v.taskRegistrations = reg
   414  			}
   415  		}
   416  		t.l.Unlock()
   417  
   418  		// Detect if all the checks are passing
   419  		passed := true
   420  
   421  	CHECKS:
   422  		for _, treg := range allocReg.Tasks {
   423  			for _, sreg := range treg.Services {
   424  				for _, check := range sreg.Checks {
   425  					onupdate := sreg.CheckOnUpdate[check.CheckID]
   426  					switch check.Status {
   427  					case api.HealthPassing:
   428  						continue
   429  					case api.HealthWarning:
   430  						if onupdate == structs.OnUpdateIgnoreWarn || onupdate == structs.OnUpdateIgnore {
   431  							continue
   432  						}
   433  					case api.HealthCritical:
   434  						if onupdate == structs.OnUpdateIgnore {
   435  							continue
   436  						}
   437  					default:
   438  					}
   439  
   440  					passed = false
   441  					t.setCheckHealth(false)
   442  					break CHECKS
   443  				}
   444  			}
   445  		}
   446  
   447  		if !passed {
   448  			// Reset the timer since we have transitioned back to unhealthy
   449  			if primed {
   450  				if !healthyTimer.Stop() {
   451  					select {
   452  					case <-healthyTimer.C:
   453  					default:
   454  					}
   455  				}
   456  				primed = false
   457  			}
   458  		} else if !primed {
   459  			// Reset the timer to fire after MinHealthyTime
   460  			if !healthyTimer.Stop() {
   461  				select {
   462  				case <-healthyTimer.C:
   463  				default:
   464  				}
   465  			}
   466  
   467  			primed = true
   468  			healthyTimer.Reset(t.minHealthyTime)
   469  		}
   470  	}
   471  }
   472  
   473  // taskHealthState captures all known health information about a task. It is
   474  // largely used to determine if the task has contributed to the allocation being
   475  // unhealthy.
   476  type taskHealthState struct {
   477  	task              *structs.Task
   478  	state             *structs.TaskState
   479  	taskRegistrations *consul.ServiceRegistrations
   480  }
   481  
   482  // event takes the deadline time for the allocation to be healthy and the update
   483  // strategy of the group. It returns true if the task has contributed to the
   484  // allocation being unhealthy and if so, an event description of why.
   485  func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   486  	requireChecks := false
   487  	desiredChecks := 0
   488  	for _, s := range t.task.Services {
   489  		if nc := len(s.Checks); nc > 0 {
   490  			requireChecks = true
   491  			desiredChecks += nc
   492  		}
   493  	}
   494  	requireChecks = requireChecks && useChecks
   495  
   496  	if t.state != nil {
   497  		if t.state.Failed {
   498  			return "Unhealthy because of failed task", true
   499  		}
   500  
   501  		switch t.state.State {
   502  		case structs.TaskStatePending:
   503  			return "Task not running by deadline", true
   504  		case structs.TaskStateDead:
   505  			// hook tasks are healthy when dead successfully
   506  			if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar {
   507  				return "Unhealthy because of dead task", true
   508  			}
   509  		case structs.TaskStateRunning:
   510  			// We are running so check if we have been running long enough
   511  			if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   512  				return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
   513  			}
   514  		}
   515  	}
   516  
   517  	if t.taskRegistrations != nil {
   518  		var notPassing []string
   519  		passing := 0
   520  
   521  	OUTER:
   522  		for _, sreg := range t.taskRegistrations.Services {
   523  			for _, check := range sreg.Checks {
   524  				if check.Status != api.HealthPassing {
   525  					notPassing = append(notPassing, sreg.Service.Service)
   526  					continue OUTER
   527  				} else {
   528  					passing++
   529  				}
   530  			}
   531  		}
   532  
   533  		if len(notPassing) != 0 {
   534  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   535  		}
   536  
   537  		if passing != desiredChecks {
   538  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   539  		}
   540  
   541  	} else if requireChecks {
   542  		return "Service checks not registered", true
   543  	}
   544  
   545  	return "", false
   546  }