github.com/bigcommerce/nomad@v0.9.3-bc/client/allochealth/tracker.go (about)

     1  package allochealth
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  	cconsul "github.com/hashicorp/nomad/client/consul"
    13  	cstructs "github.com/hashicorp/nomad/client/structs"
    14  	"github.com/hashicorp/nomad/command/agent/consul"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// AllocHealthEventSource is the source used for emitting task events
    20  	AllocHealthEventSource = "Alloc Unhealthy"
    21  
    22  	// consulCheckLookupInterval is the  interval at which we check if the
    23  	// Consul checks are healthy or unhealthy.
    24  	consulCheckLookupInterval = 500 * time.Millisecond
    25  )
    26  
    27  // Tracker tracks the health of an allocation and makes health events watchable
    28  // via channels.
    29  type Tracker struct {
    30  	// ctx and cancelFn is used to shutdown the tracker
    31  	ctx      context.Context
    32  	cancelFn context.CancelFunc
    33  
    34  	// alloc is the alloc we are tracking
    35  	alloc *structs.Allocation
    36  
    37  	// tg is the task group we are tracking
    38  	tg *structs.TaskGroup
    39  
    40  	// minHealthyTime is the duration an alloc must remain healthy to be
    41  	// considered healthy
    42  	minHealthyTime time.Duration
    43  
    44  	// useChecks specifies whether to use Consul healh checks or not
    45  	useChecks bool
    46  
    47  	// consulCheckCount is the number of checks the task group will attempt to
    48  	// register
    49  	consulCheckCount int
    50  
    51  	// allocUpdates is a listener for retrieving new alloc updates
    52  	allocUpdates *cstructs.AllocListener
    53  
    54  	// consulClient is used to look up the state of the task's checks
    55  	consulClient cconsul.ConsulServiceAPI
    56  
    57  	// healthy is used to signal whether we have determined the allocation to be
    58  	// healthy or unhealthy
    59  	healthy chan bool
    60  
    61  	// allocStopped is triggered when the allocation is stopped and tracking is
    62  	// not needed
    63  	allocStopped chan struct{}
    64  
    65  	// l is used to lock shared fields listed below
    66  	l sync.Mutex
    67  
    68  	// tasksHealthy marks whether all the tasks have met their health check
    69  	// (disregards Consul)
    70  	tasksHealthy bool
    71  
    72  	// allocFailed marks whether the allocation failed
    73  	allocFailed bool
    74  
    75  	// checksHealthy marks whether all the task's Consul checks are healthy
    76  	checksHealthy bool
    77  
    78  	// taskHealth contains the health state for each task
    79  	taskHealth map[string]*taskHealthState
    80  
    81  	logger hclog.Logger
    82  }
    83  
    84  // NewTracker returns a health tracker for the given allocation. An alloc
    85  // listener and consul API object are given so that the watcher can detect
    86  // health changes.
    87  func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation,
    88  	allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI,
    89  	minHealthyTime time.Duration, useChecks bool) *Tracker {
    90  
    91  	// Do not create a named sub-logger as the hook controlling
    92  	// this struct should pass in an appropriately named
    93  	// sub-logger.
    94  	t := &Tracker{
    95  		healthy:        make(chan bool, 1),
    96  		allocStopped:   make(chan struct{}),
    97  		alloc:          alloc,
    98  		tg:             alloc.Job.LookupTaskGroup(alloc.TaskGroup),
    99  		minHealthyTime: minHealthyTime,
   100  		useChecks:      useChecks,
   101  		allocUpdates:   allocUpdates,
   102  		consulClient:   consulClient,
   103  		logger:         logger,
   104  	}
   105  
   106  	t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
   107  	for _, task := range t.tg.Tasks {
   108  		t.taskHealth[task.Name] = &taskHealthState{task: task}
   109  	}
   110  
   111  	for _, task := range t.tg.Tasks {
   112  		for _, s := range task.Services {
   113  			t.consulCheckCount += len(s.Checks)
   114  		}
   115  	}
   116  
   117  	t.ctx, t.cancelFn = context.WithCancel(parentCtx)
   118  	return t
   119  }
   120  
   121  // Start starts the watcher.
   122  func (t *Tracker) Start() {
   123  	go t.watchTaskEvents()
   124  	if t.useChecks {
   125  		go t.watchConsulEvents()
   126  	}
   127  }
   128  
   129  // HealthyCh returns a channel that will emit a boolean indicating the health of
   130  // the allocation.
   131  func (t *Tracker) HealthyCh() <-chan bool {
   132  	return t.healthy
   133  }
   134  
   135  // AllocStoppedCh returns a channel that will be fired if the allocation is
   136  // stopped. This means that health will not be set.
   137  func (t *Tracker) AllocStoppedCh() <-chan struct{} {
   138  	return t.allocStopped
   139  }
   140  
   141  // TaskEvents returns a map of events by task. This should only be called after
   142  // health has been determined. Only tasks that have contributed to the
   143  // allocation being unhealthy will have an event.
   144  func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
   145  	t.l.Lock()
   146  	defer t.l.Unlock()
   147  
   148  	// Nothing to do since the failure wasn't task related
   149  	if t.allocFailed {
   150  		return nil
   151  	}
   152  
   153  	deadline, _ := t.ctx.Deadline()
   154  	events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks))
   155  
   156  	// Go through are task information and build the event map
   157  	for task, state := range t.taskHealth {
   158  		useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   159  		if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok {
   160  			events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e)
   161  		}
   162  	}
   163  
   164  	return events
   165  }
   166  
   167  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   168  // allocation is terminal, health is immediately broadcasted.
   169  func (t *Tracker) setTaskHealth(healthy, terminal bool) {
   170  	t.l.Lock()
   171  	defer t.l.Unlock()
   172  	t.tasksHealthy = healthy
   173  
   174  	// If we are marked healthy but we also require Consul to be healthy and it
   175  	// isn't yet, return, unless the task is terminal
   176  	requireConsul := t.useChecks && t.consulCheckCount > 0
   177  	if !terminal && healthy && requireConsul && !t.checksHealthy {
   178  		return
   179  	}
   180  
   181  	select {
   182  	case t.healthy <- healthy:
   183  	default:
   184  	}
   185  
   186  	// Shutdown the tracker
   187  	t.cancelFn()
   188  }
   189  
   190  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   191  func (t *Tracker) setCheckHealth(healthy bool) {
   192  	t.l.Lock()
   193  	defer t.l.Unlock()
   194  	t.checksHealthy = healthy
   195  
   196  	// Only signal if we are healthy and so is the tasks
   197  	if !healthy || !t.tasksHealthy {
   198  		return
   199  	}
   200  
   201  	select {
   202  	case t.healthy <- healthy:
   203  	default:
   204  	}
   205  
   206  	// Shutdown the tracker
   207  	t.cancelFn()
   208  }
   209  
   210  // markAllocStopped is used to mark the allocation as having stopped.
   211  func (t *Tracker) markAllocStopped() {
   212  	close(t.allocStopped)
   213  	t.cancelFn()
   214  }
   215  
   216  // watchTaskEvents is a long lived watcher that watches for the health of the
   217  // allocation's tasks.
   218  func (t *Tracker) watchTaskEvents() {
   219  	alloc := t.alloc
   220  	allStartedTime := time.Time{}
   221  	healthyTimer := time.NewTimer(0)
   222  	if !healthyTimer.Stop() {
   223  		select {
   224  		case <-healthyTimer.C:
   225  		default:
   226  		}
   227  	}
   228  
   229  	for {
   230  		// If the alloc is being stopped by the server just exit
   231  		switch alloc.DesiredStatus {
   232  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   233  			t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
   234  			t.markAllocStopped()
   235  			return
   236  		}
   237  
   238  		// Store the task states
   239  		t.l.Lock()
   240  		for task, state := range alloc.TaskStates {
   241  			t.taskHealth[task].state = state
   242  		}
   243  		t.l.Unlock()
   244  
   245  		// Detect if the alloc is unhealthy or if all tasks have started yet
   246  		latestStartTime := time.Time{}
   247  		for _, state := range alloc.TaskStates {
   248  			// One of the tasks has failed so we can exit watching
   249  			if state.Failed || !state.FinishedAt.IsZero() {
   250  				t.setTaskHealth(false, true)
   251  				return
   252  			}
   253  
   254  			if state.State != structs.TaskStateRunning {
   255  				latestStartTime = time.Time{}
   256  				break
   257  			} else if state.StartedAt.After(latestStartTime) {
   258  				latestStartTime = state.StartedAt
   259  			}
   260  		}
   261  
   262  		// If the alloc is marked as failed by the client but none of the
   263  		// individual tasks failed, that means something failed at the alloc
   264  		// level.
   265  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   266  			t.l.Lock()
   267  			t.allocFailed = true
   268  			t.l.Unlock()
   269  			t.setTaskHealth(false, true)
   270  			return
   271  		}
   272  
   273  		if !latestStartTime.Equal(allStartedTime) {
   274  			// Avoid the timer from firing at the old start time
   275  			if !healthyTimer.Stop() {
   276  				select {
   277  				case <-healthyTimer.C:
   278  				default:
   279  				}
   280  			}
   281  
   282  			// Set the timer since all tasks are started
   283  			if !latestStartTime.IsZero() {
   284  				allStartedTime = latestStartTime
   285  				healthyTimer.Reset(t.minHealthyTime)
   286  			}
   287  		}
   288  
   289  		select {
   290  		case <-t.ctx.Done():
   291  			return
   292  		case newAlloc, ok := <-t.allocUpdates.Ch():
   293  			if !ok {
   294  				return
   295  			}
   296  			alloc = newAlloc
   297  		case <-healthyTimer.C:
   298  			t.setTaskHealth(true, false)
   299  		}
   300  	}
   301  }
   302  
   303  // watchConsulEvents is a long lived watcher for the health of the allocation's
   304  // Consul checks.
   305  func (t *Tracker) watchConsulEvents() {
   306  	// checkTicker is the ticker that triggers us to look at the checks in
   307  	// Consul
   308  	checkTicker := time.NewTicker(consulCheckLookupInterval)
   309  	defer checkTicker.Stop()
   310  
   311  	// healthyTimer fires when the checks have been healthy for the
   312  	// MinHealthyTime
   313  	healthyTimer := time.NewTimer(0)
   314  	if !healthyTimer.Stop() {
   315  		select {
   316  		case <-healthyTimer.C:
   317  		default:
   318  		}
   319  	}
   320  
   321  	// primed marks whether the healthy timer has been set
   322  	primed := false
   323  
   324  	// Store whether the last Consul checks call was successful or not
   325  	consulChecksErr := false
   326  
   327  	// allocReg are the registered objects in Consul for the allocation
   328  	var allocReg *consul.AllocRegistration
   329  
   330  OUTER:
   331  	for {
   332  		select {
   333  		case <-t.ctx.Done():
   334  			return
   335  		case <-checkTicker.C:
   336  			newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
   337  			if err != nil {
   338  				if !consulChecksErr {
   339  					consulChecksErr = true
   340  					t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID)
   341  				}
   342  				continue OUTER
   343  			} else {
   344  				consulChecksErr = false
   345  				allocReg = newAllocReg
   346  			}
   347  		case <-healthyTimer.C:
   348  			t.setCheckHealth(true)
   349  		}
   350  
   351  		if allocReg == nil {
   352  			continue
   353  		}
   354  
   355  		// Store the task registrations
   356  		t.l.Lock()
   357  		for task, reg := range allocReg.Tasks {
   358  			t.taskHealth[task].taskRegistrations = reg
   359  		}
   360  		t.l.Unlock()
   361  
   362  		// Detect if all the checks are passing
   363  		passed := true
   364  
   365  	CHECKS:
   366  		for _, treg := range allocReg.Tasks {
   367  			for _, sreg := range treg.Services {
   368  				for _, check := range sreg.Checks {
   369  					if check.Status == api.HealthPassing {
   370  						continue
   371  					}
   372  
   373  					passed = false
   374  					t.setCheckHealth(false)
   375  					break CHECKS
   376  				}
   377  			}
   378  		}
   379  
   380  		if !passed {
   381  			// Reset the timer since we have transitioned back to unhealthy
   382  			if primed {
   383  				if !healthyTimer.Stop() {
   384  					select {
   385  					case <-healthyTimer.C:
   386  					default:
   387  					}
   388  				}
   389  				primed = false
   390  			}
   391  		} else if !primed {
   392  			// Reset the timer to fire after MinHealthyTime
   393  			if !healthyTimer.Stop() {
   394  				select {
   395  				case <-healthyTimer.C:
   396  				default:
   397  				}
   398  			}
   399  
   400  			primed = true
   401  			healthyTimer.Reset(t.minHealthyTime)
   402  		}
   403  	}
   404  }
   405  
   406  // taskHealthState captures all known health information about a task. It is
   407  // largely used to determine if the task has contributed to the allocation being
   408  // unhealthy.
   409  type taskHealthState struct {
   410  	task              *structs.Task
   411  	state             *structs.TaskState
   412  	taskRegistrations *consul.TaskRegistration
   413  }
   414  
   415  // event takes the deadline time for the allocation to be healthy and the update
   416  // strategy of the group. It returns true if the task has contributed to the
   417  // allocation being unhealthy and if so, an event description of why.
   418  func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   419  	requireChecks := false
   420  	desiredChecks := 0
   421  	for _, s := range t.task.Services {
   422  		if nc := len(s.Checks); nc > 0 {
   423  			requireChecks = true
   424  			desiredChecks += nc
   425  		}
   426  	}
   427  	requireChecks = requireChecks && useChecks
   428  
   429  	if t.state != nil {
   430  		if t.state.Failed {
   431  			return "Unhealthy because of failed task", true
   432  		}
   433  		if t.state.State != structs.TaskStateRunning {
   434  			return "Task not running by deadline", true
   435  		}
   436  
   437  		// We are running so check if we have been running long enough
   438  		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   439  			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
   440  		}
   441  	}
   442  
   443  	if t.taskRegistrations != nil {
   444  		var notPassing []string
   445  		passing := 0
   446  
   447  	OUTER:
   448  		for _, sreg := range t.taskRegistrations.Services {
   449  			for _, check := range sreg.Checks {
   450  				if check.Status != api.HealthPassing {
   451  					notPassing = append(notPassing, sreg.Service.Service)
   452  					continue OUTER
   453  				} else {
   454  					passing++
   455  				}
   456  			}
   457  		}
   458  
   459  		if len(notPassing) != 0 {
   460  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   461  		}
   462  
   463  		if passing != desiredChecks {
   464  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   465  		}
   466  
   467  	} else if requireChecks {
   468  		return "Service checks not registered", true
   469  	}
   470  
   471  	return "", false
   472  }