github.com/manicqin/nomad@v0.9.5/client/allochealth/tracker.go (about)

     1  package allochealth
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	hclog "github.com/hashicorp/go-hclog"
    12  	cconsul "github.com/hashicorp/nomad/client/consul"
    13  	cstructs "github.com/hashicorp/nomad/client/structs"
    14  	"github.com/hashicorp/nomad/command/agent/consul"
    15  	"github.com/hashicorp/nomad/nomad/structs"
    16  )
    17  
    18  const (
    19  	// AllocHealthEventSource is the source used for emitting task events
    20  	AllocHealthEventSource = "Alloc Unhealthy"
    21  
    22  	// consulCheckLookupInterval is the  interval at which we check if the
    23  	// Consul checks are healthy or unhealthy.
    24  	consulCheckLookupInterval = 500 * time.Millisecond
    25  )
    26  
    27  // Tracker tracks the health of an allocation and makes health events watchable
    28  // via channels.
    29  type Tracker struct {
    30  	// ctx and cancelFn is used to shutdown the tracker
    31  	ctx      context.Context
    32  	cancelFn context.CancelFunc
    33  
    34  	// alloc is the alloc we are tracking
    35  	alloc *structs.Allocation
    36  
    37  	// tg is the task group we are tracking
    38  	tg *structs.TaskGroup
    39  
    40  	// minHealthyTime is the duration an alloc must remain healthy to be
    41  	// considered healthy
    42  	minHealthyTime time.Duration
    43  
    44  	// useChecks specifies whether to use Consul healh checks or not
    45  	useChecks bool
    46  
    47  	// consulCheckCount is the number of checks the task group will attempt to
    48  	// register
    49  	consulCheckCount int
    50  
    51  	// allocUpdates is a listener for retrieving new alloc updates
    52  	allocUpdates *cstructs.AllocListener
    53  
    54  	// consulClient is used to look up the state of the task's checks
    55  	consulClient cconsul.ConsulServiceAPI
    56  
    57  	// healthy is used to signal whether we have determined the allocation to be
    58  	// healthy or unhealthy
    59  	healthy chan bool
    60  
    61  	// allocStopped is triggered when the allocation is stopped and tracking is
    62  	// not needed
    63  	allocStopped chan struct{}
    64  
    65  	// l is used to lock shared fields listed below
    66  	l sync.Mutex
    67  
    68  	// tasksHealthy marks whether all the tasks have met their health check
    69  	// (disregards Consul)
    70  	tasksHealthy bool
    71  
    72  	// allocFailed marks whether the allocation failed
    73  	allocFailed bool
    74  
    75  	// checksHealthy marks whether all the task's Consul checks are healthy
    76  	checksHealthy bool
    77  
    78  	// taskHealth contains the health state for each task
    79  	taskHealth map[string]*taskHealthState
    80  
    81  	logger hclog.Logger
    82  }
    83  
    84  // NewTracker returns a health tracker for the given allocation. An alloc
    85  // listener and consul API object are given so that the watcher can detect
    86  // health changes.
    87  func NewTracker(parentCtx context.Context, logger hclog.Logger, alloc *structs.Allocation,
    88  	allocUpdates *cstructs.AllocListener, consulClient cconsul.ConsulServiceAPI,
    89  	minHealthyTime time.Duration, useChecks bool) *Tracker {
    90  
    91  	// Do not create a named sub-logger as the hook controlling
    92  	// this struct should pass in an appropriately named
    93  	// sub-logger.
    94  	t := &Tracker{
    95  		healthy:        make(chan bool, 1),
    96  		allocStopped:   make(chan struct{}),
    97  		alloc:          alloc,
    98  		tg:             alloc.Job.LookupTaskGroup(alloc.TaskGroup),
    99  		minHealthyTime: minHealthyTime,
   100  		useChecks:      useChecks,
   101  		allocUpdates:   allocUpdates,
   102  		consulClient:   consulClient,
   103  		logger:         logger,
   104  	}
   105  
   106  	t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
   107  	for _, task := range t.tg.Tasks {
   108  		t.taskHealth[task.Name] = &taskHealthState{task: task}
   109  	}
   110  
   111  	for _, task := range t.tg.Tasks {
   112  		for _, s := range task.Services {
   113  			t.consulCheckCount += len(s.Checks)
   114  		}
   115  	}
   116  
   117  	t.ctx, t.cancelFn = context.WithCancel(parentCtx)
   118  	return t
   119  }
   120  
   121  // Start starts the watcher.
   122  func (t *Tracker) Start() {
   123  	go t.watchTaskEvents()
   124  	if t.useChecks {
   125  		go t.watchConsulEvents()
   126  	}
   127  }
   128  
   129  // HealthyCh returns a channel that will emit a boolean indicating the health of
   130  // the allocation.
   131  func (t *Tracker) HealthyCh() <-chan bool {
   132  	return t.healthy
   133  }
   134  
   135  // AllocStoppedCh returns a channel that will be fired if the allocation is
   136  // stopped. This means that health will not be set.
   137  func (t *Tracker) AllocStoppedCh() <-chan struct{} {
   138  	return t.allocStopped
   139  }
   140  
   141  // TaskEvents returns a map of events by task. This should only be called after
   142  // health has been determined. Only tasks that have contributed to the
   143  // allocation being unhealthy will have an event.
   144  func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
   145  	t.l.Lock()
   146  	defer t.l.Unlock()
   147  
   148  	// Nothing to do since the failure wasn't task related
   149  	if t.allocFailed {
   150  		return nil
   151  	}
   152  
   153  	deadline, _ := t.ctx.Deadline()
   154  	events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks))
   155  
   156  	// Go through are task information and build the event map
   157  	for task, state := range t.taskHealth {
   158  		useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   159  		if e, ok := state.event(deadline, t.tg.Update.MinHealthyTime, useChecks); ok {
   160  			events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e)
   161  		}
   162  	}
   163  
   164  	return events
   165  }
   166  
   167  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   168  // allocation is terminal, health is immediately broadcasted.
   169  func (t *Tracker) setTaskHealth(healthy, terminal bool) {
   170  	t.l.Lock()
   171  	defer t.l.Unlock()
   172  	t.tasksHealthy = healthy
   173  
   174  	// If we are marked healthy but we also require Consul to be healthy and it
   175  	// isn't yet, return, unless the task is terminal
   176  	requireConsul := t.useChecks && t.consulCheckCount > 0
   177  	if !terminal && healthy && requireConsul && !t.checksHealthy {
   178  		return
   179  	}
   180  
   181  	select {
   182  	case t.healthy <- healthy:
   183  	default:
   184  	}
   185  
   186  	// Shutdown the tracker
   187  	t.cancelFn()
   188  }
   189  
   190  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   191  func (t *Tracker) setCheckHealth(healthy bool) {
   192  	t.l.Lock()
   193  	defer t.l.Unlock()
   194  	t.checksHealthy = healthy
   195  
   196  	// Only signal if we are healthy and so is the tasks
   197  	if !healthy || !t.tasksHealthy {
   198  		return
   199  	}
   200  
   201  	select {
   202  	case t.healthy <- healthy:
   203  	default:
   204  	}
   205  
   206  	// Shutdown the tracker
   207  	t.cancelFn()
   208  }
   209  
   210  // markAllocStopped is used to mark the allocation as having stopped.
   211  func (t *Tracker) markAllocStopped() {
   212  	close(t.allocStopped)
   213  	t.cancelFn()
   214  }
   215  
   216  // watchTaskEvents is a long lived watcher that watches for the health of the
   217  // allocation's tasks.
   218  func (t *Tracker) watchTaskEvents() {
   219  	alloc := t.alloc
   220  	allStartedTime := time.Time{}
   221  	healthyTimer := time.NewTimer(0)
   222  	if !healthyTimer.Stop() {
   223  		select {
   224  		case <-healthyTimer.C:
   225  		default:
   226  		}
   227  	}
   228  
   229  	for {
   230  		// If the alloc is being stopped by the server just exit
   231  		switch alloc.DesiredStatus {
   232  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   233  			t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
   234  			t.markAllocStopped()
   235  			return
   236  		}
   237  
   238  		// Store the task states
   239  		t.l.Lock()
   240  		for task, state := range alloc.TaskStates {
   241  			//TODO(schmichael) for now skip unknown tasks as
   242  			//they're task group services which don't currently
   243  			//support checks anyway
   244  			if v, ok := t.taskHealth[task]; ok {
   245  				v.state = state
   246  			}
   247  		}
   248  		t.l.Unlock()
   249  
   250  		// Detect if the alloc is unhealthy or if all tasks have started yet
   251  		latestStartTime := time.Time{}
   252  		for _, state := range alloc.TaskStates {
   253  			// One of the tasks has failed so we can exit watching
   254  			if state.Failed || !state.FinishedAt.IsZero() {
   255  				t.setTaskHealth(false, true)
   256  				return
   257  			}
   258  
   259  			if state.State != structs.TaskStateRunning {
   260  				latestStartTime = time.Time{}
   261  				break
   262  			} else if state.StartedAt.After(latestStartTime) {
   263  				latestStartTime = state.StartedAt
   264  			}
   265  		}
   266  
   267  		// If the alloc is marked as failed by the client but none of the
   268  		// individual tasks failed, that means something failed at the alloc
   269  		// level.
   270  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   271  			t.l.Lock()
   272  			t.allocFailed = true
   273  			t.l.Unlock()
   274  			t.setTaskHealth(false, true)
   275  			return
   276  		}
   277  
   278  		if !latestStartTime.Equal(allStartedTime) {
   279  			// Avoid the timer from firing at the old start time
   280  			if !healthyTimer.Stop() {
   281  				select {
   282  				case <-healthyTimer.C:
   283  				default:
   284  				}
   285  			}
   286  
   287  			// Set the timer since all tasks are started
   288  			if !latestStartTime.IsZero() {
   289  				allStartedTime = latestStartTime
   290  				healthyTimer.Reset(t.minHealthyTime)
   291  			}
   292  		}
   293  
   294  		select {
   295  		case <-t.ctx.Done():
   296  			return
   297  		case newAlloc, ok := <-t.allocUpdates.Ch():
   298  			if !ok {
   299  				return
   300  			}
   301  			alloc = newAlloc
   302  		case <-healthyTimer.C:
   303  			t.setTaskHealth(true, false)
   304  		}
   305  	}
   306  }
   307  
   308  // watchConsulEvents is a long lived watcher for the health of the allocation's
   309  // Consul checks.
   310  func (t *Tracker) watchConsulEvents() {
   311  	// checkTicker is the ticker that triggers us to look at the checks in
   312  	// Consul
   313  	checkTicker := time.NewTicker(consulCheckLookupInterval)
   314  	defer checkTicker.Stop()
   315  
   316  	// healthyTimer fires when the checks have been healthy for the
   317  	// MinHealthyTime
   318  	healthyTimer := time.NewTimer(0)
   319  	if !healthyTimer.Stop() {
   320  		select {
   321  		case <-healthyTimer.C:
   322  		default:
   323  		}
   324  	}
   325  
   326  	// primed marks whether the healthy timer has been set
   327  	primed := false
   328  
   329  	// Store whether the last Consul checks call was successful or not
   330  	consulChecksErr := false
   331  
   332  	// allocReg are the registered objects in Consul for the allocation
   333  	var allocReg *consul.AllocRegistration
   334  
   335  OUTER:
   336  	for {
   337  		select {
   338  		case <-t.ctx.Done():
   339  			return
   340  		case <-checkTicker.C:
   341  			newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
   342  			if err != nil {
   343  				if !consulChecksErr {
   344  					consulChecksErr = true
   345  					t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID)
   346  				}
   347  				continue OUTER
   348  			} else {
   349  				consulChecksErr = false
   350  				allocReg = newAllocReg
   351  			}
   352  		case <-healthyTimer.C:
   353  			t.setCheckHealth(true)
   354  		}
   355  
   356  		if allocReg == nil {
   357  			continue
   358  		}
   359  
   360  		// Store the task registrations
   361  		t.l.Lock()
   362  		for task, reg := range allocReg.Tasks {
   363  			//TODO(schmichael) for now skip unknown tasks as
   364  			//they're task group services which don't currently
   365  			//support checks anyway
   366  			if v, ok := t.taskHealth[task]; ok {
   367  				v.taskRegistrations = reg
   368  			}
   369  		}
   370  		t.l.Unlock()
   371  
   372  		// Detect if all the checks are passing
   373  		passed := true
   374  
   375  	CHECKS:
   376  		for _, treg := range allocReg.Tasks {
   377  			for _, sreg := range treg.Services {
   378  				for _, check := range sreg.Checks {
   379  					if check.Status == api.HealthPassing {
   380  						continue
   381  					}
   382  
   383  					passed = false
   384  					t.setCheckHealth(false)
   385  					break CHECKS
   386  				}
   387  			}
   388  		}
   389  
   390  		if !passed {
   391  			// Reset the timer since we have transitioned back to unhealthy
   392  			if primed {
   393  				if !healthyTimer.Stop() {
   394  					select {
   395  					case <-healthyTimer.C:
   396  					default:
   397  					}
   398  				}
   399  				primed = false
   400  			}
   401  		} else if !primed {
   402  			// Reset the timer to fire after MinHealthyTime
   403  			if !healthyTimer.Stop() {
   404  				select {
   405  				case <-healthyTimer.C:
   406  				default:
   407  				}
   408  			}
   409  
   410  			primed = true
   411  			healthyTimer.Reset(t.minHealthyTime)
   412  		}
   413  	}
   414  }
   415  
   416  // taskHealthState captures all known health information about a task. It is
   417  // largely used to determine if the task has contributed to the allocation being
   418  // unhealthy.
   419  type taskHealthState struct {
   420  	task              *structs.Task
   421  	state             *structs.TaskState
   422  	taskRegistrations *consul.ServiceRegistrations
   423  }
   424  
   425  // event takes the deadline time for the allocation to be healthy and the update
   426  // strategy of the group. It returns true if the task has contributed to the
   427  // allocation being unhealthy and if so, an event description of why.
   428  func (t *taskHealthState) event(deadline time.Time, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   429  	requireChecks := false
   430  	desiredChecks := 0
   431  	for _, s := range t.task.Services {
   432  		if nc := len(s.Checks); nc > 0 {
   433  			requireChecks = true
   434  			desiredChecks += nc
   435  		}
   436  	}
   437  	requireChecks = requireChecks && useChecks
   438  
   439  	if t.state != nil {
   440  		if t.state.Failed {
   441  			return "Unhealthy because of failed task", true
   442  		}
   443  		if t.state.State != structs.TaskStateRunning {
   444  			return "Task not running by deadline", true
   445  		}
   446  
   447  		// We are running so check if we have been running long enough
   448  		if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   449  			return fmt.Sprintf("Task not running for min_healthy_time of %v by deadline", minHealthyTime), true
   450  		}
   451  	}
   452  
   453  	if t.taskRegistrations != nil {
   454  		var notPassing []string
   455  		passing := 0
   456  
   457  	OUTER:
   458  		for _, sreg := range t.taskRegistrations.Services {
   459  			for _, check := range sreg.Checks {
   460  				if check.Status != api.HealthPassing {
   461  					notPassing = append(notPassing, sreg.Service.Service)
   462  					continue OUTER
   463  				} else {
   464  					passing++
   465  				}
   466  			}
   467  		}
   468  
   469  		if len(notPassing) != 0 {
   470  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   471  		}
   472  
   473  		if passing != desiredChecks {
   474  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   475  		}
   476  
   477  	} else if requireChecks {
   478  		return "Service checks not registered", true
   479  	}
   480  
   481  	return "", false
   482  }