github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allochealth/tracker.go

github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allochealth/tracker.go (about)

     1  package allochealth
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"strings"
     7  	"sync"
     8  	"time"
     9  
    10  	"github.com/hashicorp/consul/api"
    11  	"github.com/hashicorp/go-hclog"
    12  	"github.com/hashicorp/nomad/client/serviceregistration"
    13  	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
    14  	cstructs "github.com/hashicorp/nomad/client/structs"
    15  	"github.com/hashicorp/nomad/helper"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  )
    18  
    19  const (
    20  	// AllocHealthEventSource is the source used for emitting task events
    21  	AllocHealthEventSource = "Alloc Unhealthy"
    22  
    23  	// checkLookupInterval is the pace at which we check if the Consul or Nomad
    24  	// checks for an allocation are healthy or unhealthy.
    25  	checkLookupInterval = 500 * time.Millisecond
    26  )
    27  
    28  // Tracker tracks the health of an allocation and makes health events watchable
    29  // via channels.
    30  type Tracker struct {
    31  	// ctx and cancelFn is used to shutdown the tracker
    32  	ctx      context.Context
    33  	cancelFn context.CancelFunc
    34  
    35  	// alloc is the alloc we are tracking
    36  	alloc *structs.Allocation
    37  
    38  	// tg is the task group we are tracking
    39  	tg *structs.TaskGroup
    40  
    41  	// minHealthyTime is the duration an alloc must remain healthy to be
    42  	// considered healthy
    43  	minHealthyTime time.Duration
    44  
    45  	// checkLookupInterval is the repeated interval after which which we check
    46  	// if the Consul checks are healthy or unhealthy.
    47  	checkLookupInterval time.Duration
    48  
    49  	// useChecks specifies whether to consider Consul and Nomad service checks.
    50  	useChecks bool
    51  
    52  	// consulCheckCount is the total number of Consul service checks in the task
    53  	// group including task level checks.
    54  	consulCheckCount int
    55  
    56  	// nomadCheckCount is the total the number of Nomad service checks in the task
    57  	// group including task level checks.
    58  	nomadCheckCount int
    59  
    60  	// allocUpdates is a listener for retrieving new alloc updates
    61  	allocUpdates *cstructs.AllocListener
    62  
    63  	// consulClient is used to look up the status of Consul service checks
    64  	consulClient serviceregistration.Handler
    65  
    66  	// checkStore is used to lookup the status of Nomad service checks
    67  	checkStore checkstore.Shim
    68  
    69  	// healthy is used to signal whether we have determined the allocation to be
    70  	// healthy or unhealthy
    71  	healthy chan bool
    72  
    73  	// allocStopped is triggered when the allocation is stopped and tracking is
    74  	// not needed
    75  	allocStopped chan struct{}
    76  
    77  	// lifecycleTasks is a map of ephemeral tasks and their lifecycle hooks.
    78  	// These tasks may terminate without affecting alloc health
    79  	lifecycleTasks map[string]string
    80  
    81  	// lock is used to lock shared fields listed below
    82  	lock sync.Mutex
    83  
    84  	// tasksHealthy marks whether all the tasks have met their health check
    85  	// (disregards Consul and Nomad checks)
    86  	tasksHealthy bool
    87  
    88  	// allocFailed marks whether the allocation failed
    89  	allocFailed bool
    90  
    91  	// checksHealthy marks whether all the task's Consul checks are healthy
    92  	checksHealthy bool
    93  
    94  	// taskHealth contains the health state for each task in the allocation
    95  	// name -> state
    96  	taskHealth map[string]*taskHealthState
    97  
    98  	// logger is for logging things
    99  	logger hclog.Logger
   100  }
   101  
   102  // NewTracker returns a health tracker for the given allocation.
   103  //
   104  // Depending on job configuration, an allocation's health takes into consideration
   105  // - An alloc listener
   106  // - Consul checks (via consul API)
   107  // - Nomad checks (via client state)
   108  func NewTracker(
   109  	parentCtx context.Context,
   110  	logger hclog.Logger,
   111  	alloc *structs.Allocation,
   112  	allocUpdates *cstructs.AllocListener,
   113  	consulClient serviceregistration.Handler,
   114  	checkStore checkstore.Shim,
   115  	minHealthyTime time.Duration,
   116  	useChecks bool,
   117  ) *Tracker {
   118  
   119  	t := &Tracker{
   120  		healthy:             make(chan bool, 1),
   121  		allocStopped:        make(chan struct{}),
   122  		alloc:               alloc,
   123  		tg:                  alloc.Job.LookupTaskGroup(alloc.TaskGroup),
   124  		minHealthyTime:      minHealthyTime,
   125  		useChecks:           useChecks,
   126  		allocUpdates:        allocUpdates,
   127  		consulClient:        consulClient,
   128  		checkStore:          checkStore,
   129  		checkLookupInterval: checkLookupInterval,
   130  		logger:              logger,
   131  		lifecycleTasks:      map[string]string{},
   132  	}
   133  
   134  	t.taskHealth = make(map[string]*taskHealthState, len(t.tg.Tasks))
   135  	for _, task := range t.tg.Tasks {
   136  		t.taskHealth[task.Name] = &taskHealthState{task: task}
   137  
   138  		if task.Lifecycle != nil && !task.Lifecycle.Sidecar {
   139  			t.lifecycleTasks[task.Name] = task.Lifecycle.Hook
   140  		}
   141  
   142  		c, n := countChecks(task.Services)
   143  		t.consulCheckCount += c
   144  		t.nomadCheckCount += n
   145  	}
   146  
   147  	c, n := countChecks(t.tg.Services)
   148  	t.consulCheckCount += c
   149  	t.nomadCheckCount += n
   150  
   151  	t.ctx, t.cancelFn = context.WithCancel(parentCtx)
   152  	return t
   153  }
   154  
   155  func countChecks(services []*structs.Service) (consul, nomad int) {
   156  	for _, service := range services {
   157  		switch service.Provider {
   158  		case structs.ServiceProviderNomad:
   159  			nomad += len(service.Checks)
   160  		default:
   161  			consul += len(service.Checks)
   162  		}
   163  	}
   164  	return
   165  }
   166  
   167  // Start starts the watcher.
   168  func (t *Tracker) Start() {
   169  	go t.watchTaskEvents()
   170  
   171  	switch {
   172  	case !t.useChecks:
   173  		return
   174  	case t.consulCheckCount > 0:
   175  		go t.watchConsulEvents()
   176  	case t.nomadCheckCount > 0:
   177  		go t.watchNomadEvents()
   178  	}
   179  }
   180  
   181  // HealthyCh returns a channel that will emit a boolean indicating the health of
   182  // the allocation.
   183  func (t *Tracker) HealthyCh() <-chan bool {
   184  	return t.healthy
   185  }
   186  
   187  // AllocStoppedCh returns a channel that will be fired if the allocation is
   188  // stopped. This means that health will not be set.
   189  func (t *Tracker) AllocStoppedCh() <-chan struct{} {
   190  	return t.allocStopped
   191  }
   192  
   193  // TaskEvents returns a map of events by task. This should only be called after
   194  // health has been determined. Only tasks that have contributed to the
   195  // allocation being unhealthy will have an event.
   196  func (t *Tracker) TaskEvents() map[string]*structs.TaskEvent {
   197  	t.lock.Lock()
   198  	defer t.lock.Unlock()
   199  
   200  	// Nothing to do since the failure wasn't task related
   201  	if t.allocFailed {
   202  		return nil
   203  	}
   204  
   205  	deadline, _ := t.ctx.Deadline()
   206  	events := make(map[string]*structs.TaskEvent, len(t.tg.Tasks))
   207  
   208  	// Go through are task information and build the event map
   209  	for task, state := range t.taskHealth {
   210  		useChecks := t.tg.Update.HealthCheck == structs.UpdateStrategyHealthCheck_Checks
   211  		if e, ok := state.event(deadline, t.tg.Update.HealthyDeadline, t.tg.Update.MinHealthyTime, useChecks); ok {
   212  			events[task] = structs.NewTaskEvent(AllocHealthEventSource).SetMessage(e)
   213  		}
   214  	}
   215  
   216  	return events
   217  }
   218  
   219  // setTaskHealth is used to set the tasks health as healthy or unhealthy. If the
   220  // allocation is terminal, health is immediately broadcast.
   221  func (t *Tracker) setTaskHealth(healthy, terminal bool) {
   222  	t.lock.Lock()
   223  	defer t.lock.Unlock()
   224  
   225  	t.tasksHealthy = healthy
   226  
   227  	// if unhealthy, force waiting for new checks health status
   228  	if !terminal && !healthy {
   229  		t.checksHealthy = false
   230  		return
   231  	}
   232  
   233  	// If we are marked healthy but we also require Consul checks to be healthy
   234  	// and they are not yet, return, unless the task is terminal.
   235  	usesConsulChecks := t.useChecks && t.consulCheckCount > 0
   236  	if !terminal && healthy && usesConsulChecks && !t.checksHealthy {
   237  		return
   238  	}
   239  
   240  	// If we are marked healthy but also require Nomad checks to be healthy and
   241  	// they are not yet, return, unless the task is terminal.
   242  	usesNomadChecks := t.useChecks && t.nomadCheckCount > 0
   243  	if !terminal && healthy && usesNomadChecks && !t.checksHealthy {
   244  		return
   245  	}
   246  
   247  	select {
   248  	case t.healthy <- healthy:
   249  		// nothing
   250  	default:
   251  	}
   252  
   253  	// Shutdown the tracker
   254  	t.cancelFn()
   255  }
   256  
   257  // setCheckHealth is used to mark the checks as either healthy or unhealthy.
   258  // returns true if health is propagated and no more health monitoring is needed
   259  //
   260  // todo: this is currently being shared by watchConsulEvents and watchNomadEvents,
   261  //
   262  //	and must be split up if/when we support registering services (and thus checks)
   263  //	of different providers.
   264  func (t *Tracker) setCheckHealth(healthy bool) bool {
   265  	t.lock.Lock()
   266  	defer t.lock.Unlock()
   267  
   268  	// check health should always be false if tasks are unhealthy
   269  	// as checks might be missing from unhealthy tasks
   270  	t.checksHealthy = healthy && t.tasksHealthy
   271  
   272  	// Only signal if we are healthy and so is the tasks
   273  	if !t.checksHealthy {
   274  		return false
   275  	}
   276  
   277  	select {
   278  	case t.healthy <- healthy:
   279  		// nothing
   280  	default:
   281  	}
   282  
   283  	// Shutdown the tracker, things are healthy so nothing to do
   284  	t.cancelFn()
   285  	return true
   286  }
   287  
   288  // markAllocStopped is used to mark the allocation as having stopped.
   289  func (t *Tracker) markAllocStopped() {
   290  	close(t.allocStopped)
   291  	t.cancelFn()
   292  }
   293  
   294  // watchTaskEvents is a long lived watcher that watches for the health of the
   295  // allocation's tasks.
   296  func (t *Tracker) watchTaskEvents() {
   297  	alloc := t.alloc
   298  	allStartedTime := time.Time{}
   299  
   300  	waiter := newHealthyFuture()
   301  
   302  	for {
   303  		// If the alloc is being stopped by the server just exit
   304  		switch alloc.DesiredStatus {
   305  		case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict:
   306  			t.logger.Trace("desired status is terminal for alloc", "alloc_id", alloc.ID, "desired_status", alloc.DesiredStatus)
   307  			t.markAllocStopped()
   308  			return
   309  		}
   310  
   311  		// Store the task states
   312  		t.lock.Lock()
   313  		for task, state := range alloc.TaskStates {
   314  			//TODO(schmichael) for now skip unknown tasks as
   315  			//they're task group services which don't currently
   316  			//support checks anyway
   317  			if v, ok := t.taskHealth[task]; ok {
   318  				v.state = state
   319  			}
   320  		}
   321  		t.lock.Unlock()
   322  
   323  		// Detect if the alloc is unhealthy or if all tasks have started yet
   324  		latestStartTime := time.Time{}
   325  		for taskName, state := range alloc.TaskStates {
   326  			// If the task is a poststop task we do not want to evaluate it
   327  			// since it will remain pending until the main task has finished
   328  			// or exited.
   329  			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststop {
   330  				continue
   331  			}
   332  
   333  			// If this is a poststart task which has already succeeded, we
   334  			// should skip evaluation.
   335  			if t.lifecycleTasks[taskName] == structs.TaskLifecycleHookPoststart && state.Successful() {
   336  				continue
   337  			}
   338  
   339  			// One of the tasks has failed so we can exit watching
   340  			if state.Failed || (!state.FinishedAt.IsZero() && t.lifecycleTasks[taskName] != structs.TaskLifecycleHookPrestart) {
   341  				t.setTaskHealth(false, true)
   342  				return
   343  			}
   344  
   345  			if state.State == structs.TaskStatePending {
   346  				latestStartTime = time.Time{}
   347  				break
   348  			} else if state.StartedAt.After(latestStartTime) {
   349  				// task is either running or exited successfully
   350  				latestStartTime = state.StartedAt
   351  			}
   352  		}
   353  
   354  		// If the alloc is marked as failed by the client but none of the
   355  		// individual tasks failed, that means something failed at the alloc
   356  		// level.
   357  		if alloc.ClientStatus == structs.AllocClientStatusFailed {
   358  			t.lock.Lock()
   359  			t.allocFailed = true
   360  			t.lock.Unlock()
   361  
   362  			t.setTaskHealth(false, true)
   363  			return
   364  		}
   365  
   366  		if !latestStartTime.Equal(allStartedTime) {
   367  			// reset task health
   368  			t.setTaskHealth(false, false)
   369  
   370  			// Prevent the timer from firing at the old start time
   371  			waiter.disable()
   372  
   373  			// Set the timer since all tasks are started
   374  			if !latestStartTime.IsZero() {
   375  				allStartedTime = latestStartTime
   376  				waiter.wait(t.minHealthyTime)
   377  			}
   378  		}
   379  
   380  		select {
   381  		case <-t.ctx.Done():
   382  			return
   383  		case newAlloc, ok := <-t.allocUpdates.Ch():
   384  			if !ok {
   385  				return
   386  			}
   387  			alloc = newAlloc
   388  		case <-waiter.C():
   389  			t.setTaskHealth(true, false)
   390  		}
   391  	}
   392  }
   393  
   394  // healthyFuture is used to fire after checks have been healthy for MinHealthyTime
   395  type healthyFuture struct {
   396  	timer *time.Timer
   397  }
   398  
   399  // newHealthyFuture will create a healthyFuture in a disabled state, and
   400  // will do nothing until a call to wait takes place
   401  func newHealthyFuture() *healthyFuture {
   402  	timer := time.NewTimer(0)
   403  	ht := &healthyFuture{timer: timer}
   404  	ht.disable()
   405  	return ht
   406  }
   407  
   408  // disable the healthyFuture from triggering
   409  func (h *healthyFuture) disable() {
   410  	if !h.timer.Stop() {
   411  		// must ensure channel is clear
   412  		// https://pkg.go.dev/time#Timer.Stop
   413  		select {
   414  		case <-h.timer.C:
   415  		default:
   416  		}
   417  	}
   418  }
   419  
   420  // wait will reset the healthyFuture to trigger after dur passes.
   421  func (h *healthyFuture) wait(dur time.Duration) {
   422  	// must ensure timer is stopped
   423  	// https://pkg.go.dev/time#Timer.Reset
   424  	h.disable()
   425  	h.timer.Reset(dur)
   426  }
   427  
   428  // C returns a channel on which the future will send when ready.
   429  func (h *healthyFuture) C() <-chan time.Time {
   430  	return h.timer.C
   431  }
   432  
   433  // watchConsulEvents is a watcher for the health of the allocation's Consul
   434  // checks. If all checks report healthy the watcher will exit after the
   435  // MinHealthyTime has been reached, otherwise the watcher will continue to
   436  // check unhealthy checks until the ctx is cancelled.
   437  //
   438  // Does not watch Nomad service checks; see watchNomadEvents for those.
   439  func (t *Tracker) watchConsulEvents() {
   440  	// checkTicker is the ticker that triggers us to look at the checks in Consul
   441  	checkTicker := time.NewTicker(t.checkLookupInterval)
   442  	defer checkTicker.Stop()
   443  
   444  	// waiter is used to fire when the checks have been healthy for the MinHealthyTime
   445  	waiter := newHealthyFuture()
   446  
   447  	// primed marks whether the healthy waiter has been set
   448  	primed := false
   449  
   450  	// Store whether the last Consul checks call was successful or not
   451  	consulChecksErr := false
   452  
   453  	// allocReg are the registered objects in Consul for the allocation
   454  	var allocReg *serviceregistration.AllocRegistration
   455  
   456  OUTER:
   457  	for {
   458  		select {
   459  
   460  		// we are shutting down
   461  		case <-t.ctx.Done():
   462  			return
   463  
   464  		// it is time to check the checks
   465  		case <-checkTicker.C:
   466  			newAllocReg, err := t.consulClient.AllocRegistrations(t.alloc.ID)
   467  			if err != nil {
   468  				if !consulChecksErr {
   469  					consulChecksErr = true
   470  					t.logger.Warn("error looking up Consul registrations for allocation", "error", err, "alloc_id", t.alloc.ID)
   471  				}
   472  				continue OUTER
   473  			} else {
   474  				consulChecksErr = false
   475  				allocReg = newAllocReg
   476  			}
   477  
   478  			// enough time has passed with healthy checks
   479  		case <-waiter.C():
   480  			if t.setCheckHealth(true) {
   481  				// final health set and propagated
   482  				return
   483  			}
   484  			// checks are healthy but tasks are unhealthy,
   485  			// reset and wait until all is healthy
   486  			primed = false
   487  		}
   488  
   489  		if allocReg == nil {
   490  			continue
   491  		}
   492  
   493  		// Store the task registrations
   494  		t.lock.Lock()
   495  		for task, reg := range allocReg.Tasks {
   496  			if v, ok := t.taskHealth[task]; ok {
   497  				v.taskRegistrations = reg
   498  			}
   499  		}
   500  		t.lock.Unlock()
   501  
   502  		// Detect if all the checks are passing
   503  		passed := true
   504  
   505  	CHECKS:
   506  		for _, treg := range allocReg.Tasks {
   507  			for _, sreg := range treg.Services {
   508  				for _, check := range sreg.Checks {
   509  					onUpdate := sreg.CheckOnUpdate[check.CheckID]
   510  					switch check.Status {
   511  					case api.HealthPassing:
   512  						continue
   513  					case api.HealthWarning:
   514  						if onUpdate == structs.OnUpdateIgnoreWarn || onUpdate == structs.OnUpdateIgnore {
   515  							continue
   516  						}
   517  					case api.HealthCritical:
   518  						if onUpdate == structs.OnUpdateIgnore {
   519  							continue
   520  						}
   521  					default:
   522  					}
   523  
   524  					passed = false
   525  					t.setCheckHealth(false)
   526  					break CHECKS
   527  				}
   528  			}
   529  		}
   530  
   531  		if !passed {
   532  			// Reset the timer since we have transitioned back to unhealthy
   533  			if primed {
   534  				primed = false
   535  				waiter.disable()
   536  			}
   537  		} else if !primed {
   538  			// Reset the timer to fire after MinHealthyTime
   539  			primed = true
   540  			waiter.disable()
   541  			waiter.wait(t.minHealthyTime)
   542  		}
   543  	}
   544  }
   545  
   546  // watchNomadEvents is a watcher for the health of the allocation's Nomad checks.
   547  // If all checks report healthy the watcher will exit after the MinHealthyTime has
   548  // been reached, otherwise the watcher will continue to check unhealthy checks until
   549  // the ctx is cancelled.
   550  //
   551  // Does not watch Consul service checks; see watchConsulEvents for those.
   552  func (t *Tracker) watchNomadEvents() {
   553  	// checkTicker is the ticker that triggers us to look at the checks in Nomad
   554  	checkTicker, cancel := helper.NewSafeTimer(t.checkLookupInterval)
   555  	defer cancel()
   556  
   557  	// waiter is used to fire when the checks have been healthy for the MinHealthyTime
   558  	waiter := newHealthyFuture()
   559  
   560  	// allocID of the allocation we are watching checks for
   561  	allocID := t.alloc.ID
   562  
   563  	// primed marks whether the healthy waiter has been set
   564  	primed := false
   565  
   566  	// latest set of nomad check results
   567  	var results map[structs.CheckID]*structs.CheckQueryResult
   568  
   569  	for {
   570  		select {
   571  
   572  		// tracker has been canceled, so stop waiting
   573  		case <-t.ctx.Done():
   574  			return
   575  
   576  		// it is time to check the checks
   577  		case <-checkTicker.C:
   578  			results = t.checkStore.List(allocID)
   579  			checkTicker.Reset(t.checkLookupInterval)
   580  
   581  		// enough time has passed with healthy checks
   582  		case <-waiter.C():
   583  			if t.setCheckHealth(true) { // todo(shoenig) this needs to be split between Consul and Nomad
   584  				return // final health set and propagated
   585  			}
   586  			// checks are healthy but tasks are unhealthy, reset and wait
   587  			primed = false
   588  		}
   589  
   590  		// scan to see if any checks are failing
   591  		passing := true
   592  		for _, result := range results {
   593  			switch result.Status {
   594  			case structs.CheckSuccess:
   595  				continue
   596  			case structs.CheckFailure:
   597  				if result.Mode == structs.Readiness {
   598  					continue
   599  				}
   600  				passing = false
   601  			default:
   602  				// i.e. pending check; do not consider healthy or ready
   603  				passing = false
   604  			}
   605  
   606  			if !passing {
   607  				break // 1+ check is failing; no need to continue
   608  			}
   609  		}
   610  
   611  		if !passing {
   612  			// at least one check is failing, transition to unhealthy
   613  			t.setCheckHealth(false)
   614  			primed = false
   615  			waiter.disable()
   616  		}
   617  
   618  		if passing && !primed {
   619  			// healthy but not yet primed, set timer to wait
   620  			primed = true
   621  			waiter.wait(t.minHealthyTime)
   622  		}
   623  	}
   624  }
   625  
   626  // taskHealthState captures all known health information about a task. It is
   627  // largely used to determine if the task has contributed to the allocation being
   628  // unhealthy.
   629  type taskHealthState struct {
   630  	task              *structs.Task
   631  	state             *structs.TaskState
   632  	taskRegistrations *serviceregistration.ServiceRegistrations
   633  }
   634  
   635  // event takes the deadline time for the allocation to be healthy and the update
   636  // strategy of the group. It returns true if the task has contributed to the
   637  // allocation being unhealthy and if so, an event description of why.
   638  func (t *taskHealthState) event(deadline time.Time, healthyDeadline, minHealthyTime time.Duration, useChecks bool) (string, bool) {
   639  	desiredChecks := 0
   640  	for _, s := range t.task.Services {
   641  		if nc := len(s.Checks); nc > 0 {
   642  			desiredChecks += nc
   643  		}
   644  	}
   645  	requireChecks := (desiredChecks > 0) && useChecks
   646  
   647  	if t.state != nil {
   648  		if t.state.Failed {
   649  			return "Unhealthy because of failed task", true
   650  		}
   651  
   652  		switch t.state.State {
   653  		case structs.TaskStatePending:
   654  			return fmt.Sprintf("Task not running by healthy_deadline of %v", healthyDeadline), true
   655  		case structs.TaskStateDead:
   656  			// non-sidecar hook lifecycle tasks are healthy if they exit with success
   657  			if t.task.Lifecycle == nil || t.task.Lifecycle.Sidecar {
   658  				return "Unhealthy because of dead task", true
   659  			}
   660  		case structs.TaskStateRunning:
   661  			// We are running so check if we have been running long enough
   662  			if t.state.StartedAt.Add(minHealthyTime).After(deadline) {
   663  				return fmt.Sprintf("Task not running for min_healthy_time of %v by healthy_deadline of %v", minHealthyTime, healthyDeadline), true
   664  			}
   665  		}
   666  	}
   667  
   668  	if t.taskRegistrations != nil {
   669  		var notPassing []string
   670  		passing := 0
   671  
   672  	OUTER:
   673  		for _, sreg := range t.taskRegistrations.Services {
   674  			for _, check := range sreg.Checks {
   675  				if check.Status != api.HealthPassing {
   676  					notPassing = append(notPassing, sreg.Service.Service)
   677  					continue OUTER
   678  				} else {
   679  					passing++
   680  				}
   681  			}
   682  		}
   683  
   684  		if len(notPassing) != 0 {
   685  			return fmt.Sprintf("Services not healthy by deadline: %s", strings.Join(notPassing, ", ")), true
   686  		}
   687  
   688  		if passing != desiredChecks {
   689  			return fmt.Sprintf("Only %d out of %d checks registered and passing", passing, desiredChecks), true
   690  		}
   691  
   692  	} else if requireChecks {
   693  		return "Service checks not registered", true
   694  	}
   695  
   696  	return "", false
   697  }