github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allochealth/tracker_test.go (about)

     1  package allochealth
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"sync/atomic"
     7  	"testing"
     8  	"time"
     9  
    10  	consulapi "github.com/hashicorp/consul/api"
    11  	"github.com/hashicorp/nomad/ci"
    12  	"github.com/hashicorp/nomad/client/serviceregistration"
    13  	"github.com/hashicorp/nomad/client/serviceregistration/checks/checkstore"
    14  	regmock "github.com/hashicorp/nomad/client/serviceregistration/mock"
    15  	"github.com/hashicorp/nomad/client/state"
    16  	cstructs "github.com/hashicorp/nomad/client/structs"
    17  	"github.com/hashicorp/nomad/helper"
    18  	"github.com/hashicorp/nomad/helper/testlog"
    19  	"github.com/hashicorp/nomad/nomad/mock"
    20  	"github.com/hashicorp/nomad/nomad/structs"
    21  	"github.com/hashicorp/nomad/testutil"
    22  	"github.com/shoenig/test/must"
    23  	"github.com/stretchr/testify/require"
    24  )
    25  
    26  func TestTracker_ConsulChecks_Healthy(t *testing.T) {
    27  	ci.Parallel(t)
    28  
    29  	alloc := mock.Alloc()
    30  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
    31  	task := alloc.Job.TaskGroups[0].Tasks[0]
    32  
    33  	// Synthesize running alloc and tasks
    34  	alloc.ClientStatus = structs.AllocClientStatusRunning
    35  	alloc.TaskStates = map[string]*structs.TaskState{
    36  		task.Name: {
    37  			State:     structs.TaskStateRunning,
    38  			StartedAt: time.Now(),
    39  		},
    40  	}
    41  
    42  	// Make Consul response
    43  	check := &consulapi.AgentCheck{
    44  		Name:   task.Services[0].Checks[0].Name,
    45  		Status: consulapi.HealthPassing,
    46  	}
    47  	taskRegs := map[string]*serviceregistration.ServiceRegistrations{
    48  		task.Name: {
    49  			Services: map[string]*serviceregistration.ServiceRegistration{
    50  				task.Services[0].Name: {
    51  					Service: &consulapi.AgentService{
    52  						ID:      "foo",
    53  						Service: task.Services[0].Name,
    54  					},
    55  					Checks: []*consulapi.AgentCheck{check},
    56  				},
    57  			},
    58  		},
    59  	}
    60  
    61  	logger := testlog.HCLogger(t)
    62  	b := cstructs.NewAllocBroadcaster(logger)
    63  	defer b.Close()
    64  
    65  	// Don't reply on the first call
    66  	var called uint64
    67  	consul := regmock.NewServiceRegistrationHandler(logger)
    68  	consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
    69  		if atomic.AddUint64(&called, 1) == 1 {
    70  			return nil, nil
    71  		}
    72  
    73  		reg := &serviceregistration.AllocRegistration{
    74  			Tasks: taskRegs,
    75  		}
    76  
    77  		return reg, nil
    78  	}
    79  
    80  	ctx, cancelFn := context.WithCancel(context.Background())
    81  	defer cancelFn()
    82  
    83  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
    84  	checkInterval := 10 * time.Millisecond
    85  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
    86  	tracker.checkLookupInterval = checkInterval
    87  	tracker.Start()
    88  
    89  	select {
    90  	case <-time.After(4 * checkInterval):
    91  		require.Fail(t, "timed out while waiting for health")
    92  	case h := <-tracker.HealthyCh():
    93  		require.True(t, h)
    94  	}
    95  }
    96  
    97  func TestTracker_NomadChecks_Healthy(t *testing.T) {
    98  	ci.Parallel(t)
    99  
   100  	alloc := mock.Alloc()
   101  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   102  	alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
   103  
   104  	logger := testlog.HCLogger(t)
   105  	b := cstructs.NewAllocBroadcaster(logger)
   106  	defer b.Close()
   107  
   108  	ctx, cancel := context.WithCancel(context.Background())
   109  	defer cancel()
   110  
   111  	// Synthesize running alloc and tasks
   112  	alloc.ClientStatus = structs.AllocClientStatusRunning
   113  	alloc.TaskStates = map[string]*structs.TaskState{
   114  		alloc.Job.TaskGroups[0].Tasks[0].Name: {
   115  			State:     structs.TaskStateRunning,
   116  			StartedAt: time.Now(),
   117  		},
   118  	}
   119  
   120  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   121  	err := checks.Set(alloc.ID, &structs.CheckQueryResult{
   122  		ID:        "abc123",
   123  		Mode:      "healthiness",
   124  		Status:    "pending",
   125  		Output:    "nomad: waiting to run",
   126  		Timestamp: time.Now().Unix(),
   127  		Group:     alloc.TaskGroup,
   128  		Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   129  		Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   130  		Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   131  	})
   132  	must.NoError(t, err)
   133  
   134  	consul := regmock.NewServiceRegistrationHandler(logger)
   135  	checkInterval := 10 * time.Millisecond
   136  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   137  	tracker.checkLookupInterval = checkInterval
   138  	tracker.Start()
   139  
   140  	go func() {
   141  		// wait a bit then update the check to passing
   142  		time.Sleep(15 * time.Millisecond)
   143  		must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
   144  			ID:        "abc123",
   145  			Mode:      "healthiness",
   146  			Status:    "success",
   147  			Output:    "nomad: http ok",
   148  			Timestamp: time.Now().Unix(),
   149  			Group:     alloc.TaskGroup,
   150  			Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   151  			Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   152  			Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   153  		}))
   154  	}()
   155  
   156  	select {
   157  	case <-time.After(4 * checkInterval):
   158  		t.Fatalf("timed out while waiting for success")
   159  	case healthy := <-tracker.HealthyCh():
   160  		must.True(t, healthy)
   161  	}
   162  }
   163  
   164  func TestTracker_NomadChecks_Unhealthy(t *testing.T) {
   165  	ci.Parallel(t)
   166  
   167  	alloc := mock.Alloc()
   168  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   169  	alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
   170  
   171  	logger := testlog.HCLogger(t)
   172  	b := cstructs.NewAllocBroadcaster(logger)
   173  	defer b.Close()
   174  
   175  	ctx, cancel := context.WithCancel(context.Background())
   176  	defer cancel()
   177  
   178  	// Synthesize running alloc and tasks
   179  	alloc.ClientStatus = structs.AllocClientStatusRunning
   180  	alloc.TaskStates = map[string]*structs.TaskState{
   181  		alloc.Job.TaskGroups[0].Tasks[0].Name: {
   182  			State:     structs.TaskStateRunning,
   183  			StartedAt: time.Now(),
   184  		},
   185  	}
   186  
   187  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   188  	err := checks.Set(alloc.ID, &structs.CheckQueryResult{
   189  		ID:        "abc123",
   190  		Mode:      "healthiness",
   191  		Status:    "pending", // start out pending
   192  		Output:    "nomad: waiting to run",
   193  		Timestamp: time.Now().Unix(),
   194  		Group:     alloc.TaskGroup,
   195  		Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   196  		Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   197  		Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   198  	})
   199  	must.NoError(t, err)
   200  
   201  	consul := regmock.NewServiceRegistrationHandler(logger)
   202  	checkInterval := 10 * time.Millisecond
   203  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   204  	tracker.checkLookupInterval = checkInterval
   205  	tracker.Start()
   206  
   207  	go func() {
   208  		// wait a bit then update the check to failing
   209  		time.Sleep(15 * time.Millisecond)
   210  		must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
   211  			ID:        "abc123",
   212  			Mode:      "healthiness",
   213  			Status:    "failing",
   214  			Output:    "connection refused",
   215  			Timestamp: time.Now().Unix(),
   216  			Group:     alloc.TaskGroup,
   217  			Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   218  			Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   219  			Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   220  		}))
   221  	}()
   222  
   223  	// make sure we are always unhealthy across 4 check intervals
   224  	for i := 0; i < 4; i++ {
   225  		<-time.After(checkInterval)
   226  		select {
   227  		case <-tracker.HealthyCh():
   228  			t.Fatalf("should not receive on healthy chan with failing check")
   229  		default:
   230  		}
   231  	}
   232  }
   233  
   234  func TestTracker_Checks_PendingPostStop_Healthy(t *testing.T) {
   235  	ci.Parallel(t)
   236  
   237  	alloc := mock.LifecycleAllocWithPoststopDeploy()
   238  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   239  
   240  	// Synthesize running alloc and tasks
   241  	alloc.ClientStatus = structs.AllocClientStatusRunning
   242  	alloc.TaskStates = map[string]*structs.TaskState{
   243  		"web": {
   244  			State:     structs.TaskStateRunning,
   245  			StartedAt: time.Now(),
   246  		},
   247  		"post": {
   248  			State: structs.TaskStatePending,
   249  		},
   250  	}
   251  
   252  	logger := testlog.HCLogger(t)
   253  	b := cstructs.NewAllocBroadcaster(logger)
   254  	defer b.Close()
   255  
   256  	consul := regmock.NewServiceRegistrationHandler(logger)
   257  	ctx, cancelFn := context.WithCancel(context.Background())
   258  	defer cancelFn()
   259  
   260  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   261  	checkInterval := 10 * time.Millisecond
   262  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   263  	tracker.checkLookupInterval = checkInterval
   264  	tracker.Start()
   265  
   266  	select {
   267  	case <-time.After(4 * checkInterval):
   268  		require.Fail(t, "timed out while waiting for health")
   269  	case h := <-tracker.HealthyCh():
   270  		require.True(t, h)
   271  	}
   272  }
   273  
   274  func TestTracker_Succeeded_PostStart_Healthy(t *testing.T) {
   275  	ci.Parallel(t)
   276  
   277  	alloc := mock.LifecycleAllocWithPoststartDeploy()
   278  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = time.Millisecond * 1
   279  	// Synthesize running alloc and tasks
   280  	alloc.ClientStatus = structs.AllocClientStatusRunning
   281  	alloc.TaskStates = map[string]*structs.TaskState{
   282  		"web": {
   283  			State:     structs.TaskStateRunning,
   284  			StartedAt: time.Now(),
   285  		},
   286  		"post": {
   287  			State:      structs.TaskStateDead,
   288  			StartedAt:  time.Now(),
   289  			FinishedAt: time.Now().Add(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime / 2),
   290  		},
   291  	}
   292  
   293  	logger := testlog.HCLogger(t)
   294  	b := cstructs.NewAllocBroadcaster(logger)
   295  	defer b.Close()
   296  
   297  	consul := regmock.NewServiceRegistrationHandler(logger)
   298  	ctx, cancelFn := context.WithCancel(context.Background())
   299  	defer cancelFn()
   300  
   301  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   302  	checkInterval := 10 * time.Millisecond
   303  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, alloc.Job.TaskGroups[0].Migrate.MinHealthyTime, true)
   304  	tracker.checkLookupInterval = checkInterval
   305  	tracker.Start()
   306  
   307  	select {
   308  	case <-time.After(alloc.Job.TaskGroups[0].Migrate.MinHealthyTime * 2):
   309  		require.Fail(t, "timed out while waiting for health")
   310  	case h := <-tracker.HealthyCh():
   311  		require.True(t, h)
   312  	}
   313  }
   314  
   315  func TestTracker_ConsulChecks_Unhealthy(t *testing.T) {
   316  	ci.Parallel(t)
   317  
   318  	alloc := mock.Alloc()
   319  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   320  	task := alloc.Job.TaskGroups[0].Tasks[0]
   321  
   322  	newCheck := task.Services[0].Checks[0].Copy()
   323  	newCheck.Name = "failing-check"
   324  	task.Services[0].Checks = append(task.Services[0].Checks, newCheck)
   325  
   326  	// Synthesize running alloc and tasks
   327  	alloc.ClientStatus = structs.AllocClientStatusRunning
   328  	alloc.TaskStates = map[string]*structs.TaskState{
   329  		task.Name: {
   330  			State:     structs.TaskStateRunning,
   331  			StartedAt: time.Now(),
   332  		},
   333  	}
   334  
   335  	// Make Consul response
   336  	checkHealthy := &consulapi.AgentCheck{
   337  		Name:   task.Services[0].Checks[0].Name,
   338  		Status: consulapi.HealthPassing,
   339  	}
   340  	checksUnhealthy := &consulapi.AgentCheck{
   341  		Name:   task.Services[0].Checks[1].Name,
   342  		Status: consulapi.HealthCritical,
   343  	}
   344  	taskRegs := map[string]*serviceregistration.ServiceRegistrations{
   345  		task.Name: {
   346  			Services: map[string]*serviceregistration.ServiceRegistration{
   347  				task.Services[0].Name: {
   348  					Service: &consulapi.AgentService{
   349  						ID:      "foo",
   350  						Service: task.Services[0].Name,
   351  					},
   352  					Checks: []*consulapi.AgentCheck{checkHealthy, checksUnhealthy},
   353  				},
   354  			},
   355  		},
   356  	}
   357  
   358  	logger := testlog.HCLogger(t)
   359  	b := cstructs.NewAllocBroadcaster(logger)
   360  	defer b.Close()
   361  
   362  	// Don't reply on the first call
   363  	var called uint64
   364  	consul := regmock.NewServiceRegistrationHandler(logger)
   365  	consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
   366  		if atomic.AddUint64(&called, 1) == 1 {
   367  			return nil, nil
   368  		}
   369  
   370  		reg := &serviceregistration.AllocRegistration{
   371  			Tasks: taskRegs,
   372  		}
   373  
   374  		return reg, nil
   375  	}
   376  
   377  	ctx, cancelFn := context.WithCancel(context.Background())
   378  	defer cancelFn()
   379  
   380  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   381  	checkInterval := 10 * time.Millisecond
   382  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   383  	tracker.checkLookupInterval = checkInterval
   384  	tracker.Start()
   385  
   386  	testutil.WaitForResult(func() (bool, error) {
   387  		lookup := atomic.LoadUint64(&called)
   388  		return lookup < 4, fmt.Errorf("wait to get more task registration lookups: %v", lookup)
   389  	}, func(err error) {
   390  		require.NoError(t, err)
   391  	})
   392  
   393  	tracker.lock.Lock()
   394  	require.False(t, tracker.checksHealthy)
   395  	tracker.lock.Unlock()
   396  
   397  	select {
   398  	case v := <-tracker.HealthyCh():
   399  		require.Failf(t, "expected no health value", " got %v", v)
   400  	default:
   401  		// good
   402  	}
   403  }
   404  
   405  func TestTracker_Healthy_IfBothTasksAndConsulChecksAreHealthy(t *testing.T) {
   406  	ci.Parallel(t)
   407  
   408  	alloc := mock.Alloc()
   409  	logger := testlog.HCLogger(t)
   410  
   411  	ctx, cancelFn := context.WithCancel(context.Background())
   412  	defer cancelFn()
   413  
   414  	tracker := NewTracker(ctx, logger, alloc, nil, nil, nil, time.Millisecond, true)
   415  
   416  	assertNoHealth := func() {
   417  		require.NoError(t, tracker.ctx.Err())
   418  		select {
   419  		case v := <-tracker.HealthyCh():
   420  			require.Failf(t, "unexpected healthy event", "got %v", v)
   421  		default:
   422  		}
   423  	}
   424  
   425  	// first set task health without checks
   426  	tracker.setTaskHealth(true, false)
   427  	assertNoHealth()
   428  
   429  	// now fail task health again before checks are successful
   430  	tracker.setTaskHealth(false, false)
   431  	assertNoHealth()
   432  
   433  	// now pass health checks - do not propagate health yet
   434  	tracker.setCheckHealth(true)
   435  	assertNoHealth()
   436  
   437  	// set tasks to healthy - don't propagate health yet, wait for the next check
   438  	tracker.setTaskHealth(true, false)
   439  	assertNoHealth()
   440  
   441  	// set checks to true, now propagate health status
   442  	tracker.setCheckHealth(true)
   443  
   444  	require.Error(t, tracker.ctx.Err())
   445  	select {
   446  	case v := <-tracker.HealthyCh():
   447  		require.True(t, v)
   448  	default:
   449  		require.Fail(t, "expected a health status")
   450  	}
   451  }
   452  
   453  // TestTracker_Checks_Healthy_Before_TaskHealth asserts that we mark an alloc
   454  // healthy, if the checks pass before task health pass
   455  func TestTracker_Checks_Healthy_Before_TaskHealth(t *testing.T) {
   456  	ci.Parallel(t)
   457  
   458  	alloc := mock.Alloc()
   459  	alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   460  	task := alloc.Job.TaskGroups[0].Tasks[0]
   461  
   462  	// new task starting unhealthy, without services
   463  	task2 := task.Copy()
   464  	task2.Name = task2.Name + "2"
   465  	task2.Services = nil
   466  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   467  
   468  	// Synthesize running alloc and tasks
   469  	alloc.ClientStatus = structs.AllocClientStatusRunning
   470  	alloc.TaskStates = map[string]*structs.TaskState{
   471  		task.Name: {
   472  			State:     structs.TaskStateRunning,
   473  			StartedAt: time.Now(),
   474  		},
   475  		task2.Name: {
   476  			State: structs.TaskStatePending,
   477  		},
   478  	}
   479  
   480  	// Make Consul response
   481  	check := &consulapi.AgentCheck{
   482  		Name:   task.Services[0].Checks[0].Name,
   483  		Status: consulapi.HealthPassing,
   484  	}
   485  	taskRegs := map[string]*serviceregistration.ServiceRegistrations{
   486  		task.Name: {
   487  			Services: map[string]*serviceregistration.ServiceRegistration{
   488  				task.Services[0].Name: {
   489  					Service: &consulapi.AgentService{
   490  						ID:      "foo",
   491  						Service: task.Services[0].Name,
   492  					},
   493  					Checks: []*consulapi.AgentCheck{check},
   494  				},
   495  			},
   496  		},
   497  	}
   498  
   499  	logger := testlog.HCLogger(t)
   500  	b := cstructs.NewAllocBroadcaster(logger)
   501  	defer b.Close()
   502  
   503  	// Don't reply on the first call
   504  	var called uint64
   505  	consul := regmock.NewServiceRegistrationHandler(logger)
   506  	consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
   507  		if atomic.AddUint64(&called, 1) == 1 {
   508  			return nil, nil
   509  		}
   510  
   511  		reg := &serviceregistration.AllocRegistration{
   512  			Tasks: taskRegs,
   513  		}
   514  
   515  		return reg, nil
   516  	}
   517  
   518  	ctx, cancelFn := context.WithCancel(context.Background())
   519  	defer cancelFn()
   520  
   521  	checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   522  	checkInterval := 10 * time.Millisecond
   523  	tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   524  	tracker.checkLookupInterval = checkInterval
   525  	tracker.Start()
   526  
   527  	// assert that we don't get marked healthy
   528  	select {
   529  	case <-time.After(4 * checkInterval):
   530  		// still unhealthy, good
   531  	case h := <-tracker.HealthyCh():
   532  		require.Fail(t, "unexpected health event", h)
   533  	}
   534  
   535  	helper.WithLock(&tracker.lock, func() {
   536  		require.False(t, tracker.tasksHealthy)
   537  		require.False(t, tracker.checksHealthy)
   538  	})
   539  
   540  	// now set task to healthy
   541  	runningAlloc := alloc.Copy()
   542  	runningAlloc.TaskStates = map[string]*structs.TaskState{
   543  		task.Name: {
   544  			State:     structs.TaskStateRunning,
   545  			StartedAt: time.Now(),
   546  		},
   547  		task2.Name: {
   548  			State:     structs.TaskStateRunning,
   549  			StartedAt: time.Now(),
   550  		},
   551  	}
   552  	err := b.Send(runningAlloc)
   553  	require.NoError(t, err)
   554  
   555  	// eventually, it is marked as healthy
   556  	select {
   557  	case <-time.After(4 * checkInterval):
   558  		require.Fail(t, "timed out while waiting for health")
   559  	case h := <-tracker.HealthyCh():
   560  		require.True(t, h)
   561  	}
   562  
   563  }
   564  
   565  func TestTracker_ConsulChecks_OnUpdate(t *testing.T) {
   566  	ci.Parallel(t)
   567  
   568  	cases := []struct {
   569  		desc          string
   570  		checkOnUpdate string
   571  		consulResp    string
   572  		expectedPass  bool
   573  	}{
   574  		{
   575  			desc:          "check require_healthy consul healthy",
   576  			checkOnUpdate: structs.OnUpdateRequireHealthy,
   577  			consulResp:    consulapi.HealthPassing,
   578  			expectedPass:  true,
   579  		},
   580  		{
   581  			desc:          "check on_update ignore_warning, consul warn",
   582  			checkOnUpdate: structs.OnUpdateIgnoreWarn,
   583  			consulResp:    consulapi.HealthWarning,
   584  			expectedPass:  true,
   585  		},
   586  		{
   587  			desc:          "check on_update ignore_warning, consul critical",
   588  			checkOnUpdate: structs.OnUpdateIgnoreWarn,
   589  			consulResp:    consulapi.HealthCritical,
   590  			expectedPass:  false,
   591  		},
   592  		{
   593  			desc:          "check on_update ignore_warning, consul healthy",
   594  			checkOnUpdate: structs.OnUpdateIgnoreWarn,
   595  			consulResp:    consulapi.HealthPassing,
   596  			expectedPass:  true,
   597  		},
   598  		{
   599  			desc:          "check on_update ignore, consul critical",
   600  			checkOnUpdate: structs.OnUpdateIgnore,
   601  			consulResp:    consulapi.HealthCritical,
   602  			expectedPass:  true,
   603  		},
   604  	}
   605  
   606  	for _, tc := range cases {
   607  		t.Run(tc.desc, func(t *testing.T) {
   608  
   609  			alloc := mock.Alloc()
   610  			alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   611  			task := alloc.Job.TaskGroups[0].Tasks[0]
   612  
   613  			// Synthesize running alloc and tasks
   614  			alloc.ClientStatus = structs.AllocClientStatusRunning
   615  			alloc.TaskStates = map[string]*structs.TaskState{
   616  				task.Name: {
   617  					State:     structs.TaskStateRunning,
   618  					StartedAt: time.Now(),
   619  				},
   620  			}
   621  
   622  			// Make Consul response
   623  			check := &consulapi.AgentCheck{
   624  				Name:   task.Services[0].Checks[0].Name,
   625  				Status: tc.consulResp,
   626  			}
   627  			taskRegs := map[string]*serviceregistration.ServiceRegistrations{
   628  				task.Name: {
   629  					Services: map[string]*serviceregistration.ServiceRegistration{
   630  						task.Services[0].Name: {
   631  							Service: &consulapi.AgentService{
   632  								ID:      "foo",
   633  								Service: task.Services[0].Name,
   634  							},
   635  							Checks: []*consulapi.AgentCheck{check},
   636  							CheckOnUpdate: map[string]string{
   637  								check.CheckID: tc.checkOnUpdate,
   638  							},
   639  						},
   640  					},
   641  				},
   642  			}
   643  
   644  			logger := testlog.HCLogger(t)
   645  			b := cstructs.NewAllocBroadcaster(logger)
   646  			defer b.Close()
   647  
   648  			// Don't reply on the first call
   649  			var called uint64
   650  			consul := regmock.NewServiceRegistrationHandler(logger)
   651  			consul.AllocRegistrationsFn = func(string) (*serviceregistration.AllocRegistration, error) {
   652  				if atomic.AddUint64(&called, 1) == 1 {
   653  					return nil, nil
   654  				}
   655  
   656  				reg := &serviceregistration.AllocRegistration{
   657  					Tasks: taskRegs,
   658  				}
   659  
   660  				return reg, nil
   661  			}
   662  
   663  			ctx, cancelFn := context.WithCancel(context.Background())
   664  			defer cancelFn()
   665  
   666  			checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   667  			checkInterval := 10 * time.Millisecond
   668  			tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, time.Millisecond, true)
   669  			tracker.checkLookupInterval = checkInterval
   670  			tracker.Start()
   671  
   672  			select {
   673  			case <-time.After(4 * checkInterval):
   674  				if !tc.expectedPass {
   675  					// tracker should still be running
   676  					require.Nil(t, tracker.ctx.Err())
   677  					return
   678  				}
   679  				require.Fail(t, "timed out while waiting for health")
   680  			case h := <-tracker.HealthyCh():
   681  				require.True(t, h)
   682  			}
   683  
   684  			// For healthy checks, the tracker should stop watching
   685  			select {
   686  			case <-tracker.ctx.Done():
   687  				// Ok, tracker should exit after reporting healthy
   688  			default:
   689  				require.Fail(t, "expected tracker to exit after reporting healthy")
   690  			}
   691  		})
   692  	}
   693  }
   694  
   695  func TestTracker_NomadChecks_OnUpdate(t *testing.T) {
   696  	ci.Parallel(t)
   697  
   698  	cases := []struct {
   699  		name         string
   700  		checkMode    structs.CheckMode
   701  		checkResult  structs.CheckStatus
   702  		expectedPass bool
   703  	}{
   704  		{
   705  			name:         "mode is healthiness and check is healthy",
   706  			checkMode:    structs.Healthiness,
   707  			checkResult:  structs.CheckSuccess,
   708  			expectedPass: true,
   709  		},
   710  		{
   711  			name:         "mode is healthiness and check is unhealthy",
   712  			checkMode:    structs.Healthiness,
   713  			checkResult:  structs.CheckFailure,
   714  			expectedPass: false,
   715  		},
   716  		{
   717  			name:         "mode is readiness and check is healthy",
   718  			checkMode:    structs.Readiness,
   719  			checkResult:  structs.CheckSuccess,
   720  			expectedPass: true,
   721  		},
   722  		{
   723  			name:         "mode is readiness and check is healthy",
   724  			checkMode:    structs.Readiness,
   725  			checkResult:  structs.CheckFailure,
   726  			expectedPass: true,
   727  		},
   728  	}
   729  
   730  	for i := range cases {
   731  		tc := cases[i]
   732  		t.Run(tc.name, func(t *testing.T) {
   733  			alloc := mock.Alloc()
   734  			alloc.Job.TaskGroups[0].Migrate.MinHealthyTime = 1 // let's speed things up
   735  			alloc.Job.TaskGroups[0].Tasks[0].Services[0].Provider = "nomad"
   736  
   737  			logger := testlog.HCLogger(t)
   738  			b := cstructs.NewAllocBroadcaster(logger)
   739  			defer b.Close()
   740  
   741  			// Synthesize running alloc and tasks
   742  			alloc.ClientStatus = structs.AllocClientStatusRunning
   743  			alloc.TaskStates = map[string]*structs.TaskState{
   744  				alloc.Job.TaskGroups[0].Tasks[0].Name: {
   745  					State:     structs.TaskStateRunning,
   746  					StartedAt: time.Now(),
   747  				},
   748  			}
   749  
   750  			// Set a check that is pending
   751  			checks := checkstore.NewStore(logger, state.NewMemDB(logger))
   752  			err := checks.Set(alloc.ID, &structs.CheckQueryResult{
   753  				ID:        "abc123",
   754  				Mode:      tc.checkMode,
   755  				Status:    structs.CheckPending,
   756  				Output:    "nomad: waiting to run",
   757  				Timestamp: time.Now().Unix(),
   758  				Group:     alloc.TaskGroup,
   759  				Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   760  				Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   761  				Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   762  			})
   763  			must.NoError(t, err)
   764  
   765  			go func() {
   766  				// wait a bit then update the check to passing
   767  				time.Sleep(15 * time.Millisecond)
   768  				must.NoError(t, checks.Set(alloc.ID, &structs.CheckQueryResult{
   769  					ID:        "abc123",
   770  					Mode:      tc.checkMode,
   771  					Status:    tc.checkResult,
   772  					Output:    "some output",
   773  					Timestamp: time.Now().Unix(),
   774  					Group:     alloc.TaskGroup,
   775  					Task:      alloc.Job.TaskGroups[0].Tasks[0].Name,
   776  					Service:   alloc.Job.TaskGroups[0].Tasks[0].Services[0].Name,
   777  					Check:     alloc.Job.TaskGroups[0].Tasks[0].Services[0].Checks[0].Name,
   778  				}))
   779  			}()
   780  
   781  			ctx, cancel := context.WithCancel(context.Background())
   782  			defer cancel()
   783  
   784  			consul := regmock.NewServiceRegistrationHandler(logger)
   785  			minHealthyTime := 1 * time.Millisecond
   786  			tracker := NewTracker(ctx, logger, alloc, b.Listen(), consul, checks, minHealthyTime, true)
   787  			tracker.checkLookupInterval = 10 * time.Millisecond
   788  			tracker.Start()
   789  
   790  			select {
   791  			case <-time.After(8 * tracker.checkLookupInterval):
   792  				if !tc.expectedPass {
   793  					// tracker should still be running
   794  					must.NoError(t, tracker.ctx.Err())
   795  					return
   796  				}
   797  				t.Fatal("timed out while waiting for health")
   798  			case h := <-tracker.HealthyCh():
   799  				require.True(t, h)
   800  			}
   801  
   802  			// For healthy checks, the tracker should stop watching
   803  			select {
   804  			case <-tracker.ctx.Done():
   805  				// Ok, tracker should exit after reporting healthy
   806  			default:
   807  				t.Fatal("expected tracker to exit after reporting healthy")
   808  			}
   809  		})
   810  	}
   811  }