github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/alloc_runner_test.go

github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/alloc_runner_test.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"errors"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"path/filepath"
     9  	"testing"
    10  	"time"
    11  
    12  	"github.com/hashicorp/consul/api"
    13  	multierror "github.com/hashicorp/go-multierror"
    14  	"github.com/hashicorp/nomad/ci"
    15  	"github.com/hashicorp/nomad/client/allochealth"
    16  	"github.com/hashicorp/nomad/client/allocrunner/tasklifecycle"
    17  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
    18  	"github.com/hashicorp/nomad/client/allocwatcher"
    19  	"github.com/hashicorp/nomad/client/serviceregistration"
    20  	regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
    21  	"github.com/hashicorp/nomad/client/state"
    22  	"github.com/hashicorp/nomad/helper/uuid"
    23  	"github.com/hashicorp/nomad/nomad/mock"
    24  	"github.com/hashicorp/nomad/nomad/structs"
    25  	"github.com/hashicorp/nomad/testutil"
    26  	"github.com/stretchr/testify/require"
    27  )
    28  
    29  // destroy does a blocking destroy on an alloc runner
    30  func destroy(ar *allocRunner) {
    31  	ar.Destroy()
    32  	<-ar.DestroyCh()
    33  }
    34  
    35  // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
    36  // AllocState() are initialized even before the AllocRunner has run.
    37  func TestAllocRunner_AllocState_Initialized(t *testing.T) {
    38  	ci.Parallel(t)
    39  
    40  	alloc := mock.Alloc()
    41  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
    42  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    43  	defer cleanup()
    44  
    45  	ar, err := NewAllocRunner(conf)
    46  	require.NoError(t, err)
    47  
    48  	allocState := ar.AllocState()
    49  
    50  	require.NotNil(t, allocState)
    51  	require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name])
    52  }
    53  
    54  // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the
    55  // entire task group is killed.
    56  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
    57  	ci.Parallel(t)
    58  
    59  	alloc := mock.BatchAlloc()
    60  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
    61  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
    62  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
    63  
    64  	// Create two tasks in the task group
    65  	task := alloc.Job.TaskGroups[0].Tasks[0]
    66  	task.Name = "task1"
    67  	task.Driver = "mock_driver"
    68  	task.KillTimeout = 10 * time.Millisecond
    69  	task.Config = map[string]interface{}{
    70  		"run_for": "10s",
    71  	}
    72  
    73  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
    74  	task2.Name = "task2"
    75  	task2.Driver = "mock_driver"
    76  	task2.Leader = true
    77  	task2.Config = map[string]interface{}{
    78  		"run_for": "1s",
    79  	}
    80  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
    81  	alloc.AllocatedResources.Tasks[task.Name] = tr
    82  	alloc.AllocatedResources.Tasks[task2.Name] = tr
    83  
    84  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    85  	defer cleanup()
    86  	ar, err := NewAllocRunner(conf)
    87  	require.NoError(t, err)
    88  	defer destroy(ar)
    89  	go ar.Run()
    90  
    91  	// Wait for all tasks to be killed
    92  	upd := conf.StateUpdater.(*MockStateUpdater)
    93  	testutil.WaitForResult(func() (bool, error) {
    94  		last := upd.Last()
    95  		if last == nil {
    96  			return false, fmt.Errorf("No updates")
    97  		}
    98  		if last.ClientStatus != structs.AllocClientStatusComplete {
    99  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   100  		}
   101  
   102  		// Task1 should be killed because Task2 exited
   103  		state1 := last.TaskStates[task.Name]
   104  		if state1.State != structs.TaskStateDead {
   105  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
   106  		}
   107  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
   108  			return false, fmt.Errorf("expected to have a start and finish time")
   109  		}
   110  		if len(state1.Events) < 2 {
   111  			// At least have a received and destroyed
   112  			return false, fmt.Errorf("Unexpected number of events")
   113  		}
   114  
   115  		found := false
   116  		killingMsg := ""
   117  		for _, e := range state1.Events {
   118  			if e.Type == structs.TaskLeaderDead {
   119  				found = true
   120  			}
   121  			if e.Type == structs.TaskKilling {
   122  				killingMsg = e.DisplayMessage
   123  			}
   124  		}
   125  
   126  		if !found {
   127  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
   128  		}
   129  
   130  		expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
   131  		if killingMsg != expectedKillingMsg {
   132  			return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", expectedKillingMsg, killingMsg)
   133  		}
   134  
   135  		// Task Two should be dead
   136  		state2 := last.TaskStates[task2.Name]
   137  		if state2.State != structs.TaskStateDead {
   138  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
   139  		}
   140  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
   141  			return false, fmt.Errorf("expected to have a start and finish time")
   142  		}
   143  
   144  		return true, nil
   145  	}, func(err error) {
   146  		t.Fatalf("err: %v", err)
   147  	})
   148  }
   149  
   150  // TestAllocRunner_Lifecycle_Poststart asserts that a service job with 2
   151  // poststart lifecycle hooks (1 sidecar, 1 ephemeral) starts all 3 tasks, only
   152  // the ephemeral one finishes, and the other 2 exit when the alloc is stopped.
   153  func TestAllocRunner_Lifecycle_Poststart(t *testing.T) {
   154  	alloc := mock.LifecycleAlloc()
   155  
   156  	alloc.Job.Type = structs.JobTypeService
   157  	mainTask := alloc.Job.TaskGroups[0].Tasks[0]
   158  	mainTask.Config["run_for"] = "100s"
   159  
   160  	sidecarTask := alloc.Job.TaskGroups[0].Tasks[1]
   161  	sidecarTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststart
   162  	sidecarTask.Config["run_for"] = "100s"
   163  
   164  	ephemeralTask := alloc.Job.TaskGroups[0].Tasks[2]
   165  	ephemeralTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststart
   166  
   167  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   168  	defer cleanup()
   169  	ar, err := NewAllocRunner(conf)
   170  	require.NoError(t, err)
   171  	defer destroy(ar)
   172  	go ar.Run()
   173  
   174  	upd := conf.StateUpdater.(*MockStateUpdater)
   175  
   176  	// Wait for main and sidecar tasks to be running, and that the
   177  	// ephemeral task ran and exited.
   178  	testutil.WaitForResult(func() (bool, error) {
   179  		last := upd.Last()
   180  		if last == nil {
   181  			return false, fmt.Errorf("No updates")
   182  		}
   183  
   184  		if last.ClientStatus != structs.AllocClientStatusRunning {
   185  			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
   186  		}
   187  
   188  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning {
   189  			return false, fmt.Errorf("expected main task to be running not %s", s)
   190  		}
   191  
   192  		if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateRunning {
   193  			return false, fmt.Errorf("expected sidecar task to be running not %s", s)
   194  		}
   195  
   196  		if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateDead {
   197  			return false, fmt.Errorf("expected ephemeral task to be dead not %s", s)
   198  		}
   199  
   200  		if last.TaskStates[ephemeralTask.Name].Failed {
   201  			return false, fmt.Errorf("expected ephemeral task to be successful not failed")
   202  		}
   203  
   204  		return true, nil
   205  	}, func(err error) {
   206  		t.Fatalf("error waiting for initial state:\n%v", err)
   207  	})
   208  
   209  	// Tell the alloc to stop
   210  	stopAlloc := alloc.Copy()
   211  	stopAlloc.DesiredStatus = structs.AllocDesiredStatusStop
   212  	ar.Update(stopAlloc)
   213  
   214  	// Wait for main and sidecar tasks to stop.
   215  	testutil.WaitForResult(func() (bool, error) {
   216  		last := upd.Last()
   217  
   218  		if last.ClientStatus != structs.AllocClientStatusComplete {
   219  			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
   220  		}
   221  
   222  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead {
   223  			return false, fmt.Errorf("expected main task to be dead not %s", s)
   224  		}
   225  
   226  		if last.TaskStates[mainTask.Name].Failed {
   227  			return false, fmt.Errorf("expected main task to be successful not failed")
   228  		}
   229  
   230  		if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateDead {
   231  			return false, fmt.Errorf("expected sidecar task to be dead not %s", s)
   232  		}
   233  
   234  		if last.TaskStates[sidecarTask.Name].Failed {
   235  			return false, fmt.Errorf("expected sidecar task to be successful not failed")
   236  		}
   237  
   238  		return true, nil
   239  	}, func(err error) {
   240  		t.Fatalf("error waiting for initial state:\n%v", err)
   241  	})
   242  }
   243  
   244  // TestAllocRunner_TaskMain_KillTG asserts that when main tasks die the
   245  // entire task group is killed.
   246  func TestAllocRunner_TaskMain_KillTG(t *testing.T) {
   247  	ci.Parallel(t)
   248  
   249  	alloc := mock.BatchAlloc()
   250  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   251  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   252  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   253  
   254  	// Create four tasks in the task group
   255  	prestart := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   256  	prestart.Name = "prestart-sidecar"
   257  	prestart.Driver = "mock_driver"
   258  	prestart.KillTimeout = 10 * time.Millisecond
   259  	prestart.Lifecycle = &structs.TaskLifecycleConfig{
   260  		Hook:    structs.TaskLifecycleHookPrestart,
   261  		Sidecar: true,
   262  	}
   263  
   264  	prestart.Config = map[string]interface{}{
   265  		"run_for": "100s",
   266  	}
   267  
   268  	poststart := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   269  	poststart.Name = "poststart-sidecar"
   270  	poststart.Driver = "mock_driver"
   271  	poststart.KillTimeout = 10 * time.Millisecond
   272  	poststart.Lifecycle = &structs.TaskLifecycleConfig{
   273  		Hook:    structs.TaskLifecycleHookPoststart,
   274  		Sidecar: true,
   275  	}
   276  
   277  	poststart.Config = map[string]interface{}{
   278  		"run_for": "100s",
   279  	}
   280  
   281  	// these two main tasks have the same name, is that ok?
   282  	main1 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   283  	main1.Name = "task2"
   284  	main1.Driver = "mock_driver"
   285  	main1.Config = map[string]interface{}{
   286  		"run_for": "1s",
   287  	}
   288  
   289  	main2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   290  	main2.Name = "task2"
   291  	main2.Driver = "mock_driver"
   292  	main2.Config = map[string]interface{}{
   293  		"run_for": "2s",
   294  	}
   295  
   296  	alloc.Job.TaskGroups[0].Tasks = []*structs.Task{prestart, poststart, main1, main2}
   297  	alloc.AllocatedResources.Tasks = map[string]*structs.AllocatedTaskResources{
   298  		prestart.Name:  tr,
   299  		poststart.Name: tr,
   300  		main1.Name:     tr,
   301  		main2.Name:     tr,
   302  	}
   303  
   304  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   305  	defer cleanup()
   306  	ar, err := NewAllocRunner(conf)
   307  	require.NoError(t, err)
   308  	defer destroy(ar)
   309  	go ar.Run()
   310  
   311  	hasTaskMainEvent := func(state *structs.TaskState) bool {
   312  		for _, e := range state.Events {
   313  			if e.Type == structs.TaskMainDead {
   314  				return true
   315  			}
   316  		}
   317  
   318  		return false
   319  	}
   320  
   321  	// Wait for all tasks to be killed
   322  	upd := conf.StateUpdater.(*MockStateUpdater)
   323  	testutil.WaitForResult(func() (bool, error) {
   324  		last := upd.Last()
   325  		if last == nil {
   326  			return false, fmt.Errorf("No updates")
   327  		}
   328  		if last.ClientStatus != structs.AllocClientStatusComplete {
   329  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   330  		}
   331  
   332  		var state *structs.TaskState
   333  
   334  		// both sidecars should be killed because Task2 exited
   335  		state = last.TaskStates[prestart.Name]
   336  		if state == nil {
   337  			return false, fmt.Errorf("could not find state for task %s", prestart.Name)
   338  		}
   339  		if state.State != structs.TaskStateDead {
   340  			return false, fmt.Errorf("got state %v; want %v", state.State, structs.TaskStateDead)
   341  		}
   342  		if state.FinishedAt.IsZero() || state.StartedAt.IsZero() {
   343  			return false, fmt.Errorf("expected to have a start and finish time")
   344  		}
   345  		if len(state.Events) < 2 {
   346  			// At least have a received and destroyed
   347  			return false, fmt.Errorf("Unexpected number of events")
   348  		}
   349  
   350  		if !hasTaskMainEvent(state) {
   351  			return false, fmt.Errorf("Did not find event %v: %#+v", structs.TaskMainDead, state.Events)
   352  		}
   353  
   354  		state = last.TaskStates[poststart.Name]
   355  		if state == nil {
   356  			return false, fmt.Errorf("could not find state for task %s", poststart.Name)
   357  		}
   358  		if state.State != structs.TaskStateDead {
   359  			return false, fmt.Errorf("got state %v; want %v", state.State, structs.TaskStateDead)
   360  		}
   361  		if state.FinishedAt.IsZero() || state.StartedAt.IsZero() {
   362  			return false, fmt.Errorf("expected to have a start and finish time")
   363  		}
   364  		if len(state.Events) < 2 {
   365  			// At least have a received and destroyed
   366  			return false, fmt.Errorf("Unexpected number of events")
   367  		}
   368  
   369  		if !hasTaskMainEvent(state) {
   370  			return false, fmt.Errorf("Did not find event %v: %#+v", structs.TaskMainDead, state.Events)
   371  		}
   372  
   373  		// main tasks should die naturely
   374  		state = last.TaskStates[main1.Name]
   375  		if state.State != structs.TaskStateDead {
   376  			return false, fmt.Errorf("got state %v; want %v", state.State, structs.TaskStateDead)
   377  		}
   378  		if state.FinishedAt.IsZero() || state.StartedAt.IsZero() {
   379  			return false, fmt.Errorf("expected to have a start and finish time")
   380  		}
   381  		if hasTaskMainEvent(state) {
   382  			return false, fmt.Errorf("unexpected event %#+v in %v", structs.TaskMainDead, state.Events)
   383  		}
   384  
   385  		state = last.TaskStates[main2.Name]
   386  		if state.State != structs.TaskStateDead {
   387  			return false, fmt.Errorf("got state %v; want %v", state.State, structs.TaskStateDead)
   388  		}
   389  		if state.FinishedAt.IsZero() || state.StartedAt.IsZero() {
   390  			return false, fmt.Errorf("expected to have a start and finish time")
   391  		}
   392  		if hasTaskMainEvent(state) {
   393  			return false, fmt.Errorf("unexpected event %v in %#+v", structs.TaskMainDead, state.Events)
   394  		}
   395  
   396  		return true, nil
   397  	}, func(err error) {
   398  		t.Fatalf("err: %v", err)
   399  	})
   400  }
   401  
   402  // TestAllocRunner_Lifecycle_Poststop asserts that a service job with 1
   403  // postop lifecycle hook starts all 3 tasks, only
   404  // the ephemeral one finishes, and the other 2 exit when the alloc is stopped.
   405  func TestAllocRunner_Lifecycle_Poststop(t *testing.T) {
   406  	ci.Parallel(t)
   407  
   408  	alloc := mock.LifecycleAlloc()
   409  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   410  
   411  	alloc.Job.Type = structs.JobTypeService
   412  	mainTask := alloc.Job.TaskGroups[0].Tasks[0]
   413  	mainTask.Config["run_for"] = "100s"
   414  
   415  	ephemeralTask := alloc.Job.TaskGroups[0].Tasks[1]
   416  	ephemeralTask.Name = "quit"
   417  	ephemeralTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststop
   418  	ephemeralTask.Config["run_for"] = "10s"
   419  
   420  	alloc.Job.TaskGroups[0].Tasks = []*structs.Task{mainTask, ephemeralTask}
   421  	alloc.AllocatedResources.Tasks = map[string]*structs.AllocatedTaskResources{
   422  		mainTask.Name:      tr,
   423  		ephemeralTask.Name: tr,
   424  	}
   425  
   426  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   427  	defer cleanup()
   428  	ar, err := NewAllocRunner(conf)
   429  	require.NoError(t, err)
   430  	defer destroy(ar)
   431  	go ar.Run()
   432  
   433  	upd := conf.StateUpdater.(*MockStateUpdater)
   434  
   435  	// Wait for main task to be running
   436  	testutil.WaitForResult(func() (bool, error) {
   437  		last := upd.Last()
   438  		if last == nil {
   439  			return false, fmt.Errorf("No updates")
   440  		}
   441  
   442  		if last.ClientStatus != structs.AllocClientStatusRunning {
   443  			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
   444  		}
   445  
   446  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning {
   447  			return false, fmt.Errorf("expected main task to be running not %s", s)
   448  		}
   449  
   450  		if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStatePending {
   451  			return false, fmt.Errorf("expected ephemeral task to be pending not %s", s)
   452  		}
   453  
   454  		return true, nil
   455  	}, func(err error) {
   456  		t.Fatalf("error waiting for initial state:\n%v", err)
   457  	})
   458  
   459  	// Tell the alloc to stop
   460  	stopAlloc := alloc.Copy()
   461  	stopAlloc.DesiredStatus = structs.AllocDesiredStatusStop
   462  	ar.Update(stopAlloc)
   463  
   464  	// Wait for main task to die & poststop task to run.
   465  	testutil.WaitForResult(func() (bool, error) {
   466  		last := upd.Last()
   467  
   468  		if last.ClientStatus != structs.AllocClientStatusRunning {
   469  			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
   470  		}
   471  
   472  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead {
   473  			return false, fmt.Errorf("expected main task to be dead not %s", s)
   474  		}
   475  
   476  		if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateRunning {
   477  			return false, fmt.Errorf("expected poststop task to be running not %s", s)
   478  		}
   479  
   480  		return true, nil
   481  	}, func(err error) {
   482  		t.Fatalf("error waiting for initial state:\n%v", err)
   483  	})
   484  
   485  }
   486  
   487  func TestAllocRunner_Lifecycle_Restart(t *testing.T) {
   488  	ci.Parallel(t)
   489  
   490  	// test cases can use this default or override w/ taskDefs param
   491  	alloc := mock.LifecycleAllocFromTasks([]mock.LifecycleTaskDef{
   492  		{Name: "main", RunFor: "100s", ExitCode: 0, Hook: "", IsSidecar: false},
   493  		{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   494  		{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   495  		{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   496  		{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   497  		{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   498  	})
   499  	alloc.Job.Type = structs.JobTypeService
   500  	rp := &structs.RestartPolicy{
   501  		Attempts: 1,
   502  		Interval: 10 * time.Minute,
   503  		Delay:    1 * time.Nanosecond,
   504  		Mode:     structs.RestartPolicyModeFail,
   505  	}
   506  
   507  	ev := &structs.TaskEvent{Type: structs.TaskRestartSignal}
   508  
   509  	testCases := []struct {
   510  		name          string
   511  		taskDefs      []mock.LifecycleTaskDef
   512  		isBatch       bool
   513  		hasLeader     bool
   514  		action        func(*allocRunner, *structs.Allocation) error
   515  		expectedErr   string
   516  		expectedAfter map[string]structs.TaskState
   517  	}{
   518  		{
   519  			name: "restart entire allocation",
   520  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   521  				return ar.RestartAll(ev)
   522  			},
   523  			expectedAfter: map[string]structs.TaskState{
   524  				"main":              structs.TaskState{State: "running", Restarts: 1},
   525  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 1},
   526  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   527  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 1},
   528  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   529  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   530  			},
   531  		},
   532  		{
   533  			name: "restart only running tasks",
   534  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   535  				return ar.RestartRunning(ev)
   536  			},
   537  			expectedAfter: map[string]structs.TaskState{
   538  				"main":              structs.TaskState{State: "running", Restarts: 1},
   539  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   540  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   541  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   542  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   543  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   544  			},
   545  		},
   546  		{
   547  			name: "batch job restart entire allocation",
   548  			taskDefs: []mock.LifecycleTaskDef{
   549  				{Name: "main", RunFor: "100s", ExitCode: 1, Hook: "", IsSidecar: false},
   550  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   551  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   552  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   553  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   554  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   555  			},
   556  			isBatch: true,
   557  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   558  				return ar.RestartAll(ev)
   559  			},
   560  			expectedAfter: map[string]structs.TaskState{
   561  				"main":              structs.TaskState{State: "running", Restarts: 1},
   562  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 1},
   563  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   564  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 1},
   565  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   566  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   567  			},
   568  		},
   569  		{
   570  			name: "batch job restart only running tasks ",
   571  			taskDefs: []mock.LifecycleTaskDef{
   572  				{Name: "main", RunFor: "100s", ExitCode: 1, Hook: "", IsSidecar: false},
   573  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   574  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   575  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   576  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   577  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   578  			},
   579  			isBatch: true,
   580  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   581  				return ar.RestartRunning(ev)
   582  			},
   583  			expectedAfter: map[string]structs.TaskState{
   584  				"main":              structs.TaskState{State: "running", Restarts: 1},
   585  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   586  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   587  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   588  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   589  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   590  			},
   591  		},
   592  		{
   593  			name:      "restart entire allocation with leader",
   594  			hasLeader: true,
   595  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   596  				return ar.RestartAll(ev)
   597  			},
   598  			expectedAfter: map[string]structs.TaskState{
   599  				"main":              structs.TaskState{State: "running", Restarts: 1},
   600  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 1},
   601  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   602  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 1},
   603  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   604  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   605  			},
   606  		},
   607  		{
   608  			name: "stop from server",
   609  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   610  				stopAlloc := alloc.Copy()
   611  				stopAlloc.DesiredStatus = structs.AllocDesiredStatusStop
   612  				ar.Update(stopAlloc)
   613  				return nil
   614  			},
   615  			expectedAfter: map[string]structs.TaskState{
   616  				"main":              structs.TaskState{State: "dead", Restarts: 0},
   617  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   618  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   619  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   620  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   621  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   622  			},
   623  		},
   624  		{
   625  			name: "restart main task",
   626  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   627  				return ar.RestartTask("main", ev)
   628  			},
   629  			expectedAfter: map[string]structs.TaskState{
   630  				"main":              structs.TaskState{State: "running", Restarts: 1},
   631  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   632  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 0},
   633  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   634  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 0},
   635  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   636  			},
   637  		},
   638  		{
   639  			name:      "restart leader main task",
   640  			hasLeader: true,
   641  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   642  				return ar.RestartTask("main", ev)
   643  			},
   644  			expectedAfter: map[string]structs.TaskState{
   645  				"main":              structs.TaskState{State: "running", Restarts: 1},
   646  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   647  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 0},
   648  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   649  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 0},
   650  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   651  			},
   652  		},
   653  		{
   654  			name: "main task fails and restarts once",
   655  			taskDefs: []mock.LifecycleTaskDef{
   656  				{Name: "main", RunFor: "2s", ExitCode: 1, Hook: "", IsSidecar: false},
   657  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   658  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   659  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   660  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   661  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   662  			},
   663  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   664  				time.Sleep(3 * time.Second) // make sure main task has exited
   665  				return nil
   666  			},
   667  			expectedAfter: map[string]structs.TaskState{
   668  				"main":              structs.TaskState{State: "dead", Restarts: 1},
   669  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   670  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   671  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   672  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   673  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   674  			},
   675  		},
   676  		{
   677  			name: "leader main task fails and restarts once",
   678  			taskDefs: []mock.LifecycleTaskDef{
   679  				{Name: "main", RunFor: "2s", ExitCode: 1, Hook: "", IsSidecar: false},
   680  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   681  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   682  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   683  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   684  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   685  			},
   686  			hasLeader: true,
   687  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   688  				time.Sleep(3 * time.Second) // make sure main task has exited
   689  				return nil
   690  			},
   691  			expectedAfter: map[string]structs.TaskState{
   692  				"main":              structs.TaskState{State: "dead", Restarts: 1},
   693  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   694  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   695  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   696  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   697  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   698  			},
   699  		},
   700  		{
   701  			name: "main stopped unexpectedly and restarts once",
   702  			taskDefs: []mock.LifecycleTaskDef{
   703  				{Name: "main", RunFor: "2s", ExitCode: 0, Hook: "", IsSidecar: false},
   704  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   705  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   706  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   707  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   708  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   709  			},
   710  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   711  				time.Sleep(3 * time.Second) // make sure main task has exited
   712  				return nil
   713  			},
   714  			expectedAfter: map[string]structs.TaskState{
   715  				"main":              structs.TaskState{State: "dead", Restarts: 1},
   716  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   717  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   718  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   719  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   720  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   721  			},
   722  		},
   723  		{
   724  			name: "leader main stopped unexpectedly and restarts once",
   725  			taskDefs: []mock.LifecycleTaskDef{
   726  				{Name: "main", RunFor: "2s", ExitCode: 0, Hook: "", IsSidecar: false},
   727  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   728  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   729  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   730  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   731  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   732  			},
   733  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   734  				time.Sleep(3 * time.Second) // make sure main task has exited
   735  				return nil
   736  			},
   737  			expectedAfter: map[string]structs.TaskState{
   738  				"main":              structs.TaskState{State: "dead", Restarts: 1},
   739  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   740  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   741  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   742  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   743  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   744  			},
   745  		},
   746  		{
   747  			name: "failed main task cannot be restarted",
   748  			taskDefs: []mock.LifecycleTaskDef{
   749  				{Name: "main", RunFor: "2s", ExitCode: 1, Hook: "", IsSidecar: false},
   750  				{Name: "prestart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "prestart", IsSidecar: false},
   751  				{Name: "prestart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "prestart", IsSidecar: true},
   752  				{Name: "poststart-oneshot", RunFor: "1s", ExitCode: 0, Hook: "poststart", IsSidecar: false},
   753  				{Name: "poststart-sidecar", RunFor: "100s", ExitCode: 0, Hook: "poststart", IsSidecar: true},
   754  				{Name: "poststop", RunFor: "1s", ExitCode: 0, Hook: "poststop", IsSidecar: false},
   755  			},
   756  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   757  				// make sure main task has had a chance to restart once on its
   758  				// own and fail again before we try to manually restart it
   759  				time.Sleep(5 * time.Second)
   760  				return ar.RestartTask("main", ev)
   761  			},
   762  			expectedErr: "Task not running",
   763  			expectedAfter: map[string]structs.TaskState{
   764  				"main":              structs.TaskState{State: "dead", Restarts: 1},
   765  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   766  				"prestart-sidecar":  structs.TaskState{State: "dead", Restarts: 0},
   767  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   768  				"poststart-sidecar": structs.TaskState{State: "dead", Restarts: 0},
   769  				"poststop":          structs.TaskState{State: "dead", Restarts: 0},
   770  			},
   771  		},
   772  		{
   773  			name: "restart prestart-sidecar task",
   774  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   775  				return ar.RestartTask("prestart-sidecar", ev)
   776  			},
   777  			expectedAfter: map[string]structs.TaskState{
   778  				"main":              structs.TaskState{State: "running", Restarts: 0},
   779  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   780  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 1},
   781  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   782  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 0},
   783  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   784  			},
   785  		},
   786  		{
   787  			name: "restart poststart-sidecar task",
   788  			action: func(ar *allocRunner, alloc *structs.Allocation) error {
   789  				return ar.RestartTask("poststart-sidecar", ev)
   790  			},
   791  			expectedAfter: map[string]structs.TaskState{
   792  				"main":              structs.TaskState{State: "running", Restarts: 0},
   793  				"prestart-oneshot":  structs.TaskState{State: "dead", Restarts: 0},
   794  				"prestart-sidecar":  structs.TaskState{State: "running", Restarts: 0},
   795  				"poststart-oneshot": structs.TaskState{State: "dead", Restarts: 0},
   796  				"poststart-sidecar": structs.TaskState{State: "running", Restarts: 1},
   797  				"poststop":          structs.TaskState{State: "pending", Restarts: 0},
   798  			},
   799  		},
   800  	}
   801  
   802  	for _, tc := range testCases {
   803  		tc := tc
   804  		t.Run(tc.name, func(t *testing.T) {
   805  			ci.Parallel(t)
   806  
   807  			alloc := alloc.Copy()
   808  			alloc.Job.TaskGroups[0].RestartPolicy = rp
   809  			if tc.taskDefs != nil {
   810  				alloc = mock.LifecycleAllocFromTasks(tc.taskDefs)
   811  				alloc.Job.Type = structs.JobTypeService
   812  			}
   813  			for _, task := range alloc.Job.TaskGroups[0].Tasks {
   814  				task.RestartPolicy = rp // tasks inherit the group policy
   815  			}
   816  			if tc.hasLeader {
   817  				for _, task := range alloc.Job.TaskGroups[0].Tasks {
   818  					if task.Name == "main" {
   819  						task.Leader = true
   820  					}
   821  				}
   822  			}
   823  			if tc.isBatch {
   824  				alloc.Job.Type = structs.JobTypeBatch
   825  			}
   826  
   827  			conf, cleanup := testAllocRunnerConfig(t, alloc)
   828  			defer cleanup()
   829  			ar, err := NewAllocRunner(conf)
   830  			require.NoError(t, err)
   831  			defer destroy(ar)
   832  			go ar.Run()
   833  
   834  			upd := conf.StateUpdater.(*MockStateUpdater)
   835  
   836  			// assert our "before" states:
   837  			// - all one-shot tasks should be dead but not failed
   838  			// - all main tasks and sidecars should be running
   839  			// - no tasks should have restarted
   840  			testutil.WaitForResult(func() (bool, error) {
   841  				last := upd.Last()
   842  				if last == nil {
   843  					return false, fmt.Errorf("no update")
   844  				}
   845  				if last.ClientStatus != structs.AllocClientStatusRunning {
   846  					return false, fmt.Errorf(
   847  						"expected alloc to be running not %s", last.ClientStatus)
   848  				}
   849  				var errs *multierror.Error
   850  
   851  				expectedBefore := map[string]string{
   852  					"main":              "running",
   853  					"prestart-oneshot":  "dead",
   854  					"prestart-sidecar":  "running",
   855  					"poststart-oneshot": "dead",
   856  					"poststart-sidecar": "running",
   857  					"poststop":          "pending",
   858  				}
   859  
   860  				for task, expected := range expectedBefore {
   861  					got, ok := last.TaskStates[task]
   862  					if !ok {
   863  						continue
   864  					}
   865  					if got.State != expected {
   866  						errs = multierror.Append(errs, fmt.Errorf(
   867  							"expected initial state of task %q to be %q not %q",
   868  							task, expected, got.State))
   869  					}
   870  					if got.Restarts != 0 {
   871  						errs = multierror.Append(errs, fmt.Errorf(
   872  							"expected no initial restarts of task %q, not %q",
   873  							task, got.Restarts))
   874  					}
   875  					if expected == "dead" && got.Failed {
   876  						errs = multierror.Append(errs, fmt.Errorf(
   877  							"expected ephemeral task %q to be dead but not failed",
   878  							task))
   879  					}
   880  
   881  				}
   882  				if errs.ErrorOrNil() != nil {
   883  					return false, errs.ErrorOrNil()
   884  				}
   885  				return true, nil
   886  			}, func(err error) {
   887  				require.NoError(t, err, "error waiting for initial state")
   888  			})
   889  
   890  			// perform the action
   891  			err = tc.action(ar, alloc.Copy())
   892  			if tc.expectedErr != "" {
   893  				require.EqualError(t, err, tc.expectedErr)
   894  			} else {
   895  				require.NoError(t, err)
   896  			}
   897  
   898  			// assert our "after" states
   899  			testutil.WaitForResult(func() (bool, error) {
   900  				last := upd.Last()
   901  				if last == nil {
   902  					return false, fmt.Errorf("no update")
   903  				}
   904  				var errs *multierror.Error
   905  				for task, expected := range tc.expectedAfter {
   906  					got, ok := last.TaskStates[task]
   907  					if !ok {
   908  						errs = multierror.Append(errs, fmt.Errorf(
   909  							"no final state found for task %q", task,
   910  						))
   911  					}
   912  					if got.State != expected.State {
   913  						errs = multierror.Append(errs, fmt.Errorf(
   914  							"expected final state of task %q to be %q not %q",
   915  							task, expected.State, got.State))
   916  					}
   917  					if expected.State == "dead" {
   918  						if got.FinishedAt.IsZero() || got.StartedAt.IsZero() {
   919  							errs = multierror.Append(errs, fmt.Errorf(
   920  								"expected final state of task %q to have start and finish time", task))
   921  						}
   922  						if len(got.Events) < 2 {
   923  							errs = multierror.Append(errs, fmt.Errorf(
   924  								"expected final state of task %q to include at least 2 tasks", task))
   925  						}
   926  					}
   927  
   928  					if got.Restarts != expected.Restarts {
   929  						errs = multierror.Append(errs, fmt.Errorf(
   930  							"expected final restarts of task %q to be %v not %v",
   931  							task, expected.Restarts, got.Restarts))
   932  					}
   933  				}
   934  				if errs.ErrorOrNil() != nil {
   935  					return false, errs.ErrorOrNil()
   936  				}
   937  				return true, nil
   938  			}, func(err error) {
   939  				require.NoError(t, err, "error waiting for final state")
   940  			})
   941  		})
   942  	}
   943  }
   944  
   945  func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) {
   946  	ci.Parallel(t)
   947  
   948  	alloc := mock.Alloc()
   949  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   950  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   951  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   952  
   953  	// Create a group service
   954  	tg := alloc.Job.TaskGroups[0]
   955  	tg.Services = []*structs.Service{
   956  		{
   957  			Name:     "shutdown_service",
   958  			Provider: structs.ServiceProviderConsul,
   959  		},
   960  	}
   961  
   962  	// Create two tasks in the  group
   963  	task := alloc.Job.TaskGroups[0].Tasks[0]
   964  	task.Name = "follower1"
   965  	task.Driver = "mock_driver"
   966  	task.Config = map[string]interface{}{
   967  		"run_for": "10s",
   968  	}
   969  
   970  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   971  	task2.Name = "leader"
   972  	task2.Driver = "mock_driver"
   973  	task2.Leader = true
   974  	task2.Config = map[string]interface{}{
   975  		"run_for": "10s",
   976  	}
   977  
   978  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   979  	alloc.AllocatedResources.Tasks[task.Name] = tr
   980  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   981  
   982  	// Set a shutdown delay
   983  	shutdownDelay := 1 * time.Second
   984  	alloc.Job.TaskGroups[0].ShutdownDelay = &shutdownDelay
   985  
   986  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   987  	defer cleanup()
   988  	ar, err := NewAllocRunner(conf)
   989  	require.NoError(t, err)
   990  	defer destroy(ar)
   991  	go ar.Run()
   992  
   993  	// Wait for tasks to start
   994  	upd := conf.StateUpdater.(*MockStateUpdater)
   995  	last := upd.Last()
   996  	testutil.WaitForResult(func() (bool, error) {
   997  		last = upd.Last()
   998  		if last == nil {
   999  			return false, fmt.Errorf("No updates")
  1000  		}
  1001  		if n := len(last.TaskStates); n != 2 {
  1002  			return false, fmt.Errorf("Not enough task states (want: 2; found %d)", n)
  1003  		}
  1004  		for name, state := range last.TaskStates {
  1005  			if state.State != structs.TaskStateRunning {
  1006  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1007  			}
  1008  		}
  1009  		return true, nil
  1010  	}, func(err error) {
  1011  		t.Fatalf("err: %v", err)
  1012  	})
  1013  
  1014  	// Reset updates
  1015  	upd.Reset()
  1016  
  1017  	// Stop alloc
  1018  	shutdownInit := time.Now()
  1019  	update := alloc.Copy()
  1020  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1021  	ar.Update(update)
  1022  
  1023  	// Wait for tasks to stop
  1024  	testutil.WaitForResult(func() (bool, error) {
  1025  		last := upd.Last()
  1026  		if last == nil {
  1027  			return false, fmt.Errorf("No updates")
  1028  		}
  1029  
  1030  		fin := last.TaskStates["leader"].FinishedAt
  1031  
  1032  		if fin.IsZero() {
  1033  			return false, nil
  1034  		}
  1035  
  1036  		return true, nil
  1037  	}, func(err error) {
  1038  		last := upd.Last()
  1039  		for name, state := range last.TaskStates {
  1040  			t.Logf("%s: %s", name, state.State)
  1041  		}
  1042  		t.Fatalf("err: %v", err)
  1043  	})
  1044  
  1045  	// Get consul client operations
  1046  	consulClient := conf.Consul.(*regMock.ServiceRegistrationHandler)
  1047  	consulOpts := consulClient.GetOps()
  1048  	var groupRemoveOp regMock.Operation
  1049  	for _, op := range consulOpts {
  1050  		// Grab the first deregistration request
  1051  		if op.Op == "remove" && op.Name == "group-web" {
  1052  			groupRemoveOp = op
  1053  			break
  1054  		}
  1055  	}
  1056  
  1057  	// Ensure remove operation is close to shutdown initiation
  1058  	require.True(t, groupRemoveOp.OccurredAt.Sub(shutdownInit) < 100*time.Millisecond)
  1059  
  1060  	last = upd.Last()
  1061  	minShutdown := shutdownInit.Add(task.ShutdownDelay)
  1062  	leaderFinished := last.TaskStates["leader"].FinishedAt
  1063  	followerFinished := last.TaskStates["follower1"].FinishedAt
  1064  
  1065  	// Check that both tasks shut down after min possible shutdown time
  1066  	require.Greater(t, leaderFinished.UnixNano(), minShutdown.UnixNano())
  1067  	require.Greater(t, followerFinished.UnixNano(), minShutdown.UnixNano())
  1068  
  1069  	// Check that there is at least shutdown_delay between consul
  1070  	// remove operation and task finished at time
  1071  	require.True(t, leaderFinished.Sub(groupRemoveOp.OccurredAt) > shutdownDelay)
  1072  }
  1073  
  1074  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a
  1075  // leader the leader is stopped before other tasks.
  1076  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
  1077  	ci.Parallel(t)
  1078  
  1079  	alloc := mock.Alloc()
  1080  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  1081  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
  1082  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
  1083  
  1084  	// Create 3 tasks in the task group
  1085  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1086  	task.Name = "follower1"
  1087  	task.Driver = "mock_driver"
  1088  	task.Config = map[string]interface{}{
  1089  		"run_for": "10s",
  1090  	}
  1091  
  1092  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1093  	task2.Name = "leader"
  1094  	task2.Driver = "mock_driver"
  1095  	task2.Leader = true
  1096  	task2.Config = map[string]interface{}{
  1097  		"run_for": "10s",
  1098  	}
  1099  
  1100  	task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1101  	task3.Name = "follower2"
  1102  	task3.Driver = "mock_driver"
  1103  	task3.Config = map[string]interface{}{
  1104  		"run_for": "10s",
  1105  	}
  1106  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3)
  1107  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1108  	alloc.AllocatedResources.Tasks[task2.Name] = tr
  1109  	alloc.AllocatedResources.Tasks[task3.Name] = tr
  1110  
  1111  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1112  	defer cleanup()
  1113  	ar, err := NewAllocRunner(conf)
  1114  	require.NoError(t, err)
  1115  	defer destroy(ar)
  1116  	go ar.Run()
  1117  
  1118  	// Wait for tasks to start
  1119  	upd := conf.StateUpdater.(*MockStateUpdater)
  1120  	last := upd.Last()
  1121  	testutil.WaitForResult(func() (bool, error) {
  1122  		last = upd.Last()
  1123  		if last == nil {
  1124  			return false, fmt.Errorf("No updates")
  1125  		}
  1126  		if n := len(last.TaskStates); n != 3 {
  1127  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
  1128  		}
  1129  		for name, state := range last.TaskStates {
  1130  			if state.State != structs.TaskStateRunning {
  1131  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1132  			}
  1133  		}
  1134  		return true, nil
  1135  	}, func(err error) {
  1136  		t.Fatalf("err: %v", err)
  1137  	})
  1138  
  1139  	// Reset updates
  1140  	upd.Reset()
  1141  
  1142  	// Stop alloc
  1143  	update := alloc.Copy()
  1144  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1145  	ar.Update(update)
  1146  
  1147  	// Wait for tasks to stop
  1148  	testutil.WaitForResult(func() (bool, error) {
  1149  		last := upd.Last()
  1150  		if last == nil {
  1151  			return false, fmt.Errorf("No updates")
  1152  		}
  1153  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
  1154  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
  1155  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
  1156  		}
  1157  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
  1158  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
  1159  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
  1160  		}
  1161  		return true, nil
  1162  	}, func(err error) {
  1163  		last := upd.Last()
  1164  		for name, state := range last.TaskStates {
  1165  			t.Logf("%s: %s", name, state.State)
  1166  		}
  1167  		t.Fatalf("err: %v", err)
  1168  	})
  1169  }
  1170  
  1171  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
  1172  // restored task group with a leader that failed before restoring the leader is
  1173  // not stopped as it does not exist.
  1174  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
  1175  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
  1176  	ci.Parallel(t)
  1177  
  1178  	alloc := mock.Alloc()
  1179  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  1180  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
  1181  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
  1182  
  1183  	// Create a leader and follower task in the task group
  1184  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1185  	task.Name = "follower1"
  1186  	task.Driver = "mock_driver"
  1187  	task.KillTimeout = 10 * time.Second
  1188  	task.Config = map[string]interface{}{
  1189  		"run_for": "10s",
  1190  	}
  1191  
  1192  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1193  	task2.Name = "leader"
  1194  	task2.Driver = "mock_driver"
  1195  	task2.Leader = true
  1196  	task2.KillTimeout = 10 * time.Millisecond
  1197  	task2.Config = map[string]interface{}{
  1198  		"run_for": "10s",
  1199  	}
  1200  
  1201  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
  1202  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1203  	alloc.AllocatedResources.Tasks[task2.Name] = tr
  1204  
  1205  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1206  	defer cleanup()
  1207  
  1208  	// Use a memory backed statedb
  1209  	conf.StateDB = state.NewMemDB(conf.Logger)
  1210  
  1211  	ar, err := NewAllocRunner(conf)
  1212  	require.NoError(t, err)
  1213  
  1214  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
  1215  	ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled))
  1216  	ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
  1217  
  1218  	// Create a new AllocRunner to test RestoreState and Run
  1219  	ar2, err := NewAllocRunner(conf)
  1220  	require.NoError(t, err)
  1221  	defer destroy(ar2)
  1222  
  1223  	if err := ar2.Restore(); err != nil {
  1224  		t.Fatalf("error restoring state: %v", err)
  1225  	}
  1226  	ar2.Run()
  1227  
  1228  	// Wait for tasks to be stopped because leader is dead
  1229  	testutil.WaitForResult(func() (bool, error) {
  1230  		alloc := ar2.Alloc()
  1231  		// TODO: this test does not test anything!!! alloc.TaskStates is an empty map
  1232  		for task, state := range alloc.TaskStates {
  1233  			if state.State != structs.TaskStateDead {
  1234  				return false, fmt.Errorf("Task %q should be dead: %v", task, state.State)
  1235  			}
  1236  		}
  1237  		return true, nil
  1238  	}, func(err error) {
  1239  		t.Fatalf("err: %v", err)
  1240  	})
  1241  
  1242  	// Make sure it GCs properly
  1243  	ar2.Destroy()
  1244  
  1245  	select {
  1246  	case <-ar2.DestroyCh():
  1247  		// exited as expected
  1248  	case <-time.After(10 * time.Second):
  1249  		t.Fatalf("timed out waiting for AR to GC")
  1250  	}
  1251  }
  1252  
  1253  func TestAllocRunner_Restore_LifecycleHooks(t *testing.T) {
  1254  	ci.Parallel(t)
  1255  
  1256  	alloc := mock.LifecycleAlloc()
  1257  
  1258  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1259  	defer cleanup()
  1260  
  1261  	// Use a memory backed statedb
  1262  	conf.StateDB = state.NewMemDB(conf.Logger)
  1263  
  1264  	ar, err := NewAllocRunner(conf)
  1265  	require.NoError(t, err)
  1266  
  1267  	go ar.Run()
  1268  	defer destroy(ar)
  1269  
  1270  	// Wait for the coordinator to transition from the "init" state.
  1271  	tasklifecycle.WaitNotInitUntil(ar.taskCoordinator, time.Second, func() {
  1272  		t.Fatalf("task coordinator didn't transition from init in time")
  1273  	})
  1274  
  1275  	// We should see all tasks with Prestart hooks are not blocked from running.
  1276  	tasklifecycle.RequireTaskAllowed(t, ar.taskCoordinator, ar.tasks["init"].Task())
  1277  	tasklifecycle.RequireTaskAllowed(t, ar.taskCoordinator, ar.tasks["side"].Task())
  1278  	tasklifecycle.RequireTaskBlocked(t, ar.taskCoordinator, ar.tasks["web"].Task())
  1279  	tasklifecycle.RequireTaskBlocked(t, ar.taskCoordinator, ar.tasks["poststart"].Task())
  1280  
  1281  	// Mimic client dies while init task running, and client restarts after
  1282  	// init task finished and web is running.
  1283  	ar.tasks["init"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskTerminated))
  1284  	ar.tasks["side"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
  1285  	ar.tasks["web"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
  1286  
  1287  	// Create a new AllocRunner to test Restore and Run.
  1288  	ar2, err := NewAllocRunner(conf)
  1289  	require.NoError(t, err)
  1290  	require.NoError(t, ar2.Restore())
  1291  
  1292  	go ar2.Run()
  1293  	defer destroy(ar2)
  1294  
  1295  	// Wait for the coordinator to transition from the "init" state.
  1296  	tasklifecycle.WaitNotInitUntil(ar.taskCoordinator, time.Second, func() {
  1297  		t.Fatalf("task coordinator didn't transition from init in time")
  1298  	})
  1299  
  1300  	// Restore resumes execution with correct lifecycle ordering.
  1301  	tasklifecycle.RequireTaskBlocked(t, ar2.taskCoordinator, ar2.tasks["init"].Task())
  1302  	tasklifecycle.RequireTaskAllowed(t, ar2.taskCoordinator, ar2.tasks["side"].Task())
  1303  	tasklifecycle.RequireTaskAllowed(t, ar2.taskCoordinator, ar2.tasks["web"].Task())
  1304  	tasklifecycle.RequireTaskAllowed(t, ar2.taskCoordinator, ar2.tasks["poststart"].Task())
  1305  }
  1306  
  1307  func TestAllocRunner_Update_Semantics(t *testing.T) {
  1308  	ci.Parallel(t)
  1309  	require := require.New(t)
  1310  
  1311  	updatedAlloc := func(a *structs.Allocation) *structs.Allocation {
  1312  		upd := a.CopySkipJob()
  1313  		upd.AllocModifyIndex++
  1314  
  1315  		return upd
  1316  	}
  1317  
  1318  	alloc := mock.Alloc()
  1319  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
  1320  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1321  	defer cleanup()
  1322  
  1323  	ar, err := NewAllocRunner(conf)
  1324  	require.NoError(err)
  1325  
  1326  	upd1 := updatedAlloc(alloc)
  1327  	ar.Update(upd1)
  1328  
  1329  	// Update was placed into a queue
  1330  	require.Len(ar.allocUpdatedCh, 1)
  1331  
  1332  	upd2 := updatedAlloc(alloc)
  1333  	ar.Update(upd2)
  1334  
  1335  	// Allocation was _replaced_
  1336  
  1337  	require.Len(ar.allocUpdatedCh, 1)
  1338  	queuedAlloc := <-ar.allocUpdatedCh
  1339  	require.Equal(upd2, queuedAlloc)
  1340  
  1341  	// Requeueing older alloc is skipped
  1342  	ar.Update(upd2)
  1343  	ar.Update(upd1)
  1344  
  1345  	queuedAlloc = <-ar.allocUpdatedCh
  1346  	require.Equal(upd2, queuedAlloc)
  1347  
  1348  	// Ignore after watch closed
  1349  
  1350  	close(ar.waitCh)
  1351  
  1352  	ar.Update(upd1)
  1353  
  1354  	// Did not queue the update
  1355  	require.Len(ar.allocUpdatedCh, 0)
  1356  }
  1357  
  1358  // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is
  1359  // reported for services that got migrated; not just part of deployments.
  1360  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
  1361  	ci.Parallel(t)
  1362  
  1363  	alloc := mock.Alloc()
  1364  
  1365  	// Ensure the alloc is *not* part of a deployment
  1366  	alloc.DeploymentID = ""
  1367  
  1368  	// Shorten the default migration healthy time
  1369  	tg := alloc.Job.TaskGroups[0]
  1370  	tg.Migrate = structs.DefaultMigrateStrategy()
  1371  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
  1372  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
  1373  
  1374  	task := tg.Tasks[0]
  1375  	task.Driver = "mock_driver"
  1376  	task.Config = map[string]interface{}{
  1377  		"run_for": "30s",
  1378  	}
  1379  
  1380  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1381  	defer cleanup()
  1382  
  1383  	ar, err := NewAllocRunner(conf)
  1384  	require.NoError(t, err)
  1385  	go ar.Run()
  1386  	defer destroy(ar)
  1387  
  1388  	upd := conf.StateUpdater.(*MockStateUpdater)
  1389  	testutil.WaitForResult(func() (bool, error) {
  1390  		last := upd.Last()
  1391  		if last == nil {
  1392  			return false, fmt.Errorf("No updates")
  1393  		}
  1394  		if !last.DeploymentStatus.HasHealth() {
  1395  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
  1396  		} else if !*last.DeploymentStatus.Healthy {
  1397  			// This is fatal
  1398  			t.Fatal("want deployment status healthy; got unhealthy")
  1399  		}
  1400  		return true, nil
  1401  	}, func(err error) {
  1402  		require.NoError(t, err)
  1403  	})
  1404  }
  1405  
  1406  // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health
  1407  // watcher will mark the allocation as healthy based on task states alone.
  1408  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
  1409  	ci.Parallel(t)
  1410  
  1411  	alloc := mock.Alloc()
  1412  
  1413  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1414  	task.Driver = "mock_driver"
  1415  	task.Config = map[string]interface{}{
  1416  		"run_for": "10s",
  1417  	}
  1418  
  1419  	// Create a task that takes longer to become healthy
  1420  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy())
  1421  	alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy()
  1422  	task2 := alloc.Job.TaskGroups[0].Tasks[1]
  1423  	task2.Name = "task2"
  1424  	task2.Config["start_block_for"] = "500ms"
  1425  
  1426  	// Make the alloc be part of a deployment that uses task states for
  1427  	// health checks
  1428  	alloc.DeploymentID = uuid.Generate()
  1429  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
  1430  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
  1431  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
  1432  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
  1433  
  1434  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1435  	defer cleanup()
  1436  
  1437  	ar, err := NewAllocRunner(conf)
  1438  	require.NoError(t, err)
  1439  
  1440  	start, done := time.Now(), time.Time{}
  1441  	go ar.Run()
  1442  	defer destroy(ar)
  1443  
  1444  	upd := conf.StateUpdater.(*MockStateUpdater)
  1445  	testutil.WaitForResult(func() (bool, error) {
  1446  		last := upd.Last()
  1447  		if last == nil {
  1448  			return false, fmt.Errorf("No updates")
  1449  		}
  1450  		if !last.DeploymentStatus.HasHealth() {
  1451  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
  1452  		} else if !*last.DeploymentStatus.Healthy {
  1453  			// This is fatal
  1454  			t.Fatal("want deployment status healthy; got unhealthy")
  1455  		}
  1456  
  1457  		// Capture the done timestamp
  1458  		done = last.DeploymentStatus.Timestamp
  1459  		return true, nil
  1460  	}, func(err error) {
  1461  		require.NoError(t, err)
  1462  	})
  1463  
  1464  	if d := done.Sub(start); d < 500*time.Millisecond {
  1465  		t.Fatalf("didn't wait for second task group. Only took %v", d)
  1466  	}
  1467  }
  1468  
  1469  // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health
  1470  // watcher will mark the allocation as unhealthy with failing checks.
  1471  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
  1472  	ci.Parallel(t)
  1473  
  1474  	alloc := mock.Alloc()
  1475  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1476  	task.Driver = "mock_driver"
  1477  	task.Config = map[string]interface{}{
  1478  		"run_for": "10s",
  1479  	}
  1480  
  1481  	// Set a service with check
  1482  	task.Services = []*structs.Service{
  1483  		{
  1484  			Name:      "fakservice",
  1485  			PortLabel: "http",
  1486  			Checks: []*structs.ServiceCheck{
  1487  				{
  1488  					Name:     "fakecheck",
  1489  					Type:     structs.ServiceCheckScript,
  1490  					Command:  "true",
  1491  					Interval: 30 * time.Second,
  1492  					Timeout:  5 * time.Second,
  1493  				},
  1494  			},
  1495  		},
  1496  	}
  1497  
  1498  	// Make the alloc be part of a deployment
  1499  	alloc.DeploymentID = uuid.Generate()
  1500  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
  1501  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
  1502  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
  1503  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
  1504  	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
  1505  
  1506  	checkUnhealthy := &api.AgentCheck{
  1507  		CheckID: uuid.Generate(),
  1508  		Status:  api.HealthWarning,
  1509  	}
  1510  
  1511  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1512  	defer cleanup()
  1513  
  1514  	// Only return the check as healthy after a duration
  1515  	consulClient := conf.Consul.(*regMock.ServiceRegistrationHandler)
  1516  	consulClient.AllocRegistrationsFn = func(allocID string) (*serviceregistration.AllocRegistration, error) {
  1517  		return &serviceregistration.AllocRegistration{
  1518  			Tasks: map[string]*serviceregistration.ServiceRegistrations{
  1519  				task.Name: {
  1520  					Services: map[string]*serviceregistration.ServiceRegistration{
  1521  						"123": {
  1522  							Service: &api.AgentService{Service: "fakeservice"},
  1523  							Checks:  []*api.AgentCheck{checkUnhealthy},
  1524  						},
  1525  					},
  1526  				},
  1527  			},
  1528  		}, nil
  1529  	}
  1530  
  1531  	ar, err := NewAllocRunner(conf)
  1532  	require.NoError(t, err)
  1533  	go ar.Run()
  1534  	defer destroy(ar)
  1535  
  1536  	var lastUpdate *structs.Allocation
  1537  	upd := conf.StateUpdater.(*MockStateUpdater)
  1538  	testutil.WaitForResult(func() (bool, error) {
  1539  		lastUpdate = upd.Last()
  1540  		if lastUpdate == nil {
  1541  			return false, fmt.Errorf("No updates")
  1542  		}
  1543  		if !lastUpdate.DeploymentStatus.HasHealth() {
  1544  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
  1545  		} else if *lastUpdate.DeploymentStatus.Healthy {
  1546  			// This is fatal
  1547  			t.Fatal("want deployment status unhealthy; got healthy")
  1548  		}
  1549  		return true, nil
  1550  	}, func(err error) {
  1551  		require.NoError(t, err)
  1552  	})
  1553  
  1554  	// Assert that we have an event explaining why we are unhealthy.
  1555  	require.Len(t, lastUpdate.TaskStates, 1)
  1556  	taskState := lastUpdate.TaskStates[task.Name]
  1557  	require.NotNil(t, taskState)
  1558  	require.NotEmpty(t, taskState.Events)
  1559  	last := taskState.Events[len(taskState.Events)-1]
  1560  	require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
  1561  	require.Contains(t, last.Message, "by healthy_deadline")
  1562  }
  1563  
  1564  // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running
  1565  // alloc.
  1566  func TestAllocRunner_Destroy(t *testing.T) {
  1567  	ci.Parallel(t)
  1568  
  1569  	// Ensure task takes some time
  1570  	alloc := mock.BatchAlloc()
  1571  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1572  	task.Config["run_for"] = "10s"
  1573  
  1574  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1575  	defer cleanup()
  1576  
  1577  	// Use a MemDB to assert alloc state gets cleaned up
  1578  	conf.StateDB = state.NewMemDB(conf.Logger)
  1579  
  1580  	ar, err := NewAllocRunner(conf)
  1581  	require.NoError(t, err)
  1582  	go ar.Run()
  1583  
  1584  	// Wait for alloc to be running
  1585  	testutil.WaitForResult(func() (bool, error) {
  1586  		state := ar.AllocState()
  1587  
  1588  		return state.ClientStatus == structs.AllocClientStatusRunning,
  1589  			fmt.Errorf("got client status %v; want running", state.ClientStatus)
  1590  	}, func(err error) {
  1591  		require.NoError(t, err)
  1592  	})
  1593  
  1594  	// Assert state was stored
  1595  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
  1596  	require.NoError(t, err)
  1597  	require.NotNil(t, ls)
  1598  	require.NotNil(t, ts)
  1599  
  1600  	// Now destroy
  1601  	ar.Destroy()
  1602  
  1603  	select {
  1604  	case <-ar.DestroyCh():
  1605  		// Destroyed properly!
  1606  	case <-time.After(10 * time.Second):
  1607  		require.Fail(t, "timed out waiting for alloc to be destroyed")
  1608  	}
  1609  
  1610  	// Assert alloc is dead
  1611  	state := ar.AllocState()
  1612  	require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus)
  1613  
  1614  	// Assert the state was cleaned
  1615  	ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
  1616  	require.NoError(t, err)
  1617  	require.Nil(t, ls)
  1618  	require.Nil(t, ts)
  1619  
  1620  	// Assert the alloc directory was cleaned
  1621  	if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
  1622  		require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir)
  1623  	} else if !os.IsNotExist(err) {
  1624  		require.Failf(t, "expected NotExist error", "found %v", err)
  1625  	}
  1626  }
  1627  
  1628  func TestAllocRunner_SimpleRun(t *testing.T) {
  1629  	ci.Parallel(t)
  1630  
  1631  	alloc := mock.BatchAlloc()
  1632  
  1633  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1634  	defer cleanup()
  1635  	ar, err := NewAllocRunner(conf)
  1636  	require.NoError(t, err)
  1637  	go ar.Run()
  1638  	defer destroy(ar)
  1639  
  1640  	// Wait for alloc to be running
  1641  	testutil.WaitForResult(func() (bool, error) {
  1642  		state := ar.AllocState()
  1643  
  1644  		if state.ClientStatus != structs.AllocClientStatusComplete {
  1645  			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete)
  1646  		}
  1647  
  1648  		for t, s := range state.TaskStates {
  1649  			if s.FinishedAt.IsZero() {
  1650  				return false, fmt.Errorf("task %q has zero FinishedAt value", t)
  1651  			}
  1652  		}
  1653  
  1654  		return true, nil
  1655  	}, func(err error) {
  1656  		require.NoError(t, err)
  1657  	})
  1658  
  1659  }
  1660  
  1661  // TestAllocRunner_MoveAllocDir asserts that a rescheduled
  1662  // allocation copies ephemeral disk content from previous alloc run
  1663  func TestAllocRunner_MoveAllocDir(t *testing.T) {
  1664  	ci.Parallel(t)
  1665  
  1666  	// Step 1: start and run a task
  1667  	alloc := mock.BatchAlloc()
  1668  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1669  	defer cleanup()
  1670  	ar, err := NewAllocRunner(conf)
  1671  	require.NoError(t, err)
  1672  	ar.Run()
  1673  	defer destroy(ar)
  1674  
  1675  	WaitForClientState(t, ar, structs.AllocClientStatusComplete)
  1676  
  1677  	// Step 2. Modify its directory
  1678  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1679  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
  1680  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
  1681  	taskDir := ar.allocDir.TaskDirs[task.Name]
  1682  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
  1683  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
  1684  
  1685  	// Step 3. Start a new alloc
  1686  	alloc2 := mock.BatchAlloc()
  1687  	alloc2.PreviousAllocation = alloc.ID
  1688  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
  1689  
  1690  	conf2, cleanup := testAllocRunnerConfig(t, alloc2)
  1691  	conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{
  1692  		Alloc:          alloc2,
  1693  		PreviousRunner: ar,
  1694  		Logger:         conf2.Logger,
  1695  	})
  1696  	defer cleanup()
  1697  	ar2, err := NewAllocRunner(conf2)
  1698  	require.NoError(t, err)
  1699  
  1700  	ar2.Run()
  1701  	defer destroy(ar2)
  1702  
  1703  	WaitForClientState(t, ar, structs.AllocClientStatusComplete)
  1704  
  1705  	// Ensure that data from ar was moved to ar2
  1706  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
  1707  	fileInfo, _ := os.Stat(dataFile)
  1708  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
  1709  
  1710  	taskDir = ar2.allocDir.TaskDirs[task.Name]
  1711  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
  1712  	fileInfo, _ = os.Stat(taskLocalFile)
  1713  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
  1714  
  1715  }
  1716  
  1717  // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is
  1718  // retrying fetching an artifact, other tasks in the group should be able
  1719  // to proceed.
  1720  func TestAllocRunner_HandlesArtifactFailure(t *testing.T) {
  1721  	ci.Parallel(t)
  1722  
  1723  	alloc := mock.BatchAlloc()
  1724  	rp := &structs.RestartPolicy{
  1725  		Mode:     structs.RestartPolicyModeFail,
  1726  		Attempts: 1,
  1727  		Delay:    time.Nanosecond,
  1728  		Interval: time.Hour,
  1729  	}
  1730  	alloc.Job.TaskGroups[0].RestartPolicy = rp
  1731  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
  1732  
  1733  	// Create a new task with a bad artifact
  1734  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1735  	badtask.Name = "bad"
  1736  	badtask.Artifacts = []*structs.TaskArtifact{
  1737  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
  1738  	}
  1739  
  1740  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
  1741  	alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{
  1742  		Cpu: structs.AllocatedCpuResources{
  1743  			CpuShares: 500,
  1744  		},
  1745  		Memory: structs.AllocatedMemoryResources{
  1746  			MemoryMB: 256,
  1747  		},
  1748  	}
  1749  
  1750  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1751  	defer cleanup()
  1752  	ar, err := NewAllocRunner(conf)
  1753  	require.NoError(t, err)
  1754  	go ar.Run()
  1755  	defer destroy(ar)
  1756  
  1757  	testutil.WaitForResult(func() (bool, error) {
  1758  		state := ar.AllocState()
  1759  
  1760  		switch state.ClientStatus {
  1761  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
  1762  			return true, nil
  1763  		default:
  1764  			return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus)
  1765  		}
  1766  
  1767  	}, func(err error) {
  1768  		require.NoError(t, err)
  1769  	})
  1770  
  1771  	state := ar.AllocState()
  1772  	require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus)
  1773  	require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State)
  1774  	require.True(t, state.TaskStates["web"].Successful())
  1775  	require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State)
  1776  	require.True(t, state.TaskStates["bad"].Failed)
  1777  }
  1778  
  1779  // Test that alloc runner kills tasks in task group when another task fails
  1780  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
  1781  	ci.Parallel(t)
  1782  
  1783  	alloc := mock.Alloc()
  1784  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  1785  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
  1786  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
  1787  
  1788  	// Create two tasks in the task group
  1789  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1790  	task.Name = "task1"
  1791  	task.Driver = "mock_driver"
  1792  	task.KillTimeout = 10 * time.Millisecond
  1793  	task.Config = map[string]interface{}{
  1794  		"run_for": "10s",
  1795  	}
  1796  	// Set a service with check
  1797  	task.Services = []*structs.Service{
  1798  		{
  1799  			Name:      "fakservice",
  1800  			PortLabel: "http",
  1801  			Provider:  structs.ServiceProviderConsul,
  1802  			Checks: []*structs.ServiceCheck{
  1803  				{
  1804  					Name:     "fakecheck",
  1805  					Type:     structs.ServiceCheckScript,
  1806  					Command:  "true",
  1807  					Interval: 30 * time.Second,
  1808  					Timeout:  5 * time.Second,
  1809  				},
  1810  			},
  1811  		},
  1812  	}
  1813  
  1814  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1815  	task2.Name = "task 2"
  1816  	task2.Driver = "mock_driver"
  1817  	task2.Config = map[string]interface{}{
  1818  		"start_error": "fail task please",
  1819  	}
  1820  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
  1821  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1822  	alloc.AllocatedResources.Tasks[task2.Name] = tr
  1823  
  1824  	// Make the alloc be part of a deployment
  1825  	alloc.DeploymentID = uuid.Generate()
  1826  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
  1827  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
  1828  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
  1829  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 10 * time.Millisecond
  1830  	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 2 * time.Second
  1831  
  1832  	checkHealthy := &api.AgentCheck{
  1833  		CheckID: uuid.Generate(),
  1834  		Status:  api.HealthPassing,
  1835  	}
  1836  
  1837  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1838  	defer cleanup()
  1839  
  1840  	consulClient := conf.Consul.(*regMock.ServiceRegistrationHandler)
  1841  	consulClient.AllocRegistrationsFn = func(allocID string) (*serviceregistration.AllocRegistration, error) {
  1842  		return &serviceregistration.AllocRegistration{
  1843  			Tasks: map[string]*serviceregistration.ServiceRegistrations{
  1844  				task.Name: {
  1845  					Services: map[string]*serviceregistration.ServiceRegistration{
  1846  						"123": {
  1847  							Service: &api.AgentService{Service: "fakeservice"},
  1848  							Checks:  []*api.AgentCheck{checkHealthy},
  1849  						},
  1850  					},
  1851  				},
  1852  			},
  1853  		}, nil
  1854  	}
  1855  
  1856  	ar, err := NewAllocRunner(conf)
  1857  	require.NoError(t, err)
  1858  	defer destroy(ar)
  1859  	go ar.Run()
  1860  	upd := conf.StateUpdater.(*MockStateUpdater)
  1861  
  1862  	testutil.WaitForResult(func() (bool, error) {
  1863  		last := upd.Last()
  1864  		if last == nil {
  1865  			return false, fmt.Errorf("No updates")
  1866  		}
  1867  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1868  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1869  		}
  1870  
  1871  		// Task One should be killed
  1872  		state1 := last.TaskStates[task.Name]
  1873  		if state1.State != structs.TaskStateDead {
  1874  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1875  		}
  1876  		if len(state1.Events) < 2 {
  1877  			// At least have a received and destroyed
  1878  			return false, fmt.Errorf("Unexpected number of events")
  1879  		}
  1880  
  1881  		found := false
  1882  		for _, e := range state1.Events {
  1883  			if e.Type != structs.TaskSiblingFailed {
  1884  				found = true
  1885  			}
  1886  		}
  1887  
  1888  		if !found {
  1889  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1890  		}
  1891  
  1892  		// Task Two should be failed
  1893  		state2 := last.TaskStates[task2.Name]
  1894  		if state2.State != structs.TaskStateDead {
  1895  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1896  		}
  1897  		if !state2.Failed {
  1898  			return false, fmt.Errorf("task2 should have failed")
  1899  		}
  1900  
  1901  		if !last.DeploymentStatus.HasHealth() {
  1902  			return false, fmt.Errorf("Expected deployment health to be non nil")
  1903  		}
  1904  
  1905  		return true, nil
  1906  	}, func(err error) {
  1907  		require.Fail(t, "err: %v", err)
  1908  	})
  1909  }
  1910  
  1911  // Test that alloc becoming terminal should destroy the alloc runner
  1912  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
  1913  	ci.Parallel(t)
  1914  	alloc := mock.BatchAlloc()
  1915  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  1916  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
  1917  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
  1918  	// Ensure task takes some time
  1919  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1920  	task.Driver = "mock_driver"
  1921  	task.Config["run_for"] = "10s"
  1922  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1923  
  1924  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1925  	defer cleanup()
  1926  	ar, err := NewAllocRunner(conf)
  1927  	require.NoError(t, err)
  1928  	defer destroy(ar)
  1929  	go ar.Run()
  1930  	upd := conf.StateUpdater.(*MockStateUpdater)
  1931  
  1932  	testutil.WaitForResult(func() (bool, error) {
  1933  		last := upd.Last()
  1934  		if last == nil {
  1935  			return false, fmt.Errorf("No updates")
  1936  		}
  1937  		if last.ClientStatus != structs.AllocClientStatusRunning {
  1938  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
  1939  		}
  1940  		return true, nil
  1941  	}, func(err error) {
  1942  		require.Fail(t, "err: %v", err)
  1943  	})
  1944  
  1945  	// Update the alloc to be terminal which should cause the alloc runner to
  1946  	// stop the tasks and wait for a destroy.
  1947  	update := ar.alloc.Copy()
  1948  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1949  	ar.Update(update)
  1950  
  1951  	testutil.WaitForResult(func() (bool, error) {
  1952  		last := upd.Last()
  1953  		if last == nil {
  1954  			return false, fmt.Errorf("No updates")
  1955  		}
  1956  
  1957  		// Check the status has changed.
  1958  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1959  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1960  		}
  1961  
  1962  		// Check the alloc directory still exists
  1963  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
  1964  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
  1965  		}
  1966  
  1967  		return true, nil
  1968  	}, func(err error) {
  1969  		require.Fail(t, "err: %v", err)
  1970  	})
  1971  
  1972  	// Send the destroy signal and ensure the AllocRunner cleans up.
  1973  	ar.Destroy()
  1974  
  1975  	testutil.WaitForResult(func() (bool, error) {
  1976  		last := upd.Last()
  1977  		if last == nil {
  1978  			return false, fmt.Errorf("No updates")
  1979  		}
  1980  
  1981  		// Check the status has changed.
  1982  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1983  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1984  		}
  1985  
  1986  		// Check the alloc directory was cleaned
  1987  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
  1988  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
  1989  		} else if !os.IsNotExist(err) {
  1990  			return false, fmt.Errorf("stat err: %v", err)
  1991  		}
  1992  
  1993  		return true, nil
  1994  	}, func(err error) {
  1995  		require.Fail(t, "err: %v", err)
  1996  	})
  1997  }
  1998  
  1999  // TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
  2000  func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
  2001  	ci.Parallel(t)
  2002  
  2003  	alloc := mock.BatchAlloc()
  2004  	taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
  2005  
  2006  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  2007  	conf.StateDB = state.NewMemDB(conf.Logger)
  2008  
  2009  	defer cleanup()
  2010  	ar, err := NewAllocRunner(conf)
  2011  	require.NoError(t, err)
  2012  	defer destroy(ar)
  2013  
  2014  	go ar.Run()
  2015  
  2016  	select {
  2017  	case <-ar.WaitCh():
  2018  	case <-time.After(10 * time.Second):
  2019  		require.Fail(t, "timed out waiting for alloc to complete")
  2020  	}
  2021  
  2022  	// test final persisted state upon completion
  2023  	require.NoError(t, ar.PersistState())
  2024  	allocs, _, err := conf.StateDB.GetAllAllocations()
  2025  	require.NoError(t, err)
  2026  	require.Len(t, allocs, 1)
  2027  	require.Equal(t, alloc.ID, allocs[0].ID)
  2028  	_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  2029  	require.NoError(t, err)
  2030  	require.Equal(t, structs.TaskStateDead, ts.State)
  2031  
  2032  	// check that DB alloc is empty after destroying AR
  2033  	ar.Destroy()
  2034  	select {
  2035  	case <-ar.DestroyCh():
  2036  	case <-time.After(10 * time.Second):
  2037  		require.Fail(t, "timedout waiting for destruction")
  2038  	}
  2039  
  2040  	allocs, _, err = conf.StateDB.GetAllAllocations()
  2041  	require.NoError(t, err)
  2042  	require.Empty(t, allocs)
  2043  	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  2044  	require.NoError(t, err)
  2045  	require.Nil(t, ts)
  2046  
  2047  	// check that DB alloc is empty after persisting state of destroyed AR
  2048  	ar.PersistState()
  2049  	allocs, _, err = conf.StateDB.GetAllAllocations()
  2050  	require.NoError(t, err)
  2051  	require.Empty(t, allocs)
  2052  	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  2053  	require.NoError(t, err)
  2054  	require.Nil(t, ts)
  2055  }
  2056  
  2057  func TestAllocRunner_Reconnect(t *testing.T) {
  2058  	t.Parallel()
  2059  
  2060  	type tcase struct {
  2061  		clientStatus string
  2062  		taskState    string
  2063  		taskEvent    *structs.TaskEvent
  2064  	}
  2065  	tcases := []tcase{
  2066  		{
  2067  			structs.AllocClientStatusRunning,
  2068  			structs.TaskStateRunning,
  2069  			structs.NewTaskEvent(structs.TaskStarted),
  2070  		},
  2071  		{
  2072  			structs.AllocClientStatusComplete,
  2073  			structs.TaskStateDead,
  2074  			structs.NewTaskEvent(structs.TaskTerminated),
  2075  		},
  2076  		{
  2077  			structs.AllocClientStatusFailed,
  2078  			structs.TaskStateDead,
  2079  			structs.NewTaskEvent(structs.TaskDriverFailure).SetFailsTask(),
  2080  		},
  2081  		{
  2082  			structs.AllocClientStatusPending,
  2083  			structs.TaskStatePending,
  2084  			structs.NewTaskEvent(structs.TaskReceived),
  2085  		},
  2086  	}
  2087  
  2088  	for _, tc := range tcases {
  2089  		t.Run(tc.clientStatus, func(t *testing.T) {
  2090  			// create a running alloc
  2091  			alloc := mock.BatchAlloc()
  2092  			alloc.AllocModifyIndex = 10
  2093  			alloc.ModifyIndex = 10
  2094  			alloc.ModifyTime = time.Now().UnixNano()
  2095  
  2096  			// Ensure task takes some time
  2097  			task := alloc.Job.TaskGroups[0].Tasks[0]
  2098  			task.Driver = "mock_driver"
  2099  			task.Config["run_for"] = "30s"
  2100  
  2101  			original := alloc.Copy()
  2102  
  2103  			conf, cleanup := testAllocRunnerConfig(t, alloc)
  2104  			defer cleanup()
  2105  
  2106  			ar, err := NewAllocRunner(conf)
  2107  			require.NoError(t, err)
  2108  			defer destroy(ar)
  2109  
  2110  			go ar.Run()
  2111  
  2112  			for _, taskRunner := range ar.tasks {
  2113  				taskRunner.UpdateState(tc.taskState, tc.taskEvent)
  2114  			}
  2115  
  2116  			update := ar.Alloc().Copy()
  2117  
  2118  			update.ClientStatus = structs.AllocClientStatusUnknown
  2119  			update.AllocModifyIndex = original.AllocModifyIndex + 10
  2120  			update.ModifyIndex = original.ModifyIndex + 10
  2121  			update.ModifyTime = original.ModifyTime + 10
  2122  
  2123  			err = ar.Reconnect(update)
  2124  			require.NoError(t, err)
  2125  
  2126  			require.Equal(t, tc.clientStatus, ar.AllocState().ClientStatus)
  2127  
  2128  			// Make sure the runner's alloc indexes match the update.
  2129  			require.Equal(t, update.AllocModifyIndex, ar.Alloc().AllocModifyIndex)
  2130  			require.Equal(t, update.ModifyIndex, ar.Alloc().ModifyIndex)
  2131  			require.Equal(t, update.ModifyTime, ar.Alloc().ModifyTime)
  2132  
  2133  			found := false
  2134  
  2135  			updater := conf.StateUpdater.(*MockStateUpdater)
  2136  			var last *structs.Allocation
  2137  			testutil.WaitForResult(func() (bool, error) {
  2138  				last = updater.Last()
  2139  				if last == nil {
  2140  					return false, errors.New("last update nil")
  2141  				}
  2142  
  2143  				states := last.TaskStates
  2144  				for _, s := range states {
  2145  					for _, e := range s.Events {
  2146  						if e.Type == structs.TaskClientReconnected {
  2147  							found = true
  2148  							return true, nil
  2149  						}
  2150  					}
  2151  				}
  2152  
  2153  				return false, errors.New("no reconnect event found")
  2154  			}, func(err error) {
  2155  				require.NoError(t, err)
  2156  			})
  2157  
  2158  			require.True(t, found, "no reconnect event found")
  2159  		})
  2160  	}
  2161  }
  2162  
  2163  // TestAllocRunner_Lifecycle_Shutdown_Order asserts that a service job with 3
  2164  // lifecycle hooks (1 sidecar, 1 ephemeral, 1 poststop) starts all 4 tasks, and shuts down
  2165  // the sidecar after main, but before poststop.
  2166  func TestAllocRunner_Lifecycle_Shutdown_Order(t *testing.T) {
  2167  	alloc := mock.LifecycleAllocWithPoststopDeploy()
  2168  
  2169  	alloc.Job.Type = structs.JobTypeService
  2170  
  2171  	mainTask := alloc.Job.TaskGroups[0].Tasks[0]
  2172  	mainTask.Config["run_for"] = "100s"
  2173  
  2174  	sidecarTask := alloc.Job.TaskGroups[0].Tasks[1]
  2175  	sidecarTask.Lifecycle.Hook = structs.TaskLifecycleHookPoststart
  2176  	sidecarTask.Config["run_for"] = "100s"
  2177  
  2178  	poststopTask := alloc.Job.TaskGroups[0].Tasks[2]
  2179  	ephemeralTask := alloc.Job.TaskGroups[0].Tasks[3]
  2180  
  2181  	alloc.Job.TaskGroups[0].Tasks = []*structs.Task{mainTask, ephemeralTask, sidecarTask, poststopTask}
  2182  
  2183  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  2184  	defer cleanup()
  2185  	ar, err := NewAllocRunner(conf)
  2186  	require.NoError(t, err)
  2187  	defer destroy(ar)
  2188  	go ar.Run()
  2189  
  2190  	upd := conf.StateUpdater.(*MockStateUpdater)
  2191  
  2192  	// Wait for main and sidecar tasks to be running, and that the
  2193  	// ephemeral task ran and exited.
  2194  	testutil.WaitForResult(func() (bool, error) {
  2195  		last := upd.Last()
  2196  		if last == nil {
  2197  			return false, fmt.Errorf("No updates")
  2198  		}
  2199  
  2200  		if last.ClientStatus != structs.AllocClientStatusRunning {
  2201  			return false, fmt.Errorf("expected alloc to be running not %s", last.ClientStatus)
  2202  		}
  2203  
  2204  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateRunning {
  2205  			return false, fmt.Errorf("expected main task to be running not %s", s)
  2206  		}
  2207  
  2208  		if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateRunning {
  2209  			return false, fmt.Errorf("expected sidecar task to be running not %s", s)
  2210  		}
  2211  
  2212  		if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateDead {
  2213  			return false, fmt.Errorf("expected ephemeral task to be dead not %s", s)
  2214  		}
  2215  
  2216  		if last.TaskStates[ephemeralTask.Name].Failed {
  2217  			return false, fmt.Errorf("expected ephemeral task to be successful not failed")
  2218  		}
  2219  
  2220  		return true, nil
  2221  	}, func(err error) {
  2222  		t.Fatalf("error waiting for initial state:\n%v", err)
  2223  	})
  2224  
  2225  	// Tell the alloc to stop
  2226  	stopAlloc := alloc.Copy()
  2227  	stopAlloc.DesiredStatus = structs.AllocDesiredStatusStop
  2228  	ar.Update(stopAlloc)
  2229  
  2230  	// Wait for tasks to stop.
  2231  	testutil.WaitForResult(func() (bool, error) {
  2232  		last := upd.Last()
  2233  
  2234  		if s := last.TaskStates[ephemeralTask.Name].State; s != structs.TaskStateDead {
  2235  			return false, fmt.Errorf("expected ephemeral task to be dead not %s", s)
  2236  		}
  2237  
  2238  		if last.TaskStates[ephemeralTask.Name].Failed {
  2239  			return false, fmt.Errorf("expected ephemeral task to be successful not failed")
  2240  		}
  2241  
  2242  		if s := last.TaskStates[mainTask.Name].State; s != structs.TaskStateDead {
  2243  			return false, fmt.Errorf("expected main task to be dead not %s", s)
  2244  		}
  2245  
  2246  		if last.TaskStates[mainTask.Name].Failed {
  2247  			return false, fmt.Errorf("expected main task to be successful not failed")
  2248  		}
  2249  
  2250  		if s := last.TaskStates[sidecarTask.Name].State; s != structs.TaskStateDead {
  2251  			return false, fmt.Errorf("expected sidecar task to be dead not %s", s)
  2252  		}
  2253  
  2254  		if last.TaskStates[sidecarTask.Name].Failed {
  2255  			return false, fmt.Errorf("expected sidecar task to be successful not failed")
  2256  		}
  2257  
  2258  		if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStateRunning {
  2259  			return false, fmt.Errorf("expected poststop task to be running not %s", s)
  2260  		}
  2261  
  2262  		return true, nil
  2263  	}, func(err error) {
  2264  		t.Fatalf("error waiting for kill state:\n%v", err)
  2265  	})
  2266  
  2267  	last := upd.Last()
  2268  	require.Less(t, last.TaskStates[ephemeralTask.Name].FinishedAt, last.TaskStates[mainTask.Name].FinishedAt)
  2269  	require.Less(t, last.TaskStates[mainTask.Name].FinishedAt, last.TaskStates[sidecarTask.Name].FinishedAt)
  2270  
  2271  	// Wait for poststop task to stop.
  2272  	testutil.WaitForResult(func() (bool, error) {
  2273  		last := upd.Last()
  2274  
  2275  		if s := last.TaskStates[poststopTask.Name].State; s != structs.TaskStateDead {
  2276  			return false, fmt.Errorf("expected poststop task to be dead not %s", s)
  2277  		}
  2278  
  2279  		if last.TaskStates[poststopTask.Name].Failed {
  2280  			return false, fmt.Errorf("expected poststop task to be successful not failed")
  2281  		}
  2282  
  2283  		return true, nil
  2284  	}, func(err error) {
  2285  		t.Fatalf("error waiting for poststop state:\n%v", err)
  2286  	})
  2287  
  2288  	last = upd.Last()
  2289  	require.Less(t, last.TaskStates[sidecarTask.Name].FinishedAt, last.TaskStates[poststopTask.Name].FinishedAt)
  2290  }
  2291  
  2292  func TestHasSidecarTasks(t *testing.T) {
  2293  	ci.Parallel(t)
  2294  
  2295  	testCases := []struct {
  2296  		name           string
  2297  		lifecycle      []*structs.TaskLifecycleConfig
  2298  		hasSidecars    bool
  2299  		hasNonsidecars bool
  2300  	}{
  2301  		{
  2302  			name: "all sidecar - one",
  2303  			lifecycle: []*structs.TaskLifecycleConfig{
  2304  				{
  2305  					Hook:    structs.TaskLifecycleHookPrestart,
  2306  					Sidecar: true,
  2307  				},
  2308  			},
  2309  			hasSidecars:    true,
  2310  			hasNonsidecars: false,
  2311  		},
  2312  		{
  2313  			name: "all sidecar - multiple",
  2314  			lifecycle: []*structs.TaskLifecycleConfig{
  2315  				{
  2316  					Hook:    structs.TaskLifecycleHookPrestart,
  2317  					Sidecar: true,
  2318  				},
  2319  				{
  2320  					Hook:    structs.TaskLifecycleHookPrestart,
  2321  					Sidecar: true,
  2322  				},
  2323  				{
  2324  					Hook:    structs.TaskLifecycleHookPrestart,
  2325  					Sidecar: true,
  2326  				},
  2327  			},
  2328  			hasSidecars:    true,
  2329  			hasNonsidecars: false,
  2330  		},
  2331  		{
  2332  			name: "some sidecars, some others",
  2333  			lifecycle: []*structs.TaskLifecycleConfig{
  2334  				nil,
  2335  				{
  2336  					Hook:    structs.TaskLifecycleHookPrestart,
  2337  					Sidecar: false,
  2338  				},
  2339  				{
  2340  					Hook:    structs.TaskLifecycleHookPrestart,
  2341  					Sidecar: true,
  2342  				},
  2343  			},
  2344  			hasSidecars:    true,
  2345  			hasNonsidecars: true,
  2346  		},
  2347  		{
  2348  			name: "no sidecars",
  2349  			lifecycle: []*structs.TaskLifecycleConfig{
  2350  				nil,
  2351  				{
  2352  					Hook:    structs.TaskLifecycleHookPrestart,
  2353  					Sidecar: false,
  2354  				},
  2355  				nil,
  2356  			},
  2357  			hasSidecars:    false,
  2358  			hasNonsidecars: true,
  2359  		},
  2360  	}
  2361  
  2362  	for _, tc := range testCases {
  2363  		t.Run(tc.name, func(t *testing.T) {
  2364  			// Create alloc with the given task lifecycle configurations.
  2365  			alloc := mock.BatchAlloc()
  2366  
  2367  			tasks := []*structs.Task{}
  2368  			resources := map[string]*structs.AllocatedTaskResources{}
  2369  
  2370  			tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  2371  
  2372  			for i, lifecycle := range tc.lifecycle {
  2373  				task := alloc.Job.TaskGroups[0].Tasks[0].Copy()
  2374  				task.Name = fmt.Sprintf("task%d", i)
  2375  				task.Lifecycle = lifecycle
  2376  				tasks = append(tasks, task)
  2377  				resources[task.Name] = tr
  2378  			}
  2379  
  2380  			alloc.Job.TaskGroups[0].Tasks = tasks
  2381  			alloc.AllocatedResources.Tasks = resources
  2382  
  2383  			// Create alloc runner.
  2384  			arConf, cleanup := testAllocRunnerConfig(t, alloc)
  2385  			defer cleanup()
  2386  
  2387  			ar, err := NewAllocRunner(arConf)
  2388  			require.NoError(t, err)
  2389  
  2390  			require.Equal(t, tc.hasSidecars, hasSidecarTasks(ar.tasks), "sidecars")
  2391  
  2392  			runners := []*taskrunner.TaskRunner{}
  2393  			for _, r := range ar.tasks {
  2394  				runners = append(runners, r)
  2395  			}
  2396  			require.Equal(t, tc.hasNonsidecars, hasNonSidecarTasks(runners), "non-sidecars")
  2397  
  2398  		})
  2399  	}
  2400  }