github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/alloc_runner_test.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/hashicorp/consul/api"
    12  	"github.com/hashicorp/nomad/client/allochealth"
    13  	"github.com/hashicorp/nomad/client/allocwatcher"
    14  	cconsul "github.com/hashicorp/nomad/client/consul"
    15  	"github.com/hashicorp/nomad/client/state"
    16  	"github.com/hashicorp/nomad/command/agent/consul"
    17  	"github.com/hashicorp/nomad/helper/uuid"
    18  	"github.com/hashicorp/nomad/nomad/mock"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/nomad/testutil"
    21  	"github.com/stretchr/testify/require"
    22  )
    23  
    24  // destroy does a blocking destroy on an alloc runner
    25  func destroy(ar *allocRunner) {
    26  	ar.Destroy()
    27  	<-ar.DestroyCh()
    28  }
    29  
    30  // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
    31  // AllocState() are initialized even before the AllocRunner has run.
    32  func TestAllocRunner_AllocState_Initialized(t *testing.T) {
    33  	t.Parallel()
    34  
    35  	alloc := mock.Alloc()
    36  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
    37  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    38  	defer cleanup()
    39  
    40  	ar, err := NewAllocRunner(conf)
    41  	require.NoError(t, err)
    42  
    43  	allocState := ar.AllocState()
    44  
    45  	require.NotNil(t, allocState)
    46  	require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name])
    47  }
    48  
    49  // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the
    50  // entire task group is killed.
    51  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
    52  	t.Parallel()
    53  
    54  	alloc := mock.BatchAlloc()
    55  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
    56  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
    57  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
    58  
    59  	// Create two tasks in the task group
    60  	task := alloc.Job.TaskGroups[0].Tasks[0]
    61  	task.Name = "task1"
    62  	task.Driver = "mock_driver"
    63  	task.KillTimeout = 10 * time.Millisecond
    64  	task.Config = map[string]interface{}{
    65  		"run_for": "10s",
    66  	}
    67  
    68  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
    69  	task2.Name = "task2"
    70  	task2.Driver = "mock_driver"
    71  	task2.Leader = true
    72  	task2.Config = map[string]interface{}{
    73  		"run_for": "1s",
    74  	}
    75  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
    76  	alloc.AllocatedResources.Tasks[task.Name] = tr
    77  	alloc.AllocatedResources.Tasks[task2.Name] = tr
    78  
    79  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    80  	defer cleanup()
    81  	ar, err := NewAllocRunner(conf)
    82  	require.NoError(t, err)
    83  	defer destroy(ar)
    84  	go ar.Run()
    85  
    86  	// Wait for all tasks to be killed
    87  	upd := conf.StateUpdater.(*MockStateUpdater)
    88  	testutil.WaitForResult(func() (bool, error) {
    89  		last := upd.Last()
    90  		if last == nil {
    91  			return false, fmt.Errorf("No updates")
    92  		}
    93  		if last.ClientStatus != structs.AllocClientStatusComplete {
    94  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
    95  		}
    96  
    97  		// Task1 should be killed because Task2 exited
    98  		state1 := last.TaskStates[task.Name]
    99  		if state1.State != structs.TaskStateDead {
   100  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
   101  		}
   102  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
   103  			return false, fmt.Errorf("expected to have a start and finish time")
   104  		}
   105  		if len(state1.Events) < 2 {
   106  			// At least have a received and destroyed
   107  			return false, fmt.Errorf("Unexpected number of events")
   108  		}
   109  
   110  		found := false
   111  		killingMsg := ""
   112  		for _, e := range state1.Events {
   113  			if e.Type != structs.TaskLeaderDead {
   114  				found = true
   115  			}
   116  			if e.Type == structs.TaskKilling {
   117  				killingMsg = e.DisplayMessage
   118  			}
   119  		}
   120  
   121  		if !found {
   122  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
   123  		}
   124  
   125  		expectedKillingMsg := "Sent interrupt. Waiting 10ms before force killing"
   126  		if killingMsg != expectedKillingMsg {
   127  			return false, fmt.Errorf("Unexpected task event message - wanted %q. got %q", killingMsg, expectedKillingMsg)
   128  		}
   129  
   130  		// Task Two should be dead
   131  		state2 := last.TaskStates[task2.Name]
   132  		if state2.State != structs.TaskStateDead {
   133  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
   134  		}
   135  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
   136  			return false, fmt.Errorf("expected to have a start and finish time")
   137  		}
   138  
   139  		return true, nil
   140  	}, func(err error) {
   141  		t.Fatalf("err: %v", err)
   142  	})
   143  }
   144  
   145  func TestAllocRunner_TaskGroup_ShutdownDelay(t *testing.T) {
   146  	t.Parallel()
   147  
   148  	alloc := mock.Alloc()
   149  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   150  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   151  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   152  
   153  	// Create a group service
   154  	tg := alloc.Job.TaskGroups[0]
   155  	tg.Services = []*structs.Service{
   156  		{
   157  			Name: "shutdown_service",
   158  		},
   159  	}
   160  
   161  	// Create two tasks in the  group
   162  	task := alloc.Job.TaskGroups[0].Tasks[0]
   163  	task.Name = "follower1"
   164  	task.Driver = "mock_driver"
   165  	task.Config = map[string]interface{}{
   166  		"run_for": "10s",
   167  	}
   168  
   169  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   170  	task2.Name = "leader"
   171  	task2.Driver = "mock_driver"
   172  	task2.Leader = true
   173  	task2.Config = map[string]interface{}{
   174  		"run_for": "10s",
   175  	}
   176  
   177  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   178  	alloc.AllocatedResources.Tasks[task.Name] = tr
   179  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   180  
   181  	// Set a shutdown delay
   182  	shutdownDelay := 1 * time.Second
   183  	alloc.Job.TaskGroups[0].ShutdownDelay = &shutdownDelay
   184  
   185  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   186  	defer cleanup()
   187  	ar, err := NewAllocRunner(conf)
   188  	require.NoError(t, err)
   189  	defer destroy(ar)
   190  	go ar.Run()
   191  
   192  	// Wait for tasks to start
   193  	upd := conf.StateUpdater.(*MockStateUpdater)
   194  	last := upd.Last()
   195  	testutil.WaitForResult(func() (bool, error) {
   196  		last = upd.Last()
   197  		if last == nil {
   198  			return false, fmt.Errorf("No updates")
   199  		}
   200  		if n := len(last.TaskStates); n != 2 {
   201  			return false, fmt.Errorf("Not enough task states (want: 2; found %d)", n)
   202  		}
   203  		for name, state := range last.TaskStates {
   204  			if state.State != structs.TaskStateRunning {
   205  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
   206  			}
   207  		}
   208  		return true, nil
   209  	}, func(err error) {
   210  		t.Fatalf("err: %v", err)
   211  	})
   212  
   213  	// Reset updates
   214  	upd.Reset()
   215  
   216  	// Stop alloc
   217  	shutdownInit := time.Now()
   218  	update := alloc.Copy()
   219  	update.DesiredStatus = structs.AllocDesiredStatusStop
   220  	ar.Update(update)
   221  
   222  	// Wait for tasks to stop
   223  	testutil.WaitForResult(func() (bool, error) {
   224  		last := upd.Last()
   225  		if last == nil {
   226  			return false, fmt.Errorf("No updates")
   227  		}
   228  
   229  		fin := last.TaskStates["leader"].FinishedAt
   230  
   231  		if fin.IsZero() {
   232  			return false, nil
   233  		}
   234  
   235  		return true, nil
   236  	}, func(err error) {
   237  		last := upd.Last()
   238  		for name, state := range last.TaskStates {
   239  			t.Logf("%s: %s", name, state.State)
   240  		}
   241  		t.Fatalf("err: %v", err)
   242  	})
   243  
   244  	// Get consul client operations
   245  	consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
   246  	consulOpts := consulClient.GetOps()
   247  	var groupRemoveOp cconsul.MockConsulOp
   248  	for _, op := range consulOpts {
   249  		// Grab the first deregistration request
   250  		if op.Op == "remove" && op.Name == "group-web" {
   251  			groupRemoveOp = op
   252  			break
   253  		}
   254  	}
   255  
   256  	// Ensure remove operation is close to shutdown initiation
   257  	require.True(t, groupRemoveOp.OccurredAt.Sub(shutdownInit) < 100*time.Millisecond)
   258  
   259  	last = upd.Last()
   260  	minShutdown := shutdownInit.Add(task.ShutdownDelay)
   261  	leaderFinished := last.TaskStates["leader"].FinishedAt
   262  	followerFinished := last.TaskStates["follower1"].FinishedAt
   263  
   264  	// Check that both tasks shut down after min possible shutdown time
   265  	require.Greater(t, leaderFinished.UnixNano(), minShutdown.UnixNano())
   266  	require.Greater(t, followerFinished.UnixNano(), minShutdown.UnixNano())
   267  
   268  	// Check that there is at least shutdown_delay between consul
   269  	// remove operation and task finished at time
   270  	require.True(t, leaderFinished.Sub(groupRemoveOp.OccurredAt) > shutdownDelay)
   271  }
   272  
   273  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a
   274  // leader the leader is stopped before other tasks.
   275  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
   276  	t.Parallel()
   277  
   278  	alloc := mock.Alloc()
   279  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   280  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   281  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   282  
   283  	// Create 3 tasks in the task group
   284  	task := alloc.Job.TaskGroups[0].Tasks[0]
   285  	task.Name = "follower1"
   286  	task.Driver = "mock_driver"
   287  	task.Config = map[string]interface{}{
   288  		"run_for": "10s",
   289  	}
   290  
   291  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   292  	task2.Name = "leader"
   293  	task2.Driver = "mock_driver"
   294  	task2.Leader = true
   295  	task2.Config = map[string]interface{}{
   296  		"run_for": "10s",
   297  	}
   298  
   299  	task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   300  	task3.Name = "follower2"
   301  	task3.Driver = "mock_driver"
   302  	task3.Config = map[string]interface{}{
   303  		"run_for": "10s",
   304  	}
   305  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3)
   306  	alloc.AllocatedResources.Tasks[task.Name] = tr
   307  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   308  	alloc.AllocatedResources.Tasks[task3.Name] = tr
   309  
   310  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   311  	defer cleanup()
   312  	ar, err := NewAllocRunner(conf)
   313  	require.NoError(t, err)
   314  	defer destroy(ar)
   315  	go ar.Run()
   316  
   317  	// Wait for tasks to start
   318  	upd := conf.StateUpdater.(*MockStateUpdater)
   319  	last := upd.Last()
   320  	testutil.WaitForResult(func() (bool, error) {
   321  		last = upd.Last()
   322  		if last == nil {
   323  			return false, fmt.Errorf("No updates")
   324  		}
   325  		if n := len(last.TaskStates); n != 3 {
   326  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
   327  		}
   328  		for name, state := range last.TaskStates {
   329  			if state.State != structs.TaskStateRunning {
   330  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
   331  			}
   332  		}
   333  		return true, nil
   334  	}, func(err error) {
   335  		t.Fatalf("err: %v", err)
   336  	})
   337  
   338  	// Reset updates
   339  	upd.Reset()
   340  
   341  	// Stop alloc
   342  	update := alloc.Copy()
   343  	update.DesiredStatus = structs.AllocDesiredStatusStop
   344  	ar.Update(update)
   345  
   346  	// Wait for tasks to stop
   347  	testutil.WaitForResult(func() (bool, error) {
   348  		last := upd.Last()
   349  		if last == nil {
   350  			return false, fmt.Errorf("No updates")
   351  		}
   352  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
   353  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
   354  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
   355  		}
   356  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
   357  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
   358  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
   359  		}
   360  		return true, nil
   361  	}, func(err error) {
   362  		last := upd.Last()
   363  		for name, state := range last.TaskStates {
   364  			t.Logf("%s: %s", name, state.State)
   365  		}
   366  		t.Fatalf("err: %v", err)
   367  	})
   368  }
   369  
   370  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
   371  // restored task group with a leader that failed before restoring the leader is
   372  // not stopped as it does not exist.
   373  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
   374  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
   375  	t.Parallel()
   376  
   377  	alloc := mock.Alloc()
   378  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   379  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   380  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   381  
   382  	// Create a leader and follower task in the task group
   383  	task := alloc.Job.TaskGroups[0].Tasks[0]
   384  	task.Name = "follower1"
   385  	task.Driver = "mock_driver"
   386  	task.KillTimeout = 10 * time.Second
   387  	task.Config = map[string]interface{}{
   388  		"run_for": "10s",
   389  	}
   390  
   391  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   392  	task2.Name = "leader"
   393  	task2.Driver = "mock_driver"
   394  	task2.Leader = true
   395  	task2.KillTimeout = 10 * time.Millisecond
   396  	task2.Config = map[string]interface{}{
   397  		"run_for": "10s",
   398  	}
   399  
   400  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   401  	alloc.AllocatedResources.Tasks[task.Name] = tr
   402  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   403  
   404  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   405  	defer cleanup()
   406  
   407  	// Use a memory backed statedb
   408  	conf.StateDB = state.NewMemDB(conf.Logger)
   409  
   410  	ar, err := NewAllocRunner(conf)
   411  	require.NoError(t, err)
   412  
   413  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
   414  	ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled))
   415  	ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   416  
   417  	// Create a new AllocRunner to test RestoreState and Run
   418  	ar2, err := NewAllocRunner(conf)
   419  	require.NoError(t, err)
   420  	defer destroy(ar2)
   421  
   422  	if err := ar2.Restore(); err != nil {
   423  		t.Fatalf("error restoring state: %v", err)
   424  	}
   425  	ar2.Run()
   426  
   427  	// Wait for tasks to be stopped because leader is dead
   428  	testutil.WaitForResult(func() (bool, error) {
   429  		alloc := ar2.Alloc()
   430  		// TODO: this test does not test anything!!! alloc.TaskStates is an empty map
   431  		for task, state := range alloc.TaskStates {
   432  			if state.State != structs.TaskStateDead {
   433  				return false, fmt.Errorf("Task %q should be dead: %v", task, state.State)
   434  			}
   435  		}
   436  		return true, nil
   437  	}, func(err error) {
   438  		t.Fatalf("err: %v", err)
   439  	})
   440  
   441  	// Make sure it GCs properly
   442  	ar2.Destroy()
   443  
   444  	select {
   445  	case <-ar2.DestroyCh():
   446  		// exited as expected
   447  	case <-time.After(10 * time.Second):
   448  		t.Fatalf("timed out waiting for AR to GC")
   449  	}
   450  }
   451  
   452  func TestAllocRunner_Restore_LifecycleHooks(t *testing.T) {
   453  	t.Parallel()
   454  
   455  	alloc := mock.LifecycleAlloc()
   456  
   457  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   458  	defer cleanup()
   459  
   460  	// Use a memory backed statedb
   461  	conf.StateDB = state.NewMemDB(conf.Logger)
   462  
   463  	ar, err := NewAllocRunner(conf)
   464  	require.NoError(t, err)
   465  
   466  	// We should see all tasks with Prestart hooks are not blocked from running:
   467  	// i.e. the "init" and "side" task hook coordinator channels are closed
   468  	require.Truef(t, isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["init"].Task())), "init channel was open, should be closed")
   469  	require.Truef(t, isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["side"].Task())), "side channel was open, should be closed")
   470  
   471  	isChannelClosed(ar.taskHookCoordinator.startConditionForTask(ar.tasks["side"].Task()))
   472  
   473  	// Mimic client dies while init task running, and client restarts after init task finished
   474  	ar.tasks["init"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskTerminated))
   475  	ar.tasks["side"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   476  
   477  	// Create a new AllocRunner to test RestoreState and Run
   478  	ar2, err := NewAllocRunner(conf)
   479  	require.NoError(t, err)
   480  
   481  	if err := ar2.Restore(); err != nil {
   482  		t.Fatalf("error restoring state: %v", err)
   483  	}
   484  
   485  	// We want to see Restore resume execution with correct hook ordering:
   486  	// i.e. we should see the "web" main task hook coordinator channel is closed
   487  	require.Truef(t, isChannelClosed(ar2.taskHookCoordinator.startConditionForTask(ar.tasks["web"].Task())), "web channel was open, should be closed")
   488  }
   489  
   490  func TestAllocRunner_Update_Semantics(t *testing.T) {
   491  	t.Parallel()
   492  	require := require.New(t)
   493  
   494  	updatedAlloc := func(a *structs.Allocation) *structs.Allocation {
   495  		upd := a.CopySkipJob()
   496  		upd.AllocModifyIndex++
   497  
   498  		return upd
   499  	}
   500  
   501  	alloc := mock.Alloc()
   502  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   503  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   504  	defer cleanup()
   505  
   506  	ar, err := NewAllocRunner(conf)
   507  	require.NoError(err)
   508  
   509  	upd1 := updatedAlloc(alloc)
   510  	ar.Update(upd1)
   511  
   512  	// Update was placed into a queue
   513  	require.Len(ar.allocUpdatedCh, 1)
   514  
   515  	upd2 := updatedAlloc(alloc)
   516  	ar.Update(upd2)
   517  
   518  	// Allocation was _replaced_
   519  
   520  	require.Len(ar.allocUpdatedCh, 1)
   521  	queuedAlloc := <-ar.allocUpdatedCh
   522  	require.Equal(upd2, queuedAlloc)
   523  
   524  	// Requeueing older alloc is skipped
   525  	ar.Update(upd2)
   526  	ar.Update(upd1)
   527  
   528  	queuedAlloc = <-ar.allocUpdatedCh
   529  	require.Equal(upd2, queuedAlloc)
   530  
   531  	// Ignore after watch closed
   532  
   533  	close(ar.waitCh)
   534  
   535  	ar.Update(upd1)
   536  
   537  	// Did not queue the update
   538  	require.Len(ar.allocUpdatedCh, 0)
   539  }
   540  
   541  // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is
   542  // reported for services that got migrated; not just part of deployments.
   543  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
   544  	t.Parallel()
   545  
   546  	alloc := mock.Alloc()
   547  
   548  	// Ensure the alloc is *not* part of a deployment
   549  	alloc.DeploymentID = ""
   550  
   551  	// Shorten the default migration healthy time
   552  	tg := alloc.Job.TaskGroups[0]
   553  	tg.Migrate = structs.DefaultMigrateStrategy()
   554  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
   555  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   556  
   557  	task := tg.Tasks[0]
   558  	task.Driver = "mock_driver"
   559  	task.Config = map[string]interface{}{
   560  		"run_for": "30s",
   561  	}
   562  
   563  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   564  	defer cleanup()
   565  
   566  	ar, err := NewAllocRunner(conf)
   567  	require.NoError(t, err)
   568  	go ar.Run()
   569  	defer destroy(ar)
   570  
   571  	upd := conf.StateUpdater.(*MockStateUpdater)
   572  	testutil.WaitForResult(func() (bool, error) {
   573  		last := upd.Last()
   574  		if last == nil {
   575  			return false, fmt.Errorf("No updates")
   576  		}
   577  		if !last.DeploymentStatus.HasHealth() {
   578  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   579  		} else if !*last.DeploymentStatus.Healthy {
   580  			// This is fatal
   581  			t.Fatal("want deployment status healthy; got unhealthy")
   582  		}
   583  		return true, nil
   584  	}, func(err error) {
   585  		require.NoError(t, err)
   586  	})
   587  }
   588  
   589  // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health
   590  // watcher will mark the allocation as healthy based on task states alone.
   591  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   592  	t.Parallel()
   593  
   594  	alloc := mock.Alloc()
   595  
   596  	task := alloc.Job.TaskGroups[0].Tasks[0]
   597  	task.Driver = "mock_driver"
   598  	task.Config = map[string]interface{}{
   599  		"run_for": "10s",
   600  	}
   601  
   602  	// Create a task that takes longer to become healthy
   603  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy())
   604  	alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy()
   605  	task2 := alloc.Job.TaskGroups[0].Tasks[1]
   606  	task2.Name = "task2"
   607  	task2.Config["start_block_for"] = "500ms"
   608  
   609  	// Make the alloc be part of a deployment that uses task states for
   610  	// health checks
   611  	alloc.DeploymentID = uuid.Generate()
   612  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   613  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   614  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   615  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   616  
   617  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   618  	defer cleanup()
   619  
   620  	ar, err := NewAllocRunner(conf)
   621  	require.NoError(t, err)
   622  
   623  	start, done := time.Now(), time.Time{}
   624  	go ar.Run()
   625  	defer destroy(ar)
   626  
   627  	upd := conf.StateUpdater.(*MockStateUpdater)
   628  	testutil.WaitForResult(func() (bool, error) {
   629  		last := upd.Last()
   630  		if last == nil {
   631  			return false, fmt.Errorf("No updates")
   632  		}
   633  		if !last.DeploymentStatus.HasHealth() {
   634  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   635  		} else if !*last.DeploymentStatus.Healthy {
   636  			// This is fatal
   637  			t.Fatal("want deployment status healthy; got unhealthy")
   638  		}
   639  
   640  		// Capture the done timestamp
   641  		done = last.DeploymentStatus.Timestamp
   642  		return true, nil
   643  	}, func(err error) {
   644  		require.NoError(t, err)
   645  	})
   646  
   647  	if d := done.Sub(start); d < 500*time.Millisecond {
   648  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   649  	}
   650  }
   651  
   652  // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health
   653  // watcher will mark the allocation as unhealthy with failing checks.
   654  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   655  	t.Parallel()
   656  
   657  	alloc := mock.Alloc()
   658  	task := alloc.Job.TaskGroups[0].Tasks[0]
   659  	task.Driver = "mock_driver"
   660  	task.Config = map[string]interface{}{
   661  		"run_for": "10s",
   662  	}
   663  
   664  	// Set a service with check
   665  	task.Services = []*structs.Service{
   666  		{
   667  			Name:      "fakservice",
   668  			PortLabel: "http",
   669  			Checks: []*structs.ServiceCheck{
   670  				{
   671  					Name:     "fakecheck",
   672  					Type:     structs.ServiceCheckScript,
   673  					Command:  "true",
   674  					Interval: 30 * time.Second,
   675  					Timeout:  5 * time.Second,
   676  				},
   677  			},
   678  		},
   679  	}
   680  
   681  	// Make the alloc be part of a deployment
   682  	alloc.DeploymentID = uuid.Generate()
   683  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   684  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   685  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   686  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   687  	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   688  
   689  	checkUnhealthy := &api.AgentCheck{
   690  		CheckID: uuid.Generate(),
   691  		Status:  api.HealthWarning,
   692  	}
   693  
   694  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   695  	defer cleanup()
   696  
   697  	// Only return the check as healthy after a duration
   698  	consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
   699  	consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   700  		return &consul.AllocRegistration{
   701  			Tasks: map[string]*consul.ServiceRegistrations{
   702  				task.Name: {
   703  					Services: map[string]*consul.ServiceRegistration{
   704  						"123": {
   705  							Service: &api.AgentService{Service: "fakeservice"},
   706  							Checks:  []*api.AgentCheck{checkUnhealthy},
   707  						},
   708  					},
   709  				},
   710  			},
   711  		}, nil
   712  	}
   713  
   714  	ar, err := NewAllocRunner(conf)
   715  	require.NoError(t, err)
   716  	go ar.Run()
   717  	defer destroy(ar)
   718  
   719  	var lastUpdate *structs.Allocation
   720  	upd := conf.StateUpdater.(*MockStateUpdater)
   721  	testutil.WaitForResult(func() (bool, error) {
   722  		lastUpdate = upd.Last()
   723  		if lastUpdate == nil {
   724  			return false, fmt.Errorf("No updates")
   725  		}
   726  		if !lastUpdate.DeploymentStatus.HasHealth() {
   727  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   728  		} else if *lastUpdate.DeploymentStatus.Healthy {
   729  			// This is fatal
   730  			t.Fatal("want deployment status unhealthy; got healthy")
   731  		}
   732  		return true, nil
   733  	}, func(err error) {
   734  		require.NoError(t, err)
   735  	})
   736  
   737  	// Assert that we have an event explaining why we are unhealthy.
   738  	require.Len(t, lastUpdate.TaskStates, 1)
   739  	state := lastUpdate.TaskStates[task.Name]
   740  	require.NotNil(t, state)
   741  	require.NotEmpty(t, state.Events)
   742  	last := state.Events[len(state.Events)-1]
   743  	require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
   744  	require.Contains(t, last.Message, "by deadline")
   745  }
   746  
   747  // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running
   748  // alloc.
   749  func TestAllocRunner_Destroy(t *testing.T) {
   750  	t.Parallel()
   751  
   752  	// Ensure task takes some time
   753  	alloc := mock.BatchAlloc()
   754  	task := alloc.Job.TaskGroups[0].Tasks[0]
   755  	task.Config["run_for"] = "10s"
   756  
   757  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   758  	defer cleanup()
   759  
   760  	// Use a MemDB to assert alloc state gets cleaned up
   761  	conf.StateDB = state.NewMemDB(conf.Logger)
   762  
   763  	ar, err := NewAllocRunner(conf)
   764  	require.NoError(t, err)
   765  	go ar.Run()
   766  
   767  	// Wait for alloc to be running
   768  	testutil.WaitForResult(func() (bool, error) {
   769  		state := ar.AllocState()
   770  
   771  		return state.ClientStatus == structs.AllocClientStatusRunning,
   772  			fmt.Errorf("got client status %v; want running", state.ClientStatus)
   773  	}, func(err error) {
   774  		require.NoError(t, err)
   775  	})
   776  
   777  	// Assert state was stored
   778  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   779  	require.NoError(t, err)
   780  	require.NotNil(t, ls)
   781  	require.NotNil(t, ts)
   782  
   783  	// Now destroy
   784  	ar.Destroy()
   785  
   786  	select {
   787  	case <-ar.DestroyCh():
   788  		// Destroyed properly!
   789  	case <-time.After(10 * time.Second):
   790  		require.Fail(t, "timed out waiting for alloc to be destroyed")
   791  	}
   792  
   793  	// Assert alloc is dead
   794  	state := ar.AllocState()
   795  	require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus)
   796  
   797  	// Assert the state was cleaned
   798  	ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   799  	require.NoError(t, err)
   800  	require.Nil(t, ls)
   801  	require.Nil(t, ts)
   802  
   803  	// Assert the alloc directory was cleaned
   804  	if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   805  		require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir)
   806  	} else if !os.IsNotExist(err) {
   807  		require.Failf(t, "expected NotExist error", "found %v", err)
   808  	}
   809  }
   810  
   811  func TestAllocRunner_SimpleRun(t *testing.T) {
   812  	t.Parallel()
   813  
   814  	alloc := mock.BatchAlloc()
   815  
   816  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   817  	defer cleanup()
   818  	ar, err := NewAllocRunner(conf)
   819  	require.NoError(t, err)
   820  	go ar.Run()
   821  	defer destroy(ar)
   822  
   823  	// Wait for alloc to be running
   824  	testutil.WaitForResult(func() (bool, error) {
   825  		state := ar.AllocState()
   826  
   827  		if state.ClientStatus != structs.AllocClientStatusComplete {
   828  			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete)
   829  		}
   830  
   831  		for t, s := range state.TaskStates {
   832  			if s.FinishedAt.IsZero() {
   833  				return false, fmt.Errorf("task %q has zero FinishedAt value", t)
   834  			}
   835  		}
   836  
   837  		return true, nil
   838  	}, func(err error) {
   839  		require.NoError(t, err)
   840  	})
   841  
   842  }
   843  
   844  // TestAllocRunner_MoveAllocDir asserts that a rescheduled
   845  // allocation copies ephemeral disk content from previous alloc run
   846  func TestAllocRunner_MoveAllocDir(t *testing.T) {
   847  	t.Parallel()
   848  
   849  	// Step 1: start and run a task
   850  	alloc := mock.BatchAlloc()
   851  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   852  	defer cleanup()
   853  	ar, err := NewAllocRunner(conf)
   854  	require.NoError(t, err)
   855  	ar.Run()
   856  	defer destroy(ar)
   857  
   858  	require.Equal(t, structs.AllocClientStatusComplete, ar.AllocState().ClientStatus)
   859  
   860  	// Step 2. Modify its directory
   861  	task := alloc.Job.TaskGroups[0].Tasks[0]
   862  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
   863  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
   864  	taskDir := ar.allocDir.TaskDirs[task.Name]
   865  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
   866  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
   867  
   868  	// Step 3. Start a new alloc
   869  	alloc2 := mock.BatchAlloc()
   870  	alloc2.PreviousAllocation = alloc.ID
   871  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
   872  
   873  	conf2, cleanup := testAllocRunnerConfig(t, alloc2)
   874  	conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{
   875  		Alloc:          alloc2,
   876  		PreviousRunner: ar,
   877  		Logger:         conf2.Logger,
   878  	})
   879  	defer cleanup()
   880  	ar2, err := NewAllocRunner(conf2)
   881  	require.NoError(t, err)
   882  
   883  	ar2.Run()
   884  	defer destroy(ar2)
   885  
   886  	require.Equal(t, structs.AllocClientStatusComplete, ar2.AllocState().ClientStatus)
   887  
   888  	// Ensure that data from ar was moved to ar2
   889  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
   890  	fileInfo, _ := os.Stat(dataFile)
   891  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
   892  
   893  	taskDir = ar2.allocDir.TaskDirs[task.Name]
   894  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
   895  	fileInfo, _ = os.Stat(taskLocalFile)
   896  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
   897  
   898  }
   899  
   900  // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is
   901  // retrying fetching an artifact, other tasks in the group should be able
   902  // to proceed.
   903  func TestAllocRunner_HandlesArtifactFailure(t *testing.T) {
   904  	t.Parallel()
   905  
   906  	alloc := mock.BatchAlloc()
   907  	rp := &structs.RestartPolicy{
   908  		Mode:     structs.RestartPolicyModeFail,
   909  		Attempts: 1,
   910  		Delay:    time.Nanosecond,
   911  		Interval: time.Hour,
   912  	}
   913  	alloc.Job.TaskGroups[0].RestartPolicy = rp
   914  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy = rp
   915  
   916  	// Create a new task with a bad artifact
   917  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   918  	badtask.Name = "bad"
   919  	badtask.Artifacts = []*structs.TaskArtifact{
   920  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   921  	}
   922  
   923  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   924  	alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{
   925  		Cpu: structs.AllocatedCpuResources{
   926  			CpuShares: 500,
   927  		},
   928  		Memory: structs.AllocatedMemoryResources{
   929  			MemoryMB: 256,
   930  		},
   931  	}
   932  
   933  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   934  	defer cleanup()
   935  	ar, err := NewAllocRunner(conf)
   936  	require.NoError(t, err)
   937  	go ar.Run()
   938  	defer destroy(ar)
   939  
   940  	testutil.WaitForResult(func() (bool, error) {
   941  		state := ar.AllocState()
   942  
   943  		switch state.ClientStatus {
   944  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
   945  			return true, nil
   946  		default:
   947  			return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus)
   948  		}
   949  
   950  	}, func(err error) {
   951  		require.NoError(t, err)
   952  	})
   953  
   954  	state := ar.AllocState()
   955  	require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus)
   956  	require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State)
   957  	require.True(t, state.TaskStates["web"].Successful())
   958  	require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State)
   959  	require.True(t, state.TaskStates["bad"].Failed)
   960  }
   961  
   962  // Test that alloc runner kills tasks in task group when another task fails
   963  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
   964  	alloc := mock.Alloc()
   965  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   966  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   967  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
   968  
   969  	// Create two tasks in the task group
   970  	task := alloc.Job.TaskGroups[0].Tasks[0]
   971  	task.Name = "task1"
   972  	task.Driver = "mock_driver"
   973  	task.KillTimeout = 10 * time.Millisecond
   974  	task.Config = map[string]interface{}{
   975  		"run_for": "10s",
   976  	}
   977  	// Set a service with check
   978  	task.Services = []*structs.Service{
   979  		{
   980  			Name:      "fakservice",
   981  			PortLabel: "http",
   982  			Checks: []*structs.ServiceCheck{
   983  				{
   984  					Name:     "fakecheck",
   985  					Type:     structs.ServiceCheckScript,
   986  					Command:  "true",
   987  					Interval: 30 * time.Second,
   988  					Timeout:  5 * time.Second,
   989  				},
   990  			},
   991  		},
   992  	}
   993  
   994  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   995  	task2.Name = "task 2"
   996  	task2.Driver = "mock_driver"
   997  	task2.Config = map[string]interface{}{
   998  		"start_error": "fail task please",
   999  	}
  1000  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
  1001  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1002  	alloc.AllocatedResources.Tasks[task2.Name] = tr
  1003  
  1004  	// Make the alloc be part of a deployment
  1005  	alloc.DeploymentID = uuid.Generate()
  1006  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
  1007  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
  1008  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
  1009  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 10 * time.Millisecond
  1010  	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 2 * time.Second
  1011  
  1012  	checkHealthy := &api.AgentCheck{
  1013  		CheckID: uuid.Generate(),
  1014  		Status:  api.HealthPassing,
  1015  	}
  1016  
  1017  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1018  	defer cleanup()
  1019  
  1020  	consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
  1021  	consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
  1022  		return &consul.AllocRegistration{
  1023  			Tasks: map[string]*consul.ServiceRegistrations{
  1024  				task.Name: {
  1025  					Services: map[string]*consul.ServiceRegistration{
  1026  						"123": {
  1027  							Service: &api.AgentService{Service: "fakeservice"},
  1028  							Checks:  []*api.AgentCheck{checkHealthy},
  1029  						},
  1030  					},
  1031  				},
  1032  			},
  1033  		}, nil
  1034  	}
  1035  
  1036  	ar, err := NewAllocRunner(conf)
  1037  	require.NoError(t, err)
  1038  	defer destroy(ar)
  1039  	go ar.Run()
  1040  	upd := conf.StateUpdater.(*MockStateUpdater)
  1041  
  1042  	testutil.WaitForResult(func() (bool, error) {
  1043  		last := upd.Last()
  1044  		if last == nil {
  1045  			return false, fmt.Errorf("No updates")
  1046  		}
  1047  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1048  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1049  		}
  1050  
  1051  		// Task One should be killed
  1052  		state1 := last.TaskStates[task.Name]
  1053  		if state1.State != structs.TaskStateDead {
  1054  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1055  		}
  1056  		if len(state1.Events) < 2 {
  1057  			// At least have a received and destroyed
  1058  			return false, fmt.Errorf("Unexpected number of events")
  1059  		}
  1060  
  1061  		found := false
  1062  		for _, e := range state1.Events {
  1063  			if e.Type != structs.TaskSiblingFailed {
  1064  				found = true
  1065  			}
  1066  		}
  1067  
  1068  		if !found {
  1069  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1070  		}
  1071  
  1072  		// Task Two should be failed
  1073  		state2 := last.TaskStates[task2.Name]
  1074  		if state2.State != structs.TaskStateDead {
  1075  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1076  		}
  1077  		if !state2.Failed {
  1078  			return false, fmt.Errorf("task2 should have failed")
  1079  		}
  1080  
  1081  		if !last.DeploymentStatus.HasHealth() {
  1082  			return false, fmt.Errorf("Expected deployment health to be non nil")
  1083  		}
  1084  
  1085  		return true, nil
  1086  	}, func(err error) {
  1087  		require.Fail(t, "err: %v", err)
  1088  	})
  1089  }
  1090  
  1091  // Test that alloc becoming terminal should destroy the alloc runner
  1092  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
  1093  	t.Parallel()
  1094  	alloc := mock.BatchAlloc()
  1095  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
  1096  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
  1097  	alloc.Job.TaskGroups[0].Tasks[0].RestartPolicy.Attempts = 0
  1098  	// Ensure task takes some time
  1099  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1100  	task.Driver = "mock_driver"
  1101  	task.Config["run_for"] = "10s"
  1102  	alloc.AllocatedResources.Tasks[task.Name] = tr
  1103  
  1104  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1105  	defer cleanup()
  1106  	ar, err := NewAllocRunner(conf)
  1107  	require.NoError(t, err)
  1108  	defer destroy(ar)
  1109  	go ar.Run()
  1110  	upd := conf.StateUpdater.(*MockStateUpdater)
  1111  
  1112  	testutil.WaitForResult(func() (bool, error) {
  1113  		last := upd.Last()
  1114  		if last == nil {
  1115  			return false, fmt.Errorf("No updates")
  1116  		}
  1117  		if last.ClientStatus != structs.AllocClientStatusRunning {
  1118  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
  1119  		}
  1120  		return true, nil
  1121  	}, func(err error) {
  1122  		require.Fail(t, "err: %v", err)
  1123  	})
  1124  
  1125  	// Update the alloc to be terminal which should cause the alloc runner to
  1126  	// stop the tasks and wait for a destroy.
  1127  	update := ar.alloc.Copy()
  1128  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1129  	ar.Update(update)
  1130  
  1131  	testutil.WaitForResult(func() (bool, error) {
  1132  		last := upd.Last()
  1133  		if last == nil {
  1134  			return false, fmt.Errorf("No updates")
  1135  		}
  1136  
  1137  		// Check the status has changed.
  1138  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1139  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1140  		}
  1141  
  1142  		// Check the alloc directory still exists
  1143  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
  1144  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
  1145  		}
  1146  
  1147  		return true, nil
  1148  	}, func(err error) {
  1149  		require.Fail(t, "err: %v", err)
  1150  	})
  1151  
  1152  	// Send the destroy signal and ensure the AllocRunner cleans up.
  1153  	ar.Destroy()
  1154  
  1155  	testutil.WaitForResult(func() (bool, error) {
  1156  		last := upd.Last()
  1157  		if last == nil {
  1158  			return false, fmt.Errorf("No updates")
  1159  		}
  1160  
  1161  		// Check the status has changed.
  1162  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1163  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1164  		}
  1165  
  1166  		// Check the alloc directory was cleaned
  1167  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
  1168  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
  1169  		} else if !os.IsNotExist(err) {
  1170  			return false, fmt.Errorf("stat err: %v", err)
  1171  		}
  1172  
  1173  		return true, nil
  1174  	}, func(err error) {
  1175  		require.Fail(t, "err: %v", err)
  1176  	})
  1177  }
  1178  
  1179  // TestAllocRunner_PersistState_Destroyed asserts that destroyed allocs don't persist anymore
  1180  func TestAllocRunner_PersistState_Destroyed(t *testing.T) {
  1181  	t.Parallel()
  1182  
  1183  	alloc := mock.BatchAlloc()
  1184  	taskName := alloc.Job.LookupTaskGroup(alloc.TaskGroup).Tasks[0].Name
  1185  
  1186  	conf, cleanup := testAllocRunnerConfig(t, alloc)
  1187  	conf.StateDB = state.NewMemDB(conf.Logger)
  1188  
  1189  	defer cleanup()
  1190  	ar, err := NewAllocRunner(conf)
  1191  	require.NoError(t, err)
  1192  	defer destroy(ar)
  1193  
  1194  	go ar.Run()
  1195  
  1196  	select {
  1197  	case <-ar.WaitCh():
  1198  	case <-time.After(10 * time.Second):
  1199  		require.Fail(t, "timed out waiting for alloc to complete")
  1200  	}
  1201  
  1202  	// test final persisted state upon completion
  1203  	require.NoError(t, ar.PersistState())
  1204  	allocs, _, err := conf.StateDB.GetAllAllocations()
  1205  	require.NoError(t, err)
  1206  	require.Len(t, allocs, 1)
  1207  	require.Equal(t, alloc.ID, allocs[0].ID)
  1208  	_, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  1209  	require.NoError(t, err)
  1210  	require.Equal(t, structs.TaskStateDead, ts.State)
  1211  
  1212  	// check that DB alloc is empty after destroying AR
  1213  	ar.Destroy()
  1214  	select {
  1215  	case <-ar.DestroyCh():
  1216  	case <-time.After(10 * time.Second):
  1217  		require.Fail(t, "timedout waiting for destruction")
  1218  	}
  1219  
  1220  	allocs, _, err = conf.StateDB.GetAllAllocations()
  1221  	require.NoError(t, err)
  1222  	require.Empty(t, allocs)
  1223  	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  1224  	require.NoError(t, err)
  1225  	require.Nil(t, ts)
  1226  
  1227  	// check that DB alloc is empty after persisting state of destroyed AR
  1228  	ar.PersistState()
  1229  	allocs, _, err = conf.StateDB.GetAllAllocations()
  1230  	require.NoError(t, err)
  1231  	require.Empty(t, allocs)
  1232  	_, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, taskName)
  1233  	require.NoError(t, err)
  1234  	require.Nil(t, ts)
  1235  }