github.com/smithx10/nomad@v0.9.1-rc1/client/allocrunner/alloc_runner_test.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"testing"
     9  	"time"
    10  
    11  	"github.com/hashicorp/consul/api"
    12  	"github.com/hashicorp/nomad/client/allochealth"
    13  	"github.com/hashicorp/nomad/client/allocwatcher"
    14  	cconsul "github.com/hashicorp/nomad/client/consul"
    15  	"github.com/hashicorp/nomad/client/state"
    16  	"github.com/hashicorp/nomad/command/agent/consul"
    17  	"github.com/hashicorp/nomad/helper/uuid"
    18  	"github.com/hashicorp/nomad/nomad/mock"
    19  	"github.com/hashicorp/nomad/nomad/structs"
    20  	"github.com/hashicorp/nomad/testutil"
    21  	"github.com/stretchr/testify/require"
    22  )
    23  
    24  // destroy does a blocking destroy on an alloc runner
    25  func destroy(ar *allocRunner) {
    26  	ar.Destroy()
    27  	<-ar.DestroyCh()
    28  }
    29  
    30  // TestAllocRunner_AllocState_Initialized asserts that getting TaskStates via
    31  // AllocState() are initialized even before the AllocRunner has run.
    32  func TestAllocRunner_AllocState_Initialized(t *testing.T) {
    33  	t.Parallel()
    34  
    35  	alloc := mock.Alloc()
    36  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
    37  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    38  	defer cleanup()
    39  
    40  	ar, err := NewAllocRunner(conf)
    41  	require.NoError(t, err)
    42  
    43  	allocState := ar.AllocState()
    44  
    45  	require.NotNil(t, allocState)
    46  	require.NotNil(t, allocState.TaskStates[conf.Alloc.Job.TaskGroups[0].Tasks[0].Name])
    47  }
    48  
    49  // TestAllocRunner_TaskLeader_KillTG asserts that when a leader task dies the
    50  // entire task group is killed.
    51  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
    52  	t.Parallel()
    53  
    54  	alloc := mock.BatchAlloc()
    55  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
    56  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
    57  
    58  	// Create two tasks in the task group
    59  	task := alloc.Job.TaskGroups[0].Tasks[0]
    60  	task.Name = "task1"
    61  	task.Driver = "mock_driver"
    62  	task.KillTimeout = 10 * time.Millisecond
    63  	task.Config = map[string]interface{}{
    64  		"run_for": "10s",
    65  	}
    66  
    67  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
    68  	task2.Name = "task2"
    69  	task2.Driver = "mock_driver"
    70  	task2.Leader = true
    71  	task2.Config = map[string]interface{}{
    72  		"run_for": "1s",
    73  	}
    74  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
    75  	alloc.AllocatedResources.Tasks[task.Name] = tr
    76  	alloc.AllocatedResources.Tasks[task2.Name] = tr
    77  
    78  	conf, cleanup := testAllocRunnerConfig(t, alloc)
    79  	defer cleanup()
    80  	ar, err := NewAllocRunner(conf)
    81  	require.NoError(t, err)
    82  	defer destroy(ar)
    83  	go ar.Run()
    84  
    85  	// Wait for all tasks to be killed
    86  	upd := conf.StateUpdater.(*MockStateUpdater)
    87  	testutil.WaitForResult(func() (bool, error) {
    88  		last := upd.Last()
    89  		if last == nil {
    90  			return false, fmt.Errorf("No updates")
    91  		}
    92  		if last.ClientStatus != structs.AllocClientStatusComplete {
    93  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
    94  		}
    95  
    96  		// Task1 should be killed because Task2 exited
    97  		state1 := last.TaskStates[task.Name]
    98  		if state1.State != structs.TaskStateDead {
    99  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
   100  		}
   101  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
   102  			return false, fmt.Errorf("expected to have a start and finish time")
   103  		}
   104  		if len(state1.Events) < 2 {
   105  			// At least have a received and destroyed
   106  			return false, fmt.Errorf("Unexpected number of events")
   107  		}
   108  
   109  		found := false
   110  		for _, e := range state1.Events {
   111  			if e.Type != structs.TaskLeaderDead {
   112  				found = true
   113  			}
   114  		}
   115  
   116  		if !found {
   117  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
   118  		}
   119  
   120  		// Task Two should be dead
   121  		state2 := last.TaskStates[task2.Name]
   122  		if state2.State != structs.TaskStateDead {
   123  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
   124  		}
   125  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
   126  			return false, fmt.Errorf("expected to have a start and finish time")
   127  		}
   128  
   129  		return true, nil
   130  	}, func(err error) {
   131  		t.Fatalf("err: %v", err)
   132  	})
   133  }
   134  
   135  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping an alloc with a
   136  // leader the leader is stopped before other tasks.
   137  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
   138  	t.Parallel()
   139  
   140  	alloc := mock.Alloc()
   141  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   142  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   143  
   144  	// Create 3 tasks in the task group
   145  	task := alloc.Job.TaskGroups[0].Tasks[0]
   146  	task.Name = "follower1"
   147  	task.Driver = "mock_driver"
   148  	task.Config = map[string]interface{}{
   149  		"run_for": "10s",
   150  	}
   151  
   152  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   153  	task2.Name = "leader"
   154  	task2.Driver = "mock_driver"
   155  	task2.Leader = true
   156  	task2.Config = map[string]interface{}{
   157  		"run_for": "10s",
   158  	}
   159  
   160  	task3 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   161  	task3.Name = "follower2"
   162  	task3.Driver = "mock_driver"
   163  	task3.Config = map[string]interface{}{
   164  		"run_for": "10s",
   165  	}
   166  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2, task3)
   167  	alloc.AllocatedResources.Tasks[task.Name] = tr
   168  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   169  	alloc.AllocatedResources.Tasks[task3.Name] = tr
   170  
   171  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   172  	defer cleanup()
   173  	ar, err := NewAllocRunner(conf)
   174  	require.NoError(t, err)
   175  	defer destroy(ar)
   176  	go ar.Run()
   177  
   178  	// Wait for tasks to start
   179  	upd := conf.StateUpdater.(*MockStateUpdater)
   180  	last := upd.Last()
   181  	testutil.WaitForResult(func() (bool, error) {
   182  		last = upd.Last()
   183  		if last == nil {
   184  			return false, fmt.Errorf("No updates")
   185  		}
   186  		if n := len(last.TaskStates); n != 3 {
   187  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
   188  		}
   189  		for name, state := range last.TaskStates {
   190  			if state.State != structs.TaskStateRunning {
   191  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
   192  			}
   193  		}
   194  		return true, nil
   195  	}, func(err error) {
   196  		t.Fatalf("err: %v", err)
   197  	})
   198  
   199  	// Reset updates
   200  	upd.Reset()
   201  
   202  	// Stop alloc
   203  	update := alloc.Copy()
   204  	update.DesiredStatus = structs.AllocDesiredStatusStop
   205  	ar.Update(update)
   206  
   207  	// Wait for tasks to stop
   208  	testutil.WaitForResult(func() (bool, error) {
   209  		last := upd.Last()
   210  		if last == nil {
   211  			return false, fmt.Errorf("No updates")
   212  		}
   213  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
   214  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
   215  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
   216  		}
   217  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
   218  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
   219  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
   220  		}
   221  		return true, nil
   222  	}, func(err error) {
   223  		last := upd.Last()
   224  		for name, state := range last.TaskStates {
   225  			t.Logf("%s: %s", name, state.State)
   226  		}
   227  		t.Fatalf("err: %v", err)
   228  	})
   229  }
   230  
   231  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
   232  // restored task group with a leader that failed before restoring the leader is
   233  // not stopped as it does not exist.
   234  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
   235  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
   236  	t.Parallel()
   237  
   238  	alloc := mock.Alloc()
   239  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   240  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   241  
   242  	// Create a leader and follower task in the task group
   243  	task := alloc.Job.TaskGroups[0].Tasks[0]
   244  	task.Name = "follower1"
   245  	task.Driver = "mock_driver"
   246  	task.KillTimeout = 10 * time.Second
   247  	task.Config = map[string]interface{}{
   248  		"run_for": "10s",
   249  	}
   250  
   251  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   252  	task2.Name = "leader"
   253  	task2.Driver = "mock_driver"
   254  	task2.Leader = true
   255  	task2.KillTimeout = 10 * time.Millisecond
   256  	task2.Config = map[string]interface{}{
   257  		"run_for": "10s",
   258  	}
   259  
   260  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   261  	alloc.AllocatedResources.Tasks[task.Name] = tr
   262  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   263  
   264  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   265  	defer cleanup()
   266  
   267  	// Use a memory backed statedb
   268  	conf.StateDB = state.NewMemDB(conf.Logger)
   269  
   270  	ar, err := NewAllocRunner(conf)
   271  	require.NoError(t, err)
   272  
   273  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
   274  	ar.tasks["leader"].UpdateState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskKilled))
   275  	ar.tasks["follower1"].UpdateState(structs.TaskStateRunning, structs.NewTaskEvent(structs.TaskStarted))
   276  
   277  	// Create a new AllocRunner to test RestoreState and Run
   278  	ar2, err := NewAllocRunner(conf)
   279  	require.NoError(t, err)
   280  	defer destroy(ar2)
   281  
   282  	if err := ar2.Restore(); err != nil {
   283  		t.Fatalf("error restoring state: %v", err)
   284  	}
   285  	ar2.Run()
   286  
   287  	// Wait for tasks to be stopped because leader is dead
   288  	testutil.WaitForResult(func() (bool, error) {
   289  		alloc := ar2.Alloc()
   290  		for task, state := range alloc.TaskStates {
   291  			if state.State != structs.TaskStateDead {
   292  				return false, fmt.Errorf("Task %q should be dead: %v", task, state.State)
   293  			}
   294  		}
   295  		return true, nil
   296  	}, func(err error) {
   297  		t.Fatalf("err: %v", err)
   298  	})
   299  
   300  	// Make sure it GCs properly
   301  	ar2.Destroy()
   302  
   303  	select {
   304  	case <-ar2.DestroyCh():
   305  		// exited as expected
   306  	case <-time.After(10 * time.Second):
   307  		t.Fatalf("timed out waiting for AR to GC")
   308  	}
   309  }
   310  
   311  func TestAllocRunner_Update_Semantics(t *testing.T) {
   312  	t.Parallel()
   313  	require := require.New(t)
   314  
   315  	updatedAlloc := func(a *structs.Allocation) *structs.Allocation {
   316  		upd := a.CopySkipJob()
   317  		upd.AllocModifyIndex++
   318  
   319  		return upd
   320  	}
   321  
   322  	alloc := mock.Alloc()
   323  	alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   324  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   325  	defer cleanup()
   326  
   327  	ar, err := NewAllocRunner(conf)
   328  	require.NoError(err)
   329  
   330  	upd1 := updatedAlloc(alloc)
   331  	ar.Update(upd1)
   332  
   333  	// Update was placed into a queue
   334  	require.Len(ar.allocUpdatedCh, 1)
   335  
   336  	upd2 := updatedAlloc(alloc)
   337  	ar.Update(upd2)
   338  
   339  	// Allocation was _replaced_
   340  
   341  	require.Len(ar.allocUpdatedCh, 1)
   342  	queuedAlloc := <-ar.allocUpdatedCh
   343  	require.Equal(upd2, queuedAlloc)
   344  
   345  	// Requeueing older alloc is skipped
   346  	ar.Update(upd2)
   347  	ar.Update(upd1)
   348  
   349  	queuedAlloc = <-ar.allocUpdatedCh
   350  	require.Equal(upd2, queuedAlloc)
   351  
   352  	// Ignore after watch closed
   353  
   354  	close(ar.waitCh)
   355  
   356  	ar.Update(upd1)
   357  
   358  	// Did not queue the update
   359  	require.Len(ar.allocUpdatedCh, 0)
   360  }
   361  
   362  // TestAllocRunner_DeploymentHealth_Healthy_Migration asserts that health is
   363  // reported for services that got migrated; not just part of deployments.
   364  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
   365  	t.Parallel()
   366  
   367  	alloc := mock.Alloc()
   368  
   369  	// Ensure the alloc is *not* part of a deployment
   370  	alloc.DeploymentID = ""
   371  
   372  	// Shorten the default migration healthy time
   373  	tg := alloc.Job.TaskGroups[0]
   374  	tg.Migrate = structs.DefaultMigrateStrategy()
   375  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
   376  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   377  
   378  	task := tg.Tasks[0]
   379  	task.Driver = "mock_driver"
   380  	task.Config = map[string]interface{}{
   381  		"run_for": "30s",
   382  	}
   383  
   384  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   385  	defer cleanup()
   386  
   387  	ar, err := NewAllocRunner(conf)
   388  	require.NoError(t, err)
   389  	go ar.Run()
   390  	defer destroy(ar)
   391  
   392  	upd := conf.StateUpdater.(*MockStateUpdater)
   393  	testutil.WaitForResult(func() (bool, error) {
   394  		last := upd.Last()
   395  		if last == nil {
   396  			return false, fmt.Errorf("No updates")
   397  		}
   398  		if !last.DeploymentStatus.HasHealth() {
   399  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   400  		} else if !*last.DeploymentStatus.Healthy {
   401  			// This is fatal
   402  			t.Fatal("want deployment status healthy; got unhealthy")
   403  		}
   404  		return true, nil
   405  	}, func(err error) {
   406  		require.NoError(t, err)
   407  	})
   408  }
   409  
   410  // TestAllocRunner_DeploymentHealth_Healthy_NoChecks asserts that the health
   411  // watcher will mark the allocation as healthy based on task states alone.
   412  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   413  	t.Parallel()
   414  
   415  	alloc := mock.Alloc()
   416  
   417  	task := alloc.Job.TaskGroups[0].Tasks[0]
   418  	task.Driver = "mock_driver"
   419  	task.Config = map[string]interface{}{
   420  		"run_for": "10s",
   421  	}
   422  
   423  	// Create a task that takes longer to become healthy
   424  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task.Copy())
   425  	alloc.AllocatedResources.Tasks["task2"] = alloc.AllocatedResources.Tasks["web"].Copy()
   426  	task2 := alloc.Job.TaskGroups[0].Tasks[1]
   427  	task2.Name = "task2"
   428  	task2.Config["start_block_for"] = "500ms"
   429  
   430  	// Make the alloc be part of a deployment that uses task states for
   431  	// health checks
   432  	alloc.DeploymentID = uuid.Generate()
   433  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   434  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   435  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   436  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   437  
   438  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   439  	defer cleanup()
   440  
   441  	ar, err := NewAllocRunner(conf)
   442  	require.NoError(t, err)
   443  
   444  	start, done := time.Now(), time.Time{}
   445  	go ar.Run()
   446  	defer destroy(ar)
   447  
   448  	upd := conf.StateUpdater.(*MockStateUpdater)
   449  	testutil.WaitForResult(func() (bool, error) {
   450  		last := upd.Last()
   451  		if last == nil {
   452  			return false, fmt.Errorf("No updates")
   453  		}
   454  		if !last.DeploymentStatus.HasHealth() {
   455  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   456  		} else if !*last.DeploymentStatus.Healthy {
   457  			// This is fatal
   458  			t.Fatal("want deployment status healthy; got unhealthy")
   459  		}
   460  
   461  		// Capture the done timestamp
   462  		done = last.DeploymentStatus.Timestamp
   463  		return true, nil
   464  	}, func(err error) {
   465  		require.NoError(t, err)
   466  	})
   467  
   468  	if d := done.Sub(start); d < 500*time.Millisecond {
   469  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   470  	}
   471  }
   472  
   473  // TestAllocRunner_DeploymentHealth_Unhealthy_Checks asserts that the health
   474  // watcher will mark the allocation as unhealthy with failing checks.
   475  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   476  	t.Parallel()
   477  
   478  	alloc := mock.Alloc()
   479  	task := alloc.Job.TaskGroups[0].Tasks[0]
   480  	task.Driver = "mock_driver"
   481  	task.Config = map[string]interface{}{
   482  		"run_for": "10s",
   483  	}
   484  
   485  	// Set a service with check
   486  	task.Services = []*structs.Service{
   487  		{
   488  			Name:      "fakservice",
   489  			PortLabel: "http",
   490  			Checks: []*structs.ServiceCheck{
   491  				{
   492  					Name:     "fakecheck",
   493  					Type:     structs.ServiceCheckScript,
   494  					Command:  "true",
   495  					Interval: 30 * time.Second,
   496  					Timeout:  5 * time.Second,
   497  				},
   498  			},
   499  		},
   500  	}
   501  
   502  	// Make the alloc be part of a deployment
   503  	alloc.DeploymentID = uuid.Generate()
   504  	alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   505  	alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   506  	alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   507  	alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   508  	alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   509  
   510  	checkUnhealthy := &api.AgentCheck{
   511  		CheckID: uuid.Generate(),
   512  		Status:  api.HealthWarning,
   513  	}
   514  
   515  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   516  	defer cleanup()
   517  
   518  	// Only return the check as healthy after a duration
   519  	consulClient := conf.Consul.(*cconsul.MockConsulServiceClient)
   520  	consulClient.AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   521  		return &consul.AllocRegistration{
   522  			Tasks: map[string]*consul.TaskRegistration{
   523  				task.Name: {
   524  					Services: map[string]*consul.ServiceRegistration{
   525  						"123": {
   526  							Service: &api.AgentService{Service: "fakeservice"},
   527  							Checks:  []*api.AgentCheck{checkUnhealthy},
   528  						},
   529  					},
   530  				},
   531  			},
   532  		}, nil
   533  	}
   534  
   535  	ar, err := NewAllocRunner(conf)
   536  	require.NoError(t, err)
   537  	go ar.Run()
   538  	defer destroy(ar)
   539  
   540  	var lastUpdate *structs.Allocation
   541  	upd := conf.StateUpdater.(*MockStateUpdater)
   542  	testutil.WaitForResult(func() (bool, error) {
   543  		lastUpdate = upd.Last()
   544  		if lastUpdate == nil {
   545  			return false, fmt.Errorf("No updates")
   546  		}
   547  		if !lastUpdate.DeploymentStatus.HasHealth() {
   548  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   549  		} else if *lastUpdate.DeploymentStatus.Healthy {
   550  			// This is fatal
   551  			t.Fatal("want deployment status unhealthy; got healthy")
   552  		}
   553  		return true, nil
   554  	}, func(err error) {
   555  		require.NoError(t, err)
   556  	})
   557  
   558  	// Assert that we have an event explaining why we are unhealthy.
   559  	require.Len(t, lastUpdate.TaskStates, 1)
   560  	state := lastUpdate.TaskStates[task.Name]
   561  	require.NotNil(t, state)
   562  	require.NotEmpty(t, state.Events)
   563  	last := state.Events[len(state.Events)-1]
   564  	require.Equal(t, allochealth.AllocHealthEventSource, last.Type)
   565  	require.Contains(t, last.Message, "by deadline")
   566  }
   567  
   568  // TestAllocRunner_Destroy asserts that Destroy kills and cleans up a running
   569  // alloc.
   570  func TestAllocRunner_Destroy(t *testing.T) {
   571  	t.Parallel()
   572  
   573  	// Ensure task takes some time
   574  	alloc := mock.BatchAlloc()
   575  	task := alloc.Job.TaskGroups[0].Tasks[0]
   576  	task.Config["run_for"] = "10s"
   577  
   578  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   579  	defer cleanup()
   580  
   581  	// Use a MemDB to assert alloc state gets cleaned up
   582  	conf.StateDB = state.NewMemDB(conf.Logger)
   583  
   584  	ar, err := NewAllocRunner(conf)
   585  	require.NoError(t, err)
   586  	go ar.Run()
   587  
   588  	// Wait for alloc to be running
   589  	testutil.WaitForResult(func() (bool, error) {
   590  		state := ar.AllocState()
   591  
   592  		return state.ClientStatus == structs.AllocClientStatusRunning,
   593  			fmt.Errorf("got client status %v; want running", state.ClientStatus)
   594  	}, func(err error) {
   595  		require.NoError(t, err)
   596  	})
   597  
   598  	// Assert state was stored
   599  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   600  	require.NoError(t, err)
   601  	require.NotNil(t, ls)
   602  	require.NotNil(t, ts)
   603  
   604  	// Now destroy
   605  	ar.Destroy()
   606  
   607  	select {
   608  	case <-ar.DestroyCh():
   609  		// Destroyed properly!
   610  	case <-time.After(10 * time.Second):
   611  		require.Fail(t, "timed out waiting for alloc to be destroyed")
   612  	}
   613  
   614  	// Assert alloc is dead
   615  	state := ar.AllocState()
   616  	require.Equal(t, structs.AllocClientStatusComplete, state.ClientStatus)
   617  
   618  	// Assert the state was cleaned
   619  	ls, ts, err = conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   620  	require.NoError(t, err)
   621  	require.Nil(t, ls)
   622  	require.Nil(t, ts)
   623  
   624  	// Assert the alloc directory was cleaned
   625  	if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   626  		require.Fail(t, "alloc dir still exists: %v", ar.allocDir.AllocDir)
   627  	} else if !os.IsNotExist(err) {
   628  		require.Failf(t, "expected NotExist error", "found %v", err)
   629  	}
   630  }
   631  
   632  func TestAllocRunner_SimpleRun(t *testing.T) {
   633  	t.Parallel()
   634  
   635  	alloc := mock.BatchAlloc()
   636  
   637  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   638  	defer cleanup()
   639  	ar, err := NewAllocRunner(conf)
   640  	require.NoError(t, err)
   641  	go ar.Run()
   642  	defer destroy(ar)
   643  
   644  	// Wait for alloc to be running
   645  	testutil.WaitForResult(func() (bool, error) {
   646  		state := ar.AllocState()
   647  
   648  		if state.ClientStatus != structs.AllocClientStatusComplete {
   649  			return false, fmt.Errorf("got status %v; want %v", state.ClientStatus, structs.AllocClientStatusComplete)
   650  		}
   651  
   652  		for t, s := range state.TaskStates {
   653  			if s.FinishedAt.IsZero() {
   654  				return false, fmt.Errorf("task %q has zero FinishedAt value", t)
   655  			}
   656  		}
   657  
   658  		return true, nil
   659  	}, func(err error) {
   660  		require.NoError(t, err)
   661  	})
   662  
   663  }
   664  
   665  // TestAllocRunner_MoveAllocDir asserts that a rescheduled
   666  // allocation copies ephemeral disk content from previous alloc run
   667  func TestAllocRunner_MoveAllocDir(t *testing.T) {
   668  	t.Parallel()
   669  
   670  	// Step 1: start and run a task
   671  	alloc := mock.BatchAlloc()
   672  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   673  	defer cleanup()
   674  	ar, err := NewAllocRunner(conf)
   675  	require.NoError(t, err)
   676  	ar.Run()
   677  	defer destroy(ar)
   678  
   679  	require.Equal(t, structs.AllocClientStatusComplete, ar.AllocState().ClientStatus)
   680  
   681  	// Step 2. Modify its directory
   682  	task := alloc.Job.TaskGroups[0].Tasks[0]
   683  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
   684  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
   685  	taskDir := ar.allocDir.TaskDirs[task.Name]
   686  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
   687  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
   688  
   689  	// Step 3. Start a new alloc
   690  	alloc2 := mock.BatchAlloc()
   691  	alloc2.PreviousAllocation = alloc.ID
   692  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
   693  
   694  	conf2, cleanup := testAllocRunnerConfig(t, alloc2)
   695  	conf2.PrevAllocWatcher, conf2.PrevAllocMigrator = allocwatcher.NewAllocWatcher(allocwatcher.Config{
   696  		Alloc:          alloc2,
   697  		PreviousRunner: ar,
   698  		Logger:         conf2.Logger,
   699  	})
   700  	defer cleanup()
   701  	ar2, err := NewAllocRunner(conf2)
   702  	require.NoError(t, err)
   703  
   704  	ar2.Run()
   705  	defer destroy(ar2)
   706  
   707  	require.Equal(t, structs.AllocClientStatusComplete, ar2.AllocState().ClientStatus)
   708  
   709  	// Ensure that data from ar was moved to ar2
   710  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
   711  	fileInfo, _ := os.Stat(dataFile)
   712  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
   713  
   714  	taskDir = ar2.allocDir.TaskDirs[task.Name]
   715  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
   716  	fileInfo, _ = os.Stat(taskLocalFile)
   717  	require.NotNilf(t, fileInfo, "file %q not found", dataFile)
   718  
   719  }
   720  
   721  // TestAllocRuner_HandlesArtifactFailure ensures that if one task in a task group is
   722  // retrying fetching an artifact, other tasks in the group should be able
   723  // to proceed.
   724  func TestAllocRunner_HandlesArtifactFailure(t *testing.T) {
   725  	t.Parallel()
   726  
   727  	alloc := mock.BatchAlloc()
   728  	alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
   729  		Mode:     structs.RestartPolicyModeFail,
   730  		Attempts: 1,
   731  		Delay:    time.Nanosecond,
   732  		Interval: time.Hour,
   733  	}
   734  
   735  	// Create a new task with a bad artifact
   736  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   737  	badtask.Name = "bad"
   738  	badtask.Artifacts = []*structs.TaskArtifact{
   739  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   740  	}
   741  
   742  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   743  	alloc.AllocatedResources.Tasks["bad"] = &structs.AllocatedTaskResources{
   744  		Cpu: structs.AllocatedCpuResources{
   745  			CpuShares: 500,
   746  		},
   747  		Memory: structs.AllocatedMemoryResources{
   748  			MemoryMB: 256,
   749  		},
   750  	}
   751  
   752  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   753  	defer cleanup()
   754  	ar, err := NewAllocRunner(conf)
   755  	require.NoError(t, err)
   756  	go ar.Run()
   757  	defer destroy(ar)
   758  
   759  	testutil.WaitForResult(func() (bool, error) {
   760  		state := ar.AllocState()
   761  
   762  		switch state.ClientStatus {
   763  		case structs.AllocClientStatusComplete, structs.AllocClientStatusFailed:
   764  			return true, nil
   765  		default:
   766  			return false, fmt.Errorf("got status %v but want terminal", state.ClientStatus)
   767  		}
   768  
   769  	}, func(err error) {
   770  		require.NoError(t, err)
   771  	})
   772  
   773  	state := ar.AllocState()
   774  	require.Equal(t, structs.AllocClientStatusFailed, state.ClientStatus)
   775  	require.Equal(t, structs.TaskStateDead, state.TaskStates["web"].State)
   776  	require.True(t, state.TaskStates["web"].Successful())
   777  	require.Equal(t, structs.TaskStateDead, state.TaskStates["bad"].State)
   778  	require.True(t, state.TaskStates["bad"].Failed)
   779  }
   780  
   781  // Test that alloc runner kills tasks in task group when another task fails
   782  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
   783  	alloc := mock.BatchAlloc()
   784  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   785  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   786  
   787  	// Create two tasks in the task group
   788  	task := alloc.Job.TaskGroups[0].Tasks[0]
   789  	task.Name = "task1"
   790  	task.Driver = "mock_driver"
   791  	task.KillTimeout = 10 * time.Millisecond
   792  	task.Config = map[string]interface{}{
   793  		"run_for": "10s",
   794  	}
   795  
   796  	task2 := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   797  	task2.Name = "task 2"
   798  	task2.Driver = "mock_driver"
   799  	task2.Config = map[string]interface{}{
   800  		"start_error": "fail task please",
   801  	}
   802  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, task2)
   803  	alloc.AllocatedResources.Tasks[task.Name] = tr
   804  	alloc.AllocatedResources.Tasks[task2.Name] = tr
   805  
   806  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   807  	defer cleanup()
   808  	ar, err := NewAllocRunner(conf)
   809  	require.NoError(t, err)
   810  	defer destroy(ar)
   811  	go ar.Run()
   812  	upd := conf.StateUpdater.(*MockStateUpdater)
   813  
   814  	testutil.WaitForResult(func() (bool, error) {
   815  		last := upd.Last()
   816  		if last == nil {
   817  			return false, fmt.Errorf("No updates")
   818  		}
   819  		if last.ClientStatus != structs.AllocClientStatusFailed {
   820  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
   821  		}
   822  
   823  		// Task One should be killed
   824  		state1 := last.TaskStates[task.Name]
   825  		if state1.State != structs.TaskStateDead {
   826  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
   827  		}
   828  		if len(state1.Events) < 2 {
   829  			// At least have a received and destroyed
   830  			return false, fmt.Errorf("Unexpected number of events")
   831  		}
   832  
   833  		found := false
   834  		for _, e := range state1.Events {
   835  			if e.Type != structs.TaskSiblingFailed {
   836  				found = true
   837  			}
   838  		}
   839  
   840  		if !found {
   841  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
   842  		}
   843  
   844  		// Task Two should be failed
   845  		state2 := last.TaskStates[task2.Name]
   846  		if state2.State != structs.TaskStateDead {
   847  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
   848  		}
   849  		if !state2.Failed {
   850  			return false, fmt.Errorf("task2 should have failed")
   851  		}
   852  
   853  		return true, nil
   854  	}, func(err error) {
   855  		require.Fail(t, "err: %v", err)
   856  	})
   857  }
   858  
   859  // Test that alloc becoming terminal should destroy the alloc runner
   860  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
   861  	t.Parallel()
   862  	alloc := mock.BatchAlloc()
   863  	tr := alloc.AllocatedResources.Tasks[alloc.Job.TaskGroups[0].Tasks[0].Name]
   864  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 0
   865  	// Ensure task takes some time
   866  	task := alloc.Job.TaskGroups[0].Tasks[0]
   867  	task.Driver = "mock_driver"
   868  	task.Config["run_for"] = "10s"
   869  	alloc.AllocatedResources.Tasks[task.Name] = tr
   870  
   871  	conf, cleanup := testAllocRunnerConfig(t, alloc)
   872  	defer cleanup()
   873  	ar, err := NewAllocRunner(conf)
   874  	require.NoError(t, err)
   875  	defer destroy(ar)
   876  	go ar.Run()
   877  	upd := conf.StateUpdater.(*MockStateUpdater)
   878  
   879  	testutil.WaitForResult(func() (bool, error) {
   880  		last := upd.Last()
   881  		if last == nil {
   882  			return false, fmt.Errorf("No updates")
   883  		}
   884  		if last.ClientStatus != structs.AllocClientStatusRunning {
   885  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   886  		}
   887  		return true, nil
   888  	}, func(err error) {
   889  		require.Fail(t, "err: %v", err)
   890  	})
   891  
   892  	// Update the alloc to be terminal which should cause the alloc runner to
   893  	// stop the tasks and wait for a destroy.
   894  	update := ar.alloc.Copy()
   895  	update.DesiredStatus = structs.AllocDesiredStatusStop
   896  	ar.Update(update)
   897  
   898  	testutil.WaitForResult(func() (bool, error) {
   899  		last := upd.Last()
   900  		if last == nil {
   901  			return false, fmt.Errorf("No updates")
   902  		}
   903  
   904  		// Check the status has changed.
   905  		if last.ClientStatus != structs.AllocClientStatusComplete {
   906  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   907  		}
   908  
   909  		// Check the alloc directory still exists
   910  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   911  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   912  		}
   913  
   914  		return true, nil
   915  	}, func(err error) {
   916  		require.Fail(t, "err: %v", err)
   917  	})
   918  
   919  	// Send the destroy signal and ensure the AllocRunner cleans up.
   920  	ar.Destroy()
   921  
   922  	testutil.WaitForResult(func() (bool, error) {
   923  		last := upd.Last()
   924  		if last == nil {
   925  			return false, fmt.Errorf("No updates")
   926  		}
   927  
   928  		// Check the status has changed.
   929  		if last.ClientStatus != structs.AllocClientStatusComplete {
   930  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   931  		}
   932  
   933  		// Check the alloc directory was cleaned
   934  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   935  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   936  		} else if !os.IsNotExist(err) {
   937  			return false, fmt.Errorf("stat err: %v", err)
   938  		}
   939  
   940  		return true, nil
   941  	}, func(err error) {
   942  		require.Fail(t, "err: %v", err)
   943  	})
   944  }