github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/alloc_runner_test.go

github.com/zhizhiboom/nomad@v0.8.5-0.20180907175415-f28fd3a1a056/client/allocrunner/alloc_runner_test.go (about)

     1  package allocrunner
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"strings"
     9  	"testing"
    10  	"time"
    11  
    12  	"github.com/boltdb/bolt"
    13  	"github.com/hashicorp/consul/api"
    14  	"github.com/hashicorp/nomad/command/agent/consul"
    15  	"github.com/hashicorp/nomad/helper/testlog"
    16  	"github.com/hashicorp/nomad/helper/uuid"
    17  	"github.com/hashicorp/nomad/nomad/mock"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  	"github.com/hashicorp/nomad/testutil"
    20  	"github.com/stretchr/testify/assert"
    21  
    22  	"github.com/hashicorp/nomad/client/allocrunner/taskrunner"
    23  	consulApi "github.com/hashicorp/nomad/client/consul"
    24  	"github.com/hashicorp/nomad/client/state"
    25  	"github.com/stretchr/testify/require"
    26  )
    27  
    28  // allocationBucketExists checks if the allocation bucket was created.
    29  func allocationBucketExists(tx *bolt.Tx, allocID string) bool {
    30  	bucket, err := state.GetAllocationBucket(tx, allocID)
    31  	return err == nil && bucket != nil
    32  }
    33  
    34  func TestAllocRunner_SimpleRun(t *testing.T) {
    35  	t.Parallel()
    36  	upd, ar := TestAllocRunner(t, false)
    37  	go ar.Run()
    38  	defer ar.Destroy()
    39  
    40  	testutil.WaitForResult(func() (bool, error) {
    41  		last := upd.Last()
    42  		if last == nil {
    43  			return false, fmt.Errorf("No updates")
    44  		}
    45  		if last.ClientStatus != structs.AllocClientStatusComplete {
    46  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
    47  		}
    48  		return true, nil
    49  	}, func(err error) {
    50  		t.Fatalf("err: %v", err)
    51  	})
    52  }
    53  
    54  // Test that FinisheAt is set when the alloc is in a terminal state
    55  func TestAllocRunner_FinishedAtSet(t *testing.T) {
    56  	t.Parallel()
    57  	require := require.New(t)
    58  	_, ar := TestAllocRunner(t, false)
    59  	ar.allocClientStatus = structs.AllocClientStatusFailed
    60  	alloc := ar.Alloc()
    61  	taskFinishedAt := make(map[string]time.Time)
    62  	require.NotEmpty(alloc.TaskStates)
    63  	for name, s := range alloc.TaskStates {
    64  		require.False(s.FinishedAt.IsZero())
    65  		taskFinishedAt[name] = s.FinishedAt
    66  	}
    67  
    68  	// Verify that calling again should not mutate finishedAt
    69  	alloc2 := ar.Alloc()
    70  	for name, s := range alloc2.TaskStates {
    71  		require.Equal(taskFinishedAt[name], s.FinishedAt)
    72  	}
    73  
    74  }
    75  
    76  // Test that FinisheAt is set when the alloc is in a terminal state
    77  func TestAllocRunner_FinishedAtSet_TaskEvents(t *testing.T) {
    78  	t.Parallel()
    79  	require := require.New(t)
    80  	_, ar := TestAllocRunner(t, false)
    81  	ar.taskStates[ar.alloc.Job.TaskGroups[0].Tasks[0].Name] = &structs.TaskState{State: structs.TaskStateDead, Failed: true}
    82  
    83  	alloc := ar.Alloc()
    84  	taskFinishedAt := make(map[string]time.Time)
    85  	require.NotEmpty(alloc.TaskStates)
    86  	for name, s := range alloc.TaskStates {
    87  		require.False(s.FinishedAt.IsZero())
    88  		taskFinishedAt[name] = s.FinishedAt
    89  	}
    90  
    91  	// Verify that calling again should not mutate finishedAt
    92  	alloc2 := ar.Alloc()
    93  	for name, s := range alloc2.TaskStates {
    94  		require.Equal(taskFinishedAt[name], s.FinishedAt)
    95  	}
    96  
    97  }
    98  
    99  // Test that the watcher will mark the allocation as unhealthy.
   100  func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
   101  	t.Parallel()
   102  	assert := assert.New(t)
   103  
   104  	// Ensure the task fails and restarts
   105  	upd, ar := TestAllocRunner(t, true)
   106  
   107  	// Make the task fail
   108  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   109  	task.Driver = "mock_driver"
   110  	task.Config["start_error"] = "test error"
   111  
   112  	// Make the alloc be part of a deployment
   113  	ar.alloc.DeploymentID = uuid.Generate()
   114  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   115  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   116  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   117  
   118  	go ar.Run()
   119  	defer ar.Destroy()
   120  
   121  	testutil.WaitForResult(func() (bool, error) {
   122  		last := upd.Last()
   123  		if last == nil {
   124  			return false, fmt.Errorf("No updates")
   125  		}
   126  		if !last.DeploymentStatus.HasHealth() {
   127  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   128  		} else if *last.DeploymentStatus.Healthy {
   129  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   130  		}
   131  		return true, nil
   132  	}, func(err error) {
   133  		t.Fatalf("err: %v", err)
   134  	})
   135  
   136  	// Assert that we have an event explaining why we are unhealthy.
   137  	assert.Len(ar.taskStates, 1)
   138  	state := ar.taskStates[task.Name]
   139  	assert.NotNil(state)
   140  	assert.NotEmpty(state.Events)
   141  	last := state.Events[len(state.Events)-1]
   142  	assert.Equal(allocHealthEventSource, last.Type)
   143  	assert.Contains(last.Message, "failed task")
   144  }
   145  
   146  // Test that the watcher will mark the allocation as unhealthy if it hits its
   147  // deadline.
   148  func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
   149  	t.Parallel()
   150  
   151  	// Don't restart but force service job type
   152  	upd, ar := TestAllocRunner(t, false)
   153  	ar.alloc.Job.Type = structs.JobTypeService
   154  
   155  	// Make the task block
   156  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   157  	task.Driver = "mock_driver"
   158  	task.Config["start_block_for"] = "4s"
   159  	task.Config["run_for"] = "10s"
   160  
   161  	// Make the alloc be part of a deployment
   162  	ar.alloc.DeploymentID = uuid.Generate()
   163  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   164  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   165  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   166  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond
   167  
   168  	go ar.Run()
   169  	defer ar.Destroy()
   170  
   171  	testutil.WaitForResult(func() (bool, error) {
   172  		last := upd.Last()
   173  		if last == nil {
   174  			return false, fmt.Errorf("No updates")
   175  		}
   176  
   177  		// Assert alloc is unhealthy
   178  		if !last.DeploymentStatus.HasHealth() {
   179  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   180  		} else if *last.DeploymentStatus.Healthy {
   181  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   182  		}
   183  
   184  		// Assert there is a task event explaining why we are unhealthy.
   185  		state, ok := last.TaskStates[task.Name]
   186  		if !ok {
   187  			return false, fmt.Errorf("missing state for task %s", task.Name)
   188  		}
   189  		n := len(state.Events)
   190  		if n == 0 {
   191  			return false, fmt.Errorf("no task events")
   192  		}
   193  		lastEvent := state.Events[n-1]
   194  		if lastEvent.Type != allocHealthEventSource {
   195  			return false, fmt.Errorf("expected %q; found %q", allocHealthEventSource, lastEvent.Type)
   196  		}
   197  		if !strings.Contains(lastEvent.Message, "not running by deadline") {
   198  			return false, fmt.Errorf(`expected "not running by deadline" but found: %s`, lastEvent.Message)
   199  		}
   200  
   201  		return true, nil
   202  	}, func(err error) {
   203  		t.Fatalf("err: %v", err)
   204  	})
   205  }
   206  
   207  // Test that the watcher will mark the allocation as healthy.
   208  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   209  	t.Parallel()
   210  
   211  	// Ensure the task fails and restarts
   212  	upd, ar := TestAllocRunner(t, true)
   213  
   214  	// Make the task run healthy
   215  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   216  	task.Driver = "mock_driver"
   217  	task.Config["run_for"] = "10s"
   218  
   219  	// Create a task that takes longer to become healthy
   220  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   221  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   222  	task2.Name = "task 2"
   223  	task2.Config["start_block_for"] = "500ms"
   224  
   225  	// Make the alloc be part of a deployment
   226  	ar.alloc.DeploymentID = uuid.Generate()
   227  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   228  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   229  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   230  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   231  
   232  	start := time.Now()
   233  	go ar.Run()
   234  	defer ar.Destroy()
   235  
   236  	testutil.WaitForResult(func() (bool, error) {
   237  		last := upd.Last()
   238  		if last == nil {
   239  			return false, fmt.Errorf("No updates")
   240  		}
   241  		if !last.DeploymentStatus.HasHealth() {
   242  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   243  		} else if !*last.DeploymentStatus.Healthy {
   244  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   245  		}
   246  		return true, nil
   247  	}, func(err error) {
   248  		t.Fatalf("err: %v", err)
   249  	})
   250  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   251  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   252  	}
   253  }
   254  
   255  // Test that the watcher will mark the allocation as healthy with checks
   256  func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
   257  	t.Parallel()
   258  
   259  	// Ensure the task fails and restarts
   260  	upd, ar := TestAllocRunner(t, true)
   261  
   262  	// Make the task fail
   263  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   264  	task.Driver = "mock_driver"
   265  	task.Config["run_for"] = "10s"
   266  
   267  	// Create a task that has no checks
   268  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   269  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   270  	task2.Name = "task 2"
   271  	task2.Services = nil
   272  
   273  	// Make the alloc be part of a deployment
   274  	ar.alloc.DeploymentID = uuid.Generate()
   275  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   276  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   277  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   278  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   279  
   280  	checkHealthy := &api.AgentCheck{
   281  		CheckID: uuid.Generate(),
   282  		Status:  api.HealthPassing,
   283  	}
   284  	checkUnhealthy := &api.AgentCheck{
   285  		CheckID: checkHealthy.CheckID,
   286  		Status:  api.HealthWarning,
   287  	}
   288  
   289  	// Only return the check as healthy after a duration
   290  	trigger := time.After(500 * time.Millisecond)
   291  	ar.consulClient.(*consulApi.MockConsulServiceClient).AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   292  		select {
   293  		case <-trigger:
   294  			return &consul.AllocRegistration{
   295  				Tasks: map[string]*consul.TaskRegistration{
   296  					task.Name: {
   297  						Services: map[string]*consul.ServiceRegistration{
   298  							"123": {
   299  								Service: &api.AgentService{Service: "foo"},
   300  								Checks:  []*api.AgentCheck{checkHealthy},
   301  							},
   302  						},
   303  					},
   304  				},
   305  			}, nil
   306  		default:
   307  			return &consul.AllocRegistration{
   308  				Tasks: map[string]*consul.TaskRegistration{
   309  					task.Name: {
   310  						Services: map[string]*consul.ServiceRegistration{
   311  							"123": {
   312  								Service: &api.AgentService{Service: "foo"},
   313  								Checks:  []*api.AgentCheck{checkUnhealthy},
   314  							},
   315  						},
   316  					},
   317  				},
   318  			}, nil
   319  		}
   320  	}
   321  
   322  	start := time.Now()
   323  	go ar.Run()
   324  	defer ar.Destroy()
   325  
   326  	testutil.WaitForResult(func() (bool, error) {
   327  		last := upd.Last()
   328  		if last == nil {
   329  			return false, fmt.Errorf("No updates")
   330  		}
   331  		if !last.DeploymentStatus.HasHealth() {
   332  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   333  		} else if !*last.DeploymentStatus.Healthy {
   334  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   335  		}
   336  		return true, nil
   337  	}, func(err error) {
   338  		t.Fatalf("err: %v", err)
   339  	})
   340  
   341  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   342  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   343  	}
   344  }
   345  
   346  // Test that the watcher will mark the allocation as unhealthy with failing
   347  // checks
   348  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   349  	t.Parallel()
   350  	assert := assert.New(t)
   351  
   352  	// Ensure the task fails and restarts
   353  	upd, ar := TestAllocRunner(t, true)
   354  
   355  	// Make the task fail
   356  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   357  	task.Driver = "mock_driver"
   358  	task.Config["run_for"] = "10s"
   359  
   360  	// Make the alloc be part of a deployment
   361  	ar.alloc.DeploymentID = uuid.Generate()
   362  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   363  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   364  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   365  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   366  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   367  
   368  	checkUnhealthy := &api.AgentCheck{
   369  		CheckID: uuid.Generate(),
   370  		Status:  api.HealthWarning,
   371  	}
   372  
   373  	// Only return the check as healthy after a duration
   374  	ar.consulClient.(*consulApi.MockConsulServiceClient).AllocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   375  		return &consul.AllocRegistration{
   376  			Tasks: map[string]*consul.TaskRegistration{
   377  				task.Name: {
   378  					Services: map[string]*consul.ServiceRegistration{
   379  						"123": {
   380  							Service: &api.AgentService{Service: "foo"},
   381  							Checks:  []*api.AgentCheck{checkUnhealthy},
   382  						},
   383  					},
   384  				},
   385  			},
   386  		}, nil
   387  	}
   388  
   389  	go ar.Run()
   390  	defer ar.Destroy()
   391  
   392  	testutil.WaitForResult(func() (bool, error) {
   393  		last := upd.Last()
   394  		if last == nil {
   395  			return false, fmt.Errorf("No updates")
   396  		}
   397  		if !last.DeploymentStatus.HasHealth() {
   398  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   399  		} else if *last.DeploymentStatus.Healthy {
   400  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   401  		}
   402  		return true, nil
   403  	}, func(err error) {
   404  		t.Fatalf("err: %v", err)
   405  	})
   406  
   407  	// Assert that we have an event explaining why we are unhealthy.
   408  	assert.Len(ar.taskStates, 1)
   409  	state := ar.taskStates[task.Name]
   410  	assert.NotNil(state)
   411  	assert.NotEmpty(state.Events)
   412  	last := state.Events[len(state.Events)-1]
   413  	assert.Equal(allocHealthEventSource, last.Type)
   414  	assert.Contains(last.Message, "Services not healthy by deadline")
   415  }
   416  
   417  // Test that the watcher will mark the allocation as healthy.
   418  func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
   419  	t.Parallel()
   420  
   421  	// Ensure the task fails and restarts
   422  	upd, ar := TestAllocRunner(t, true)
   423  
   424  	// Make the task run healthy
   425  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   426  	task.Driver = "mock_driver"
   427  	task.Config["run_for"] = "30s"
   428  
   429  	// Make the alloc be part of a deployment
   430  	ar.alloc.DeploymentID = uuid.Generate()
   431  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   432  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   433  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   434  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   435  
   436  	go ar.Run()
   437  	defer ar.Destroy()
   438  
   439  	testutil.WaitForResult(func() (bool, error) {
   440  		last := upd.Last()
   441  		if last == nil {
   442  			return false, fmt.Errorf("No updates")
   443  		}
   444  		if !last.DeploymentStatus.HasHealth() {
   445  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   446  		} else if !*last.DeploymentStatus.Healthy {
   447  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   448  		}
   449  		return true, nil
   450  	}, func(err error) {
   451  		t.Fatalf("err: %v", err)
   452  	})
   453  
   454  	// Mimick an update to a new deployment id
   455  	last := upd.Last()
   456  	last.DeploymentStatus = nil
   457  	last.DeploymentID = uuid.Generate()
   458  	ar.Update(last)
   459  
   460  	testutil.WaitForResult(func() (bool, error) {
   461  		last := upd.Last()
   462  		if !last.DeploymentStatus.HasHealth() {
   463  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   464  		} else if !*last.DeploymentStatus.Healthy {
   465  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   466  		}
   467  		return true, nil
   468  	}, func(err error) {
   469  		t.Fatalf("err: %v", err)
   470  	})
   471  }
   472  
   473  // Test that health is reported for services that got migrated; not just part
   474  // of deployments.
   475  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
   476  	t.Parallel()
   477  
   478  	// Ensure the task fails and restarts
   479  	upd, ar := TestAllocRunner(t, true)
   480  
   481  	// Make the task run healthy
   482  	tg := ar.alloc.Job.TaskGroups[0]
   483  	task := tg.Tasks[0]
   484  	task.Driver = "mock_driver"
   485  	task.Config["run_for"] = "30s"
   486  
   487  	// Shorten the default migration healthy time
   488  	tg.Migrate = structs.DefaultMigrateStrategy()
   489  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
   490  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   491  
   492  	// Ensure the alloc is *not* part of a deployment
   493  	ar.alloc.DeploymentID = ""
   494  
   495  	go ar.Run()
   496  	defer ar.Destroy()
   497  
   498  	testutil.WaitForResult(func() (bool, error) {
   499  		last := upd.Last()
   500  		if last == nil {
   501  			return false, fmt.Errorf("No updates")
   502  		}
   503  		if !last.DeploymentStatus.HasHealth() {
   504  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   505  		} else if !*last.DeploymentStatus.Healthy {
   506  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   507  		}
   508  		return true, nil
   509  	}, func(err error) {
   510  		t.Fatalf("err: %v", err)
   511  	})
   512  }
   513  
   514  // Test that health is *not* reported for batch jobs
   515  func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) {
   516  	t.Parallel()
   517  
   518  	// Ensure the task fails and restarts
   519  	alloc := mock.BatchAlloc()
   520  	tg := alloc.Job.TaskGroups[0]
   521  
   522  	// This should not be possile as validation should prevent batch jobs
   523  	// from having a migration stanza!
   524  	tg.Migrate = structs.DefaultMigrateStrategy()
   525  	tg.Migrate.MinHealthyTime = 1 * time.Millisecond
   526  	tg.Migrate.HealthyDeadline = 2 * time.Millisecond
   527  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   528  
   529  	task := tg.Tasks[0]
   530  	task.Driver = "mock_driver"
   531  	task.Config["run_for"] = "5s"
   532  	upd, ar := TestAllocRunnerFromAlloc(t, alloc, false)
   533  
   534  	go ar.Run()
   535  	defer ar.Destroy()
   536  
   537  	testutil.WaitForResult(func() (bool, error) {
   538  		last := upd.Last()
   539  		if last == nil {
   540  			return false, fmt.Errorf("No updates")
   541  		}
   542  		if last.DeploymentStatus != nil {
   543  			return false, fmt.Errorf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy)
   544  		}
   545  		return true, nil
   546  	}, func(err error) {
   547  		t.Fatalf("err: %v", err)
   548  	})
   549  }
   550  
   551  // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
   552  // retrying fetching an artifact, other tasks in the group should be able
   553  // to proceed.
   554  func TestAllocRunner_RetryArtifact(t *testing.T) {
   555  	t.Parallel()
   556  
   557  	alloc := mock.Alloc()
   558  	alloc.Job.Type = structs.JobTypeBatch
   559  	alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail
   560  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
   561  	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
   562  
   563  	task := alloc.Job.TaskGroups[0].Tasks[0]
   564  	task.Driver = "mock_driver"
   565  	task.Config = map[string]interface{}{
   566  		"exit_code": "0",
   567  		"run_for":   "1s",
   568  	}
   569  
   570  	// Create a new task with a bad artifact
   571  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   572  	badtask.Name = "bad"
   573  	badtask.Artifacts = []*structs.TaskArtifact{
   574  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   575  	}
   576  
   577  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   578  	upd, ar := TestAllocRunnerFromAlloc(t, alloc, true)
   579  	go ar.Run()
   580  	defer ar.Destroy()
   581  
   582  	testutil.WaitForResult(func() (bool, error) {
   583  		last := upd.Last()
   584  		if last == nil {
   585  			return false, fmt.Errorf("No updates")
   586  		}
   587  
   588  		// web task should have completed successfully while bad task
   589  		// retries artifact fetching
   590  		webstate, ok := last.TaskStates["web"]
   591  		if !ok {
   592  			return false, fmt.Errorf("no task state for web")
   593  		}
   594  		if webstate.State != structs.TaskStateDead {
   595  			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
   596  		}
   597  		if !webstate.Successful() {
   598  			return false, fmt.Errorf("expected web to have exited successfully")
   599  		}
   600  
   601  		// bad task should have failed
   602  		badstate := last.TaskStates["bad"]
   603  		if badstate.State != structs.TaskStateDead {
   604  			return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State)
   605  		}
   606  		if !badstate.Failed {
   607  			return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events)
   608  		}
   609  		return true, nil
   610  	}, func(err error) {
   611  		t.Fatalf("err: %v", err)
   612  	})
   613  }
   614  
   615  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
   616  	t.Parallel()
   617  	upd, ar := TestAllocRunner(t, false)
   618  
   619  	// Ensure task takes some time
   620  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   621  	task.Driver = "mock_driver"
   622  	task.Config["run_for"] = "10s"
   623  	go ar.Run()
   624  
   625  	testutil.WaitForResult(func() (bool, error) {
   626  		last := upd.Last()
   627  		if last == nil {
   628  			return false, fmt.Errorf("No updates")
   629  		}
   630  		if last.ClientStatus != structs.AllocClientStatusRunning {
   631  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   632  		}
   633  		return true, nil
   634  	}, func(err error) {
   635  		t.Fatalf("err: %v", err)
   636  	})
   637  
   638  	// Update the alloc to be terminal which should cause the alloc runner to
   639  	// stop the tasks and wait for a destroy.
   640  	update := ar.alloc.Copy()
   641  	update.DesiredStatus = structs.AllocDesiredStatusStop
   642  	ar.Update(update)
   643  
   644  	testutil.WaitForResult(func() (bool, error) {
   645  		last := upd.Last()
   646  		if last == nil {
   647  			return false, fmt.Errorf("No updates")
   648  		}
   649  
   650  		// Check the status has changed.
   651  		if last.ClientStatus != structs.AllocClientStatusComplete {
   652  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   653  		}
   654  
   655  		// Check the allocation state still exists
   656  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   657  			if !allocationBucketExists(tx, ar.Alloc().ID) {
   658  				return fmt.Errorf("no bucket for alloc")
   659  			}
   660  
   661  			return nil
   662  		}); err != nil {
   663  			return false, fmt.Errorf("state destroyed")
   664  		}
   665  
   666  		// Check the alloc directory still exists
   667  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   668  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   669  		}
   670  
   671  		return true, nil
   672  	}, func(err error) {
   673  		t.Fatalf("err: %v", err)
   674  	})
   675  
   676  	// Send the destroy signal and ensure the AllocRunner cleans up.
   677  	ar.Destroy()
   678  
   679  	testutil.WaitForResult(func() (bool, error) {
   680  		last := upd.Last()
   681  		if last == nil {
   682  			return false, fmt.Errorf("No updates")
   683  		}
   684  
   685  		// Check the status has changed.
   686  		if last.ClientStatus != structs.AllocClientStatusComplete {
   687  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   688  		}
   689  
   690  		// Check the state was cleaned
   691  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   692  			if allocationBucketExists(tx, ar.Alloc().ID) {
   693  				return fmt.Errorf("bucket for alloc exists")
   694  			}
   695  
   696  			return nil
   697  		}); err != nil {
   698  			return false, fmt.Errorf("state not destroyed")
   699  		}
   700  
   701  		// Check the alloc directory was cleaned
   702  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   703  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   704  		} else if !os.IsNotExist(err) {
   705  			return false, fmt.Errorf("stat err: %v", err)
   706  		}
   707  
   708  		return true, nil
   709  	}, func(err error) {
   710  		t.Fatalf("err: %v", err)
   711  	})
   712  }
   713  
   714  func TestAllocRunner_Destroy(t *testing.T) {
   715  	t.Parallel()
   716  	upd, ar := TestAllocRunner(t, false)
   717  
   718  	// Ensure task takes some time
   719  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   720  	task.Driver = "mock_driver"
   721  	task.Config["run_for"] = "10s"
   722  	go ar.Run()
   723  	start := time.Now()
   724  
   725  	// Begin the tear down
   726  	go func() {
   727  		time.Sleep(1 * time.Second)
   728  		ar.Destroy()
   729  	}()
   730  
   731  	testutil.WaitForResult(func() (bool, error) {
   732  		last := upd.Last()
   733  		if last == nil {
   734  			return false, fmt.Errorf("No updates")
   735  		}
   736  
   737  		// Check the status has changed.
   738  		if last.ClientStatus != structs.AllocClientStatusComplete {
   739  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   740  		}
   741  
   742  		// Check the state was cleaned
   743  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   744  			if allocationBucketExists(tx, ar.Alloc().ID) {
   745  				return fmt.Errorf("bucket for alloc exists")
   746  			}
   747  
   748  			return nil
   749  		}); err != nil {
   750  			return false, fmt.Errorf("state not destroyed: %v", err)
   751  		}
   752  
   753  		// Check the alloc directory was cleaned
   754  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   755  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   756  		} else if !os.IsNotExist(err) {
   757  			return false, fmt.Errorf("stat err: %v", err)
   758  		}
   759  
   760  		return true, nil
   761  	}, func(err error) {
   762  		t.Fatalf("err: %v", err)
   763  	})
   764  
   765  	if elapsed := time.Since(start); elapsed > 20*time.Second {
   766  		t.Fatalf("took too long to terminate: %s", elapsed)
   767  	}
   768  }
   769  
   770  func TestAllocRunner_Update(t *testing.T) {
   771  	t.Parallel()
   772  	_, ar := TestAllocRunner(t, false)
   773  
   774  	// Deep copy the alloc to avoid races when updating
   775  	newAlloc := ar.Alloc().Copy()
   776  
   777  	// Ensure task takes some time
   778  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   779  	task.Driver = "mock_driver"
   780  	task.Config["run_for"] = "10s"
   781  	go ar.Run()
   782  	defer ar.Destroy()
   783  
   784  	// Update the alloc definition
   785  	newAlloc.Name = "FOO"
   786  	newAlloc.AllocModifyIndex++
   787  	ar.Update(newAlloc)
   788  
   789  	// Check the alloc runner stores the update allocation.
   790  	testutil.WaitForResult(func() (bool, error) {
   791  		return ar.Alloc().Name == "FOO", nil
   792  	}, func(err error) {
   793  		t.Fatalf("err: %v %#v", err, ar.Alloc())
   794  	})
   795  }
   796  
   797  func TestAllocRunner_SaveRestoreState(t *testing.T) {
   798  	t.Parallel()
   799  	alloc := mock.Alloc()
   800  	task := alloc.Job.TaskGroups[0].Tasks[0]
   801  	task.Driver = "mock_driver"
   802  	task.Config = map[string]interface{}{
   803  		"exit_code": "0",
   804  		"run_for":   "10s",
   805  	}
   806  
   807  	upd, ar := TestAllocRunnerFromAlloc(t, alloc, false)
   808  	go ar.Run()
   809  	defer ar.Destroy()
   810  
   811  	// Snapshot state
   812  	testutil.WaitForResult(func() (bool, error) {
   813  		ar.taskLock.RLock()
   814  		defer ar.taskLock.RUnlock()
   815  		return len(ar.tasks) == 1, nil
   816  	}, func(err error) {
   817  		t.Fatalf("task never started: %v", err)
   818  	})
   819  
   820  	err := ar.SaveState()
   821  	if err != nil {
   822  		t.Fatalf("err: %v", err)
   823  	}
   824  
   825  	// Create a new alloc runner
   826  	l2 := testlog.WithPrefix(t, "----- ar2:  ")
   827  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   828  	prevAlloc := NewAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   829  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   830  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   831  	err = ar2.RestoreState()
   832  	if err != nil {
   833  		t.Fatalf("err: %v", err)
   834  	}
   835  	go ar2.Run()
   836  
   837  	testutil.WaitForResult(func() (bool, error) {
   838  		if len(ar2.tasks) != 1 {
   839  			return false, fmt.Errorf("Incorrect number of tasks")
   840  		}
   841  
   842  		last := upd.Last()
   843  		if last == nil {
   844  			return false, nil
   845  		}
   846  
   847  		return last.ClientStatus == structs.AllocClientStatusRunning, nil
   848  	}, func(err error) {
   849  		last := upd.Last()
   850  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"])
   851  	})
   852  
   853  	// Destroy and wait
   854  	ar2.Destroy()
   855  	start := time.Now()
   856  
   857  	testutil.WaitForResult(func() (bool, error) {
   858  		alloc := ar2.Alloc()
   859  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
   860  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
   861  		}
   862  		return true, nil
   863  	}, func(err error) {
   864  		last := upd.Last()
   865  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   866  	})
   867  
   868  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
   869  		t.Fatalf("took too long to terminate")
   870  	}
   871  }
   872  
   873  func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
   874  	t.Parallel()
   875  	upd, ar := TestAllocRunner(t, false)
   876  	ar.logger = testlog.WithPrefix(t, "ar1:  ")
   877  
   878  	// Ensure task takes some time
   879  	ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   880  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   881  	task.Config["run_for"] = "10s"
   882  	go ar.Run()
   883  	defer ar.Destroy()
   884  
   885  	testutil.WaitForResult(func() (bool, error) {
   886  		last := upd.Last()
   887  		if last == nil {
   888  			return false, fmt.Errorf("No updates")
   889  		}
   890  
   891  		if last.ClientStatus != structs.AllocClientStatusRunning {
   892  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   893  		}
   894  		return true, nil
   895  	}, func(err error) {
   896  		t.Fatalf("err: %v", err)
   897  	})
   898  
   899  	// Update the alloc to be terminal which should cause the alloc runner to
   900  	// stop the tasks and wait for a destroy.
   901  	update := ar.alloc.Copy()
   902  	update.DesiredStatus = structs.AllocDesiredStatusStop
   903  	ar.Update(update)
   904  
   905  	testutil.WaitForResult(func() (bool, error) {
   906  		return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil
   907  	}, func(err error) {
   908  		t.Fatalf("err: %v", err)
   909  	})
   910  
   911  	err := ar.SaveState()
   912  	if err != nil {
   913  		t.Fatalf("err: %v", err)
   914  	}
   915  
   916  	// Ensure ar1 doesn't recreate the state file
   917  	ar.allocLock.Lock()
   918  	defer ar.allocLock.Unlock()
   919  
   920  	// Create a new alloc runner
   921  	l2 := testlog.WithPrefix(t, "ar2:  ")
   922  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   923  	prevAlloc := NewAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   924  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   925  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   926  	err = ar2.RestoreState()
   927  	if err != nil {
   928  		t.Fatalf("err: %v", err)
   929  	}
   930  	ar2.logger.Println("[TESTING] running second alloc runner")
   931  	go ar2.Run()
   932  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
   933  
   934  	testutil.WaitForResult(func() (bool, error) {
   935  		// Check the state still exists
   936  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   937  			if !allocationBucketExists(tx, ar2.Alloc().ID) {
   938  				return fmt.Errorf("no bucket for alloc")
   939  			}
   940  
   941  			return nil
   942  		}); err != nil {
   943  			return false, fmt.Errorf("state destroyed")
   944  		}
   945  
   946  		// Check the alloc directory still exists
   947  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   948  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   949  		}
   950  
   951  		return true, nil
   952  	}, func(err error) {
   953  		last := upd.Last()
   954  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   955  	})
   956  
   957  	// Send the destroy signal and ensure the AllocRunner cleans up.
   958  	ar2.logger.Println("[TESTING] destroying second alloc runner")
   959  	ar2.Destroy()
   960  
   961  	testutil.WaitForResult(func() (bool, error) {
   962  		last := upd.Last()
   963  		if last == nil {
   964  			return false, fmt.Errorf("No updates")
   965  		}
   966  
   967  		// Check the status has changed.
   968  		if last.ClientStatus != structs.AllocClientStatusComplete {
   969  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   970  		}
   971  
   972  		// Check the state was cleaned
   973  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   974  			if allocationBucketExists(tx, ar2.Alloc().ID) {
   975  				return fmt.Errorf("bucket for alloc exists")
   976  			}
   977  
   978  			return nil
   979  		}); err != nil {
   980  			return false, fmt.Errorf("state not destroyed")
   981  		}
   982  
   983  		// Check the alloc directory was cleaned
   984  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   985  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   986  		} else if !os.IsNotExist(err) {
   987  			return false, fmt.Errorf("stat err: %v", err)
   988  		}
   989  
   990  		return true, nil
   991  	}, func(err error) {
   992  		t.Fatalf("err: %v", err)
   993  	})
   994  }
   995  
   996  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
   997  	t.Parallel()
   998  	upd, ar := TestAllocRunner(t, false)
   999  
  1000  	// Create two tasks in the task group
  1001  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1002  	task.Driver = "mock_driver"
  1003  	task.KillTimeout = 10 * time.Millisecond
  1004  	task.Config = map[string]interface{}{
  1005  		"run_for": "10s",
  1006  	}
  1007  
  1008  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1009  	task2.Name = "task 2"
  1010  	task2.Driver = "mock_driver"
  1011  	task2.Config = map[string]interface{}{
  1012  		"start_error": "fail task please",
  1013  	}
  1014  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1015  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1016  	go ar.Run()
  1017  	defer ar.Destroy()
  1018  
  1019  	testutil.WaitForResult(func() (bool, error) {
  1020  		last := upd.Last()
  1021  		if last == nil {
  1022  			return false, fmt.Errorf("No updates")
  1023  		}
  1024  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1025  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1026  		}
  1027  
  1028  		// Task One should be killed
  1029  		state1 := last.TaskStates[task.Name]
  1030  		if state1.State != structs.TaskStateDead {
  1031  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1032  		}
  1033  		if len(state1.Events) < 2 {
  1034  			// At least have a received and destroyed
  1035  			return false, fmt.Errorf("Unexpected number of events")
  1036  		}
  1037  
  1038  		found := false
  1039  		for _, e := range state1.Events {
  1040  			if e.Type != structs.TaskSiblingFailed {
  1041  				found = true
  1042  			}
  1043  		}
  1044  
  1045  		if !found {
  1046  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1047  		}
  1048  
  1049  		// Task Two should be failed
  1050  		state2 := last.TaskStates[task2.Name]
  1051  		if state2.State != structs.TaskStateDead {
  1052  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1053  		}
  1054  		if !state2.Failed {
  1055  			return false, fmt.Errorf("task2 should have failed")
  1056  		}
  1057  
  1058  		return true, nil
  1059  	}, func(err error) {
  1060  		t.Fatalf("err: %v", err)
  1061  	})
  1062  }
  1063  
  1064  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
  1065  	t.Parallel()
  1066  	upd, ar := TestAllocRunner(t, false)
  1067  
  1068  	// Create two tasks in the task group
  1069  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1070  	task.Driver = "mock_driver"
  1071  	task.KillTimeout = 10 * time.Millisecond
  1072  	task.Config = map[string]interface{}{
  1073  		"run_for": "10s",
  1074  	}
  1075  
  1076  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1077  	task2.Name = "task 2"
  1078  	task2.Driver = "mock_driver"
  1079  	task2.Leader = true
  1080  	task2.Config = map[string]interface{}{
  1081  		"run_for": "1s",
  1082  	}
  1083  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1084  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1085  	go ar.Run()
  1086  	defer ar.Destroy()
  1087  
  1088  	testutil.WaitForResult(func() (bool, error) {
  1089  		last := upd.Last()
  1090  		if last == nil {
  1091  			return false, fmt.Errorf("No updates")
  1092  		}
  1093  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1094  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1095  		}
  1096  
  1097  		// Task One should be killed
  1098  		state1 := last.TaskStates[task.Name]
  1099  		if state1.State != structs.TaskStateDead {
  1100  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1101  		}
  1102  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
  1103  			return false, fmt.Errorf("expected to have a start and finish time")
  1104  		}
  1105  		if len(state1.Events) < 2 {
  1106  			// At least have a received and destroyed
  1107  			return false, fmt.Errorf("Unexpected number of events")
  1108  		}
  1109  
  1110  		found := false
  1111  		for _, e := range state1.Events {
  1112  			if e.Type != structs.TaskLeaderDead {
  1113  				found = true
  1114  			}
  1115  		}
  1116  
  1117  		if !found {
  1118  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
  1119  		}
  1120  
  1121  		// Task Two should be dead
  1122  		state2 := last.TaskStates[task2.Name]
  1123  		if state2.State != structs.TaskStateDead {
  1124  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1125  		}
  1126  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
  1127  			return false, fmt.Errorf("expected to have a start and finish time")
  1128  		}
  1129  
  1130  		return true, nil
  1131  	}, func(err error) {
  1132  		t.Fatalf("err: %v", err)
  1133  	})
  1134  }
  1135  
  1136  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group
  1137  // with a leader the leader is stopped before other tasks.
  1138  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
  1139  	t.Parallel()
  1140  	upd, ar := TestAllocRunner(t, false)
  1141  
  1142  	// Create 3 tasks in the task group
  1143  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1144  	task.Name = "follower1"
  1145  	task.Driver = "mock_driver"
  1146  	task.KillTimeout = 10 * time.Millisecond
  1147  	task.Config = map[string]interface{}{
  1148  		"run_for": "10s",
  1149  	}
  1150  
  1151  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1152  	task2.Name = "leader"
  1153  	task2.Driver = "mock_driver"
  1154  	task2.Leader = true
  1155  	task2.KillTimeout = 10 * time.Millisecond
  1156  	task2.Config = map[string]interface{}{
  1157  		"run_for": "10s",
  1158  	}
  1159  
  1160  	task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1161  	task3.Name = "follower2"
  1162  	task3.Driver = "mock_driver"
  1163  	task3.KillTimeout = 10 * time.Millisecond
  1164  	task3.Config = map[string]interface{}{
  1165  		"run_for": "10s",
  1166  	}
  1167  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3)
  1168  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1169  	defer ar.Destroy()
  1170  
  1171  	go ar.Run()
  1172  
  1173  	// Wait for tasks to start
  1174  	last := upd.Last()
  1175  	testutil.WaitForResult(func() (bool, error) {
  1176  		last = upd.Last()
  1177  		if last == nil {
  1178  			return false, fmt.Errorf("No updates")
  1179  		}
  1180  		if n := len(last.TaskStates); n != 3 {
  1181  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
  1182  		}
  1183  		for name, state := range last.TaskStates {
  1184  			if state.State != structs.TaskStateRunning {
  1185  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1186  			}
  1187  		}
  1188  		return true, nil
  1189  	}, func(err error) {
  1190  		t.Fatalf("err: %v", err)
  1191  	})
  1192  
  1193  	// Reset updates
  1194  	upd.mu.Lock()
  1195  	upd.Allocs = upd.Allocs[:0]
  1196  	upd.mu.Unlock()
  1197  
  1198  	// Stop alloc
  1199  	update := ar.Alloc()
  1200  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1201  	ar.Update(update)
  1202  
  1203  	// Wait for tasks to stop
  1204  	testutil.WaitForResult(func() (bool, error) {
  1205  		last := upd.Last()
  1206  		if last == nil {
  1207  			return false, fmt.Errorf("No updates")
  1208  		}
  1209  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
  1210  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
  1211  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
  1212  		}
  1213  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
  1214  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
  1215  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
  1216  		}
  1217  		return true, nil
  1218  	}, func(err error) {
  1219  		last := upd.Last()
  1220  		for name, state := range last.TaskStates {
  1221  			t.Logf("%s: %s", name, state.State)
  1222  		}
  1223  		t.Fatalf("err: %v", err)
  1224  	})
  1225  }
  1226  
  1227  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
  1228  // restored task group with a leader that failed before restoring the leader is
  1229  // not stopped as it does not exist.
  1230  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
  1231  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
  1232  	t.Skip("Skipping because the functionality being tested doesn't exist")
  1233  	t.Parallel()
  1234  	_, ar := TestAllocRunner(t, false)
  1235  	defer ar.Destroy()
  1236  
  1237  	// Create a leader and follower task in the task group
  1238  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1239  	task.Name = "follower1"
  1240  	task.Driver = "mock_driver"
  1241  	task.KillTimeout = 10 * time.Second
  1242  	task.Config = map[string]interface{}{
  1243  		"run_for": "10s",
  1244  	}
  1245  
  1246  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1247  	task2.Name = "leader"
  1248  	task2.Driver = "mock_driver"
  1249  	task2.Leader = true
  1250  	task2.KillTimeout = 10 * time.Millisecond
  1251  	task2.Config = map[string]interface{}{
  1252  		"run_for": "0s",
  1253  	}
  1254  
  1255  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1256  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1257  
  1258  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
  1259  	ar.tasks = map[string]*taskrunner.TaskRunner{
  1260  		"leader": taskrunner.NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1261  			ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(),
  1262  			ar.vaultClient, ar.consulClient),
  1263  		"follower1": taskrunner.NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1264  			ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(),
  1265  			ar.vaultClient, ar.consulClient),
  1266  	}
  1267  	ar.taskStates = map[string]*structs.TaskState{
  1268  		"leader":    {State: structs.TaskStateDead},
  1269  		"follower1": {State: structs.TaskStateRunning},
  1270  	}
  1271  	if err := ar.SaveState(); err != nil {
  1272  		t.Fatalf("error saving state: %v", err)
  1273  	}
  1274  
  1275  	// Create a new AllocRunner to test RestoreState and Run
  1276  	upd2 := &MockAllocStateUpdater{}
  1277  	ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc,
  1278  		ar.vaultClient, ar.consulClient, ar.prevAlloc)
  1279  	defer ar2.Destroy()
  1280  
  1281  	if err := ar2.RestoreState(); err != nil {
  1282  		t.Fatalf("error restoring state: %v", err)
  1283  	}
  1284  	go ar2.Run()
  1285  
  1286  	// Wait for tasks to be stopped because leader is dead
  1287  	testutil.WaitForResult(func() (bool, error) {
  1288  		alloc := ar2.Alloc()
  1289  		for task, state := range alloc.TaskStates {
  1290  			if state.State != structs.TaskStateDead {
  1291  				return false, fmt.Errorf("Task %q should be dead: %v", task, state.State)
  1292  			}
  1293  		}
  1294  		return true, nil
  1295  	}, func(err error) {
  1296  		t.Fatalf("err: %v", err)
  1297  	})
  1298  
  1299  	// Make sure it GCs properly
  1300  	ar2.Destroy()
  1301  
  1302  	select {
  1303  	case <-ar2.WaitCh():
  1304  		// exited as expected
  1305  	case <-time.After(10 * time.Second):
  1306  		t.Fatalf("timed out waiting for AR to GC")
  1307  	}
  1308  }
  1309  
  1310  // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's
  1311  // local/ dir will be moved to a replacement alloc's local/ dir if sticky
  1312  // volumes is on.
  1313  func TestAllocRunner_MoveAllocDir(t *testing.T) {
  1314  	t.Parallel()
  1315  	// Create an alloc runner
  1316  	alloc := mock.Alloc()
  1317  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1318  	task.Driver = "mock_driver"
  1319  	task.Config = map[string]interface{}{
  1320  		"run_for": "1s",
  1321  	}
  1322  	upd, ar := TestAllocRunnerFromAlloc(t, alloc, false)
  1323  	go ar.Run()
  1324  	defer ar.Destroy()
  1325  
  1326  	testutil.WaitForResult(func() (bool, error) {
  1327  		last := upd.Last()
  1328  		if last == nil {
  1329  			return false, fmt.Errorf("No updates")
  1330  		}
  1331  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1332  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1333  		}
  1334  		return true, nil
  1335  	}, func(err error) {
  1336  		t.Fatalf("err: %v", err)
  1337  	})
  1338  
  1339  	// Write some data in data dir and task dir of the alloc
  1340  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
  1341  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
  1342  	taskDir := ar.allocDir.TaskDirs[task.Name]
  1343  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
  1344  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
  1345  
  1346  	// Create another alloc runner
  1347  	alloc2 := mock.Alloc()
  1348  	alloc2.PreviousAllocation = ar.allocID
  1349  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
  1350  	task = alloc2.Job.TaskGroups[0].Tasks[0]
  1351  	task.Driver = "mock_driver"
  1352  	task.Config = map[string]interface{}{
  1353  		"run_for": "1s",
  1354  	}
  1355  	upd2, ar2 := TestAllocRunnerFromAlloc(t, alloc2, false)
  1356  
  1357  	// Set prevAlloc like Client does
  1358  	ar2.prevAlloc = NewAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "")
  1359  
  1360  	go ar2.Run()
  1361  	defer ar2.Destroy()
  1362  
  1363  	testutil.WaitForResult(func() (bool, error) {
  1364  		last := upd2.Last()
  1365  		if last == nil {
  1366  			return false, fmt.Errorf("No updates")
  1367  		}
  1368  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1369  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1370  		}
  1371  		return true, nil
  1372  	}, func(err error) {
  1373  		t.Fatalf("err: %v", err)
  1374  	})
  1375  
  1376  	// Ensure that data from ar was moved to ar2
  1377  	taskDir = ar2.allocDir.TaskDirs[task.Name]
  1378  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
  1379  	if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil {
  1380  		t.Fatalf("file %v not found", taskLocalFile)
  1381  	}
  1382  
  1383  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
  1384  	if fileInfo, _ := os.Stat(dataFile); fileInfo == nil {
  1385  		t.Fatalf("file %v not found", dataFile)
  1386  	}
  1387  }