github.com/djenriquez/nomad-1@v0.8.1/client/alloc_runner_test.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"strings"
     9  	"sync"
    10  	"testing"
    11  	"text/template"
    12  	"time"
    13  
    14  	"github.com/boltdb/bolt"
    15  	"github.com/hashicorp/consul/api"
    16  	"github.com/hashicorp/go-multierror"
    17  	"github.com/hashicorp/nomad/command/agent/consul"
    18  	"github.com/hashicorp/nomad/helper/testlog"
    19  	"github.com/hashicorp/nomad/helper/uuid"
    20  	"github.com/hashicorp/nomad/nomad/mock"
    21  	"github.com/hashicorp/nomad/nomad/structs"
    22  	"github.com/hashicorp/nomad/testutil"
    23  	"github.com/hashicorp/nomad/version"
    24  	"github.com/kr/pretty"
    25  	"github.com/stretchr/testify/assert"
    26  
    27  	"github.com/hashicorp/nomad/client/config"
    28  	"github.com/hashicorp/nomad/client/vaultclient"
    29  	"github.com/stretchr/testify/require"
    30  )
    31  
    32  type MockAllocStateUpdater struct {
    33  	Allocs []*structs.Allocation
    34  	mu     sync.Mutex
    35  }
    36  
    37  // Update fulfills the TaskStateUpdater interface
    38  func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
    39  	m.mu.Lock()
    40  	m.Allocs = append(m.Allocs, alloc)
    41  	m.mu.Unlock()
    42  }
    43  
    44  // Last returns a copy of the last alloc (or nil) sync'd
    45  func (m *MockAllocStateUpdater) Last() *structs.Allocation {
    46  	m.mu.Lock()
    47  	defer m.mu.Unlock()
    48  	n := len(m.Allocs)
    49  	if n == 0 {
    50  		return nil
    51  	}
    52  	return m.Allocs[n-1].Copy()
    53  }
    54  
    55  // allocationBucketExists checks if the allocation bucket was created.
    56  func allocationBucketExists(tx *bolt.Tx, allocID string) bool {
    57  	allocations := tx.Bucket(allocationsBucket)
    58  	if allocations == nil {
    59  		return false
    60  	}
    61  
    62  	// Retrieve the specific allocations bucket
    63  	alloc := allocations.Bucket([]byte(allocID))
    64  	return alloc != nil
    65  }
    66  
    67  func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    68  	conf := config.DefaultConfig()
    69  	conf.Node = mock.Node()
    70  	conf.StateDir = os.TempDir()
    71  	conf.AllocDir = os.TempDir()
    72  	tmp, _ := ioutil.TempFile("", "state-db")
    73  	db, _ := bolt.Open(tmp.Name(), 0600, nil)
    74  	upd := &MockAllocStateUpdater{}
    75  	if !restarts {
    76  		*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
    77  		alloc.Job.Type = structs.JobTypeBatch
    78  	}
    79  	vclient := vaultclient.NewMockVaultClient()
    80  	ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{})
    81  	return upd, ar
    82  }
    83  
    84  func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    85  	// Use mock driver
    86  	alloc := mock.Alloc()
    87  	task := alloc.Job.TaskGroups[0].Tasks[0]
    88  	task.Driver = "mock_driver"
    89  	task.Config["run_for"] = "500ms"
    90  	return testAllocRunnerFromAlloc(t, alloc, restarts)
    91  }
    92  
    93  func TestAllocRunner_SimpleRun(t *testing.T) {
    94  	t.Parallel()
    95  	upd, ar := testAllocRunner(t, false)
    96  	go ar.Run()
    97  	defer ar.Destroy()
    98  
    99  	testutil.WaitForResult(func() (bool, error) {
   100  		last := upd.Last()
   101  		if last == nil {
   102  			return false, fmt.Errorf("No updates")
   103  		}
   104  		if last.ClientStatus != structs.AllocClientStatusComplete {
   105  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   106  		}
   107  		return true, nil
   108  	}, func(err error) {
   109  		t.Fatalf("err: %v", err)
   110  	})
   111  }
   112  
   113  // Test that FinisheAt is set when the alloc is in a terminal state
   114  func TestAllocRunner_FinishedAtSet(t *testing.T) {
   115  	t.Parallel()
   116  	require := require.New(t)
   117  	_, ar := testAllocRunner(t, false)
   118  	ar.allocClientStatus = structs.AllocClientStatusFailed
   119  	alloc := ar.Alloc()
   120  	taskFinishedAt := make(map[string]time.Time)
   121  	require.NotEmpty(alloc.TaskStates)
   122  	for name, s := range alloc.TaskStates {
   123  		require.False(s.FinishedAt.IsZero())
   124  		taskFinishedAt[name] = s.FinishedAt
   125  	}
   126  
   127  	// Verify that calling again should not mutate finishedAt
   128  	alloc2 := ar.Alloc()
   129  	for name, s := range alloc2.TaskStates {
   130  		require.Equal(taskFinishedAt[name], s.FinishedAt)
   131  	}
   132  
   133  }
   134  
   135  // Test that FinisheAt is set when the alloc is in a terminal state
   136  func TestAllocRunner_FinishedAtSet_TaskEvents(t *testing.T) {
   137  	t.Parallel()
   138  	require := require.New(t)
   139  	_, ar := testAllocRunner(t, false)
   140  	ar.taskStates[ar.alloc.Job.TaskGroups[0].Tasks[0].Name] = &structs.TaskState{State: structs.TaskStateDead, Failed: true}
   141  
   142  	alloc := ar.Alloc()
   143  	taskFinishedAt := make(map[string]time.Time)
   144  	require.NotEmpty(alloc.TaskStates)
   145  	for name, s := range alloc.TaskStates {
   146  		require.False(s.FinishedAt.IsZero())
   147  		taskFinishedAt[name] = s.FinishedAt
   148  	}
   149  
   150  	// Verify that calling again should not mutate finishedAt
   151  	alloc2 := ar.Alloc()
   152  	for name, s := range alloc2.TaskStates {
   153  		require.Equal(taskFinishedAt[name], s.FinishedAt)
   154  	}
   155  
   156  }
   157  
   158  // Test that the watcher will mark the allocation as unhealthy.
   159  func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
   160  	t.Parallel()
   161  	assert := assert.New(t)
   162  
   163  	// Ensure the task fails and restarts
   164  	upd, ar := testAllocRunner(t, true)
   165  
   166  	// Make the task fail
   167  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   168  	task.Driver = "mock_driver"
   169  	task.Config["start_error"] = "test error"
   170  
   171  	// Make the alloc be part of a deployment
   172  	ar.alloc.DeploymentID = uuid.Generate()
   173  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   174  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   175  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   176  
   177  	go ar.Run()
   178  	defer ar.Destroy()
   179  
   180  	testutil.WaitForResult(func() (bool, error) {
   181  		last := upd.Last()
   182  		if last == nil {
   183  			return false, fmt.Errorf("No updates")
   184  		}
   185  		if !last.DeploymentStatus.HasHealth() {
   186  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   187  		} else if *last.DeploymentStatus.Healthy {
   188  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   189  		}
   190  		return true, nil
   191  	}, func(err error) {
   192  		t.Fatalf("err: %v", err)
   193  	})
   194  
   195  	// Assert that we have an event explaining why we are unhealthy.
   196  	assert.Len(ar.taskStates, 1)
   197  	state := ar.taskStates[task.Name]
   198  	assert.NotNil(state)
   199  	assert.NotEmpty(state.Events)
   200  	last := state.Events[len(state.Events)-1]
   201  	assert.Equal(allocHealthEventSource, last.Type)
   202  	assert.Contains(last.Message, "failed task")
   203  }
   204  
   205  // Test that the watcher will mark the allocation as unhealthy if it hits its
   206  // deadline.
   207  func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
   208  	t.Parallel()
   209  	assert := assert.New(t)
   210  
   211  	// Ensure the task fails and restarts
   212  	upd, ar := testAllocRunner(t, true)
   213  
   214  	// Make the task block
   215  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   216  	task.Driver = "mock_driver"
   217  	task.Config["start_block_for"] = "4s"
   218  	task.Config["run_for"] = "10s"
   219  
   220  	// Make the alloc be part of a deployment
   221  	ar.alloc.DeploymentID = uuid.Generate()
   222  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   223  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   224  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   225  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond
   226  
   227  	go ar.Run()
   228  	defer ar.Destroy()
   229  
   230  	testutil.WaitForResult(func() (bool, error) {
   231  		last := upd.Last()
   232  		if last == nil {
   233  			return false, fmt.Errorf("No updates")
   234  		}
   235  		if !last.DeploymentStatus.HasHealth() {
   236  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   237  		} else if *last.DeploymentStatus.Healthy {
   238  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   239  		}
   240  		return true, nil
   241  	}, func(err error) {
   242  		t.Fatalf("err: %v", err)
   243  	})
   244  
   245  	// Assert that we have an event explaining why we are unhealthy.
   246  	assert.Len(ar.taskStates, 1)
   247  	state := ar.taskStates[task.Name]
   248  	assert.NotNil(state)
   249  	assert.NotEmpty(state.Events)
   250  	last := state.Events[len(state.Events)-1]
   251  	assert.Equal(allocHealthEventSource, last.Type)
   252  	assert.Contains(last.Message, "not running by deadline")
   253  }
   254  
   255  // Test that the watcher will mark the allocation as healthy.
   256  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   257  	t.Parallel()
   258  
   259  	// Ensure the task fails and restarts
   260  	upd, ar := testAllocRunner(t, true)
   261  
   262  	// Make the task run healthy
   263  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   264  	task.Driver = "mock_driver"
   265  	task.Config["run_for"] = "10s"
   266  
   267  	// Create a task that takes longer to become healthy
   268  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   269  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   270  	task2.Name = "task 2"
   271  	task2.Config["start_block_for"] = "500ms"
   272  
   273  	// Make the alloc be part of a deployment
   274  	ar.alloc.DeploymentID = uuid.Generate()
   275  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   276  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   277  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   278  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   279  
   280  	start := time.Now()
   281  	go ar.Run()
   282  	defer ar.Destroy()
   283  
   284  	testutil.WaitForResult(func() (bool, error) {
   285  		last := upd.Last()
   286  		if last == nil {
   287  			return false, fmt.Errorf("No updates")
   288  		}
   289  		if !last.DeploymentStatus.HasHealth() {
   290  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   291  		} else if !*last.DeploymentStatus.Healthy {
   292  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   293  		}
   294  		return true, nil
   295  	}, func(err error) {
   296  		t.Fatalf("err: %v", err)
   297  	})
   298  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   299  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   300  	}
   301  }
   302  
   303  // Test that the watcher will mark the allocation as healthy with checks
   304  func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
   305  	t.Parallel()
   306  
   307  	// Ensure the task fails and restarts
   308  	upd, ar := testAllocRunner(t, true)
   309  
   310  	// Make the task fail
   311  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   312  	task.Driver = "mock_driver"
   313  	task.Config["run_for"] = "10s"
   314  
   315  	// Create a task that has no checks
   316  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   317  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   318  	task2.Name = "task 2"
   319  	task2.Services = nil
   320  
   321  	// Make the alloc be part of a deployment
   322  	ar.alloc.DeploymentID = uuid.Generate()
   323  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   324  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   325  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   326  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   327  
   328  	checkHealthy := &api.AgentCheck{
   329  		CheckID: uuid.Generate(),
   330  		Status:  api.HealthPassing,
   331  	}
   332  	checkUnhealthy := &api.AgentCheck{
   333  		CheckID: checkHealthy.CheckID,
   334  		Status:  api.HealthWarning,
   335  	}
   336  
   337  	// Only return the check as healthy after a duration
   338  	trigger := time.After(500 * time.Millisecond)
   339  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   340  		select {
   341  		case <-trigger:
   342  			return &consul.AllocRegistration{
   343  				Tasks: map[string]*consul.TaskRegistration{
   344  					task.Name: {
   345  						Services: map[string]*consul.ServiceRegistration{
   346  							"123": {
   347  								Service: &api.AgentService{Service: "foo"},
   348  								Checks:  []*api.AgentCheck{checkHealthy},
   349  							},
   350  						},
   351  					},
   352  				},
   353  			}, nil
   354  		default:
   355  			return &consul.AllocRegistration{
   356  				Tasks: map[string]*consul.TaskRegistration{
   357  					task.Name: {
   358  						Services: map[string]*consul.ServiceRegistration{
   359  							"123": {
   360  								Service: &api.AgentService{Service: "foo"},
   361  								Checks:  []*api.AgentCheck{checkUnhealthy},
   362  							},
   363  						},
   364  					},
   365  				},
   366  			}, nil
   367  		}
   368  	}
   369  
   370  	start := time.Now()
   371  	go ar.Run()
   372  	defer ar.Destroy()
   373  
   374  	testutil.WaitForResult(func() (bool, error) {
   375  		last := upd.Last()
   376  		if last == nil {
   377  			return false, fmt.Errorf("No updates")
   378  		}
   379  		if !last.DeploymentStatus.HasHealth() {
   380  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   381  		} else if !*last.DeploymentStatus.Healthy {
   382  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   383  		}
   384  		return true, nil
   385  	}, func(err error) {
   386  		t.Fatalf("err: %v", err)
   387  	})
   388  
   389  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   390  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   391  	}
   392  }
   393  
   394  // Test that the watcher will mark the allocation as unhealthy with failing
   395  // checks
   396  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   397  	t.Parallel()
   398  	assert := assert.New(t)
   399  
   400  	// Ensure the task fails and restarts
   401  	upd, ar := testAllocRunner(t, true)
   402  
   403  	// Make the task fail
   404  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   405  	task.Driver = "mock_driver"
   406  	task.Config["run_for"] = "10s"
   407  
   408  	// Make the alloc be part of a deployment
   409  	ar.alloc.DeploymentID = uuid.Generate()
   410  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   411  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   412  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   413  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   414  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   415  
   416  	checkUnhealthy := &api.AgentCheck{
   417  		CheckID: uuid.Generate(),
   418  		Status:  api.HealthWarning,
   419  	}
   420  
   421  	// Only return the check as healthy after a duration
   422  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   423  		return &consul.AllocRegistration{
   424  			Tasks: map[string]*consul.TaskRegistration{
   425  				task.Name: {
   426  					Services: map[string]*consul.ServiceRegistration{
   427  						"123": {
   428  							Service: &api.AgentService{Service: "foo"},
   429  							Checks:  []*api.AgentCheck{checkUnhealthy},
   430  						},
   431  					},
   432  				},
   433  			},
   434  		}, nil
   435  	}
   436  
   437  	go ar.Run()
   438  	defer ar.Destroy()
   439  
   440  	testutil.WaitForResult(func() (bool, error) {
   441  		last := upd.Last()
   442  		if last == nil {
   443  			return false, fmt.Errorf("No updates")
   444  		}
   445  		if !last.DeploymentStatus.HasHealth() {
   446  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   447  		} else if *last.DeploymentStatus.Healthy {
   448  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   449  		}
   450  		return true, nil
   451  	}, func(err error) {
   452  		t.Fatalf("err: %v", err)
   453  	})
   454  
   455  	// Assert that we have an event explaining why we are unhealthy.
   456  	assert.Len(ar.taskStates, 1)
   457  	state := ar.taskStates[task.Name]
   458  	assert.NotNil(state)
   459  	assert.NotEmpty(state.Events)
   460  	last := state.Events[len(state.Events)-1]
   461  	assert.Equal(allocHealthEventSource, last.Type)
   462  	assert.Contains(last.Message, "Services not healthy by deadline")
   463  }
   464  
   465  // Test that the watcher will mark the allocation as healthy.
   466  func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
   467  	t.Parallel()
   468  
   469  	// Ensure the task fails and restarts
   470  	upd, ar := testAllocRunner(t, true)
   471  
   472  	// Make the task run healthy
   473  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   474  	task.Driver = "mock_driver"
   475  	task.Config["run_for"] = "30s"
   476  
   477  	// Make the alloc be part of a deployment
   478  	ar.alloc.DeploymentID = uuid.Generate()
   479  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   480  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   481  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   482  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   483  
   484  	go ar.Run()
   485  	defer ar.Destroy()
   486  
   487  	testutil.WaitForResult(func() (bool, error) {
   488  		last := upd.Last()
   489  		if last == nil {
   490  			return false, fmt.Errorf("No updates")
   491  		}
   492  		if !last.DeploymentStatus.HasHealth() {
   493  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   494  		} else if !*last.DeploymentStatus.Healthy {
   495  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   496  		}
   497  		return true, nil
   498  	}, func(err error) {
   499  		t.Fatalf("err: %v", err)
   500  	})
   501  
   502  	// Mimick an update to a new deployment id
   503  	last := upd.Last()
   504  	last.DeploymentStatus = nil
   505  	last.DeploymentID = uuid.Generate()
   506  	ar.Update(last)
   507  
   508  	testutil.WaitForResult(func() (bool, error) {
   509  		last := upd.Last()
   510  		if !last.DeploymentStatus.HasHealth() {
   511  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   512  		} else if !*last.DeploymentStatus.Healthy {
   513  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   514  		}
   515  		return true, nil
   516  	}, func(err error) {
   517  		t.Fatalf("err: %v", err)
   518  	})
   519  }
   520  
   521  // Test that health is reported for services that got migrated; not just part
   522  // of deployments.
   523  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
   524  	t.Parallel()
   525  
   526  	// Ensure the task fails and restarts
   527  	upd, ar := testAllocRunner(t, true)
   528  
   529  	// Make the task run healthy
   530  	tg := ar.alloc.Job.TaskGroups[0]
   531  	task := tg.Tasks[0]
   532  	task.Driver = "mock_driver"
   533  	task.Config["run_for"] = "30s"
   534  
   535  	// Shorten the default migration healthy time
   536  	tg.Migrate = structs.DefaultMigrateStrategy()
   537  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
   538  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   539  
   540  	// Ensure the alloc is *not* part of a deployment
   541  	ar.alloc.DeploymentID = ""
   542  
   543  	go ar.Run()
   544  	defer ar.Destroy()
   545  
   546  	testutil.WaitForResult(func() (bool, error) {
   547  		last := upd.Last()
   548  		if last == nil {
   549  			return false, fmt.Errorf("No updates")
   550  		}
   551  		if !last.DeploymentStatus.HasHealth() {
   552  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   553  		} else if !*last.DeploymentStatus.Healthy {
   554  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   555  		}
   556  		return true, nil
   557  	}, func(err error) {
   558  		t.Fatalf("err: %v", err)
   559  	})
   560  }
   561  
   562  // Test that health is *not* reported for batch jobs
   563  func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) {
   564  	t.Parallel()
   565  
   566  	// Ensure the task fails and restarts
   567  	alloc := mock.BatchAlloc()
   568  	tg := alloc.Job.TaskGroups[0]
   569  
   570  	// This should not be possile as validation should prevent batch jobs
   571  	// from having a migration stanza!
   572  	tg.Migrate = structs.DefaultMigrateStrategy()
   573  	tg.Migrate.MinHealthyTime = 1 * time.Millisecond
   574  	tg.Migrate.HealthyDeadline = 2 * time.Millisecond
   575  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   576  
   577  	task := tg.Tasks[0]
   578  	task.Driver = "mock_driver"
   579  	task.Config["run_for"] = "5s"
   580  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   581  
   582  	go ar.Run()
   583  	defer ar.Destroy()
   584  
   585  	testutil.WaitForResult(func() (bool, error) {
   586  		last := upd.Last()
   587  		if last == nil {
   588  			return false, fmt.Errorf("No updates")
   589  		}
   590  		if last.DeploymentStatus != nil {
   591  			return false, fmt.Errorf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy)
   592  		}
   593  		return true, nil
   594  	}, func(err error) {
   595  		t.Fatalf("err: %v", err)
   596  	})
   597  }
   598  
   599  // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
   600  // retrying fetching an artifact, other tasks in the group should be able
   601  // to proceed.
   602  func TestAllocRunner_RetryArtifact(t *testing.T) {
   603  	t.Parallel()
   604  
   605  	alloc := mock.Alloc()
   606  	alloc.Job.Type = structs.JobTypeBatch
   607  	alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail
   608  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
   609  	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
   610  
   611  	task := alloc.Job.TaskGroups[0].Tasks[0]
   612  	task.Driver = "mock_driver"
   613  	task.Config = map[string]interface{}{
   614  		"exit_code": "0",
   615  		"run_for":   "1s",
   616  	}
   617  
   618  	// Create a new task with a bad artifact
   619  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   620  	badtask.Name = "bad"
   621  	badtask.Artifacts = []*structs.TaskArtifact{
   622  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   623  	}
   624  
   625  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   626  	upd, ar := testAllocRunnerFromAlloc(t, alloc, true)
   627  	go ar.Run()
   628  	defer ar.Destroy()
   629  
   630  	testutil.WaitForResult(func() (bool, error) {
   631  		last := upd.Last()
   632  		if last == nil {
   633  			return false, fmt.Errorf("No updates")
   634  		}
   635  
   636  		// web task should have completed successfully while bad task
   637  		// retries artifact fetching
   638  		webstate, ok := last.TaskStates["web"]
   639  		if !ok {
   640  			return false, fmt.Errorf("no task state for web")
   641  		}
   642  		if webstate.State != structs.TaskStateDead {
   643  			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
   644  		}
   645  		if !webstate.Successful() {
   646  			return false, fmt.Errorf("expected web to have exited successfully")
   647  		}
   648  
   649  		// bad task should have failed
   650  		badstate := last.TaskStates["bad"]
   651  		if badstate.State != structs.TaskStateDead {
   652  			return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State)
   653  		}
   654  		if !badstate.Failed {
   655  			return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events)
   656  		}
   657  		return true, nil
   658  	}, func(err error) {
   659  		t.Fatalf("err: %v", err)
   660  	})
   661  }
   662  
   663  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
   664  	t.Parallel()
   665  	upd, ar := testAllocRunner(t, false)
   666  
   667  	// Ensure task takes some time
   668  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   669  	task.Driver = "mock_driver"
   670  	task.Config["run_for"] = "10s"
   671  	go ar.Run()
   672  
   673  	testutil.WaitForResult(func() (bool, error) {
   674  		last := upd.Last()
   675  		if last == nil {
   676  			return false, fmt.Errorf("No updates")
   677  		}
   678  		if last.ClientStatus != structs.AllocClientStatusRunning {
   679  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   680  		}
   681  		return true, nil
   682  	}, func(err error) {
   683  		t.Fatalf("err: %v", err)
   684  	})
   685  
   686  	// Update the alloc to be terminal which should cause the alloc runner to
   687  	// stop the tasks and wait for a destroy.
   688  	update := ar.alloc.Copy()
   689  	update.DesiredStatus = structs.AllocDesiredStatusStop
   690  	ar.Update(update)
   691  
   692  	testutil.WaitForResult(func() (bool, error) {
   693  		last := upd.Last()
   694  		if last == nil {
   695  			return false, fmt.Errorf("No updates")
   696  		}
   697  
   698  		// Check the status has changed.
   699  		if last.ClientStatus != structs.AllocClientStatusComplete {
   700  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   701  		}
   702  
   703  		// Check the allocation state still exists
   704  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   705  			if !allocationBucketExists(tx, ar.Alloc().ID) {
   706  				return fmt.Errorf("no bucket for alloc")
   707  			}
   708  
   709  			return nil
   710  		}); err != nil {
   711  			return false, fmt.Errorf("state destroyed")
   712  		}
   713  
   714  		// Check the alloc directory still exists
   715  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   716  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   717  		}
   718  
   719  		return true, nil
   720  	}, func(err error) {
   721  		t.Fatalf("err: %v", err)
   722  	})
   723  
   724  	// Send the destroy signal and ensure the AllocRunner cleans up.
   725  	ar.Destroy()
   726  
   727  	testutil.WaitForResult(func() (bool, error) {
   728  		last := upd.Last()
   729  		if last == nil {
   730  			return false, fmt.Errorf("No updates")
   731  		}
   732  
   733  		// Check the status has changed.
   734  		if last.ClientStatus != structs.AllocClientStatusComplete {
   735  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   736  		}
   737  
   738  		// Check the state was cleaned
   739  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   740  			if allocationBucketExists(tx, ar.Alloc().ID) {
   741  				return fmt.Errorf("bucket for alloc exists")
   742  			}
   743  
   744  			return nil
   745  		}); err != nil {
   746  			return false, fmt.Errorf("state not destroyed")
   747  		}
   748  
   749  		// Check the alloc directory was cleaned
   750  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   751  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   752  		} else if !os.IsNotExist(err) {
   753  			return false, fmt.Errorf("stat err: %v", err)
   754  		}
   755  
   756  		return true, nil
   757  	}, func(err error) {
   758  		t.Fatalf("err: %v", err)
   759  	})
   760  }
   761  
   762  func TestAllocRunner_Destroy(t *testing.T) {
   763  	t.Parallel()
   764  	upd, ar := testAllocRunner(t, false)
   765  
   766  	// Ensure task takes some time
   767  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   768  	task.Driver = "mock_driver"
   769  	task.Config["run_for"] = "10s"
   770  	go ar.Run()
   771  	start := time.Now()
   772  
   773  	// Begin the tear down
   774  	go func() {
   775  		time.Sleep(1 * time.Second)
   776  		ar.Destroy()
   777  	}()
   778  
   779  	testutil.WaitForResult(func() (bool, error) {
   780  		last := upd.Last()
   781  		if last == nil {
   782  			return false, fmt.Errorf("No updates")
   783  		}
   784  
   785  		// Check the status has changed.
   786  		if last.ClientStatus != structs.AllocClientStatusComplete {
   787  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   788  		}
   789  
   790  		// Check the state was cleaned
   791  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   792  			if allocationBucketExists(tx, ar.Alloc().ID) {
   793  				return fmt.Errorf("bucket for alloc exists")
   794  			}
   795  
   796  			return nil
   797  		}); err != nil {
   798  			return false, fmt.Errorf("state not destroyed: %v", err)
   799  		}
   800  
   801  		// Check the alloc directory was cleaned
   802  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   803  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   804  		} else if !os.IsNotExist(err) {
   805  			return false, fmt.Errorf("stat err: %v", err)
   806  		}
   807  
   808  		return true, nil
   809  	}, func(err error) {
   810  		t.Fatalf("err: %v", err)
   811  	})
   812  
   813  	if elapsed := time.Since(start); elapsed > 20*time.Second {
   814  		t.Fatalf("took too long to terminate: %s", elapsed)
   815  	}
   816  }
   817  
   818  func TestAllocRunner_Update(t *testing.T) {
   819  	t.Parallel()
   820  	_, ar := testAllocRunner(t, false)
   821  
   822  	// Deep copy the alloc to avoid races when updating
   823  	newAlloc := ar.Alloc().Copy()
   824  
   825  	// Ensure task takes some time
   826  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   827  	task.Driver = "mock_driver"
   828  	task.Config["run_for"] = "10s"
   829  	go ar.Run()
   830  	defer ar.Destroy()
   831  
   832  	// Update the alloc definition
   833  	newAlloc.Name = "FOO"
   834  	newAlloc.AllocModifyIndex++
   835  	ar.Update(newAlloc)
   836  
   837  	// Check the alloc runner stores the update allocation.
   838  	testutil.WaitForResult(func() (bool, error) {
   839  		return ar.Alloc().Name == "FOO", nil
   840  	}, func(err error) {
   841  		t.Fatalf("err: %v %#v", err, ar.Alloc())
   842  	})
   843  }
   844  
   845  func TestAllocRunner_SaveRestoreState(t *testing.T) {
   846  	t.Parallel()
   847  	alloc := mock.Alloc()
   848  	task := alloc.Job.TaskGroups[0].Tasks[0]
   849  	task.Driver = "mock_driver"
   850  	task.Config = map[string]interface{}{
   851  		"exit_code": "0",
   852  		"run_for":   "10s",
   853  	}
   854  
   855  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   856  	go ar.Run()
   857  	defer ar.Destroy()
   858  
   859  	// Snapshot state
   860  	testutil.WaitForResult(func() (bool, error) {
   861  		ar.taskLock.RLock()
   862  		defer ar.taskLock.RUnlock()
   863  		return len(ar.tasks) == 1, nil
   864  	}, func(err error) {
   865  		t.Fatalf("task never started: %v", err)
   866  	})
   867  
   868  	err := ar.SaveState()
   869  	if err != nil {
   870  		t.Fatalf("err: %v", err)
   871  	}
   872  
   873  	// Create a new alloc runner
   874  	l2 := prefixedTestLogger("----- ar2:  ")
   875  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   876  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   877  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   878  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   879  	err = ar2.RestoreState()
   880  	if err != nil {
   881  		t.Fatalf("err: %v", err)
   882  	}
   883  	go ar2.Run()
   884  
   885  	testutil.WaitForResult(func() (bool, error) {
   886  		if len(ar2.tasks) != 1 {
   887  			return false, fmt.Errorf("Incorrect number of tasks")
   888  		}
   889  
   890  		last := upd.Last()
   891  		if last == nil {
   892  			return false, nil
   893  		}
   894  
   895  		return last.ClientStatus == structs.AllocClientStatusRunning, nil
   896  	}, func(err error) {
   897  		last := upd.Last()
   898  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"])
   899  	})
   900  
   901  	// Destroy and wait
   902  	ar2.Destroy()
   903  	start := time.Now()
   904  
   905  	testutil.WaitForResult(func() (bool, error) {
   906  		alloc := ar2.Alloc()
   907  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
   908  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
   909  		}
   910  		return true, nil
   911  	}, func(err error) {
   912  		last := upd.Last()
   913  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   914  	})
   915  
   916  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
   917  		t.Fatalf("took too long to terminate")
   918  	}
   919  }
   920  
   921  func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
   922  	t.Parallel()
   923  	upd, ar := testAllocRunner(t, false)
   924  	ar.logger = prefixedTestLogger("ar1: ")
   925  
   926  	// Ensure task takes some time
   927  	ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   928  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   929  	task.Config["run_for"] = "10s"
   930  	go ar.Run()
   931  	defer ar.Destroy()
   932  
   933  	testutil.WaitForResult(func() (bool, error) {
   934  		last := upd.Last()
   935  		if last == nil {
   936  			return false, fmt.Errorf("No updates")
   937  		}
   938  
   939  		if last.ClientStatus != structs.AllocClientStatusRunning {
   940  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   941  		}
   942  		return true, nil
   943  	}, func(err error) {
   944  		t.Fatalf("err: %v", err)
   945  	})
   946  
   947  	// Update the alloc to be terminal which should cause the alloc runner to
   948  	// stop the tasks and wait for a destroy.
   949  	update := ar.alloc.Copy()
   950  	update.DesiredStatus = structs.AllocDesiredStatusStop
   951  	ar.Update(update)
   952  
   953  	testutil.WaitForResult(func() (bool, error) {
   954  		return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil
   955  	}, func(err error) {
   956  		t.Fatalf("err: %v", err)
   957  	})
   958  
   959  	err := ar.SaveState()
   960  	if err != nil {
   961  		t.Fatalf("err: %v", err)
   962  	}
   963  
   964  	// Ensure ar1 doesn't recreate the state file
   965  	ar.allocLock.Lock()
   966  	defer ar.allocLock.Unlock()
   967  
   968  	// Create a new alloc runner
   969  	l2 := prefixedTestLogger("ar2: ")
   970  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   971  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   972  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   973  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   974  	err = ar2.RestoreState()
   975  	if err != nil {
   976  		t.Fatalf("err: %v", err)
   977  	}
   978  	ar2.logger.Println("[TESTING] running second alloc runner")
   979  	go ar2.Run()
   980  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
   981  
   982  	testutil.WaitForResult(func() (bool, error) {
   983  		// Check the state still exists
   984  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   985  			if !allocationBucketExists(tx, ar2.Alloc().ID) {
   986  				return fmt.Errorf("no bucket for alloc")
   987  			}
   988  
   989  			return nil
   990  		}); err != nil {
   991  			return false, fmt.Errorf("state destroyed")
   992  		}
   993  
   994  		// Check the alloc directory still exists
   995  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   996  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   997  		}
   998  
   999  		return true, nil
  1000  	}, func(err error) {
  1001  		last := upd.Last()
  1002  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
  1003  	})
  1004  
  1005  	// Send the destroy signal and ensure the AllocRunner cleans up.
  1006  	ar2.logger.Println("[TESTING] destroying second alloc runner")
  1007  	ar2.Destroy()
  1008  
  1009  	testutil.WaitForResult(func() (bool, error) {
  1010  		last := upd.Last()
  1011  		if last == nil {
  1012  			return false, fmt.Errorf("No updates")
  1013  		}
  1014  
  1015  		// Check the status has changed.
  1016  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1017  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1018  		}
  1019  
  1020  		// Check the state was cleaned
  1021  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
  1022  			if allocationBucketExists(tx, ar2.Alloc().ID) {
  1023  				return fmt.Errorf("bucket for alloc exists")
  1024  			}
  1025  
  1026  			return nil
  1027  		}); err != nil {
  1028  			return false, fmt.Errorf("state not destroyed")
  1029  		}
  1030  
  1031  		// Check the alloc directory was cleaned
  1032  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
  1033  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
  1034  		} else if !os.IsNotExist(err) {
  1035  			return false, fmt.Errorf("stat err: %v", err)
  1036  		}
  1037  
  1038  		return true, nil
  1039  	}, func(err error) {
  1040  		t.Fatalf("err: %v", err)
  1041  	})
  1042  }
  1043  
  1044  // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are
  1045  // restarted on upgrade.
  1046  func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) {
  1047  	t.Parallel()
  1048  	alloc := mock.Alloc()
  1049  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1050  	task.Driver = "mock_driver"
  1051  	task.Config = map[string]interface{}{
  1052  		"exit_code": "0",
  1053  		"run_for":   "10s",
  1054  	}
  1055  
  1056  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
  1057  	// Hack in old version to cause an upgrade on RestoreState
  1058  	origConfig := ar.config.Copy()
  1059  	ar.config.Version = &version.VersionInfo{Version: "0.5.6"}
  1060  	go ar.Run()
  1061  	defer ar.Destroy()
  1062  
  1063  	// Snapshot state
  1064  	testutil.WaitForResult(func() (bool, error) {
  1065  		last := upd.Last()
  1066  		if last == nil {
  1067  			return false, fmt.Errorf("No updates")
  1068  		}
  1069  
  1070  		if last.ClientStatus != structs.AllocClientStatusRunning {
  1071  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
  1072  		}
  1073  		return true, nil
  1074  	}, func(err error) {
  1075  		t.Fatalf("task never started: %v", err)
  1076  	})
  1077  
  1078  	err := ar.SaveState()
  1079  	if err != nil {
  1080  		t.Fatalf("err: %v", err)
  1081  	}
  1082  
  1083  	// Create a new alloc runner
  1084  	l2 := prefixedTestLogger("ar2: ")
  1085  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
  1086  	prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "")
  1087  	ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
  1088  	err = ar2.RestoreState()
  1089  	if err != nil {
  1090  		t.Fatalf("err: %v", err)
  1091  	}
  1092  	go ar2.Run()
  1093  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
  1094  
  1095  	testutil.WaitForResult(func() (bool, error) {
  1096  		last := upd.Last()
  1097  		if last == nil {
  1098  			return false, fmt.Errorf("No updates")
  1099  		}
  1100  		for _, ev := range last.TaskStates["web"].Events {
  1101  			if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) {
  1102  				return true, nil
  1103  			}
  1104  		}
  1105  		return false, fmt.Errorf("no restart with proper reason found")
  1106  	}, func(err error) {
  1107  		last := upd.Last()
  1108  		t.Fatalf("err: %v\nweb state: % #v", err, pretty.Formatter(last.TaskStates["web"]))
  1109  	})
  1110  
  1111  	// Destroy and wait
  1112  	ar2.Destroy()
  1113  	start := time.Now()
  1114  
  1115  	testutil.WaitForResult(func() (bool, error) {
  1116  		alloc := ar2.Alloc()
  1117  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
  1118  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
  1119  		}
  1120  		return true, nil
  1121  	}, func(err error) {
  1122  		last := upd.Last()
  1123  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
  1124  	})
  1125  
  1126  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
  1127  		t.Fatalf("took too long to terminate")
  1128  	}
  1129  }
  1130  
  1131  // Ensure pre-#2132 state files containing the Context struct are properly
  1132  // migrated to the new format.
  1133  //
  1134  // Old Context State:
  1135  //
  1136  //  "Context": {
  1137  //    "AllocDir": {
  1138  //      "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb",
  1139  //      "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc",
  1140  //      "TaskDirs": {
  1141  //        "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1"
  1142  //      }
  1143  //    },
  1144  //    "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb"
  1145  //  }
  1146  func TestAllocRunner_RestoreOldState(t *testing.T) {
  1147  	t.Parallel()
  1148  	alloc := mock.Alloc()
  1149  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1150  	task.Driver = "mock_driver"
  1151  	task.Config = map[string]interface{}{
  1152  		"exit_code": "0",
  1153  		"run_for":   "10s",
  1154  	}
  1155  
  1156  	logger := testLogger()
  1157  	conf := config.DefaultConfig()
  1158  	conf.Node = mock.Node()
  1159  	conf.StateDir = os.TempDir()
  1160  	conf.AllocDir = os.TempDir()
  1161  	tmp, err := ioutil.TempFile("", "state-db")
  1162  	if err != nil {
  1163  		t.Fatalf("error creating state db file: %v", err)
  1164  	}
  1165  	db, err := bolt.Open(tmp.Name(), 0600, nil)
  1166  	if err != nil {
  1167  		t.Fatalf("error creating state db: %v", err)
  1168  	}
  1169  
  1170  	if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil {
  1171  		t.Fatalf("error creating state dir: %v", err)
  1172  	}
  1173  	statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json")
  1174  	w, err := os.Create(statePath)
  1175  	if err != nil {
  1176  		t.Fatalf("error creating state file: %v", err)
  1177  	}
  1178  	tmplctx := &struct {
  1179  		AllocID  string
  1180  		AllocDir string
  1181  	}{alloc.ID, conf.AllocDir}
  1182  	err = template.Must(template.New("test_state").Parse(`{
  1183    "Version": "0.5.1",
  1184    "Alloc": {
  1185      "ID": "{{ .AllocID }}",
  1186      "Name": "example",
  1187      "JobID": "example",
  1188      "Job": {
  1189        "ID": "example",
  1190        "Name": "example",
  1191        "Type": "batch",
  1192        "TaskGroups": [
  1193          {
  1194            "Name": "example",
  1195            "Tasks": [
  1196              {
  1197                "Name": "example",
  1198                "Driver": "mock",
  1199                "Config": {
  1200                  "exit_code": "0",
  1201  		"run_for": "10s"
  1202                }
  1203              }
  1204            ]
  1205          }
  1206        ]
  1207      },
  1208      "TaskGroup": "example",
  1209      "DesiredStatus": "run",
  1210      "ClientStatus": "running",
  1211      "TaskStates": {
  1212        "example": {
  1213          "State": "running",
  1214          "Failed": false,
  1215          "Events": []
  1216        }
  1217      }
  1218    },
  1219    "Context": {
  1220      "AllocDir": {
  1221        "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}",
  1222        "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc",
  1223        "TaskDirs": {
  1224          "example": "{{ .AllocDir }}/{{ .AllocID }}/example"
  1225        }
  1226      },
  1227      "AllocID": "{{ .AllocID }}"
  1228    }
  1229  }`)).Execute(w, tmplctx)
  1230  	if err != nil {
  1231  		t.Fatalf("error writing state file: %v", err)
  1232  	}
  1233  	w.Close()
  1234  
  1235  	upd := &MockAllocStateUpdater{}
  1236  	*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
  1237  	alloc.Job.Type = structs.JobTypeBatch
  1238  	vclient := vaultclient.NewMockVaultClient()
  1239  	cclient := newMockConsulServiceClient(t)
  1240  	ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{})
  1241  	defer ar.Destroy()
  1242  
  1243  	// RestoreState should fail on the task state since we only test the
  1244  	// alloc state restoring.
  1245  	err = ar.RestoreState()
  1246  	if err == nil {
  1247  		t.Fatal("expected error restoring Task state")
  1248  	}
  1249  	merr, ok := err.(*multierror.Error)
  1250  	if !ok {
  1251  		t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err)
  1252  	}
  1253  	if len(merr.Errors) != 1 {
  1254  		t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err)
  1255  	}
  1256  	if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) {
  1257  		t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error())
  1258  	}
  1259  
  1260  	if err := ar.SaveState(); err != nil {
  1261  		t.Fatalf("error saving new state: %v", err)
  1262  	}
  1263  }
  1264  
  1265  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
  1266  	t.Parallel()
  1267  	upd, ar := testAllocRunner(t, false)
  1268  
  1269  	// Create two tasks in the task group
  1270  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1271  	task.Driver = "mock_driver"
  1272  	task.KillTimeout = 10 * time.Millisecond
  1273  	task.Config = map[string]interface{}{
  1274  		"run_for": "10s",
  1275  	}
  1276  
  1277  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1278  	task2.Name = "task 2"
  1279  	task2.Driver = "mock_driver"
  1280  	task2.Config = map[string]interface{}{
  1281  		"start_error": "fail task please",
  1282  	}
  1283  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1284  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1285  	go ar.Run()
  1286  	defer ar.Destroy()
  1287  
  1288  	testutil.WaitForResult(func() (bool, error) {
  1289  		last := upd.Last()
  1290  		if last == nil {
  1291  			return false, fmt.Errorf("No updates")
  1292  		}
  1293  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1294  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1295  		}
  1296  
  1297  		// Task One should be killed
  1298  		state1 := last.TaskStates[task.Name]
  1299  		if state1.State != structs.TaskStateDead {
  1300  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1301  		}
  1302  		if len(state1.Events) < 2 {
  1303  			// At least have a received and destroyed
  1304  			return false, fmt.Errorf("Unexpected number of events")
  1305  		}
  1306  
  1307  		found := false
  1308  		for _, e := range state1.Events {
  1309  			if e.Type != structs.TaskSiblingFailed {
  1310  				found = true
  1311  			}
  1312  		}
  1313  
  1314  		if !found {
  1315  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1316  		}
  1317  
  1318  		// Task Two should be failed
  1319  		state2 := last.TaskStates[task2.Name]
  1320  		if state2.State != structs.TaskStateDead {
  1321  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1322  		}
  1323  		if !state2.Failed {
  1324  			return false, fmt.Errorf("task2 should have failed")
  1325  		}
  1326  
  1327  		return true, nil
  1328  	}, func(err error) {
  1329  		t.Fatalf("err: %v", err)
  1330  	})
  1331  }
  1332  
  1333  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
  1334  	t.Parallel()
  1335  	upd, ar := testAllocRunner(t, false)
  1336  
  1337  	// Create two tasks in the task group
  1338  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1339  	task.Driver = "mock_driver"
  1340  	task.KillTimeout = 10 * time.Millisecond
  1341  	task.Config = map[string]interface{}{
  1342  		"run_for": "10s",
  1343  	}
  1344  
  1345  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1346  	task2.Name = "task 2"
  1347  	task2.Driver = "mock_driver"
  1348  	task2.Leader = true
  1349  	task2.Config = map[string]interface{}{
  1350  		"run_for": "1s",
  1351  	}
  1352  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1353  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1354  	go ar.Run()
  1355  	defer ar.Destroy()
  1356  
  1357  	testutil.WaitForResult(func() (bool, error) {
  1358  		last := upd.Last()
  1359  		if last == nil {
  1360  			return false, fmt.Errorf("No updates")
  1361  		}
  1362  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1363  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1364  		}
  1365  
  1366  		// Task One should be killed
  1367  		state1 := last.TaskStates[task.Name]
  1368  		if state1.State != structs.TaskStateDead {
  1369  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1370  		}
  1371  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
  1372  			return false, fmt.Errorf("expected to have a start and finish time")
  1373  		}
  1374  		if len(state1.Events) < 2 {
  1375  			// At least have a received and destroyed
  1376  			return false, fmt.Errorf("Unexpected number of events")
  1377  		}
  1378  
  1379  		found := false
  1380  		for _, e := range state1.Events {
  1381  			if e.Type != structs.TaskLeaderDead {
  1382  				found = true
  1383  			}
  1384  		}
  1385  
  1386  		if !found {
  1387  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
  1388  		}
  1389  
  1390  		// Task Two should be dead
  1391  		state2 := last.TaskStates[task2.Name]
  1392  		if state2.State != structs.TaskStateDead {
  1393  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1394  		}
  1395  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
  1396  			return false, fmt.Errorf("expected to have a start and finish time")
  1397  		}
  1398  
  1399  		return true, nil
  1400  	}, func(err error) {
  1401  		t.Fatalf("err: %v", err)
  1402  	})
  1403  }
  1404  
  1405  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group
  1406  // with a leader the leader is stopped before other tasks.
  1407  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
  1408  	t.Parallel()
  1409  	upd, ar := testAllocRunner(t, false)
  1410  
  1411  	// Create 3 tasks in the task group
  1412  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1413  	task.Name = "follower1"
  1414  	task.Driver = "mock_driver"
  1415  	task.KillTimeout = 10 * time.Millisecond
  1416  	task.Config = map[string]interface{}{
  1417  		"run_for": "10s",
  1418  	}
  1419  
  1420  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1421  	task2.Name = "leader"
  1422  	task2.Driver = "mock_driver"
  1423  	task2.Leader = true
  1424  	task2.KillTimeout = 10 * time.Millisecond
  1425  	task2.Config = map[string]interface{}{
  1426  		"run_for": "10s",
  1427  	}
  1428  
  1429  	task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1430  	task3.Name = "follower2"
  1431  	task3.Driver = "mock_driver"
  1432  	task3.KillTimeout = 10 * time.Millisecond
  1433  	task3.Config = map[string]interface{}{
  1434  		"run_for": "10s",
  1435  	}
  1436  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3)
  1437  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1438  	defer ar.Destroy()
  1439  
  1440  	go ar.Run()
  1441  
  1442  	// Wait for tasks to start
  1443  	last := upd.Last()
  1444  	testutil.WaitForResult(func() (bool, error) {
  1445  		last = upd.Last()
  1446  		if last == nil {
  1447  			return false, fmt.Errorf("No updates")
  1448  		}
  1449  		if n := len(last.TaskStates); n != 3 {
  1450  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
  1451  		}
  1452  		for name, state := range last.TaskStates {
  1453  			if state.State != structs.TaskStateRunning {
  1454  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1455  			}
  1456  		}
  1457  		return true, nil
  1458  	}, func(err error) {
  1459  		t.Fatalf("err: %v", err)
  1460  	})
  1461  
  1462  	// Reset updates
  1463  	upd.mu.Lock()
  1464  	upd.Allocs = upd.Allocs[:0]
  1465  	upd.mu.Unlock()
  1466  
  1467  	// Stop alloc
  1468  	update := ar.Alloc()
  1469  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1470  	ar.Update(update)
  1471  
  1472  	// Wait for tasks to stop
  1473  	testutil.WaitForResult(func() (bool, error) {
  1474  		last := upd.Last()
  1475  		if last == nil {
  1476  			return false, fmt.Errorf("No updates")
  1477  		}
  1478  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
  1479  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
  1480  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
  1481  		}
  1482  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
  1483  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
  1484  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
  1485  		}
  1486  		return true, nil
  1487  	}, func(err error) {
  1488  		last := upd.Last()
  1489  		for name, state := range last.TaskStates {
  1490  			t.Logf("%s: %s", name, state.State)
  1491  		}
  1492  		t.Fatalf("err: %v", err)
  1493  	})
  1494  }
  1495  
  1496  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
  1497  // restored task group with a leader that failed before restoring the leader is
  1498  // not stopped as it does not exist.
  1499  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
  1500  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
  1501  	t.Parallel()
  1502  	_, ar := testAllocRunner(t, false)
  1503  	defer ar.Destroy()
  1504  
  1505  	// Create a leader and follower task in the task group
  1506  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1507  	task.Name = "follower1"
  1508  	task.Driver = "mock_driver"
  1509  	task.KillTimeout = 10 * time.Second
  1510  	task.Config = map[string]interface{}{
  1511  		"run_for": "10s",
  1512  	}
  1513  
  1514  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1515  	task2.Name = "leader"
  1516  	task2.Driver = "mock_driver"
  1517  	task2.Leader = true
  1518  	task2.KillTimeout = 10 * time.Millisecond
  1519  	task2.Config = map[string]interface{}{
  1520  		"run_for": "0s",
  1521  	}
  1522  
  1523  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1524  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1525  
  1526  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
  1527  	ar.tasks = map[string]*TaskRunner{
  1528  		"leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1529  			ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(),
  1530  			ar.vaultClient, ar.consulClient),
  1531  		"follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1532  			ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(),
  1533  			ar.vaultClient, ar.consulClient),
  1534  	}
  1535  	ar.taskStates = map[string]*structs.TaskState{
  1536  		"leader":    {State: structs.TaskStateDead},
  1537  		"follower1": {State: structs.TaskStateRunning},
  1538  	}
  1539  	if err := ar.SaveState(); err != nil {
  1540  		t.Fatalf("error saving state: %v", err)
  1541  	}
  1542  
  1543  	// Create a new AllocRunner to test RestoreState and Run
  1544  	upd2 := &MockAllocStateUpdater{}
  1545  	ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc,
  1546  		ar.vaultClient, ar.consulClient, ar.prevAlloc)
  1547  	defer ar2.Destroy()
  1548  
  1549  	if err := ar2.RestoreState(); err != nil {
  1550  		t.Fatalf("error restoring state: %v", err)
  1551  	}
  1552  	go ar2.Run()
  1553  
  1554  	// Wait for tasks to be stopped because leader is dead
  1555  	testutil.WaitForResult(func() (bool, error) {
  1556  		last := upd2.Last()
  1557  		if last == nil {
  1558  			return false, fmt.Errorf("No updates")
  1559  		}
  1560  		if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead {
  1561  			return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual)
  1562  		}
  1563  		if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead {
  1564  			return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual)
  1565  		}
  1566  		return true, nil
  1567  	}, func(err error) {
  1568  		last := upd2.Last()
  1569  		for name, state := range last.TaskStates {
  1570  			t.Logf("%s: %s", name, state.State)
  1571  		}
  1572  		t.Fatalf("err: %v", err)
  1573  	})
  1574  
  1575  	// Make sure it GCs properly
  1576  	ar2.Destroy()
  1577  
  1578  	select {
  1579  	case <-ar2.WaitCh():
  1580  		// exited as expected
  1581  	case <-time.After(10 * time.Second):
  1582  		t.Fatalf("timed out waiting for AR to GC")
  1583  	}
  1584  }
  1585  
  1586  // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's
  1587  // local/ dir will be moved to a replacement alloc's local/ dir if sticky
  1588  // volumes is on.
  1589  func TestAllocRunner_MoveAllocDir(t *testing.T) {
  1590  	t.Parallel()
  1591  	// Create an alloc runner
  1592  	alloc := mock.Alloc()
  1593  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1594  	task.Driver = "mock_driver"
  1595  	task.Config = map[string]interface{}{
  1596  		"run_for": "1s",
  1597  	}
  1598  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
  1599  	go ar.Run()
  1600  	defer ar.Destroy()
  1601  
  1602  	testutil.WaitForResult(func() (bool, error) {
  1603  		last := upd.Last()
  1604  		if last == nil {
  1605  			return false, fmt.Errorf("No updates")
  1606  		}
  1607  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1608  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1609  		}
  1610  		return true, nil
  1611  	}, func(err error) {
  1612  		t.Fatalf("err: %v", err)
  1613  	})
  1614  
  1615  	// Write some data in data dir and task dir of the alloc
  1616  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
  1617  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
  1618  	taskDir := ar.allocDir.TaskDirs[task.Name]
  1619  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
  1620  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
  1621  
  1622  	// Create another alloc runner
  1623  	alloc2 := mock.Alloc()
  1624  	alloc2.PreviousAllocation = ar.allocID
  1625  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
  1626  	task = alloc2.Job.TaskGroups[0].Tasks[0]
  1627  	task.Driver = "mock_driver"
  1628  	task.Config = map[string]interface{}{
  1629  		"run_for": "1s",
  1630  	}
  1631  	upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false)
  1632  
  1633  	// Set prevAlloc like Client does
  1634  	ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "")
  1635  
  1636  	go ar2.Run()
  1637  	defer ar2.Destroy()
  1638  
  1639  	testutil.WaitForResult(func() (bool, error) {
  1640  		last := upd2.Last()
  1641  		if last == nil {
  1642  			return false, fmt.Errorf("No updates")
  1643  		}
  1644  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1645  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1646  		}
  1647  		return true, nil
  1648  	}, func(err error) {
  1649  		t.Fatalf("err: %v", err)
  1650  	})
  1651  
  1652  	// Ensure that data from ar was moved to ar2
  1653  	taskDir = ar2.allocDir.TaskDirs[task.Name]
  1654  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
  1655  	if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil {
  1656  		t.Fatalf("file %v not found", taskLocalFile)
  1657  	}
  1658  
  1659  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
  1660  	if fileInfo, _ := os.Stat(dataFile); fileInfo == nil {
  1661  		t.Fatalf("file %v not found", dataFile)
  1662  	}
  1663  }