github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/alloc_runner_test.go

github.com/hspak/nomad@v0.7.2-0.20180309000617-bc4ae22a39a5/client/alloc_runner_test.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"strings"
     9  	"sync"
    10  	"testing"
    11  	"text/template"
    12  	"time"
    13  
    14  	"github.com/boltdb/bolt"
    15  	"github.com/hashicorp/consul/api"
    16  	"github.com/hashicorp/go-multierror"
    17  	"github.com/hashicorp/nomad/command/agent/consul"
    18  	"github.com/hashicorp/nomad/helper/testlog"
    19  	"github.com/hashicorp/nomad/helper/uuid"
    20  	"github.com/hashicorp/nomad/nomad/mock"
    21  	"github.com/hashicorp/nomad/nomad/structs"
    22  	"github.com/hashicorp/nomad/testutil"
    23  	"github.com/hashicorp/nomad/version"
    24  	"github.com/kr/pretty"
    25  	"github.com/stretchr/testify/assert"
    26  
    27  	"github.com/hashicorp/nomad/client/config"
    28  	"github.com/hashicorp/nomad/client/vaultclient"
    29  )
    30  
    31  type MockAllocStateUpdater struct {
    32  	Allocs []*structs.Allocation
    33  	mu     sync.Mutex
    34  }
    35  
    36  // Update fulfills the TaskStateUpdater interface
    37  func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
    38  	m.mu.Lock()
    39  	m.Allocs = append(m.Allocs, alloc)
    40  	m.mu.Unlock()
    41  }
    42  
    43  // Last returns the total number of updates and the last alloc (or nil)
    44  func (m *MockAllocStateUpdater) Last() (int, *structs.Allocation) {
    45  	m.mu.Lock()
    46  	defer m.mu.Unlock()
    47  	n := len(m.Allocs)
    48  	if n == 0 {
    49  		return 0, nil
    50  	}
    51  	return n, m.Allocs[n-1].Copy()
    52  }
    53  
    54  // allocationBucketExists checks if the allocation bucket was created.
    55  func allocationBucketExists(tx *bolt.Tx, allocID string) bool {
    56  	allocations := tx.Bucket(allocationsBucket)
    57  	if allocations == nil {
    58  		return false
    59  	}
    60  
    61  	// Retrieve the specific allocations bucket
    62  	alloc := allocations.Bucket([]byte(allocID))
    63  	return alloc != nil
    64  }
    65  
    66  func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    67  	conf := config.DefaultConfig()
    68  	conf.Node = mock.Node()
    69  	conf.StateDir = os.TempDir()
    70  	conf.AllocDir = os.TempDir()
    71  	tmp, _ := ioutil.TempFile("", "state-db")
    72  	db, _ := bolt.Open(tmp.Name(), 0600, nil)
    73  	upd := &MockAllocStateUpdater{}
    74  	if !restarts {
    75  		*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
    76  		alloc.Job.Type = structs.JobTypeBatch
    77  	}
    78  	vclient := vaultclient.NewMockVaultClient()
    79  	ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{})
    80  	return upd, ar
    81  }
    82  
    83  func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    84  	// Use mock driver
    85  	alloc := mock.Alloc()
    86  	task := alloc.Job.TaskGroups[0].Tasks[0]
    87  	task.Driver = "mock_driver"
    88  	task.Config["run_for"] = "500ms"
    89  	return testAllocRunnerFromAlloc(t, alloc, restarts)
    90  }
    91  
    92  func TestAllocRunner_SimpleRun(t *testing.T) {
    93  	t.Parallel()
    94  	upd, ar := testAllocRunner(t, false)
    95  	go ar.Run()
    96  	defer ar.Destroy()
    97  
    98  	testutil.WaitForResult(func() (bool, error) {
    99  		_, last := upd.Last()
   100  		if last == nil {
   101  			return false, fmt.Errorf("No updates")
   102  		}
   103  		if last.ClientStatus != structs.AllocClientStatusComplete {
   104  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   105  		}
   106  		return true, nil
   107  	}, func(err error) {
   108  		t.Fatalf("err: %v", err)
   109  	})
   110  }
   111  
   112  // Test that the watcher will mark the allocation as unhealthy.
   113  func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
   114  	t.Parallel()
   115  	assert := assert.New(t)
   116  
   117  	// Ensure the task fails and restarts
   118  	upd, ar := testAllocRunner(t, false)
   119  
   120  	// Make the task fail
   121  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   122  	task.Driver = "mock_driver"
   123  	task.Config["start_error"] = "test error"
   124  
   125  	// Make the alloc be part of a deployment
   126  	ar.alloc.DeploymentID = uuid.Generate()
   127  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   128  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   129  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   130  
   131  	go ar.Run()
   132  	defer ar.Destroy()
   133  
   134  	testutil.WaitForResult(func() (bool, error) {
   135  		_, last := upd.Last()
   136  		if last == nil {
   137  			return false, fmt.Errorf("No updates")
   138  		}
   139  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   140  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   141  		} else if *last.DeploymentStatus.Healthy {
   142  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   143  		}
   144  		return true, nil
   145  	}, func(err error) {
   146  		t.Fatalf("err: %v", err)
   147  	})
   148  
   149  	// Assert that we have an event explaining why we are unhealthy.
   150  	assert.Len(ar.taskStates, 1)
   151  	state := ar.taskStates[task.Name]
   152  	assert.NotNil(state)
   153  	assert.NotEmpty(state.Events)
   154  	last := state.Events[len(state.Events)-1]
   155  	assert.Equal(allocHealthEventSource, last.Type)
   156  	assert.Contains(last.Message, "failed task")
   157  }
   158  
   159  // Test that the watcher will mark the allocation as unhealthy if it hits its
   160  // deadline.
   161  func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
   162  	t.Parallel()
   163  	assert := assert.New(t)
   164  
   165  	// Ensure the task fails and restarts
   166  	upd, ar := testAllocRunner(t, false)
   167  
   168  	// Make the task block
   169  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   170  	task.Driver = "mock_driver"
   171  	task.Config["start_block_for"] = "2s"
   172  	task.Config["run_for"] = "10s"
   173  
   174  	// Make the alloc be part of a deployment
   175  	ar.alloc.DeploymentID = uuid.Generate()
   176  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   177  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   178  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   179  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond
   180  
   181  	go ar.Run()
   182  	defer ar.Destroy()
   183  
   184  	testutil.WaitForResult(func() (bool, error) {
   185  		_, last := upd.Last()
   186  		if last == nil {
   187  			return false, fmt.Errorf("No updates")
   188  		}
   189  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   190  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   191  		} else if *last.DeploymentStatus.Healthy {
   192  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   193  		}
   194  		return true, nil
   195  	}, func(err error) {
   196  		t.Fatalf("err: %v", err)
   197  	})
   198  
   199  	// Assert that we have an event explaining why we are unhealthy.
   200  	assert.Len(ar.taskStates, 1)
   201  	state := ar.taskStates[task.Name]
   202  	assert.NotNil(state)
   203  	assert.NotEmpty(state.Events)
   204  	last := state.Events[len(state.Events)-1]
   205  	assert.Equal(allocHealthEventSource, last.Type)
   206  	assert.Contains(last.Message, "not running by deadline")
   207  }
   208  
   209  // Test that the watcher will mark the allocation as healthy.
   210  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   211  	t.Parallel()
   212  
   213  	// Ensure the task fails and restarts
   214  	upd, ar := testAllocRunner(t, false)
   215  
   216  	// Make the task run healthy
   217  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   218  	task.Driver = "mock_driver"
   219  	task.Config["run_for"] = "10s"
   220  
   221  	// Create a task that takes longer to become healthy
   222  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   223  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   224  	task2.Name = "task 2"
   225  	task2.Config["start_block_for"] = "500ms"
   226  
   227  	// Make the alloc be part of a deployment
   228  	ar.alloc.DeploymentID = uuid.Generate()
   229  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   230  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   231  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   232  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   233  
   234  	start := time.Now()
   235  	go ar.Run()
   236  	defer ar.Destroy()
   237  
   238  	testutil.WaitForResult(func() (bool, error) {
   239  		_, last := upd.Last()
   240  		if last == nil {
   241  			return false, fmt.Errorf("No updates")
   242  		}
   243  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   244  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   245  		} else if !*last.DeploymentStatus.Healthy {
   246  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   247  		}
   248  		return true, nil
   249  	}, func(err error) {
   250  		t.Fatalf("err: %v", err)
   251  	})
   252  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   253  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   254  	}
   255  }
   256  
   257  // Test that the watcher will mark the allocation as healthy with checks
   258  func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
   259  	t.Parallel()
   260  
   261  	// Ensure the task fails and restarts
   262  	upd, ar := testAllocRunner(t, false)
   263  
   264  	// Make the task fail
   265  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   266  	task.Driver = "mock_driver"
   267  	task.Config["run_for"] = "10s"
   268  
   269  	// Create a task that has no checks
   270  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   271  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   272  	task2.Name = "task 2"
   273  	task2.Services = nil
   274  
   275  	// Make the alloc be part of a deployment
   276  	ar.alloc.DeploymentID = uuid.Generate()
   277  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   278  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   279  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   280  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   281  
   282  	checkHealthy := &api.AgentCheck{
   283  		CheckID: uuid.Generate(),
   284  		Status:  api.HealthPassing,
   285  	}
   286  	checkUnhealthy := &api.AgentCheck{
   287  		CheckID: checkHealthy.CheckID,
   288  		Status:  api.HealthWarning,
   289  	}
   290  
   291  	// Only return the check as healthy after a duration
   292  	trigger := time.After(500 * time.Millisecond)
   293  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   294  		select {
   295  		case <-trigger:
   296  			return &consul.AllocRegistration{
   297  				Tasks: map[string]*consul.TaskRegistration{
   298  					task.Name: {
   299  						Services: map[string]*consul.ServiceRegistration{
   300  							"123": {
   301  								Service: &api.AgentService{Service: "foo"},
   302  								Checks:  []*api.AgentCheck{checkHealthy},
   303  							},
   304  						},
   305  					},
   306  				},
   307  			}, nil
   308  		default:
   309  			return &consul.AllocRegistration{
   310  				Tasks: map[string]*consul.TaskRegistration{
   311  					task.Name: {
   312  						Services: map[string]*consul.ServiceRegistration{
   313  							"123": {
   314  								Service: &api.AgentService{Service: "foo"},
   315  								Checks:  []*api.AgentCheck{checkUnhealthy},
   316  							},
   317  						},
   318  					},
   319  				},
   320  			}, nil
   321  		}
   322  	}
   323  
   324  	start := time.Now()
   325  	go ar.Run()
   326  	defer ar.Destroy()
   327  
   328  	testutil.WaitForResult(func() (bool, error) {
   329  		_, last := upd.Last()
   330  		if last == nil {
   331  			return false, fmt.Errorf("No updates")
   332  		}
   333  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   334  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   335  		} else if !*last.DeploymentStatus.Healthy {
   336  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   337  		}
   338  		return true, nil
   339  	}, func(err error) {
   340  		t.Fatalf("err: %v", err)
   341  	})
   342  
   343  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   344  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   345  	}
   346  }
   347  
   348  // Test that the watcher will mark the allocation as unhealthy with failing
   349  // checks
   350  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   351  	t.Parallel()
   352  	assert := assert.New(t)
   353  
   354  	// Ensure the task fails and restarts
   355  	upd, ar := testAllocRunner(t, false)
   356  
   357  	// Make the task fail
   358  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   359  	task.Driver = "mock_driver"
   360  	task.Config["run_for"] = "10s"
   361  
   362  	// Make the alloc be part of a deployment
   363  	ar.alloc.DeploymentID = uuid.Generate()
   364  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   365  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   366  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   367  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   368  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   369  
   370  	checkUnhealthy := &api.AgentCheck{
   371  		CheckID: uuid.Generate(),
   372  		Status:  api.HealthWarning,
   373  	}
   374  
   375  	// Only return the check as healthy after a duration
   376  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   377  		return &consul.AllocRegistration{
   378  			Tasks: map[string]*consul.TaskRegistration{
   379  				task.Name: {
   380  					Services: map[string]*consul.ServiceRegistration{
   381  						"123": {
   382  							Service: &api.AgentService{Service: "foo"},
   383  							Checks:  []*api.AgentCheck{checkUnhealthy},
   384  						},
   385  					},
   386  				},
   387  			},
   388  		}, nil
   389  	}
   390  
   391  	go ar.Run()
   392  	defer ar.Destroy()
   393  
   394  	testutil.WaitForResult(func() (bool, error) {
   395  		_, last := upd.Last()
   396  		if last == nil {
   397  			return false, fmt.Errorf("No updates")
   398  		}
   399  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   400  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   401  		} else if *last.DeploymentStatus.Healthy {
   402  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   403  		}
   404  		return true, nil
   405  	}, func(err error) {
   406  		t.Fatalf("err: %v", err)
   407  	})
   408  
   409  	// Assert that we have an event explaining why we are unhealthy.
   410  	assert.Len(ar.taskStates, 1)
   411  	state := ar.taskStates[task.Name]
   412  	assert.NotNil(state)
   413  	assert.NotEmpty(state.Events)
   414  	last := state.Events[len(state.Events)-1]
   415  	assert.Equal(allocHealthEventSource, last.Type)
   416  	assert.Contains(last.Message, "Services not healthy by deadline")
   417  }
   418  
   419  // Test that the watcher will mark the allocation as healthy.
   420  func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
   421  	t.Parallel()
   422  
   423  	// Ensure the task fails and restarts
   424  	upd, ar := testAllocRunner(t, false)
   425  
   426  	// Make the task run healthy
   427  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   428  	task.Driver = "mock_driver"
   429  	task.Config["run_for"] = "30s"
   430  
   431  	// Make the alloc be part of a deployment
   432  	ar.alloc.DeploymentID = uuid.Generate()
   433  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   434  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   435  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   436  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   437  
   438  	go ar.Run()
   439  	defer ar.Destroy()
   440  
   441  	testutil.WaitForResult(func() (bool, error) {
   442  		_, last := upd.Last()
   443  		if last == nil {
   444  			return false, fmt.Errorf("No updates")
   445  		}
   446  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   447  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   448  		} else if !*last.DeploymentStatus.Healthy {
   449  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   450  		}
   451  		return true, nil
   452  	}, func(err error) {
   453  		t.Fatalf("err: %v", err)
   454  	})
   455  
   456  	// Mimick an update to a new deployment id
   457  	oldCount, last := upd.Last()
   458  	last.DeploymentStatus = nil
   459  	last.DeploymentID = uuid.Generate()
   460  	ar.Update(last)
   461  
   462  	testutil.WaitForResult(func() (bool, error) {
   463  		newCount, last := upd.Last()
   464  		if newCount <= oldCount {
   465  			return false, fmt.Errorf("No new updates")
   466  		}
   467  		if last.DeploymentStatus == nil || last.DeploymentStatus.Healthy == nil {
   468  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   469  		} else if !*last.DeploymentStatus.Healthy {
   470  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   471  		}
   472  		return true, nil
   473  	}, func(err error) {
   474  		t.Fatalf("err: %v", err)
   475  	})
   476  }
   477  
   478  // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
   479  // retrying fetching an artifact, other tasks in the group should be able
   480  // to proceed.
   481  func TestAllocRunner_RetryArtifact(t *testing.T) {
   482  	t.Parallel()
   483  
   484  	alloc := mock.Alloc()
   485  	alloc.Job.Type = structs.JobTypeBatch
   486  	alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail
   487  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
   488  	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
   489  
   490  	task := alloc.Job.TaskGroups[0].Tasks[0]
   491  	task.Driver = "mock_driver"
   492  	task.Config = map[string]interface{}{
   493  		"exit_code": "0",
   494  		"run_for":   "1s",
   495  	}
   496  
   497  	// Create a new task with a bad artifact
   498  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   499  	badtask.Name = "bad"
   500  	badtask.Artifacts = []*structs.TaskArtifact{
   501  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   502  	}
   503  
   504  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   505  	upd, ar := testAllocRunnerFromAlloc(t, alloc, true)
   506  	go ar.Run()
   507  	defer ar.Destroy()
   508  
   509  	testutil.WaitForResult(func() (bool, error) {
   510  		count, last := upd.Last()
   511  		if min := 6; count < min {
   512  			return false, fmt.Errorf("Not enough updates (%d < %d)", count, min)
   513  		}
   514  
   515  		// web task should have completed successfully while bad task
   516  		// retries artififact fetching
   517  		webstate := last.TaskStates["web"]
   518  		if webstate.State != structs.TaskStateDead {
   519  			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
   520  		}
   521  		if !webstate.Successful() {
   522  			return false, fmt.Errorf("expected web to have exited successfully")
   523  		}
   524  
   525  		// bad task should have failed
   526  		badstate := last.TaskStates["bad"]
   527  		if badstate.State != structs.TaskStateDead {
   528  			return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State)
   529  		}
   530  		if !badstate.Failed {
   531  			return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events)
   532  		}
   533  		return true, nil
   534  	}, func(err error) {
   535  		t.Fatalf("err: %v", err)
   536  	})
   537  }
   538  
   539  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
   540  	t.Parallel()
   541  	upd, ar := testAllocRunner(t, false)
   542  
   543  	// Ensure task takes some time
   544  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   545  	task.Driver = "mock_driver"
   546  	task.Config["run_for"] = "10s"
   547  	go ar.Run()
   548  
   549  	testutil.WaitForResult(func() (bool, error) {
   550  		_, last := upd.Last()
   551  		if last == nil {
   552  			return false, fmt.Errorf("No updates")
   553  		}
   554  		if last.ClientStatus != structs.AllocClientStatusRunning {
   555  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   556  		}
   557  		return true, nil
   558  	}, func(err error) {
   559  		t.Fatalf("err: %v", err)
   560  	})
   561  
   562  	// Update the alloc to be terminal which should cause the alloc runner to
   563  	// stop the tasks and wait for a destroy.
   564  	update := ar.alloc.Copy()
   565  	update.DesiredStatus = structs.AllocDesiredStatusStop
   566  	ar.Update(update)
   567  
   568  	testutil.WaitForResult(func() (bool, error) {
   569  		_, last := upd.Last()
   570  		if last == nil {
   571  			return false, fmt.Errorf("No updates")
   572  		}
   573  
   574  		// Check the status has changed.
   575  		if last.ClientStatus != structs.AllocClientStatusComplete {
   576  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   577  		}
   578  
   579  		// Check the allocation state still exists
   580  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   581  			if !allocationBucketExists(tx, ar.Alloc().ID) {
   582  				return fmt.Errorf("no bucket for alloc")
   583  			}
   584  
   585  			return nil
   586  		}); err != nil {
   587  			return false, fmt.Errorf("state destroyed")
   588  		}
   589  
   590  		// Check the alloc directory still exists
   591  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   592  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   593  		}
   594  
   595  		return true, nil
   596  	}, func(err error) {
   597  		t.Fatalf("err: %v", err)
   598  	})
   599  
   600  	// Send the destroy signal and ensure the AllocRunner cleans up.
   601  	ar.Destroy()
   602  
   603  	testutil.WaitForResult(func() (bool, error) {
   604  		_, last := upd.Last()
   605  		if last == nil {
   606  			return false, fmt.Errorf("No updates")
   607  		}
   608  
   609  		// Check the status has changed.
   610  		if last.ClientStatus != structs.AllocClientStatusComplete {
   611  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   612  		}
   613  
   614  		// Check the state was cleaned
   615  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   616  			if allocationBucketExists(tx, ar.Alloc().ID) {
   617  				return fmt.Errorf("bucket for alloc exists")
   618  			}
   619  
   620  			return nil
   621  		}); err != nil {
   622  			return false, fmt.Errorf("state not destroyed")
   623  		}
   624  
   625  		// Check the alloc directory was cleaned
   626  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   627  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   628  		} else if !os.IsNotExist(err) {
   629  			return false, fmt.Errorf("stat err: %v", err)
   630  		}
   631  
   632  		return true, nil
   633  	}, func(err error) {
   634  		t.Fatalf("err: %v", err)
   635  	})
   636  }
   637  
   638  func TestAllocRunner_Destroy(t *testing.T) {
   639  	t.Parallel()
   640  	upd, ar := testAllocRunner(t, false)
   641  
   642  	// Ensure task takes some time
   643  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   644  	task.Driver = "mock_driver"
   645  	task.Config["run_for"] = "10s"
   646  	go ar.Run()
   647  	start := time.Now()
   648  
   649  	// Begin the tear down
   650  	go func() {
   651  		time.Sleep(1 * time.Second)
   652  		ar.Destroy()
   653  	}()
   654  
   655  	testutil.WaitForResult(func() (bool, error) {
   656  		_, last := upd.Last()
   657  		if last == nil {
   658  			return false, fmt.Errorf("No updates")
   659  		}
   660  
   661  		// Check the status has changed.
   662  		if last.ClientStatus != structs.AllocClientStatusComplete {
   663  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   664  		}
   665  
   666  		// Check the state was cleaned
   667  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   668  			if allocationBucketExists(tx, ar.Alloc().ID) {
   669  				return fmt.Errorf("bucket for alloc exists")
   670  			}
   671  
   672  			return nil
   673  		}); err != nil {
   674  			return false, fmt.Errorf("state not destroyed: %v", err)
   675  		}
   676  
   677  		// Check the alloc directory was cleaned
   678  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   679  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   680  		} else if !os.IsNotExist(err) {
   681  			return false, fmt.Errorf("stat err: %v", err)
   682  		}
   683  
   684  		return true, nil
   685  	}, func(err error) {
   686  		t.Fatalf("err: %v", err)
   687  	})
   688  
   689  	if elapsed := time.Since(start); elapsed > 20*time.Second {
   690  		t.Fatalf("took too long to terminate: %s", elapsed)
   691  	}
   692  }
   693  
   694  func TestAllocRunner_Update(t *testing.T) {
   695  	t.Parallel()
   696  	_, ar := testAllocRunner(t, false)
   697  
   698  	// Deep copy the alloc to avoid races when updating
   699  	newAlloc := ar.Alloc().Copy()
   700  
   701  	// Ensure task takes some time
   702  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   703  	task.Driver = "mock_driver"
   704  	task.Config["run_for"] = "10s"
   705  	go ar.Run()
   706  	defer ar.Destroy()
   707  
   708  	// Update the alloc definition
   709  	newAlloc.Name = "FOO"
   710  	newAlloc.AllocModifyIndex++
   711  	ar.Update(newAlloc)
   712  
   713  	// Check the alloc runner stores the update allocation.
   714  	testutil.WaitForResult(func() (bool, error) {
   715  		return ar.Alloc().Name == "FOO", nil
   716  	}, func(err error) {
   717  		t.Fatalf("err: %v %#v", err, ar.Alloc())
   718  	})
   719  }
   720  
   721  func TestAllocRunner_SaveRestoreState(t *testing.T) {
   722  	t.Parallel()
   723  	alloc := mock.Alloc()
   724  	task := alloc.Job.TaskGroups[0].Tasks[0]
   725  	task.Driver = "mock_driver"
   726  	task.Config = map[string]interface{}{
   727  		"exit_code": "0",
   728  		"run_for":   "10s",
   729  	}
   730  
   731  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   732  	go ar.Run()
   733  	defer ar.Destroy()
   734  
   735  	// Snapshot state
   736  	testutil.WaitForResult(func() (bool, error) {
   737  		ar.taskLock.RLock()
   738  		defer ar.taskLock.RUnlock()
   739  		return len(ar.tasks) == 1, nil
   740  	}, func(err error) {
   741  		t.Fatalf("task never started: %v", err)
   742  	})
   743  
   744  	err := ar.SaveState()
   745  	if err != nil {
   746  		t.Fatalf("err: %v", err)
   747  	}
   748  
   749  	// Create a new alloc runner
   750  	l2 := prefixedTestLogger("----- ar2:  ")
   751  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   752  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   753  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   754  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   755  	err = ar2.RestoreState()
   756  	if err != nil {
   757  		t.Fatalf("err: %v", err)
   758  	}
   759  	go ar2.Run()
   760  
   761  	testutil.WaitForResult(func() (bool, error) {
   762  		if len(ar2.tasks) != 1 {
   763  			return false, fmt.Errorf("Incorrect number of tasks")
   764  		}
   765  
   766  		_, last := upd.Last()
   767  		if last == nil {
   768  			return false, nil
   769  		}
   770  
   771  		return last.ClientStatus == structs.AllocClientStatusRunning, nil
   772  	}, func(err error) {
   773  		_, last := upd.Last()
   774  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"])
   775  	})
   776  
   777  	// Destroy and wait
   778  	ar2.Destroy()
   779  	start := time.Now()
   780  
   781  	testutil.WaitForResult(func() (bool, error) {
   782  		alloc := ar2.Alloc()
   783  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
   784  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
   785  		}
   786  		return true, nil
   787  	}, func(err error) {
   788  		_, last := upd.Last()
   789  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   790  	})
   791  
   792  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
   793  		t.Fatalf("took too long to terminate")
   794  	}
   795  }
   796  
   797  func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
   798  	t.Parallel()
   799  	upd, ar := testAllocRunner(t, false)
   800  	ar.logger = prefixedTestLogger("ar1: ")
   801  
   802  	// Ensure task takes some time
   803  	ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   804  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   805  	task.Config["run_for"] = "10s"
   806  	go ar.Run()
   807  	defer ar.Destroy()
   808  
   809  	testutil.WaitForResult(func() (bool, error) {
   810  		_, last := upd.Last()
   811  		if last == nil {
   812  			return false, fmt.Errorf("No updates")
   813  		}
   814  
   815  		if last.ClientStatus != structs.AllocClientStatusRunning {
   816  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   817  		}
   818  		return true, nil
   819  	}, func(err error) {
   820  		t.Fatalf("err: %v", err)
   821  	})
   822  
   823  	// Update the alloc to be terminal which should cause the alloc runner to
   824  	// stop the tasks and wait for a destroy.
   825  	update := ar.alloc.Copy()
   826  	update.DesiredStatus = structs.AllocDesiredStatusStop
   827  	ar.Update(update)
   828  
   829  	testutil.WaitForResult(func() (bool, error) {
   830  		return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil
   831  	}, func(err error) {
   832  		t.Fatalf("err: %v", err)
   833  	})
   834  
   835  	err := ar.SaveState()
   836  	if err != nil {
   837  		t.Fatalf("err: %v", err)
   838  	}
   839  
   840  	// Ensure ar1 doesn't recreate the state file
   841  	ar.allocLock.Lock()
   842  	defer ar.allocLock.Unlock()
   843  
   844  	// Create a new alloc runner
   845  	l2 := prefixedTestLogger("ar2: ")
   846  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   847  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   848  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   849  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   850  	err = ar2.RestoreState()
   851  	if err != nil {
   852  		t.Fatalf("err: %v", err)
   853  	}
   854  	ar2.logger.Println("[TESTING] running second alloc runner")
   855  	go ar2.Run()
   856  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
   857  
   858  	testutil.WaitForResult(func() (bool, error) {
   859  		// Check the state still exists
   860  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   861  			if !allocationBucketExists(tx, ar2.Alloc().ID) {
   862  				return fmt.Errorf("no bucket for alloc")
   863  			}
   864  
   865  			return nil
   866  		}); err != nil {
   867  			return false, fmt.Errorf("state destroyed")
   868  		}
   869  
   870  		// Check the alloc directory still exists
   871  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   872  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   873  		}
   874  
   875  		return true, nil
   876  	}, func(err error) {
   877  		_, last := upd.Last()
   878  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   879  	})
   880  
   881  	// Send the destroy signal and ensure the AllocRunner cleans up.
   882  	ar2.logger.Println("[TESTING] destroying second alloc runner")
   883  	ar2.Destroy()
   884  
   885  	testutil.WaitForResult(func() (bool, error) {
   886  		_, last := upd.Last()
   887  		if last == nil {
   888  			return false, fmt.Errorf("No updates")
   889  		}
   890  
   891  		// Check the status has changed.
   892  		if last.ClientStatus != structs.AllocClientStatusComplete {
   893  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   894  		}
   895  
   896  		// Check the state was cleaned
   897  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   898  			if allocationBucketExists(tx, ar2.Alloc().ID) {
   899  				return fmt.Errorf("bucket for alloc exists")
   900  			}
   901  
   902  			return nil
   903  		}); err != nil {
   904  			return false, fmt.Errorf("state not destroyed")
   905  		}
   906  
   907  		// Check the alloc directory was cleaned
   908  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   909  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   910  		} else if !os.IsNotExist(err) {
   911  			return false, fmt.Errorf("stat err: %v", err)
   912  		}
   913  
   914  		return true, nil
   915  	}, func(err error) {
   916  		t.Fatalf("err: %v", err)
   917  	})
   918  }
   919  
   920  // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are
   921  // restarted on upgrade.
   922  func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) {
   923  	t.Parallel()
   924  	alloc := mock.Alloc()
   925  	task := alloc.Job.TaskGroups[0].Tasks[0]
   926  	task.Driver = "mock_driver"
   927  	task.Config = map[string]interface{}{
   928  		"exit_code": "0",
   929  		"run_for":   "10s",
   930  	}
   931  
   932  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   933  	// Hack in old version to cause an upgrade on RestoreState
   934  	origConfig := ar.config.Copy()
   935  	ar.config.Version = &version.VersionInfo{Version: "0.5.6"}
   936  	go ar.Run()
   937  	defer ar.Destroy()
   938  
   939  	// Snapshot state
   940  	testutil.WaitForResult(func() (bool, error) {
   941  		_, last := upd.Last()
   942  		if last == nil {
   943  			return false, fmt.Errorf("No updates")
   944  		}
   945  
   946  		if last.ClientStatus != structs.AllocClientStatusRunning {
   947  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   948  		}
   949  		return true, nil
   950  	}, func(err error) {
   951  		t.Fatalf("task never started: %v", err)
   952  	})
   953  
   954  	err := ar.SaveState()
   955  	if err != nil {
   956  		t.Fatalf("err: %v", err)
   957  	}
   958  
   959  	// Create a new alloc runner
   960  	l2 := prefixedTestLogger("ar2: ")
   961  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   962  	prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "")
   963  	ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   964  	err = ar2.RestoreState()
   965  	if err != nil {
   966  		t.Fatalf("err: %v", err)
   967  	}
   968  	go ar2.Run()
   969  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
   970  
   971  	testutil.WaitForResult(func() (bool, error) {
   972  		count, last := upd.Last()
   973  		if min := 3; count < min {
   974  			return false, fmt.Errorf("expected at least %d updates but found %d", min, count)
   975  		}
   976  		for _, ev := range last.TaskStates["web"].Events {
   977  			if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) {
   978  				return true, nil
   979  			}
   980  		}
   981  		return false, fmt.Errorf("no restart with proper reason found")
   982  	}, func(err error) {
   983  		count, last := upd.Last()
   984  		t.Fatalf("err: %v\nAllocs: %d\nweb state: % #v", err, count, pretty.Formatter(last.TaskStates["web"]))
   985  	})
   986  
   987  	// Destroy and wait
   988  	ar2.Destroy()
   989  	start := time.Now()
   990  
   991  	testutil.WaitForResult(func() (bool, error) {
   992  		alloc := ar2.Alloc()
   993  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
   994  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
   995  		}
   996  		return true, nil
   997  	}, func(err error) {
   998  		_, last := upd.Last()
   999  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
  1000  	})
  1001  
  1002  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
  1003  		t.Fatalf("took too long to terminate")
  1004  	}
  1005  }
  1006  
  1007  // Ensure pre-#2132 state files containing the Context struct are properly
  1008  // migrated to the new format.
  1009  //
  1010  // Old Context State:
  1011  //
  1012  //  "Context": {
  1013  //    "AllocDir": {
  1014  //      "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb",
  1015  //      "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc",
  1016  //      "TaskDirs": {
  1017  //        "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1"
  1018  //      }
  1019  //    },
  1020  //    "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb"
  1021  //  }
  1022  func TestAllocRunner_RestoreOldState(t *testing.T) {
  1023  	t.Parallel()
  1024  	alloc := mock.Alloc()
  1025  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1026  	task.Driver = "mock_driver"
  1027  	task.Config = map[string]interface{}{
  1028  		"exit_code": "0",
  1029  		"run_for":   "10s",
  1030  	}
  1031  
  1032  	logger := testLogger()
  1033  	conf := config.DefaultConfig()
  1034  	conf.Node = mock.Node()
  1035  	conf.StateDir = os.TempDir()
  1036  	conf.AllocDir = os.TempDir()
  1037  	tmp, err := ioutil.TempFile("", "state-db")
  1038  	if err != nil {
  1039  		t.Fatalf("error creating state db file: %v", err)
  1040  	}
  1041  	db, err := bolt.Open(tmp.Name(), 0600, nil)
  1042  	if err != nil {
  1043  		t.Fatalf("error creating state db: %v", err)
  1044  	}
  1045  
  1046  	if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil {
  1047  		t.Fatalf("error creating state dir: %v", err)
  1048  	}
  1049  	statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json")
  1050  	w, err := os.Create(statePath)
  1051  	if err != nil {
  1052  		t.Fatalf("error creating state file: %v", err)
  1053  	}
  1054  	tmplctx := &struct {
  1055  		AllocID  string
  1056  		AllocDir string
  1057  	}{alloc.ID, conf.AllocDir}
  1058  	err = template.Must(template.New("test_state").Parse(`{
  1059    "Version": "0.5.1",
  1060    "Alloc": {
  1061      "ID": "{{ .AllocID }}",
  1062      "Name": "example",
  1063      "JobID": "example",
  1064      "Job": {
  1065        "ID": "example",
  1066        "Name": "example",
  1067        "Type": "batch",
  1068        "TaskGroups": [
  1069          {
  1070            "Name": "example",
  1071            "Tasks": [
  1072              {
  1073                "Name": "example",
  1074                "Driver": "mock",
  1075                "Config": {
  1076                  "exit_code": "0",
  1077  		"run_for": "10s"
  1078                }
  1079              }
  1080            ]
  1081          }
  1082        ]
  1083      },
  1084      "TaskGroup": "example",
  1085      "DesiredStatus": "run",
  1086      "ClientStatus": "running",
  1087      "TaskStates": {
  1088        "example": {
  1089          "State": "running",
  1090          "Failed": false,
  1091          "Events": []
  1092        }
  1093      }
  1094    },
  1095    "Context": {
  1096      "AllocDir": {
  1097        "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}",
  1098        "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc",
  1099        "TaskDirs": {
  1100          "example": "{{ .AllocDir }}/{{ .AllocID }}/example"
  1101        }
  1102      },
  1103      "AllocID": "{{ .AllocID }}"
  1104    }
  1105  }`)).Execute(w, tmplctx)
  1106  	if err != nil {
  1107  		t.Fatalf("error writing state file: %v", err)
  1108  	}
  1109  	w.Close()
  1110  
  1111  	upd := &MockAllocStateUpdater{}
  1112  	*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
  1113  	alloc.Job.Type = structs.JobTypeBatch
  1114  	vclient := vaultclient.NewMockVaultClient()
  1115  	cclient := newMockConsulServiceClient(t)
  1116  	ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{})
  1117  	defer ar.Destroy()
  1118  
  1119  	// RestoreState should fail on the task state since we only test the
  1120  	// alloc state restoring.
  1121  	err = ar.RestoreState()
  1122  	if err == nil {
  1123  		t.Fatal("expected error restoring Task state")
  1124  	}
  1125  	merr, ok := err.(*multierror.Error)
  1126  	if !ok {
  1127  		t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err)
  1128  	}
  1129  	if len(merr.Errors) != 1 {
  1130  		t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err)
  1131  	}
  1132  	if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) {
  1133  		t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error())
  1134  	}
  1135  
  1136  	if err := ar.SaveState(); err != nil {
  1137  		t.Fatalf("error saving new state: %v", err)
  1138  	}
  1139  }
  1140  
  1141  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
  1142  	t.Parallel()
  1143  	upd, ar := testAllocRunner(t, false)
  1144  
  1145  	// Create two tasks in the task group
  1146  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1147  	task.Driver = "mock_driver"
  1148  	task.KillTimeout = 10 * time.Millisecond
  1149  	task.Config = map[string]interface{}{
  1150  		"run_for": "10s",
  1151  	}
  1152  
  1153  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1154  	task2.Name = "task 2"
  1155  	task2.Driver = "mock_driver"
  1156  	task2.Config = map[string]interface{}{
  1157  		"start_error": "fail task please",
  1158  	}
  1159  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1160  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1161  	go ar.Run()
  1162  	defer ar.Destroy()
  1163  
  1164  	testutil.WaitForResult(func() (bool, error) {
  1165  		_, last := upd.Last()
  1166  		if last == nil {
  1167  			return false, fmt.Errorf("No updates")
  1168  		}
  1169  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1170  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1171  		}
  1172  
  1173  		// Task One should be killed
  1174  		state1 := last.TaskStates[task.Name]
  1175  		if state1.State != structs.TaskStateDead {
  1176  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1177  		}
  1178  		if len(state1.Events) < 2 {
  1179  			// At least have a received and destroyed
  1180  			return false, fmt.Errorf("Unexpected number of events")
  1181  		}
  1182  
  1183  		found := false
  1184  		for _, e := range state1.Events {
  1185  			if e.Type != structs.TaskSiblingFailed {
  1186  				found = true
  1187  			}
  1188  		}
  1189  
  1190  		if !found {
  1191  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1192  		}
  1193  
  1194  		// Task Two should be failed
  1195  		state2 := last.TaskStates[task2.Name]
  1196  		if state2.State != structs.TaskStateDead {
  1197  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1198  		}
  1199  		if !state2.Failed {
  1200  			return false, fmt.Errorf("task2 should have failed")
  1201  		}
  1202  
  1203  		return true, nil
  1204  	}, func(err error) {
  1205  		t.Fatalf("err: %v", err)
  1206  	})
  1207  }
  1208  
  1209  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
  1210  	t.Parallel()
  1211  	upd, ar := testAllocRunner(t, false)
  1212  
  1213  	// Create two tasks in the task group
  1214  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1215  	task.Driver = "mock_driver"
  1216  	task.KillTimeout = 10 * time.Millisecond
  1217  	task.Config = map[string]interface{}{
  1218  		"run_for": "10s",
  1219  	}
  1220  
  1221  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1222  	task2.Name = "task 2"
  1223  	task2.Driver = "mock_driver"
  1224  	task2.Leader = true
  1225  	task2.Config = map[string]interface{}{
  1226  		"run_for": "1s",
  1227  	}
  1228  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1229  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1230  	go ar.Run()
  1231  	defer ar.Destroy()
  1232  
  1233  	testutil.WaitForResult(func() (bool, error) {
  1234  		_, last := upd.Last()
  1235  		if last == nil {
  1236  			return false, fmt.Errorf("No updates")
  1237  		}
  1238  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1239  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1240  		}
  1241  
  1242  		// Task One should be killed
  1243  		state1 := last.TaskStates[task.Name]
  1244  		if state1.State != structs.TaskStateDead {
  1245  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1246  		}
  1247  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
  1248  			return false, fmt.Errorf("expected to have a start and finish time")
  1249  		}
  1250  		if len(state1.Events) < 2 {
  1251  			// At least have a received and destroyed
  1252  			return false, fmt.Errorf("Unexpected number of events")
  1253  		}
  1254  
  1255  		found := false
  1256  		for _, e := range state1.Events {
  1257  			if e.Type != structs.TaskLeaderDead {
  1258  				found = true
  1259  			}
  1260  		}
  1261  
  1262  		if !found {
  1263  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
  1264  		}
  1265  
  1266  		// Task Two should be dead
  1267  		state2 := last.TaskStates[task2.Name]
  1268  		if state2.State != structs.TaskStateDead {
  1269  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1270  		}
  1271  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
  1272  			return false, fmt.Errorf("expected to have a start and finish time")
  1273  		}
  1274  
  1275  		return true, nil
  1276  	}, func(err error) {
  1277  		t.Fatalf("err: %v", err)
  1278  	})
  1279  }
  1280  
  1281  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group
  1282  // with a leader the leader is stopped before other tasks.
  1283  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
  1284  	t.Parallel()
  1285  	upd, ar := testAllocRunner(t, false)
  1286  
  1287  	// Create 3 tasks in the task group
  1288  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1289  	task.Name = "follower1"
  1290  	task.Driver = "mock_driver"
  1291  	task.KillTimeout = 10 * time.Millisecond
  1292  	task.Config = map[string]interface{}{
  1293  		"run_for": "10s",
  1294  	}
  1295  
  1296  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1297  	task2.Name = "leader"
  1298  	task2.Driver = "mock_driver"
  1299  	task2.Leader = true
  1300  	task2.KillTimeout = 10 * time.Millisecond
  1301  	task2.Config = map[string]interface{}{
  1302  		"run_for": "10s",
  1303  	}
  1304  
  1305  	task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1306  	task3.Name = "follower2"
  1307  	task3.Driver = "mock_driver"
  1308  	task3.KillTimeout = 10 * time.Millisecond
  1309  	task3.Config = map[string]interface{}{
  1310  		"run_for": "10s",
  1311  	}
  1312  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3)
  1313  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1314  	defer ar.Destroy()
  1315  
  1316  	go ar.Run()
  1317  
  1318  	// Wait for tasks to start
  1319  	oldCount, last := upd.Last()
  1320  	testutil.WaitForResult(func() (bool, error) {
  1321  		oldCount, last = upd.Last()
  1322  		if last == nil {
  1323  			return false, fmt.Errorf("No updates")
  1324  		}
  1325  		if n := len(last.TaskStates); n != 3 {
  1326  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
  1327  		}
  1328  		for name, state := range last.TaskStates {
  1329  			if state.State != structs.TaskStateRunning {
  1330  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1331  			}
  1332  		}
  1333  		return true, nil
  1334  	}, func(err error) {
  1335  		t.Fatalf("err: %v", err)
  1336  	})
  1337  
  1338  	// Stop alloc
  1339  	update := ar.Alloc()
  1340  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1341  	ar.Update(update)
  1342  
  1343  	// Wait for tasks to stop
  1344  	testutil.WaitForResult(func() (bool, error) {
  1345  		newCount, last := upd.Last()
  1346  		if newCount == oldCount {
  1347  			return false, fmt.Errorf("no new updates (count: %d)", newCount)
  1348  		}
  1349  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
  1350  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
  1351  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
  1352  		}
  1353  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
  1354  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
  1355  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
  1356  		}
  1357  		return true, nil
  1358  	}, func(err error) {
  1359  		count, last := upd.Last()
  1360  		t.Logf("Updates: %d", count)
  1361  		for name, state := range last.TaskStates {
  1362  			t.Logf("%s: %s", name, state.State)
  1363  		}
  1364  		t.Fatalf("err: %v", err)
  1365  	})
  1366  }
  1367  
  1368  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
  1369  // restored task group with a leader that failed before restoring the leader is
  1370  // not stopped as it does not exist.
  1371  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
  1372  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
  1373  	t.Parallel()
  1374  	_, ar := testAllocRunner(t, false)
  1375  	defer ar.Destroy()
  1376  
  1377  	// Create a leader and follower task in the task group
  1378  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1379  	task.Name = "follower1"
  1380  	task.Driver = "mock_driver"
  1381  	task.KillTimeout = 10 * time.Second
  1382  	task.Config = map[string]interface{}{
  1383  		"run_for": "10s",
  1384  	}
  1385  
  1386  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1387  	task2.Name = "leader"
  1388  	task2.Driver = "mock_driver"
  1389  	task2.Leader = true
  1390  	task2.KillTimeout = 10 * time.Millisecond
  1391  	task2.Config = map[string]interface{}{
  1392  		"run_for": "0s",
  1393  	}
  1394  
  1395  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1396  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1397  
  1398  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
  1399  	ar.tasks = map[string]*TaskRunner{
  1400  		"leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1401  			ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(),
  1402  			ar.vaultClient, ar.consulClient),
  1403  		"follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1404  			ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(),
  1405  			ar.vaultClient, ar.consulClient),
  1406  	}
  1407  	ar.taskStates = map[string]*structs.TaskState{
  1408  		"leader":    {State: structs.TaskStateDead},
  1409  		"follower1": {State: structs.TaskStateRunning},
  1410  	}
  1411  	if err := ar.SaveState(); err != nil {
  1412  		t.Fatalf("error saving state: %v", err)
  1413  	}
  1414  
  1415  	// Create a new AllocRunner to test RestoreState and Run
  1416  	upd2 := &MockAllocStateUpdater{}
  1417  	ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc,
  1418  		ar.vaultClient, ar.consulClient, ar.prevAlloc)
  1419  	defer ar2.Destroy()
  1420  
  1421  	if err := ar2.RestoreState(); err != nil {
  1422  		t.Fatalf("error restoring state: %v", err)
  1423  	}
  1424  	go ar2.Run()
  1425  
  1426  	// Wait for tasks to be stopped because leader is dead
  1427  	testutil.WaitForResult(func() (bool, error) {
  1428  		_, last := upd2.Last()
  1429  		if last == nil {
  1430  			return false, fmt.Errorf("no updates yet")
  1431  		}
  1432  		if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead {
  1433  			return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual)
  1434  		}
  1435  		if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead {
  1436  			return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual)
  1437  		}
  1438  		return true, nil
  1439  	}, func(err error) {
  1440  		count, last := upd2.Last()
  1441  		t.Logf("Updates: %d", count)
  1442  		for name, state := range last.TaskStates {
  1443  			t.Logf("%s: %s", name, state.State)
  1444  		}
  1445  		t.Fatalf("err: %v", err)
  1446  	})
  1447  
  1448  	// Make sure it GCs properly
  1449  	ar2.Destroy()
  1450  
  1451  	select {
  1452  	case <-ar2.WaitCh():
  1453  		// exited as expected
  1454  	case <-time.After(10 * time.Second):
  1455  		t.Fatalf("timed out waiting for AR to GC")
  1456  	}
  1457  }
  1458  
  1459  // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's
  1460  // local/ dir will be moved to a replacement alloc's local/ dir if sticky
  1461  // volumes is on.
  1462  func TestAllocRunner_MoveAllocDir(t *testing.T) {
  1463  	t.Parallel()
  1464  	// Create an alloc runner
  1465  	alloc := mock.Alloc()
  1466  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1467  	task.Driver = "mock_driver"
  1468  	task.Config = map[string]interface{}{
  1469  		"run_for": "1s",
  1470  	}
  1471  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
  1472  	go ar.Run()
  1473  	defer ar.Destroy()
  1474  
  1475  	testutil.WaitForResult(func() (bool, error) {
  1476  		_, last := upd.Last()
  1477  		if last == nil {
  1478  			return false, fmt.Errorf("No updates")
  1479  		}
  1480  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1481  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1482  		}
  1483  		return true, nil
  1484  	}, func(err error) {
  1485  		t.Fatalf("err: %v", err)
  1486  	})
  1487  
  1488  	// Write some data in data dir and task dir of the alloc
  1489  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
  1490  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
  1491  	taskDir := ar.allocDir.TaskDirs[task.Name]
  1492  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
  1493  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
  1494  
  1495  	// Create another alloc runner
  1496  	alloc2 := mock.Alloc()
  1497  	alloc2.PreviousAllocation = ar.allocID
  1498  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
  1499  	task = alloc2.Job.TaskGroups[0].Tasks[0]
  1500  	task.Driver = "mock_driver"
  1501  	task.Config = map[string]interface{}{
  1502  		"run_for": "1s",
  1503  	}
  1504  	upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false)
  1505  
  1506  	// Set prevAlloc like Client does
  1507  	ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "")
  1508  
  1509  	go ar2.Run()
  1510  	defer ar2.Destroy()
  1511  
  1512  	testutil.WaitForResult(func() (bool, error) {
  1513  		_, last := upd2.Last()
  1514  		if last == nil {
  1515  			return false, fmt.Errorf("No updates")
  1516  		}
  1517  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1518  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1519  		}
  1520  		return true, nil
  1521  	}, func(err error) {
  1522  		t.Fatalf("err: %v", err)
  1523  	})
  1524  
  1525  	// Ensure that data from ar was moved to ar2
  1526  	taskDir = ar2.allocDir.TaskDirs[task.Name]
  1527  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
  1528  	if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil {
  1529  		t.Fatalf("file %v not found", taskLocalFile)
  1530  	}
  1531  
  1532  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
  1533  	if fileInfo, _ := os.Stat(dataFile); fileInfo == nil {
  1534  		t.Fatalf("file %v not found", dataFile)
  1535  	}
  1536  }