github.com/emate/nomad@v0.8.2-wo-binpacking/client/alloc_runner_test.go (about)

     1  package client
     2  
     3  import (
     4  	"fmt"
     5  	"io/ioutil"
     6  	"os"
     7  	"path/filepath"
     8  	"strings"
     9  	"sync"
    10  	"testing"
    11  	"text/template"
    12  	"time"
    13  
    14  	"github.com/boltdb/bolt"
    15  	"github.com/hashicorp/consul/api"
    16  	"github.com/hashicorp/go-multierror"
    17  	"github.com/hashicorp/nomad/command/agent/consul"
    18  	"github.com/hashicorp/nomad/helper/testlog"
    19  	"github.com/hashicorp/nomad/helper/uuid"
    20  	"github.com/hashicorp/nomad/nomad/mock"
    21  	"github.com/hashicorp/nomad/nomad/structs"
    22  	"github.com/hashicorp/nomad/testutil"
    23  	"github.com/hashicorp/nomad/version"
    24  	"github.com/kr/pretty"
    25  	"github.com/stretchr/testify/assert"
    26  
    27  	"github.com/hashicorp/nomad/client/config"
    28  	"github.com/hashicorp/nomad/client/vaultclient"
    29  	"github.com/stretchr/testify/require"
    30  )
    31  
    32  type MockAllocStateUpdater struct {
    33  	Allocs []*structs.Allocation
    34  	mu     sync.Mutex
    35  }
    36  
    37  // Update fulfills the TaskStateUpdater interface
    38  func (m *MockAllocStateUpdater) Update(alloc *structs.Allocation) {
    39  	m.mu.Lock()
    40  	m.Allocs = append(m.Allocs, alloc)
    41  	m.mu.Unlock()
    42  }
    43  
    44  // Last returns a copy of the last alloc (or nil) sync'd
    45  func (m *MockAllocStateUpdater) Last() *structs.Allocation {
    46  	m.mu.Lock()
    47  	defer m.mu.Unlock()
    48  	n := len(m.Allocs)
    49  	if n == 0 {
    50  		return nil
    51  	}
    52  	return m.Allocs[n-1].Copy()
    53  }
    54  
    55  // allocationBucketExists checks if the allocation bucket was created.
    56  func allocationBucketExists(tx *bolt.Tx, allocID string) bool {
    57  	allocations := tx.Bucket(allocationsBucket)
    58  	if allocations == nil {
    59  		return false
    60  	}
    61  
    62  	// Retrieve the specific allocations bucket
    63  	alloc := allocations.Bucket([]byte(allocID))
    64  	return alloc != nil
    65  }
    66  
    67  func testAllocRunnerFromAlloc(t *testing.T, alloc *structs.Allocation, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    68  	conf := config.DefaultConfig()
    69  	conf.Node = mock.Node()
    70  	conf.StateDir = os.TempDir()
    71  	conf.AllocDir = os.TempDir()
    72  	tmp, _ := ioutil.TempFile("", "state-db")
    73  	db, _ := bolt.Open(tmp.Name(), 0600, nil)
    74  	upd := &MockAllocStateUpdater{}
    75  	if !restarts {
    76  		*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
    77  		alloc.Job.Type = structs.JobTypeBatch
    78  	}
    79  	vclient := vaultclient.NewMockVaultClient()
    80  	ar := NewAllocRunner(testlog.Logger(t), conf, db, upd.Update, alloc, vclient, newMockConsulServiceClient(t), noopPrevAlloc{})
    81  	return upd, ar
    82  }
    83  
    84  func testAllocRunner(t *testing.T, restarts bool) (*MockAllocStateUpdater, *AllocRunner) {
    85  	// Use mock driver
    86  	alloc := mock.Alloc()
    87  	task := alloc.Job.TaskGroups[0].Tasks[0]
    88  	task.Driver = "mock_driver"
    89  	task.Config["run_for"] = "500ms"
    90  	return testAllocRunnerFromAlloc(t, alloc, restarts)
    91  }
    92  
    93  func TestAllocRunner_SimpleRun(t *testing.T) {
    94  	t.Parallel()
    95  	upd, ar := testAllocRunner(t, false)
    96  	go ar.Run()
    97  	defer ar.Destroy()
    98  
    99  	testutil.WaitForResult(func() (bool, error) {
   100  		last := upd.Last()
   101  		if last == nil {
   102  			return false, fmt.Errorf("No updates")
   103  		}
   104  		if last.ClientStatus != structs.AllocClientStatusComplete {
   105  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   106  		}
   107  		return true, nil
   108  	}, func(err error) {
   109  		t.Fatalf("err: %v", err)
   110  	})
   111  }
   112  
   113  // Test that FinisheAt is set when the alloc is in a terminal state
   114  func TestAllocRunner_FinishedAtSet(t *testing.T) {
   115  	t.Parallel()
   116  	require := require.New(t)
   117  	_, ar := testAllocRunner(t, false)
   118  	ar.allocClientStatus = structs.AllocClientStatusFailed
   119  	alloc := ar.Alloc()
   120  	taskFinishedAt := make(map[string]time.Time)
   121  	require.NotEmpty(alloc.TaskStates)
   122  	for name, s := range alloc.TaskStates {
   123  		require.False(s.FinishedAt.IsZero())
   124  		taskFinishedAt[name] = s.FinishedAt
   125  	}
   126  
   127  	// Verify that calling again should not mutate finishedAt
   128  	alloc2 := ar.Alloc()
   129  	for name, s := range alloc2.TaskStates {
   130  		require.Equal(taskFinishedAt[name], s.FinishedAt)
   131  	}
   132  
   133  }
   134  
   135  // Test that FinisheAt is set when the alloc is in a terminal state
   136  func TestAllocRunner_FinishedAtSet_TaskEvents(t *testing.T) {
   137  	t.Parallel()
   138  	require := require.New(t)
   139  	_, ar := testAllocRunner(t, false)
   140  	ar.taskStates[ar.alloc.Job.TaskGroups[0].Tasks[0].Name] = &structs.TaskState{State: structs.TaskStateDead, Failed: true}
   141  
   142  	alloc := ar.Alloc()
   143  	taskFinishedAt := make(map[string]time.Time)
   144  	require.NotEmpty(alloc.TaskStates)
   145  	for name, s := range alloc.TaskStates {
   146  		require.False(s.FinishedAt.IsZero())
   147  		taskFinishedAt[name] = s.FinishedAt
   148  	}
   149  
   150  	// Verify that calling again should not mutate finishedAt
   151  	alloc2 := ar.Alloc()
   152  	for name, s := range alloc2.TaskStates {
   153  		require.Equal(taskFinishedAt[name], s.FinishedAt)
   154  	}
   155  
   156  }
   157  
   158  // Test that the watcher will mark the allocation as unhealthy.
   159  func TestAllocRunner_DeploymentHealth_Unhealthy_BadStart(t *testing.T) {
   160  	t.Parallel()
   161  	assert := assert.New(t)
   162  
   163  	// Ensure the task fails and restarts
   164  	upd, ar := testAllocRunner(t, true)
   165  
   166  	// Make the task fail
   167  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   168  	task.Driver = "mock_driver"
   169  	task.Config["start_error"] = "test error"
   170  
   171  	// Make the alloc be part of a deployment
   172  	ar.alloc.DeploymentID = uuid.Generate()
   173  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   174  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   175  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   176  
   177  	go ar.Run()
   178  	defer ar.Destroy()
   179  
   180  	testutil.WaitForResult(func() (bool, error) {
   181  		last := upd.Last()
   182  		if last == nil {
   183  			return false, fmt.Errorf("No updates")
   184  		}
   185  		if !last.DeploymentStatus.HasHealth() {
   186  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   187  		} else if *last.DeploymentStatus.Healthy {
   188  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   189  		}
   190  		return true, nil
   191  	}, func(err error) {
   192  		t.Fatalf("err: %v", err)
   193  	})
   194  
   195  	// Assert that we have an event explaining why we are unhealthy.
   196  	assert.Len(ar.taskStates, 1)
   197  	state := ar.taskStates[task.Name]
   198  	assert.NotNil(state)
   199  	assert.NotEmpty(state.Events)
   200  	last := state.Events[len(state.Events)-1]
   201  	assert.Equal(allocHealthEventSource, last.Type)
   202  	assert.Contains(last.Message, "failed task")
   203  }
   204  
   205  // Test that the watcher will mark the allocation as unhealthy if it hits its
   206  // deadline.
   207  func TestAllocRunner_DeploymentHealth_Unhealthy_Deadline(t *testing.T) {
   208  	t.Parallel()
   209  
   210  	// Don't restart but force service job type
   211  	upd, ar := testAllocRunner(t, false)
   212  	ar.alloc.Job.Type = structs.JobTypeService
   213  
   214  	// Make the task block
   215  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   216  	task.Driver = "mock_driver"
   217  	task.Config["start_block_for"] = "4s"
   218  	task.Config["run_for"] = "10s"
   219  
   220  	// Make the alloc be part of a deployment
   221  	ar.alloc.DeploymentID = uuid.Generate()
   222  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   223  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   224  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   225  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 100 * time.Millisecond
   226  
   227  	go ar.Run()
   228  	defer ar.Destroy()
   229  
   230  	testutil.WaitForResult(func() (bool, error) {
   231  		last := upd.Last()
   232  		if last == nil {
   233  			return false, fmt.Errorf("No updates")
   234  		}
   235  
   236  		// Assert alloc is unhealthy
   237  		if !last.DeploymentStatus.HasHealth() {
   238  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   239  		} else if *last.DeploymentStatus.Healthy {
   240  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   241  		}
   242  
   243  		// Assert there is a task event explaining why we are unhealthy.
   244  		state, ok := last.TaskStates[task.Name]
   245  		if !ok {
   246  			return false, fmt.Errorf("missing state for task %s", task.Name)
   247  		}
   248  		n := len(state.Events)
   249  		if n == 0 {
   250  			return false, fmt.Errorf("no task events")
   251  		}
   252  		lastEvent := state.Events[n-1]
   253  		if lastEvent.Type != allocHealthEventSource {
   254  			return false, fmt.Errorf("expected %q; found %q", allocHealthEventSource, lastEvent.Type)
   255  		}
   256  		if !strings.Contains(lastEvent.Message, "not running by deadline") {
   257  			return false, fmt.Errorf(`expected "not running by deadline" but found: %s`, lastEvent.Message)
   258  		}
   259  
   260  		return true, nil
   261  	}, func(err error) {
   262  		t.Fatalf("err: %v", err)
   263  	})
   264  }
   265  
   266  // Test that the watcher will mark the allocation as healthy.
   267  func TestAllocRunner_DeploymentHealth_Healthy_NoChecks(t *testing.T) {
   268  	t.Parallel()
   269  
   270  	// Ensure the task fails and restarts
   271  	upd, ar := testAllocRunner(t, true)
   272  
   273  	// Make the task run healthy
   274  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   275  	task.Driver = "mock_driver"
   276  	task.Config["run_for"] = "10s"
   277  
   278  	// Create a task that takes longer to become healthy
   279  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   280  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   281  	task2.Name = "task 2"
   282  	task2.Config["start_block_for"] = "500ms"
   283  
   284  	// Make the alloc be part of a deployment
   285  	ar.alloc.DeploymentID = uuid.Generate()
   286  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   287  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   288  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   289  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   290  
   291  	start := time.Now()
   292  	go ar.Run()
   293  	defer ar.Destroy()
   294  
   295  	testutil.WaitForResult(func() (bool, error) {
   296  		last := upd.Last()
   297  		if last == nil {
   298  			return false, fmt.Errorf("No updates")
   299  		}
   300  		if !last.DeploymentStatus.HasHealth() {
   301  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   302  		} else if !*last.DeploymentStatus.Healthy {
   303  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   304  		}
   305  		return true, nil
   306  	}, func(err error) {
   307  		t.Fatalf("err: %v", err)
   308  	})
   309  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   310  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   311  	}
   312  }
   313  
   314  // Test that the watcher will mark the allocation as healthy with checks
   315  func TestAllocRunner_DeploymentHealth_Healthy_Checks(t *testing.T) {
   316  	t.Parallel()
   317  
   318  	// Ensure the task fails and restarts
   319  	upd, ar := testAllocRunner(t, true)
   320  
   321  	// Make the task fail
   322  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   323  	task.Driver = "mock_driver"
   324  	task.Config["run_for"] = "10s"
   325  
   326  	// Create a task that has no checks
   327  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task.Copy())
   328  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[1]
   329  	task2.Name = "task 2"
   330  	task2.Services = nil
   331  
   332  	// Make the alloc be part of a deployment
   333  	ar.alloc.DeploymentID = uuid.Generate()
   334  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   335  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   336  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   337  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   338  
   339  	checkHealthy := &api.AgentCheck{
   340  		CheckID: uuid.Generate(),
   341  		Status:  api.HealthPassing,
   342  	}
   343  	checkUnhealthy := &api.AgentCheck{
   344  		CheckID: checkHealthy.CheckID,
   345  		Status:  api.HealthWarning,
   346  	}
   347  
   348  	// Only return the check as healthy after a duration
   349  	trigger := time.After(500 * time.Millisecond)
   350  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   351  		select {
   352  		case <-trigger:
   353  			return &consul.AllocRegistration{
   354  				Tasks: map[string]*consul.TaskRegistration{
   355  					task.Name: {
   356  						Services: map[string]*consul.ServiceRegistration{
   357  							"123": {
   358  								Service: &api.AgentService{Service: "foo"},
   359  								Checks:  []*api.AgentCheck{checkHealthy},
   360  							},
   361  						},
   362  					},
   363  				},
   364  			}, nil
   365  		default:
   366  			return &consul.AllocRegistration{
   367  				Tasks: map[string]*consul.TaskRegistration{
   368  					task.Name: {
   369  						Services: map[string]*consul.ServiceRegistration{
   370  							"123": {
   371  								Service: &api.AgentService{Service: "foo"},
   372  								Checks:  []*api.AgentCheck{checkUnhealthy},
   373  							},
   374  						},
   375  					},
   376  				},
   377  			}, nil
   378  		}
   379  	}
   380  
   381  	start := time.Now()
   382  	go ar.Run()
   383  	defer ar.Destroy()
   384  
   385  	testutil.WaitForResult(func() (bool, error) {
   386  		last := upd.Last()
   387  		if last == nil {
   388  			return false, fmt.Errorf("No updates")
   389  		}
   390  		if !last.DeploymentStatus.HasHealth() {
   391  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   392  		} else if !*last.DeploymentStatus.Healthy {
   393  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   394  		}
   395  		return true, nil
   396  	}, func(err error) {
   397  		t.Fatalf("err: %v", err)
   398  	})
   399  
   400  	if d := time.Now().Sub(start); d < 500*time.Millisecond {
   401  		t.Fatalf("didn't wait for second task group. Only took %v", d)
   402  	}
   403  }
   404  
   405  // Test that the watcher will mark the allocation as unhealthy with failing
   406  // checks
   407  func TestAllocRunner_DeploymentHealth_Unhealthy_Checks(t *testing.T) {
   408  	t.Parallel()
   409  	assert := assert.New(t)
   410  
   411  	// Ensure the task fails and restarts
   412  	upd, ar := testAllocRunner(t, true)
   413  
   414  	// Make the task fail
   415  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   416  	task.Driver = "mock_driver"
   417  	task.Config["run_for"] = "10s"
   418  
   419  	// Make the alloc be part of a deployment
   420  	ar.alloc.DeploymentID = uuid.Generate()
   421  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   422  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_Checks
   423  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   424  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   425  	ar.alloc.Job.TaskGroups[0].Update.HealthyDeadline = 1 * time.Second
   426  
   427  	checkUnhealthy := &api.AgentCheck{
   428  		CheckID: uuid.Generate(),
   429  		Status:  api.HealthWarning,
   430  	}
   431  
   432  	// Only return the check as healthy after a duration
   433  	ar.consulClient.(*mockConsulServiceClient).allocRegistrationsFn = func(allocID string) (*consul.AllocRegistration, error) {
   434  		return &consul.AllocRegistration{
   435  			Tasks: map[string]*consul.TaskRegistration{
   436  				task.Name: {
   437  					Services: map[string]*consul.ServiceRegistration{
   438  						"123": {
   439  							Service: &api.AgentService{Service: "foo"},
   440  							Checks:  []*api.AgentCheck{checkUnhealthy},
   441  						},
   442  					},
   443  				},
   444  			},
   445  		}, nil
   446  	}
   447  
   448  	go ar.Run()
   449  	defer ar.Destroy()
   450  
   451  	testutil.WaitForResult(func() (bool, error) {
   452  		last := upd.Last()
   453  		if last == nil {
   454  			return false, fmt.Errorf("No updates")
   455  		}
   456  		if !last.DeploymentStatus.HasHealth() {
   457  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   458  		} else if *last.DeploymentStatus.Healthy {
   459  			return false, fmt.Errorf("want deployment status unhealthy; got healthy")
   460  		}
   461  		return true, nil
   462  	}, func(err error) {
   463  		t.Fatalf("err: %v", err)
   464  	})
   465  
   466  	// Assert that we have an event explaining why we are unhealthy.
   467  	assert.Len(ar.taskStates, 1)
   468  	state := ar.taskStates[task.Name]
   469  	assert.NotNil(state)
   470  	assert.NotEmpty(state.Events)
   471  	last := state.Events[len(state.Events)-1]
   472  	assert.Equal(allocHealthEventSource, last.Type)
   473  	assert.Contains(last.Message, "Services not healthy by deadline")
   474  }
   475  
   476  // Test that the watcher will mark the allocation as healthy.
   477  func TestAllocRunner_DeploymentHealth_Healthy_UpdatedDeployment(t *testing.T) {
   478  	t.Parallel()
   479  
   480  	// Ensure the task fails and restarts
   481  	upd, ar := testAllocRunner(t, true)
   482  
   483  	// Make the task run healthy
   484  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   485  	task.Driver = "mock_driver"
   486  	task.Config["run_for"] = "30s"
   487  
   488  	// Make the alloc be part of a deployment
   489  	ar.alloc.DeploymentID = uuid.Generate()
   490  	ar.alloc.Job.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy()
   491  	ar.alloc.Job.TaskGroups[0].Update.HealthCheck = structs.UpdateStrategyHealthCheck_TaskStates
   492  	ar.alloc.Job.TaskGroups[0].Update.MaxParallel = 1
   493  	ar.alloc.Job.TaskGroups[0].Update.MinHealthyTime = 100 * time.Millisecond
   494  
   495  	go ar.Run()
   496  	defer ar.Destroy()
   497  
   498  	testutil.WaitForResult(func() (bool, error) {
   499  		last := upd.Last()
   500  		if last == nil {
   501  			return false, fmt.Errorf("No updates")
   502  		}
   503  		if !last.DeploymentStatus.HasHealth() {
   504  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   505  		} else if !*last.DeploymentStatus.Healthy {
   506  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   507  		}
   508  		return true, nil
   509  	}, func(err error) {
   510  		t.Fatalf("err: %v", err)
   511  	})
   512  
   513  	// Mimick an update to a new deployment id
   514  	last := upd.Last()
   515  	last.DeploymentStatus = nil
   516  	last.DeploymentID = uuid.Generate()
   517  	ar.Update(last)
   518  
   519  	testutil.WaitForResult(func() (bool, error) {
   520  		last := upd.Last()
   521  		if !last.DeploymentStatus.HasHealth() {
   522  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   523  		} else if !*last.DeploymentStatus.Healthy {
   524  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   525  		}
   526  		return true, nil
   527  	}, func(err error) {
   528  		t.Fatalf("err: %v", err)
   529  	})
   530  }
   531  
   532  // Test that health is reported for services that got migrated; not just part
   533  // of deployments.
   534  func TestAllocRunner_DeploymentHealth_Healthy_Migration(t *testing.T) {
   535  	t.Parallel()
   536  
   537  	// Ensure the task fails and restarts
   538  	upd, ar := testAllocRunner(t, true)
   539  
   540  	// Make the task run healthy
   541  	tg := ar.alloc.Job.TaskGroups[0]
   542  	task := tg.Tasks[0]
   543  	task.Driver = "mock_driver"
   544  	task.Config["run_for"] = "30s"
   545  
   546  	// Shorten the default migration healthy time
   547  	tg.Migrate = structs.DefaultMigrateStrategy()
   548  	tg.Migrate.MinHealthyTime = 100 * time.Millisecond
   549  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   550  
   551  	// Ensure the alloc is *not* part of a deployment
   552  	ar.alloc.DeploymentID = ""
   553  
   554  	go ar.Run()
   555  	defer ar.Destroy()
   556  
   557  	testutil.WaitForResult(func() (bool, error) {
   558  		last := upd.Last()
   559  		if last == nil {
   560  			return false, fmt.Errorf("No updates")
   561  		}
   562  		if !last.DeploymentStatus.HasHealth() {
   563  			return false, fmt.Errorf("want deployment status unhealthy; got unset")
   564  		} else if !*last.DeploymentStatus.Healthy {
   565  			return false, fmt.Errorf("want deployment status healthy; got unhealthy")
   566  		}
   567  		return true, nil
   568  	}, func(err error) {
   569  		t.Fatalf("err: %v", err)
   570  	})
   571  }
   572  
   573  // Test that health is *not* reported for batch jobs
   574  func TestAllocRunner_DeploymentHealth_BatchDisabled(t *testing.T) {
   575  	t.Parallel()
   576  
   577  	// Ensure the task fails and restarts
   578  	alloc := mock.BatchAlloc()
   579  	tg := alloc.Job.TaskGroups[0]
   580  
   581  	// This should not be possile as validation should prevent batch jobs
   582  	// from having a migration stanza!
   583  	tg.Migrate = structs.DefaultMigrateStrategy()
   584  	tg.Migrate.MinHealthyTime = 1 * time.Millisecond
   585  	tg.Migrate.HealthyDeadline = 2 * time.Millisecond
   586  	tg.Migrate.HealthCheck = structs.MigrateStrategyHealthStates
   587  
   588  	task := tg.Tasks[0]
   589  	task.Driver = "mock_driver"
   590  	task.Config["run_for"] = "5s"
   591  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   592  
   593  	go ar.Run()
   594  	defer ar.Destroy()
   595  
   596  	testutil.WaitForResult(func() (bool, error) {
   597  		last := upd.Last()
   598  		if last == nil {
   599  			return false, fmt.Errorf("No updates")
   600  		}
   601  		if last.DeploymentStatus != nil {
   602  			return false, fmt.Errorf("unexpected deployment health set: %v", last.DeploymentStatus.Healthy)
   603  		}
   604  		return true, nil
   605  	}, func(err error) {
   606  		t.Fatalf("err: %v", err)
   607  	})
   608  }
   609  
   610  // TestAllocRuner_RetryArtifact ensures that if one task in a task group is
   611  // retrying fetching an artifact, other tasks in the group should be able
   612  // to proceed.
   613  func TestAllocRunner_RetryArtifact(t *testing.T) {
   614  	t.Parallel()
   615  
   616  	alloc := mock.Alloc()
   617  	alloc.Job.Type = structs.JobTypeBatch
   618  	alloc.Job.TaskGroups[0].RestartPolicy.Mode = structs.RestartPolicyModeFail
   619  	alloc.Job.TaskGroups[0].RestartPolicy.Attempts = 1
   620  	alloc.Job.TaskGroups[0].RestartPolicy.Delay = time.Duration(4*testutil.TestMultiplier()) * time.Second
   621  
   622  	task := alloc.Job.TaskGroups[0].Tasks[0]
   623  	task.Driver = "mock_driver"
   624  	task.Config = map[string]interface{}{
   625  		"exit_code": "0",
   626  		"run_for":   "1s",
   627  	}
   628  
   629  	// Create a new task with a bad artifact
   630  	badtask := alloc.Job.TaskGroups[0].Tasks[0].Copy()
   631  	badtask.Name = "bad"
   632  	badtask.Artifacts = []*structs.TaskArtifact{
   633  		{GetterSource: "http://127.0.0.1:0/foo/bar/baz"},
   634  	}
   635  
   636  	alloc.Job.TaskGroups[0].Tasks = append(alloc.Job.TaskGroups[0].Tasks, badtask)
   637  	upd, ar := testAllocRunnerFromAlloc(t, alloc, true)
   638  	go ar.Run()
   639  	defer ar.Destroy()
   640  
   641  	testutil.WaitForResult(func() (bool, error) {
   642  		last := upd.Last()
   643  		if last == nil {
   644  			return false, fmt.Errorf("No updates")
   645  		}
   646  
   647  		// web task should have completed successfully while bad task
   648  		// retries artifact fetching
   649  		webstate, ok := last.TaskStates["web"]
   650  		if !ok {
   651  			return false, fmt.Errorf("no task state for web")
   652  		}
   653  		if webstate.State != structs.TaskStateDead {
   654  			return false, fmt.Errorf("expected web to be dead but found %q", last.TaskStates["web"].State)
   655  		}
   656  		if !webstate.Successful() {
   657  			return false, fmt.Errorf("expected web to have exited successfully")
   658  		}
   659  
   660  		// bad task should have failed
   661  		badstate := last.TaskStates["bad"]
   662  		if badstate.State != structs.TaskStateDead {
   663  			return false, fmt.Errorf("expected bad to be dead but found %q", badstate.State)
   664  		}
   665  		if !badstate.Failed {
   666  			return false, fmt.Errorf("expected bad to have failed: %#v", badstate.Events)
   667  		}
   668  		return true, nil
   669  	}, func(err error) {
   670  		t.Fatalf("err: %v", err)
   671  	})
   672  }
   673  
   674  func TestAllocRunner_TerminalUpdate_Destroy(t *testing.T) {
   675  	t.Parallel()
   676  	upd, ar := testAllocRunner(t, false)
   677  
   678  	// Ensure task takes some time
   679  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   680  	task.Driver = "mock_driver"
   681  	task.Config["run_for"] = "10s"
   682  	go ar.Run()
   683  
   684  	testutil.WaitForResult(func() (bool, error) {
   685  		last := upd.Last()
   686  		if last == nil {
   687  			return false, fmt.Errorf("No updates")
   688  		}
   689  		if last.ClientStatus != structs.AllocClientStatusRunning {
   690  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   691  		}
   692  		return true, nil
   693  	}, func(err error) {
   694  		t.Fatalf("err: %v", err)
   695  	})
   696  
   697  	// Update the alloc to be terminal which should cause the alloc runner to
   698  	// stop the tasks and wait for a destroy.
   699  	update := ar.alloc.Copy()
   700  	update.DesiredStatus = structs.AllocDesiredStatusStop
   701  	ar.Update(update)
   702  
   703  	testutil.WaitForResult(func() (bool, error) {
   704  		last := upd.Last()
   705  		if last == nil {
   706  			return false, fmt.Errorf("No updates")
   707  		}
   708  
   709  		// Check the status has changed.
   710  		if last.ClientStatus != structs.AllocClientStatusComplete {
   711  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   712  		}
   713  
   714  		// Check the allocation state still exists
   715  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   716  			if !allocationBucketExists(tx, ar.Alloc().ID) {
   717  				return fmt.Errorf("no bucket for alloc")
   718  			}
   719  
   720  			return nil
   721  		}); err != nil {
   722  			return false, fmt.Errorf("state destroyed")
   723  		}
   724  
   725  		// Check the alloc directory still exists
   726  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
   727  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
   728  		}
   729  
   730  		return true, nil
   731  	}, func(err error) {
   732  		t.Fatalf("err: %v", err)
   733  	})
   734  
   735  	// Send the destroy signal and ensure the AllocRunner cleans up.
   736  	ar.Destroy()
   737  
   738  	testutil.WaitForResult(func() (bool, error) {
   739  		last := upd.Last()
   740  		if last == nil {
   741  			return false, fmt.Errorf("No updates")
   742  		}
   743  
   744  		// Check the status has changed.
   745  		if last.ClientStatus != structs.AllocClientStatusComplete {
   746  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   747  		}
   748  
   749  		// Check the state was cleaned
   750  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   751  			if allocationBucketExists(tx, ar.Alloc().ID) {
   752  				return fmt.Errorf("bucket for alloc exists")
   753  			}
   754  
   755  			return nil
   756  		}); err != nil {
   757  			return false, fmt.Errorf("state not destroyed")
   758  		}
   759  
   760  		// Check the alloc directory was cleaned
   761  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   762  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   763  		} else if !os.IsNotExist(err) {
   764  			return false, fmt.Errorf("stat err: %v", err)
   765  		}
   766  
   767  		return true, nil
   768  	}, func(err error) {
   769  		t.Fatalf("err: %v", err)
   770  	})
   771  }
   772  
   773  func TestAllocRunner_Destroy(t *testing.T) {
   774  	t.Parallel()
   775  	upd, ar := testAllocRunner(t, false)
   776  
   777  	// Ensure task takes some time
   778  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   779  	task.Driver = "mock_driver"
   780  	task.Config["run_for"] = "10s"
   781  	go ar.Run()
   782  	start := time.Now()
   783  
   784  	// Begin the tear down
   785  	go func() {
   786  		time.Sleep(1 * time.Second)
   787  		ar.Destroy()
   788  	}()
   789  
   790  	testutil.WaitForResult(func() (bool, error) {
   791  		last := upd.Last()
   792  		if last == nil {
   793  			return false, fmt.Errorf("No updates")
   794  		}
   795  
   796  		// Check the status has changed.
   797  		if last.ClientStatus != structs.AllocClientStatusComplete {
   798  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
   799  		}
   800  
   801  		// Check the state was cleaned
   802  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   803  			if allocationBucketExists(tx, ar.Alloc().ID) {
   804  				return fmt.Errorf("bucket for alloc exists")
   805  			}
   806  
   807  			return nil
   808  		}); err != nil {
   809  			return false, fmt.Errorf("state not destroyed: %v", err)
   810  		}
   811  
   812  		// Check the alloc directory was cleaned
   813  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
   814  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
   815  		} else if !os.IsNotExist(err) {
   816  			return false, fmt.Errorf("stat err: %v", err)
   817  		}
   818  
   819  		return true, nil
   820  	}, func(err error) {
   821  		t.Fatalf("err: %v", err)
   822  	})
   823  
   824  	if elapsed := time.Since(start); elapsed > 20*time.Second {
   825  		t.Fatalf("took too long to terminate: %s", elapsed)
   826  	}
   827  }
   828  
   829  func TestAllocRunner_Update(t *testing.T) {
   830  	t.Parallel()
   831  	_, ar := testAllocRunner(t, false)
   832  
   833  	// Deep copy the alloc to avoid races when updating
   834  	newAlloc := ar.Alloc().Copy()
   835  
   836  	// Ensure task takes some time
   837  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   838  	task.Driver = "mock_driver"
   839  	task.Config["run_for"] = "10s"
   840  	go ar.Run()
   841  	defer ar.Destroy()
   842  
   843  	// Update the alloc definition
   844  	newAlloc.Name = "FOO"
   845  	newAlloc.AllocModifyIndex++
   846  	ar.Update(newAlloc)
   847  
   848  	// Check the alloc runner stores the update allocation.
   849  	testutil.WaitForResult(func() (bool, error) {
   850  		return ar.Alloc().Name == "FOO", nil
   851  	}, func(err error) {
   852  		t.Fatalf("err: %v %#v", err, ar.Alloc())
   853  	})
   854  }
   855  
   856  func TestAllocRunner_SaveRestoreState(t *testing.T) {
   857  	t.Parallel()
   858  	alloc := mock.Alloc()
   859  	task := alloc.Job.TaskGroups[0].Tasks[0]
   860  	task.Driver = "mock_driver"
   861  	task.Config = map[string]interface{}{
   862  		"exit_code": "0",
   863  		"run_for":   "10s",
   864  	}
   865  
   866  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
   867  	go ar.Run()
   868  	defer ar.Destroy()
   869  
   870  	// Snapshot state
   871  	testutil.WaitForResult(func() (bool, error) {
   872  		ar.taskLock.RLock()
   873  		defer ar.taskLock.RUnlock()
   874  		return len(ar.tasks) == 1, nil
   875  	}, func(err error) {
   876  		t.Fatalf("task never started: %v", err)
   877  	})
   878  
   879  	err := ar.SaveState()
   880  	if err != nil {
   881  		t.Fatalf("err: %v", err)
   882  	}
   883  
   884  	// Create a new alloc runner
   885  	l2 := prefixedTestLogger("----- ar2:  ")
   886  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   887  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   888  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   889  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   890  	err = ar2.RestoreState()
   891  	if err != nil {
   892  		t.Fatalf("err: %v", err)
   893  	}
   894  	go ar2.Run()
   895  
   896  	testutil.WaitForResult(func() (bool, error) {
   897  		if len(ar2.tasks) != 1 {
   898  			return false, fmt.Errorf("Incorrect number of tasks")
   899  		}
   900  
   901  		last := upd.Last()
   902  		if last == nil {
   903  			return false, nil
   904  		}
   905  
   906  		return last.ClientStatus == structs.AllocClientStatusRunning, nil
   907  	}, func(err error) {
   908  		last := upd.Last()
   909  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates["web"])
   910  	})
   911  
   912  	// Destroy and wait
   913  	ar2.Destroy()
   914  	start := time.Now()
   915  
   916  	testutil.WaitForResult(func() (bool, error) {
   917  		alloc := ar2.Alloc()
   918  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
   919  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
   920  		}
   921  		return true, nil
   922  	}, func(err error) {
   923  		last := upd.Last()
   924  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
   925  	})
   926  
   927  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
   928  		t.Fatalf("took too long to terminate")
   929  	}
   930  }
   931  
   932  func TestAllocRunner_SaveRestoreState_TerminalAlloc(t *testing.T) {
   933  	t.Parallel()
   934  	upd, ar := testAllocRunner(t, false)
   935  	ar.logger = prefixedTestLogger("ar1: ")
   936  
   937  	// Ensure task takes some time
   938  	ar.alloc.Job.TaskGroups[0].Tasks[0].Driver = "mock_driver"
   939  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
   940  	task.Config["run_for"] = "10s"
   941  	go ar.Run()
   942  	defer ar.Destroy()
   943  
   944  	testutil.WaitForResult(func() (bool, error) {
   945  		last := upd.Last()
   946  		if last == nil {
   947  			return false, fmt.Errorf("No updates")
   948  		}
   949  
   950  		if last.ClientStatus != structs.AllocClientStatusRunning {
   951  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
   952  		}
   953  		return true, nil
   954  	}, func(err error) {
   955  		t.Fatalf("err: %v", err)
   956  	})
   957  
   958  	// Update the alloc to be terminal which should cause the alloc runner to
   959  	// stop the tasks and wait for a destroy.
   960  	update := ar.alloc.Copy()
   961  	update.DesiredStatus = structs.AllocDesiredStatusStop
   962  	ar.Update(update)
   963  
   964  	testutil.WaitForResult(func() (bool, error) {
   965  		return ar.Alloc().DesiredStatus == structs.AllocDesiredStatusStop, nil
   966  	}, func(err error) {
   967  		t.Fatalf("err: %v", err)
   968  	})
   969  
   970  	err := ar.SaveState()
   971  	if err != nil {
   972  		t.Fatalf("err: %v", err)
   973  	}
   974  
   975  	// Ensure ar1 doesn't recreate the state file
   976  	ar.allocLock.Lock()
   977  	defer ar.allocLock.Unlock()
   978  
   979  	// Create a new alloc runner
   980  	l2 := prefixedTestLogger("ar2: ")
   981  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
   982  	prevAlloc := newAllocWatcher(alloc2, ar, nil, ar.config, l2, "")
   983  	ar2 := NewAllocRunner(l2, ar.config, ar.stateDB, upd.Update,
   984  		alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
   985  	err = ar2.RestoreState()
   986  	if err != nil {
   987  		t.Fatalf("err: %v", err)
   988  	}
   989  	ar2.logger.Println("[TESTING] running second alloc runner")
   990  	go ar2.Run()
   991  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
   992  
   993  	testutil.WaitForResult(func() (bool, error) {
   994  		// Check the state still exists
   995  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
   996  			if !allocationBucketExists(tx, ar2.Alloc().ID) {
   997  				return fmt.Errorf("no bucket for alloc")
   998  			}
   999  
  1000  			return nil
  1001  		}); err != nil {
  1002  			return false, fmt.Errorf("state destroyed")
  1003  		}
  1004  
  1005  		// Check the alloc directory still exists
  1006  		if _, err := os.Stat(ar.allocDir.AllocDir); err != nil {
  1007  			return false, fmt.Errorf("alloc dir destroyed: %v", ar.allocDir.AllocDir)
  1008  		}
  1009  
  1010  		return true, nil
  1011  	}, func(err error) {
  1012  		last := upd.Last()
  1013  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
  1014  	})
  1015  
  1016  	// Send the destroy signal and ensure the AllocRunner cleans up.
  1017  	ar2.logger.Println("[TESTING] destroying second alloc runner")
  1018  	ar2.Destroy()
  1019  
  1020  	testutil.WaitForResult(func() (bool, error) {
  1021  		last := upd.Last()
  1022  		if last == nil {
  1023  			return false, fmt.Errorf("No updates")
  1024  		}
  1025  
  1026  		// Check the status has changed.
  1027  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1028  			return false, fmt.Errorf("got client status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1029  		}
  1030  
  1031  		// Check the state was cleaned
  1032  		if err := ar.stateDB.View(func(tx *bolt.Tx) error {
  1033  			if allocationBucketExists(tx, ar2.Alloc().ID) {
  1034  				return fmt.Errorf("bucket for alloc exists")
  1035  			}
  1036  
  1037  			return nil
  1038  		}); err != nil {
  1039  			return false, fmt.Errorf("state not destroyed")
  1040  		}
  1041  
  1042  		// Check the alloc directory was cleaned
  1043  		if _, err := os.Stat(ar.allocDir.AllocDir); err == nil {
  1044  			return false, fmt.Errorf("alloc dir still exists: %v", ar.allocDir.AllocDir)
  1045  		} else if !os.IsNotExist(err) {
  1046  			return false, fmt.Errorf("stat err: %v", err)
  1047  		}
  1048  
  1049  		return true, nil
  1050  	}, func(err error) {
  1051  		t.Fatalf("err: %v", err)
  1052  	})
  1053  }
  1054  
  1055  // TestAllocRunner_SaveRestoreState_Upgrade asserts that pre-0.6 exec tasks are
  1056  // restarted on upgrade.
  1057  func TestAllocRunner_SaveRestoreState_Upgrade(t *testing.T) {
  1058  	t.Parallel()
  1059  	alloc := mock.Alloc()
  1060  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1061  	task.Driver = "mock_driver"
  1062  	task.Config = map[string]interface{}{
  1063  		"exit_code": "0",
  1064  		"run_for":   "10s",
  1065  	}
  1066  
  1067  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
  1068  	// Hack in old version to cause an upgrade on RestoreState
  1069  	origConfig := ar.config.Copy()
  1070  	ar.config.Version = &version.VersionInfo{Version: "0.5.6"}
  1071  	go ar.Run()
  1072  	defer ar.Destroy()
  1073  
  1074  	// Snapshot state
  1075  	testutil.WaitForResult(func() (bool, error) {
  1076  		last := upd.Last()
  1077  		if last == nil {
  1078  			return false, fmt.Errorf("No updates")
  1079  		}
  1080  
  1081  		if last.ClientStatus != structs.AllocClientStatusRunning {
  1082  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusRunning)
  1083  		}
  1084  		return true, nil
  1085  	}, func(err error) {
  1086  		t.Fatalf("task never started: %v", err)
  1087  	})
  1088  
  1089  	err := ar.SaveState()
  1090  	if err != nil {
  1091  		t.Fatalf("err: %v", err)
  1092  	}
  1093  
  1094  	// Create a new alloc runner
  1095  	l2 := prefixedTestLogger("ar2: ")
  1096  	alloc2 := &structs.Allocation{ID: ar.alloc.ID}
  1097  	prevAlloc := newAllocWatcher(alloc2, ar, nil, origConfig, l2, "")
  1098  	ar2 := NewAllocRunner(l2, origConfig, ar.stateDB, upd.Update, alloc2, ar.vaultClient, ar.consulClient, prevAlloc)
  1099  	err = ar2.RestoreState()
  1100  	if err != nil {
  1101  		t.Fatalf("err: %v", err)
  1102  	}
  1103  	go ar2.Run()
  1104  	defer ar2.Destroy() // Just-in-case of failure before Destroy below
  1105  
  1106  	testutil.WaitForResult(func() (bool, error) {
  1107  		last := upd.Last()
  1108  		if last == nil {
  1109  			return false, fmt.Errorf("No updates")
  1110  		}
  1111  		for _, ev := range last.TaskStates["web"].Events {
  1112  			if strings.HasSuffix(ev.RestartReason, pre06ScriptCheckReason) {
  1113  				return true, nil
  1114  			}
  1115  		}
  1116  		return false, fmt.Errorf("no restart with proper reason found")
  1117  	}, func(err error) {
  1118  		last := upd.Last()
  1119  		t.Fatalf("err: %v\nweb state: % #v", err, pretty.Formatter(last.TaskStates["web"]))
  1120  	})
  1121  
  1122  	// Destroy and wait
  1123  	ar2.Destroy()
  1124  	start := time.Now()
  1125  
  1126  	testutil.WaitForResult(func() (bool, error) {
  1127  		alloc := ar2.Alloc()
  1128  		if alloc.ClientStatus != structs.AllocClientStatusComplete {
  1129  			return false, fmt.Errorf("Bad client status; got %v; want %v", alloc.ClientStatus, structs.AllocClientStatusComplete)
  1130  		}
  1131  		return true, nil
  1132  	}, func(err error) {
  1133  		last := upd.Last()
  1134  		t.Fatalf("err: %v %#v %#v", err, last, last.TaskStates)
  1135  	})
  1136  
  1137  	if time.Since(start) > time.Duration(testutil.TestMultiplier()*5)*time.Second {
  1138  		t.Fatalf("took too long to terminate")
  1139  	}
  1140  }
  1141  
  1142  // Ensure pre-#2132 state files containing the Context struct are properly
  1143  // migrated to the new format.
  1144  //
  1145  // Old Context State:
  1146  //
  1147  //  "Context": {
  1148  //    "AllocDir": {
  1149  //      "AllocDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb",
  1150  //      "SharedDir": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/alloc",
  1151  //      "TaskDirs": {
  1152  //        "echo1": "/path/to/allocs/2a54fcff-fc44-8d4f-e025-53c48e9cbbbb/echo1"
  1153  //      }
  1154  //    },
  1155  //    "AllocID": "2a54fcff-fc44-8d4f-e025-53c48e9cbbbb"
  1156  //  }
  1157  func TestAllocRunner_RestoreOldState(t *testing.T) {
  1158  	t.Parallel()
  1159  	alloc := mock.Alloc()
  1160  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1161  	task.Driver = "mock_driver"
  1162  	task.Config = map[string]interface{}{
  1163  		"exit_code": "0",
  1164  		"run_for":   "10s",
  1165  	}
  1166  
  1167  	logger := testLogger()
  1168  	conf := config.DefaultConfig()
  1169  	conf.Node = mock.Node()
  1170  	conf.StateDir = os.TempDir()
  1171  	conf.AllocDir = os.TempDir()
  1172  	tmp, err := ioutil.TempFile("", "state-db")
  1173  	if err != nil {
  1174  		t.Fatalf("error creating state db file: %v", err)
  1175  	}
  1176  	db, err := bolt.Open(tmp.Name(), 0600, nil)
  1177  	if err != nil {
  1178  		t.Fatalf("error creating state db: %v", err)
  1179  	}
  1180  
  1181  	if err := os.MkdirAll(filepath.Join(conf.StateDir, "alloc", alloc.ID), 0777); err != nil {
  1182  		t.Fatalf("error creating state dir: %v", err)
  1183  	}
  1184  	statePath := filepath.Join(conf.StateDir, "alloc", alloc.ID, "state.json")
  1185  	w, err := os.Create(statePath)
  1186  	if err != nil {
  1187  		t.Fatalf("error creating state file: %v", err)
  1188  	}
  1189  	tmplctx := &struct {
  1190  		AllocID  string
  1191  		AllocDir string
  1192  	}{alloc.ID, conf.AllocDir}
  1193  	err = template.Must(template.New("test_state").Parse(`{
  1194    "Version": "0.5.1",
  1195    "Alloc": {
  1196      "ID": "{{ .AllocID }}",
  1197      "Name": "example",
  1198      "JobID": "example",
  1199      "Job": {
  1200        "ID": "example",
  1201        "Name": "example",
  1202        "Type": "batch",
  1203        "TaskGroups": [
  1204          {
  1205            "Name": "example",
  1206            "Tasks": [
  1207              {
  1208                "Name": "example",
  1209                "Driver": "mock",
  1210                "Config": {
  1211                  "exit_code": "0",
  1212  		"run_for": "10s"
  1213                }
  1214              }
  1215            ]
  1216          }
  1217        ]
  1218      },
  1219      "TaskGroup": "example",
  1220      "DesiredStatus": "run",
  1221      "ClientStatus": "running",
  1222      "TaskStates": {
  1223        "example": {
  1224          "State": "running",
  1225          "Failed": false,
  1226          "Events": []
  1227        }
  1228      }
  1229    },
  1230    "Context": {
  1231      "AllocDir": {
  1232        "AllocDir": "{{ .AllocDir }}/{{ .AllocID }}",
  1233        "SharedDir": "{{ .AllocDir }}/{{ .AllocID }}/alloc",
  1234        "TaskDirs": {
  1235          "example": "{{ .AllocDir }}/{{ .AllocID }}/example"
  1236        }
  1237      },
  1238      "AllocID": "{{ .AllocID }}"
  1239    }
  1240  }`)).Execute(w, tmplctx)
  1241  	if err != nil {
  1242  		t.Fatalf("error writing state file: %v", err)
  1243  	}
  1244  	w.Close()
  1245  
  1246  	upd := &MockAllocStateUpdater{}
  1247  	*alloc.Job.LookupTaskGroup(alloc.TaskGroup).RestartPolicy = structs.RestartPolicy{Attempts: 0}
  1248  	alloc.Job.Type = structs.JobTypeBatch
  1249  	vclient := vaultclient.NewMockVaultClient()
  1250  	cclient := newMockConsulServiceClient(t)
  1251  	ar := NewAllocRunner(logger, conf, db, upd.Update, alloc, vclient, cclient, noopPrevAlloc{})
  1252  	defer ar.Destroy()
  1253  
  1254  	// RestoreState should fail on the task state since we only test the
  1255  	// alloc state restoring.
  1256  	err = ar.RestoreState()
  1257  	if err == nil {
  1258  		t.Fatal("expected error restoring Task state")
  1259  	}
  1260  	merr, ok := err.(*multierror.Error)
  1261  	if !ok {
  1262  		t.Fatalf("expected RestoreState to return a multierror but found: %T -> %v", err, err)
  1263  	}
  1264  	if len(merr.Errors) != 1 {
  1265  		t.Fatalf("expected exactly 1 error from RestoreState but found: %d: %v", len(merr.Errors), err)
  1266  	}
  1267  	if expected := "failed to get task bucket"; !strings.Contains(merr.Errors[0].Error(), expected) {
  1268  		t.Fatalf("expected %q but got: %q", expected, merr.Errors[0].Error())
  1269  	}
  1270  
  1271  	if err := ar.SaveState(); err != nil {
  1272  		t.Fatalf("error saving new state: %v", err)
  1273  	}
  1274  }
  1275  
  1276  func TestAllocRunner_TaskFailed_KillTG(t *testing.T) {
  1277  	t.Parallel()
  1278  	upd, ar := testAllocRunner(t, false)
  1279  
  1280  	// Create two tasks in the task group
  1281  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1282  	task.Driver = "mock_driver"
  1283  	task.KillTimeout = 10 * time.Millisecond
  1284  	task.Config = map[string]interface{}{
  1285  		"run_for": "10s",
  1286  	}
  1287  
  1288  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1289  	task2.Name = "task 2"
  1290  	task2.Driver = "mock_driver"
  1291  	task2.Config = map[string]interface{}{
  1292  		"start_error": "fail task please",
  1293  	}
  1294  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1295  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1296  	go ar.Run()
  1297  	defer ar.Destroy()
  1298  
  1299  	testutil.WaitForResult(func() (bool, error) {
  1300  		last := upd.Last()
  1301  		if last == nil {
  1302  			return false, fmt.Errorf("No updates")
  1303  		}
  1304  		if last.ClientStatus != structs.AllocClientStatusFailed {
  1305  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusFailed)
  1306  		}
  1307  
  1308  		// Task One should be killed
  1309  		state1 := last.TaskStates[task.Name]
  1310  		if state1.State != structs.TaskStateDead {
  1311  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1312  		}
  1313  		if len(state1.Events) < 2 {
  1314  			// At least have a received and destroyed
  1315  			return false, fmt.Errorf("Unexpected number of events")
  1316  		}
  1317  
  1318  		found := false
  1319  		for _, e := range state1.Events {
  1320  			if e.Type != structs.TaskSiblingFailed {
  1321  				found = true
  1322  			}
  1323  		}
  1324  
  1325  		if !found {
  1326  			return false, fmt.Errorf("Did not find event %v", structs.TaskSiblingFailed)
  1327  		}
  1328  
  1329  		// Task Two should be failed
  1330  		state2 := last.TaskStates[task2.Name]
  1331  		if state2.State != structs.TaskStateDead {
  1332  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1333  		}
  1334  		if !state2.Failed {
  1335  			return false, fmt.Errorf("task2 should have failed")
  1336  		}
  1337  
  1338  		return true, nil
  1339  	}, func(err error) {
  1340  		t.Fatalf("err: %v", err)
  1341  	})
  1342  }
  1343  
  1344  func TestAllocRunner_TaskLeader_KillTG(t *testing.T) {
  1345  	t.Parallel()
  1346  	upd, ar := testAllocRunner(t, false)
  1347  
  1348  	// Create two tasks in the task group
  1349  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1350  	task.Driver = "mock_driver"
  1351  	task.KillTimeout = 10 * time.Millisecond
  1352  	task.Config = map[string]interface{}{
  1353  		"run_for": "10s",
  1354  	}
  1355  
  1356  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1357  	task2.Name = "task 2"
  1358  	task2.Driver = "mock_driver"
  1359  	task2.Leader = true
  1360  	task2.Config = map[string]interface{}{
  1361  		"run_for": "1s",
  1362  	}
  1363  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1364  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1365  	go ar.Run()
  1366  	defer ar.Destroy()
  1367  
  1368  	testutil.WaitForResult(func() (bool, error) {
  1369  		last := upd.Last()
  1370  		if last == nil {
  1371  			return false, fmt.Errorf("No updates")
  1372  		}
  1373  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1374  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1375  		}
  1376  
  1377  		// Task One should be killed
  1378  		state1 := last.TaskStates[task.Name]
  1379  		if state1.State != structs.TaskStateDead {
  1380  			return false, fmt.Errorf("got state %v; want %v", state1.State, structs.TaskStateDead)
  1381  		}
  1382  		if state1.FinishedAt.IsZero() || state1.StartedAt.IsZero() {
  1383  			return false, fmt.Errorf("expected to have a start and finish time")
  1384  		}
  1385  		if len(state1.Events) < 2 {
  1386  			// At least have a received and destroyed
  1387  			return false, fmt.Errorf("Unexpected number of events")
  1388  		}
  1389  
  1390  		found := false
  1391  		for _, e := range state1.Events {
  1392  			if e.Type != structs.TaskLeaderDead {
  1393  				found = true
  1394  			}
  1395  		}
  1396  
  1397  		if !found {
  1398  			return false, fmt.Errorf("Did not find event %v", structs.TaskLeaderDead)
  1399  		}
  1400  
  1401  		// Task Two should be dead
  1402  		state2 := last.TaskStates[task2.Name]
  1403  		if state2.State != structs.TaskStateDead {
  1404  			return false, fmt.Errorf("got state %v; want %v", state2.State, structs.TaskStateDead)
  1405  		}
  1406  		if state2.FinishedAt.IsZero() || state2.StartedAt.IsZero() {
  1407  			return false, fmt.Errorf("expected to have a start and finish time")
  1408  		}
  1409  
  1410  		return true, nil
  1411  	}, func(err error) {
  1412  		t.Fatalf("err: %v", err)
  1413  	})
  1414  }
  1415  
  1416  // TestAllocRunner_TaskLeader_StopTG asserts that when stopping a task group
  1417  // with a leader the leader is stopped before other tasks.
  1418  func TestAllocRunner_TaskLeader_StopTG(t *testing.T) {
  1419  	t.Parallel()
  1420  	upd, ar := testAllocRunner(t, false)
  1421  
  1422  	// Create 3 tasks in the task group
  1423  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1424  	task.Name = "follower1"
  1425  	task.Driver = "mock_driver"
  1426  	task.KillTimeout = 10 * time.Millisecond
  1427  	task.Config = map[string]interface{}{
  1428  		"run_for": "10s",
  1429  	}
  1430  
  1431  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1432  	task2.Name = "leader"
  1433  	task2.Driver = "mock_driver"
  1434  	task2.Leader = true
  1435  	task2.KillTimeout = 10 * time.Millisecond
  1436  	task2.Config = map[string]interface{}{
  1437  		"run_for": "10s",
  1438  	}
  1439  
  1440  	task3 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1441  	task3.Name = "follower2"
  1442  	task3.Driver = "mock_driver"
  1443  	task3.KillTimeout = 10 * time.Millisecond
  1444  	task3.Config = map[string]interface{}{
  1445  		"run_for": "10s",
  1446  	}
  1447  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2, task3)
  1448  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1449  	defer ar.Destroy()
  1450  
  1451  	go ar.Run()
  1452  
  1453  	// Wait for tasks to start
  1454  	last := upd.Last()
  1455  	testutil.WaitForResult(func() (bool, error) {
  1456  		last = upd.Last()
  1457  		if last == nil {
  1458  			return false, fmt.Errorf("No updates")
  1459  		}
  1460  		if n := len(last.TaskStates); n != 3 {
  1461  			return false, fmt.Errorf("Not enough task states (want: 3; found %d)", n)
  1462  		}
  1463  		for name, state := range last.TaskStates {
  1464  			if state.State != structs.TaskStateRunning {
  1465  				return false, fmt.Errorf("Task %q is not running yet (it's %q)", name, state.State)
  1466  			}
  1467  		}
  1468  		return true, nil
  1469  	}, func(err error) {
  1470  		t.Fatalf("err: %v", err)
  1471  	})
  1472  
  1473  	// Reset updates
  1474  	upd.mu.Lock()
  1475  	upd.Allocs = upd.Allocs[:0]
  1476  	upd.mu.Unlock()
  1477  
  1478  	// Stop alloc
  1479  	update := ar.Alloc()
  1480  	update.DesiredStatus = structs.AllocDesiredStatusStop
  1481  	ar.Update(update)
  1482  
  1483  	// Wait for tasks to stop
  1484  	testutil.WaitForResult(func() (bool, error) {
  1485  		last := upd.Last()
  1486  		if last == nil {
  1487  			return false, fmt.Errorf("No updates")
  1488  		}
  1489  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower1"].FinishedAt.UnixNano() {
  1490  			return false, fmt.Errorf("expected leader to finish before follower1: %s >= %s",
  1491  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower1"].FinishedAt)
  1492  		}
  1493  		if last.TaskStates["leader"].FinishedAt.UnixNano() >= last.TaskStates["follower2"].FinishedAt.UnixNano() {
  1494  			return false, fmt.Errorf("expected leader to finish before follower2: %s >= %s",
  1495  				last.TaskStates["leader"].FinishedAt, last.TaskStates["follower2"].FinishedAt)
  1496  		}
  1497  		return true, nil
  1498  	}, func(err error) {
  1499  		last := upd.Last()
  1500  		for name, state := range last.TaskStates {
  1501  			t.Logf("%s: %s", name, state.State)
  1502  		}
  1503  		t.Fatalf("err: %v", err)
  1504  	})
  1505  }
  1506  
  1507  // TestAllocRunner_TaskLeader_StopRestoredTG asserts that when stopping a
  1508  // restored task group with a leader that failed before restoring the leader is
  1509  // not stopped as it does not exist.
  1510  // See https://github.com/hashicorp/nomad/issues/3420#issuecomment-341666932
  1511  func TestAllocRunner_TaskLeader_StopRestoredTG(t *testing.T) {
  1512  	t.Parallel()
  1513  	_, ar := testAllocRunner(t, false)
  1514  	defer ar.Destroy()
  1515  
  1516  	// Create a leader and follower task in the task group
  1517  	task := ar.alloc.Job.TaskGroups[0].Tasks[0]
  1518  	task.Name = "follower1"
  1519  	task.Driver = "mock_driver"
  1520  	task.KillTimeout = 10 * time.Second
  1521  	task.Config = map[string]interface{}{
  1522  		"run_for": "10s",
  1523  	}
  1524  
  1525  	task2 := ar.alloc.Job.TaskGroups[0].Tasks[0].Copy()
  1526  	task2.Name = "leader"
  1527  	task2.Driver = "mock_driver"
  1528  	task2.Leader = true
  1529  	task2.KillTimeout = 10 * time.Millisecond
  1530  	task2.Config = map[string]interface{}{
  1531  		"run_for": "0s",
  1532  	}
  1533  
  1534  	ar.alloc.Job.TaskGroups[0].Tasks = append(ar.alloc.Job.TaskGroups[0].Tasks, task2)
  1535  	ar.alloc.TaskResources[task2.Name] = task2.Resources
  1536  
  1537  	// Mimic Nomad exiting before the leader stopping is able to stop other tasks.
  1538  	ar.tasks = map[string]*TaskRunner{
  1539  		"leader": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1540  			ar.allocDir.NewTaskDir(task2.Name), ar.Alloc(), task2.Copy(),
  1541  			ar.vaultClient, ar.consulClient),
  1542  		"follower1": NewTaskRunner(ar.logger, ar.config, ar.stateDB, ar.setTaskState,
  1543  			ar.allocDir.NewTaskDir(task.Name), ar.Alloc(), task.Copy(),
  1544  			ar.vaultClient, ar.consulClient),
  1545  	}
  1546  	ar.taskStates = map[string]*structs.TaskState{
  1547  		"leader":    {State: structs.TaskStateDead},
  1548  		"follower1": {State: structs.TaskStateRunning},
  1549  	}
  1550  	if err := ar.SaveState(); err != nil {
  1551  		t.Fatalf("error saving state: %v", err)
  1552  	}
  1553  
  1554  	// Create a new AllocRunner to test RestoreState and Run
  1555  	upd2 := &MockAllocStateUpdater{}
  1556  	ar2 := NewAllocRunner(ar.logger, ar.config, ar.stateDB, upd2.Update, ar.alloc,
  1557  		ar.vaultClient, ar.consulClient, ar.prevAlloc)
  1558  	defer ar2.Destroy()
  1559  
  1560  	if err := ar2.RestoreState(); err != nil {
  1561  		t.Fatalf("error restoring state: %v", err)
  1562  	}
  1563  	go ar2.Run()
  1564  
  1565  	// Wait for tasks to be stopped because leader is dead
  1566  	testutil.WaitForResult(func() (bool, error) {
  1567  		last := upd2.Last()
  1568  		if last == nil {
  1569  			return false, fmt.Errorf("No updates")
  1570  		}
  1571  		if actual := last.TaskStates["leader"].State; actual != structs.TaskStateDead {
  1572  			return false, fmt.Errorf("Task leader is not dead yet (it's %q)", actual)
  1573  		}
  1574  		if actual := last.TaskStates["follower1"].State; actual != structs.TaskStateDead {
  1575  			return false, fmt.Errorf("Task follower1 is not dead yet (it's %q)", actual)
  1576  		}
  1577  		return true, nil
  1578  	}, func(err error) {
  1579  		last := upd2.Last()
  1580  		for name, state := range last.TaskStates {
  1581  			t.Logf("%s: %s", name, state.State)
  1582  		}
  1583  		t.Fatalf("err: %v", err)
  1584  	})
  1585  
  1586  	// Make sure it GCs properly
  1587  	ar2.Destroy()
  1588  
  1589  	select {
  1590  	case <-ar2.WaitCh():
  1591  		// exited as expected
  1592  	case <-time.After(10 * time.Second):
  1593  		t.Fatalf("timed out waiting for AR to GC")
  1594  	}
  1595  }
  1596  
  1597  // TestAllocRunner_MoveAllocDir asserts that a file written to an alloc's
  1598  // local/ dir will be moved to a replacement alloc's local/ dir if sticky
  1599  // volumes is on.
  1600  func TestAllocRunner_MoveAllocDir(t *testing.T) {
  1601  	t.Parallel()
  1602  	// Create an alloc runner
  1603  	alloc := mock.Alloc()
  1604  	task := alloc.Job.TaskGroups[0].Tasks[0]
  1605  	task.Driver = "mock_driver"
  1606  	task.Config = map[string]interface{}{
  1607  		"run_for": "1s",
  1608  	}
  1609  	upd, ar := testAllocRunnerFromAlloc(t, alloc, false)
  1610  	go ar.Run()
  1611  	defer ar.Destroy()
  1612  
  1613  	testutil.WaitForResult(func() (bool, error) {
  1614  		last := upd.Last()
  1615  		if last == nil {
  1616  			return false, fmt.Errorf("No updates")
  1617  		}
  1618  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1619  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1620  		}
  1621  		return true, nil
  1622  	}, func(err error) {
  1623  		t.Fatalf("err: %v", err)
  1624  	})
  1625  
  1626  	// Write some data in data dir and task dir of the alloc
  1627  	dataFile := filepath.Join(ar.allocDir.SharedDir, "data", "data_file")
  1628  	ioutil.WriteFile(dataFile, []byte("hello world"), os.ModePerm)
  1629  	taskDir := ar.allocDir.TaskDirs[task.Name]
  1630  	taskLocalFile := filepath.Join(taskDir.LocalDir, "local_file")
  1631  	ioutil.WriteFile(taskLocalFile, []byte("good bye world"), os.ModePerm)
  1632  
  1633  	// Create another alloc runner
  1634  	alloc2 := mock.Alloc()
  1635  	alloc2.PreviousAllocation = ar.allocID
  1636  	alloc2.Job.TaskGroups[0].EphemeralDisk.Sticky = true
  1637  	task = alloc2.Job.TaskGroups[0].Tasks[0]
  1638  	task.Driver = "mock_driver"
  1639  	task.Config = map[string]interface{}{
  1640  		"run_for": "1s",
  1641  	}
  1642  	upd2, ar2 := testAllocRunnerFromAlloc(t, alloc2, false)
  1643  
  1644  	// Set prevAlloc like Client does
  1645  	ar2.prevAlloc = newAllocWatcher(alloc2, ar, nil, ar2.config, ar2.logger, "")
  1646  
  1647  	go ar2.Run()
  1648  	defer ar2.Destroy()
  1649  
  1650  	testutil.WaitForResult(func() (bool, error) {
  1651  		last := upd2.Last()
  1652  		if last == nil {
  1653  			return false, fmt.Errorf("No updates")
  1654  		}
  1655  		if last.ClientStatus != structs.AllocClientStatusComplete {
  1656  			return false, fmt.Errorf("got status %v; want %v", last.ClientStatus, structs.AllocClientStatusComplete)
  1657  		}
  1658  		return true, nil
  1659  	}, func(err error) {
  1660  		t.Fatalf("err: %v", err)
  1661  	})
  1662  
  1663  	// Ensure that data from ar was moved to ar2
  1664  	taskDir = ar2.allocDir.TaskDirs[task.Name]
  1665  	taskLocalFile = filepath.Join(taskDir.LocalDir, "local_file")
  1666  	if fileInfo, _ := os.Stat(taskLocalFile); fileInfo == nil {
  1667  		t.Fatalf("file %v not found", taskLocalFile)
  1668  	}
  1669  
  1670  	dataFile = filepath.Join(ar2.allocDir.SharedDir, "data", "data_file")
  1671  	if fileInfo, _ := os.Stat(dataFile); fileInfo == nil {
  1672  		t.Fatalf("file %v not found", dataFile)
  1673  	}
  1674  }