github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/alloc_runner_unix_test.go (about)

     1  // +build !windows
     2  
     3  package allocrunner
     4  
     5  import (
     6  	"encoding/json"
     7  	"fmt"
     8  	"os"
     9  	"syscall"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/hashicorp/nomad/client/consul"
    14  	"github.com/hashicorp/nomad/client/state"
    15  	"github.com/hashicorp/nomad/nomad/mock"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  	"github.com/hashicorp/nomad/testutil"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal
    22  // alloc with a running task properly kills the running the task. This is meant
    23  // to simulate a Nomad agent crash after receiving an updated alloc with
    24  // DesiredStatus=Stop, persisting the update, but crashing before terminating
    25  // the task.
    26  func TestAllocRunner_Restore_RunningTerminal(t *testing.T) {
    27  	t.Parallel()
    28  
    29  	// 1. Run task
    30  	// 2. Shutdown alloc runner
    31  	// 3. Set alloc.desiredstatus=false
    32  	// 4. Start new alloc runner
    33  	// 5. Assert task and logmon are cleaned up
    34  
    35  	alloc := mock.Alloc()
    36  	task := alloc.Job.TaskGroups[0].Tasks[0]
    37  	task.Driver = "mock_driver"
    38  	task.Config = map[string]interface{}{
    39  		"run_for": "1h",
    40  	}
    41  
    42  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
    43  	defer cleanup()
    44  
    45  	// Maintain state for subsequent run
    46  	conf.StateDB = state.NewMemDB(conf.Logger)
    47  
    48  	// Start and wait for task to be running
    49  	ar, err := NewAllocRunner(conf)
    50  	require.NoError(t, err)
    51  	go ar.Run()
    52  	defer destroy(ar)
    53  
    54  	testutil.WaitForResult(func() (bool, error) {
    55  		s := ar.AllocState()
    56  		return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus)
    57  	}, func(err error) {
    58  		require.NoError(t, err)
    59  	})
    60  
    61  	// Shutdown the AR and manually change the state to mimic a crash where
    62  	// a stopped alloc update is received, but Nomad crashes before
    63  	// stopping the alloc.
    64  	ar.Shutdown()
    65  	select {
    66  	case <-ar.ShutdownCh():
    67  	case <-time.After(30 * time.Second):
    68  		require.Fail(t, "AR took too long to exit")
    69  	}
    70  
    71  	// Assert logmon is still running. This is a super ugly hack that pulls
    72  	// logmon's PID out of its reattach config, but it does properly ensure
    73  	// logmon gets cleaned up.
    74  	ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
    75  	require.NoError(t, err)
    76  	require.NotNil(t, ls)
    77  
    78  	logmonReattach := struct {
    79  		Pid int
    80  	}{}
    81  	err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach)
    82  	require.NoError(t, err)
    83  
    84  	logmonProc, _ := os.FindProcess(logmonReattach.Pid)
    85  	require.NoError(t, logmonProc.Signal(syscall.Signal(0)))
    86  
    87  	// Fake alloc terminal during Restore()
    88  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
    89  	alloc.ModifyIndex++
    90  	alloc.AllocModifyIndex++
    91  
    92  	// Start a new alloc runner and assert it gets stopped
    93  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
    94  	defer cleanup2()
    95  
    96  	// Use original statedb to maintain hook state
    97  	conf2.StateDB = conf.StateDB
    98  
    99  	// Restore, start, and wait for task to be killed
   100  	ar2, err := NewAllocRunner(conf2)
   101  	require.NoError(t, err)
   102  
   103  	require.NoError(t, ar2.Restore())
   104  
   105  	go ar2.Run()
   106  	defer destroy(ar2)
   107  
   108  	select {
   109  	case <-ar2.WaitCh():
   110  	case <-time.After(30 * time.Second):
   111  	}
   112  
   113  	// Assert logmon was cleaned up
   114  	require.Error(t, logmonProc.Signal(syscall.Signal(0)))
   115  
   116  	// Assert consul was cleaned up:
   117  	//   2 removals (canary+noncanary) during prekill
   118  	//   2 removals (canary+noncanary) during exited
   119  	//   2 removals (canary+noncanary) during stop
   120  	consulOps := conf2.Consul.(*consul.MockConsulServiceClient).GetOps()
   121  	require.Len(t, consulOps, 6)
   122  	for _, op := range consulOps {
   123  		require.Equal(t, "remove", op.Op)
   124  	}
   125  
   126  	// Assert terminated task event was emitted
   127  	events := ar2.AllocState().TaskStates[task.Name].Events
   128  	require.Len(t, events, 4)
   129  	require.Equal(t, events[0].Type, structs.TaskReceived)
   130  	require.Equal(t, events[1].Type, structs.TaskSetup)
   131  	require.Equal(t, events[2].Type, structs.TaskStarted)
   132  	require.Equal(t, events[3].Type, structs.TaskTerminated)
   133  }
   134  
   135  // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed
   136  // batch alloc doesn't run it again
   137  func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
   138  	t.Parallel()
   139  
   140  	// 1. Run task and wait for it to complete
   141  	// 2. Start new alloc runner
   142  	// 3. Assert task didn't run again
   143  
   144  	alloc := mock.Alloc()
   145  	alloc.Job.Type = structs.JobTypeBatch
   146  	task := alloc.Job.TaskGroups[0].Tasks[0]
   147  	task.Driver = "mock_driver"
   148  	task.Config = map[string]interface{}{
   149  		"run_for": "2ms",
   150  	}
   151  
   152  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   153  	defer cleanup()
   154  
   155  	// Maintain state for subsequent run
   156  	conf.StateDB = state.NewMemDB(conf.Logger)
   157  
   158  	// Start and wait for task to be running
   159  	ar, err := NewAllocRunner(conf)
   160  	require.NoError(t, err)
   161  	go ar.Run()
   162  	defer destroy(ar)
   163  
   164  	testutil.WaitForResult(func() (bool, error) {
   165  		s := ar.AllocState()
   166  		if s.ClientStatus != structs.AllocClientStatusComplete {
   167  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   168  		}
   169  		return true, nil
   170  	}, func(err error) {
   171  		require.NoError(t, err)
   172  	})
   173  
   174  	// once job finishes, it shouldn't run again
   175  	require.False(t, ar.shouldRun())
   176  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   177  	require.Len(t, initialRunEvents, 4)
   178  
   179  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   180  	require.NoError(t, err)
   181  	require.NotNil(t, ls)
   182  	require.Equal(t, structs.TaskStateDead, ts.State)
   183  
   184  	// Start a new alloc runner and assert it gets stopped
   185  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
   186  	defer cleanup2()
   187  
   188  	// Use original statedb to maintain hook state
   189  	conf2.StateDB = conf.StateDB
   190  
   191  	// Restore, start, and wait for task to be killed
   192  	ar2, err := NewAllocRunner(conf2)
   193  	require.NoError(t, err)
   194  
   195  	require.NoError(t, ar2.Restore())
   196  
   197  	go ar2.Run()
   198  	defer destroy(ar2)
   199  
   200  	// AR waitCh must be closed even when task doesn't run again
   201  	select {
   202  	case <-ar2.WaitCh():
   203  	case <-time.After(10 * time.Second):
   204  		require.Fail(t, "alloc.waitCh wasn't closed")
   205  	}
   206  
   207  	// TR waitCh must be closed too!
   208  	select {
   209  	case <-ar2.tasks[task.Name].WaitCh():
   210  	case <-time.After(10 * time.Second):
   211  		require.Fail(t, "tr.waitCh wasn't closed")
   212  	}
   213  
   214  	// Assert that events are unmodified, which they would if task re-run
   215  	events := ar2.AllocState().TaskStates[task.Name].Events
   216  	require.Equal(t, initialRunEvents, events)
   217  }
   218  
   219  // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
   220  // prestart hooks failed, then the alloc and subsequent tasks transition
   221  // to failed state
   222  func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
   223  	t.Parallel()
   224  
   225  	alloc := mock.Alloc()
   226  	alloc.Job.Type = structs.JobTypeBatch
   227  	task := alloc.Job.TaskGroups[0].Tasks[0]
   228  	task.Driver = "mock_driver"
   229  	task.Config = map[string]interface{}{
   230  		"run_for": "2ms",
   231  	}
   232  	alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{
   233  		Attempts: 0,
   234  	}
   235  
   236  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   237  	defer cleanup()
   238  
   239  	// Maintain state for subsequent run
   240  	conf.StateDB = state.NewMemDB(conf.Logger)
   241  
   242  	// Start and wait for task to be running
   243  	ar, err := NewAllocRunner(conf)
   244  	require.NoError(t, err)
   245  
   246  	ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})
   247  
   248  	go ar.Run()
   249  	defer destroy(ar)
   250  
   251  	select {
   252  	case <-ar.WaitCh():
   253  	case <-time.After(10 * time.Second):
   254  		require.Fail(t, "alloc.waitCh wasn't closed")
   255  	}
   256  
   257  	testutil.WaitForResult(func() (bool, error) {
   258  		s := ar.AllocState()
   259  		if s.ClientStatus != structs.AllocClientStatusFailed {
   260  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   261  		}
   262  		return true, nil
   263  	}, func(err error) {
   264  		require.NoError(t, err)
   265  	})
   266  
   267  	// once job finishes, it shouldn't run again
   268  	require.False(t, ar.shouldRun())
   269  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   270  	require.Len(t, initialRunEvents, 2)
   271  
   272  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   273  	require.NoError(t, err)
   274  	require.NotNil(t, ls)
   275  	require.NotNil(t, ts)
   276  	require.Equal(t, structs.TaskStateDead, ts.State)
   277  	require.True(t, ts.Failed)
   278  
   279  	// TR waitCh must be closed too!
   280  	select {
   281  	case <-ar.tasks[task.Name].WaitCh():
   282  	case <-time.After(10 * time.Second):
   283  		require.Fail(t, "tr.waitCh wasn't closed")
   284  	}
   285  }
   286  
   287  type allocFailingPrestartHook struct{}
   288  
   289  func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }
   290  
   291  func (*allocFailingPrestartHook) Prerun() error {
   292  	return fmt.Errorf("failing prestart hooks")
   293  }