github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/alloc_runner_unix_test.go (about)

     1  // +build !windows
     2  
     3  package allocrunner
     4  
     5  import (
     6  	"encoding/json"
     7  	"fmt"
     8  	"os"
     9  	"syscall"
    10  	"testing"
    11  	"time"
    12  
    13  	"github.com/hashicorp/nomad/client/consul"
    14  	"github.com/hashicorp/nomad/client/state"
    15  	"github.com/hashicorp/nomad/nomad/mock"
    16  	"github.com/hashicorp/nomad/nomad/structs"
    17  	"github.com/hashicorp/nomad/testutil"
    18  	"github.com/stretchr/testify/require"
    19  )
    20  
    21  // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal
    22  // alloc with a running task properly kills the running the task. This is meant
    23  // to simulate a Nomad agent crash after receiving an updated alloc with
    24  // DesiredStatus=Stop, persisting the update, but crashing before terminating
    25  // the task.
    26  func TestAllocRunner_Restore_RunningTerminal(t *testing.T) {
    27  	t.Parallel()
    28  
    29  	// 1. Run task
    30  	// 2. Shutdown alloc runner
    31  	// 3. Set alloc.desiredstatus=false
    32  	// 4. Start new alloc runner
    33  	// 5. Assert task and logmon are cleaned up
    34  
    35  	alloc := mock.Alloc()
    36  	alloc.Job.TaskGroups[0].Services = []*structs.Service{
    37  		{
    38  			Name:      "foo",
    39  			PortLabel: "8888",
    40  		},
    41  	}
    42  	task := alloc.Job.TaskGroups[0].Tasks[0]
    43  	task.Driver = "mock_driver"
    44  	task.Config = map[string]interface{}{
    45  		"run_for": "1h",
    46  	}
    47  
    48  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
    49  	defer cleanup()
    50  
    51  	// Maintain state for subsequent run
    52  	conf.StateDB = state.NewMemDB(conf.Logger)
    53  
    54  	// Start and wait for task to be running
    55  	ar, err := NewAllocRunner(conf)
    56  	require.NoError(t, err)
    57  	go ar.Run()
    58  	defer destroy(ar)
    59  
    60  	testutil.WaitForResult(func() (bool, error) {
    61  		s := ar.AllocState()
    62  		return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus)
    63  	}, func(err error) {
    64  		require.NoError(t, err)
    65  	})
    66  
    67  	// Shutdown the AR and manually change the state to mimic a crash where
    68  	// a stopped alloc update is received, but Nomad crashes before
    69  	// stopping the alloc.
    70  	ar.Shutdown()
    71  	select {
    72  	case <-ar.ShutdownCh():
    73  	case <-time.After(30 * time.Second):
    74  		require.Fail(t, "AR took too long to exit")
    75  	}
    76  
    77  	// Assert logmon is still running. This is a super ugly hack that pulls
    78  	// logmon's PID out of its reattach config, but it does properly ensure
    79  	// logmon gets cleaned up.
    80  	ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
    81  	require.NoError(t, err)
    82  	require.NotNil(t, ls)
    83  
    84  	logmonReattach := struct {
    85  		Pid int
    86  	}{}
    87  	err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach)
    88  	require.NoError(t, err)
    89  
    90  	logmonProc, _ := os.FindProcess(logmonReattach.Pid)
    91  	require.NoError(t, logmonProc.Signal(syscall.Signal(0)))
    92  
    93  	// Fake alloc terminal during Restore()
    94  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
    95  	alloc.ModifyIndex++
    96  	alloc.AllocModifyIndex++
    97  
    98  	// Start a new alloc runner and assert it gets stopped
    99  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
   100  	defer cleanup2()
   101  
   102  	// Use original statedb to maintain hook state
   103  	conf2.StateDB = conf.StateDB
   104  
   105  	// Restore, start, and wait for task to be killed
   106  	ar2, err := NewAllocRunner(conf2)
   107  	require.NoError(t, err)
   108  
   109  	require.NoError(t, ar2.Restore())
   110  
   111  	go ar2.Run()
   112  	defer destroy(ar2)
   113  
   114  	select {
   115  	case <-ar2.WaitCh():
   116  	case <-time.After(30 * time.Second):
   117  	}
   118  
   119  	// Assert logmon was cleaned up
   120  	require.Error(t, logmonProc.Signal(syscall.Signal(0)))
   121  
   122  	// Assert consul was cleaned up:
   123  	//   2 removals (canary+noncanary) during prekill
   124  	//   2 removals (canary+noncanary) during exited
   125  	//   2 removals (canary+noncanary) during stop
   126  	//   2 removals (canary+noncanary) group during stop
   127  	consulOps := conf2.Consul.(*consul.MockConsulServiceClient).GetOps()
   128  	require.Len(t, consulOps, 8)
   129  	for _, op := range consulOps {
   130  		require.Equal(t, "remove", op.Op)
   131  	}
   132  
   133  	// Assert terminated task event was emitted
   134  	events := ar2.AllocState().TaskStates[task.Name].Events
   135  	require.Len(t, events, 4)
   136  	require.Equal(t, events[0].Type, structs.TaskReceived)
   137  	require.Equal(t, events[1].Type, structs.TaskSetup)
   138  	require.Equal(t, events[2].Type, structs.TaskStarted)
   139  	require.Equal(t, events[3].Type, structs.TaskTerminated)
   140  }
   141  
   142  // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed
   143  // batch alloc doesn't run it again
   144  func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
   145  	t.Parallel()
   146  
   147  	// 1. Run task and wait for it to complete
   148  	// 2. Start new alloc runner
   149  	// 3. Assert task didn't run again
   150  
   151  	alloc := mock.Alloc()
   152  	alloc.Job.Type = structs.JobTypeBatch
   153  	task := alloc.Job.TaskGroups[0].Tasks[0]
   154  	task.Driver = "mock_driver"
   155  	task.Config = map[string]interface{}{
   156  		"run_for": "2ms",
   157  	}
   158  
   159  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   160  	defer cleanup()
   161  
   162  	// Maintain state for subsequent run
   163  	conf.StateDB = state.NewMemDB(conf.Logger)
   164  
   165  	// Start and wait for task to be running
   166  	ar, err := NewAllocRunner(conf)
   167  	require.NoError(t, err)
   168  	go ar.Run()
   169  	defer destroy(ar)
   170  
   171  	testutil.WaitForResult(func() (bool, error) {
   172  		s := ar.AllocState()
   173  		if s.ClientStatus != structs.AllocClientStatusComplete {
   174  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   175  		}
   176  		return true, nil
   177  	}, func(err error) {
   178  		require.NoError(t, err)
   179  	})
   180  
   181  	// once job finishes, it shouldn't run again
   182  	require.False(t, ar.shouldRun())
   183  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   184  	require.Len(t, initialRunEvents, 4)
   185  
   186  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   187  	require.NoError(t, err)
   188  	require.NotNil(t, ls)
   189  	require.Equal(t, structs.TaskStateDead, ts.State)
   190  
   191  	// Start a new alloc runner and assert it gets stopped
   192  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
   193  	defer cleanup2()
   194  
   195  	// Use original statedb to maintain hook state
   196  	conf2.StateDB = conf.StateDB
   197  
   198  	// Restore, start, and wait for task to be killed
   199  	ar2, err := NewAllocRunner(conf2)
   200  	require.NoError(t, err)
   201  
   202  	require.NoError(t, ar2.Restore())
   203  
   204  	go ar2.Run()
   205  	defer destroy(ar2)
   206  
   207  	// AR waitCh must be closed even when task doesn't run again
   208  	select {
   209  	case <-ar2.WaitCh():
   210  	case <-time.After(10 * time.Second):
   211  		require.Fail(t, "alloc.waitCh wasn't closed")
   212  	}
   213  
   214  	// TR waitCh must be closed too!
   215  	select {
   216  	case <-ar2.tasks[task.Name].WaitCh():
   217  	case <-time.After(10 * time.Second):
   218  		require.Fail(t, "tr.waitCh wasn't closed")
   219  	}
   220  
   221  	// Assert that events are unmodified, which they would if task re-run
   222  	events := ar2.AllocState().TaskStates[task.Name].Events
   223  	require.Equal(t, initialRunEvents, events)
   224  }
   225  
   226  // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
   227  // prestart hooks failed, then the alloc and subsequent tasks transition
   228  // to failed state
   229  func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
   230  	t.Parallel()
   231  
   232  	alloc := mock.Alloc()
   233  	alloc.Job.Type = structs.JobTypeBatch
   234  	task := alloc.Job.TaskGroups[0].Tasks[0]
   235  	task.Driver = "mock_driver"
   236  	task.Config = map[string]interface{}{
   237  		"run_for": "2ms",
   238  	}
   239  	rp := &structs.RestartPolicy{Attempts: 0}
   240  	alloc.Job.TaskGroups[0].RestartPolicy = rp
   241  	task.RestartPolicy = rp
   242  
   243  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   244  	defer cleanup()
   245  
   246  	// Maintain state for subsequent run
   247  	conf.StateDB = state.NewMemDB(conf.Logger)
   248  
   249  	// Start and wait for task to be running
   250  	ar, err := NewAllocRunner(conf)
   251  	require.NoError(t, err)
   252  
   253  	ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})
   254  
   255  	go ar.Run()
   256  	defer destroy(ar)
   257  
   258  	select {
   259  	case <-ar.WaitCh():
   260  	case <-time.After(10 * time.Second):
   261  		require.Fail(t, "alloc.waitCh wasn't closed")
   262  	}
   263  
   264  	testutil.WaitForResult(func() (bool, error) {
   265  		s := ar.AllocState()
   266  		if s.ClientStatus != structs.AllocClientStatusFailed {
   267  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   268  		}
   269  		return true, nil
   270  	}, func(err error) {
   271  		require.NoError(t, err)
   272  	})
   273  
   274  	// once job finishes, it shouldn't run again
   275  	require.False(t, ar.shouldRun())
   276  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   277  	require.Len(t, initialRunEvents, 2)
   278  
   279  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   280  	require.NoError(t, err)
   281  	require.NotNil(t, ls)
   282  	require.NotNil(t, ts)
   283  	require.Equal(t, structs.TaskStateDead, ts.State)
   284  	require.True(t, ts.Failed)
   285  
   286  	// TR waitCh must be closed too!
   287  	select {
   288  	case <-ar.tasks[task.Name].WaitCh():
   289  	case <-time.After(10 * time.Second):
   290  		require.Fail(t, "tr.waitCh wasn't closed")
   291  	}
   292  }
   293  
   294  type allocFailingPrestartHook struct{}
   295  
   296  func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }
   297  
   298  func (*allocFailingPrestartHook) Prerun() error {
   299  	return fmt.Errorf("failing prestart hooks")
   300  }