github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/alloc_runner_unix_test.go (about)

     1  //go:build !windows
     2  // +build !windows
     3  
     4  package allocrunner
     5  
     6  import (
     7  	"encoding/json"
     8  	"fmt"
     9  	"os"
    10  	"syscall"
    11  	"testing"
    12  	"time"
    13  
    14  	"github.com/hashicorp/nomad/ci"
    15  	regMock "github.com/hashicorp/nomad/client/serviceregistration/mock"
    16  	"github.com/hashicorp/nomad/client/state"
    17  	"github.com/hashicorp/nomad/nomad/mock"
    18  	"github.com/hashicorp/nomad/nomad/structs"
    19  	"github.com/hashicorp/nomad/testutil"
    20  	"github.com/stretchr/testify/require"
    21  )
    22  
    23  // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal
    24  // alloc with a running task properly kills the running the task. This is meant
    25  // to simulate a Nomad agent crash after receiving an updated alloc with
    26  // DesiredStatus=Stop, persisting the update, but crashing before terminating
    27  // the task.
    28  func TestAllocRunner_Restore_RunningTerminal(t *testing.T) {
    29  	ci.Parallel(t)
    30  
    31  	// 1. Run task
    32  	// 2. Shutdown alloc runner
    33  	// 3. Set alloc.desiredstatus=false
    34  	// 4. Start new alloc runner
    35  	// 5. Assert task and logmon are cleaned up
    36  
    37  	alloc := mock.Alloc()
    38  	alloc.Job.TaskGroups[0].Services = []*structs.Service{
    39  		{
    40  			Name:      "foo",
    41  			PortLabel: "8888",
    42  			Provider:  structs.ServiceProviderConsul,
    43  		},
    44  	}
    45  	task := alloc.Job.TaskGroups[0].Tasks[0]
    46  	task.Driver = "mock_driver"
    47  	task.Config = map[string]interface{}{
    48  		"run_for": "1h",
    49  	}
    50  
    51  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
    52  	defer cleanup()
    53  
    54  	// Maintain state for subsequent run
    55  	conf.StateDB = state.NewMemDB(conf.Logger)
    56  
    57  	// Start and wait for task to be running
    58  	ar, err := NewAllocRunner(conf)
    59  	require.NoError(t, err)
    60  	go ar.Run()
    61  	defer destroy(ar)
    62  
    63  	testutil.WaitForResult(func() (bool, error) {
    64  		s := ar.AllocState()
    65  		return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus)
    66  	}, func(err error) {
    67  		require.NoError(t, err)
    68  	})
    69  
    70  	// Shutdown the AR and manually change the state to mimic a crash where
    71  	// a stopped alloc update is received, but Nomad crashes before
    72  	// stopping the alloc.
    73  	ar.Shutdown()
    74  	select {
    75  	case <-ar.ShutdownCh():
    76  	case <-time.After(30 * time.Second):
    77  		require.Fail(t, "AR took too long to exit")
    78  	}
    79  
    80  	// Assert logmon is still running. This is a super ugly hack that pulls
    81  	// logmon's PID out of its reattach config, but it does properly ensure
    82  	// logmon gets cleaned up.
    83  	ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
    84  	require.NoError(t, err)
    85  	require.NotNil(t, ls)
    86  
    87  	logmonReattach := struct {
    88  		Pid int
    89  	}{}
    90  	err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach)
    91  	require.NoError(t, err)
    92  
    93  	logmonProc, _ := os.FindProcess(logmonReattach.Pid)
    94  	require.NoError(t, logmonProc.Signal(syscall.Signal(0)))
    95  
    96  	// Fake alloc terminal during Restore()
    97  	alloc.DesiredStatus = structs.AllocDesiredStatusStop
    98  	alloc.ModifyIndex++
    99  	alloc.AllocModifyIndex++
   100  
   101  	// Start a new alloc runner and assert it gets stopped
   102  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
   103  	defer cleanup2()
   104  
   105  	// Use original statedb to maintain hook state
   106  	conf2.StateDB = conf.StateDB
   107  
   108  	// Restore, start, and wait for task to be killed
   109  	ar2, err := NewAllocRunner(conf2)
   110  	require.NoError(t, err)
   111  
   112  	require.NoError(t, ar2.Restore())
   113  
   114  	go ar2.Run()
   115  	defer destroy(ar2)
   116  
   117  	select {
   118  	case <-ar2.WaitCh():
   119  	case <-time.After(30 * time.Second):
   120  	}
   121  
   122  	// Assert logmon was cleaned up
   123  	require.Error(t, logmonProc.Signal(syscall.Signal(0)))
   124  
   125  	// Assert consul was cleaned up:
   126  	//   1 removal during prekill
   127  	//    - removal during exited is de-duped due to prekill
   128  	//    - removal during stop is de-duped due to prekill
   129  	//   1 removal group during stop
   130  	consulOps := conf2.Consul.(*regMock.ServiceRegistrationHandler).GetOps()
   131  	require.Len(t, consulOps, 2)
   132  	for _, op := range consulOps {
   133  		require.Equal(t, "remove", op.Op)
   134  	}
   135  
   136  	// Assert terminated task event was emitted
   137  	events := ar2.AllocState().TaskStates[task.Name].Events
   138  	require.Len(t, events, 4)
   139  	require.Equal(t, events[0].Type, structs.TaskReceived)
   140  	require.Equal(t, events[1].Type, structs.TaskSetup)
   141  	require.Equal(t, events[2].Type, structs.TaskStarted)
   142  	require.Equal(t, events[3].Type, structs.TaskTerminated)
   143  }
   144  
   145  // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed
   146  // batch alloc doesn't run it again
   147  func TestAllocRunner_Restore_CompletedBatch(t *testing.T) {
   148  	ci.Parallel(t)
   149  
   150  	// 1. Run task and wait for it to complete
   151  	// 2. Start new alloc runner
   152  	// 3. Assert task didn't run again
   153  
   154  	alloc := mock.Alloc()
   155  	alloc.Job.Type = structs.JobTypeBatch
   156  	task := alloc.Job.TaskGroups[0].Tasks[0]
   157  	task.Driver = "mock_driver"
   158  	task.Config = map[string]interface{}{
   159  		"run_for": "2ms",
   160  	}
   161  
   162  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   163  	defer cleanup()
   164  
   165  	// Maintain state for subsequent run
   166  	conf.StateDB = state.NewMemDB(conf.Logger)
   167  
   168  	// Start and wait for task to be running
   169  	ar, err := NewAllocRunner(conf)
   170  	require.NoError(t, err)
   171  	go ar.Run()
   172  	defer destroy(ar)
   173  
   174  	testutil.WaitForResult(func() (bool, error) {
   175  		s := ar.AllocState()
   176  		if s.ClientStatus != structs.AllocClientStatusComplete {
   177  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   178  		}
   179  		return true, nil
   180  	}, func(err error) {
   181  		require.NoError(t, err)
   182  	})
   183  
   184  	// once job finishes, it shouldn't run again
   185  	require.False(t, ar.shouldRun())
   186  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   187  	require.Len(t, initialRunEvents, 4)
   188  
   189  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   190  	require.NoError(t, err)
   191  	require.NotNil(t, ls)
   192  	require.Equal(t, structs.TaskStateDead, ts.State)
   193  
   194  	// Start a new alloc runner and assert it gets stopped
   195  	conf2, cleanup2 := testAllocRunnerConfig(t, alloc)
   196  	defer cleanup2()
   197  
   198  	// Use original statedb to maintain hook state
   199  	conf2.StateDB = conf.StateDB
   200  
   201  	// Restore, start, and wait for task to be killed
   202  	ar2, err := NewAllocRunner(conf2)
   203  	require.NoError(t, err)
   204  
   205  	require.NoError(t, ar2.Restore())
   206  
   207  	go ar2.Run()
   208  	defer destroy(ar2)
   209  
   210  	// AR waitCh must be open as the task waits for a possible alloc restart.
   211  	select {
   212  	case <-ar2.WaitCh():
   213  		require.Fail(t, "alloc.waitCh was closed")
   214  	default:
   215  	}
   216  
   217  	// TR waitCh must be open too!
   218  	select {
   219  	case <-ar2.tasks[task.Name].WaitCh():
   220  		require.Fail(t, "tr.waitCh was closed")
   221  	default:
   222  	}
   223  
   224  	// Assert that events are unmodified, which they would if task re-run
   225  	events := ar2.AllocState().TaskStates[task.Name].Events
   226  	require.Equal(t, initialRunEvents, events)
   227  }
   228  
   229  // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc
   230  // prestart hooks failed, then the alloc and subsequent tasks transition
   231  // to failed state
   232  func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) {
   233  	ci.Parallel(t)
   234  
   235  	alloc := mock.Alloc()
   236  	alloc.Job.Type = structs.JobTypeBatch
   237  	task := alloc.Job.TaskGroups[0].Tasks[0]
   238  	task.Driver = "mock_driver"
   239  	task.Config = map[string]interface{}{
   240  		"run_for": "2ms",
   241  	}
   242  	rp := &structs.RestartPolicy{Attempts: 0}
   243  	alloc.Job.TaskGroups[0].RestartPolicy = rp
   244  	task.RestartPolicy = rp
   245  
   246  	conf, cleanup := testAllocRunnerConfig(t, alloc.Copy())
   247  	defer cleanup()
   248  
   249  	// Maintain state for subsequent run
   250  	conf.StateDB = state.NewMemDB(conf.Logger)
   251  
   252  	// Start and wait for task to be running
   253  	ar, err := NewAllocRunner(conf)
   254  	require.NoError(t, err)
   255  
   256  	ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{})
   257  
   258  	go ar.Run()
   259  	defer destroy(ar)
   260  
   261  	select {
   262  	case <-ar.WaitCh():
   263  	case <-time.After(10 * time.Second):
   264  		require.Fail(t, "alloc.waitCh wasn't closed")
   265  	}
   266  
   267  	testutil.WaitForResult(func() (bool, error) {
   268  		s := ar.AllocState()
   269  		if s.ClientStatus != structs.AllocClientStatusFailed {
   270  			return false, fmt.Errorf("expected complete, got %s", s.ClientStatus)
   271  		}
   272  		return true, nil
   273  	}, func(err error) {
   274  		require.NoError(t, err)
   275  	})
   276  
   277  	// once job finishes, it shouldn't run again
   278  	require.False(t, ar.shouldRun())
   279  	initialRunEvents := ar.AllocState().TaskStates[task.Name].Events
   280  	require.Len(t, initialRunEvents, 2)
   281  
   282  	ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name)
   283  	require.NoError(t, err)
   284  	require.NotNil(t, ls)
   285  	require.NotNil(t, ts)
   286  	require.Equal(t, structs.TaskStateDead, ts.State)
   287  	require.True(t, ts.Failed)
   288  
   289  	// TR waitCh must be closed too!
   290  	select {
   291  	case <-ar.tasks[task.Name].WaitCh():
   292  	case <-time.After(10 * time.Second):
   293  		require.Fail(t, "tr.waitCh wasn't closed")
   294  	}
   295  }
   296  
   297  type allocFailingPrestartHook struct{}
   298  
   299  func (*allocFailingPrestartHook) Name() string { return "failing_prestart" }
   300  
   301  func (*allocFailingPrestartHook) Prerun() error {
   302  	return fmt.Errorf("failing prestart hooks")
   303  }