github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/client/allocrunner/alloc_runner_unix_test.go (about) 1 //go:build !windows 2 // +build !windows 3 4 package allocrunner 5 6 import ( 7 "encoding/json" 8 "fmt" 9 "os" 10 "syscall" 11 "testing" 12 "time" 13 14 "github.com/hashicorp/nomad/ci" 15 regMock "github.com/hashicorp/nomad/client/serviceregistration/mock" 16 "github.com/hashicorp/nomad/client/state" 17 "github.com/hashicorp/nomad/nomad/mock" 18 "github.com/hashicorp/nomad/nomad/structs" 19 "github.com/hashicorp/nomad/testutil" 20 "github.com/stretchr/testify/require" 21 ) 22 23 // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal 24 // alloc with a running task properly kills the running the task. This is meant 25 // to simulate a Nomad agent crash after receiving an updated alloc with 26 // DesiredStatus=Stop, persisting the update, but crashing before terminating 27 // the task. 28 func TestAllocRunner_Restore_RunningTerminal(t *testing.T) { 29 ci.Parallel(t) 30 31 // 1. Run task 32 // 2. Shutdown alloc runner 33 // 3. Set alloc.desiredstatus=false 34 // 4. Start new alloc runner 35 // 5. Assert task and logmon are cleaned up 36 37 alloc := mock.Alloc() 38 alloc.Job.TaskGroups[0].Services = []*structs.Service{ 39 { 40 Name: "foo", 41 PortLabel: "8888", 42 Provider: structs.ServiceProviderConsul, 43 }, 44 } 45 task := alloc.Job.TaskGroups[0].Tasks[0] 46 task.Driver = "mock_driver" 47 task.Config = map[string]interface{}{ 48 "run_for": "1h", 49 } 50 51 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 52 defer cleanup() 53 54 // Maintain state for subsequent run 55 conf.StateDB = state.NewMemDB(conf.Logger) 56 57 // Start and wait for task to be running 58 ar, err := NewAllocRunner(conf) 59 require.NoError(t, err) 60 go ar.Run() 61 defer destroy(ar) 62 63 testutil.WaitForResult(func() (bool, error) { 64 s := ar.AllocState() 65 return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus) 66 }, func(err error) { 67 require.NoError(t, err) 68 }) 69 70 // Shutdown the AR and manually change the state to mimic a crash where 71 // a stopped alloc update is received, but Nomad crashes before 72 // stopping the alloc. 73 ar.Shutdown() 74 select { 75 case <-ar.ShutdownCh(): 76 case <-time.After(30 * time.Second): 77 require.Fail(t, "AR took too long to exit") 78 } 79 80 // Assert logmon is still running. This is a super ugly hack that pulls 81 // logmon's PID out of its reattach config, but it does properly ensure 82 // logmon gets cleaned up. 83 ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 84 require.NoError(t, err) 85 require.NotNil(t, ls) 86 87 logmonReattach := struct { 88 Pid int 89 }{} 90 err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach) 91 require.NoError(t, err) 92 93 logmonProc, _ := os.FindProcess(logmonReattach.Pid) 94 require.NoError(t, logmonProc.Signal(syscall.Signal(0))) 95 96 // Fake alloc terminal during Restore() 97 alloc.DesiredStatus = structs.AllocDesiredStatusStop 98 alloc.ModifyIndex++ 99 alloc.AllocModifyIndex++ 100 101 // Start a new alloc runner and assert it gets stopped 102 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 103 defer cleanup2() 104 105 // Use original statedb to maintain hook state 106 conf2.StateDB = conf.StateDB 107 108 // Restore, start, and wait for task to be killed 109 ar2, err := NewAllocRunner(conf2) 110 require.NoError(t, err) 111 112 require.NoError(t, ar2.Restore()) 113 114 go ar2.Run() 115 defer destroy(ar2) 116 117 select { 118 case <-ar2.WaitCh(): 119 case <-time.After(30 * time.Second): 120 } 121 122 // Assert logmon was cleaned up 123 require.Error(t, logmonProc.Signal(syscall.Signal(0))) 124 125 // Assert consul was cleaned up: 126 // 1 removal during prekill 127 // - removal during exited is de-duped due to prekill 128 // - removal during stop is de-duped due to prekill 129 // 1 removal group during stop 130 consulOps := conf2.Consul.(*regMock.ServiceRegistrationHandler).GetOps() 131 require.Len(t, consulOps, 2) 132 for _, op := range consulOps { 133 require.Equal(t, "remove", op.Op) 134 } 135 136 // Assert terminated task event was emitted 137 events := ar2.AllocState().TaskStates[task.Name].Events 138 require.Len(t, events, 4) 139 require.Equal(t, events[0].Type, structs.TaskReceived) 140 require.Equal(t, events[1].Type, structs.TaskSetup) 141 require.Equal(t, events[2].Type, structs.TaskStarted) 142 require.Equal(t, events[3].Type, structs.TaskTerminated) 143 } 144 145 // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed 146 // batch alloc doesn't run it again 147 func TestAllocRunner_Restore_CompletedBatch(t *testing.T) { 148 ci.Parallel(t) 149 150 // 1. Run task and wait for it to complete 151 // 2. Start new alloc runner 152 // 3. Assert task didn't run again 153 154 alloc := mock.Alloc() 155 alloc.Job.Type = structs.JobTypeBatch 156 task := alloc.Job.TaskGroups[0].Tasks[0] 157 task.Driver = "mock_driver" 158 task.Config = map[string]interface{}{ 159 "run_for": "2ms", 160 } 161 162 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 163 defer cleanup() 164 165 // Maintain state for subsequent run 166 conf.StateDB = state.NewMemDB(conf.Logger) 167 168 // Start and wait for task to be running 169 ar, err := NewAllocRunner(conf) 170 require.NoError(t, err) 171 go ar.Run() 172 defer destroy(ar) 173 174 testutil.WaitForResult(func() (bool, error) { 175 s := ar.AllocState() 176 if s.ClientStatus != structs.AllocClientStatusComplete { 177 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 178 } 179 return true, nil 180 }, func(err error) { 181 require.NoError(t, err) 182 }) 183 184 // once job finishes, it shouldn't run again 185 require.False(t, ar.shouldRun()) 186 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 187 require.Len(t, initialRunEvents, 4) 188 189 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 190 require.NoError(t, err) 191 require.NotNil(t, ls) 192 require.Equal(t, structs.TaskStateDead, ts.State) 193 194 // Start a new alloc runner and assert it gets stopped 195 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 196 defer cleanup2() 197 198 // Use original statedb to maintain hook state 199 conf2.StateDB = conf.StateDB 200 201 // Restore, start, and wait for task to be killed 202 ar2, err := NewAllocRunner(conf2) 203 require.NoError(t, err) 204 205 require.NoError(t, ar2.Restore()) 206 207 go ar2.Run() 208 defer destroy(ar2) 209 210 // AR waitCh must be open as the task waits for a possible alloc restart. 211 select { 212 case <-ar2.WaitCh(): 213 require.Fail(t, "alloc.waitCh was closed") 214 default: 215 } 216 217 // TR waitCh must be open too! 218 select { 219 case <-ar2.tasks[task.Name].WaitCh(): 220 require.Fail(t, "tr.waitCh was closed") 221 default: 222 } 223 224 // Assert that events are unmodified, which they would if task re-run 225 events := ar2.AllocState().TaskStates[task.Name].Events 226 require.Equal(t, initialRunEvents, events) 227 } 228 229 // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc 230 // prestart hooks failed, then the alloc and subsequent tasks transition 231 // to failed state 232 func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) { 233 ci.Parallel(t) 234 235 alloc := mock.Alloc() 236 alloc.Job.Type = structs.JobTypeBatch 237 task := alloc.Job.TaskGroups[0].Tasks[0] 238 task.Driver = "mock_driver" 239 task.Config = map[string]interface{}{ 240 "run_for": "2ms", 241 } 242 rp := &structs.RestartPolicy{Attempts: 0} 243 alloc.Job.TaskGroups[0].RestartPolicy = rp 244 task.RestartPolicy = rp 245 246 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 247 defer cleanup() 248 249 // Maintain state for subsequent run 250 conf.StateDB = state.NewMemDB(conf.Logger) 251 252 // Start and wait for task to be running 253 ar, err := NewAllocRunner(conf) 254 require.NoError(t, err) 255 256 ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{}) 257 258 go ar.Run() 259 defer destroy(ar) 260 261 select { 262 case <-ar.WaitCh(): 263 case <-time.After(10 * time.Second): 264 require.Fail(t, "alloc.waitCh wasn't closed") 265 } 266 267 testutil.WaitForResult(func() (bool, error) { 268 s := ar.AllocState() 269 if s.ClientStatus != structs.AllocClientStatusFailed { 270 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 271 } 272 return true, nil 273 }, func(err error) { 274 require.NoError(t, err) 275 }) 276 277 // once job finishes, it shouldn't run again 278 require.False(t, ar.shouldRun()) 279 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 280 require.Len(t, initialRunEvents, 2) 281 282 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 283 require.NoError(t, err) 284 require.NotNil(t, ls) 285 require.NotNil(t, ts) 286 require.Equal(t, structs.TaskStateDead, ts.State) 287 require.True(t, ts.Failed) 288 289 // TR waitCh must be closed too! 290 select { 291 case <-ar.tasks[task.Name].WaitCh(): 292 case <-time.After(10 * time.Second): 293 require.Fail(t, "tr.waitCh wasn't closed") 294 } 295 } 296 297 type allocFailingPrestartHook struct{} 298 299 func (*allocFailingPrestartHook) Name() string { return "failing_prestart" } 300 301 func (*allocFailingPrestartHook) Prerun() error { 302 return fmt.Errorf("failing prestart hooks") 303 }