github.com/bigcommerce/nomad@v0.9.3-bc/client/allocrunner/alloc_runner_unix_test.go (about) 1 // +build !windows 2 3 package allocrunner 4 5 import ( 6 "encoding/json" 7 "fmt" 8 "os" 9 "syscall" 10 "testing" 11 "time" 12 13 "github.com/hashicorp/nomad/client/consul" 14 "github.com/hashicorp/nomad/client/state" 15 "github.com/hashicorp/nomad/nomad/mock" 16 "github.com/hashicorp/nomad/nomad/structs" 17 "github.com/hashicorp/nomad/testutil" 18 "github.com/stretchr/testify/require" 19 ) 20 21 // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal 22 // alloc with a running task properly kills the running the task. This is meant 23 // to simulate a Nomad agent crash after receiving an updated alloc with 24 // DesiredStatus=Stop, persisting the update, but crashing before terminating 25 // the task. 26 func TestAllocRunner_Restore_RunningTerminal(t *testing.T) { 27 t.Parallel() 28 29 // 1. Run task 30 // 2. Shutdown alloc runner 31 // 3. Set alloc.desiredstatus=false 32 // 4. Start new alloc runner 33 // 5. Assert task and logmon are cleaned up 34 35 alloc := mock.Alloc() 36 task := alloc.Job.TaskGroups[0].Tasks[0] 37 task.Driver = "mock_driver" 38 task.Config = map[string]interface{}{ 39 "run_for": "1h", 40 } 41 42 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 43 defer cleanup() 44 45 // Maintain state for subsequent run 46 conf.StateDB = state.NewMemDB(conf.Logger) 47 48 // Start and wait for task to be running 49 ar, err := NewAllocRunner(conf) 50 require.NoError(t, err) 51 go ar.Run() 52 defer destroy(ar) 53 54 testutil.WaitForResult(func() (bool, error) { 55 s := ar.AllocState() 56 return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus) 57 }, func(err error) { 58 require.NoError(t, err) 59 }) 60 61 // Shutdown the AR and manually change the state to mimic a crash where 62 // a stopped alloc update is received, but Nomad crashes before 63 // stopping the alloc. 64 ar.Shutdown() 65 select { 66 case <-ar.ShutdownCh(): 67 case <-time.After(30 * time.Second): 68 require.Fail(t, "AR took too long to exit") 69 } 70 71 // Assert logmon is still running. This is a super ugly hack that pulls 72 // logmon's PID out of its reattach config, but it does properly ensure 73 // logmon gets cleaned up. 74 ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 75 require.NoError(t, err) 76 require.NotNil(t, ls) 77 78 logmonReattach := struct { 79 Pid int 80 }{} 81 err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach) 82 require.NoError(t, err) 83 84 logmonProc, _ := os.FindProcess(logmonReattach.Pid) 85 require.NoError(t, logmonProc.Signal(syscall.Signal(0))) 86 87 // Fake alloc terminal during Restore() 88 alloc.DesiredStatus = structs.AllocDesiredStatusStop 89 alloc.ModifyIndex++ 90 alloc.AllocModifyIndex++ 91 92 // Start a new alloc runner and assert it gets stopped 93 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 94 defer cleanup2() 95 96 // Use original statedb to maintain hook state 97 conf2.StateDB = conf.StateDB 98 99 // Restore, start, and wait for task to be killed 100 ar2, err := NewAllocRunner(conf2) 101 require.NoError(t, err) 102 103 require.NoError(t, ar2.Restore()) 104 105 go ar2.Run() 106 defer destroy(ar2) 107 108 select { 109 case <-ar2.WaitCh(): 110 case <-time.After(30 * time.Second): 111 } 112 113 // Assert logmon was cleaned up 114 require.Error(t, logmonProc.Signal(syscall.Signal(0))) 115 116 // Assert consul was cleaned up: 117 // 2 removals (canary+noncanary) during prekill 118 // 2 removals (canary+noncanary) during exited 119 // 2 removals (canary+noncanary) during stop 120 consulOps := conf2.Consul.(*consul.MockConsulServiceClient).GetOps() 121 require.Len(t, consulOps, 6) 122 for _, op := range consulOps { 123 require.Equal(t, "remove", op.Op) 124 } 125 126 // Assert terminated task event was emitted 127 events := ar2.AllocState().TaskStates[task.Name].Events 128 require.Len(t, events, 4) 129 require.Equal(t, events[0].Type, structs.TaskReceived) 130 require.Equal(t, events[1].Type, structs.TaskSetup) 131 require.Equal(t, events[2].Type, structs.TaskStarted) 132 require.Equal(t, events[3].Type, structs.TaskTerminated) 133 } 134 135 // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed 136 // batch alloc doesn't run it again 137 func TestAllocRunner_Restore_CompletedBatch(t *testing.T) { 138 t.Parallel() 139 140 // 1. Run task and wait for it to complete 141 // 2. Start new alloc runner 142 // 3. Assert task didn't run again 143 144 alloc := mock.Alloc() 145 alloc.Job.Type = structs.JobTypeBatch 146 task := alloc.Job.TaskGroups[0].Tasks[0] 147 task.Driver = "mock_driver" 148 task.Config = map[string]interface{}{ 149 "run_for": "2ms", 150 } 151 152 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 153 defer cleanup() 154 155 // Maintain state for subsequent run 156 conf.StateDB = state.NewMemDB(conf.Logger) 157 158 // Start and wait for task to be running 159 ar, err := NewAllocRunner(conf) 160 require.NoError(t, err) 161 go ar.Run() 162 defer destroy(ar) 163 164 testutil.WaitForResult(func() (bool, error) { 165 s := ar.AllocState() 166 if s.ClientStatus != structs.AllocClientStatusComplete { 167 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 168 } 169 return true, nil 170 }, func(err error) { 171 require.NoError(t, err) 172 }) 173 174 // once job finishes, it shouldn't run again 175 require.False(t, ar.shouldRun()) 176 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 177 require.Len(t, initialRunEvents, 4) 178 179 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 180 require.NoError(t, err) 181 require.NotNil(t, ls) 182 require.Equal(t, structs.TaskStateDead, ts.State) 183 184 // Start a new alloc runner and assert it gets stopped 185 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 186 defer cleanup2() 187 188 // Use original statedb to maintain hook state 189 conf2.StateDB = conf.StateDB 190 191 // Restore, start, and wait for task to be killed 192 ar2, err := NewAllocRunner(conf2) 193 require.NoError(t, err) 194 195 require.NoError(t, ar2.Restore()) 196 197 go ar2.Run() 198 defer destroy(ar2) 199 200 // AR waitCh must be closed even when task doesn't run again 201 select { 202 case <-ar2.WaitCh(): 203 case <-time.After(10 * time.Second): 204 require.Fail(t, "alloc.waitCh wasn't closed") 205 } 206 207 // TR waitCh must be closed too! 208 select { 209 case <-ar2.tasks[task.Name].WaitCh(): 210 case <-time.After(10 * time.Second): 211 require.Fail(t, "tr.waitCh wasn't closed") 212 } 213 214 // Assert that events are unmodified, which they would if task re-run 215 events := ar2.AllocState().TaskStates[task.Name].Events 216 require.Equal(t, initialRunEvents, events) 217 } 218 219 // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc 220 // prestart hooks failed, then the alloc and subsequent tasks transition 221 // to failed state 222 func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) { 223 t.Parallel() 224 225 alloc := mock.Alloc() 226 alloc.Job.Type = structs.JobTypeBatch 227 task := alloc.Job.TaskGroups[0].Tasks[0] 228 task.Driver = "mock_driver" 229 task.Config = map[string]interface{}{ 230 "run_for": "2ms", 231 } 232 alloc.Job.TaskGroups[0].RestartPolicy = &structs.RestartPolicy{ 233 Attempts: 0, 234 } 235 236 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 237 defer cleanup() 238 239 // Maintain state for subsequent run 240 conf.StateDB = state.NewMemDB(conf.Logger) 241 242 // Start and wait for task to be running 243 ar, err := NewAllocRunner(conf) 244 require.NoError(t, err) 245 246 ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{}) 247 248 go ar.Run() 249 defer destroy(ar) 250 251 select { 252 case <-ar.WaitCh(): 253 case <-time.After(10 * time.Second): 254 require.Fail(t, "alloc.waitCh wasn't closed") 255 } 256 257 testutil.WaitForResult(func() (bool, error) { 258 s := ar.AllocState() 259 if s.ClientStatus != structs.AllocClientStatusFailed { 260 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 261 } 262 return true, nil 263 }, func(err error) { 264 require.NoError(t, err) 265 }) 266 267 // once job finishes, it shouldn't run again 268 require.False(t, ar.shouldRun()) 269 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 270 require.Len(t, initialRunEvents, 2) 271 272 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 273 require.NoError(t, err) 274 require.NotNil(t, ls) 275 require.NotNil(t, ts) 276 require.Equal(t, structs.TaskStateDead, ts.State) 277 require.True(t, ts.Failed) 278 279 // TR waitCh must be closed too! 280 select { 281 case <-ar.tasks[task.Name].WaitCh(): 282 case <-time.After(10 * time.Second): 283 require.Fail(t, "tr.waitCh wasn't closed") 284 } 285 } 286 287 type allocFailingPrestartHook struct{} 288 289 func (*allocFailingPrestartHook) Name() string { return "failing_prestart" } 290 291 func (*allocFailingPrestartHook) Prerun() error { 292 return fmt.Errorf("failing prestart hooks") 293 }