github.com/iqoqo/nomad@v0.11.3-0.20200911112621-d7021c74d101/client/allocrunner/alloc_runner_unix_test.go (about) 1 // +build !windows 2 3 package allocrunner 4 5 import ( 6 "encoding/json" 7 "fmt" 8 "os" 9 "syscall" 10 "testing" 11 "time" 12 13 "github.com/hashicorp/nomad/client/consul" 14 "github.com/hashicorp/nomad/client/state" 15 "github.com/hashicorp/nomad/nomad/mock" 16 "github.com/hashicorp/nomad/nomad/structs" 17 "github.com/hashicorp/nomad/testutil" 18 "github.com/stretchr/testify/require" 19 ) 20 21 // TestAllocRunner_Restore_RunningTerminal asserts that restoring a terminal 22 // alloc with a running task properly kills the running the task. This is meant 23 // to simulate a Nomad agent crash after receiving an updated alloc with 24 // DesiredStatus=Stop, persisting the update, but crashing before terminating 25 // the task. 26 func TestAllocRunner_Restore_RunningTerminal(t *testing.T) { 27 t.Parallel() 28 29 // 1. Run task 30 // 2. Shutdown alloc runner 31 // 3. Set alloc.desiredstatus=false 32 // 4. Start new alloc runner 33 // 5. Assert task and logmon are cleaned up 34 35 alloc := mock.Alloc() 36 alloc.Job.TaskGroups[0].Services = []*structs.Service{ 37 { 38 Name: "foo", 39 PortLabel: "8888", 40 }, 41 } 42 task := alloc.Job.TaskGroups[0].Tasks[0] 43 task.Driver = "mock_driver" 44 task.Config = map[string]interface{}{ 45 "run_for": "1h", 46 } 47 48 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 49 defer cleanup() 50 51 // Maintain state for subsequent run 52 conf.StateDB = state.NewMemDB(conf.Logger) 53 54 // Start and wait for task to be running 55 ar, err := NewAllocRunner(conf) 56 require.NoError(t, err) 57 go ar.Run() 58 defer destroy(ar) 59 60 testutil.WaitForResult(func() (bool, error) { 61 s := ar.AllocState() 62 return s.ClientStatus == structs.AllocClientStatusRunning, fmt.Errorf("expected running, got %s", s.ClientStatus) 63 }, func(err error) { 64 require.NoError(t, err) 65 }) 66 67 // Shutdown the AR and manually change the state to mimic a crash where 68 // a stopped alloc update is received, but Nomad crashes before 69 // stopping the alloc. 70 ar.Shutdown() 71 select { 72 case <-ar.ShutdownCh(): 73 case <-time.After(30 * time.Second): 74 require.Fail(t, "AR took too long to exit") 75 } 76 77 // Assert logmon is still running. This is a super ugly hack that pulls 78 // logmon's PID out of its reattach config, but it does properly ensure 79 // logmon gets cleaned up. 80 ls, _, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 81 require.NoError(t, err) 82 require.NotNil(t, ls) 83 84 logmonReattach := struct { 85 Pid int 86 }{} 87 err = json.Unmarshal([]byte(ls.Hooks["logmon"].Data["reattach_config"]), &logmonReattach) 88 require.NoError(t, err) 89 90 logmonProc, _ := os.FindProcess(logmonReattach.Pid) 91 require.NoError(t, logmonProc.Signal(syscall.Signal(0))) 92 93 // Fake alloc terminal during Restore() 94 alloc.DesiredStatus = structs.AllocDesiredStatusStop 95 alloc.ModifyIndex++ 96 alloc.AllocModifyIndex++ 97 98 // Start a new alloc runner and assert it gets stopped 99 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 100 defer cleanup2() 101 102 // Use original statedb to maintain hook state 103 conf2.StateDB = conf.StateDB 104 105 // Restore, start, and wait for task to be killed 106 ar2, err := NewAllocRunner(conf2) 107 require.NoError(t, err) 108 109 require.NoError(t, ar2.Restore()) 110 111 go ar2.Run() 112 defer destroy(ar2) 113 114 select { 115 case <-ar2.WaitCh(): 116 case <-time.After(30 * time.Second): 117 } 118 119 // Assert logmon was cleaned up 120 require.Error(t, logmonProc.Signal(syscall.Signal(0))) 121 122 // Assert consul was cleaned up: 123 // 2 removals (canary+noncanary) during prekill 124 // 2 removals (canary+noncanary) during exited 125 // 2 removals (canary+noncanary) during stop 126 // 2 removals (canary+noncanary) group during stop 127 consulOps := conf2.Consul.(*consul.MockConsulServiceClient).GetOps() 128 require.Len(t, consulOps, 8) 129 for _, op := range consulOps { 130 require.Equal(t, "remove", op.Op) 131 } 132 133 // Assert terminated task event was emitted 134 events := ar2.AllocState().TaskStates[task.Name].Events 135 require.Len(t, events, 4) 136 require.Equal(t, events[0].Type, structs.TaskReceived) 137 require.Equal(t, events[1].Type, structs.TaskSetup) 138 require.Equal(t, events[2].Type, structs.TaskStarted) 139 require.Equal(t, events[3].Type, structs.TaskTerminated) 140 } 141 142 // TestAllocRunner_Restore_CompletedBatch asserts that restoring a completed 143 // batch alloc doesn't run it again 144 func TestAllocRunner_Restore_CompletedBatch(t *testing.T) { 145 t.Parallel() 146 147 // 1. Run task and wait for it to complete 148 // 2. Start new alloc runner 149 // 3. Assert task didn't run again 150 151 alloc := mock.Alloc() 152 alloc.Job.Type = structs.JobTypeBatch 153 task := alloc.Job.TaskGroups[0].Tasks[0] 154 task.Driver = "mock_driver" 155 task.Config = map[string]interface{}{ 156 "run_for": "2ms", 157 } 158 159 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 160 defer cleanup() 161 162 // Maintain state for subsequent run 163 conf.StateDB = state.NewMemDB(conf.Logger) 164 165 // Start and wait for task to be running 166 ar, err := NewAllocRunner(conf) 167 require.NoError(t, err) 168 go ar.Run() 169 defer destroy(ar) 170 171 testutil.WaitForResult(func() (bool, error) { 172 s := ar.AllocState() 173 if s.ClientStatus != structs.AllocClientStatusComplete { 174 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 175 } 176 return true, nil 177 }, func(err error) { 178 require.NoError(t, err) 179 }) 180 181 // once job finishes, it shouldn't run again 182 require.False(t, ar.shouldRun()) 183 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 184 require.Len(t, initialRunEvents, 4) 185 186 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 187 require.NoError(t, err) 188 require.NotNil(t, ls) 189 require.Equal(t, structs.TaskStateDead, ts.State) 190 191 // Start a new alloc runner and assert it gets stopped 192 conf2, cleanup2 := testAllocRunnerConfig(t, alloc) 193 defer cleanup2() 194 195 // Use original statedb to maintain hook state 196 conf2.StateDB = conf.StateDB 197 198 // Restore, start, and wait for task to be killed 199 ar2, err := NewAllocRunner(conf2) 200 require.NoError(t, err) 201 202 require.NoError(t, ar2.Restore()) 203 204 go ar2.Run() 205 defer destroy(ar2) 206 207 // AR waitCh must be closed even when task doesn't run again 208 select { 209 case <-ar2.WaitCh(): 210 case <-time.After(10 * time.Second): 211 require.Fail(t, "alloc.waitCh wasn't closed") 212 } 213 214 // TR waitCh must be closed too! 215 select { 216 case <-ar2.tasks[task.Name].WaitCh(): 217 case <-time.After(10 * time.Second): 218 require.Fail(t, "tr.waitCh wasn't closed") 219 } 220 221 // Assert that events are unmodified, which they would if task re-run 222 events := ar2.AllocState().TaskStates[task.Name].Events 223 require.Equal(t, initialRunEvents, events) 224 } 225 226 // TestAllocRunner_PreStartFailuresLeadToFailed asserts that if an alloc 227 // prestart hooks failed, then the alloc and subsequent tasks transition 228 // to failed state 229 func TestAllocRunner_PreStartFailuresLeadToFailed(t *testing.T) { 230 t.Parallel() 231 232 alloc := mock.Alloc() 233 alloc.Job.Type = structs.JobTypeBatch 234 task := alloc.Job.TaskGroups[0].Tasks[0] 235 task.Driver = "mock_driver" 236 task.Config = map[string]interface{}{ 237 "run_for": "2ms", 238 } 239 rp := &structs.RestartPolicy{Attempts: 0} 240 alloc.Job.TaskGroups[0].RestartPolicy = rp 241 task.RestartPolicy = rp 242 243 conf, cleanup := testAllocRunnerConfig(t, alloc.Copy()) 244 defer cleanup() 245 246 // Maintain state for subsequent run 247 conf.StateDB = state.NewMemDB(conf.Logger) 248 249 // Start and wait for task to be running 250 ar, err := NewAllocRunner(conf) 251 require.NoError(t, err) 252 253 ar.runnerHooks = append(ar.runnerHooks, &allocFailingPrestartHook{}) 254 255 go ar.Run() 256 defer destroy(ar) 257 258 select { 259 case <-ar.WaitCh(): 260 case <-time.After(10 * time.Second): 261 require.Fail(t, "alloc.waitCh wasn't closed") 262 } 263 264 testutil.WaitForResult(func() (bool, error) { 265 s := ar.AllocState() 266 if s.ClientStatus != structs.AllocClientStatusFailed { 267 return false, fmt.Errorf("expected complete, got %s", s.ClientStatus) 268 } 269 return true, nil 270 }, func(err error) { 271 require.NoError(t, err) 272 }) 273 274 // once job finishes, it shouldn't run again 275 require.False(t, ar.shouldRun()) 276 initialRunEvents := ar.AllocState().TaskStates[task.Name].Events 277 require.Len(t, initialRunEvents, 2) 278 279 ls, ts, err := conf.StateDB.GetTaskRunnerState(alloc.ID, task.Name) 280 require.NoError(t, err) 281 require.NotNil(t, ls) 282 require.NotNil(t, ts) 283 require.Equal(t, structs.TaskStateDead, ts.State) 284 require.True(t, ts.Failed) 285 286 // TR waitCh must be closed too! 287 select { 288 case <-ar.tasks[task.Name].WaitCh(): 289 case <-time.After(10 * time.Second): 290 require.Fail(t, "tr.waitCh wasn't closed") 291 } 292 } 293 294 type allocFailingPrestartHook struct{} 295 296 func (*allocFailingPrestartHook) Name() string { return "failing_prestart" } 297 298 func (*allocFailingPrestartHook) Prerun() error { 299 return fmt.Errorf("failing prestart hooks") 300 }