github.com/bigcommerce/nomad@v0.9.3-bc/e2e/clientstate/clientstate.go (about) 1 package clientstate 2 3 import ( 4 "bytes" 5 "fmt" 6 "io/ioutil" 7 "math/rand" 8 "net/http" 9 "os" 10 "os/exec" 11 "path/filepath" 12 "strconv" 13 "syscall" 14 "time" 15 16 "github.com/hashicorp/nomad/api" 17 "github.com/hashicorp/nomad/client/state" 18 "github.com/hashicorp/nomad/e2e/e2eutil" 19 "github.com/hashicorp/nomad/e2e/execagent" 20 "github.com/hashicorp/nomad/e2e/framework" 21 "github.com/hashicorp/nomad/helper/discover" 22 "github.com/hashicorp/nomad/helper/testlog" 23 "github.com/hashicorp/nomad/helper/uuid" 24 "github.com/hashicorp/nomad/testutil" 25 ) 26 27 func init() { 28 framework.AddSuites(&framework.TestSuite{ 29 Component: "clientstate", 30 CanRunLocal: true, 31 Cases: []framework.TestCase{ 32 &ClientStateTC{}, 33 }, 34 }) 35 } 36 37 type ClientStateTC struct { 38 framework.TC 39 40 // bin is the path to Nomad binary 41 bin string 42 } 43 44 func (tc *ClientStateTC) BeforeAll(f *framework.F) { 45 if os.Getenv("NOMAD_TEST_STATE") == "" { 46 f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1") 47 } 48 49 bin, err := discover.NomadExecutable() 50 f.NoError(err) 51 tc.bin = bin 52 } 53 54 func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) { 55 allocfs := client.AllocFS() 56 r, err := allocfs.Cat(alloc, path, nil) 57 if err != nil { 58 return 0, err 59 } 60 defer r.Close() 61 62 out, err := ioutil.ReadAll(r) 63 if err != nil { 64 return 0, err 65 } 66 67 lines := bytes.SplitN(out, []byte{'\n'}, 2) 68 if len(lines) != 2 || len(lines[1]) > 0 { 69 return 0, fmt.Errorf("expected 1 line not %q", string(out)) 70 } 71 72 // Capture pid 73 pid, err := strconv.Atoi(string(lines[0])) 74 if err != nil { 75 return 0, err 76 } 77 78 return pid, nil 79 } 80 81 // TestClientState_Kill force kills Nomad agents and restarts them in a tight 82 // loop to assert Nomad is crash safe. 83 func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) { 84 t := f.T() 85 t.Parallel() 86 87 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 88 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 89 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 90 f.NoError(err) 91 92 f.NoError(serverAgent.Start()) 93 defer serverAgent.Destroy() 94 f.NoError(clientAgent.Start()) 95 defer clientAgent.Destroy() 96 97 // Get a client for the server agent to use even while the client is 98 // down. 99 client, err := serverAgent.Client() 100 f.NoError(err) 101 102 jobID := "sleeper-" + uuid.Generate()[:8] 103 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID) 104 f.Len(allocs, 1) 105 106 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 107 f.NoError(err) 108 109 defer func() { 110 if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil { 111 t.Logf("error stopping job: %v", err) 112 } 113 114 testutil.WaitForResult(func() (bool, error) { 115 sum, _, err := client.Jobs().Summary(jobID, nil) 116 if err != nil { 117 return false, err 118 } 119 if r := sum.Summary["sleeper"].Running; r > 0 { 120 return false, fmt.Errorf("still running: %d", r) 121 } 122 return true, nil 123 }, func(err error) { 124 f.NoError(err) 125 }) 126 127 //XXX Must use client agent for gc'ing allocs? 128 clientAPI, err := clientAgent.Client() 129 f.NoError(err) 130 if err := clientAPI.Allocations().GC(alloc, nil); err != nil { 131 t.Logf("error garbage collecting alloc: %v", err) 132 } 133 134 if err := client.System().GarbageCollect(); err != nil { 135 t.Logf("error doing full gc: %v", err) 136 } 137 138 //HACK to wait until things have GC'd 139 time.Sleep(time.Second) 140 }() 141 142 assertHealthy := func() { 143 t.Helper() 144 testutil.WaitForResult(func() (bool, error) { 145 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 146 f.NoError(err) // should never error 147 148 if len(alloc.TaskStates) == 0 { 149 return false, fmt.Errorf("waiting for tasks to start") 150 } 151 152 if s := alloc.TaskStates["sleeper"].State; s != "running" { 153 return false, fmt.Errorf("task should be running: %q", s) 154 } 155 156 // Restarts should never happen 157 f.Zero(alloc.TaskStates["sleeper"].Restarts) 158 return true, nil 159 }, func(err error) { 160 f.NoError(err) 161 }) 162 } 163 assertHealthy() 164 165 // Find pid 166 pid := 0 167 testutil.WaitForResult(func() (bool, error) { 168 pid, err = getPID(client, alloc, "sleeper/pid") 169 return pid > 0, err 170 }, func(err error) { 171 f.NoError(err) 172 }) 173 174 // Kill and restart a few times 175 tries := 10 176 for i := 0; i < tries; i++ { 177 t.Logf("TEST RUN %d/%d", i+1, tries) 178 179 // Kill -9 the Agent 180 agentPid := clientAgent.Cmd.Process.Pid 181 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 182 183 state, err := clientAgent.Cmd.Process.Wait() 184 f.NoError(err) 185 f.False(state.Exited()) // kill signal != exited 186 f.False(state.Success()) 187 188 // Assert sleeper is still running 189 f.NoError(syscall.Kill(pid, 0)) 190 assertHealthy() 191 192 // Should not be able to reach its filesystem 193 _, err = getPID(client, alloc, "sleeper/pid") 194 f.Error(err) 195 196 // Restart the agent (have to create a new Cmd) 197 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 198 "-config", clientAgent.ConfFile, 199 "-data-dir", clientAgent.DataDir, 200 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 201 ) 202 clientAgent.Cmd.Stdout = clientOut 203 clientAgent.Cmd.Stderr = clientOut 204 f.NoError(clientAgent.Start()) 205 206 // Assert a new process did start 207 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 208 209 // Retrieving the pid should work once it restarts 210 testutil.WaitForResult(func() (bool, error) { 211 newPid, err := getPID(client, alloc, "sleeper/pid") 212 return newPid == pid, err 213 }, func(err error) { 214 f.NoError(err) 215 }) 216 217 // Alloc should still be running 218 assertHealthy() 219 } 220 } 221 222 // TestClientState_KillDuringRestart force kills Nomad agents and restarts them 223 // in a tight loop to assert Nomad is crash safe while a task is restarting. 224 func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) { 225 t := f.T() 226 t.Parallel() 227 228 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 229 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 230 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 231 f.NoError(err) 232 233 f.NoError(serverAgent.Start()) 234 defer serverAgent.Destroy() 235 236 f.NoError(clientAgent.Start()) 237 defer clientAgent.Destroy() 238 239 // Get a client for the server agent to use even while the client is 240 // down. 241 client, err := serverAgent.Client() 242 f.NoError(err) 243 244 jobID := "restarter-" + uuid.Generate()[:8] 245 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID) 246 f.Len(allocs, 1) 247 248 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 249 f.NoError(err) 250 251 defer func() { 252 //FIXME(schmichael): this cleanup is insufficient, but I can't 253 // figure out how to fix it 254 client.Jobs().Deregister(jobID, false, nil) 255 client.System().GarbageCollect() 256 time.Sleep(time.Second) 257 }() 258 259 var restarts uint64 260 testutil.WaitForResult(func() (bool, error) { 261 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 262 f.NoError(err) // should never error 263 264 if len(alloc.TaskStates) == 0 { 265 return false, fmt.Errorf("waiting for tasks to start") 266 } 267 268 n := alloc.TaskStates["restarter"].Restarts 269 if n < restarts { 270 // Restarts should never decrease; immediately fail 271 f.Failf("restarts decreased", "%d < %d", n, restarts) 272 } 273 274 // Capture current restarts 275 restarts = n 276 return true, nil 277 }, func(err error) { 278 f.NoError(err) 279 }) 280 281 dice := rand.New(rand.NewSource(time.Now().UnixNano())) 282 283 // Kill and restart agent a few times 284 i := 0 285 for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); { 286 i++ 287 sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond 288 t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts) 289 290 time.Sleep(sleep) 291 292 // Ensure restarts are progressing 293 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 294 f.NoError(err) // should never error 295 n := alloc.TaskStates["restarter"].Restarts 296 if n < restarts { 297 // Restarts should never decrease; immediately fail 298 f.Failf("restarts decreased", "%d < %d", n, restarts) 299 } 300 if i > 5 && n == 0 { 301 // At least one restart should have happened by now 302 f.Failf("no restarts", "expected at least 1 restart after %d tries", i) 303 } 304 restarts = n 305 306 // Kill -9 Agent 307 agentPid := clientAgent.Cmd.Process.Pid 308 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 309 t.Logf("[TEST] ===> Killed %d", agentPid) 310 311 state, err := clientAgent.Cmd.Process.Wait() 312 f.NoError(err) 313 f.False(state.Exited()) // kill signal != exited 314 f.False(state.Success()) 315 316 // Restart the agent (have to create a new Cmd) 317 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 318 "-config", clientAgent.ConfFile, 319 "-data-dir", clientAgent.DataDir, 320 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 321 ) 322 clientAgent.Cmd.Stdout = clientOut 323 clientAgent.Cmd.Stderr = clientOut 324 f.NoError(clientAgent.Start()) 325 326 // Assert a new process did start 327 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 328 clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP) 329 testutil.WaitForResult(func() (bool, error) { 330 resp, err := http.Get(clientUrl) 331 if err != nil { 332 return false, err 333 } 334 resp.Body.Close() 335 return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode) 336 }, func(err error) { 337 f.NoError(err) 338 }) 339 } 340 341 t.Logf("[TEST] ===> Final restarts: %d", restarts) 342 } 343 344 // TestClientState_Corrupt removes task state from the client's state db to 345 // assert it recovers. 346 func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) { 347 t := f.T() 348 t.Parallel() 349 350 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 351 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 352 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 353 f.NoError(err) 354 355 f.NoError(serverAgent.Start()) 356 defer serverAgent.Destroy() 357 f.NoError(clientAgent.Start()) 358 defer clientAgent.Destroy() 359 360 // Get a client for the server agent to use even while the client is 361 // down. 362 client, err := serverAgent.Client() 363 f.NoError(err) 364 365 jobID := "sleeper-" + uuid.Generate()[:8] 366 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID) 367 f.Len(allocs, 1) 368 369 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 370 f.NoError(err) 371 372 defer func() { 373 //FIXME(schmichael): this cleanup is insufficient, but I can't 374 // figure out how to fix it 375 client.Jobs().Deregister(jobID, false, nil) 376 client.System().GarbageCollect() 377 time.Sleep(time.Second) 378 }() 379 380 assertHealthy := func() { 381 t.Helper() 382 testutil.WaitForResult(func() (bool, error) { 383 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 384 f.NoError(err) // should never error 385 386 if len(alloc.TaskStates) == 0 { 387 return false, fmt.Errorf("waiting for tasks to start") 388 } 389 390 if s := alloc.TaskStates["sleeper"].State; s != "running" { 391 return false, fmt.Errorf("task should be running: %q", s) 392 } 393 394 // Restarts should never happen 395 f.Zero(alloc.TaskStates["sleeper"].Restarts) 396 return true, nil 397 }, func(err error) { 398 f.NoError(err) 399 }) 400 } 401 assertHealthy() 402 403 // Find pid 404 pid := 0 405 testutil.WaitForResult(func() (bool, error) { 406 pid, err = getPID(client, alloc, "sleeper/pid") 407 return pid > 0, err 408 }, func(err error) { 409 f.NoError(err) 410 }) 411 412 // Kill and corrupt the state 413 agentPid := clientAgent.Cmd.Process.Pid 414 f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt)) 415 416 procState, err := clientAgent.Cmd.Process.Wait() 417 f.NoError(err) 418 f.True(procState.Exited()) 419 420 // Assert sleeper is still running 421 f.NoError(syscall.Kill(pid, 0)) 422 assertHealthy() 423 424 // Remove task bucket from client state 425 db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client")) 426 f.NoError(err) 427 428 f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper")) 429 f.NoError(db.Close()) 430 431 // Restart the agent (have to create a new Cmd) 432 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 433 "-config", clientAgent.ConfFile, 434 "-data-dir", clientAgent.DataDir, 435 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 436 ) 437 clientAgent.Cmd.Stdout = clientOut 438 clientAgent.Cmd.Stderr = clientOut 439 f.NoError(clientAgent.Start()) 440 441 // Assert a new process did start 442 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 443 444 // Retrieving the pid should work once it restarts. 445 // Critically there are now 2 pids because the client task state was 446 // lost Nomad started a new copy. 447 testutil.WaitForResult(func() (bool, error) { 448 allocfs := client.AllocFS() 449 r, err := allocfs.Cat(alloc, "sleeper/pid", nil) 450 if err != nil { 451 return false, err 452 } 453 defer r.Close() 454 455 out, err := ioutil.ReadAll(r) 456 if err != nil { 457 return false, err 458 } 459 460 lines := bytes.SplitN(out, []byte{'\n'}, 3) 461 if len(lines) != 3 || len(lines[2]) > 0 { 462 return false, fmt.Errorf("expected 2 lines not %v", lines) 463 } 464 465 return true, nil 466 }, func(err error) { 467 f.NoError(err) 468 }) 469 470 // Alloc should still be running 471 assertHealthy() 472 }