github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/clientstate/clientstate.go (about) 1 package clientstate 2 3 import ( 4 "bytes" 5 "fmt" 6 "io/ioutil" 7 "math/rand" 8 "net/http" 9 "os" 10 "os/exec" 11 "path/filepath" 12 "strconv" 13 "syscall" 14 "time" 15 16 "github.com/hashicorp/nomad/api" 17 "github.com/hashicorp/nomad/ci" 18 "github.com/hashicorp/nomad/client/state" 19 "github.com/hashicorp/nomad/e2e/e2eutil" 20 "github.com/hashicorp/nomad/e2e/execagent" 21 "github.com/hashicorp/nomad/e2e/framework" 22 "github.com/hashicorp/nomad/helper/discover" 23 "github.com/hashicorp/nomad/helper/testlog" 24 "github.com/hashicorp/nomad/helper/uuid" 25 "github.com/hashicorp/nomad/testutil" 26 ) 27 28 func init() { 29 framework.AddSuites(&framework.TestSuite{ 30 Component: "clientstate", 31 CanRunLocal: true, 32 Cases: []framework.TestCase{ 33 &ClientStateTC{}, 34 }, 35 }) 36 } 37 38 type ClientStateTC struct { 39 framework.TC 40 41 // bin is the path to Nomad binary 42 bin string 43 } 44 45 func (tc *ClientStateTC) BeforeAll(f *framework.F) { 46 if os.Getenv("NOMAD_TEST_STATE") == "" { 47 f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1") 48 } 49 50 bin, err := discover.NomadExecutable() 51 f.NoError(err) 52 tc.bin = bin 53 } 54 55 func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) { 56 allocfs := client.AllocFS() 57 r, err := allocfs.Cat(alloc, path, nil) 58 if err != nil { 59 return 0, err 60 } 61 defer r.Close() 62 63 out, err := ioutil.ReadAll(r) 64 if err != nil { 65 return 0, err 66 } 67 68 lines := bytes.SplitN(out, []byte{'\n'}, 2) 69 if len(lines) != 2 || len(lines[1]) > 0 { 70 return 0, fmt.Errorf("expected 1 line not %q", string(out)) 71 } 72 73 // Capture pid 74 pid, err := strconv.Atoi(string(lines[0])) 75 if err != nil { 76 return 0, err 77 } 78 79 return pid, nil 80 } 81 82 // TestClientState_Kill force kills Nomad agents and restarts them in a tight 83 // loop to assert Nomad is crash safe. 84 func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) { 85 t := f.T() 86 ci.Parallel(t) 87 88 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 89 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 90 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 91 f.NoError(err) 92 93 f.NoError(serverAgent.Start()) 94 defer serverAgent.Destroy() 95 f.NoError(clientAgent.Start()) 96 defer clientAgent.Destroy() 97 98 // Get a client for the server agent to use even while the client is 99 // down. 100 client, err := serverAgent.Client() 101 f.NoError(err) 102 103 jobID := "sleeper-" + uuid.Generate()[:8] 104 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "") 105 f.Len(allocs, 1) 106 107 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 108 f.NoError(err) 109 110 defer func() { 111 if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil { 112 t.Logf("error stopping job: %v", err) 113 } 114 115 testutil.WaitForResult(func() (bool, error) { 116 sum, _, err := client.Jobs().Summary(jobID, nil) 117 if err != nil { 118 return false, err 119 } 120 if r := sum.Summary["sleeper"].Running; r > 0 { 121 return false, fmt.Errorf("still running: %d", r) 122 } 123 return true, nil 124 }, func(err error) { 125 f.NoError(err) 126 }) 127 128 //XXX Must use client agent for gc'ing allocs? 129 clientAPI, err := clientAgent.Client() 130 f.NoError(err) 131 if err := clientAPI.Allocations().GC(alloc, nil); err != nil { 132 t.Logf("error garbage collecting alloc: %v", err) 133 } 134 135 if err := client.System().GarbageCollect(); err != nil { 136 t.Logf("error doing full gc: %v", err) 137 } 138 139 //HACK to wait until things have GC'd 140 time.Sleep(time.Second) 141 }() 142 143 assertHealthy := func() { 144 t.Helper() 145 testutil.WaitForResult(func() (bool, error) { 146 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 147 f.NoError(err) // should never error 148 149 if len(alloc.TaskStates) == 0 { 150 return false, fmt.Errorf("waiting for tasks to start") 151 } 152 153 if s := alloc.TaskStates["sleeper"].State; s != "running" { 154 return false, fmt.Errorf("task should be running: %q", s) 155 } 156 157 // Restarts should never happen 158 f.Zero(alloc.TaskStates["sleeper"].Restarts) 159 return true, nil 160 }, func(err error) { 161 f.NoError(err) 162 }) 163 } 164 assertHealthy() 165 166 // Find pid 167 pid := 0 168 testutil.WaitForResult(func() (bool, error) { 169 pid, err = getPID(client, alloc, "sleeper/pid") 170 return pid > 0, err 171 }, func(err error) { 172 f.NoError(err) 173 }) 174 175 // Kill and restart a few times 176 tries := 10 177 for i := 0; i < tries; i++ { 178 t.Logf("TEST RUN %d/%d", i+1, tries) 179 180 // Kill -9 the Agent 181 agentPid := clientAgent.Cmd.Process.Pid 182 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 183 184 state, err := clientAgent.Cmd.Process.Wait() 185 f.NoError(err) 186 f.False(state.Exited()) // kill signal != exited 187 f.False(state.Success()) 188 189 // Assert sleeper is still running 190 f.NoError(syscall.Kill(pid, 0)) 191 assertHealthy() 192 193 // Should not be able to reach its filesystem 194 _, err = getPID(client, alloc, "sleeper/pid") 195 f.Error(err) 196 197 // Restart the agent (have to create a new Cmd) 198 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 199 "-config", clientAgent.ConfFile, 200 "-data-dir", clientAgent.DataDir, 201 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 202 ) 203 clientAgent.Cmd.Stdout = clientOut 204 clientAgent.Cmd.Stderr = clientOut 205 f.NoError(clientAgent.Start()) 206 207 // Assert a new process did start 208 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 209 210 // Retrieving the pid should work once it restarts 211 testutil.WaitForResult(func() (bool, error) { 212 newPid, err := getPID(client, alloc, "sleeper/pid") 213 return newPid == pid, err 214 }, func(err error) { 215 f.NoError(err) 216 }) 217 218 // Alloc should still be running 219 assertHealthy() 220 } 221 } 222 223 // TestClientState_KillDuringRestart force kills Nomad agents and restarts them 224 // in a tight loop to assert Nomad is crash safe while a task is restarting. 225 func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) { 226 t := f.T() 227 ci.Parallel(t) 228 229 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 230 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 231 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 232 f.NoError(err) 233 234 f.NoError(serverAgent.Start()) 235 defer serverAgent.Destroy() 236 237 f.NoError(clientAgent.Start()) 238 defer clientAgent.Destroy() 239 240 // Get a client for the server agent to use even while the client is 241 // down. 242 client, err := serverAgent.Client() 243 f.NoError(err) 244 245 jobID := "restarter-" + uuid.Generate()[:8] 246 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID, "") 247 f.Len(allocs, 1) 248 249 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 250 f.NoError(err) 251 252 defer func() { 253 //FIXME(schmichael): this cleanup is insufficient, but I can't 254 // figure out how to fix it 255 client.Jobs().Deregister(jobID, false, nil) 256 client.System().GarbageCollect() 257 time.Sleep(time.Second) 258 }() 259 260 var restarts uint64 261 testutil.WaitForResult(func() (bool, error) { 262 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 263 f.NoError(err) // should never error 264 265 if len(alloc.TaskStates) == 0 { 266 return false, fmt.Errorf("waiting for tasks to start") 267 } 268 269 n := alloc.TaskStates["restarter"].Restarts 270 if n < restarts { 271 // Restarts should never decrease; immediately fail 272 f.Failf("restarts decreased", "%d < %d", n, restarts) 273 } 274 275 // Capture current restarts 276 restarts = n 277 return true, nil 278 }, func(err error) { 279 f.NoError(err) 280 }) 281 282 dice := rand.New(rand.NewSource(time.Now().UnixNano())) 283 284 // Kill and restart agent a few times 285 i := 0 286 for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); { 287 i++ 288 sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond 289 t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts) 290 291 time.Sleep(sleep) 292 293 // Ensure restarts are progressing 294 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 295 f.NoError(err) // should never error 296 n := alloc.TaskStates["restarter"].Restarts 297 if n < restarts { 298 // Restarts should never decrease; immediately fail 299 f.Failf("restarts decreased", "%d < %d", n, restarts) 300 } 301 if i > 5 && n == 0 { 302 // At least one restart should have happened by now 303 f.Failf("no restarts", "expected at least 1 restart after %d tries", i) 304 } 305 restarts = n 306 307 // Kill -9 Agent 308 agentPid := clientAgent.Cmd.Process.Pid 309 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 310 t.Logf("[TEST] ===> Killed %d", agentPid) 311 312 state, err := clientAgent.Cmd.Process.Wait() 313 f.NoError(err) 314 f.False(state.Exited()) // kill signal != exited 315 f.False(state.Success()) 316 317 // Restart the agent (have to create a new Cmd) 318 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 319 "-config", clientAgent.ConfFile, 320 "-data-dir", clientAgent.DataDir, 321 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 322 ) 323 clientAgent.Cmd.Stdout = clientOut 324 clientAgent.Cmd.Stderr = clientOut 325 f.NoError(clientAgent.Start()) 326 327 // Assert a new process did start 328 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 329 clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP) 330 testutil.WaitForResult(func() (bool, error) { 331 resp, err := http.Get(clientUrl) 332 if err != nil { 333 return false, err 334 } 335 resp.Body.Close() 336 return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode) 337 }, func(err error) { 338 f.NoError(err) 339 }) 340 } 341 342 t.Logf("[TEST] ===> Final restarts: %d", restarts) 343 } 344 345 // TestClientState_Corrupt removes task state from the client's state db to 346 // assert it recovers. 347 func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) { 348 t := f.T() 349 ci.Parallel(t) 350 351 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 352 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 353 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 354 f.NoError(err) 355 356 f.NoError(serverAgent.Start()) 357 defer serverAgent.Destroy() 358 f.NoError(clientAgent.Start()) 359 defer clientAgent.Destroy() 360 361 // Get a client for the server agent to use even while the client is 362 // down. 363 client, err := serverAgent.Client() 364 f.NoError(err) 365 366 jobID := "sleeper-" + uuid.Generate()[:8] 367 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "") 368 f.Len(allocs, 1) 369 370 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 371 f.NoError(err) 372 373 defer func() { 374 //FIXME(schmichael): this cleanup is insufficient, but I can't 375 // figure out how to fix it 376 client.Jobs().Deregister(jobID, false, nil) 377 client.System().GarbageCollect() 378 time.Sleep(time.Second) 379 }() 380 381 assertHealthy := func() { 382 t.Helper() 383 testutil.WaitForResult(func() (bool, error) { 384 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 385 f.NoError(err) // should never error 386 387 if len(alloc.TaskStates) == 0 { 388 return false, fmt.Errorf("waiting for tasks to start") 389 } 390 391 if s := alloc.TaskStates["sleeper"].State; s != "running" { 392 return false, fmt.Errorf("task should be running: %q", s) 393 } 394 395 // Restarts should never happen 396 f.Zero(alloc.TaskStates["sleeper"].Restarts) 397 return true, nil 398 }, func(err error) { 399 f.NoError(err) 400 }) 401 } 402 assertHealthy() 403 404 // Find pid 405 pid := 0 406 testutil.WaitForResult(func() (bool, error) { 407 pid, err = getPID(client, alloc, "sleeper/pid") 408 return pid > 0, err 409 }, func(err error) { 410 f.NoError(err) 411 }) 412 413 // Kill and corrupt the state 414 agentPid := clientAgent.Cmd.Process.Pid 415 f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt)) 416 417 procState, err := clientAgent.Cmd.Process.Wait() 418 f.NoError(err) 419 f.True(procState.Exited()) 420 421 // Assert sleeper is still running 422 f.NoError(syscall.Kill(pid, 0)) 423 assertHealthy() 424 425 // Remove task bucket from client state 426 db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client")) 427 f.NoError(err) 428 429 f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper")) 430 f.NoError(db.Close()) 431 432 // Restart the agent (have to create a new Cmd) 433 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 434 "-config", clientAgent.ConfFile, 435 "-data-dir", clientAgent.DataDir, 436 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 437 ) 438 clientAgent.Cmd.Stdout = clientOut 439 clientAgent.Cmd.Stderr = clientOut 440 f.NoError(clientAgent.Start()) 441 442 // Assert a new process did start 443 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 444 445 // Retrieving the pid should work once it restarts. 446 // Critically there are now 2 pids because the client task state was 447 // lost Nomad started a new copy. 448 testutil.WaitForResult(func() (bool, error) { 449 allocfs := client.AllocFS() 450 r, err := allocfs.Cat(alloc, "sleeper/pid", nil) 451 if err != nil { 452 return false, err 453 } 454 defer r.Close() 455 456 out, err := ioutil.ReadAll(r) 457 if err != nil { 458 return false, err 459 } 460 461 lines := bytes.SplitN(out, []byte{'\n'}, 3) 462 if len(lines) != 3 || len(lines[2]) > 0 { 463 return false, fmt.Errorf("expected 2 lines not %v", lines) 464 } 465 466 return true, nil 467 }, func(err error) { 468 f.NoError(err) 469 }) 470 471 // Alloc should still be running 472 assertHealthy() 473 }