github.com/hernad/nomad@v1.6.112/e2e/clientstate/clientstate.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package clientstate 5 6 import ( 7 "bytes" 8 "fmt" 9 "io" 10 "math/rand" 11 "net/http" 12 "os" 13 "os/exec" 14 "path/filepath" 15 "strconv" 16 "syscall" 17 "time" 18 19 "github.com/hernad/nomad/api" 20 "github.com/hernad/nomad/ci" 21 "github.com/hernad/nomad/client/state" 22 "github.com/hernad/nomad/e2e/e2eutil" 23 "github.com/hernad/nomad/e2e/execagent" 24 "github.com/hernad/nomad/e2e/framework" 25 "github.com/hernad/nomad/helper/discover" 26 "github.com/hernad/nomad/helper/testlog" 27 "github.com/hernad/nomad/helper/uuid" 28 "github.com/hernad/nomad/testutil" 29 ) 30 31 func init() { 32 framework.AddSuites(&framework.TestSuite{ 33 Component: "clientstate", 34 CanRunLocal: true, 35 Cases: []framework.TestCase{ 36 &ClientStateTC{}, 37 }, 38 }) 39 } 40 41 type ClientStateTC struct { 42 framework.TC 43 44 // bin is the path to Nomad binary 45 bin string 46 } 47 48 func (tc *ClientStateTC) BeforeAll(f *framework.F) { 49 if os.Getenv("NOMAD_TEST_STATE") == "" { 50 f.T().Skip("Skipping very slow state corruption test unless NOMAD_TEST_STATE=1") 51 } 52 53 bin, err := discover.NomadExecutable() 54 f.NoError(err) 55 tc.bin = bin 56 } 57 58 func getPID(client *api.Client, alloc *api.Allocation, path string) (int, error) { 59 allocfs := client.AllocFS() 60 r, err := allocfs.Cat(alloc, path, nil) 61 if err != nil { 62 return 0, err 63 } 64 defer r.Close() 65 66 out, err := io.ReadAll(r) 67 if err != nil { 68 return 0, err 69 } 70 71 lines := bytes.SplitN(out, []byte{'\n'}, 2) 72 if len(lines) != 2 || len(lines[1]) > 0 { 73 return 0, fmt.Errorf("expected 1 line not %q", string(out)) 74 } 75 76 // Capture pid 77 pid, err := strconv.Atoi(string(lines[0])) 78 if err != nil { 79 return 0, err 80 } 81 82 return pid, nil 83 } 84 85 // TestClientState_Kill force kills Nomad agents and restarts them in a tight 86 // loop to assert Nomad is crash safe. 87 func (tc *ClientStateTC) TestClientState_Kill(f *framework.F) { 88 t := f.T() 89 ci.Parallel(t) 90 91 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 92 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 93 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 94 f.NoError(err) 95 96 f.NoError(serverAgent.Start()) 97 defer serverAgent.Destroy() 98 f.NoError(clientAgent.Start()) 99 defer clientAgent.Destroy() 100 101 // Get a client for the server agent to use even while the client is 102 // down. 103 client, err := serverAgent.Client() 104 f.NoError(err) 105 106 jobID := "sleeper-" + uuid.Generate()[:8] 107 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "") 108 f.Len(allocs, 1) 109 110 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 111 f.NoError(err) 112 113 defer func() { 114 if _, _, err := client.Jobs().Deregister(jobID, false, nil); err != nil { 115 t.Logf("error stopping job: %v", err) 116 } 117 118 testutil.WaitForResult(func() (bool, error) { 119 sum, _, err := client.Jobs().Summary(jobID, nil) 120 if err != nil { 121 return false, err 122 } 123 if r := sum.Summary["sleeper"].Running; r > 0 { 124 return false, fmt.Errorf("still running: %d", r) 125 } 126 return true, nil 127 }, func(err error) { 128 f.NoError(err) 129 }) 130 131 //XXX Must use client agent for gc'ing allocs? 132 clientAPI, err := clientAgent.Client() 133 f.NoError(err) 134 if err := clientAPI.Allocations().GC(alloc, nil); err != nil { 135 t.Logf("error garbage collecting alloc: %v", err) 136 } 137 138 if err := client.System().GarbageCollect(); err != nil { 139 t.Logf("error doing full gc: %v", err) 140 } 141 142 //HACK to wait until things have GC'd 143 time.Sleep(time.Second) 144 }() 145 146 assertHealthy := func() { 147 t.Helper() 148 testutil.WaitForResult(func() (bool, error) { 149 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 150 f.NoError(err) // should never error 151 152 if len(alloc.TaskStates) == 0 { 153 return false, fmt.Errorf("waiting for tasks to start") 154 } 155 156 if s := alloc.TaskStates["sleeper"].State; s != "running" { 157 return false, fmt.Errorf("task should be running: %q", s) 158 } 159 160 // Restarts should never happen 161 f.Zero(alloc.TaskStates["sleeper"].Restarts) 162 return true, nil 163 }, func(err error) { 164 f.NoError(err) 165 }) 166 } 167 assertHealthy() 168 169 // Find pid 170 pid := 0 171 testutil.WaitForResult(func() (bool, error) { 172 pid, err = getPID(client, alloc, "sleeper/pid") 173 return pid > 0, err 174 }, func(err error) { 175 f.NoError(err) 176 }) 177 178 // Kill and restart a few times 179 tries := 10 180 for i := 0; i < tries; i++ { 181 t.Logf("TEST RUN %d/%d", i+1, tries) 182 183 // Kill -9 the Agent 184 agentPid := clientAgent.Cmd.Process.Pid 185 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 186 187 state, err := clientAgent.Cmd.Process.Wait() 188 f.NoError(err) 189 f.False(state.Exited()) // kill signal != exited 190 f.False(state.Success()) 191 192 // Assert sleeper is still running 193 f.NoError(syscall.Kill(pid, 0)) 194 assertHealthy() 195 196 // Should not be able to reach its filesystem 197 _, err = getPID(client, alloc, "sleeper/pid") 198 f.Error(err) 199 200 // Restart the agent (have to create a new Cmd) 201 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 202 "-config", clientAgent.ConfFile, 203 "-data-dir", clientAgent.DataDir, 204 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 205 ) 206 clientAgent.Cmd.Stdout = clientOut 207 clientAgent.Cmd.Stderr = clientOut 208 f.NoError(clientAgent.Start()) 209 210 // Assert a new process did start 211 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 212 213 // Retrieving the pid should work once it restarts 214 testutil.WaitForResult(func() (bool, error) { 215 newPid, err := getPID(client, alloc, "sleeper/pid") 216 return newPid == pid, err 217 }, func(err error) { 218 f.NoError(err) 219 }) 220 221 // Alloc should still be running 222 assertHealthy() 223 } 224 } 225 226 // TestClientState_KillDuringRestart force kills Nomad agents and restarts them 227 // in a tight loop to assert Nomad is crash safe while a task is restarting. 228 func (tc *ClientStateTC) TestClientState_KillDuringRestart(f *framework.F) { 229 t := f.T() 230 ci.Parallel(t) 231 232 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 233 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 234 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 235 f.NoError(err) 236 237 f.NoError(serverAgent.Start()) 238 defer serverAgent.Destroy() 239 240 f.NoError(clientAgent.Start()) 241 defer clientAgent.Destroy() 242 243 // Get a client for the server agent to use even while the client is 244 // down. 245 client, err := serverAgent.Client() 246 f.NoError(err) 247 248 jobID := "restarter-" + uuid.Generate()[:8] 249 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/restarter.nomad", jobID, "") 250 f.Len(allocs, 1) 251 252 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 253 f.NoError(err) 254 255 defer func() { 256 //FIXME(schmichael): this cleanup is insufficient, but I can't 257 // figure out how to fix it 258 client.Jobs().Deregister(jobID, false, nil) 259 client.System().GarbageCollect() 260 time.Sleep(time.Second) 261 }() 262 263 var restarts uint64 264 testutil.WaitForResult(func() (bool, error) { 265 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 266 f.NoError(err) // should never error 267 268 if len(alloc.TaskStates) == 0 { 269 return false, fmt.Errorf("waiting for tasks to start") 270 } 271 272 n := alloc.TaskStates["restarter"].Restarts 273 if n < restarts { 274 // Restarts should never decrease; immediately fail 275 f.Failf("restarts decreased", "%d < %d", n, restarts) 276 } 277 278 // Capture current restarts 279 restarts = n 280 return true, nil 281 }, func(err error) { 282 f.NoError(err) 283 }) 284 285 dice := rand.New(rand.NewSource(time.Now().UnixNano())) 286 287 // Kill and restart agent a few times 288 i := 0 289 for deadline := time.Now().Add(5 * time.Minute); time.Now().Before(deadline); { 290 i++ 291 sleep := time.Duration(1500+dice.Int63n(6000)) * time.Millisecond 292 t.Logf("[TEST] ===> Run %d (pid: %d sleeping for %v; last restarts: %d)", i, clientAgent.Cmd.Process.Pid, sleep, restarts) 293 294 time.Sleep(sleep) 295 296 // Ensure restarts are progressing 297 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 298 f.NoError(err) // should never error 299 n := alloc.TaskStates["restarter"].Restarts 300 if n < restarts { 301 // Restarts should never decrease; immediately fail 302 f.Failf("restarts decreased", "%d < %d", n, restarts) 303 } 304 if i > 5 && n == 0 { 305 // At least one restart should have happened by now 306 f.Failf("no restarts", "expected at least 1 restart after %d tries", i) 307 } 308 restarts = n 309 310 // Kill -9 Agent 311 agentPid := clientAgent.Cmd.Process.Pid 312 f.NoError(clientAgent.Cmd.Process.Signal(os.Kill)) 313 t.Logf("[TEST] ===> Killed %d", agentPid) 314 315 state, err := clientAgent.Cmd.Process.Wait() 316 f.NoError(err) 317 f.False(state.Exited()) // kill signal != exited 318 f.False(state.Success()) 319 320 // Restart the agent (have to create a new Cmd) 321 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 322 "-config", clientAgent.ConfFile, 323 "-data-dir", clientAgent.DataDir, 324 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 325 ) 326 clientAgent.Cmd.Stdout = clientOut 327 clientAgent.Cmd.Stderr = clientOut 328 f.NoError(clientAgent.Start()) 329 330 // Assert a new process did start 331 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 332 clientUrl := fmt.Sprintf("http://127.0.0.1:%d/v1/client/stats", clientAgent.Vars.HTTP) 333 testutil.WaitForResult(func() (bool, error) { 334 resp, err := http.Get(clientUrl) 335 if err != nil { 336 return false, err 337 } 338 resp.Body.Close() 339 return resp.StatusCode == 200, fmt.Errorf("%d != 200", resp.StatusCode) 340 }, func(err error) { 341 f.NoError(err) 342 }) 343 } 344 345 t.Logf("[TEST] ===> Final restarts: %d", restarts) 346 } 347 348 // TestClientState_Corrupt removes task state from the client's state db to 349 // assert it recovers. 350 func (tc *ClientStateTC) TestClientState_Corrupt(f *framework.F) { 351 t := f.T() 352 ci.Parallel(t) 353 354 serverOut := testlog.NewPrefixWriter(t, "SERVER: ") 355 clientOut := testlog.NewPrefixWriter(t, "CLIENT: ") 356 serverAgent, clientAgent, err := execagent.NewClientServerPair(tc.bin, serverOut, clientOut) 357 f.NoError(err) 358 359 f.NoError(serverAgent.Start()) 360 defer serverAgent.Destroy() 361 f.NoError(clientAgent.Start()) 362 defer clientAgent.Destroy() 363 364 // Get a client for the server agent to use even while the client is 365 // down. 366 client, err := serverAgent.Client() 367 f.NoError(err) 368 369 jobID := "sleeper-" + uuid.Generate()[:8] 370 allocs := e2eutil.RegisterAndWaitForAllocs(t, client, "clientstate/sleeper.nomad", jobID, "") 371 f.Len(allocs, 1) 372 373 alloc, _, err := client.Allocations().Info(allocs[0].ID, nil) 374 f.NoError(err) 375 376 defer func() { 377 //FIXME(schmichael): this cleanup is insufficient, but I can't 378 // figure out how to fix it 379 client.Jobs().Deregister(jobID, false, nil) 380 client.System().GarbageCollect() 381 time.Sleep(time.Second) 382 }() 383 384 assertHealthy := func() { 385 t.Helper() 386 testutil.WaitForResult(func() (bool, error) { 387 alloc, _, err = client.Allocations().Info(alloc.ID, nil) 388 f.NoError(err) // should never error 389 390 if len(alloc.TaskStates) == 0 { 391 return false, fmt.Errorf("waiting for tasks to start") 392 } 393 394 if s := alloc.TaskStates["sleeper"].State; s != "running" { 395 return false, fmt.Errorf("task should be running: %q", s) 396 } 397 398 // Restarts should never happen 399 f.Zero(alloc.TaskStates["sleeper"].Restarts) 400 return true, nil 401 }, func(err error) { 402 f.NoError(err) 403 }) 404 } 405 assertHealthy() 406 407 // Find pid 408 pid := 0 409 testutil.WaitForResult(func() (bool, error) { 410 pid, err = getPID(client, alloc, "sleeper/pid") 411 return pid > 0, err 412 }, func(err error) { 413 f.NoError(err) 414 }) 415 416 // Kill and corrupt the state 417 agentPid := clientAgent.Cmd.Process.Pid 418 f.NoError(clientAgent.Cmd.Process.Signal(os.Interrupt)) 419 420 procState, err := clientAgent.Cmd.Process.Wait() 421 f.NoError(err) 422 f.True(procState.Exited()) 423 424 // Assert sleeper is still running 425 f.NoError(syscall.Kill(pid, 0)) 426 assertHealthy() 427 428 // Remove task bucket from client state 429 db, err := state.NewBoltStateDB(testlog.HCLogger(t), filepath.Join(clientAgent.DataDir, "client")) 430 f.NoError(err) 431 432 f.NoError(db.DeleteTaskBucket(alloc.ID, "sleeper")) 433 f.NoError(db.Close()) 434 435 // Restart the agent (have to create a new Cmd) 436 clientAgent.Cmd = exec.Command(clientAgent.BinPath, "agent", 437 "-config", clientAgent.ConfFile, 438 "-data-dir", clientAgent.DataDir, 439 "-servers", fmt.Sprintf("127.0.0.1:%d", serverAgent.Vars.RPC), 440 ) 441 clientAgent.Cmd.Stdout = clientOut 442 clientAgent.Cmd.Stderr = clientOut 443 f.NoError(clientAgent.Start()) 444 445 // Assert a new process did start 446 f.NotEqual(clientAgent.Cmd.Process.Pid, agentPid) 447 448 // Retrieving the pid should work once it restarts. 449 // Critically there are now 2 pids because the client task state was 450 // lost Nomad started a new copy. 451 testutil.WaitForResult(func() (bool, error) { 452 allocfs := client.AllocFS() 453 r, err := allocfs.Cat(alloc, "sleeper/pid", nil) 454 if err != nil { 455 return false, err 456 } 457 defer r.Close() 458 459 out, err := io.ReadAll(r) 460 if err != nil { 461 return false, err 462 } 463 464 lines := bytes.SplitN(out, []byte{'\n'}, 3) 465 if len(lines) != 3 || len(lines[2]) > 0 { 466 return false, fmt.Errorf("expected 2 lines not %v", lines) 467 } 468 469 return true, nil 470 }, func(err error) { 471 f.NoError(err) 472 }) 473 474 // Alloc should still be running 475 assertHealthy() 476 }