github.com/anth0d/nomad@v0.0.0-20221214183521-ae3a0a2cad06/e2e/rescheduling/rescheduling.go (about) 1 package rescheduling 2 3 import ( 4 "fmt" 5 "os" 6 "reflect" 7 "sort" 8 "time" 9 10 e2e "github.com/hashicorp/nomad/e2e/e2eutil" 11 "github.com/hashicorp/nomad/e2e/framework" 12 "github.com/hashicorp/nomad/helper/uuid" 13 "github.com/hashicorp/nomad/jobspec" 14 "github.com/hashicorp/nomad/testutil" 15 ) 16 17 const ns = "" 18 19 type RescheduleE2ETest struct { 20 framework.TC 21 jobIds []string 22 } 23 24 func init() { 25 framework.AddSuites(&framework.TestSuite{ 26 Component: "Rescheduling", 27 CanRunLocal: true, 28 Consul: true, 29 Cases: []framework.TestCase{ 30 new(RescheduleE2ETest), 31 }, 32 }) 33 34 } 35 36 func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { 37 e2e.WaitForLeader(f.T(), tc.Nomad()) 38 e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) 39 } 40 41 func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { 42 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 43 return 44 } 45 46 for _, id := range tc.jobIds { 47 err := e2e.StopJob(id, "-purge") 48 f.Assert().NoError(err) 49 } 50 tc.jobIds = []string{} 51 _, err := e2e.Command("nomad", "system", "gc") 52 f.Assert().NoError(err) 53 } 54 55 // TestNoReschedule runs a job that should fail and never reschedule 56 func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { 57 jobID := "test-no-reschedule-" + uuid.Generate()[0:8] 58 f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) 59 tc.jobIds = append(tc.jobIds, jobID) 60 61 expected := []string{"failed", "failed", "failed"} 62 f.NoError( 63 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 64 "should have exactly 3 failed allocs", 65 ) 66 } 67 68 // TestNoRescheduleSystem runs a system job that should fail and never reschedule 69 func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { 70 jobID := "test-reschedule-system-" + uuid.Generate()[0:8] 71 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) 72 tc.jobIds = append(tc.jobIds, jobID) 73 74 f.NoError( 75 e2e.WaitForAllocStatusComparison( 76 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 77 func(got []string) bool { 78 for _, status := range got { 79 if status != "failed" { 80 return false 81 } 82 } 83 return true 84 }, nil, 85 ), 86 "should have only failed allocs", 87 ) 88 } 89 90 // TestDefaultReschedule runs a job that should reschedule after delay 91 func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { 92 93 jobID := "test-default-reschedule-" + uuid.Generate()[0:8] 94 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) 95 tc.jobIds = append(tc.jobIds, jobID) 96 97 expected := []string{"failed", "failed", "failed"} 98 f.NoError( 99 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 100 "should have exactly 3 failed allocs", 101 ) 102 103 // TODO(tgross): return early if "slow" isn't set 104 // wait until first exponential delay kicks in and rescheduling is attempted 105 time.Sleep(time.Second * 35) 106 expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} 107 f.NoError( 108 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 109 "should have exactly 6 failed allocs after 35s", 110 ) 111 } 112 113 // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts 114 func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { 115 116 jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] 117 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) 118 tc.jobIds = append(tc.jobIds, jobID) 119 120 expected := []string{"failed", "failed", "failed"} 121 f.NoError( 122 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 123 "should have exactly 3 failed allocs", 124 ) 125 126 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") 127 f.NoError(err) 128 job.ID = &jobID 129 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} 130 _, _, err = tc.Nomad().Jobs().Register(job, nil) 131 f.NoError(err, "could not register updated job") 132 133 f.NoError( 134 e2e.WaitForAllocStatusComparison( 135 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 136 func(got []string) bool { 137 for _, status := range got { 138 if status == "running" { 139 return true 140 } 141 } 142 return false 143 }, nil, 144 ), 145 "should have at least 1 running alloc", 146 ) 147 } 148 149 // TestRescheduleSuccess runs a job that should be running after rescheduling 150 func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { 151 152 jobID := "test-reschedule-success-" + uuid.Generate()[0:8] 153 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) 154 tc.jobIds = append(tc.jobIds, jobID) 155 156 f.NoError( 157 e2e.WaitForAllocStatusComparison( 158 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 159 func(got []string) bool { 160 for _, status := range got { 161 if status == "running" { 162 return true 163 } 164 } 165 return false 166 }, nil, 167 ), 168 "should have at least 1 running alloc", 169 ) 170 } 171 172 // TestRescheduleWithUpdate updates a running job to fail, and verifies that 173 // it gets rescheduled 174 func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { 175 176 jobID := "test-reschedule-update-" + uuid.Generate()[0:8] 177 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) 178 tc.jobIds = append(tc.jobIds, jobID) 179 180 expected := []string{"running", "running", "running"} 181 f.NoError( 182 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 183 "should have exactly 3 running allocs", 184 ) 185 186 // reschedule to make fail 187 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") 188 f.NoError(err) 189 job.ID = &jobID 190 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 191 _, _, err = tc.Nomad().Jobs().Register(job, nil) 192 f.NoError(err, "could not register updated job") 193 194 f.NoError( 195 e2e.WaitForAllocStatusComparison( 196 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 197 func(got []string) bool { return len(got) > 0 }, nil, 198 ), 199 "should have rescheduled allocs until progress deadline", 200 ) 201 } 202 203 // TestRescheduleWithCanary updates a running job to fail, and verify that the 204 // canary gets rescheduled 205 func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { 206 207 jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] 208 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) 209 tc.jobIds = append(tc.jobIds, jobID) 210 211 expected := []string{"running", "running", "running"} 212 f.NoError( 213 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 214 "should have exactly 3 running allocs", 215 ) 216 217 f.NoError( 218 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 219 "deployment should be successful") 220 221 // reschedule to make fail 222 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") 223 f.NoError(err) 224 job.ID = &jobID 225 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 226 _, _, err = tc.Nomad().Jobs().Register(job, nil) 227 f.NoError(err, "could not register updated job") 228 229 f.NoError( 230 e2e.WaitForAllocStatusComparison( 231 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 232 func(got []string) bool { return len(got) > 0 }, nil, 233 ), 234 "should have rescheduled allocs until progress deadline", 235 ) 236 237 f.NoError( 238 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 239 "deployment should be running") 240 } 241 242 // TestRescheduleWithCanaryAutoRevert updates a running job to fail, and 243 // verifies that the job gets reverted. 244 func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { 245 246 jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] 247 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) 248 tc.jobIds = append(tc.jobIds, jobID) 249 250 expected := []string{"running", "running", "running"} 251 f.NoError( 252 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 253 "should have exactly 3 running allocs", 254 ) 255 256 f.NoError( 257 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 258 "deployment should be successful") 259 260 // reschedule to make fail 261 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") 262 f.NoError(err) 263 job.ID = &jobID 264 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 265 _, _, err = tc.Nomad().Jobs().Register(job, nil) 266 f.NoError(err, "could not register updated job") 267 268 f.NoError( 269 e2e.WaitForAllocStatusComparison( 270 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 271 func(got []string) bool { return len(got) > 0 }, nil, 272 ), 273 "should have new allocs after update", 274 ) 275 276 // then we'll fail and revert 277 expected = []string{"failed", "failed", "failed", "running", "running", "running"} 278 f.NoError( 279 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 280 "should have exactly 3 running reverted allocs", 281 ) 282 283 f.NoError( 284 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 285 "deployment should be successful") 286 } 287 288 // TestRescheduleMaxParallel updates a job with a max_parallel config 289 func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { 290 291 jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] 292 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) 293 tc.jobIds = append(tc.jobIds, jobID) 294 295 expected := []string{"running", "running", "running"} 296 f.NoError( 297 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 298 "should have exactly 3 running allocs", 299 ) 300 301 f.NoError( 302 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 303 "deployment should be successful") 304 305 // reschedule to make fail 306 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") 307 f.NoError(err) 308 job.ID = &jobID 309 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 310 _, _, err = tc.Nomad().Jobs().Register(job, nil) 311 f.NoError(err, "could not register updated job") 312 313 expected = []string{"complete", "failed", "failed", "running", "running"} 314 315 f.NoError( 316 e2e.WaitForAllocStatusComparison( 317 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 318 func(got []string) bool { 319 sort.Strings(got) 320 return reflect.DeepEqual(got, expected) 321 }, nil, 322 ), 323 "should have failed allocs including rescheduled failed allocs", 324 ) 325 326 f.NoError( 327 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 328 "deployment should be running") 329 } 330 331 // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel 332 // config that will autorevert on failure 333 func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { 334 335 jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] 336 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) 337 tc.jobIds = append(tc.jobIds, jobID) 338 339 expected := []string{"running", "running", "running"} 340 f.NoError( 341 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 342 "should have exactly 3 running allocs", 343 ) 344 345 f.NoError( 346 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 347 "deployment should be successful") 348 349 // reschedule to make fail 350 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") 351 f.NoError(err) 352 job.ID = &jobID 353 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 354 _, _, err = tc.Nomad().Jobs().Register(job, nil) 355 f.NoError(err, "could not e2e.Register updated job") 356 357 f.NoError( 358 e2e.WaitForAllocStatusComparison( 359 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 360 func(got []string) bool { return len(got) > 0 }, nil, 361 ), 362 "should have new allocs after update", 363 ) 364 365 // wait for the revert 366 expected = []string{"complete", "failed", "running", "running", "running"} 367 f.NoError( 368 e2e.WaitForAllocStatusComparison( 369 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 370 func(got []string) bool { 371 sort.Strings(got) 372 return reflect.DeepEqual(got, expected) 373 }, nil, 374 ), 375 "should have one successful, one failed, and 3 reverted allocs", 376 ) 377 378 // at this point the allocs have been checked but we need to wait for the 379 // deployment to be marked complete before we can assert that it's successful 380 // and verify the count of deployments 381 f.NoError( 382 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 383 "most recent deployment should be successful") 384 385 out, err := e2e.Command("nomad", "deployment", "status") 386 f.NoError(err, "could not get deployment status") 387 388 results, err := e2e.ParseColumns(out) 389 f.NoError(err, "could not parse deployment status") 390 statuses := map[string]int{} 391 for _, row := range results { 392 if row["Job ID"] == jobID { 393 statuses[row["Status"]]++ 394 } 395 } 396 397 f.Equal(1, statuses["failed"], 398 fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out)) 399 f.Equal(2, statuses["successful"], 400 fmt.Sprintf("expected 2 successful deployments, got:\n%s", out)) 401 } 402 403 // TestRescheduleProgressDeadline verifies the progress deadline is reset with 404 // each healthy allocation, and that a rescheduled allocation does not. 405 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { 406 407 jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] 408 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) 409 tc.jobIds = append(tc.jobIds, jobID) 410 411 expected := []string{"running"} 412 f.NoError( 413 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 414 "should have a running allocation", 415 ) 416 417 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 418 f.NoError(err, "couldn't look up deployment") 419 420 oldDeadline, err := getProgressDeadline(deploymentID) 421 f.NoError(err, "could not get progress deadline") 422 time.Sleep(time.Second * 20) 423 424 newDeadline, err := getProgressDeadline(deploymentID) 425 f.NoError(err, "could not get new progress deadline") 426 f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") 427 428 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 429 "deployment should be successful") 430 } 431 432 // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with 433 // each healthy allocation, and that a rescheduled allocation does not. 434 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { 435 436 jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] 437 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) 438 tc.jobIds = append(tc.jobIds, jobID) 439 440 testutil.WaitForResult(func() (bool, error) { 441 _, err := e2e.LastDeploymentID(jobID, ns) 442 return err == nil, err 443 }, func(err error) { 444 f.NoError(err, "deployment wasn't created yet") 445 }) 446 447 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 448 f.NoError(err, "couldn't look up deployment") 449 450 oldDeadline, err := getProgressDeadline(deploymentID) 451 f.NoError(err, "could not get progress deadline") 452 time.Sleep(time.Second * 20) 453 454 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), 455 "deployment should be failed") 456 457 f.NoError( 458 e2e.WaitForAllocStatusComparison( 459 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 460 func(got []string) bool { 461 for _, status := range got { 462 if status != "failed" { 463 return false 464 } 465 } 466 return true 467 }, nil, 468 ), 469 "should have only failed allocs", 470 ) 471 472 newDeadline, err := getProgressDeadline(deploymentID) 473 f.NoError(err, "could not get new progress deadline") 474 f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") 475 } 476 477 func getProgressDeadline(deploymentID string) (time.Time, error) { 478 479 out, err := e2e.Command("nomad", "deployment", "status", deploymentID) 480 if err != nil { 481 return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) 482 } 483 484 section, err := e2e.GetSection(out, "Deployed") 485 if err != nil { 486 return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) 487 } 488 489 rows, err := e2e.ParseColumns(section) 490 if err != nil { 491 return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) 492 } 493 494 layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go 495 raw := rows[0]["Progress Deadline"] 496 return time.Parse(layout, raw) 497 }