github.com/hernad/nomad@v1.6.112/e2e/rescheduling/rescheduling.go (about) 1 // Copyright (c) HashiCorp, Inc. 2 // SPDX-License-Identifier: MPL-2.0 3 4 package rescheduling 5 6 import ( 7 "fmt" 8 "os" 9 "reflect" 10 "sort" 11 "time" 12 13 e2e "github.com/hernad/nomad/e2e/e2eutil" 14 "github.com/hernad/nomad/e2e/framework" 15 "github.com/hernad/nomad/helper/uuid" 16 "github.com/hernad/nomad/jobspec" 17 "github.com/hernad/nomad/testutil" 18 ) 19 20 const ns = "" 21 22 type RescheduleE2ETest struct { 23 framework.TC 24 jobIds []string 25 } 26 27 func init() { 28 framework.AddSuites(&framework.TestSuite{ 29 Component: "Rescheduling", 30 CanRunLocal: true, 31 Consul: true, 32 Cases: []framework.TestCase{ 33 new(RescheduleE2ETest), 34 }, 35 }) 36 37 } 38 39 func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { 40 e2e.WaitForLeader(f.T(), tc.Nomad()) 41 e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) 42 } 43 44 func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { 45 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 46 return 47 } 48 49 for _, id := range tc.jobIds { 50 err := e2e.StopJob(id, "-purge") 51 f.Assert().NoError(err) 52 } 53 tc.jobIds = []string{} 54 _, err := e2e.Command("nomad", "system", "gc") 55 f.Assert().NoError(err) 56 } 57 58 // TestNoReschedule runs a job that should fail and never reschedule 59 func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { 60 jobID := "test-no-reschedule-" + uuid.Generate()[0:8] 61 f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) 62 tc.jobIds = append(tc.jobIds, jobID) 63 64 expected := []string{"failed", "failed", "failed"} 65 f.NoError( 66 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 67 "should have exactly 3 failed allocs", 68 ) 69 } 70 71 // TestNoRescheduleSystem runs a system job that should fail and never reschedule 72 func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { 73 jobID := "test-reschedule-system-" + uuid.Generate()[0:8] 74 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) 75 tc.jobIds = append(tc.jobIds, jobID) 76 77 f.NoError( 78 e2e.WaitForAllocStatusComparison( 79 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 80 func(got []string) bool { 81 for _, status := range got { 82 if status != "failed" { 83 return false 84 } 85 } 86 return true 87 }, nil, 88 ), 89 "should have only failed allocs", 90 ) 91 } 92 93 // TestDefaultReschedule runs a job that should reschedule after delay 94 func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { 95 96 jobID := "test-default-reschedule-" + uuid.Generate()[0:8] 97 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) 98 tc.jobIds = append(tc.jobIds, jobID) 99 100 expected := []string{"failed", "failed", "failed"} 101 f.NoError( 102 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 103 "should have exactly 3 failed allocs", 104 ) 105 106 // TODO(tgross): return early if "slow" isn't set 107 // wait until first exponential delay kicks in and rescheduling is attempted 108 time.Sleep(time.Second * 35) 109 expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} 110 f.NoError( 111 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 112 "should have exactly 6 failed allocs after 35s", 113 ) 114 } 115 116 // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts 117 func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { 118 119 jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] 120 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) 121 tc.jobIds = append(tc.jobIds, jobID) 122 123 expected := []string{"failed", "failed", "failed"} 124 f.NoError( 125 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 126 "should have exactly 3 failed allocs", 127 ) 128 129 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") 130 f.NoError(err) 131 job.ID = &jobID 132 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} 133 _, _, err = tc.Nomad().Jobs().Register(job, nil) 134 f.NoError(err, "could not register updated job") 135 136 f.NoError( 137 e2e.WaitForAllocStatusComparison( 138 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 139 func(got []string) bool { 140 for _, status := range got { 141 if status == "running" { 142 return true 143 } 144 } 145 return false 146 }, nil, 147 ), 148 "should have at least 1 running alloc", 149 ) 150 } 151 152 // TestRescheduleSuccess runs a job that should be running after rescheduling 153 func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { 154 155 jobID := "test-reschedule-success-" + uuid.Generate()[0:8] 156 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) 157 tc.jobIds = append(tc.jobIds, jobID) 158 159 f.NoError( 160 e2e.WaitForAllocStatusComparison( 161 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 162 func(got []string) bool { 163 for _, status := range got { 164 if status == "running" { 165 return true 166 } 167 } 168 return false 169 }, nil, 170 ), 171 "should have at least 1 running alloc", 172 ) 173 } 174 175 // TestRescheduleWithUpdate updates a running job to fail, and verifies that 176 // it gets rescheduled 177 func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { 178 179 jobID := "test-reschedule-update-" + uuid.Generate()[0:8] 180 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) 181 tc.jobIds = append(tc.jobIds, jobID) 182 183 expected := []string{"running", "running", "running"} 184 f.NoError( 185 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 186 "should have exactly 3 running allocs", 187 ) 188 189 // reschedule to make fail 190 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") 191 f.NoError(err) 192 job.ID = &jobID 193 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 194 _, _, err = tc.Nomad().Jobs().Register(job, nil) 195 f.NoError(err, "could not register updated job") 196 197 f.NoError( 198 e2e.WaitForAllocStatusComparison( 199 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 200 func(got []string) bool { return len(got) > 0 }, nil, 201 ), 202 "should have rescheduled allocs until progress deadline", 203 ) 204 } 205 206 // TestRescheduleWithCanary updates a running job to fail, and verify that the 207 // canary gets rescheduled 208 func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { 209 210 jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] 211 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) 212 tc.jobIds = append(tc.jobIds, jobID) 213 214 expected := []string{"running", "running", "running"} 215 f.NoError( 216 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 217 "should have exactly 3 running allocs", 218 ) 219 220 f.NoError( 221 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 222 "deployment should be successful") 223 224 // reschedule to make fail 225 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") 226 f.NoError(err) 227 job.ID = &jobID 228 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 229 _, _, err = tc.Nomad().Jobs().Register(job, nil) 230 f.NoError(err, "could not register updated job") 231 232 f.NoError( 233 e2e.WaitForAllocStatusComparison( 234 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 235 func(got []string) bool { return len(got) > 0 }, nil, 236 ), 237 "should have rescheduled allocs until progress deadline", 238 ) 239 240 f.NoError( 241 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 242 "deployment should be running") 243 } 244 245 // TestRescheduleWithCanaryAutoRevert updates a running job to fail, and 246 // verifies that the job gets reverted. 247 func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { 248 249 jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] 250 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) 251 tc.jobIds = append(tc.jobIds, jobID) 252 253 expected := []string{"running", "running", "running"} 254 f.NoError( 255 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 256 "should have exactly 3 running allocs", 257 ) 258 259 f.NoError( 260 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 261 "deployment should be successful") 262 263 // reschedule to make fail 264 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") 265 f.NoError(err) 266 job.ID = &jobID 267 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 268 _, _, err = tc.Nomad().Jobs().Register(job, nil) 269 f.NoError(err, "could not register updated job") 270 271 f.NoError( 272 e2e.WaitForAllocStatusComparison( 273 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 274 func(got []string) bool { return len(got) > 0 }, nil, 275 ), 276 "should have new allocs after update", 277 ) 278 279 // then we'll fail and revert 280 expected = []string{"failed", "failed", "failed", "running", "running", "running"} 281 f.NoError( 282 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 283 "should have exactly 3 running reverted allocs", 284 ) 285 286 f.NoError( 287 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 288 "deployment should be successful") 289 } 290 291 // TestRescheduleMaxParallel updates a job with a max_parallel config 292 func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { 293 294 jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] 295 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) 296 tc.jobIds = append(tc.jobIds, jobID) 297 298 expected := []string{"running", "running", "running"} 299 f.NoError( 300 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 301 "should have exactly 3 running allocs", 302 ) 303 304 f.NoError( 305 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 306 "deployment should be successful") 307 308 // reschedule to make fail 309 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") 310 f.NoError(err) 311 job.ID = &jobID 312 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 313 _, _, err = tc.Nomad().Jobs().Register(job, nil) 314 f.NoError(err, "could not register updated job") 315 316 expected = []string{"complete", "failed", "failed", "running", "running"} 317 318 f.NoError( 319 e2e.WaitForAllocStatusComparison( 320 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 321 func(got []string) bool { 322 sort.Strings(got) 323 return reflect.DeepEqual(got, expected) 324 }, nil, 325 ), 326 "should have failed allocs including rescheduled failed allocs", 327 ) 328 329 f.NoError( 330 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 331 "deployment should be running") 332 } 333 334 // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel 335 // config that will autorevert on failure 336 func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { 337 338 jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] 339 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) 340 tc.jobIds = append(tc.jobIds, jobID) 341 342 expected := []string{"running", "running", "running"} 343 f.NoError( 344 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 345 "should have exactly 3 running allocs", 346 ) 347 348 f.NoError( 349 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 350 "deployment should be successful") 351 352 // reschedule to make fail 353 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") 354 f.NoError(err) 355 job.ID = &jobID 356 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 357 _, _, err = tc.Nomad().Jobs().Register(job, nil) 358 f.NoError(err, "could not e2e.Register updated job") 359 360 f.NoError( 361 e2e.WaitForAllocStatusComparison( 362 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 363 func(got []string) bool { return len(got) > 0 }, nil, 364 ), 365 "should have new allocs after update", 366 ) 367 368 // wait for the revert 369 expected = []string{"complete", "failed", "running", "running", "running"} 370 f.NoError( 371 e2e.WaitForAllocStatusComparison( 372 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 373 func(got []string) bool { 374 sort.Strings(got) 375 return reflect.DeepEqual(got, expected) 376 }, nil, 377 ), 378 "should have one successful, one failed, and 3 reverted allocs", 379 ) 380 381 // at this point the allocs have been checked but we need to wait for the 382 // deployment to be marked complete before we can assert that it's successful 383 // and verify the count of deployments 384 f.NoError( 385 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 386 "most recent deployment should be successful") 387 388 out, err := e2e.Command("nomad", "deployment", "status") 389 f.NoError(err, "could not get deployment status") 390 391 results, err := e2e.ParseColumns(out) 392 f.NoError(err, "could not parse deployment status") 393 statuses := map[string]int{} 394 for _, row := range results { 395 if row["Job ID"] == jobID { 396 statuses[row["Status"]]++ 397 } 398 } 399 400 f.Equal(1, statuses["failed"], 401 fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out)) 402 f.Equal(2, statuses["successful"], 403 fmt.Sprintf("expected 2 successful deployments, got:\n%s", out)) 404 } 405 406 // TestRescheduleProgressDeadline verifies the progress deadline is reset with 407 // each healthy allocation, and that a rescheduled allocation does not. 408 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { 409 410 jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] 411 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) 412 tc.jobIds = append(tc.jobIds, jobID) 413 414 expected := []string{"running"} 415 f.NoError( 416 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 417 "should have a running allocation", 418 ) 419 420 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 421 f.NoError(err, "couldn't look up deployment") 422 423 oldDeadline, err := getProgressDeadline(deploymentID) 424 f.NoError(err, "could not get progress deadline") 425 time.Sleep(time.Second * 20) 426 427 newDeadline, err := getProgressDeadline(deploymentID) 428 f.NoError(err, "could not get new progress deadline") 429 f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") 430 431 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 432 "deployment should be successful") 433 } 434 435 // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with 436 // each healthy allocation, and that a rescheduled allocation does not. 437 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { 438 439 jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] 440 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) 441 tc.jobIds = append(tc.jobIds, jobID) 442 443 testutil.WaitForResult(func() (bool, error) { 444 _, err := e2e.LastDeploymentID(jobID, ns) 445 return err == nil, err 446 }, func(err error) { 447 f.NoError(err, "deployment wasn't created yet") 448 }) 449 450 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 451 f.NoError(err, "couldn't look up deployment") 452 453 oldDeadline, err := getProgressDeadline(deploymentID) 454 f.NoError(err, "could not get progress deadline") 455 time.Sleep(time.Second * 20) 456 457 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), 458 "deployment should be failed") 459 460 f.NoError( 461 e2e.WaitForAllocStatusComparison( 462 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 463 func(got []string) bool { 464 for _, status := range got { 465 if status != "failed" { 466 return false 467 } 468 } 469 return true 470 }, nil, 471 ), 472 "should have only failed allocs", 473 ) 474 475 newDeadline, err := getProgressDeadline(deploymentID) 476 f.NoError(err, "could not get new progress deadline") 477 f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") 478 } 479 480 func getProgressDeadline(deploymentID string) (time.Time, error) { 481 482 out, err := e2e.Command("nomad", "deployment", "status", deploymentID) 483 if err != nil { 484 return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) 485 } 486 487 section, err := e2e.GetSection(out, "Deployed") 488 if err != nil { 489 return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) 490 } 491 492 rows, err := e2e.ParseColumns(section) 493 if err != nil { 494 return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) 495 } 496 497 layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go 498 raw := rows[0]["Progress Deadline"] 499 return time.Parse(layout, raw) 500 }