github.com/Ilhicas/nomad@v1.0.4-0.20210304152020-e86851182bc3/e2e/rescheduling/rescheduling.go (about) 1 package rescheduling 2 3 import ( 4 "fmt" 5 "os" 6 "reflect" 7 "sort" 8 "time" 9 10 e2e "github.com/hashicorp/nomad/e2e/e2eutil" 11 "github.com/hashicorp/nomad/e2e/framework" 12 "github.com/hashicorp/nomad/helper/uuid" 13 "github.com/hashicorp/nomad/jobspec" 14 ) 15 16 const ns = "" 17 18 type RescheduleE2ETest struct { 19 framework.TC 20 jobIds []string 21 } 22 23 func init() { 24 framework.AddSuites(&framework.TestSuite{ 25 Component: "Rescheduling", 26 CanRunLocal: true, 27 Consul: true, 28 Cases: []framework.TestCase{ 29 new(RescheduleE2ETest), 30 }, 31 }) 32 33 } 34 35 func (tc *RescheduleE2ETest) BeforeAll(f *framework.F) { 36 e2e.WaitForLeader(f.T(), tc.Nomad()) 37 e2e.WaitForNodesReady(f.T(), tc.Nomad(), 1) 38 } 39 40 func (tc *RescheduleE2ETest) AfterEach(f *framework.F) { 41 if os.Getenv("NOMAD_TEST_SKIPCLEANUP") == "1" { 42 return 43 } 44 45 for _, id := range tc.jobIds { 46 _, err := e2e.Command("nomad", "job", "stop", "-purge", id) 47 f.Assert().NoError(err) 48 } 49 tc.jobIds = []string{} 50 _, err := e2e.Command("nomad", "system", "gc") 51 f.Assert().NoError(err) 52 } 53 54 // TestNoReschedule runs a job that should fail and never reschedule 55 func (tc *RescheduleE2ETest) TestNoReschedule(f *framework.F) { 56 jobID := "test-no-reschedule-" + uuid.Generate()[0:8] 57 f.NoError(e2e.Register(jobID, "rescheduling/input/norescheduling.nomad")) 58 tc.jobIds = append(tc.jobIds, jobID) 59 60 expected := []string{"failed", "failed", "failed"} 61 f.NoError( 62 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 63 "should have exactly 3 failed allocs", 64 ) 65 } 66 67 // TestNoRescheduleSystem runs a system job that should fail and never reschedule 68 func (tc *RescheduleE2ETest) TestNoRescheduleSystem(f *framework.F) { 69 jobID := "test-reschedule-system-" + uuid.Generate()[0:8] 70 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_system.nomad")) 71 tc.jobIds = append(tc.jobIds, jobID) 72 73 f.NoError( 74 e2e.WaitForAllocStatusComparison( 75 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 76 func(got []string) bool { 77 for _, status := range got { 78 if status != "failed" { 79 return false 80 } 81 } 82 return true 83 }, nil, 84 ), 85 "should have only failed allocs", 86 ) 87 } 88 89 // TestDefaultReschedule runs a job that should reschedule after delay 90 func (tc *RescheduleE2ETest) TestDefaultReschedule(f *framework.F) { 91 92 jobID := "test-default-reschedule-" + uuid.Generate()[0:8] 93 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_default.nomad")) 94 tc.jobIds = append(tc.jobIds, jobID) 95 96 expected := []string{"failed", "failed", "failed"} 97 f.NoError( 98 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 99 "should have exactly 3 failed allocs", 100 ) 101 102 // TODO(tgross): return early if "slow" isn't set 103 // wait until first exponential delay kicks in and rescheduling is attempted 104 time.Sleep(time.Second * 35) 105 expected = []string{"failed", "failed", "failed", "failed", "failed", "failed"} 106 f.NoError( 107 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 108 "should have exactly 6 failed allocs after 35s", 109 ) 110 } 111 112 // TestRescheduleMaxAttempts runs a job with a maximum reschedule attempts 113 func (tc *RescheduleE2ETest) TestRescheduleMaxAttempts(f *framework.F) { 114 115 jobID := "test-reschedule-fail-" + uuid.Generate()[0:8] 116 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_fail.nomad")) 117 tc.jobIds = append(tc.jobIds, jobID) 118 119 expected := []string{"failed", "failed", "failed"} 120 f.NoError( 121 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 122 "should have exactly 3 failed allocs", 123 ) 124 125 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_fail.nomad") 126 f.NoError(err) 127 job.ID = &jobID 128 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "sleep 15000"} 129 _, _, err = tc.Nomad().Jobs().Register(job, nil) 130 f.NoError(err, "could not register updated job") 131 132 f.NoError( 133 e2e.WaitForAllocStatusComparison( 134 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 135 func(got []string) bool { 136 for _, status := range got { 137 if status == "running" { 138 return true 139 } 140 } 141 return false 142 }, nil, 143 ), 144 "should have at least 1 running alloc", 145 ) 146 } 147 148 // TestRescheduleSuccess runs a job that should be running after rescheduling 149 func (tc *RescheduleE2ETest) TestRescheduleSuccess(f *framework.F) { 150 151 jobID := "test-reschedule-success-" + uuid.Generate()[0:8] 152 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_success.nomad")) 153 tc.jobIds = append(tc.jobIds, jobID) 154 155 f.NoError( 156 e2e.WaitForAllocStatusComparison( 157 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 158 func(got []string) bool { 159 for _, status := range got { 160 if status == "running" { 161 return true 162 } 163 } 164 return false 165 }, nil, 166 ), 167 "should have at least 1 running alloc", 168 ) 169 } 170 171 // TestRescheduleWithUpdate updates a running job to fail, and verifies that 172 // it gets rescheduled 173 func (tc *RescheduleE2ETest) TestRescheduleWithUpdate(f *framework.F) { 174 175 jobID := "test-reschedule-update-" + uuid.Generate()[0:8] 176 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_update.nomad")) 177 tc.jobIds = append(tc.jobIds, jobID) 178 179 expected := []string{"running", "running", "running"} 180 f.NoError( 181 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 182 "should have exactly 3 running allocs", 183 ) 184 185 // reschedule to make fail 186 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_update.nomad") 187 f.NoError(err) 188 job.ID = &jobID 189 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 190 _, _, err = tc.Nomad().Jobs().Register(job, nil) 191 f.NoError(err, "could not register updated job") 192 193 f.NoError( 194 e2e.WaitForAllocStatusComparison( 195 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 196 func(got []string) bool { return len(got) > 0 }, nil, 197 ), 198 "should have rescheduled allocs until progress deadline", 199 ) 200 } 201 202 // TestRescheduleWithCanary updates a running job to fail, and verify that the 203 // canary gets rescheduled 204 func (tc *RescheduleE2ETest) TestRescheduleWithCanary(f *framework.F) { 205 206 jobID := "test-reschedule-canary-" + uuid.Generate()[0:8] 207 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary.nomad")) 208 tc.jobIds = append(tc.jobIds, jobID) 209 210 expected := []string{"running", "running", "running"} 211 f.NoError( 212 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 213 "should have exactly 3 running allocs", 214 ) 215 216 f.NoError( 217 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 218 "deployment should be successful") 219 220 // reschedule to make fail 221 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary.nomad") 222 f.NoError(err) 223 job.ID = &jobID 224 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 225 _, _, err = tc.Nomad().Jobs().Register(job, nil) 226 f.NoError(err, "could not register updated job") 227 228 f.NoError( 229 e2e.WaitForAllocStatusComparison( 230 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 231 func(got []string) bool { return len(got) > 0 }, nil, 232 ), 233 "should have rescheduled allocs until progress deadline", 234 ) 235 236 f.NoError( 237 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 238 "deployment should be running") 239 } 240 241 // TestRescheduleWithCanary updates a running job to fail, and verifies that 242 // the job gets reverted 243 func (tc *RescheduleE2ETest) TestRescheduleWithCanaryAutoRevert(f *framework.F) { 244 245 jobID := "test-reschedule-canary-revert-" + uuid.Generate()[0:8] 246 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_canary_autorevert.nomad")) 247 tc.jobIds = append(tc.jobIds, jobID) 248 249 expected := []string{"running", "running", "running"} 250 f.NoError( 251 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 252 "should have exactly 3 running allocs", 253 ) 254 255 f.NoError( 256 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 257 "deployment should be successful") 258 259 // reschedule to make fail 260 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_canary_autorevert.nomad") 261 f.NoError(err) 262 job.ID = &jobID 263 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 264 _, _, err = tc.Nomad().Jobs().Register(job, nil) 265 f.NoError(err, "could not register updated job") 266 267 f.NoError( 268 e2e.WaitForAllocStatusComparison( 269 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 270 func(got []string) bool { return len(got) > 0 }, nil, 271 ), 272 "should have new allocs after update", 273 ) 274 275 // then we'll fail and revert 276 expected = []string{"failed", "failed", "failed", "running", "running", "running"} 277 f.NoError( 278 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 279 "should have exactly 3 running reverted allocs", 280 ) 281 282 f.NoError( 283 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 284 "deployment should be successful") 285 } 286 287 // TestRescheduleMaxParallel updates a job with a max_parallel config 288 func (tc *RescheduleE2ETest) TestRescheduleMaxParallel(f *framework.F) { 289 290 jobID := "test-reschedule-maxp-" + uuid.Generate()[0:8] 291 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp.nomad")) 292 tc.jobIds = append(tc.jobIds, jobID) 293 294 expected := []string{"running", "running", "running"} 295 f.NoError( 296 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 297 "should have exactly 3 running allocs", 298 ) 299 300 f.NoError( 301 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 302 "deployment should be successful") 303 304 // reschedule to make fail 305 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp.nomad") 306 f.NoError(err) 307 job.ID = &jobID 308 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 309 _, _, err = tc.Nomad().Jobs().Register(job, nil) 310 f.NoError(err, "could not register updated job") 311 312 expected = []string{"complete", "failed", "failed", "running", "running"} 313 314 f.NoError( 315 e2e.WaitForAllocStatusComparison( 316 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 317 func(got []string) bool { 318 sort.Strings(got) 319 return reflect.DeepEqual(got, expected) 320 }, nil, 321 ), 322 "should have failed allocs including rescheduled failed allocs", 323 ) 324 325 f.NoError( 326 e2e.WaitForLastDeploymentStatus(jobID, ns, "running", nil), 327 "deployment should be running") 328 } 329 330 // TestRescheduleMaxParallelAutoRevert updates a job with a max_parallel 331 // config that will autorevert on failure 332 func (tc *RescheduleE2ETest) TestRescheduleMaxParallelAutoRevert(f *framework.F) { 333 334 jobID := "test-reschedule-maxp-revert-" + uuid.Generate()[0:8] 335 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_maxp_autorevert.nomad")) 336 tc.jobIds = append(tc.jobIds, jobID) 337 338 expected := []string{"running", "running", "running"} 339 f.NoError( 340 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 341 "should have exactly 3 running allocs", 342 ) 343 344 f.NoError( 345 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 346 "deployment should be successful") 347 348 // reschedule to make fail 349 job, err := jobspec.ParseFile("rescheduling/input/rescheduling_maxp_autorevert.nomad") 350 f.NoError(err) 351 job.ID = &jobID 352 job.TaskGroups[0].Tasks[0].Config["args"] = []string{"-c", "lol"} 353 _, _, err = tc.Nomad().Jobs().Register(job, nil) 354 f.NoError(err, "could not e2e.Register updated job") 355 356 f.NoError( 357 e2e.WaitForAllocStatusComparison( 358 func() ([]string, error) { return e2e.AllocStatusesRescheduled(jobID, ns) }, 359 func(got []string) bool { return len(got) > 0 }, nil, 360 ), 361 "should have new allocs after update", 362 ) 363 364 // wait for the revert 365 expected = []string{"complete", "failed", "running", "running", "running"} 366 f.NoError( 367 e2e.WaitForAllocStatusComparison( 368 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 369 func(got []string) bool { 370 sort.Strings(got) 371 return reflect.DeepEqual(got, expected) 372 }, nil, 373 ), 374 "should have one successful, one failed, and 3 reverted allocs", 375 ) 376 377 // at this point the allocs have been checked but we need to wait for the 378 // deployment to be marked complete before we can assert that it's successful 379 // and verify the count of deployments 380 f.NoError( 381 e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 382 "most recent deployment should be successful") 383 384 out, err := e2e.Command("nomad", "deployment", "status") 385 f.NoError(err, "could not get deployment status") 386 387 results, err := e2e.ParseColumns(out) 388 f.NoError(err, "could not parse deployment status") 389 statuses := map[string]int{} 390 for _, row := range results { 391 if row["Job ID"] == jobID { 392 statuses[row["Status"]]++ 393 } 394 } 395 396 f.Equal(1, statuses["failed"], 397 fmt.Sprintf("expected only 1 failed deployment, got:\n%s", out)) 398 f.Equal(2, statuses["successful"], 399 fmt.Sprintf("expected 2 successful deployments, got:\n%s", out)) 400 } 401 402 // TestRescheduleProgressDeadline verifies the progress deadline is reset with 403 // each healthy allocation, and that a rescheduled allocation does not. 404 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadline(f *framework.F) { 405 406 jobID := "test-reschedule-deadline-" + uuid.Generate()[0:8] 407 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline.nomad")) 408 tc.jobIds = append(tc.jobIds, jobID) 409 410 expected := []string{"running"} 411 f.NoError( 412 e2e.WaitForAllocStatusExpected(jobID, ns, expected), 413 "should have a running allocation", 414 ) 415 416 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 417 f.NoError(err, "couldn't look up deployment") 418 419 oldDeadline, err := getProgressDeadline(deploymentID) 420 f.NoError(err, "could not get progress deadline") 421 time.Sleep(time.Second * 20) 422 423 newDeadline, err := getProgressDeadline(deploymentID) 424 f.NoError(err, "could not get new progress deadline") 425 f.NotEqual(oldDeadline, newDeadline, "progress deadline should have been updated") 426 427 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "successful", nil), 428 "deployment should be successful") 429 } 430 431 // TestRescheduleProgressDeadlineFail verifies the progress deadline is reset with 432 // each healthy allocation, and that a rescheduled allocation does not. 433 func (tc *RescheduleE2ETest) TestRescheduleProgressDeadlineFail(f *framework.F) { 434 435 jobID := "test-reschedule-deadline-fail" + uuid.Generate()[0:8] 436 f.NoError(e2e.Register(jobID, "rescheduling/input/rescheduling_progressdeadline_fail.nomad")) 437 tc.jobIds = append(tc.jobIds, jobID) 438 439 deploymentID, err := e2e.LastDeploymentID(jobID, ns) 440 f.NoError(err, "couldn't look up deployment") 441 442 oldDeadline, err := getProgressDeadline(deploymentID) 443 f.NoError(err, "could not get progress deadline") 444 time.Sleep(time.Second * 20) 445 446 f.NoError(e2e.WaitForLastDeploymentStatus(jobID, ns, "failed", nil), 447 "deployment should be failed") 448 449 f.NoError( 450 e2e.WaitForAllocStatusComparison( 451 func() ([]string, error) { return e2e.AllocStatuses(jobID, ns) }, 452 func(got []string) bool { 453 for _, status := range got { 454 if status != "failed" { 455 return false 456 } 457 } 458 return true 459 }, nil, 460 ), 461 "should have only failed allocs", 462 ) 463 464 newDeadline, err := getProgressDeadline(deploymentID) 465 f.NoError(err, "could not get new progress deadline") 466 f.Equal(oldDeadline, newDeadline, "progress deadline should not have been updated") 467 } 468 469 func getProgressDeadline(deploymentID string) (time.Time, error) { 470 471 out, err := e2e.Command("nomad", "deployment", "status", deploymentID) 472 if err != nil { 473 return time.Time{}, fmt.Errorf("could not get deployment status: %v\n%v", err, out) 474 } 475 476 section, err := e2e.GetSection(out, "Deployed") 477 if err != nil { 478 return time.Time{}, fmt.Errorf("could not find Deployed section: %w", err) 479 } 480 481 rows, err := e2e.ParseColumns(section) 482 if err != nil { 483 return time.Time{}, fmt.Errorf("could not parse Deployed section: %w", err) 484 } 485 486 layout := "2006-01-02T15:04:05Z07:00" // taken from command/helpers.go 487 raw := rows[0]["Progress Deadline"] 488 return time.Parse(layout, raw) 489 }