volcano.sh/volcano@v1.9.0/test/e2e/jobseq/job_error_handling.go (about) 1 /* 2 Copyright 2021 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package jobseq 18 19 import ( 20 "context" 21 "strconv" 22 23 . "github.com/onsi/ginkgo/v2" 24 . "github.com/onsi/gomega" 25 26 v1 "k8s.io/api/core/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 29 vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1" 30 vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1" 31 32 jobctl "volcano.sh/volcano/pkg/controllers/job" 33 34 e2eutil "volcano.sh/volcano/test/e2e/util" 35 ) 36 37 var _ = Describe("Job Error Handling", func() { 38 It("job level LifecyclePolicy, Event: PodFailed; Action: RestartJob", func() { 39 By("init test context") 40 context := e2eutil.InitTestContext(e2eutil.Options{}) 41 defer e2eutil.CleanupTestContext(context) 42 43 By("create job") 44 job := e2eutil.CreateJob(context, &e2eutil.JobSpec{ 45 Name: "failed-restart-job", 46 Policies: []vcbatch.LifecyclePolicy{ 47 { 48 Action: vcbus.RestartJobAction, 49 Event: vcbus.PodFailedEvent, 50 }, 51 }, 52 Tasks: []e2eutil.TaskSpec{ 53 { 54 Name: "success", 55 Img: e2eutil.DefaultNginxImage, 56 Min: 2, 57 Rep: 2, 58 }, 59 { 60 Name: "fail", 61 Img: e2eutil.DefaultNginxImage, 62 Min: 2, 63 Rep: 2, 64 Command: "sleep 10s && xxx", 65 RestartPolicy: v1.RestartPolicyNever, 66 }, 67 }, 68 }) 69 70 // job phase: pending -> running -> restarting 71 err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting}) 72 Expect(err).NotTo(HaveOccurred()) 73 }) 74 75 It("job level LifecyclePolicy, Event: PodFailed; Action: TerminateJob", func() { 76 By("init test context") 77 context := e2eutil.InitTestContext(e2eutil.Options{}) 78 defer e2eutil.CleanupTestContext(context) 79 80 By("create job") 81 job := e2eutil.CreateJob(context, &e2eutil.JobSpec{ 82 Name: "failed-terminate-job", 83 Policies: []vcbatch.LifecyclePolicy{ 84 { 85 Action: vcbus.TerminateJobAction, 86 Event: vcbus.PodFailedEvent, 87 }, 88 }, 89 Tasks: []e2eutil.TaskSpec{ 90 { 91 Name: "success", 92 Img: e2eutil.DefaultNginxImage, 93 Min: 2, 94 Rep: 2, 95 }, 96 { 97 Name: "fail", 98 Img: e2eutil.DefaultNginxImage, 99 Min: 2, 100 Rep: 2, 101 Command: "sleep 10s && xxx", 102 RestartPolicy: v1.RestartPolicyNever, 103 }, 104 }, 105 }) 106 107 // job phase: pending -> running -> Terminating -> Terminated 108 err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Terminating, vcbatch.Terminated}) 109 Expect(err).NotTo(HaveOccurred()) 110 }) 111 112 It("job level LifecyclePolicy, Event: PodFailed; Action: AbortJob", func() { 113 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 114 defer e2eutil.CleanupTestContext(ctx) 115 116 By("create job") 117 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 118 Name: "failed-abort-job", 119 Policies: []vcbatch.LifecyclePolicy{ 120 { 121 Action: vcbus.AbortJobAction, 122 Event: vcbus.PodFailedEvent, 123 }, 124 }, 125 Tasks: []e2eutil.TaskSpec{ 126 { 127 Name: "success", 128 Img: e2eutil.DefaultNginxImage, 129 Min: 2, 130 Rep: 2, 131 }, 132 { 133 Name: "fail", 134 Img: e2eutil.DefaultNginxImage, 135 Min: 2, 136 Rep: 2, 137 Command: "sleep 10s && xxx", 138 RestartPolicy: v1.RestartPolicyNever, 139 }, 140 }, 141 }) 142 143 // job phase: pending -> running -> Aborting -> Aborted 144 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Aborting, vcbatch.Aborted}) 145 Expect(err).NotTo(HaveOccurred()) 146 }) 147 148 It("job level LifecyclePolicy, Event: PodEvicted; Action: RestartJob", func() { 149 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 150 defer e2eutil.CleanupTestContext(ctx) 151 152 By("create job") 153 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 154 Name: "evicted-restart-job", 155 Policies: []vcbatch.LifecyclePolicy{ 156 { 157 Action: vcbus.RestartJobAction, 158 Event: vcbus.PodEvictedEvent, 159 }, 160 }, 161 Tasks: []e2eutil.TaskSpec{ 162 { 163 Name: "success", 164 Img: e2eutil.DefaultNginxImage, 165 Min: 2, 166 Rep: 2, 167 }, 168 { 169 Name: "delete", 170 Img: e2eutil.DefaultNginxImage, 171 Min: 2, 172 Rep: 2, 173 }, 174 }, 175 }) 176 177 // job phase: pending -> running 178 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 179 Expect(err).NotTo(HaveOccurred()) 180 181 By("delete one pod of job") 182 podName := jobctl.MakePodName(job.Name, "delete", 0) 183 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 184 Expect(err).NotTo(HaveOccurred()) 185 186 // job phase: Restarting -> Running 187 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running}) 188 Expect(err).NotTo(HaveOccurred()) 189 }) 190 191 It("job level LifecyclePolicy, Event: PodEvicted; Action: TerminateJob", func() { 192 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 193 defer e2eutil.CleanupTestContext(ctx) 194 195 By("create job") 196 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 197 Name: "evicted-terminate-job", 198 Policies: []vcbatch.LifecyclePolicy{ 199 { 200 Action: vcbus.TerminateJobAction, 201 Event: vcbus.PodEvictedEvent, 202 }, 203 }, 204 Tasks: []e2eutil.TaskSpec{ 205 { 206 Name: "success", 207 Img: e2eutil.DefaultNginxImage, 208 Min: 2, 209 Rep: 2, 210 }, 211 { 212 Name: "delete", 213 Img: e2eutil.DefaultNginxImage, 214 Min: 2, 215 Rep: 2, 216 }, 217 }, 218 }) 219 220 // job phase: pending -> running 221 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 222 Expect(err).NotTo(HaveOccurred()) 223 224 By("delete one pod of job") 225 podName := jobctl.MakePodName(job.Name, "delete", 0) 226 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 227 Expect(err).NotTo(HaveOccurred()) 228 229 // job phase: Terminating -> Terminated 230 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated}) 231 Expect(err).NotTo(HaveOccurred()) 232 }) 233 234 It("job level LifecyclePolicy, Event: PodEvicted; Action: AbortJob", func() { 235 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 236 defer e2eutil.CleanupTestContext(ctx) 237 238 By("create job") 239 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 240 Name: "evicted-abort-job", 241 Policies: []vcbatch.LifecyclePolicy{ 242 { 243 Action: vcbus.AbortJobAction, 244 Event: vcbus.PodEvictedEvent, 245 }, 246 }, 247 Tasks: []e2eutil.TaskSpec{ 248 { 249 Name: "success", 250 Img: e2eutil.DefaultNginxImage, 251 Min: 2, 252 Rep: 2, 253 }, 254 { 255 Name: "delete", 256 Img: e2eutil.DefaultNginxImage, 257 Min: 2, 258 Rep: 2, 259 }, 260 }, 261 }) 262 263 // job phase: pending -> running 264 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 265 Expect(err).NotTo(HaveOccurred()) 266 267 By("delete one pod of job") 268 podName := jobctl.MakePodName(job.Name, "delete", 0) 269 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 270 Expect(err).NotTo(HaveOccurred()) 271 272 // job phase: Aborting -> Aborted 273 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Aborting, vcbatch.Aborted}) 274 Expect(err).NotTo(HaveOccurred()) 275 }) 276 277 It("job level LifecyclePolicy, Event: Any; Action: RestartJob", func() { 278 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 279 defer e2eutil.CleanupTestContext(ctx) 280 281 By("create job") 282 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 283 Name: "any-restart-job", 284 Policies: []vcbatch.LifecyclePolicy{ 285 { 286 Action: vcbus.RestartJobAction, 287 Event: vcbus.AnyEvent, 288 }, 289 }, 290 Tasks: []e2eutil.TaskSpec{ 291 { 292 Name: "success", 293 Img: e2eutil.DefaultNginxImage, 294 Min: 2, 295 Rep: 2, 296 }, 297 { 298 Name: "delete", 299 Img: e2eutil.DefaultNginxImage, 300 Min: 2, 301 Rep: 2, 302 }, 303 }, 304 }) 305 306 // job phase: pending -> running 307 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 308 Expect(err).NotTo(HaveOccurred()) 309 310 By("delete one pod of job") 311 podName := jobctl.MakePodName(job.Name, "delete", 0) 312 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 313 Expect(err).NotTo(HaveOccurred()) 314 315 // job phase: Restarting -> Running 316 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running}) 317 Expect(err).NotTo(HaveOccurred()) 318 }) 319 320 It("Job error handling: Restart job when job is unschedulable", func() { 321 By("init test context") 322 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 323 defer e2eutil.CleanupTestContext(ctx) 324 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 325 326 jobSpec := &e2eutil.JobSpec{ 327 Name: "job-restart-when-unschedulable", 328 Namespace: ctx.Namespace, 329 Policies: []vcbatch.LifecyclePolicy{ 330 { 331 Event: vcbus.JobUnknownEvent, 332 Action: vcbus.RestartJobAction, 333 }, 334 }, 335 Tasks: []e2eutil.TaskSpec{ 336 { 337 Name: "test", 338 Img: e2eutil.DefaultNginxImage, 339 Req: e2eutil.OneCPU, 340 Min: rep, 341 Rep: rep, 342 }, 343 }, 344 } 345 By("Create the Job") 346 job := e2eutil.CreateJob(ctx, jobSpec) 347 err := e2eutil.WaitJobReady(ctx, job) 348 Expect(err).NotTo(HaveOccurred()) 349 350 By("Taint all nodes") 351 taints := []v1.Taint{ 352 { 353 Key: "unschedulable-taint-key", 354 Value: "unschedulable-taint-val", 355 Effect: v1.TaintEffectNoSchedule, 356 }, 357 } 358 err = e2eutil.TaintAllNodes(ctx, taints) 359 Expect(err).NotTo(HaveOccurred()) 360 361 podName := jobctl.MakePodName(job.Name, "test", 0) 362 By("Kill one of the pod in order to trigger unschedulable status") 363 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 364 Expect(err).NotTo(HaveOccurred()) 365 366 By("Job is restarting") 367 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{ 368 vcbatch.Restarting, vcbatch.Pending}) 369 Expect(err).NotTo(HaveOccurred()) 370 371 By("Untaint all nodes") 372 err = e2eutil.RemoveTaintsFromAllNodes(ctx, taints) 373 Expect(err).NotTo(HaveOccurred()) 374 By("Job is running again") 375 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Running}) 376 Expect(err).NotTo(HaveOccurred()) 377 }) 378 379 It("Job error handling: Abort job when job is unschedulable", func() { 380 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 381 defer e2eutil.CleanupTestContext(ctx) 382 rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU) 383 384 jobSpec := &e2eutil.JobSpec{ 385 Name: "job-abort-when-unschedulable", 386 Namespace: ctx.Namespace, 387 Policies: []vcbatch.LifecyclePolicy{ 388 { 389 Event: vcbus.JobUnknownEvent, 390 Action: vcbus.AbortJobAction, 391 }, 392 }, 393 Tasks: []e2eutil.TaskSpec{ 394 { 395 Name: "test", 396 Img: e2eutil.DefaultNginxImage, 397 Req: e2eutil.OneCPU, 398 Min: rep, 399 Rep: rep, 400 }, 401 }, 402 } 403 By("Create the Job") 404 job := e2eutil.CreateJob(ctx, jobSpec) 405 err := e2eutil.WaitJobReady(ctx, job) 406 Expect(err).NotTo(HaveOccurred()) 407 408 By("Taint all nodes") 409 taints := []v1.Taint{ 410 { 411 Key: "unschedulable-taint-key", 412 Value: "unschedulable-taint-val", 413 Effect: v1.TaintEffectNoSchedule, 414 }, 415 } 416 err = e2eutil.TaintAllNodes(ctx, taints) 417 Expect(err).NotTo(HaveOccurred()) 418 419 podName := jobctl.MakePodName(job.Name, "test", 0) 420 By("Kill one of the pod in order to trigger unschedulable status") 421 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 422 Expect(err).NotTo(HaveOccurred()) 423 424 By("Job is aborted") 425 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{ 426 vcbatch.Aborting, vcbatch.Aborted}) 427 Expect(err).NotTo(HaveOccurred()) 428 429 err = e2eutil.RemoveTaintsFromAllNodes(ctx, taints) 430 Expect(err).NotTo(HaveOccurred()) 431 }) 432 433 It("job level LifecyclePolicy, Event: TaskCompleted; Action: CompletedJob", func() { 434 By("init test context") 435 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 436 defer e2eutil.CleanupTestContext(ctx) 437 438 By("create job") 439 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 440 Name: "any-complete-job", 441 Namespace: ctx.Namespace, 442 Policies: []vcbatch.LifecyclePolicy{ 443 { 444 Action: vcbus.CompleteJobAction, 445 Event: vcbus.TaskCompletedEvent, 446 }, 447 }, 448 Tasks: []e2eutil.TaskSpec{ 449 { 450 Name: "completed-task", 451 Img: e2eutil.DefaultBusyBoxImage, 452 Min: 2, 453 Rep: 2, 454 //Sleep 5 seconds ensure job in running state 455 Command: "sleep 5", 456 }, 457 { 458 Name: "terminating-task", 459 Img: e2eutil.DefaultNginxImage, 460 Min: 2, 461 Rep: 2, 462 }, 463 }, 464 }) 465 466 By("job scheduled, then task 'completed_task' finished and job finally complete") 467 // job phase: pending -> running -> completing -> completed 468 // TODO: skip running -> completing for the github CI pool performance 469 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{ 470 vcbatch.Pending, vcbatch.Completed}) 471 Expect(err).NotTo(HaveOccurred()) 472 473 }) 474 475 It("job level LifecyclePolicy, Event: TaskFailed; Action: TerminateJob", func() { 476 By("init test context") 477 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 478 defer e2eutil.CleanupTestContext(ctx) 479 480 By("create job") 481 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 482 Name: "task-failed-terminate-job", 483 Namespace: ctx.Namespace, 484 Policies: []vcbatch.LifecyclePolicy{ 485 { 486 Action: vcbus.TerminateJobAction, 487 Event: vcbus.TaskFailedEvent, 488 }, 489 }, 490 Tasks: []e2eutil.TaskSpec{ 491 { 492 Name: "success", 493 Img: e2eutil.DefaultBusyBoxImage, 494 Min: 2, 495 Rep: 2, 496 //Sleep 5 seconds ensure job in running state 497 Command: "sleep 5", 498 }, 499 { 500 Name: "failed", 501 Img: e2eutil.DefaultBusyBoxImage, 502 Min: 2, 503 Rep: 2, 504 Command: "sleep 10s && xxx", 505 RestartPolicy: v1.RestartPolicyNever, 506 MaxRetry: 3, 507 }, 508 }, 509 }) 510 511 // job phase: Pending -> Running 512 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 513 Expect(err).NotTo(HaveOccurred()) 514 515 By("update one pod of job") 516 podName := jobctl.MakePodName(job.Name, "failed", 0) 517 pod, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).Get(context.TODO(), podName, metav1.GetOptions{}) 518 Expect(err).NotTo(HaveOccurred()) 519 520 pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: 4}} 521 _, err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{}) 522 Expect(err).NotTo(HaveOccurred()) 523 524 // job phase: Terminating -> Terminated 525 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated}) 526 Expect(err).NotTo(HaveOccurred()) 527 528 }) 529 530 It("job level LifecyclePolicy, error code: 3; Action: RestartJob", func() { 531 By("init test context") 532 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 533 defer e2eutil.CleanupTestContext(ctx) 534 535 By("create job") 536 var erroCode int32 = 3 537 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 538 Name: "errorcode-restart-job", 539 Namespace: ctx.Namespace, 540 Policies: []vcbatch.LifecyclePolicy{ 541 { 542 Action: vcbus.RestartJobAction, 543 ExitCode: &erroCode, 544 }, 545 }, 546 Tasks: []e2eutil.TaskSpec{ 547 { 548 Name: "success", 549 Img: e2eutil.DefaultNginxImage, 550 Min: 1, 551 Rep: 1, 552 }, 553 { 554 Name: "fail", 555 Img: e2eutil.DefaultNginxImage, 556 Min: 1, 557 Rep: 1, 558 Command: "sleep 10s && exit 3", 559 RestartPolicy: v1.RestartPolicyNever, 560 }, 561 }, 562 }) 563 564 // job phase: pending -> running -> restarting 565 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting}) 566 Expect(err).NotTo(HaveOccurred()) 567 }) 568 569 It("job level LifecyclePolicy, Event[]: PodEvicted, PodFailed; Action: TerminateJob", func() { 570 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 571 defer e2eutil.CleanupTestContext(ctx) 572 573 By("create job") 574 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 575 Name: "evicted-terminate-job", 576 Policies: []vcbatch.LifecyclePolicy{ 577 { 578 Action: vcbus.TerminateJobAction, 579 Events: []vcbus.Event{vcbus.PodEvictedEvent, 580 vcbus.PodFailedEvent, 581 vcbus.PodEvictedEvent, 582 }, 583 }, 584 }, 585 Tasks: []e2eutil.TaskSpec{ 586 { 587 Name: "success", 588 Img: e2eutil.DefaultNginxImage, 589 Min: 2, 590 Rep: 2, 591 }, 592 { 593 Name: "delete", 594 Img: e2eutil.DefaultNginxImage, 595 Min: 2, 596 Rep: 2, 597 }, 598 }, 599 }) 600 601 // job phase: pending -> running 602 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 603 Expect(err).NotTo(HaveOccurred()) 604 605 By("delete one pod of job") 606 podName := jobctl.MakePodName(job.Name, "delete", 0) 607 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 608 Expect(err).NotTo(HaveOccurred()) 609 610 // job phase: Terminating -> Terminated 611 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated}) 612 Expect(err).NotTo(HaveOccurred()) 613 }) 614 It("Task level LifecyclePolicy, Event: PodFailed; Action: RestartJob", func() { 615 By("init test context") 616 context := e2eutil.InitTestContext(e2eutil.Options{}) 617 defer e2eutil.CleanupTestContext(context) 618 619 By("create job") 620 job := e2eutil.CreateJob(context, &e2eutil.JobSpec{ 621 Name: "failed-restart-job", 622 Tasks: []e2eutil.TaskSpec{ 623 { 624 Name: "success", 625 Img: e2eutil.DefaultNginxImage, 626 Min: 2, 627 Rep: 2, 628 }, 629 { 630 Name: "fail", 631 Img: e2eutil.DefaultNginxImage, 632 Min: 2, 633 Rep: 2, 634 Command: "sleep 10s && xxx", 635 RestartPolicy: v1.RestartPolicyNever, 636 Policies: []vcbatch.LifecyclePolicy{ 637 { 638 Action: vcbus.RestartJobAction, 639 Event: vcbus.PodFailedEvent, 640 }, 641 }, 642 }, 643 }, 644 }) 645 646 // job phase: pending -> running -> restarting 647 err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting}) 648 Expect(err).NotTo(HaveOccurred()) 649 }) 650 It("Task level LifecyclePolicy, Event: PodEvicted; Action: RestartJob", func() { 651 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 652 defer e2eutil.CleanupTestContext(ctx) 653 654 By("create job") 655 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 656 Name: "evicted-restart-job", 657 658 Tasks: []e2eutil.TaskSpec{ 659 { 660 Name: "success", 661 Img: e2eutil.DefaultNginxImage, 662 Min: 2, 663 Rep: 2, 664 }, 665 { 666 Name: "delete", 667 Img: e2eutil.DefaultNginxImage, 668 Min: 2, 669 Rep: 2, 670 Policies: []vcbatch.LifecyclePolicy{ 671 { 672 Action: vcbus.RestartJobAction, 673 Event: vcbus.PodEvictedEvent, 674 }, 675 }, 676 }, 677 }, 678 }) 679 680 // job phase: pending -> running 681 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 682 Expect(err).NotTo(HaveOccurred()) 683 684 By("delete one pod of job") 685 podName := jobctl.MakePodName(job.Name, "delete", 0) 686 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 687 Expect(err).NotTo(HaveOccurred()) 688 689 // job phase: Restarting -> Running 690 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running}) 691 Expect(err).NotTo(HaveOccurred()) 692 }) 693 It("Task level LifecyclePolicy, Event: PodEvicted; Action: TerminateJob", func() { 694 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 695 defer e2eutil.CleanupTestContext(ctx) 696 697 By("create job") 698 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 699 Name: "evicted-terminate-job", 700 Tasks: []e2eutil.TaskSpec{ 701 { 702 Name: "success", 703 Img: e2eutil.DefaultNginxImage, 704 Min: 2, 705 Rep: 2, 706 }, 707 { 708 Name: "delete", 709 Img: e2eutil.DefaultNginxImage, 710 Min: 2, 711 Rep: 2, 712 Policies: []vcbatch.LifecyclePolicy{ 713 { 714 Action: vcbus.TerminateJobAction, 715 Event: vcbus.PodEvictedEvent, 716 }, 717 }, 718 }, 719 }, 720 }) 721 722 // job phase: pending -> running 723 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 724 Expect(err).NotTo(HaveOccurred()) 725 726 By("delete one pod of job") 727 podName := jobctl.MakePodName(job.Name, "delete", 0) 728 err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{}) 729 Expect(err).NotTo(HaveOccurred()) 730 731 // job phase: Terminating -> Terminated 732 err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated}) 733 Expect(err).NotTo(HaveOccurred()) 734 }) 735 It("Task level LifecyclePolicy, Event: TaskCompleted; Action: CompletedJob", func() { 736 ctx := e2eutil.InitTestContext(e2eutil.Options{}) 737 defer e2eutil.CleanupTestContext(ctx) 738 739 By("create job") 740 job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{ 741 Name: "any-complete-job", 742 Tasks: []e2eutil.TaskSpec{ 743 { 744 Name: "completed-task", 745 Img: e2eutil.DefaultBusyBoxImage, 746 Min: 2, 747 Rep: 2, 748 // Sleep 5 seconds ensure job in running state 749 Command: "sleep 5", 750 Policies: []vcbatch.LifecyclePolicy{ 751 { 752 Action: vcbus.CompleteJobAction, 753 Event: vcbus.TaskCompletedEvent, 754 }, 755 }, 756 }, 757 { 758 Name: "terminating-task", 759 Img: e2eutil.DefaultNginxImage, 760 Min: 2, 761 Rep: 2, 762 }, 763 }, 764 }) 765 766 By("job scheduled, then task 'completed_task' finished and job finally complete") 767 // job phase: pending -> running -> completing -> completed 768 err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{ 769 vcbatch.Pending, vcbatch.Completed}) 770 Expect(err).NotTo(HaveOccurred()) 771 772 }) 773 774 It("job level LifecyclePolicy, Event: PodFailed; Action: AbortJob and Task level lifecyclePolicy, Event : PodFailed; Action: RestartJob", func() { 775 By("init test context") 776 context := e2eutil.InitTestContext(e2eutil.Options{}) 777 defer e2eutil.CleanupTestContext(context) 778 779 By("create job") 780 job := e2eutil.CreateJob(context, &e2eutil.JobSpec{ 781 Name: "failed-restart-job", 782 Policies: []vcbatch.LifecyclePolicy{ 783 { 784 Action: vcbus.AbortJobAction, 785 Event: vcbus.PodFailedEvent, 786 }, 787 }, 788 Tasks: []e2eutil.TaskSpec{ 789 { 790 Name: "success", 791 Img: e2eutil.DefaultNginxImage, 792 Min: 2, 793 Rep: 2, 794 }, 795 { 796 Name: "fail", 797 Img: e2eutil.DefaultNginxImage, 798 Min: 2, 799 Rep: 2, 800 Command: "sleep 10s && xxx", 801 RestartPolicy: v1.RestartPolicyNever, 802 Policies: []vcbatch.LifecyclePolicy{ 803 { 804 Action: vcbus.RestartJobAction, 805 Event: vcbus.PodFailedEvent, 806 }, 807 }, 808 }, 809 }, 810 }) 811 812 // job phase: pending -> running -> Restarting 813 err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting}) 814 Expect(err).NotTo(HaveOccurred()) 815 }) 816 817 It("Task Priority", func() { 818 By("init test context") 819 context := e2eutil.InitTestContext(e2eutil.Options{ 820 PriorityClasses: map[string]int32{ 821 e2eutil.MasterPriority: e2eutil.MasterPriorityValue, 822 e2eutil.WorkerPriority: e2eutil.WorkerPriorityValue, 823 }, 824 }) 825 defer e2eutil.CleanupTestContext(context) 826 827 rep := e2eutil.ClusterSize(context, e2eutil.OneCPU) 828 nodecount := e2eutil.ClusterNodeNumber(context) 829 By("create job") 830 job := e2eutil.CreateJob(context, &e2eutil.JobSpec{ 831 Name: "task-priority-job", 832 Min: int32(nodecount), 833 Tasks: []e2eutil.TaskSpec{ 834 { 835 Name: "higherprioritytask", 836 Img: e2eutil.DefaultNginxImage, 837 Rep: int32(nodecount), 838 Req: e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)), 839 Taskpriority: e2eutil.MasterPriority, 840 }, 841 { 842 Name: "lowerprioritytask", 843 Img: e2eutil.DefaultNginxImage, 844 Rep: int32(nodecount), 845 Req: e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)), 846 Taskpriority: e2eutil.MasterPriority, 847 }, 848 }, 849 }) 850 851 // job phase: pending -> running 852 err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running}) 853 Expect(err).NotTo(HaveOccurred()) 854 expteced := map[string]int{ 855 e2eutil.MasterPriority: nodecount, 856 e2eutil.WorkerPriority: 0, 857 } 858 859 err = e2eutil.WaitTasksReadyEx(context, job, expteced) 860 Expect(err).NotTo(HaveOccurred()) 861 }) 862 863 })