volcano.sh/volcano@v1.9.0/test/e2e/jobseq/job_error_handling.go (about)

     1  /*
     2  Copyright 2021 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jobseq
    18  
    19  import (
    20  	"context"
    21  	"strconv"
    22  
    23  	. "github.com/onsi/ginkgo/v2"
    24  	. "github.com/onsi/gomega"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  
    29  	vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    30  	vcbus "volcano.sh/apis/pkg/apis/bus/v1alpha1"
    31  
    32  	jobctl "volcano.sh/volcano/pkg/controllers/job"
    33  
    34  	e2eutil "volcano.sh/volcano/test/e2e/util"
    35  )
    36  
    37  var _ = Describe("Job Error Handling", func() {
    38  	It("job level LifecyclePolicy, Event: PodFailed; Action: RestartJob", func() {
    39  		By("init test context")
    40  		context := e2eutil.InitTestContext(e2eutil.Options{})
    41  		defer e2eutil.CleanupTestContext(context)
    42  
    43  		By("create job")
    44  		job := e2eutil.CreateJob(context, &e2eutil.JobSpec{
    45  			Name: "failed-restart-job",
    46  			Policies: []vcbatch.LifecyclePolicy{
    47  				{
    48  					Action: vcbus.RestartJobAction,
    49  					Event:  vcbus.PodFailedEvent,
    50  				},
    51  			},
    52  			Tasks: []e2eutil.TaskSpec{
    53  				{
    54  					Name: "success",
    55  					Img:  e2eutil.DefaultNginxImage,
    56  					Min:  2,
    57  					Rep:  2,
    58  				},
    59  				{
    60  					Name:          "fail",
    61  					Img:           e2eutil.DefaultNginxImage,
    62  					Min:           2,
    63  					Rep:           2,
    64  					Command:       "sleep 10s && xxx",
    65  					RestartPolicy: v1.RestartPolicyNever,
    66  				},
    67  			},
    68  		})
    69  
    70  		// job phase: pending -> running -> restarting
    71  		err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting})
    72  		Expect(err).NotTo(HaveOccurred())
    73  	})
    74  
    75  	It("job level LifecyclePolicy, Event: PodFailed; Action: TerminateJob", func() {
    76  		By("init test context")
    77  		context := e2eutil.InitTestContext(e2eutil.Options{})
    78  		defer e2eutil.CleanupTestContext(context)
    79  
    80  		By("create job")
    81  		job := e2eutil.CreateJob(context, &e2eutil.JobSpec{
    82  			Name: "failed-terminate-job",
    83  			Policies: []vcbatch.LifecyclePolicy{
    84  				{
    85  					Action: vcbus.TerminateJobAction,
    86  					Event:  vcbus.PodFailedEvent,
    87  				},
    88  			},
    89  			Tasks: []e2eutil.TaskSpec{
    90  				{
    91  					Name: "success",
    92  					Img:  e2eutil.DefaultNginxImage,
    93  					Min:  2,
    94  					Rep:  2,
    95  				},
    96  				{
    97  					Name:          "fail",
    98  					Img:           e2eutil.DefaultNginxImage,
    99  					Min:           2,
   100  					Rep:           2,
   101  					Command:       "sleep 10s && xxx",
   102  					RestartPolicy: v1.RestartPolicyNever,
   103  				},
   104  			},
   105  		})
   106  
   107  		// job phase: pending -> running -> Terminating -> Terminated
   108  		err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Terminating, vcbatch.Terminated})
   109  		Expect(err).NotTo(HaveOccurred())
   110  	})
   111  
   112  	It("job level LifecyclePolicy, Event: PodFailed; Action: AbortJob", func() {
   113  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   114  		defer e2eutil.CleanupTestContext(ctx)
   115  
   116  		By("create job")
   117  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   118  			Name: "failed-abort-job",
   119  			Policies: []vcbatch.LifecyclePolicy{
   120  				{
   121  					Action: vcbus.AbortJobAction,
   122  					Event:  vcbus.PodFailedEvent,
   123  				},
   124  			},
   125  			Tasks: []e2eutil.TaskSpec{
   126  				{
   127  					Name: "success",
   128  					Img:  e2eutil.DefaultNginxImage,
   129  					Min:  2,
   130  					Rep:  2,
   131  				},
   132  				{
   133  					Name:          "fail",
   134  					Img:           e2eutil.DefaultNginxImage,
   135  					Min:           2,
   136  					Rep:           2,
   137  					Command:       "sleep 10s && xxx",
   138  					RestartPolicy: v1.RestartPolicyNever,
   139  				},
   140  			},
   141  		})
   142  
   143  		// job phase: pending -> running -> Aborting -> Aborted
   144  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Aborting, vcbatch.Aborted})
   145  		Expect(err).NotTo(HaveOccurred())
   146  	})
   147  
   148  	It("job level LifecyclePolicy, Event: PodEvicted; Action: RestartJob", func() {
   149  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   150  		defer e2eutil.CleanupTestContext(ctx)
   151  
   152  		By("create job")
   153  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   154  			Name: "evicted-restart-job",
   155  			Policies: []vcbatch.LifecyclePolicy{
   156  				{
   157  					Action: vcbus.RestartJobAction,
   158  					Event:  vcbus.PodEvictedEvent,
   159  				},
   160  			},
   161  			Tasks: []e2eutil.TaskSpec{
   162  				{
   163  					Name: "success",
   164  					Img:  e2eutil.DefaultNginxImage,
   165  					Min:  2,
   166  					Rep:  2,
   167  				},
   168  				{
   169  					Name: "delete",
   170  					Img:  e2eutil.DefaultNginxImage,
   171  					Min:  2,
   172  					Rep:  2,
   173  				},
   174  			},
   175  		})
   176  
   177  		// job phase: pending -> running
   178  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   179  		Expect(err).NotTo(HaveOccurred())
   180  
   181  		By("delete one pod of job")
   182  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   183  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   184  		Expect(err).NotTo(HaveOccurred())
   185  
   186  		// job phase: Restarting -> Running
   187  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running})
   188  		Expect(err).NotTo(HaveOccurred())
   189  	})
   190  
   191  	It("job level LifecyclePolicy, Event: PodEvicted; Action: TerminateJob", func() {
   192  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   193  		defer e2eutil.CleanupTestContext(ctx)
   194  
   195  		By("create job")
   196  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   197  			Name: "evicted-terminate-job",
   198  			Policies: []vcbatch.LifecyclePolicy{
   199  				{
   200  					Action: vcbus.TerminateJobAction,
   201  					Event:  vcbus.PodEvictedEvent,
   202  				},
   203  			},
   204  			Tasks: []e2eutil.TaskSpec{
   205  				{
   206  					Name: "success",
   207  					Img:  e2eutil.DefaultNginxImage,
   208  					Min:  2,
   209  					Rep:  2,
   210  				},
   211  				{
   212  					Name: "delete",
   213  					Img:  e2eutil.DefaultNginxImage,
   214  					Min:  2,
   215  					Rep:  2,
   216  				},
   217  			},
   218  		})
   219  
   220  		// job phase: pending -> running
   221  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   222  		Expect(err).NotTo(HaveOccurred())
   223  
   224  		By("delete one pod of job")
   225  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   226  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   227  		Expect(err).NotTo(HaveOccurred())
   228  
   229  		// job phase: Terminating -> Terminated
   230  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated})
   231  		Expect(err).NotTo(HaveOccurred())
   232  	})
   233  
   234  	It("job level LifecyclePolicy, Event: PodEvicted; Action: AbortJob", func() {
   235  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   236  		defer e2eutil.CleanupTestContext(ctx)
   237  
   238  		By("create job")
   239  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   240  			Name: "evicted-abort-job",
   241  			Policies: []vcbatch.LifecyclePolicy{
   242  				{
   243  					Action: vcbus.AbortJobAction,
   244  					Event:  vcbus.PodEvictedEvent,
   245  				},
   246  			},
   247  			Tasks: []e2eutil.TaskSpec{
   248  				{
   249  					Name: "success",
   250  					Img:  e2eutil.DefaultNginxImage,
   251  					Min:  2,
   252  					Rep:  2,
   253  				},
   254  				{
   255  					Name: "delete",
   256  					Img:  e2eutil.DefaultNginxImage,
   257  					Min:  2,
   258  					Rep:  2,
   259  				},
   260  			},
   261  		})
   262  
   263  		// job phase: pending -> running
   264  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   265  		Expect(err).NotTo(HaveOccurred())
   266  
   267  		By("delete one pod of job")
   268  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   269  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   270  		Expect(err).NotTo(HaveOccurred())
   271  
   272  		// job phase: Aborting -> Aborted
   273  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Aborting, vcbatch.Aborted})
   274  		Expect(err).NotTo(HaveOccurred())
   275  	})
   276  
   277  	It("job level LifecyclePolicy, Event: Any; Action: RestartJob", func() {
   278  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   279  		defer e2eutil.CleanupTestContext(ctx)
   280  
   281  		By("create job")
   282  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   283  			Name: "any-restart-job",
   284  			Policies: []vcbatch.LifecyclePolicy{
   285  				{
   286  					Action: vcbus.RestartJobAction,
   287  					Event:  vcbus.AnyEvent,
   288  				},
   289  			},
   290  			Tasks: []e2eutil.TaskSpec{
   291  				{
   292  					Name: "success",
   293  					Img:  e2eutil.DefaultNginxImage,
   294  					Min:  2,
   295  					Rep:  2,
   296  				},
   297  				{
   298  					Name: "delete",
   299  					Img:  e2eutil.DefaultNginxImage,
   300  					Min:  2,
   301  					Rep:  2,
   302  				},
   303  			},
   304  		})
   305  
   306  		// job phase: pending -> running
   307  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   308  		Expect(err).NotTo(HaveOccurred())
   309  
   310  		By("delete one pod of job")
   311  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   312  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   313  		Expect(err).NotTo(HaveOccurred())
   314  
   315  		// job phase: Restarting -> Running
   316  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running})
   317  		Expect(err).NotTo(HaveOccurred())
   318  	})
   319  
   320  	It("Job error handling: Restart job when job is unschedulable", func() {
   321  		By("init test context")
   322  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   323  		defer e2eutil.CleanupTestContext(ctx)
   324  		rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU)
   325  
   326  		jobSpec := &e2eutil.JobSpec{
   327  			Name:      "job-restart-when-unschedulable",
   328  			Namespace: ctx.Namespace,
   329  			Policies: []vcbatch.LifecyclePolicy{
   330  				{
   331  					Event:  vcbus.JobUnknownEvent,
   332  					Action: vcbus.RestartJobAction,
   333  				},
   334  			},
   335  			Tasks: []e2eutil.TaskSpec{
   336  				{
   337  					Name: "test",
   338  					Img:  e2eutil.DefaultNginxImage,
   339  					Req:  e2eutil.OneCPU,
   340  					Min:  rep,
   341  					Rep:  rep,
   342  				},
   343  			},
   344  		}
   345  		By("Create the Job")
   346  		job := e2eutil.CreateJob(ctx, jobSpec)
   347  		err := e2eutil.WaitJobReady(ctx, job)
   348  		Expect(err).NotTo(HaveOccurred())
   349  
   350  		By("Taint all nodes")
   351  		taints := []v1.Taint{
   352  			{
   353  				Key:    "unschedulable-taint-key",
   354  				Value:  "unschedulable-taint-val",
   355  				Effect: v1.TaintEffectNoSchedule,
   356  			},
   357  		}
   358  		err = e2eutil.TaintAllNodes(ctx, taints)
   359  		Expect(err).NotTo(HaveOccurred())
   360  
   361  		podName := jobctl.MakePodName(job.Name, "test", 0)
   362  		By("Kill one of the pod in order to trigger unschedulable status")
   363  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   364  		Expect(err).NotTo(HaveOccurred())
   365  
   366  		By("Job is restarting")
   367  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{
   368  			vcbatch.Restarting, vcbatch.Pending})
   369  		Expect(err).NotTo(HaveOccurred())
   370  
   371  		By("Untaint all nodes")
   372  		err = e2eutil.RemoveTaintsFromAllNodes(ctx, taints)
   373  		Expect(err).NotTo(HaveOccurred())
   374  		By("Job is running again")
   375  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Running})
   376  		Expect(err).NotTo(HaveOccurred())
   377  	})
   378  
   379  	It("Job error handling: Abort job when job is unschedulable", func() {
   380  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   381  		defer e2eutil.CleanupTestContext(ctx)
   382  		rep := e2eutil.ClusterSize(ctx, e2eutil.OneCPU)
   383  
   384  		jobSpec := &e2eutil.JobSpec{
   385  			Name:      "job-abort-when-unschedulable",
   386  			Namespace: ctx.Namespace,
   387  			Policies: []vcbatch.LifecyclePolicy{
   388  				{
   389  					Event:  vcbus.JobUnknownEvent,
   390  					Action: vcbus.AbortJobAction,
   391  				},
   392  			},
   393  			Tasks: []e2eutil.TaskSpec{
   394  				{
   395  					Name: "test",
   396  					Img:  e2eutil.DefaultNginxImage,
   397  					Req:  e2eutil.OneCPU,
   398  					Min:  rep,
   399  					Rep:  rep,
   400  				},
   401  			},
   402  		}
   403  		By("Create the Job")
   404  		job := e2eutil.CreateJob(ctx, jobSpec)
   405  		err := e2eutil.WaitJobReady(ctx, job)
   406  		Expect(err).NotTo(HaveOccurred())
   407  
   408  		By("Taint all nodes")
   409  		taints := []v1.Taint{
   410  			{
   411  				Key:    "unschedulable-taint-key",
   412  				Value:  "unschedulable-taint-val",
   413  				Effect: v1.TaintEffectNoSchedule,
   414  			},
   415  		}
   416  		err = e2eutil.TaintAllNodes(ctx, taints)
   417  		Expect(err).NotTo(HaveOccurred())
   418  
   419  		podName := jobctl.MakePodName(job.Name, "test", 0)
   420  		By("Kill one of the pod in order to trigger unschedulable status")
   421  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   422  		Expect(err).NotTo(HaveOccurred())
   423  
   424  		By("Job is aborted")
   425  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{
   426  			vcbatch.Aborting, vcbatch.Aborted})
   427  		Expect(err).NotTo(HaveOccurred())
   428  
   429  		err = e2eutil.RemoveTaintsFromAllNodes(ctx, taints)
   430  		Expect(err).NotTo(HaveOccurred())
   431  	})
   432  
   433  	It("job level LifecyclePolicy, Event: TaskCompleted; Action: CompletedJob", func() {
   434  		By("init test context")
   435  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   436  		defer e2eutil.CleanupTestContext(ctx)
   437  
   438  		By("create job")
   439  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   440  			Name:      "any-complete-job",
   441  			Namespace: ctx.Namespace,
   442  			Policies: []vcbatch.LifecyclePolicy{
   443  				{
   444  					Action: vcbus.CompleteJobAction,
   445  					Event:  vcbus.TaskCompletedEvent,
   446  				},
   447  			},
   448  			Tasks: []e2eutil.TaskSpec{
   449  				{
   450  					Name: "completed-task",
   451  					Img:  e2eutil.DefaultBusyBoxImage,
   452  					Min:  2,
   453  					Rep:  2,
   454  					//Sleep 5 seconds ensure job in running state
   455  					Command: "sleep 5",
   456  				},
   457  				{
   458  					Name: "terminating-task",
   459  					Img:  e2eutil.DefaultNginxImage,
   460  					Min:  2,
   461  					Rep:  2,
   462  				},
   463  			},
   464  		})
   465  
   466  		By("job scheduled, then task 'completed_task' finished and job finally complete")
   467  		// job phase: pending -> running -> completing -> completed
   468  		// TODO: skip running -> completing for the github CI pool performance
   469  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{
   470  			vcbatch.Pending, vcbatch.Completed})
   471  		Expect(err).NotTo(HaveOccurred())
   472  
   473  	})
   474  
   475  	It("job level LifecyclePolicy, Event: TaskFailed; Action: TerminateJob", func() {
   476  		By("init test context")
   477  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   478  		defer e2eutil.CleanupTestContext(ctx)
   479  
   480  		By("create job")
   481  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   482  			Name:      "task-failed-terminate-job",
   483  			Namespace: ctx.Namespace,
   484  			Policies: []vcbatch.LifecyclePolicy{
   485  				{
   486  					Action: vcbus.TerminateJobAction,
   487  					Event:  vcbus.TaskFailedEvent,
   488  				},
   489  			},
   490  			Tasks: []e2eutil.TaskSpec{
   491  				{
   492  					Name: "success",
   493  					Img:  e2eutil.DefaultBusyBoxImage,
   494  					Min:  2,
   495  					Rep:  2,
   496  					//Sleep 5 seconds ensure job in running state
   497  					Command: "sleep 5",
   498  				},
   499  				{
   500  					Name:          "failed",
   501  					Img:           e2eutil.DefaultBusyBoxImage,
   502  					Min:           2,
   503  					Rep:           2,
   504  					Command:       "sleep 10s && xxx",
   505  					RestartPolicy: v1.RestartPolicyNever,
   506  					MaxRetry:      3,
   507  				},
   508  			},
   509  		})
   510  
   511  		// job phase: Pending -> Running
   512  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   513  		Expect(err).NotTo(HaveOccurred())
   514  
   515  		By("update one pod of job")
   516  		podName := jobctl.MakePodName(job.Name, "failed", 0)
   517  		pod, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).Get(context.TODO(), podName, metav1.GetOptions{})
   518  		Expect(err).NotTo(HaveOccurred())
   519  
   520  		pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: 4}}
   521  		_, err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).UpdateStatus(context.TODO(), pod, metav1.UpdateOptions{})
   522  		Expect(err).NotTo(HaveOccurred())
   523  
   524  		// job phase: Terminating -> Terminated
   525  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated})
   526  		Expect(err).NotTo(HaveOccurred())
   527  
   528  	})
   529  
   530  	It("job level LifecyclePolicy, error code: 3; Action: RestartJob", func() {
   531  		By("init test context")
   532  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   533  		defer e2eutil.CleanupTestContext(ctx)
   534  
   535  		By("create job")
   536  		var erroCode int32 = 3
   537  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   538  			Name:      "errorcode-restart-job",
   539  			Namespace: ctx.Namespace,
   540  			Policies: []vcbatch.LifecyclePolicy{
   541  				{
   542  					Action:   vcbus.RestartJobAction,
   543  					ExitCode: &erroCode,
   544  				},
   545  			},
   546  			Tasks: []e2eutil.TaskSpec{
   547  				{
   548  					Name: "success",
   549  					Img:  e2eutil.DefaultNginxImage,
   550  					Min:  1,
   551  					Rep:  1,
   552  				},
   553  				{
   554  					Name:          "fail",
   555  					Img:           e2eutil.DefaultNginxImage,
   556  					Min:           1,
   557  					Rep:           1,
   558  					Command:       "sleep 10s && exit 3",
   559  					RestartPolicy: v1.RestartPolicyNever,
   560  				},
   561  			},
   562  		})
   563  
   564  		// job phase: pending -> running -> restarting
   565  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting})
   566  		Expect(err).NotTo(HaveOccurred())
   567  	})
   568  
   569  	It("job level LifecyclePolicy, Event[]: PodEvicted, PodFailed; Action: TerminateJob", func() {
   570  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   571  		defer e2eutil.CleanupTestContext(ctx)
   572  
   573  		By("create job")
   574  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   575  			Name: "evicted-terminate-job",
   576  			Policies: []vcbatch.LifecyclePolicy{
   577  				{
   578  					Action: vcbus.TerminateJobAction,
   579  					Events: []vcbus.Event{vcbus.PodEvictedEvent,
   580  						vcbus.PodFailedEvent,
   581  						vcbus.PodEvictedEvent,
   582  					},
   583  				},
   584  			},
   585  			Tasks: []e2eutil.TaskSpec{
   586  				{
   587  					Name: "success",
   588  					Img:  e2eutil.DefaultNginxImage,
   589  					Min:  2,
   590  					Rep:  2,
   591  				},
   592  				{
   593  					Name: "delete",
   594  					Img:  e2eutil.DefaultNginxImage,
   595  					Min:  2,
   596  					Rep:  2,
   597  				},
   598  			},
   599  		})
   600  
   601  		// job phase: pending -> running
   602  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   603  		Expect(err).NotTo(HaveOccurred())
   604  
   605  		By("delete one pod of job")
   606  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   607  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   608  		Expect(err).NotTo(HaveOccurred())
   609  
   610  		// job phase: Terminating -> Terminated
   611  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated})
   612  		Expect(err).NotTo(HaveOccurred())
   613  	})
   614  	It("Task level LifecyclePolicy, Event: PodFailed; Action: RestartJob", func() {
   615  		By("init test context")
   616  		context := e2eutil.InitTestContext(e2eutil.Options{})
   617  		defer e2eutil.CleanupTestContext(context)
   618  
   619  		By("create job")
   620  		job := e2eutil.CreateJob(context, &e2eutil.JobSpec{
   621  			Name: "failed-restart-job",
   622  			Tasks: []e2eutil.TaskSpec{
   623  				{
   624  					Name: "success",
   625  					Img:  e2eutil.DefaultNginxImage,
   626  					Min:  2,
   627  					Rep:  2,
   628  				},
   629  				{
   630  					Name:          "fail",
   631  					Img:           e2eutil.DefaultNginxImage,
   632  					Min:           2,
   633  					Rep:           2,
   634  					Command:       "sleep 10s && xxx",
   635  					RestartPolicy: v1.RestartPolicyNever,
   636  					Policies: []vcbatch.LifecyclePolicy{
   637  						{
   638  							Action: vcbus.RestartJobAction,
   639  							Event:  vcbus.PodFailedEvent,
   640  						},
   641  					},
   642  				},
   643  			},
   644  		})
   645  
   646  		// job phase: pending -> running -> restarting
   647  		err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting})
   648  		Expect(err).NotTo(HaveOccurred())
   649  	})
   650  	It("Task level LifecyclePolicy, Event: PodEvicted; Action: RestartJob", func() {
   651  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   652  		defer e2eutil.CleanupTestContext(ctx)
   653  
   654  		By("create job")
   655  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   656  			Name: "evicted-restart-job",
   657  
   658  			Tasks: []e2eutil.TaskSpec{
   659  				{
   660  					Name: "success",
   661  					Img:  e2eutil.DefaultNginxImage,
   662  					Min:  2,
   663  					Rep:  2,
   664  				},
   665  				{
   666  					Name: "delete",
   667  					Img:  e2eutil.DefaultNginxImage,
   668  					Min:  2,
   669  					Rep:  2,
   670  					Policies: []vcbatch.LifecyclePolicy{
   671  						{
   672  							Action: vcbus.RestartJobAction,
   673  							Event:  vcbus.PodEvictedEvent,
   674  						},
   675  					},
   676  				},
   677  			},
   678  		})
   679  
   680  		// job phase: pending -> running
   681  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   682  		Expect(err).NotTo(HaveOccurred())
   683  
   684  		By("delete one pod of job")
   685  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   686  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   687  		Expect(err).NotTo(HaveOccurred())
   688  
   689  		// job phase: Restarting -> Running
   690  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Restarting, vcbatch.Pending, vcbatch.Running})
   691  		Expect(err).NotTo(HaveOccurred())
   692  	})
   693  	It("Task level LifecyclePolicy, Event: PodEvicted; Action: TerminateJob", func() {
   694  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   695  		defer e2eutil.CleanupTestContext(ctx)
   696  
   697  		By("create job")
   698  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   699  			Name: "evicted-terminate-job",
   700  			Tasks: []e2eutil.TaskSpec{
   701  				{
   702  					Name: "success",
   703  					Img:  e2eutil.DefaultNginxImage,
   704  					Min:  2,
   705  					Rep:  2,
   706  				},
   707  				{
   708  					Name: "delete",
   709  					Img:  e2eutil.DefaultNginxImage,
   710  					Min:  2,
   711  					Rep:  2,
   712  					Policies: []vcbatch.LifecyclePolicy{
   713  						{
   714  							Action: vcbus.TerminateJobAction,
   715  							Event:  vcbus.PodEvictedEvent,
   716  						},
   717  					},
   718  				},
   719  			},
   720  		})
   721  
   722  		// job phase: pending -> running
   723  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   724  		Expect(err).NotTo(HaveOccurred())
   725  
   726  		By("delete one pod of job")
   727  		podName := jobctl.MakePodName(job.Name, "delete", 0)
   728  		err = ctx.Kubeclient.CoreV1().Pods(job.Namespace).Delete(context.TODO(), podName, metav1.DeleteOptions{})
   729  		Expect(err).NotTo(HaveOccurred())
   730  
   731  		// job phase: Terminating -> Terminated
   732  		err = e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{vcbatch.Terminating, vcbatch.Terminated})
   733  		Expect(err).NotTo(HaveOccurred())
   734  	})
   735  	It("Task level LifecyclePolicy, Event: TaskCompleted; Action: CompletedJob", func() {
   736  		ctx := e2eutil.InitTestContext(e2eutil.Options{})
   737  		defer e2eutil.CleanupTestContext(ctx)
   738  
   739  		By("create job")
   740  		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
   741  			Name: "any-complete-job",
   742  			Tasks: []e2eutil.TaskSpec{
   743  				{
   744  					Name: "completed-task",
   745  					Img:  e2eutil.DefaultBusyBoxImage,
   746  					Min:  2,
   747  					Rep:  2,
   748  					// Sleep 5 seconds ensure job in running state
   749  					Command: "sleep 5",
   750  					Policies: []vcbatch.LifecyclePolicy{
   751  						{
   752  							Action: vcbus.CompleteJobAction,
   753  							Event:  vcbus.TaskCompletedEvent,
   754  						},
   755  					},
   756  				},
   757  				{
   758  					Name: "terminating-task",
   759  					Img:  e2eutil.DefaultNginxImage,
   760  					Min:  2,
   761  					Rep:  2,
   762  				},
   763  			},
   764  		})
   765  
   766  		By("job scheduled, then task 'completed_task' finished and job finally complete")
   767  		// job phase: pending -> running -> completing -> completed
   768  		err := e2eutil.WaitJobPhases(ctx, job, []vcbatch.JobPhase{
   769  			vcbatch.Pending, vcbatch.Completed})
   770  		Expect(err).NotTo(HaveOccurred())
   771  
   772  	})
   773  
   774  	It("job level LifecyclePolicy, Event: PodFailed; Action: AbortJob and Task level lifecyclePolicy, Event : PodFailed; Action: RestartJob", func() {
   775  		By("init test context")
   776  		context := e2eutil.InitTestContext(e2eutil.Options{})
   777  		defer e2eutil.CleanupTestContext(context)
   778  
   779  		By("create job")
   780  		job := e2eutil.CreateJob(context, &e2eutil.JobSpec{
   781  			Name: "failed-restart-job",
   782  			Policies: []vcbatch.LifecyclePolicy{
   783  				{
   784  					Action: vcbus.AbortJobAction,
   785  					Event:  vcbus.PodFailedEvent,
   786  				},
   787  			},
   788  			Tasks: []e2eutil.TaskSpec{
   789  				{
   790  					Name: "success",
   791  					Img:  e2eutil.DefaultNginxImage,
   792  					Min:  2,
   793  					Rep:  2,
   794  				},
   795  				{
   796  					Name:          "fail",
   797  					Img:           e2eutil.DefaultNginxImage,
   798  					Min:           2,
   799  					Rep:           2,
   800  					Command:       "sleep 10s && xxx",
   801  					RestartPolicy: v1.RestartPolicyNever,
   802  					Policies: []vcbatch.LifecyclePolicy{
   803  						{
   804  							Action: vcbus.RestartJobAction,
   805  							Event:  vcbus.PodFailedEvent,
   806  						},
   807  					},
   808  				},
   809  			},
   810  		})
   811  
   812  		// job phase: pending -> running -> Restarting
   813  		err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running, vcbatch.Restarting})
   814  		Expect(err).NotTo(HaveOccurred())
   815  	})
   816  
   817  	It("Task Priority", func() {
   818  		By("init test context")
   819  		context := e2eutil.InitTestContext(e2eutil.Options{
   820  			PriorityClasses: map[string]int32{
   821  				e2eutil.MasterPriority: e2eutil.MasterPriorityValue,
   822  				e2eutil.WorkerPriority: e2eutil.WorkerPriorityValue,
   823  			},
   824  		})
   825  		defer e2eutil.CleanupTestContext(context)
   826  
   827  		rep := e2eutil.ClusterSize(context, e2eutil.OneCPU)
   828  		nodecount := e2eutil.ClusterNodeNumber(context)
   829  		By("create job")
   830  		job := e2eutil.CreateJob(context, &e2eutil.JobSpec{
   831  			Name: "task-priority-job",
   832  			Min:  int32(nodecount),
   833  			Tasks: []e2eutil.TaskSpec{
   834  				{
   835  					Name:         "higherprioritytask",
   836  					Img:          e2eutil.DefaultNginxImage,
   837  					Rep:          int32(nodecount),
   838  					Req:          e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)),
   839  					Taskpriority: e2eutil.MasterPriority,
   840  				},
   841  				{
   842  					Name:         "lowerprioritytask",
   843  					Img:          e2eutil.DefaultNginxImage,
   844  					Rep:          int32(nodecount),
   845  					Req:          e2eutil.CPUResource(strconv.Itoa(int(rep)/nodecount - 1)),
   846  					Taskpriority: e2eutil.MasterPriority,
   847  				},
   848  			},
   849  		})
   850  
   851  		// job phase: pending -> running
   852  		err := e2eutil.WaitJobPhases(context, job, []vcbatch.JobPhase{vcbatch.Pending, vcbatch.Running})
   853  		Expect(err).NotTo(HaveOccurred())
   854  		expteced := map[string]int{
   855  			e2eutil.MasterPriority: nodecount,
   856  			e2eutil.WorkerPriority: 0,
   857  		}
   858  
   859  		err = e2eutil.WaitTasksReadyEx(context, job, expteced)
   860  		Expect(err).NotTo(HaveOccurred())
   861  	})
   862  
   863  })