volcano.sh/volcano@v1.9.0/test/e2e/util/job.go (about)

     1  /*
     2  Copyright 2021 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package util
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	. "github.com/onsi/gomega"
    28  	batchv1 "k8s.io/api/batch/v1"
    29  	v1 "k8s.io/api/core/v1"
    30  	"k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  
    35  	batchv1alpha1 "volcano.sh/apis/pkg/apis/batch/v1alpha1"
    36  	schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    37  )
    38  
    39  type TaskSpec struct {
    40  	Name                  string
    41  	Min, Rep              int32
    42  	Img                   string
    43  	Command               string
    44  	WorkingDir            string
    45  	Hostport              int32
    46  	Req                   v1.ResourceList
    47  	Limit                 v1.ResourceList
    48  	Affinity              *v1.Affinity
    49  	Labels                map[string]string
    50  	Policies              []batchv1alpha1.LifecyclePolicy
    51  	RestartPolicy         v1.RestartPolicy
    52  	Tolerations           []v1.Toleration
    53  	DefaultGracefulPeriod *int64
    54  	Taskpriority          string
    55  	MaxRetry              int32
    56  }
    57  
    58  type JobSpec struct {
    59  	Name      string
    60  	Namespace string
    61  	Queue     string
    62  	Tasks     []TaskSpec
    63  	Policies  []batchv1alpha1.LifecyclePolicy
    64  	Min       int32
    65  	Pri       string
    66  	Plugins   map[string][]string
    67  	Volumes   []batchv1alpha1.VolumeSpec
    68  	NodeName  string
    69  	// ttl seconds after job finished
    70  	TTL        *int32
    71  	MinSuccess *int32
    72  	// job max retry
    73  	MaxRetry int32
    74  }
    75  
    76  func Namespace(context *TestContext, job *JobSpec) string {
    77  	if len(job.Namespace) != 0 {
    78  		return job.Namespace
    79  	}
    80  
    81  	return context.Namespace
    82  }
    83  
    84  func CreateJob(context *TestContext, jobSpec *JobSpec) *batchv1alpha1.Job {
    85  	job, err := CreateJobInner(context, jobSpec)
    86  	Expect(err).NotTo(HaveOccurred(), "failed to create job %s in namespace %s", jobSpec.Name, jobSpec.Namespace)
    87  	return job
    88  }
    89  
    90  func CreateJobWithPodGroup(ctx *TestContext, jobSpec *JobSpec,
    91  	pgName string, annotations map[string]string) *batchv1alpha1.Job {
    92  	ns := Namespace(ctx, jobSpec)
    93  
    94  	job := &batchv1alpha1.Job{
    95  		ObjectMeta: metav1.ObjectMeta{
    96  			Name:        jobSpec.Name,
    97  			Namespace:   ns,
    98  			Annotations: annotations,
    99  		},
   100  		Spec: batchv1alpha1.JobSpec{
   101  			Policies:                jobSpec.Policies,
   102  			Queue:                   jobSpec.Queue,
   103  			Plugins:                 jobSpec.Plugins,
   104  			TTLSecondsAfterFinished: jobSpec.TTL,
   105  		},
   106  	}
   107  
   108  	var min int32
   109  	for i, task := range jobSpec.Tasks {
   110  		name := task.Name
   111  		if len(name) == 0 {
   112  			name = fmt.Sprintf("%s-task-%d", jobSpec.Name, i)
   113  		}
   114  
   115  		restartPolicy := v1.RestartPolicyOnFailure
   116  		if len(task.RestartPolicy) > 0 {
   117  			restartPolicy = task.RestartPolicy
   118  		}
   119  
   120  		ts := batchv1alpha1.TaskSpec{
   121  			Name:     name,
   122  			Replicas: task.Rep,
   123  			Policies: task.Policies,
   124  			Template: v1.PodTemplateSpec{
   125  				ObjectMeta: metav1.ObjectMeta{
   126  					Name:   name,
   127  					Labels: task.Labels,
   128  				},
   129  				Spec: v1.PodSpec{
   130  					SchedulerName:     "volcano",
   131  					RestartPolicy:     restartPolicy,
   132  					Containers:        CreateContainers(task.Img, task.Command, task.WorkingDir, task.Req, task.Limit, task.Hostport),
   133  					Affinity:          task.Affinity,
   134  					Tolerations:       task.Tolerations,
   135  					PriorityClassName: task.Taskpriority,
   136  				},
   137  			},
   138  		}
   139  
   140  		if pgName != "" {
   141  			ts.Template.ObjectMeta.Annotations = map[string]string{schedulingv1beta1.KubeGroupNameAnnotationKey: pgName}
   142  		}
   143  
   144  		if task.DefaultGracefulPeriod != nil {
   145  			ts.Template.Spec.TerminationGracePeriodSeconds = task.DefaultGracefulPeriod
   146  		} else {
   147  			// NOTE: TerminationGracePeriodSeconds is set to 3 in default in case of timeout when restarting tasks in test.
   148  			var defaultPeriod int64 = 3
   149  			ts.Template.Spec.TerminationGracePeriodSeconds = &defaultPeriod
   150  		}
   151  
   152  		job.Spec.Tasks = append(job.Spec.Tasks, ts)
   153  
   154  		min += task.Min
   155  	}
   156  
   157  	if jobSpec.Min > 0 {
   158  		job.Spec.MinAvailable = jobSpec.Min
   159  	} else {
   160  		job.Spec.MinAvailable = min
   161  	}
   162  
   163  	if jobSpec.Pri != "" {
   164  		job.Spec.PriorityClassName = jobSpec.Pri
   165  	}
   166  
   167  	job.Spec.Volumes = jobSpec.Volumes
   168  
   169  	jobCreated, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Create(context.TODO(), job, metav1.CreateOptions{})
   170  	Expect(err).NotTo(HaveOccurred(), "failed to create job %s in namespace %s", job.Name, job.Namespace)
   171  
   172  	return jobCreated
   173  }
   174  
   175  func UpdateJob(ctx *TestContext, job *batchv1alpha1.Job) error {
   176  	spec, err := json.Marshal(job.Spec)
   177  	if err != nil {
   178  		return err
   179  	}
   180  	patch := fmt.Sprintf(`[{"op": "replace", "path": "/spec", "value":%s}]`, spec)
   181  	patchBytes := []byte(patch)
   182  	_, err = ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Patch(context.TODO(),
   183  		job.Name, types.JSONPatchType, patchBytes, metav1.PatchOptions{})
   184  	return err
   185  }
   186  
   187  func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, error) {
   188  	ns := Namespace(ctx, jobSpec)
   189  
   190  	job := &batchv1alpha1.Job{
   191  		ObjectMeta: metav1.ObjectMeta{
   192  			Name:      jobSpec.Name,
   193  			Namespace: ns,
   194  		},
   195  		Spec: batchv1alpha1.JobSpec{
   196  			SchedulerName:           "volcano",
   197  			Policies:                jobSpec.Policies,
   198  			Queue:                   jobSpec.Queue,
   199  			Plugins:                 jobSpec.Plugins,
   200  			TTLSecondsAfterFinished: jobSpec.TTL,
   201  			MinSuccess:              jobSpec.MinSuccess,
   202  			MaxRetry:                jobSpec.MaxRetry,
   203  		},
   204  	}
   205  
   206  	var min int32
   207  	for i, task := range jobSpec.Tasks {
   208  		name := task.Name
   209  		if len(name) == 0 {
   210  			name = fmt.Sprintf("%s-task-%d", jobSpec.Name, i)
   211  		}
   212  
   213  		restartPolicy := v1.RestartPolicyOnFailure
   214  		if len(task.RestartPolicy) > 0 {
   215  			restartPolicy = task.RestartPolicy
   216  		}
   217  
   218  		maxRetry := task.MaxRetry
   219  		if maxRetry == 0 {
   220  			maxRetry = -1
   221  		}
   222  
   223  		ts := batchv1alpha1.TaskSpec{
   224  			Name:     name,
   225  			Replicas: task.Rep,
   226  			Policies: task.Policies,
   227  			MaxRetry: maxRetry,
   228  			Template: v1.PodTemplateSpec{
   229  				ObjectMeta: metav1.ObjectMeta{
   230  					Name:   name,
   231  					Labels: task.Labels,
   232  				},
   233  				Spec: v1.PodSpec{
   234  					RestartPolicy:     restartPolicy,
   235  					Containers:        CreateContainers(task.Img, task.Command, task.WorkingDir, task.Req, task.Limit, task.Hostport),
   236  					Affinity:          task.Affinity,
   237  					Tolerations:       task.Tolerations,
   238  					PriorityClassName: task.Taskpriority,
   239  				},
   240  			},
   241  		}
   242  		if jobSpec.NodeName != "" {
   243  			ts.Template.Spec.NodeName = jobSpec.NodeName
   244  		}
   245  
   246  		if task.DefaultGracefulPeriod != nil {
   247  			ts.Template.Spec.TerminationGracePeriodSeconds = task.DefaultGracefulPeriod
   248  		} else {
   249  			// NOTE: TerminationGracePeriodSeconds is set to 3 in default in case of timeout when restarting tasks in test.
   250  			var defaultPeriod int64 = 3
   251  			ts.Template.Spec.TerminationGracePeriodSeconds = &defaultPeriod
   252  		}
   253  
   254  		job.Spec.Tasks = append(job.Spec.Tasks, ts)
   255  
   256  		min += task.Min
   257  	}
   258  
   259  	if jobSpec.Min > 0 {
   260  		job.Spec.MinAvailable = jobSpec.Min
   261  	} else {
   262  		job.Spec.MinAvailable = min
   263  	}
   264  
   265  	if jobSpec.Pri != "" {
   266  		job.Spec.PriorityClassName = jobSpec.Pri
   267  	}
   268  
   269  	job.Spec.Volumes = jobSpec.Volumes
   270  
   271  	return ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Create(context.TODO(), job, metav1.CreateOptions{})
   272  }
   273  
   274  func WaitTaskPhase(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum int) error {
   275  	var additionalError error
   276  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   277  		pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
   278  		Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)
   279  
   280  		readyTaskNum := 0
   281  		for _, pod := range pods.Items {
   282  			if !metav1.IsControlledBy(&pod, job) {
   283  				continue
   284  			}
   285  
   286  			for _, p := range phase {
   287  				if pod.Status.Phase == p {
   288  					readyTaskNum++
   289  					break
   290  				}
   291  			}
   292  		}
   293  
   294  		ready := taskNum <= readyTaskNum
   295  		if !ready {
   296  			additionalError = fmt.Errorf("expected job '%s' to have %d ready pods, actual got %d", job.Name,
   297  				taskNum,
   298  				readyTaskNum)
   299  		}
   300  		return ready, nil
   301  	})
   302  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   303  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   304  	}
   305  	return err
   306  }
   307  
   308  func taskPhaseEx(ctx *TestContext, job *batchv1alpha1.Job, phase []v1.PodPhase, taskNum map[string]int) error {
   309  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   310  
   311  		pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
   312  		Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)
   313  
   314  		readyTaskNum := map[string]int{}
   315  		for _, pod := range pods.Items {
   316  			if !metav1.IsControlledBy(&pod, job) {
   317  				continue
   318  			}
   319  
   320  			for _, p := range phase {
   321  				if pod.Status.Phase == p {
   322  					readyTaskNum[pod.Spec.PriorityClassName]++
   323  					break
   324  				}
   325  			}
   326  		}
   327  
   328  		for k, v := range taskNum {
   329  			if v > readyTaskNum[k] {
   330  				return false, nil
   331  			}
   332  		}
   333  
   334  		return true, nil
   335  	})
   336  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   337  		return fmt.Errorf("[Wait time out]")
   338  	}
   339  	return err
   340  
   341  }
   342  
   343  func jobUnschedulable(ctx *TestContext, job *batchv1alpha1.Job, now time.Time) error {
   344  	var additionalError error
   345  	// TODO(k82cn): check Job's Condition instead of PodGroup's event.
   346  	err := wait.Poll(10*time.Second, FiveMinute, func() (bool, error) {
   347  		pgName := job.Name + "-" + string(job.UID)
   348  		pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(job.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
   349  		if err != nil {
   350  			additionalError = fmt.Errorf("expected to have job's podgroup %s created, actual got error %s",
   351  				job.Name, err.Error())
   352  			return false, nil
   353  		}
   354  
   355  		events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{})
   356  		if err != nil {
   357  			additionalError = fmt.Errorf("expected to have events for job %s, actual got error %s",
   358  				job.Name, err.Error())
   359  			return false, nil
   360  		}
   361  		for _, event := range events.Items {
   362  			target := event.InvolvedObject
   363  			if strings.HasPrefix(target.Name, pg.Name) && target.Namespace == pg.Namespace {
   364  				if event.Reason == string("Unschedulable") || event.Reason == string("FailedScheduling") && event.LastTimestamp.After(now) {
   365  					return true, nil
   366  				}
   367  			}
   368  		}
   369  		additionalError = fmt.Errorf(
   370  			"expected to have 'Unschedulable' events for podgroup %s, actual got nothing", job.Name)
   371  		return false, nil
   372  	})
   373  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   374  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   375  	}
   376  	return err
   377  }
   378  
   379  func JobEvicted(ctx *TestContext, job *batchv1alpha1.Job, time time.Time) wait.ConditionFunc {
   380  	// TODO(k82cn): check Job's conditions instead of PodGroup's event.
   381  	return func() (bool, error) {
   382  		pgName := job.Name + "-" + string(job.UID)
   383  		pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(job.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
   384  		Expect(err).NotTo(HaveOccurred(), "failed to get pod group of job %s in namespace %s", job.Name, job.Namespace)
   385  
   386  		events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{})
   387  		Expect(err).NotTo(HaveOccurred(), "failed to list events in namespace %s", pg.Namespace)
   388  
   389  		for _, event := range events.Items {
   390  			target := event.InvolvedObject
   391  			if target.Name == pg.Name && target.Namespace == pg.Namespace {
   392  				if event.Reason == string("Evict") && event.LastTimestamp.After(time) {
   393  					return true, nil
   394  				}
   395  			}
   396  		}
   397  		return false, nil
   398  	}
   399  }
   400  
   401  func WaitJobPhases(ctx *TestContext, job *batchv1alpha1.Job, phases []batchv1alpha1.JobPhase) error {
   402  	w, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Watch(context.TODO(), metav1.ListOptions{})
   403  	if err != nil {
   404  		return err
   405  	}
   406  	defer w.Stop()
   407  
   408  	var additionalError error
   409  	total := int32(0)
   410  	for _, task := range job.Spec.Tasks {
   411  		total += task.Replicas
   412  	}
   413  
   414  	ch := w.ResultChan()
   415  	index := 0
   416  	timeout := time.After(TenMinute)
   417  
   418  	for index < len(phases) {
   419  		select {
   420  		case event, open := <-ch:
   421  			if !open {
   422  				return fmt.Errorf("watch channel should be always open")
   423  			}
   424  
   425  			newJob := event.Object.(*batchv1alpha1.Job)
   426  			phase := phases[index]
   427  			if newJob.Name != job.Name || newJob.Namespace != job.Namespace {
   428  				continue
   429  			}
   430  
   431  			if newJob.Status.State.Phase != phase {
   432  				additionalError = fmt.Errorf(
   433  					"expected job '%s' to be in status %s, actual get %s",
   434  					job.Name, phase, newJob.Status.State.Phase)
   435  				continue
   436  			}
   437  
   438  			var flag bool
   439  			switch phase {
   440  			case batchv1alpha1.Pending:
   441  				flag = (newJob.Status.Pending+newJob.Status.Succeeded+
   442  					newJob.Status.Failed+newJob.Status.Running) == 0 ||
   443  					(total-newJob.Status.Terminating >= newJob.Status.MinAvailable)
   444  			case batchv1alpha1.Terminating, batchv1alpha1.Aborting, batchv1alpha1.Restarting, batchv1alpha1.Completing:
   445  				flag = newJob.Status.Terminating > 0
   446  			case batchv1alpha1.Terminated, batchv1alpha1.Aborted, batchv1alpha1.Completed:
   447  				flag = newJob.Status.Pending == 0 &&
   448  					newJob.Status.Running == 0 &&
   449  					newJob.Status.Terminating == 0
   450  			case batchv1alpha1.Running:
   451  				flag = newJob.Status.Running >= newJob.Spec.MinAvailable
   452  			default:
   453  				return fmt.Errorf("unknown phase %s", phase)
   454  			}
   455  
   456  			if !flag {
   457  				additionalError = fmt.Errorf(
   458  					"expected job '%s' to be in status %s, actual detail status %s",
   459  					job.Name, phase, getJobStatusDetail(newJob))
   460  				continue
   461  			}
   462  
   463  			index++
   464  			timeout = time.After(TenMinute)
   465  
   466  		case <-timeout:
   467  			return fmt.Errorf("[Wait time out]: %s", additionalError)
   468  		}
   469  	}
   470  
   471  	return nil
   472  }
   473  
   474  func WaitJobStates(ctx *TestContext, job *batchv1alpha1.Job, phases []batchv1alpha1.JobPhase, waitTime time.Duration) error {
   475  	for _, phase := range phases {
   476  		err := waitJobPhaseExpect(ctx, job, phase, waitTime)
   477  		if err != nil {
   478  			return err
   479  		}
   480  	}
   481  	return nil
   482  }
   483  
   484  func getJobStatusDetail(job *batchv1alpha1.Job) string {
   485  	return fmt.Sprintf("\nName: %s\n Phase: %s\nPending: %d"+
   486  		"\nRunning: %d\nSucceeded: %d\nTerminating: %d\nFailed: %d\n ",
   487  		job.Name, job.Status.State.Phase, job.Status.Pending, job.Status.Running,
   488  		job.Status.Succeeded, job.Status.Terminating, job.Status.Failed)
   489  }
   490  
   491  // WaitJobReady waits for the Job to be ready
   492  func WaitJobReady(ctx *TestContext, job *batchv1alpha1.Job) error {
   493  	return WaitTasksReady(ctx, job, int(job.Spec.MinAvailable))
   494  }
   495  
   496  // WaitJobPending waits for the Job to be pending
   497  func WaitJobPending(ctx *TestContext, job *batchv1alpha1.Job) error {
   498  	return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodPending}, int(job.Spec.MinAvailable))
   499  }
   500  
   501  // WaitTasksReady waits for the tasks of a Job to be ready
   502  func WaitTasksReady(ctx *TestContext, job *batchv1alpha1.Job, taskNum int) error {
   503  	return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodRunning, v1.PodSucceeded}, taskNum)
   504  }
   505  
   506  func WaitTasksReadyEx(ctx *TestContext, job *batchv1alpha1.Job, taskNum map[string]int) error {
   507  	return taskPhaseEx(ctx, job, []v1.PodPhase{v1.PodRunning, v1.PodSucceeded}, taskNum)
   508  }
   509  
   510  // WaitTasksPending waits for the tasks of a Job to be pending
   511  func WaitTasksPending(ctx *TestContext, job *batchv1alpha1.Job, taskNum int) error {
   512  	return WaitTaskPhase(ctx, job, []v1.PodPhase{v1.PodPending}, taskNum)
   513  }
   514  
   515  // WaitJobStateReady waits for the state of a Job to be ready
   516  func WaitJobStateReady(ctx *TestContext, job *batchv1alpha1.Job) error {
   517  	return waitJobPhaseExpect(ctx, job, batchv1alpha1.Running, FiveMinute)
   518  }
   519  
   520  // WaitJobStatePending waits for the state of a Job to be pending
   521  func WaitJobStatePending(ctx *TestContext, job *batchv1alpha1.Job) error {
   522  	return waitJobPhaseExpect(ctx, job, batchv1alpha1.Pending, FiveMinute)
   523  }
   524  
   525  // WaitJobStateAborted waits for the state of a Job to be aborted
   526  func WaitJobStateAborted(ctx *TestContext, job *batchv1alpha1.Job) error {
   527  	return waitJobPhaseExpect(ctx, job, batchv1alpha1.Aborted, FiveMinute)
   528  }
   529  
   530  // WaitPodPhaseRunningMoreThanNum waits for the number of running pods to be more than specified number
   531  func WaitPodPhaseRunningMoreThanNum(ctx *TestContext, namespace string, num int) error {
   532  	var additionalError error
   533  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   534  		clusterPods, err := ctx.Kubeclient.CoreV1().Pods(namespace).List(context.TODO(), metav1.ListOptions{})
   535  		Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", namespace)
   536  
   537  		runningPodNum := 0
   538  		for _, pod := range clusterPods.Items {
   539  			if pod.Status.Phase == "Running" {
   540  				runningPodNum++
   541  			}
   542  		}
   543  
   544  		expected := runningPodNum >= num
   545  		if !expected {
   546  			additionalError = fmt.Errorf("expected running pod is '%s', actual got %s", strconv.Itoa(runningPodNum), strconv.Itoa(num))
   547  		}
   548  		return expected, nil
   549  	})
   550  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   551  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   552  	}
   553  	return err
   554  }
   555  
   556  func waitJobPhaseExpect(ctx *TestContext, job *batchv1alpha1.Job, state batchv1alpha1.JobPhase, waitTime time.Duration) error {
   557  	var additionalError error
   558  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   559  		job, err := ctx.Vcclient.BatchV1alpha1().Jobs(job.Namespace).Get(context.TODO(), job.Name, metav1.GetOptions{})
   560  		Expect(err).NotTo(HaveOccurred())
   561  		expected := job.Status.State.Phase == state
   562  		if !expected {
   563  			additionalError = fmt.Errorf("expected job '%s' phase in %s, actual got %s", job.Name,
   564  				state, job.Status.State.Phase)
   565  		}
   566  		return expected, nil
   567  	})
   568  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   569  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   570  	}
   571  	return err
   572  }
   573  
   574  func WaitJobPhaseReady(ctx *TestContext, job *batchv1.Job) error {
   575  	var additionalError error
   576  
   577  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   578  		job, err := ctx.Kubeclient.BatchV1().Jobs(job.Namespace).Get(context.TODO(), job.Name, metav1.GetOptions{})
   579  		Expect(err).NotTo(HaveOccurred())
   580  		expected := job.Status.Active > 0
   581  		if !expected {
   582  			additionalError = fmt.Errorf("expected job '%s' active pod to be greater than 0, actual got %d", job.Name, job.Status.Active)
   583  		}
   584  		return expected, nil
   585  	})
   586  
   587  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   588  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   589  	}
   590  
   591  	return err
   592  }
   593  
   594  func WaitJobUnschedulable(ctx *TestContext, job *batchv1alpha1.Job) error {
   595  	now := time.Now()
   596  	return jobUnschedulable(ctx, job, now)
   597  }
   598  
   599  func CreateContainers(img, command, workingDir string, req, limit v1.ResourceList, hostport int32) []v1.Container {
   600  	var imageRepo []string
   601  	container := v1.Container{
   602  		Image:           img,
   603  		ImagePullPolicy: v1.PullIfNotPresent,
   604  		Resources: v1.ResourceRequirements{
   605  			Requests: req,
   606  			Limits:   limit,
   607  		},
   608  	}
   609  	if !strings.Contains(img, ":") {
   610  		imageRepo = strings.Split(img, "/")
   611  	} else {
   612  		imageRepo = strings.Split(img[:strings.Index(img, ":")], "/")
   613  	}
   614  	container.Name = imageRepo[len(imageRepo)-1]
   615  
   616  	if len(command) > 0 {
   617  		container.Command = []string{"/bin/sh"}
   618  		container.Args = []string{"-c", command}
   619  	}
   620  
   621  	if hostport > 0 {
   622  		container.Ports = []v1.ContainerPort{
   623  			{
   624  				ContainerPort: hostport,
   625  				HostPort:      hostport,
   626  			},
   627  		}
   628  	}
   629  
   630  	if len(workingDir) > 0 {
   631  		container.WorkingDir = workingDir
   632  	}
   633  
   634  	return []v1.Container{container}
   635  }
   636  
   637  // WaitJobCleanedUp waits for the Job to be cleaned up
   638  func WaitJobCleanedUp(ctx *TestContext, cleanupjob *batchv1alpha1.Job) error {
   639  	var additionalError error
   640  
   641  	pods := GetTasksOfJob(ctx, cleanupjob)
   642  
   643  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   644  		job, err := ctx.Vcclient.BatchV1alpha1().Jobs(cleanupjob.Namespace).Get(context.TODO(), cleanupjob.Name, metav1.GetOptions{})
   645  		if err != nil && !errors.IsNotFound(err) {
   646  			return false, nil
   647  		}
   648  		if len(job.Name) != 0 {
   649  			additionalError = fmt.Errorf("job %s/%s still exist", job.Namespace, job.Name)
   650  			return false, nil
   651  		}
   652  
   653  		pgName := cleanupjob.Name + "-" + string(cleanupjob.UID)
   654  		pg, err := ctx.Vcclient.SchedulingV1beta1().PodGroups(cleanupjob.Namespace).Get(context.TODO(), pgName, metav1.GetOptions{})
   655  		if err != nil && !errors.IsNotFound(err) {
   656  			return false, nil
   657  		}
   658  		if len(pg.Name) != 0 {
   659  			additionalError = fmt.Errorf("pdgroup %s/%s still exist", job.Namespace, job.Name)
   660  			return false, nil
   661  		}
   662  
   663  		return true, nil
   664  	})
   665  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   666  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   667  	}
   668  
   669  	for _, pod := range pods {
   670  		err := WaitPodGone(ctx, pod.Name, pod.Namespace)
   671  		if err != nil {
   672  			return err
   673  		}
   674  	}
   675  
   676  	return err
   677  }
   678  
   679  // GetTasksOfJob returns the tasks belongs to the job
   680  func GetTasksOfJob(ctx *TestContext, job *batchv1alpha1.Job) []*v1.Pod {
   681  	pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
   682  	Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)
   683  
   684  	var tasks []*v1.Pod
   685  
   686  	for _, pod := range pods.Items {
   687  		if !metav1.IsControlledBy(&pod, job) {
   688  			continue
   689  		}
   690  		duplicatePod := pod.DeepCopy()
   691  		tasks = append(tasks, duplicatePod)
   692  	}
   693  
   694  	return tasks
   695  }
   696  
   697  // WaitPodGone waits the Pod to be deleted when aborting a Job
   698  func WaitPodGone(ctx *TestContext, podName, namespace string) error {
   699  	var additionalError error
   700  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   701  		_, err := ctx.Kubeclient.CoreV1().Pods(namespace).Get(context.TODO(), podName, metav1.GetOptions{})
   702  		expected := errors.IsNotFound(err)
   703  		if !expected {
   704  			additionalError = fmt.Errorf("job related pod should be deleted when aborting job")
   705  		}
   706  
   707  		return expected, nil
   708  	})
   709  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   710  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   711  	}
   712  	return err
   713  }
   714  
   715  // WaitJobTerminateAction waits for the Job to be terminated
   716  func WaitJobTerminateAction(ctx *TestContext, pg *batchv1alpha1.Job) error {
   717  	return wait.Poll(10*time.Second, FiveMinute, jobTerminateAction(ctx, pg, time.Now()))
   718  }
   719  
   720  func jobTerminateAction(ctx *TestContext, pg *batchv1alpha1.Job, time time.Time) wait.ConditionFunc {
   721  	return func() (bool, error) {
   722  		events, err := ctx.Kubeclient.CoreV1().Events(pg.Namespace).List(context.TODO(), metav1.ListOptions{})
   723  		Expect(err).NotTo(HaveOccurred(), "failed to list events in namespace %s", pg.Namespace)
   724  
   725  		for _, event := range events.Items {
   726  			target := event.InvolvedObject
   727  			if strings.HasPrefix(target.Name, pg.Name) && target.Namespace == pg.Namespace {
   728  				if event.Reason == string(ExecuteAction) && strings.Contains(event.Message, "TerminateJob") && event.LastTimestamp.After(time) {
   729  					return true, nil
   730  				}
   731  			}
   732  		}
   733  
   734  		return false, nil
   735  	}
   736  }
   737  
   738  // WaitPodPhase waits for the Pod to be the specified phase
   739  func WaitPodPhase(ctx *TestContext, pod *v1.Pod, phase []v1.PodPhase) error {
   740  	var additionalError error
   741  	err := wait.Poll(100*time.Millisecond, FiveMinute, func() (bool, error) {
   742  		pods, err := ctx.Kubeclient.CoreV1().Pods(pod.Namespace).List(context.TODO(), metav1.ListOptions{})
   743  		Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", pod.Namespace)
   744  
   745  		for _, p := range phase {
   746  			for _, pod := range pods.Items {
   747  				if pod.Status.Phase == p {
   748  					return true, nil
   749  				}
   750  			}
   751  		}
   752  
   753  		additionalError = fmt.Errorf("expected pod '%s' to %v, actual got %s", pod.Name, phase, pod.Status.Phase)
   754  		return false, nil
   755  	})
   756  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   757  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   758  	}
   759  	return err
   760  }
   761  
   762  // IsPodScheduled returns whether the Pod is scheduled
   763  func IsPodScheduled(pod *v1.Pod) bool {
   764  	for _, cond := range pod.Status.Conditions {
   765  		if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionTrue {
   766  			return true
   767  		}
   768  	}
   769  	return false
   770  }
   771  
   772  // WaitTasksCompleted waits for the tasks of a job to be completed
   773  func WaitTasksCompleted(ctx *TestContext, job *batchv1alpha1.Job, successNum int32) error {
   774  	var additionalError error
   775  	err := wait.Poll(100*time.Millisecond, TwoMinute, func() (bool, error) {
   776  		pods, err := ctx.Kubeclient.CoreV1().Pods(job.Namespace).List(context.TODO(), metav1.ListOptions{})
   777  		Expect(err).NotTo(HaveOccurred(), "failed to list pods in namespace %s", job.Namespace)
   778  
   779  		var succeeded int32 = 0
   780  		for _, pod := range pods.Items {
   781  			if !metav1.IsControlledBy(&pod, job) {
   782  				continue
   783  			}
   784  
   785  			if pod.Status.Phase == "Succeeded" {
   786  				succeeded++
   787  			}
   788  		}
   789  
   790  		ready := succeeded >= successNum
   791  		if !ready {
   792  			additionalError = fmt.Errorf("expected job '%s' to have %d succeeded pods, actual got %d", job.Name,
   793  				successNum,
   794  				succeeded)
   795  		}
   796  		return ready, nil
   797  	})
   798  	if err != nil && strings.Contains(err.Error(), TimeOutMessage) {
   799  		return fmt.Errorf("[Wait time out]: %s", additionalError)
   800  	}
   801  	return err
   802  }