k8s.io/kubernetes@v1.29.3/test/e2e/apps/job.go

k8s.io/kubernetes@v1.29.3/test/e2e/apps/job.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package apps
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"fmt"
    23  	"strconv"
    24  	"time"
    25  
    26  	batchv1 "k8s.io/api/batch/v1"
    27  	v1 "k8s.io/api/core/v1"
    28  	policyv1 "k8s.io/api/policy/v1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	"k8s.io/apimachinery/pkg/api/resource"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    33  	"k8s.io/apimachinery/pkg/labels"
    34  	"k8s.io/apimachinery/pkg/runtime/schema"
    35  	"k8s.io/apimachinery/pkg/types"
    36  	utilrand "k8s.io/apimachinery/pkg/util/rand"
    37  	"k8s.io/apimachinery/pkg/util/sets"
    38  	"k8s.io/apimachinery/pkg/util/wait"
    39  	"k8s.io/apimachinery/pkg/watch"
    40  	clientset "k8s.io/client-go/kubernetes"
    41  	"k8s.io/client-go/tools/cache"
    42  	watchtools "k8s.io/client-go/tools/watch"
    43  	"k8s.io/client-go/util/retry"
    44  	batchinternal "k8s.io/kubernetes/pkg/apis/batch"
    45  	"k8s.io/kubernetes/test/e2e/framework"
    46  	e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
    47  	e2enode "k8s.io/kubernetes/test/e2e/framework/node"
    48  	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
    49  	e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
    50  	"k8s.io/kubernetes/test/e2e/scheduling"
    51  	admissionapi "k8s.io/pod-security-admission/api"
    52  	"k8s.io/utils/pointer"
    53  	"k8s.io/utils/ptr"
    54  
    55  	"github.com/onsi/ginkgo/v2"
    56  	"github.com/onsi/gomega"
    57  )
    58  
    59  type watchEventConfig struct {
    60  	framework           *framework.Framework
    61  	resourceVersion     string
    62  	w                   *cache.ListWatch
    63  	jobName             string
    64  	watchEvent          watch.EventType
    65  	extJob              *batchv1.Job
    66  	updatedMetadataType string
    67  	updatedKey          string
    68  	updatedValue        string
    69  }
    70  
    71  var _ = SIGDescribe("Job", func() {
    72  	f := framework.NewDefaultFramework("job")
    73  	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged
    74  	parallelism := int32(2)
    75  	completions := int32(4)
    76  
    77  	largeParallelism := int32(90)
    78  	largeCompletions := int32(90)
    79  
    80  	backoffLimit := int32(6) // default value
    81  
    82  	// Simplest case: N pods succeed
    83  	ginkgo.It("should run a job to completion when tasks succeed", func(ctx context.Context) {
    84  		ginkgo.By("Creating a job")
    85  		job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
    86  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
    87  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
    88  
    89  		ginkgo.By("Ensuring job reaches completions")
    90  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
    91  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
    92  
    93  		ginkgo.By("Ensuring pods for job exist")
    94  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
    95  		framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name)
    96  		successes := int32(0)
    97  		for _, pod := range pods.Items {
    98  			if pod.Status.Phase == v1.PodSucceeded {
    99  				successes++
   100  			}
   101  		}
   102  		gomega.Expect(successes).To(gomega.Equal(completions), "expected %d successful job pods, but got  %d", completions, successes)
   103  	})
   104  
   105  	ginkgo.It("should allow to use the pod failure policy on exit code to fail the job early", func(ctx context.Context) {
   106  
   107  		// We fail the Job's pod only once to ensure the backoffLimit is not
   108  		// reached and thus the job is failed due to the pod failure policy
   109  		// with FailJob action.
   110  		// In order to ensure a Job's pod fails once before succeeding we force
   111  		// the Job's Pods to be scheduled to a single Node and use a hostPath
   112  		// volume to persist data across new Pods.
   113  		ginkgo.By("Looking for a node to schedule job pod")
   114  		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
   115  		framework.ExpectNoError(err)
   116  
   117  		ginkgo.By("Creating a job")
   118  		job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-failjob", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
   119  		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
   120  			Rules: []batchv1.PodFailurePolicyRule{
   121  				{
   122  					Action: batchv1.PodFailurePolicyActionFailJob,
   123  					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   124  						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   125  						Values:   []int32{1},
   126  					},
   127  				},
   128  			},
   129  		}
   130  		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   131  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   132  
   133  		ginkgo.By("Ensuring job fails")
   134  		err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name)
   135  		framework.ExpectNoError(err, "failed to ensure job failure in namespace: %s", f.Namespace.Name)
   136  	})
   137  
   138  	ginkgo.It("should allow to use the pod failure policy to not count the failure towards the backoffLimit", func(ctx context.Context) {
   139  
   140  		// We set the backoffLimit to 0 so that any pod failure would trigger
   141  		// job failure if not for the pod failure policy to ignore the failed
   142  		// pods from counting them towards the backoffLimit. Also, we fail the
   143  		// pod only once so that the job eventually succeeds.
   144  		// In order to ensure a Job's pod fails once before succeeding we force
   145  		// the Job's Pods to be scheduled to a single Node and use a hostPath
   146  		// volume to persist data across new Pods.
   147  		backoffLimit := int32(0)
   148  
   149  		ginkgo.By("Looking for a node to schedule job pod")
   150  		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
   151  		framework.ExpectNoError(err)
   152  
   153  		ginkgo.By("Creating a job")
   154  		job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
   155  		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
   156  			Rules: []batchv1.PodFailurePolicyRule{
   157  				{
   158  					Action: batchv1.PodFailurePolicyActionIgnore,
   159  					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   160  						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   161  						Values:   []int32{1},
   162  					},
   163  				},
   164  			},
   165  		}
   166  		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   167  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   168  
   169  		ginkgo.By("Ensuring job reaches completions")
   170  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
   171  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   172  	})
   173  
   174  	// This test is using an indexed job. The pod corresponding to the 0th index
   175  	// creates a marker file on the host and runs 'forever' until evicted. We use
   176  	// the non-0-indexed pods to determine if the marker file is already
   177  	// created by the 0th indexed pod - the non-0-indexed pods fail and restart
   178  	// until the marker file is created (their potential failures are ignored
   179  	// based on the exit code). Once the marker file is created the 0th indexed
   180  	// pod is evicted (DisruptionTarget condition is added in the process),
   181  	// after restart it runs to successful completion.
   182  	// Steps:
   183  	// 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods
   184  	// 2. Create the indexed job
   185  	// 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod
   186  	// 4. Make sure the 0-indexed pod is running
   187  	// 5. Evict the 0-indexed pod
   188  	// 6. Await for the job to successfully complete
   189  	ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit",
   190  		func(ctx context.Context, policy *batchv1.PodFailurePolicy) {
   191  			mode := batchv1.IndexedCompletion
   192  
   193  			// We set the backoffLimit to 0 so that any pod failure would trigger
   194  			// job failure if not for the pod failure policy to ignore the failed
   195  			// pods from counting them towards the backoffLimit.
   196  			backoffLimit := int32(0)
   197  
   198  			ginkgo.By("Looking for a node to schedule job pods")
   199  			node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
   200  			framework.ExpectNoError(err)
   201  
   202  			ginkgo.By("Creating a job")
   203  			job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
   204  			job.Spec.CompletionMode = &mode
   205  			job.Spec.PodFailurePolicy = policy
   206  			job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   207  			framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   208  
   209  			ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created")
   210  			err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1)
   211  			framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace)
   212  
   213  			ginkgo.By("Awaiting for the 0-indexed pod to be running")
   214  			err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
   215  			framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace)
   216  
   217  			pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   218  			framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace)
   219  			gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected")
   220  			pod := pods[0]
   221  			ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace))
   222  			evictTarget := &policyv1.Eviction{
   223  				ObjectMeta: metav1.ObjectMeta{
   224  					Name:      pod.Name,
   225  					Namespace: pod.Namespace,
   226  				},
   227  			}
   228  			f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget)
   229  			framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace)
   230  
   231  			ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace))
   232  			err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete)
   233  			framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace)
   234  
   235  			ginkgo.By("Ensuring job reaches completions")
   236  			err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
   237  			framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   238  		},
   239  		ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{
   240  			Rules: []batchv1.PodFailurePolicyRule{
   241  				{
   242  					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
   243  					Action: batchv1.PodFailurePolicyActionIgnore,
   244  					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   245  						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   246  						Values:   []int32{1},
   247  					},
   248  				},
   249  				{
   250  					// Ignore the pod failure caused by the eviction
   251  					Action: batchv1.PodFailurePolicyActionIgnore,
   252  					OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
   253  						{
   254  							Type:   v1.DisruptionTarget,
   255  							Status: v1.ConditionTrue,
   256  						},
   257  					},
   258  				},
   259  			},
   260  		}),
   261  		ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{
   262  			Rules: []batchv1.PodFailurePolicyRule{
   263  				{
   264  					// Ignore failures of the non 0-indexed pods which fail until the marker file is created
   265  					// And the 137 in the 0-indexed pod due to eviction.
   266  					Action: batchv1.PodFailurePolicyActionIgnore,
   267  					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   268  						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   269  						Values:   []int32{1, 137},
   270  					},
   271  				},
   272  			},
   273  		}),
   274  	)
   275  
   276  	ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) {
   277  		ginkgo.By("Creating a job with suspend=true")
   278  		job := e2ejob.NewTestJob("succeed", "suspend-true-to-false", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   279  		job.Spec.Suspend = pointer.BoolPtr(true)
   280  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   281  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   282  
   283  		ginkgo.By("Checking Job status to observe Suspended state")
   284  		err = e2ejob.WaitForJobSuspend(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   285  		framework.ExpectNoError(err, "failed to observe suspend state: %s", f.Namespace.Name)
   286  
   287  		ginkgo.By("Ensuring pods aren't created for job")
   288  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   289  		framework.ExpectNoError(err, "failed to list pod for a given job %s in namespace %s", job.Name, f.Namespace.Name)
   290  		gomega.Expect(pods.Items).To(gomega.BeEmpty())
   291  
   292  		ginkgo.By("Updating the job with suspend=false")
   293  		job, err = f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, job.Name, metav1.GetOptions{})
   294  		framework.ExpectNoError(err, "failed to get job in namespace: %s", f.Namespace.Name)
   295  		job.Spec.Suspend = pointer.BoolPtr(false)
   296  		job, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   297  		framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name)
   298  
   299  		ginkgo.By("Waiting for job to complete")
   300  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
   301  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   302  	})
   303  
   304  	ginkgo.It("should delete pods when suspended", func(ctx context.Context) {
   305  		ginkgo.By("Creating a job with suspend=false")
   306  		job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   307  		job.Spec.Suspend = pointer.Bool(false)
   308  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   309  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   310  
   311  		ginkgo.By("Ensure pods equal to parallelism count is attached to the job")
   312  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism)
   313  		framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name)
   314  
   315  		ginkgo.By("Updating the job with suspend=true")
   316  		err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
   317  			job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   318  			framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name)
   319  			job.Spec.Suspend = pointer.Bool(true)
   320  			updatedJob, err := e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   321  			if err == nil {
   322  				job = updatedJob
   323  			}
   324  			return err
   325  		})
   326  		framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name)
   327  
   328  		ginkgo.By("Ensuring pods are deleted")
   329  		err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   330  		framework.ExpectNoError(err, "failed to ensure pods are deleted after suspend=true")
   331  
   332  		ginkgo.By("Checking Job status to observe Suspended state")
   333  		job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   334  		framework.ExpectNoError(err, "failed to retrieve latest job object")
   335  		exists := false
   336  		for _, c := range job.Status.Conditions {
   337  			if c.Type == batchv1.JobSuspended {
   338  				exists = true
   339  				break
   340  			}
   341  		}
   342  		if !exists {
   343  			framework.Failf("Job was expected to be completed or failed")
   344  		}
   345  	})
   346  
   347  	ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) {
   348  		ginkgo.By("Creating a job")
   349  		job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1)
   350  		job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed)
   351  		job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){
   352  	sleep 5
   353  	exit 143
   354  }
   355  trap _term SIGTERM
   356  while true; do
   357  	sleep 1
   358  done`}
   359  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   360  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   361  
   362  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1)
   363  		framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name)
   364  
   365  		ginkgo.By("Deleting job pod")
   366  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   367  		framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name)
   368  
   369  		framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name)
   370  
   371  		ginkgo.By("Ensuring pod does not get recreated while it is in terminating state")
   372  		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
   373  			if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 {
   374  				return ""
   375  			} else {
   376  				return fmt.Sprintf(
   377  					"expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods",
   378  					job.Status.Active,
   379  					job.Status.Failed,
   380  					*job.Status.Terminating,
   381  				)
   382  			}
   383  		})
   384  		framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state")
   385  
   386  		ginkgo.By("Ensuring pod gets recreated after it has failed")
   387  		err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string {
   388  			if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 {
   389  				return ""
   390  			} else {
   391  				return fmt.Sprintf(
   392  					"expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods",
   393  					job.Status.Active,
   394  					job.Status.Failed,
   395  					*job.Status.Terminating,
   396  				)
   397  			}
   398  		})
   399  		framework.ExpectNoError(err, "failed to wait for pod to get recreated")
   400  	})
   401  
   402  	/*
   403  		  Release: v1.24
   404  			Testname: Ensure Pods of an Indexed Job get a unique index.
   405  			Description: Create an Indexed job. Job MUST complete successfully.
   406  			Ensure that created pods have completion index annotation and environment variable.
   407  	*/
   408  	framework.ConformanceIt("should create pods for an Indexed job with completion indexes and specified hostname", func(ctx context.Context) {
   409  		ginkgo.By("Creating Indexed job")
   410  		job := e2ejob.NewTestJob("succeed", "indexed-job", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   411  		mode := batchv1.IndexedCompletion
   412  		job.Spec.CompletionMode = &mode
   413  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   414  		framework.ExpectNoError(err, "failed to create indexed job in namespace %s", f.Namespace.Name)
   415  
   416  		ginkgo.By("Ensuring job reaches completions")
   417  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
   418  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   419  
   420  		ginkgo.By("Ensuring pods with index for job exist")
   421  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   422  		framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name)
   423  		succeededIndexes := sets.NewInt()
   424  		for _, pod := range pods.Items {
   425  			if pod.Status.Phase == v1.PodSucceeded && pod.Annotations != nil {
   426  				ix, err := strconv.Atoi(pod.Annotations[batchv1.JobCompletionIndexAnnotation])
   427  				framework.ExpectNoError(err, "failed obtaining completion index from pod in namespace: %s", f.Namespace.Name)
   428  				succeededIndexes.Insert(ix)
   429  				expectedName := fmt.Sprintf("%s-%d", job.Name, ix)
   430  				gomega.Expect(pod.Spec.Hostname).To(gomega.Equal(expectedName), "expected completed pod with hostname %s, but got %s", expectedName, pod.Spec.Hostname)
   431  			}
   432  		}
   433  		gotIndexes := succeededIndexes.List()
   434  		wantIndexes := []int{0, 1, 2, 3}
   435  		gomega.Expect(gotIndexes).To(gomega.Equal(wantIndexes), "expected completed indexes %s, but got %s", wantIndexes, gotIndexes)
   436  	})
   437  
   438  	/*
   439  		Testcase: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex despite some failing
   440  		Description: Create an indexed job and ensure that all indexes are either failed or succeeded, depending
   441  		on the end state of the corresponding pods. Pods with odd indexes fail, while the pods with even indexes
   442  		succeeded. Also, verify that the number of failed pods doubles the number of failing indexes, as the
   443  		backoffLimitPerIndex=1, allowing for one pod recreation before marking that indexed failed.
   444  	*/
   445  	ginkgo.It("should execute all indexes despite some failing when using backoffLimitPerIndex", func(ctx context.Context) {
   446  		ginkgo.By("Creating an indexed job with backoffLimit per index and failing pods")
   447  		job := e2ejob.NewTestJob("failOddSucceedEven", "with-backoff-limit-per-index", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   448  		job.Spec.BackoffLimit = nil
   449  		job.Spec.BackoffLimitPerIndex = ptr.To[int32](1)
   450  		mode := batchv1.IndexedCompletion
   451  		job.Spec.CompletionMode = &mode
   452  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   453  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   454  
   455  		ginkgo.By("Awaiting for the job to fail as there are failed indexes")
   456  		err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name)
   457  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   458  
   459  		ginkgo.By("Verifying the Job status fields to ensure all indexes were executed")
   460  		job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   461  		framework.ExpectNoError(err, "failed to retrieve latest job object")
   462  		gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1,3")))
   463  		gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0,2"))
   464  		gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(4)))
   465  		gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(2)))
   466  	})
   467  
   468  	/*
   469  		Testcase: Terminate job execution when the maxFailedIndexes is exceeded
   470  		Description: Create an indexed job with backoffLimitPerIndex and maxFailedIndexes.
   471  		Verify the job execution is terminated as soon as the number of failed
   472  		indexes exceeds maxFailedIndexes.
   473  	*/
   474  	ginkgo.It("should terminate job execution when the number of failed indexes exceeds maxFailedIndexes", func(ctx context.Context) {
   475  		// we use parallelism=1 to make sure in the asserts only one pod was created
   476  		parallelism := int32(1)
   477  		ginkgo.By("Creating an indexed job with backoffLimit per index and maxFailedIndexes")
   478  		job := e2ejob.NewTestJob("fail", "with-max-failed-indexes", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   479  		job.Spec.BackoffLimit = nil
   480  		job.Spec.BackoffLimitPerIndex = ptr.To[int32](0)
   481  		job.Spec.MaxFailedIndexes = ptr.To[int32](0)
   482  
   483  		mode := batchv1.IndexedCompletion
   484  		job.Spec.CompletionMode = &mode
   485  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   486  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   487  
   488  		ginkgo.By("Awaiting for the job to fail as the number of max failed indexes is exceeded")
   489  		err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name)
   490  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   491  
   492  		ginkgo.By("Verifying the Job status fields to ensure early termination of the job")
   493  		job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   494  		framework.ExpectNoError(err, "failed to retrieve latest job object")
   495  		gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("0")))
   496  		gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1)))
   497  	})
   498  
   499  	/*
   500  		Testcase: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy
   501  		Description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy
   502  		with the FailIndex action. Verify the failed pods matching the pod failure policy
   503  		result in marking the corresponding indexes as failed without restarts, despite
   504  		backoffLimitPerIndex > 0.
   505  	*/
   506  	ginkgo.It("should mark indexes as failed when the FailIndex action is matched in podFailurePolicy", func(ctx context.Context) {
   507  		completions := int32(2)
   508  
   509  		ginkgo.By("Creating an indexed job with failing pods matching the FailIndex action")
   510  		job := e2ejob.NewTestJob("failOddSucceedEven", "matching-fail-index-action", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   511  		job.Spec.BackoffLimit = nil
   512  		job.Spec.BackoffLimitPerIndex = ptr.To[int32](1)
   513  		job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{
   514  			Rules: []batchv1.PodFailurePolicyRule{
   515  				{
   516  					Action: batchv1.PodFailurePolicyActionFailIndex,
   517  					OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   518  						Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   519  						Values:   []int32{1},
   520  					},
   521  				},
   522  			},
   523  		}
   524  		mode := batchv1.IndexedCompletion
   525  		job.Spec.CompletionMode = &mode
   526  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   527  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   528  
   529  		ginkgo.By("Awaiting for the job to fail as all indexes are failed")
   530  		err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name)
   531  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   532  
   533  		ginkgo.By("Verifying the Job status fields to ensure the upper indexes didn't execute")
   534  		job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   535  		framework.ExpectNoError(err, "failed to retrieve latest job object")
   536  		gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1")))
   537  		gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0"))
   538  		gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1)))
   539  		gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(1)))
   540  	})
   541  
   542  	/*
   543  		Testcase: Ensure that the pods associated with the job are removed once the job is deleted
   544  		Description: Create a job and ensure the associated pod count is equal to parallelism count. Delete the
   545  		job and ensure if the pods associated with the job have been removed
   546  	*/
   547  	ginkgo.It("should remove pods when job is deleted", func(ctx context.Context) {
   548  		ginkgo.By("Creating a job")
   549  		job := e2ejob.NewTestJob("notTerminate", "all-pods-removed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   550  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   551  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   552  
   553  		ginkgo.By("Ensure pods equal to parallelism count is attached to the job")
   554  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism)
   555  		framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name)
   556  
   557  		ginkgo.By("Delete the job")
   558  		err = e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name)
   559  		framework.ExpectNoError(err, "failed to delete the job in namespace: %s", f.Namespace.Name)
   560  
   561  		ginkgo.By("Ensure the pods associated with the job are also deleted")
   562  		err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   563  		framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
   564  	})
   565  
   566  	/*
   567  		Release: v1.16
   568  		Testname: Jobs, completion after task failure
   569  		Description: Explicitly cause the tasks to fail once initially. After restarting, the Job MUST
   570  		execute to completion.
   571  	*/
   572  	framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func(ctx context.Context) {
   573  		ginkgo.By("Creating a job")
   574  		// One failure, then a success, local restarts.
   575  		// We can't use the random failure approach, because kubelet will
   576  		// throttle frequently failing containers in a given pod, ramping
   577  		// up to 5 minutes between restarts, making test timeout due to
   578  		// successive failures too likely with a reasonable test timeout.
   579  		job := e2ejob.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit)
   580  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   581  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   582  
   583  		ginkgo.By("Ensuring job reaches completions")
   584  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions)
   585  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   586  	})
   587  
   588  	// Pods sometimes fail, but eventually succeed, after pod restarts
   589  	ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func(ctx context.Context) {
   590  		// One failure, then a success, no local restarts.
   591  		// We can't use the random failure approach, because JobController
   592  		// will throttle frequently failing Pods of a given Job, ramping
   593  		// up to 6 minutes between restarts, making test timeout due to
   594  		// successive failures.
   595  		// Instead, we force the Job's Pods to be scheduled to a single Node
   596  		// and use a hostPath volume to persist data across new Pods.
   597  		ginkgo.By("Looking for a node to schedule job pod")
   598  		node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet)
   599  		framework.ExpectNoError(err)
   600  
   601  		ginkgo.By("Creating a job")
   602  		job := e2ejob.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name)
   603  		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   604  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   605  
   606  		ginkgo.By("Ensuring job reaches completions")
   607  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, *job.Spec.Completions)
   608  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   609  	})
   610  
   611  	ginkgo.It("should fail when exceeds active deadline", func(ctx context.Context) {
   612  		ginkgo.By("Creating a job")
   613  		var activeDeadlineSeconds int64 = 1
   614  		job := e2ejob.NewTestJob("notTerminate", "exceed-active-deadline", v1.RestartPolicyNever, parallelism, completions, &activeDeadlineSeconds, backoffLimit)
   615  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   616  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   617  		ginkgo.By("Ensuring job past active deadline")
   618  		err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, time.Duration(activeDeadlineSeconds+15)*time.Second, "DeadlineExceeded")
   619  		framework.ExpectNoError(err, "failed to ensure job past active deadline in namespace: %s", f.Namespace.Name)
   620  	})
   621  
   622  	/*
   623  		Release: v1.15
   624  		Testname: Jobs, active pods, graceful termination
   625  		Description: Create a job. Ensure the active pods reflect parallelism in the namespace and delete the job. Job MUST be deleted successfully.
   626  	*/
   627  	framework.ConformanceIt("should delete a job", func(ctx context.Context) {
   628  		ginkgo.By("Creating a job")
   629  		job := e2ejob.NewTestJob("notTerminate", "foo", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   630  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   631  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   632  
   633  		ginkgo.By("Ensuring active pods == parallelism")
   634  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism)
   635  		framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name)
   636  
   637  		ginkgo.By("delete a job")
   638  		framework.ExpectNoError(e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name))
   639  
   640  		ginkgo.By("Ensuring job was deleted")
   641  		_, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   642  		framework.ExpectError(err, "failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name)
   643  		if !apierrors.IsNotFound(err) {
   644  			framework.Failf("failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name)
   645  		}
   646  	})
   647  
   648  	/*
   649  		Release: v1.16
   650  		Testname: Jobs, orphan pods, re-adoption
   651  		Description: Create a parallel job. The number of Pods MUST equal the level of parallelism.
   652  		Orphan a Pod by modifying its owner reference. The Job MUST re-adopt the orphan pod.
   653  		Modify the labels of one of the Job's Pods. The Job MUST release the Pod.
   654  	*/
   655  	framework.ConformanceIt("should adopt matching orphans and release non-matching pods", func(ctx context.Context) {
   656  		ginkgo.By("Creating a job")
   657  		job := e2ejob.NewTestJob("notTerminate", "adopt-release", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   658  		// Replace job with the one returned from Create() so it has the UID.
   659  		// Save Kind since it won't be populated in the returned job.
   660  		kind := job.Kind
   661  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   662  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   663  		job.Kind = kind
   664  
   665  		ginkgo.By("Ensuring active pods == parallelism")
   666  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism)
   667  		framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name)
   668  
   669  		ginkgo.By("Orphaning one of the Job's Pods")
   670  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   671  		framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
   672  		gomega.Expect(pods.Items).To(gomega.HaveLen(int(parallelism)))
   673  		pod := pods.Items[0]
   674  		e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) {
   675  			pod.OwnerReferences = nil
   676  		})
   677  
   678  		ginkgo.By("Checking that the Job readopts the Pod")
   679  		gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "adopted", e2ejob.JobTimeout,
   680  			func(pod *v1.Pod) (bool, error) {
   681  				controllerRef := metav1.GetControllerOf(pod)
   682  				if controllerRef == nil {
   683  					return false, nil
   684  				}
   685  				if controllerRef.Kind != job.Kind || controllerRef.Name != job.Name || controllerRef.UID != job.UID {
   686  					return false, fmt.Errorf("pod has wrong controllerRef: got %v, want %v", controllerRef, job)
   687  				}
   688  				return true, nil
   689  			},
   690  		)).To(gomega.Succeed(), "wait for pod %q to be readopted", pod.Name)
   691  
   692  		ginkgo.By("Removing the labels from the Job's Pod")
   693  		e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) {
   694  			pod.Labels = nil
   695  		})
   696  
   697  		ginkgo.By("Checking that the Job releases the Pod")
   698  		gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "released", e2ejob.JobTimeout,
   699  			func(pod *v1.Pod) (bool, error) {
   700  				controllerRef := metav1.GetControllerOf(pod)
   701  				if controllerRef != nil {
   702  					return false, nil
   703  				}
   704  				return true, nil
   705  			},
   706  		)).To(gomega.Succeed(), "wait for pod %q to be released", pod.Name)
   707  	})
   708  
   709  	ginkgo.It("should fail to exceed backoffLimit", func(ctx context.Context) {
   710  		ginkgo.By("Creating a job")
   711  		backoff := 1
   712  		job := e2ejob.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff))
   713  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   714  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   715  		ginkgo.By("Ensuring job exceed backofflimit")
   716  
   717  		err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout, "BackoffLimitExceeded")
   718  		framework.ExpectNoError(err, "failed to ensure job exceed backofflimit in namespace: %s", f.Namespace.Name)
   719  
   720  		ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1))
   721  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   722  		framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name)
   723  		gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1))
   724  		for _, pod := range pods.Items {
   725  			gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodFailed))
   726  		}
   727  	})
   728  
   729  	f.It("should run a job to completion with CPU requests", f.WithSerial(), func(ctx context.Context) {
   730  		ginkgo.By("Creating a job that with CPU requests")
   731  
   732  		testNodeName := scheduling.GetNodeThatCanRunPod(ctx, f)
   733  		targetNode, err := f.ClientSet.CoreV1().Nodes().Get(ctx, testNodeName, metav1.GetOptions{})
   734  		framework.ExpectNoError(err, "unable to get node object for node %v", testNodeName)
   735  
   736  		cpu, ok := targetNode.Status.Allocatable[v1.ResourceCPU]
   737  		if !ok {
   738  			framework.Failf("Unable to get node's %q cpu", targetNode.Name)
   739  		}
   740  
   741  		cpuRequest := fmt.Sprint(int64(0.2 * float64(cpu.Value())))
   742  
   743  		backoff := 0
   744  		ginkgo.By("Creating a job")
   745  		job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, largeParallelism, largeCompletions, nil, int32(backoff))
   746  		for i := range job.Spec.Template.Spec.Containers {
   747  			job.Spec.Template.Spec.Containers[i].Resources = v1.ResourceRequirements{
   748  				Requests: v1.ResourceList{
   749  					v1.ResourceCPU: resource.MustParse(cpuRequest),
   750  				},
   751  			}
   752  			job.Spec.Template.Spec.NodeSelector = map[string]string{"kubernetes.io/hostname": testNodeName}
   753  		}
   754  
   755  		framework.Logf("Creating job %q with a node hostname selector %q with cpu request %q", job.Name, testNodeName, cpuRequest)
   756  		job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   757  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   758  
   759  		ginkgo.By("Ensuring job reaches completions")
   760  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, largeCompletions)
   761  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name)
   762  
   763  		ginkgo.By("Ensuring pods for job exist")
   764  		pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   765  		framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name)
   766  		successes := int32(0)
   767  		for _, pod := range pods.Items {
   768  			if pod.Status.Phase == v1.PodSucceeded {
   769  				successes++
   770  			}
   771  		}
   772  		gomega.Expect(successes).To(gomega.Equal(largeCompletions), "expected %d successful job pods, but got  %d", largeCompletions, successes)
   773  	})
   774  
   775  	/*
   776  		Release: v1.24
   777  		Testname: Jobs, apply changes to status
   778  		Description: Attempt to create a running Job which MUST succeed.
   779  		Attempt to patch the Job status to include a new start time which
   780  		MUST succeed. An annotation for the job that was patched MUST be found.
   781  		Attempt to replace the job status with a new start time which MUST
   782  		succeed. Attempt to read its status sub-resource which MUST succeed
   783  	*/
   784  	framework.ConformanceIt("should apply changes to a job status", func(ctx context.Context) {
   785  
   786  		ns := f.Namespace.Name
   787  		jClient := f.ClientSet.BatchV1().Jobs(ns)
   788  
   789  		ginkgo.By("Creating a job")
   790  		job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   791  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   792  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   793  
   794  		ginkgo.By("Ensure pods equal to parallelism count is attached to the job")
   795  		err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism)
   796  		framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name)
   797  
   798  		// /status subresource operations
   799  		ginkgo.By("patching /status")
   800  		// we need to use RFC3339 version since conversion over the wire cuts nanoseconds
   801  		now1 := metav1.Now().Rfc3339Copy()
   802  		jStatus := batchv1.JobStatus{
   803  			StartTime: &now1,
   804  		}
   805  
   806  		jStatusJSON, err := json.Marshal(jStatus)
   807  		framework.ExpectNoError(err)
   808  		patchedStatus, err := jClient.Patch(ctx, job.Name, types.MergePatchType,
   809  			[]byte(`{"metadata":{"annotations":{"patchedstatus":"true"}},"status":`+string(jStatusJSON)+`}`),
   810  			metav1.PatchOptions{}, "status")
   811  		framework.ExpectNoError(err)
   812  		if !patchedStatus.Status.StartTime.Equal(&now1) {
   813  			framework.Failf("patched object should have the applied StartTime %#v, got %#v instead", jStatus.StartTime, patchedStatus.Status.StartTime)
   814  		}
   815  		gomega.Expect(patchedStatus.Annotations).To(gomega.HaveKeyWithValue("patchedstatus", "true"), "patched object should have the applied annotation")
   816  
   817  		ginkgo.By("updating /status")
   818  		// we need to use RFC3339 version since conversion over the wire cuts nanoseconds
   819  		now2 := metav1.Now().Rfc3339Copy()
   820  		var statusToUpdate, updatedStatus *batchv1.Job
   821  		err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
   822  			statusToUpdate, err = jClient.Get(ctx, job.Name, metav1.GetOptions{})
   823  			if err != nil {
   824  				return err
   825  			}
   826  			statusToUpdate.Status.StartTime = &now2
   827  			updatedStatus, err = jClient.UpdateStatus(ctx, statusToUpdate, metav1.UpdateOptions{})
   828  			return err
   829  		})
   830  		framework.ExpectNoError(err)
   831  		if !updatedStatus.Status.StartTime.Equal(&now2) {
   832  			framework.Failf("updated object status expected to have updated StartTime %#v, got %#v", statusToUpdate.Status.StartTime, updatedStatus.Status.StartTime)
   833  		}
   834  
   835  		ginkgo.By("get /status")
   836  		jResource := schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"}
   837  		gottenStatus, err := f.DynamicClient.Resource(jResource).Namespace(ns).Get(ctx, job.Name, metav1.GetOptions{}, "status")
   838  		framework.ExpectNoError(err)
   839  		statusUID, _, err := unstructured.NestedFieldCopy(gottenStatus.Object, "metadata", "uid")
   840  		framework.ExpectNoError(err)
   841  		gomega.Expect(string(job.UID)).To(gomega.Equal(statusUID), fmt.Sprintf("job.UID: %v expected to match statusUID: %v ", job.UID, statusUID))
   842  	})
   843  
   844  	/*
   845  		Release: v1.25
   846  		Testname: Jobs, manage lifecycle
   847  		Description: Attempt to create a suspended Job which MUST succeed.
   848  		Attempt to patch the Job to include a new label which MUST succeed.
   849  		The label MUST be found. Attempt to replace the Job to include a
   850  		new annotation which MUST succeed. The annotation MUST be found.
   851  		Attempt to list all namespaces with a label selector which MUST
   852  		succeed. One list MUST be found. It MUST succeed at deleting a
   853  		collection of jobs via a label selector.
   854  	*/
   855  	framework.ConformanceIt("should manage the lifecycle of a job", func(ctx context.Context) {
   856  		jobName := "e2e-" + utilrand.String(5)
   857  		label := map[string]string{"e2e-job-label": jobName}
   858  		labelSelector := labels.SelectorFromSet(label).String()
   859  
   860  		ns := f.Namespace.Name
   861  		jobClient := f.ClientSet.BatchV1().Jobs(ns)
   862  
   863  		w := &cache.ListWatch{
   864  			WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
   865  				options.LabelSelector = labelSelector
   866  				return jobClient.Watch(ctx, options)
   867  			},
   868  		}
   869  		jobsList, err := jobClient.List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
   870  		framework.ExpectNoError(err, "failed to list Job")
   871  
   872  		ginkgo.By("Creating a suspended job")
   873  		job := e2ejob.NewTestJob("succeed", jobName, v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   874  		job.Labels = label
   875  		job.Spec.Suspend = pointer.BoolPtr(true)
   876  		job, err = e2ejob.CreateJob(ctx, f.ClientSet, ns, job)
   877  		framework.ExpectNoError(err, "failed to create job in namespace: %s", ns)
   878  
   879  		ginkgo.By("Patching the Job")
   880  		payload := "{\"metadata\":{\"labels\":{\"" + jobName + "\":\"patched\"}}}"
   881  		patchedJob, err := f.ClientSet.BatchV1().Jobs(ns).Patch(ctx, jobName, types.StrategicMergePatchType, []byte(payload), metav1.PatchOptions{})
   882  		framework.ExpectNoError(err, "failed to patch Job %s in namespace %s", jobName, ns)
   883  
   884  		ginkgo.By("Watching for Job to be patched")
   885  		c := watchEventConfig{
   886  			framework:           f,
   887  			resourceVersion:     jobsList.ResourceVersion,
   888  			w:                   w,
   889  			jobName:             jobName,
   890  			watchEvent:          watch.Modified,
   891  			extJob:              patchedJob,
   892  			updatedMetadataType: "label",
   893  			updatedKey:          jobName,
   894  			updatedValue:        "patched",
   895  		}
   896  		waitForJobEvent(ctx, c)
   897  		gomega.Expect(patchedJob.Labels).To(gomega.HaveKeyWithValue(jobName, "patched"), "Did not find job label for this job. Current labels: %v", patchedJob.Labels)
   898  
   899  		ginkgo.By("Updating the job")
   900  		var updatedJob *batchv1.Job
   901  
   902  		err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
   903  			patchedJob, err = jobClient.Get(ctx, jobName, metav1.GetOptions{})
   904  			framework.ExpectNoError(err, "Unable to get job %s", jobName)
   905  			patchedJob.Spec.Suspend = pointer.BoolPtr(false)
   906  			if patchedJob.Annotations == nil {
   907  				patchedJob.Annotations = map[string]string{}
   908  			}
   909  			patchedJob.Annotations["updated"] = "true"
   910  			updatedJob, err = e2ejob.UpdateJob(ctx, f.ClientSet, ns, patchedJob)
   911  			return err
   912  		})
   913  		framework.ExpectNoError(err, "failed to update job in namespace: %s", ns)
   914  
   915  		ginkgo.By("Watching for Job to be updated")
   916  		c = watchEventConfig{
   917  			framework:           f,
   918  			resourceVersion:     patchedJob.ResourceVersion,
   919  			w:                   w,
   920  			jobName:             jobName,
   921  			watchEvent:          watch.Modified,
   922  			extJob:              updatedJob,
   923  			updatedMetadataType: "annotation",
   924  			updatedKey:          "updated",
   925  			updatedValue:        "true",
   926  		}
   927  		waitForJobEvent(ctx, c)
   928  		gomega.Expect(updatedJob.Annotations).To(gomega.HaveKeyWithValue("updated", "true"), "updated Job should have the applied annotation")
   929  		framework.Logf("Found Job annotations: %#v", patchedJob.Annotations)
   930  
   931  		ginkgo.By("Listing all Jobs with LabelSelector")
   932  		jobs, err := f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
   933  		framework.ExpectNoError(err, "Failed to list job. %v", err)
   934  		gomega.Expect(jobs.Items).To(gomega.HaveLen(1), "Failed to find job %v", jobName)
   935  		testJob := jobs.Items[0]
   936  		framework.Logf("Job: %v as labels: %v", testJob.Name, testJob.Labels)
   937  
   938  		ginkgo.By("Waiting for job to complete")
   939  		err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, ns, jobName, completions)
   940  		framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", ns)
   941  
   942  		ginkgo.By("Delete a job collection with a labelselector")
   943  		propagationPolicy := metav1.DeletePropagationBackground
   944  		err = f.ClientSet.BatchV1().Jobs(ns).DeleteCollection(ctx, metav1.DeleteOptions{PropagationPolicy: &propagationPolicy}, metav1.ListOptions{LabelSelector: labelSelector})
   945  		framework.ExpectNoError(err, "failed to delete job %s in namespace: %s", job.Name, ns)
   946  
   947  		ginkgo.By("Watching for Job to be deleted")
   948  		c = watchEventConfig{
   949  			framework:           f,
   950  			resourceVersion:     updatedJob.ResourceVersion,
   951  			w:                   w,
   952  			jobName:             jobName,
   953  			watchEvent:          watch.Deleted,
   954  			extJob:              &testJob,
   955  			updatedMetadataType: "label",
   956  			updatedKey:          "e2e-job-label",
   957  			updatedValue:        jobName,
   958  		}
   959  		waitForJobEvent(ctx, c)
   960  
   961  		ginkgo.By("Relist jobs to confirm deletion")
   962  		jobs, err = f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector})
   963  		framework.ExpectNoError(err, "Failed to list job. %v", err)
   964  		gomega.Expect(jobs.Items).To(gomega.BeEmpty(), "Found job %v", jobName)
   965  	})
   966  
   967  	ginkgo.It("should update the status ready field", func(ctx context.Context) {
   968  		ginkgo.By("Creating a job with suspend=true")
   969  		job := e2ejob.NewTestJob("notTerminate", "all-ready", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit)
   970  		job.Spec.Suspend = ptr.To[bool](true)
   971  		job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job)
   972  		framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name)
   973  
   974  		ginkgo.By("Ensure the job controller updates the status.ready field")
   975  		err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0))
   976  		framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name)
   977  
   978  		ginkgo.By("Updating the job with suspend=false")
   979  		err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](false))
   980  		framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name)
   981  
   982  		ginkgo.By("Ensure the job controller updates the status.ready field")
   983  		err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, &parallelism)
   984  		framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name)
   985  
   986  		ginkgo.By("Updating the job with suspend=true")
   987  		err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](true))
   988  		framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name)
   989  
   990  		ginkgo.By("Ensure the job controller updates the status.ready field")
   991  		err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0))
   992  		framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name)
   993  	})
   994  })
   995  
   996  func updateJobSuspendWithRetries(ctx context.Context, f *framework.Framework, job *batchv1.Job, suspend *bool) error {
   997  	return retry.RetryOnConflict(retry.DefaultRetry, func() error {
   998  		job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name)
   999  		framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name)
  1000  		job.Spec.Suspend = suspend
  1001  		_, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job)
  1002  		return err
  1003  	})
  1004  }
  1005  
  1006  // waitForJobEvent is used to track and log Job events.
  1007  // As delivery of events is not actually guaranteed we
  1008  // will not return an error if we miss the required event.
  1009  func waitForJobEvent(ctx context.Context, config watchEventConfig) {
  1010  	f := config.framework
  1011  	ctx, cancel := context.WithTimeout(ctx, f.Timeouts.PodStartShort)
  1012  	defer cancel()
  1013  	_, err := watchtools.Until(ctx, config.resourceVersion, config.w, func(event watch.Event) (bool, error) {
  1014  		if job, ok := event.Object.(*batchv1.Job); ok {
  1015  
  1016  			var key string
  1017  			switch config.updatedMetadataType {
  1018  			case "annotation":
  1019  				key = job.Annotations[config.updatedKey]
  1020  			case "label":
  1021  				key = job.Labels[config.updatedKey]
  1022  			}
  1023  
  1024  			found := job.ObjectMeta.Name == config.extJob.ObjectMeta.Name &&
  1025  				job.ObjectMeta.Namespace == f.Namespace.Name &&
  1026  				key == config.updatedValue &&
  1027  				event.Type == config.watchEvent
  1028  			if !found {
  1029  				framework.Logf("Event %v observed for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations)
  1030  				return false, nil
  1031  			}
  1032  			framework.Logf("Event %v found for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations)
  1033  			return found, nil
  1034  		}
  1035  		framework.Logf("Observed event: %+v", event.Object)
  1036  		return false, nil
  1037  	})
  1038  	if err != nil {
  1039  		j, _ := f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, config.jobName, metav1.GetOptions{})
  1040  		framework.Logf("We missed the %v event. Job details: %+v", config.watchEvent, j)
  1041  	}
  1042  }
  1043  
  1044  // waitForJobFailure uses c to wait for up to timeout for the Job named jobName in namespace ns to fail.
  1045  func waitForJobFailure(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration, reason string) error {
  1046  	return wait.Poll(framework.Poll, timeout, func() (bool, error) {
  1047  		curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
  1048  		if err != nil {
  1049  			return false, err
  1050  		}
  1051  		for _, c := range curr.Status.Conditions {
  1052  			if c.Type == batchv1.JobFailed && c.Status == v1.ConditionTrue {
  1053  				if reason == "" || reason == c.Reason {
  1054  					return true, nil
  1055  				}
  1056  			}
  1057  		}
  1058  		return false, nil
  1059  	})
  1060  }