k8s.io/kubernetes@v1.29.3/test/e2e/apps/job.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package apps 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "strconv" 24 "time" 25 26 batchv1 "k8s.io/api/batch/v1" 27 v1 "k8s.io/api/core/v1" 28 policyv1 "k8s.io/api/policy/v1" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 "k8s.io/apimachinery/pkg/api/resource" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 33 "k8s.io/apimachinery/pkg/labels" 34 "k8s.io/apimachinery/pkg/runtime/schema" 35 "k8s.io/apimachinery/pkg/types" 36 utilrand "k8s.io/apimachinery/pkg/util/rand" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/apimachinery/pkg/watch" 40 clientset "k8s.io/client-go/kubernetes" 41 "k8s.io/client-go/tools/cache" 42 watchtools "k8s.io/client-go/tools/watch" 43 "k8s.io/client-go/util/retry" 44 batchinternal "k8s.io/kubernetes/pkg/apis/batch" 45 "k8s.io/kubernetes/test/e2e/framework" 46 e2ejob "k8s.io/kubernetes/test/e2e/framework/job" 47 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 48 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 49 e2eresource "k8s.io/kubernetes/test/e2e/framework/resource" 50 "k8s.io/kubernetes/test/e2e/scheduling" 51 admissionapi "k8s.io/pod-security-admission/api" 52 "k8s.io/utils/pointer" 53 "k8s.io/utils/ptr" 54 55 "github.com/onsi/ginkgo/v2" 56 "github.com/onsi/gomega" 57 ) 58 59 type watchEventConfig struct { 60 framework *framework.Framework 61 resourceVersion string 62 w *cache.ListWatch 63 jobName string 64 watchEvent watch.EventType 65 extJob *batchv1.Job 66 updatedMetadataType string 67 updatedKey string 68 updatedValue string 69 } 70 71 var _ = SIGDescribe("Job", func() { 72 f := framework.NewDefaultFramework("job") 73 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 74 parallelism := int32(2) 75 completions := int32(4) 76 77 largeParallelism := int32(90) 78 largeCompletions := int32(90) 79 80 backoffLimit := int32(6) // default value 81 82 // Simplest case: N pods succeed 83 ginkgo.It("should run a job to completion when tasks succeed", func(ctx context.Context) { 84 ginkgo.By("Creating a job") 85 job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 86 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 87 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 88 89 ginkgo.By("Ensuring job reaches completions") 90 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 91 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 92 93 ginkgo.By("Ensuring pods for job exist") 94 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 95 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 96 successes := int32(0) 97 for _, pod := range pods.Items { 98 if pod.Status.Phase == v1.PodSucceeded { 99 successes++ 100 } 101 } 102 gomega.Expect(successes).To(gomega.Equal(completions), "expected %d successful job pods, but got %d", completions, successes) 103 }) 104 105 ginkgo.It("should allow to use the pod failure policy on exit code to fail the job early", func(ctx context.Context) { 106 107 // We fail the Job's pod only once to ensure the backoffLimit is not 108 // reached and thus the job is failed due to the pod failure policy 109 // with FailJob action. 110 // In order to ensure a Job's pod fails once before succeeding we force 111 // the Job's Pods to be scheduled to a single Node and use a hostPath 112 // volume to persist data across new Pods. 113 ginkgo.By("Looking for a node to schedule job pod") 114 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 115 framework.ExpectNoError(err) 116 117 ginkgo.By("Creating a job") 118 job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-failjob", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 119 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 120 Rules: []batchv1.PodFailurePolicyRule{ 121 { 122 Action: batchv1.PodFailurePolicyActionFailJob, 123 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 124 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 125 Values: []int32{1}, 126 }, 127 }, 128 }, 129 } 130 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 131 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 132 133 ginkgo.By("Ensuring job fails") 134 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 135 framework.ExpectNoError(err, "failed to ensure job failure in namespace: %s", f.Namespace.Name) 136 }) 137 138 ginkgo.It("should allow to use the pod failure policy to not count the failure towards the backoffLimit", func(ctx context.Context) { 139 140 // We set the backoffLimit to 0 so that any pod failure would trigger 141 // job failure if not for the pod failure policy to ignore the failed 142 // pods from counting them towards the backoffLimit. Also, we fail the 143 // pod only once so that the job eventually succeeds. 144 // In order to ensure a Job's pod fails once before succeeding we force 145 // the Job's Pods to be scheduled to a single Node and use a hostPath 146 // volume to persist data across new Pods. 147 backoffLimit := int32(0) 148 149 ginkgo.By("Looking for a node to schedule job pod") 150 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 151 framework.ExpectNoError(err) 152 153 ginkgo.By("Creating a job") 154 job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 155 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 156 Rules: []batchv1.PodFailurePolicyRule{ 157 { 158 Action: batchv1.PodFailurePolicyActionIgnore, 159 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 160 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 161 Values: []int32{1}, 162 }, 163 }, 164 }, 165 } 166 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 167 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 168 169 ginkgo.By("Ensuring job reaches completions") 170 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 171 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 172 }) 173 174 // This test is using an indexed job. The pod corresponding to the 0th index 175 // creates a marker file on the host and runs 'forever' until evicted. We use 176 // the non-0-indexed pods to determine if the marker file is already 177 // created by the 0th indexed pod - the non-0-indexed pods fail and restart 178 // until the marker file is created (their potential failures are ignored 179 // based on the exit code). Once the marker file is created the 0th indexed 180 // pod is evicted (DisruptionTarget condition is added in the process), 181 // after restart it runs to successful completion. 182 // Steps: 183 // 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods 184 // 2. Create the indexed job 185 // 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod 186 // 4. Make sure the 0-indexed pod is running 187 // 5. Evict the 0-indexed pod 188 // 6. Await for the job to successfully complete 189 ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit", 190 func(ctx context.Context, policy *batchv1.PodFailurePolicy) { 191 mode := batchv1.IndexedCompletion 192 193 // We set the backoffLimit to 0 so that any pod failure would trigger 194 // job failure if not for the pod failure policy to ignore the failed 195 // pods from counting them towards the backoffLimit. 196 backoffLimit := int32(0) 197 198 ginkgo.By("Looking for a node to schedule job pods") 199 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 200 framework.ExpectNoError(err) 201 202 ginkgo.By("Creating a job") 203 job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 204 job.Spec.CompletionMode = &mode 205 job.Spec.PodFailurePolicy = policy 206 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 207 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 208 209 ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") 210 err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) 211 framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace) 212 213 ginkgo.By("Awaiting for the 0-indexed pod to be running") 214 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) 215 framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace) 216 217 pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 218 framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace) 219 gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected") 220 pod := pods[0] 221 ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace)) 222 evictTarget := &policyv1.Eviction{ 223 ObjectMeta: metav1.ObjectMeta{ 224 Name: pod.Name, 225 Namespace: pod.Namespace, 226 }, 227 } 228 f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget) 229 framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) 230 231 ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) 232 err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) 233 framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) 234 235 ginkgo.By("Ensuring job reaches completions") 236 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 237 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 238 }, 239 ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{ 240 Rules: []batchv1.PodFailurePolicyRule{ 241 { 242 // Ignore failures of the non 0-indexed pods which fail until the marker file is created 243 Action: batchv1.PodFailurePolicyActionIgnore, 244 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 245 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 246 Values: []int32{1}, 247 }, 248 }, 249 { 250 // Ignore the pod failure caused by the eviction 251 Action: batchv1.PodFailurePolicyActionIgnore, 252 OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{ 253 { 254 Type: v1.DisruptionTarget, 255 Status: v1.ConditionTrue, 256 }, 257 }, 258 }, 259 }, 260 }), 261 ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{ 262 Rules: []batchv1.PodFailurePolicyRule{ 263 { 264 // Ignore failures of the non 0-indexed pods which fail until the marker file is created 265 // And the 137 in the 0-indexed pod due to eviction. 266 Action: batchv1.PodFailurePolicyActionIgnore, 267 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 268 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 269 Values: []int32{1, 137}, 270 }, 271 }, 272 }, 273 }), 274 ) 275 276 ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) { 277 ginkgo.By("Creating a job with suspend=true") 278 job := e2ejob.NewTestJob("succeed", "suspend-true-to-false", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 279 job.Spec.Suspend = pointer.BoolPtr(true) 280 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 281 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 282 283 ginkgo.By("Checking Job status to observe Suspended state") 284 err = e2ejob.WaitForJobSuspend(ctx, f.ClientSet, f.Namespace.Name, job.Name) 285 framework.ExpectNoError(err, "failed to observe suspend state: %s", f.Namespace.Name) 286 287 ginkgo.By("Ensuring pods aren't created for job") 288 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 289 framework.ExpectNoError(err, "failed to list pod for a given job %s in namespace %s", job.Name, f.Namespace.Name) 290 gomega.Expect(pods.Items).To(gomega.BeEmpty()) 291 292 ginkgo.By("Updating the job with suspend=false") 293 job, err = f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, job.Name, metav1.GetOptions{}) 294 framework.ExpectNoError(err, "failed to get job in namespace: %s", f.Namespace.Name) 295 job.Spec.Suspend = pointer.BoolPtr(false) 296 job, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 297 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 298 299 ginkgo.By("Waiting for job to complete") 300 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 301 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 302 }) 303 304 ginkgo.It("should delete pods when suspended", func(ctx context.Context) { 305 ginkgo.By("Creating a job with suspend=false") 306 job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 307 job.Spec.Suspend = pointer.Bool(false) 308 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 309 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 310 311 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 312 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 313 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 314 315 ginkgo.By("Updating the job with suspend=true") 316 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 317 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 318 framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name) 319 job.Spec.Suspend = pointer.Bool(true) 320 updatedJob, err := e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 321 if err == nil { 322 job = updatedJob 323 } 324 return err 325 }) 326 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 327 328 ginkgo.By("Ensuring pods are deleted") 329 err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name) 330 framework.ExpectNoError(err, "failed to ensure pods are deleted after suspend=true") 331 332 ginkgo.By("Checking Job status to observe Suspended state") 333 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 334 framework.ExpectNoError(err, "failed to retrieve latest job object") 335 exists := false 336 for _, c := range job.Status.Conditions { 337 if c.Type == batchv1.JobSuspended { 338 exists = true 339 break 340 } 341 } 342 if !exists { 343 framework.Failf("Job was expected to be completed or failed") 344 } 345 }) 346 347 ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) { 348 ginkgo.By("Creating a job") 349 job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1) 350 job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed) 351 job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){ 352 sleep 5 353 exit 143 354 } 355 trap _term SIGTERM 356 while true; do 357 sleep 1 358 done`} 359 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 360 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 361 362 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) 363 framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name) 364 365 ginkgo.By("Deleting job pod") 366 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 367 framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name) 368 369 framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name) 370 371 ginkgo.By("Ensuring pod does not get recreated while it is in terminating state") 372 err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string { 373 if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 { 374 return "" 375 } else { 376 return fmt.Sprintf( 377 "expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods", 378 job.Status.Active, 379 job.Status.Failed, 380 *job.Status.Terminating, 381 ) 382 } 383 }) 384 framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state") 385 386 ginkgo.By("Ensuring pod gets recreated after it has failed") 387 err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string { 388 if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 { 389 return "" 390 } else { 391 return fmt.Sprintf( 392 "expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods", 393 job.Status.Active, 394 job.Status.Failed, 395 *job.Status.Terminating, 396 ) 397 } 398 }) 399 framework.ExpectNoError(err, "failed to wait for pod to get recreated") 400 }) 401 402 /* 403 Release: v1.24 404 Testname: Ensure Pods of an Indexed Job get a unique index. 405 Description: Create an Indexed job. Job MUST complete successfully. 406 Ensure that created pods have completion index annotation and environment variable. 407 */ 408 framework.ConformanceIt("should create pods for an Indexed job with completion indexes and specified hostname", func(ctx context.Context) { 409 ginkgo.By("Creating Indexed job") 410 job := e2ejob.NewTestJob("succeed", "indexed-job", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 411 mode := batchv1.IndexedCompletion 412 job.Spec.CompletionMode = &mode 413 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 414 framework.ExpectNoError(err, "failed to create indexed job in namespace %s", f.Namespace.Name) 415 416 ginkgo.By("Ensuring job reaches completions") 417 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 418 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 419 420 ginkgo.By("Ensuring pods with index for job exist") 421 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 422 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 423 succeededIndexes := sets.NewInt() 424 for _, pod := range pods.Items { 425 if pod.Status.Phase == v1.PodSucceeded && pod.Annotations != nil { 426 ix, err := strconv.Atoi(pod.Annotations[batchv1.JobCompletionIndexAnnotation]) 427 framework.ExpectNoError(err, "failed obtaining completion index from pod in namespace: %s", f.Namespace.Name) 428 succeededIndexes.Insert(ix) 429 expectedName := fmt.Sprintf("%s-%d", job.Name, ix) 430 gomega.Expect(pod.Spec.Hostname).To(gomega.Equal(expectedName), "expected completed pod with hostname %s, but got %s", expectedName, pod.Spec.Hostname) 431 } 432 } 433 gotIndexes := succeededIndexes.List() 434 wantIndexes := []int{0, 1, 2, 3} 435 gomega.Expect(gotIndexes).To(gomega.Equal(wantIndexes), "expected completed indexes %s, but got %s", wantIndexes, gotIndexes) 436 }) 437 438 /* 439 Testcase: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex despite some failing 440 Description: Create an indexed job and ensure that all indexes are either failed or succeeded, depending 441 on the end state of the corresponding pods. Pods with odd indexes fail, while the pods with even indexes 442 succeeded. Also, verify that the number of failed pods doubles the number of failing indexes, as the 443 backoffLimitPerIndex=1, allowing for one pod recreation before marking that indexed failed. 444 */ 445 ginkgo.It("should execute all indexes despite some failing when using backoffLimitPerIndex", func(ctx context.Context) { 446 ginkgo.By("Creating an indexed job with backoffLimit per index and failing pods") 447 job := e2ejob.NewTestJob("failOddSucceedEven", "with-backoff-limit-per-index", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 448 job.Spec.BackoffLimit = nil 449 job.Spec.BackoffLimitPerIndex = ptr.To[int32](1) 450 mode := batchv1.IndexedCompletion 451 job.Spec.CompletionMode = &mode 452 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 453 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 454 455 ginkgo.By("Awaiting for the job to fail as there are failed indexes") 456 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 457 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 458 459 ginkgo.By("Verifying the Job status fields to ensure all indexes were executed") 460 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 461 framework.ExpectNoError(err, "failed to retrieve latest job object") 462 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1,3"))) 463 gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0,2")) 464 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(4))) 465 gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(2))) 466 }) 467 468 /* 469 Testcase: Terminate job execution when the maxFailedIndexes is exceeded 470 Description: Create an indexed job with backoffLimitPerIndex and maxFailedIndexes. 471 Verify the job execution is terminated as soon as the number of failed 472 indexes exceeds maxFailedIndexes. 473 */ 474 ginkgo.It("should terminate job execution when the number of failed indexes exceeds maxFailedIndexes", func(ctx context.Context) { 475 // we use parallelism=1 to make sure in the asserts only one pod was created 476 parallelism := int32(1) 477 ginkgo.By("Creating an indexed job with backoffLimit per index and maxFailedIndexes") 478 job := e2ejob.NewTestJob("fail", "with-max-failed-indexes", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 479 job.Spec.BackoffLimit = nil 480 job.Spec.BackoffLimitPerIndex = ptr.To[int32](0) 481 job.Spec.MaxFailedIndexes = ptr.To[int32](0) 482 483 mode := batchv1.IndexedCompletion 484 job.Spec.CompletionMode = &mode 485 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 486 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 487 488 ginkgo.By("Awaiting for the job to fail as the number of max failed indexes is exceeded") 489 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 490 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 491 492 ginkgo.By("Verifying the Job status fields to ensure early termination of the job") 493 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 494 framework.ExpectNoError(err, "failed to retrieve latest job object") 495 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("0"))) 496 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1))) 497 }) 498 499 /* 500 Testcase: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy 501 Description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy 502 with the FailIndex action. Verify the failed pods matching the pod failure policy 503 result in marking the corresponding indexes as failed without restarts, despite 504 backoffLimitPerIndex > 0. 505 */ 506 ginkgo.It("should mark indexes as failed when the FailIndex action is matched in podFailurePolicy", func(ctx context.Context) { 507 completions := int32(2) 508 509 ginkgo.By("Creating an indexed job with failing pods matching the FailIndex action") 510 job := e2ejob.NewTestJob("failOddSucceedEven", "matching-fail-index-action", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 511 job.Spec.BackoffLimit = nil 512 job.Spec.BackoffLimitPerIndex = ptr.To[int32](1) 513 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 514 Rules: []batchv1.PodFailurePolicyRule{ 515 { 516 Action: batchv1.PodFailurePolicyActionFailIndex, 517 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 518 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 519 Values: []int32{1}, 520 }, 521 }, 522 }, 523 } 524 mode := batchv1.IndexedCompletion 525 job.Spec.CompletionMode = &mode 526 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 527 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 528 529 ginkgo.By("Awaiting for the job to fail as all indexes are failed") 530 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 531 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 532 533 ginkgo.By("Verifying the Job status fields to ensure the upper indexes didn't execute") 534 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 535 framework.ExpectNoError(err, "failed to retrieve latest job object") 536 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1"))) 537 gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0")) 538 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1))) 539 gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(1))) 540 }) 541 542 /* 543 Testcase: Ensure that the pods associated with the job are removed once the job is deleted 544 Description: Create a job and ensure the associated pod count is equal to parallelism count. Delete the 545 job and ensure if the pods associated with the job have been removed 546 */ 547 ginkgo.It("should remove pods when job is deleted", func(ctx context.Context) { 548 ginkgo.By("Creating a job") 549 job := e2ejob.NewTestJob("notTerminate", "all-pods-removed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 550 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 551 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 552 553 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 554 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 555 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 556 557 ginkgo.By("Delete the job") 558 err = e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name) 559 framework.ExpectNoError(err, "failed to delete the job in namespace: %s", f.Namespace.Name) 560 561 ginkgo.By("Ensure the pods associated with the job are also deleted") 562 err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name) 563 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 564 }) 565 566 /* 567 Release: v1.16 568 Testname: Jobs, completion after task failure 569 Description: Explicitly cause the tasks to fail once initially. After restarting, the Job MUST 570 execute to completion. 571 */ 572 framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func(ctx context.Context) { 573 ginkgo.By("Creating a job") 574 // One failure, then a success, local restarts. 575 // We can't use the random failure approach, because kubelet will 576 // throttle frequently failing containers in a given pod, ramping 577 // up to 5 minutes between restarts, making test timeout due to 578 // successive failures too likely with a reasonable test timeout. 579 job := e2ejob.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit) 580 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 581 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 582 583 ginkgo.By("Ensuring job reaches completions") 584 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 585 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 586 }) 587 588 // Pods sometimes fail, but eventually succeed, after pod restarts 589 ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func(ctx context.Context) { 590 // One failure, then a success, no local restarts. 591 // We can't use the random failure approach, because JobController 592 // will throttle frequently failing Pods of a given Job, ramping 593 // up to 6 minutes between restarts, making test timeout due to 594 // successive failures. 595 // Instead, we force the Job's Pods to be scheduled to a single Node 596 // and use a hostPath volume to persist data across new Pods. 597 ginkgo.By("Looking for a node to schedule job pod") 598 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 599 framework.ExpectNoError(err) 600 601 ginkgo.By("Creating a job") 602 job := e2ejob.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 603 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 604 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 605 606 ginkgo.By("Ensuring job reaches completions") 607 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, *job.Spec.Completions) 608 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 609 }) 610 611 ginkgo.It("should fail when exceeds active deadline", func(ctx context.Context) { 612 ginkgo.By("Creating a job") 613 var activeDeadlineSeconds int64 = 1 614 job := e2ejob.NewTestJob("notTerminate", "exceed-active-deadline", v1.RestartPolicyNever, parallelism, completions, &activeDeadlineSeconds, backoffLimit) 615 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 616 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 617 ginkgo.By("Ensuring job past active deadline") 618 err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, time.Duration(activeDeadlineSeconds+15)*time.Second, "DeadlineExceeded") 619 framework.ExpectNoError(err, "failed to ensure job past active deadline in namespace: %s", f.Namespace.Name) 620 }) 621 622 /* 623 Release: v1.15 624 Testname: Jobs, active pods, graceful termination 625 Description: Create a job. Ensure the active pods reflect parallelism in the namespace and delete the job. Job MUST be deleted successfully. 626 */ 627 framework.ConformanceIt("should delete a job", func(ctx context.Context) { 628 ginkgo.By("Creating a job") 629 job := e2ejob.NewTestJob("notTerminate", "foo", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 630 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 631 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 632 633 ginkgo.By("Ensuring active pods == parallelism") 634 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 635 framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name) 636 637 ginkgo.By("delete a job") 638 framework.ExpectNoError(e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name)) 639 640 ginkgo.By("Ensuring job was deleted") 641 _, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 642 framework.ExpectError(err, "failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name) 643 if !apierrors.IsNotFound(err) { 644 framework.Failf("failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name) 645 } 646 }) 647 648 /* 649 Release: v1.16 650 Testname: Jobs, orphan pods, re-adoption 651 Description: Create a parallel job. The number of Pods MUST equal the level of parallelism. 652 Orphan a Pod by modifying its owner reference. The Job MUST re-adopt the orphan pod. 653 Modify the labels of one of the Job's Pods. The Job MUST release the Pod. 654 */ 655 framework.ConformanceIt("should adopt matching orphans and release non-matching pods", func(ctx context.Context) { 656 ginkgo.By("Creating a job") 657 job := e2ejob.NewTestJob("notTerminate", "adopt-release", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 658 // Replace job with the one returned from Create() so it has the UID. 659 // Save Kind since it won't be populated in the returned job. 660 kind := job.Kind 661 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 662 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 663 job.Kind = kind 664 665 ginkgo.By("Ensuring active pods == parallelism") 666 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 667 framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name) 668 669 ginkgo.By("Orphaning one of the Job's Pods") 670 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 671 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 672 gomega.Expect(pods.Items).To(gomega.HaveLen(int(parallelism))) 673 pod := pods.Items[0] 674 e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) { 675 pod.OwnerReferences = nil 676 }) 677 678 ginkgo.By("Checking that the Job readopts the Pod") 679 gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "adopted", e2ejob.JobTimeout, 680 func(pod *v1.Pod) (bool, error) { 681 controllerRef := metav1.GetControllerOf(pod) 682 if controllerRef == nil { 683 return false, nil 684 } 685 if controllerRef.Kind != job.Kind || controllerRef.Name != job.Name || controllerRef.UID != job.UID { 686 return false, fmt.Errorf("pod has wrong controllerRef: got %v, want %v", controllerRef, job) 687 } 688 return true, nil 689 }, 690 )).To(gomega.Succeed(), "wait for pod %q to be readopted", pod.Name) 691 692 ginkgo.By("Removing the labels from the Job's Pod") 693 e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) { 694 pod.Labels = nil 695 }) 696 697 ginkgo.By("Checking that the Job releases the Pod") 698 gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "released", e2ejob.JobTimeout, 699 func(pod *v1.Pod) (bool, error) { 700 controllerRef := metav1.GetControllerOf(pod) 701 if controllerRef != nil { 702 return false, nil 703 } 704 return true, nil 705 }, 706 )).To(gomega.Succeed(), "wait for pod %q to be released", pod.Name) 707 }) 708 709 ginkgo.It("should fail to exceed backoffLimit", func(ctx context.Context) { 710 ginkgo.By("Creating a job") 711 backoff := 1 712 job := e2ejob.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff)) 713 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 714 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 715 ginkgo.By("Ensuring job exceed backofflimit") 716 717 err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout, "BackoffLimitExceeded") 718 framework.ExpectNoError(err, "failed to ensure job exceed backofflimit in namespace: %s", f.Namespace.Name) 719 720 ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1)) 721 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 722 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 723 gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) 724 for _, pod := range pods.Items { 725 gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodFailed)) 726 } 727 }) 728 729 f.It("should run a job to completion with CPU requests", f.WithSerial(), func(ctx context.Context) { 730 ginkgo.By("Creating a job that with CPU requests") 731 732 testNodeName := scheduling.GetNodeThatCanRunPod(ctx, f) 733 targetNode, err := f.ClientSet.CoreV1().Nodes().Get(ctx, testNodeName, metav1.GetOptions{}) 734 framework.ExpectNoError(err, "unable to get node object for node %v", testNodeName) 735 736 cpu, ok := targetNode.Status.Allocatable[v1.ResourceCPU] 737 if !ok { 738 framework.Failf("Unable to get node's %q cpu", targetNode.Name) 739 } 740 741 cpuRequest := fmt.Sprint(int64(0.2 * float64(cpu.Value()))) 742 743 backoff := 0 744 ginkgo.By("Creating a job") 745 job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, largeParallelism, largeCompletions, nil, int32(backoff)) 746 for i := range job.Spec.Template.Spec.Containers { 747 job.Spec.Template.Spec.Containers[i].Resources = v1.ResourceRequirements{ 748 Requests: v1.ResourceList{ 749 v1.ResourceCPU: resource.MustParse(cpuRequest), 750 }, 751 } 752 job.Spec.Template.Spec.NodeSelector = map[string]string{"kubernetes.io/hostname": testNodeName} 753 } 754 755 framework.Logf("Creating job %q with a node hostname selector %q with cpu request %q", job.Name, testNodeName, cpuRequest) 756 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 757 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 758 759 ginkgo.By("Ensuring job reaches completions") 760 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, largeCompletions) 761 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 762 763 ginkgo.By("Ensuring pods for job exist") 764 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 765 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 766 successes := int32(0) 767 for _, pod := range pods.Items { 768 if pod.Status.Phase == v1.PodSucceeded { 769 successes++ 770 } 771 } 772 gomega.Expect(successes).To(gomega.Equal(largeCompletions), "expected %d successful job pods, but got %d", largeCompletions, successes) 773 }) 774 775 /* 776 Release: v1.24 777 Testname: Jobs, apply changes to status 778 Description: Attempt to create a running Job which MUST succeed. 779 Attempt to patch the Job status to include a new start time which 780 MUST succeed. An annotation for the job that was patched MUST be found. 781 Attempt to replace the job status with a new start time which MUST 782 succeed. Attempt to read its status sub-resource which MUST succeed 783 */ 784 framework.ConformanceIt("should apply changes to a job status", func(ctx context.Context) { 785 786 ns := f.Namespace.Name 787 jClient := f.ClientSet.BatchV1().Jobs(ns) 788 789 ginkgo.By("Creating a job") 790 job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 791 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 792 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 793 794 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 795 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 796 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 797 798 // /status subresource operations 799 ginkgo.By("patching /status") 800 // we need to use RFC3339 version since conversion over the wire cuts nanoseconds 801 now1 := metav1.Now().Rfc3339Copy() 802 jStatus := batchv1.JobStatus{ 803 StartTime: &now1, 804 } 805 806 jStatusJSON, err := json.Marshal(jStatus) 807 framework.ExpectNoError(err) 808 patchedStatus, err := jClient.Patch(ctx, job.Name, types.MergePatchType, 809 []byte(`{"metadata":{"annotations":{"patchedstatus":"true"}},"status":`+string(jStatusJSON)+`}`), 810 metav1.PatchOptions{}, "status") 811 framework.ExpectNoError(err) 812 if !patchedStatus.Status.StartTime.Equal(&now1) { 813 framework.Failf("patched object should have the applied StartTime %#v, got %#v instead", jStatus.StartTime, patchedStatus.Status.StartTime) 814 } 815 gomega.Expect(patchedStatus.Annotations).To(gomega.HaveKeyWithValue("patchedstatus", "true"), "patched object should have the applied annotation") 816 817 ginkgo.By("updating /status") 818 // we need to use RFC3339 version since conversion over the wire cuts nanoseconds 819 now2 := metav1.Now().Rfc3339Copy() 820 var statusToUpdate, updatedStatus *batchv1.Job 821 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 822 statusToUpdate, err = jClient.Get(ctx, job.Name, metav1.GetOptions{}) 823 if err != nil { 824 return err 825 } 826 statusToUpdate.Status.StartTime = &now2 827 updatedStatus, err = jClient.UpdateStatus(ctx, statusToUpdate, metav1.UpdateOptions{}) 828 return err 829 }) 830 framework.ExpectNoError(err) 831 if !updatedStatus.Status.StartTime.Equal(&now2) { 832 framework.Failf("updated object status expected to have updated StartTime %#v, got %#v", statusToUpdate.Status.StartTime, updatedStatus.Status.StartTime) 833 } 834 835 ginkgo.By("get /status") 836 jResource := schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} 837 gottenStatus, err := f.DynamicClient.Resource(jResource).Namespace(ns).Get(ctx, job.Name, metav1.GetOptions{}, "status") 838 framework.ExpectNoError(err) 839 statusUID, _, err := unstructured.NestedFieldCopy(gottenStatus.Object, "metadata", "uid") 840 framework.ExpectNoError(err) 841 gomega.Expect(string(job.UID)).To(gomega.Equal(statusUID), fmt.Sprintf("job.UID: %v expected to match statusUID: %v ", job.UID, statusUID)) 842 }) 843 844 /* 845 Release: v1.25 846 Testname: Jobs, manage lifecycle 847 Description: Attempt to create a suspended Job which MUST succeed. 848 Attempt to patch the Job to include a new label which MUST succeed. 849 The label MUST be found. Attempt to replace the Job to include a 850 new annotation which MUST succeed. The annotation MUST be found. 851 Attempt to list all namespaces with a label selector which MUST 852 succeed. One list MUST be found. It MUST succeed at deleting a 853 collection of jobs via a label selector. 854 */ 855 framework.ConformanceIt("should manage the lifecycle of a job", func(ctx context.Context) { 856 jobName := "e2e-" + utilrand.String(5) 857 label := map[string]string{"e2e-job-label": jobName} 858 labelSelector := labels.SelectorFromSet(label).String() 859 860 ns := f.Namespace.Name 861 jobClient := f.ClientSet.BatchV1().Jobs(ns) 862 863 w := &cache.ListWatch{ 864 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 865 options.LabelSelector = labelSelector 866 return jobClient.Watch(ctx, options) 867 }, 868 } 869 jobsList, err := jobClient.List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 870 framework.ExpectNoError(err, "failed to list Job") 871 872 ginkgo.By("Creating a suspended job") 873 job := e2ejob.NewTestJob("succeed", jobName, v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 874 job.Labels = label 875 job.Spec.Suspend = pointer.BoolPtr(true) 876 job, err = e2ejob.CreateJob(ctx, f.ClientSet, ns, job) 877 framework.ExpectNoError(err, "failed to create job in namespace: %s", ns) 878 879 ginkgo.By("Patching the Job") 880 payload := "{\"metadata\":{\"labels\":{\"" + jobName + "\":\"patched\"}}}" 881 patchedJob, err := f.ClientSet.BatchV1().Jobs(ns).Patch(ctx, jobName, types.StrategicMergePatchType, []byte(payload), metav1.PatchOptions{}) 882 framework.ExpectNoError(err, "failed to patch Job %s in namespace %s", jobName, ns) 883 884 ginkgo.By("Watching for Job to be patched") 885 c := watchEventConfig{ 886 framework: f, 887 resourceVersion: jobsList.ResourceVersion, 888 w: w, 889 jobName: jobName, 890 watchEvent: watch.Modified, 891 extJob: patchedJob, 892 updatedMetadataType: "label", 893 updatedKey: jobName, 894 updatedValue: "patched", 895 } 896 waitForJobEvent(ctx, c) 897 gomega.Expect(patchedJob.Labels).To(gomega.HaveKeyWithValue(jobName, "patched"), "Did not find job label for this job. Current labels: %v", patchedJob.Labels) 898 899 ginkgo.By("Updating the job") 900 var updatedJob *batchv1.Job 901 902 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 903 patchedJob, err = jobClient.Get(ctx, jobName, metav1.GetOptions{}) 904 framework.ExpectNoError(err, "Unable to get job %s", jobName) 905 patchedJob.Spec.Suspend = pointer.BoolPtr(false) 906 if patchedJob.Annotations == nil { 907 patchedJob.Annotations = map[string]string{} 908 } 909 patchedJob.Annotations["updated"] = "true" 910 updatedJob, err = e2ejob.UpdateJob(ctx, f.ClientSet, ns, patchedJob) 911 return err 912 }) 913 framework.ExpectNoError(err, "failed to update job in namespace: %s", ns) 914 915 ginkgo.By("Watching for Job to be updated") 916 c = watchEventConfig{ 917 framework: f, 918 resourceVersion: patchedJob.ResourceVersion, 919 w: w, 920 jobName: jobName, 921 watchEvent: watch.Modified, 922 extJob: updatedJob, 923 updatedMetadataType: "annotation", 924 updatedKey: "updated", 925 updatedValue: "true", 926 } 927 waitForJobEvent(ctx, c) 928 gomega.Expect(updatedJob.Annotations).To(gomega.HaveKeyWithValue("updated", "true"), "updated Job should have the applied annotation") 929 framework.Logf("Found Job annotations: %#v", patchedJob.Annotations) 930 931 ginkgo.By("Listing all Jobs with LabelSelector") 932 jobs, err := f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 933 framework.ExpectNoError(err, "Failed to list job. %v", err) 934 gomega.Expect(jobs.Items).To(gomega.HaveLen(1), "Failed to find job %v", jobName) 935 testJob := jobs.Items[0] 936 framework.Logf("Job: %v as labels: %v", testJob.Name, testJob.Labels) 937 938 ginkgo.By("Waiting for job to complete") 939 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, ns, jobName, completions) 940 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", ns) 941 942 ginkgo.By("Delete a job collection with a labelselector") 943 propagationPolicy := metav1.DeletePropagationBackground 944 err = f.ClientSet.BatchV1().Jobs(ns).DeleteCollection(ctx, metav1.DeleteOptions{PropagationPolicy: &propagationPolicy}, metav1.ListOptions{LabelSelector: labelSelector}) 945 framework.ExpectNoError(err, "failed to delete job %s in namespace: %s", job.Name, ns) 946 947 ginkgo.By("Watching for Job to be deleted") 948 c = watchEventConfig{ 949 framework: f, 950 resourceVersion: updatedJob.ResourceVersion, 951 w: w, 952 jobName: jobName, 953 watchEvent: watch.Deleted, 954 extJob: &testJob, 955 updatedMetadataType: "label", 956 updatedKey: "e2e-job-label", 957 updatedValue: jobName, 958 } 959 waitForJobEvent(ctx, c) 960 961 ginkgo.By("Relist jobs to confirm deletion") 962 jobs, err = f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 963 framework.ExpectNoError(err, "Failed to list job. %v", err) 964 gomega.Expect(jobs.Items).To(gomega.BeEmpty(), "Found job %v", jobName) 965 }) 966 967 ginkgo.It("should update the status ready field", func(ctx context.Context) { 968 ginkgo.By("Creating a job with suspend=true") 969 job := e2ejob.NewTestJob("notTerminate", "all-ready", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 970 job.Spec.Suspend = ptr.To[bool](true) 971 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 972 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 973 974 ginkgo.By("Ensure the job controller updates the status.ready field") 975 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0)) 976 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 977 978 ginkgo.By("Updating the job with suspend=false") 979 err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](false)) 980 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 981 982 ginkgo.By("Ensure the job controller updates the status.ready field") 983 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ¶llelism) 984 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 985 986 ginkgo.By("Updating the job with suspend=true") 987 err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](true)) 988 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 989 990 ginkgo.By("Ensure the job controller updates the status.ready field") 991 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0)) 992 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 993 }) 994 }) 995 996 func updateJobSuspendWithRetries(ctx context.Context, f *framework.Framework, job *batchv1.Job, suspend *bool) error { 997 return retry.RetryOnConflict(retry.DefaultRetry, func() error { 998 job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 999 framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name) 1000 job.Spec.Suspend = suspend 1001 _, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 1002 return err 1003 }) 1004 } 1005 1006 // waitForJobEvent is used to track and log Job events. 1007 // As delivery of events is not actually guaranteed we 1008 // will not return an error if we miss the required event. 1009 func waitForJobEvent(ctx context.Context, config watchEventConfig) { 1010 f := config.framework 1011 ctx, cancel := context.WithTimeout(ctx, f.Timeouts.PodStartShort) 1012 defer cancel() 1013 _, err := watchtools.Until(ctx, config.resourceVersion, config.w, func(event watch.Event) (bool, error) { 1014 if job, ok := event.Object.(*batchv1.Job); ok { 1015 1016 var key string 1017 switch config.updatedMetadataType { 1018 case "annotation": 1019 key = job.Annotations[config.updatedKey] 1020 case "label": 1021 key = job.Labels[config.updatedKey] 1022 } 1023 1024 found := job.ObjectMeta.Name == config.extJob.ObjectMeta.Name && 1025 job.ObjectMeta.Namespace == f.Namespace.Name && 1026 key == config.updatedValue && 1027 event.Type == config.watchEvent 1028 if !found { 1029 framework.Logf("Event %v observed for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations) 1030 return false, nil 1031 } 1032 framework.Logf("Event %v found for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations) 1033 return found, nil 1034 } 1035 framework.Logf("Observed event: %+v", event.Object) 1036 return false, nil 1037 }) 1038 if err != nil { 1039 j, _ := f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, config.jobName, metav1.GetOptions{}) 1040 framework.Logf("We missed the %v event. Job details: %+v", config.watchEvent, j) 1041 } 1042 } 1043 1044 // waitForJobFailure uses c to wait for up to timeout for the Job named jobName in namespace ns to fail. 1045 func waitForJobFailure(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration, reason string) error { 1046 return wait.Poll(framework.Poll, timeout, func() (bool, error) { 1047 curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) 1048 if err != nil { 1049 return false, err 1050 } 1051 for _, c := range curr.Status.Conditions { 1052 if c.Type == batchv1.JobFailed && c.Status == v1.ConditionTrue { 1053 if reason == "" || reason == c.Reason { 1054 return true, nil 1055 } 1056 } 1057 } 1058 return false, nil 1059 }) 1060 }