k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/test/e2e/apps/job.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package apps 18 19 import ( 20 "context" 21 "encoding/json" 22 "fmt" 23 "strconv" 24 "time" 25 26 batchv1 "k8s.io/api/batch/v1" 27 v1 "k8s.io/api/core/v1" 28 policyv1 "k8s.io/api/policy/v1" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 "k8s.io/apimachinery/pkg/api/resource" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 33 "k8s.io/apimachinery/pkg/labels" 34 "k8s.io/apimachinery/pkg/runtime/schema" 35 "k8s.io/apimachinery/pkg/types" 36 utilrand "k8s.io/apimachinery/pkg/util/rand" 37 "k8s.io/apimachinery/pkg/util/sets" 38 "k8s.io/apimachinery/pkg/util/wait" 39 "k8s.io/apimachinery/pkg/watch" 40 clientset "k8s.io/client-go/kubernetes" 41 "k8s.io/client-go/tools/cache" 42 watchtools "k8s.io/client-go/tools/watch" 43 "k8s.io/client-go/util/retry" 44 batchinternal "k8s.io/kubernetes/pkg/apis/batch" 45 "k8s.io/kubernetes/test/e2e/framework" 46 e2ejob "k8s.io/kubernetes/test/e2e/framework/job" 47 e2enode "k8s.io/kubernetes/test/e2e/framework/node" 48 e2epod "k8s.io/kubernetes/test/e2e/framework/pod" 49 e2eresource "k8s.io/kubernetes/test/e2e/framework/resource" 50 "k8s.io/kubernetes/test/e2e/scheduling" 51 admissionapi "k8s.io/pod-security-admission/api" 52 "k8s.io/utils/pointer" 53 "k8s.io/utils/ptr" 54 55 "github.com/onsi/ginkgo/v2" 56 "github.com/onsi/gomega" 57 ) 58 59 type watchEventConfig struct { 60 framework *framework.Framework 61 resourceVersion string 62 w *cache.ListWatch 63 jobName string 64 watchEvent watch.EventType 65 extJob *batchv1.Job 66 updatedMetadataType string 67 updatedKey string 68 updatedValue string 69 } 70 71 var _ = SIGDescribe("Job", func() { 72 f := framework.NewDefaultFramework("job") 73 f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged 74 75 // Simplest case: N pods succeed 76 ginkgo.It("should run a job to completion when tasks succeed", func(ctx context.Context) { 77 parallelism := int32(2) 78 completions := int32(4) 79 backoffLimit := int32(6) // default value 80 81 ginkgo.By("Creating a job") 82 job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 83 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 84 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 85 86 ginkgo.By("Ensuring job reaches completions") 87 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 88 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 89 90 ginkgo.By("Ensuring pods for job exist") 91 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 92 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 93 successes := int32(0) 94 for _, pod := range pods.Items { 95 if pod.Status.Phase == v1.PodSucceeded { 96 successes++ 97 } 98 } 99 gomega.Expect(successes).To(gomega.Equal(completions), "expected %d successful job pods, but got %d", completions, successes) 100 }) 101 102 ginkgo.It("should allow to use the pod failure policy on exit code to fail the job early", func(ctx context.Context) { 103 104 // We fail the Job's pod only once to ensure the backoffLimit is not 105 // reached and thus the job is failed due to the pod failure policy 106 // with FailJob action. 107 // In order to ensure a Job's pod fails once before succeeding we force 108 // the Job's Pods to be scheduled to a single Node and use a hostPath 109 // volume to persist data across new Pods. 110 ginkgo.By("Looking for a node to schedule job pod") 111 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 112 framework.ExpectNoError(err) 113 114 parallelism := int32(2) 115 completions := int32(4) 116 backoffLimit := int32(6) // default value 117 ginkgo.By("Creating a job") 118 job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-failjob", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 119 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 120 Rules: []batchv1.PodFailurePolicyRule{ 121 { 122 Action: batchv1.PodFailurePolicyActionFailJob, 123 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 124 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 125 Values: []int32{1}, 126 }, 127 }, 128 }, 129 } 130 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 131 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 132 133 ginkgo.By("Ensuring job fails") 134 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 135 framework.ExpectNoError(err, "failed to ensure job failure in namespace: %s", f.Namespace.Name) 136 }) 137 138 ginkgo.It("should allow to use the pod failure policy to not count the failure towards the backoffLimit", func(ctx context.Context) { 139 140 // We set the backoffLimit to 0 so that any pod failure would trigger 141 // job failure if not for the pod failure policy to ignore the failed 142 // pods from counting them towards the backoffLimit. Also, we fail the 143 // pod only once so that the job eventually succeeds. 144 // In order to ensure a Job's pod fails once before succeeding we force 145 // the Job's Pods to be scheduled to a single Node and use a hostPath 146 // volume to persist data across new Pods. 147 parallelism := int32(2) 148 completions := int32(4) 149 backoffLimit := int32(0) 150 151 ginkgo.By("Looking for a node to schedule job pod") 152 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 153 framework.ExpectNoError(err) 154 155 ginkgo.By("Creating a job") 156 job := e2ejob.NewTestJobOnNode("failOnce", "pod-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 157 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 158 Rules: []batchv1.PodFailurePolicyRule{ 159 { 160 Action: batchv1.PodFailurePolicyActionIgnore, 161 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 162 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 163 Values: []int32{1}, 164 }, 165 }, 166 }, 167 } 168 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 169 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 170 171 ginkgo.By("Ensuring job reaches completions") 172 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 173 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 174 }) 175 176 // This test is using an indexed job. The pod corresponding to the 0th index 177 // creates a marker file on the host and runs 'forever' until evicted. We use 178 // the non-0-indexed pods to determine if the marker file is already 179 // created by the 0th indexed pod - the non-0-indexed pods fail and restart 180 // until the marker file is created (their potential failures are ignored 181 // based on the exit code). Once the marker file is created the 0th indexed 182 // pod is evicted (DisruptionTarget condition is added in the process), 183 // after restart it runs to successful completion. 184 // Steps: 185 // 1. Select a node to run all Job's pods to ensure the host marker file is accessible by all pods 186 // 2. Create the indexed job 187 // 3. Await for all non-0-indexed pods to succeed to ensure the marker file is created by the 0-indexed pod 188 // 4. Make sure the 0-indexed pod is running 189 // 5. Evict the 0-indexed pod 190 // 6. Await for the job to successfully complete 191 ginkgo.DescribeTable("Using a pod failure policy to not count some failures towards the backoffLimit", 192 func(ctx context.Context, policy *batchv1.PodFailurePolicy) { 193 mode := batchv1.IndexedCompletion 194 195 // We set the backoffLimit to 0 so that any pod failure would trigger 196 // job failure if not for the pod failure policy to ignore the failed 197 // pods from counting them towards the backoffLimit. 198 parallelism := int32(2) 199 completions := int32(4) 200 backoffLimit := int32(0) 201 202 ginkgo.By("Looking for a node to schedule job pods") 203 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 204 framework.ExpectNoError(err) 205 206 ginkgo.By("Creating a job") 207 job := e2ejob.NewTestJobOnNode("notTerminateOnce", "pod-disruption-failure-ignore", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 208 job.Spec.CompletionMode = &mode 209 job.Spec.PodFailurePolicy = policy 210 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 211 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 212 213 ginkgo.By("Awaiting for all non 0-indexed pods to succeed to ensure the marker file is created") 214 err = e2ejob.WaitForJobPodsSucceeded(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions-1) 215 framework.ExpectNoError(err, "failed to await for all non 0-indexed pods to succeed for job: %s/%s", job.Name, job.Namespace) 216 217 ginkgo.By("Awaiting for the 0-indexed pod to be running") 218 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) 219 framework.ExpectNoError(err, "failed to await for the 0-indexed pod to be running for the job: %s/%s", job.Name, job.Namespace) 220 221 pods, err := e2ejob.GetAllRunningJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 222 framework.ExpectNoError(err, "failed to get running pods for the job: %s/%s", job.Name, job.Namespace) 223 gomega.Expect(pods).To(gomega.HaveLen(1), "Exactly one running pod is expected") 224 pod := pods[0] 225 ginkgo.By(fmt.Sprintf("Evicting the running pod: %s/%s", pod.Name, pod.Namespace)) 226 evictTarget := &policyv1.Eviction{ 227 ObjectMeta: metav1.ObjectMeta{ 228 Name: pod.Name, 229 Namespace: pod.Namespace, 230 }, 231 } 232 f.ClientSet.CoreV1().Pods(pod.Namespace).EvictV1(context.TODO(), evictTarget) 233 framework.ExpectNoError(err, "failed to evict the pod: %s/%s", pod.Name, pod.Namespace) 234 235 ginkgo.By(fmt.Sprintf("Awaiting for the pod: %s/%s to be deleted", pod.Name, pod.Namespace)) 236 err = e2epod.WaitForPodNotFoundInNamespace(ctx, f.ClientSet, pod.Name, pod.Namespace, f.Timeouts.PodDelete) 237 framework.ExpectNoError(err, "failed to await for the pod to be deleted: %s/%s", pod.Name, pod.Namespace) 238 239 ginkgo.By("Ensuring job reaches completions") 240 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 241 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 242 }, 243 ginkgo.Entry("Ignore DisruptionTarget condition", &batchv1.PodFailurePolicy{ 244 Rules: []batchv1.PodFailurePolicyRule{ 245 { 246 // Ignore failures of the non 0-indexed pods which fail until the marker file is created 247 Action: batchv1.PodFailurePolicyActionIgnore, 248 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 249 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 250 Values: []int32{1}, 251 }, 252 }, 253 { 254 // Ignore the pod failure caused by the eviction 255 Action: batchv1.PodFailurePolicyActionIgnore, 256 OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{ 257 { 258 Type: v1.DisruptionTarget, 259 Status: v1.ConditionTrue, 260 }, 261 }, 262 }, 263 }, 264 }), 265 ginkgo.Entry("Ignore exit code 137", &batchv1.PodFailurePolicy{ 266 Rules: []batchv1.PodFailurePolicyRule{ 267 { 268 // Ignore failures of the non 0-indexed pods which fail until the marker file is created 269 // And the 137 in the 0-indexed pod due to eviction. 270 Action: batchv1.PodFailurePolicyActionIgnore, 271 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 272 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 273 Values: []int32{1, 137}, 274 }, 275 }, 276 }, 277 }), 278 ) 279 280 ginkgo.It("should not create pods when created in suspend state", func(ctx context.Context) { 281 parallelism := int32(2) 282 completions := int32(4) 283 backoffLimit := int32(6) // default value 284 285 ginkgo.By("Creating a job with suspend=true") 286 job := e2ejob.NewTestJob("succeed", "suspend-true-to-false", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 287 job.Spec.Suspend = pointer.BoolPtr(true) 288 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 289 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 290 291 ginkgo.By("Checking Job status to observe Suspended state") 292 err = e2ejob.WaitForJobSuspend(ctx, f.ClientSet, f.Namespace.Name, job.Name) 293 framework.ExpectNoError(err, "failed to observe suspend state: %s", f.Namespace.Name) 294 295 ginkgo.By("Ensuring pods aren't created for job") 296 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 297 framework.ExpectNoError(err, "failed to list pod for a given job %s in namespace %s", job.Name, f.Namespace.Name) 298 gomega.Expect(pods.Items).To(gomega.BeEmpty()) 299 300 ginkgo.By("Updating the job with suspend=false") 301 job, err = f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, job.Name, metav1.GetOptions{}) 302 framework.ExpectNoError(err, "failed to get job in namespace: %s", f.Namespace.Name) 303 job.Spec.Suspend = pointer.BoolPtr(false) 304 job, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 305 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 306 307 ginkgo.By("Waiting for job to complete") 308 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 309 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 310 }) 311 312 ginkgo.It("should delete pods when suspended", func(ctx context.Context) { 313 parallelism := int32(2) 314 completions := int32(4) 315 backoffLimit := int32(6) // default value 316 317 ginkgo.By("Creating a job with suspend=false") 318 job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 319 job.Spec.Suspend = pointer.Bool(false) 320 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 321 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 322 323 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 324 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 325 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 326 327 ginkgo.By("Updating the job with suspend=true") 328 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 329 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 330 framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name) 331 job.Spec.Suspend = pointer.Bool(true) 332 updatedJob, err := e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 333 if err == nil { 334 job = updatedJob 335 } 336 return err 337 }) 338 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 339 340 ginkgo.By("Ensuring pods are deleted") 341 err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name) 342 framework.ExpectNoError(err, "failed to ensure pods are deleted after suspend=true") 343 344 ginkgo.By("Checking Job status to observe Suspended state") 345 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 346 framework.ExpectNoError(err, "failed to retrieve latest job object") 347 exists := false 348 for _, c := range job.Status.Conditions { 349 if c.Type == batchv1.JobSuspended { 350 exists = true 351 break 352 } 353 } 354 if !exists { 355 framework.Failf("Job was expected to be completed or failed") 356 } 357 }) 358 359 ginkgo.It("should recreate pods only after they have failed if pod replacement policy is set to Failed", func(ctx context.Context) { 360 ginkgo.By("Creating a job") 361 job := e2ejob.NewTestJob("", "pod-recreate-failed", v1.RestartPolicyNever, 1, 1, nil, 1) 362 job.Spec.PodReplacementPolicy = ptr.To(batchv1.Failed) 363 job.Spec.Template.Spec.Containers[0].Command = []string{"/bin/sh", "-c", `_term(){ 364 sleep 5 365 exit 143 366 } 367 trap _term SIGTERM 368 while true; do 369 sleep 1 370 done`} 371 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 372 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 373 374 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, 1) 375 framework.ExpectNoError(err, "failed to wait for job pod to become running in namespace: %s", f.Namespace.Name) 376 377 ginkgo.By("Deleting job pod") 378 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 379 framework.ExpectNoError(err, "failed to get pod list for job %s in namespace: %s", job.Name, f.Namespace.Name) 380 381 framework.ExpectNoError(e2epod.DeletePodsWithGracePeriod(ctx, f.ClientSet, pods.Items, 30), "failed to delete pods in namespace: %s", f.Namespace.Name) 382 383 ginkgo.By("Ensuring pod does not get recreated while it is in terminating state") 384 err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string { 385 if job.Status.Active == 0 && job.Status.Failed == 0 && *job.Status.Terminating == 1 { 386 return "" 387 } else { 388 return fmt.Sprintf( 389 "expected job to have 0 active pod, 0 failed pod and 1 terminating pods, but got %d active pods, %d failed pods and %d terminating pods", 390 job.Status.Active, 391 job.Status.Failed, 392 *job.Status.Terminating, 393 ) 394 } 395 }) 396 framework.ExpectNoError(err, "failed to ensure pod is not recreated while it is in terminating state") 397 398 ginkgo.By("Ensuring pod gets recreated after it has failed") 399 err = e2ejob.WaitForJobState(ctx, f.ClientSet, f.Namespace.Name, job.Name, f.Timeouts.PodDelete, func(job *batchv1.Job) string { 400 if job.Status.Active == 1 && job.Status.Failed == 1 && *job.Status.Terminating == 0 { 401 return "" 402 } else { 403 return fmt.Sprintf( 404 "expected job to have 1 active pods, 1 failed pods and 0 terminating pod, but got %d active pods, %d failed pods and %d terminating pods", 405 job.Status.Active, 406 job.Status.Failed, 407 *job.Status.Terminating, 408 ) 409 } 410 }) 411 framework.ExpectNoError(err, "failed to wait for pod to get recreated") 412 }) 413 414 /* 415 Release: v1.24 416 Testname: Ensure Pods of an Indexed Job get a unique index. 417 Description: Create an Indexed job. Job MUST complete successfully. 418 Ensure that created pods have completion index annotation and environment variable. 419 */ 420 framework.ConformanceIt("should create pods for an Indexed job with completion indexes and specified hostname", func(ctx context.Context) { 421 parallelism := int32(2) 422 completions := int32(4) 423 backoffLimit := int32(6) // default value 424 425 ginkgo.By("Creating Indexed job") 426 job := e2ejob.NewTestJob("succeed", "indexed-job", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 427 mode := batchv1.IndexedCompletion 428 job.Spec.CompletionMode = &mode 429 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 430 framework.ExpectNoError(err, "failed to create indexed job in namespace %s", f.Namespace.Name) 431 432 ginkgo.By("Ensuring job reaches completions") 433 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 434 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 435 436 ginkgo.By("Ensuring pods with index for job exist") 437 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 438 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 439 succeededIndexes := sets.NewInt() 440 for _, pod := range pods.Items { 441 if pod.Status.Phase == v1.PodSucceeded && pod.Annotations != nil { 442 ix, err := strconv.Atoi(pod.Annotations[batchv1.JobCompletionIndexAnnotation]) 443 framework.ExpectNoError(err, "failed obtaining completion index from pod in namespace: %s", f.Namespace.Name) 444 succeededIndexes.Insert(ix) 445 expectedName := fmt.Sprintf("%s-%d", job.Name, ix) 446 gomega.Expect(pod.Spec.Hostname).To(gomega.Equal(expectedName), "expected completed pod with hostname %s, but got %s", expectedName, pod.Spec.Hostname) 447 } 448 } 449 gotIndexes := succeededIndexes.List() 450 wantIndexes := []int{0, 1, 2, 3} 451 gomega.Expect(gotIndexes).To(gomega.Equal(wantIndexes), "expected completed indexes %s, but got %s", wantIndexes, gotIndexes) 452 }) 453 454 /* 455 Testcase: Ensure that all indexes are executed for an indexed job with backoffLimitPerIndex despite some failing 456 Description: Create an indexed job and ensure that all indexes are either failed or succeeded, depending 457 on the end state of the corresponding pods. Pods with odd indexes fail, while the pods with even indexes 458 succeeded. Also, verify that the number of failed pods doubles the number of failing indexes, as the 459 backoffLimitPerIndex=1, allowing for one pod recreation before marking that indexed failed. 460 */ 461 ginkgo.It("should execute all indexes despite some failing when using backoffLimitPerIndex", func(ctx context.Context) { 462 parallelism := int32(2) 463 completions := int32(4) 464 backoffLimit := int32(6) // default value 465 466 ginkgo.By("Creating an indexed job with backoffLimit per index and failing pods") 467 job := e2ejob.NewTestJob("failOddSucceedEven", "with-backoff-limit-per-index", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 468 job.Spec.BackoffLimit = nil 469 job.Spec.BackoffLimitPerIndex = ptr.To[int32](1) 470 mode := batchv1.IndexedCompletion 471 job.Spec.CompletionMode = &mode 472 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 473 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 474 475 ginkgo.By("Awaiting for the job to fail as there are failed indexes") 476 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 477 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 478 479 ginkgo.By("Verifying the Job status fields to ensure all indexes were executed") 480 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 481 framework.ExpectNoError(err, "failed to retrieve latest job object") 482 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1,3"))) 483 gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0,2")) 484 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(4))) 485 gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(2))) 486 }) 487 488 /* 489 Testcase: Terminate job execution when the maxFailedIndexes is exceeded 490 Description: Create an indexed job with backoffLimitPerIndex and maxFailedIndexes. 491 Verify the job execution is terminated as soon as the number of failed 492 indexes exceeds maxFailedIndexes. 493 */ 494 ginkgo.It("should terminate job execution when the number of failed indexes exceeds maxFailedIndexes", func(ctx context.Context) { 495 // we use parallelism=1 to make sure in the asserts only one pod was created 496 parallelism := int32(1) 497 completions := int32(4) 498 backoffLimit := int32(6) // default value 499 500 ginkgo.By("Creating an indexed job with backoffLimit per index and maxFailedIndexes") 501 job := e2ejob.NewTestJob("fail", "with-max-failed-indexes", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 502 job.Spec.BackoffLimit = nil 503 job.Spec.BackoffLimitPerIndex = ptr.To[int32](0) 504 job.Spec.MaxFailedIndexes = ptr.To[int32](0) 505 506 mode := batchv1.IndexedCompletion 507 job.Spec.CompletionMode = &mode 508 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 509 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 510 511 ginkgo.By("Awaiting for the job to fail as the number of max failed indexes is exceeded") 512 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 513 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 514 515 ginkgo.By("Verifying the Job status fields to ensure early termination of the job") 516 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 517 framework.ExpectNoError(err, "failed to retrieve latest job object") 518 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("0"))) 519 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1))) 520 }) 521 522 /* 523 Testcase: Mark indexes as failed when the FailIndex action is matched in podFailurePolicy 524 Description: Create an indexed job with backoffLimitPerIndex, and podFailurePolicy 525 with the FailIndex action. Verify the failed pods matching the pod failure policy 526 result in marking the corresponding indexes as failed without restarts, despite 527 backoffLimitPerIndex > 0. 528 */ 529 ginkgo.It("should mark indexes as failed when the FailIndex action is matched in podFailurePolicy", func(ctx context.Context) { 530 parallelism := int32(2) 531 completions := int32(2) 532 backoffLimit := int32(6) // default value 533 534 ginkgo.By("Creating an indexed job with failing pods matching the FailIndex action") 535 job := e2ejob.NewTestJob("failOddSucceedEven", "matching-fail-index-action", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 536 job.Spec.BackoffLimit = nil 537 job.Spec.BackoffLimitPerIndex = ptr.To[int32](1) 538 job.Spec.PodFailurePolicy = &batchv1.PodFailurePolicy{ 539 Rules: []batchv1.PodFailurePolicyRule{ 540 { 541 Action: batchv1.PodFailurePolicyActionFailIndex, 542 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 543 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 544 Values: []int32{1}, 545 }, 546 }, 547 }, 548 } 549 mode := batchv1.IndexedCompletion 550 job.Spec.CompletionMode = &mode 551 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 552 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 553 554 ginkgo.By("Awaiting for the job to fail as all indexes are failed") 555 err = e2ejob.WaitForJobFailed(f.ClientSet, f.Namespace.Name, job.Name) 556 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 557 558 ginkgo.By("Verifying the Job status fields to ensure the upper indexes didn't execute") 559 job, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 560 framework.ExpectNoError(err, "failed to retrieve latest job object") 561 gomega.Expect(job.Status.FailedIndexes).Should(gomega.HaveValue(gomega.Equal("1"))) 562 gomega.Expect(job.Status.CompletedIndexes).Should(gomega.Equal("0")) 563 gomega.Expect(job.Status.Failed).Should(gomega.Equal(int32(1))) 564 gomega.Expect(job.Status.Succeeded).Should(gomega.Equal(int32(1))) 565 }) 566 567 /* 568 Testcase: Ensure that the pods associated with the job are removed once the job is deleted 569 Description: Create a job and ensure the associated pod count is equal to parallelism count. Delete the 570 job and ensure if the pods associated with the job have been removed 571 */ 572 ginkgo.It("should remove pods when job is deleted", func(ctx context.Context) { 573 parallelism := int32(2) 574 completions := int32(4) 575 backoffLimit := int32(6) // default value 576 577 ginkgo.By("Creating a job") 578 job := e2ejob.NewTestJob("notTerminate", "all-pods-removed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 579 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 580 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 581 582 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 583 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 584 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 585 586 ginkgo.By("Delete the job") 587 err = e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name) 588 framework.ExpectNoError(err, "failed to delete the job in namespace: %s", f.Namespace.Name) 589 590 ginkgo.By("Ensure the pods associated with the job are also deleted") 591 err = e2ejob.WaitForAllJobPodsGone(ctx, f.ClientSet, f.Namespace.Name, job.Name) 592 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 593 }) 594 595 /* 596 Release: v1.16 597 Testname: Jobs, completion after task failure 598 Description: Explicitly cause the tasks to fail once initially. After restarting, the Job MUST 599 execute to completion. 600 */ 601 framework.ConformanceIt("should run a job to completion when tasks sometimes fail and are locally restarted", func(ctx context.Context) { 602 parallelism := int32(2) 603 completions := int32(4) 604 backoffLimit := int32(6) // default value 605 606 ginkgo.By("Creating a job") 607 // One failure, then a success, local restarts. 608 // We can't use the random failure approach, because kubelet will 609 // throttle frequently failing containers in a given pod, ramping 610 // up to 5 minutes between restarts, making test timeout due to 611 // successive failures too likely with a reasonable test timeout. 612 job := e2ejob.NewTestJob("failOnce", "fail-once-local", v1.RestartPolicyOnFailure, parallelism, completions, nil, backoffLimit) 613 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 614 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 615 616 ginkgo.By("Ensuring job reaches completions") 617 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 618 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 619 }) 620 621 // Pods sometimes fail, but eventually succeed, after pod restarts 622 ginkgo.It("should run a job to completion when tasks sometimes fail and are not locally restarted", func(ctx context.Context) { 623 // One failure, then a success, no local restarts. 624 // We can't use the random failure approach, because JobController 625 // will throttle frequently failing Pods of a given Job, ramping 626 // up to 6 minutes between restarts, making test timeout due to 627 // successive failures. 628 // Instead, we force the Job's Pods to be scheduled to a single Node 629 // and use a hostPath volume to persist data across new Pods. 630 ginkgo.By("Looking for a node to schedule job pod") 631 node, err := e2enode.GetRandomReadySchedulableNode(ctx, f.ClientSet) 632 framework.ExpectNoError(err) 633 634 parallelism := int32(2) 635 completions := int32(4) 636 backoffLimit := int32(6) // default value 637 638 ginkgo.By("Creating a job") 639 job := e2ejob.NewTestJobOnNode("failOnce", "fail-once-non-local", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit, node.Name) 640 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 641 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 642 643 ginkgo.By("Ensuring job reaches completions") 644 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, *job.Spec.Completions) 645 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 646 }) 647 648 ginkgo.It("should fail when exceeds active deadline", func(ctx context.Context) { 649 activeDeadlineSeconds := int64(1) 650 parallelism := int32(2) 651 completions := int32(4) 652 backoffLimit := int32(6) // default value 653 654 ginkgo.By("Creating a job") 655 job := e2ejob.NewTestJob("notTerminate", "exceed-active-deadline", v1.RestartPolicyNever, parallelism, completions, &activeDeadlineSeconds, backoffLimit) 656 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 657 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 658 ginkgo.By("Ensuring job past active deadline") 659 err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, time.Duration(activeDeadlineSeconds+15)*time.Second, "DeadlineExceeded") 660 framework.ExpectNoError(err, "failed to ensure job past active deadline in namespace: %s", f.Namespace.Name) 661 }) 662 663 /* 664 Release: v1.15 665 Testname: Jobs, active pods, graceful termination 666 Description: Create a job. Ensure the active pods reflect parallelism in the namespace and delete the job. Job MUST be deleted successfully. 667 */ 668 framework.ConformanceIt("should delete a job", func(ctx context.Context) { 669 parallelism := int32(2) 670 completions := int32(4) 671 backoffLimit := int32(6) // default value 672 673 ginkgo.By("Creating a job") 674 job := e2ejob.NewTestJob("notTerminate", "foo", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 675 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 676 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 677 678 ginkgo.By("Ensuring active pods == parallelism") 679 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 680 framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name) 681 682 ginkgo.By("delete a job") 683 framework.ExpectNoError(e2eresource.DeleteResourceAndWaitForGC(ctx, f.ClientSet, batchinternal.Kind("Job"), f.Namespace.Name, job.Name)) 684 685 ginkgo.By("Ensuring job was deleted") 686 _, err = e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 687 gomega.Expect(err).To(gomega.MatchError(apierrors.IsNotFound, fmt.Sprintf("failed to ensure job %s was deleted in namespace: %s", job.Name, f.Namespace.Name))) 688 }) 689 690 /* 691 Release: v1.16 692 Testname: Jobs, orphan pods, re-adoption 693 Description: Create a parallel job. The number of Pods MUST equal the level of parallelism. 694 Orphan a Pod by modifying its owner reference. The Job MUST re-adopt the orphan pod. 695 Modify the labels of one of the Job's Pods. The Job MUST release the Pod. 696 */ 697 framework.ConformanceIt("should adopt matching orphans and release non-matching pods", func(ctx context.Context) { 698 parallelism := int32(2) 699 completions := int32(4) 700 backoffLimit := int32(6) // default value 701 702 ginkgo.By("Creating a job") 703 job := e2ejob.NewTestJob("notTerminate", "adopt-release", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 704 // Replace job with the one returned from Create() so it has the UID. 705 // Save Kind since it won't be populated in the returned job. 706 kind := job.Kind 707 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 708 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 709 job.Kind = kind 710 711 ginkgo.By("Ensuring active pods == parallelism") 712 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 713 framework.ExpectNoError(err, "failed to ensure active pods == parallelism in namespace: %s", f.Namespace.Name) 714 715 ginkgo.By("Orphaning one of the Job's Pods") 716 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 717 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 718 gomega.Expect(pods.Items).To(gomega.HaveLen(int(parallelism))) 719 pod := pods.Items[0] 720 e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) { 721 pod.OwnerReferences = nil 722 }) 723 724 ginkgo.By("Checking that the Job readopts the Pod") 725 gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "adopted", e2ejob.JobTimeout, 726 func(pod *v1.Pod) (bool, error) { 727 controllerRef := metav1.GetControllerOf(pod) 728 if controllerRef == nil { 729 return false, nil 730 } 731 if controllerRef.Kind != job.Kind || controllerRef.Name != job.Name || controllerRef.UID != job.UID { 732 return false, fmt.Errorf("pod has wrong controllerRef: got %v, want %v", controllerRef, job) 733 } 734 return true, nil 735 }, 736 )).To(gomega.Succeed(), "wait for pod %q to be readopted", pod.Name) 737 738 ginkgo.By("Removing the labels from the Job's Pod") 739 e2epod.NewPodClient(f).Update(ctx, pod.Name, func(pod *v1.Pod) { 740 pod.Labels = nil 741 }) 742 743 ginkgo.By("Checking that the Job releases the Pod") 744 gomega.Expect(e2epod.WaitForPodCondition(ctx, f.ClientSet, pod.Namespace, pod.Name, "released", e2ejob.JobTimeout, 745 func(pod *v1.Pod) (bool, error) { 746 controllerRef := metav1.GetControllerOf(pod) 747 if controllerRef != nil { 748 return false, nil 749 } 750 return true, nil 751 }, 752 )).To(gomega.Succeed(), "wait for pod %q to be released", pod.Name) 753 }) 754 755 ginkgo.It("should fail to exceed backoffLimit", func(ctx context.Context) { 756 ginkgo.By("Creating a job") 757 backoff := 1 758 job := e2ejob.NewTestJob("fail", "backofflimit", v1.RestartPolicyNever, 1, 1, nil, int32(backoff)) 759 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 760 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 761 ginkgo.By("Ensuring job exceed backofflimit") 762 763 err = waitForJobFailure(ctx, f.ClientSet, f.Namespace.Name, job.Name, e2ejob.JobTimeout, "BackoffLimitExceeded") 764 framework.ExpectNoError(err, "failed to ensure job exceed backofflimit in namespace: %s", f.Namespace.Name) 765 766 ginkgo.By(fmt.Sprintf("Checking that %d pod created and status is failed", backoff+1)) 767 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 768 framework.ExpectNoError(err, "failed to get PodList for job %s in namespace: %s", job.Name, f.Namespace.Name) 769 gomega.Expect(pods.Items).To(gomega.HaveLen(backoff + 1)) 770 for _, pod := range pods.Items { 771 gomega.Expect(pod.Status.Phase).To(gomega.Equal(v1.PodFailed)) 772 } 773 }) 774 775 f.It("should run a job to completion with CPU requests", f.WithSerial(), func(ctx context.Context) { 776 ginkgo.By("Creating a job that with CPU requests") 777 778 testNodeName := scheduling.GetNodeThatCanRunPod(ctx, f) 779 targetNode, err := f.ClientSet.CoreV1().Nodes().Get(ctx, testNodeName, metav1.GetOptions{}) 780 framework.ExpectNoError(err, "unable to get node object for node %v", testNodeName) 781 782 cpu, ok := targetNode.Status.Allocatable[v1.ResourceCPU] 783 if !ok { 784 framework.Failf("Unable to get node's %q cpu", targetNode.Name) 785 } 786 787 cpuRequest := fmt.Sprint(int64(0.2 * float64(cpu.Value()))) 788 789 parallelism := int32(90) 790 completions := int32(90) 791 backoffLimit := int32(0) 792 793 ginkgo.By("Creating a job") 794 job := e2ejob.NewTestJob("succeed", "all-succeed", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 795 for i := range job.Spec.Template.Spec.Containers { 796 job.Spec.Template.Spec.Containers[i].Resources = v1.ResourceRequirements{ 797 Requests: v1.ResourceList{ 798 v1.ResourceCPU: resource.MustParse(cpuRequest), 799 }, 800 } 801 job.Spec.Template.Spec.NodeSelector = map[string]string{"kubernetes.io/hostname": testNodeName} 802 } 803 804 framework.Logf("Creating job %q with a node hostname selector %q with cpu request %q", job.Name, testNodeName, cpuRequest) 805 job, err = e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 806 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 807 808 ginkgo.By("Ensuring job reaches completions") 809 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, f.Namespace.Name, job.Name, completions) 810 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", f.Namespace.Name) 811 812 ginkgo.By("Ensuring pods for job exist") 813 pods, err := e2ejob.GetJobPods(ctx, f.ClientSet, f.Namespace.Name, job.Name) 814 framework.ExpectNoError(err, "failed to get pod list for job in namespace: %s", f.Namespace.Name) 815 successes := int32(0) 816 for _, pod := range pods.Items { 817 if pod.Status.Phase == v1.PodSucceeded { 818 successes++ 819 } 820 } 821 gomega.Expect(successes).To(gomega.Equal(completions), "expected %d successful job pods, but got %d", completions, successes) 822 }) 823 824 /* 825 Release: v1.24 826 Testname: Jobs, apply changes to status 827 Description: Attempt to create a running Job which MUST succeed. 828 Attempt to patch the Job status which MUST succeed. 829 An annotation for the job that was patched MUST be found. 830 Attempt to replace the job status with update which MUST succeed. 831 Attempt to read its status sub-resource which MUST succeed 832 */ 833 framework.ConformanceIt("should apply changes to a job status", func(ctx context.Context) { 834 835 ns := f.Namespace.Name 836 jClient := f.ClientSet.BatchV1().Jobs(ns) 837 838 parallelism := int32(2) 839 completions := int32(4) 840 backoffLimit := int32(6) // default value 841 842 ginkgo.By("Creating a job") 843 job := e2ejob.NewTestJob("notTerminate", "suspend-false-to-true", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 844 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 845 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 846 847 ginkgo.By("Ensure pods equal to parallelism count is attached to the job") 848 err = e2ejob.WaitForJobPodsRunning(ctx, f.ClientSet, f.Namespace.Name, job.Name, parallelism) 849 framework.ExpectNoError(err, "failed to ensure number of pods associated with job %s is equal to parallelism count in namespace: %s", job.Name, f.Namespace.Name) 850 851 customConditionType := batchv1.JobConditionType("CustomConditionType") 852 // /status subresource operations 853 ginkgo.By("patching /status") 854 // we need to use RFC3339 version since conversion over the wire cuts nanoseconds 855 now1 := metav1.Now().Rfc3339Copy() 856 jStatus := batchv1.JobStatus{ 857 Conditions: []batchv1.JobCondition{ 858 { 859 Type: customConditionType, 860 Status: v1.ConditionTrue, 861 LastTransitionTime: now1, 862 }, 863 }, 864 } 865 866 jStatusJSON, err := json.Marshal(jStatus) 867 framework.ExpectNoError(err) 868 patchedStatus, err := jClient.Patch(ctx, job.Name, types.MergePatchType, 869 []byte(`{"metadata":{"annotations":{"patchedstatus":"true"}},"status":`+string(jStatusJSON)+`}`), 870 metav1.PatchOptions{}, "status") 871 framework.ExpectNoError(err) 872 if condition := findConditionByType(patchedStatus.Status.Conditions, customConditionType); condition != nil { 873 if !condition.LastTransitionTime.Equal(&now1) { 874 framework.Failf("patched object should have the applied condition with LastTransitionTime %#v, got %#v instead", now1, condition.LastTransitionTime) 875 } 876 } else { 877 framework.Failf("patched object does not have the required condition %v", customConditionType) 878 } 879 gomega.Expect(patchedStatus.Annotations).To(gomega.HaveKeyWithValue("patchedstatus", "true"), "patched object should have the applied annotation") 880 881 ginkgo.By("updating /status") 882 // we need to use RFC3339 version since conversion over the wire cuts nanoseconds 883 now2 := metav1.Now().Rfc3339Copy() 884 var statusToUpdate, updatedStatus *batchv1.Job 885 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 886 statusToUpdate, err = jClient.Get(ctx, job.Name, metav1.GetOptions{}) 887 if err != nil { 888 return err 889 } 890 if condition := findConditionByType(statusToUpdate.Status.Conditions, customConditionType); condition != nil { 891 condition.LastTransitionTime = now2 892 } else { 893 framework.Failf("patched object does not have the required condition %v", customConditionType) 894 } 895 updatedStatus, err = jClient.UpdateStatus(ctx, statusToUpdate, metav1.UpdateOptions{}) 896 return err 897 }) 898 framework.ExpectNoError(err) 899 if condition := findConditionByType(updatedStatus.Status.Conditions, customConditionType); condition != nil { 900 if !condition.LastTransitionTime.Equal(&now2) { 901 framework.Failf("patched object should have the applied condition with LastTransitionTime %#v, got %#v instead", now2, condition.LastTransitionTime) 902 } 903 } else { 904 framework.Failf("patched object does not have the required condition %v", customConditionType) 905 } 906 907 ginkgo.By("get /status") 908 jResource := schema.GroupVersionResource{Group: "batch", Version: "v1", Resource: "jobs"} 909 gottenStatus, err := f.DynamicClient.Resource(jResource).Namespace(ns).Get(ctx, job.Name, metav1.GetOptions{}, "status") 910 framework.ExpectNoError(err) 911 statusUID, _, err := unstructured.NestedFieldCopy(gottenStatus.Object, "metadata", "uid") 912 framework.ExpectNoError(err) 913 gomega.Expect(string(job.UID)).To(gomega.Equal(statusUID), fmt.Sprintf("job.UID: %v expected to match statusUID: %v ", job.UID, statusUID)) 914 }) 915 916 /* 917 Release: v1.25 918 Testname: Jobs, manage lifecycle 919 Description: Attempt to create a suspended Job which MUST succeed. 920 Attempt to patch the Job to include a new label which MUST succeed. 921 The label MUST be found. Attempt to replace the Job to include a 922 new annotation which MUST succeed. The annotation MUST be found. 923 Attempt to list all namespaces with a label selector which MUST 924 succeed. One list MUST be found. It MUST succeed at deleting a 925 collection of jobs via a label selector. 926 */ 927 framework.ConformanceIt("should manage the lifecycle of a job", func(ctx context.Context) { 928 jobName := "e2e-" + utilrand.String(5) 929 label := map[string]string{"e2e-job-label": jobName} 930 labelSelector := labels.SelectorFromSet(label).String() 931 932 ns := f.Namespace.Name 933 jobClient := f.ClientSet.BatchV1().Jobs(ns) 934 935 w := &cache.ListWatch{ 936 WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { 937 options.LabelSelector = labelSelector 938 return jobClient.Watch(ctx, options) 939 }, 940 } 941 jobsList, err := jobClient.List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 942 framework.ExpectNoError(err, "failed to list Job") 943 944 parallelism := int32(2) 945 completions := int32(4) 946 backoffLimit := int32(6) // default value 947 948 ginkgo.By("Creating a suspended job") 949 job := e2ejob.NewTestJob("succeed", jobName, v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 950 job.Labels = label 951 job.Spec.Suspend = pointer.BoolPtr(true) 952 job, err = e2ejob.CreateJob(ctx, f.ClientSet, ns, job) 953 framework.ExpectNoError(err, "failed to create job in namespace: %s", ns) 954 955 ginkgo.By("Patching the Job") 956 payload := "{\"metadata\":{\"labels\":{\"" + jobName + "\":\"patched\"}}}" 957 patchedJob, err := f.ClientSet.BatchV1().Jobs(ns).Patch(ctx, jobName, types.StrategicMergePatchType, []byte(payload), metav1.PatchOptions{}) 958 framework.ExpectNoError(err, "failed to patch Job %s in namespace %s", jobName, ns) 959 960 ginkgo.By("Watching for Job to be patched") 961 c := watchEventConfig{ 962 framework: f, 963 resourceVersion: jobsList.ResourceVersion, 964 w: w, 965 jobName: jobName, 966 watchEvent: watch.Modified, 967 extJob: patchedJob, 968 updatedMetadataType: "label", 969 updatedKey: jobName, 970 updatedValue: "patched", 971 } 972 waitForJobEvent(ctx, c) 973 gomega.Expect(patchedJob.Labels).To(gomega.HaveKeyWithValue(jobName, "patched"), "Did not find job label for this job. Current labels: %v", patchedJob.Labels) 974 975 ginkgo.By("Updating the job") 976 var updatedJob *batchv1.Job 977 978 err = retry.RetryOnConflict(retry.DefaultRetry, func() error { 979 patchedJob, err = jobClient.Get(ctx, jobName, metav1.GetOptions{}) 980 framework.ExpectNoError(err, "Unable to get job %s", jobName) 981 patchedJob.Spec.Suspend = pointer.BoolPtr(false) 982 if patchedJob.Annotations == nil { 983 patchedJob.Annotations = map[string]string{} 984 } 985 patchedJob.Annotations["updated"] = "true" 986 updatedJob, err = e2ejob.UpdateJob(ctx, f.ClientSet, ns, patchedJob) 987 return err 988 }) 989 framework.ExpectNoError(err, "failed to update job in namespace: %s", ns) 990 991 ginkgo.By("Watching for Job to be updated") 992 c = watchEventConfig{ 993 framework: f, 994 resourceVersion: patchedJob.ResourceVersion, 995 w: w, 996 jobName: jobName, 997 watchEvent: watch.Modified, 998 extJob: updatedJob, 999 updatedMetadataType: "annotation", 1000 updatedKey: "updated", 1001 updatedValue: "true", 1002 } 1003 waitForJobEvent(ctx, c) 1004 gomega.Expect(updatedJob.Annotations).To(gomega.HaveKeyWithValue("updated", "true"), "updated Job should have the applied annotation") 1005 framework.Logf("Found Job annotations: %#v", patchedJob.Annotations) 1006 1007 ginkgo.By("Listing all Jobs with LabelSelector") 1008 jobs, err := f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 1009 framework.ExpectNoError(err, "Failed to list job. %v", err) 1010 gomega.Expect(jobs.Items).To(gomega.HaveLen(1), "Failed to find job %v", jobName) 1011 testJob := jobs.Items[0] 1012 framework.Logf("Job: %v as labels: %v", testJob.Name, testJob.Labels) 1013 1014 ginkgo.By("Waiting for job to complete") 1015 err = e2ejob.WaitForJobComplete(ctx, f.ClientSet, ns, jobName, completions) 1016 framework.ExpectNoError(err, "failed to ensure job completion in namespace: %s", ns) 1017 1018 ginkgo.By("Delete a job collection with a labelselector") 1019 propagationPolicy := metav1.DeletePropagationBackground 1020 err = f.ClientSet.BatchV1().Jobs(ns).DeleteCollection(ctx, metav1.DeleteOptions{PropagationPolicy: &propagationPolicy}, metav1.ListOptions{LabelSelector: labelSelector}) 1021 framework.ExpectNoError(err, "failed to delete job %s in namespace: %s", job.Name, ns) 1022 1023 ginkgo.By("Watching for Job to be deleted") 1024 c = watchEventConfig{ 1025 framework: f, 1026 resourceVersion: updatedJob.ResourceVersion, 1027 w: w, 1028 jobName: jobName, 1029 watchEvent: watch.Deleted, 1030 extJob: &testJob, 1031 updatedMetadataType: "label", 1032 updatedKey: "e2e-job-label", 1033 updatedValue: jobName, 1034 } 1035 waitForJobEvent(ctx, c) 1036 1037 ginkgo.By("Relist jobs to confirm deletion") 1038 jobs, err = f.ClientSet.BatchV1().Jobs("").List(ctx, metav1.ListOptions{LabelSelector: labelSelector}) 1039 framework.ExpectNoError(err, "Failed to list job. %v", err) 1040 gomega.Expect(jobs.Items).To(gomega.BeEmpty(), "Found job %v", jobName) 1041 }) 1042 1043 ginkgo.It("should update the status ready field", func(ctx context.Context) { 1044 parallelism := int32(2) 1045 completions := int32(4) 1046 backoffLimit := int32(6) // default value 1047 1048 ginkgo.By("Creating a job with suspend=true") 1049 job := e2ejob.NewTestJob("notTerminate", "all-ready", v1.RestartPolicyNever, parallelism, completions, nil, backoffLimit) 1050 job.Spec.Suspend = ptr.To[bool](true) 1051 job, err := e2ejob.CreateJob(ctx, f.ClientSet, f.Namespace.Name, job) 1052 framework.ExpectNoError(err, "failed to create job in namespace: %s", f.Namespace.Name) 1053 1054 ginkgo.By("Ensure the job controller updates the status.ready field") 1055 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0)) 1056 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 1057 1058 ginkgo.By("Updating the job with suspend=false") 1059 err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](false)) 1060 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 1061 1062 ginkgo.By("Ensure the job controller updates the status.ready field") 1063 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ¶llelism) 1064 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 1065 1066 ginkgo.By("Updating the job with suspend=true") 1067 err = updateJobSuspendWithRetries(ctx, f, job, ptr.To[bool](true)) 1068 framework.ExpectNoError(err, "failed to update job in namespace: %s", f.Namespace.Name) 1069 1070 ginkgo.By("Ensure the job controller updates the status.ready field") 1071 err = e2ejob.WaitForJobReady(ctx, f.ClientSet, f.Namespace.Name, job.Name, ptr.To[int32](0)) 1072 framework.ExpectNoError(err, "failed to ensure job status ready field in namespace: %s", f.Namespace.Name) 1073 }) 1074 }) 1075 1076 func updateJobSuspendWithRetries(ctx context.Context, f *framework.Framework, job *batchv1.Job, suspend *bool) error { 1077 return retry.RetryOnConflict(retry.DefaultRetry, func() error { 1078 job, err := e2ejob.GetJob(ctx, f.ClientSet, f.Namespace.Name, job.Name) 1079 framework.ExpectNoError(err, "unable to get job %s in namespace %s", job.Name, f.Namespace.Name) 1080 job.Spec.Suspend = suspend 1081 _, err = e2ejob.UpdateJob(ctx, f.ClientSet, f.Namespace.Name, job) 1082 return err 1083 }) 1084 } 1085 1086 // waitForJobEvent is used to track and log Job events. 1087 // As delivery of events is not actually guaranteed we 1088 // will not return an error if we miss the required event. 1089 func waitForJobEvent(ctx context.Context, config watchEventConfig) { 1090 f := config.framework 1091 ctx, cancel := context.WithTimeout(ctx, f.Timeouts.PodStartShort) 1092 defer cancel() 1093 _, err := watchtools.Until(ctx, config.resourceVersion, config.w, func(event watch.Event) (bool, error) { 1094 if job, ok := event.Object.(*batchv1.Job); ok { 1095 1096 var key string 1097 switch config.updatedMetadataType { 1098 case "annotation": 1099 key = job.Annotations[config.updatedKey] 1100 case "label": 1101 key = job.Labels[config.updatedKey] 1102 } 1103 1104 found := job.ObjectMeta.Name == config.extJob.ObjectMeta.Name && 1105 job.ObjectMeta.Namespace == f.Namespace.Name && 1106 key == config.updatedValue && 1107 event.Type == config.watchEvent 1108 if !found { 1109 framework.Logf("Event %v observed for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations) 1110 return false, nil 1111 } 1112 framework.Logf("Event %v found for Job %v in namespace %v with labels: %v and annotations: %v", event.Type, job.ObjectMeta.Name, job.ObjectMeta.Namespace, job.Labels, job.Annotations) 1113 return found, nil 1114 } 1115 framework.Logf("Observed event: %+v", event.Object) 1116 return false, nil 1117 }) 1118 if err != nil { 1119 j, _ := f.ClientSet.BatchV1().Jobs(f.Namespace.Name).Get(ctx, config.jobName, metav1.GetOptions{}) 1120 framework.Logf("We missed the %v event. Job details: %+v", config.watchEvent, j) 1121 } 1122 } 1123 1124 // waitForJobFailure uses c to wait for up to timeout for the Job named jobName in namespace ns to fail. 1125 func waitForJobFailure(ctx context.Context, c clientset.Interface, ns, jobName string, timeout time.Duration, reason string) error { 1126 return wait.Poll(framework.Poll, timeout, func() (bool, error) { 1127 curr, err := c.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) 1128 if err != nil { 1129 return false, err 1130 } 1131 for _, c := range curr.Status.Conditions { 1132 if c.Type == batchv1.JobFailed && c.Status == v1.ConditionTrue { 1133 if reason == "" || reason == c.Reason { 1134 return true, nil 1135 } 1136 } 1137 } 1138 return false, nil 1139 }) 1140 } 1141 1142 func findConditionByType(list []batchv1.JobCondition, cType batchv1.JobConditionType) *batchv1.JobCondition { 1143 for i := range list { 1144 if list[i].Type == cType { 1145 return &list[i] 1146 } 1147 } 1148 return nil 1149 }