k8s.io/kubernetes@v1.29.3/test/integration/job/job_test.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "sort" 24 "strconv" 25 "strings" 26 "sync" 27 "sync/atomic" 28 "testing" 29 "time" 30 31 "github.com/google/go-cmp/cmp" 32 batchv1 "k8s.io/api/batch/v1" 33 v1 "k8s.io/api/core/v1" 34 eventsv1 "k8s.io/api/events/v1" 35 apierrors "k8s.io/apimachinery/pkg/api/errors" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/runtime/schema" 38 "k8s.io/apimachinery/pkg/types" 39 "k8s.io/apimachinery/pkg/util/sets" 40 "k8s.io/apimachinery/pkg/util/validation/field" 41 "k8s.io/apimachinery/pkg/util/wait" 42 "k8s.io/apimachinery/pkg/watch" 43 "k8s.io/apiserver/pkg/util/feature" 44 "k8s.io/client-go/informers" 45 clientset "k8s.io/client-go/kubernetes" 46 typedv1 "k8s.io/client-go/kubernetes/typed/batch/v1" 47 restclient "k8s.io/client-go/rest" 48 "k8s.io/client-go/util/retry" 49 featuregatetesting "k8s.io/component-base/featuregate/testing" 50 basemetrics "k8s.io/component-base/metrics" 51 "k8s.io/component-base/metrics/testutil" 52 "k8s.io/klog/v2" 53 kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing" 54 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 55 jobcontroller "k8s.io/kubernetes/pkg/controller/job" 56 "k8s.io/kubernetes/pkg/controller/job/metrics" 57 "k8s.io/kubernetes/pkg/features" 58 "k8s.io/kubernetes/test/integration/framework" 59 "k8s.io/kubernetes/test/integration/util" 60 "k8s.io/utils/ptr" 61 ) 62 63 const waitInterval = time.Second 64 const fastPodFailureBackoff = 100 * time.Millisecond 65 66 type metricLabelsWithValue struct { 67 Labels []string 68 Value int 69 } 70 71 func validateCounterMetric(ctx context.Context, t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) { 72 t.Helper() 73 var cmpErr error 74 err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, 10*time.Second, true, func(ctx context.Context) (bool, error) { 75 cmpErr = nil 76 value, err := testutil.GetCounterMetricValue(counterVec.WithLabelValues(wantMetric.Labels...)) 77 if err != nil { 78 return true, fmt.Errorf("collecting the %q metric: %q", counterVec.Name, err) 79 } 80 if wantMetric.Value != int(value) { 81 cmpErr = fmt.Errorf("Unexpected metric delta for %q metric with labels %q. want: %v, got: %v", counterVec.Name, wantMetric.Labels, wantMetric.Value, int(value)) 82 return false, nil 83 } 84 return true, nil 85 }) 86 if err != nil { 87 t.Errorf("Failed waiting for expected metric: %q", err) 88 } 89 if cmpErr != nil { 90 t.Error(cmpErr) 91 } 92 } 93 94 func validateTerminatedPodsTrackingFinalizerMetric(ctx context.Context, t *testing.T, want int) { 95 validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{ 96 Value: want, 97 Labels: []string{metrics.Add}, 98 }) 99 validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{ 100 Value: want, 101 Labels: []string{metrics.Delete}, 102 }) 103 } 104 105 // TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed 106 // in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on 107 // the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is 108 // done to simulate what PodGC would do. Then, the test spawns the second instance of the controller to check that it 109 // will pick up the job state properly and will mark it as Failed, even if th pod triggering the pod failure policy is 110 // already deleted. 111 // Note: this scenario requires the use of finalizers. Without finalizers there is no guarantee a failed pod would be 112 // checked against the pod failure policy rules before its removal by PodGC. 113 func TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart(t *testing.T) { 114 count := 3 115 job := batchv1.Job{ 116 Spec: batchv1.JobSpec{ 117 Template: v1.PodTemplateSpec{ 118 Spec: v1.PodSpec{ 119 Containers: []v1.Container{ 120 { 121 Name: "main-container", 122 Image: "foo", 123 ImagePullPolicy: v1.PullIfNotPresent, 124 TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError, 125 }, 126 }, 127 }, 128 }, 129 Parallelism: ptr.To(int32(count)), 130 Completions: ptr.To(int32(count)), 131 PodFailurePolicy: &batchv1.PodFailurePolicy{ 132 Rules: []batchv1.PodFailurePolicyRule{ 133 { 134 Action: batchv1.PodFailurePolicyActionFailJob, 135 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 136 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 137 Values: []int32{5}, 138 }, 139 }, 140 }, 141 }, 142 }, 143 } 144 podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{ 145 Phase: v1.PodFailed, 146 ContainerStatuses: []v1.ContainerStatus{ 147 { 148 Name: "main-container", 149 State: v1.ContainerState{ 150 Terminated: &v1.ContainerStateTerminated{ 151 ExitCode: 5, 152 }, 153 }, 154 }, 155 }, 156 } 157 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)() 158 closeFn, restConfig, cs, ns := setup(t, "simple") 159 defer closeFn() 160 161 // Make the job controller significantly slower to trigger race condition. 162 restConfig.QPS = 1 163 restConfig.Burst = 1 164 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 165 defer func() { 166 cancel() 167 }() 168 resetMetrics() 169 restConfig.QPS = 200 170 restConfig.Burst = 200 171 172 // create a job with a failed pod matching the exit code rule and a couple of successful pods 173 jobObj, err := createJobWithDefaults(ctx, cs, ns.Name, &job) 174 if err != nil { 175 t.Fatalf("Failed to create Job: %v", err) 176 } 177 validateJobPodsStatus(ctx, t, cs, jobObj, podsByStatus{ 178 Active: count, 179 Ready: ptr.To[int32](0), 180 Terminating: ptr.To[int32](0), 181 }) 182 183 jobPods, err := getJobPods(ctx, t, cs, jobObj, func(s v1.PodStatus) bool { 184 return (s.Phase == v1.PodPending || s.Phase == v1.PodRunning) 185 }) 186 if err != nil { 187 t.Fatalf("Failed to list Job Pods: %v", err) 188 } 189 190 failedIndex := 1 191 wg := sync.WaitGroup{} 192 wg.Add(1) 193 194 // Await for the failed pod (with index failedIndex) to have its finalizer 195 // removed. The finalizer will be removed by the job controller just after 196 // appending the FailureTarget condition to the job to mark it as targeted 197 // for failure. 198 go func(ctx context.Context) { 199 err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, time.Minute, true, func(ctx context.Context) (bool, error) { 200 failedPodUpdated, err := cs.CoreV1().Pods(jobObj.Namespace).Get(ctx, jobPods[failedIndex].Name, metav1.GetOptions{}) 201 if err != nil { 202 return true, err 203 } 204 if len(failedPodUpdated.Finalizers) == 0 { 205 return true, nil 206 } 207 return false, nil 208 }) 209 if err != nil { 210 t.Logf("Failed awaiting for the finalizer removal for pod %v", klog.KObj(jobPods[failedIndex])) 211 } 212 wg.Done() 213 }(ctx) 214 215 // We update one pod as failed with state matching the pod failure policy rule. This results in removal 216 // of the pod finalizer from the pod by the job controller. 217 failedPod := jobPods[failedIndex] 218 updatedPod := failedPod.DeepCopy() 219 updatedPod.Status = podStatusMatchingOnExitCodesTerminateRule 220 _, err = updatePodStatuses(ctx, cs, []v1.Pod{*updatedPod}) 221 if err != nil { 222 t.Fatalf("Failed to update pod statuses %q for pods of job %q", err, klog.KObj(jobObj)) 223 } 224 wg.Wait() 225 226 t.Logf("Finalizer is removed for the failed pod %q. Shutting down the controller.", klog.KObj(failedPod)) 227 // shut down the first job controller as soon as it removed the finalizer for the failed pod. This will 228 // likely happen before the first controller is able to mark the job as Failed. 229 cancel() 230 231 // Delete the failed pod to make sure it is not used by the second instance of the controller 232 ctx, cancel = context.WithCancel(context.Background()) 233 err = cs.CoreV1().Pods(failedPod.Namespace).Delete(ctx, failedPod.Name, metav1.DeleteOptions{GracePeriodSeconds: ptr.To[int64](0)}) 234 if err != nil { 235 t.Fatalf("Error: '%v' while deleting pod: '%v'", err, klog.KObj(failedPod)) 236 } 237 t.Logf("The failed pod %q is deleted", klog.KObj(failedPod)) 238 cancel() 239 240 // start the second controller to promote the interim FailureTarget job condition as Failed 241 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 242 // verify the job is correctly marked as Failed 243 validateJobFailed(ctx, t, cs, jobObj) 244 validateNoOrphanPodsWithFinalizers(ctx, t, cs, jobObj) 245 } 246 247 // TestJobPodFailurePolicy tests handling of pod failures with respect to the 248 // configured pod failure policy rules 249 func TestJobPodFailurePolicy(t *testing.T) { 250 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 251 job := batchv1.Job{ 252 Spec: batchv1.JobSpec{ 253 Template: v1.PodTemplateSpec{ 254 Spec: v1.PodSpec{ 255 Containers: []v1.Container{ 256 { 257 Name: "main-container", 258 Image: "foo", 259 ImagePullPolicy: v1.PullIfNotPresent, 260 TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError, 261 }, 262 }, 263 }, 264 }, 265 PodFailurePolicy: &batchv1.PodFailurePolicy{ 266 Rules: []batchv1.PodFailurePolicyRule{ 267 { 268 Action: batchv1.PodFailurePolicyActionIgnore, 269 OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{ 270 { 271 Type: v1.DisruptionTarget, 272 }, 273 }, 274 }, 275 { 276 Action: batchv1.PodFailurePolicyActionCount, 277 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 278 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 279 Values: []int32{10}, 280 }, 281 }, 282 { 283 Action: batchv1.PodFailurePolicyActionFailJob, 284 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 285 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 286 Values: []int32{5, 6, 7}, 287 }, 288 }, 289 }, 290 }, 291 }, 292 } 293 podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{ 294 Phase: v1.PodFailed, 295 ContainerStatuses: []v1.ContainerStatus{ 296 { 297 Name: "main-container", 298 State: v1.ContainerState{ 299 Terminated: &v1.ContainerStateTerminated{ 300 ExitCode: 5, 301 }, 302 }, 303 }, 304 }, 305 } 306 podStatusMatchingOnExitCodesCountRule := v1.PodStatus{ 307 Phase: v1.PodFailed, 308 ContainerStatuses: []v1.ContainerStatus{ 309 { 310 Name: "main-container", 311 State: v1.ContainerState{ 312 Terminated: &v1.ContainerStateTerminated{ 313 ExitCode: 10, 314 }, 315 }, 316 }, 317 }, 318 } 319 podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{ 320 Phase: v1.PodFailed, 321 Conditions: []v1.PodCondition{ 322 { 323 Type: v1.DisruptionTarget, 324 Status: v1.ConditionTrue, 325 }, 326 }, 327 } 328 podStatusNotMatchingAnyRule := v1.PodStatus{ 329 Phase: v1.PodFailed, 330 ContainerStatuses: []v1.ContainerStatus{ 331 { 332 State: v1.ContainerState{ 333 Terminated: &v1.ContainerStateTerminated{}, 334 }, 335 }, 336 }, 337 } 338 testCases := map[string]struct { 339 enableJobPodFailurePolicy bool 340 restartController bool 341 job batchv1.Job 342 podStatus v1.PodStatus 343 wantActive int 344 wantFailed int 345 wantJobConditionType batchv1.JobConditionType 346 wantJobFinishedMetric metricLabelsWithValue 347 wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue 348 }{ 349 "pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": { 350 enableJobPodFailurePolicy: true, 351 job: job, 352 podStatus: podStatusMatchingOnExitCodesTerminateRule, 353 wantActive: 0, 354 wantFailed: 1, 355 wantJobConditionType: batchv1.JobFailed, 356 wantJobFinishedMetric: metricLabelsWithValue{ 357 Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"}, 358 Value: 1, 359 }, 360 wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ 361 Labels: []string{"FailJob"}, 362 Value: 1, 363 }, 364 }, 365 "pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": { 366 enableJobPodFailurePolicy: true, 367 restartController: true, 368 job: job, 369 podStatus: podStatusMatchingOnExitCodesTerminateRule, 370 wantActive: 0, 371 wantFailed: 1, 372 wantJobConditionType: batchv1.JobFailed, 373 wantJobFinishedMetric: metricLabelsWithValue{ 374 Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"}, 375 Value: 1, 376 }, 377 }, 378 "pod status matching the configured FailJob rule on exit codes; default handling when JobPodFailurePolicy disabled": { 379 enableJobPodFailurePolicy: false, 380 job: job, 381 podStatus: podStatusMatchingOnExitCodesTerminateRule, 382 wantActive: 1, 383 wantFailed: 1, 384 wantJobConditionType: batchv1.JobComplete, 385 wantJobFinishedMetric: metricLabelsWithValue{ 386 Labels: []string{"NonIndexed", "succeeded", ""}, 387 Value: 1, 388 }, 389 }, 390 "pod status matching the configured Ignore rule on pod conditions; pod failure not counted when JobPodFailurePolicy enabled": { 391 enableJobPodFailurePolicy: true, 392 job: job, 393 podStatus: podStatusMatchingOnPodConditionsIgnoreRule, 394 wantActive: 1, 395 wantFailed: 0, 396 wantJobConditionType: batchv1.JobComplete, 397 wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ 398 Labels: []string{"Ignore"}, 399 Value: 1, 400 }, 401 wantJobFinishedMetric: metricLabelsWithValue{ 402 Labels: []string{"NonIndexed", "succeeded", ""}, 403 Value: 1, 404 }, 405 }, 406 "pod status matching the configured Count rule on exit codes; pod failure counted when JobPodFailurePolicy enabled": { 407 enableJobPodFailurePolicy: true, 408 job: job, 409 podStatus: podStatusMatchingOnExitCodesCountRule, 410 wantActive: 1, 411 wantFailed: 1, 412 wantJobConditionType: batchv1.JobComplete, 413 wantJobFinishedMetric: metricLabelsWithValue{ 414 Labels: []string{"NonIndexed", "succeeded", ""}, 415 Value: 1, 416 }, 417 wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ 418 Labels: []string{"Count"}, 419 Value: 1, 420 }, 421 }, 422 "pod status non-matching any configured rule; pod failure counted when JobPodFailurePolicy enabled": { 423 enableJobPodFailurePolicy: true, 424 job: job, 425 podStatus: podStatusNotMatchingAnyRule, 426 wantActive: 1, 427 wantFailed: 1, 428 wantJobConditionType: batchv1.JobComplete, 429 wantJobFinishedMetric: metricLabelsWithValue{ 430 Labels: []string{"NonIndexed", "succeeded", ""}, 431 Value: 1, 432 }, 433 wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{ 434 Labels: []string{"Count"}, 435 Value: 0, 436 }, 437 }, 438 } 439 for name, test := range testCases { 440 t.Run(name, func(t *testing.T) { 441 resetMetrics() 442 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)() 443 444 closeFn, restConfig, clientSet, ns := setup(t, "simple") 445 defer closeFn() 446 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 447 defer func() { 448 cancel() 449 }() 450 451 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job) 452 if err != nil { 453 t.Fatalf("Error %q while creating the job %q", err, jobObj.Name) 454 } 455 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 456 Active: 1, 457 Ready: ptr.To[int32](0), 458 Terminating: ptr.To[int32](0), 459 }) 460 461 op := func(p *v1.Pod) bool { 462 p.Status = test.podStatus 463 return true 464 } 465 466 if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil { 467 t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name) 468 } 469 470 if test.restartController { 471 cancel() 472 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 473 } 474 475 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 476 Active: test.wantActive, 477 Failed: test.wantFailed, 478 Ready: ptr.To[int32](0), 479 Terminating: ptr.To[int32](0), 480 }) 481 482 if test.wantJobConditionType == batchv1.JobComplete { 483 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { 484 t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err) 485 } 486 } 487 validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType) 488 validateCounterMetric(ctx, t, metrics.JobFinishedNum, test.wantJobFinishedMetric) 489 if test.wantPodFailuresHandledByPolicyRuleMetric != nil { 490 validateCounterMetric(ctx, t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric) 491 } 492 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 493 }) 494 } 495 } 496 497 // TestBackoffLimitPerIndex_DelayedPodDeletion tests the pod deletion is delayed 498 // until the replacement pod is created, so that the replacement pod has the 499 // index-failure-count annotation bumped, when BackoffLimitPerIndex is used. 500 func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) { 501 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 502 503 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() 504 closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-failed") 505 defer closeFn() 506 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 507 defer func() { 508 cancel() 509 }() 510 511 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 512 Spec: batchv1.JobSpec{ 513 Parallelism: ptr.To[int32](1), 514 Completions: ptr.To[int32](1), 515 BackoffLimitPerIndex: ptr.To[int32](1), 516 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 517 }, 518 }) 519 if err != nil { 520 t.Fatalf("Failed to create Job: %v", err) 521 } 522 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 523 Active: 1, 524 Ready: ptr.To[int32](0), 525 Terminating: ptr.To[int32](0), 526 }) 527 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To("")) 528 529 // First pod from index 0 failed. 530 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { 531 t.Fatal("Failed trying to fail pod with index 0") 532 } 533 // Delete the failed pod 534 pod, err := getJobPodForIndex(ctx, clientSet, jobObj, 0, func(_ *v1.Pod) bool { return true }) 535 if err != nil { 536 t.Fatalf("failed to get terminal pod for index: %v", 0) 537 } 538 if err := clientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { 539 t.Fatalf("failed to delete pod: %v, error: %v", klog.KObj(pod), err) 540 } 541 542 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 543 Active: 1, 544 Failed: 1, 545 Ready: ptr.To[int32](0), 546 Terminating: ptr.To[int32](0), 547 }) 548 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To("")) 549 550 // Verify the replacement pod is created and has the index-failure-count 551 // annotation bumped. 552 replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, 0) 553 if err != nil { 554 t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", 0, err) 555 } 556 gotIndexFailureCount, err := getIndexFailureCount(replacement) 557 if err != nil { 558 t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err) 559 } 560 if diff := cmp.Diff(1, gotIndexFailureCount); diff != "" { 561 t.Errorf("Unexpected index failure count for the replacement pod: %s", diff) 562 } 563 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil { 564 t.Fatal("Failed trying to fail pod with index 0") 565 } 566 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 567 Active: 0, 568 Succeeded: 1, 569 Failed: 1, 570 Ready: ptr.To[int32](0), 571 Terminating: ptr.To[int32](0), 572 }) 573 validateJobSucceeded(ctx, t, clientSet, jobObj) 574 } 575 576 // TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when 577 // reenabling the BackoffLimitPerIndex feature. 578 func TestBackoffLimitPerIndex_Reenabling(t *testing.T) { 579 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 580 581 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() 582 closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-reenabled") 583 defer closeFn() 584 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 585 defer cancel() 586 resetMetrics() 587 588 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 589 Spec: batchv1.JobSpec{ 590 Parallelism: ptr.To[int32](3), 591 Completions: ptr.To[int32](3), 592 BackoffLimitPerIndex: ptr.To[int32](0), 593 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 594 }, 595 }) 596 if err != nil { 597 t.Fatalf("Failed to create Job: %v", err) 598 } 599 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 600 Active: 3, 601 Ready: ptr.To[int32](0), 602 Terminating: ptr.To[int32](0), 603 }) 604 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", ptr.To("")) 605 606 // First pod from index 0 failed 607 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { 608 t.Fatal("Failed trying to fail pod with index 0") 609 } 610 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 611 Active: 2, 612 Failed: 1, 613 Ready: ptr.To[int32](0), 614 Terminating: ptr.To[int32](0), 615 }) 616 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2), "", ptr.To("0")) 617 618 // Disable the feature 619 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, false)() 620 621 // First pod from index 1 failed 622 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 623 t.Fatal("Failed trying to fail pod with index 1") 624 } 625 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 626 Active: 3, 627 Failed: 2, 628 Ready: ptr.To[int32](0), 629 Terminating: ptr.To[int32](0), 630 }) 631 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil) 632 633 // Reenable the feature 634 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() 635 636 // First pod from index 2 failed 637 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { 638 t.Fatal("Failed trying to fail pod with index 2") 639 } 640 641 // Verify the indexes 0 and 1 are active as the failed pods don't have 642 // finalizers at this point, so they are ignored. 643 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 644 Active: 2, 645 Failed: 3, 646 Ready: ptr.To[int32](0), 647 Terminating: ptr.To[int32](0), 648 }) 649 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("2")) 650 651 // mark remaining pods are Succeeded and verify Job status 652 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil { 653 t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err) 654 } 655 validateJobFailed(ctx, t, clientSet, jobObj) 656 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 657 } 658 659 // TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff tests that the 660 // pods are recreated with expotential backoff delay computed independently 661 // per index. Scenario: 662 // - fail index 0 663 // - fail index 0 664 // - fail index 1 665 // - succeed index 0 666 // - fail index 1 667 // - succeed index 1 668 func TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff(t *testing.T) { 669 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() 670 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second)) 671 672 closeFn, restConfig, clientSet, ns := setup(t, "simple") 673 defer closeFn() 674 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 675 defer cancel() 676 677 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 678 Spec: batchv1.JobSpec{ 679 Completions: ptr.To[int32](2), 680 Parallelism: ptr.To[int32](2), 681 BackoffLimitPerIndex: ptr.To[int32](2), 682 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 683 }, 684 }) 685 if err != nil { 686 t.Fatalf("Could not create job: %v", err) 687 } 688 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 689 Active: 2, 690 Ready: ptr.To[int32](0), 691 Terminating: ptr.To[int32](0), 692 }) 693 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("")) 694 695 // Fail the first pod for index 0 696 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { 697 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 698 } 699 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 700 Active: 2, 701 Failed: 1, 702 Ready: ptr.To[int32](0), 703 Terminating: ptr.To[int32](0), 704 }) 705 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("")) 706 707 // Fail the second pod for index 0 708 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil { 709 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 710 } 711 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 712 Active: 2, 713 Failed: 2, 714 Ready: ptr.To[int32](0), 715 Terminating: ptr.To[int32](0), 716 }) 717 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("")) 718 719 // Fail the first pod for index 1 720 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 721 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 722 } 723 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 724 Active: 2, 725 Failed: 3, 726 Ready: ptr.To[int32](0), 727 Terminating: ptr.To[int32](0), 728 }) 729 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("")) 730 731 // Succeed the third pod for index 0 732 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil { 733 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) 734 } 735 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 736 Active: 1, 737 Failed: 3, 738 Succeeded: 1, 739 Ready: ptr.To[int32](0), 740 Terminating: ptr.To[int32](0), 741 }) 742 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To("")) 743 744 // Fail the second pod for index 1 745 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 746 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 747 } 748 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 749 Active: 1, 750 Failed: 4, 751 Succeeded: 1, 752 Ready: ptr.To[int32](0), 753 Terminating: ptr.To[int32](0), 754 }) 755 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To("")) 756 757 // Succeed the third pod for index 1 758 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { 759 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) 760 } 761 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 762 Active: 0, 763 Failed: 4, 764 Succeeded: 2, 765 Ready: ptr.To[int32](0), 766 Terminating: ptr.To[int32](0), 767 }) 768 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0,1", ptr.To("")) 769 validateJobSucceeded(ctx, t, clientSet, jobObj) 770 771 for index := 0; index < int(*jobObj.Spec.Completions); index++ { 772 podsForIndex, err := getJobPodsForIndex(ctx, clientSet, jobObj, index, func(_ *v1.Pod) bool { return true }) 773 if err != nil { 774 t.Fatalf("Failed to list job %q pods for index %v, error: %v", klog.KObj(jobObj), index, err) 775 } 776 validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, podsForIndex) 777 } 778 } 779 780 // TestBackoffLimitPerIndex tests handling of job and its pods when 781 // backoff limit per index is used. 782 func TestBackoffLimitPerIndex(t *testing.T) { 783 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 784 785 type podTerminationWithExpectations struct { 786 index int 787 status v1.PodStatus 788 wantActive int 789 wantFailed int 790 wantSucceeded int 791 wantActiveIndexes sets.Set[int] 792 wantCompletedIndexes string 793 wantFailedIndexes *string 794 wantReplacementPodFailureCount *int 795 } 796 797 podTemplateSpec := v1.PodTemplateSpec{ 798 Spec: v1.PodSpec{ 799 Containers: []v1.Container{ 800 { 801 Name: "main-container", 802 Image: "foo", 803 ImagePullPolicy: v1.PullIfNotPresent, 804 TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError, 805 }, 806 }, 807 }, 808 } 809 testCases := map[string]struct { 810 job batchv1.Job 811 podTerminations []podTerminationWithExpectations 812 wantJobConditionType batchv1.JobConditionType 813 wantJobFinishedIndexesTotalMetric []metricLabelsWithValue 814 }{ 815 "job succeeded": { 816 job: batchv1.Job{ 817 Spec: batchv1.JobSpec{ 818 Parallelism: ptr.To[int32](2), 819 Completions: ptr.To[int32](2), 820 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 821 BackoffLimitPerIndex: ptr.To[int32](1), 822 Template: podTemplateSpec, 823 }, 824 }, 825 podTerminations: []podTerminationWithExpectations{ 826 { 827 status: v1.PodStatus{ 828 Phase: v1.PodFailed, 829 }, 830 wantActive: 2, 831 wantFailed: 1, 832 wantActiveIndexes: sets.New(0, 1), 833 wantFailedIndexes: ptr.To(""), 834 wantReplacementPodFailureCount: ptr.To(1), 835 }, 836 }, 837 wantJobConditionType: batchv1.JobComplete, 838 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 839 { 840 Labels: []string{"succeeded", "perIndex"}, 841 Value: 2, 842 }, 843 }, 844 }, 845 "job index fails due to exceeding backoff limit per index": { 846 job: batchv1.Job{ 847 Spec: batchv1.JobSpec{ 848 Parallelism: ptr.To[int32](2), 849 Completions: ptr.To[int32](2), 850 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 851 BackoffLimitPerIndex: ptr.To[int32](2), 852 Template: podTemplateSpec, 853 }, 854 }, 855 podTerminations: []podTerminationWithExpectations{ 856 { 857 status: v1.PodStatus{ 858 Phase: v1.PodFailed, 859 }, 860 wantActive: 2, 861 wantFailed: 1, 862 wantActiveIndexes: sets.New(0, 1), 863 wantFailedIndexes: ptr.To(""), 864 wantReplacementPodFailureCount: ptr.To(1), 865 }, 866 { 867 status: v1.PodStatus{ 868 Phase: v1.PodFailed, 869 }, 870 wantActive: 2, 871 wantFailed: 2, 872 wantActiveIndexes: sets.New(0, 1), 873 wantFailedIndexes: ptr.To(""), 874 wantReplacementPodFailureCount: ptr.To(2), 875 }, 876 { 877 status: v1.PodStatus{ 878 Phase: v1.PodFailed, 879 }, 880 wantActive: 1, 881 wantFailed: 3, 882 wantActiveIndexes: sets.New(1), 883 wantFailedIndexes: ptr.To("0"), 884 }, 885 }, 886 wantJobConditionType: batchv1.JobFailed, 887 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 888 { 889 Labels: []string{"failed", "perIndex"}, 890 Value: 1, 891 }, 892 { 893 Labels: []string{"succeeded", "perIndex"}, 894 Value: 1, 895 }, 896 }, 897 }, 898 "job index fails due to exceeding the global backoff limit first": { 899 job: batchv1.Job{ 900 Spec: batchv1.JobSpec{ 901 Parallelism: ptr.To[int32](3), 902 Completions: ptr.To[int32](3), 903 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 904 BackoffLimitPerIndex: ptr.To[int32](1), 905 BackoffLimit: ptr.To[int32](2), 906 Template: podTemplateSpec, 907 }, 908 }, 909 podTerminations: []podTerminationWithExpectations{ 910 { 911 index: 0, 912 status: v1.PodStatus{ 913 Phase: v1.PodFailed, 914 }, 915 wantActive: 3, 916 wantFailed: 1, 917 wantActiveIndexes: sets.New(0, 1, 2), 918 wantFailedIndexes: ptr.To(""), 919 }, 920 { 921 index: 1, 922 status: v1.PodStatus{ 923 Phase: v1.PodFailed, 924 }, 925 wantActive: 3, 926 wantFailed: 2, 927 wantActiveIndexes: sets.New(0, 1, 2), 928 wantFailedIndexes: ptr.To(""), 929 }, 930 { 931 index: 2, 932 status: v1.PodStatus{ 933 Phase: v1.PodFailed, 934 }, 935 wantFailed: 5, 936 wantFailedIndexes: ptr.To(""), 937 }, 938 }, 939 wantJobConditionType: batchv1.JobFailed, 940 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 941 { 942 Labels: []string{"succeeded", "perIndex"}, 943 Value: 0, 944 }, 945 { 946 Labels: []string{"failed", "perIndex"}, 947 Value: 0, 948 }, 949 }, 950 }, 951 "job continues execution after a failed index, the job is marked Failed due to the failed index": { 952 job: batchv1.Job{ 953 Spec: batchv1.JobSpec{ 954 Parallelism: ptr.To[int32](2), 955 Completions: ptr.To[int32](2), 956 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 957 BackoffLimitPerIndex: ptr.To[int32](0), 958 Template: podTemplateSpec, 959 }, 960 }, 961 podTerminations: []podTerminationWithExpectations{ 962 { 963 index: 0, 964 status: v1.PodStatus{ 965 Phase: v1.PodFailed, 966 }, 967 wantActive: 1, 968 wantFailed: 1, 969 wantActiveIndexes: sets.New(1), 970 wantFailedIndexes: ptr.To("0"), 971 }, 972 { 973 index: 1, 974 status: v1.PodStatus{ 975 Phase: v1.PodSucceeded, 976 }, 977 wantFailed: 1, 978 wantSucceeded: 1, 979 wantFailedIndexes: ptr.To("0"), 980 wantCompletedIndexes: "1", 981 }, 982 }, 983 wantJobConditionType: batchv1.JobFailed, 984 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 985 { 986 Labels: []string{"succeeded", "perIndex"}, 987 Value: 1, 988 }, 989 { 990 Labels: []string{"failed", "perIndex"}, 991 Value: 1, 992 }, 993 }, 994 }, 995 "job execution terminated early due to exceeding max failed indexes": { 996 job: batchv1.Job{ 997 Spec: batchv1.JobSpec{ 998 Parallelism: ptr.To[int32](3), 999 Completions: ptr.To[int32](3), 1000 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 1001 BackoffLimitPerIndex: ptr.To[int32](0), 1002 MaxFailedIndexes: ptr.To[int32](1), 1003 Template: podTemplateSpec, 1004 }, 1005 }, 1006 podTerminations: []podTerminationWithExpectations{ 1007 { 1008 index: 0, 1009 status: v1.PodStatus{ 1010 Phase: v1.PodFailed, 1011 }, 1012 wantActive: 2, 1013 wantFailed: 1, 1014 wantActiveIndexes: sets.New(1, 2), 1015 wantFailedIndexes: ptr.To("0"), 1016 }, 1017 { 1018 index: 1, 1019 status: v1.PodStatus{ 1020 Phase: v1.PodFailed, 1021 }, 1022 wantActive: 0, 1023 wantFailed: 3, 1024 wantFailedIndexes: ptr.To("0,1"), 1025 }, 1026 }, 1027 wantJobConditionType: batchv1.JobFailed, 1028 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 1029 { 1030 Labels: []string{"failed", "perIndex"}, 1031 Value: 2, 1032 }, 1033 }, 1034 }, 1035 "pod failure matching pod failure policy rule with FailIndex action": { 1036 job: batchv1.Job{ 1037 Spec: batchv1.JobSpec{ 1038 Parallelism: ptr.To[int32](2), 1039 Completions: ptr.To[int32](2), 1040 CompletionMode: completionModePtr(batchv1.IndexedCompletion), 1041 BackoffLimitPerIndex: ptr.To[int32](1), 1042 Template: podTemplateSpec, 1043 PodFailurePolicy: &batchv1.PodFailurePolicy{ 1044 Rules: []batchv1.PodFailurePolicyRule{ 1045 { 1046 Action: batchv1.PodFailurePolicyActionFailIndex, 1047 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 1048 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 1049 Values: []int32{13}, 1050 }, 1051 }, 1052 { 1053 Action: batchv1.PodFailurePolicyActionFailIndex, 1054 OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{ 1055 { 1056 Type: v1.DisruptionTarget, 1057 Status: v1.ConditionTrue, 1058 }, 1059 }, 1060 }, 1061 }, 1062 }, 1063 }, 1064 }, 1065 podTerminations: []podTerminationWithExpectations{ 1066 { 1067 index: 0, 1068 status: v1.PodStatus{ 1069 Phase: v1.PodFailed, 1070 ContainerStatuses: []v1.ContainerStatus{ 1071 { 1072 State: v1.ContainerState{ 1073 Terminated: &v1.ContainerStateTerminated{ 1074 ExitCode: 13, 1075 }, 1076 }, 1077 }, 1078 }, 1079 }, 1080 wantActive: 1, 1081 wantFailed: 1, 1082 wantActiveIndexes: sets.New(1), 1083 wantFailedIndexes: ptr.To("0"), 1084 }, 1085 { 1086 index: 1, 1087 status: v1.PodStatus{ 1088 Phase: v1.PodFailed, 1089 Conditions: []v1.PodCondition{ 1090 { 1091 Type: v1.DisruptionTarget, 1092 Status: v1.ConditionTrue, 1093 }, 1094 }, 1095 }, 1096 wantFailed: 2, 1097 wantFailedIndexes: ptr.To("0,1"), 1098 }, 1099 }, 1100 wantJobConditionType: batchv1.JobFailed, 1101 wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{ 1102 { 1103 Labels: []string{"failed", "perIndex"}, 1104 Value: 2, 1105 }, 1106 }, 1107 }, 1108 } 1109 for name, test := range testCases { 1110 t.Run(name, func(t *testing.T) { 1111 resetMetrics() 1112 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)() 1113 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)() 1114 1115 closeFn, restConfig, clientSet, ns := setup(t, "simple") 1116 defer closeFn() 1117 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1118 defer func() { 1119 cancel() 1120 }() 1121 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job) 1122 if err != nil { 1123 t.Fatalf("Error %q while creating the job %q", err, jobObj.Name) 1124 } 1125 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1126 Active: int(*test.job.Spec.Parallelism), 1127 Ready: ptr.To[int32](0), 1128 Terminating: ptr.To[int32](0), 1129 }) 1130 for _, podTermination := range test.podTerminations { 1131 pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index) 1132 if err != nil { 1133 t.Fatalf("listing Job Pods: %q", err) 1134 } 1135 pod.Status = podTermination.status 1136 if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil { 1137 t.Fatalf("Error updating the pod %q: %q", klog.KObj(pod), err) 1138 } 1139 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1140 Active: podTermination.wantActive, 1141 Succeeded: podTermination.wantSucceeded, 1142 Failed: podTermination.wantFailed, 1143 Ready: ptr.To[int32](0), 1144 Terminating: ptr.To[int32](0), 1145 }) 1146 validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes) 1147 if podTermination.wantReplacementPodFailureCount != nil { 1148 replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index) 1149 if err != nil { 1150 t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", podTermination.index, err) 1151 } 1152 gotReplacementPodFailureCount, err := getIndexFailureCount(replacement) 1153 if err != nil { 1154 t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err) 1155 } 1156 if *podTermination.wantReplacementPodFailureCount != gotReplacementPodFailureCount { 1157 t.Fatalf("Unexpected value of the index failure count annotation. Want: %v, got: %v", *podTermination.wantReplacementPodFailureCount, gotReplacementPodFailureCount) 1158 } 1159 } 1160 } 1161 1162 remainingActive := test.podTerminations[len(test.podTerminations)-1].wantActive 1163 if remainingActive > 0 { 1164 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remainingActive); err != nil { 1165 t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err) 1166 } 1167 } 1168 validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType) 1169 for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric { 1170 validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue) 1171 } 1172 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1173 }) 1174 } 1175 } 1176 1177 func getIndexFailureCount(p *v1.Pod) (int, error) { 1178 if p.Annotations == nil { 1179 return 0, errors.New("no annotations found") 1180 } 1181 v, ok := p.Annotations[batchv1.JobIndexFailureCountAnnotation] 1182 if !ok { 1183 return 0, fmt.Errorf("annotation %s not found", batchv1.JobIndexFailureCountAnnotation) 1184 } 1185 return strconv.Atoi(v) 1186 } 1187 1188 func completionModePtr(cm batchv1.CompletionMode) *batchv1.CompletionMode { 1189 return &cm 1190 } 1191 1192 // TestNonParallelJob tests that a Job that only executes one Pod. The test 1193 // recreates the Job controller at some points to make sure a new controller 1194 // is able to pickup. 1195 func TestNonParallelJob(t *testing.T) { 1196 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 1197 closeFn, restConfig, clientSet, ns := setup(t, "simple") 1198 defer closeFn() 1199 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1200 defer func() { 1201 cancel() 1202 }() 1203 1204 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{}) 1205 if err != nil { 1206 t.Fatalf("Failed to create Job: %v", err) 1207 } 1208 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1209 Active: 1, 1210 Ready: ptr.To[int32](0), 1211 Terminating: ptr.To[int32](0), 1212 }) 1213 1214 // Restarting controller. 1215 cancel() 1216 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 1217 1218 // Failed Pod is replaced. 1219 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 1220 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 1221 } 1222 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1223 Active: 1, 1224 Failed: 1, 1225 Ready: ptr.To[int32](0), 1226 Terminating: ptr.To[int32](0), 1227 }) 1228 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1229 Labels: []string{"NonIndexed", "failed"}, 1230 Value: 1, 1231 }) 1232 1233 // Restarting controller. 1234 cancel() 1235 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 1236 1237 // No more Pods are created after the Pod succeeds. 1238 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { 1239 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) 1240 } 1241 validateJobSucceeded(ctx, t, clientSet, jobObj) 1242 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1243 Failed: 1, 1244 Succeeded: 1, 1245 Ready: ptr.To[int32](0), 1246 Terminating: ptr.To[int32](0), 1247 }) 1248 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1249 validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{ 1250 Labels: []string{"NonIndexed", "succeeded", ""}, 1251 Value: 1, 1252 }) 1253 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1254 Labels: []string{"NonIndexed", "succeeded"}, 1255 Value: 1, 1256 }) 1257 } 1258 1259 func TestParallelJob(t *testing.T) { 1260 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 1261 closeFn, restConfig, clientSet, ns := setup(t, "parallel") 1262 defer closeFn() 1263 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1264 defer cancel() 1265 resetMetrics() 1266 1267 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1268 Spec: batchv1.JobSpec{ 1269 Parallelism: ptr.To[int32](5), 1270 }, 1271 }) 1272 if err != nil { 1273 t.Fatalf("Failed to create Job: %v", err) 1274 } 1275 want := podsByStatus{ 1276 Active: 5, 1277 Ready: ptr.To[int32](0), 1278 Terminating: ptr.To[int32](0), 1279 } 1280 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1281 1282 // Tracks ready pods, if enabled. 1283 if err, _ := setJobPodsReady(ctx, clientSet, jobObj, 2); err != nil { 1284 t.Fatalf("Failed Marking Pods as ready: %v", err) 1285 } 1286 want.Ready = ptr.To[int32](2) 1287 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1288 1289 // Failed Pods are replaced. 1290 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { 1291 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err) 1292 } 1293 want = podsByStatus{ 1294 Active: 5, 1295 Failed: 2, 1296 Ready: ptr.To[int32](0), 1297 Terminating: ptr.To[int32](0), 1298 } 1299 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1300 // Once one Pod succeeds, no more Pods are created, even if some fail. 1301 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { 1302 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) 1303 } 1304 want = podsByStatus{ 1305 Failed: 2, 1306 Succeeded: 1, 1307 Active: 4, 1308 Ready: ptr.To[int32](0), 1309 Terminating: ptr.To[int32](0), 1310 } 1311 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1312 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { 1313 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err) 1314 } 1315 want = podsByStatus{ 1316 Failed: 4, 1317 Succeeded: 1, 1318 Active: 2, 1319 Ready: ptr.To[int32](0), 1320 Terminating: ptr.To[int32](0), 1321 } 1322 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1323 // No more Pods are created after remaining Pods succeed. 1324 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil { 1325 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err) 1326 } 1327 validateJobSucceeded(ctx, t, clientSet, jobObj) 1328 want = podsByStatus{ 1329 Failed: 4, 1330 Succeeded: 3, 1331 Ready: ptr.To[int32](0), 1332 Terminating: ptr.To[int32](0), 1333 } 1334 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1335 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1336 validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 7) 1337 validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{ 1338 Labels: []string{"NonIndexed", "succeeded", ""}, 1339 Value: 1, 1340 }) 1341 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1342 Labels: []string{"NonIndexed", "succeeded"}, 1343 Value: 3, 1344 }) 1345 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1346 Labels: []string{"NonIndexed", "failed"}, 1347 Value: 4, 1348 }) 1349 } 1350 1351 func TestParallelJobChangingParallelism(t *testing.T) { 1352 closeFn, restConfig, clientSet, ns := setup(t, "parallel") 1353 defer closeFn() 1354 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1355 defer cancel() 1356 1357 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1358 Spec: batchv1.JobSpec{ 1359 BackoffLimit: ptr.To[int32](2), 1360 Parallelism: ptr.To[int32](5), 1361 }, 1362 }) 1363 if err != nil { 1364 t.Fatalf("Failed to create Job: %v", err) 1365 } 1366 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1367 Active: 5, 1368 Ready: ptr.To[int32](0), 1369 Terminating: ptr.To[int32](0), 1370 }) 1371 1372 // Reduce parallelism by a number greater than backoffLimit. 1373 patch := []byte(`{"spec":{"parallelism":2}}`) 1374 jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{}) 1375 if err != nil { 1376 t.Fatalf("Updating Job: %v", err) 1377 } 1378 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1379 Active: 2, 1380 Ready: ptr.To[int32](0), 1381 Terminating: ptr.To[int32](0), 1382 }) 1383 1384 // Increase parallelism again. 1385 patch = []byte(`{"spec":{"parallelism":4}}`) 1386 jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{}) 1387 if err != nil { 1388 t.Fatalf("Updating Job: %v", err) 1389 } 1390 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1391 Active: 4, 1392 Ready: ptr.To[int32](0), 1393 Terminating: ptr.To[int32](0), 1394 }) 1395 1396 // Succeed Job 1397 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 4); err != nil { 1398 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err) 1399 } 1400 validateJobSucceeded(ctx, t, clientSet, jobObj) 1401 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1402 Succeeded: 4, 1403 Ready: ptr.To[int32](0), 1404 Terminating: ptr.To[int32](0), 1405 }) 1406 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1407 } 1408 1409 func TestParallelJobWithCompletions(t *testing.T) { 1410 // Lower limits for a job sync so that we can test partial updates with a low 1411 // number of pods. 1412 t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 10)) 1413 t.Cleanup(setDuringTest(&jobcontroller.MaxPodCreateDeletePerSync, 10)) 1414 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 1415 closeFn, restConfig, clientSet, ns := setup(t, "completions") 1416 defer closeFn() 1417 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1418 defer cancel() 1419 resetMetrics() 1420 1421 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1422 Spec: batchv1.JobSpec{ 1423 Parallelism: ptr.To[int32](54), 1424 Completions: ptr.To[int32](56), 1425 }, 1426 }) 1427 if err != nil { 1428 t.Fatalf("Failed to create Job: %v", err) 1429 } 1430 want := podsByStatus{ 1431 Active: 54, 1432 Ready: ptr.To[int32](0), 1433 Terminating: ptr.To[int32](0), 1434 } 1435 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1436 // Tracks ready pods, if enabled. 1437 if err, _ := setJobPodsReady(ctx, clientSet, jobObj, 52); err != nil { 1438 t.Fatalf("Failed Marking Pods as ready: %v", err) 1439 } 1440 want.Ready = ptr.To[int32](52) 1441 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1442 1443 // Failed Pods are replaced. 1444 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { 1445 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err) 1446 } 1447 want = podsByStatus{ 1448 Active: 54, 1449 Failed: 2, 1450 Ready: ptr.To[int32](50), 1451 Terminating: ptr.To[int32](0), 1452 } 1453 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1454 // Pods are created until the number of succeeded Pods equals completions. 1455 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 53); err != nil { 1456 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err) 1457 } 1458 want = podsByStatus{ 1459 Failed: 2, 1460 Succeeded: 53, 1461 Active: 3, 1462 Ready: ptr.To[int32](0), 1463 Terminating: ptr.To[int32](0), 1464 } 1465 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1466 // No more Pods are created after the Job completes. 1467 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil { 1468 t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err) 1469 } 1470 validateJobSucceeded(ctx, t, clientSet, jobObj) 1471 want = podsByStatus{ 1472 Failed: 2, 1473 Succeeded: 56, 1474 Ready: ptr.To[int32](0), 1475 Terminating: ptr.To[int32](0), 1476 } 1477 validateJobPodsStatus(ctx, t, clientSet, jobObj, want) 1478 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1479 validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{ 1480 Labels: []string{"NonIndexed", "succeeded", ""}, 1481 Value: 1, 1482 }) 1483 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1484 Labels: []string{"NonIndexed", "succeeded"}, 1485 Value: 56, 1486 }) 1487 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1488 Labels: []string{"NonIndexed", "failed"}, 1489 Value: 2, 1490 }) 1491 } 1492 1493 func TestIndexedJob(t *testing.T) { 1494 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 1495 closeFn, restConfig, clientSet, ns := setup(t, "indexed") 1496 defer closeFn() 1497 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1498 defer cancel() 1499 resetMetrics() 1500 1501 mode := batchv1.IndexedCompletion 1502 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1503 Spec: batchv1.JobSpec{ 1504 Parallelism: ptr.To[int32](3), 1505 Completions: ptr.To[int32](4), 1506 CompletionMode: &mode, 1507 }, 1508 }) 1509 if err != nil { 1510 t.Fatalf("Failed to create Job: %v", err) 1511 } 1512 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1513 Active: 3, 1514 Ready: ptr.To[int32](0), 1515 Terminating: ptr.To[int32](0), 1516 }) 1517 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil) 1518 validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{ 1519 Labels: []string{"succeeded", "global"}, 1520 Value: 0, 1521 }) 1522 1523 // One Pod succeeds. 1524 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil { 1525 t.Fatal("Failed trying to succeed pod with index 1") 1526 } 1527 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1528 Active: 3, 1529 Succeeded: 1, 1530 Ready: ptr.To[int32](0), 1531 Terminating: ptr.To[int32](0), 1532 }) 1533 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil) 1534 validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{ 1535 Labels: []string{"succeeded", "global"}, 1536 Value: 1, 1537 }) 1538 1539 // One Pod fails, which should be recreated. 1540 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil { 1541 t.Fatal("Failed trying to succeed pod with index 2") 1542 } 1543 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1544 Active: 3, 1545 Failed: 1, 1546 Succeeded: 1, 1547 Ready: ptr.To[int32](0), 1548 Terminating: ptr.To[int32](0), 1549 }) 1550 validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil) 1551 validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{ 1552 Labels: []string{"succeeded", "global"}, 1553 Value: 1, 1554 }) 1555 1556 // Remaining Pods succeed. 1557 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil { 1558 t.Fatal("Failed trying to succeed remaining pods") 1559 } 1560 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 1561 Active: 0, 1562 Failed: 1, 1563 Succeeded: 4, 1564 Ready: ptr.To[int32](0), 1565 Terminating: ptr.To[int32](0), 1566 }) 1567 validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3", nil) 1568 validateJobSucceeded(ctx, t, clientSet, jobObj) 1569 validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj) 1570 validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5) 1571 validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{ 1572 Labels: []string{"succeeded", "global"}, 1573 Value: 4, 1574 }) 1575 validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{ 1576 Labels: []string{"Indexed", "succeeded", ""}, 1577 Value: 1, 1578 }) 1579 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1580 Labels: []string{"Indexed", "succeeded"}, 1581 Value: 4, 1582 }) 1583 validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{ 1584 Labels: []string{"Indexed", "failed"}, 1585 Value: 1, 1586 }) 1587 } 1588 1589 func TestJobPodReplacementPolicy(t *testing.T) { 1590 indexedCompletion := batchv1.IndexedCompletion 1591 nonIndexedCompletion := batchv1.NonIndexedCompletion 1592 var podReplacementPolicy = func(obj batchv1.PodReplacementPolicy) *batchv1.PodReplacementPolicy { 1593 return &obj 1594 } 1595 type jobStatus struct { 1596 active int 1597 failed int 1598 terminating *int32 1599 } 1600 type jobPodsCreationMetrics struct { 1601 new int 1602 recreateTerminatingOrFailed int 1603 recreateFailed int 1604 } 1605 cases := map[string]struct { 1606 podReplacementPolicyEnabled bool 1607 jobSpec *batchv1.JobSpec 1608 wantStatusAfterDeletion jobStatus 1609 wantStatusAfterFailure jobStatus 1610 wantMetrics jobPodsCreationMetrics 1611 }{ 1612 "feature flag off, delete & fail pods, recreate terminating pods, and verify job status counters": { 1613 jobSpec: &batchv1.JobSpec{ 1614 Parallelism: ptr.To[int32](2), 1615 Completions: ptr.To[int32](2), 1616 CompletionMode: &indexedCompletion, 1617 Template: v1.PodTemplateSpec{ 1618 ObjectMeta: metav1.ObjectMeta{ 1619 Finalizers: []string{"fake.example.com/blockDeletion"}, 1620 }, 1621 }, 1622 }, 1623 wantStatusAfterDeletion: jobStatus{ 1624 active: 2, 1625 failed: 2, 1626 }, 1627 wantStatusAfterFailure: jobStatus{ 1628 active: 2, 1629 failed: 2, 1630 }, 1631 wantMetrics: jobPodsCreationMetrics{ 1632 new: 4, 1633 }, 1634 }, 1635 "feature flag true, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": { 1636 podReplacementPolicyEnabled: true, 1637 jobSpec: &batchv1.JobSpec{ 1638 Parallelism: ptr.To[int32](2), 1639 Completions: ptr.To[int32](2), 1640 CompletionMode: &indexedCompletion, 1641 PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed), 1642 Template: v1.PodTemplateSpec{ 1643 ObjectMeta: metav1.ObjectMeta{ 1644 Finalizers: []string{"fake.example.com/blockDeletion"}, 1645 }, 1646 }, 1647 }, 1648 wantStatusAfterDeletion: jobStatus{ 1649 active: 2, 1650 failed: 2, 1651 terminating: ptr.To[int32](2), 1652 }, 1653 wantStatusAfterFailure: jobStatus{ 1654 active: 2, 1655 failed: 2, 1656 terminating: ptr.To[int32](0), 1657 }, 1658 wantMetrics: jobPodsCreationMetrics{ 1659 new: 2, 1660 recreateTerminatingOrFailed: 2, 1661 }, 1662 }, 1663 "feature flag true with NonIndexedJob, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": { 1664 podReplacementPolicyEnabled: true, 1665 jobSpec: &batchv1.JobSpec{ 1666 Parallelism: ptr.To[int32](2), 1667 Completions: ptr.To[int32](2), 1668 CompletionMode: &indexedCompletion, 1669 PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed), 1670 Template: v1.PodTemplateSpec{ 1671 ObjectMeta: metav1.ObjectMeta{ 1672 Finalizers: []string{"fake.example.com/blockDeletion"}, 1673 }, 1674 }, 1675 }, 1676 wantStatusAfterDeletion: jobStatus{ 1677 active: 2, 1678 failed: 2, 1679 terminating: ptr.To[int32](2), 1680 }, 1681 wantStatusAfterFailure: jobStatus{ 1682 active: 2, 1683 failed: 2, 1684 terminating: ptr.To[int32](0), 1685 }, 1686 wantMetrics: jobPodsCreationMetrics{ 1687 new: 2, 1688 recreateTerminatingOrFailed: 2, 1689 }, 1690 }, 1691 "feature flag false, podFailurePolicy enabled, delete & fail pods, recreate failed pods, and verify job status counters": { 1692 podReplacementPolicyEnabled: false, 1693 jobSpec: &batchv1.JobSpec{ 1694 Parallelism: ptr.To[int32](2), 1695 Completions: ptr.To[int32](2), 1696 CompletionMode: &nonIndexedCompletion, 1697 PodReplacementPolicy: podReplacementPolicy(batchv1.Failed), 1698 Template: v1.PodTemplateSpec{ 1699 ObjectMeta: metav1.ObjectMeta{ 1700 Finalizers: []string{"fake.example.com/blockDeletion"}, 1701 }, 1702 }, 1703 PodFailurePolicy: &batchv1.PodFailurePolicy{ 1704 Rules: []batchv1.PodFailurePolicyRule{ 1705 { 1706 Action: batchv1.PodFailurePolicyActionFailJob, 1707 OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{ 1708 Operator: batchv1.PodFailurePolicyOnExitCodesOpIn, 1709 Values: []int32{5}, 1710 }, 1711 }, 1712 }, 1713 }, 1714 }, 1715 wantStatusAfterDeletion: jobStatus{ 1716 active: 2, 1717 }, 1718 wantStatusAfterFailure: jobStatus{ 1719 active: 2, 1720 }, 1721 wantMetrics: jobPodsCreationMetrics{ 1722 new: 2, 1723 }, 1724 }, 1725 "feature flag true, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": { 1726 podReplacementPolicyEnabled: true, 1727 jobSpec: &batchv1.JobSpec{ 1728 Parallelism: ptr.To[int32](2), 1729 Completions: ptr.To[int32](2), 1730 CompletionMode: &indexedCompletion, 1731 PodReplacementPolicy: podReplacementPolicy(batchv1.Failed), 1732 Template: v1.PodTemplateSpec{ 1733 ObjectMeta: metav1.ObjectMeta{ 1734 Finalizers: []string{"fake.example.com/blockDeletion"}, 1735 }, 1736 }, 1737 }, 1738 wantStatusAfterDeletion: jobStatus{ 1739 active: 0, 1740 failed: 0, 1741 terminating: ptr.To[int32](2), 1742 }, 1743 wantStatusAfterFailure: jobStatus{ 1744 active: 2, 1745 failed: 2, 1746 terminating: ptr.To[int32](0), 1747 }, 1748 wantMetrics: jobPodsCreationMetrics{ 1749 new: 2, 1750 recreateFailed: 2, 1751 }, 1752 }, 1753 "feature flag true with NonIndexedJob, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": { 1754 podReplacementPolicyEnabled: true, 1755 jobSpec: &batchv1.JobSpec{ 1756 Parallelism: ptr.To[int32](2), 1757 Completions: ptr.To[int32](2), 1758 CompletionMode: &nonIndexedCompletion, 1759 PodReplacementPolicy: podReplacementPolicy(batchv1.Failed), 1760 Template: v1.PodTemplateSpec{ 1761 ObjectMeta: metav1.ObjectMeta{ 1762 Finalizers: []string{"fake.example.com/blockDeletion"}, 1763 }, 1764 }, 1765 }, 1766 wantStatusAfterDeletion: jobStatus{ 1767 active: 0, 1768 failed: 0, 1769 terminating: ptr.To[int32](2), 1770 }, 1771 wantStatusAfterFailure: jobStatus{ 1772 active: 2, 1773 failed: 2, 1774 terminating: ptr.To[int32](0), 1775 }, 1776 wantMetrics: jobPodsCreationMetrics{ 1777 new: 2, 1778 recreateFailed: 2, 1779 }, 1780 }, 1781 } 1782 for name, tc := range cases { 1783 tc := tc 1784 t.Run(name, func(t *testing.T) { 1785 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.podReplacementPolicyEnabled)() 1786 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.jobSpec.PodFailurePolicy != nil)() 1787 1788 closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy") 1789 t.Cleanup(closeFn) 1790 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1791 t.Cleanup(cancel) 1792 resetMetrics() 1793 1794 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1795 Spec: *tc.jobSpec, 1796 }) 1797 if err != nil { 1798 t.Fatalf("Failed to create Job: %v", err) 1799 } 1800 jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace) 1801 1802 waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj) 1803 t.Cleanup(func() { removePodsFinalizer(ctx, t, clientSet, ns.Name) }) 1804 1805 deletePods(ctx, t, clientSet, ns.Name) 1806 1807 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{ 1808 Terminating: tc.wantStatusAfterDeletion.terminating, 1809 Failed: tc.wantStatusAfterDeletion.failed, 1810 Active: tc.wantStatusAfterDeletion.active, 1811 Ready: ptr.To[int32](0), 1812 }) 1813 1814 failTerminatingPods(ctx, t, clientSet, ns.Name) 1815 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{ 1816 Terminating: tc.wantStatusAfterFailure.terminating, 1817 Failed: tc.wantStatusAfterFailure.failed, 1818 Active: tc.wantStatusAfterFailure.active, 1819 Ready: ptr.To[int32](0), 1820 }) 1821 1822 validateCounterMetric( 1823 ctx, 1824 t, 1825 metrics.JobPodsCreationTotal, 1826 metricLabelsWithValue{Labels: []string{"new", "succeeded"}, Value: tc.wantMetrics.new}, 1827 ) 1828 validateCounterMetric( 1829 ctx, 1830 t, 1831 metrics.JobPodsCreationTotal, 1832 metricLabelsWithValue{Labels: []string{"recreate_terminating_or_failed", "succeeded"}, Value: tc.wantMetrics.recreateTerminatingOrFailed}, 1833 ) 1834 validateCounterMetric( 1835 ctx, 1836 t, 1837 metrics.JobPodsCreationTotal, 1838 metricLabelsWithValue{Labels: []string{"recreate_failed", "succeeded"}, Value: tc.wantMetrics.recreateFailed}, 1839 ) 1840 }) 1841 } 1842 } 1843 1844 // This tests the feature enable -> disable -> enable path for PodReplacementPolicy. 1845 // We verify that Failed case works as expected when turned on. 1846 // Disable reverts to previous behavior. 1847 // Enabling will then match the original failed case. 1848 func TestJobPodReplacementPolicyFeatureToggling(t *testing.T) { 1849 const podCount int32 = 2 1850 jobSpec := batchv1.JobSpec{ 1851 Parallelism: ptr.To(podCount), 1852 Completions: ptr.To(podCount), 1853 CompletionMode: ptr.To(batchv1.NonIndexedCompletion), 1854 PodReplacementPolicy: ptr.To(batchv1.Failed), 1855 } 1856 wantTerminating := ptr.To(podCount) 1857 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)() 1858 closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy") 1859 defer closeFn() 1860 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 1861 defer func() { 1862 cancel() 1863 }() 1864 resetMetrics() 1865 1866 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 1867 Spec: jobSpec, 1868 }) 1869 if err != nil { 1870 t.Fatalf("Failed to create Job: %v", err) 1871 } 1872 jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace) 1873 1874 waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj) 1875 deletePods(ctx, t, clientSet, jobObj.Namespace) 1876 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{ 1877 Terminating: wantTerminating, 1878 Failed: 0, 1879 Ready: ptr.To[int32](0), 1880 }) 1881 // Disable controller and turn feature off. 1882 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, false)() 1883 cancel() 1884 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 1885 1886 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{ 1887 Terminating: nil, 1888 Failed: int(podCount), 1889 Ready: ptr.To[int32](0), 1890 Active: int(podCount), 1891 }) 1892 // Disable the controller and turn feature on again. 1893 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)() 1894 cancel() 1895 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 1896 waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj) 1897 deletePods(ctx, t, clientSet, jobObj.Namespace) 1898 1899 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{ 1900 Terminating: wantTerminating, 1901 Failed: int(podCount), 1902 Active: 0, 1903 Ready: ptr.To[int32](0), 1904 }) 1905 } 1906 1907 func TestElasticIndexedJob(t *testing.T) { 1908 const initialCompletions int32 = 3 1909 type jobUpdate struct { 1910 completions *int32 1911 succeedIndexes []int 1912 failIndexes []int 1913 wantSucceededIndexes string 1914 wantFailed int 1915 wantRemainingIndexes sets.Set[int] 1916 wantActivePods int 1917 } 1918 cases := map[string]struct { 1919 featureGate bool 1920 jobUpdates []jobUpdate 1921 wantErr *apierrors.StatusError 1922 }{ 1923 "feature flag off, mutation not allowed": { 1924 jobUpdates: []jobUpdate{ 1925 { 1926 completions: ptr.To[int32](4), 1927 }, 1928 }, 1929 wantErr: apierrors.NewInvalid( 1930 schema.GroupKind{Group: "batch", Kind: "Job"}, 1931 "test-job", 1932 field.ErrorList{field.Invalid(field.NewPath("spec", "completions"), 4, "field is immutable")}, 1933 ), 1934 }, 1935 "scale up": { 1936 featureGate: true, 1937 jobUpdates: []jobUpdate{ 1938 { 1939 // Scale up completions 3->4 then succeed indexes 0-3 1940 completions: ptr.To[int32](4), 1941 succeedIndexes: []int{0, 1, 2, 3}, 1942 wantSucceededIndexes: "0-3", 1943 }, 1944 }, 1945 }, 1946 "scale down": { 1947 featureGate: true, 1948 jobUpdates: []jobUpdate{ 1949 // First succeed index 1 and fail index 2 while completions is still original value (3). 1950 { 1951 succeedIndexes: []int{1}, 1952 failIndexes: []int{2}, 1953 wantSucceededIndexes: "1", 1954 wantFailed: 1, 1955 wantRemainingIndexes: sets.New(0, 2), 1956 wantActivePods: 2, 1957 }, 1958 // Scale down completions 3->1, verify prev failure out of range still counts 1959 // but succeeded out of range does not. 1960 { 1961 completions: ptr.To[int32](1), 1962 succeedIndexes: []int{0}, 1963 wantSucceededIndexes: "0", 1964 wantFailed: 1, 1965 }, 1966 }, 1967 }, 1968 "index finishes successfully, scale down, scale up": { 1969 featureGate: true, 1970 jobUpdates: []jobUpdate{ 1971 // First succeed index 2 while completions is still original value (3). 1972 { 1973 succeedIndexes: []int{2}, 1974 wantSucceededIndexes: "2", 1975 wantRemainingIndexes: sets.New(0, 1), 1976 wantActivePods: 2, 1977 }, 1978 // Scale completions down 3->2 to exclude previously succeeded index. 1979 { 1980 completions: ptr.To[int32](2), 1981 wantRemainingIndexes: sets.New(0, 1), 1982 wantActivePods: 2, 1983 }, 1984 // Scale completions back up to include previously succeeded index that was temporarily out of range. 1985 { 1986 completions: ptr.To[int32](3), 1987 succeedIndexes: []int{0, 1, 2}, 1988 wantSucceededIndexes: "0-2", 1989 }, 1990 }, 1991 }, 1992 "scale down to 0, verify that the job succeeds": { 1993 featureGate: true, 1994 jobUpdates: []jobUpdate{ 1995 { 1996 completions: ptr.To[int32](0), 1997 }, 1998 }, 1999 }, 2000 } 2001 2002 for name, tc := range cases { 2003 tc := tc 2004 t.Run(name, func(t *testing.T) { 2005 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.ElasticIndexedJob, tc.featureGate)() 2006 closeFn, restConfig, clientSet, ns := setup(t, "indexed") 2007 defer closeFn() 2008 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2009 defer cancel() 2010 resetMetrics() 2011 2012 // Set up initial Job in Indexed completion mode. 2013 mode := batchv1.IndexedCompletion 2014 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2015 Spec: batchv1.JobSpec{ 2016 Parallelism: ptr.To(initialCompletions), 2017 Completions: ptr.To(initialCompletions), 2018 CompletionMode: &mode, 2019 }, 2020 }) 2021 if err != nil { 2022 t.Fatalf("Failed to create Job: %v", err) 2023 } 2024 jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace) 2025 2026 // Wait for pods to start up. 2027 err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) { 2028 job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{}) 2029 if err != nil { 2030 return false, err 2031 } 2032 if job.Status.Active == initialCompletions { 2033 return true, nil 2034 } 2035 return false, nil 2036 }) 2037 if err != nil { 2038 t.Fatalf("Error waiting for Job pods to become active: %v", err) 2039 } 2040 2041 for _, update := range tc.jobUpdates { 2042 // Update Job spec if necessary. 2043 if update.completions != nil { 2044 if jobObj, err = updateJob(ctx, jobClient, jobObj.Name, func(j *batchv1.Job) { 2045 j.Spec.Completions = update.completions 2046 j.Spec.Parallelism = update.completions 2047 }); err != nil { 2048 if diff := cmp.Diff(tc.wantErr, err); diff != "" { 2049 t.Fatalf("Unexpected or missing errors (-want/+got): %s", diff) 2050 } 2051 return 2052 } 2053 } 2054 2055 // Succeed specified indexes. 2056 for _, idx := range update.succeedIndexes { 2057 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, idx); err != nil { 2058 t.Fatalf("Failed trying to succeed pod with index %d: %v", idx, err) 2059 } 2060 } 2061 2062 // Fail specified indexes. 2063 for _, idx := range update.failIndexes { 2064 if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, idx); err != nil { 2065 t.Fatalf("Failed trying to fail pod with index %d: %v", idx, err) 2066 } 2067 } 2068 2069 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2070 Active: update.wantActivePods, 2071 Succeeded: len(update.succeedIndexes), 2072 Failed: update.wantFailed, 2073 Ready: ptr.To[int32](0), 2074 Terminating: ptr.To[int32](0), 2075 }) 2076 validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil) 2077 } 2078 2079 validateJobSucceeded(ctx, t, clientSet, jobObj) 2080 }) 2081 } 2082 } 2083 2084 // BenchmarkLargeIndexedJob benchmarks the completion of an Indexed Job. 2085 // We expect that large jobs are more commonly used as Indexed. And they are 2086 // also faster to track, as they need less API calls. 2087 func BenchmarkLargeIndexedJob(b *testing.B) { 2088 closeFn, restConfig, clientSet, ns := setup(b, "indexed") 2089 restConfig.QPS = 100 2090 restConfig.Burst = 100 2091 defer closeFn() 2092 ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig) 2093 defer cancel() 2094 backoff := wait.Backoff{ 2095 Duration: time.Second, 2096 Factor: 1.5, 2097 Steps: 30, 2098 Cap: 5 * time.Minute, 2099 } 2100 cases := map[string]struct { 2101 nPods int32 2102 backoffLimitPerIndex *int32 2103 }{ 2104 "regular indexed job without failures; size=10": { 2105 nPods: 10, 2106 }, 2107 "job with backoffLimitPerIndex without failures; size=10": { 2108 nPods: 10, 2109 backoffLimitPerIndex: ptr.To[int32](1), 2110 }, 2111 "regular indexed job without failures; size=100": { 2112 nPods: 100, 2113 }, 2114 "job with backoffLimitPerIndex without failures; size=100": { 2115 nPods: 100, 2116 backoffLimitPerIndex: ptr.To[int32](1), 2117 }, 2118 } 2119 mode := batchv1.IndexedCompletion 2120 for name, tc := range cases { 2121 b.Run(name, func(b *testing.B) { 2122 enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil 2123 defer featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)() 2124 b.ResetTimer() 2125 for n := 0; n < b.N; n++ { 2126 b.StartTimer() 2127 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2128 ObjectMeta: metav1.ObjectMeta{ 2129 Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex), 2130 }, 2131 Spec: batchv1.JobSpec{ 2132 Parallelism: ptr.To(tc.nPods), 2133 Completions: ptr.To(tc.nPods), 2134 CompletionMode: &mode, 2135 BackoffLimitPerIndex: tc.backoffLimitPerIndex, 2136 }, 2137 }) 2138 if err != nil { 2139 b.Fatalf("Failed to create Job: %v", err) 2140 } 2141 b.Cleanup(func() { 2142 if err := cleanUp(ctx, clientSet, jobObj); err != nil { 2143 b.Fatalf("Failed cleanup: %v", err) 2144 } 2145 }) 2146 remaining := int(tc.nPods) 2147 if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) { 2148 if err, succ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil { 2149 remaining -= succ 2150 b.Logf("Transient failure succeeding pods: %v", err) 2151 return false, nil 2152 } 2153 return true, nil 2154 }); err != nil { 2155 b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err) 2156 } 2157 validateJobSucceeded(ctx, b, clientSet, jobObj) 2158 b.StopTimer() 2159 } 2160 }) 2161 } 2162 } 2163 2164 // BenchmarkLargeFailureHandling benchmarks the handling of numerous pod failures 2165 // of an Indexed Job. We set minimal backoff delay to make the job controller 2166 // performance comparable for indexed jobs with global backoffLimit, and those 2167 // with backoffLimit per-index, despite different patterns of handling failures. 2168 func BenchmarkLargeFailureHandling(b *testing.B) { 2169 b.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff)) 2170 b.Cleanup(setDurationDuringTest(&jobcontroller.MaxJobPodFailureBackOff, fastPodFailureBackoff)) 2171 closeFn, restConfig, clientSet, ns := setup(b, "indexed") 2172 restConfig.QPS = 100 2173 restConfig.Burst = 100 2174 defer closeFn() 2175 ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig) 2176 defer cancel() 2177 backoff := wait.Backoff{ 2178 Duration: time.Second, 2179 Factor: 1.5, 2180 Steps: 30, 2181 Cap: 5 * time.Minute, 2182 } 2183 cases := map[string]struct { 2184 nPods int32 2185 backoffLimitPerIndex *int32 2186 customTimeout *time.Duration 2187 }{ 2188 "regular indexed job with failures; size=10": { 2189 nPods: 10, 2190 }, 2191 "job with backoffLimitPerIndex with failures; size=10": { 2192 nPods: 10, 2193 backoffLimitPerIndex: ptr.To[int32](1), 2194 }, 2195 "regular indexed job with failures; size=100": { 2196 nPods: 100, 2197 }, 2198 "job with backoffLimitPerIndex with failures; size=100": { 2199 nPods: 100, 2200 backoffLimitPerIndex: ptr.To[int32](1), 2201 }, 2202 } 2203 mode := batchv1.IndexedCompletion 2204 for name, tc := range cases { 2205 b.Run(name, func(b *testing.B) { 2206 enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil 2207 timeout := ptr.Deref(tc.customTimeout, wait.ForeverTestTimeout) 2208 defer featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)() 2209 b.ResetTimer() 2210 for n := 0; n < b.N; n++ { 2211 b.StopTimer() 2212 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2213 ObjectMeta: metav1.ObjectMeta{ 2214 Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex), 2215 }, 2216 Spec: batchv1.JobSpec{ 2217 Parallelism: ptr.To(tc.nPods), 2218 Completions: ptr.To(tc.nPods), 2219 CompletionMode: &mode, 2220 BackoffLimitPerIndex: tc.backoffLimitPerIndex, 2221 BackoffLimit: ptr.To(tc.nPods), 2222 }, 2223 }) 2224 if err != nil { 2225 b.Fatalf("Failed to create Job: %v", err) 2226 } 2227 b.Cleanup(func() { 2228 if err := cleanUp(ctx, clientSet, jobObj); err != nil { 2229 b.Fatalf("Failed cleanup: %v", err) 2230 } 2231 }) 2232 validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{ 2233 Active: int(tc.nPods), 2234 Ready: ptr.To[int32](0), 2235 Terminating: ptr.To[int32](0), 2236 }, timeout) 2237 2238 b.StartTimer() 2239 remaining := int(tc.nPods) 2240 if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) { 2241 if err, fail := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, remaining); err != nil { 2242 remaining -= fail 2243 b.Logf("Transient failure failing pods: %v", err) 2244 return false, nil 2245 } 2246 return true, nil 2247 }); err != nil { 2248 b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err) 2249 } 2250 validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{ 2251 Active: int(tc.nPods), 2252 Ready: ptr.To[int32](0), 2253 Failed: int(tc.nPods), 2254 Terminating: ptr.To[int32](0), 2255 }, timeout) 2256 b.StopTimer() 2257 } 2258 }) 2259 } 2260 } 2261 2262 // cleanUp deletes all pods and the job 2263 func cleanUp(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job) error { 2264 // Clean up pods in pages, because DeleteCollection might timeout. 2265 // #90743 2266 for { 2267 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{Limit: 1}) 2268 if err != nil { 2269 return err 2270 } 2271 if len(pods.Items) == 0 { 2272 break 2273 } 2274 err = clientSet.CoreV1().Pods(jobObj.Namespace).DeleteCollection(ctx, 2275 metav1.DeleteOptions{}, 2276 metav1.ListOptions{ 2277 Limit: 1000, 2278 }) 2279 if err != nil { 2280 return err 2281 } 2282 } 2283 return clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{}) 2284 } 2285 2286 func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) { 2287 for _, policy := range []metav1.DeletionPropagation{metav1.DeletePropagationOrphan, metav1.DeletePropagationBackground, metav1.DeletePropagationForeground} { 2288 t.Run(string(policy), func(t *testing.T) { 2289 closeFn, restConfig, clientSet, ns := setup(t, "simple") 2290 defer closeFn() 2291 informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "controller-informers")), 0) 2292 // Make the job controller significantly slower to trigger race condition. 2293 restConfig.QPS = 1 2294 restConfig.Burst = 1 2295 jc, ctx, cancel := createJobControllerWithSharedInformers(t, restConfig, informerSet) 2296 resetMetrics() 2297 defer cancel() 2298 restConfig.QPS = 200 2299 restConfig.Burst = 200 2300 runGC := util.CreateGCController(ctx, t, *restConfig, informerSet) 2301 informerSet.Start(ctx.Done()) 2302 go jc.Run(ctx, 1) 2303 runGC() 2304 2305 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2306 Spec: batchv1.JobSpec{ 2307 Parallelism: ptr.To[int32](2), 2308 }, 2309 }) 2310 if err != nil { 2311 t.Fatalf("Failed to create Job: %v", err) 2312 } 2313 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2314 Active: 2, 2315 Ready: ptr.To[int32](0), 2316 Terminating: ptr.To[int32](0), 2317 }) 2318 2319 // Delete Job. The GC should delete the pods in cascade. 2320 err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{ 2321 PropagationPolicy: &policy, 2322 }) 2323 if err != nil { 2324 t.Fatalf("Failed to delete job: %v", err) 2325 } 2326 validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj) 2327 // Pods never finished, so they are not counted in the metric. 2328 validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 0) 2329 }) 2330 } 2331 } 2332 2333 func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) { 2334 // Set a maximum number of uncounted pods below parallelism, to ensure it 2335 // doesn't affect the termination of pods. 2336 t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50)) 2337 closeFn, restConfig, clientSet, ns := setup(t, "simple") 2338 defer closeFn() 2339 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2340 defer cancel() 2341 2342 // Job tracking with finalizers requires less calls in Indexed mode, 2343 // so it's more likely to process all finalizers before all the pods 2344 // are visible. 2345 mode := batchv1.IndexedCompletion 2346 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2347 Spec: batchv1.JobSpec{ 2348 CompletionMode: &mode, 2349 Completions: ptr.To[int32](100), 2350 Parallelism: ptr.To[int32](100), 2351 BackoffLimit: ptr.To[int32](0), 2352 }, 2353 }) 2354 if err != nil { 2355 t.Fatalf("Could not create job: %v", err) 2356 } 2357 2358 // Fail a pod ASAP. 2359 err = wait.PollUntilContextTimeout(ctx, time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) { 2360 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 2361 return false, nil 2362 } 2363 return true, nil 2364 }) 2365 if err != nil { 2366 t.Fatalf("Could not fail pod: %v", err) 2367 } 2368 2369 validateJobFailed(ctx, t, clientSet, jobObj) 2370 validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{ 2371 Labels: []string{"Indexed", "failed", "BackoffLimitExceeded"}, 2372 Value: 1, 2373 }) 2374 2375 validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj) 2376 } 2377 2378 func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) { 2379 t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second)) 2380 closeFn, restConfig, clientSet, ns := setup(t, "simple") 2381 defer closeFn() 2382 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2383 defer cancel() 2384 2385 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{}) 2386 if err != nil { 2387 t.Fatalf("Could not create job: %v", err) 2388 } 2389 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2390 Active: 1, 2391 Ready: ptr.To[int32](0), 2392 Terminating: ptr.To[int32](0), 2393 }) 2394 2395 // Fail the first pod 2396 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 2397 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 2398 } 2399 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2400 Active: 1, 2401 Ready: ptr.To[int32](0), 2402 Failed: 1, 2403 Terminating: ptr.To[int32](0), 2404 }) 2405 2406 // Fail the second pod 2407 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 2408 t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err) 2409 } 2410 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2411 Active: 1, 2412 Ready: ptr.To[int32](0), 2413 Failed: 2, 2414 Terminating: ptr.To[int32](0), 2415 }) 2416 2417 jobPods, err := getJobPods(ctx, t, clientSet, jobObj, func(ps v1.PodStatus) bool { return true }) 2418 if err != nil { 2419 t.Fatalf("Failed to list Job Pods: %v", err) 2420 } 2421 if len(jobPods) != 3 { 2422 t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods)) 2423 } 2424 validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, jobPods) 2425 } 2426 2427 func validateExpotentialBackoffDelay(t *testing.T, defaultPodFailureBackoff time.Duration, pods []*v1.Pod) { 2428 t.Helper() 2429 creationTime := []time.Time{} 2430 finishTime := []time.Time{} 2431 for _, pod := range pods { 2432 creationTime = append(creationTime, pod.CreationTimestamp.Time) 2433 if len(pod.Status.ContainerStatuses) > 0 { 2434 finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time) 2435 } 2436 } 2437 2438 sort.Slice(creationTime, func(i, j int) bool { 2439 return creationTime[i].Before(creationTime[j]) 2440 }) 2441 sort.Slice(finishTime, func(i, j int) bool { 2442 return finishTime[i].Before(finishTime[j]) 2443 }) 2444 2445 diff := creationTime[1].Sub(finishTime[0]) 2446 2447 if diff < defaultPodFailureBackoff { 2448 t.Fatalf("Second pod should be created at least %v seconds after the first pod, time difference: %v", defaultPodFailureBackoff, diff) 2449 } 2450 2451 if diff >= 2*defaultPodFailureBackoff { 2452 t.Fatalf("Second pod should be created before %v seconds after the first pod, time difference: %v", 2*defaultPodFailureBackoff, diff) 2453 } 2454 2455 diff = creationTime[2].Sub(finishTime[1]) 2456 2457 if diff < 2*defaultPodFailureBackoff { 2458 t.Fatalf("Third pod should be created at least %v seconds after the second pod, time difference: %v", 2*defaultPodFailureBackoff, diff) 2459 } 2460 2461 if diff >= 4*defaultPodFailureBackoff { 2462 t.Fatalf("Third pod should be created before %v seconds after the second pod, time difference: %v", 4*defaultPodFailureBackoff, diff) 2463 } 2464 } 2465 2466 // TestJobFailedWithInterrupts tests that a job were one pod fails and the rest 2467 // succeed is marked as Failed, even if the controller fails in the middle. 2468 func TestJobFailedWithInterrupts(t *testing.T) { 2469 closeFn, restConfig, clientSet, ns := setup(t, "simple") 2470 defer closeFn() 2471 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2472 defer func() { 2473 cancel() 2474 }() 2475 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2476 Spec: batchv1.JobSpec{ 2477 Completions: ptr.To[int32](10), 2478 Parallelism: ptr.To[int32](10), 2479 BackoffLimit: ptr.To[int32](0), 2480 Template: v1.PodTemplateSpec{ 2481 Spec: v1.PodSpec{ 2482 NodeName: "foo", // Scheduled pods are not deleted immediately. 2483 }, 2484 }, 2485 }, 2486 }) 2487 if err != nil { 2488 t.Fatalf("Could not create job: %v", err) 2489 } 2490 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2491 Active: 10, 2492 Ready: ptr.To[int32](0), 2493 Terminating: ptr.To[int32](0), 2494 }) 2495 t.Log("Finishing pods") 2496 if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil { 2497 t.Fatalf("Could not fail a pod: %v", err) 2498 } 2499 remaining := 9 2500 if err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) { 2501 if err, succ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil { 2502 remaining -= succ 2503 t.Logf("Transient failure succeeding pods: %v", err) 2504 return false, nil 2505 } 2506 return true, nil 2507 }); err != nil { 2508 t.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err) 2509 } 2510 t.Log("Recreating job controller") 2511 cancel() 2512 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 2513 validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed) 2514 } 2515 2516 func validateNoOrphanPodsWithFinalizers(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) { 2517 t.Helper() 2518 orphanPods := 0 2519 if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) { 2520 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{ 2521 LabelSelector: metav1.FormatLabelSelector(jobObj.Spec.Selector), 2522 }) 2523 if err != nil { 2524 return false, err 2525 } 2526 orphanPods = 0 2527 for _, pod := range pods.Items { 2528 if hasJobTrackingFinalizer(&pod) { 2529 orphanPods++ 2530 } 2531 } 2532 return orphanPods == 0, nil 2533 }); err != nil { 2534 t.Errorf("Failed waiting for pods to be freed from finalizer: %v", err) 2535 t.Logf("Last saw %d orphan pods", orphanPods) 2536 } 2537 } 2538 2539 func TestOrphanPodsFinalizersClearedOnRestart(t *testing.T) { 2540 // Step 0: create job. 2541 closeFn, restConfig, clientSet, ns := setup(t, "simple") 2542 defer closeFn() 2543 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2544 defer func() { 2545 cancel() 2546 }() 2547 2548 jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2549 Spec: batchv1.JobSpec{ 2550 Parallelism: ptr.To[int32](1), 2551 }, 2552 }) 2553 if err != nil { 2554 t.Fatalf("Failed to create Job: %v", err) 2555 } 2556 validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{ 2557 Active: 1, 2558 Ready: ptr.To[int32](0), 2559 Terminating: ptr.To[int32](0), 2560 }) 2561 2562 // Step 2: Delete the Job while the controller is stopped. 2563 cancel() 2564 2565 err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(context.Background(), jobObj.Name, metav1.DeleteOptions{}) 2566 if err != nil { 2567 t.Fatalf("Failed to delete job: %v", err) 2568 } 2569 2570 // Step 3: Restart controller. 2571 ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig) 2572 validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj) 2573 } 2574 2575 func TestSuspendJob(t *testing.T) { 2576 type step struct { 2577 flag bool 2578 wantActive int 2579 wantStatus v1.ConditionStatus 2580 wantReason string 2581 } 2582 testCases := []struct { 2583 featureGate bool 2584 create step 2585 update step 2586 }{ 2587 // Exhaustively test all combinations other than trivial true->true and 2588 // false->false cases. 2589 { 2590 create: step{flag: false, wantActive: 2}, 2591 update: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"}, 2592 }, 2593 { 2594 create: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"}, 2595 update: step{flag: false, wantActive: 2, wantStatus: v1.ConditionFalse, wantReason: "Resumed"}, 2596 }, 2597 } 2598 2599 for _, tc := range testCases { 2600 name := fmt.Sprintf("feature=%v,create=%v,update=%v", tc.featureGate, tc.create.flag, tc.update.flag) 2601 t.Run(name, func(t *testing.T) { 2602 closeFn, restConfig, clientSet, ns := setup(t, "suspend") 2603 defer closeFn() 2604 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2605 defer cancel() 2606 events, err := clientSet.EventsV1().Events(ns.Name).Watch(ctx, metav1.ListOptions{}) 2607 if err != nil { 2608 t.Fatal(err) 2609 } 2610 defer events.Stop() 2611 2612 parallelism := int32(2) 2613 job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2614 Spec: batchv1.JobSpec{ 2615 Parallelism: ptr.To(parallelism), 2616 Completions: ptr.To[int32](4), 2617 Suspend: ptr.To(tc.create.flag), 2618 }, 2619 }) 2620 if err != nil { 2621 t.Fatalf("Failed to create Job: %v", err) 2622 } 2623 2624 validate := func(s string, active int, status v1.ConditionStatus, reason string) { 2625 validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{ 2626 Active: active, 2627 Ready: ptr.To[int32](0), 2628 Terminating: ptr.To[int32](0), 2629 }) 2630 job, err = clientSet.BatchV1().Jobs(ns.Name).Get(ctx, job.Name, metav1.GetOptions{}) 2631 if err != nil { 2632 t.Fatalf("Failed to get Job after %s: %v", s, err) 2633 } 2634 if got, want := getJobConditionStatus(ctx, job, batchv1.JobSuspended), status; got != want { 2635 t.Errorf("Unexpected Job condition %q status after %s: got %q, want %q", batchv1.JobSuspended, s, got, want) 2636 } 2637 if err := waitForEvent(ctx, events, job.UID, reason); err != nil { 2638 t.Errorf("Waiting for event with reason %q after %s: %v", reason, s, err) 2639 } 2640 } 2641 validate("create", tc.create.wantActive, tc.create.wantStatus, tc.create.wantReason) 2642 2643 job.Spec.Suspend = ptr.To(tc.update.flag) 2644 job, err = clientSet.BatchV1().Jobs(ns.Name).Update(ctx, job, metav1.UpdateOptions{}) 2645 if err != nil { 2646 t.Fatalf("Failed to update Job: %v", err) 2647 } 2648 validate("update", tc.update.wantActive, tc.update.wantStatus, tc.update.wantReason) 2649 }) 2650 } 2651 } 2652 2653 func TestSuspendJobControllerRestart(t *testing.T) { 2654 closeFn, restConfig, clientSet, ns := setup(t, "suspend") 2655 defer closeFn() 2656 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2657 defer cancel() 2658 2659 job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{ 2660 Spec: batchv1.JobSpec{ 2661 Parallelism: ptr.To[int32](2), 2662 Completions: ptr.To[int32](4), 2663 Suspend: ptr.To(true), 2664 }, 2665 }) 2666 if err != nil { 2667 t.Fatalf("Failed to create Job: %v", err) 2668 } 2669 validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{ 2670 Active: 0, 2671 Ready: ptr.To[int32](0), 2672 Terminating: ptr.To[int32](0), 2673 }) 2674 } 2675 2676 func TestNodeSelectorUpdate(t *testing.T) { 2677 closeFn, restConfig, clientSet, ns := setup(t, "suspend") 2678 defer closeFn() 2679 ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig) 2680 defer cancel() 2681 2682 job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{Spec: batchv1.JobSpec{ 2683 Parallelism: ptr.To[int32](1), 2684 Suspend: ptr.To(true), 2685 }}) 2686 if err != nil { 2687 t.Fatalf("Failed to create Job: %v", err) 2688 } 2689 jobName := job.Name 2690 jobNamespace := job.Namespace 2691 jobClient := clientSet.BatchV1().Jobs(jobNamespace) 2692 2693 // (1) Unsuspend and set node selector in the same update. 2694 nodeSelector := map[string]string{"foo": "bar"} 2695 if _, err := updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) { 2696 j.Spec.Template.Spec.NodeSelector = nodeSelector 2697 j.Spec.Suspend = ptr.To(false) 2698 }); err != nil { 2699 t.Errorf("Unexpected error: %v", err) 2700 } 2701 2702 // (2) Check that the pod was created using the expected node selector. 2703 2704 var pod *v1.Pod 2705 if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { 2706 pods, err := clientSet.CoreV1().Pods(jobNamespace).List(ctx, metav1.ListOptions{}) 2707 if err != nil { 2708 t.Fatalf("Failed to list Job Pods: %v", err) 2709 } 2710 if len(pods.Items) == 0 { 2711 return false, nil 2712 } 2713 pod = &pods.Items[0] 2714 return true, nil 2715 }); err != nil || pod == nil { 2716 t.Fatalf("pod not found: %v", err) 2717 } 2718 2719 // if the feature gate is enabled, then the job should now be unsuspended and 2720 // the pod has the node selector. 2721 if diff := cmp.Diff(nodeSelector, pod.Spec.NodeSelector); diff != "" { 2722 t.Errorf("Unexpected nodeSelector (-want,+got):\n%s", diff) 2723 } 2724 2725 // (3) Update node selector again. It should fail since the job is unsuspended. 2726 _, err = updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) { 2727 j.Spec.Template.Spec.NodeSelector = map[string]string{"foo": "baz"} 2728 }) 2729 2730 if err == nil || !strings.Contains(err.Error(), "spec.template: Invalid value") { 2731 t.Errorf("Expected \"spec.template: Invalid value\" error, got: %v", err) 2732 } 2733 2734 } 2735 2736 type podsByStatus struct { 2737 Active int 2738 Ready *int32 2739 Failed int 2740 Succeeded int 2741 Terminating *int32 2742 } 2743 2744 func validateJobsPodsStatusOnly(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) { 2745 t.Helper() 2746 validateJobsPodsStatusOnlyWithTimeout(ctx, t, clientSet, jobObj, desired, wait.ForeverTestTimeout) 2747 } 2748 2749 func validateJobsPodsStatusOnlyWithTimeout(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus, timeout time.Duration) { 2750 t.Helper() 2751 var actualCounts podsByStatus 2752 if err := wait.PollUntilContextTimeout(ctx, waitInterval, timeout, true, func(ctx context.Context) (bool, error) { 2753 updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{}) 2754 if err != nil { 2755 t.Fatalf("Failed to get updated Job: %v", err) 2756 } 2757 actualCounts = podsByStatus{ 2758 Active: int(updatedJob.Status.Active), 2759 Ready: updatedJob.Status.Ready, 2760 Succeeded: int(updatedJob.Status.Succeeded), 2761 Failed: int(updatedJob.Status.Failed), 2762 Terminating: updatedJob.Status.Terminating, 2763 } 2764 return cmp.Equal(actualCounts, desired), nil 2765 }); err != nil { 2766 diff := cmp.Diff(desired, actualCounts) 2767 t.Errorf("Waiting for Job Status: %v\nPods (-want,+got):\n%s", err, diff) 2768 } 2769 } 2770 2771 func validateJobPodsStatus(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) { 2772 t.Helper() 2773 validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, desired) 2774 var active []*v1.Pod 2775 if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { 2776 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 2777 if err != nil { 2778 t.Fatalf("Failed to list Job Pods: %v", err) 2779 } 2780 active = nil 2781 for _, pod := range pods.Items { 2782 phase := pod.Status.Phase 2783 if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) { 2784 p := pod 2785 active = append(active, &p) 2786 } 2787 } 2788 return len(active) == desired.Active, nil 2789 }); err != nil { 2790 if len(active) != desired.Active { 2791 t.Errorf("Found %d active Pods, want %d", len(active), desired.Active) 2792 } 2793 } 2794 for _, p := range active { 2795 if !hasJobTrackingFinalizer(p) { 2796 t.Errorf("Active pod %s doesn't have tracking finalizer", p.Name) 2797 } 2798 } 2799 } 2800 2801 func getJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, filter func(v1.PodStatus) bool) ([]*v1.Pod, error) { 2802 t.Helper() 2803 allPods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 2804 if err != nil { 2805 return nil, err 2806 } 2807 jobPods := make([]*v1.Pod, 0, 0) 2808 for _, pod := range allPods.Items { 2809 if metav1.IsControlledBy(&pod, jobObj) && filter(pod.Status) { 2810 p := pod 2811 jobPods = append(jobPods, &p) 2812 } 2813 } 2814 return jobPods, nil 2815 } 2816 2817 func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) { 2818 t.Helper() 2819 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 2820 if err != nil { 2821 t.Fatalf("Failed to list Job Pods: %v", err) 2822 } 2823 for _, pod := range pods.Items { 2824 phase := pod.Status.Phase 2825 if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) && hasJobTrackingFinalizer(&pod) { 2826 t.Errorf("Finished pod %s still has a tracking finalizer", pod.Name) 2827 } 2828 } 2829 } 2830 2831 // validateIndexedJobPods validates indexes and hostname of 2832 // active and completed Pods of an Indexed Job. 2833 // Call after validateJobPodsStatus 2834 func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string, wantFailed *string) { 2835 t.Helper() 2836 updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{}) 2837 if err != nil { 2838 t.Fatalf("Failed to get updated Job: %v", err) 2839 } 2840 if updatedJob.Status.CompletedIndexes != gotCompleted { 2841 t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted) 2842 } 2843 if diff := cmp.Diff(wantFailed, updatedJob.Status.FailedIndexes); diff != "" { 2844 t.Errorf("Got unexpected failed indexes: %s", diff) 2845 } 2846 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 2847 if err != nil { 2848 t.Fatalf("Failed to list Job Pods: %v", err) 2849 } 2850 gotActive := sets.New[int]() 2851 for _, pod := range pods.Items { 2852 if metav1.IsControlledBy(&pod, jobObj) { 2853 if pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodRunning { 2854 ix, err := getCompletionIndex(&pod) 2855 if err != nil { 2856 t.Errorf("Failed getting completion index for pod %s: %v", pod.Name, err) 2857 } else { 2858 gotActive.Insert(ix) 2859 } 2860 expectedName := fmt.Sprintf("%s-%d", jobObj.Name, ix) 2861 if diff := cmp.Equal(expectedName, pod.Spec.Hostname); !diff { 2862 t.Errorf("Got pod hostname %s, want %s", pod.Spec.Hostname, expectedName) 2863 } 2864 } 2865 } 2866 } 2867 if wantActive == nil { 2868 wantActive = sets.New[int]() 2869 } 2870 if diff := cmp.Diff(sets.List(wantActive), sets.List(gotActive)); diff != "" { 2871 t.Errorf("Unexpected active indexes (-want,+got):\n%s", diff) 2872 } 2873 } 2874 2875 func waitForEvent(ctx context.Context, events watch.Interface, uid types.UID, reason string) error { 2876 if reason == "" { 2877 return nil 2878 } 2879 return wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { 2880 for { 2881 var ev watch.Event 2882 select { 2883 case ev = <-events.ResultChan(): 2884 default: 2885 return false, nil 2886 } 2887 e, ok := ev.Object.(*eventsv1.Event) 2888 if !ok { 2889 continue 2890 } 2891 ctrl := "job-controller" 2892 if (e.ReportingController == ctrl || e.DeprecatedSource.Component == ctrl) && e.Reason == reason && e.Regarding.UID == uid { 2893 return true, nil 2894 } 2895 } 2896 }) 2897 } 2898 2899 func getJobConditionStatus(ctx context.Context, job *batchv1.Job, cType batchv1.JobConditionType) v1.ConditionStatus { 2900 for _, cond := range job.Status.Conditions { 2901 if cond.Type == cType { 2902 return cond.Status 2903 } 2904 } 2905 return "" 2906 } 2907 2908 func validateJobFailed(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) { 2909 t.Helper() 2910 validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed) 2911 } 2912 2913 func validateJobSucceeded(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job) { 2914 t.Helper() 2915 validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobComplete) 2916 } 2917 2918 func validateJobCondition(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, cond batchv1.JobConditionType) { 2919 t.Helper() 2920 if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { 2921 j, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{}) 2922 if err != nil { 2923 t.Fatalf("Failed to obtain updated Job: %v", err) 2924 } 2925 return getJobConditionStatus(ctx, j, cond) == v1.ConditionTrue, nil 2926 }); err != nil { 2927 t.Errorf("Waiting for Job to have condition %s: %v", cond, err) 2928 } 2929 } 2930 2931 func setJobPodsPhase(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, cnt int) (error, int) { 2932 op := func(p *v1.Pod) bool { 2933 p.Status.Phase = phase 2934 if phase == v1.PodFailed || phase == v1.PodSucceeded { 2935 p.Status.ContainerStatuses = []v1.ContainerStatus{ 2936 { 2937 State: v1.ContainerState{ 2938 Terminated: &v1.ContainerStateTerminated{ 2939 FinishedAt: metav1.Now(), 2940 }, 2941 }, 2942 }, 2943 } 2944 } 2945 return true 2946 } 2947 return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt) 2948 } 2949 2950 func setJobPodsReady(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, cnt int) (error, int) { 2951 op := func(p *v1.Pod) bool { 2952 if podutil.IsPodReady(p) { 2953 return false 2954 } 2955 p.Status.Conditions = append(p.Status.Conditions, v1.PodCondition{ 2956 Type: v1.PodReady, 2957 Status: v1.ConditionTrue, 2958 }) 2959 return true 2960 } 2961 return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt) 2962 } 2963 2964 func updateJobPodsStatus(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, op func(*v1.Pod) bool, cnt int) (error, int) { 2965 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 2966 if err != nil { 2967 return fmt.Errorf("listing Job Pods: %w", err), 0 2968 } 2969 updates := make([]v1.Pod, 0, cnt) 2970 for _, pod := range pods.Items { 2971 if len(updates) == cnt { 2972 break 2973 } 2974 if p := pod.Status.Phase; metav1.IsControlledBy(&pod, jobObj) && p != v1.PodFailed && p != v1.PodSucceeded { 2975 if !op(&pod) { 2976 continue 2977 } 2978 updates = append(updates, pod) 2979 } 2980 } 2981 successful, err := updatePodStatuses(ctx, clientSet, updates) 2982 if successful != cnt { 2983 return fmt.Errorf("couldn't set phase on %d Job pods", cnt-successful), successful 2984 } 2985 return err, successful 2986 } 2987 2988 func updatePodStatuses(ctx context.Context, clientSet clientset.Interface, updates []v1.Pod) (int, error) { 2989 wg := sync.WaitGroup{} 2990 wg.Add(len(updates)) 2991 errCh := make(chan error, len(updates)) 2992 var updated int32 2993 2994 for _, pod := range updates { 2995 pod := pod 2996 go func() { 2997 _, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{}) 2998 if err != nil { 2999 errCh <- err 3000 } else { 3001 atomic.AddInt32(&updated, 1) 3002 } 3003 wg.Done() 3004 }() 3005 } 3006 wg.Wait() 3007 3008 select { 3009 case err := <-errCh: 3010 return int(updated), fmt.Errorf("updating Pod status: %w", err) 3011 default: 3012 } 3013 return int(updated), nil 3014 } 3015 3016 func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, ix int) error { 3017 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 3018 if err != nil { 3019 return fmt.Errorf("listing Job Pods: %w", err) 3020 } 3021 for _, pod := range pods.Items { 3022 if p := pod.Status.Phase; !metav1.IsControlledBy(&pod, jobObj) || p == v1.PodFailed || p == v1.PodSucceeded { 3023 continue 3024 } 3025 if pix, err := getCompletionIndex(&pod); err == nil && pix == ix { 3026 pod.Status.Phase = phase 3027 if phase == v1.PodFailed || phase == v1.PodSucceeded { 3028 pod.Status.ContainerStatuses = []v1.ContainerStatus{ 3029 { 3030 State: v1.ContainerState{ 3031 Terminated: &v1.ContainerStateTerminated{ 3032 FinishedAt: metav1.Now(), 3033 }, 3034 }, 3035 }, 3036 } 3037 } 3038 _, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{}) 3039 if err != nil { 3040 return fmt.Errorf("updating pod %s status: %w", pod.Name, err) 3041 } 3042 return nil 3043 } 3044 } 3045 return errors.New("no pod matching index found") 3046 } 3047 3048 func getActivePodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int) (*v1.Pod, error) { 3049 return getJobPodForIndex(ctx, clientSet, jobObj, ix, func(p *v1.Pod) bool { 3050 return !podutil.IsPodTerminal(p) 3051 }) 3052 } 3053 3054 func getJobPodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) (*v1.Pod, error) { 3055 pods, err := getJobPodsForIndex(ctx, clientSet, jobObj, ix, filter) 3056 if err != nil { 3057 return nil, err 3058 } 3059 if len(pods) == 0 { 3060 return nil, fmt.Errorf("Pod not found for index: %v", ix) 3061 } 3062 return pods[0], nil 3063 } 3064 3065 func getJobPodsForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) ([]*v1.Pod, error) { 3066 pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{}) 3067 if err != nil { 3068 return nil, fmt.Errorf("listing Job Pods: %w", err) 3069 } 3070 var result []*v1.Pod 3071 for _, pod := range pods.Items { 3072 pod := pod 3073 if !metav1.IsControlledBy(&pod, jobObj) { 3074 continue 3075 } 3076 if !filter(&pod) { 3077 continue 3078 } 3079 if pix, err := getCompletionIndex(&pod); err == nil && pix == ix { 3080 result = append(result, &pod) 3081 } 3082 } 3083 return result, nil 3084 } 3085 3086 func getCompletionIndex(p *v1.Pod) (int, error) { 3087 if p.Annotations == nil { 3088 return 0, errors.New("no annotations found") 3089 } 3090 v, ok := p.Annotations[batchv1.JobCompletionIndexAnnotation] 3091 if !ok { 3092 return 0, fmt.Errorf("annotation %s not found", batchv1.JobCompletionIndexAnnotation) 3093 } 3094 return strconv.Atoi(v) 3095 } 3096 3097 func createJobWithDefaults(ctx context.Context, clientSet clientset.Interface, ns string, jobObj *batchv1.Job) (*batchv1.Job, error) { 3098 if jobObj.Name == "" { 3099 jobObj.Name = "test-job" 3100 } 3101 if len(jobObj.Spec.Template.Spec.Containers) == 0 { 3102 jobObj.Spec.Template.Spec.Containers = []v1.Container{ 3103 {Name: "foo", Image: "bar"}, 3104 } 3105 } 3106 if jobObj.Spec.Template.Spec.RestartPolicy == "" { 3107 jobObj.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever 3108 } 3109 return clientSet.BatchV1().Jobs(ns).Create(ctx, jobObj, metav1.CreateOptions{}) 3110 } 3111 3112 func setup(t testing.TB, nsBaseName string) (framework.TearDownFunc, *restclient.Config, clientset.Interface, *v1.Namespace) { 3113 // Disable ServiceAccount admission plugin as we don't have serviceaccount controller running. 3114 server := kubeapiservertesting.StartTestServerOrDie(t, nil, []string{"--disable-admission-plugins=ServiceAccount"}, framework.SharedEtcd()) 3115 3116 config := restclient.CopyConfig(server.ClientConfig) 3117 config.QPS = 200 3118 config.Burst = 200 3119 config.Timeout = 0 3120 clientSet, err := clientset.NewForConfig(config) 3121 if err != nil { 3122 t.Fatalf("Error creating clientset: %v", err) 3123 } 3124 3125 ns := framework.CreateNamespaceOrDie(clientSet, nsBaseName, t) 3126 closeFn := func() { 3127 framework.DeleteNamespaceOrDie(clientSet, ns, t) 3128 server.TearDownFn() 3129 } 3130 return closeFn, config, clientSet, ns 3131 } 3132 3133 func startJobControllerAndWaitForCaches(tb testing.TB, restConfig *restclient.Config) (context.Context, context.CancelFunc) { 3134 tb.Helper() 3135 informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-informers")), 0) 3136 jc, ctx, cancel := createJobControllerWithSharedInformers(tb, restConfig, informerSet) 3137 informerSet.Start(ctx.Done()) 3138 go jc.Run(ctx, 1) 3139 3140 // since this method starts the controller in a separate goroutine 3141 // and the tests don't check /readyz there is no way 3142 // the tests can tell it is safe to call the server and requests won't be rejected 3143 // thus we wait until caches have synced 3144 informerSet.WaitForCacheSync(ctx.Done()) 3145 return ctx, cancel 3146 } 3147 3148 func resetMetrics() { 3149 metrics.TerminatedPodsTrackingFinalizerTotal.Reset() 3150 metrics.JobFinishedNum.Reset() 3151 metrics.JobPodsFinished.Reset() 3152 metrics.PodFailuresHandledByFailurePolicy.Reset() 3153 metrics.JobFinishedIndexesTotal.Reset() 3154 metrics.JobPodsCreationTotal.Reset() 3155 } 3156 3157 func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) { 3158 tb.Helper() 3159 clientSet := clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-controller")) 3160 ctx, cancel := context.WithCancel(context.Background()) 3161 jc, err := jobcontroller.NewController(ctx, informerSet.Core().V1().Pods(), informerSet.Batch().V1().Jobs(), clientSet) 3162 if err != nil { 3163 tb.Fatalf("Error creating Job controller: %v", err) 3164 } 3165 return jc, ctx, cancel 3166 } 3167 3168 func hasJobTrackingFinalizer(obj metav1.Object) bool { 3169 for _, fin := range obj.GetFinalizers() { 3170 if fin == batchv1.JobTrackingFinalizer { 3171 return true 3172 } 3173 } 3174 return false 3175 } 3176 3177 func setDuringTest(val *int, newVal int) func() { 3178 origVal := *val 3179 *val = newVal 3180 return func() { 3181 *val = origVal 3182 } 3183 } 3184 3185 func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() { 3186 origVal := *val 3187 *val = newVal 3188 return func() { 3189 *val = origVal 3190 } 3191 } 3192 3193 func updateJob(ctx context.Context, jobClient typedv1.JobInterface, jobName string, updateFunc func(*batchv1.Job)) (*batchv1.Job, error) { 3194 var job *batchv1.Job 3195 err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { 3196 newJob, err := jobClient.Get(ctx, jobName, metav1.GetOptions{}) 3197 if err != nil { 3198 return err 3199 } 3200 updateFunc(newJob) 3201 job, err = jobClient.Update(ctx, newJob, metav1.UpdateOptions{}) 3202 return err 3203 }) 3204 return job, err 3205 } 3206 3207 func waitForPodsToBeActive(ctx context.Context, t *testing.T, jobClient typedv1.JobInterface, podCount int32, jobObj *batchv1.Job) { 3208 t.Helper() 3209 err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(context.Context) (done bool, err error) { 3210 job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{}) 3211 if err != nil { 3212 return false, err 3213 } 3214 return job.Status.Active == podCount, nil 3215 }) 3216 if err != nil { 3217 t.Fatalf("Error waiting for Job pods to become active: %v", err) 3218 } 3219 } 3220 3221 func deletePods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) { 3222 t.Helper() 3223 err := clientSet.CoreV1().Pods(namespace).DeleteCollection(ctx, 3224 metav1.DeleteOptions{}, 3225 metav1.ListOptions{ 3226 Limit: 1000, 3227 }) 3228 if err != nil { 3229 t.Fatalf("Failed to cleanup Pods: %v", err) 3230 } 3231 } 3232 3233 func removePodsFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) { 3234 t.Helper() 3235 pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) 3236 if err != nil { 3237 t.Fatalf("Failed to list pods: %v", err) 3238 } 3239 updatePod(ctx, t, clientSet, pods.Items, func(pod *v1.Pod) { 3240 for i, finalizer := range pod.Finalizers { 3241 if finalizer == "fake.example.com/blockDeletion" { 3242 pod.Finalizers = append(pod.Finalizers[:i], pod.Finalizers[i+1:]...) 3243 } 3244 } 3245 }) 3246 } 3247 3248 func updatePod(ctx context.Context, t *testing.T, clientSet clientset.Interface, pods []v1.Pod, updateFunc func(*v1.Pod)) { 3249 t.Helper() 3250 for _, val := range pods { 3251 if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error { 3252 newPod, err := clientSet.CoreV1().Pods(val.Namespace).Get(ctx, val.Name, metav1.GetOptions{}) 3253 if err != nil { 3254 return err 3255 } 3256 updateFunc(newPod) 3257 _, err = clientSet.CoreV1().Pods(val.Namespace).Update(ctx, newPod, metav1.UpdateOptions{}) 3258 return err 3259 }); err != nil { 3260 t.Fatalf("Failed to update pod %s: %v", val.Name, err) 3261 } 3262 } 3263 } 3264 3265 func failTerminatingPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) { 3266 t.Helper() 3267 pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{}) 3268 if err != nil { 3269 t.Fatalf("Failed to list pods: %v", err) 3270 } 3271 var terminatingPods []v1.Pod 3272 for _, pod := range pods.Items { 3273 if pod.DeletionTimestamp != nil { 3274 pod.Status.Phase = v1.PodFailed 3275 terminatingPods = append(terminatingPods, pod) 3276 } 3277 } 3278 _, err = updatePodStatuses(ctx, clientSet, terminatingPods) 3279 if err != nil { 3280 t.Fatalf("Failed to update pod statuses: %v", err) 3281 } 3282 }