k8s.io/kubernetes@v1.29.3/pkg/controller/job/job_controller_test.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "math" 24 "sort" 25 "strconv" 26 "testing" 27 "time" 28 29 "github.com/google/go-cmp/cmp" 30 "github.com/google/go-cmp/cmp/cmpopts" 31 batch "k8s.io/api/batch/v1" 32 v1 "k8s.io/api/core/v1" 33 apiequality "k8s.io/apimachinery/pkg/api/equality" 34 apierrors "k8s.io/apimachinery/pkg/api/errors" 35 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 36 "k8s.io/apimachinery/pkg/runtime/schema" 37 "k8s.io/apimachinery/pkg/types" 38 "k8s.io/apimachinery/pkg/util/rand" 39 "k8s.io/apimachinery/pkg/util/sets" 40 "k8s.io/apimachinery/pkg/util/uuid" 41 "k8s.io/apimachinery/pkg/util/wait" 42 "k8s.io/apimachinery/pkg/watch" 43 "k8s.io/apiserver/pkg/util/feature" 44 "k8s.io/client-go/informers" 45 clientset "k8s.io/client-go/kubernetes" 46 "k8s.io/client-go/kubernetes/fake" 47 restclient "k8s.io/client-go/rest" 48 core "k8s.io/client-go/testing" 49 "k8s.io/client-go/tools/cache" 50 "k8s.io/client-go/util/workqueue" 51 featuregatetesting "k8s.io/component-base/featuregate/testing" 52 metricstestutil "k8s.io/component-base/metrics/testutil" 53 "k8s.io/klog/v2" 54 "k8s.io/klog/v2/ktesting" 55 _ "k8s.io/kubernetes/pkg/apis/core/install" 56 "k8s.io/kubernetes/pkg/controller" 57 "k8s.io/kubernetes/pkg/controller/job/metrics" 58 "k8s.io/kubernetes/pkg/controller/testutil" 59 "k8s.io/kubernetes/pkg/features" 60 "k8s.io/utils/clock" 61 clocktesting "k8s.io/utils/clock/testing" 62 "k8s.io/utils/ptr" 63 ) 64 65 var realClock = &clock.RealClock{} 66 var alwaysReady = func() bool { return true } 67 68 const fastSyncJobBatchPeriod = 10 * time.Millisecond 69 const fastJobApiBackoff = 10 * time.Millisecond 70 const fastRequeue = 10 * time.Millisecond 71 72 // testFinishedAt represents time one second later than unix epoch 73 // this will be used in various test cases where we don't want back-off to kick in 74 var testFinishedAt = metav1.NewTime((time.Time{}).Add(time.Second)) 75 76 func newJobWithName(name string, parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job { 77 j := &batch.Job{ 78 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 79 ObjectMeta: metav1.ObjectMeta{ 80 Name: name, 81 UID: uuid.NewUUID(), 82 Namespace: metav1.NamespaceDefault, 83 }, 84 Spec: batch.JobSpec{ 85 Selector: &metav1.LabelSelector{ 86 MatchLabels: map[string]string{"foo": "bar"}, 87 }, 88 Template: v1.PodTemplateSpec{ 89 ObjectMeta: metav1.ObjectMeta{ 90 Labels: map[string]string{ 91 "foo": "bar", 92 }, 93 }, 94 Spec: v1.PodSpec{ 95 Containers: []v1.Container{ 96 {Image: "foo/bar"}, 97 }, 98 }, 99 }, 100 }, 101 } 102 if completionMode != "" { 103 j.Spec.CompletionMode = &completionMode 104 } 105 // Special case: -1 for either completions or parallelism means leave nil (negative is not allowed 106 // in practice by validation. 107 if completions >= 0 { 108 j.Spec.Completions = &completions 109 } else { 110 j.Spec.Completions = nil 111 } 112 if parallelism >= 0 { 113 j.Spec.Parallelism = ¶llelism 114 } else { 115 j.Spec.Parallelism = nil 116 } 117 j.Spec.BackoffLimit = &backoffLimit 118 119 return j 120 } 121 122 func newJob(parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job { 123 return newJobWithName("foobar", parallelism, completions, backoffLimit, completionMode) 124 } 125 126 func newControllerFromClient(ctx context.Context, t *testing.T, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc) (*Controller, informers.SharedInformerFactory) { 127 t.Helper() 128 return newControllerFromClientWithClock(ctx, t, kubeClient, resyncPeriod, realClock) 129 } 130 131 func newControllerFromClientWithClock(ctx context.Context, t *testing.T, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, clock clock.WithTicker) (*Controller, informers.SharedInformerFactory) { 132 t.Helper() 133 sharedInformers := informers.NewSharedInformerFactory(kubeClient, resyncPeriod()) 134 jm, err := newControllerWithClock(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), kubeClient, clock) 135 if err != nil { 136 t.Fatalf("Error creating Job controller: %v", err) 137 } 138 jm.podControl = &controller.FakePodControl{} 139 return jm, sharedInformers 140 } 141 142 func newPod(name string, job *batch.Job) *v1.Pod { 143 return &v1.Pod{ 144 ObjectMeta: metav1.ObjectMeta{ 145 Name: name, 146 UID: types.UID(name), 147 Labels: job.Spec.Selector.MatchLabels, 148 Namespace: job.Namespace, 149 OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(job, controllerKind)}, 150 }, 151 } 152 } 153 154 // create count pods with the given phase for the given job 155 func newPodList(count int, status v1.PodPhase, job *batch.Job) []*v1.Pod { 156 var pods []*v1.Pod 157 for i := 0; i < count; i++ { 158 newPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), job) 159 newPod.Status = v1.PodStatus{Phase: status} 160 newPod.Status.ContainerStatuses = []v1.ContainerStatus{ 161 { 162 State: v1.ContainerState{ 163 Terminated: &v1.ContainerStateTerminated{ 164 FinishedAt: testFinishedAt, 165 }, 166 }, 167 }, 168 } 169 newPod.Finalizers = append(newPod.Finalizers, batch.JobTrackingFinalizer) 170 pods = append(pods, newPod) 171 } 172 return pods 173 } 174 175 func setPodsStatuses(podIndexer cache.Indexer, job *batch.Job, pendingPods, activePods, succeededPods, failedPods, terminatingPods, readyPods int) { 176 for _, pod := range newPodList(pendingPods, v1.PodPending, job) { 177 podIndexer.Add(pod) 178 } 179 running := newPodList(activePods, v1.PodRunning, job) 180 for i, p := range running { 181 if i >= readyPods { 182 break 183 } 184 p.Status.Conditions = append(p.Status.Conditions, v1.PodCondition{ 185 Type: v1.PodReady, 186 Status: v1.ConditionTrue, 187 }) 188 } 189 for _, pod := range running { 190 podIndexer.Add(pod) 191 } 192 for _, pod := range newPodList(succeededPods, v1.PodSucceeded, job) { 193 podIndexer.Add(pod) 194 } 195 for _, pod := range newPodList(failedPods, v1.PodFailed, job) { 196 podIndexer.Add(pod) 197 } 198 terminating := newPodList(terminatingPods, v1.PodRunning, job) 199 for _, p := range terminating { 200 now := metav1.Now() 201 p.DeletionTimestamp = &now 202 } 203 for _, pod := range terminating { 204 podIndexer.Add(pod) 205 } 206 } 207 208 func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status []indexPhase) { 209 for _, s := range status { 210 p := newPod(fmt.Sprintf("pod-%s", rand.String(10)), job) 211 p.Status = v1.PodStatus{Phase: s.Phase} 212 if s.Phase == v1.PodFailed || s.Phase == v1.PodSucceeded { 213 p.Status.ContainerStatuses = []v1.ContainerStatus{ 214 { 215 State: v1.ContainerState{ 216 Terminated: &v1.ContainerStateTerminated{ 217 FinishedAt: testFinishedAt, 218 }, 219 }, 220 }, 221 } 222 } 223 if s.Index != noIndex { 224 p.Annotations = map[string]string{ 225 batch.JobCompletionIndexAnnotation: s.Index, 226 } 227 p.Spec.Hostname = fmt.Sprintf("%s-%s", job.Name, s.Index) 228 } 229 p.Finalizers = append(p.Finalizers, batch.JobTrackingFinalizer) 230 podIndexer.Add(p) 231 } 232 } 233 234 type jobInitialStatus struct { 235 active int 236 succeed int 237 failed int 238 startTime *time.Time 239 } 240 241 func TestControllerSyncJob(t *testing.T) { 242 _, ctx := ktesting.NewTestContext(t) 243 jobConditionComplete := batch.JobComplete 244 jobConditionFailed := batch.JobFailed 245 jobConditionSuspended := batch.JobSuspended 246 referenceTime := time.Now() 247 248 testCases := map[string]struct { 249 // job setup 250 parallelism int32 251 completions int32 252 backoffLimit int32 253 deleting bool 254 podLimit int 255 completionMode batch.CompletionMode 256 wasSuspended bool 257 suspend bool 258 podReplacementPolicy *batch.PodReplacementPolicy 259 podFailurePolicy *batch.PodFailurePolicy 260 initialStatus *jobInitialStatus 261 backoffRecord *backoffRecord 262 controllerTime *time.Time 263 264 // pod setup 265 266 // If a podControllerError is set, finalizers are not able to be removed. 267 // This means that there is no status update so the counters for 268 // failedPods and succeededPods cannot be incremented. 269 podControllerError error 270 pendingPods int 271 activePods int 272 readyPods int 273 succeededPods int 274 failedPods int 275 terminatingPods int 276 podsWithIndexes []indexPhase 277 fakeExpectationAtCreation int32 // negative: ExpectDeletions, positive: ExpectCreations 278 279 // expectations 280 expectedCreations int32 281 expectedDeletions int32 282 expectedActive int32 283 expectedReady *int32 284 expectedSucceeded int32 285 expectedCompletedIdxs string 286 expectedFailed int32 287 expectedTerminating *int32 288 expectedCondition *batch.JobConditionType 289 expectedConditionStatus v1.ConditionStatus 290 expectedConditionReason string 291 expectedCreatedIndexes sets.Set[int] 292 expectedPodPatches int 293 294 // features 295 podIndexLabelDisabled bool 296 jobPodReplacementPolicy bool 297 jobPodFailurePolicy bool 298 }{ 299 "job start": { 300 parallelism: 2, 301 completions: 5, 302 backoffLimit: 6, 303 expectedCreations: 2, 304 expectedActive: 2, 305 expectedReady: ptr.To[int32](0), 306 }, 307 "WQ job start": { 308 parallelism: 2, 309 completions: -1, 310 backoffLimit: 6, 311 expectedCreations: 2, 312 expectedActive: 2, 313 expectedReady: ptr.To[int32](0), 314 }, 315 "pending pods": { 316 parallelism: 2, 317 completions: 5, 318 backoffLimit: 6, 319 pendingPods: 2, 320 expectedActive: 2, 321 expectedReady: ptr.To[int32](0), 322 }, 323 "correct # of pods": { 324 parallelism: 3, 325 completions: 5, 326 backoffLimit: 6, 327 activePods: 3, 328 readyPods: 2, 329 expectedActive: 3, 330 expectedReady: ptr.To[int32](2), 331 }, 332 "WQ job: correct # of pods": { 333 parallelism: 2, 334 completions: -1, 335 backoffLimit: 6, 336 activePods: 2, 337 expectedActive: 2, 338 expectedReady: ptr.To[int32](0), 339 }, 340 "too few active pods": { 341 parallelism: 2, 342 completions: 5, 343 backoffLimit: 6, 344 activePods: 1, 345 succeededPods: 1, 346 expectedCreations: 1, 347 expectedActive: 2, 348 expectedSucceeded: 1, 349 expectedPodPatches: 1, 350 expectedReady: ptr.To[int32](0), 351 }, 352 "WQ job: recreate pods when failed": { 353 parallelism: 1, 354 completions: -1, 355 backoffLimit: 6, 356 activePods: 1, 357 failedPods: 1, 358 podReplacementPolicy: podReplacementPolicy(batch.Failed), 359 jobPodReplacementPolicy: true, 360 terminatingPods: 1, 361 expectedTerminating: ptr.To[int32](1), 362 expectedReady: ptr.To[int32](0), 363 // Removes finalizer and deletes one failed pod 364 expectedPodPatches: 1, 365 expectedFailed: 1, 366 expectedActive: 1, 367 }, 368 "WQ job: turn on PodReplacementPolicy but not set PodReplacementPolicy": { 369 parallelism: 1, 370 completions: 1, 371 backoffLimit: 6, 372 activePods: 1, 373 failedPods: 1, 374 jobPodReplacementPolicy: true, 375 expectedTerminating: ptr.To[int32](1), 376 expectedReady: ptr.To[int32](0), 377 terminatingPods: 1, 378 expectedActive: 1, 379 expectedPodPatches: 2, 380 expectedFailed: 2, 381 }, 382 "WQ job: recreate pods when terminating or failed": { 383 parallelism: 1, 384 completions: -1, 385 backoffLimit: 6, 386 activePods: 1, 387 failedPods: 1, 388 podReplacementPolicy: podReplacementPolicy(batch.TerminatingOrFailed), 389 jobPodReplacementPolicy: true, 390 terminatingPods: 1, 391 expectedTerminating: ptr.To[int32](1), 392 expectedReady: ptr.To[int32](0), 393 expectedActive: 1, 394 expectedPodPatches: 2, 395 expectedFailed: 2, 396 }, 397 "more terminating pods than parallelism": { 398 parallelism: 1, 399 completions: 1, 400 backoffLimit: 6, 401 activePods: 2, 402 failedPods: 0, 403 terminatingPods: 4, 404 podReplacementPolicy: podReplacementPolicy(batch.Failed), 405 jobPodReplacementPolicy: true, 406 expectedTerminating: ptr.To[int32](4), 407 expectedReady: ptr.To[int32](0), 408 expectedActive: 1, 409 expectedDeletions: 1, 410 expectedPodPatches: 1, 411 }, 412 "more terminating pods than parallelism; PodFailurePolicy used": { 413 // Repro for https://github.com/kubernetes/kubernetes/issues/122235 414 parallelism: 1, 415 completions: 1, 416 backoffLimit: 6, 417 activePods: 2, 418 failedPods: 0, 419 terminatingPods: 4, 420 jobPodFailurePolicy: true, 421 podFailurePolicy: &batch.PodFailurePolicy{}, 422 expectedTerminating: nil, 423 expectedReady: ptr.To[int32](0), 424 expectedActive: 1, 425 expectedDeletions: 1, 426 expectedPodPatches: 1, 427 }, 428 "too few active pods and active back-off": { 429 parallelism: 1, 430 completions: 1, 431 backoffLimit: 6, 432 backoffRecord: &backoffRecord{ 433 failuresAfterLastSuccess: 1, 434 lastFailureTime: &referenceTime, 435 }, 436 initialStatus: &jobInitialStatus{ 437 startTime: func() *time.Time { 438 now := time.Now() 439 return &now 440 }(), 441 }, 442 activePods: 0, 443 succeededPods: 0, 444 expectedCreations: 0, 445 expectedActive: 0, 446 expectedSucceeded: 0, 447 expectedPodPatches: 0, 448 expectedReady: ptr.To[int32](0), 449 controllerTime: &referenceTime, 450 }, 451 "too few active pods and no back-offs": { 452 parallelism: 1, 453 completions: 1, 454 backoffLimit: 6, 455 backoffRecord: &backoffRecord{ 456 failuresAfterLastSuccess: 0, 457 lastFailureTime: &referenceTime, 458 }, 459 activePods: 0, 460 succeededPods: 0, 461 expectedCreations: 1, 462 expectedActive: 1, 463 expectedSucceeded: 0, 464 expectedPodPatches: 0, 465 expectedReady: ptr.To[int32](0), 466 controllerTime: &referenceTime, 467 }, 468 "too few active pods with a dynamic job": { 469 parallelism: 2, 470 completions: -1, 471 backoffLimit: 6, 472 activePods: 1, 473 expectedCreations: 1, 474 expectedActive: 2, 475 expectedReady: ptr.To[int32](0), 476 }, 477 "too few active pods, with controller error": { 478 parallelism: 2, 479 completions: 5, 480 backoffLimit: 6, 481 podControllerError: fmt.Errorf("fake error"), 482 activePods: 1, 483 succeededPods: 1, 484 expectedCreations: 1, 485 expectedActive: 1, 486 expectedSucceeded: 0, 487 expectedPodPatches: 1, 488 expectedReady: ptr.To[int32](0), 489 }, 490 "too many active pods": { 491 parallelism: 2, 492 completions: 5, 493 backoffLimit: 6, 494 activePods: 3, 495 expectedDeletions: 1, 496 expectedActive: 2, 497 expectedPodPatches: 1, 498 expectedReady: ptr.To[int32](0), 499 }, 500 "too many active pods, with controller error": { 501 parallelism: 2, 502 completions: 5, 503 backoffLimit: 6, 504 podControllerError: fmt.Errorf("fake error"), 505 activePods: 3, 506 expectedDeletions: 0, 507 expectedPodPatches: 1, 508 expectedActive: 3, 509 expectedReady: ptr.To[int32](0), 510 }, 511 "failed + succeed pods: reset backoff delay": { 512 parallelism: 2, 513 completions: 5, 514 backoffLimit: 6, 515 activePods: 1, 516 succeededPods: 1, 517 failedPods: 1, 518 expectedCreations: 1, 519 expectedActive: 2, 520 expectedSucceeded: 1, 521 expectedFailed: 1, 522 expectedPodPatches: 2, 523 expectedReady: ptr.To[int32](0), 524 }, 525 "new failed pod": { 526 parallelism: 2, 527 completions: 5, 528 backoffLimit: 6, 529 activePods: 1, 530 failedPods: 1, 531 expectedCreations: 1, 532 expectedActive: 2, 533 expectedFailed: 1, 534 expectedPodPatches: 1, 535 expectedReady: ptr.To[int32](0), 536 }, 537 "no new pod; possible finalizer update of failed pod": { 538 parallelism: 1, 539 completions: 1, 540 backoffLimit: 6, 541 initialStatus: &jobInitialStatus{ 542 active: 1, 543 succeed: 0, 544 failed: 1, 545 }, 546 activePods: 1, 547 failedPods: 0, 548 expectedCreations: 0, 549 expectedActive: 1, 550 expectedFailed: 1, 551 expectedPodPatches: 0, 552 expectedReady: ptr.To[int32](0), 553 }, 554 "only new failed pod with controller error": { 555 parallelism: 2, 556 completions: 5, 557 backoffLimit: 6, 558 podControllerError: fmt.Errorf("fake error"), 559 activePods: 1, 560 failedPods: 1, 561 expectedCreations: 1, 562 expectedActive: 1, 563 expectedFailed: 0, 564 expectedPodPatches: 1, 565 expectedReady: ptr.To[int32](0), 566 }, 567 "job finish": { 568 parallelism: 2, 569 completions: 5, 570 backoffLimit: 6, 571 succeededPods: 5, 572 expectedSucceeded: 5, 573 expectedCondition: &jobConditionComplete, 574 expectedConditionStatus: v1.ConditionTrue, 575 expectedPodPatches: 5, 576 expectedReady: ptr.To[int32](0), 577 }, 578 "WQ job finishing": { 579 parallelism: 2, 580 completions: -1, 581 backoffLimit: 6, 582 activePods: 1, 583 succeededPods: 1, 584 expectedActive: 1, 585 expectedSucceeded: 1, 586 expectedPodPatches: 1, 587 expectedReady: ptr.To[int32](0), 588 }, 589 "WQ job all finished": { 590 parallelism: 2, 591 completions: -1, 592 backoffLimit: 6, 593 succeededPods: 2, 594 expectedSucceeded: 2, 595 expectedCondition: &jobConditionComplete, 596 expectedConditionStatus: v1.ConditionTrue, 597 expectedPodPatches: 2, 598 expectedReady: ptr.To[int32](0), 599 }, 600 "WQ job all finished despite one failure": { 601 parallelism: 2, 602 completions: -1, 603 backoffLimit: 6, 604 succeededPods: 1, 605 failedPods: 1, 606 expectedSucceeded: 1, 607 expectedFailed: 1, 608 expectedCondition: &jobConditionComplete, 609 expectedConditionStatus: v1.ConditionTrue, 610 expectedPodPatches: 2, 611 expectedReady: ptr.To[int32](0), 612 }, 613 "more active pods than parallelism": { 614 parallelism: 2, 615 completions: 5, 616 backoffLimit: 6, 617 activePods: 10, 618 expectedDeletions: 8, 619 expectedActive: 2, 620 expectedPodPatches: 8, 621 expectedReady: ptr.To[int32](0), 622 }, 623 "more active pods than remaining completions": { 624 parallelism: 3, 625 completions: 4, 626 backoffLimit: 6, 627 activePods: 3, 628 succeededPods: 2, 629 expectedDeletions: 1, 630 expectedActive: 2, 631 expectedSucceeded: 2, 632 expectedPodPatches: 3, 633 expectedReady: ptr.To[int32](0), 634 }, 635 "status change": { 636 parallelism: 2, 637 completions: 5, 638 backoffLimit: 6, 639 activePods: 2, 640 succeededPods: 2, 641 expectedActive: 2, 642 expectedSucceeded: 2, 643 expectedPodPatches: 2, 644 expectedReady: ptr.To[int32](0), 645 }, 646 "deleting job": { 647 parallelism: 2, 648 completions: 5, 649 backoffLimit: 6, 650 deleting: true, 651 pendingPods: 1, 652 activePods: 1, 653 succeededPods: 1, 654 expectedActive: 2, 655 expectedSucceeded: 1, 656 expectedPodPatches: 3, 657 expectedReady: ptr.To[int32](0), 658 }, 659 "limited pods": { 660 parallelism: 100, 661 completions: 200, 662 backoffLimit: 6, 663 podLimit: 10, 664 expectedCreations: 10, 665 expectedActive: 10, 666 expectedReady: ptr.To[int32](0), 667 }, 668 "too many job failures": { 669 parallelism: 2, 670 completions: 5, 671 deleting: true, 672 failedPods: 1, 673 expectedFailed: 1, 674 expectedCondition: &jobConditionFailed, 675 expectedConditionStatus: v1.ConditionTrue, 676 expectedConditionReason: "BackoffLimitExceeded", 677 expectedPodPatches: 1, 678 expectedReady: ptr.To[int32](0), 679 }, 680 "job failures, unsatisfied expectations": { 681 parallelism: 2, 682 completions: 5, 683 deleting: true, 684 failedPods: 1, 685 fakeExpectationAtCreation: 1, 686 expectedFailed: 1, 687 expectedPodPatches: 1, 688 expectedReady: ptr.To[int32](0), 689 }, 690 "indexed job start": { 691 parallelism: 2, 692 completions: 5, 693 backoffLimit: 6, 694 completionMode: batch.IndexedCompletion, 695 expectedCreations: 2, 696 expectedActive: 2, 697 expectedCreatedIndexes: sets.New(0, 1), 698 expectedReady: ptr.To[int32](0), 699 }, 700 "indexed job with some pods deleted, podReplacementPolicy Failed": { 701 parallelism: 2, 702 completions: 5, 703 backoffLimit: 6, 704 completionMode: batch.IndexedCompletion, 705 expectedCreations: 1, 706 expectedActive: 1, 707 expectedCreatedIndexes: sets.New(0), 708 podReplacementPolicy: podReplacementPolicy(batch.Failed), 709 jobPodReplacementPolicy: true, 710 terminatingPods: 1, 711 expectedTerminating: ptr.To[int32](1), 712 expectedReady: ptr.To[int32](0), 713 }, 714 "indexed job with some pods deleted, podReplacementPolicy TerminatingOrFailed": { 715 parallelism: 2, 716 completions: 5, 717 backoffLimit: 6, 718 completionMode: batch.IndexedCompletion, 719 expectedCreations: 2, 720 expectedActive: 2, 721 expectedCreatedIndexes: sets.New(0, 1), 722 podReplacementPolicy: podReplacementPolicy(batch.TerminatingOrFailed), 723 jobPodReplacementPolicy: true, 724 terminatingPods: 1, 725 expectedTerminating: ptr.To[int32](1), 726 expectedReady: ptr.To[int32](0), 727 expectedPodPatches: 1, 728 }, 729 "indexed job completed": { 730 parallelism: 2, 731 completions: 3, 732 backoffLimit: 6, 733 completionMode: batch.IndexedCompletion, 734 podsWithIndexes: []indexPhase{ 735 {"0", v1.PodSucceeded}, 736 {"1", v1.PodFailed}, 737 {"1", v1.PodSucceeded}, 738 {"2", v1.PodSucceeded}, 739 }, 740 expectedSucceeded: 3, 741 expectedFailed: 1, 742 expectedCompletedIdxs: "0-2", 743 expectedCondition: &jobConditionComplete, 744 expectedConditionStatus: v1.ConditionTrue, 745 expectedPodPatches: 4, 746 expectedReady: ptr.To[int32](0), 747 }, 748 "indexed job repeated completed index": { 749 parallelism: 2, 750 completions: 3, 751 backoffLimit: 6, 752 completionMode: batch.IndexedCompletion, 753 podsWithIndexes: []indexPhase{ 754 {"0", v1.PodSucceeded}, 755 {"1", v1.PodSucceeded}, 756 {"1", v1.PodSucceeded}, 757 }, 758 expectedCreations: 1, 759 expectedActive: 1, 760 expectedSucceeded: 2, 761 expectedCompletedIdxs: "0,1", 762 expectedCreatedIndexes: sets.New(2), 763 expectedPodPatches: 3, 764 expectedReady: ptr.To[int32](0), 765 }, 766 "indexed job some running and completed pods": { 767 parallelism: 8, 768 completions: 20, 769 backoffLimit: 6, 770 completionMode: batch.IndexedCompletion, 771 podsWithIndexes: []indexPhase{ 772 {"0", v1.PodRunning}, 773 {"2", v1.PodSucceeded}, 774 {"3", v1.PodPending}, 775 {"4", v1.PodSucceeded}, 776 {"5", v1.PodSucceeded}, 777 {"7", v1.PodSucceeded}, 778 {"8", v1.PodSucceeded}, 779 {"9", v1.PodSucceeded}, 780 }, 781 expectedCreations: 6, 782 expectedActive: 8, 783 expectedSucceeded: 6, 784 expectedCompletedIdxs: "2,4,5,7-9", 785 expectedCreatedIndexes: sets.New(1, 6, 10, 11, 12, 13), 786 expectedPodPatches: 6, 787 expectedReady: ptr.To[int32](0), 788 }, 789 "indexed job some failed pods": { 790 parallelism: 3, 791 completions: 4, 792 backoffLimit: 6, 793 completionMode: batch.IndexedCompletion, 794 podsWithIndexes: []indexPhase{ 795 {"0", v1.PodFailed}, 796 {"1", v1.PodPending}, 797 {"2", v1.PodFailed}, 798 }, 799 expectedCreations: 2, 800 expectedActive: 3, 801 expectedFailed: 2, 802 expectedCreatedIndexes: sets.New(0, 2), 803 expectedPodPatches: 2, 804 expectedReady: ptr.To[int32](0), 805 }, 806 "indexed job some pods without index": { 807 parallelism: 2, 808 completions: 5, 809 backoffLimit: 6, 810 completionMode: batch.IndexedCompletion, 811 activePods: 1, 812 succeededPods: 1, 813 failedPods: 1, 814 podsWithIndexes: []indexPhase{ 815 {"invalid", v1.PodRunning}, 816 {"invalid", v1.PodSucceeded}, 817 {"invalid", v1.PodFailed}, 818 {"invalid", v1.PodPending}, 819 {"0", v1.PodSucceeded}, 820 {"1", v1.PodRunning}, 821 {"2", v1.PodRunning}, 822 }, 823 expectedDeletions: 3, 824 expectedActive: 2, 825 expectedSucceeded: 1, 826 expectedFailed: 0, 827 expectedCompletedIdxs: "0", 828 expectedPodPatches: 8, 829 expectedReady: ptr.To[int32](0), 830 }, 831 "indexed job repeated indexes": { 832 parallelism: 5, 833 completions: 5, 834 backoffLimit: 6, 835 completionMode: batch.IndexedCompletion, 836 succeededPods: 1, 837 failedPods: 1, 838 podsWithIndexes: []indexPhase{ 839 {"invalid", v1.PodRunning}, 840 {"0", v1.PodSucceeded}, 841 {"1", v1.PodRunning}, 842 {"2", v1.PodRunning}, 843 {"2", v1.PodPending}, 844 }, 845 expectedCreations: 0, 846 expectedDeletions: 2, 847 expectedActive: 2, 848 expectedSucceeded: 1, 849 expectedCompletedIdxs: "0", 850 expectedPodPatches: 5, 851 expectedReady: ptr.To[int32](0), 852 }, 853 "indexed job with indexes outside of range": { 854 parallelism: 2, 855 completions: 5, 856 backoffLimit: 6, 857 completionMode: batch.IndexedCompletion, 858 podsWithIndexes: []indexPhase{ 859 {"0", v1.PodSucceeded}, 860 {"5", v1.PodRunning}, 861 {"6", v1.PodSucceeded}, 862 {"7", v1.PodPending}, 863 {"8", v1.PodFailed}, 864 }, 865 expectedCreations: 0, // only one of creations and deletions can happen in a sync 866 expectedSucceeded: 1, 867 expectedDeletions: 2, 868 expectedCompletedIdxs: "0", 869 expectedActive: 0, 870 expectedFailed: 0, 871 expectedPodPatches: 5, 872 expectedReady: ptr.To[int32](0), 873 }, 874 "suspending a job with satisfied expectations": { 875 // Suspended Job should delete active pods when expectations are 876 // satisfied. 877 suspend: true, 878 parallelism: 2, 879 activePods: 2, // parallelism == active, expectations satisfied 880 completions: 4, 881 backoffLimit: 6, 882 expectedCreations: 0, 883 expectedDeletions: 2, 884 expectedActive: 0, 885 expectedCondition: &jobConditionSuspended, 886 expectedConditionStatus: v1.ConditionTrue, 887 expectedConditionReason: "JobSuspended", 888 expectedPodPatches: 2, 889 expectedReady: ptr.To[int32](0), 890 }, 891 "suspending a job with unsatisfied expectations": { 892 // Unlike the previous test, we expect the controller to NOT suspend the 893 // Job in the syncJob call because the controller will wait for 894 // expectations to be satisfied first. The next syncJob call (not tested 895 // here) will be the same as the previous test. 896 suspend: true, 897 parallelism: 2, 898 activePods: 3, // active > parallelism, expectations unsatisfied 899 fakeExpectationAtCreation: -1, // the controller is expecting a deletion 900 completions: 4, 901 backoffLimit: 6, 902 expectedCreations: 0, 903 expectedDeletions: 0, 904 expectedActive: 3, 905 expectedReady: ptr.To[int32](0), 906 }, 907 "resuming a suspended job": { 908 wasSuspended: true, 909 suspend: false, 910 parallelism: 2, 911 completions: 4, 912 backoffLimit: 6, 913 expectedCreations: 2, 914 expectedDeletions: 0, 915 expectedActive: 2, 916 expectedCondition: &jobConditionSuspended, 917 expectedConditionStatus: v1.ConditionFalse, 918 expectedConditionReason: "JobResumed", 919 expectedReady: ptr.To[int32](0), 920 }, 921 "suspending a deleted job": { 922 // We would normally expect the active pods to be deleted (see a few test 923 // cases above), but since this job is being deleted, we don't expect 924 // anything changed here from before the job was suspended. The 925 // JobSuspended condition is also missing. 926 suspend: true, 927 deleting: true, 928 parallelism: 2, 929 activePods: 2, // parallelism == active, expectations satisfied 930 completions: 4, 931 backoffLimit: 6, 932 expectedCreations: 0, 933 expectedDeletions: 0, 934 expectedActive: 2, 935 expectedPodPatches: 2, 936 expectedReady: ptr.To[int32](0), 937 }, 938 "indexed job with podIndexLabel feature disabled": { 939 parallelism: 2, 940 completions: 5, 941 backoffLimit: 6, 942 completionMode: batch.IndexedCompletion, 943 expectedCreations: 2, 944 expectedActive: 2, 945 expectedCreatedIndexes: sets.New(0, 1), 946 podIndexLabelDisabled: true, 947 expectedReady: ptr.To[int32](0), 948 }, 949 } 950 951 for name, tc := range testCases { 952 t.Run(name, func(t *testing.T) { 953 logger, _ := ktesting.NewTestContext(t) 954 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodIndexLabel, !tc.podIndexLabelDisabled)() 955 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.jobPodReplacementPolicy)() 956 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.jobPodFailurePolicy)() 957 // job manager setup 958 clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 959 960 var fakeClock clock.WithTicker 961 if tc.controllerTime != nil { 962 fakeClock = clocktesting.NewFakeClock(*tc.controllerTime) 963 } else { 964 fakeClock = clocktesting.NewFakeClock(time.Now()) 965 } 966 967 manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientSet, controller.NoResyncPeriodFunc, fakeClock) 968 fakePodControl := controller.FakePodControl{Err: tc.podControllerError, CreateLimit: tc.podLimit} 969 manager.podControl = &fakePodControl 970 manager.podStoreSynced = alwaysReady 971 manager.jobStoreSynced = alwaysReady 972 973 // job & pods setup 974 job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, tc.completionMode) 975 job.Spec.Suspend = ptr.To(tc.suspend) 976 if tc.jobPodReplacementPolicy { 977 job.Spec.PodReplacementPolicy = tc.podReplacementPolicy 978 } 979 if tc.jobPodFailurePolicy { 980 job.Spec.PodFailurePolicy = tc.podFailurePolicy 981 } 982 if tc.initialStatus != nil { 983 startTime := metav1.Now() 984 job.Status.StartTime = &startTime 985 job.Status.Active = int32(tc.initialStatus.active) 986 job.Status.Succeeded = int32(tc.initialStatus.succeed) 987 job.Status.Failed = int32(tc.initialStatus.failed) 988 if tc.initialStatus.startTime != nil { 989 startTime := metav1.NewTime(*tc.initialStatus.startTime) 990 job.Status.StartTime = &startTime 991 } 992 } 993 994 key, err := controller.KeyFunc(job) 995 if err != nil { 996 t.Errorf("Unexpected error getting job key: %v", err) 997 } 998 999 if tc.backoffRecord != nil { 1000 tc.backoffRecord.key = key 1001 manager.podBackoffStore.updateBackoffRecord(*tc.backoffRecord) 1002 } 1003 if tc.fakeExpectationAtCreation < 0 { 1004 manager.expectations.ExpectDeletions(logger, key, int(-tc.fakeExpectationAtCreation)) 1005 } else if tc.fakeExpectationAtCreation > 0 { 1006 manager.expectations.ExpectCreations(logger, key, int(tc.fakeExpectationAtCreation)) 1007 } 1008 if tc.wasSuspended { 1009 job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobSuspended, v1.ConditionTrue, "JobSuspended", "Job suspended", realClock.Now())) 1010 } 1011 if tc.deleting { 1012 now := metav1.Now() 1013 job.DeletionTimestamp = &now 1014 } 1015 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 1016 podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() 1017 setPodsStatuses(podIndexer, job, tc.pendingPods, tc.activePods, tc.succeededPods, tc.failedPods, tc.terminatingPods, tc.readyPods) 1018 setPodsStatusesWithIndexes(podIndexer, job, tc.podsWithIndexes) 1019 1020 actual := job 1021 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 1022 actual = job 1023 return job, nil 1024 } 1025 1026 // run 1027 err = manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 1028 1029 // We need requeue syncJob task if podController error 1030 if tc.podControllerError != nil { 1031 if err == nil { 1032 t.Error("Syncing jobs expected to return error on podControl exception") 1033 } 1034 } else if tc.podLimit != 0 && fakePodControl.CreateCallCount > tc.podLimit { 1035 if err == nil { 1036 t.Error("Syncing jobs expected to return error when reached the podControl limit") 1037 } 1038 } else if err != nil { 1039 t.Errorf("Unexpected error when syncing jobs: %v", err) 1040 } 1041 // validate created/deleted pods 1042 if int32(len(fakePodControl.Templates)) != tc.expectedCreations { 1043 t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", tc.expectedCreations, len(fakePodControl.Templates)) 1044 } 1045 if tc.completionMode == batch.IndexedCompletion { 1046 checkIndexedJobPods(t, &fakePodControl, tc.expectedCreatedIndexes, job.Name, tc.podIndexLabelDisabled) 1047 } else { 1048 for _, p := range fakePodControl.Templates { 1049 // Fake pod control doesn't add generate name from the owner reference. 1050 if p.GenerateName != "" { 1051 t.Errorf("Got pod generate name %s, want %s", p.GenerateName, "") 1052 } 1053 if p.Spec.Hostname != "" { 1054 t.Errorf("Got pod hostname %q, want none", p.Spec.Hostname) 1055 } 1056 } 1057 } 1058 if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions { 1059 t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", tc.expectedDeletions, len(fakePodControl.DeletePodName)) 1060 } 1061 // Each create should have an accompanying ControllerRef. 1062 if len(fakePodControl.ControllerRefs) != int(tc.expectedCreations) { 1063 t.Errorf("Unexpected number of ControllerRefs. Expected %d, saw %d\n", tc.expectedCreations, len(fakePodControl.ControllerRefs)) 1064 } 1065 // Make sure the ControllerRefs are correct. 1066 for _, controllerRef := range fakePodControl.ControllerRefs { 1067 if got, want := controllerRef.APIVersion, "batch/v1"; got != want { 1068 t.Errorf("controllerRef.APIVersion = %q, want %q", got, want) 1069 } 1070 if got, want := controllerRef.Kind, "Job"; got != want { 1071 t.Errorf("controllerRef.Kind = %q, want %q", got, want) 1072 } 1073 if got, want := controllerRef.Name, job.Name; got != want { 1074 t.Errorf("controllerRef.Name = %q, want %q", got, want) 1075 } 1076 if got, want := controllerRef.UID, job.UID; got != want { 1077 t.Errorf("controllerRef.UID = %q, want %q", got, want) 1078 } 1079 if controllerRef.Controller == nil || *controllerRef.Controller != true { 1080 t.Errorf("controllerRef.Controller is not set to true") 1081 } 1082 } 1083 // validate status 1084 if actual.Status.Active != tc.expectedActive { 1085 t.Errorf("Unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active) 1086 } 1087 if diff := cmp.Diff(tc.expectedReady, actual.Status.Ready); diff != "" { 1088 t.Errorf("Unexpected number of ready pods (-want,+got): %s", diff) 1089 } 1090 if actual.Status.Succeeded != tc.expectedSucceeded { 1091 t.Errorf("Unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded) 1092 } 1093 if diff := cmp.Diff(tc.expectedCompletedIdxs, actual.Status.CompletedIndexes); diff != "" { 1094 t.Errorf("Unexpected completed indexes (-want,+got):\n%s", diff) 1095 } 1096 if actual.Status.Failed != tc.expectedFailed { 1097 t.Errorf("Unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed) 1098 } 1099 if diff := cmp.Diff(tc.expectedTerminating, actual.Status.Terminating); diff != "" { 1100 t.Errorf("Unexpected number of terminating pods (-want,+got): %s", diff) 1101 } 1102 if actual.Status.StartTime != nil && tc.suspend { 1103 t.Error("Unexpected .status.startTime not nil when suspend is true") 1104 } 1105 if actual.Status.StartTime == nil && !tc.suspend { 1106 t.Error("Missing .status.startTime") 1107 } 1108 // validate conditions 1109 if tc.expectedCondition != nil { 1110 if !getCondition(actual, *tc.expectedCondition, tc.expectedConditionStatus, tc.expectedConditionReason) { 1111 t.Errorf("Expected completion condition. Got %#v", actual.Status.Conditions) 1112 } 1113 } else { 1114 if cond := hasTrueCondition(actual); cond != nil { 1115 t.Errorf("Got condition %s, want none", *cond) 1116 } 1117 } 1118 if tc.expectedCondition == nil && tc.suspend && len(actual.Status.Conditions) != 0 { 1119 t.Errorf("Unexpected conditions %v", actual.Status.Conditions) 1120 } 1121 // validate slow start 1122 expectedLimit := 0 1123 for pass := uint8(0); expectedLimit <= tc.podLimit; pass++ { 1124 expectedLimit += controller.SlowStartInitialBatchSize << pass 1125 } 1126 if tc.podLimit > 0 && fakePodControl.CreateCallCount > expectedLimit { 1127 t.Errorf("Unexpected number of create calls. Expected <= %d, saw %d\n", fakePodControl.CreateLimit*2, fakePodControl.CreateCallCount) 1128 } 1129 if p := len(fakePodControl.Patches); p != tc.expectedPodPatches { 1130 t.Errorf("Got %d pod patches, want %d", p, tc.expectedPodPatches) 1131 } 1132 }) 1133 } 1134 } 1135 1136 func checkIndexedJobPods(t *testing.T, control *controller.FakePodControl, wantIndexes sets.Set[int], jobName string, podIndexLabelDisabled bool) { 1137 t.Helper() 1138 gotIndexes := sets.New[int]() 1139 for _, p := range control.Templates { 1140 checkJobCompletionEnvVariable(t, &p.Spec, podIndexLabelDisabled) 1141 if !podIndexLabelDisabled { 1142 checkJobCompletionLabel(t, &p) 1143 } 1144 ix := getCompletionIndex(p.Annotations) 1145 if ix == -1 { 1146 t.Errorf("Created pod %s didn't have completion index", p.Name) 1147 } else { 1148 gotIndexes.Insert(ix) 1149 } 1150 expectedName := fmt.Sprintf("%s-%d", jobName, ix) 1151 if expectedName != p.Spec.Hostname { 1152 t.Errorf("Got pod hostname %s, want %s", p.Spec.Hostname, expectedName) 1153 } 1154 expectedName += "-" 1155 if expectedName != p.GenerateName { 1156 t.Errorf("Got pod generate name %s, want %s", p.GenerateName, expectedName) 1157 } 1158 } 1159 if diff := cmp.Diff(sets.List(wantIndexes), sets.List(gotIndexes)); diff != "" { 1160 t.Errorf("Unexpected created completion indexes (-want,+got):\n%s", diff) 1161 } 1162 } 1163 1164 func TestGetNewFinshedPods(t *testing.T) { 1165 cases := map[string]struct { 1166 job batch.Job 1167 pods []*v1.Pod 1168 expectedRmFinalizers sets.Set[string] 1169 wantSucceeded int32 1170 wantFailed int32 1171 }{ 1172 "some counted": { 1173 job: batch.Job{ 1174 Status: batch.JobStatus{ 1175 Succeeded: 2, 1176 Failed: 1, 1177 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1178 }, 1179 }, 1180 pods: []*v1.Pod{ 1181 buildPod().uid("a").phase(v1.PodSucceeded).Pod, 1182 buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1183 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1184 buildPod().uid("d").phase(v1.PodFailed).Pod, 1185 buildPod().uid("e").phase(v1.PodFailed).trackingFinalizer().Pod, 1186 buildPod().uid("f").phase(v1.PodRunning).Pod, 1187 }, 1188 wantSucceeded: 4, 1189 wantFailed: 2, 1190 }, 1191 "some uncounted": { 1192 job: batch.Job{ 1193 Status: batch.JobStatus{ 1194 Succeeded: 1, 1195 Failed: 1, 1196 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1197 Succeeded: []types.UID{"a", "c"}, 1198 Failed: []types.UID{"e", "f"}, 1199 }, 1200 }, 1201 }, 1202 pods: []*v1.Pod{ 1203 buildPod().uid("a").phase(v1.PodSucceeded).Pod, 1204 buildPod().uid("b").phase(v1.PodSucceeded).Pod, 1205 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1206 buildPod().uid("d").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1207 buildPod().uid("e").phase(v1.PodFailed).Pod, 1208 buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod, 1209 buildPod().uid("g").phase(v1.PodFailed).trackingFinalizer().Pod, 1210 }, 1211 wantSucceeded: 4, 1212 wantFailed: 4, 1213 }, 1214 "with expected removed finalizers": { 1215 job: batch.Job{ 1216 Status: batch.JobStatus{ 1217 Succeeded: 2, 1218 Failed: 2, 1219 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1220 Succeeded: []types.UID{"a"}, 1221 Failed: []types.UID{"d"}, 1222 }, 1223 }, 1224 }, 1225 expectedRmFinalizers: sets.New("b", "f"), 1226 pods: []*v1.Pod{ 1227 buildPod().uid("a").phase(v1.PodSucceeded).Pod, 1228 buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1229 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1230 buildPod().uid("d").phase(v1.PodFailed).Pod, 1231 buildPod().uid("e").phase(v1.PodFailed).trackingFinalizer().Pod, 1232 buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod, 1233 buildPod().uid("g").phase(v1.PodFailed).trackingFinalizer().Pod, 1234 }, 1235 wantSucceeded: 4, 1236 wantFailed: 5, 1237 }, 1238 "deleted pods": { 1239 job: batch.Job{ 1240 Status: batch.JobStatus{ 1241 Succeeded: 1, 1242 Failed: 1, 1243 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1244 }, 1245 }, 1246 pods: []*v1.Pod{ 1247 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().deletionTimestamp().Pod, 1248 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().deletionTimestamp().Pod, 1249 buildPod().uid("c").phase(v1.PodRunning).trackingFinalizer().deletionTimestamp().Pod, 1250 buildPod().uid("d").phase(v1.PodPending).trackingFinalizer().deletionTimestamp().Pod, 1251 buildPod().uid("e").phase(v1.PodRunning).deletionTimestamp().Pod, 1252 buildPod().uid("f").phase(v1.PodPending).deletionTimestamp().Pod, 1253 }, 1254 wantSucceeded: 2, 1255 wantFailed: 4, 1256 }, 1257 } 1258 for name, tc := range cases { 1259 t.Run(name, func(t *testing.T) { 1260 uncounted := newUncountedTerminatedPods(*tc.job.Status.UncountedTerminatedPods) 1261 jobCtx := &syncJobCtx{job: &tc.job, pods: tc.pods, uncounted: uncounted, expectedRmFinalizers: tc.expectedRmFinalizers} 1262 succeededPods, failedPods := getNewFinishedPods(jobCtx) 1263 succeeded := int32(len(succeededPods)) + tc.job.Status.Succeeded + int32(len(uncounted.succeeded)) 1264 failed := int32(len(failedPods)) + tc.job.Status.Failed + int32(len(uncounted.failed)) 1265 if succeeded != tc.wantSucceeded { 1266 t.Errorf("getStatus reports %d succeeded pods, want %d", succeeded, tc.wantSucceeded) 1267 } 1268 if failed != tc.wantFailed { 1269 t.Errorf("getStatus reports %d succeeded pods, want %d", failed, tc.wantFailed) 1270 } 1271 }) 1272 } 1273 } 1274 1275 func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) { 1276 logger, ctx := ktesting.NewTestContext(t) 1277 succeededCond := newCondition(batch.JobComplete, v1.ConditionTrue, "", "", realClock.Now()) 1278 failedCond := newCondition(batch.JobFailed, v1.ConditionTrue, "", "", realClock.Now()) 1279 indexedCompletion := batch.IndexedCompletion 1280 mockErr := errors.New("mock error") 1281 cases := map[string]struct { 1282 job batch.Job 1283 pods []*v1.Pod 1284 finishedCond *batch.JobCondition 1285 expectedRmFinalizers sets.Set[string] 1286 needsFlush bool 1287 statusUpdateErr error 1288 podControlErr error 1289 wantErr error 1290 wantRmFinalizers int 1291 wantStatusUpdates []batch.JobStatus 1292 wantSucceededPodsMetric int 1293 wantFailedPodsMetric int 1294 1295 // features 1296 enableJobBackoffLimitPerIndex bool 1297 }{ 1298 "no updates": {}, 1299 "new active": { 1300 job: batch.Job{ 1301 Status: batch.JobStatus{ 1302 Active: 1, 1303 }, 1304 }, 1305 needsFlush: true, 1306 wantStatusUpdates: []batch.JobStatus{ 1307 { 1308 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1309 Active: 1, 1310 }, 1311 }, 1312 }, 1313 "track finished pods": { 1314 pods: []*v1.Pod{ 1315 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1316 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1317 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().deletionTimestamp().Pod, 1318 buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().deletionTimestamp().Pod, 1319 buildPod().uid("e").phase(v1.PodPending).trackingFinalizer().deletionTimestamp().Pod, 1320 buildPod().phase(v1.PodPending).trackingFinalizer().Pod, 1321 buildPod().phase(v1.PodRunning).trackingFinalizer().Pod, 1322 }, 1323 wantRmFinalizers: 5, 1324 wantStatusUpdates: []batch.JobStatus{ 1325 { 1326 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1327 Succeeded: []types.UID{"a", "c"}, 1328 Failed: []types.UID{"b", "d", "e"}, 1329 }, 1330 }, 1331 { 1332 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1333 Succeeded: 2, 1334 Failed: 3, 1335 }, 1336 }, 1337 wantSucceededPodsMetric: 2, 1338 wantFailedPodsMetric: 3, 1339 }, 1340 "past and new finished pods": { 1341 job: batch.Job{ 1342 Status: batch.JobStatus{ 1343 Active: 1, 1344 Succeeded: 2, 1345 Failed: 3, 1346 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1347 Succeeded: []types.UID{"a", "e"}, 1348 Failed: []types.UID{"b", "f"}, 1349 }, 1350 }, 1351 }, 1352 pods: []*v1.Pod{ 1353 buildPod().uid("e").phase(v1.PodSucceeded).Pod, 1354 buildPod().phase(v1.PodFailed).Pod, 1355 buildPod().phase(v1.PodPending).Pod, 1356 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1357 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1358 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1359 buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod, 1360 }, 1361 wantRmFinalizers: 4, 1362 wantStatusUpdates: []batch.JobStatus{ 1363 { 1364 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1365 Succeeded: []types.UID{"a", "c"}, 1366 Failed: []types.UID{"b", "d"}, 1367 }, 1368 Active: 1, 1369 Succeeded: 3, 1370 Failed: 4, 1371 }, 1372 { 1373 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1374 Active: 1, 1375 Succeeded: 5, 1376 Failed: 6, 1377 }, 1378 }, 1379 wantSucceededPodsMetric: 3, 1380 wantFailedPodsMetric: 3, 1381 }, 1382 "expecting removed finalizers": { 1383 job: batch.Job{ 1384 Status: batch.JobStatus{ 1385 Succeeded: 2, 1386 Failed: 3, 1387 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1388 Succeeded: []types.UID{"a", "g"}, 1389 Failed: []types.UID{"b", "h"}, 1390 }, 1391 }, 1392 }, 1393 expectedRmFinalizers: sets.New("c", "d", "g", "h"), 1394 pods: []*v1.Pod{ 1395 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1396 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1397 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1398 buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod, 1399 buildPod().uid("e").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1400 buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod, 1401 buildPod().uid("g").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1402 buildPod().uid("h").phase(v1.PodFailed).trackingFinalizer().Pod, 1403 }, 1404 wantRmFinalizers: 4, 1405 wantStatusUpdates: []batch.JobStatus{ 1406 { 1407 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1408 Succeeded: []types.UID{"a", "e"}, 1409 Failed: []types.UID{"b", "f"}, 1410 }, 1411 Succeeded: 3, 1412 Failed: 4, 1413 }, 1414 { 1415 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1416 Succeeded: 5, 1417 Failed: 6, 1418 }, 1419 }, 1420 wantSucceededPodsMetric: 3, 1421 wantFailedPodsMetric: 3, 1422 }, 1423 "succeeding job": { 1424 pods: []*v1.Pod{ 1425 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1426 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1427 }, 1428 finishedCond: succeededCond, 1429 wantRmFinalizers: 2, 1430 wantStatusUpdates: []batch.JobStatus{ 1431 { 1432 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1433 Succeeded: []types.UID{"a"}, 1434 Failed: []types.UID{"b"}, 1435 }, 1436 }, 1437 { 1438 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1439 Succeeded: 1, 1440 Failed: 1, 1441 Conditions: []batch.JobCondition{*succeededCond}, 1442 CompletionTime: &succeededCond.LastTransitionTime, 1443 }, 1444 }, 1445 wantSucceededPodsMetric: 1, 1446 wantFailedPodsMetric: 1, 1447 }, 1448 "failing job": { 1449 pods: []*v1.Pod{ 1450 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1451 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1452 buildPod().uid("c").phase(v1.PodRunning).trackingFinalizer().Pod, 1453 }, 1454 finishedCond: failedCond, 1455 // Running pod counts as failed. 1456 wantRmFinalizers: 3, 1457 wantStatusUpdates: []batch.JobStatus{ 1458 { 1459 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1460 Succeeded: []types.UID{"a"}, 1461 Failed: []types.UID{"b", "c"}, 1462 }, 1463 }, 1464 { 1465 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1466 Succeeded: 1, 1467 Failed: 2, 1468 Conditions: []batch.JobCondition{*failedCond}, 1469 }, 1470 }, 1471 wantSucceededPodsMetric: 1, 1472 wantFailedPodsMetric: 2, 1473 }, 1474 "deleted job": { 1475 job: batch.Job{ 1476 ObjectMeta: metav1.ObjectMeta{ 1477 DeletionTimestamp: &metav1.Time{}, 1478 }, 1479 Status: batch.JobStatus{ 1480 Active: 1, 1481 }, 1482 }, 1483 pods: []*v1.Pod{ 1484 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1485 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1486 buildPod().phase(v1.PodRunning).trackingFinalizer().Pod, 1487 }, 1488 // Removing finalizer from Running pod, but doesn't count as failed. 1489 wantRmFinalizers: 3, 1490 wantStatusUpdates: []batch.JobStatus{ 1491 { 1492 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1493 Succeeded: []types.UID{"a"}, 1494 Failed: []types.UID{"b"}, 1495 }, 1496 Active: 1, 1497 }, 1498 { 1499 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1500 Active: 1, 1501 Succeeded: 1, 1502 Failed: 1, 1503 }, 1504 }, 1505 wantSucceededPodsMetric: 1, 1506 wantFailedPodsMetric: 1, 1507 }, 1508 "status update error": { 1509 pods: []*v1.Pod{ 1510 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1511 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1512 }, 1513 statusUpdateErr: mockErr, 1514 wantErr: mockErr, 1515 wantStatusUpdates: []batch.JobStatus{ 1516 { 1517 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1518 Succeeded: []types.UID{"a"}, 1519 Failed: []types.UID{"b"}, 1520 }, 1521 }, 1522 }, 1523 }, 1524 "pod patch errors": { 1525 pods: []*v1.Pod{ 1526 buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1527 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod, 1528 }, 1529 podControlErr: mockErr, 1530 wantErr: mockErr, 1531 wantRmFinalizers: 2, 1532 wantStatusUpdates: []batch.JobStatus{ 1533 { 1534 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1535 Succeeded: []types.UID{"a"}, 1536 Failed: []types.UID{"b"}, 1537 }, 1538 }, 1539 }, 1540 }, 1541 "pod patch errors with partial success": { 1542 job: batch.Job{ 1543 Status: batch.JobStatus{ 1544 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1545 Succeeded: []types.UID{"a"}, 1546 Failed: []types.UID{"b"}, 1547 }, 1548 }, 1549 }, 1550 pods: []*v1.Pod{ 1551 buildPod().uid("a").phase(v1.PodSucceeded).Pod, 1552 buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1553 buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod, 1554 }, 1555 podControlErr: mockErr, 1556 wantErr: mockErr, 1557 wantRmFinalizers: 2, 1558 wantStatusUpdates: []batch.JobStatus{ 1559 { 1560 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1561 Succeeded: []types.UID{"c"}, 1562 Failed: []types.UID{"d"}, 1563 }, 1564 Succeeded: 1, 1565 Failed: 1, 1566 }, 1567 }, 1568 }, 1569 "indexed job new successful pods": { 1570 job: batch.Job{ 1571 Spec: batch.JobSpec{ 1572 CompletionMode: &indexedCompletion, 1573 Completions: ptr.To[int32](6), 1574 }, 1575 Status: batch.JobStatus{ 1576 Active: 1, 1577 }, 1578 }, 1579 pods: []*v1.Pod{ 1580 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod, 1581 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod, 1582 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod, 1583 buildPod().phase(v1.PodRunning).trackingFinalizer().index("5").Pod, 1584 buildPod().phase(v1.PodSucceeded).trackingFinalizer().Pod, 1585 }, 1586 wantRmFinalizers: 4, 1587 wantStatusUpdates: []batch.JobStatus{ 1588 { 1589 Active: 1, 1590 Succeeded: 2, 1591 CompletedIndexes: "1,3", 1592 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1593 }, 1594 }, 1595 wantSucceededPodsMetric: 2, 1596 }, 1597 "indexed job prev successful pods outside current completions index range with no new succeeded pods": { 1598 job: batch.Job{ 1599 Spec: batch.JobSpec{ 1600 CompletionMode: &indexedCompletion, 1601 Completions: ptr.To[int32](2), 1602 Parallelism: ptr.To[int32](2), 1603 }, 1604 Status: batch.JobStatus{ 1605 Active: 2, 1606 Succeeded: 1, 1607 CompletedIndexes: "3", 1608 }, 1609 }, 1610 pods: []*v1.Pod{ 1611 buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod, 1612 buildPod().phase(v1.PodRunning).trackingFinalizer().index("1").Pod, 1613 }, 1614 wantRmFinalizers: 0, 1615 wantStatusUpdates: []batch.JobStatus{ 1616 { 1617 Active: 2, 1618 Succeeded: 0, 1619 CompletedIndexes: "", 1620 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1621 }, 1622 }, 1623 }, 1624 "indexed job prev successful pods outside current completions index range with new succeeded pods in range": { 1625 job: batch.Job{ 1626 Spec: batch.JobSpec{ 1627 CompletionMode: &indexedCompletion, 1628 Completions: ptr.To[int32](2), 1629 Parallelism: ptr.To[int32](2), 1630 }, 1631 Status: batch.JobStatus{ 1632 Active: 2, 1633 Succeeded: 1, 1634 CompletedIndexes: "3", 1635 }, 1636 }, 1637 pods: []*v1.Pod{ 1638 buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod, 1639 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod, 1640 }, 1641 wantRmFinalizers: 1, 1642 wantStatusUpdates: []batch.JobStatus{ 1643 { 1644 Active: 2, 1645 Succeeded: 1, 1646 CompletedIndexes: "1", 1647 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1648 }, 1649 }, 1650 wantSucceededPodsMetric: 1, 1651 }, 1652 "indexed job new failed pods": { 1653 job: batch.Job{ 1654 Spec: batch.JobSpec{ 1655 CompletionMode: &indexedCompletion, 1656 Completions: ptr.To[int32](6), 1657 }, 1658 Status: batch.JobStatus{ 1659 Active: 1, 1660 }, 1661 }, 1662 pods: []*v1.Pod{ 1663 buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().index("1").Pod, 1664 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().index("3").Pod, 1665 buildPod().uid("c").phase(v1.PodFailed).trackingFinalizer().index("3").Pod, 1666 buildPod().uid("d").phase(v1.PodRunning).trackingFinalizer().index("5").Pod, 1667 buildPod().phase(v1.PodFailed).trackingFinalizer().Pod, 1668 }, 1669 wantRmFinalizers: 4, 1670 wantStatusUpdates: []batch.JobStatus{ 1671 { 1672 Active: 1, 1673 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1674 Failed: []types.UID{"a", "b", "c"}, 1675 }, 1676 }, 1677 { 1678 Active: 1, 1679 Failed: 3, 1680 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1681 }, 1682 }, 1683 wantFailedPodsMetric: 3, 1684 }, 1685 "indexed job past and new pods": { 1686 job: batch.Job{ 1687 Spec: batch.JobSpec{ 1688 CompletionMode: &indexedCompletion, 1689 Completions: ptr.To[int32](7), 1690 }, 1691 Status: batch.JobStatus{ 1692 Failed: 2, 1693 Succeeded: 5, 1694 CompletedIndexes: "0-2,4,6,7", 1695 }, 1696 }, 1697 pods: []*v1.Pod{ 1698 buildPod().phase(v1.PodSucceeded).index("0").Pod, 1699 buildPod().phase(v1.PodFailed).index("1").Pod, 1700 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod, 1701 buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod, 1702 buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().index("2").Pod, 1703 buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().index("5").Pod, 1704 }, 1705 wantRmFinalizers: 4, 1706 wantStatusUpdates: []batch.JobStatus{ 1707 { 1708 Succeeded: 6, 1709 Failed: 2, 1710 CompletedIndexes: "0-4,6", 1711 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1712 Failed: []types.UID{"a", "b"}, 1713 }, 1714 }, 1715 { 1716 Succeeded: 6, 1717 Failed: 4, 1718 CompletedIndexes: "0-4,6", 1719 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1720 }, 1721 }, 1722 wantSucceededPodsMetric: 1, 1723 wantFailedPodsMetric: 2, 1724 }, 1725 "too many finished": { 1726 job: batch.Job{ 1727 Status: batch.JobStatus{ 1728 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1729 Failed: []types.UID{"a", "b"}, 1730 }, 1731 }, 1732 }, 1733 pods: func() []*v1.Pod { 1734 pods := make([]*v1.Pod, 500) 1735 for i := range pods { 1736 pods[i] = buildPod().uid(strconv.Itoa(i)).phase(v1.PodSucceeded).trackingFinalizer().Pod 1737 } 1738 pods = append(pods, buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod) 1739 return pods 1740 }(), 1741 wantRmFinalizers: 499, 1742 wantStatusUpdates: []batch.JobStatus{ 1743 { 1744 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1745 Succeeded: func() []types.UID { 1746 uids := make([]types.UID, 499) 1747 for i := range uids { 1748 uids[i] = types.UID(strconv.Itoa(i)) 1749 } 1750 return uids 1751 }(), 1752 Failed: []types.UID{"b"}, 1753 }, 1754 Failed: 1, 1755 }, 1756 { 1757 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1758 Failed: []types.UID{"b"}, 1759 }, 1760 Succeeded: 499, 1761 Failed: 1, 1762 }, 1763 }, 1764 wantSucceededPodsMetric: 499, 1765 wantFailedPodsMetric: 1, 1766 }, 1767 "too many indexed finished": { 1768 job: batch.Job{ 1769 Spec: batch.JobSpec{ 1770 CompletionMode: &indexedCompletion, 1771 Completions: ptr.To[int32](501), 1772 }, 1773 }, 1774 pods: func() []*v1.Pod { 1775 pods := make([]*v1.Pod, 501) 1776 for i := range pods { 1777 pods[i] = buildPod().uid(strconv.Itoa(i)).index(strconv.Itoa(i)).phase(v1.PodSucceeded).trackingFinalizer().Pod 1778 } 1779 return pods 1780 }(), 1781 wantRmFinalizers: 500, 1782 wantStatusUpdates: []batch.JobStatus{ 1783 { 1784 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1785 CompletedIndexes: "0-499", 1786 Succeeded: 500, 1787 }, 1788 }, 1789 wantSucceededPodsMetric: 500, 1790 }, 1791 "pod flips from failed to succeeded": { 1792 job: batch.Job{ 1793 Spec: batch.JobSpec{ 1794 Completions: ptr.To[int32](2), 1795 Parallelism: ptr.To[int32](2), 1796 }, 1797 Status: batch.JobStatus{ 1798 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1799 Failed: []types.UID{"a", "b"}, 1800 }, 1801 }, 1802 }, 1803 pods: []*v1.Pod{ 1804 buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().Pod, 1805 buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod, 1806 }, 1807 finishedCond: failedCond, 1808 wantRmFinalizers: 2, 1809 wantStatusUpdates: []batch.JobStatus{ 1810 { 1811 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1812 Failed: 2, 1813 Conditions: []batch.JobCondition{*failedCond}, 1814 }, 1815 }, 1816 wantFailedPodsMetric: 2, 1817 }, 1818 "indexed job with a failed pod with delayed finalizer removal; the pod is not counted": { 1819 enableJobBackoffLimitPerIndex: true, 1820 job: batch.Job{ 1821 Spec: batch.JobSpec{ 1822 CompletionMode: &indexedCompletion, 1823 Completions: ptr.To[int32](6), 1824 BackoffLimitPerIndex: ptr.To[int32](1), 1825 }, 1826 }, 1827 pods: []*v1.Pod{ 1828 buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod, 1829 }, 1830 wantStatusUpdates: []batch.JobStatus{ 1831 { 1832 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1833 FailedIndexes: ptr.To(""), 1834 }, 1835 }, 1836 }, 1837 "indexed job with a failed pod which is recreated by a running pod; the pod is counted": { 1838 enableJobBackoffLimitPerIndex: true, 1839 job: batch.Job{ 1840 Spec: batch.JobSpec{ 1841 CompletionMode: &indexedCompletion, 1842 Completions: ptr.To[int32](6), 1843 BackoffLimitPerIndex: ptr.To[int32](1), 1844 }, 1845 Status: batch.JobStatus{ 1846 Active: 1, 1847 }, 1848 }, 1849 pods: []*v1.Pod{ 1850 buildPod().uid("a1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod, 1851 buildPod().uid("a2").phase(v1.PodRunning).indexFailureCount("1").trackingFinalizer().index("1").Pod, 1852 }, 1853 wantRmFinalizers: 1, 1854 wantStatusUpdates: []batch.JobStatus{ 1855 { 1856 Active: 1, 1857 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1858 Failed: []types.UID{"a1"}, 1859 }, 1860 FailedIndexes: ptr.To(""), 1861 }, 1862 { 1863 Active: 1, 1864 Failed: 1, 1865 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1866 FailedIndexes: ptr.To(""), 1867 }, 1868 }, 1869 wantFailedPodsMetric: 1, 1870 }, 1871 "indexed job with a failed pod for a failed index; the pod is counted": { 1872 enableJobBackoffLimitPerIndex: true, 1873 job: batch.Job{ 1874 Spec: batch.JobSpec{ 1875 CompletionMode: &indexedCompletion, 1876 Completions: ptr.To[int32](6), 1877 BackoffLimitPerIndex: ptr.To[int32](1), 1878 }, 1879 }, 1880 pods: []*v1.Pod{ 1881 buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().index("1").Pod, 1882 }, 1883 wantRmFinalizers: 1, 1884 wantStatusUpdates: []batch.JobStatus{ 1885 { 1886 FailedIndexes: ptr.To("1"), 1887 UncountedTerminatedPods: &batch.UncountedTerminatedPods{ 1888 Failed: []types.UID{"a"}, 1889 }, 1890 }, 1891 { 1892 Failed: 1, 1893 FailedIndexes: ptr.To("1"), 1894 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 1895 }, 1896 }, 1897 wantFailedPodsMetric: 1, 1898 }, 1899 } 1900 for name, tc := range cases { 1901 t.Run(name, func(t *testing.T) { 1902 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)() 1903 clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 1904 manager, _ := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc) 1905 fakePodControl := controller.FakePodControl{Err: tc.podControlErr} 1906 metrics.JobPodsFinished.Reset() 1907 manager.podControl = &fakePodControl 1908 var statusUpdates []batch.JobStatus 1909 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 1910 statusUpdates = append(statusUpdates, *job.Status.DeepCopy()) 1911 return job, tc.statusUpdateErr 1912 } 1913 job := tc.job.DeepCopy() 1914 if job.Status.UncountedTerminatedPods == nil { 1915 job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{} 1916 } 1917 jobCtx := &syncJobCtx{ 1918 job: job, 1919 pods: tc.pods, 1920 uncounted: newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods), 1921 expectedRmFinalizers: tc.expectedRmFinalizers, 1922 finishedCondition: tc.finishedCond, 1923 } 1924 if isIndexedJob(job) { 1925 jobCtx.succeededIndexes = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions)) 1926 if tc.enableJobBackoffLimitPerIndex && job.Spec.BackoffLimitPerIndex != nil { 1927 jobCtx.failedIndexes = calculateFailedIndexes(logger, job, tc.pods) 1928 jobCtx.activePods = controller.FilterActivePods(logger, tc.pods) 1929 jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx) 1930 } 1931 } 1932 1933 err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush) 1934 if !errors.Is(err, tc.wantErr) { 1935 t.Errorf("Got error %v, want %v", err, tc.wantErr) 1936 } 1937 if diff := cmp.Diff(tc.wantStatusUpdates, statusUpdates, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { 1938 t.Errorf("Unexpected status updates (-want,+got):\n%s", diff) 1939 } 1940 rmFinalizers := len(fakePodControl.Patches) 1941 if rmFinalizers != tc.wantRmFinalizers { 1942 t.Errorf("Removed %d finalizers, want %d", rmFinalizers, tc.wantRmFinalizers) 1943 } 1944 if tc.wantErr == nil { 1945 completionMode := completionModeStr(job) 1946 v, err := metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded)) 1947 if err != nil { 1948 t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err) 1949 } 1950 if float64(tc.wantSucceededPodsMetric) != v { 1951 t.Errorf("Metric reports %.0f succeeded pods, want %d", v, tc.wantSucceededPodsMetric) 1952 } 1953 v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed)) 1954 if err != nil { 1955 t.Fatalf("Obtaining failed job_pods_finished_total: %v", err) 1956 } 1957 if float64(tc.wantFailedPodsMetric) != v { 1958 t.Errorf("Metric reports %.0f failed pods, want %d", v, tc.wantFailedPodsMetric) 1959 } 1960 } 1961 }) 1962 } 1963 } 1964 1965 // TestSyncJobPastDeadline verifies tracking of active deadline in a single syncJob call. 1966 func TestSyncJobPastDeadline(t *testing.T) { 1967 _, ctx := ktesting.NewTestContext(t) 1968 testCases := map[string]struct { 1969 // job setup 1970 parallelism int32 1971 completions int32 1972 activeDeadlineSeconds int64 1973 startTime int64 1974 backoffLimit int32 1975 suspend bool 1976 1977 // pod setup 1978 activePods int 1979 succeededPods int 1980 failedPods int 1981 1982 // expectations 1983 expectedDeletions int32 1984 expectedActive int32 1985 expectedSucceeded int32 1986 expectedFailed int32 1987 expectedCondition batch.JobConditionType 1988 expectedConditionReason string 1989 }{ 1990 "activeDeadlineSeconds less than single pod execution": { 1991 parallelism: 1, 1992 completions: 1, 1993 activeDeadlineSeconds: 10, 1994 startTime: 15, 1995 backoffLimit: 6, 1996 activePods: 1, 1997 expectedDeletions: 1, 1998 expectedFailed: 1, 1999 expectedCondition: batch.JobFailed, 2000 expectedConditionReason: batch.JobReasonDeadlineExceeded, 2001 }, 2002 "activeDeadlineSeconds bigger than single pod execution": { 2003 parallelism: 1, 2004 completions: 2, 2005 activeDeadlineSeconds: 10, 2006 startTime: 15, 2007 backoffLimit: 6, 2008 activePods: 1, 2009 succeededPods: 1, 2010 expectedDeletions: 1, 2011 expectedSucceeded: 1, 2012 expectedFailed: 1, 2013 expectedCondition: batch.JobFailed, 2014 expectedConditionReason: batch.JobReasonDeadlineExceeded, 2015 }, 2016 "activeDeadlineSeconds times-out before any pod starts": { 2017 parallelism: 1, 2018 completions: 1, 2019 activeDeadlineSeconds: 10, 2020 startTime: 10, 2021 backoffLimit: 6, 2022 expectedCondition: batch.JobFailed, 2023 expectedConditionReason: batch.JobReasonDeadlineExceeded, 2024 }, 2025 "activeDeadlineSeconds with backofflimit reach": { 2026 parallelism: 1, 2027 completions: 1, 2028 activeDeadlineSeconds: 1, 2029 startTime: 10, 2030 failedPods: 1, 2031 expectedFailed: 1, 2032 expectedCondition: batch.JobFailed, 2033 expectedConditionReason: batch.JobReasonBackoffLimitExceeded, 2034 }, 2035 "activeDeadlineSeconds is not triggered when Job is suspended": { 2036 suspend: true, 2037 parallelism: 1, 2038 completions: 2, 2039 activeDeadlineSeconds: 10, 2040 startTime: 15, 2041 backoffLimit: 6, 2042 expectedCondition: batch.JobSuspended, 2043 expectedConditionReason: "JobSuspended", 2044 }, 2045 } 2046 2047 for name, tc := range testCases { 2048 t.Run(name, func(t *testing.T) { 2049 // job manager setup 2050 clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 2051 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc) 2052 fakePodControl := controller.FakePodControl{} 2053 manager.podControl = &fakePodControl 2054 manager.podStoreSynced = alwaysReady 2055 manager.jobStoreSynced = alwaysReady 2056 var actual *batch.Job 2057 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 2058 actual = job 2059 return job, nil 2060 } 2061 2062 // job & pods setup 2063 job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion) 2064 job.Spec.ActiveDeadlineSeconds = &tc.activeDeadlineSeconds 2065 job.Spec.Suspend = ptr.To(tc.suspend) 2066 start := metav1.Unix(metav1.Now().Time.Unix()-tc.startTime, 0) 2067 job.Status.StartTime = &start 2068 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 2069 podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() 2070 setPodsStatuses(podIndexer, job, 0, tc.activePods, tc.succeededPods, tc.failedPods, 0, 0) 2071 2072 // run 2073 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 2074 if err != nil { 2075 t.Errorf("Unexpected error when syncing jobs %v", err) 2076 } 2077 // validate created/deleted pods 2078 if int32(len(fakePodControl.Templates)) != 0 { 2079 t.Errorf("Unexpected number of creates. Expected 0, saw %d\n", len(fakePodControl.Templates)) 2080 } 2081 if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions { 2082 t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", tc.expectedDeletions, len(fakePodControl.DeletePodName)) 2083 } 2084 // validate status 2085 if actual.Status.Active != tc.expectedActive { 2086 t.Errorf("Unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active) 2087 } 2088 if actual.Status.Succeeded != tc.expectedSucceeded { 2089 t.Errorf("Unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded) 2090 } 2091 if actual.Status.Failed != tc.expectedFailed { 2092 t.Errorf("Unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed) 2093 } 2094 if actual.Status.StartTime == nil { 2095 t.Error("Missing .status.startTime") 2096 } 2097 // validate conditions 2098 if !getCondition(actual, tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) { 2099 t.Errorf("Expected fail condition. Got %#v", actual.Status.Conditions) 2100 } 2101 }) 2102 } 2103 } 2104 2105 func getCondition(job *batch.Job, condition batch.JobConditionType, status v1.ConditionStatus, reason string) bool { 2106 for _, v := range job.Status.Conditions { 2107 if v.Type == condition && v.Status == status && v.Reason == reason { 2108 return true 2109 } 2110 } 2111 return false 2112 } 2113 2114 func hasTrueCondition(job *batch.Job) *batch.JobConditionType { 2115 for _, v := range job.Status.Conditions { 2116 if v.Status == v1.ConditionTrue { 2117 return &v.Type 2118 } 2119 } 2120 return nil 2121 } 2122 2123 // TestPastDeadlineJobFinished ensures that a Job is correctly tracked until 2124 // reaching the active deadline, at which point it is marked as Failed. 2125 func TestPastDeadlineJobFinished(t *testing.T) { 2126 _, ctx := ktesting.NewTestContext(t) 2127 clientset := fake.NewSimpleClientset() 2128 fakeClock := clocktesting.NewFakeClock(time.Now().Truncate(time.Second)) 2129 manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 2130 manager.podStoreSynced = alwaysReady 2131 manager.jobStoreSynced = alwaysReady 2132 manager.expectations = FakeJobExpectations{ 2133 controller.NewControllerExpectations(), true, func() { 2134 }, 2135 } 2136 ctx, cancel := context.WithCancel(context.Background()) 2137 defer cancel() 2138 sharedInformerFactory.Start(ctx.Done()) 2139 sharedInformerFactory.WaitForCacheSync(ctx.Done()) 2140 2141 go manager.Run(ctx, 1) 2142 2143 tests := []struct { 2144 name string 2145 setStartTime bool 2146 jobName string 2147 }{ 2148 { 2149 name: "New job created without start time being set", 2150 setStartTime: false, 2151 jobName: "job1", 2152 }, 2153 { 2154 name: "New job created with start time being set", 2155 setStartTime: true, 2156 jobName: "job2", 2157 }, 2158 } 2159 for _, tc := range tests { 2160 t.Run(tc.name, func(t *testing.T) { 2161 job := newJobWithName(tc.jobName, 1, 1, 6, batch.NonIndexedCompletion) 2162 job.Spec.ActiveDeadlineSeconds = ptr.To[int64](1) 2163 if tc.setStartTime { 2164 start := metav1.NewTime(fakeClock.Now()) 2165 job.Status.StartTime = &start 2166 } 2167 2168 _, err := clientset.BatchV1().Jobs(job.GetNamespace()).Create(ctx, job, metav1.CreateOptions{}) 2169 if err != nil { 2170 t.Errorf("Could not create Job: %v", err) 2171 } 2172 2173 var j *batch.Job 2174 err = wait.PollUntilContextTimeout(ctx, 200*time.Microsecond, 3*time.Second, true, func(ctx context.Context) (done bool, err error) { 2175 j, err = clientset.BatchV1().Jobs(metav1.NamespaceDefault).Get(ctx, job.GetName(), metav1.GetOptions{}) 2176 if err != nil { 2177 return false, err 2178 } 2179 return j.Status.StartTime != nil, nil 2180 }) 2181 if err != nil { 2182 t.Errorf("Job failed to ensure that start time was set: %v", err) 2183 } 2184 err = wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, 3*time.Second, false, func(ctx context.Context) (done bool, err error) { 2185 j, err = clientset.BatchV1().Jobs(metav1.NamespaceDefault).Get(ctx, job.GetName(), metav1.GetOptions{}) 2186 if err != nil { 2187 return false, nil 2188 } 2189 if getCondition(j, batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded) { 2190 if manager.clock.Since(j.Status.StartTime.Time) < time.Duration(*j.Spec.ActiveDeadlineSeconds)*time.Second { 2191 return true, errors.New("Job contains DeadlineExceeded condition earlier than expected") 2192 } 2193 return true, nil 2194 } 2195 manager.clock.Sleep(100 * time.Millisecond) 2196 return false, nil 2197 }) 2198 if err != nil { 2199 t.Errorf("Job failed to enforce activeDeadlineSeconds configuration. Expected condition with Reason 'DeadlineExceeded' was not found in %v", j.Status) 2200 } 2201 }) 2202 } 2203 } 2204 2205 func TestSingleJobFailedCondition(t *testing.T) { 2206 _, ctx := ktesting.NewTestContext(t) 2207 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 2208 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 2209 fakePodControl := controller.FakePodControl{} 2210 manager.podControl = &fakePodControl 2211 manager.podStoreSynced = alwaysReady 2212 manager.jobStoreSynced = alwaysReady 2213 var actual *batch.Job 2214 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 2215 actual = job 2216 return job, nil 2217 } 2218 2219 job := newJob(1, 1, 6, batch.NonIndexedCompletion) 2220 job.Spec.ActiveDeadlineSeconds = ptr.To[int64](10) 2221 start := metav1.Unix(metav1.Now().Time.Unix()-15, 0) 2222 job.Status.StartTime = &start 2223 job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobFailed, v1.ConditionFalse, "DeadlineExceeded", "Job was active longer than specified deadline", realClock.Now())) 2224 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 2225 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 2226 if err != nil { 2227 t.Errorf("Unexpected error when syncing jobs %v", err) 2228 } 2229 if len(fakePodControl.DeletePodName) != 0 { 2230 t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName)) 2231 } 2232 if actual == nil { 2233 t.Error("Expected job modification\n") 2234 } 2235 failedConditions := getConditionsByType(actual.Status.Conditions, batch.JobFailed) 2236 if len(failedConditions) != 1 { 2237 t.Error("Unexpected number of failed conditions\n") 2238 } 2239 if failedConditions[0].Status != v1.ConditionTrue { 2240 t.Errorf("Unexpected status for the failed condition. Expected: %v, saw %v\n", v1.ConditionTrue, failedConditions[0].Status) 2241 } 2242 2243 } 2244 2245 func TestSyncJobComplete(t *testing.T) { 2246 _, ctx := ktesting.NewTestContext(t) 2247 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 2248 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 2249 fakePodControl := controller.FakePodControl{} 2250 manager.podControl = &fakePodControl 2251 manager.podStoreSynced = alwaysReady 2252 manager.jobStoreSynced = alwaysReady 2253 2254 job := newJob(1, 1, 6, batch.NonIndexedCompletion) 2255 job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobComplete, v1.ConditionTrue, "", "", realClock.Now())) 2256 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 2257 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 2258 if err != nil { 2259 t.Fatalf("Unexpected error when syncing jobs %v", err) 2260 } 2261 actual, err := manager.jobLister.Jobs(job.Namespace).Get(job.Name) 2262 if err != nil { 2263 t.Fatalf("Unexpected error when trying to get job from the store: %v", err) 2264 } 2265 // Verify that after syncing a complete job, the conditions are the same. 2266 if got, expected := len(actual.Status.Conditions), 1; got != expected { 2267 t.Fatalf("Unexpected job status conditions amount; expected %d, got %d", expected, got) 2268 } 2269 } 2270 2271 func TestSyncJobDeleted(t *testing.T) { 2272 _, ctx := ktesting.NewTestContext(t) 2273 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 2274 manager, _ := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 2275 fakePodControl := controller.FakePodControl{} 2276 manager.podControl = &fakePodControl 2277 manager.podStoreSynced = alwaysReady 2278 manager.jobStoreSynced = alwaysReady 2279 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 2280 return job, nil 2281 } 2282 job := newJob(2, 2, 6, batch.NonIndexedCompletion) 2283 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 2284 if err != nil { 2285 t.Errorf("Unexpected error when syncing jobs %v", err) 2286 } 2287 if len(fakePodControl.Templates) != 0 { 2288 t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", 0, len(fakePodControl.Templates)) 2289 } 2290 if len(fakePodControl.DeletePodName) != 0 { 2291 t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName)) 2292 } 2293 } 2294 2295 func TestSyncJobWithJobPodFailurePolicy(t *testing.T) { 2296 _, ctx := ktesting.NewTestContext(t) 2297 now := metav1.Now() 2298 indexedCompletionMode := batch.IndexedCompletion 2299 validObjectMeta := metav1.ObjectMeta{ 2300 Name: "foobar", 2301 UID: uuid.NewUUID(), 2302 Namespace: metav1.NamespaceDefault, 2303 } 2304 validSelector := &metav1.LabelSelector{ 2305 MatchLabels: map[string]string{"foo": "bar"}, 2306 } 2307 validTemplate := v1.PodTemplateSpec{ 2308 ObjectMeta: metav1.ObjectMeta{ 2309 Labels: map[string]string{ 2310 "foo": "bar", 2311 }, 2312 }, 2313 Spec: v1.PodSpec{ 2314 Containers: []v1.Container{ 2315 {Image: "foo/bar"}, 2316 }, 2317 }, 2318 } 2319 2320 onExitCodeRules := []batch.PodFailurePolicyRule{ 2321 { 2322 Action: batch.PodFailurePolicyActionIgnore, 2323 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2324 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 2325 Values: []int32{1, 2, 3}, 2326 }, 2327 }, 2328 { 2329 Action: batch.PodFailurePolicyActionFailJob, 2330 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2331 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 2332 Values: []int32{5, 6, 7}, 2333 }, 2334 }, 2335 } 2336 2337 testCases := map[string]struct { 2338 enableJobPodFailurePolicy bool 2339 enablePodDisruptionConditions bool 2340 enableJobPodReplacementPolicy bool 2341 job batch.Job 2342 pods []v1.Pod 2343 wantConditions *[]batch.JobCondition 2344 wantStatusFailed int32 2345 wantStatusActive int32 2346 wantStatusSucceeded int32 2347 wantStatusTerminating *int32 2348 }{ 2349 "default handling for pod failure if the container matching the exit codes does not match the containerName restriction": { 2350 enableJobPodFailurePolicy: true, 2351 job: batch.Job{ 2352 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2353 ObjectMeta: validObjectMeta, 2354 Spec: batch.JobSpec{ 2355 Selector: validSelector, 2356 Template: validTemplate, 2357 Parallelism: ptr.To[int32](1), 2358 Completions: ptr.To[int32](1), 2359 BackoffLimit: ptr.To[int32](6), 2360 PodFailurePolicy: &batch.PodFailurePolicy{ 2361 Rules: []batch.PodFailurePolicyRule{ 2362 { 2363 Action: batch.PodFailurePolicyActionIgnore, 2364 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2365 ContainerName: ptr.To("main-container"), 2366 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 2367 Values: []int32{1, 2, 3}, 2368 }, 2369 }, 2370 { 2371 Action: batch.PodFailurePolicyActionFailJob, 2372 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2373 ContainerName: ptr.To("main-container"), 2374 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 2375 Values: []int32{5, 6, 7}, 2376 }, 2377 }, 2378 }, 2379 }, 2380 }, 2381 }, 2382 pods: []v1.Pod{ 2383 { 2384 Status: v1.PodStatus{ 2385 Phase: v1.PodFailed, 2386 ContainerStatuses: []v1.ContainerStatus{ 2387 { 2388 Name: "monitoring-container", 2389 State: v1.ContainerState{ 2390 Terminated: &v1.ContainerStateTerminated{ 2391 ExitCode: 5, 2392 }, 2393 }, 2394 }, 2395 { 2396 Name: "main-container", 2397 State: v1.ContainerState{ 2398 Terminated: &v1.ContainerStateTerminated{ 2399 ExitCode: 42, 2400 FinishedAt: testFinishedAt, 2401 }, 2402 }, 2403 }, 2404 }, 2405 }, 2406 }, 2407 }, 2408 wantConditions: nil, 2409 wantStatusActive: 1, 2410 wantStatusSucceeded: 0, 2411 wantStatusFailed: 1, 2412 }, 2413 "running pod should not result in job fail based on OnExitCodes": { 2414 enableJobPodFailurePolicy: true, 2415 job: batch.Job{ 2416 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2417 ObjectMeta: validObjectMeta, 2418 Spec: batch.JobSpec{ 2419 Selector: validSelector, 2420 Template: validTemplate, 2421 Parallelism: ptr.To[int32](1), 2422 Completions: ptr.To[int32](1), 2423 BackoffLimit: ptr.To[int32](6), 2424 PodFailurePolicy: &batch.PodFailurePolicy{ 2425 Rules: onExitCodeRules, 2426 }, 2427 }, 2428 }, 2429 pods: []v1.Pod{ 2430 { 2431 Status: v1.PodStatus{ 2432 Phase: v1.PodRunning, 2433 ContainerStatuses: []v1.ContainerStatus{ 2434 { 2435 Name: "main-container", 2436 State: v1.ContainerState{ 2437 Terminated: &v1.ContainerStateTerminated{ 2438 ExitCode: 5, 2439 }, 2440 }, 2441 }, 2442 }, 2443 }, 2444 }, 2445 }, 2446 wantConditions: nil, 2447 wantStatusActive: 1, 2448 wantStatusFailed: 0, 2449 wantStatusSucceeded: 0, 2450 }, 2451 "fail job based on OnExitCodes": { 2452 enableJobPodFailurePolicy: true, 2453 job: batch.Job{ 2454 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2455 ObjectMeta: validObjectMeta, 2456 Spec: batch.JobSpec{ 2457 Selector: validSelector, 2458 Template: validTemplate, 2459 Parallelism: ptr.To[int32](1), 2460 Completions: ptr.To[int32](1), 2461 BackoffLimit: ptr.To[int32](6), 2462 PodFailurePolicy: &batch.PodFailurePolicy{ 2463 Rules: onExitCodeRules, 2464 }, 2465 }, 2466 }, 2467 pods: []v1.Pod{ 2468 { 2469 Status: v1.PodStatus{ 2470 Phase: v1.PodFailed, 2471 ContainerStatuses: []v1.ContainerStatus{ 2472 { 2473 Name: "main-container", 2474 State: v1.ContainerState{ 2475 Terminated: &v1.ContainerStateTerminated{ 2476 ExitCode: 5, 2477 }, 2478 }, 2479 }, 2480 }, 2481 }, 2482 }, 2483 }, 2484 wantConditions: &[]batch.JobCondition{ 2485 { 2486 Type: batch.JobFailed, 2487 Status: v1.ConditionTrue, 2488 Reason: batch.JobReasonPodFailurePolicy, 2489 Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1", 2490 }, 2491 }, 2492 wantStatusActive: 0, 2493 wantStatusFailed: 1, 2494 wantStatusSucceeded: 0, 2495 }, 2496 "job marked already as failure target with failed pod": { 2497 enableJobPodFailurePolicy: true, 2498 job: batch.Job{ 2499 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2500 ObjectMeta: validObjectMeta, 2501 Spec: batch.JobSpec{ 2502 Selector: validSelector, 2503 Template: validTemplate, 2504 Parallelism: ptr.To[int32](1), 2505 Completions: ptr.To[int32](1), 2506 BackoffLimit: ptr.To[int32](6), 2507 PodFailurePolicy: &batch.PodFailurePolicy{ 2508 Rules: onExitCodeRules, 2509 }, 2510 }, 2511 Status: batch.JobStatus{ 2512 Conditions: []batch.JobCondition{ 2513 { 2514 Type: batch.JobFailureTarget, 2515 Status: v1.ConditionTrue, 2516 Reason: batch.JobReasonPodFailurePolicy, 2517 Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1", 2518 }, 2519 }, 2520 }, 2521 }, 2522 pods: []v1.Pod{ 2523 { 2524 Status: v1.PodStatus{ 2525 Phase: v1.PodFailed, 2526 ContainerStatuses: []v1.ContainerStatus{ 2527 { 2528 Name: "main-container", 2529 State: v1.ContainerState{ 2530 Terminated: &v1.ContainerStateTerminated{ 2531 ExitCode: 5, 2532 }, 2533 }, 2534 }, 2535 }, 2536 }, 2537 }, 2538 }, 2539 wantConditions: &[]batch.JobCondition{ 2540 { 2541 Type: batch.JobFailed, 2542 Status: v1.ConditionTrue, 2543 Reason: batch.JobReasonPodFailurePolicy, 2544 Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1", 2545 }, 2546 }, 2547 wantStatusActive: 0, 2548 wantStatusFailed: 1, 2549 wantStatusSucceeded: 0, 2550 }, 2551 "job marked already as failure target with failed pod, message based on already deleted pod": { 2552 enableJobPodFailurePolicy: true, 2553 job: batch.Job{ 2554 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2555 ObjectMeta: validObjectMeta, 2556 Spec: batch.JobSpec{ 2557 Selector: validSelector, 2558 Template: validTemplate, 2559 Parallelism: ptr.To[int32](1), 2560 Completions: ptr.To[int32](1), 2561 BackoffLimit: ptr.To[int32](6), 2562 PodFailurePolicy: &batch.PodFailurePolicy{ 2563 Rules: onExitCodeRules, 2564 }, 2565 }, 2566 Status: batch.JobStatus{ 2567 Conditions: []batch.JobCondition{ 2568 { 2569 Type: batch.JobFailureTarget, 2570 Status: v1.ConditionTrue, 2571 Reason: batch.JobReasonPodFailurePolicy, 2572 Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1", 2573 }, 2574 }, 2575 }, 2576 }, 2577 pods: []v1.Pod{ 2578 { 2579 Status: v1.PodStatus{ 2580 Phase: v1.PodFailed, 2581 ContainerStatuses: []v1.ContainerStatus{ 2582 { 2583 Name: "main-container", 2584 State: v1.ContainerState{ 2585 Terminated: &v1.ContainerStateTerminated{ 2586 ExitCode: 5, 2587 }, 2588 }, 2589 }, 2590 }, 2591 }, 2592 }, 2593 }, 2594 wantConditions: &[]batch.JobCondition{ 2595 { 2596 Type: batch.JobFailed, 2597 Status: v1.ConditionTrue, 2598 Reason: batch.JobReasonPodFailurePolicy, 2599 Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1", 2600 }, 2601 }, 2602 wantStatusActive: 0, 2603 wantStatusFailed: 1, 2604 wantStatusSucceeded: 0, 2605 }, 2606 "default handling for a failed pod when the feature is disabled even, despite matching rule": { 2607 enableJobPodFailurePolicy: false, 2608 job: batch.Job{ 2609 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2610 ObjectMeta: validObjectMeta, 2611 Spec: batch.JobSpec{ 2612 Selector: validSelector, 2613 Template: validTemplate, 2614 Parallelism: ptr.To[int32](1), 2615 Completions: ptr.To[int32](1), 2616 BackoffLimit: ptr.To[int32](6), 2617 PodFailurePolicy: &batch.PodFailurePolicy{ 2618 Rules: onExitCodeRules, 2619 }, 2620 }, 2621 }, 2622 pods: []v1.Pod{ 2623 { 2624 Status: v1.PodStatus{ 2625 Phase: v1.PodFailed, 2626 ContainerStatuses: []v1.ContainerStatus{ 2627 { 2628 Name: "main-container", 2629 State: v1.ContainerState{ 2630 Terminated: &v1.ContainerStateTerminated{ 2631 ExitCode: 5, 2632 FinishedAt: testFinishedAt, 2633 }, 2634 }, 2635 }, 2636 }, 2637 }, 2638 }, 2639 }, 2640 wantConditions: nil, 2641 wantStatusActive: 1, 2642 wantStatusFailed: 1, 2643 wantStatusSucceeded: 0, 2644 }, 2645 "fail job with multiple pods": { 2646 enableJobPodFailurePolicy: true, 2647 job: batch.Job{ 2648 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2649 ObjectMeta: validObjectMeta, 2650 Spec: batch.JobSpec{ 2651 Selector: validSelector, 2652 Template: validTemplate, 2653 Parallelism: ptr.To[int32](2), 2654 Completions: ptr.To[int32](2), 2655 BackoffLimit: ptr.To[int32](6), 2656 PodFailurePolicy: &batch.PodFailurePolicy{ 2657 Rules: onExitCodeRules, 2658 }, 2659 }, 2660 }, 2661 pods: []v1.Pod{ 2662 { 2663 Status: v1.PodStatus{ 2664 Phase: v1.PodRunning, 2665 }, 2666 }, 2667 { 2668 Status: v1.PodStatus{ 2669 Phase: v1.PodFailed, 2670 ContainerStatuses: []v1.ContainerStatus{ 2671 { 2672 Name: "main-container", 2673 State: v1.ContainerState{ 2674 Terminated: &v1.ContainerStateTerminated{ 2675 ExitCode: 5, 2676 }, 2677 }, 2678 }, 2679 }, 2680 }, 2681 }, 2682 }, 2683 wantConditions: &[]batch.JobCondition{ 2684 { 2685 Type: batch.JobFailed, 2686 Status: v1.ConditionTrue, 2687 Reason: batch.JobReasonPodFailurePolicy, 2688 Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1", 2689 }, 2690 }, 2691 wantStatusActive: 0, 2692 wantStatusFailed: 2, 2693 wantStatusSucceeded: 0, 2694 }, 2695 "fail indexed job based on OnExitCodes": { 2696 enableJobPodFailurePolicy: true, 2697 job: batch.Job{ 2698 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2699 ObjectMeta: validObjectMeta, 2700 Spec: batch.JobSpec{ 2701 Selector: validSelector, 2702 Template: validTemplate, 2703 CompletionMode: &indexedCompletionMode, 2704 Parallelism: ptr.To[int32](1), 2705 Completions: ptr.To[int32](1), 2706 BackoffLimit: ptr.To[int32](6), 2707 PodFailurePolicy: &batch.PodFailurePolicy{ 2708 Rules: onExitCodeRules, 2709 }, 2710 }, 2711 }, 2712 pods: []v1.Pod{ 2713 { 2714 Status: v1.PodStatus{ 2715 Phase: v1.PodFailed, 2716 ContainerStatuses: []v1.ContainerStatus{ 2717 { 2718 Name: "main-container", 2719 State: v1.ContainerState{ 2720 Terminated: &v1.ContainerStateTerminated{ 2721 ExitCode: 5, 2722 }, 2723 }, 2724 }, 2725 }, 2726 }, 2727 }, 2728 }, 2729 wantConditions: &[]batch.JobCondition{ 2730 { 2731 Type: batch.JobFailed, 2732 Status: v1.ConditionTrue, 2733 Reason: batch.JobReasonPodFailurePolicy, 2734 Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1", 2735 }, 2736 }, 2737 wantStatusActive: 0, 2738 wantStatusFailed: 1, 2739 wantStatusSucceeded: 0, 2740 }, 2741 "fail job based on OnExitCodes with NotIn operator": { 2742 enableJobPodFailurePolicy: true, 2743 job: batch.Job{ 2744 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2745 ObjectMeta: validObjectMeta, 2746 Spec: batch.JobSpec{ 2747 Selector: validSelector, 2748 Template: validTemplate, 2749 Parallelism: ptr.To[int32](1), 2750 Completions: ptr.To[int32](1), 2751 BackoffLimit: ptr.To[int32](6), 2752 PodFailurePolicy: &batch.PodFailurePolicy{ 2753 Rules: []batch.PodFailurePolicyRule{ 2754 { 2755 Action: batch.PodFailurePolicyActionFailJob, 2756 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2757 Operator: batch.PodFailurePolicyOnExitCodesOpNotIn, 2758 Values: []int32{5, 6, 7}, 2759 }, 2760 }, 2761 }, 2762 }, 2763 }, 2764 }, 2765 pods: []v1.Pod{ 2766 { 2767 Status: v1.PodStatus{ 2768 Phase: v1.PodFailed, 2769 ContainerStatuses: []v1.ContainerStatus{ 2770 { 2771 Name: "main-container", 2772 State: v1.ContainerState{ 2773 Terminated: &v1.ContainerStateTerminated{ 2774 ExitCode: 42, 2775 }, 2776 }, 2777 }, 2778 }, 2779 }, 2780 }, 2781 }, 2782 wantConditions: &[]batch.JobCondition{ 2783 { 2784 Type: batch.JobFailed, 2785 Status: v1.ConditionTrue, 2786 Reason: batch.JobReasonPodFailurePolicy, 2787 Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0", 2788 }, 2789 }, 2790 wantStatusActive: 0, 2791 wantStatusFailed: 1, 2792 wantStatusSucceeded: 0, 2793 }, 2794 "default handling job based on OnExitCodes with NotIn operator": { 2795 enableJobPodFailurePolicy: true, 2796 job: batch.Job{ 2797 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2798 ObjectMeta: validObjectMeta, 2799 Spec: batch.JobSpec{ 2800 Selector: validSelector, 2801 Template: validTemplate, 2802 Parallelism: ptr.To[int32](1), 2803 Completions: ptr.To[int32](1), 2804 BackoffLimit: ptr.To[int32](6), 2805 PodFailurePolicy: &batch.PodFailurePolicy{ 2806 Rules: []batch.PodFailurePolicyRule{ 2807 { 2808 Action: batch.PodFailurePolicyActionFailJob, 2809 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 2810 Operator: batch.PodFailurePolicyOnExitCodesOpNotIn, 2811 Values: []int32{5, 6, 7}, 2812 }, 2813 }, 2814 }, 2815 }, 2816 }, 2817 }, 2818 pods: []v1.Pod{ 2819 { 2820 Status: v1.PodStatus{ 2821 Phase: v1.PodFailed, 2822 ContainerStatuses: []v1.ContainerStatus{ 2823 { 2824 Name: "main-container", 2825 State: v1.ContainerState{ 2826 Terminated: &v1.ContainerStateTerminated{ 2827 ExitCode: 5, 2828 FinishedAt: testFinishedAt, 2829 }, 2830 }, 2831 }, 2832 }, 2833 }, 2834 }, 2835 }, 2836 wantConditions: nil, 2837 wantStatusActive: 1, 2838 wantStatusFailed: 1, 2839 wantStatusSucceeded: 0, 2840 }, 2841 "fail job based on OnExitCodes for InitContainer": { 2842 enableJobPodFailurePolicy: true, 2843 job: batch.Job{ 2844 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2845 ObjectMeta: validObjectMeta, 2846 Spec: batch.JobSpec{ 2847 Selector: validSelector, 2848 Template: validTemplate, 2849 Parallelism: ptr.To[int32](1), 2850 Completions: ptr.To[int32](1), 2851 BackoffLimit: ptr.To[int32](6), 2852 PodFailurePolicy: &batch.PodFailurePolicy{ 2853 Rules: onExitCodeRules, 2854 }, 2855 }, 2856 }, 2857 pods: []v1.Pod{ 2858 { 2859 Status: v1.PodStatus{ 2860 Phase: v1.PodFailed, 2861 InitContainerStatuses: []v1.ContainerStatus{ 2862 { 2863 Name: "init-container", 2864 State: v1.ContainerState{ 2865 Terminated: &v1.ContainerStateTerminated{ 2866 ExitCode: 5, 2867 }, 2868 }, 2869 }, 2870 }, 2871 ContainerStatuses: []v1.ContainerStatus{ 2872 { 2873 Name: "main-container", 2874 State: v1.ContainerState{ 2875 Terminated: &v1.ContainerStateTerminated{ 2876 ExitCode: 143, 2877 }, 2878 }, 2879 }, 2880 }, 2881 }, 2882 }, 2883 }, 2884 wantConditions: &[]batch.JobCondition{ 2885 { 2886 Type: batch.JobFailed, 2887 Status: v1.ConditionTrue, 2888 Reason: batch.JobReasonPodFailurePolicy, 2889 Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1", 2890 }, 2891 }, 2892 wantStatusActive: 0, 2893 wantStatusFailed: 1, 2894 wantStatusSucceeded: 0, 2895 }, 2896 "ignore pod failure; both rules are matching, the first is executed only": { 2897 enableJobPodFailurePolicy: true, 2898 job: batch.Job{ 2899 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2900 ObjectMeta: validObjectMeta, 2901 Spec: batch.JobSpec{ 2902 Selector: validSelector, 2903 Template: validTemplate, 2904 Parallelism: ptr.To[int32](1), 2905 Completions: ptr.To[int32](1), 2906 BackoffLimit: ptr.To[int32](0), 2907 PodFailurePolicy: &batch.PodFailurePolicy{ 2908 Rules: onExitCodeRules, 2909 }, 2910 }, 2911 }, 2912 pods: []v1.Pod{ 2913 { 2914 Status: v1.PodStatus{ 2915 Phase: v1.PodFailed, 2916 ContainerStatuses: []v1.ContainerStatus{ 2917 { 2918 Name: "container1", 2919 State: v1.ContainerState{ 2920 Terminated: &v1.ContainerStateTerminated{ 2921 ExitCode: 2, 2922 }, 2923 }, 2924 }, 2925 { 2926 Name: "container2", 2927 State: v1.ContainerState{ 2928 Terminated: &v1.ContainerStateTerminated{ 2929 ExitCode: 6, 2930 }, 2931 }, 2932 }, 2933 }, 2934 }, 2935 }, 2936 }, 2937 wantConditions: nil, 2938 wantStatusActive: 1, 2939 wantStatusFailed: 0, 2940 wantStatusSucceeded: 0, 2941 }, 2942 "ignore pod failure based on OnExitCodes": { 2943 enableJobPodFailurePolicy: true, 2944 job: batch.Job{ 2945 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2946 ObjectMeta: validObjectMeta, 2947 Spec: batch.JobSpec{ 2948 Selector: validSelector, 2949 Template: validTemplate, 2950 Parallelism: ptr.To[int32](1), 2951 Completions: ptr.To[int32](1), 2952 BackoffLimit: ptr.To[int32](0), 2953 PodFailurePolicy: &batch.PodFailurePolicy{ 2954 Rules: onExitCodeRules, 2955 }, 2956 }, 2957 }, 2958 pods: []v1.Pod{ 2959 { 2960 Status: v1.PodStatus{ 2961 Phase: v1.PodFailed, 2962 ContainerStatuses: []v1.ContainerStatus{ 2963 { 2964 State: v1.ContainerState{ 2965 Terminated: &v1.ContainerStateTerminated{ 2966 ExitCode: 1, 2967 }, 2968 }, 2969 }, 2970 }, 2971 }, 2972 }, 2973 }, 2974 wantConditions: nil, 2975 wantStatusActive: 1, 2976 wantStatusFailed: 0, 2977 wantStatusSucceeded: 0, 2978 }, 2979 "default job based on OnExitCodes": { 2980 enableJobPodFailurePolicy: true, 2981 job: batch.Job{ 2982 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 2983 ObjectMeta: validObjectMeta, 2984 Spec: batch.JobSpec{ 2985 Selector: validSelector, 2986 Template: validTemplate, 2987 Parallelism: ptr.To[int32](1), 2988 Completions: ptr.To[int32](1), 2989 BackoffLimit: ptr.To[int32](0), 2990 PodFailurePolicy: &batch.PodFailurePolicy{ 2991 Rules: onExitCodeRules, 2992 }, 2993 }, 2994 }, 2995 pods: []v1.Pod{ 2996 { 2997 Status: v1.PodStatus{ 2998 Phase: v1.PodFailed, 2999 ContainerStatuses: []v1.ContainerStatus{ 3000 { 3001 State: v1.ContainerState{ 3002 Terminated: &v1.ContainerStateTerminated{ 3003 ExitCode: 10, 3004 }, 3005 }, 3006 }, 3007 }, 3008 }, 3009 }, 3010 }, 3011 wantConditions: &[]batch.JobCondition{ 3012 { 3013 Type: batch.JobFailed, 3014 Status: v1.ConditionTrue, 3015 Reason: batch.JobReasonBackoffLimitExceeded, 3016 Message: "Job has reached the specified backoff limit", 3017 }, 3018 }, 3019 wantStatusActive: 0, 3020 wantStatusFailed: 1, 3021 wantStatusSucceeded: 0, 3022 }, 3023 "count pod failure based on OnExitCodes; both rules are matching, the first is executed only": { 3024 enableJobPodFailurePolicy: true, 3025 job: batch.Job{ 3026 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3027 ObjectMeta: validObjectMeta, 3028 Spec: batch.JobSpec{ 3029 Selector: validSelector, 3030 Template: validTemplate, 3031 Parallelism: ptr.To[int32](1), 3032 Completions: ptr.To[int32](1), 3033 BackoffLimit: ptr.To[int32](6), 3034 PodFailurePolicy: &batch.PodFailurePolicy{ 3035 Rules: []batch.PodFailurePolicyRule{ 3036 { 3037 Action: batch.PodFailurePolicyActionCount, 3038 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 3039 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 3040 Values: []int32{1, 2}, 3041 }, 3042 }, 3043 { 3044 Action: batch.PodFailurePolicyActionIgnore, 3045 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 3046 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 3047 Values: []int32{2, 3}, 3048 }, 3049 }, 3050 }, 3051 }, 3052 }, 3053 }, 3054 pods: []v1.Pod{ 3055 { 3056 Status: v1.PodStatus{ 3057 Phase: v1.PodFailed, 3058 ContainerStatuses: []v1.ContainerStatus{ 3059 { 3060 State: v1.ContainerState{ 3061 Terminated: &v1.ContainerStateTerminated{ 3062 ExitCode: 2, 3063 FinishedAt: testFinishedAt, 3064 }, 3065 }, 3066 }, 3067 }, 3068 }, 3069 }, 3070 }, 3071 wantConditions: nil, 3072 wantStatusActive: 1, 3073 wantStatusFailed: 1, 3074 wantStatusSucceeded: 0, 3075 }, 3076 "count pod failure based on OnPodConditions; both rules are matching, the first is executed only": { 3077 enableJobPodFailurePolicy: true, 3078 job: batch.Job{ 3079 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3080 ObjectMeta: validObjectMeta, 3081 Spec: batch.JobSpec{ 3082 Selector: validSelector, 3083 Template: validTemplate, 3084 Parallelism: ptr.To[int32](1), 3085 Completions: ptr.To[int32](1), 3086 BackoffLimit: ptr.To[int32](6), 3087 PodFailurePolicy: &batch.PodFailurePolicy{ 3088 Rules: []batch.PodFailurePolicyRule{ 3089 { 3090 Action: batch.PodFailurePolicyActionCount, 3091 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3092 { 3093 Type: v1.PodConditionType("ResourceLimitExceeded"), 3094 Status: v1.ConditionTrue, 3095 }, 3096 }, 3097 }, 3098 { 3099 Action: batch.PodFailurePolicyActionIgnore, 3100 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3101 { 3102 Type: v1.DisruptionTarget, 3103 Status: v1.ConditionTrue, 3104 }, 3105 }, 3106 }, 3107 }, 3108 }, 3109 }, 3110 }, 3111 pods: []v1.Pod{ 3112 { 3113 Status: v1.PodStatus{ 3114 Phase: v1.PodFailed, 3115 Conditions: []v1.PodCondition{ 3116 { 3117 Type: v1.PodConditionType("ResourceLimitExceeded"), 3118 Status: v1.ConditionTrue, 3119 }, 3120 { 3121 Type: v1.DisruptionTarget, 3122 Status: v1.ConditionTrue, 3123 }, 3124 }, 3125 ContainerStatuses: []v1.ContainerStatus{ 3126 { 3127 State: v1.ContainerState{ 3128 Terminated: &v1.ContainerStateTerminated{ 3129 FinishedAt: testFinishedAt, 3130 }, 3131 }, 3132 }, 3133 }, 3134 }, 3135 }, 3136 }, 3137 wantConditions: nil, 3138 wantStatusActive: 1, 3139 wantStatusFailed: 1, 3140 wantStatusSucceeded: 0, 3141 }, 3142 "ignore pod failure based on OnPodConditions": { 3143 enableJobPodFailurePolicy: true, 3144 job: batch.Job{ 3145 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3146 ObjectMeta: validObjectMeta, 3147 Spec: batch.JobSpec{ 3148 Selector: validSelector, 3149 Template: validTemplate, 3150 Parallelism: ptr.To[int32](1), 3151 Completions: ptr.To[int32](1), 3152 BackoffLimit: ptr.To[int32](0), 3153 PodFailurePolicy: &batch.PodFailurePolicy{ 3154 Rules: []batch.PodFailurePolicyRule{ 3155 { 3156 Action: batch.PodFailurePolicyActionIgnore, 3157 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3158 { 3159 Type: v1.DisruptionTarget, 3160 Status: v1.ConditionTrue, 3161 }, 3162 }, 3163 }, 3164 }, 3165 }, 3166 }, 3167 }, 3168 pods: []v1.Pod{ 3169 { 3170 Status: v1.PodStatus{ 3171 Phase: v1.PodFailed, 3172 Conditions: []v1.PodCondition{ 3173 { 3174 Type: v1.DisruptionTarget, 3175 Status: v1.ConditionTrue, 3176 }, 3177 }, 3178 }, 3179 }, 3180 }, 3181 wantConditions: nil, 3182 wantStatusActive: 1, 3183 wantStatusFailed: 0, 3184 wantStatusSucceeded: 0, 3185 }, 3186 "ignore pod failure based on OnPodConditions, ignored failures delays pod recreation": { 3187 enableJobPodFailurePolicy: true, 3188 job: batch.Job{ 3189 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3190 ObjectMeta: validObjectMeta, 3191 Spec: batch.JobSpec{ 3192 Selector: validSelector, 3193 Template: validTemplate, 3194 Parallelism: ptr.To[int32](1), 3195 Completions: ptr.To[int32](1), 3196 BackoffLimit: ptr.To[int32](0), 3197 PodFailurePolicy: &batch.PodFailurePolicy{ 3198 Rules: []batch.PodFailurePolicyRule{ 3199 { 3200 Action: batch.PodFailurePolicyActionIgnore, 3201 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3202 { 3203 Type: v1.DisruptionTarget, 3204 Status: v1.ConditionTrue, 3205 }, 3206 }, 3207 }, 3208 }, 3209 }, 3210 }, 3211 }, 3212 pods: []v1.Pod{ 3213 { 3214 ObjectMeta: metav1.ObjectMeta{ 3215 DeletionTimestamp: &now, 3216 }, 3217 Status: v1.PodStatus{ 3218 Phase: v1.PodFailed, 3219 Conditions: []v1.PodCondition{ 3220 { 3221 Type: v1.DisruptionTarget, 3222 Status: v1.ConditionTrue, 3223 }, 3224 }, 3225 }, 3226 }, 3227 }, 3228 wantConditions: nil, 3229 wantStatusActive: 0, 3230 wantStatusFailed: 0, 3231 wantStatusSucceeded: 0, 3232 }, 3233 "fail job based on OnPodConditions": { 3234 enableJobPodFailurePolicy: true, 3235 job: batch.Job{ 3236 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3237 ObjectMeta: validObjectMeta, 3238 Spec: batch.JobSpec{ 3239 Selector: validSelector, 3240 Template: validTemplate, 3241 Parallelism: ptr.To[int32](1), 3242 Completions: ptr.To[int32](1), 3243 BackoffLimit: ptr.To[int32](6), 3244 PodFailurePolicy: &batch.PodFailurePolicy{ 3245 Rules: []batch.PodFailurePolicyRule{ 3246 { 3247 Action: batch.PodFailurePolicyActionFailJob, 3248 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3249 { 3250 Type: v1.DisruptionTarget, 3251 Status: v1.ConditionTrue, 3252 }, 3253 }, 3254 }, 3255 }, 3256 }, 3257 }, 3258 }, 3259 pods: []v1.Pod{ 3260 { 3261 Status: v1.PodStatus{ 3262 Phase: v1.PodFailed, 3263 Conditions: []v1.PodCondition{ 3264 { 3265 Type: v1.DisruptionTarget, 3266 Status: v1.ConditionTrue, 3267 }, 3268 }, 3269 }, 3270 }, 3271 }, 3272 wantConditions: &[]batch.JobCondition{ 3273 { 3274 Type: batch.JobFailed, 3275 Status: v1.ConditionTrue, 3276 Reason: batch.JobReasonPodFailurePolicy, 3277 Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0", 3278 }, 3279 }, 3280 wantStatusActive: 0, 3281 wantStatusFailed: 1, 3282 wantStatusSucceeded: 0, 3283 }, 3284 "terminating Pod considered failed when PodDisruptionConditions is disabled": { 3285 enableJobPodFailurePolicy: true, 3286 job: batch.Job{ 3287 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3288 ObjectMeta: validObjectMeta, 3289 Spec: batch.JobSpec{ 3290 Parallelism: ptr.To[int32](1), 3291 Selector: validSelector, 3292 Template: validTemplate, 3293 BackoffLimit: ptr.To[int32](0), 3294 PodFailurePolicy: &batch.PodFailurePolicy{ 3295 Rules: []batch.PodFailurePolicyRule{ 3296 { 3297 Action: batch.PodFailurePolicyActionCount, 3298 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3299 { 3300 Type: v1.DisruptionTarget, 3301 Status: v1.ConditionTrue, 3302 }, 3303 }, 3304 }, 3305 }, 3306 }, 3307 }, 3308 }, 3309 pods: []v1.Pod{ 3310 { 3311 ObjectMeta: metav1.ObjectMeta{ 3312 DeletionTimestamp: &now, 3313 }, 3314 }, 3315 }, 3316 }, 3317 "terminating Pod not considered failed when PodDisruptionConditions is enabled": { 3318 enableJobPodFailurePolicy: true, 3319 enablePodDisruptionConditions: true, 3320 job: batch.Job{ 3321 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3322 ObjectMeta: validObjectMeta, 3323 Spec: batch.JobSpec{ 3324 Parallelism: ptr.To[int32](1), 3325 Selector: validSelector, 3326 Template: validTemplate, 3327 BackoffLimit: ptr.To[int32](0), 3328 PodFailurePolicy: &batch.PodFailurePolicy{ 3329 Rules: []batch.PodFailurePolicyRule{ 3330 { 3331 Action: batch.PodFailurePolicyActionCount, 3332 OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{ 3333 { 3334 Type: v1.DisruptionTarget, 3335 Status: v1.ConditionTrue, 3336 }, 3337 }, 3338 }, 3339 }, 3340 }, 3341 }, 3342 }, 3343 pods: []v1.Pod{ 3344 { 3345 ObjectMeta: metav1.ObjectMeta{ 3346 DeletionTimestamp: &now, 3347 }, 3348 Status: v1.PodStatus{ 3349 Phase: v1.PodRunning, 3350 }, 3351 }, 3352 }, 3353 }, 3354 } 3355 for name, tc := range testCases { 3356 t.Run(name, func(t *testing.T) { 3357 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() 3358 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, tc.enablePodDisruptionConditions)() 3359 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)() 3360 3361 if tc.job.Spec.PodReplacementPolicy == nil { 3362 tc.job.Spec.PodReplacementPolicy = podReplacementPolicy(batch.Failed) 3363 } 3364 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 3365 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 3366 fakePodControl := controller.FakePodControl{} 3367 manager.podControl = &fakePodControl 3368 manager.podStoreSynced = alwaysReady 3369 manager.jobStoreSynced = alwaysReady 3370 job := &tc.job 3371 3372 actual := job 3373 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 3374 actual = job 3375 return job, nil 3376 } 3377 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 3378 for i, pod := range tc.pods { 3379 pod := pod 3380 pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job) 3381 if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion { 3382 pb.index(fmt.Sprintf("%v", i)) 3383 } 3384 pb = pb.trackingFinalizer() 3385 sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod) 3386 } 3387 3388 manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 3389 3390 if tc.wantConditions != nil { 3391 for _, wantCondition := range *tc.wantConditions { 3392 conditions := getConditionsByType(actual.Status.Conditions, wantCondition.Type) 3393 if len(conditions) != 1 { 3394 t.Fatalf("Expected a single completion condition. Got %#v for type: %q", conditions, wantCondition.Type) 3395 } 3396 condition := *conditions[0] 3397 if diff := cmp.Diff(wantCondition, condition, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { 3398 t.Errorf("Unexpected job condition (-want,+got):\n%s", diff) 3399 } 3400 } 3401 } else { 3402 if cond := hasTrueCondition(actual); cond != nil { 3403 t.Errorf("Got condition %s, want none", *cond) 3404 } 3405 } 3406 // validate status 3407 if actual.Status.Active != tc.wantStatusActive { 3408 t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.wantStatusActive, actual.Status.Active) 3409 } 3410 if actual.Status.Succeeded != tc.wantStatusSucceeded { 3411 t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.wantStatusSucceeded, actual.Status.Succeeded) 3412 } 3413 if actual.Status.Failed != tc.wantStatusFailed { 3414 t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.wantStatusFailed, actual.Status.Failed) 3415 } 3416 if ptr.Deref(actual.Status.Terminating, 0) != ptr.Deref(tc.wantStatusTerminating, 0) { 3417 t.Errorf("unexpected number of terminating pods. Expected %d, saw %d\n", ptr.Deref(tc.wantStatusTerminating, 0), ptr.Deref(actual.Status.Terminating, 0)) 3418 } 3419 }) 3420 } 3421 } 3422 3423 func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) { 3424 _, ctx := ktesting.NewTestContext(t) 3425 now := time.Now() 3426 validObjectMeta := metav1.ObjectMeta{ 3427 Name: "foobar", 3428 UID: uuid.NewUUID(), 3429 Namespace: metav1.NamespaceDefault, 3430 } 3431 validSelector := &metav1.LabelSelector{ 3432 MatchLabels: map[string]string{"foo": "bar"}, 3433 } 3434 validTemplate := v1.PodTemplateSpec{ 3435 ObjectMeta: metav1.ObjectMeta{ 3436 Labels: map[string]string{ 3437 "foo": "bar", 3438 }, 3439 }, 3440 Spec: v1.PodSpec{ 3441 Containers: []v1.Container{ 3442 {Image: "foo/bar"}, 3443 }, 3444 }, 3445 } 3446 3447 testCases := map[string]struct { 3448 enableJobBackoffLimitPerIndex bool 3449 enableJobPodFailurePolicy bool 3450 job batch.Job 3451 pods []v1.Pod 3452 wantStatus batch.JobStatus 3453 }{ 3454 "successful job after a single failure within index": { 3455 enableJobBackoffLimitPerIndex: true, 3456 job: batch.Job{ 3457 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3458 ObjectMeta: validObjectMeta, 3459 Spec: batch.JobSpec{ 3460 Selector: validSelector, 3461 Template: validTemplate, 3462 Parallelism: ptr.To[int32](2), 3463 Completions: ptr.To[int32](2), 3464 BackoffLimit: ptr.To[int32](math.MaxInt32), 3465 CompletionMode: completionModePtr(batch.IndexedCompletion), 3466 BackoffLimitPerIndex: ptr.To[int32](1), 3467 }, 3468 }, 3469 pods: []v1.Pod{ 3470 *buildPod().uid("a1").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, 3471 *buildPod().uid("a2").index("0").phase(v1.PodSucceeded).indexFailureCount("1").trackingFinalizer().Pod, 3472 *buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, 3473 }, 3474 wantStatus: batch.JobStatus{ 3475 Failed: 1, 3476 Succeeded: 2, 3477 Terminating: ptr.To[int32](0), 3478 CompletedIndexes: "0,1", 3479 FailedIndexes: ptr.To(""), 3480 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3481 Conditions: []batch.JobCondition{ 3482 { 3483 Type: batch.JobComplete, 3484 Status: v1.ConditionTrue, 3485 }, 3486 }, 3487 }, 3488 }, 3489 "single failed pod, not counted as the replacement pod creation is delayed": { 3490 enableJobBackoffLimitPerIndex: true, 3491 job: batch.Job{ 3492 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3493 ObjectMeta: validObjectMeta, 3494 Spec: batch.JobSpec{ 3495 Selector: validSelector, 3496 Template: validTemplate, 3497 Parallelism: ptr.To[int32](2), 3498 Completions: ptr.To[int32](2), 3499 BackoffLimit: ptr.To[int32](math.MaxInt32), 3500 CompletionMode: completionModePtr(batch.IndexedCompletion), 3501 BackoffLimitPerIndex: ptr.To[int32](1), 3502 }, 3503 }, 3504 pods: []v1.Pod{ 3505 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, 3506 }, 3507 wantStatus: batch.JobStatus{ 3508 Active: 2, 3509 Terminating: ptr.To[int32](0), 3510 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3511 FailedIndexes: ptr.To(""), 3512 }, 3513 }, 3514 "single failed pod replaced already": { 3515 enableJobBackoffLimitPerIndex: true, 3516 job: batch.Job{ 3517 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3518 ObjectMeta: validObjectMeta, 3519 Spec: batch.JobSpec{ 3520 Selector: validSelector, 3521 Template: validTemplate, 3522 Parallelism: ptr.To[int32](2), 3523 Completions: ptr.To[int32](2), 3524 BackoffLimit: ptr.To[int32](math.MaxInt32), 3525 CompletionMode: completionModePtr(batch.IndexedCompletion), 3526 BackoffLimitPerIndex: ptr.To[int32](1), 3527 }, 3528 }, 3529 pods: []v1.Pod{ 3530 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, 3531 *buildPod().uid("b").index("0").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod, 3532 }, 3533 wantStatus: batch.JobStatus{ 3534 Active: 2, 3535 Failed: 1, 3536 Terminating: ptr.To[int32](0), 3537 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3538 FailedIndexes: ptr.To(""), 3539 }, 3540 }, 3541 "single failed index due to exceeding the backoff limit per index, the job continues": { 3542 enableJobBackoffLimitPerIndex: true, 3543 job: batch.Job{ 3544 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3545 ObjectMeta: validObjectMeta, 3546 Spec: batch.JobSpec{ 3547 Selector: validSelector, 3548 Template: validTemplate, 3549 Parallelism: ptr.To[int32](2), 3550 Completions: ptr.To[int32](2), 3551 BackoffLimit: ptr.To[int32](math.MaxInt32), 3552 CompletionMode: completionModePtr(batch.IndexedCompletion), 3553 BackoffLimitPerIndex: ptr.To[int32](1), 3554 }, 3555 }, 3556 pods: []v1.Pod{ 3557 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, 3558 }, 3559 wantStatus: batch.JobStatus{ 3560 Active: 1, 3561 Failed: 1, 3562 FailedIndexes: ptr.To("0"), 3563 Terminating: ptr.To[int32](0), 3564 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3565 }, 3566 }, 3567 "single failed index due to FailIndex action, the job continues": { 3568 enableJobBackoffLimitPerIndex: true, 3569 enableJobPodFailurePolicy: true, 3570 job: batch.Job{ 3571 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3572 ObjectMeta: validObjectMeta, 3573 Spec: batch.JobSpec{ 3574 Selector: validSelector, 3575 Template: validTemplate, 3576 Parallelism: ptr.To[int32](2), 3577 Completions: ptr.To[int32](2), 3578 BackoffLimit: ptr.To[int32](math.MaxInt32), 3579 CompletionMode: completionModePtr(batch.IndexedCompletion), 3580 BackoffLimitPerIndex: ptr.To[int32](1), 3581 PodFailurePolicy: &batch.PodFailurePolicy{ 3582 Rules: []batch.PodFailurePolicyRule{ 3583 { 3584 Action: batch.PodFailurePolicyActionFailIndex, 3585 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 3586 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 3587 Values: []int32{3}, 3588 }, 3589 }, 3590 }, 3591 }, 3592 }, 3593 }, 3594 pods: []v1.Pod{ 3595 *buildPod().uid("a").index("0").status(v1.PodStatus{ 3596 Phase: v1.PodFailed, 3597 ContainerStatuses: []v1.ContainerStatus{ 3598 { 3599 State: v1.ContainerState{ 3600 Terminated: &v1.ContainerStateTerminated{ 3601 ExitCode: 3, 3602 }, 3603 }, 3604 }, 3605 }, 3606 }).indexFailureCount("0").trackingFinalizer().Pod, 3607 }, 3608 wantStatus: batch.JobStatus{ 3609 Active: 1, 3610 Failed: 1, 3611 FailedIndexes: ptr.To("0"), 3612 Terminating: ptr.To[int32](0), 3613 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3614 }, 3615 }, 3616 "job failed index due to FailJob action": { 3617 enableJobBackoffLimitPerIndex: true, 3618 enableJobPodFailurePolicy: true, 3619 job: batch.Job{ 3620 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3621 ObjectMeta: validObjectMeta, 3622 Spec: batch.JobSpec{ 3623 Selector: validSelector, 3624 Template: validTemplate, 3625 Parallelism: ptr.To[int32](2), 3626 Completions: ptr.To[int32](2), 3627 BackoffLimit: ptr.To[int32](6), 3628 CompletionMode: completionModePtr(batch.IndexedCompletion), 3629 BackoffLimitPerIndex: ptr.To[int32](1), 3630 PodFailurePolicy: &batch.PodFailurePolicy{ 3631 Rules: []batch.PodFailurePolicyRule{ 3632 { 3633 Action: batch.PodFailurePolicyActionFailJob, 3634 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 3635 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 3636 Values: []int32{3}, 3637 }, 3638 }, 3639 }, 3640 }, 3641 }, 3642 }, 3643 pods: []v1.Pod{ 3644 *buildPod().uid("a").index("0").status(v1.PodStatus{ 3645 Phase: v1.PodFailed, 3646 ContainerStatuses: []v1.ContainerStatus{ 3647 { 3648 Name: "x", 3649 State: v1.ContainerState{ 3650 Terminated: &v1.ContainerStateTerminated{ 3651 ExitCode: 3, 3652 }, 3653 }, 3654 }, 3655 }, 3656 }).indexFailureCount("0").trackingFinalizer().Pod, 3657 }, 3658 wantStatus: batch.JobStatus{ 3659 Active: 0, 3660 Failed: 1, 3661 FailedIndexes: ptr.To(""), 3662 Terminating: ptr.To[int32](0), 3663 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3664 Conditions: []batch.JobCondition{ 3665 { 3666 Type: batch.JobFailureTarget, 3667 Status: v1.ConditionTrue, 3668 Reason: batch.JobReasonPodFailurePolicy, 3669 Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0", 3670 }, 3671 { 3672 Type: batch.JobFailed, 3673 Status: v1.ConditionTrue, 3674 Reason: batch.JobReasonPodFailurePolicy, 3675 Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0", 3676 }, 3677 }, 3678 }, 3679 }, 3680 "job pod failure ignored due to matching Ignore action": { 3681 enableJobBackoffLimitPerIndex: true, 3682 enableJobPodFailurePolicy: true, 3683 job: batch.Job{ 3684 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3685 ObjectMeta: validObjectMeta, 3686 Spec: batch.JobSpec{ 3687 Selector: validSelector, 3688 Template: validTemplate, 3689 Parallelism: ptr.To[int32](2), 3690 Completions: ptr.To[int32](2), 3691 BackoffLimit: ptr.To[int32](6), 3692 CompletionMode: completionModePtr(batch.IndexedCompletion), 3693 BackoffLimitPerIndex: ptr.To[int32](1), 3694 PodFailurePolicy: &batch.PodFailurePolicy{ 3695 Rules: []batch.PodFailurePolicyRule{ 3696 { 3697 Action: batch.PodFailurePolicyActionIgnore, 3698 OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{ 3699 Operator: batch.PodFailurePolicyOnExitCodesOpIn, 3700 Values: []int32{3}, 3701 }, 3702 }, 3703 }, 3704 }, 3705 }, 3706 }, 3707 pods: []v1.Pod{ 3708 *buildPod().uid("a").index("0").status(v1.PodStatus{ 3709 Phase: v1.PodFailed, 3710 ContainerStatuses: []v1.ContainerStatus{ 3711 { 3712 Name: "x", 3713 State: v1.ContainerState{ 3714 Terminated: &v1.ContainerStateTerminated{ 3715 ExitCode: 3, 3716 }, 3717 }, 3718 }, 3719 }, 3720 }).indexFailureCount("0").trackingFinalizer().Pod, 3721 }, 3722 wantStatus: batch.JobStatus{ 3723 Active: 2, 3724 Failed: 0, 3725 FailedIndexes: ptr.To(""), 3726 Terminating: ptr.To[int32](0), 3727 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3728 }, 3729 }, 3730 "job failed due to exceeding backoffLimit before backoffLimitPerIndex": { 3731 enableJobBackoffLimitPerIndex: true, 3732 job: batch.Job{ 3733 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3734 ObjectMeta: validObjectMeta, 3735 Spec: batch.JobSpec{ 3736 Selector: validSelector, 3737 Template: validTemplate, 3738 Parallelism: ptr.To[int32](2), 3739 Completions: ptr.To[int32](2), 3740 BackoffLimit: ptr.To[int32](1), 3741 CompletionMode: completionModePtr(batch.IndexedCompletion), 3742 BackoffLimitPerIndex: ptr.To[int32](1), 3743 }, 3744 }, 3745 pods: []v1.Pod{ 3746 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, 3747 *buildPod().uid("b").index("1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod, 3748 }, 3749 wantStatus: batch.JobStatus{ 3750 Failed: 2, 3751 Succeeded: 0, 3752 FailedIndexes: ptr.To(""), 3753 Terminating: ptr.To[int32](0), 3754 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3755 Conditions: []batch.JobCondition{ 3756 { 3757 Type: batch.JobFailed, 3758 Status: v1.ConditionTrue, 3759 Reason: batch.JobReasonBackoffLimitExceeded, 3760 Message: "Job has reached the specified backoff limit", 3761 }, 3762 }, 3763 }, 3764 }, 3765 "job failed due to failed indexes": { 3766 enableJobBackoffLimitPerIndex: true, 3767 job: batch.Job{ 3768 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3769 ObjectMeta: validObjectMeta, 3770 Spec: batch.JobSpec{ 3771 Selector: validSelector, 3772 Template: validTemplate, 3773 Parallelism: ptr.To[int32](2), 3774 Completions: ptr.To[int32](2), 3775 BackoffLimit: ptr.To[int32](math.MaxInt32), 3776 CompletionMode: completionModePtr(batch.IndexedCompletion), 3777 BackoffLimitPerIndex: ptr.To[int32](1), 3778 }, 3779 }, 3780 pods: []v1.Pod{ 3781 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, 3782 *buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, 3783 }, 3784 wantStatus: batch.JobStatus{ 3785 Failed: 1, 3786 Succeeded: 1, 3787 Terminating: ptr.To[int32](0), 3788 FailedIndexes: ptr.To("0"), 3789 CompletedIndexes: "1", 3790 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3791 Conditions: []batch.JobCondition{ 3792 { 3793 Type: batch.JobFailed, 3794 Status: v1.ConditionTrue, 3795 Reason: batch.JobReasonFailedIndexes, 3796 Message: "Job has failed indexes", 3797 }, 3798 }, 3799 }, 3800 }, 3801 "job failed due to exceeding max failed indexes": { 3802 enableJobBackoffLimitPerIndex: true, 3803 job: batch.Job{ 3804 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3805 ObjectMeta: validObjectMeta, 3806 Spec: batch.JobSpec{ 3807 Selector: validSelector, 3808 Template: validTemplate, 3809 Parallelism: ptr.To[int32](4), 3810 Completions: ptr.To[int32](4), 3811 BackoffLimit: ptr.To[int32](math.MaxInt32), 3812 CompletionMode: completionModePtr(batch.IndexedCompletion), 3813 BackoffLimitPerIndex: ptr.To[int32](1), 3814 MaxFailedIndexes: ptr.To[int32](1), 3815 }, 3816 }, 3817 pods: []v1.Pod{ 3818 *buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, 3819 *buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod, 3820 *buildPod().uid("c").index("2").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod, 3821 *buildPod().uid("d").index("3").phase(v1.PodRunning).indexFailureCount("0").trackingFinalizer().Pod, 3822 }, 3823 wantStatus: batch.JobStatus{ 3824 Failed: 3, 3825 Succeeded: 1, 3826 Terminating: ptr.To[int32](0), 3827 FailedIndexes: ptr.To("0,2"), 3828 CompletedIndexes: "1", 3829 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3830 Conditions: []batch.JobCondition{ 3831 { 3832 Type: batch.JobFailed, 3833 Status: v1.ConditionTrue, 3834 Reason: batch.JobReasonMaxFailedIndexesExceeded, 3835 Message: "Job has exceeded the specified maximal number of failed indexes", 3836 }, 3837 }, 3838 }, 3839 }, 3840 "job with finished indexes; failedIndexes are cleaned when JobBackoffLimitPerIndex disabled": { 3841 enableJobBackoffLimitPerIndex: false, 3842 job: batch.Job{ 3843 TypeMeta: metav1.TypeMeta{Kind: "Job"}, 3844 ObjectMeta: validObjectMeta, 3845 Spec: batch.JobSpec{ 3846 Selector: validSelector, 3847 Template: validTemplate, 3848 Parallelism: ptr.To[int32](3), 3849 Completions: ptr.To[int32](3), 3850 BackoffLimit: ptr.To[int32](math.MaxInt32), 3851 CompletionMode: completionModePtr(batch.IndexedCompletion), 3852 BackoffLimitPerIndex: ptr.To[int32](1), 3853 }, 3854 Status: batch.JobStatus{ 3855 FailedIndexes: ptr.To("0"), 3856 CompletedIndexes: "1", 3857 }, 3858 }, 3859 pods: []v1.Pod{ 3860 *buildPod().uid("c").index("2").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod, 3861 }, 3862 wantStatus: batch.JobStatus{ 3863 Active: 2, 3864 Succeeded: 1, 3865 Terminating: ptr.To[int32](0), 3866 CompletedIndexes: "1", 3867 UncountedTerminatedPods: &batch.UncountedTerminatedPods{}, 3868 }, 3869 }, 3870 } 3871 for name, tc := range testCases { 3872 t.Run(name, func(t *testing.T) { 3873 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)() 3874 defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)() 3875 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 3876 fakeClock := clocktesting.NewFakeClock(now) 3877 manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 3878 fakePodControl := controller.FakePodControl{} 3879 manager.podControl = &fakePodControl 3880 manager.podStoreSynced = alwaysReady 3881 manager.jobStoreSynced = alwaysReady 3882 job := &tc.job 3883 3884 actual := job 3885 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 3886 actual = job 3887 return job, nil 3888 } 3889 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 3890 for i, pod := range tc.pods { 3891 pod := pod 3892 pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job) 3893 if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion { 3894 pb.index(fmt.Sprintf("%v", getCompletionIndex(pod.Annotations))) 3895 } 3896 pb = pb.trackingFinalizer() 3897 sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod) 3898 } 3899 3900 manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 3901 3902 // validate relevant fields of the status 3903 if diff := cmp.Diff(tc.wantStatus, actual.Status, 3904 cmpopts.IgnoreFields(batch.JobStatus{}, "StartTime", "CompletionTime", "Ready"), 3905 cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { 3906 t.Errorf("unexpected job status. Diff: %s\n", diff) 3907 } 3908 }) 3909 } 3910 } 3911 3912 func TestSyncJobUpdateRequeue(t *testing.T) { 3913 _, ctx := ktesting.NewTestContext(t) 3914 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 3915 cases := map[string]struct { 3916 updateErr error 3917 wantRequeued bool 3918 }{ 3919 "no error": { 3920 wantRequeued: false, 3921 }, 3922 "generic error": { 3923 updateErr: fmt.Errorf("update error"), 3924 wantRequeued: true, 3925 }, 3926 "conflict error": { 3927 updateErr: apierrors.NewConflict(schema.GroupResource{}, "", nil), 3928 wantRequeued: true, 3929 }, 3930 } 3931 for name, tc := range cases { 3932 t.Run(name, func(t *testing.T) { 3933 t.Cleanup(setDurationDuringTest(&DefaultJobApiBackOff, fastJobApiBackoff)) 3934 fakeClient := clocktesting.NewFakeClock(time.Now()) 3935 manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClient) 3936 fakePodControl := controller.FakePodControl{} 3937 manager.podControl = &fakePodControl 3938 manager.podStoreSynced = alwaysReady 3939 manager.jobStoreSynced = alwaysReady 3940 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 3941 return job, tc.updateErr 3942 } 3943 job := newJob(2, 2, 6, batch.NonIndexedCompletion) 3944 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 3945 manager.queue.Add(testutil.GetKey(job, t)) 3946 manager.processNextWorkItem(context.TODO()) 3947 if tc.wantRequeued { 3948 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, manager, 1) 3949 } else { 3950 // We advance the clock to make sure there are not items awaiting 3951 // to be added into the queue. We also sleep a little to give the 3952 // delaying queue time to move the potential items from pre-queue 3953 // into the queue asynchronously. 3954 manager.clock.Sleep(fastJobApiBackoff) 3955 time.Sleep(time.Millisecond) 3956 verifyEmptyQueue(ctx, t, manager) 3957 } 3958 }) 3959 } 3960 } 3961 3962 func TestUpdateJobRequeue(t *testing.T) { 3963 logger, ctx := ktesting.NewTestContext(t) 3964 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 3965 cases := map[string]struct { 3966 oldJob *batch.Job 3967 updateFn func(job *batch.Job) 3968 wantRequeuedImmediately bool 3969 }{ 3970 "spec update": { 3971 oldJob: newJob(1, 1, 1, batch.IndexedCompletion), 3972 updateFn: func(job *batch.Job) { 3973 job.Spec.Suspend = ptr.To(false) 3974 job.Generation++ 3975 }, 3976 wantRequeuedImmediately: true, 3977 }, 3978 "status update": { 3979 oldJob: newJob(1, 1, 1, batch.IndexedCompletion), 3980 updateFn: func(job *batch.Job) { 3981 job.Status.StartTime = &metav1.Time{Time: time.Now()} 3982 }, 3983 wantRequeuedImmediately: false, 3984 }, 3985 } 3986 for name, tc := range cases { 3987 t.Run(name, func(t *testing.T) { 3988 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 3989 manager.podStoreSynced = alwaysReady 3990 manager.jobStoreSynced = alwaysReady 3991 3992 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.oldJob) 3993 newJob := tc.oldJob.DeepCopy() 3994 if tc.updateFn != nil { 3995 tc.updateFn(newJob) 3996 } 3997 manager.updateJob(logger, tc.oldJob, newJob) 3998 gotRequeuedImmediately := manager.queue.Len() > 0 3999 if tc.wantRequeuedImmediately != gotRequeuedImmediately { 4000 t.Fatalf("Want immediate requeue: %v, got immediate requeue: %v", tc.wantRequeuedImmediately, gotRequeuedImmediately) 4001 } 4002 }) 4003 } 4004 } 4005 4006 func TestGetPodCreationInfoForIndependentIndexes(t *testing.T) { 4007 logger, ctx := ktesting.NewTestContext(t) 4008 now := time.Now() 4009 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4010 cases := map[string]struct { 4011 indexesToAdd []int 4012 podsWithDelayedDeletionPerIndex map[int]*v1.Pod 4013 wantIndexesToAdd []int 4014 wantRemainingTime time.Duration 4015 }{ 4016 "simple index creation": { 4017 indexesToAdd: []int{1, 3}, 4018 wantIndexesToAdd: []int{1, 3}, 4019 }, 4020 "subset of indexes can be recreated now": { 4021 indexesToAdd: []int{1, 3}, 4022 podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ 4023 1: buildPod().indexFailureCount("0").index("1").customDeletionTimestamp(now).Pod, 4024 }, 4025 wantIndexesToAdd: []int{3}, 4026 }, 4027 "subset of indexes can be recreated now as the pods failed long time ago": { 4028 indexesToAdd: []int{1, 3}, 4029 podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ 4030 1: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod, 4031 3: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-DefaultJobPodFailureBackOff)).Pod, 4032 }, 4033 wantIndexesToAdd: []int{3}, 4034 }, 4035 "no indexes can be recreated now, need to wait default pod failure backoff": { 4036 indexesToAdd: []int{1, 2, 3}, 4037 podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ 4038 1: buildPod().indexFailureCount("1").customDeletionTimestamp(now).Pod, 4039 2: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod, 4040 3: buildPod().indexFailureCount("2").customDeletionTimestamp(now).Pod, 4041 }, 4042 wantRemainingTime: DefaultJobPodFailureBackOff, 4043 }, 4044 "no indexes can be recreated now, need to wait but 1s already passed": { 4045 indexesToAdd: []int{1, 2, 3}, 4046 podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{ 4047 1: buildPod().indexFailureCount("1").customDeletionTimestamp(now.Add(-time.Second)).Pod, 4048 2: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-time.Second)).Pod, 4049 3: buildPod().indexFailureCount("2").customDeletionTimestamp(now.Add(-time.Second)).Pod, 4050 }, 4051 wantRemainingTime: DefaultJobPodFailureBackOff - time.Second, 4052 }, 4053 } 4054 for name, tc := range cases { 4055 t.Run(name, func(t *testing.T) { 4056 fakeClock := clocktesting.NewFakeClock(now) 4057 manager, _ := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4058 gotIndexesToAdd, gotRemainingTime := manager.getPodCreationInfoForIndependentIndexes(logger, tc.indexesToAdd, tc.podsWithDelayedDeletionPerIndex) 4059 if diff := cmp.Diff(tc.wantIndexesToAdd, gotIndexesToAdd); diff != "" { 4060 t.Fatalf("Unexpected indexes to add: %s", diff) 4061 } 4062 if diff := cmp.Diff(tc.wantRemainingTime, gotRemainingTime); diff != "" { 4063 t.Fatalf("Unexpected remaining time: %s", diff) 4064 } 4065 }) 4066 } 4067 } 4068 4069 func TestJobPodLookup(t *testing.T) { 4070 _, ctx := ktesting.NewTestContext(t) 4071 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4072 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4073 manager.podStoreSynced = alwaysReady 4074 manager.jobStoreSynced = alwaysReady 4075 testCases := []struct { 4076 job *batch.Job 4077 pod *v1.Pod 4078 4079 expectedName string 4080 }{ 4081 // pods without labels don't match any job 4082 { 4083 job: &batch.Job{ 4084 ObjectMeta: metav1.ObjectMeta{Name: "basic"}, 4085 }, 4086 pod: &v1.Pod{ 4087 ObjectMeta: metav1.ObjectMeta{Name: "foo1", Namespace: metav1.NamespaceAll}, 4088 }, 4089 expectedName: "", 4090 }, 4091 // matching labels, different namespace 4092 { 4093 job: &batch.Job{ 4094 ObjectMeta: metav1.ObjectMeta{Name: "foo"}, 4095 Spec: batch.JobSpec{ 4096 Selector: &metav1.LabelSelector{ 4097 MatchLabels: map[string]string{"foo": "bar"}, 4098 }, 4099 }, 4100 }, 4101 pod: &v1.Pod{ 4102 ObjectMeta: metav1.ObjectMeta{ 4103 Name: "foo2", 4104 Namespace: "ns", 4105 Labels: map[string]string{"foo": "bar"}, 4106 }, 4107 }, 4108 expectedName: "", 4109 }, 4110 // matching ns and labels returns 4111 { 4112 job: &batch.Job{ 4113 ObjectMeta: metav1.ObjectMeta{Name: "bar", Namespace: "ns"}, 4114 Spec: batch.JobSpec{ 4115 Selector: &metav1.LabelSelector{ 4116 MatchExpressions: []metav1.LabelSelectorRequirement{ 4117 { 4118 Key: "foo", 4119 Operator: metav1.LabelSelectorOpIn, 4120 Values: []string{"bar"}, 4121 }, 4122 }, 4123 }, 4124 }, 4125 }, 4126 pod: &v1.Pod{ 4127 ObjectMeta: metav1.ObjectMeta{ 4128 Name: "foo3", 4129 Namespace: "ns", 4130 Labels: map[string]string{"foo": "bar"}, 4131 }, 4132 }, 4133 expectedName: "bar", 4134 }, 4135 } 4136 for _, tc := range testCases { 4137 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.job) 4138 if jobs := manager.getPodJobs(tc.pod); len(jobs) > 0 { 4139 if got, want := len(jobs), 1; got != want { 4140 t.Errorf("len(jobs) = %v, want %v", got, want) 4141 } 4142 job := jobs[0] 4143 if tc.expectedName != job.Name { 4144 t.Errorf("Got job %+v expected %+v", job.Name, tc.expectedName) 4145 } 4146 } else if tc.expectedName != "" { 4147 t.Errorf("Expected a job %v pod %v, found none", tc.expectedName, tc.pod.Name) 4148 } 4149 } 4150 } 4151 4152 func TestGetPodsForJob(t *testing.T) { 4153 _, ctx := ktesting.NewTestContext(t) 4154 job := newJob(1, 1, 6, batch.NonIndexedCompletion) 4155 job.Name = "test_job" 4156 otherJob := newJob(1, 1, 6, batch.NonIndexedCompletion) 4157 otherJob.Name = "other_job" 4158 cases := map[string]struct { 4159 jobDeleted bool 4160 jobDeletedInCache bool 4161 pods []*v1.Pod 4162 wantPods []string 4163 wantPodsFinalizer []string 4164 }{ 4165 "only matching": { 4166 pods: []*v1.Pod{ 4167 buildPod().name("pod1").job(job).trackingFinalizer().Pod, 4168 buildPod().name("pod2").job(otherJob).Pod, 4169 buildPod().name("pod3").ns(job.Namespace).Pod, 4170 buildPod().name("pod4").job(job).Pod, 4171 }, 4172 wantPods: []string{"pod1", "pod4"}, 4173 wantPodsFinalizer: []string{"pod1"}, 4174 }, 4175 "adopt": { 4176 pods: []*v1.Pod{ 4177 buildPod().name("pod1").job(job).Pod, 4178 buildPod().name("pod2").job(job).clearOwner().Pod, 4179 buildPod().name("pod3").job(otherJob).Pod, 4180 }, 4181 wantPods: []string{"pod1", "pod2"}, 4182 wantPodsFinalizer: []string{"pod2"}, 4183 }, 4184 "no adopt when deleting": { 4185 jobDeleted: true, 4186 jobDeletedInCache: true, 4187 pods: []*v1.Pod{ 4188 buildPod().name("pod1").job(job).Pod, 4189 buildPod().name("pod2").job(job).clearOwner().Pod, 4190 }, 4191 wantPods: []string{"pod1"}, 4192 }, 4193 "no adopt when deleting race": { 4194 jobDeleted: true, 4195 pods: []*v1.Pod{ 4196 buildPod().name("pod1").job(job).Pod, 4197 buildPod().name("pod2").job(job).clearOwner().Pod, 4198 }, 4199 wantPods: []string{"pod1"}, 4200 }, 4201 "release": { 4202 pods: []*v1.Pod{ 4203 buildPod().name("pod1").job(job).Pod, 4204 buildPod().name("pod2").job(job).clearLabels().Pod, 4205 }, 4206 wantPods: []string{"pod1"}, 4207 }, 4208 } 4209 for name, tc := range cases { 4210 t.Run(name, func(t *testing.T) { 4211 job := job.DeepCopy() 4212 if tc.jobDeleted { 4213 job.DeletionTimestamp = &metav1.Time{} 4214 } 4215 clientSet := fake.NewSimpleClientset(job, otherJob) 4216 jm, informer := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc) 4217 jm.podStoreSynced = alwaysReady 4218 jm.jobStoreSynced = alwaysReady 4219 cachedJob := job.DeepCopy() 4220 if tc.jobDeletedInCache { 4221 cachedJob.DeletionTimestamp = &metav1.Time{} 4222 } 4223 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(cachedJob) 4224 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(otherJob) 4225 for _, p := range tc.pods { 4226 informer.Core().V1().Pods().Informer().GetIndexer().Add(p) 4227 } 4228 4229 pods, err := jm.getPodsForJob(context.TODO(), job) 4230 if err != nil { 4231 t.Fatalf("getPodsForJob() error: %v", err) 4232 } 4233 got := make([]string, len(pods)) 4234 var gotFinalizer []string 4235 for i, p := range pods { 4236 got[i] = p.Name 4237 if hasJobTrackingFinalizer(p) { 4238 gotFinalizer = append(gotFinalizer, p.Name) 4239 } 4240 } 4241 sort.Strings(got) 4242 if diff := cmp.Diff(tc.wantPods, got); diff != "" { 4243 t.Errorf("getPodsForJob() returned (-want,+got):\n%s", diff) 4244 } 4245 sort.Strings(gotFinalizer) 4246 if diff := cmp.Diff(tc.wantPodsFinalizer, gotFinalizer); diff != "" { 4247 t.Errorf("Pods with finalizers (-want,+got):\n%s", diff) 4248 } 4249 }) 4250 } 4251 } 4252 4253 func TestAddPod(t *testing.T) { 4254 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4255 _, ctx := ktesting.NewTestContext(t) 4256 logger := klog.FromContext(ctx) 4257 4258 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4259 fakeClock := clocktesting.NewFakeClock(time.Now()) 4260 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4261 jm.podStoreSynced = alwaysReady 4262 jm.jobStoreSynced = alwaysReady 4263 4264 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4265 job1.Name = "job1" 4266 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4267 job2.Name = "job2" 4268 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4269 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4270 4271 pod1 := newPod("pod1", job1) 4272 pod2 := newPod("pod2", job2) 4273 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4274 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2) 4275 4276 jm.addPod(logger, pod1) 4277 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4278 key, done := jm.queue.Get() 4279 if key == nil || done { 4280 t.Fatalf("failed to enqueue controller for pod %v", pod1.Name) 4281 } 4282 expectedKey, _ := controller.KeyFunc(job1) 4283 if got, want := key.(string), expectedKey; got != want { 4284 t.Errorf("queue.Get() = %v, want %v", got, want) 4285 } 4286 4287 jm.addPod(logger, pod2) 4288 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4289 key, done = jm.queue.Get() 4290 if key == nil || done { 4291 t.Fatalf("failed to enqueue controller for pod %v", pod2.Name) 4292 } 4293 expectedKey, _ = controller.KeyFunc(job2) 4294 if got, want := key.(string), expectedKey; got != want { 4295 t.Errorf("queue.Get() = %v, want %v", got, want) 4296 } 4297 } 4298 4299 func TestAddPodOrphan(t *testing.T) { 4300 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4301 logger, ctx := ktesting.NewTestContext(t) 4302 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4303 fakeClock := clocktesting.NewFakeClock(time.Now()) 4304 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4305 jm.podStoreSynced = alwaysReady 4306 jm.jobStoreSynced = alwaysReady 4307 4308 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4309 job1.Name = "job1" 4310 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4311 job2.Name = "job2" 4312 job3 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4313 job3.Name = "job3" 4314 job3.Spec.Selector.MatchLabels = map[string]string{"other": "labels"} 4315 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4316 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4317 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job3) 4318 4319 pod1 := newPod("pod1", job1) 4320 // Make pod an orphan. Expect all matching controllers to be queued. 4321 pod1.OwnerReferences = nil 4322 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4323 4324 jm.addPod(logger, pod1) 4325 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2) 4326 } 4327 4328 func TestUpdatePod(t *testing.T) { 4329 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4330 _, ctx := ktesting.NewTestContext(t) 4331 logger := klog.FromContext(ctx) 4332 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4333 fakeClock := clocktesting.NewFakeClock(time.Now()) 4334 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4335 jm.podStoreSynced = alwaysReady 4336 jm.jobStoreSynced = alwaysReady 4337 4338 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4339 job1.Name = "job1" 4340 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4341 job2.Name = "job2" 4342 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4343 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4344 4345 pod1 := newPod("pod1", job1) 4346 pod2 := newPod("pod2", job2) 4347 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4348 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2) 4349 4350 prev := *pod1 4351 bumpResourceVersion(pod1) 4352 jm.updatePod(logger, &prev, pod1) 4353 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4354 key, done := jm.queue.Get() 4355 if key == nil || done { 4356 t.Fatalf("failed to enqueue controller for pod %v", pod1.Name) 4357 } 4358 expectedKey, _ := controller.KeyFunc(job1) 4359 if got, want := key.(string), expectedKey; got != want { 4360 t.Errorf("queue.Get() = %v, want %v", got, want) 4361 } 4362 4363 prev = *pod2 4364 bumpResourceVersion(pod2) 4365 jm.updatePod(logger, &prev, pod2) 4366 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4367 key, done = jm.queue.Get() 4368 if key == nil || done { 4369 t.Fatalf("failed to enqueue controller for pod %v", pod2.Name) 4370 } 4371 expectedKey, _ = controller.KeyFunc(job2) 4372 if got, want := key.(string), expectedKey; got != want { 4373 t.Errorf("queue.Get() = %v, want %v", got, want) 4374 } 4375 } 4376 4377 func TestUpdatePodOrphanWithNewLabels(t *testing.T) { 4378 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4379 logger, ctx := ktesting.NewTestContext(t) 4380 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4381 fakeClock := clocktesting.NewFakeClock(time.Now()) 4382 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4383 jm.podStoreSynced = alwaysReady 4384 jm.jobStoreSynced = alwaysReady 4385 4386 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4387 job1.Name = "job1" 4388 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4389 job2.Name = "job2" 4390 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4391 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4392 4393 pod1 := newPod("pod1", job1) 4394 pod1.OwnerReferences = nil 4395 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4396 4397 // Labels changed on orphan. Expect newly matching controllers to queue. 4398 prev := *pod1 4399 prev.Labels = map[string]string{"foo2": "bar2"} 4400 bumpResourceVersion(pod1) 4401 jm.updatePod(logger, &prev, pod1) 4402 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2) 4403 } 4404 4405 func TestUpdatePodChangeControllerRef(t *testing.T) { 4406 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4407 _, ctx := ktesting.NewTestContext(t) 4408 logger := klog.FromContext(ctx) 4409 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4410 fakeClock := clocktesting.NewFakeClock(time.Now()) 4411 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4412 jm.podStoreSynced = alwaysReady 4413 jm.jobStoreSynced = alwaysReady 4414 4415 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4416 job1.Name = "job1" 4417 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4418 job2.Name = "job2" 4419 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4420 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4421 4422 pod1 := newPod("pod1", job1) 4423 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4424 4425 // Changed ControllerRef. Expect both old and new to queue. 4426 prev := *pod1 4427 prev.OwnerReferences = []metav1.OwnerReference{*metav1.NewControllerRef(job2, controllerKind)} 4428 bumpResourceVersion(pod1) 4429 jm.updatePod(logger, &prev, pod1) 4430 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2) 4431 } 4432 4433 func TestUpdatePodRelease(t *testing.T) { 4434 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4435 _, ctx := ktesting.NewTestContext(t) 4436 logger := klog.FromContext(ctx) 4437 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4438 fakeClock := clocktesting.NewFakeClock(time.Now()) 4439 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4440 jm.podStoreSynced = alwaysReady 4441 jm.jobStoreSynced = alwaysReady 4442 4443 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4444 job1.Name = "job1" 4445 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4446 job2.Name = "job2" 4447 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4448 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4449 4450 pod1 := newPod("pod1", job1) 4451 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4452 4453 // Remove ControllerRef. Expect all matching to queue for adoption. 4454 prev := *pod1 4455 pod1.OwnerReferences = nil 4456 bumpResourceVersion(pod1) 4457 jm.updatePod(logger, &prev, pod1) 4458 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2) 4459 } 4460 4461 func TestDeletePod(t *testing.T) { 4462 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod)) 4463 logger, ctx := ktesting.NewTestContext(t) 4464 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4465 fakeClock := clocktesting.NewFakeClock(time.Now()) 4466 jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4467 jm.podStoreSynced = alwaysReady 4468 jm.jobStoreSynced = alwaysReady 4469 4470 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4471 job1.Name = "job1" 4472 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4473 job2.Name = "job2" 4474 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4475 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4476 4477 pod1 := newPod("pod1", job1) 4478 pod2 := newPod("pod2", job2) 4479 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4480 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2) 4481 4482 jm.deletePod(logger, pod1, true) 4483 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4484 key, done := jm.queue.Get() 4485 if key == nil || done { 4486 t.Fatalf("failed to enqueue controller for pod %v", pod1.Name) 4487 } 4488 expectedKey, _ := controller.KeyFunc(job1) 4489 if got, want := key.(string), expectedKey; got != want { 4490 t.Errorf("queue.Get() = %v, want %v", got, want) 4491 } 4492 4493 jm.deletePod(logger, pod2, true) 4494 verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1) 4495 key, done = jm.queue.Get() 4496 if key == nil || done { 4497 t.Fatalf("failed to enqueue controller for pod %v", pod2.Name) 4498 } 4499 expectedKey, _ = controller.KeyFunc(job2) 4500 if got, want := key.(string), expectedKey; got != want { 4501 t.Errorf("queue.Get() = %v, want %v", got, want) 4502 } 4503 } 4504 4505 func TestDeletePodOrphan(t *testing.T) { 4506 // Disable batching of pod updates to show it does not get requeued at all 4507 t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, 0)) 4508 logger, ctx := ktesting.NewTestContext(t) 4509 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4510 jm, informer := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4511 jm.podStoreSynced = alwaysReady 4512 jm.jobStoreSynced = alwaysReady 4513 4514 job1 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4515 job1.Name = "job1" 4516 job2 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4517 job2.Name = "job2" 4518 job3 := newJob(1, 1, 6, batch.NonIndexedCompletion) 4519 job3.Name = "job3" 4520 job3.Spec.Selector.MatchLabels = map[string]string{"other": "labels"} 4521 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1) 4522 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2) 4523 informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job3) 4524 4525 pod1 := newPod("pod1", job1) 4526 pod1.OwnerReferences = nil 4527 informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1) 4528 4529 jm.deletePod(logger, pod1, true) 4530 if got, want := jm.queue.Len(), 0; got != want { 4531 t.Fatalf("queue.Len() = %v, want %v", got, want) 4532 } 4533 } 4534 4535 type FakeJobExpectations struct { 4536 *controller.ControllerExpectations 4537 satisfied bool 4538 expSatisfied func() 4539 } 4540 4541 func (fe FakeJobExpectations) SatisfiedExpectations(logger klog.Logger, controllerKey string) bool { 4542 fe.expSatisfied() 4543 return fe.satisfied 4544 } 4545 4546 // TestSyncJobExpectations tests that a pod cannot sneak in between counting active pods 4547 // and checking expectations. 4548 func TestSyncJobExpectations(t *testing.T) { 4549 _, ctx := ktesting.NewTestContext(t) 4550 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4551 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4552 fakePodControl := controller.FakePodControl{} 4553 manager.podControl = &fakePodControl 4554 manager.podStoreSynced = alwaysReady 4555 manager.jobStoreSynced = alwaysReady 4556 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 4557 return job, nil 4558 } 4559 4560 job := newJob(2, 2, 6, batch.NonIndexedCompletion) 4561 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 4562 pods := newPodList(2, v1.PodPending, job) 4563 podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() 4564 podIndexer.Add(pods[0]) 4565 4566 manager.expectations = FakeJobExpectations{ 4567 controller.NewControllerExpectations(), true, func() { 4568 // If we check active pods before checking expectations, the job 4569 // will create a new replica because it doesn't see this pod, but 4570 // has fulfilled its expectations. 4571 podIndexer.Add(pods[1]) 4572 }, 4573 } 4574 manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 4575 if len(fakePodControl.Templates) != 0 { 4576 t.Errorf("Unexpected number of creates. Expected %d, saw %d\n", 0, len(fakePodControl.Templates)) 4577 } 4578 if len(fakePodControl.DeletePodName) != 0 { 4579 t.Errorf("Unexpected number of deletes. Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName)) 4580 } 4581 } 4582 4583 func TestWatchJobs(t *testing.T) { 4584 _, ctx := ktesting.NewTestContext(t) 4585 clientset := fake.NewSimpleClientset() 4586 fakeWatch := watch.NewFake() 4587 clientset.PrependWatchReactor("jobs", core.DefaultWatchReactor(fakeWatch, nil)) 4588 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4589 manager.podStoreSynced = alwaysReady 4590 manager.jobStoreSynced = alwaysReady 4591 4592 var testJob batch.Job 4593 received := make(chan struct{}) 4594 4595 // The update sent through the fakeWatcher should make its way into the workqueue, 4596 // and eventually into the syncHandler. 4597 manager.syncHandler = func(ctx context.Context, key string) error { 4598 defer close(received) 4599 ns, name, err := cache.SplitMetaNamespaceKey(key) 4600 if err != nil { 4601 t.Errorf("Error getting namespace/name from key %v: %v", key, err) 4602 } 4603 job, err := manager.jobLister.Jobs(ns).Get(name) 4604 if err != nil || job == nil { 4605 t.Errorf("Expected to find job under key %v: %v", key, err) 4606 return nil 4607 } 4608 if !apiequality.Semantic.DeepDerivative(*job, testJob) { 4609 t.Errorf("Expected %#v, but got %#v", testJob, *job) 4610 } 4611 return nil 4612 } 4613 // Start only the job watcher and the workqueue, send a watch event, 4614 // and make sure it hits the sync method. 4615 stopCh := make(chan struct{}) 4616 defer close(stopCh) 4617 sharedInformerFactory.Start(stopCh) 4618 go manager.Run(context.TODO(), 1) 4619 4620 // We're sending new job to see if it reaches syncHandler. 4621 testJob.Namespace = "bar" 4622 testJob.Name = "foo" 4623 fakeWatch.Add(&testJob) 4624 t.Log("Waiting for job to reach syncHandler") 4625 <-received 4626 } 4627 4628 func TestWatchPods(t *testing.T) { 4629 _, ctx := ktesting.NewTestContext(t) 4630 testJob := newJob(2, 2, 6, batch.NonIndexedCompletion) 4631 clientset := fake.NewSimpleClientset(testJob) 4632 fakeWatch := watch.NewFake() 4633 clientset.PrependWatchReactor("pods", core.DefaultWatchReactor(fakeWatch, nil)) 4634 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4635 manager.podStoreSynced = alwaysReady 4636 manager.jobStoreSynced = alwaysReady 4637 4638 // Put one job and one pod into the store 4639 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(testJob) 4640 received := make(chan struct{}) 4641 // The pod update sent through the fakeWatcher should figure out the managing job and 4642 // send it into the syncHandler. 4643 manager.syncHandler = func(ctx context.Context, key string) error { 4644 ns, name, err := cache.SplitMetaNamespaceKey(key) 4645 if err != nil { 4646 t.Errorf("Error getting namespace/name from key %v: %v", key, err) 4647 } 4648 job, err := manager.jobLister.Jobs(ns).Get(name) 4649 if err != nil { 4650 t.Errorf("Expected to find job under key %v: %v", key, err) 4651 } 4652 if !apiequality.Semantic.DeepDerivative(job, testJob) { 4653 t.Errorf("\nExpected %#v,\nbut got %#v", testJob, job) 4654 close(received) 4655 return nil 4656 } 4657 close(received) 4658 return nil 4659 } 4660 // Start only the pod watcher and the workqueue, send a watch event, 4661 // and make sure it hits the sync method for the right job. 4662 stopCh := make(chan struct{}) 4663 defer close(stopCh) 4664 go sharedInformerFactory.Core().V1().Pods().Informer().Run(stopCh) 4665 go manager.Run(context.TODO(), 1) 4666 4667 pods := newPodList(1, v1.PodRunning, testJob) 4668 testPod := pods[0] 4669 testPod.Status.Phase = v1.PodFailed 4670 fakeWatch.Add(testPod) 4671 4672 t.Log("Waiting for pod to reach syncHandler") 4673 <-received 4674 } 4675 4676 func TestWatchOrphanPods(t *testing.T) { 4677 _, ctx := ktesting.NewTestContext(t) 4678 clientset := fake.NewSimpleClientset() 4679 sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc()) 4680 manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset) 4681 if err != nil { 4682 t.Fatalf("Error creating Job controller: %v", err) 4683 } 4684 manager.podStoreSynced = alwaysReady 4685 manager.jobStoreSynced = alwaysReady 4686 4687 stopCh := make(chan struct{}) 4688 defer close(stopCh) 4689 podInformer := sharedInformers.Core().V1().Pods().Informer() 4690 go podInformer.Run(stopCh) 4691 cache.WaitForCacheSync(stopCh, podInformer.HasSynced) 4692 go manager.Run(context.TODO(), 1) 4693 4694 // Create job but don't add it to the store. 4695 cases := map[string]struct { 4696 job *batch.Job 4697 inCache bool 4698 }{ 4699 "job_does_not_exist": { 4700 job: newJob(2, 2, 6, batch.NonIndexedCompletion), 4701 }, 4702 "orphan": {}, 4703 "job_finished": { 4704 job: func() *batch.Job { 4705 j := newJob(2, 2, 6, batch.NonIndexedCompletion) 4706 j.Status.Conditions = append(j.Status.Conditions, batch.JobCondition{ 4707 Type: batch.JobComplete, 4708 Status: v1.ConditionTrue, 4709 }) 4710 return j 4711 }(), 4712 inCache: true, 4713 }, 4714 } 4715 for name, tc := range cases { 4716 t.Run(name, func(t *testing.T) { 4717 if tc.inCache { 4718 if err := sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.job); err != nil { 4719 t.Fatalf("Failed to insert job in index: %v", err) 4720 } 4721 t.Cleanup(func() { 4722 sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Delete(tc.job) 4723 }) 4724 } 4725 4726 podBuilder := buildPod().name(name).deletionTimestamp().trackingFinalizer() 4727 if tc.job != nil { 4728 podBuilder = podBuilder.job(tc.job) 4729 } 4730 orphanPod := podBuilder.Pod 4731 orphanPod, err := clientset.CoreV1().Pods("default").Create(context.Background(), orphanPod, metav1.CreateOptions{}) 4732 if err != nil { 4733 t.Fatalf("Creating orphan pod: %v", err) 4734 } 4735 4736 if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { 4737 p, err := clientset.CoreV1().Pods(orphanPod.Namespace).Get(context.Background(), orphanPod.Name, metav1.GetOptions{}) 4738 if err != nil { 4739 return false, err 4740 } 4741 return !hasJobTrackingFinalizer(p), nil 4742 }); err != nil { 4743 t.Errorf("Waiting for Pod to get the finalizer removed: %v", err) 4744 } 4745 }) 4746 } 4747 } 4748 4749 func bumpResourceVersion(obj metav1.Object) { 4750 ver, _ := strconv.ParseInt(obj.GetResourceVersion(), 10, 32) 4751 obj.SetResourceVersion(strconv.FormatInt(ver+1, 10)) 4752 } 4753 4754 func TestJobApiBackoffReset(t *testing.T) { 4755 t.Cleanup(setDurationDuringTest(&DefaultJobApiBackOff, fastJobApiBackoff)) 4756 _, ctx := ktesting.NewTestContext(t) 4757 4758 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4759 fakeClock := clocktesting.NewFakeClock(time.Now()) 4760 manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock) 4761 fakePodControl := controller.FakePodControl{} 4762 manager.podControl = &fakePodControl 4763 manager.podStoreSynced = alwaysReady 4764 manager.jobStoreSynced = alwaysReady 4765 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 4766 return job, nil 4767 } 4768 4769 job := newJob(1, 1, 2, batch.NonIndexedCompletion) 4770 key := testutil.GetKey(job, t) 4771 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 4772 4773 // error returned make the key requeued 4774 fakePodControl.Err = errors.New("Controller error") 4775 manager.queue.Add(key) 4776 manager.processNextWorkItem(context.TODO()) 4777 retries := manager.queue.NumRequeues(key) 4778 if retries != 1 { 4779 t.Fatalf("%s: expected exactly 1 retry, got %d", job.Name, retries) 4780 } 4781 // await for the actual requeue after processing of the pending queue is done 4782 awaitForQueueLen(ctx, t, manager, 1) 4783 4784 // the queue is emptied on success 4785 fakePodControl.Err = nil 4786 manager.processNextWorkItem(context.TODO()) 4787 verifyEmptyQueue(ctx, t, manager) 4788 } 4789 4790 var _ workqueue.RateLimitingInterface = &fakeRateLimitingQueue{} 4791 4792 type fakeRateLimitingQueue struct { 4793 workqueue.Interface 4794 requeues int 4795 item interface{} 4796 duration time.Duration 4797 } 4798 4799 func (f *fakeRateLimitingQueue) AddRateLimited(item interface{}) {} 4800 func (f *fakeRateLimitingQueue) Forget(item interface{}) { 4801 f.requeues = 0 4802 } 4803 func (f *fakeRateLimitingQueue) NumRequeues(item interface{}) int { 4804 return f.requeues 4805 } 4806 func (f *fakeRateLimitingQueue) AddAfter(item interface{}, duration time.Duration) { 4807 f.item = item 4808 f.duration = duration 4809 } 4810 4811 func TestJobBackoff(t *testing.T) { 4812 _, ctx := ktesting.NewTestContext(t) 4813 logger := klog.FromContext(ctx) 4814 job := newJob(1, 1, 1, batch.NonIndexedCompletion) 4815 oldPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), job) 4816 oldPod.ResourceVersion = "1" 4817 newPod := oldPod.DeepCopy() 4818 newPod.ResourceVersion = "2" 4819 4820 testCases := map[string]struct { 4821 requeues int 4822 oldPodPhase v1.PodPhase 4823 phase v1.PodPhase 4824 wantBackoff time.Duration 4825 }{ 4826 "failure with pod updates batching": { 4827 requeues: 0, 4828 phase: v1.PodFailed, 4829 wantBackoff: syncJobBatchPeriod, 4830 }, 4831 } 4832 4833 for name, tc := range testCases { 4834 t.Run(name, func(t *testing.T) { 4835 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4836 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4837 fakePodControl := controller.FakePodControl{} 4838 manager.podControl = &fakePodControl 4839 manager.podStoreSynced = alwaysReady 4840 manager.jobStoreSynced = alwaysReady 4841 queue := &fakeRateLimitingQueue{} 4842 manager.queue = queue 4843 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 4844 4845 queue.requeues = tc.requeues 4846 newPod.Status.Phase = tc.phase 4847 oldPod.Status.Phase = v1.PodRunning 4848 if tc.oldPodPhase != "" { 4849 oldPod.Status.Phase = tc.oldPodPhase 4850 } 4851 manager.updatePod(logger, oldPod, newPod) 4852 if queue.duration != tc.wantBackoff { 4853 t.Errorf("unexpected backoff %v, expected %v", queue.duration, tc.wantBackoff) 4854 } 4855 }) 4856 } 4857 } 4858 4859 func TestJobBackoffForOnFailure(t *testing.T) { 4860 _, ctx := ktesting.NewTestContext(t) 4861 jobConditionComplete := batch.JobComplete 4862 jobConditionFailed := batch.JobFailed 4863 jobConditionSuspended := batch.JobSuspended 4864 4865 testCases := map[string]struct { 4866 // job setup 4867 parallelism int32 4868 completions int32 4869 backoffLimit int32 4870 suspend bool 4871 4872 // pod setup 4873 restartCounts []int32 4874 podPhase v1.PodPhase 4875 4876 // expectations 4877 expectedActive int32 4878 expectedSucceeded int32 4879 expectedFailed int32 4880 expectedCondition *batch.JobConditionType 4881 expectedConditionReason string 4882 }{ 4883 "backoffLimit 0 should have 1 pod active": { 4884 1, 1, 0, 4885 false, []int32{0}, v1.PodRunning, 4886 1, 0, 0, nil, "", 4887 }, 4888 "backoffLimit 1 with restartCount 0 should have 1 pod active": { 4889 1, 1, 1, 4890 false, []int32{0}, v1.PodRunning, 4891 1, 0, 0, nil, "", 4892 }, 4893 "backoffLimit 1 with restartCount 1 and podRunning should have 0 pod active": { 4894 1, 1, 1, 4895 false, []int32{1}, v1.PodRunning, 4896 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded", 4897 }, 4898 "backoffLimit 1 with restartCount 1 and podPending should have 0 pod active": { 4899 1, 1, 1, 4900 false, []int32{1}, v1.PodPending, 4901 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded", 4902 }, 4903 "too many job failures with podRunning - single pod": { 4904 1, 5, 2, 4905 false, []int32{2}, v1.PodRunning, 4906 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded", 4907 }, 4908 "too many job failures with podPending - single pod": { 4909 1, 5, 2, 4910 false, []int32{2}, v1.PodPending, 4911 0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded", 4912 }, 4913 "too many job failures with podRunning - multiple pods": { 4914 2, 5, 2, 4915 false, []int32{1, 1}, v1.PodRunning, 4916 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded", 4917 }, 4918 "too many job failures with podPending - multiple pods": { 4919 2, 5, 2, 4920 false, []int32{1, 1}, v1.PodPending, 4921 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded", 4922 }, 4923 "not enough failures": { 4924 2, 5, 3, 4925 false, []int32{1, 1}, v1.PodRunning, 4926 2, 0, 0, nil, "", 4927 }, 4928 "suspending a job": { 4929 2, 4, 6, 4930 true, []int32{1, 1}, v1.PodRunning, 4931 0, 0, 0, &jobConditionSuspended, "JobSuspended", 4932 }, 4933 "finshed job": { 4934 2, 4, 6, 4935 true, []int32{1, 1, 2, 0}, v1.PodSucceeded, 4936 0, 4, 0, &jobConditionComplete, "", 4937 }, 4938 } 4939 4940 for name, tc := range testCases { 4941 t.Run(name, func(t *testing.T) { 4942 // job manager setup 4943 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 4944 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 4945 fakePodControl := controller.FakePodControl{} 4946 manager.podControl = &fakePodControl 4947 manager.podStoreSynced = alwaysReady 4948 manager.jobStoreSynced = alwaysReady 4949 var actual *batch.Job 4950 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 4951 actual = job 4952 return job, nil 4953 } 4954 4955 // job & pods setup 4956 job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion) 4957 job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyOnFailure 4958 job.Spec.Suspend = ptr.To(tc.suspend) 4959 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 4960 podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() 4961 for i, pod := range newPodList(len(tc.restartCounts), tc.podPhase, job) { 4962 pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: tc.restartCounts[i]}} 4963 podIndexer.Add(pod) 4964 } 4965 4966 // run 4967 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 4968 4969 if err != nil { 4970 t.Errorf("unexpected error syncing job. Got %#v", err) 4971 } 4972 // validate status 4973 if actual.Status.Active != tc.expectedActive { 4974 t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active) 4975 } 4976 if actual.Status.Succeeded != tc.expectedSucceeded { 4977 t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded) 4978 } 4979 if actual.Status.Failed != tc.expectedFailed { 4980 t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed) 4981 } 4982 // validate conditions 4983 if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) { 4984 t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions) 4985 } 4986 }) 4987 } 4988 } 4989 4990 func TestJobBackoffOnRestartPolicyNever(t *testing.T) { 4991 _, ctx := ktesting.NewTestContext(t) 4992 jobConditionFailed := batch.JobFailed 4993 4994 testCases := map[string]struct { 4995 // job setup 4996 parallelism int32 4997 completions int32 4998 backoffLimit int32 4999 5000 // pod setup 5001 activePodsPhase v1.PodPhase 5002 activePods int 5003 failedPods int 5004 5005 // expectations 5006 expectedActive int32 5007 expectedSucceeded int32 5008 expectedFailed int32 5009 expectedCondition *batch.JobConditionType 5010 expectedConditionReason string 5011 }{ 5012 "not enough failures with backoffLimit 0 - single pod": { 5013 1, 1, 0, 5014 v1.PodRunning, 1, 0, 5015 1, 0, 0, nil, "", 5016 }, 5017 "not enough failures with backoffLimit 1 - single pod": { 5018 1, 1, 1, 5019 "", 0, 1, 5020 1, 0, 1, nil, "", 5021 }, 5022 "too many failures with backoffLimit 1 - single pod": { 5023 1, 1, 1, 5024 "", 0, 2, 5025 0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded", 5026 }, 5027 "not enough failures with backoffLimit 6 - multiple pods": { 5028 2, 2, 6, 5029 v1.PodRunning, 1, 6, 5030 2, 0, 6, nil, "", 5031 }, 5032 "too many failures with backoffLimit 6 - multiple pods": { 5033 2, 2, 6, 5034 "", 0, 7, 5035 0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded", 5036 }, 5037 } 5038 5039 for name, tc := range testCases { 5040 t.Run(name, func(t *testing.T) { 5041 // job manager setup 5042 clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}}) 5043 manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc) 5044 fakePodControl := controller.FakePodControl{} 5045 manager.podControl = &fakePodControl 5046 manager.podStoreSynced = alwaysReady 5047 manager.jobStoreSynced = alwaysReady 5048 var actual *batch.Job 5049 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 5050 actual = job 5051 return job, nil 5052 } 5053 5054 // job & pods setup 5055 job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion) 5056 job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever 5057 sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 5058 podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer() 5059 for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) { 5060 pod.Status.ContainerStatuses = []v1.ContainerStatus{{State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{ 5061 FinishedAt: testFinishedAt, 5062 }}}} 5063 podIndexer.Add(pod) 5064 } 5065 for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) { 5066 podIndexer.Add(pod) 5067 } 5068 5069 // run 5070 err := manager.syncJob(context.TODO(), testutil.GetKey(job, t)) 5071 if err != nil { 5072 t.Fatalf("unexpected error syncing job: %#v\n", err) 5073 } 5074 // validate status 5075 if actual.Status.Active != tc.expectedActive { 5076 t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active) 5077 } 5078 if actual.Status.Succeeded != tc.expectedSucceeded { 5079 t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded) 5080 } 5081 if actual.Status.Failed != tc.expectedFailed { 5082 t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed) 5083 } 5084 // validate conditions 5085 if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) { 5086 t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions) 5087 } 5088 }) 5089 } 5090 } 5091 5092 func TestEnsureJobConditions(t *testing.T) { 5093 testCases := []struct { 5094 name string 5095 haveList []batch.JobCondition 5096 wantType batch.JobConditionType 5097 wantStatus v1.ConditionStatus 5098 wantReason string 5099 expectList []batch.JobCondition 5100 expectUpdate bool 5101 }{ 5102 { 5103 name: "append true condition", 5104 haveList: []batch.JobCondition{}, 5105 wantType: batch.JobSuspended, 5106 wantStatus: v1.ConditionTrue, 5107 wantReason: "foo", 5108 expectList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5109 expectUpdate: true, 5110 }, 5111 { 5112 name: "append false condition", 5113 haveList: []batch.JobCondition{}, 5114 wantType: batch.JobSuspended, 5115 wantStatus: v1.ConditionFalse, 5116 wantReason: "foo", 5117 expectList: []batch.JobCondition{}, 5118 expectUpdate: false, 5119 }, 5120 { 5121 name: "update true condition reason", 5122 haveList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5123 wantType: batch.JobSuspended, 5124 wantStatus: v1.ConditionTrue, 5125 wantReason: "bar", 5126 expectList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "bar", "", realClock.Now())}, 5127 expectUpdate: true, 5128 }, 5129 { 5130 name: "update true condition status", 5131 haveList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5132 wantType: batch.JobSuspended, 5133 wantStatus: v1.ConditionFalse, 5134 wantReason: "foo", 5135 expectList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionFalse, "foo", "", realClock.Now())}, 5136 expectUpdate: true, 5137 }, 5138 { 5139 name: "update false condition status", 5140 haveList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionFalse, "foo", "", realClock.Now())}, 5141 wantType: batch.JobSuspended, 5142 wantStatus: v1.ConditionTrue, 5143 wantReason: "foo", 5144 expectList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5145 expectUpdate: true, 5146 }, 5147 { 5148 name: "condition already exists", 5149 haveList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5150 wantType: batch.JobSuspended, 5151 wantStatus: v1.ConditionTrue, 5152 wantReason: "foo", 5153 expectList: []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())}, 5154 expectUpdate: false, 5155 }, 5156 } 5157 for _, tc := range testCases { 5158 t.Run(tc.name, func(t *testing.T) { 5159 gotList, isUpdated := ensureJobConditionStatus(tc.haveList, tc.wantType, tc.wantStatus, tc.wantReason, "", realClock.Now()) 5160 if isUpdated != tc.expectUpdate { 5161 t.Errorf("Got isUpdated=%v, want %v", isUpdated, tc.expectUpdate) 5162 } 5163 if len(gotList) != len(tc.expectList) { 5164 t.Errorf("got a list of length %d, want %d", len(gotList), len(tc.expectList)) 5165 } 5166 if diff := cmp.Diff(tc.expectList, gotList, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" { 5167 t.Errorf("Unexpected JobCondition list: (-want,+got):\n%s", diff) 5168 } 5169 }) 5170 } 5171 } 5172 5173 func TestFinalizersRemovedExpectations(t *testing.T) { 5174 _, ctx := ktesting.NewTestContext(t) 5175 clientset := fake.NewSimpleClientset() 5176 sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc()) 5177 manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset) 5178 if err != nil { 5179 t.Fatalf("Error creating Job controller: %v", err) 5180 } 5181 manager.podStoreSynced = alwaysReady 5182 manager.jobStoreSynced = alwaysReady 5183 manager.podControl = &controller.FakePodControl{Err: errors.New("fake pod controller error")} 5184 manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) { 5185 return job, nil 5186 } 5187 5188 job := newJob(2, 2, 6, batch.NonIndexedCompletion) 5189 sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Add(job) 5190 pods := append(newPodList(2, v1.PodSucceeded, job), newPodList(2, v1.PodFailed, job)...) 5191 podInformer := sharedInformers.Core().V1().Pods().Informer() 5192 podIndexer := podInformer.GetIndexer() 5193 uids := sets.New[string]() 5194 for i := range pods { 5195 clientset.Tracker().Add(pods[i]) 5196 podIndexer.Add(pods[i]) 5197 uids.Insert(string(pods[i].UID)) 5198 } 5199 jobKey := testutil.GetKey(job, t) 5200 5201 manager.syncJob(context.TODO(), jobKey) 5202 gotExpectedUIDs := manager.finalizerExpectations.getExpectedUIDs(jobKey) 5203 if len(gotExpectedUIDs) != 0 { 5204 t.Errorf("Got unwanted expectations for removed finalizers after first syncJob with client failures:\n%s", sets.List(gotExpectedUIDs)) 5205 } 5206 5207 // Remove failures and re-sync. 5208 manager.podControl.(*controller.FakePodControl).Err = nil 5209 manager.syncJob(context.TODO(), jobKey) 5210 gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey) 5211 if diff := cmp.Diff(uids, gotExpectedUIDs); diff != "" { 5212 t.Errorf("Different expectations for removed finalizers after syncJob (-want,+got):\n%s", diff) 5213 } 5214 5215 stopCh := make(chan struct{}) 5216 defer close(stopCh) 5217 go sharedInformers.Core().V1().Pods().Informer().Run(stopCh) 5218 cache.WaitForCacheSync(stopCh, podInformer.HasSynced) 5219 5220 // Make sure the first syncJob sets the expectations, even after the caches synced. 5221 gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey) 5222 if diff := cmp.Diff(uids, gotExpectedUIDs); diff != "" { 5223 t.Errorf("Different expectations for removed finalizers after syncJob and cacheSync (-want,+got):\n%s", diff) 5224 } 5225 5226 // Change pods in different ways. 5227 5228 podsResource := schema.GroupVersionResource{Version: "v1", Resource: "pods"} 5229 5230 update := pods[0].DeepCopy() 5231 update.Finalizers = nil 5232 update.ResourceVersion = "1" 5233 err = clientset.Tracker().Update(podsResource, update, update.Namespace) 5234 if err != nil { 5235 t.Errorf("Removing finalizer: %v", err) 5236 } 5237 5238 update = pods[1].DeepCopy() 5239 update.Finalizers = nil 5240 update.DeletionTimestamp = &metav1.Time{Time: time.Now()} 5241 update.ResourceVersion = "1" 5242 err = clientset.Tracker().Update(podsResource, update, update.Namespace) 5243 if err != nil { 5244 t.Errorf("Removing finalizer and setting deletion timestamp: %v", err) 5245 } 5246 5247 // Preserve the finalizer. 5248 update = pods[2].DeepCopy() 5249 update.DeletionTimestamp = &metav1.Time{Time: time.Now()} 5250 update.ResourceVersion = "1" 5251 err = clientset.Tracker().Update(podsResource, update, update.Namespace) 5252 if err != nil { 5253 t.Errorf("Setting deletion timestamp: %v", err) 5254 } 5255 5256 err = clientset.Tracker().Delete(podsResource, pods[3].Namespace, pods[3].Name) 5257 if err != nil { 5258 t.Errorf("Deleting pod that had finalizer: %v", err) 5259 } 5260 5261 uids = sets.New(string(pods[2].UID)) 5262 var diff string 5263 if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { 5264 gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey) 5265 diff = cmp.Diff(uids, gotExpectedUIDs) 5266 return diff == "", nil 5267 }); err != nil { 5268 t.Errorf("Timeout waiting for expectations (-want, +got):\n%s", diff) 5269 } 5270 } 5271 5272 func TestFinalizerCleanup(t *testing.T) { 5273 _, ctx := ktesting.NewTestContext(t) 5274 ctx, cancel := context.WithCancel(ctx) 5275 defer cancel() 5276 5277 clientset := fake.NewSimpleClientset() 5278 sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc()) 5279 manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset) 5280 if err != nil { 5281 t.Fatalf("Error creating Job controller: %v", err) 5282 } 5283 manager.podStoreSynced = alwaysReady 5284 manager.jobStoreSynced = alwaysReady 5285 5286 // Initialize the controller with 0 workers to make sure the 5287 // pod finalizers are not removed by the "syncJob" function. 5288 go manager.Run(ctx, 0) 5289 5290 // Start the Pod and Job informers. 5291 sharedInformers.Start(ctx.Done()) 5292 sharedInformers.WaitForCacheSync(ctx.Done()) 5293 5294 // Create a simple Job 5295 job := newJob(1, 1, 1, batch.NonIndexedCompletion) 5296 job, err = clientset.BatchV1().Jobs(job.GetNamespace()).Create(ctx, job, metav1.CreateOptions{}) 5297 if err != nil { 5298 t.Fatalf("Creating job: %v", err) 5299 } 5300 5301 // Create a Pod with the job tracking finalizer 5302 pod := newPod("test-pod", job) 5303 pod.Finalizers = append(pod.Finalizers, batch.JobTrackingFinalizer) 5304 pod, err = clientset.CoreV1().Pods(pod.GetNamespace()).Create(ctx, pod, metav1.CreateOptions{}) 5305 if err != nil { 5306 t.Fatalf("Creating pod: %v", err) 5307 } 5308 5309 // Mark Job as complete. 5310 job.Status.Conditions = append(job.Status.Conditions, batch.JobCondition{ 5311 Type: batch.JobComplete, 5312 Status: v1.ConditionTrue, 5313 }) 5314 _, err = clientset.BatchV1().Jobs(job.GetNamespace()).UpdateStatus(ctx, job, metav1.UpdateOptions{}) 5315 if err != nil { 5316 t.Fatalf("Updating job status: %v", err) 5317 } 5318 5319 // Verify the pod finalizer is removed for a finished Job, 5320 // even if the jobs pods are not tracked by the main reconciliation loop. 5321 if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) { 5322 p, err := clientset.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{}) 5323 if err != nil { 5324 return false, err 5325 } 5326 return !hasJobTrackingFinalizer(p), nil 5327 }); err != nil { 5328 t.Errorf("Waiting for Pod to get the finalizer removed: %v", err) 5329 } 5330 5331 } 5332 5333 func checkJobCompletionLabel(t *testing.T, p *v1.PodTemplateSpec) { 5334 t.Helper() 5335 labels := p.GetLabels() 5336 if labels == nil || labels[batch.JobCompletionIndexAnnotation] == "" { 5337 t.Errorf("missing expected pod label %s", batch.JobCompletionIndexAnnotation) 5338 } 5339 } 5340 5341 func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec, podIndexLabelDisabled bool) { 5342 t.Helper() 5343 var fieldPath string 5344 if podIndexLabelDisabled { 5345 fieldPath = fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation) 5346 } else { 5347 fieldPath = fmt.Sprintf("metadata.labels['%s']", batch.JobCompletionIndexAnnotation) 5348 } 5349 want := []v1.EnvVar{ 5350 { 5351 Name: "JOB_COMPLETION_INDEX", 5352 ValueFrom: &v1.EnvVarSource{ 5353 FieldRef: &v1.ObjectFieldSelector{ 5354 FieldPath: fieldPath, 5355 }, 5356 }, 5357 }, 5358 } 5359 for _, c := range spec.InitContainers { 5360 if diff := cmp.Diff(want, c.Env); diff != "" { 5361 t.Errorf("Unexpected Env in container %s (-want,+got):\n%s", c.Name, diff) 5362 } 5363 } 5364 for _, c := range spec.Containers { 5365 if diff := cmp.Diff(want, c.Env); diff != "" { 5366 t.Errorf("Unexpected Env in container %s (-want,+got):\n%s", c.Name, diff) 5367 } 5368 } 5369 } 5370 5371 func podReplacementPolicy(m batch.PodReplacementPolicy) *batch.PodReplacementPolicy { 5372 return &m 5373 } 5374 5375 func verifyEmptyQueueAndAwaitForQueueLen(ctx context.Context, t *testing.T, jm *Controller, wantQueueLen int) { 5376 t.Helper() 5377 verifyEmptyQueue(ctx, t, jm) 5378 awaitForQueueLen(ctx, t, jm, wantQueueLen) 5379 } 5380 5381 func awaitForQueueLen(ctx context.Context, t *testing.T, jm *Controller, wantQueueLen int) { 5382 t.Helper() 5383 verifyEmptyQueue(ctx, t, jm) 5384 if err := wait.PollUntilContextTimeout(ctx, fastRequeue, time.Second, true, func(ctx context.Context) (bool, error) { 5385 if requeued := jm.queue.Len() == wantQueueLen; requeued { 5386 return true, nil 5387 } 5388 jm.clock.Sleep(fastRequeue) 5389 return false, nil 5390 }); err != nil { 5391 t.Errorf("Failed to await for expected queue.Len(). want %v, got: %v", wantQueueLen, jm.queue.Len()) 5392 } 5393 } 5394 5395 func verifyEmptyQueue(ctx context.Context, t *testing.T, jm *Controller) { 5396 t.Helper() 5397 if jm.queue.Len() > 0 { 5398 t.Errorf("Unexpected queue.Len(). Want: %d, got: %d", 0, jm.queue.Len()) 5399 } 5400 } 5401 5402 type podBuilder struct { 5403 *v1.Pod 5404 } 5405 5406 func buildPod() podBuilder { 5407 return podBuilder{Pod: &v1.Pod{ 5408 ObjectMeta: metav1.ObjectMeta{ 5409 UID: types.UID(rand.String(5)), 5410 }, 5411 }} 5412 } 5413 5414 func getConditionsByType(list []batch.JobCondition, cType batch.JobConditionType) []*batch.JobCondition { 5415 var result []*batch.JobCondition 5416 for i := range list { 5417 if list[i].Type == cType { 5418 result = append(result, &list[i]) 5419 } 5420 } 5421 return result 5422 } 5423 5424 func (pb podBuilder) name(n string) podBuilder { 5425 pb.Name = n 5426 return pb 5427 } 5428 5429 func (pb podBuilder) ns(n string) podBuilder { 5430 pb.Namespace = n 5431 return pb 5432 } 5433 5434 func (pb podBuilder) uid(u string) podBuilder { 5435 pb.UID = types.UID(u) 5436 return pb 5437 } 5438 5439 func (pb podBuilder) job(j *batch.Job) podBuilder { 5440 pb.Labels = j.Spec.Selector.MatchLabels 5441 pb.Namespace = j.Namespace 5442 pb.OwnerReferences = []metav1.OwnerReference{*metav1.NewControllerRef(j, controllerKind)} 5443 return pb 5444 } 5445 5446 func (pb podBuilder) clearOwner() podBuilder { 5447 pb.OwnerReferences = nil 5448 return pb 5449 } 5450 5451 func (pb podBuilder) clearLabels() podBuilder { 5452 pb.Labels = nil 5453 return pb 5454 } 5455 5456 func (pb podBuilder) index(ix string) podBuilder { 5457 return pb.annotation(batch.JobCompletionIndexAnnotation, ix) 5458 } 5459 5460 func (pb podBuilder) indexFailureCount(count string) podBuilder { 5461 return pb.annotation(batch.JobIndexFailureCountAnnotation, count) 5462 } 5463 5464 func (pb podBuilder) indexIgnoredFailureCount(count string) podBuilder { 5465 return pb.annotation(batch.JobIndexIgnoredFailureCountAnnotation, count) 5466 } 5467 5468 func (pb podBuilder) annotation(key, value string) podBuilder { 5469 if pb.Annotations == nil { 5470 pb.Annotations = make(map[string]string) 5471 } 5472 pb.Annotations[key] = value 5473 return pb 5474 } 5475 5476 func (pb podBuilder) status(s v1.PodStatus) podBuilder { 5477 pb.Status = s 5478 return pb 5479 } 5480 5481 func (pb podBuilder) phase(p v1.PodPhase) podBuilder { 5482 pb.Status.Phase = p 5483 return pb 5484 } 5485 5486 func (pb podBuilder) trackingFinalizer() podBuilder { 5487 for _, f := range pb.Finalizers { 5488 if f == batch.JobTrackingFinalizer { 5489 return pb 5490 } 5491 } 5492 pb.Finalizers = append(pb.Finalizers, batch.JobTrackingFinalizer) 5493 return pb 5494 } 5495 5496 func (pb podBuilder) deletionTimestamp() podBuilder { 5497 pb.DeletionTimestamp = &metav1.Time{} 5498 return pb 5499 } 5500 5501 func (pb podBuilder) customDeletionTimestamp(t time.Time) podBuilder { 5502 pb.DeletionTimestamp = &metav1.Time{Time: t} 5503 return pb 5504 } 5505 5506 func completionModePtr(m batch.CompletionMode) *batch.CompletionMode { 5507 return &m 5508 } 5509 5510 func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() { 5511 origVal := *val 5512 *val = newVal 5513 return func() { 5514 *val = origVal 5515 } 5516 }