k8s.io/kubernetes@v1.29.3/pkg/controller/job/job_controller_test.go

k8s.io/kubernetes@v1.29.3/pkg/controller/job/job_controller_test.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package job
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"math"
    24  	"sort"
    25  	"strconv"
    26  	"testing"
    27  	"time"
    28  
    29  	"github.com/google/go-cmp/cmp"
    30  	"github.com/google/go-cmp/cmp/cmpopts"
    31  	batch "k8s.io/api/batch/v1"
    32  	v1 "k8s.io/api/core/v1"
    33  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    34  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    35  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    36  	"k8s.io/apimachinery/pkg/runtime/schema"
    37  	"k8s.io/apimachinery/pkg/types"
    38  	"k8s.io/apimachinery/pkg/util/rand"
    39  	"k8s.io/apimachinery/pkg/util/sets"
    40  	"k8s.io/apimachinery/pkg/util/uuid"
    41  	"k8s.io/apimachinery/pkg/util/wait"
    42  	"k8s.io/apimachinery/pkg/watch"
    43  	"k8s.io/apiserver/pkg/util/feature"
    44  	"k8s.io/client-go/informers"
    45  	clientset "k8s.io/client-go/kubernetes"
    46  	"k8s.io/client-go/kubernetes/fake"
    47  	restclient "k8s.io/client-go/rest"
    48  	core "k8s.io/client-go/testing"
    49  	"k8s.io/client-go/tools/cache"
    50  	"k8s.io/client-go/util/workqueue"
    51  	featuregatetesting "k8s.io/component-base/featuregate/testing"
    52  	metricstestutil "k8s.io/component-base/metrics/testutil"
    53  	"k8s.io/klog/v2"
    54  	"k8s.io/klog/v2/ktesting"
    55  	_ "k8s.io/kubernetes/pkg/apis/core/install"
    56  	"k8s.io/kubernetes/pkg/controller"
    57  	"k8s.io/kubernetes/pkg/controller/job/metrics"
    58  	"k8s.io/kubernetes/pkg/controller/testutil"
    59  	"k8s.io/kubernetes/pkg/features"
    60  	"k8s.io/utils/clock"
    61  	clocktesting "k8s.io/utils/clock/testing"
    62  	"k8s.io/utils/ptr"
    63  )
    64  
    65  var realClock = &clock.RealClock{}
    66  var alwaysReady = func() bool { return true }
    67  
    68  const fastSyncJobBatchPeriod = 10 * time.Millisecond
    69  const fastJobApiBackoff = 10 * time.Millisecond
    70  const fastRequeue = 10 * time.Millisecond
    71  
    72  // testFinishedAt represents time one second later than unix epoch
    73  // this will be used in various test cases where we don't want back-off to kick in
    74  var testFinishedAt = metav1.NewTime((time.Time{}).Add(time.Second))
    75  
    76  func newJobWithName(name string, parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job {
    77  	j := &batch.Job{
    78  		TypeMeta: metav1.TypeMeta{Kind: "Job"},
    79  		ObjectMeta: metav1.ObjectMeta{
    80  			Name:      name,
    81  			UID:       uuid.NewUUID(),
    82  			Namespace: metav1.NamespaceDefault,
    83  		},
    84  		Spec: batch.JobSpec{
    85  			Selector: &metav1.LabelSelector{
    86  				MatchLabels: map[string]string{"foo": "bar"},
    87  			},
    88  			Template: v1.PodTemplateSpec{
    89  				ObjectMeta: metav1.ObjectMeta{
    90  					Labels: map[string]string{
    91  						"foo": "bar",
    92  					},
    93  				},
    94  				Spec: v1.PodSpec{
    95  					Containers: []v1.Container{
    96  						{Image: "foo/bar"},
    97  					},
    98  				},
    99  			},
   100  		},
   101  	}
   102  	if completionMode != "" {
   103  		j.Spec.CompletionMode = &completionMode
   104  	}
   105  	// Special case: -1 for either completions or parallelism means leave nil (negative is not allowed
   106  	// in practice by validation.
   107  	if completions >= 0 {
   108  		j.Spec.Completions = &completions
   109  	} else {
   110  		j.Spec.Completions = nil
   111  	}
   112  	if parallelism >= 0 {
   113  		j.Spec.Parallelism = &parallelism
   114  	} else {
   115  		j.Spec.Parallelism = nil
   116  	}
   117  	j.Spec.BackoffLimit = &backoffLimit
   118  
   119  	return j
   120  }
   121  
   122  func newJob(parallelism, completions, backoffLimit int32, completionMode batch.CompletionMode) *batch.Job {
   123  	return newJobWithName("foobar", parallelism, completions, backoffLimit, completionMode)
   124  }
   125  
   126  func newControllerFromClient(ctx context.Context, t *testing.T, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc) (*Controller, informers.SharedInformerFactory) {
   127  	t.Helper()
   128  	return newControllerFromClientWithClock(ctx, t, kubeClient, resyncPeriod, realClock)
   129  }
   130  
   131  func newControllerFromClientWithClock(ctx context.Context, t *testing.T, kubeClient clientset.Interface, resyncPeriod controller.ResyncPeriodFunc, clock clock.WithTicker) (*Controller, informers.SharedInformerFactory) {
   132  	t.Helper()
   133  	sharedInformers := informers.NewSharedInformerFactory(kubeClient, resyncPeriod())
   134  	jm, err := newControllerWithClock(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), kubeClient, clock)
   135  	if err != nil {
   136  		t.Fatalf("Error creating Job controller: %v", err)
   137  	}
   138  	jm.podControl = &controller.FakePodControl{}
   139  	return jm, sharedInformers
   140  }
   141  
   142  func newPod(name string, job *batch.Job) *v1.Pod {
   143  	return &v1.Pod{
   144  		ObjectMeta: metav1.ObjectMeta{
   145  			Name:            name,
   146  			UID:             types.UID(name),
   147  			Labels:          job.Spec.Selector.MatchLabels,
   148  			Namespace:       job.Namespace,
   149  			OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(job, controllerKind)},
   150  		},
   151  	}
   152  }
   153  
   154  // create count pods with the given phase for the given job
   155  func newPodList(count int, status v1.PodPhase, job *batch.Job) []*v1.Pod {
   156  	var pods []*v1.Pod
   157  	for i := 0; i < count; i++ {
   158  		newPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), job)
   159  		newPod.Status = v1.PodStatus{Phase: status}
   160  		newPod.Status.ContainerStatuses = []v1.ContainerStatus{
   161  			{
   162  				State: v1.ContainerState{
   163  					Terminated: &v1.ContainerStateTerminated{
   164  						FinishedAt: testFinishedAt,
   165  					},
   166  				},
   167  			},
   168  		}
   169  		newPod.Finalizers = append(newPod.Finalizers, batch.JobTrackingFinalizer)
   170  		pods = append(pods, newPod)
   171  	}
   172  	return pods
   173  }
   174  
   175  func setPodsStatuses(podIndexer cache.Indexer, job *batch.Job, pendingPods, activePods, succeededPods, failedPods, terminatingPods, readyPods int) {
   176  	for _, pod := range newPodList(pendingPods, v1.PodPending, job) {
   177  		podIndexer.Add(pod)
   178  	}
   179  	running := newPodList(activePods, v1.PodRunning, job)
   180  	for i, p := range running {
   181  		if i >= readyPods {
   182  			break
   183  		}
   184  		p.Status.Conditions = append(p.Status.Conditions, v1.PodCondition{
   185  			Type:   v1.PodReady,
   186  			Status: v1.ConditionTrue,
   187  		})
   188  	}
   189  	for _, pod := range running {
   190  		podIndexer.Add(pod)
   191  	}
   192  	for _, pod := range newPodList(succeededPods, v1.PodSucceeded, job) {
   193  		podIndexer.Add(pod)
   194  	}
   195  	for _, pod := range newPodList(failedPods, v1.PodFailed, job) {
   196  		podIndexer.Add(pod)
   197  	}
   198  	terminating := newPodList(terminatingPods, v1.PodRunning, job)
   199  	for _, p := range terminating {
   200  		now := metav1.Now()
   201  		p.DeletionTimestamp = &now
   202  	}
   203  	for _, pod := range terminating {
   204  		podIndexer.Add(pod)
   205  	}
   206  }
   207  
   208  func setPodsStatusesWithIndexes(podIndexer cache.Indexer, job *batch.Job, status []indexPhase) {
   209  	for _, s := range status {
   210  		p := newPod(fmt.Sprintf("pod-%s", rand.String(10)), job)
   211  		p.Status = v1.PodStatus{Phase: s.Phase}
   212  		if s.Phase == v1.PodFailed || s.Phase == v1.PodSucceeded {
   213  			p.Status.ContainerStatuses = []v1.ContainerStatus{
   214  				{
   215  					State: v1.ContainerState{
   216  						Terminated: &v1.ContainerStateTerminated{
   217  							FinishedAt: testFinishedAt,
   218  						},
   219  					},
   220  				},
   221  			}
   222  		}
   223  		if s.Index != noIndex {
   224  			p.Annotations = map[string]string{
   225  				batch.JobCompletionIndexAnnotation: s.Index,
   226  			}
   227  			p.Spec.Hostname = fmt.Sprintf("%s-%s", job.Name, s.Index)
   228  		}
   229  		p.Finalizers = append(p.Finalizers, batch.JobTrackingFinalizer)
   230  		podIndexer.Add(p)
   231  	}
   232  }
   233  
   234  type jobInitialStatus struct {
   235  	active    int
   236  	succeed   int
   237  	failed    int
   238  	startTime *time.Time
   239  }
   240  
   241  func TestControllerSyncJob(t *testing.T) {
   242  	_, ctx := ktesting.NewTestContext(t)
   243  	jobConditionComplete := batch.JobComplete
   244  	jobConditionFailed := batch.JobFailed
   245  	jobConditionSuspended := batch.JobSuspended
   246  	referenceTime := time.Now()
   247  
   248  	testCases := map[string]struct {
   249  		// job setup
   250  		parallelism          int32
   251  		completions          int32
   252  		backoffLimit         int32
   253  		deleting             bool
   254  		podLimit             int
   255  		completionMode       batch.CompletionMode
   256  		wasSuspended         bool
   257  		suspend              bool
   258  		podReplacementPolicy *batch.PodReplacementPolicy
   259  		podFailurePolicy     *batch.PodFailurePolicy
   260  		initialStatus        *jobInitialStatus
   261  		backoffRecord        *backoffRecord
   262  		controllerTime       *time.Time
   263  
   264  		// pod setup
   265  
   266  		// If a podControllerError is set, finalizers are not able to be removed.
   267  		// This means that there is no status update so the counters for
   268  		// failedPods and succeededPods cannot be incremented.
   269  		podControllerError        error
   270  		pendingPods               int
   271  		activePods                int
   272  		readyPods                 int
   273  		succeededPods             int
   274  		failedPods                int
   275  		terminatingPods           int
   276  		podsWithIndexes           []indexPhase
   277  		fakeExpectationAtCreation int32 // negative: ExpectDeletions, positive: ExpectCreations
   278  
   279  		// expectations
   280  		expectedCreations       int32
   281  		expectedDeletions       int32
   282  		expectedActive          int32
   283  		expectedReady           *int32
   284  		expectedSucceeded       int32
   285  		expectedCompletedIdxs   string
   286  		expectedFailed          int32
   287  		expectedTerminating     *int32
   288  		expectedCondition       *batch.JobConditionType
   289  		expectedConditionStatus v1.ConditionStatus
   290  		expectedConditionReason string
   291  		expectedCreatedIndexes  sets.Set[int]
   292  		expectedPodPatches      int
   293  
   294  		// features
   295  		podIndexLabelDisabled   bool
   296  		jobPodReplacementPolicy bool
   297  		jobPodFailurePolicy     bool
   298  	}{
   299  		"job start": {
   300  			parallelism:       2,
   301  			completions:       5,
   302  			backoffLimit:      6,
   303  			expectedCreations: 2,
   304  			expectedActive:    2,
   305  			expectedReady:     ptr.To[int32](0),
   306  		},
   307  		"WQ job start": {
   308  			parallelism:       2,
   309  			completions:       -1,
   310  			backoffLimit:      6,
   311  			expectedCreations: 2,
   312  			expectedActive:    2,
   313  			expectedReady:     ptr.To[int32](0),
   314  		},
   315  		"pending pods": {
   316  			parallelism:    2,
   317  			completions:    5,
   318  			backoffLimit:   6,
   319  			pendingPods:    2,
   320  			expectedActive: 2,
   321  			expectedReady:  ptr.To[int32](0),
   322  		},
   323  		"correct # of pods": {
   324  			parallelism:    3,
   325  			completions:    5,
   326  			backoffLimit:   6,
   327  			activePods:     3,
   328  			readyPods:      2,
   329  			expectedActive: 3,
   330  			expectedReady:  ptr.To[int32](2),
   331  		},
   332  		"WQ job: correct # of pods": {
   333  			parallelism:    2,
   334  			completions:    -1,
   335  			backoffLimit:   6,
   336  			activePods:     2,
   337  			expectedActive: 2,
   338  			expectedReady:  ptr.To[int32](0),
   339  		},
   340  		"too few active pods": {
   341  			parallelism:        2,
   342  			completions:        5,
   343  			backoffLimit:       6,
   344  			activePods:         1,
   345  			succeededPods:      1,
   346  			expectedCreations:  1,
   347  			expectedActive:     2,
   348  			expectedSucceeded:  1,
   349  			expectedPodPatches: 1,
   350  			expectedReady:      ptr.To[int32](0),
   351  		},
   352  		"WQ job: recreate pods when failed": {
   353  			parallelism:             1,
   354  			completions:             -1,
   355  			backoffLimit:            6,
   356  			activePods:              1,
   357  			failedPods:              1,
   358  			podReplacementPolicy:    podReplacementPolicy(batch.Failed),
   359  			jobPodReplacementPolicy: true,
   360  			terminatingPods:         1,
   361  			expectedTerminating:     ptr.To[int32](1),
   362  			expectedReady:           ptr.To[int32](0),
   363  			// Removes finalizer and deletes one failed pod
   364  			expectedPodPatches: 1,
   365  			expectedFailed:     1,
   366  			expectedActive:     1,
   367  		},
   368  		"WQ job: turn on PodReplacementPolicy but not set PodReplacementPolicy": {
   369  			parallelism:             1,
   370  			completions:             1,
   371  			backoffLimit:            6,
   372  			activePods:              1,
   373  			failedPods:              1,
   374  			jobPodReplacementPolicy: true,
   375  			expectedTerminating:     ptr.To[int32](1),
   376  			expectedReady:           ptr.To[int32](0),
   377  			terminatingPods:         1,
   378  			expectedActive:          1,
   379  			expectedPodPatches:      2,
   380  			expectedFailed:          2,
   381  		},
   382  		"WQ job: recreate pods when terminating or failed": {
   383  			parallelism:             1,
   384  			completions:             -1,
   385  			backoffLimit:            6,
   386  			activePods:              1,
   387  			failedPods:              1,
   388  			podReplacementPolicy:    podReplacementPolicy(batch.TerminatingOrFailed),
   389  			jobPodReplacementPolicy: true,
   390  			terminatingPods:         1,
   391  			expectedTerminating:     ptr.To[int32](1),
   392  			expectedReady:           ptr.To[int32](0),
   393  			expectedActive:          1,
   394  			expectedPodPatches:      2,
   395  			expectedFailed:          2,
   396  		},
   397  		"more terminating pods than parallelism": {
   398  			parallelism:             1,
   399  			completions:             1,
   400  			backoffLimit:            6,
   401  			activePods:              2,
   402  			failedPods:              0,
   403  			terminatingPods:         4,
   404  			podReplacementPolicy:    podReplacementPolicy(batch.Failed),
   405  			jobPodReplacementPolicy: true,
   406  			expectedTerminating:     ptr.To[int32](4),
   407  			expectedReady:           ptr.To[int32](0),
   408  			expectedActive:          1,
   409  			expectedDeletions:       1,
   410  			expectedPodPatches:      1,
   411  		},
   412  		"more terminating pods than parallelism; PodFailurePolicy used": {
   413  			// Repro for https://github.com/kubernetes/kubernetes/issues/122235
   414  			parallelism:         1,
   415  			completions:         1,
   416  			backoffLimit:        6,
   417  			activePods:          2,
   418  			failedPods:          0,
   419  			terminatingPods:     4,
   420  			jobPodFailurePolicy: true,
   421  			podFailurePolicy:    &batch.PodFailurePolicy{},
   422  			expectedTerminating: nil,
   423  			expectedReady:       ptr.To[int32](0),
   424  			expectedActive:      1,
   425  			expectedDeletions:   1,
   426  			expectedPodPatches:  1,
   427  		},
   428  		"too few active pods and active back-off": {
   429  			parallelism:  1,
   430  			completions:  1,
   431  			backoffLimit: 6,
   432  			backoffRecord: &backoffRecord{
   433  				failuresAfterLastSuccess: 1,
   434  				lastFailureTime:          &referenceTime,
   435  			},
   436  			initialStatus: &jobInitialStatus{
   437  				startTime: func() *time.Time {
   438  					now := time.Now()
   439  					return &now
   440  				}(),
   441  			},
   442  			activePods:         0,
   443  			succeededPods:      0,
   444  			expectedCreations:  0,
   445  			expectedActive:     0,
   446  			expectedSucceeded:  0,
   447  			expectedPodPatches: 0,
   448  			expectedReady:      ptr.To[int32](0),
   449  			controllerTime:     &referenceTime,
   450  		},
   451  		"too few active pods and no back-offs": {
   452  			parallelism:  1,
   453  			completions:  1,
   454  			backoffLimit: 6,
   455  			backoffRecord: &backoffRecord{
   456  				failuresAfterLastSuccess: 0,
   457  				lastFailureTime:          &referenceTime,
   458  			},
   459  			activePods:         0,
   460  			succeededPods:      0,
   461  			expectedCreations:  1,
   462  			expectedActive:     1,
   463  			expectedSucceeded:  0,
   464  			expectedPodPatches: 0,
   465  			expectedReady:      ptr.To[int32](0),
   466  			controllerTime:     &referenceTime,
   467  		},
   468  		"too few active pods with a dynamic job": {
   469  			parallelism:       2,
   470  			completions:       -1,
   471  			backoffLimit:      6,
   472  			activePods:        1,
   473  			expectedCreations: 1,
   474  			expectedActive:    2,
   475  			expectedReady:     ptr.To[int32](0),
   476  		},
   477  		"too few active pods, with controller error": {
   478  			parallelism:        2,
   479  			completions:        5,
   480  			backoffLimit:       6,
   481  			podControllerError: fmt.Errorf("fake error"),
   482  			activePods:         1,
   483  			succeededPods:      1,
   484  			expectedCreations:  1,
   485  			expectedActive:     1,
   486  			expectedSucceeded:  0,
   487  			expectedPodPatches: 1,
   488  			expectedReady:      ptr.To[int32](0),
   489  		},
   490  		"too many active pods": {
   491  			parallelism:        2,
   492  			completions:        5,
   493  			backoffLimit:       6,
   494  			activePods:         3,
   495  			expectedDeletions:  1,
   496  			expectedActive:     2,
   497  			expectedPodPatches: 1,
   498  			expectedReady:      ptr.To[int32](0),
   499  		},
   500  		"too many active pods, with controller error": {
   501  			parallelism:        2,
   502  			completions:        5,
   503  			backoffLimit:       6,
   504  			podControllerError: fmt.Errorf("fake error"),
   505  			activePods:         3,
   506  			expectedDeletions:  0,
   507  			expectedPodPatches: 1,
   508  			expectedActive:     3,
   509  			expectedReady:      ptr.To[int32](0),
   510  		},
   511  		"failed + succeed pods: reset backoff delay": {
   512  			parallelism:        2,
   513  			completions:        5,
   514  			backoffLimit:       6,
   515  			activePods:         1,
   516  			succeededPods:      1,
   517  			failedPods:         1,
   518  			expectedCreations:  1,
   519  			expectedActive:     2,
   520  			expectedSucceeded:  1,
   521  			expectedFailed:     1,
   522  			expectedPodPatches: 2,
   523  			expectedReady:      ptr.To[int32](0),
   524  		},
   525  		"new failed pod": {
   526  			parallelism:        2,
   527  			completions:        5,
   528  			backoffLimit:       6,
   529  			activePods:         1,
   530  			failedPods:         1,
   531  			expectedCreations:  1,
   532  			expectedActive:     2,
   533  			expectedFailed:     1,
   534  			expectedPodPatches: 1,
   535  			expectedReady:      ptr.To[int32](0),
   536  		},
   537  		"no new pod; possible finalizer update of failed pod": {
   538  			parallelism:  1,
   539  			completions:  1,
   540  			backoffLimit: 6,
   541  			initialStatus: &jobInitialStatus{
   542  				active:  1,
   543  				succeed: 0,
   544  				failed:  1,
   545  			},
   546  			activePods:         1,
   547  			failedPods:         0,
   548  			expectedCreations:  0,
   549  			expectedActive:     1,
   550  			expectedFailed:     1,
   551  			expectedPodPatches: 0,
   552  			expectedReady:      ptr.To[int32](0),
   553  		},
   554  		"only new failed pod with controller error": {
   555  			parallelism:        2,
   556  			completions:        5,
   557  			backoffLimit:       6,
   558  			podControllerError: fmt.Errorf("fake error"),
   559  			activePods:         1,
   560  			failedPods:         1,
   561  			expectedCreations:  1,
   562  			expectedActive:     1,
   563  			expectedFailed:     0,
   564  			expectedPodPatches: 1,
   565  			expectedReady:      ptr.To[int32](0),
   566  		},
   567  		"job finish": {
   568  			parallelism:             2,
   569  			completions:             5,
   570  			backoffLimit:            6,
   571  			succeededPods:           5,
   572  			expectedSucceeded:       5,
   573  			expectedCondition:       &jobConditionComplete,
   574  			expectedConditionStatus: v1.ConditionTrue,
   575  			expectedPodPatches:      5,
   576  			expectedReady:           ptr.To[int32](0),
   577  		},
   578  		"WQ job finishing": {
   579  			parallelism:        2,
   580  			completions:        -1,
   581  			backoffLimit:       6,
   582  			activePods:         1,
   583  			succeededPods:      1,
   584  			expectedActive:     1,
   585  			expectedSucceeded:  1,
   586  			expectedPodPatches: 1,
   587  			expectedReady:      ptr.To[int32](0),
   588  		},
   589  		"WQ job all finished": {
   590  			parallelism:             2,
   591  			completions:             -1,
   592  			backoffLimit:            6,
   593  			succeededPods:           2,
   594  			expectedSucceeded:       2,
   595  			expectedCondition:       &jobConditionComplete,
   596  			expectedConditionStatus: v1.ConditionTrue,
   597  			expectedPodPatches:      2,
   598  			expectedReady:           ptr.To[int32](0),
   599  		},
   600  		"WQ job all finished despite one failure": {
   601  			parallelism:             2,
   602  			completions:             -1,
   603  			backoffLimit:            6,
   604  			succeededPods:           1,
   605  			failedPods:              1,
   606  			expectedSucceeded:       1,
   607  			expectedFailed:          1,
   608  			expectedCondition:       &jobConditionComplete,
   609  			expectedConditionStatus: v1.ConditionTrue,
   610  			expectedPodPatches:      2,
   611  			expectedReady:           ptr.To[int32](0),
   612  		},
   613  		"more active pods than parallelism": {
   614  			parallelism:        2,
   615  			completions:        5,
   616  			backoffLimit:       6,
   617  			activePods:         10,
   618  			expectedDeletions:  8,
   619  			expectedActive:     2,
   620  			expectedPodPatches: 8,
   621  			expectedReady:      ptr.To[int32](0),
   622  		},
   623  		"more active pods than remaining completions": {
   624  			parallelism:        3,
   625  			completions:        4,
   626  			backoffLimit:       6,
   627  			activePods:         3,
   628  			succeededPods:      2,
   629  			expectedDeletions:  1,
   630  			expectedActive:     2,
   631  			expectedSucceeded:  2,
   632  			expectedPodPatches: 3,
   633  			expectedReady:      ptr.To[int32](0),
   634  		},
   635  		"status change": {
   636  			parallelism:        2,
   637  			completions:        5,
   638  			backoffLimit:       6,
   639  			activePods:         2,
   640  			succeededPods:      2,
   641  			expectedActive:     2,
   642  			expectedSucceeded:  2,
   643  			expectedPodPatches: 2,
   644  			expectedReady:      ptr.To[int32](0),
   645  		},
   646  		"deleting job": {
   647  			parallelism:        2,
   648  			completions:        5,
   649  			backoffLimit:       6,
   650  			deleting:           true,
   651  			pendingPods:        1,
   652  			activePods:         1,
   653  			succeededPods:      1,
   654  			expectedActive:     2,
   655  			expectedSucceeded:  1,
   656  			expectedPodPatches: 3,
   657  			expectedReady:      ptr.To[int32](0),
   658  		},
   659  		"limited pods": {
   660  			parallelism:       100,
   661  			completions:       200,
   662  			backoffLimit:      6,
   663  			podLimit:          10,
   664  			expectedCreations: 10,
   665  			expectedActive:    10,
   666  			expectedReady:     ptr.To[int32](0),
   667  		},
   668  		"too many job failures": {
   669  			parallelism:             2,
   670  			completions:             5,
   671  			deleting:                true,
   672  			failedPods:              1,
   673  			expectedFailed:          1,
   674  			expectedCondition:       &jobConditionFailed,
   675  			expectedConditionStatus: v1.ConditionTrue,
   676  			expectedConditionReason: "BackoffLimitExceeded",
   677  			expectedPodPatches:      1,
   678  			expectedReady:           ptr.To[int32](0),
   679  		},
   680  		"job failures, unsatisfied expectations": {
   681  			parallelism:               2,
   682  			completions:               5,
   683  			deleting:                  true,
   684  			failedPods:                1,
   685  			fakeExpectationAtCreation: 1,
   686  			expectedFailed:            1,
   687  			expectedPodPatches:        1,
   688  			expectedReady:             ptr.To[int32](0),
   689  		},
   690  		"indexed job start": {
   691  			parallelism:            2,
   692  			completions:            5,
   693  			backoffLimit:           6,
   694  			completionMode:         batch.IndexedCompletion,
   695  			expectedCreations:      2,
   696  			expectedActive:         2,
   697  			expectedCreatedIndexes: sets.New(0, 1),
   698  			expectedReady:          ptr.To[int32](0),
   699  		},
   700  		"indexed job with some pods deleted, podReplacementPolicy Failed": {
   701  			parallelism:             2,
   702  			completions:             5,
   703  			backoffLimit:            6,
   704  			completionMode:          batch.IndexedCompletion,
   705  			expectedCreations:       1,
   706  			expectedActive:          1,
   707  			expectedCreatedIndexes:  sets.New(0),
   708  			podReplacementPolicy:    podReplacementPolicy(batch.Failed),
   709  			jobPodReplacementPolicy: true,
   710  			terminatingPods:         1,
   711  			expectedTerminating:     ptr.To[int32](1),
   712  			expectedReady:           ptr.To[int32](0),
   713  		},
   714  		"indexed job with some pods deleted, podReplacementPolicy TerminatingOrFailed": {
   715  			parallelism:             2,
   716  			completions:             5,
   717  			backoffLimit:            6,
   718  			completionMode:          batch.IndexedCompletion,
   719  			expectedCreations:       2,
   720  			expectedActive:          2,
   721  			expectedCreatedIndexes:  sets.New(0, 1),
   722  			podReplacementPolicy:    podReplacementPolicy(batch.TerminatingOrFailed),
   723  			jobPodReplacementPolicy: true,
   724  			terminatingPods:         1,
   725  			expectedTerminating:     ptr.To[int32](1),
   726  			expectedReady:           ptr.To[int32](0),
   727  			expectedPodPatches:      1,
   728  		},
   729  		"indexed job completed": {
   730  			parallelism:    2,
   731  			completions:    3,
   732  			backoffLimit:   6,
   733  			completionMode: batch.IndexedCompletion,
   734  			podsWithIndexes: []indexPhase{
   735  				{"0", v1.PodSucceeded},
   736  				{"1", v1.PodFailed},
   737  				{"1", v1.PodSucceeded},
   738  				{"2", v1.PodSucceeded},
   739  			},
   740  			expectedSucceeded:       3,
   741  			expectedFailed:          1,
   742  			expectedCompletedIdxs:   "0-2",
   743  			expectedCondition:       &jobConditionComplete,
   744  			expectedConditionStatus: v1.ConditionTrue,
   745  			expectedPodPatches:      4,
   746  			expectedReady:           ptr.To[int32](0),
   747  		},
   748  		"indexed job repeated completed index": {
   749  			parallelism:    2,
   750  			completions:    3,
   751  			backoffLimit:   6,
   752  			completionMode: batch.IndexedCompletion,
   753  			podsWithIndexes: []indexPhase{
   754  				{"0", v1.PodSucceeded},
   755  				{"1", v1.PodSucceeded},
   756  				{"1", v1.PodSucceeded},
   757  			},
   758  			expectedCreations:      1,
   759  			expectedActive:         1,
   760  			expectedSucceeded:      2,
   761  			expectedCompletedIdxs:  "0,1",
   762  			expectedCreatedIndexes: sets.New(2),
   763  			expectedPodPatches:     3,
   764  			expectedReady:          ptr.To[int32](0),
   765  		},
   766  		"indexed job some running and completed pods": {
   767  			parallelism:    8,
   768  			completions:    20,
   769  			backoffLimit:   6,
   770  			completionMode: batch.IndexedCompletion,
   771  			podsWithIndexes: []indexPhase{
   772  				{"0", v1.PodRunning},
   773  				{"2", v1.PodSucceeded},
   774  				{"3", v1.PodPending},
   775  				{"4", v1.PodSucceeded},
   776  				{"5", v1.PodSucceeded},
   777  				{"7", v1.PodSucceeded},
   778  				{"8", v1.PodSucceeded},
   779  				{"9", v1.PodSucceeded},
   780  			},
   781  			expectedCreations:      6,
   782  			expectedActive:         8,
   783  			expectedSucceeded:      6,
   784  			expectedCompletedIdxs:  "2,4,5,7-9",
   785  			expectedCreatedIndexes: sets.New(1, 6, 10, 11, 12, 13),
   786  			expectedPodPatches:     6,
   787  			expectedReady:          ptr.To[int32](0),
   788  		},
   789  		"indexed job some failed pods": {
   790  			parallelism:    3,
   791  			completions:    4,
   792  			backoffLimit:   6,
   793  			completionMode: batch.IndexedCompletion,
   794  			podsWithIndexes: []indexPhase{
   795  				{"0", v1.PodFailed},
   796  				{"1", v1.PodPending},
   797  				{"2", v1.PodFailed},
   798  			},
   799  			expectedCreations:      2,
   800  			expectedActive:         3,
   801  			expectedFailed:         2,
   802  			expectedCreatedIndexes: sets.New(0, 2),
   803  			expectedPodPatches:     2,
   804  			expectedReady:          ptr.To[int32](0),
   805  		},
   806  		"indexed job some pods without index": {
   807  			parallelism:    2,
   808  			completions:    5,
   809  			backoffLimit:   6,
   810  			completionMode: batch.IndexedCompletion,
   811  			activePods:     1,
   812  			succeededPods:  1,
   813  			failedPods:     1,
   814  			podsWithIndexes: []indexPhase{
   815  				{"invalid", v1.PodRunning},
   816  				{"invalid", v1.PodSucceeded},
   817  				{"invalid", v1.PodFailed},
   818  				{"invalid", v1.PodPending},
   819  				{"0", v1.PodSucceeded},
   820  				{"1", v1.PodRunning},
   821  				{"2", v1.PodRunning},
   822  			},
   823  			expectedDeletions:     3,
   824  			expectedActive:        2,
   825  			expectedSucceeded:     1,
   826  			expectedFailed:        0,
   827  			expectedCompletedIdxs: "0",
   828  			expectedPodPatches:    8,
   829  			expectedReady:         ptr.To[int32](0),
   830  		},
   831  		"indexed job repeated indexes": {
   832  			parallelism:    5,
   833  			completions:    5,
   834  			backoffLimit:   6,
   835  			completionMode: batch.IndexedCompletion,
   836  			succeededPods:  1,
   837  			failedPods:     1,
   838  			podsWithIndexes: []indexPhase{
   839  				{"invalid", v1.PodRunning},
   840  				{"0", v1.PodSucceeded},
   841  				{"1", v1.PodRunning},
   842  				{"2", v1.PodRunning},
   843  				{"2", v1.PodPending},
   844  			},
   845  			expectedCreations:     0,
   846  			expectedDeletions:     2,
   847  			expectedActive:        2,
   848  			expectedSucceeded:     1,
   849  			expectedCompletedIdxs: "0",
   850  			expectedPodPatches:    5,
   851  			expectedReady:         ptr.To[int32](0),
   852  		},
   853  		"indexed job with indexes outside of range": {
   854  			parallelism:    2,
   855  			completions:    5,
   856  			backoffLimit:   6,
   857  			completionMode: batch.IndexedCompletion,
   858  			podsWithIndexes: []indexPhase{
   859  				{"0", v1.PodSucceeded},
   860  				{"5", v1.PodRunning},
   861  				{"6", v1.PodSucceeded},
   862  				{"7", v1.PodPending},
   863  				{"8", v1.PodFailed},
   864  			},
   865  			expectedCreations:     0, // only one of creations and deletions can happen in a sync
   866  			expectedSucceeded:     1,
   867  			expectedDeletions:     2,
   868  			expectedCompletedIdxs: "0",
   869  			expectedActive:        0,
   870  			expectedFailed:        0,
   871  			expectedPodPatches:    5,
   872  			expectedReady:         ptr.To[int32](0),
   873  		},
   874  		"suspending a job with satisfied expectations": {
   875  			// Suspended Job should delete active pods when expectations are
   876  			// satisfied.
   877  			suspend:                 true,
   878  			parallelism:             2,
   879  			activePods:              2, // parallelism == active, expectations satisfied
   880  			completions:             4,
   881  			backoffLimit:            6,
   882  			expectedCreations:       0,
   883  			expectedDeletions:       2,
   884  			expectedActive:          0,
   885  			expectedCondition:       &jobConditionSuspended,
   886  			expectedConditionStatus: v1.ConditionTrue,
   887  			expectedConditionReason: "JobSuspended",
   888  			expectedPodPatches:      2,
   889  			expectedReady:           ptr.To[int32](0),
   890  		},
   891  		"suspending a job with unsatisfied expectations": {
   892  			// Unlike the previous test, we expect the controller to NOT suspend the
   893  			// Job in the syncJob call because the controller will wait for
   894  			// expectations to be satisfied first. The next syncJob call (not tested
   895  			// here) will be the same as the previous test.
   896  			suspend:                   true,
   897  			parallelism:               2,
   898  			activePods:                3,  // active > parallelism, expectations unsatisfied
   899  			fakeExpectationAtCreation: -1, // the controller is expecting a deletion
   900  			completions:               4,
   901  			backoffLimit:              6,
   902  			expectedCreations:         0,
   903  			expectedDeletions:         0,
   904  			expectedActive:            3,
   905  			expectedReady:             ptr.To[int32](0),
   906  		},
   907  		"resuming a suspended job": {
   908  			wasSuspended:            true,
   909  			suspend:                 false,
   910  			parallelism:             2,
   911  			completions:             4,
   912  			backoffLimit:            6,
   913  			expectedCreations:       2,
   914  			expectedDeletions:       0,
   915  			expectedActive:          2,
   916  			expectedCondition:       &jobConditionSuspended,
   917  			expectedConditionStatus: v1.ConditionFalse,
   918  			expectedConditionReason: "JobResumed",
   919  			expectedReady:           ptr.To[int32](0),
   920  		},
   921  		"suspending a deleted job": {
   922  			// We would normally expect the active pods to be deleted (see a few test
   923  			// cases above), but since this job is being deleted, we don't expect
   924  			// anything changed here from before the job was suspended. The
   925  			// JobSuspended condition is also missing.
   926  			suspend:            true,
   927  			deleting:           true,
   928  			parallelism:        2,
   929  			activePods:         2, // parallelism == active, expectations satisfied
   930  			completions:        4,
   931  			backoffLimit:       6,
   932  			expectedCreations:  0,
   933  			expectedDeletions:  0,
   934  			expectedActive:     2,
   935  			expectedPodPatches: 2,
   936  			expectedReady:      ptr.To[int32](0),
   937  		},
   938  		"indexed job with podIndexLabel feature disabled": {
   939  			parallelism:            2,
   940  			completions:            5,
   941  			backoffLimit:           6,
   942  			completionMode:         batch.IndexedCompletion,
   943  			expectedCreations:      2,
   944  			expectedActive:         2,
   945  			expectedCreatedIndexes: sets.New(0, 1),
   946  			podIndexLabelDisabled:  true,
   947  			expectedReady:          ptr.To[int32](0),
   948  		},
   949  	}
   950  
   951  	for name, tc := range testCases {
   952  		t.Run(name, func(t *testing.T) {
   953  			logger, _ := ktesting.NewTestContext(t)
   954  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodIndexLabel, !tc.podIndexLabelDisabled)()
   955  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.jobPodReplacementPolicy)()
   956  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.jobPodFailurePolicy)()
   957  			// job manager setup
   958  			clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
   959  
   960  			var fakeClock clock.WithTicker
   961  			if tc.controllerTime != nil {
   962  				fakeClock = clocktesting.NewFakeClock(*tc.controllerTime)
   963  			} else {
   964  				fakeClock = clocktesting.NewFakeClock(time.Now())
   965  			}
   966  
   967  			manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientSet, controller.NoResyncPeriodFunc, fakeClock)
   968  			fakePodControl := controller.FakePodControl{Err: tc.podControllerError, CreateLimit: tc.podLimit}
   969  			manager.podControl = &fakePodControl
   970  			manager.podStoreSynced = alwaysReady
   971  			manager.jobStoreSynced = alwaysReady
   972  
   973  			// job & pods setup
   974  			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, tc.completionMode)
   975  			job.Spec.Suspend = ptr.To(tc.suspend)
   976  			if tc.jobPodReplacementPolicy {
   977  				job.Spec.PodReplacementPolicy = tc.podReplacementPolicy
   978  			}
   979  			if tc.jobPodFailurePolicy {
   980  				job.Spec.PodFailurePolicy = tc.podFailurePolicy
   981  			}
   982  			if tc.initialStatus != nil {
   983  				startTime := metav1.Now()
   984  				job.Status.StartTime = &startTime
   985  				job.Status.Active = int32(tc.initialStatus.active)
   986  				job.Status.Succeeded = int32(tc.initialStatus.succeed)
   987  				job.Status.Failed = int32(tc.initialStatus.failed)
   988  				if tc.initialStatus.startTime != nil {
   989  					startTime := metav1.NewTime(*tc.initialStatus.startTime)
   990  					job.Status.StartTime = &startTime
   991  				}
   992  			}
   993  
   994  			key, err := controller.KeyFunc(job)
   995  			if err != nil {
   996  				t.Errorf("Unexpected error getting job key: %v", err)
   997  			}
   998  
   999  			if tc.backoffRecord != nil {
  1000  				tc.backoffRecord.key = key
  1001  				manager.podBackoffStore.updateBackoffRecord(*tc.backoffRecord)
  1002  			}
  1003  			if tc.fakeExpectationAtCreation < 0 {
  1004  				manager.expectations.ExpectDeletions(logger, key, int(-tc.fakeExpectationAtCreation))
  1005  			} else if tc.fakeExpectationAtCreation > 0 {
  1006  				manager.expectations.ExpectCreations(logger, key, int(tc.fakeExpectationAtCreation))
  1007  			}
  1008  			if tc.wasSuspended {
  1009  				job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobSuspended, v1.ConditionTrue, "JobSuspended", "Job suspended", realClock.Now()))
  1010  			}
  1011  			if tc.deleting {
  1012  				now := metav1.Now()
  1013  				job.DeletionTimestamp = &now
  1014  			}
  1015  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  1016  			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
  1017  			setPodsStatuses(podIndexer, job, tc.pendingPods, tc.activePods, tc.succeededPods, tc.failedPods, tc.terminatingPods, tc.readyPods)
  1018  			setPodsStatusesWithIndexes(podIndexer, job, tc.podsWithIndexes)
  1019  
  1020  			actual := job
  1021  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  1022  				actual = job
  1023  				return job, nil
  1024  			}
  1025  
  1026  			// run
  1027  			err = manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  1028  
  1029  			// We need requeue syncJob task if podController error
  1030  			if tc.podControllerError != nil {
  1031  				if err == nil {
  1032  					t.Error("Syncing jobs expected to return error on podControl exception")
  1033  				}
  1034  			} else if tc.podLimit != 0 && fakePodControl.CreateCallCount > tc.podLimit {
  1035  				if err == nil {
  1036  					t.Error("Syncing jobs expected to return error when reached the podControl limit")
  1037  				}
  1038  			} else if err != nil {
  1039  				t.Errorf("Unexpected error when syncing jobs: %v", err)
  1040  			}
  1041  			// validate created/deleted pods
  1042  			if int32(len(fakePodControl.Templates)) != tc.expectedCreations {
  1043  				t.Errorf("Unexpected number of creates.  Expected %d, saw %d\n", tc.expectedCreations, len(fakePodControl.Templates))
  1044  			}
  1045  			if tc.completionMode == batch.IndexedCompletion {
  1046  				checkIndexedJobPods(t, &fakePodControl, tc.expectedCreatedIndexes, job.Name, tc.podIndexLabelDisabled)
  1047  			} else {
  1048  				for _, p := range fakePodControl.Templates {
  1049  					// Fake pod control doesn't add generate name from the owner reference.
  1050  					if p.GenerateName != "" {
  1051  						t.Errorf("Got pod generate name %s, want %s", p.GenerateName, "")
  1052  					}
  1053  					if p.Spec.Hostname != "" {
  1054  						t.Errorf("Got pod hostname %q, want none", p.Spec.Hostname)
  1055  					}
  1056  				}
  1057  			}
  1058  			if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions {
  1059  				t.Errorf("Unexpected number of deletes.  Expected %d, saw %d\n", tc.expectedDeletions, len(fakePodControl.DeletePodName))
  1060  			}
  1061  			// Each create should have an accompanying ControllerRef.
  1062  			if len(fakePodControl.ControllerRefs) != int(tc.expectedCreations) {
  1063  				t.Errorf("Unexpected number of ControllerRefs.  Expected %d, saw %d\n", tc.expectedCreations, len(fakePodControl.ControllerRefs))
  1064  			}
  1065  			// Make sure the ControllerRefs are correct.
  1066  			for _, controllerRef := range fakePodControl.ControllerRefs {
  1067  				if got, want := controllerRef.APIVersion, "batch/v1"; got != want {
  1068  					t.Errorf("controllerRef.APIVersion = %q, want %q", got, want)
  1069  				}
  1070  				if got, want := controllerRef.Kind, "Job"; got != want {
  1071  					t.Errorf("controllerRef.Kind = %q, want %q", got, want)
  1072  				}
  1073  				if got, want := controllerRef.Name, job.Name; got != want {
  1074  					t.Errorf("controllerRef.Name = %q, want %q", got, want)
  1075  				}
  1076  				if got, want := controllerRef.UID, job.UID; got != want {
  1077  					t.Errorf("controllerRef.UID = %q, want %q", got, want)
  1078  				}
  1079  				if controllerRef.Controller == nil || *controllerRef.Controller != true {
  1080  					t.Errorf("controllerRef.Controller is not set to true")
  1081  				}
  1082  			}
  1083  			// validate status
  1084  			if actual.Status.Active != tc.expectedActive {
  1085  				t.Errorf("Unexpected number of active pods.  Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
  1086  			}
  1087  			if diff := cmp.Diff(tc.expectedReady, actual.Status.Ready); diff != "" {
  1088  				t.Errorf("Unexpected number of ready pods (-want,+got): %s", diff)
  1089  			}
  1090  			if actual.Status.Succeeded != tc.expectedSucceeded {
  1091  				t.Errorf("Unexpected number of succeeded pods.  Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
  1092  			}
  1093  			if diff := cmp.Diff(tc.expectedCompletedIdxs, actual.Status.CompletedIndexes); diff != "" {
  1094  				t.Errorf("Unexpected completed indexes (-want,+got):\n%s", diff)
  1095  			}
  1096  			if actual.Status.Failed != tc.expectedFailed {
  1097  				t.Errorf("Unexpected number of failed pods.  Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
  1098  			}
  1099  			if diff := cmp.Diff(tc.expectedTerminating, actual.Status.Terminating); diff != "" {
  1100  				t.Errorf("Unexpected number of terminating pods (-want,+got): %s", diff)
  1101  			}
  1102  			if actual.Status.StartTime != nil && tc.suspend {
  1103  				t.Error("Unexpected .status.startTime not nil when suspend is true")
  1104  			}
  1105  			if actual.Status.StartTime == nil && !tc.suspend {
  1106  				t.Error("Missing .status.startTime")
  1107  			}
  1108  			// validate conditions
  1109  			if tc.expectedCondition != nil {
  1110  				if !getCondition(actual, *tc.expectedCondition, tc.expectedConditionStatus, tc.expectedConditionReason) {
  1111  					t.Errorf("Expected completion condition.  Got %#v", actual.Status.Conditions)
  1112  				}
  1113  			} else {
  1114  				if cond := hasTrueCondition(actual); cond != nil {
  1115  					t.Errorf("Got condition %s, want none", *cond)
  1116  				}
  1117  			}
  1118  			if tc.expectedCondition == nil && tc.suspend && len(actual.Status.Conditions) != 0 {
  1119  				t.Errorf("Unexpected conditions %v", actual.Status.Conditions)
  1120  			}
  1121  			// validate slow start
  1122  			expectedLimit := 0
  1123  			for pass := uint8(0); expectedLimit <= tc.podLimit; pass++ {
  1124  				expectedLimit += controller.SlowStartInitialBatchSize << pass
  1125  			}
  1126  			if tc.podLimit > 0 && fakePodControl.CreateCallCount > expectedLimit {
  1127  				t.Errorf("Unexpected number of create calls.  Expected <= %d, saw %d\n", fakePodControl.CreateLimit*2, fakePodControl.CreateCallCount)
  1128  			}
  1129  			if p := len(fakePodControl.Patches); p != tc.expectedPodPatches {
  1130  				t.Errorf("Got %d pod patches, want %d", p, tc.expectedPodPatches)
  1131  			}
  1132  		})
  1133  	}
  1134  }
  1135  
  1136  func checkIndexedJobPods(t *testing.T, control *controller.FakePodControl, wantIndexes sets.Set[int], jobName string, podIndexLabelDisabled bool) {
  1137  	t.Helper()
  1138  	gotIndexes := sets.New[int]()
  1139  	for _, p := range control.Templates {
  1140  		checkJobCompletionEnvVariable(t, &p.Spec, podIndexLabelDisabled)
  1141  		if !podIndexLabelDisabled {
  1142  			checkJobCompletionLabel(t, &p)
  1143  		}
  1144  		ix := getCompletionIndex(p.Annotations)
  1145  		if ix == -1 {
  1146  			t.Errorf("Created pod %s didn't have completion index", p.Name)
  1147  		} else {
  1148  			gotIndexes.Insert(ix)
  1149  		}
  1150  		expectedName := fmt.Sprintf("%s-%d", jobName, ix)
  1151  		if expectedName != p.Spec.Hostname {
  1152  			t.Errorf("Got pod hostname %s, want %s", p.Spec.Hostname, expectedName)
  1153  		}
  1154  		expectedName += "-"
  1155  		if expectedName != p.GenerateName {
  1156  			t.Errorf("Got pod generate name %s, want %s", p.GenerateName, expectedName)
  1157  		}
  1158  	}
  1159  	if diff := cmp.Diff(sets.List(wantIndexes), sets.List(gotIndexes)); diff != "" {
  1160  		t.Errorf("Unexpected created completion indexes (-want,+got):\n%s", diff)
  1161  	}
  1162  }
  1163  
  1164  func TestGetNewFinshedPods(t *testing.T) {
  1165  	cases := map[string]struct {
  1166  		job                  batch.Job
  1167  		pods                 []*v1.Pod
  1168  		expectedRmFinalizers sets.Set[string]
  1169  		wantSucceeded        int32
  1170  		wantFailed           int32
  1171  	}{
  1172  		"some counted": {
  1173  			job: batch.Job{
  1174  				Status: batch.JobStatus{
  1175  					Succeeded:               2,
  1176  					Failed:                  1,
  1177  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1178  				},
  1179  			},
  1180  			pods: []*v1.Pod{
  1181  				buildPod().uid("a").phase(v1.PodSucceeded).Pod,
  1182  				buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1183  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1184  				buildPod().uid("d").phase(v1.PodFailed).Pod,
  1185  				buildPod().uid("e").phase(v1.PodFailed).trackingFinalizer().Pod,
  1186  				buildPod().uid("f").phase(v1.PodRunning).Pod,
  1187  			},
  1188  			wantSucceeded: 4,
  1189  			wantFailed:    2,
  1190  		},
  1191  		"some uncounted": {
  1192  			job: batch.Job{
  1193  				Status: batch.JobStatus{
  1194  					Succeeded: 1,
  1195  					Failed:    1,
  1196  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1197  						Succeeded: []types.UID{"a", "c"},
  1198  						Failed:    []types.UID{"e", "f"},
  1199  					},
  1200  				},
  1201  			},
  1202  			pods: []*v1.Pod{
  1203  				buildPod().uid("a").phase(v1.PodSucceeded).Pod,
  1204  				buildPod().uid("b").phase(v1.PodSucceeded).Pod,
  1205  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1206  				buildPod().uid("d").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1207  				buildPod().uid("e").phase(v1.PodFailed).Pod,
  1208  				buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod,
  1209  				buildPod().uid("g").phase(v1.PodFailed).trackingFinalizer().Pod,
  1210  			},
  1211  			wantSucceeded: 4,
  1212  			wantFailed:    4,
  1213  		},
  1214  		"with expected removed finalizers": {
  1215  			job: batch.Job{
  1216  				Status: batch.JobStatus{
  1217  					Succeeded: 2,
  1218  					Failed:    2,
  1219  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1220  						Succeeded: []types.UID{"a"},
  1221  						Failed:    []types.UID{"d"},
  1222  					},
  1223  				},
  1224  			},
  1225  			expectedRmFinalizers: sets.New("b", "f"),
  1226  			pods: []*v1.Pod{
  1227  				buildPod().uid("a").phase(v1.PodSucceeded).Pod,
  1228  				buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1229  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1230  				buildPod().uid("d").phase(v1.PodFailed).Pod,
  1231  				buildPod().uid("e").phase(v1.PodFailed).trackingFinalizer().Pod,
  1232  				buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod,
  1233  				buildPod().uid("g").phase(v1.PodFailed).trackingFinalizer().Pod,
  1234  			},
  1235  			wantSucceeded: 4,
  1236  			wantFailed:    5,
  1237  		},
  1238  		"deleted pods": {
  1239  			job: batch.Job{
  1240  				Status: batch.JobStatus{
  1241  					Succeeded:               1,
  1242  					Failed:                  1,
  1243  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1244  				},
  1245  			},
  1246  			pods: []*v1.Pod{
  1247  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().deletionTimestamp().Pod,
  1248  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().deletionTimestamp().Pod,
  1249  				buildPod().uid("c").phase(v1.PodRunning).trackingFinalizer().deletionTimestamp().Pod,
  1250  				buildPod().uid("d").phase(v1.PodPending).trackingFinalizer().deletionTimestamp().Pod,
  1251  				buildPod().uid("e").phase(v1.PodRunning).deletionTimestamp().Pod,
  1252  				buildPod().uid("f").phase(v1.PodPending).deletionTimestamp().Pod,
  1253  			},
  1254  			wantSucceeded: 2,
  1255  			wantFailed:    4,
  1256  		},
  1257  	}
  1258  	for name, tc := range cases {
  1259  		t.Run(name, func(t *testing.T) {
  1260  			uncounted := newUncountedTerminatedPods(*tc.job.Status.UncountedTerminatedPods)
  1261  			jobCtx := &syncJobCtx{job: &tc.job, pods: tc.pods, uncounted: uncounted, expectedRmFinalizers: tc.expectedRmFinalizers}
  1262  			succeededPods, failedPods := getNewFinishedPods(jobCtx)
  1263  			succeeded := int32(len(succeededPods)) + tc.job.Status.Succeeded + int32(len(uncounted.succeeded))
  1264  			failed := int32(len(failedPods)) + tc.job.Status.Failed + int32(len(uncounted.failed))
  1265  			if succeeded != tc.wantSucceeded {
  1266  				t.Errorf("getStatus reports %d succeeded pods, want %d", succeeded, tc.wantSucceeded)
  1267  			}
  1268  			if failed != tc.wantFailed {
  1269  				t.Errorf("getStatus reports %d succeeded pods, want %d", failed, tc.wantFailed)
  1270  			}
  1271  		})
  1272  	}
  1273  }
  1274  
  1275  func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
  1276  	logger, ctx := ktesting.NewTestContext(t)
  1277  	succeededCond := newCondition(batch.JobComplete, v1.ConditionTrue, "", "", realClock.Now())
  1278  	failedCond := newCondition(batch.JobFailed, v1.ConditionTrue, "", "", realClock.Now())
  1279  	indexedCompletion := batch.IndexedCompletion
  1280  	mockErr := errors.New("mock error")
  1281  	cases := map[string]struct {
  1282  		job                     batch.Job
  1283  		pods                    []*v1.Pod
  1284  		finishedCond            *batch.JobCondition
  1285  		expectedRmFinalizers    sets.Set[string]
  1286  		needsFlush              bool
  1287  		statusUpdateErr         error
  1288  		podControlErr           error
  1289  		wantErr                 error
  1290  		wantRmFinalizers        int
  1291  		wantStatusUpdates       []batch.JobStatus
  1292  		wantSucceededPodsMetric int
  1293  		wantFailedPodsMetric    int
  1294  
  1295  		// features
  1296  		enableJobBackoffLimitPerIndex bool
  1297  	}{
  1298  		"no updates": {},
  1299  		"new active": {
  1300  			job: batch.Job{
  1301  				Status: batch.JobStatus{
  1302  					Active: 1,
  1303  				},
  1304  			},
  1305  			needsFlush: true,
  1306  			wantStatusUpdates: []batch.JobStatus{
  1307  				{
  1308  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1309  					Active:                  1,
  1310  				},
  1311  			},
  1312  		},
  1313  		"track finished pods": {
  1314  			pods: []*v1.Pod{
  1315  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1316  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1317  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().deletionTimestamp().Pod,
  1318  				buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().deletionTimestamp().Pod,
  1319  				buildPod().uid("e").phase(v1.PodPending).trackingFinalizer().deletionTimestamp().Pod,
  1320  				buildPod().phase(v1.PodPending).trackingFinalizer().Pod,
  1321  				buildPod().phase(v1.PodRunning).trackingFinalizer().Pod,
  1322  			},
  1323  			wantRmFinalizers: 5,
  1324  			wantStatusUpdates: []batch.JobStatus{
  1325  				{
  1326  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1327  						Succeeded: []types.UID{"a", "c"},
  1328  						Failed:    []types.UID{"b", "d", "e"},
  1329  					},
  1330  				},
  1331  				{
  1332  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1333  					Succeeded:               2,
  1334  					Failed:                  3,
  1335  				},
  1336  			},
  1337  			wantSucceededPodsMetric: 2,
  1338  			wantFailedPodsMetric:    3,
  1339  		},
  1340  		"past and new finished pods": {
  1341  			job: batch.Job{
  1342  				Status: batch.JobStatus{
  1343  					Active:    1,
  1344  					Succeeded: 2,
  1345  					Failed:    3,
  1346  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1347  						Succeeded: []types.UID{"a", "e"},
  1348  						Failed:    []types.UID{"b", "f"},
  1349  					},
  1350  				},
  1351  			},
  1352  			pods: []*v1.Pod{
  1353  				buildPod().uid("e").phase(v1.PodSucceeded).Pod,
  1354  				buildPod().phase(v1.PodFailed).Pod,
  1355  				buildPod().phase(v1.PodPending).Pod,
  1356  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1357  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1358  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1359  				buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod,
  1360  			},
  1361  			wantRmFinalizers: 4,
  1362  			wantStatusUpdates: []batch.JobStatus{
  1363  				{
  1364  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1365  						Succeeded: []types.UID{"a", "c"},
  1366  						Failed:    []types.UID{"b", "d"},
  1367  					},
  1368  					Active:    1,
  1369  					Succeeded: 3,
  1370  					Failed:    4,
  1371  				},
  1372  				{
  1373  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1374  					Active:                  1,
  1375  					Succeeded:               5,
  1376  					Failed:                  6,
  1377  				},
  1378  			},
  1379  			wantSucceededPodsMetric: 3,
  1380  			wantFailedPodsMetric:    3,
  1381  		},
  1382  		"expecting removed finalizers": {
  1383  			job: batch.Job{
  1384  				Status: batch.JobStatus{
  1385  					Succeeded: 2,
  1386  					Failed:    3,
  1387  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1388  						Succeeded: []types.UID{"a", "g"},
  1389  						Failed:    []types.UID{"b", "h"},
  1390  					},
  1391  				},
  1392  			},
  1393  			expectedRmFinalizers: sets.New("c", "d", "g", "h"),
  1394  			pods: []*v1.Pod{
  1395  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1396  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1397  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1398  				buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod,
  1399  				buildPod().uid("e").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1400  				buildPod().uid("f").phase(v1.PodFailed).trackingFinalizer().Pod,
  1401  				buildPod().uid("g").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1402  				buildPod().uid("h").phase(v1.PodFailed).trackingFinalizer().Pod,
  1403  			},
  1404  			wantRmFinalizers: 4,
  1405  			wantStatusUpdates: []batch.JobStatus{
  1406  				{
  1407  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1408  						Succeeded: []types.UID{"a", "e"},
  1409  						Failed:    []types.UID{"b", "f"},
  1410  					},
  1411  					Succeeded: 3,
  1412  					Failed:    4,
  1413  				},
  1414  				{
  1415  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1416  					Succeeded:               5,
  1417  					Failed:                  6,
  1418  				},
  1419  			},
  1420  			wantSucceededPodsMetric: 3,
  1421  			wantFailedPodsMetric:    3,
  1422  		},
  1423  		"succeeding job": {
  1424  			pods: []*v1.Pod{
  1425  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1426  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1427  			},
  1428  			finishedCond:     succeededCond,
  1429  			wantRmFinalizers: 2,
  1430  			wantStatusUpdates: []batch.JobStatus{
  1431  				{
  1432  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1433  						Succeeded: []types.UID{"a"},
  1434  						Failed:    []types.UID{"b"},
  1435  					},
  1436  				},
  1437  				{
  1438  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1439  					Succeeded:               1,
  1440  					Failed:                  1,
  1441  					Conditions:              []batch.JobCondition{*succeededCond},
  1442  					CompletionTime:          &succeededCond.LastTransitionTime,
  1443  				},
  1444  			},
  1445  			wantSucceededPodsMetric: 1,
  1446  			wantFailedPodsMetric:    1,
  1447  		},
  1448  		"failing job": {
  1449  			pods: []*v1.Pod{
  1450  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1451  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1452  				buildPod().uid("c").phase(v1.PodRunning).trackingFinalizer().Pod,
  1453  			},
  1454  			finishedCond: failedCond,
  1455  			// Running pod counts as failed.
  1456  			wantRmFinalizers: 3,
  1457  			wantStatusUpdates: []batch.JobStatus{
  1458  				{
  1459  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1460  						Succeeded: []types.UID{"a"},
  1461  						Failed:    []types.UID{"b", "c"},
  1462  					},
  1463  				},
  1464  				{
  1465  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1466  					Succeeded:               1,
  1467  					Failed:                  2,
  1468  					Conditions:              []batch.JobCondition{*failedCond},
  1469  				},
  1470  			},
  1471  			wantSucceededPodsMetric: 1,
  1472  			wantFailedPodsMetric:    2,
  1473  		},
  1474  		"deleted job": {
  1475  			job: batch.Job{
  1476  				ObjectMeta: metav1.ObjectMeta{
  1477  					DeletionTimestamp: &metav1.Time{},
  1478  				},
  1479  				Status: batch.JobStatus{
  1480  					Active: 1,
  1481  				},
  1482  			},
  1483  			pods: []*v1.Pod{
  1484  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1485  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1486  				buildPod().phase(v1.PodRunning).trackingFinalizer().Pod,
  1487  			},
  1488  			// Removing finalizer from Running pod, but doesn't count as failed.
  1489  			wantRmFinalizers: 3,
  1490  			wantStatusUpdates: []batch.JobStatus{
  1491  				{
  1492  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1493  						Succeeded: []types.UID{"a"},
  1494  						Failed:    []types.UID{"b"},
  1495  					},
  1496  					Active: 1,
  1497  				},
  1498  				{
  1499  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1500  					Active:                  1,
  1501  					Succeeded:               1,
  1502  					Failed:                  1,
  1503  				},
  1504  			},
  1505  			wantSucceededPodsMetric: 1,
  1506  			wantFailedPodsMetric:    1,
  1507  		},
  1508  		"status update error": {
  1509  			pods: []*v1.Pod{
  1510  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1511  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1512  			},
  1513  			statusUpdateErr: mockErr,
  1514  			wantErr:         mockErr,
  1515  			wantStatusUpdates: []batch.JobStatus{
  1516  				{
  1517  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1518  						Succeeded: []types.UID{"a"},
  1519  						Failed:    []types.UID{"b"},
  1520  					},
  1521  				},
  1522  			},
  1523  		},
  1524  		"pod patch errors": {
  1525  			pods: []*v1.Pod{
  1526  				buildPod().uid("a").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1527  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod,
  1528  			},
  1529  			podControlErr:    mockErr,
  1530  			wantErr:          mockErr,
  1531  			wantRmFinalizers: 2,
  1532  			wantStatusUpdates: []batch.JobStatus{
  1533  				{
  1534  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1535  						Succeeded: []types.UID{"a"},
  1536  						Failed:    []types.UID{"b"},
  1537  					},
  1538  				},
  1539  			},
  1540  		},
  1541  		"pod patch errors with partial success": {
  1542  			job: batch.Job{
  1543  				Status: batch.JobStatus{
  1544  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1545  						Succeeded: []types.UID{"a"},
  1546  						Failed:    []types.UID{"b"},
  1547  					},
  1548  				},
  1549  			},
  1550  			pods: []*v1.Pod{
  1551  				buildPod().uid("a").phase(v1.PodSucceeded).Pod,
  1552  				buildPod().uid("c").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1553  				buildPod().uid("d").phase(v1.PodFailed).trackingFinalizer().Pod,
  1554  			},
  1555  			podControlErr:    mockErr,
  1556  			wantErr:          mockErr,
  1557  			wantRmFinalizers: 2,
  1558  			wantStatusUpdates: []batch.JobStatus{
  1559  				{
  1560  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1561  						Succeeded: []types.UID{"c"},
  1562  						Failed:    []types.UID{"d"},
  1563  					},
  1564  					Succeeded: 1,
  1565  					Failed:    1,
  1566  				},
  1567  			},
  1568  		},
  1569  		"indexed job new successful pods": {
  1570  			job: batch.Job{
  1571  				Spec: batch.JobSpec{
  1572  					CompletionMode: &indexedCompletion,
  1573  					Completions:    ptr.To[int32](6),
  1574  				},
  1575  				Status: batch.JobStatus{
  1576  					Active: 1,
  1577  				},
  1578  			},
  1579  			pods: []*v1.Pod{
  1580  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod,
  1581  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod,
  1582  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod,
  1583  				buildPod().phase(v1.PodRunning).trackingFinalizer().index("5").Pod,
  1584  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1585  			},
  1586  			wantRmFinalizers: 4,
  1587  			wantStatusUpdates: []batch.JobStatus{
  1588  				{
  1589  					Active:                  1,
  1590  					Succeeded:               2,
  1591  					CompletedIndexes:        "1,3",
  1592  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1593  				},
  1594  			},
  1595  			wantSucceededPodsMetric: 2,
  1596  		},
  1597  		"indexed job prev successful pods outside current completions index range with no new succeeded pods": {
  1598  			job: batch.Job{
  1599  				Spec: batch.JobSpec{
  1600  					CompletionMode: &indexedCompletion,
  1601  					Completions:    ptr.To[int32](2),
  1602  					Parallelism:    ptr.To[int32](2),
  1603  				},
  1604  				Status: batch.JobStatus{
  1605  					Active:           2,
  1606  					Succeeded:        1,
  1607  					CompletedIndexes: "3",
  1608  				},
  1609  			},
  1610  			pods: []*v1.Pod{
  1611  				buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
  1612  				buildPod().phase(v1.PodRunning).trackingFinalizer().index("1").Pod,
  1613  			},
  1614  			wantRmFinalizers: 0,
  1615  			wantStatusUpdates: []batch.JobStatus{
  1616  				{
  1617  					Active:                  2,
  1618  					Succeeded:               0,
  1619  					CompletedIndexes:        "",
  1620  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1621  				},
  1622  			},
  1623  		},
  1624  		"indexed job prev successful pods outside current completions index range with new succeeded pods in range": {
  1625  			job: batch.Job{
  1626  				Spec: batch.JobSpec{
  1627  					CompletionMode: &indexedCompletion,
  1628  					Completions:    ptr.To[int32](2),
  1629  					Parallelism:    ptr.To[int32](2),
  1630  				},
  1631  				Status: batch.JobStatus{
  1632  					Active:           2,
  1633  					Succeeded:        1,
  1634  					CompletedIndexes: "3",
  1635  				},
  1636  			},
  1637  			pods: []*v1.Pod{
  1638  				buildPod().phase(v1.PodRunning).trackingFinalizer().index("0").Pod,
  1639  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod,
  1640  			},
  1641  			wantRmFinalizers: 1,
  1642  			wantStatusUpdates: []batch.JobStatus{
  1643  				{
  1644  					Active:                  2,
  1645  					Succeeded:               1,
  1646  					CompletedIndexes:        "1",
  1647  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1648  				},
  1649  			},
  1650  			wantSucceededPodsMetric: 1,
  1651  		},
  1652  		"indexed job new failed pods": {
  1653  			job: batch.Job{
  1654  				Spec: batch.JobSpec{
  1655  					CompletionMode: &indexedCompletion,
  1656  					Completions:    ptr.To[int32](6),
  1657  				},
  1658  				Status: batch.JobStatus{
  1659  					Active: 1,
  1660  				},
  1661  			},
  1662  			pods: []*v1.Pod{
  1663  				buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().index("1").Pod,
  1664  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().index("3").Pod,
  1665  				buildPod().uid("c").phase(v1.PodFailed).trackingFinalizer().index("3").Pod,
  1666  				buildPod().uid("d").phase(v1.PodRunning).trackingFinalizer().index("5").Pod,
  1667  				buildPod().phase(v1.PodFailed).trackingFinalizer().Pod,
  1668  			},
  1669  			wantRmFinalizers: 4,
  1670  			wantStatusUpdates: []batch.JobStatus{
  1671  				{
  1672  					Active: 1,
  1673  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1674  						Failed: []types.UID{"a", "b", "c"},
  1675  					},
  1676  				},
  1677  				{
  1678  					Active:                  1,
  1679  					Failed:                  3,
  1680  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1681  				},
  1682  			},
  1683  			wantFailedPodsMetric: 3,
  1684  		},
  1685  		"indexed job past and new pods": {
  1686  			job: batch.Job{
  1687  				Spec: batch.JobSpec{
  1688  					CompletionMode: &indexedCompletion,
  1689  					Completions:    ptr.To[int32](7),
  1690  				},
  1691  				Status: batch.JobStatus{
  1692  					Failed:           2,
  1693  					Succeeded:        5,
  1694  					CompletedIndexes: "0-2,4,6,7",
  1695  				},
  1696  			},
  1697  			pods: []*v1.Pod{
  1698  				buildPod().phase(v1.PodSucceeded).index("0").Pod,
  1699  				buildPod().phase(v1.PodFailed).index("1").Pod,
  1700  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("1").Pod,
  1701  				buildPod().phase(v1.PodSucceeded).trackingFinalizer().index("3").Pod,
  1702  				buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().index("2").Pod,
  1703  				buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().index("5").Pod,
  1704  			},
  1705  			wantRmFinalizers: 4,
  1706  			wantStatusUpdates: []batch.JobStatus{
  1707  				{
  1708  					Succeeded:        6,
  1709  					Failed:           2,
  1710  					CompletedIndexes: "0-4,6",
  1711  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1712  						Failed: []types.UID{"a", "b"},
  1713  					},
  1714  				},
  1715  				{
  1716  					Succeeded:               6,
  1717  					Failed:                  4,
  1718  					CompletedIndexes:        "0-4,6",
  1719  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1720  				},
  1721  			},
  1722  			wantSucceededPodsMetric: 1,
  1723  			wantFailedPodsMetric:    2,
  1724  		},
  1725  		"too many finished": {
  1726  			job: batch.Job{
  1727  				Status: batch.JobStatus{
  1728  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1729  						Failed: []types.UID{"a", "b"},
  1730  					},
  1731  				},
  1732  			},
  1733  			pods: func() []*v1.Pod {
  1734  				pods := make([]*v1.Pod, 500)
  1735  				for i := range pods {
  1736  					pods[i] = buildPod().uid(strconv.Itoa(i)).phase(v1.PodSucceeded).trackingFinalizer().Pod
  1737  				}
  1738  				pods = append(pods, buildPod().uid("b").phase(v1.PodFailed).trackingFinalizer().Pod)
  1739  				return pods
  1740  			}(),
  1741  			wantRmFinalizers: 499,
  1742  			wantStatusUpdates: []batch.JobStatus{
  1743  				{
  1744  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1745  						Succeeded: func() []types.UID {
  1746  							uids := make([]types.UID, 499)
  1747  							for i := range uids {
  1748  								uids[i] = types.UID(strconv.Itoa(i))
  1749  							}
  1750  							return uids
  1751  						}(),
  1752  						Failed: []types.UID{"b"},
  1753  					},
  1754  					Failed: 1,
  1755  				},
  1756  				{
  1757  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1758  						Failed: []types.UID{"b"},
  1759  					},
  1760  					Succeeded: 499,
  1761  					Failed:    1,
  1762  				},
  1763  			},
  1764  			wantSucceededPodsMetric: 499,
  1765  			wantFailedPodsMetric:    1,
  1766  		},
  1767  		"too many indexed finished": {
  1768  			job: batch.Job{
  1769  				Spec: batch.JobSpec{
  1770  					CompletionMode: &indexedCompletion,
  1771  					Completions:    ptr.To[int32](501),
  1772  				},
  1773  			},
  1774  			pods: func() []*v1.Pod {
  1775  				pods := make([]*v1.Pod, 501)
  1776  				for i := range pods {
  1777  					pods[i] = buildPod().uid(strconv.Itoa(i)).index(strconv.Itoa(i)).phase(v1.PodSucceeded).trackingFinalizer().Pod
  1778  				}
  1779  				return pods
  1780  			}(),
  1781  			wantRmFinalizers: 500,
  1782  			wantStatusUpdates: []batch.JobStatus{
  1783  				{
  1784  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1785  					CompletedIndexes:        "0-499",
  1786  					Succeeded:               500,
  1787  				},
  1788  			},
  1789  			wantSucceededPodsMetric: 500,
  1790  		},
  1791  		"pod flips from failed to succeeded": {
  1792  			job: batch.Job{
  1793  				Spec: batch.JobSpec{
  1794  					Completions: ptr.To[int32](2),
  1795  					Parallelism: ptr.To[int32](2),
  1796  				},
  1797  				Status: batch.JobStatus{
  1798  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1799  						Failed: []types.UID{"a", "b"},
  1800  					},
  1801  				},
  1802  			},
  1803  			pods: []*v1.Pod{
  1804  				buildPod().uid("a").phase(v1.PodFailed).trackingFinalizer().Pod,
  1805  				buildPod().uid("b").phase(v1.PodSucceeded).trackingFinalizer().Pod,
  1806  			},
  1807  			finishedCond:     failedCond,
  1808  			wantRmFinalizers: 2,
  1809  			wantStatusUpdates: []batch.JobStatus{
  1810  				{
  1811  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1812  					Failed:                  2,
  1813  					Conditions:              []batch.JobCondition{*failedCond},
  1814  				},
  1815  			},
  1816  			wantFailedPodsMetric: 2,
  1817  		},
  1818  		"indexed job with a failed pod with delayed finalizer removal; the pod is not counted": {
  1819  			enableJobBackoffLimitPerIndex: true,
  1820  			job: batch.Job{
  1821  				Spec: batch.JobSpec{
  1822  					CompletionMode:       &indexedCompletion,
  1823  					Completions:          ptr.To[int32](6),
  1824  					BackoffLimitPerIndex: ptr.To[int32](1),
  1825  				},
  1826  			},
  1827  			pods: []*v1.Pod{
  1828  				buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod,
  1829  			},
  1830  			wantStatusUpdates: []batch.JobStatus{
  1831  				{
  1832  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1833  					FailedIndexes:           ptr.To(""),
  1834  				},
  1835  			},
  1836  		},
  1837  		"indexed job with a failed pod which is recreated by a running pod; the pod is counted": {
  1838  			enableJobBackoffLimitPerIndex: true,
  1839  			job: batch.Job{
  1840  				Spec: batch.JobSpec{
  1841  					CompletionMode:       &indexedCompletion,
  1842  					Completions:          ptr.To[int32](6),
  1843  					BackoffLimitPerIndex: ptr.To[int32](1),
  1844  				},
  1845  				Status: batch.JobStatus{
  1846  					Active: 1,
  1847  				},
  1848  			},
  1849  			pods: []*v1.Pod{
  1850  				buildPod().uid("a1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().index("1").Pod,
  1851  				buildPod().uid("a2").phase(v1.PodRunning).indexFailureCount("1").trackingFinalizer().index("1").Pod,
  1852  			},
  1853  			wantRmFinalizers: 1,
  1854  			wantStatusUpdates: []batch.JobStatus{
  1855  				{
  1856  					Active: 1,
  1857  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1858  						Failed: []types.UID{"a1"},
  1859  					},
  1860  					FailedIndexes: ptr.To(""),
  1861  				},
  1862  				{
  1863  					Active:                  1,
  1864  					Failed:                  1,
  1865  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1866  					FailedIndexes:           ptr.To(""),
  1867  				},
  1868  			},
  1869  			wantFailedPodsMetric: 1,
  1870  		},
  1871  		"indexed job with a failed pod for a failed index; the pod is counted": {
  1872  			enableJobBackoffLimitPerIndex: true,
  1873  			job: batch.Job{
  1874  				Spec: batch.JobSpec{
  1875  					CompletionMode:       &indexedCompletion,
  1876  					Completions:          ptr.To[int32](6),
  1877  					BackoffLimitPerIndex: ptr.To[int32](1),
  1878  				},
  1879  			},
  1880  			pods: []*v1.Pod{
  1881  				buildPod().uid("a").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().index("1").Pod,
  1882  			},
  1883  			wantRmFinalizers: 1,
  1884  			wantStatusUpdates: []batch.JobStatus{
  1885  				{
  1886  					FailedIndexes: ptr.To("1"),
  1887  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{
  1888  						Failed: []types.UID{"a"},
  1889  					},
  1890  				},
  1891  				{
  1892  					Failed:                  1,
  1893  					FailedIndexes:           ptr.To("1"),
  1894  					UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  1895  				},
  1896  			},
  1897  			wantFailedPodsMetric: 1,
  1898  		},
  1899  	}
  1900  	for name, tc := range cases {
  1901  		t.Run(name, func(t *testing.T) {
  1902  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)()
  1903  			clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  1904  			manager, _ := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc)
  1905  			fakePodControl := controller.FakePodControl{Err: tc.podControlErr}
  1906  			metrics.JobPodsFinished.Reset()
  1907  			manager.podControl = &fakePodControl
  1908  			var statusUpdates []batch.JobStatus
  1909  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  1910  				statusUpdates = append(statusUpdates, *job.Status.DeepCopy())
  1911  				return job, tc.statusUpdateErr
  1912  			}
  1913  			job := tc.job.DeepCopy()
  1914  			if job.Status.UncountedTerminatedPods == nil {
  1915  				job.Status.UncountedTerminatedPods = &batch.UncountedTerminatedPods{}
  1916  			}
  1917  			jobCtx := &syncJobCtx{
  1918  				job:                  job,
  1919  				pods:                 tc.pods,
  1920  				uncounted:            newUncountedTerminatedPods(*job.Status.UncountedTerminatedPods),
  1921  				expectedRmFinalizers: tc.expectedRmFinalizers,
  1922  				finishedCondition:    tc.finishedCond,
  1923  			}
  1924  			if isIndexedJob(job) {
  1925  				jobCtx.succeededIndexes = parseIndexesFromString(logger, job.Status.CompletedIndexes, int(*job.Spec.Completions))
  1926  				if tc.enableJobBackoffLimitPerIndex && job.Spec.BackoffLimitPerIndex != nil {
  1927  					jobCtx.failedIndexes = calculateFailedIndexes(logger, job, tc.pods)
  1928  					jobCtx.activePods = controller.FilterActivePods(logger, tc.pods)
  1929  					jobCtx.podsWithDelayedDeletionPerIndex = getPodsWithDelayedDeletionPerIndex(logger, jobCtx)
  1930  				}
  1931  			}
  1932  
  1933  			err := manager.trackJobStatusAndRemoveFinalizers(ctx, jobCtx, tc.needsFlush)
  1934  			if !errors.Is(err, tc.wantErr) {
  1935  				t.Errorf("Got error %v, want %v", err, tc.wantErr)
  1936  			}
  1937  			if diff := cmp.Diff(tc.wantStatusUpdates, statusUpdates, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
  1938  				t.Errorf("Unexpected status updates (-want,+got):\n%s", diff)
  1939  			}
  1940  			rmFinalizers := len(fakePodControl.Patches)
  1941  			if rmFinalizers != tc.wantRmFinalizers {
  1942  				t.Errorf("Removed %d finalizers, want %d", rmFinalizers, tc.wantRmFinalizers)
  1943  			}
  1944  			if tc.wantErr == nil {
  1945  				completionMode := completionModeStr(job)
  1946  				v, err := metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Succeeded))
  1947  				if err != nil {
  1948  					t.Fatalf("Obtaining succeeded job_pods_finished_total: %v", err)
  1949  				}
  1950  				if float64(tc.wantSucceededPodsMetric) != v {
  1951  					t.Errorf("Metric reports %.0f succeeded pods, want %d", v, tc.wantSucceededPodsMetric)
  1952  				}
  1953  				v, err = metricstestutil.GetCounterMetricValue(metrics.JobPodsFinished.WithLabelValues(completionMode, metrics.Failed))
  1954  				if err != nil {
  1955  					t.Fatalf("Obtaining failed job_pods_finished_total: %v", err)
  1956  				}
  1957  				if float64(tc.wantFailedPodsMetric) != v {
  1958  					t.Errorf("Metric reports %.0f failed pods, want %d", v, tc.wantFailedPodsMetric)
  1959  				}
  1960  			}
  1961  		})
  1962  	}
  1963  }
  1964  
  1965  // TestSyncJobPastDeadline verifies tracking of active deadline in a single syncJob call.
  1966  func TestSyncJobPastDeadline(t *testing.T) {
  1967  	_, ctx := ktesting.NewTestContext(t)
  1968  	testCases := map[string]struct {
  1969  		// job setup
  1970  		parallelism           int32
  1971  		completions           int32
  1972  		activeDeadlineSeconds int64
  1973  		startTime             int64
  1974  		backoffLimit          int32
  1975  		suspend               bool
  1976  
  1977  		// pod setup
  1978  		activePods    int
  1979  		succeededPods int
  1980  		failedPods    int
  1981  
  1982  		// expectations
  1983  		expectedDeletions       int32
  1984  		expectedActive          int32
  1985  		expectedSucceeded       int32
  1986  		expectedFailed          int32
  1987  		expectedCondition       batch.JobConditionType
  1988  		expectedConditionReason string
  1989  	}{
  1990  		"activeDeadlineSeconds less than single pod execution": {
  1991  			parallelism:             1,
  1992  			completions:             1,
  1993  			activeDeadlineSeconds:   10,
  1994  			startTime:               15,
  1995  			backoffLimit:            6,
  1996  			activePods:              1,
  1997  			expectedDeletions:       1,
  1998  			expectedFailed:          1,
  1999  			expectedCondition:       batch.JobFailed,
  2000  			expectedConditionReason: batch.JobReasonDeadlineExceeded,
  2001  		},
  2002  		"activeDeadlineSeconds bigger than single pod execution": {
  2003  			parallelism:             1,
  2004  			completions:             2,
  2005  			activeDeadlineSeconds:   10,
  2006  			startTime:               15,
  2007  			backoffLimit:            6,
  2008  			activePods:              1,
  2009  			succeededPods:           1,
  2010  			expectedDeletions:       1,
  2011  			expectedSucceeded:       1,
  2012  			expectedFailed:          1,
  2013  			expectedCondition:       batch.JobFailed,
  2014  			expectedConditionReason: batch.JobReasonDeadlineExceeded,
  2015  		},
  2016  		"activeDeadlineSeconds times-out before any pod starts": {
  2017  			parallelism:             1,
  2018  			completions:             1,
  2019  			activeDeadlineSeconds:   10,
  2020  			startTime:               10,
  2021  			backoffLimit:            6,
  2022  			expectedCondition:       batch.JobFailed,
  2023  			expectedConditionReason: batch.JobReasonDeadlineExceeded,
  2024  		},
  2025  		"activeDeadlineSeconds with backofflimit reach": {
  2026  			parallelism:             1,
  2027  			completions:             1,
  2028  			activeDeadlineSeconds:   1,
  2029  			startTime:               10,
  2030  			failedPods:              1,
  2031  			expectedFailed:          1,
  2032  			expectedCondition:       batch.JobFailed,
  2033  			expectedConditionReason: batch.JobReasonBackoffLimitExceeded,
  2034  		},
  2035  		"activeDeadlineSeconds is not triggered when Job is suspended": {
  2036  			suspend:                 true,
  2037  			parallelism:             1,
  2038  			completions:             2,
  2039  			activeDeadlineSeconds:   10,
  2040  			startTime:               15,
  2041  			backoffLimit:            6,
  2042  			expectedCondition:       batch.JobSuspended,
  2043  			expectedConditionReason: "JobSuspended",
  2044  		},
  2045  	}
  2046  
  2047  	for name, tc := range testCases {
  2048  		t.Run(name, func(t *testing.T) {
  2049  			// job manager setup
  2050  			clientSet := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  2051  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc)
  2052  			fakePodControl := controller.FakePodControl{}
  2053  			manager.podControl = &fakePodControl
  2054  			manager.podStoreSynced = alwaysReady
  2055  			manager.jobStoreSynced = alwaysReady
  2056  			var actual *batch.Job
  2057  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  2058  				actual = job
  2059  				return job, nil
  2060  			}
  2061  
  2062  			// job & pods setup
  2063  			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion)
  2064  			job.Spec.ActiveDeadlineSeconds = &tc.activeDeadlineSeconds
  2065  			job.Spec.Suspend = ptr.To(tc.suspend)
  2066  			start := metav1.Unix(metav1.Now().Time.Unix()-tc.startTime, 0)
  2067  			job.Status.StartTime = &start
  2068  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  2069  			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
  2070  			setPodsStatuses(podIndexer, job, 0, tc.activePods, tc.succeededPods, tc.failedPods, 0, 0)
  2071  
  2072  			// run
  2073  			err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  2074  			if err != nil {
  2075  				t.Errorf("Unexpected error when syncing jobs %v", err)
  2076  			}
  2077  			// validate created/deleted pods
  2078  			if int32(len(fakePodControl.Templates)) != 0 {
  2079  				t.Errorf("Unexpected number of creates.  Expected 0, saw %d\n", len(fakePodControl.Templates))
  2080  			}
  2081  			if int32(len(fakePodControl.DeletePodName)) != tc.expectedDeletions {
  2082  				t.Errorf("Unexpected number of deletes.  Expected %d, saw %d\n", tc.expectedDeletions, len(fakePodControl.DeletePodName))
  2083  			}
  2084  			// validate status
  2085  			if actual.Status.Active != tc.expectedActive {
  2086  				t.Errorf("Unexpected number of active pods.  Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
  2087  			}
  2088  			if actual.Status.Succeeded != tc.expectedSucceeded {
  2089  				t.Errorf("Unexpected number of succeeded pods.  Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
  2090  			}
  2091  			if actual.Status.Failed != tc.expectedFailed {
  2092  				t.Errorf("Unexpected number of failed pods.  Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
  2093  			}
  2094  			if actual.Status.StartTime == nil {
  2095  				t.Error("Missing .status.startTime")
  2096  			}
  2097  			// validate conditions
  2098  			if !getCondition(actual, tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) {
  2099  				t.Errorf("Expected fail condition.  Got %#v", actual.Status.Conditions)
  2100  			}
  2101  		})
  2102  	}
  2103  }
  2104  
  2105  func getCondition(job *batch.Job, condition batch.JobConditionType, status v1.ConditionStatus, reason string) bool {
  2106  	for _, v := range job.Status.Conditions {
  2107  		if v.Type == condition && v.Status == status && v.Reason == reason {
  2108  			return true
  2109  		}
  2110  	}
  2111  	return false
  2112  }
  2113  
  2114  func hasTrueCondition(job *batch.Job) *batch.JobConditionType {
  2115  	for _, v := range job.Status.Conditions {
  2116  		if v.Status == v1.ConditionTrue {
  2117  			return &v.Type
  2118  		}
  2119  	}
  2120  	return nil
  2121  }
  2122  
  2123  // TestPastDeadlineJobFinished ensures that a Job is correctly tracked until
  2124  // reaching the active deadline, at which point it is marked as Failed.
  2125  func TestPastDeadlineJobFinished(t *testing.T) {
  2126  	_, ctx := ktesting.NewTestContext(t)
  2127  	clientset := fake.NewSimpleClientset()
  2128  	fakeClock := clocktesting.NewFakeClock(time.Now().Truncate(time.Second))
  2129  	manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  2130  	manager.podStoreSynced = alwaysReady
  2131  	manager.jobStoreSynced = alwaysReady
  2132  	manager.expectations = FakeJobExpectations{
  2133  		controller.NewControllerExpectations(), true, func() {
  2134  		},
  2135  	}
  2136  	ctx, cancel := context.WithCancel(context.Background())
  2137  	defer cancel()
  2138  	sharedInformerFactory.Start(ctx.Done())
  2139  	sharedInformerFactory.WaitForCacheSync(ctx.Done())
  2140  
  2141  	go manager.Run(ctx, 1)
  2142  
  2143  	tests := []struct {
  2144  		name         string
  2145  		setStartTime bool
  2146  		jobName      string
  2147  	}{
  2148  		{
  2149  			name:         "New job created without start time being set",
  2150  			setStartTime: false,
  2151  			jobName:      "job1",
  2152  		},
  2153  		{
  2154  			name:         "New job created with start time being set",
  2155  			setStartTime: true,
  2156  			jobName:      "job2",
  2157  		},
  2158  	}
  2159  	for _, tc := range tests {
  2160  		t.Run(tc.name, func(t *testing.T) {
  2161  			job := newJobWithName(tc.jobName, 1, 1, 6, batch.NonIndexedCompletion)
  2162  			job.Spec.ActiveDeadlineSeconds = ptr.To[int64](1)
  2163  			if tc.setStartTime {
  2164  				start := metav1.NewTime(fakeClock.Now())
  2165  				job.Status.StartTime = &start
  2166  			}
  2167  
  2168  			_, err := clientset.BatchV1().Jobs(job.GetNamespace()).Create(ctx, job, metav1.CreateOptions{})
  2169  			if err != nil {
  2170  				t.Errorf("Could not create Job: %v", err)
  2171  			}
  2172  
  2173  			var j *batch.Job
  2174  			err = wait.PollUntilContextTimeout(ctx, 200*time.Microsecond, 3*time.Second, true, func(ctx context.Context) (done bool, err error) {
  2175  				j, err = clientset.BatchV1().Jobs(metav1.NamespaceDefault).Get(ctx, job.GetName(), metav1.GetOptions{})
  2176  				if err != nil {
  2177  					return false, err
  2178  				}
  2179  				return j.Status.StartTime != nil, nil
  2180  			})
  2181  			if err != nil {
  2182  				t.Errorf("Job failed to ensure that start time was set: %v", err)
  2183  			}
  2184  			err = wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, 3*time.Second, false, func(ctx context.Context) (done bool, err error) {
  2185  				j, err = clientset.BatchV1().Jobs(metav1.NamespaceDefault).Get(ctx, job.GetName(), metav1.GetOptions{})
  2186  				if err != nil {
  2187  					return false, nil
  2188  				}
  2189  				if getCondition(j, batch.JobFailed, v1.ConditionTrue, batch.JobReasonDeadlineExceeded) {
  2190  					if manager.clock.Since(j.Status.StartTime.Time) < time.Duration(*j.Spec.ActiveDeadlineSeconds)*time.Second {
  2191  						return true, errors.New("Job contains DeadlineExceeded condition earlier than expected")
  2192  					}
  2193  					return true, nil
  2194  				}
  2195  				manager.clock.Sleep(100 * time.Millisecond)
  2196  				return false, nil
  2197  			})
  2198  			if err != nil {
  2199  				t.Errorf("Job failed to enforce activeDeadlineSeconds configuration. Expected condition with Reason 'DeadlineExceeded' was not found in %v", j.Status)
  2200  			}
  2201  		})
  2202  	}
  2203  }
  2204  
  2205  func TestSingleJobFailedCondition(t *testing.T) {
  2206  	_, ctx := ktesting.NewTestContext(t)
  2207  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  2208  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  2209  	fakePodControl := controller.FakePodControl{}
  2210  	manager.podControl = &fakePodControl
  2211  	manager.podStoreSynced = alwaysReady
  2212  	manager.jobStoreSynced = alwaysReady
  2213  	var actual *batch.Job
  2214  	manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  2215  		actual = job
  2216  		return job, nil
  2217  	}
  2218  
  2219  	job := newJob(1, 1, 6, batch.NonIndexedCompletion)
  2220  	job.Spec.ActiveDeadlineSeconds = ptr.To[int64](10)
  2221  	start := metav1.Unix(metav1.Now().Time.Unix()-15, 0)
  2222  	job.Status.StartTime = &start
  2223  	job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobFailed, v1.ConditionFalse, "DeadlineExceeded", "Job was active longer than specified deadline", realClock.Now()))
  2224  	sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  2225  	err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  2226  	if err != nil {
  2227  		t.Errorf("Unexpected error when syncing jobs %v", err)
  2228  	}
  2229  	if len(fakePodControl.DeletePodName) != 0 {
  2230  		t.Errorf("Unexpected number of deletes.  Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName))
  2231  	}
  2232  	if actual == nil {
  2233  		t.Error("Expected job modification\n")
  2234  	}
  2235  	failedConditions := getConditionsByType(actual.Status.Conditions, batch.JobFailed)
  2236  	if len(failedConditions) != 1 {
  2237  		t.Error("Unexpected number of failed conditions\n")
  2238  	}
  2239  	if failedConditions[0].Status != v1.ConditionTrue {
  2240  		t.Errorf("Unexpected status for the failed condition. Expected: %v, saw %v\n", v1.ConditionTrue, failedConditions[0].Status)
  2241  	}
  2242  
  2243  }
  2244  
  2245  func TestSyncJobComplete(t *testing.T) {
  2246  	_, ctx := ktesting.NewTestContext(t)
  2247  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  2248  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  2249  	fakePodControl := controller.FakePodControl{}
  2250  	manager.podControl = &fakePodControl
  2251  	manager.podStoreSynced = alwaysReady
  2252  	manager.jobStoreSynced = alwaysReady
  2253  
  2254  	job := newJob(1, 1, 6, batch.NonIndexedCompletion)
  2255  	job.Status.Conditions = append(job.Status.Conditions, *newCondition(batch.JobComplete, v1.ConditionTrue, "", "", realClock.Now()))
  2256  	sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  2257  	err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  2258  	if err != nil {
  2259  		t.Fatalf("Unexpected error when syncing jobs %v", err)
  2260  	}
  2261  	actual, err := manager.jobLister.Jobs(job.Namespace).Get(job.Name)
  2262  	if err != nil {
  2263  		t.Fatalf("Unexpected error when trying to get job from the store: %v", err)
  2264  	}
  2265  	// Verify that after syncing a complete job, the conditions are the same.
  2266  	if got, expected := len(actual.Status.Conditions), 1; got != expected {
  2267  		t.Fatalf("Unexpected job status conditions amount; expected %d, got %d", expected, got)
  2268  	}
  2269  }
  2270  
  2271  func TestSyncJobDeleted(t *testing.T) {
  2272  	_, ctx := ktesting.NewTestContext(t)
  2273  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  2274  	manager, _ := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  2275  	fakePodControl := controller.FakePodControl{}
  2276  	manager.podControl = &fakePodControl
  2277  	manager.podStoreSynced = alwaysReady
  2278  	manager.jobStoreSynced = alwaysReady
  2279  	manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  2280  		return job, nil
  2281  	}
  2282  	job := newJob(2, 2, 6, batch.NonIndexedCompletion)
  2283  	err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  2284  	if err != nil {
  2285  		t.Errorf("Unexpected error when syncing jobs %v", err)
  2286  	}
  2287  	if len(fakePodControl.Templates) != 0 {
  2288  		t.Errorf("Unexpected number of creates.  Expected %d, saw %d\n", 0, len(fakePodControl.Templates))
  2289  	}
  2290  	if len(fakePodControl.DeletePodName) != 0 {
  2291  		t.Errorf("Unexpected number of deletes.  Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName))
  2292  	}
  2293  }
  2294  
  2295  func TestSyncJobWithJobPodFailurePolicy(t *testing.T) {
  2296  	_, ctx := ktesting.NewTestContext(t)
  2297  	now := metav1.Now()
  2298  	indexedCompletionMode := batch.IndexedCompletion
  2299  	validObjectMeta := metav1.ObjectMeta{
  2300  		Name:      "foobar",
  2301  		UID:       uuid.NewUUID(),
  2302  		Namespace: metav1.NamespaceDefault,
  2303  	}
  2304  	validSelector := &metav1.LabelSelector{
  2305  		MatchLabels: map[string]string{"foo": "bar"},
  2306  	}
  2307  	validTemplate := v1.PodTemplateSpec{
  2308  		ObjectMeta: metav1.ObjectMeta{
  2309  			Labels: map[string]string{
  2310  				"foo": "bar",
  2311  			},
  2312  		},
  2313  		Spec: v1.PodSpec{
  2314  			Containers: []v1.Container{
  2315  				{Image: "foo/bar"},
  2316  			},
  2317  		},
  2318  	}
  2319  
  2320  	onExitCodeRules := []batch.PodFailurePolicyRule{
  2321  		{
  2322  			Action: batch.PodFailurePolicyActionIgnore,
  2323  			OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2324  				Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  2325  				Values:   []int32{1, 2, 3},
  2326  			},
  2327  		},
  2328  		{
  2329  			Action: batch.PodFailurePolicyActionFailJob,
  2330  			OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2331  				Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  2332  				Values:   []int32{5, 6, 7},
  2333  			},
  2334  		},
  2335  	}
  2336  
  2337  	testCases := map[string]struct {
  2338  		enableJobPodFailurePolicy     bool
  2339  		enablePodDisruptionConditions bool
  2340  		enableJobPodReplacementPolicy bool
  2341  		job                           batch.Job
  2342  		pods                          []v1.Pod
  2343  		wantConditions                *[]batch.JobCondition
  2344  		wantStatusFailed              int32
  2345  		wantStatusActive              int32
  2346  		wantStatusSucceeded           int32
  2347  		wantStatusTerminating         *int32
  2348  	}{
  2349  		"default handling for pod failure if the container matching the exit codes does not match the containerName restriction": {
  2350  			enableJobPodFailurePolicy: true,
  2351  			job: batch.Job{
  2352  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2353  				ObjectMeta: validObjectMeta,
  2354  				Spec: batch.JobSpec{
  2355  					Selector:     validSelector,
  2356  					Template:     validTemplate,
  2357  					Parallelism:  ptr.To[int32](1),
  2358  					Completions:  ptr.To[int32](1),
  2359  					BackoffLimit: ptr.To[int32](6),
  2360  					PodFailurePolicy: &batch.PodFailurePolicy{
  2361  						Rules: []batch.PodFailurePolicyRule{
  2362  							{
  2363  								Action: batch.PodFailurePolicyActionIgnore,
  2364  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2365  									ContainerName: ptr.To("main-container"),
  2366  									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
  2367  									Values:        []int32{1, 2, 3},
  2368  								},
  2369  							},
  2370  							{
  2371  								Action: batch.PodFailurePolicyActionFailJob,
  2372  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2373  									ContainerName: ptr.To("main-container"),
  2374  									Operator:      batch.PodFailurePolicyOnExitCodesOpIn,
  2375  									Values:        []int32{5, 6, 7},
  2376  								},
  2377  							},
  2378  						},
  2379  					},
  2380  				},
  2381  			},
  2382  			pods: []v1.Pod{
  2383  				{
  2384  					Status: v1.PodStatus{
  2385  						Phase: v1.PodFailed,
  2386  						ContainerStatuses: []v1.ContainerStatus{
  2387  							{
  2388  								Name: "monitoring-container",
  2389  								State: v1.ContainerState{
  2390  									Terminated: &v1.ContainerStateTerminated{
  2391  										ExitCode: 5,
  2392  									},
  2393  								},
  2394  							},
  2395  							{
  2396  								Name: "main-container",
  2397  								State: v1.ContainerState{
  2398  									Terminated: &v1.ContainerStateTerminated{
  2399  										ExitCode:   42,
  2400  										FinishedAt: testFinishedAt,
  2401  									},
  2402  								},
  2403  							},
  2404  						},
  2405  					},
  2406  				},
  2407  			},
  2408  			wantConditions:      nil,
  2409  			wantStatusActive:    1,
  2410  			wantStatusSucceeded: 0,
  2411  			wantStatusFailed:    1,
  2412  		},
  2413  		"running pod should not result in job fail based on OnExitCodes": {
  2414  			enableJobPodFailurePolicy: true,
  2415  			job: batch.Job{
  2416  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2417  				ObjectMeta: validObjectMeta,
  2418  				Spec: batch.JobSpec{
  2419  					Selector:     validSelector,
  2420  					Template:     validTemplate,
  2421  					Parallelism:  ptr.To[int32](1),
  2422  					Completions:  ptr.To[int32](1),
  2423  					BackoffLimit: ptr.To[int32](6),
  2424  					PodFailurePolicy: &batch.PodFailurePolicy{
  2425  						Rules: onExitCodeRules,
  2426  					},
  2427  				},
  2428  			},
  2429  			pods: []v1.Pod{
  2430  				{
  2431  					Status: v1.PodStatus{
  2432  						Phase: v1.PodRunning,
  2433  						ContainerStatuses: []v1.ContainerStatus{
  2434  							{
  2435  								Name: "main-container",
  2436  								State: v1.ContainerState{
  2437  									Terminated: &v1.ContainerStateTerminated{
  2438  										ExitCode: 5,
  2439  									},
  2440  								},
  2441  							},
  2442  						},
  2443  					},
  2444  				},
  2445  			},
  2446  			wantConditions:      nil,
  2447  			wantStatusActive:    1,
  2448  			wantStatusFailed:    0,
  2449  			wantStatusSucceeded: 0,
  2450  		},
  2451  		"fail job based on OnExitCodes": {
  2452  			enableJobPodFailurePolicy: true,
  2453  			job: batch.Job{
  2454  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2455  				ObjectMeta: validObjectMeta,
  2456  				Spec: batch.JobSpec{
  2457  					Selector:     validSelector,
  2458  					Template:     validTemplate,
  2459  					Parallelism:  ptr.To[int32](1),
  2460  					Completions:  ptr.To[int32](1),
  2461  					BackoffLimit: ptr.To[int32](6),
  2462  					PodFailurePolicy: &batch.PodFailurePolicy{
  2463  						Rules: onExitCodeRules,
  2464  					},
  2465  				},
  2466  			},
  2467  			pods: []v1.Pod{
  2468  				{
  2469  					Status: v1.PodStatus{
  2470  						Phase: v1.PodFailed,
  2471  						ContainerStatuses: []v1.ContainerStatus{
  2472  							{
  2473  								Name: "main-container",
  2474  								State: v1.ContainerState{
  2475  									Terminated: &v1.ContainerStateTerminated{
  2476  										ExitCode: 5,
  2477  									},
  2478  								},
  2479  							},
  2480  						},
  2481  					},
  2482  				},
  2483  			},
  2484  			wantConditions: &[]batch.JobCondition{
  2485  				{
  2486  					Type:    batch.JobFailed,
  2487  					Status:  v1.ConditionTrue,
  2488  					Reason:  batch.JobReasonPodFailurePolicy,
  2489  					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
  2490  				},
  2491  			},
  2492  			wantStatusActive:    0,
  2493  			wantStatusFailed:    1,
  2494  			wantStatusSucceeded: 0,
  2495  		},
  2496  		"job marked already as failure target with failed pod": {
  2497  			enableJobPodFailurePolicy: true,
  2498  			job: batch.Job{
  2499  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2500  				ObjectMeta: validObjectMeta,
  2501  				Spec: batch.JobSpec{
  2502  					Selector:     validSelector,
  2503  					Template:     validTemplate,
  2504  					Parallelism:  ptr.To[int32](1),
  2505  					Completions:  ptr.To[int32](1),
  2506  					BackoffLimit: ptr.To[int32](6),
  2507  					PodFailurePolicy: &batch.PodFailurePolicy{
  2508  						Rules: onExitCodeRules,
  2509  					},
  2510  				},
  2511  				Status: batch.JobStatus{
  2512  					Conditions: []batch.JobCondition{
  2513  						{
  2514  							Type:    batch.JobFailureTarget,
  2515  							Status:  v1.ConditionTrue,
  2516  							Reason:  batch.JobReasonPodFailurePolicy,
  2517  							Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
  2518  						},
  2519  					},
  2520  				},
  2521  			},
  2522  			pods: []v1.Pod{
  2523  				{
  2524  					Status: v1.PodStatus{
  2525  						Phase: v1.PodFailed,
  2526  						ContainerStatuses: []v1.ContainerStatus{
  2527  							{
  2528  								Name: "main-container",
  2529  								State: v1.ContainerState{
  2530  									Terminated: &v1.ContainerStateTerminated{
  2531  										ExitCode: 5,
  2532  									},
  2533  								},
  2534  							},
  2535  						},
  2536  					},
  2537  				},
  2538  			},
  2539  			wantConditions: &[]batch.JobCondition{
  2540  				{
  2541  					Type:    batch.JobFailed,
  2542  					Status:  v1.ConditionTrue,
  2543  					Reason:  batch.JobReasonPodFailurePolicy,
  2544  					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
  2545  				},
  2546  			},
  2547  			wantStatusActive:    0,
  2548  			wantStatusFailed:    1,
  2549  			wantStatusSucceeded: 0,
  2550  		},
  2551  		"job marked already as failure target with failed pod, message based on already deleted pod": {
  2552  			enableJobPodFailurePolicy: true,
  2553  			job: batch.Job{
  2554  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2555  				ObjectMeta: validObjectMeta,
  2556  				Spec: batch.JobSpec{
  2557  					Selector:     validSelector,
  2558  					Template:     validTemplate,
  2559  					Parallelism:  ptr.To[int32](1),
  2560  					Completions:  ptr.To[int32](1),
  2561  					BackoffLimit: ptr.To[int32](6),
  2562  					PodFailurePolicy: &batch.PodFailurePolicy{
  2563  						Rules: onExitCodeRules,
  2564  					},
  2565  				},
  2566  				Status: batch.JobStatus{
  2567  					Conditions: []batch.JobCondition{
  2568  						{
  2569  							Type:    batch.JobFailureTarget,
  2570  							Status:  v1.ConditionTrue,
  2571  							Reason:  batch.JobReasonPodFailurePolicy,
  2572  							Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
  2573  						},
  2574  					},
  2575  				},
  2576  			},
  2577  			pods: []v1.Pod{
  2578  				{
  2579  					Status: v1.PodStatus{
  2580  						Phase: v1.PodFailed,
  2581  						ContainerStatuses: []v1.ContainerStatus{
  2582  							{
  2583  								Name: "main-container",
  2584  								State: v1.ContainerState{
  2585  									Terminated: &v1.ContainerStateTerminated{
  2586  										ExitCode: 5,
  2587  									},
  2588  								},
  2589  							},
  2590  						},
  2591  					},
  2592  				},
  2593  			},
  2594  			wantConditions: &[]batch.JobCondition{
  2595  				{
  2596  					Type:    batch.JobFailed,
  2597  					Status:  v1.ConditionTrue,
  2598  					Reason:  batch.JobReasonPodFailurePolicy,
  2599  					Message: "Container main-container for pod default/already-deleted-pod failed with exit code 5 matching FailJob rule at index 1",
  2600  				},
  2601  			},
  2602  			wantStatusActive:    0,
  2603  			wantStatusFailed:    1,
  2604  			wantStatusSucceeded: 0,
  2605  		},
  2606  		"default handling for a failed pod when the feature is disabled even, despite matching rule": {
  2607  			enableJobPodFailurePolicy: false,
  2608  			job: batch.Job{
  2609  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2610  				ObjectMeta: validObjectMeta,
  2611  				Spec: batch.JobSpec{
  2612  					Selector:     validSelector,
  2613  					Template:     validTemplate,
  2614  					Parallelism:  ptr.To[int32](1),
  2615  					Completions:  ptr.To[int32](1),
  2616  					BackoffLimit: ptr.To[int32](6),
  2617  					PodFailurePolicy: &batch.PodFailurePolicy{
  2618  						Rules: onExitCodeRules,
  2619  					},
  2620  				},
  2621  			},
  2622  			pods: []v1.Pod{
  2623  				{
  2624  					Status: v1.PodStatus{
  2625  						Phase: v1.PodFailed,
  2626  						ContainerStatuses: []v1.ContainerStatus{
  2627  							{
  2628  								Name: "main-container",
  2629  								State: v1.ContainerState{
  2630  									Terminated: &v1.ContainerStateTerminated{
  2631  										ExitCode:   5,
  2632  										FinishedAt: testFinishedAt,
  2633  									},
  2634  								},
  2635  							},
  2636  						},
  2637  					},
  2638  				},
  2639  			},
  2640  			wantConditions:      nil,
  2641  			wantStatusActive:    1,
  2642  			wantStatusFailed:    1,
  2643  			wantStatusSucceeded: 0,
  2644  		},
  2645  		"fail job with multiple pods": {
  2646  			enableJobPodFailurePolicy: true,
  2647  			job: batch.Job{
  2648  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2649  				ObjectMeta: validObjectMeta,
  2650  				Spec: batch.JobSpec{
  2651  					Selector:     validSelector,
  2652  					Template:     validTemplate,
  2653  					Parallelism:  ptr.To[int32](2),
  2654  					Completions:  ptr.To[int32](2),
  2655  					BackoffLimit: ptr.To[int32](6),
  2656  					PodFailurePolicy: &batch.PodFailurePolicy{
  2657  						Rules: onExitCodeRules,
  2658  					},
  2659  				},
  2660  			},
  2661  			pods: []v1.Pod{
  2662  				{
  2663  					Status: v1.PodStatus{
  2664  						Phase: v1.PodRunning,
  2665  					},
  2666  				},
  2667  				{
  2668  					Status: v1.PodStatus{
  2669  						Phase: v1.PodFailed,
  2670  						ContainerStatuses: []v1.ContainerStatus{
  2671  							{
  2672  								Name: "main-container",
  2673  								State: v1.ContainerState{
  2674  									Terminated: &v1.ContainerStateTerminated{
  2675  										ExitCode: 5,
  2676  									},
  2677  								},
  2678  							},
  2679  						},
  2680  					},
  2681  				},
  2682  			},
  2683  			wantConditions: &[]batch.JobCondition{
  2684  				{
  2685  					Type:    batch.JobFailed,
  2686  					Status:  v1.ConditionTrue,
  2687  					Reason:  batch.JobReasonPodFailurePolicy,
  2688  					Message: "Container main-container for pod default/mypod-1 failed with exit code 5 matching FailJob rule at index 1",
  2689  				},
  2690  			},
  2691  			wantStatusActive:    0,
  2692  			wantStatusFailed:    2,
  2693  			wantStatusSucceeded: 0,
  2694  		},
  2695  		"fail indexed job based on OnExitCodes": {
  2696  			enableJobPodFailurePolicy: true,
  2697  			job: batch.Job{
  2698  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2699  				ObjectMeta: validObjectMeta,
  2700  				Spec: batch.JobSpec{
  2701  					Selector:       validSelector,
  2702  					Template:       validTemplate,
  2703  					CompletionMode: &indexedCompletionMode,
  2704  					Parallelism:    ptr.To[int32](1),
  2705  					Completions:    ptr.To[int32](1),
  2706  					BackoffLimit:   ptr.To[int32](6),
  2707  					PodFailurePolicy: &batch.PodFailurePolicy{
  2708  						Rules: onExitCodeRules,
  2709  					},
  2710  				},
  2711  			},
  2712  			pods: []v1.Pod{
  2713  				{
  2714  					Status: v1.PodStatus{
  2715  						Phase: v1.PodFailed,
  2716  						ContainerStatuses: []v1.ContainerStatus{
  2717  							{
  2718  								Name: "main-container",
  2719  								State: v1.ContainerState{
  2720  									Terminated: &v1.ContainerStateTerminated{
  2721  										ExitCode: 5,
  2722  									},
  2723  								},
  2724  							},
  2725  						},
  2726  					},
  2727  				},
  2728  			},
  2729  			wantConditions: &[]batch.JobCondition{
  2730  				{
  2731  					Type:    batch.JobFailed,
  2732  					Status:  v1.ConditionTrue,
  2733  					Reason:  batch.JobReasonPodFailurePolicy,
  2734  					Message: "Container main-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
  2735  				},
  2736  			},
  2737  			wantStatusActive:    0,
  2738  			wantStatusFailed:    1,
  2739  			wantStatusSucceeded: 0,
  2740  		},
  2741  		"fail job based on OnExitCodes with NotIn operator": {
  2742  			enableJobPodFailurePolicy: true,
  2743  			job: batch.Job{
  2744  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2745  				ObjectMeta: validObjectMeta,
  2746  				Spec: batch.JobSpec{
  2747  					Selector:     validSelector,
  2748  					Template:     validTemplate,
  2749  					Parallelism:  ptr.To[int32](1),
  2750  					Completions:  ptr.To[int32](1),
  2751  					BackoffLimit: ptr.To[int32](6),
  2752  					PodFailurePolicy: &batch.PodFailurePolicy{
  2753  						Rules: []batch.PodFailurePolicyRule{
  2754  							{
  2755  								Action: batch.PodFailurePolicyActionFailJob,
  2756  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2757  									Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
  2758  									Values:   []int32{5, 6, 7},
  2759  								},
  2760  							},
  2761  						},
  2762  					},
  2763  				},
  2764  			},
  2765  			pods: []v1.Pod{
  2766  				{
  2767  					Status: v1.PodStatus{
  2768  						Phase: v1.PodFailed,
  2769  						ContainerStatuses: []v1.ContainerStatus{
  2770  							{
  2771  								Name: "main-container",
  2772  								State: v1.ContainerState{
  2773  									Terminated: &v1.ContainerStateTerminated{
  2774  										ExitCode: 42,
  2775  									},
  2776  								},
  2777  							},
  2778  						},
  2779  					},
  2780  				},
  2781  			},
  2782  			wantConditions: &[]batch.JobCondition{
  2783  				{
  2784  					Type:    batch.JobFailed,
  2785  					Status:  v1.ConditionTrue,
  2786  					Reason:  batch.JobReasonPodFailurePolicy,
  2787  					Message: "Container main-container for pod default/mypod-0 failed with exit code 42 matching FailJob rule at index 0",
  2788  				},
  2789  			},
  2790  			wantStatusActive:    0,
  2791  			wantStatusFailed:    1,
  2792  			wantStatusSucceeded: 0,
  2793  		},
  2794  		"default handling job based on OnExitCodes with NotIn operator": {
  2795  			enableJobPodFailurePolicy: true,
  2796  			job: batch.Job{
  2797  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2798  				ObjectMeta: validObjectMeta,
  2799  				Spec: batch.JobSpec{
  2800  					Selector:     validSelector,
  2801  					Template:     validTemplate,
  2802  					Parallelism:  ptr.To[int32](1),
  2803  					Completions:  ptr.To[int32](1),
  2804  					BackoffLimit: ptr.To[int32](6),
  2805  					PodFailurePolicy: &batch.PodFailurePolicy{
  2806  						Rules: []batch.PodFailurePolicyRule{
  2807  							{
  2808  								Action: batch.PodFailurePolicyActionFailJob,
  2809  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  2810  									Operator: batch.PodFailurePolicyOnExitCodesOpNotIn,
  2811  									Values:   []int32{5, 6, 7},
  2812  								},
  2813  							},
  2814  						},
  2815  					},
  2816  				},
  2817  			},
  2818  			pods: []v1.Pod{
  2819  				{
  2820  					Status: v1.PodStatus{
  2821  						Phase: v1.PodFailed,
  2822  						ContainerStatuses: []v1.ContainerStatus{
  2823  							{
  2824  								Name: "main-container",
  2825  								State: v1.ContainerState{
  2826  									Terminated: &v1.ContainerStateTerminated{
  2827  										ExitCode:   5,
  2828  										FinishedAt: testFinishedAt,
  2829  									},
  2830  								},
  2831  							},
  2832  						},
  2833  					},
  2834  				},
  2835  			},
  2836  			wantConditions:      nil,
  2837  			wantStatusActive:    1,
  2838  			wantStatusFailed:    1,
  2839  			wantStatusSucceeded: 0,
  2840  		},
  2841  		"fail job based on OnExitCodes for InitContainer": {
  2842  			enableJobPodFailurePolicy: true,
  2843  			job: batch.Job{
  2844  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2845  				ObjectMeta: validObjectMeta,
  2846  				Spec: batch.JobSpec{
  2847  					Selector:     validSelector,
  2848  					Template:     validTemplate,
  2849  					Parallelism:  ptr.To[int32](1),
  2850  					Completions:  ptr.To[int32](1),
  2851  					BackoffLimit: ptr.To[int32](6),
  2852  					PodFailurePolicy: &batch.PodFailurePolicy{
  2853  						Rules: onExitCodeRules,
  2854  					},
  2855  				},
  2856  			},
  2857  			pods: []v1.Pod{
  2858  				{
  2859  					Status: v1.PodStatus{
  2860  						Phase: v1.PodFailed,
  2861  						InitContainerStatuses: []v1.ContainerStatus{
  2862  							{
  2863  								Name: "init-container",
  2864  								State: v1.ContainerState{
  2865  									Terminated: &v1.ContainerStateTerminated{
  2866  										ExitCode: 5,
  2867  									},
  2868  								},
  2869  							},
  2870  						},
  2871  						ContainerStatuses: []v1.ContainerStatus{
  2872  							{
  2873  								Name: "main-container",
  2874  								State: v1.ContainerState{
  2875  									Terminated: &v1.ContainerStateTerminated{
  2876  										ExitCode: 143,
  2877  									},
  2878  								},
  2879  							},
  2880  						},
  2881  					},
  2882  				},
  2883  			},
  2884  			wantConditions: &[]batch.JobCondition{
  2885  				{
  2886  					Type:    batch.JobFailed,
  2887  					Status:  v1.ConditionTrue,
  2888  					Reason:  batch.JobReasonPodFailurePolicy,
  2889  					Message: "Container init-container for pod default/mypod-0 failed with exit code 5 matching FailJob rule at index 1",
  2890  				},
  2891  			},
  2892  			wantStatusActive:    0,
  2893  			wantStatusFailed:    1,
  2894  			wantStatusSucceeded: 0,
  2895  		},
  2896  		"ignore pod failure; both rules are matching, the first is executed only": {
  2897  			enableJobPodFailurePolicy: true,
  2898  			job: batch.Job{
  2899  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2900  				ObjectMeta: validObjectMeta,
  2901  				Spec: batch.JobSpec{
  2902  					Selector:     validSelector,
  2903  					Template:     validTemplate,
  2904  					Parallelism:  ptr.To[int32](1),
  2905  					Completions:  ptr.To[int32](1),
  2906  					BackoffLimit: ptr.To[int32](0),
  2907  					PodFailurePolicy: &batch.PodFailurePolicy{
  2908  						Rules: onExitCodeRules,
  2909  					},
  2910  				},
  2911  			},
  2912  			pods: []v1.Pod{
  2913  				{
  2914  					Status: v1.PodStatus{
  2915  						Phase: v1.PodFailed,
  2916  						ContainerStatuses: []v1.ContainerStatus{
  2917  							{
  2918  								Name: "container1",
  2919  								State: v1.ContainerState{
  2920  									Terminated: &v1.ContainerStateTerminated{
  2921  										ExitCode: 2,
  2922  									},
  2923  								},
  2924  							},
  2925  							{
  2926  								Name: "container2",
  2927  								State: v1.ContainerState{
  2928  									Terminated: &v1.ContainerStateTerminated{
  2929  										ExitCode: 6,
  2930  									},
  2931  								},
  2932  							},
  2933  						},
  2934  					},
  2935  				},
  2936  			},
  2937  			wantConditions:      nil,
  2938  			wantStatusActive:    1,
  2939  			wantStatusFailed:    0,
  2940  			wantStatusSucceeded: 0,
  2941  		},
  2942  		"ignore pod failure based on OnExitCodes": {
  2943  			enableJobPodFailurePolicy: true,
  2944  			job: batch.Job{
  2945  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2946  				ObjectMeta: validObjectMeta,
  2947  				Spec: batch.JobSpec{
  2948  					Selector:     validSelector,
  2949  					Template:     validTemplate,
  2950  					Parallelism:  ptr.To[int32](1),
  2951  					Completions:  ptr.To[int32](1),
  2952  					BackoffLimit: ptr.To[int32](0),
  2953  					PodFailurePolicy: &batch.PodFailurePolicy{
  2954  						Rules: onExitCodeRules,
  2955  					},
  2956  				},
  2957  			},
  2958  			pods: []v1.Pod{
  2959  				{
  2960  					Status: v1.PodStatus{
  2961  						Phase: v1.PodFailed,
  2962  						ContainerStatuses: []v1.ContainerStatus{
  2963  							{
  2964  								State: v1.ContainerState{
  2965  									Terminated: &v1.ContainerStateTerminated{
  2966  										ExitCode: 1,
  2967  									},
  2968  								},
  2969  							},
  2970  						},
  2971  					},
  2972  				},
  2973  			},
  2974  			wantConditions:      nil,
  2975  			wantStatusActive:    1,
  2976  			wantStatusFailed:    0,
  2977  			wantStatusSucceeded: 0,
  2978  		},
  2979  		"default job based on OnExitCodes": {
  2980  			enableJobPodFailurePolicy: true,
  2981  			job: batch.Job{
  2982  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  2983  				ObjectMeta: validObjectMeta,
  2984  				Spec: batch.JobSpec{
  2985  					Selector:     validSelector,
  2986  					Template:     validTemplate,
  2987  					Parallelism:  ptr.To[int32](1),
  2988  					Completions:  ptr.To[int32](1),
  2989  					BackoffLimit: ptr.To[int32](0),
  2990  					PodFailurePolicy: &batch.PodFailurePolicy{
  2991  						Rules: onExitCodeRules,
  2992  					},
  2993  				},
  2994  			},
  2995  			pods: []v1.Pod{
  2996  				{
  2997  					Status: v1.PodStatus{
  2998  						Phase: v1.PodFailed,
  2999  						ContainerStatuses: []v1.ContainerStatus{
  3000  							{
  3001  								State: v1.ContainerState{
  3002  									Terminated: &v1.ContainerStateTerminated{
  3003  										ExitCode: 10,
  3004  									},
  3005  								},
  3006  							},
  3007  						},
  3008  					},
  3009  				},
  3010  			},
  3011  			wantConditions: &[]batch.JobCondition{
  3012  				{
  3013  					Type:    batch.JobFailed,
  3014  					Status:  v1.ConditionTrue,
  3015  					Reason:  batch.JobReasonBackoffLimitExceeded,
  3016  					Message: "Job has reached the specified backoff limit",
  3017  				},
  3018  			},
  3019  			wantStatusActive:    0,
  3020  			wantStatusFailed:    1,
  3021  			wantStatusSucceeded: 0,
  3022  		},
  3023  		"count pod failure based on OnExitCodes; both rules are matching, the first is executed only": {
  3024  			enableJobPodFailurePolicy: true,
  3025  			job: batch.Job{
  3026  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3027  				ObjectMeta: validObjectMeta,
  3028  				Spec: batch.JobSpec{
  3029  					Selector:     validSelector,
  3030  					Template:     validTemplate,
  3031  					Parallelism:  ptr.To[int32](1),
  3032  					Completions:  ptr.To[int32](1),
  3033  					BackoffLimit: ptr.To[int32](6),
  3034  					PodFailurePolicy: &batch.PodFailurePolicy{
  3035  						Rules: []batch.PodFailurePolicyRule{
  3036  							{
  3037  								Action: batch.PodFailurePolicyActionCount,
  3038  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  3039  									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  3040  									Values:   []int32{1, 2},
  3041  								},
  3042  							},
  3043  							{
  3044  								Action: batch.PodFailurePolicyActionIgnore,
  3045  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  3046  									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  3047  									Values:   []int32{2, 3},
  3048  								},
  3049  							},
  3050  						},
  3051  					},
  3052  				},
  3053  			},
  3054  			pods: []v1.Pod{
  3055  				{
  3056  					Status: v1.PodStatus{
  3057  						Phase: v1.PodFailed,
  3058  						ContainerStatuses: []v1.ContainerStatus{
  3059  							{
  3060  								State: v1.ContainerState{
  3061  									Terminated: &v1.ContainerStateTerminated{
  3062  										ExitCode:   2,
  3063  										FinishedAt: testFinishedAt,
  3064  									},
  3065  								},
  3066  							},
  3067  						},
  3068  					},
  3069  				},
  3070  			},
  3071  			wantConditions:      nil,
  3072  			wantStatusActive:    1,
  3073  			wantStatusFailed:    1,
  3074  			wantStatusSucceeded: 0,
  3075  		},
  3076  		"count pod failure based on OnPodConditions; both rules are matching, the first is executed only": {
  3077  			enableJobPodFailurePolicy: true,
  3078  			job: batch.Job{
  3079  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3080  				ObjectMeta: validObjectMeta,
  3081  				Spec: batch.JobSpec{
  3082  					Selector:     validSelector,
  3083  					Template:     validTemplate,
  3084  					Parallelism:  ptr.To[int32](1),
  3085  					Completions:  ptr.To[int32](1),
  3086  					BackoffLimit: ptr.To[int32](6),
  3087  					PodFailurePolicy: &batch.PodFailurePolicy{
  3088  						Rules: []batch.PodFailurePolicyRule{
  3089  							{
  3090  								Action: batch.PodFailurePolicyActionCount,
  3091  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3092  									{
  3093  										Type:   v1.PodConditionType("ResourceLimitExceeded"),
  3094  										Status: v1.ConditionTrue,
  3095  									},
  3096  								},
  3097  							},
  3098  							{
  3099  								Action: batch.PodFailurePolicyActionIgnore,
  3100  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3101  									{
  3102  										Type:   v1.DisruptionTarget,
  3103  										Status: v1.ConditionTrue,
  3104  									},
  3105  								},
  3106  							},
  3107  						},
  3108  					},
  3109  				},
  3110  			},
  3111  			pods: []v1.Pod{
  3112  				{
  3113  					Status: v1.PodStatus{
  3114  						Phase: v1.PodFailed,
  3115  						Conditions: []v1.PodCondition{
  3116  							{
  3117  								Type:   v1.PodConditionType("ResourceLimitExceeded"),
  3118  								Status: v1.ConditionTrue,
  3119  							},
  3120  							{
  3121  								Type:   v1.DisruptionTarget,
  3122  								Status: v1.ConditionTrue,
  3123  							},
  3124  						},
  3125  						ContainerStatuses: []v1.ContainerStatus{
  3126  							{
  3127  								State: v1.ContainerState{
  3128  									Terminated: &v1.ContainerStateTerminated{
  3129  										FinishedAt: testFinishedAt,
  3130  									},
  3131  								},
  3132  							},
  3133  						},
  3134  					},
  3135  				},
  3136  			},
  3137  			wantConditions:      nil,
  3138  			wantStatusActive:    1,
  3139  			wantStatusFailed:    1,
  3140  			wantStatusSucceeded: 0,
  3141  		},
  3142  		"ignore pod failure based on OnPodConditions": {
  3143  			enableJobPodFailurePolicy: true,
  3144  			job: batch.Job{
  3145  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3146  				ObjectMeta: validObjectMeta,
  3147  				Spec: batch.JobSpec{
  3148  					Selector:     validSelector,
  3149  					Template:     validTemplate,
  3150  					Parallelism:  ptr.To[int32](1),
  3151  					Completions:  ptr.To[int32](1),
  3152  					BackoffLimit: ptr.To[int32](0),
  3153  					PodFailurePolicy: &batch.PodFailurePolicy{
  3154  						Rules: []batch.PodFailurePolicyRule{
  3155  							{
  3156  								Action: batch.PodFailurePolicyActionIgnore,
  3157  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3158  									{
  3159  										Type:   v1.DisruptionTarget,
  3160  										Status: v1.ConditionTrue,
  3161  									},
  3162  								},
  3163  							},
  3164  						},
  3165  					},
  3166  				},
  3167  			},
  3168  			pods: []v1.Pod{
  3169  				{
  3170  					Status: v1.PodStatus{
  3171  						Phase: v1.PodFailed,
  3172  						Conditions: []v1.PodCondition{
  3173  							{
  3174  								Type:   v1.DisruptionTarget,
  3175  								Status: v1.ConditionTrue,
  3176  							},
  3177  						},
  3178  					},
  3179  				},
  3180  			},
  3181  			wantConditions:      nil,
  3182  			wantStatusActive:    1,
  3183  			wantStatusFailed:    0,
  3184  			wantStatusSucceeded: 0,
  3185  		},
  3186  		"ignore pod failure based on OnPodConditions, ignored failures delays pod recreation": {
  3187  			enableJobPodFailurePolicy: true,
  3188  			job: batch.Job{
  3189  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3190  				ObjectMeta: validObjectMeta,
  3191  				Spec: batch.JobSpec{
  3192  					Selector:     validSelector,
  3193  					Template:     validTemplate,
  3194  					Parallelism:  ptr.To[int32](1),
  3195  					Completions:  ptr.To[int32](1),
  3196  					BackoffLimit: ptr.To[int32](0),
  3197  					PodFailurePolicy: &batch.PodFailurePolicy{
  3198  						Rules: []batch.PodFailurePolicyRule{
  3199  							{
  3200  								Action: batch.PodFailurePolicyActionIgnore,
  3201  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3202  									{
  3203  										Type:   v1.DisruptionTarget,
  3204  										Status: v1.ConditionTrue,
  3205  									},
  3206  								},
  3207  							},
  3208  						},
  3209  					},
  3210  				},
  3211  			},
  3212  			pods: []v1.Pod{
  3213  				{
  3214  					ObjectMeta: metav1.ObjectMeta{
  3215  						DeletionTimestamp: &now,
  3216  					},
  3217  					Status: v1.PodStatus{
  3218  						Phase: v1.PodFailed,
  3219  						Conditions: []v1.PodCondition{
  3220  							{
  3221  								Type:   v1.DisruptionTarget,
  3222  								Status: v1.ConditionTrue,
  3223  							},
  3224  						},
  3225  					},
  3226  				},
  3227  			},
  3228  			wantConditions:      nil,
  3229  			wantStatusActive:    0,
  3230  			wantStatusFailed:    0,
  3231  			wantStatusSucceeded: 0,
  3232  		},
  3233  		"fail job based on OnPodConditions": {
  3234  			enableJobPodFailurePolicy: true,
  3235  			job: batch.Job{
  3236  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3237  				ObjectMeta: validObjectMeta,
  3238  				Spec: batch.JobSpec{
  3239  					Selector:     validSelector,
  3240  					Template:     validTemplate,
  3241  					Parallelism:  ptr.To[int32](1),
  3242  					Completions:  ptr.To[int32](1),
  3243  					BackoffLimit: ptr.To[int32](6),
  3244  					PodFailurePolicy: &batch.PodFailurePolicy{
  3245  						Rules: []batch.PodFailurePolicyRule{
  3246  							{
  3247  								Action: batch.PodFailurePolicyActionFailJob,
  3248  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3249  									{
  3250  										Type:   v1.DisruptionTarget,
  3251  										Status: v1.ConditionTrue,
  3252  									},
  3253  								},
  3254  							},
  3255  						},
  3256  					},
  3257  				},
  3258  			},
  3259  			pods: []v1.Pod{
  3260  				{
  3261  					Status: v1.PodStatus{
  3262  						Phase: v1.PodFailed,
  3263  						Conditions: []v1.PodCondition{
  3264  							{
  3265  								Type:   v1.DisruptionTarget,
  3266  								Status: v1.ConditionTrue,
  3267  							},
  3268  						},
  3269  					},
  3270  				},
  3271  			},
  3272  			wantConditions: &[]batch.JobCondition{
  3273  				{
  3274  					Type:    batch.JobFailed,
  3275  					Status:  v1.ConditionTrue,
  3276  					Reason:  batch.JobReasonPodFailurePolicy,
  3277  					Message: "Pod default/mypod-0 has condition DisruptionTarget matching FailJob rule at index 0",
  3278  				},
  3279  			},
  3280  			wantStatusActive:    0,
  3281  			wantStatusFailed:    1,
  3282  			wantStatusSucceeded: 0,
  3283  		},
  3284  		"terminating Pod considered failed when PodDisruptionConditions is disabled": {
  3285  			enableJobPodFailurePolicy: true,
  3286  			job: batch.Job{
  3287  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3288  				ObjectMeta: validObjectMeta,
  3289  				Spec: batch.JobSpec{
  3290  					Parallelism:  ptr.To[int32](1),
  3291  					Selector:     validSelector,
  3292  					Template:     validTemplate,
  3293  					BackoffLimit: ptr.To[int32](0),
  3294  					PodFailurePolicy: &batch.PodFailurePolicy{
  3295  						Rules: []batch.PodFailurePolicyRule{
  3296  							{
  3297  								Action: batch.PodFailurePolicyActionCount,
  3298  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3299  									{
  3300  										Type:   v1.DisruptionTarget,
  3301  										Status: v1.ConditionTrue,
  3302  									},
  3303  								},
  3304  							},
  3305  						},
  3306  					},
  3307  				},
  3308  			},
  3309  			pods: []v1.Pod{
  3310  				{
  3311  					ObjectMeta: metav1.ObjectMeta{
  3312  						DeletionTimestamp: &now,
  3313  					},
  3314  				},
  3315  			},
  3316  		},
  3317  		"terminating Pod not considered failed when PodDisruptionConditions is enabled": {
  3318  			enableJobPodFailurePolicy:     true,
  3319  			enablePodDisruptionConditions: true,
  3320  			job: batch.Job{
  3321  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3322  				ObjectMeta: validObjectMeta,
  3323  				Spec: batch.JobSpec{
  3324  					Parallelism:  ptr.To[int32](1),
  3325  					Selector:     validSelector,
  3326  					Template:     validTemplate,
  3327  					BackoffLimit: ptr.To[int32](0),
  3328  					PodFailurePolicy: &batch.PodFailurePolicy{
  3329  						Rules: []batch.PodFailurePolicyRule{
  3330  							{
  3331  								Action: batch.PodFailurePolicyActionCount,
  3332  								OnPodConditions: []batch.PodFailurePolicyOnPodConditionsPattern{
  3333  									{
  3334  										Type:   v1.DisruptionTarget,
  3335  										Status: v1.ConditionTrue,
  3336  									},
  3337  								},
  3338  							},
  3339  						},
  3340  					},
  3341  				},
  3342  			},
  3343  			pods: []v1.Pod{
  3344  				{
  3345  					ObjectMeta: metav1.ObjectMeta{
  3346  						DeletionTimestamp: &now,
  3347  					},
  3348  					Status: v1.PodStatus{
  3349  						Phase: v1.PodRunning,
  3350  					},
  3351  				},
  3352  			},
  3353  		},
  3354  	}
  3355  	for name, tc := range testCases {
  3356  		t.Run(name, func(t *testing.T) {
  3357  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
  3358  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.PodDisruptionConditions, tc.enablePodDisruptionConditions)()
  3359  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.enableJobPodReplacementPolicy)()
  3360  
  3361  			if tc.job.Spec.PodReplacementPolicy == nil {
  3362  				tc.job.Spec.PodReplacementPolicy = podReplacementPolicy(batch.Failed)
  3363  			}
  3364  			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  3365  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  3366  			fakePodControl := controller.FakePodControl{}
  3367  			manager.podControl = &fakePodControl
  3368  			manager.podStoreSynced = alwaysReady
  3369  			manager.jobStoreSynced = alwaysReady
  3370  			job := &tc.job
  3371  
  3372  			actual := job
  3373  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  3374  				actual = job
  3375  				return job, nil
  3376  			}
  3377  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  3378  			for i, pod := range tc.pods {
  3379  				pod := pod
  3380  				pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job)
  3381  				if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
  3382  					pb.index(fmt.Sprintf("%v", i))
  3383  				}
  3384  				pb = pb.trackingFinalizer()
  3385  				sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
  3386  			}
  3387  
  3388  			manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  3389  
  3390  			if tc.wantConditions != nil {
  3391  				for _, wantCondition := range *tc.wantConditions {
  3392  					conditions := getConditionsByType(actual.Status.Conditions, wantCondition.Type)
  3393  					if len(conditions) != 1 {
  3394  						t.Fatalf("Expected a single completion condition. Got %#v for type: %q", conditions, wantCondition.Type)
  3395  					}
  3396  					condition := *conditions[0]
  3397  					if diff := cmp.Diff(wantCondition, condition, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
  3398  						t.Errorf("Unexpected job condition (-want,+got):\n%s", diff)
  3399  					}
  3400  				}
  3401  			} else {
  3402  				if cond := hasTrueCondition(actual); cond != nil {
  3403  					t.Errorf("Got condition %s, want none", *cond)
  3404  				}
  3405  			}
  3406  			// validate status
  3407  			if actual.Status.Active != tc.wantStatusActive {
  3408  				t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.wantStatusActive, actual.Status.Active)
  3409  			}
  3410  			if actual.Status.Succeeded != tc.wantStatusSucceeded {
  3411  				t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.wantStatusSucceeded, actual.Status.Succeeded)
  3412  			}
  3413  			if actual.Status.Failed != tc.wantStatusFailed {
  3414  				t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.wantStatusFailed, actual.Status.Failed)
  3415  			}
  3416  			if ptr.Deref(actual.Status.Terminating, 0) != ptr.Deref(tc.wantStatusTerminating, 0) {
  3417  				t.Errorf("unexpected number of terminating pods. Expected %d, saw %d\n", ptr.Deref(tc.wantStatusTerminating, 0), ptr.Deref(actual.Status.Terminating, 0))
  3418  			}
  3419  		})
  3420  	}
  3421  }
  3422  
  3423  func TestSyncJobWithJobBackoffLimitPerIndex(t *testing.T) {
  3424  	_, ctx := ktesting.NewTestContext(t)
  3425  	now := time.Now()
  3426  	validObjectMeta := metav1.ObjectMeta{
  3427  		Name:      "foobar",
  3428  		UID:       uuid.NewUUID(),
  3429  		Namespace: metav1.NamespaceDefault,
  3430  	}
  3431  	validSelector := &metav1.LabelSelector{
  3432  		MatchLabels: map[string]string{"foo": "bar"},
  3433  	}
  3434  	validTemplate := v1.PodTemplateSpec{
  3435  		ObjectMeta: metav1.ObjectMeta{
  3436  			Labels: map[string]string{
  3437  				"foo": "bar",
  3438  			},
  3439  		},
  3440  		Spec: v1.PodSpec{
  3441  			Containers: []v1.Container{
  3442  				{Image: "foo/bar"},
  3443  			},
  3444  		},
  3445  	}
  3446  
  3447  	testCases := map[string]struct {
  3448  		enableJobBackoffLimitPerIndex bool
  3449  		enableJobPodFailurePolicy     bool
  3450  		job                           batch.Job
  3451  		pods                          []v1.Pod
  3452  		wantStatus                    batch.JobStatus
  3453  	}{
  3454  		"successful job after a single failure within index": {
  3455  			enableJobBackoffLimitPerIndex: true,
  3456  			job: batch.Job{
  3457  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3458  				ObjectMeta: validObjectMeta,
  3459  				Spec: batch.JobSpec{
  3460  					Selector:             validSelector,
  3461  					Template:             validTemplate,
  3462  					Parallelism:          ptr.To[int32](2),
  3463  					Completions:          ptr.To[int32](2),
  3464  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3465  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3466  					BackoffLimitPerIndex: ptr.To[int32](1),
  3467  				},
  3468  			},
  3469  			pods: []v1.Pod{
  3470  				*buildPod().uid("a1").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
  3471  				*buildPod().uid("a2").index("0").phase(v1.PodSucceeded).indexFailureCount("1").trackingFinalizer().Pod,
  3472  				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
  3473  			},
  3474  			wantStatus: batch.JobStatus{
  3475  				Failed:                  1,
  3476  				Succeeded:               2,
  3477  				Terminating:             ptr.To[int32](0),
  3478  				CompletedIndexes:        "0,1",
  3479  				FailedIndexes:           ptr.To(""),
  3480  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3481  				Conditions: []batch.JobCondition{
  3482  					{
  3483  						Type:   batch.JobComplete,
  3484  						Status: v1.ConditionTrue,
  3485  					},
  3486  				},
  3487  			},
  3488  		},
  3489  		"single failed pod, not counted as the replacement pod creation is delayed": {
  3490  			enableJobBackoffLimitPerIndex: true,
  3491  			job: batch.Job{
  3492  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3493  				ObjectMeta: validObjectMeta,
  3494  				Spec: batch.JobSpec{
  3495  					Selector:             validSelector,
  3496  					Template:             validTemplate,
  3497  					Parallelism:          ptr.To[int32](2),
  3498  					Completions:          ptr.To[int32](2),
  3499  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3500  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3501  					BackoffLimitPerIndex: ptr.To[int32](1),
  3502  				},
  3503  			},
  3504  			pods: []v1.Pod{
  3505  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
  3506  			},
  3507  			wantStatus: batch.JobStatus{
  3508  				Active:                  2,
  3509  				Terminating:             ptr.To[int32](0),
  3510  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3511  				FailedIndexes:           ptr.To(""),
  3512  			},
  3513  		},
  3514  		"single failed pod replaced already": {
  3515  			enableJobBackoffLimitPerIndex: true,
  3516  			job: batch.Job{
  3517  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3518  				ObjectMeta: validObjectMeta,
  3519  				Spec: batch.JobSpec{
  3520  					Selector:             validSelector,
  3521  					Template:             validTemplate,
  3522  					Parallelism:          ptr.To[int32](2),
  3523  					Completions:          ptr.To[int32](2),
  3524  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3525  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3526  					BackoffLimitPerIndex: ptr.To[int32](1),
  3527  				},
  3528  			},
  3529  			pods: []v1.Pod{
  3530  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
  3531  				*buildPod().uid("b").index("0").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod,
  3532  			},
  3533  			wantStatus: batch.JobStatus{
  3534  				Active:                  2,
  3535  				Failed:                  1,
  3536  				Terminating:             ptr.To[int32](0),
  3537  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3538  				FailedIndexes:           ptr.To(""),
  3539  			},
  3540  		},
  3541  		"single failed index due to exceeding the backoff limit per index, the job continues": {
  3542  			enableJobBackoffLimitPerIndex: true,
  3543  			job: batch.Job{
  3544  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3545  				ObjectMeta: validObjectMeta,
  3546  				Spec: batch.JobSpec{
  3547  					Selector:             validSelector,
  3548  					Template:             validTemplate,
  3549  					Parallelism:          ptr.To[int32](2),
  3550  					Completions:          ptr.To[int32](2),
  3551  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3552  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3553  					BackoffLimitPerIndex: ptr.To[int32](1),
  3554  				},
  3555  			},
  3556  			pods: []v1.Pod{
  3557  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
  3558  			},
  3559  			wantStatus: batch.JobStatus{
  3560  				Active:                  1,
  3561  				Failed:                  1,
  3562  				FailedIndexes:           ptr.To("0"),
  3563  				Terminating:             ptr.To[int32](0),
  3564  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3565  			},
  3566  		},
  3567  		"single failed index due to FailIndex action, the job continues": {
  3568  			enableJobBackoffLimitPerIndex: true,
  3569  			enableJobPodFailurePolicy:     true,
  3570  			job: batch.Job{
  3571  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3572  				ObjectMeta: validObjectMeta,
  3573  				Spec: batch.JobSpec{
  3574  					Selector:             validSelector,
  3575  					Template:             validTemplate,
  3576  					Parallelism:          ptr.To[int32](2),
  3577  					Completions:          ptr.To[int32](2),
  3578  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3579  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3580  					BackoffLimitPerIndex: ptr.To[int32](1),
  3581  					PodFailurePolicy: &batch.PodFailurePolicy{
  3582  						Rules: []batch.PodFailurePolicyRule{
  3583  							{
  3584  								Action: batch.PodFailurePolicyActionFailIndex,
  3585  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  3586  									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  3587  									Values:   []int32{3},
  3588  								},
  3589  							},
  3590  						},
  3591  					},
  3592  				},
  3593  			},
  3594  			pods: []v1.Pod{
  3595  				*buildPod().uid("a").index("0").status(v1.PodStatus{
  3596  					Phase: v1.PodFailed,
  3597  					ContainerStatuses: []v1.ContainerStatus{
  3598  						{
  3599  							State: v1.ContainerState{
  3600  								Terminated: &v1.ContainerStateTerminated{
  3601  									ExitCode: 3,
  3602  								},
  3603  							},
  3604  						},
  3605  					},
  3606  				}).indexFailureCount("0").trackingFinalizer().Pod,
  3607  			},
  3608  			wantStatus: batch.JobStatus{
  3609  				Active:                  1,
  3610  				Failed:                  1,
  3611  				FailedIndexes:           ptr.To("0"),
  3612  				Terminating:             ptr.To[int32](0),
  3613  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3614  			},
  3615  		},
  3616  		"job failed index due to FailJob action": {
  3617  			enableJobBackoffLimitPerIndex: true,
  3618  			enableJobPodFailurePolicy:     true,
  3619  			job: batch.Job{
  3620  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3621  				ObjectMeta: validObjectMeta,
  3622  				Spec: batch.JobSpec{
  3623  					Selector:             validSelector,
  3624  					Template:             validTemplate,
  3625  					Parallelism:          ptr.To[int32](2),
  3626  					Completions:          ptr.To[int32](2),
  3627  					BackoffLimit:         ptr.To[int32](6),
  3628  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3629  					BackoffLimitPerIndex: ptr.To[int32](1),
  3630  					PodFailurePolicy: &batch.PodFailurePolicy{
  3631  						Rules: []batch.PodFailurePolicyRule{
  3632  							{
  3633  								Action: batch.PodFailurePolicyActionFailJob,
  3634  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  3635  									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  3636  									Values:   []int32{3},
  3637  								},
  3638  							},
  3639  						},
  3640  					},
  3641  				},
  3642  			},
  3643  			pods: []v1.Pod{
  3644  				*buildPod().uid("a").index("0").status(v1.PodStatus{
  3645  					Phase: v1.PodFailed,
  3646  					ContainerStatuses: []v1.ContainerStatus{
  3647  						{
  3648  							Name: "x",
  3649  							State: v1.ContainerState{
  3650  								Terminated: &v1.ContainerStateTerminated{
  3651  									ExitCode: 3,
  3652  								},
  3653  							},
  3654  						},
  3655  					},
  3656  				}).indexFailureCount("0").trackingFinalizer().Pod,
  3657  			},
  3658  			wantStatus: batch.JobStatus{
  3659  				Active:                  0,
  3660  				Failed:                  1,
  3661  				FailedIndexes:           ptr.To(""),
  3662  				Terminating:             ptr.To[int32](0),
  3663  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3664  				Conditions: []batch.JobCondition{
  3665  					{
  3666  						Type:    batch.JobFailureTarget,
  3667  						Status:  v1.ConditionTrue,
  3668  						Reason:  batch.JobReasonPodFailurePolicy,
  3669  						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
  3670  					},
  3671  					{
  3672  						Type:    batch.JobFailed,
  3673  						Status:  v1.ConditionTrue,
  3674  						Reason:  batch.JobReasonPodFailurePolicy,
  3675  						Message: "Container x for pod default/mypod-0 failed with exit code 3 matching FailJob rule at index 0",
  3676  					},
  3677  				},
  3678  			},
  3679  		},
  3680  		"job pod failure ignored due to matching Ignore action": {
  3681  			enableJobBackoffLimitPerIndex: true,
  3682  			enableJobPodFailurePolicy:     true,
  3683  			job: batch.Job{
  3684  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3685  				ObjectMeta: validObjectMeta,
  3686  				Spec: batch.JobSpec{
  3687  					Selector:             validSelector,
  3688  					Template:             validTemplate,
  3689  					Parallelism:          ptr.To[int32](2),
  3690  					Completions:          ptr.To[int32](2),
  3691  					BackoffLimit:         ptr.To[int32](6),
  3692  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3693  					BackoffLimitPerIndex: ptr.To[int32](1),
  3694  					PodFailurePolicy: &batch.PodFailurePolicy{
  3695  						Rules: []batch.PodFailurePolicyRule{
  3696  							{
  3697  								Action: batch.PodFailurePolicyActionIgnore,
  3698  								OnExitCodes: &batch.PodFailurePolicyOnExitCodesRequirement{
  3699  									Operator: batch.PodFailurePolicyOnExitCodesOpIn,
  3700  									Values:   []int32{3},
  3701  								},
  3702  							},
  3703  						},
  3704  					},
  3705  				},
  3706  			},
  3707  			pods: []v1.Pod{
  3708  				*buildPod().uid("a").index("0").status(v1.PodStatus{
  3709  					Phase: v1.PodFailed,
  3710  					ContainerStatuses: []v1.ContainerStatus{
  3711  						{
  3712  							Name: "x",
  3713  							State: v1.ContainerState{
  3714  								Terminated: &v1.ContainerStateTerminated{
  3715  									ExitCode: 3,
  3716  								},
  3717  							},
  3718  						},
  3719  					},
  3720  				}).indexFailureCount("0").trackingFinalizer().Pod,
  3721  			},
  3722  			wantStatus: batch.JobStatus{
  3723  				Active:                  2,
  3724  				Failed:                  0,
  3725  				FailedIndexes:           ptr.To(""),
  3726  				Terminating:             ptr.To[int32](0),
  3727  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3728  			},
  3729  		},
  3730  		"job failed due to exceeding backoffLimit before backoffLimitPerIndex": {
  3731  			enableJobBackoffLimitPerIndex: true,
  3732  			job: batch.Job{
  3733  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3734  				ObjectMeta: validObjectMeta,
  3735  				Spec: batch.JobSpec{
  3736  					Selector:             validSelector,
  3737  					Template:             validTemplate,
  3738  					Parallelism:          ptr.To[int32](2),
  3739  					Completions:          ptr.To[int32](2),
  3740  					BackoffLimit:         ptr.To[int32](1),
  3741  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3742  					BackoffLimitPerIndex: ptr.To[int32](1),
  3743  				},
  3744  			},
  3745  			pods: []v1.Pod{
  3746  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
  3747  				*buildPod().uid("b").index("1").phase(v1.PodFailed).indexFailureCount("0").trackingFinalizer().Pod,
  3748  			},
  3749  			wantStatus: batch.JobStatus{
  3750  				Failed:                  2,
  3751  				Succeeded:               0,
  3752  				FailedIndexes:           ptr.To(""),
  3753  				Terminating:             ptr.To[int32](0),
  3754  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3755  				Conditions: []batch.JobCondition{
  3756  					{
  3757  						Type:    batch.JobFailed,
  3758  						Status:  v1.ConditionTrue,
  3759  						Reason:  batch.JobReasonBackoffLimitExceeded,
  3760  						Message: "Job has reached the specified backoff limit",
  3761  					},
  3762  				},
  3763  			},
  3764  		},
  3765  		"job failed due to failed indexes": {
  3766  			enableJobBackoffLimitPerIndex: true,
  3767  			job: batch.Job{
  3768  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3769  				ObjectMeta: validObjectMeta,
  3770  				Spec: batch.JobSpec{
  3771  					Selector:             validSelector,
  3772  					Template:             validTemplate,
  3773  					Parallelism:          ptr.To[int32](2),
  3774  					Completions:          ptr.To[int32](2),
  3775  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3776  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3777  					BackoffLimitPerIndex: ptr.To[int32](1),
  3778  				},
  3779  			},
  3780  			pods: []v1.Pod{
  3781  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
  3782  				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
  3783  			},
  3784  			wantStatus: batch.JobStatus{
  3785  				Failed:                  1,
  3786  				Succeeded:               1,
  3787  				Terminating:             ptr.To[int32](0),
  3788  				FailedIndexes:           ptr.To("0"),
  3789  				CompletedIndexes:        "1",
  3790  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3791  				Conditions: []batch.JobCondition{
  3792  					{
  3793  						Type:    batch.JobFailed,
  3794  						Status:  v1.ConditionTrue,
  3795  						Reason:  batch.JobReasonFailedIndexes,
  3796  						Message: "Job has failed indexes",
  3797  					},
  3798  				},
  3799  			},
  3800  		},
  3801  		"job failed due to exceeding max failed indexes": {
  3802  			enableJobBackoffLimitPerIndex: true,
  3803  			job: batch.Job{
  3804  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3805  				ObjectMeta: validObjectMeta,
  3806  				Spec: batch.JobSpec{
  3807  					Selector:             validSelector,
  3808  					Template:             validTemplate,
  3809  					Parallelism:          ptr.To[int32](4),
  3810  					Completions:          ptr.To[int32](4),
  3811  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3812  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3813  					BackoffLimitPerIndex: ptr.To[int32](1),
  3814  					MaxFailedIndexes:     ptr.To[int32](1),
  3815  				},
  3816  			},
  3817  			pods: []v1.Pod{
  3818  				*buildPod().uid("a").index("0").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
  3819  				*buildPod().uid("b").index("1").phase(v1.PodSucceeded).indexFailureCount("0").trackingFinalizer().Pod,
  3820  				*buildPod().uid("c").index("2").phase(v1.PodFailed).indexFailureCount("1").trackingFinalizer().Pod,
  3821  				*buildPod().uid("d").index("3").phase(v1.PodRunning).indexFailureCount("0").trackingFinalizer().Pod,
  3822  			},
  3823  			wantStatus: batch.JobStatus{
  3824  				Failed:                  3,
  3825  				Succeeded:               1,
  3826  				Terminating:             ptr.To[int32](0),
  3827  				FailedIndexes:           ptr.To("0,2"),
  3828  				CompletedIndexes:        "1",
  3829  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3830  				Conditions: []batch.JobCondition{
  3831  					{
  3832  						Type:    batch.JobFailed,
  3833  						Status:  v1.ConditionTrue,
  3834  						Reason:  batch.JobReasonMaxFailedIndexesExceeded,
  3835  						Message: "Job has exceeded the specified maximal number of failed indexes",
  3836  					},
  3837  				},
  3838  			},
  3839  		},
  3840  		"job with finished indexes; failedIndexes are cleaned when JobBackoffLimitPerIndex disabled": {
  3841  			enableJobBackoffLimitPerIndex: false,
  3842  			job: batch.Job{
  3843  				TypeMeta:   metav1.TypeMeta{Kind: "Job"},
  3844  				ObjectMeta: validObjectMeta,
  3845  				Spec: batch.JobSpec{
  3846  					Selector:             validSelector,
  3847  					Template:             validTemplate,
  3848  					Parallelism:          ptr.To[int32](3),
  3849  					Completions:          ptr.To[int32](3),
  3850  					BackoffLimit:         ptr.To[int32](math.MaxInt32),
  3851  					CompletionMode:       completionModePtr(batch.IndexedCompletion),
  3852  					BackoffLimitPerIndex: ptr.To[int32](1),
  3853  				},
  3854  				Status: batch.JobStatus{
  3855  					FailedIndexes:    ptr.To("0"),
  3856  					CompletedIndexes: "1",
  3857  				},
  3858  			},
  3859  			pods: []v1.Pod{
  3860  				*buildPod().uid("c").index("2").phase(v1.PodPending).indexFailureCount("1").trackingFinalizer().Pod,
  3861  			},
  3862  			wantStatus: batch.JobStatus{
  3863  				Active:                  2,
  3864  				Succeeded:               1,
  3865  				Terminating:             ptr.To[int32](0),
  3866  				CompletedIndexes:        "1",
  3867  				UncountedTerminatedPods: &batch.UncountedTerminatedPods{},
  3868  			},
  3869  		},
  3870  	}
  3871  	for name, tc := range testCases {
  3872  		t.Run(name, func(t *testing.T) {
  3873  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, tc.enableJobBackoffLimitPerIndex)()
  3874  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.enableJobPodFailurePolicy)()
  3875  			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  3876  			fakeClock := clocktesting.NewFakeClock(now)
  3877  			manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  3878  			fakePodControl := controller.FakePodControl{}
  3879  			manager.podControl = &fakePodControl
  3880  			manager.podStoreSynced = alwaysReady
  3881  			manager.jobStoreSynced = alwaysReady
  3882  			job := &tc.job
  3883  
  3884  			actual := job
  3885  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  3886  				actual = job
  3887  				return job, nil
  3888  			}
  3889  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  3890  			for i, pod := range tc.pods {
  3891  				pod := pod
  3892  				pb := podBuilder{Pod: &pod}.name(fmt.Sprintf("mypod-%d", i)).job(job)
  3893  				if job.Spec.CompletionMode != nil && *job.Spec.CompletionMode == batch.IndexedCompletion {
  3894  					pb.index(fmt.Sprintf("%v", getCompletionIndex(pod.Annotations)))
  3895  				}
  3896  				pb = pb.trackingFinalizer()
  3897  				sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer().Add(pb.Pod)
  3898  			}
  3899  
  3900  			manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  3901  
  3902  			// validate relevant fields of the status
  3903  			if diff := cmp.Diff(tc.wantStatus, actual.Status,
  3904  				cmpopts.IgnoreFields(batch.JobStatus{}, "StartTime", "CompletionTime", "Ready"),
  3905  				cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
  3906  				t.Errorf("unexpected job status. Diff: %s\n", diff)
  3907  			}
  3908  		})
  3909  	}
  3910  }
  3911  
  3912  func TestSyncJobUpdateRequeue(t *testing.T) {
  3913  	_, ctx := ktesting.NewTestContext(t)
  3914  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  3915  	cases := map[string]struct {
  3916  		updateErr    error
  3917  		wantRequeued bool
  3918  	}{
  3919  		"no error": {
  3920  			wantRequeued: false,
  3921  		},
  3922  		"generic error": {
  3923  			updateErr:    fmt.Errorf("update error"),
  3924  			wantRequeued: true,
  3925  		},
  3926  		"conflict error": {
  3927  			updateErr:    apierrors.NewConflict(schema.GroupResource{}, "", nil),
  3928  			wantRequeued: true,
  3929  		},
  3930  	}
  3931  	for name, tc := range cases {
  3932  		t.Run(name, func(t *testing.T) {
  3933  			t.Cleanup(setDurationDuringTest(&DefaultJobApiBackOff, fastJobApiBackoff))
  3934  			fakeClient := clocktesting.NewFakeClock(time.Now())
  3935  			manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClient)
  3936  			fakePodControl := controller.FakePodControl{}
  3937  			manager.podControl = &fakePodControl
  3938  			manager.podStoreSynced = alwaysReady
  3939  			manager.jobStoreSynced = alwaysReady
  3940  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  3941  				return job, tc.updateErr
  3942  			}
  3943  			job := newJob(2, 2, 6, batch.NonIndexedCompletion)
  3944  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  3945  			manager.queue.Add(testutil.GetKey(job, t))
  3946  			manager.processNextWorkItem(context.TODO())
  3947  			if tc.wantRequeued {
  3948  				verifyEmptyQueueAndAwaitForQueueLen(ctx, t, manager, 1)
  3949  			} else {
  3950  				// We advance the clock to make sure there are not items awaiting
  3951  				// to be added into the queue. We also sleep a little to give the
  3952  				// delaying queue time to move the potential items from pre-queue
  3953  				// into the queue asynchronously.
  3954  				manager.clock.Sleep(fastJobApiBackoff)
  3955  				time.Sleep(time.Millisecond)
  3956  				verifyEmptyQueue(ctx, t, manager)
  3957  			}
  3958  		})
  3959  	}
  3960  }
  3961  
  3962  func TestUpdateJobRequeue(t *testing.T) {
  3963  	logger, ctx := ktesting.NewTestContext(t)
  3964  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  3965  	cases := map[string]struct {
  3966  		oldJob                  *batch.Job
  3967  		updateFn                func(job *batch.Job)
  3968  		wantRequeuedImmediately bool
  3969  	}{
  3970  		"spec update": {
  3971  			oldJob: newJob(1, 1, 1, batch.IndexedCompletion),
  3972  			updateFn: func(job *batch.Job) {
  3973  				job.Spec.Suspend = ptr.To(false)
  3974  				job.Generation++
  3975  			},
  3976  			wantRequeuedImmediately: true,
  3977  		},
  3978  		"status update": {
  3979  			oldJob: newJob(1, 1, 1, batch.IndexedCompletion),
  3980  			updateFn: func(job *batch.Job) {
  3981  				job.Status.StartTime = &metav1.Time{Time: time.Now()}
  3982  			},
  3983  			wantRequeuedImmediately: false,
  3984  		},
  3985  	}
  3986  	for name, tc := range cases {
  3987  		t.Run(name, func(t *testing.T) {
  3988  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  3989  			manager.podStoreSynced = alwaysReady
  3990  			manager.jobStoreSynced = alwaysReady
  3991  
  3992  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.oldJob)
  3993  			newJob := tc.oldJob.DeepCopy()
  3994  			if tc.updateFn != nil {
  3995  				tc.updateFn(newJob)
  3996  			}
  3997  			manager.updateJob(logger, tc.oldJob, newJob)
  3998  			gotRequeuedImmediately := manager.queue.Len() > 0
  3999  			if tc.wantRequeuedImmediately != gotRequeuedImmediately {
  4000  				t.Fatalf("Want immediate requeue: %v, got immediate requeue: %v", tc.wantRequeuedImmediately, gotRequeuedImmediately)
  4001  			}
  4002  		})
  4003  	}
  4004  }
  4005  
  4006  func TestGetPodCreationInfoForIndependentIndexes(t *testing.T) {
  4007  	logger, ctx := ktesting.NewTestContext(t)
  4008  	now := time.Now()
  4009  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4010  	cases := map[string]struct {
  4011  		indexesToAdd                    []int
  4012  		podsWithDelayedDeletionPerIndex map[int]*v1.Pod
  4013  		wantIndexesToAdd                []int
  4014  		wantRemainingTime               time.Duration
  4015  	}{
  4016  		"simple index creation": {
  4017  			indexesToAdd:     []int{1, 3},
  4018  			wantIndexesToAdd: []int{1, 3},
  4019  		},
  4020  		"subset of indexes can be recreated now": {
  4021  			indexesToAdd: []int{1, 3},
  4022  			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
  4023  				1: buildPod().indexFailureCount("0").index("1").customDeletionTimestamp(now).Pod,
  4024  			},
  4025  			wantIndexesToAdd: []int{3},
  4026  		},
  4027  		"subset of indexes can be recreated now as the pods failed long time ago": {
  4028  			indexesToAdd: []int{1, 3},
  4029  			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
  4030  				1: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod,
  4031  				3: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-DefaultJobPodFailureBackOff)).Pod,
  4032  			},
  4033  			wantIndexesToAdd: []int{3},
  4034  		},
  4035  		"no indexes can be recreated now, need to wait default pod failure backoff": {
  4036  			indexesToAdd: []int{1, 2, 3},
  4037  			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
  4038  				1: buildPod().indexFailureCount("1").customDeletionTimestamp(now).Pod,
  4039  				2: buildPod().indexFailureCount("0").customDeletionTimestamp(now).Pod,
  4040  				3: buildPod().indexFailureCount("2").customDeletionTimestamp(now).Pod,
  4041  			},
  4042  			wantRemainingTime: DefaultJobPodFailureBackOff,
  4043  		},
  4044  		"no indexes can be recreated now, need to wait but 1s already passed": {
  4045  			indexesToAdd: []int{1, 2, 3},
  4046  			podsWithDelayedDeletionPerIndex: map[int]*v1.Pod{
  4047  				1: buildPod().indexFailureCount("1").customDeletionTimestamp(now.Add(-time.Second)).Pod,
  4048  				2: buildPod().indexFailureCount("0").customDeletionTimestamp(now.Add(-time.Second)).Pod,
  4049  				3: buildPod().indexFailureCount("2").customDeletionTimestamp(now.Add(-time.Second)).Pod,
  4050  			},
  4051  			wantRemainingTime: DefaultJobPodFailureBackOff - time.Second,
  4052  		},
  4053  	}
  4054  	for name, tc := range cases {
  4055  		t.Run(name, func(t *testing.T) {
  4056  			fakeClock := clocktesting.NewFakeClock(now)
  4057  			manager, _ := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4058  			gotIndexesToAdd, gotRemainingTime := manager.getPodCreationInfoForIndependentIndexes(logger, tc.indexesToAdd, tc.podsWithDelayedDeletionPerIndex)
  4059  			if diff := cmp.Diff(tc.wantIndexesToAdd, gotIndexesToAdd); diff != "" {
  4060  				t.Fatalf("Unexpected indexes to add: %s", diff)
  4061  			}
  4062  			if diff := cmp.Diff(tc.wantRemainingTime, gotRemainingTime); diff != "" {
  4063  				t.Fatalf("Unexpected remaining time: %s", diff)
  4064  			}
  4065  		})
  4066  	}
  4067  }
  4068  
  4069  func TestJobPodLookup(t *testing.T) {
  4070  	_, ctx := ktesting.NewTestContext(t)
  4071  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4072  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4073  	manager.podStoreSynced = alwaysReady
  4074  	manager.jobStoreSynced = alwaysReady
  4075  	testCases := []struct {
  4076  		job *batch.Job
  4077  		pod *v1.Pod
  4078  
  4079  		expectedName string
  4080  	}{
  4081  		// pods without labels don't match any job
  4082  		{
  4083  			job: &batch.Job{
  4084  				ObjectMeta: metav1.ObjectMeta{Name: "basic"},
  4085  			},
  4086  			pod: &v1.Pod{
  4087  				ObjectMeta: metav1.ObjectMeta{Name: "foo1", Namespace: metav1.NamespaceAll},
  4088  			},
  4089  			expectedName: "",
  4090  		},
  4091  		// matching labels, different namespace
  4092  		{
  4093  			job: &batch.Job{
  4094  				ObjectMeta: metav1.ObjectMeta{Name: "foo"},
  4095  				Spec: batch.JobSpec{
  4096  					Selector: &metav1.LabelSelector{
  4097  						MatchLabels: map[string]string{"foo": "bar"},
  4098  					},
  4099  				},
  4100  			},
  4101  			pod: &v1.Pod{
  4102  				ObjectMeta: metav1.ObjectMeta{
  4103  					Name:      "foo2",
  4104  					Namespace: "ns",
  4105  					Labels:    map[string]string{"foo": "bar"},
  4106  				},
  4107  			},
  4108  			expectedName: "",
  4109  		},
  4110  		// matching ns and labels returns
  4111  		{
  4112  			job: &batch.Job{
  4113  				ObjectMeta: metav1.ObjectMeta{Name: "bar", Namespace: "ns"},
  4114  				Spec: batch.JobSpec{
  4115  					Selector: &metav1.LabelSelector{
  4116  						MatchExpressions: []metav1.LabelSelectorRequirement{
  4117  							{
  4118  								Key:      "foo",
  4119  								Operator: metav1.LabelSelectorOpIn,
  4120  								Values:   []string{"bar"},
  4121  							},
  4122  						},
  4123  					},
  4124  				},
  4125  			},
  4126  			pod: &v1.Pod{
  4127  				ObjectMeta: metav1.ObjectMeta{
  4128  					Name:      "foo3",
  4129  					Namespace: "ns",
  4130  					Labels:    map[string]string{"foo": "bar"},
  4131  				},
  4132  			},
  4133  			expectedName: "bar",
  4134  		},
  4135  	}
  4136  	for _, tc := range testCases {
  4137  		sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.job)
  4138  		if jobs := manager.getPodJobs(tc.pod); len(jobs) > 0 {
  4139  			if got, want := len(jobs), 1; got != want {
  4140  				t.Errorf("len(jobs) = %v, want %v", got, want)
  4141  			}
  4142  			job := jobs[0]
  4143  			if tc.expectedName != job.Name {
  4144  				t.Errorf("Got job %+v expected %+v", job.Name, tc.expectedName)
  4145  			}
  4146  		} else if tc.expectedName != "" {
  4147  			t.Errorf("Expected a job %v pod %v, found none", tc.expectedName, tc.pod.Name)
  4148  		}
  4149  	}
  4150  }
  4151  
  4152  func TestGetPodsForJob(t *testing.T) {
  4153  	_, ctx := ktesting.NewTestContext(t)
  4154  	job := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4155  	job.Name = "test_job"
  4156  	otherJob := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4157  	otherJob.Name = "other_job"
  4158  	cases := map[string]struct {
  4159  		jobDeleted        bool
  4160  		jobDeletedInCache bool
  4161  		pods              []*v1.Pod
  4162  		wantPods          []string
  4163  		wantPodsFinalizer []string
  4164  	}{
  4165  		"only matching": {
  4166  			pods: []*v1.Pod{
  4167  				buildPod().name("pod1").job(job).trackingFinalizer().Pod,
  4168  				buildPod().name("pod2").job(otherJob).Pod,
  4169  				buildPod().name("pod3").ns(job.Namespace).Pod,
  4170  				buildPod().name("pod4").job(job).Pod,
  4171  			},
  4172  			wantPods:          []string{"pod1", "pod4"},
  4173  			wantPodsFinalizer: []string{"pod1"},
  4174  		},
  4175  		"adopt": {
  4176  			pods: []*v1.Pod{
  4177  				buildPod().name("pod1").job(job).Pod,
  4178  				buildPod().name("pod2").job(job).clearOwner().Pod,
  4179  				buildPod().name("pod3").job(otherJob).Pod,
  4180  			},
  4181  			wantPods:          []string{"pod1", "pod2"},
  4182  			wantPodsFinalizer: []string{"pod2"},
  4183  		},
  4184  		"no adopt when deleting": {
  4185  			jobDeleted:        true,
  4186  			jobDeletedInCache: true,
  4187  			pods: []*v1.Pod{
  4188  				buildPod().name("pod1").job(job).Pod,
  4189  				buildPod().name("pod2").job(job).clearOwner().Pod,
  4190  			},
  4191  			wantPods: []string{"pod1"},
  4192  		},
  4193  		"no adopt when deleting race": {
  4194  			jobDeleted: true,
  4195  			pods: []*v1.Pod{
  4196  				buildPod().name("pod1").job(job).Pod,
  4197  				buildPod().name("pod2").job(job).clearOwner().Pod,
  4198  			},
  4199  			wantPods: []string{"pod1"},
  4200  		},
  4201  		"release": {
  4202  			pods: []*v1.Pod{
  4203  				buildPod().name("pod1").job(job).Pod,
  4204  				buildPod().name("pod2").job(job).clearLabels().Pod,
  4205  			},
  4206  			wantPods: []string{"pod1"},
  4207  		},
  4208  	}
  4209  	for name, tc := range cases {
  4210  		t.Run(name, func(t *testing.T) {
  4211  			job := job.DeepCopy()
  4212  			if tc.jobDeleted {
  4213  				job.DeletionTimestamp = &metav1.Time{}
  4214  			}
  4215  			clientSet := fake.NewSimpleClientset(job, otherJob)
  4216  			jm, informer := newControllerFromClient(ctx, t, clientSet, controller.NoResyncPeriodFunc)
  4217  			jm.podStoreSynced = alwaysReady
  4218  			jm.jobStoreSynced = alwaysReady
  4219  			cachedJob := job.DeepCopy()
  4220  			if tc.jobDeletedInCache {
  4221  				cachedJob.DeletionTimestamp = &metav1.Time{}
  4222  			}
  4223  			informer.Batch().V1().Jobs().Informer().GetIndexer().Add(cachedJob)
  4224  			informer.Batch().V1().Jobs().Informer().GetIndexer().Add(otherJob)
  4225  			for _, p := range tc.pods {
  4226  				informer.Core().V1().Pods().Informer().GetIndexer().Add(p)
  4227  			}
  4228  
  4229  			pods, err := jm.getPodsForJob(context.TODO(), job)
  4230  			if err != nil {
  4231  				t.Fatalf("getPodsForJob() error: %v", err)
  4232  			}
  4233  			got := make([]string, len(pods))
  4234  			var gotFinalizer []string
  4235  			for i, p := range pods {
  4236  				got[i] = p.Name
  4237  				if hasJobTrackingFinalizer(p) {
  4238  					gotFinalizer = append(gotFinalizer, p.Name)
  4239  				}
  4240  			}
  4241  			sort.Strings(got)
  4242  			if diff := cmp.Diff(tc.wantPods, got); diff != "" {
  4243  				t.Errorf("getPodsForJob() returned (-want,+got):\n%s", diff)
  4244  			}
  4245  			sort.Strings(gotFinalizer)
  4246  			if diff := cmp.Diff(tc.wantPodsFinalizer, gotFinalizer); diff != "" {
  4247  				t.Errorf("Pods with finalizers (-want,+got):\n%s", diff)
  4248  			}
  4249  		})
  4250  	}
  4251  }
  4252  
  4253  func TestAddPod(t *testing.T) {
  4254  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4255  	_, ctx := ktesting.NewTestContext(t)
  4256  	logger := klog.FromContext(ctx)
  4257  
  4258  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4259  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4260  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4261  	jm.podStoreSynced = alwaysReady
  4262  	jm.jobStoreSynced = alwaysReady
  4263  
  4264  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4265  	job1.Name = "job1"
  4266  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4267  	job2.Name = "job2"
  4268  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4269  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4270  
  4271  	pod1 := newPod("pod1", job1)
  4272  	pod2 := newPod("pod2", job2)
  4273  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4274  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2)
  4275  
  4276  	jm.addPod(logger, pod1)
  4277  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4278  	key, done := jm.queue.Get()
  4279  	if key == nil || done {
  4280  		t.Fatalf("failed to enqueue controller for pod %v", pod1.Name)
  4281  	}
  4282  	expectedKey, _ := controller.KeyFunc(job1)
  4283  	if got, want := key.(string), expectedKey; got != want {
  4284  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4285  	}
  4286  
  4287  	jm.addPod(logger, pod2)
  4288  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4289  	key, done = jm.queue.Get()
  4290  	if key == nil || done {
  4291  		t.Fatalf("failed to enqueue controller for pod %v", pod2.Name)
  4292  	}
  4293  	expectedKey, _ = controller.KeyFunc(job2)
  4294  	if got, want := key.(string), expectedKey; got != want {
  4295  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4296  	}
  4297  }
  4298  
  4299  func TestAddPodOrphan(t *testing.T) {
  4300  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4301  	logger, ctx := ktesting.NewTestContext(t)
  4302  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4303  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4304  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4305  	jm.podStoreSynced = alwaysReady
  4306  	jm.jobStoreSynced = alwaysReady
  4307  
  4308  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4309  	job1.Name = "job1"
  4310  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4311  	job2.Name = "job2"
  4312  	job3 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4313  	job3.Name = "job3"
  4314  	job3.Spec.Selector.MatchLabels = map[string]string{"other": "labels"}
  4315  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4316  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4317  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job3)
  4318  
  4319  	pod1 := newPod("pod1", job1)
  4320  	// Make pod an orphan. Expect all matching controllers to be queued.
  4321  	pod1.OwnerReferences = nil
  4322  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4323  
  4324  	jm.addPod(logger, pod1)
  4325  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2)
  4326  }
  4327  
  4328  func TestUpdatePod(t *testing.T) {
  4329  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4330  	_, ctx := ktesting.NewTestContext(t)
  4331  	logger := klog.FromContext(ctx)
  4332  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4333  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4334  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4335  	jm.podStoreSynced = alwaysReady
  4336  	jm.jobStoreSynced = alwaysReady
  4337  
  4338  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4339  	job1.Name = "job1"
  4340  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4341  	job2.Name = "job2"
  4342  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4343  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4344  
  4345  	pod1 := newPod("pod1", job1)
  4346  	pod2 := newPod("pod2", job2)
  4347  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4348  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2)
  4349  
  4350  	prev := *pod1
  4351  	bumpResourceVersion(pod1)
  4352  	jm.updatePod(logger, &prev, pod1)
  4353  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4354  	key, done := jm.queue.Get()
  4355  	if key == nil || done {
  4356  		t.Fatalf("failed to enqueue controller for pod %v", pod1.Name)
  4357  	}
  4358  	expectedKey, _ := controller.KeyFunc(job1)
  4359  	if got, want := key.(string), expectedKey; got != want {
  4360  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4361  	}
  4362  
  4363  	prev = *pod2
  4364  	bumpResourceVersion(pod2)
  4365  	jm.updatePod(logger, &prev, pod2)
  4366  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4367  	key, done = jm.queue.Get()
  4368  	if key == nil || done {
  4369  		t.Fatalf("failed to enqueue controller for pod %v", pod2.Name)
  4370  	}
  4371  	expectedKey, _ = controller.KeyFunc(job2)
  4372  	if got, want := key.(string), expectedKey; got != want {
  4373  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4374  	}
  4375  }
  4376  
  4377  func TestUpdatePodOrphanWithNewLabels(t *testing.T) {
  4378  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4379  	logger, ctx := ktesting.NewTestContext(t)
  4380  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4381  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4382  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4383  	jm.podStoreSynced = alwaysReady
  4384  	jm.jobStoreSynced = alwaysReady
  4385  
  4386  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4387  	job1.Name = "job1"
  4388  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4389  	job2.Name = "job2"
  4390  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4391  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4392  
  4393  	pod1 := newPod("pod1", job1)
  4394  	pod1.OwnerReferences = nil
  4395  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4396  
  4397  	// Labels changed on orphan. Expect newly matching controllers to queue.
  4398  	prev := *pod1
  4399  	prev.Labels = map[string]string{"foo2": "bar2"}
  4400  	bumpResourceVersion(pod1)
  4401  	jm.updatePod(logger, &prev, pod1)
  4402  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2)
  4403  }
  4404  
  4405  func TestUpdatePodChangeControllerRef(t *testing.T) {
  4406  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4407  	_, ctx := ktesting.NewTestContext(t)
  4408  	logger := klog.FromContext(ctx)
  4409  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4410  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4411  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4412  	jm.podStoreSynced = alwaysReady
  4413  	jm.jobStoreSynced = alwaysReady
  4414  
  4415  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4416  	job1.Name = "job1"
  4417  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4418  	job2.Name = "job2"
  4419  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4420  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4421  
  4422  	pod1 := newPod("pod1", job1)
  4423  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4424  
  4425  	// Changed ControllerRef. Expect both old and new to queue.
  4426  	prev := *pod1
  4427  	prev.OwnerReferences = []metav1.OwnerReference{*metav1.NewControllerRef(job2, controllerKind)}
  4428  	bumpResourceVersion(pod1)
  4429  	jm.updatePod(logger, &prev, pod1)
  4430  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2)
  4431  }
  4432  
  4433  func TestUpdatePodRelease(t *testing.T) {
  4434  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4435  	_, ctx := ktesting.NewTestContext(t)
  4436  	logger := klog.FromContext(ctx)
  4437  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4438  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4439  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4440  	jm.podStoreSynced = alwaysReady
  4441  	jm.jobStoreSynced = alwaysReady
  4442  
  4443  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4444  	job1.Name = "job1"
  4445  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4446  	job2.Name = "job2"
  4447  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4448  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4449  
  4450  	pod1 := newPod("pod1", job1)
  4451  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4452  
  4453  	// Remove ControllerRef. Expect all matching to queue for adoption.
  4454  	prev := *pod1
  4455  	pod1.OwnerReferences = nil
  4456  	bumpResourceVersion(pod1)
  4457  	jm.updatePod(logger, &prev, pod1)
  4458  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 2)
  4459  }
  4460  
  4461  func TestDeletePod(t *testing.T) {
  4462  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, fastSyncJobBatchPeriod))
  4463  	logger, ctx := ktesting.NewTestContext(t)
  4464  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4465  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4466  	jm, informer := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4467  	jm.podStoreSynced = alwaysReady
  4468  	jm.jobStoreSynced = alwaysReady
  4469  
  4470  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4471  	job1.Name = "job1"
  4472  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4473  	job2.Name = "job2"
  4474  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4475  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4476  
  4477  	pod1 := newPod("pod1", job1)
  4478  	pod2 := newPod("pod2", job2)
  4479  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4480  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod2)
  4481  
  4482  	jm.deletePod(logger, pod1, true)
  4483  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4484  	key, done := jm.queue.Get()
  4485  	if key == nil || done {
  4486  		t.Fatalf("failed to enqueue controller for pod %v", pod1.Name)
  4487  	}
  4488  	expectedKey, _ := controller.KeyFunc(job1)
  4489  	if got, want := key.(string), expectedKey; got != want {
  4490  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4491  	}
  4492  
  4493  	jm.deletePod(logger, pod2, true)
  4494  	verifyEmptyQueueAndAwaitForQueueLen(ctx, t, jm, 1)
  4495  	key, done = jm.queue.Get()
  4496  	if key == nil || done {
  4497  		t.Fatalf("failed to enqueue controller for pod %v", pod2.Name)
  4498  	}
  4499  	expectedKey, _ = controller.KeyFunc(job2)
  4500  	if got, want := key.(string), expectedKey; got != want {
  4501  		t.Errorf("queue.Get() = %v, want %v", got, want)
  4502  	}
  4503  }
  4504  
  4505  func TestDeletePodOrphan(t *testing.T) {
  4506  	// Disable batching of pod updates to show it does not get requeued at all
  4507  	t.Cleanup(setDurationDuringTest(&syncJobBatchPeriod, 0))
  4508  	logger, ctx := ktesting.NewTestContext(t)
  4509  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4510  	jm, informer := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4511  	jm.podStoreSynced = alwaysReady
  4512  	jm.jobStoreSynced = alwaysReady
  4513  
  4514  	job1 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4515  	job1.Name = "job1"
  4516  	job2 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4517  	job2.Name = "job2"
  4518  	job3 := newJob(1, 1, 6, batch.NonIndexedCompletion)
  4519  	job3.Name = "job3"
  4520  	job3.Spec.Selector.MatchLabels = map[string]string{"other": "labels"}
  4521  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job1)
  4522  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job2)
  4523  	informer.Batch().V1().Jobs().Informer().GetIndexer().Add(job3)
  4524  
  4525  	pod1 := newPod("pod1", job1)
  4526  	pod1.OwnerReferences = nil
  4527  	informer.Core().V1().Pods().Informer().GetIndexer().Add(pod1)
  4528  
  4529  	jm.deletePod(logger, pod1, true)
  4530  	if got, want := jm.queue.Len(), 0; got != want {
  4531  		t.Fatalf("queue.Len() = %v, want %v", got, want)
  4532  	}
  4533  }
  4534  
  4535  type FakeJobExpectations struct {
  4536  	*controller.ControllerExpectations
  4537  	satisfied    bool
  4538  	expSatisfied func()
  4539  }
  4540  
  4541  func (fe FakeJobExpectations) SatisfiedExpectations(logger klog.Logger, controllerKey string) bool {
  4542  	fe.expSatisfied()
  4543  	return fe.satisfied
  4544  }
  4545  
  4546  // TestSyncJobExpectations tests that a pod cannot sneak in between counting active pods
  4547  // and checking expectations.
  4548  func TestSyncJobExpectations(t *testing.T) {
  4549  	_, ctx := ktesting.NewTestContext(t)
  4550  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4551  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4552  	fakePodControl := controller.FakePodControl{}
  4553  	manager.podControl = &fakePodControl
  4554  	manager.podStoreSynced = alwaysReady
  4555  	manager.jobStoreSynced = alwaysReady
  4556  	manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  4557  		return job, nil
  4558  	}
  4559  
  4560  	job := newJob(2, 2, 6, batch.NonIndexedCompletion)
  4561  	sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  4562  	pods := newPodList(2, v1.PodPending, job)
  4563  	podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
  4564  	podIndexer.Add(pods[0])
  4565  
  4566  	manager.expectations = FakeJobExpectations{
  4567  		controller.NewControllerExpectations(), true, func() {
  4568  			// If we check active pods before checking expectations, the job
  4569  			// will create a new replica because it doesn't see this pod, but
  4570  			// has fulfilled its expectations.
  4571  			podIndexer.Add(pods[1])
  4572  		},
  4573  	}
  4574  	manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  4575  	if len(fakePodControl.Templates) != 0 {
  4576  		t.Errorf("Unexpected number of creates.  Expected %d, saw %d\n", 0, len(fakePodControl.Templates))
  4577  	}
  4578  	if len(fakePodControl.DeletePodName) != 0 {
  4579  		t.Errorf("Unexpected number of deletes.  Expected %d, saw %d\n", 0, len(fakePodControl.DeletePodName))
  4580  	}
  4581  }
  4582  
  4583  func TestWatchJobs(t *testing.T) {
  4584  	_, ctx := ktesting.NewTestContext(t)
  4585  	clientset := fake.NewSimpleClientset()
  4586  	fakeWatch := watch.NewFake()
  4587  	clientset.PrependWatchReactor("jobs", core.DefaultWatchReactor(fakeWatch, nil))
  4588  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4589  	manager.podStoreSynced = alwaysReady
  4590  	manager.jobStoreSynced = alwaysReady
  4591  
  4592  	var testJob batch.Job
  4593  	received := make(chan struct{})
  4594  
  4595  	// The update sent through the fakeWatcher should make its way into the workqueue,
  4596  	// and eventually into the syncHandler.
  4597  	manager.syncHandler = func(ctx context.Context, key string) error {
  4598  		defer close(received)
  4599  		ns, name, err := cache.SplitMetaNamespaceKey(key)
  4600  		if err != nil {
  4601  			t.Errorf("Error getting namespace/name from key %v: %v", key, err)
  4602  		}
  4603  		job, err := manager.jobLister.Jobs(ns).Get(name)
  4604  		if err != nil || job == nil {
  4605  			t.Errorf("Expected to find job under key %v: %v", key, err)
  4606  			return nil
  4607  		}
  4608  		if !apiequality.Semantic.DeepDerivative(*job, testJob) {
  4609  			t.Errorf("Expected %#v, but got %#v", testJob, *job)
  4610  		}
  4611  		return nil
  4612  	}
  4613  	// Start only the job watcher and the workqueue, send a watch event,
  4614  	// and make sure it hits the sync method.
  4615  	stopCh := make(chan struct{})
  4616  	defer close(stopCh)
  4617  	sharedInformerFactory.Start(stopCh)
  4618  	go manager.Run(context.TODO(), 1)
  4619  
  4620  	// We're sending new job to see if it reaches syncHandler.
  4621  	testJob.Namespace = "bar"
  4622  	testJob.Name = "foo"
  4623  	fakeWatch.Add(&testJob)
  4624  	t.Log("Waiting for job to reach syncHandler")
  4625  	<-received
  4626  }
  4627  
  4628  func TestWatchPods(t *testing.T) {
  4629  	_, ctx := ktesting.NewTestContext(t)
  4630  	testJob := newJob(2, 2, 6, batch.NonIndexedCompletion)
  4631  	clientset := fake.NewSimpleClientset(testJob)
  4632  	fakeWatch := watch.NewFake()
  4633  	clientset.PrependWatchReactor("pods", core.DefaultWatchReactor(fakeWatch, nil))
  4634  	manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4635  	manager.podStoreSynced = alwaysReady
  4636  	manager.jobStoreSynced = alwaysReady
  4637  
  4638  	// Put one job and one pod into the store
  4639  	sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(testJob)
  4640  	received := make(chan struct{})
  4641  	// The pod update sent through the fakeWatcher should figure out the managing job and
  4642  	// send it into the syncHandler.
  4643  	manager.syncHandler = func(ctx context.Context, key string) error {
  4644  		ns, name, err := cache.SplitMetaNamespaceKey(key)
  4645  		if err != nil {
  4646  			t.Errorf("Error getting namespace/name from key %v: %v", key, err)
  4647  		}
  4648  		job, err := manager.jobLister.Jobs(ns).Get(name)
  4649  		if err != nil {
  4650  			t.Errorf("Expected to find job under key %v: %v", key, err)
  4651  		}
  4652  		if !apiequality.Semantic.DeepDerivative(job, testJob) {
  4653  			t.Errorf("\nExpected %#v,\nbut got %#v", testJob, job)
  4654  			close(received)
  4655  			return nil
  4656  		}
  4657  		close(received)
  4658  		return nil
  4659  	}
  4660  	// Start only the pod watcher and the workqueue, send a watch event,
  4661  	// and make sure it hits the sync method for the right job.
  4662  	stopCh := make(chan struct{})
  4663  	defer close(stopCh)
  4664  	go sharedInformerFactory.Core().V1().Pods().Informer().Run(stopCh)
  4665  	go manager.Run(context.TODO(), 1)
  4666  
  4667  	pods := newPodList(1, v1.PodRunning, testJob)
  4668  	testPod := pods[0]
  4669  	testPod.Status.Phase = v1.PodFailed
  4670  	fakeWatch.Add(testPod)
  4671  
  4672  	t.Log("Waiting for pod to reach syncHandler")
  4673  	<-received
  4674  }
  4675  
  4676  func TestWatchOrphanPods(t *testing.T) {
  4677  	_, ctx := ktesting.NewTestContext(t)
  4678  	clientset := fake.NewSimpleClientset()
  4679  	sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc())
  4680  	manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset)
  4681  	if err != nil {
  4682  		t.Fatalf("Error creating Job controller: %v", err)
  4683  	}
  4684  	manager.podStoreSynced = alwaysReady
  4685  	manager.jobStoreSynced = alwaysReady
  4686  
  4687  	stopCh := make(chan struct{})
  4688  	defer close(stopCh)
  4689  	podInformer := sharedInformers.Core().V1().Pods().Informer()
  4690  	go podInformer.Run(stopCh)
  4691  	cache.WaitForCacheSync(stopCh, podInformer.HasSynced)
  4692  	go manager.Run(context.TODO(), 1)
  4693  
  4694  	// Create job but don't add it to the store.
  4695  	cases := map[string]struct {
  4696  		job     *batch.Job
  4697  		inCache bool
  4698  	}{
  4699  		"job_does_not_exist": {
  4700  			job: newJob(2, 2, 6, batch.NonIndexedCompletion),
  4701  		},
  4702  		"orphan": {},
  4703  		"job_finished": {
  4704  			job: func() *batch.Job {
  4705  				j := newJob(2, 2, 6, batch.NonIndexedCompletion)
  4706  				j.Status.Conditions = append(j.Status.Conditions, batch.JobCondition{
  4707  					Type:   batch.JobComplete,
  4708  					Status: v1.ConditionTrue,
  4709  				})
  4710  				return j
  4711  			}(),
  4712  			inCache: true,
  4713  		},
  4714  	}
  4715  	for name, tc := range cases {
  4716  		t.Run(name, func(t *testing.T) {
  4717  			if tc.inCache {
  4718  				if err := sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Add(tc.job); err != nil {
  4719  					t.Fatalf("Failed to insert job in index: %v", err)
  4720  				}
  4721  				t.Cleanup(func() {
  4722  					sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Delete(tc.job)
  4723  				})
  4724  			}
  4725  
  4726  			podBuilder := buildPod().name(name).deletionTimestamp().trackingFinalizer()
  4727  			if tc.job != nil {
  4728  				podBuilder = podBuilder.job(tc.job)
  4729  			}
  4730  			orphanPod := podBuilder.Pod
  4731  			orphanPod, err := clientset.CoreV1().Pods("default").Create(context.Background(), orphanPod, metav1.CreateOptions{})
  4732  			if err != nil {
  4733  				t.Fatalf("Creating orphan pod: %v", err)
  4734  			}
  4735  
  4736  			if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
  4737  				p, err := clientset.CoreV1().Pods(orphanPod.Namespace).Get(context.Background(), orphanPod.Name, metav1.GetOptions{})
  4738  				if err != nil {
  4739  					return false, err
  4740  				}
  4741  				return !hasJobTrackingFinalizer(p), nil
  4742  			}); err != nil {
  4743  				t.Errorf("Waiting for Pod to get the finalizer removed: %v", err)
  4744  			}
  4745  		})
  4746  	}
  4747  }
  4748  
  4749  func bumpResourceVersion(obj metav1.Object) {
  4750  	ver, _ := strconv.ParseInt(obj.GetResourceVersion(), 10, 32)
  4751  	obj.SetResourceVersion(strconv.FormatInt(ver+1, 10))
  4752  }
  4753  
  4754  func TestJobApiBackoffReset(t *testing.T) {
  4755  	t.Cleanup(setDurationDuringTest(&DefaultJobApiBackOff, fastJobApiBackoff))
  4756  	_, ctx := ktesting.NewTestContext(t)
  4757  
  4758  	clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4759  	fakeClock := clocktesting.NewFakeClock(time.Now())
  4760  	manager, sharedInformerFactory := newControllerFromClientWithClock(ctx, t, clientset, controller.NoResyncPeriodFunc, fakeClock)
  4761  	fakePodControl := controller.FakePodControl{}
  4762  	manager.podControl = &fakePodControl
  4763  	manager.podStoreSynced = alwaysReady
  4764  	manager.jobStoreSynced = alwaysReady
  4765  	manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  4766  		return job, nil
  4767  	}
  4768  
  4769  	job := newJob(1, 1, 2, batch.NonIndexedCompletion)
  4770  	key := testutil.GetKey(job, t)
  4771  	sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  4772  
  4773  	// error returned make the key requeued
  4774  	fakePodControl.Err = errors.New("Controller error")
  4775  	manager.queue.Add(key)
  4776  	manager.processNextWorkItem(context.TODO())
  4777  	retries := manager.queue.NumRequeues(key)
  4778  	if retries != 1 {
  4779  		t.Fatalf("%s: expected exactly 1 retry, got %d", job.Name, retries)
  4780  	}
  4781  	// await for the actual requeue after processing of the pending queue is done
  4782  	awaitForQueueLen(ctx, t, manager, 1)
  4783  
  4784  	// the queue is emptied on success
  4785  	fakePodControl.Err = nil
  4786  	manager.processNextWorkItem(context.TODO())
  4787  	verifyEmptyQueue(ctx, t, manager)
  4788  }
  4789  
  4790  var _ workqueue.RateLimitingInterface = &fakeRateLimitingQueue{}
  4791  
  4792  type fakeRateLimitingQueue struct {
  4793  	workqueue.Interface
  4794  	requeues int
  4795  	item     interface{}
  4796  	duration time.Duration
  4797  }
  4798  
  4799  func (f *fakeRateLimitingQueue) AddRateLimited(item interface{}) {}
  4800  func (f *fakeRateLimitingQueue) Forget(item interface{}) {
  4801  	f.requeues = 0
  4802  }
  4803  func (f *fakeRateLimitingQueue) NumRequeues(item interface{}) int {
  4804  	return f.requeues
  4805  }
  4806  func (f *fakeRateLimitingQueue) AddAfter(item interface{}, duration time.Duration) {
  4807  	f.item = item
  4808  	f.duration = duration
  4809  }
  4810  
  4811  func TestJobBackoff(t *testing.T) {
  4812  	_, ctx := ktesting.NewTestContext(t)
  4813  	logger := klog.FromContext(ctx)
  4814  	job := newJob(1, 1, 1, batch.NonIndexedCompletion)
  4815  	oldPod := newPod(fmt.Sprintf("pod-%v", rand.String(10)), job)
  4816  	oldPod.ResourceVersion = "1"
  4817  	newPod := oldPod.DeepCopy()
  4818  	newPod.ResourceVersion = "2"
  4819  
  4820  	testCases := map[string]struct {
  4821  		requeues    int
  4822  		oldPodPhase v1.PodPhase
  4823  		phase       v1.PodPhase
  4824  		wantBackoff time.Duration
  4825  	}{
  4826  		"failure with pod updates batching": {
  4827  			requeues:    0,
  4828  			phase:       v1.PodFailed,
  4829  			wantBackoff: syncJobBatchPeriod,
  4830  		},
  4831  	}
  4832  
  4833  	for name, tc := range testCases {
  4834  		t.Run(name, func(t *testing.T) {
  4835  			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4836  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4837  			fakePodControl := controller.FakePodControl{}
  4838  			manager.podControl = &fakePodControl
  4839  			manager.podStoreSynced = alwaysReady
  4840  			manager.jobStoreSynced = alwaysReady
  4841  			queue := &fakeRateLimitingQueue{}
  4842  			manager.queue = queue
  4843  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  4844  
  4845  			queue.requeues = tc.requeues
  4846  			newPod.Status.Phase = tc.phase
  4847  			oldPod.Status.Phase = v1.PodRunning
  4848  			if tc.oldPodPhase != "" {
  4849  				oldPod.Status.Phase = tc.oldPodPhase
  4850  			}
  4851  			manager.updatePod(logger, oldPod, newPod)
  4852  			if queue.duration != tc.wantBackoff {
  4853  				t.Errorf("unexpected backoff %v, expected %v", queue.duration, tc.wantBackoff)
  4854  			}
  4855  		})
  4856  	}
  4857  }
  4858  
  4859  func TestJobBackoffForOnFailure(t *testing.T) {
  4860  	_, ctx := ktesting.NewTestContext(t)
  4861  	jobConditionComplete := batch.JobComplete
  4862  	jobConditionFailed := batch.JobFailed
  4863  	jobConditionSuspended := batch.JobSuspended
  4864  
  4865  	testCases := map[string]struct {
  4866  		// job setup
  4867  		parallelism  int32
  4868  		completions  int32
  4869  		backoffLimit int32
  4870  		suspend      bool
  4871  
  4872  		// pod setup
  4873  		restartCounts []int32
  4874  		podPhase      v1.PodPhase
  4875  
  4876  		// expectations
  4877  		expectedActive          int32
  4878  		expectedSucceeded       int32
  4879  		expectedFailed          int32
  4880  		expectedCondition       *batch.JobConditionType
  4881  		expectedConditionReason string
  4882  	}{
  4883  		"backoffLimit 0 should have 1 pod active": {
  4884  			1, 1, 0,
  4885  			false, []int32{0}, v1.PodRunning,
  4886  			1, 0, 0, nil, "",
  4887  		},
  4888  		"backoffLimit 1 with restartCount 0 should have 1 pod active": {
  4889  			1, 1, 1,
  4890  			false, []int32{0}, v1.PodRunning,
  4891  			1, 0, 0, nil, "",
  4892  		},
  4893  		"backoffLimit 1 with restartCount 1 and podRunning should have 0 pod active": {
  4894  			1, 1, 1,
  4895  			false, []int32{1}, v1.PodRunning,
  4896  			0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
  4897  		},
  4898  		"backoffLimit 1 with restartCount 1 and podPending should have 0 pod active": {
  4899  			1, 1, 1,
  4900  			false, []int32{1}, v1.PodPending,
  4901  			0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
  4902  		},
  4903  		"too many job failures with podRunning - single pod": {
  4904  			1, 5, 2,
  4905  			false, []int32{2}, v1.PodRunning,
  4906  			0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
  4907  		},
  4908  		"too many job failures with podPending - single pod": {
  4909  			1, 5, 2,
  4910  			false, []int32{2}, v1.PodPending,
  4911  			0, 0, 1, &jobConditionFailed, "BackoffLimitExceeded",
  4912  		},
  4913  		"too many job failures with podRunning - multiple pods": {
  4914  			2, 5, 2,
  4915  			false, []int32{1, 1}, v1.PodRunning,
  4916  			0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded",
  4917  		},
  4918  		"too many job failures with podPending - multiple pods": {
  4919  			2, 5, 2,
  4920  			false, []int32{1, 1}, v1.PodPending,
  4921  			0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded",
  4922  		},
  4923  		"not enough failures": {
  4924  			2, 5, 3,
  4925  			false, []int32{1, 1}, v1.PodRunning,
  4926  			2, 0, 0, nil, "",
  4927  		},
  4928  		"suspending a job": {
  4929  			2, 4, 6,
  4930  			true, []int32{1, 1}, v1.PodRunning,
  4931  			0, 0, 0, &jobConditionSuspended, "JobSuspended",
  4932  		},
  4933  		"finshed job": {
  4934  			2, 4, 6,
  4935  			true, []int32{1, 1, 2, 0}, v1.PodSucceeded,
  4936  			0, 4, 0, &jobConditionComplete, "",
  4937  		},
  4938  	}
  4939  
  4940  	for name, tc := range testCases {
  4941  		t.Run(name, func(t *testing.T) {
  4942  			// job manager setup
  4943  			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  4944  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  4945  			fakePodControl := controller.FakePodControl{}
  4946  			manager.podControl = &fakePodControl
  4947  			manager.podStoreSynced = alwaysReady
  4948  			manager.jobStoreSynced = alwaysReady
  4949  			var actual *batch.Job
  4950  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  4951  				actual = job
  4952  				return job, nil
  4953  			}
  4954  
  4955  			// job & pods setup
  4956  			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion)
  4957  			job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyOnFailure
  4958  			job.Spec.Suspend = ptr.To(tc.suspend)
  4959  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  4960  			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
  4961  			for i, pod := range newPodList(len(tc.restartCounts), tc.podPhase, job) {
  4962  				pod.Status.ContainerStatuses = []v1.ContainerStatus{{RestartCount: tc.restartCounts[i]}}
  4963  				podIndexer.Add(pod)
  4964  			}
  4965  
  4966  			// run
  4967  			err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  4968  
  4969  			if err != nil {
  4970  				t.Errorf("unexpected error syncing job.  Got %#v", err)
  4971  			}
  4972  			// validate status
  4973  			if actual.Status.Active != tc.expectedActive {
  4974  				t.Errorf("unexpected number of active pods.  Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
  4975  			}
  4976  			if actual.Status.Succeeded != tc.expectedSucceeded {
  4977  				t.Errorf("unexpected number of succeeded pods.  Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
  4978  			}
  4979  			if actual.Status.Failed != tc.expectedFailed {
  4980  				t.Errorf("unexpected number of failed pods.  Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
  4981  			}
  4982  			// validate conditions
  4983  			if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) {
  4984  				t.Errorf("expected completion condition.  Got %#v", actual.Status.Conditions)
  4985  			}
  4986  		})
  4987  	}
  4988  }
  4989  
  4990  func TestJobBackoffOnRestartPolicyNever(t *testing.T) {
  4991  	_, ctx := ktesting.NewTestContext(t)
  4992  	jobConditionFailed := batch.JobFailed
  4993  
  4994  	testCases := map[string]struct {
  4995  		// job setup
  4996  		parallelism  int32
  4997  		completions  int32
  4998  		backoffLimit int32
  4999  
  5000  		// pod setup
  5001  		activePodsPhase v1.PodPhase
  5002  		activePods      int
  5003  		failedPods      int
  5004  
  5005  		// expectations
  5006  		expectedActive          int32
  5007  		expectedSucceeded       int32
  5008  		expectedFailed          int32
  5009  		expectedCondition       *batch.JobConditionType
  5010  		expectedConditionReason string
  5011  	}{
  5012  		"not enough failures with backoffLimit 0 - single pod": {
  5013  			1, 1, 0,
  5014  			v1.PodRunning, 1, 0,
  5015  			1, 0, 0, nil, "",
  5016  		},
  5017  		"not enough failures with backoffLimit 1 - single pod": {
  5018  			1, 1, 1,
  5019  			"", 0, 1,
  5020  			1, 0, 1, nil, "",
  5021  		},
  5022  		"too many failures with backoffLimit 1 - single pod": {
  5023  			1, 1, 1,
  5024  			"", 0, 2,
  5025  			0, 0, 2, &jobConditionFailed, "BackoffLimitExceeded",
  5026  		},
  5027  		"not enough failures with backoffLimit 6 - multiple pods": {
  5028  			2, 2, 6,
  5029  			v1.PodRunning, 1, 6,
  5030  			2, 0, 6, nil, "",
  5031  		},
  5032  		"too many failures with backoffLimit 6 - multiple pods": {
  5033  			2, 2, 6,
  5034  			"", 0, 7,
  5035  			0, 0, 7, &jobConditionFailed, "BackoffLimitExceeded",
  5036  		},
  5037  	}
  5038  
  5039  	for name, tc := range testCases {
  5040  		t.Run(name, func(t *testing.T) {
  5041  			// job manager setup
  5042  			clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
  5043  			manager, sharedInformerFactory := newControllerFromClient(ctx, t, clientset, controller.NoResyncPeriodFunc)
  5044  			fakePodControl := controller.FakePodControl{}
  5045  			manager.podControl = &fakePodControl
  5046  			manager.podStoreSynced = alwaysReady
  5047  			manager.jobStoreSynced = alwaysReady
  5048  			var actual *batch.Job
  5049  			manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  5050  				actual = job
  5051  				return job, nil
  5052  			}
  5053  
  5054  			// job & pods setup
  5055  			job := newJob(tc.parallelism, tc.completions, tc.backoffLimit, batch.NonIndexedCompletion)
  5056  			job.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
  5057  			sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  5058  			podIndexer := sharedInformerFactory.Core().V1().Pods().Informer().GetIndexer()
  5059  			for _, pod := range newPodList(tc.failedPods, v1.PodFailed, job) {
  5060  				pod.Status.ContainerStatuses = []v1.ContainerStatus{{State: v1.ContainerState{Terminated: &v1.ContainerStateTerminated{
  5061  					FinishedAt: testFinishedAt,
  5062  				}}}}
  5063  				podIndexer.Add(pod)
  5064  			}
  5065  			for _, pod := range newPodList(tc.activePods, tc.activePodsPhase, job) {
  5066  				podIndexer.Add(pod)
  5067  			}
  5068  
  5069  			// run
  5070  			err := manager.syncJob(context.TODO(), testutil.GetKey(job, t))
  5071  			if err != nil {
  5072  				t.Fatalf("unexpected error syncing job: %#v\n", err)
  5073  			}
  5074  			// validate status
  5075  			if actual.Status.Active != tc.expectedActive {
  5076  				t.Errorf("unexpected number of active pods. Expected %d, saw %d\n", tc.expectedActive, actual.Status.Active)
  5077  			}
  5078  			if actual.Status.Succeeded != tc.expectedSucceeded {
  5079  				t.Errorf("unexpected number of succeeded pods. Expected %d, saw %d\n", tc.expectedSucceeded, actual.Status.Succeeded)
  5080  			}
  5081  			if actual.Status.Failed != tc.expectedFailed {
  5082  				t.Errorf("unexpected number of failed pods. Expected %d, saw %d\n", tc.expectedFailed, actual.Status.Failed)
  5083  			}
  5084  			// validate conditions
  5085  			if tc.expectedCondition != nil && !getCondition(actual, *tc.expectedCondition, v1.ConditionTrue, tc.expectedConditionReason) {
  5086  				t.Errorf("expected completion condition. Got %#v", actual.Status.Conditions)
  5087  			}
  5088  		})
  5089  	}
  5090  }
  5091  
  5092  func TestEnsureJobConditions(t *testing.T) {
  5093  	testCases := []struct {
  5094  		name         string
  5095  		haveList     []batch.JobCondition
  5096  		wantType     batch.JobConditionType
  5097  		wantStatus   v1.ConditionStatus
  5098  		wantReason   string
  5099  		expectList   []batch.JobCondition
  5100  		expectUpdate bool
  5101  	}{
  5102  		{
  5103  			name:         "append true condition",
  5104  			haveList:     []batch.JobCondition{},
  5105  			wantType:     batch.JobSuspended,
  5106  			wantStatus:   v1.ConditionTrue,
  5107  			wantReason:   "foo",
  5108  			expectList:   []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5109  			expectUpdate: true,
  5110  		},
  5111  		{
  5112  			name:         "append false condition",
  5113  			haveList:     []batch.JobCondition{},
  5114  			wantType:     batch.JobSuspended,
  5115  			wantStatus:   v1.ConditionFalse,
  5116  			wantReason:   "foo",
  5117  			expectList:   []batch.JobCondition{},
  5118  			expectUpdate: false,
  5119  		},
  5120  		{
  5121  			name:         "update true condition reason",
  5122  			haveList:     []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5123  			wantType:     batch.JobSuspended,
  5124  			wantStatus:   v1.ConditionTrue,
  5125  			wantReason:   "bar",
  5126  			expectList:   []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "bar", "", realClock.Now())},
  5127  			expectUpdate: true,
  5128  		},
  5129  		{
  5130  			name:         "update true condition status",
  5131  			haveList:     []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5132  			wantType:     batch.JobSuspended,
  5133  			wantStatus:   v1.ConditionFalse,
  5134  			wantReason:   "foo",
  5135  			expectList:   []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionFalse, "foo", "", realClock.Now())},
  5136  			expectUpdate: true,
  5137  		},
  5138  		{
  5139  			name:         "update false condition status",
  5140  			haveList:     []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionFalse, "foo", "", realClock.Now())},
  5141  			wantType:     batch.JobSuspended,
  5142  			wantStatus:   v1.ConditionTrue,
  5143  			wantReason:   "foo",
  5144  			expectList:   []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5145  			expectUpdate: true,
  5146  		},
  5147  		{
  5148  			name:         "condition already exists",
  5149  			haveList:     []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5150  			wantType:     batch.JobSuspended,
  5151  			wantStatus:   v1.ConditionTrue,
  5152  			wantReason:   "foo",
  5153  			expectList:   []batch.JobCondition{*newCondition(batch.JobSuspended, v1.ConditionTrue, "foo", "", realClock.Now())},
  5154  			expectUpdate: false,
  5155  		},
  5156  	}
  5157  	for _, tc := range testCases {
  5158  		t.Run(tc.name, func(t *testing.T) {
  5159  			gotList, isUpdated := ensureJobConditionStatus(tc.haveList, tc.wantType, tc.wantStatus, tc.wantReason, "", realClock.Now())
  5160  			if isUpdated != tc.expectUpdate {
  5161  				t.Errorf("Got isUpdated=%v, want %v", isUpdated, tc.expectUpdate)
  5162  			}
  5163  			if len(gotList) != len(tc.expectList) {
  5164  				t.Errorf("got a list of length %d, want %d", len(gotList), len(tc.expectList))
  5165  			}
  5166  			if diff := cmp.Diff(tc.expectList, gotList, cmpopts.IgnoreFields(batch.JobCondition{}, "LastProbeTime", "LastTransitionTime")); diff != "" {
  5167  				t.Errorf("Unexpected JobCondition list: (-want,+got):\n%s", diff)
  5168  			}
  5169  		})
  5170  	}
  5171  }
  5172  
  5173  func TestFinalizersRemovedExpectations(t *testing.T) {
  5174  	_, ctx := ktesting.NewTestContext(t)
  5175  	clientset := fake.NewSimpleClientset()
  5176  	sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc())
  5177  	manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset)
  5178  	if err != nil {
  5179  		t.Fatalf("Error creating Job controller: %v", err)
  5180  	}
  5181  	manager.podStoreSynced = alwaysReady
  5182  	manager.jobStoreSynced = alwaysReady
  5183  	manager.podControl = &controller.FakePodControl{Err: errors.New("fake pod controller error")}
  5184  	manager.updateStatusHandler = func(ctx context.Context, job *batch.Job) (*batch.Job, error) {
  5185  		return job, nil
  5186  	}
  5187  
  5188  	job := newJob(2, 2, 6, batch.NonIndexedCompletion)
  5189  	sharedInformers.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
  5190  	pods := append(newPodList(2, v1.PodSucceeded, job), newPodList(2, v1.PodFailed, job)...)
  5191  	podInformer := sharedInformers.Core().V1().Pods().Informer()
  5192  	podIndexer := podInformer.GetIndexer()
  5193  	uids := sets.New[string]()
  5194  	for i := range pods {
  5195  		clientset.Tracker().Add(pods[i])
  5196  		podIndexer.Add(pods[i])
  5197  		uids.Insert(string(pods[i].UID))
  5198  	}
  5199  	jobKey := testutil.GetKey(job, t)
  5200  
  5201  	manager.syncJob(context.TODO(), jobKey)
  5202  	gotExpectedUIDs := manager.finalizerExpectations.getExpectedUIDs(jobKey)
  5203  	if len(gotExpectedUIDs) != 0 {
  5204  		t.Errorf("Got unwanted expectations for removed finalizers after first syncJob with client failures:\n%s", sets.List(gotExpectedUIDs))
  5205  	}
  5206  
  5207  	// Remove failures and re-sync.
  5208  	manager.podControl.(*controller.FakePodControl).Err = nil
  5209  	manager.syncJob(context.TODO(), jobKey)
  5210  	gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey)
  5211  	if diff := cmp.Diff(uids, gotExpectedUIDs); diff != "" {
  5212  		t.Errorf("Different expectations for removed finalizers after syncJob (-want,+got):\n%s", diff)
  5213  	}
  5214  
  5215  	stopCh := make(chan struct{})
  5216  	defer close(stopCh)
  5217  	go sharedInformers.Core().V1().Pods().Informer().Run(stopCh)
  5218  	cache.WaitForCacheSync(stopCh, podInformer.HasSynced)
  5219  
  5220  	// Make sure the first syncJob sets the expectations, even after the caches synced.
  5221  	gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey)
  5222  	if diff := cmp.Diff(uids, gotExpectedUIDs); diff != "" {
  5223  		t.Errorf("Different expectations for removed finalizers after syncJob and cacheSync (-want,+got):\n%s", diff)
  5224  	}
  5225  
  5226  	// Change pods in different ways.
  5227  
  5228  	podsResource := schema.GroupVersionResource{Version: "v1", Resource: "pods"}
  5229  
  5230  	update := pods[0].DeepCopy()
  5231  	update.Finalizers = nil
  5232  	update.ResourceVersion = "1"
  5233  	err = clientset.Tracker().Update(podsResource, update, update.Namespace)
  5234  	if err != nil {
  5235  		t.Errorf("Removing finalizer: %v", err)
  5236  	}
  5237  
  5238  	update = pods[1].DeepCopy()
  5239  	update.Finalizers = nil
  5240  	update.DeletionTimestamp = &metav1.Time{Time: time.Now()}
  5241  	update.ResourceVersion = "1"
  5242  	err = clientset.Tracker().Update(podsResource, update, update.Namespace)
  5243  	if err != nil {
  5244  		t.Errorf("Removing finalizer and setting deletion timestamp: %v", err)
  5245  	}
  5246  
  5247  	// Preserve the finalizer.
  5248  	update = pods[2].DeepCopy()
  5249  	update.DeletionTimestamp = &metav1.Time{Time: time.Now()}
  5250  	update.ResourceVersion = "1"
  5251  	err = clientset.Tracker().Update(podsResource, update, update.Namespace)
  5252  	if err != nil {
  5253  		t.Errorf("Setting deletion timestamp: %v", err)
  5254  	}
  5255  
  5256  	err = clientset.Tracker().Delete(podsResource, pods[3].Namespace, pods[3].Name)
  5257  	if err != nil {
  5258  		t.Errorf("Deleting pod that had finalizer: %v", err)
  5259  	}
  5260  
  5261  	uids = sets.New(string(pods[2].UID))
  5262  	var diff string
  5263  	if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
  5264  		gotExpectedUIDs = manager.finalizerExpectations.getExpectedUIDs(jobKey)
  5265  		diff = cmp.Diff(uids, gotExpectedUIDs)
  5266  		return diff == "", nil
  5267  	}); err != nil {
  5268  		t.Errorf("Timeout waiting for expectations (-want, +got):\n%s", diff)
  5269  	}
  5270  }
  5271  
  5272  func TestFinalizerCleanup(t *testing.T) {
  5273  	_, ctx := ktesting.NewTestContext(t)
  5274  	ctx, cancel := context.WithCancel(ctx)
  5275  	defer cancel()
  5276  
  5277  	clientset := fake.NewSimpleClientset()
  5278  	sharedInformers := informers.NewSharedInformerFactory(clientset, controller.NoResyncPeriodFunc())
  5279  	manager, err := NewController(ctx, sharedInformers.Core().V1().Pods(), sharedInformers.Batch().V1().Jobs(), clientset)
  5280  	if err != nil {
  5281  		t.Fatalf("Error creating Job controller: %v", err)
  5282  	}
  5283  	manager.podStoreSynced = alwaysReady
  5284  	manager.jobStoreSynced = alwaysReady
  5285  
  5286  	// Initialize the controller with 0 workers to make sure the
  5287  	// pod finalizers are not removed by the "syncJob" function.
  5288  	go manager.Run(ctx, 0)
  5289  
  5290  	// Start the Pod and Job informers.
  5291  	sharedInformers.Start(ctx.Done())
  5292  	sharedInformers.WaitForCacheSync(ctx.Done())
  5293  
  5294  	// Create a simple Job
  5295  	job := newJob(1, 1, 1, batch.NonIndexedCompletion)
  5296  	job, err = clientset.BatchV1().Jobs(job.GetNamespace()).Create(ctx, job, metav1.CreateOptions{})
  5297  	if err != nil {
  5298  		t.Fatalf("Creating job: %v", err)
  5299  	}
  5300  
  5301  	// Create a Pod with the job tracking finalizer
  5302  	pod := newPod("test-pod", job)
  5303  	pod.Finalizers = append(pod.Finalizers, batch.JobTrackingFinalizer)
  5304  	pod, err = clientset.CoreV1().Pods(pod.GetNamespace()).Create(ctx, pod, metav1.CreateOptions{})
  5305  	if err != nil {
  5306  		t.Fatalf("Creating pod: %v", err)
  5307  	}
  5308  
  5309  	// Mark Job as complete.
  5310  	job.Status.Conditions = append(job.Status.Conditions, batch.JobCondition{
  5311  		Type:   batch.JobComplete,
  5312  		Status: v1.ConditionTrue,
  5313  	})
  5314  	_, err = clientset.BatchV1().Jobs(job.GetNamespace()).UpdateStatus(ctx, job, metav1.UpdateOptions{})
  5315  	if err != nil {
  5316  		t.Fatalf("Updating job status: %v", err)
  5317  	}
  5318  
  5319  	// Verify the pod finalizer is removed for a finished Job,
  5320  	// even if the jobs pods are not tracked by the main reconciliation loop.
  5321  	if err := wait.PollUntilContextTimeout(ctx, 100*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
  5322  		p, err := clientset.CoreV1().Pods(pod.Namespace).Get(ctx, pod.Name, metav1.GetOptions{})
  5323  		if err != nil {
  5324  			return false, err
  5325  		}
  5326  		return !hasJobTrackingFinalizer(p), nil
  5327  	}); err != nil {
  5328  		t.Errorf("Waiting for Pod to get the finalizer removed: %v", err)
  5329  	}
  5330  
  5331  }
  5332  
  5333  func checkJobCompletionLabel(t *testing.T, p *v1.PodTemplateSpec) {
  5334  	t.Helper()
  5335  	labels := p.GetLabels()
  5336  	if labels == nil || labels[batch.JobCompletionIndexAnnotation] == "" {
  5337  		t.Errorf("missing expected pod label %s", batch.JobCompletionIndexAnnotation)
  5338  	}
  5339  }
  5340  
  5341  func checkJobCompletionEnvVariable(t *testing.T, spec *v1.PodSpec, podIndexLabelDisabled bool) {
  5342  	t.Helper()
  5343  	var fieldPath string
  5344  	if podIndexLabelDisabled {
  5345  		fieldPath = fmt.Sprintf("metadata.annotations['%s']", batch.JobCompletionIndexAnnotation)
  5346  	} else {
  5347  		fieldPath = fmt.Sprintf("metadata.labels['%s']", batch.JobCompletionIndexAnnotation)
  5348  	}
  5349  	want := []v1.EnvVar{
  5350  		{
  5351  			Name: "JOB_COMPLETION_INDEX",
  5352  			ValueFrom: &v1.EnvVarSource{
  5353  				FieldRef: &v1.ObjectFieldSelector{
  5354  					FieldPath: fieldPath,
  5355  				},
  5356  			},
  5357  		},
  5358  	}
  5359  	for _, c := range spec.InitContainers {
  5360  		if diff := cmp.Diff(want, c.Env); diff != "" {
  5361  			t.Errorf("Unexpected Env in container %s (-want,+got):\n%s", c.Name, diff)
  5362  		}
  5363  	}
  5364  	for _, c := range spec.Containers {
  5365  		if diff := cmp.Diff(want, c.Env); diff != "" {
  5366  			t.Errorf("Unexpected Env in container %s (-want,+got):\n%s", c.Name, diff)
  5367  		}
  5368  	}
  5369  }
  5370  
  5371  func podReplacementPolicy(m batch.PodReplacementPolicy) *batch.PodReplacementPolicy {
  5372  	return &m
  5373  }
  5374  
  5375  func verifyEmptyQueueAndAwaitForQueueLen(ctx context.Context, t *testing.T, jm *Controller, wantQueueLen int) {
  5376  	t.Helper()
  5377  	verifyEmptyQueue(ctx, t, jm)
  5378  	awaitForQueueLen(ctx, t, jm, wantQueueLen)
  5379  }
  5380  
  5381  func awaitForQueueLen(ctx context.Context, t *testing.T, jm *Controller, wantQueueLen int) {
  5382  	t.Helper()
  5383  	verifyEmptyQueue(ctx, t, jm)
  5384  	if err := wait.PollUntilContextTimeout(ctx, fastRequeue, time.Second, true, func(ctx context.Context) (bool, error) {
  5385  		if requeued := jm.queue.Len() == wantQueueLen; requeued {
  5386  			return true, nil
  5387  		}
  5388  		jm.clock.Sleep(fastRequeue)
  5389  		return false, nil
  5390  	}); err != nil {
  5391  		t.Errorf("Failed to await for expected queue.Len(). want %v, got: %v", wantQueueLen, jm.queue.Len())
  5392  	}
  5393  }
  5394  
  5395  func verifyEmptyQueue(ctx context.Context, t *testing.T, jm *Controller) {
  5396  	t.Helper()
  5397  	if jm.queue.Len() > 0 {
  5398  		t.Errorf("Unexpected queue.Len(). Want: %d, got: %d", 0, jm.queue.Len())
  5399  	}
  5400  }
  5401  
  5402  type podBuilder struct {
  5403  	*v1.Pod
  5404  }
  5405  
  5406  func buildPod() podBuilder {
  5407  	return podBuilder{Pod: &v1.Pod{
  5408  		ObjectMeta: metav1.ObjectMeta{
  5409  			UID: types.UID(rand.String(5)),
  5410  		},
  5411  	}}
  5412  }
  5413  
  5414  func getConditionsByType(list []batch.JobCondition, cType batch.JobConditionType) []*batch.JobCondition {
  5415  	var result []*batch.JobCondition
  5416  	for i := range list {
  5417  		if list[i].Type == cType {
  5418  			result = append(result, &list[i])
  5419  		}
  5420  	}
  5421  	return result
  5422  }
  5423  
  5424  func (pb podBuilder) name(n string) podBuilder {
  5425  	pb.Name = n
  5426  	return pb
  5427  }
  5428  
  5429  func (pb podBuilder) ns(n string) podBuilder {
  5430  	pb.Namespace = n
  5431  	return pb
  5432  }
  5433  
  5434  func (pb podBuilder) uid(u string) podBuilder {
  5435  	pb.UID = types.UID(u)
  5436  	return pb
  5437  }
  5438  
  5439  func (pb podBuilder) job(j *batch.Job) podBuilder {
  5440  	pb.Labels = j.Spec.Selector.MatchLabels
  5441  	pb.Namespace = j.Namespace
  5442  	pb.OwnerReferences = []metav1.OwnerReference{*metav1.NewControllerRef(j, controllerKind)}
  5443  	return pb
  5444  }
  5445  
  5446  func (pb podBuilder) clearOwner() podBuilder {
  5447  	pb.OwnerReferences = nil
  5448  	return pb
  5449  }
  5450  
  5451  func (pb podBuilder) clearLabels() podBuilder {
  5452  	pb.Labels = nil
  5453  	return pb
  5454  }
  5455  
  5456  func (pb podBuilder) index(ix string) podBuilder {
  5457  	return pb.annotation(batch.JobCompletionIndexAnnotation, ix)
  5458  }
  5459  
  5460  func (pb podBuilder) indexFailureCount(count string) podBuilder {
  5461  	return pb.annotation(batch.JobIndexFailureCountAnnotation, count)
  5462  }
  5463  
  5464  func (pb podBuilder) indexIgnoredFailureCount(count string) podBuilder {
  5465  	return pb.annotation(batch.JobIndexIgnoredFailureCountAnnotation, count)
  5466  }
  5467  
  5468  func (pb podBuilder) annotation(key, value string) podBuilder {
  5469  	if pb.Annotations == nil {
  5470  		pb.Annotations = make(map[string]string)
  5471  	}
  5472  	pb.Annotations[key] = value
  5473  	return pb
  5474  }
  5475  
  5476  func (pb podBuilder) status(s v1.PodStatus) podBuilder {
  5477  	pb.Status = s
  5478  	return pb
  5479  }
  5480  
  5481  func (pb podBuilder) phase(p v1.PodPhase) podBuilder {
  5482  	pb.Status.Phase = p
  5483  	return pb
  5484  }
  5485  
  5486  func (pb podBuilder) trackingFinalizer() podBuilder {
  5487  	for _, f := range pb.Finalizers {
  5488  		if f == batch.JobTrackingFinalizer {
  5489  			return pb
  5490  		}
  5491  	}
  5492  	pb.Finalizers = append(pb.Finalizers, batch.JobTrackingFinalizer)
  5493  	return pb
  5494  }
  5495  
  5496  func (pb podBuilder) deletionTimestamp() podBuilder {
  5497  	pb.DeletionTimestamp = &metav1.Time{}
  5498  	return pb
  5499  }
  5500  
  5501  func (pb podBuilder) customDeletionTimestamp(t time.Time) podBuilder {
  5502  	pb.DeletionTimestamp = &metav1.Time{Time: t}
  5503  	return pb
  5504  }
  5505  
  5506  func completionModePtr(m batch.CompletionMode) *batch.CompletionMode {
  5507  	return &m
  5508  }
  5509  
  5510  func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() {
  5511  	origVal := *val
  5512  	*val = newVal
  5513  	return func() {
  5514  		*val = origVal
  5515  	}
  5516  }