k8s.io/kubernetes@v1.29.3/test/integration/job/job_test.go

k8s.io/kubernetes@v1.29.3/test/integration/job/job_test.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package job
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  	"sync"
    27  	"sync/atomic"
    28  	"testing"
    29  	"time"
    30  
    31  	"github.com/google/go-cmp/cmp"
    32  	batchv1 "k8s.io/api/batch/v1"
    33  	v1 "k8s.io/api/core/v1"
    34  	eventsv1 "k8s.io/api/events/v1"
    35  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/apimachinery/pkg/runtime/schema"
    38  	"k8s.io/apimachinery/pkg/types"
    39  	"k8s.io/apimachinery/pkg/util/sets"
    40  	"k8s.io/apimachinery/pkg/util/validation/field"
    41  	"k8s.io/apimachinery/pkg/util/wait"
    42  	"k8s.io/apimachinery/pkg/watch"
    43  	"k8s.io/apiserver/pkg/util/feature"
    44  	"k8s.io/client-go/informers"
    45  	clientset "k8s.io/client-go/kubernetes"
    46  	typedv1 "k8s.io/client-go/kubernetes/typed/batch/v1"
    47  	restclient "k8s.io/client-go/rest"
    48  	"k8s.io/client-go/util/retry"
    49  	featuregatetesting "k8s.io/component-base/featuregate/testing"
    50  	basemetrics "k8s.io/component-base/metrics"
    51  	"k8s.io/component-base/metrics/testutil"
    52  	"k8s.io/klog/v2"
    53  	kubeapiservertesting "k8s.io/kubernetes/cmd/kube-apiserver/app/testing"
    54  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    55  	jobcontroller "k8s.io/kubernetes/pkg/controller/job"
    56  	"k8s.io/kubernetes/pkg/controller/job/metrics"
    57  	"k8s.io/kubernetes/pkg/features"
    58  	"k8s.io/kubernetes/test/integration/framework"
    59  	"k8s.io/kubernetes/test/integration/util"
    60  	"k8s.io/utils/ptr"
    61  )
    62  
    63  const waitInterval = time.Second
    64  const fastPodFailureBackoff = 100 * time.Millisecond
    65  
    66  type metricLabelsWithValue struct {
    67  	Labels []string
    68  	Value  int
    69  }
    70  
    71  func validateCounterMetric(ctx context.Context, t *testing.T, counterVec *basemetrics.CounterVec, wantMetric metricLabelsWithValue) {
    72  	t.Helper()
    73  	var cmpErr error
    74  	err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, 10*time.Second, true, func(ctx context.Context) (bool, error) {
    75  		cmpErr = nil
    76  		value, err := testutil.GetCounterMetricValue(counterVec.WithLabelValues(wantMetric.Labels...))
    77  		if err != nil {
    78  			return true, fmt.Errorf("collecting the %q metric: %q", counterVec.Name, err)
    79  		}
    80  		if wantMetric.Value != int(value) {
    81  			cmpErr = fmt.Errorf("Unexpected metric delta for %q metric with labels %q. want: %v, got: %v", counterVec.Name, wantMetric.Labels, wantMetric.Value, int(value))
    82  			return false, nil
    83  		}
    84  		return true, nil
    85  	})
    86  	if err != nil {
    87  		t.Errorf("Failed waiting for expected metric: %q", err)
    88  	}
    89  	if cmpErr != nil {
    90  		t.Error(cmpErr)
    91  	}
    92  }
    93  
    94  func validateTerminatedPodsTrackingFinalizerMetric(ctx context.Context, t *testing.T, want int) {
    95  	validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
    96  		Value:  want,
    97  		Labels: []string{metrics.Add},
    98  	})
    99  	validateCounterMetric(ctx, t, metrics.TerminatedPodsTrackingFinalizerTotal, metricLabelsWithValue{
   100  		Value:  want,
   101  		Labels: []string{metrics.Delete},
   102  	})
   103  }
   104  
   105  // TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart verifies that the job is properly marked as Failed
   106  // in a scenario when the job controller crashes between removing pod finalizers and marking the job as Failed (based on
   107  // the pod failure policy). After the finalizer for the failed pod is removed we remove the failed pod. This step is
   108  // done to simulate what PodGC would do. Then, the test spawns the second instance of the controller to check that it
   109  // will pick up the job state properly and will mark it as Failed, even if th pod triggering the pod failure policy is
   110  // already deleted.
   111  // Note: this scenario requires the use of finalizers. Without finalizers there is no guarantee a failed pod would be
   112  // checked against the pod failure policy rules before its removal by PodGC.
   113  func TestJobPodFailurePolicyWithFailedPodDeletedDuringControllerRestart(t *testing.T) {
   114  	count := 3
   115  	job := batchv1.Job{
   116  		Spec: batchv1.JobSpec{
   117  			Template: v1.PodTemplateSpec{
   118  				Spec: v1.PodSpec{
   119  					Containers: []v1.Container{
   120  						{
   121  							Name:                     "main-container",
   122  							Image:                    "foo",
   123  							ImagePullPolicy:          v1.PullIfNotPresent,
   124  							TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
   125  						},
   126  					},
   127  				},
   128  			},
   129  			Parallelism: ptr.To(int32(count)),
   130  			Completions: ptr.To(int32(count)),
   131  			PodFailurePolicy: &batchv1.PodFailurePolicy{
   132  				Rules: []batchv1.PodFailurePolicyRule{
   133  					{
   134  						Action: batchv1.PodFailurePolicyActionFailJob,
   135  						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   136  							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   137  							Values:   []int32{5},
   138  						},
   139  					},
   140  				},
   141  			},
   142  		},
   143  	}
   144  	podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
   145  		Phase: v1.PodFailed,
   146  		ContainerStatuses: []v1.ContainerStatus{
   147  			{
   148  				Name: "main-container",
   149  				State: v1.ContainerState{
   150  					Terminated: &v1.ContainerStateTerminated{
   151  						ExitCode: 5,
   152  					},
   153  				},
   154  			},
   155  		},
   156  	}
   157  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)()
   158  	closeFn, restConfig, cs, ns := setup(t, "simple")
   159  	defer closeFn()
   160  
   161  	// Make the job controller significantly slower to trigger race condition.
   162  	restConfig.QPS = 1
   163  	restConfig.Burst = 1
   164  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
   165  	defer func() {
   166  		cancel()
   167  	}()
   168  	resetMetrics()
   169  	restConfig.QPS = 200
   170  	restConfig.Burst = 200
   171  
   172  	// create a job with a failed pod matching the exit code rule and a couple of successful pods
   173  	jobObj, err := createJobWithDefaults(ctx, cs, ns.Name, &job)
   174  	if err != nil {
   175  		t.Fatalf("Failed to create Job: %v", err)
   176  	}
   177  	validateJobPodsStatus(ctx, t, cs, jobObj, podsByStatus{
   178  		Active:      count,
   179  		Ready:       ptr.To[int32](0),
   180  		Terminating: ptr.To[int32](0),
   181  	})
   182  
   183  	jobPods, err := getJobPods(ctx, t, cs, jobObj, func(s v1.PodStatus) bool {
   184  		return (s.Phase == v1.PodPending || s.Phase == v1.PodRunning)
   185  	})
   186  	if err != nil {
   187  		t.Fatalf("Failed to list Job Pods: %v", err)
   188  	}
   189  
   190  	failedIndex := 1
   191  	wg := sync.WaitGroup{}
   192  	wg.Add(1)
   193  
   194  	// Await for the failed pod (with index failedIndex) to have its finalizer
   195  	// removed. The finalizer will be removed by the job controller just after
   196  	// appending the FailureTarget condition to the job to mark it as targeted
   197  	// for failure.
   198  	go func(ctx context.Context) {
   199  		err := wait.PollUntilContextTimeout(ctx, 10*time.Millisecond, time.Minute, true, func(ctx context.Context) (bool, error) {
   200  			failedPodUpdated, err := cs.CoreV1().Pods(jobObj.Namespace).Get(ctx, jobPods[failedIndex].Name, metav1.GetOptions{})
   201  			if err != nil {
   202  				return true, err
   203  			}
   204  			if len(failedPodUpdated.Finalizers) == 0 {
   205  				return true, nil
   206  			}
   207  			return false, nil
   208  		})
   209  		if err != nil {
   210  			t.Logf("Failed awaiting for the finalizer removal for pod %v", klog.KObj(jobPods[failedIndex]))
   211  		}
   212  		wg.Done()
   213  	}(ctx)
   214  
   215  	// We update one pod as failed with state matching the pod failure policy rule. This results in removal
   216  	// of the pod finalizer from the pod by the job controller.
   217  	failedPod := jobPods[failedIndex]
   218  	updatedPod := failedPod.DeepCopy()
   219  	updatedPod.Status = podStatusMatchingOnExitCodesTerminateRule
   220  	_, err = updatePodStatuses(ctx, cs, []v1.Pod{*updatedPod})
   221  	if err != nil {
   222  		t.Fatalf("Failed to update pod statuses %q for pods of job %q", err, klog.KObj(jobObj))
   223  	}
   224  	wg.Wait()
   225  
   226  	t.Logf("Finalizer is removed for the failed pod %q. Shutting down the controller.", klog.KObj(failedPod))
   227  	// shut down the first job controller as soon as it removed the finalizer for the failed pod. This will
   228  	// likely happen before the first controller is able to mark the job as Failed.
   229  	cancel()
   230  
   231  	// Delete the failed pod to make sure it is not used by the second instance of the controller
   232  	ctx, cancel = context.WithCancel(context.Background())
   233  	err = cs.CoreV1().Pods(failedPod.Namespace).Delete(ctx, failedPod.Name, metav1.DeleteOptions{GracePeriodSeconds: ptr.To[int64](0)})
   234  	if err != nil {
   235  		t.Fatalf("Error: '%v' while deleting pod: '%v'", err, klog.KObj(failedPod))
   236  	}
   237  	t.Logf("The failed pod %q is deleted", klog.KObj(failedPod))
   238  	cancel()
   239  
   240  	// start the second controller to promote the interim FailureTarget job condition as Failed
   241  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
   242  	// verify the job is correctly marked as Failed
   243  	validateJobFailed(ctx, t, cs, jobObj)
   244  	validateNoOrphanPodsWithFinalizers(ctx, t, cs, jobObj)
   245  }
   246  
   247  // TestJobPodFailurePolicy tests handling of pod failures with respect to the
   248  // configured pod failure policy rules
   249  func TestJobPodFailurePolicy(t *testing.T) {
   250  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
   251  	job := batchv1.Job{
   252  		Spec: batchv1.JobSpec{
   253  			Template: v1.PodTemplateSpec{
   254  				Spec: v1.PodSpec{
   255  					Containers: []v1.Container{
   256  						{
   257  							Name:                     "main-container",
   258  							Image:                    "foo",
   259  							ImagePullPolicy:          v1.PullIfNotPresent,
   260  							TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
   261  						},
   262  					},
   263  				},
   264  			},
   265  			PodFailurePolicy: &batchv1.PodFailurePolicy{
   266  				Rules: []batchv1.PodFailurePolicyRule{
   267  					{
   268  						Action: batchv1.PodFailurePolicyActionIgnore,
   269  						OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
   270  							{
   271  								Type: v1.DisruptionTarget,
   272  							},
   273  						},
   274  					},
   275  					{
   276  						Action: batchv1.PodFailurePolicyActionCount,
   277  						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   278  							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   279  							Values:   []int32{10},
   280  						},
   281  					},
   282  					{
   283  						Action: batchv1.PodFailurePolicyActionFailJob,
   284  						OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
   285  							Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
   286  							Values:   []int32{5, 6, 7},
   287  						},
   288  					},
   289  				},
   290  			},
   291  		},
   292  	}
   293  	podStatusMatchingOnExitCodesTerminateRule := v1.PodStatus{
   294  		Phase: v1.PodFailed,
   295  		ContainerStatuses: []v1.ContainerStatus{
   296  			{
   297  				Name: "main-container",
   298  				State: v1.ContainerState{
   299  					Terminated: &v1.ContainerStateTerminated{
   300  						ExitCode: 5,
   301  					},
   302  				},
   303  			},
   304  		},
   305  	}
   306  	podStatusMatchingOnExitCodesCountRule := v1.PodStatus{
   307  		Phase: v1.PodFailed,
   308  		ContainerStatuses: []v1.ContainerStatus{
   309  			{
   310  				Name: "main-container",
   311  				State: v1.ContainerState{
   312  					Terminated: &v1.ContainerStateTerminated{
   313  						ExitCode: 10,
   314  					},
   315  				},
   316  			},
   317  		},
   318  	}
   319  	podStatusMatchingOnPodConditionsIgnoreRule := v1.PodStatus{
   320  		Phase: v1.PodFailed,
   321  		Conditions: []v1.PodCondition{
   322  			{
   323  				Type:   v1.DisruptionTarget,
   324  				Status: v1.ConditionTrue,
   325  			},
   326  		},
   327  	}
   328  	podStatusNotMatchingAnyRule := v1.PodStatus{
   329  		Phase: v1.PodFailed,
   330  		ContainerStatuses: []v1.ContainerStatus{
   331  			{
   332  				State: v1.ContainerState{
   333  					Terminated: &v1.ContainerStateTerminated{},
   334  				},
   335  			},
   336  		},
   337  	}
   338  	testCases := map[string]struct {
   339  		enableJobPodFailurePolicy                bool
   340  		restartController                        bool
   341  		job                                      batchv1.Job
   342  		podStatus                                v1.PodStatus
   343  		wantActive                               int
   344  		wantFailed                               int
   345  		wantJobConditionType                     batchv1.JobConditionType
   346  		wantJobFinishedMetric                    metricLabelsWithValue
   347  		wantPodFailuresHandledByPolicyRuleMetric *metricLabelsWithValue
   348  	}{
   349  		"pod status matching the configured FailJob rule on exit codes; job terminated when JobPodFailurePolicy enabled": {
   350  			enableJobPodFailurePolicy: true,
   351  			job:                       job,
   352  			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
   353  			wantActive:                0,
   354  			wantFailed:                1,
   355  			wantJobConditionType:      batchv1.JobFailed,
   356  			wantJobFinishedMetric: metricLabelsWithValue{
   357  				Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
   358  				Value:  1,
   359  			},
   360  			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
   361  				Labels: []string{"FailJob"},
   362  				Value:  1,
   363  			},
   364  		},
   365  		"pod status matching the configured FailJob rule on exit codes; with controller restart; job terminated when JobPodFailurePolicy enabled": {
   366  			enableJobPodFailurePolicy: true,
   367  			restartController:         true,
   368  			job:                       job,
   369  			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
   370  			wantActive:                0,
   371  			wantFailed:                1,
   372  			wantJobConditionType:      batchv1.JobFailed,
   373  			wantJobFinishedMetric: metricLabelsWithValue{
   374  				Labels: []string{"NonIndexed", "failed", "PodFailurePolicy"},
   375  				Value:  1,
   376  			},
   377  		},
   378  		"pod status matching the configured FailJob rule on exit codes; default handling when JobPodFailurePolicy disabled": {
   379  			enableJobPodFailurePolicy: false,
   380  			job:                       job,
   381  			podStatus:                 podStatusMatchingOnExitCodesTerminateRule,
   382  			wantActive:                1,
   383  			wantFailed:                1,
   384  			wantJobConditionType:      batchv1.JobComplete,
   385  			wantJobFinishedMetric: metricLabelsWithValue{
   386  				Labels: []string{"NonIndexed", "succeeded", ""},
   387  				Value:  1,
   388  			},
   389  		},
   390  		"pod status matching the configured Ignore rule on pod conditions; pod failure not counted when JobPodFailurePolicy enabled": {
   391  			enableJobPodFailurePolicy: true,
   392  			job:                       job,
   393  			podStatus:                 podStatusMatchingOnPodConditionsIgnoreRule,
   394  			wantActive:                1,
   395  			wantFailed:                0,
   396  			wantJobConditionType:      batchv1.JobComplete,
   397  			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
   398  				Labels: []string{"Ignore"},
   399  				Value:  1,
   400  			},
   401  			wantJobFinishedMetric: metricLabelsWithValue{
   402  				Labels: []string{"NonIndexed", "succeeded", ""},
   403  				Value:  1,
   404  			},
   405  		},
   406  		"pod status matching the configured Count rule on exit codes; pod failure counted when JobPodFailurePolicy enabled": {
   407  			enableJobPodFailurePolicy: true,
   408  			job:                       job,
   409  			podStatus:                 podStatusMatchingOnExitCodesCountRule,
   410  			wantActive:                1,
   411  			wantFailed:                1,
   412  			wantJobConditionType:      batchv1.JobComplete,
   413  			wantJobFinishedMetric: metricLabelsWithValue{
   414  				Labels: []string{"NonIndexed", "succeeded", ""},
   415  				Value:  1,
   416  			},
   417  			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
   418  				Labels: []string{"Count"},
   419  				Value:  1,
   420  			},
   421  		},
   422  		"pod status non-matching any configured rule; pod failure counted when JobPodFailurePolicy enabled": {
   423  			enableJobPodFailurePolicy: true,
   424  			job:                       job,
   425  			podStatus:                 podStatusNotMatchingAnyRule,
   426  			wantActive:                1,
   427  			wantFailed:                1,
   428  			wantJobConditionType:      batchv1.JobComplete,
   429  			wantJobFinishedMetric: metricLabelsWithValue{
   430  				Labels: []string{"NonIndexed", "succeeded", ""},
   431  				Value:  1,
   432  			},
   433  			wantPodFailuresHandledByPolicyRuleMetric: &metricLabelsWithValue{
   434  				Labels: []string{"Count"},
   435  				Value:  0,
   436  			},
   437  		},
   438  	}
   439  	for name, test := range testCases {
   440  		t.Run(name, func(t *testing.T) {
   441  			resetMetrics()
   442  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, test.enableJobPodFailurePolicy)()
   443  
   444  			closeFn, restConfig, clientSet, ns := setup(t, "simple")
   445  			defer closeFn()
   446  			ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
   447  			defer func() {
   448  				cancel()
   449  			}()
   450  
   451  			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
   452  			if err != nil {
   453  				t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
   454  			}
   455  			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   456  				Active:      1,
   457  				Ready:       ptr.To[int32](0),
   458  				Terminating: ptr.To[int32](0),
   459  			})
   460  
   461  			op := func(p *v1.Pod) bool {
   462  				p.Status = test.podStatus
   463  				return true
   464  			}
   465  
   466  			if err, _ := updateJobPodsStatus(ctx, clientSet, jobObj, op, 1); err != nil {
   467  				t.Fatalf("Error %q while updating pod status for Job: %q", err, jobObj.Name)
   468  			}
   469  
   470  			if test.restartController {
   471  				cancel()
   472  				ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
   473  			}
   474  
   475  			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   476  				Active:      test.wantActive,
   477  				Failed:      test.wantFailed,
   478  				Ready:       ptr.To[int32](0),
   479  				Terminating: ptr.To[int32](0),
   480  			})
   481  
   482  			if test.wantJobConditionType == batchv1.JobComplete {
   483  				if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
   484  					t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
   485  				}
   486  			}
   487  			validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
   488  			validateCounterMetric(ctx, t, metrics.JobFinishedNum, test.wantJobFinishedMetric)
   489  			if test.wantPodFailuresHandledByPolicyRuleMetric != nil {
   490  				validateCounterMetric(ctx, t, metrics.PodFailuresHandledByFailurePolicy, *test.wantPodFailuresHandledByPolicyRuleMetric)
   491  			}
   492  			validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
   493  		})
   494  	}
   495  }
   496  
   497  // TestBackoffLimitPerIndex_DelayedPodDeletion tests the pod deletion is delayed
   498  // until the replacement pod is created, so that the replacement pod has the
   499  // index-failure-count annotation bumped, when BackoffLimitPerIndex is used.
   500  func TestBackoffLimitPerIndex_DelayedPodDeletion(t *testing.T) {
   501  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
   502  
   503  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
   504  	closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-failed")
   505  	defer closeFn()
   506  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
   507  	defer func() {
   508  		cancel()
   509  	}()
   510  
   511  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
   512  		Spec: batchv1.JobSpec{
   513  			Parallelism:          ptr.To[int32](1),
   514  			Completions:          ptr.To[int32](1),
   515  			BackoffLimitPerIndex: ptr.To[int32](1),
   516  			CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   517  		},
   518  	})
   519  	if err != nil {
   520  		t.Fatalf("Failed to create Job: %v", err)
   521  	}
   522  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   523  		Active:      1,
   524  		Ready:       ptr.To[int32](0),
   525  		Terminating: ptr.To[int32](0),
   526  	})
   527  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To(""))
   528  
   529  	// First pod from index 0 failed.
   530  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
   531  		t.Fatal("Failed trying to fail pod with index 0")
   532  	}
   533  	// Delete the failed pod
   534  	pod, err := getJobPodForIndex(ctx, clientSet, jobObj, 0, func(_ *v1.Pod) bool { return true })
   535  	if err != nil {
   536  		t.Fatalf("failed to get terminal pod for index: %v", 0)
   537  	}
   538  	if err := clientSet.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
   539  		t.Fatalf("failed to delete pod: %v, error: %v", klog.KObj(pod), err)
   540  	}
   541  
   542  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   543  		Active:      1,
   544  		Failed:      1,
   545  		Ready:       ptr.To[int32](0),
   546  		Terminating: ptr.To[int32](0),
   547  	})
   548  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0), "", ptr.To(""))
   549  
   550  	// Verify the replacement pod is created and has the index-failure-count
   551  	// annotation bumped.
   552  	replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, 0)
   553  	if err != nil {
   554  		t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", 0, err)
   555  	}
   556  	gotIndexFailureCount, err := getIndexFailureCount(replacement)
   557  	if err != nil {
   558  		t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
   559  	}
   560  	if diff := cmp.Diff(1, gotIndexFailureCount); diff != "" {
   561  		t.Errorf("Unexpected index failure count for the replacement pod: %s", diff)
   562  	}
   563  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
   564  		t.Fatal("Failed trying to fail pod with index 0")
   565  	}
   566  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   567  		Active:      0,
   568  		Succeeded:   1,
   569  		Failed:      1,
   570  		Ready:       ptr.To[int32](0),
   571  		Terminating: ptr.To[int32](0),
   572  	})
   573  	validateJobSucceeded(ctx, t, clientSet, jobObj)
   574  }
   575  
   576  // TestBackoffLimitPerIndex_Reenabling tests handling of pod failures when
   577  // reenabling the BackoffLimitPerIndex feature.
   578  func TestBackoffLimitPerIndex_Reenabling(t *testing.T) {
   579  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
   580  
   581  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
   582  	closeFn, restConfig, clientSet, ns := setup(t, "backoff-limit-per-index-reenabled")
   583  	defer closeFn()
   584  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
   585  	defer cancel()
   586  	resetMetrics()
   587  
   588  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
   589  		Spec: batchv1.JobSpec{
   590  			Parallelism:          ptr.To[int32](3),
   591  			Completions:          ptr.To[int32](3),
   592  			BackoffLimitPerIndex: ptr.To[int32](0),
   593  			CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   594  		},
   595  	})
   596  	if err != nil {
   597  		t.Fatalf("Failed to create Job: %v", err)
   598  	}
   599  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   600  		Active:      3,
   601  		Ready:       ptr.To[int32](0),
   602  		Terminating: ptr.To[int32](0),
   603  	})
   604  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", ptr.To(""))
   605  
   606  	// First pod from index 0 failed
   607  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
   608  		t.Fatal("Failed trying to fail pod with index 0")
   609  	}
   610  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   611  		Active:      2,
   612  		Failed:      1,
   613  		Ready:       ptr.To[int32](0),
   614  		Terminating: ptr.To[int32](0),
   615  	})
   616  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1, 2), "", ptr.To("0"))
   617  
   618  	// Disable the feature
   619  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, false)()
   620  
   621  	// First pod from index 1 failed
   622  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
   623  		t.Fatal("Failed trying to fail pod with index 1")
   624  	}
   625  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   626  		Active:      3,
   627  		Failed:      2,
   628  		Ready:       ptr.To[int32](0),
   629  		Terminating: ptr.To[int32](0),
   630  	})
   631  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
   632  
   633  	// Reenable the feature
   634  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
   635  
   636  	// First pod from index 2 failed
   637  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
   638  		t.Fatal("Failed trying to fail pod with index 2")
   639  	}
   640  
   641  	// Verify the indexes 0 and 1 are active as the failed pods don't have
   642  	// finalizers at this point, so they are ignored.
   643  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   644  		Active:      2,
   645  		Failed:      3,
   646  		Ready:       ptr.To[int32](0),
   647  		Terminating: ptr.To[int32](0),
   648  	})
   649  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To("2"))
   650  
   651  	// mark remaining pods are Succeeded and verify Job status
   652  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
   653  		t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
   654  	}
   655  	validateJobFailed(ctx, t, clientSet, jobObj)
   656  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
   657  }
   658  
   659  // TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff tests that the
   660  // pods are recreated with expotential backoff delay computed independently
   661  // per index. Scenario:
   662  // - fail index 0
   663  // - fail index 0
   664  // - fail index 1
   665  // - succeed index 0
   666  // - fail index 1
   667  // - succeed index 1
   668  func TestBackoffLimitPerIndex_JobPodsCreatedWithExponentialBackoff(t *testing.T) {
   669  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
   670  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second))
   671  
   672  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
   673  	defer closeFn()
   674  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
   675  	defer cancel()
   676  
   677  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
   678  		Spec: batchv1.JobSpec{
   679  			Completions:          ptr.To[int32](2),
   680  			Parallelism:          ptr.To[int32](2),
   681  			BackoffLimitPerIndex: ptr.To[int32](2),
   682  			CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   683  		},
   684  	})
   685  	if err != nil {
   686  		t.Fatalf("Could not create job: %v", err)
   687  	}
   688  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   689  		Active:      2,
   690  		Ready:       ptr.To[int32](0),
   691  		Terminating: ptr.To[int32](0),
   692  	})
   693  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
   694  
   695  	// Fail the first pod for index 0
   696  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
   697  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
   698  	}
   699  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   700  		Active:      2,
   701  		Failed:      1,
   702  		Ready:       ptr.To[int32](0),
   703  		Terminating: ptr.To[int32](0),
   704  	})
   705  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
   706  
   707  	// Fail the second pod for index 0
   708  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 0); err != nil {
   709  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
   710  	}
   711  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   712  		Active:      2,
   713  		Failed:      2,
   714  		Ready:       ptr.To[int32](0),
   715  		Terminating: ptr.To[int32](0),
   716  	})
   717  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
   718  
   719  	// Fail the first pod for index 1
   720  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
   721  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
   722  	}
   723  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   724  		Active:      2,
   725  		Failed:      3,
   726  		Ready:       ptr.To[int32](0),
   727  		Terminating: ptr.To[int32](0),
   728  	})
   729  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1), "", ptr.To(""))
   730  
   731  	// Succeed the third pod for index 0
   732  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 0); err != nil {
   733  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
   734  	}
   735  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   736  		Active:      1,
   737  		Failed:      3,
   738  		Succeeded:   1,
   739  		Ready:       ptr.To[int32](0),
   740  		Terminating: ptr.To[int32](0),
   741  	})
   742  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To(""))
   743  
   744  	// Fail the second pod for index 1
   745  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
   746  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
   747  	}
   748  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   749  		Active:      1,
   750  		Failed:      4,
   751  		Succeeded:   1,
   752  		Ready:       ptr.To[int32](0),
   753  		Terminating: ptr.To[int32](0),
   754  	})
   755  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(1), "0", ptr.To(""))
   756  
   757  	// Succeed the third pod for index 1
   758  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
   759  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
   760  	}
   761  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
   762  		Active:      0,
   763  		Failed:      4,
   764  		Succeeded:   2,
   765  		Ready:       ptr.To[int32](0),
   766  		Terminating: ptr.To[int32](0),
   767  	})
   768  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New[int](), "0,1", ptr.To(""))
   769  	validateJobSucceeded(ctx, t, clientSet, jobObj)
   770  
   771  	for index := 0; index < int(*jobObj.Spec.Completions); index++ {
   772  		podsForIndex, err := getJobPodsForIndex(ctx, clientSet, jobObj, index, func(_ *v1.Pod) bool { return true })
   773  		if err != nil {
   774  			t.Fatalf("Failed to list job %q pods for index %v, error: %v", klog.KObj(jobObj), index, err)
   775  		}
   776  		validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, podsForIndex)
   777  	}
   778  }
   779  
   780  // TestBackoffLimitPerIndex tests handling of job and its pods when
   781  // backoff limit per index is used.
   782  func TestBackoffLimitPerIndex(t *testing.T) {
   783  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
   784  
   785  	type podTerminationWithExpectations struct {
   786  		index                          int
   787  		status                         v1.PodStatus
   788  		wantActive                     int
   789  		wantFailed                     int
   790  		wantSucceeded                  int
   791  		wantActiveIndexes              sets.Set[int]
   792  		wantCompletedIndexes           string
   793  		wantFailedIndexes              *string
   794  		wantReplacementPodFailureCount *int
   795  	}
   796  
   797  	podTemplateSpec := v1.PodTemplateSpec{
   798  		Spec: v1.PodSpec{
   799  			Containers: []v1.Container{
   800  				{
   801  					Name:                     "main-container",
   802  					Image:                    "foo",
   803  					ImagePullPolicy:          v1.PullIfNotPresent,
   804  					TerminationMessagePolicy: v1.TerminationMessageFallbackToLogsOnError,
   805  				},
   806  			},
   807  		},
   808  	}
   809  	testCases := map[string]struct {
   810  		job                               batchv1.Job
   811  		podTerminations                   []podTerminationWithExpectations
   812  		wantJobConditionType              batchv1.JobConditionType
   813  		wantJobFinishedIndexesTotalMetric []metricLabelsWithValue
   814  	}{
   815  		"job succeeded": {
   816  			job: batchv1.Job{
   817  				Spec: batchv1.JobSpec{
   818  					Parallelism:          ptr.To[int32](2),
   819  					Completions:          ptr.To[int32](2),
   820  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   821  					BackoffLimitPerIndex: ptr.To[int32](1),
   822  					Template:             podTemplateSpec,
   823  				},
   824  			},
   825  			podTerminations: []podTerminationWithExpectations{
   826  				{
   827  					status: v1.PodStatus{
   828  						Phase: v1.PodFailed,
   829  					},
   830  					wantActive:                     2,
   831  					wantFailed:                     1,
   832  					wantActiveIndexes:              sets.New(0, 1),
   833  					wantFailedIndexes:              ptr.To(""),
   834  					wantReplacementPodFailureCount: ptr.To(1),
   835  				},
   836  			},
   837  			wantJobConditionType: batchv1.JobComplete,
   838  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
   839  				{
   840  					Labels: []string{"succeeded", "perIndex"},
   841  					Value:  2,
   842  				},
   843  			},
   844  		},
   845  		"job index fails due to exceeding backoff limit per index": {
   846  			job: batchv1.Job{
   847  				Spec: batchv1.JobSpec{
   848  					Parallelism:          ptr.To[int32](2),
   849  					Completions:          ptr.To[int32](2),
   850  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   851  					BackoffLimitPerIndex: ptr.To[int32](2),
   852  					Template:             podTemplateSpec,
   853  				},
   854  			},
   855  			podTerminations: []podTerminationWithExpectations{
   856  				{
   857  					status: v1.PodStatus{
   858  						Phase: v1.PodFailed,
   859  					},
   860  					wantActive:                     2,
   861  					wantFailed:                     1,
   862  					wantActiveIndexes:              sets.New(0, 1),
   863  					wantFailedIndexes:              ptr.To(""),
   864  					wantReplacementPodFailureCount: ptr.To(1),
   865  				},
   866  				{
   867  					status: v1.PodStatus{
   868  						Phase: v1.PodFailed,
   869  					},
   870  					wantActive:                     2,
   871  					wantFailed:                     2,
   872  					wantActiveIndexes:              sets.New(0, 1),
   873  					wantFailedIndexes:              ptr.To(""),
   874  					wantReplacementPodFailureCount: ptr.To(2),
   875  				},
   876  				{
   877  					status: v1.PodStatus{
   878  						Phase: v1.PodFailed,
   879  					},
   880  					wantActive:        1,
   881  					wantFailed:        3,
   882  					wantActiveIndexes: sets.New(1),
   883  					wantFailedIndexes: ptr.To("0"),
   884  				},
   885  			},
   886  			wantJobConditionType: batchv1.JobFailed,
   887  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
   888  				{
   889  					Labels: []string{"failed", "perIndex"},
   890  					Value:  1,
   891  				},
   892  				{
   893  					Labels: []string{"succeeded", "perIndex"},
   894  					Value:  1,
   895  				},
   896  			},
   897  		},
   898  		"job index fails due to exceeding the global backoff limit first": {
   899  			job: batchv1.Job{
   900  				Spec: batchv1.JobSpec{
   901  					Parallelism:          ptr.To[int32](3),
   902  					Completions:          ptr.To[int32](3),
   903  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   904  					BackoffLimitPerIndex: ptr.To[int32](1),
   905  					BackoffLimit:         ptr.To[int32](2),
   906  					Template:             podTemplateSpec,
   907  				},
   908  			},
   909  			podTerminations: []podTerminationWithExpectations{
   910  				{
   911  					index: 0,
   912  					status: v1.PodStatus{
   913  						Phase: v1.PodFailed,
   914  					},
   915  					wantActive:        3,
   916  					wantFailed:        1,
   917  					wantActiveIndexes: sets.New(0, 1, 2),
   918  					wantFailedIndexes: ptr.To(""),
   919  				},
   920  				{
   921  					index: 1,
   922  					status: v1.PodStatus{
   923  						Phase: v1.PodFailed,
   924  					},
   925  					wantActive:        3,
   926  					wantFailed:        2,
   927  					wantActiveIndexes: sets.New(0, 1, 2),
   928  					wantFailedIndexes: ptr.To(""),
   929  				},
   930  				{
   931  					index: 2,
   932  					status: v1.PodStatus{
   933  						Phase: v1.PodFailed,
   934  					},
   935  					wantFailed:        5,
   936  					wantFailedIndexes: ptr.To(""),
   937  				},
   938  			},
   939  			wantJobConditionType: batchv1.JobFailed,
   940  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
   941  				{
   942  					Labels: []string{"succeeded", "perIndex"},
   943  					Value:  0,
   944  				},
   945  				{
   946  					Labels: []string{"failed", "perIndex"},
   947  					Value:  0,
   948  				},
   949  			},
   950  		},
   951  		"job continues execution after a failed index, the job is marked Failed due to the failed index": {
   952  			job: batchv1.Job{
   953  				Spec: batchv1.JobSpec{
   954  					Parallelism:          ptr.To[int32](2),
   955  					Completions:          ptr.To[int32](2),
   956  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
   957  					BackoffLimitPerIndex: ptr.To[int32](0),
   958  					Template:             podTemplateSpec,
   959  				},
   960  			},
   961  			podTerminations: []podTerminationWithExpectations{
   962  				{
   963  					index: 0,
   964  					status: v1.PodStatus{
   965  						Phase: v1.PodFailed,
   966  					},
   967  					wantActive:        1,
   968  					wantFailed:        1,
   969  					wantActiveIndexes: sets.New(1),
   970  					wantFailedIndexes: ptr.To("0"),
   971  				},
   972  				{
   973  					index: 1,
   974  					status: v1.PodStatus{
   975  						Phase: v1.PodSucceeded,
   976  					},
   977  					wantFailed:           1,
   978  					wantSucceeded:        1,
   979  					wantFailedIndexes:    ptr.To("0"),
   980  					wantCompletedIndexes: "1",
   981  				},
   982  			},
   983  			wantJobConditionType: batchv1.JobFailed,
   984  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
   985  				{
   986  					Labels: []string{"succeeded", "perIndex"},
   987  					Value:  1,
   988  				},
   989  				{
   990  					Labels: []string{"failed", "perIndex"},
   991  					Value:  1,
   992  				},
   993  			},
   994  		},
   995  		"job execution terminated early due to exceeding max failed indexes": {
   996  			job: batchv1.Job{
   997  				Spec: batchv1.JobSpec{
   998  					Parallelism:          ptr.To[int32](3),
   999  					Completions:          ptr.To[int32](3),
  1000  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
  1001  					BackoffLimitPerIndex: ptr.To[int32](0),
  1002  					MaxFailedIndexes:     ptr.To[int32](1),
  1003  					Template:             podTemplateSpec,
  1004  				},
  1005  			},
  1006  			podTerminations: []podTerminationWithExpectations{
  1007  				{
  1008  					index: 0,
  1009  					status: v1.PodStatus{
  1010  						Phase: v1.PodFailed,
  1011  					},
  1012  					wantActive:        2,
  1013  					wantFailed:        1,
  1014  					wantActiveIndexes: sets.New(1, 2),
  1015  					wantFailedIndexes: ptr.To("0"),
  1016  				},
  1017  				{
  1018  					index: 1,
  1019  					status: v1.PodStatus{
  1020  						Phase: v1.PodFailed,
  1021  					},
  1022  					wantActive:        0,
  1023  					wantFailed:        3,
  1024  					wantFailedIndexes: ptr.To("0,1"),
  1025  				},
  1026  			},
  1027  			wantJobConditionType: batchv1.JobFailed,
  1028  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
  1029  				{
  1030  					Labels: []string{"failed", "perIndex"},
  1031  					Value:  2,
  1032  				},
  1033  			},
  1034  		},
  1035  		"pod failure matching pod failure policy rule with FailIndex action": {
  1036  			job: batchv1.Job{
  1037  				Spec: batchv1.JobSpec{
  1038  					Parallelism:          ptr.To[int32](2),
  1039  					Completions:          ptr.To[int32](2),
  1040  					CompletionMode:       completionModePtr(batchv1.IndexedCompletion),
  1041  					BackoffLimitPerIndex: ptr.To[int32](1),
  1042  					Template:             podTemplateSpec,
  1043  					PodFailurePolicy: &batchv1.PodFailurePolicy{
  1044  						Rules: []batchv1.PodFailurePolicyRule{
  1045  							{
  1046  								Action: batchv1.PodFailurePolicyActionFailIndex,
  1047  								OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
  1048  									Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
  1049  									Values:   []int32{13},
  1050  								},
  1051  							},
  1052  							{
  1053  								Action: batchv1.PodFailurePolicyActionFailIndex,
  1054  								OnPodConditions: []batchv1.PodFailurePolicyOnPodConditionsPattern{
  1055  									{
  1056  										Type:   v1.DisruptionTarget,
  1057  										Status: v1.ConditionTrue,
  1058  									},
  1059  								},
  1060  							},
  1061  						},
  1062  					},
  1063  				},
  1064  			},
  1065  			podTerminations: []podTerminationWithExpectations{
  1066  				{
  1067  					index: 0,
  1068  					status: v1.PodStatus{
  1069  						Phase: v1.PodFailed,
  1070  						ContainerStatuses: []v1.ContainerStatus{
  1071  							{
  1072  								State: v1.ContainerState{
  1073  									Terminated: &v1.ContainerStateTerminated{
  1074  										ExitCode: 13,
  1075  									},
  1076  								},
  1077  							},
  1078  						},
  1079  					},
  1080  					wantActive:        1,
  1081  					wantFailed:        1,
  1082  					wantActiveIndexes: sets.New(1),
  1083  					wantFailedIndexes: ptr.To("0"),
  1084  				},
  1085  				{
  1086  					index: 1,
  1087  					status: v1.PodStatus{
  1088  						Phase: v1.PodFailed,
  1089  						Conditions: []v1.PodCondition{
  1090  							{
  1091  								Type:   v1.DisruptionTarget,
  1092  								Status: v1.ConditionTrue,
  1093  							},
  1094  						},
  1095  					},
  1096  					wantFailed:        2,
  1097  					wantFailedIndexes: ptr.To("0,1"),
  1098  				},
  1099  			},
  1100  			wantJobConditionType: batchv1.JobFailed,
  1101  			wantJobFinishedIndexesTotalMetric: []metricLabelsWithValue{
  1102  				{
  1103  					Labels: []string{"failed", "perIndex"},
  1104  					Value:  2,
  1105  				},
  1106  			},
  1107  		},
  1108  	}
  1109  	for name, test := range testCases {
  1110  		t.Run(name, func(t *testing.T) {
  1111  			resetMetrics()
  1112  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, true)()
  1113  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, true)()
  1114  
  1115  			closeFn, restConfig, clientSet, ns := setup(t, "simple")
  1116  			defer closeFn()
  1117  			ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1118  			defer func() {
  1119  				cancel()
  1120  			}()
  1121  			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &test.job)
  1122  			if err != nil {
  1123  				t.Fatalf("Error %q while creating the job %q", err, jobObj.Name)
  1124  			}
  1125  			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1126  				Active:      int(*test.job.Spec.Parallelism),
  1127  				Ready:       ptr.To[int32](0),
  1128  				Terminating: ptr.To[int32](0),
  1129  			})
  1130  			for _, podTermination := range test.podTerminations {
  1131  				pod, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
  1132  				if err != nil {
  1133  					t.Fatalf("listing Job Pods: %q", err)
  1134  				}
  1135  				pod.Status = podTermination.status
  1136  				if _, err = clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, pod, metav1.UpdateOptions{}); err != nil {
  1137  					t.Fatalf("Error updating the pod %q: %q", klog.KObj(pod), err)
  1138  				}
  1139  				validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1140  					Active:      podTermination.wantActive,
  1141  					Succeeded:   podTermination.wantSucceeded,
  1142  					Failed:      podTermination.wantFailed,
  1143  					Ready:       ptr.To[int32](0),
  1144  					Terminating: ptr.To[int32](0),
  1145  				})
  1146  				validateIndexedJobPods(ctx, t, clientSet, jobObj, podTermination.wantActiveIndexes, podTermination.wantCompletedIndexes, podTermination.wantFailedIndexes)
  1147  				if podTermination.wantReplacementPodFailureCount != nil {
  1148  					replacement, err := getActivePodForIndex(ctx, clientSet, jobObj, podTermination.index)
  1149  					if err != nil {
  1150  						t.Fatalf("Failed to get active replacement pod for index: %v, error: %v", podTermination.index, err)
  1151  					}
  1152  					gotReplacementPodFailureCount, err := getIndexFailureCount(replacement)
  1153  					if err != nil {
  1154  						t.Fatalf("Failed read the index failure count annotation for pod: %v, error: %v", klog.KObj(replacement), err)
  1155  					}
  1156  					if *podTermination.wantReplacementPodFailureCount != gotReplacementPodFailureCount {
  1157  						t.Fatalf("Unexpected value of the index failure count annotation. Want: %v, got: %v", *podTermination.wantReplacementPodFailureCount, gotReplacementPodFailureCount)
  1158  					}
  1159  				}
  1160  			}
  1161  
  1162  			remainingActive := test.podTerminations[len(test.podTerminations)-1].wantActive
  1163  			if remainingActive > 0 {
  1164  				if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remainingActive); err != nil {
  1165  					t.Fatalf("Failed setting phase %q on Job Pod: %q", v1.PodSucceeded, err)
  1166  				}
  1167  			}
  1168  			validateJobCondition(ctx, t, clientSet, jobObj, test.wantJobConditionType)
  1169  			for _, wantMetricValue := range test.wantJobFinishedIndexesTotalMetric {
  1170  				validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, wantMetricValue)
  1171  			}
  1172  			validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1173  		})
  1174  	}
  1175  }
  1176  
  1177  func getIndexFailureCount(p *v1.Pod) (int, error) {
  1178  	if p.Annotations == nil {
  1179  		return 0, errors.New("no annotations found")
  1180  	}
  1181  	v, ok := p.Annotations[batchv1.JobIndexFailureCountAnnotation]
  1182  	if !ok {
  1183  		return 0, fmt.Errorf("annotation %s not found", batchv1.JobIndexFailureCountAnnotation)
  1184  	}
  1185  	return strconv.Atoi(v)
  1186  }
  1187  
  1188  func completionModePtr(cm batchv1.CompletionMode) *batchv1.CompletionMode {
  1189  	return &cm
  1190  }
  1191  
  1192  // TestNonParallelJob tests that a Job that only executes one Pod. The test
  1193  // recreates the Job controller at some points to make sure a new controller
  1194  // is able to pickup.
  1195  func TestNonParallelJob(t *testing.T) {
  1196  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
  1197  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
  1198  	defer closeFn()
  1199  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1200  	defer func() {
  1201  		cancel()
  1202  	}()
  1203  
  1204  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{})
  1205  	if err != nil {
  1206  		t.Fatalf("Failed to create Job: %v", err)
  1207  	}
  1208  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1209  		Active:      1,
  1210  		Ready:       ptr.To[int32](0),
  1211  		Terminating: ptr.To[int32](0),
  1212  	})
  1213  
  1214  	// Restarting controller.
  1215  	cancel()
  1216  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  1217  
  1218  	// Failed Pod is replaced.
  1219  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
  1220  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
  1221  	}
  1222  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1223  		Active:      1,
  1224  		Failed:      1,
  1225  		Ready:       ptr.To[int32](0),
  1226  		Terminating: ptr.To[int32](0),
  1227  	})
  1228  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1229  		Labels: []string{"NonIndexed", "failed"},
  1230  		Value:  1,
  1231  	})
  1232  
  1233  	// Restarting controller.
  1234  	cancel()
  1235  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  1236  
  1237  	// No more Pods are created after the Pod succeeds.
  1238  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
  1239  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
  1240  	}
  1241  	validateJobSucceeded(ctx, t, clientSet, jobObj)
  1242  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1243  		Failed:      1,
  1244  		Succeeded:   1,
  1245  		Ready:       ptr.To[int32](0),
  1246  		Terminating: ptr.To[int32](0),
  1247  	})
  1248  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1249  	validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
  1250  		Labels: []string{"NonIndexed", "succeeded", ""},
  1251  		Value:  1,
  1252  	})
  1253  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1254  		Labels: []string{"NonIndexed", "succeeded"},
  1255  		Value:  1,
  1256  	})
  1257  }
  1258  
  1259  func TestParallelJob(t *testing.T) {
  1260  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
  1261  	closeFn, restConfig, clientSet, ns := setup(t, "parallel")
  1262  	defer closeFn()
  1263  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1264  	defer cancel()
  1265  	resetMetrics()
  1266  
  1267  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1268  		Spec: batchv1.JobSpec{
  1269  			Parallelism: ptr.To[int32](5),
  1270  		},
  1271  	})
  1272  	if err != nil {
  1273  		t.Fatalf("Failed to create Job: %v", err)
  1274  	}
  1275  	want := podsByStatus{
  1276  		Active:      5,
  1277  		Ready:       ptr.To[int32](0),
  1278  		Terminating: ptr.To[int32](0),
  1279  	}
  1280  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1281  
  1282  	// Tracks ready pods, if enabled.
  1283  	if err, _ := setJobPodsReady(ctx, clientSet, jobObj, 2); err != nil {
  1284  		t.Fatalf("Failed Marking Pods as ready: %v", err)
  1285  	}
  1286  	want.Ready = ptr.To[int32](2)
  1287  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1288  
  1289  	// Failed Pods are replaced.
  1290  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
  1291  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
  1292  	}
  1293  	want = podsByStatus{
  1294  		Active:      5,
  1295  		Failed:      2,
  1296  		Ready:       ptr.To[int32](0),
  1297  		Terminating: ptr.To[int32](0),
  1298  	}
  1299  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1300  	// Once one Pod succeeds, no more Pods are created, even if some fail.
  1301  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
  1302  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
  1303  	}
  1304  	want = podsByStatus{
  1305  		Failed:      2,
  1306  		Succeeded:   1,
  1307  		Active:      4,
  1308  		Ready:       ptr.To[int32](0),
  1309  		Terminating: ptr.To[int32](0),
  1310  	}
  1311  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1312  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
  1313  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
  1314  	}
  1315  	want = podsByStatus{
  1316  		Failed:      4,
  1317  		Succeeded:   1,
  1318  		Active:      2,
  1319  		Ready:       ptr.To[int32](0),
  1320  		Terminating: ptr.To[int32](0),
  1321  	}
  1322  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1323  	// No more Pods are created after remaining Pods succeed.
  1324  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 2); err != nil {
  1325  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err)
  1326  	}
  1327  	validateJobSucceeded(ctx, t, clientSet, jobObj)
  1328  	want = podsByStatus{
  1329  		Failed:      4,
  1330  		Succeeded:   3,
  1331  		Ready:       ptr.To[int32](0),
  1332  		Terminating: ptr.To[int32](0),
  1333  	}
  1334  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1335  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1336  	validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 7)
  1337  	validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
  1338  		Labels: []string{"NonIndexed", "succeeded", ""},
  1339  		Value:  1,
  1340  	})
  1341  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1342  		Labels: []string{"NonIndexed", "succeeded"},
  1343  		Value:  3,
  1344  	})
  1345  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1346  		Labels: []string{"NonIndexed", "failed"},
  1347  		Value:  4,
  1348  	})
  1349  }
  1350  
  1351  func TestParallelJobChangingParallelism(t *testing.T) {
  1352  	closeFn, restConfig, clientSet, ns := setup(t, "parallel")
  1353  	defer closeFn()
  1354  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1355  	defer cancel()
  1356  
  1357  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1358  		Spec: batchv1.JobSpec{
  1359  			BackoffLimit: ptr.To[int32](2),
  1360  			Parallelism:  ptr.To[int32](5),
  1361  		},
  1362  	})
  1363  	if err != nil {
  1364  		t.Fatalf("Failed to create Job: %v", err)
  1365  	}
  1366  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1367  		Active:      5,
  1368  		Ready:       ptr.To[int32](0),
  1369  		Terminating: ptr.To[int32](0),
  1370  	})
  1371  
  1372  	// Reduce parallelism by a number greater than backoffLimit.
  1373  	patch := []byte(`{"spec":{"parallelism":2}}`)
  1374  	jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{})
  1375  	if err != nil {
  1376  		t.Fatalf("Updating Job: %v", err)
  1377  	}
  1378  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1379  		Active:      2,
  1380  		Ready:       ptr.To[int32](0),
  1381  		Terminating: ptr.To[int32](0),
  1382  	})
  1383  
  1384  	// Increase parallelism again.
  1385  	patch = []byte(`{"spec":{"parallelism":4}}`)
  1386  	jobObj, err = clientSet.BatchV1().Jobs(ns.Name).Patch(ctx, jobObj.Name, types.StrategicMergePatchType, patch, metav1.PatchOptions{})
  1387  	if err != nil {
  1388  		t.Fatalf("Updating Job: %v", err)
  1389  	}
  1390  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1391  		Active:      4,
  1392  		Ready:       ptr.To[int32](0),
  1393  		Terminating: ptr.To[int32](0),
  1394  	})
  1395  
  1396  	// Succeed Job
  1397  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 4); err != nil {
  1398  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
  1399  	}
  1400  	validateJobSucceeded(ctx, t, clientSet, jobObj)
  1401  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1402  		Succeeded:   4,
  1403  		Ready:       ptr.To[int32](0),
  1404  		Terminating: ptr.To[int32](0),
  1405  	})
  1406  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1407  }
  1408  
  1409  func TestParallelJobWithCompletions(t *testing.T) {
  1410  	// Lower limits for a job sync so that we can test partial updates with a low
  1411  	// number of pods.
  1412  	t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 10))
  1413  	t.Cleanup(setDuringTest(&jobcontroller.MaxPodCreateDeletePerSync, 10))
  1414  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
  1415  	closeFn, restConfig, clientSet, ns := setup(t, "completions")
  1416  	defer closeFn()
  1417  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1418  	defer cancel()
  1419  	resetMetrics()
  1420  
  1421  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1422  		Spec: batchv1.JobSpec{
  1423  			Parallelism: ptr.To[int32](54),
  1424  			Completions: ptr.To[int32](56),
  1425  		},
  1426  	})
  1427  	if err != nil {
  1428  		t.Fatalf("Failed to create Job: %v", err)
  1429  	}
  1430  	want := podsByStatus{
  1431  		Active:      54,
  1432  		Ready:       ptr.To[int32](0),
  1433  		Terminating: ptr.To[int32](0),
  1434  	}
  1435  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1436  	// Tracks ready pods, if enabled.
  1437  	if err, _ := setJobPodsReady(ctx, clientSet, jobObj, 52); err != nil {
  1438  		t.Fatalf("Failed Marking Pods as ready: %v", err)
  1439  	}
  1440  	want.Ready = ptr.To[int32](52)
  1441  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1442  
  1443  	// Failed Pods are replaced.
  1444  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
  1445  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodFailed, err)
  1446  	}
  1447  	want = podsByStatus{
  1448  		Active:      54,
  1449  		Failed:      2,
  1450  		Ready:       ptr.To[int32](50),
  1451  		Terminating: ptr.To[int32](0),
  1452  	}
  1453  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1454  	// Pods are created until the number of succeeded Pods equals completions.
  1455  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 53); err != nil {
  1456  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodSucceeded, err)
  1457  	}
  1458  	want = podsByStatus{
  1459  		Failed:      2,
  1460  		Succeeded:   53,
  1461  		Active:      3,
  1462  		Ready:       ptr.To[int32](0),
  1463  		Terminating: ptr.To[int32](0),
  1464  	}
  1465  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1466  	// No more Pods are created after the Job completes.
  1467  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
  1468  		t.Fatalf("Failed setting phase %s on Job Pods: %v", v1.PodSucceeded, err)
  1469  	}
  1470  	validateJobSucceeded(ctx, t, clientSet, jobObj)
  1471  	want = podsByStatus{
  1472  		Failed:      2,
  1473  		Succeeded:   56,
  1474  		Ready:       ptr.To[int32](0),
  1475  		Terminating: ptr.To[int32](0),
  1476  	}
  1477  	validateJobPodsStatus(ctx, t, clientSet, jobObj, want)
  1478  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1479  	validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
  1480  		Labels: []string{"NonIndexed", "succeeded", ""},
  1481  		Value:  1,
  1482  	})
  1483  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1484  		Labels: []string{"NonIndexed", "succeeded"},
  1485  		Value:  56,
  1486  	})
  1487  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1488  		Labels: []string{"NonIndexed", "failed"},
  1489  		Value:  2,
  1490  	})
  1491  }
  1492  
  1493  func TestIndexedJob(t *testing.T) {
  1494  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
  1495  	closeFn, restConfig, clientSet, ns := setup(t, "indexed")
  1496  	defer closeFn()
  1497  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1498  	defer cancel()
  1499  	resetMetrics()
  1500  
  1501  	mode := batchv1.IndexedCompletion
  1502  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1503  		Spec: batchv1.JobSpec{
  1504  			Parallelism:    ptr.To[int32](3),
  1505  			Completions:    ptr.To[int32](4),
  1506  			CompletionMode: &mode,
  1507  		},
  1508  	})
  1509  	if err != nil {
  1510  		t.Fatalf("Failed to create Job: %v", err)
  1511  	}
  1512  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1513  		Active:      3,
  1514  		Ready:       ptr.To[int32](0),
  1515  		Terminating: ptr.To[int32](0),
  1516  	})
  1517  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 1, 2), "", nil)
  1518  	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
  1519  		Labels: []string{"succeeded", "global"},
  1520  		Value:  0,
  1521  	})
  1522  
  1523  	// One Pod succeeds.
  1524  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, 1); err != nil {
  1525  		t.Fatal("Failed trying to succeed pod with index 1")
  1526  	}
  1527  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1528  		Active:      3,
  1529  		Succeeded:   1,
  1530  		Ready:       ptr.To[int32](0),
  1531  		Terminating: ptr.To[int32](0),
  1532  	})
  1533  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
  1534  	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
  1535  		Labels: []string{"succeeded", "global"},
  1536  		Value:  1,
  1537  	})
  1538  
  1539  	// One Pod fails, which should be recreated.
  1540  	if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, 2); err != nil {
  1541  		t.Fatal("Failed trying to succeed pod with index 2")
  1542  	}
  1543  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1544  		Active:      3,
  1545  		Failed:      1,
  1546  		Succeeded:   1,
  1547  		Ready:       ptr.To[int32](0),
  1548  		Terminating: ptr.To[int32](0),
  1549  	})
  1550  	validateIndexedJobPods(ctx, t, clientSet, jobObj, sets.New(0, 2, 3), "1", nil)
  1551  	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
  1552  		Labels: []string{"succeeded", "global"},
  1553  		Value:  1,
  1554  	})
  1555  
  1556  	// Remaining Pods succeed.
  1557  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, 3); err != nil {
  1558  		t.Fatal("Failed trying to succeed remaining pods")
  1559  	}
  1560  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  1561  		Active:      0,
  1562  		Failed:      1,
  1563  		Succeeded:   4,
  1564  		Ready:       ptr.To[int32](0),
  1565  		Terminating: ptr.To[int32](0),
  1566  	})
  1567  	validateIndexedJobPods(ctx, t, clientSet, jobObj, nil, "0-3", nil)
  1568  	validateJobSucceeded(ctx, t, clientSet, jobObj)
  1569  	validateFinishedPodsNoFinalizer(ctx, t, clientSet, jobObj)
  1570  	validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 5)
  1571  	validateCounterMetric(ctx, t, metrics.JobFinishedIndexesTotal, metricLabelsWithValue{
  1572  		Labels: []string{"succeeded", "global"},
  1573  		Value:  4,
  1574  	})
  1575  	validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
  1576  		Labels: []string{"Indexed", "succeeded", ""},
  1577  		Value:  1,
  1578  	})
  1579  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1580  		Labels: []string{"Indexed", "succeeded"},
  1581  		Value:  4,
  1582  	})
  1583  	validateCounterMetric(ctx, t, metrics.JobPodsFinished, metricLabelsWithValue{
  1584  		Labels: []string{"Indexed", "failed"},
  1585  		Value:  1,
  1586  	})
  1587  }
  1588  
  1589  func TestJobPodReplacementPolicy(t *testing.T) {
  1590  	indexedCompletion := batchv1.IndexedCompletion
  1591  	nonIndexedCompletion := batchv1.NonIndexedCompletion
  1592  	var podReplacementPolicy = func(obj batchv1.PodReplacementPolicy) *batchv1.PodReplacementPolicy {
  1593  		return &obj
  1594  	}
  1595  	type jobStatus struct {
  1596  		active      int
  1597  		failed      int
  1598  		terminating *int32
  1599  	}
  1600  	type jobPodsCreationMetrics struct {
  1601  		new                         int
  1602  		recreateTerminatingOrFailed int
  1603  		recreateFailed              int
  1604  	}
  1605  	cases := map[string]struct {
  1606  		podReplacementPolicyEnabled bool
  1607  		jobSpec                     *batchv1.JobSpec
  1608  		wantStatusAfterDeletion     jobStatus
  1609  		wantStatusAfterFailure      jobStatus
  1610  		wantMetrics                 jobPodsCreationMetrics
  1611  	}{
  1612  		"feature flag off, delete & fail pods, recreate terminating pods, and verify job status counters": {
  1613  			jobSpec: &batchv1.JobSpec{
  1614  				Parallelism:    ptr.To[int32](2),
  1615  				Completions:    ptr.To[int32](2),
  1616  				CompletionMode: &indexedCompletion,
  1617  				Template: v1.PodTemplateSpec{
  1618  					ObjectMeta: metav1.ObjectMeta{
  1619  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1620  					},
  1621  				},
  1622  			},
  1623  			wantStatusAfterDeletion: jobStatus{
  1624  				active: 2,
  1625  				failed: 2,
  1626  			},
  1627  			wantStatusAfterFailure: jobStatus{
  1628  				active: 2,
  1629  				failed: 2,
  1630  			},
  1631  			wantMetrics: jobPodsCreationMetrics{
  1632  				new: 4,
  1633  			},
  1634  		},
  1635  		"feature flag true, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
  1636  			podReplacementPolicyEnabled: true,
  1637  			jobSpec: &batchv1.JobSpec{
  1638  				Parallelism:          ptr.To[int32](2),
  1639  				Completions:          ptr.To[int32](2),
  1640  				CompletionMode:       &indexedCompletion,
  1641  				PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed),
  1642  				Template: v1.PodTemplateSpec{
  1643  					ObjectMeta: metav1.ObjectMeta{
  1644  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1645  					},
  1646  				},
  1647  			},
  1648  			wantStatusAfterDeletion: jobStatus{
  1649  				active:      2,
  1650  				failed:      2,
  1651  				terminating: ptr.To[int32](2),
  1652  			},
  1653  			wantStatusAfterFailure: jobStatus{
  1654  				active:      2,
  1655  				failed:      2,
  1656  				terminating: ptr.To[int32](0),
  1657  			},
  1658  			wantMetrics: jobPodsCreationMetrics{
  1659  				new:                         2,
  1660  				recreateTerminatingOrFailed: 2,
  1661  			},
  1662  		},
  1663  		"feature flag true with NonIndexedJob, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
  1664  			podReplacementPolicyEnabled: true,
  1665  			jobSpec: &batchv1.JobSpec{
  1666  				Parallelism:          ptr.To[int32](2),
  1667  				Completions:          ptr.To[int32](2),
  1668  				CompletionMode:       &indexedCompletion,
  1669  				PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed),
  1670  				Template: v1.PodTemplateSpec{
  1671  					ObjectMeta: metav1.ObjectMeta{
  1672  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1673  					},
  1674  				},
  1675  			},
  1676  			wantStatusAfterDeletion: jobStatus{
  1677  				active:      2,
  1678  				failed:      2,
  1679  				terminating: ptr.To[int32](2),
  1680  			},
  1681  			wantStatusAfterFailure: jobStatus{
  1682  				active:      2,
  1683  				failed:      2,
  1684  				terminating: ptr.To[int32](0),
  1685  			},
  1686  			wantMetrics: jobPodsCreationMetrics{
  1687  				new:                         2,
  1688  				recreateTerminatingOrFailed: 2,
  1689  			},
  1690  		},
  1691  		"feature flag false, podFailurePolicy enabled, delete & fail pods, recreate failed pods, and verify job status counters": {
  1692  			podReplacementPolicyEnabled: false,
  1693  			jobSpec: &batchv1.JobSpec{
  1694  				Parallelism:          ptr.To[int32](2),
  1695  				Completions:          ptr.To[int32](2),
  1696  				CompletionMode:       &nonIndexedCompletion,
  1697  				PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
  1698  				Template: v1.PodTemplateSpec{
  1699  					ObjectMeta: metav1.ObjectMeta{
  1700  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1701  					},
  1702  				},
  1703  				PodFailurePolicy: &batchv1.PodFailurePolicy{
  1704  					Rules: []batchv1.PodFailurePolicyRule{
  1705  						{
  1706  							Action: batchv1.PodFailurePolicyActionFailJob,
  1707  							OnExitCodes: &batchv1.PodFailurePolicyOnExitCodesRequirement{
  1708  								Operator: batchv1.PodFailurePolicyOnExitCodesOpIn,
  1709  								Values:   []int32{5},
  1710  							},
  1711  						},
  1712  					},
  1713  				},
  1714  			},
  1715  			wantStatusAfterDeletion: jobStatus{
  1716  				active: 2,
  1717  			},
  1718  			wantStatusAfterFailure: jobStatus{
  1719  				active: 2,
  1720  			},
  1721  			wantMetrics: jobPodsCreationMetrics{
  1722  				new: 2,
  1723  			},
  1724  		},
  1725  		"feature flag true, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": {
  1726  			podReplacementPolicyEnabled: true,
  1727  			jobSpec: &batchv1.JobSpec{
  1728  				Parallelism:          ptr.To[int32](2),
  1729  				Completions:          ptr.To[int32](2),
  1730  				CompletionMode:       &indexedCompletion,
  1731  				PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
  1732  				Template: v1.PodTemplateSpec{
  1733  					ObjectMeta: metav1.ObjectMeta{
  1734  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1735  					},
  1736  				},
  1737  			},
  1738  			wantStatusAfterDeletion: jobStatus{
  1739  				active:      0,
  1740  				failed:      0,
  1741  				terminating: ptr.To[int32](2),
  1742  			},
  1743  			wantStatusAfterFailure: jobStatus{
  1744  				active:      2,
  1745  				failed:      2,
  1746  				terminating: ptr.To[int32](0),
  1747  			},
  1748  			wantMetrics: jobPodsCreationMetrics{
  1749  				new:            2,
  1750  				recreateFailed: 2,
  1751  			},
  1752  		},
  1753  		"feature flag true with NonIndexedJob, Failed policy, delete & fail pods, recreate failed pods, and verify job status counters": {
  1754  			podReplacementPolicyEnabled: true,
  1755  			jobSpec: &batchv1.JobSpec{
  1756  				Parallelism:          ptr.To[int32](2),
  1757  				Completions:          ptr.To[int32](2),
  1758  				CompletionMode:       &nonIndexedCompletion,
  1759  				PodReplacementPolicy: podReplacementPolicy(batchv1.Failed),
  1760  				Template: v1.PodTemplateSpec{
  1761  					ObjectMeta: metav1.ObjectMeta{
  1762  						Finalizers: []string{"fake.example.com/blockDeletion"},
  1763  					},
  1764  				},
  1765  			},
  1766  			wantStatusAfterDeletion: jobStatus{
  1767  				active:      0,
  1768  				failed:      0,
  1769  				terminating: ptr.To[int32](2),
  1770  			},
  1771  			wantStatusAfterFailure: jobStatus{
  1772  				active:      2,
  1773  				failed:      2,
  1774  				terminating: ptr.To[int32](0),
  1775  			},
  1776  			wantMetrics: jobPodsCreationMetrics{
  1777  				new:            2,
  1778  				recreateFailed: 2,
  1779  			},
  1780  		},
  1781  	}
  1782  	for name, tc := range cases {
  1783  		tc := tc
  1784  		t.Run(name, func(t *testing.T) {
  1785  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, tc.podReplacementPolicyEnabled)()
  1786  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodFailurePolicy, tc.jobSpec.PodFailurePolicy != nil)()
  1787  
  1788  			closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy")
  1789  			t.Cleanup(closeFn)
  1790  			ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1791  			t.Cleanup(cancel)
  1792  			resetMetrics()
  1793  
  1794  			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1795  				Spec: *tc.jobSpec,
  1796  			})
  1797  			if err != nil {
  1798  				t.Fatalf("Failed to create Job: %v", err)
  1799  			}
  1800  			jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
  1801  
  1802  			waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
  1803  			t.Cleanup(func() { removePodsFinalizer(ctx, t, clientSet, ns.Name) })
  1804  
  1805  			deletePods(ctx, t, clientSet, ns.Name)
  1806  
  1807  			validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
  1808  				Terminating: tc.wantStatusAfterDeletion.terminating,
  1809  				Failed:      tc.wantStatusAfterDeletion.failed,
  1810  				Active:      tc.wantStatusAfterDeletion.active,
  1811  				Ready:       ptr.To[int32](0),
  1812  			})
  1813  
  1814  			failTerminatingPods(ctx, t, clientSet, ns.Name)
  1815  			validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
  1816  				Terminating: tc.wantStatusAfterFailure.terminating,
  1817  				Failed:      tc.wantStatusAfterFailure.failed,
  1818  				Active:      tc.wantStatusAfterFailure.active,
  1819  				Ready:       ptr.To[int32](0),
  1820  			})
  1821  
  1822  			validateCounterMetric(
  1823  				ctx,
  1824  				t,
  1825  				metrics.JobPodsCreationTotal,
  1826  				metricLabelsWithValue{Labels: []string{"new", "succeeded"}, Value: tc.wantMetrics.new},
  1827  			)
  1828  			validateCounterMetric(
  1829  				ctx,
  1830  				t,
  1831  				metrics.JobPodsCreationTotal,
  1832  				metricLabelsWithValue{Labels: []string{"recreate_terminating_or_failed", "succeeded"}, Value: tc.wantMetrics.recreateTerminatingOrFailed},
  1833  			)
  1834  			validateCounterMetric(
  1835  				ctx,
  1836  				t,
  1837  				metrics.JobPodsCreationTotal,
  1838  				metricLabelsWithValue{Labels: []string{"recreate_failed", "succeeded"}, Value: tc.wantMetrics.recreateFailed},
  1839  			)
  1840  		})
  1841  	}
  1842  }
  1843  
  1844  // This tests the feature enable -> disable -> enable path for PodReplacementPolicy.
  1845  // We verify that Failed case works as expected when turned on.
  1846  // Disable reverts to previous behavior.
  1847  // Enabling will then match the original failed case.
  1848  func TestJobPodReplacementPolicyFeatureToggling(t *testing.T) {
  1849  	const podCount int32 = 2
  1850  	jobSpec := batchv1.JobSpec{
  1851  		Parallelism:          ptr.To(podCount),
  1852  		Completions:          ptr.To(podCount),
  1853  		CompletionMode:       ptr.To(batchv1.NonIndexedCompletion),
  1854  		PodReplacementPolicy: ptr.To(batchv1.Failed),
  1855  	}
  1856  	wantTerminating := ptr.To(podCount)
  1857  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)()
  1858  	closeFn, restConfig, clientSet, ns := setup(t, "pod-replacement-policy")
  1859  	defer closeFn()
  1860  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  1861  	defer func() {
  1862  		cancel()
  1863  	}()
  1864  	resetMetrics()
  1865  
  1866  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  1867  		Spec: jobSpec,
  1868  	})
  1869  	if err != nil {
  1870  		t.Fatalf("Failed to create Job: %v", err)
  1871  	}
  1872  	jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
  1873  
  1874  	waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
  1875  	deletePods(ctx, t, clientSet, jobObj.Namespace)
  1876  	validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
  1877  		Terminating: wantTerminating,
  1878  		Failed:      0,
  1879  		Ready:       ptr.To[int32](0),
  1880  	})
  1881  	// Disable controller and turn feature off.
  1882  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, false)()
  1883  	cancel()
  1884  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  1885  
  1886  	validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
  1887  		Terminating: nil,
  1888  		Failed:      int(podCount),
  1889  		Ready:       ptr.To[int32](0),
  1890  		Active:      int(podCount),
  1891  	})
  1892  	// Disable the controller and turn feature on again.
  1893  	defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobPodReplacementPolicy, true)()
  1894  	cancel()
  1895  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  1896  	waitForPodsToBeActive(ctx, t, jobClient, 2, jobObj)
  1897  	deletePods(ctx, t, clientSet, jobObj.Namespace)
  1898  
  1899  	validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, podsByStatus{
  1900  		Terminating: wantTerminating,
  1901  		Failed:      int(podCount),
  1902  		Active:      0,
  1903  		Ready:       ptr.To[int32](0),
  1904  	})
  1905  }
  1906  
  1907  func TestElasticIndexedJob(t *testing.T) {
  1908  	const initialCompletions int32 = 3
  1909  	type jobUpdate struct {
  1910  		completions          *int32
  1911  		succeedIndexes       []int
  1912  		failIndexes          []int
  1913  		wantSucceededIndexes string
  1914  		wantFailed           int
  1915  		wantRemainingIndexes sets.Set[int]
  1916  		wantActivePods       int
  1917  	}
  1918  	cases := map[string]struct {
  1919  		featureGate bool
  1920  		jobUpdates  []jobUpdate
  1921  		wantErr     *apierrors.StatusError
  1922  	}{
  1923  		"feature flag off, mutation not allowed": {
  1924  			jobUpdates: []jobUpdate{
  1925  				{
  1926  					completions: ptr.To[int32](4),
  1927  				},
  1928  			},
  1929  			wantErr: apierrors.NewInvalid(
  1930  				schema.GroupKind{Group: "batch", Kind: "Job"},
  1931  				"test-job",
  1932  				field.ErrorList{field.Invalid(field.NewPath("spec", "completions"), 4, "field is immutable")},
  1933  			),
  1934  		},
  1935  		"scale up": {
  1936  			featureGate: true,
  1937  			jobUpdates: []jobUpdate{
  1938  				{
  1939  					// Scale up completions 3->4 then succeed indexes 0-3
  1940  					completions:          ptr.To[int32](4),
  1941  					succeedIndexes:       []int{0, 1, 2, 3},
  1942  					wantSucceededIndexes: "0-3",
  1943  				},
  1944  			},
  1945  		},
  1946  		"scale down": {
  1947  			featureGate: true,
  1948  			jobUpdates: []jobUpdate{
  1949  				// First succeed index 1 and fail index 2 while completions is still original value (3).
  1950  				{
  1951  					succeedIndexes:       []int{1},
  1952  					failIndexes:          []int{2},
  1953  					wantSucceededIndexes: "1",
  1954  					wantFailed:           1,
  1955  					wantRemainingIndexes: sets.New(0, 2),
  1956  					wantActivePods:       2,
  1957  				},
  1958  				// Scale down completions 3->1, verify prev failure out of range still counts
  1959  				// but succeeded out of range does not.
  1960  				{
  1961  					completions:          ptr.To[int32](1),
  1962  					succeedIndexes:       []int{0},
  1963  					wantSucceededIndexes: "0",
  1964  					wantFailed:           1,
  1965  				},
  1966  			},
  1967  		},
  1968  		"index finishes successfully, scale down, scale up": {
  1969  			featureGate: true,
  1970  			jobUpdates: []jobUpdate{
  1971  				// First succeed index 2 while completions is still original value (3).
  1972  				{
  1973  					succeedIndexes:       []int{2},
  1974  					wantSucceededIndexes: "2",
  1975  					wantRemainingIndexes: sets.New(0, 1),
  1976  					wantActivePods:       2,
  1977  				},
  1978  				// Scale completions down 3->2 to exclude previously succeeded index.
  1979  				{
  1980  					completions:          ptr.To[int32](2),
  1981  					wantRemainingIndexes: sets.New(0, 1),
  1982  					wantActivePods:       2,
  1983  				},
  1984  				// Scale completions back up to include previously succeeded index that was temporarily out of range.
  1985  				{
  1986  					completions:          ptr.To[int32](3),
  1987  					succeedIndexes:       []int{0, 1, 2},
  1988  					wantSucceededIndexes: "0-2",
  1989  				},
  1990  			},
  1991  		},
  1992  		"scale down to 0, verify that the job succeeds": {
  1993  			featureGate: true,
  1994  			jobUpdates: []jobUpdate{
  1995  				{
  1996  					completions: ptr.To[int32](0),
  1997  				},
  1998  			},
  1999  		},
  2000  	}
  2001  
  2002  	for name, tc := range cases {
  2003  		tc := tc
  2004  		t.Run(name, func(t *testing.T) {
  2005  			defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.ElasticIndexedJob, tc.featureGate)()
  2006  			closeFn, restConfig, clientSet, ns := setup(t, "indexed")
  2007  			defer closeFn()
  2008  			ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2009  			defer cancel()
  2010  			resetMetrics()
  2011  
  2012  			// Set up initial Job in Indexed completion mode.
  2013  			mode := batchv1.IndexedCompletion
  2014  			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2015  				Spec: batchv1.JobSpec{
  2016  					Parallelism:    ptr.To(initialCompletions),
  2017  					Completions:    ptr.To(initialCompletions),
  2018  					CompletionMode: &mode,
  2019  				},
  2020  			})
  2021  			if err != nil {
  2022  				t.Fatalf("Failed to create Job: %v", err)
  2023  			}
  2024  			jobClient := clientSet.BatchV1().Jobs(jobObj.Namespace)
  2025  
  2026  			// Wait for pods to start up.
  2027  			err = wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
  2028  				job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
  2029  				if err != nil {
  2030  					return false, err
  2031  				}
  2032  				if job.Status.Active == initialCompletions {
  2033  					return true, nil
  2034  				}
  2035  				return false, nil
  2036  			})
  2037  			if err != nil {
  2038  				t.Fatalf("Error waiting for Job pods to become active: %v", err)
  2039  			}
  2040  
  2041  			for _, update := range tc.jobUpdates {
  2042  				// Update Job spec if necessary.
  2043  				if update.completions != nil {
  2044  					if jobObj, err = updateJob(ctx, jobClient, jobObj.Name, func(j *batchv1.Job) {
  2045  						j.Spec.Completions = update.completions
  2046  						j.Spec.Parallelism = update.completions
  2047  					}); err != nil {
  2048  						if diff := cmp.Diff(tc.wantErr, err); diff != "" {
  2049  							t.Fatalf("Unexpected or missing errors (-want/+got): %s", diff)
  2050  						}
  2051  						return
  2052  					}
  2053  				}
  2054  
  2055  				// Succeed specified indexes.
  2056  				for _, idx := range update.succeedIndexes {
  2057  					if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodSucceeded, idx); err != nil {
  2058  						t.Fatalf("Failed trying to succeed pod with index %d: %v", idx, err)
  2059  					}
  2060  				}
  2061  
  2062  				// Fail specified indexes.
  2063  				for _, idx := range update.failIndexes {
  2064  					if err := setJobPhaseForIndex(ctx, clientSet, jobObj, v1.PodFailed, idx); err != nil {
  2065  						t.Fatalf("Failed trying to fail pod with index %d: %v", idx, err)
  2066  					}
  2067  				}
  2068  
  2069  				validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2070  					Active:      update.wantActivePods,
  2071  					Succeeded:   len(update.succeedIndexes),
  2072  					Failed:      update.wantFailed,
  2073  					Ready:       ptr.To[int32](0),
  2074  					Terminating: ptr.To[int32](0),
  2075  				})
  2076  				validateIndexedJobPods(ctx, t, clientSet, jobObj, update.wantRemainingIndexes, update.wantSucceededIndexes, nil)
  2077  			}
  2078  
  2079  			validateJobSucceeded(ctx, t, clientSet, jobObj)
  2080  		})
  2081  	}
  2082  }
  2083  
  2084  // BenchmarkLargeIndexedJob benchmarks the completion of an Indexed Job.
  2085  // We expect that large jobs are more commonly used as Indexed. And they are
  2086  // also faster to track, as they need less API calls.
  2087  func BenchmarkLargeIndexedJob(b *testing.B) {
  2088  	closeFn, restConfig, clientSet, ns := setup(b, "indexed")
  2089  	restConfig.QPS = 100
  2090  	restConfig.Burst = 100
  2091  	defer closeFn()
  2092  	ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig)
  2093  	defer cancel()
  2094  	backoff := wait.Backoff{
  2095  		Duration: time.Second,
  2096  		Factor:   1.5,
  2097  		Steps:    30,
  2098  		Cap:      5 * time.Minute,
  2099  	}
  2100  	cases := map[string]struct {
  2101  		nPods                int32
  2102  		backoffLimitPerIndex *int32
  2103  	}{
  2104  		"regular indexed job without failures; size=10": {
  2105  			nPods: 10,
  2106  		},
  2107  		"job with backoffLimitPerIndex without failures; size=10": {
  2108  			nPods:                10,
  2109  			backoffLimitPerIndex: ptr.To[int32](1),
  2110  		},
  2111  		"regular indexed job without failures; size=100": {
  2112  			nPods: 100,
  2113  		},
  2114  		"job with backoffLimitPerIndex without failures; size=100": {
  2115  			nPods:                100,
  2116  			backoffLimitPerIndex: ptr.To[int32](1),
  2117  		},
  2118  	}
  2119  	mode := batchv1.IndexedCompletion
  2120  	for name, tc := range cases {
  2121  		b.Run(name, func(b *testing.B) {
  2122  			enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
  2123  			defer featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)()
  2124  			b.ResetTimer()
  2125  			for n := 0; n < b.N; n++ {
  2126  				b.StartTimer()
  2127  				jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2128  					ObjectMeta: metav1.ObjectMeta{
  2129  						Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex),
  2130  					},
  2131  					Spec: batchv1.JobSpec{
  2132  						Parallelism:          ptr.To(tc.nPods),
  2133  						Completions:          ptr.To(tc.nPods),
  2134  						CompletionMode:       &mode,
  2135  						BackoffLimitPerIndex: tc.backoffLimitPerIndex,
  2136  					},
  2137  				})
  2138  				if err != nil {
  2139  					b.Fatalf("Failed to create Job: %v", err)
  2140  				}
  2141  				b.Cleanup(func() {
  2142  					if err := cleanUp(ctx, clientSet, jobObj); err != nil {
  2143  						b.Fatalf("Failed cleanup: %v", err)
  2144  					}
  2145  				})
  2146  				remaining := int(tc.nPods)
  2147  				if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) {
  2148  					if err, succ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil {
  2149  						remaining -= succ
  2150  						b.Logf("Transient failure succeeding pods: %v", err)
  2151  						return false, nil
  2152  					}
  2153  					return true, nil
  2154  				}); err != nil {
  2155  					b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
  2156  				}
  2157  				validateJobSucceeded(ctx, b, clientSet, jobObj)
  2158  				b.StopTimer()
  2159  			}
  2160  		})
  2161  	}
  2162  }
  2163  
  2164  // BenchmarkLargeFailureHandling benchmarks the handling of numerous pod failures
  2165  // of an Indexed Job. We set minimal backoff delay to make the job controller
  2166  // performance comparable for indexed jobs with global backoffLimit, and those
  2167  // with backoffLimit per-index, despite different patterns of handling failures.
  2168  func BenchmarkLargeFailureHandling(b *testing.B) {
  2169  	b.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
  2170  	b.Cleanup(setDurationDuringTest(&jobcontroller.MaxJobPodFailureBackOff, fastPodFailureBackoff))
  2171  	closeFn, restConfig, clientSet, ns := setup(b, "indexed")
  2172  	restConfig.QPS = 100
  2173  	restConfig.Burst = 100
  2174  	defer closeFn()
  2175  	ctx, cancel := startJobControllerAndWaitForCaches(b, restConfig)
  2176  	defer cancel()
  2177  	backoff := wait.Backoff{
  2178  		Duration: time.Second,
  2179  		Factor:   1.5,
  2180  		Steps:    30,
  2181  		Cap:      5 * time.Minute,
  2182  	}
  2183  	cases := map[string]struct {
  2184  		nPods                int32
  2185  		backoffLimitPerIndex *int32
  2186  		customTimeout        *time.Duration
  2187  	}{
  2188  		"regular indexed job with failures; size=10": {
  2189  			nPods: 10,
  2190  		},
  2191  		"job with backoffLimitPerIndex with failures; size=10": {
  2192  			nPods:                10,
  2193  			backoffLimitPerIndex: ptr.To[int32](1),
  2194  		},
  2195  		"regular indexed job with failures; size=100": {
  2196  			nPods: 100,
  2197  		},
  2198  		"job with backoffLimitPerIndex with failures; size=100": {
  2199  			nPods:                100,
  2200  			backoffLimitPerIndex: ptr.To[int32](1),
  2201  		},
  2202  	}
  2203  	mode := batchv1.IndexedCompletion
  2204  	for name, tc := range cases {
  2205  		b.Run(name, func(b *testing.B) {
  2206  			enableJobBackoffLimitPerIndex := tc.backoffLimitPerIndex != nil
  2207  			timeout := ptr.Deref(tc.customTimeout, wait.ForeverTestTimeout)
  2208  			defer featuregatetesting.SetFeatureGateDuringTest(b, feature.DefaultFeatureGate, features.JobBackoffLimitPerIndex, enableJobBackoffLimitPerIndex)()
  2209  			b.ResetTimer()
  2210  			for n := 0; n < b.N; n++ {
  2211  				b.StopTimer()
  2212  				jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2213  					ObjectMeta: metav1.ObjectMeta{
  2214  						Name: fmt.Sprintf("npods-%d-%d-%v", tc.nPods, n, enableJobBackoffLimitPerIndex),
  2215  					},
  2216  					Spec: batchv1.JobSpec{
  2217  						Parallelism:          ptr.To(tc.nPods),
  2218  						Completions:          ptr.To(tc.nPods),
  2219  						CompletionMode:       &mode,
  2220  						BackoffLimitPerIndex: tc.backoffLimitPerIndex,
  2221  						BackoffLimit:         ptr.To(tc.nPods),
  2222  					},
  2223  				})
  2224  				if err != nil {
  2225  					b.Fatalf("Failed to create Job: %v", err)
  2226  				}
  2227  				b.Cleanup(func() {
  2228  					if err := cleanUp(ctx, clientSet, jobObj); err != nil {
  2229  						b.Fatalf("Failed cleanup: %v", err)
  2230  					}
  2231  				})
  2232  				validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{
  2233  					Active:      int(tc.nPods),
  2234  					Ready:       ptr.To[int32](0),
  2235  					Terminating: ptr.To[int32](0),
  2236  				}, timeout)
  2237  
  2238  				b.StartTimer()
  2239  				remaining := int(tc.nPods)
  2240  				if err := wait.ExponentialBackoff(backoff, func() (done bool, err error) {
  2241  					if err, fail := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, remaining); err != nil {
  2242  						remaining -= fail
  2243  						b.Logf("Transient failure failing pods: %v", err)
  2244  						return false, nil
  2245  					}
  2246  					return true, nil
  2247  				}); err != nil {
  2248  					b.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
  2249  				}
  2250  				validateJobsPodsStatusOnlyWithTimeout(ctx, b, clientSet, jobObj, podsByStatus{
  2251  					Active:      int(tc.nPods),
  2252  					Ready:       ptr.To[int32](0),
  2253  					Failed:      int(tc.nPods),
  2254  					Terminating: ptr.To[int32](0),
  2255  				}, timeout)
  2256  				b.StopTimer()
  2257  			}
  2258  		})
  2259  	}
  2260  }
  2261  
  2262  // cleanUp deletes all pods and the job
  2263  func cleanUp(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job) error {
  2264  	// Clean up pods in pages, because DeleteCollection might timeout.
  2265  	// #90743
  2266  	for {
  2267  		pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{Limit: 1})
  2268  		if err != nil {
  2269  			return err
  2270  		}
  2271  		if len(pods.Items) == 0 {
  2272  			break
  2273  		}
  2274  		err = clientSet.CoreV1().Pods(jobObj.Namespace).DeleteCollection(ctx,
  2275  			metav1.DeleteOptions{},
  2276  			metav1.ListOptions{
  2277  				Limit: 1000,
  2278  			})
  2279  		if err != nil {
  2280  			return err
  2281  		}
  2282  	}
  2283  	return clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{})
  2284  }
  2285  
  2286  func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
  2287  	for _, policy := range []metav1.DeletionPropagation{metav1.DeletePropagationOrphan, metav1.DeletePropagationBackground, metav1.DeletePropagationForeground} {
  2288  		t.Run(string(policy), func(t *testing.T) {
  2289  			closeFn, restConfig, clientSet, ns := setup(t, "simple")
  2290  			defer closeFn()
  2291  			informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "controller-informers")), 0)
  2292  			// Make the job controller significantly slower to trigger race condition.
  2293  			restConfig.QPS = 1
  2294  			restConfig.Burst = 1
  2295  			jc, ctx, cancel := createJobControllerWithSharedInformers(t, restConfig, informerSet)
  2296  			resetMetrics()
  2297  			defer cancel()
  2298  			restConfig.QPS = 200
  2299  			restConfig.Burst = 200
  2300  			runGC := util.CreateGCController(ctx, t, *restConfig, informerSet)
  2301  			informerSet.Start(ctx.Done())
  2302  			go jc.Run(ctx, 1)
  2303  			runGC()
  2304  
  2305  			jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2306  				Spec: batchv1.JobSpec{
  2307  					Parallelism: ptr.To[int32](2),
  2308  				},
  2309  			})
  2310  			if err != nil {
  2311  				t.Fatalf("Failed to create Job: %v", err)
  2312  			}
  2313  			validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2314  				Active:      2,
  2315  				Ready:       ptr.To[int32](0),
  2316  				Terminating: ptr.To[int32](0),
  2317  			})
  2318  
  2319  			// Delete Job. The GC should delete the pods in cascade.
  2320  			err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(ctx, jobObj.Name, metav1.DeleteOptions{
  2321  				PropagationPolicy: &policy,
  2322  			})
  2323  			if err != nil {
  2324  				t.Fatalf("Failed to delete job: %v", err)
  2325  			}
  2326  			validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
  2327  			// Pods never finished, so they are not counted in the metric.
  2328  			validateTerminatedPodsTrackingFinalizerMetric(ctx, t, 0)
  2329  		})
  2330  	}
  2331  }
  2332  
  2333  func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) {
  2334  	// Set a maximum number of uncounted pods below parallelism, to ensure it
  2335  	// doesn't affect the termination of pods.
  2336  	t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50))
  2337  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
  2338  	defer closeFn()
  2339  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2340  	defer cancel()
  2341  
  2342  	// Job tracking with finalizers requires less calls in Indexed mode,
  2343  	// so it's more likely to process all finalizers before all the pods
  2344  	// are visible.
  2345  	mode := batchv1.IndexedCompletion
  2346  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2347  		Spec: batchv1.JobSpec{
  2348  			CompletionMode: &mode,
  2349  			Completions:    ptr.To[int32](100),
  2350  			Parallelism:    ptr.To[int32](100),
  2351  			BackoffLimit:   ptr.To[int32](0),
  2352  		},
  2353  	})
  2354  	if err != nil {
  2355  		t.Fatalf("Could not create job: %v", err)
  2356  	}
  2357  
  2358  	// Fail a pod ASAP.
  2359  	err = wait.PollUntilContextTimeout(ctx, time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
  2360  		if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
  2361  			return false, nil
  2362  		}
  2363  		return true, nil
  2364  	})
  2365  	if err != nil {
  2366  		t.Fatalf("Could not fail pod: %v", err)
  2367  	}
  2368  
  2369  	validateJobFailed(ctx, t, clientSet, jobObj)
  2370  	validateCounterMetric(ctx, t, metrics.JobFinishedNum, metricLabelsWithValue{
  2371  		Labels: []string{"Indexed", "failed", "BackoffLimitExceeded"},
  2372  		Value:  1,
  2373  	})
  2374  
  2375  	validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
  2376  }
  2377  
  2378  func TestJobPodsCreatedWithExponentialBackoff(t *testing.T) {
  2379  	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, 2*time.Second))
  2380  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
  2381  	defer closeFn()
  2382  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2383  	defer cancel()
  2384  
  2385  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{})
  2386  	if err != nil {
  2387  		t.Fatalf("Could not create job: %v", err)
  2388  	}
  2389  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2390  		Active:      1,
  2391  		Ready:       ptr.To[int32](0),
  2392  		Terminating: ptr.To[int32](0),
  2393  	})
  2394  
  2395  	// Fail the first pod
  2396  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
  2397  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
  2398  	}
  2399  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2400  		Active:      1,
  2401  		Ready:       ptr.To[int32](0),
  2402  		Failed:      1,
  2403  		Terminating: ptr.To[int32](0),
  2404  	})
  2405  
  2406  	// Fail the second pod
  2407  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
  2408  		t.Fatalf("Failed setting phase %s on Job Pod: %v", v1.PodFailed, err)
  2409  	}
  2410  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2411  		Active:      1,
  2412  		Ready:       ptr.To[int32](0),
  2413  		Failed:      2,
  2414  		Terminating: ptr.To[int32](0),
  2415  	})
  2416  
  2417  	jobPods, err := getJobPods(ctx, t, clientSet, jobObj, func(ps v1.PodStatus) bool { return true })
  2418  	if err != nil {
  2419  		t.Fatalf("Failed to list Job Pods: %v", err)
  2420  	}
  2421  	if len(jobPods) != 3 {
  2422  		t.Fatalf("Expected to get %v pods, received %v", 4, len(jobPods))
  2423  	}
  2424  	validateExpotentialBackoffDelay(t, jobcontroller.DefaultJobPodFailureBackOff, jobPods)
  2425  }
  2426  
  2427  func validateExpotentialBackoffDelay(t *testing.T, defaultPodFailureBackoff time.Duration, pods []*v1.Pod) {
  2428  	t.Helper()
  2429  	creationTime := []time.Time{}
  2430  	finishTime := []time.Time{}
  2431  	for _, pod := range pods {
  2432  		creationTime = append(creationTime, pod.CreationTimestamp.Time)
  2433  		if len(pod.Status.ContainerStatuses) > 0 {
  2434  			finishTime = append(finishTime, pod.Status.ContainerStatuses[0].State.Terminated.FinishedAt.Time)
  2435  		}
  2436  	}
  2437  
  2438  	sort.Slice(creationTime, func(i, j int) bool {
  2439  		return creationTime[i].Before(creationTime[j])
  2440  	})
  2441  	sort.Slice(finishTime, func(i, j int) bool {
  2442  		return finishTime[i].Before(finishTime[j])
  2443  	})
  2444  
  2445  	diff := creationTime[1].Sub(finishTime[0])
  2446  
  2447  	if diff < defaultPodFailureBackoff {
  2448  		t.Fatalf("Second pod should be created at least %v seconds after the first pod, time difference: %v", defaultPodFailureBackoff, diff)
  2449  	}
  2450  
  2451  	if diff >= 2*defaultPodFailureBackoff {
  2452  		t.Fatalf("Second pod should be created before %v seconds after the first pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
  2453  	}
  2454  
  2455  	diff = creationTime[2].Sub(finishTime[1])
  2456  
  2457  	if diff < 2*defaultPodFailureBackoff {
  2458  		t.Fatalf("Third pod should be created at least %v seconds after the second pod, time difference: %v", 2*defaultPodFailureBackoff, diff)
  2459  	}
  2460  
  2461  	if diff >= 4*defaultPodFailureBackoff {
  2462  		t.Fatalf("Third pod should be created before %v seconds after the second pod, time difference: %v", 4*defaultPodFailureBackoff, diff)
  2463  	}
  2464  }
  2465  
  2466  // TestJobFailedWithInterrupts tests that a job were one pod fails and the rest
  2467  // succeed is marked as Failed, even if the controller fails in the middle.
  2468  func TestJobFailedWithInterrupts(t *testing.T) {
  2469  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
  2470  	defer closeFn()
  2471  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2472  	defer func() {
  2473  		cancel()
  2474  	}()
  2475  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2476  		Spec: batchv1.JobSpec{
  2477  			Completions:  ptr.To[int32](10),
  2478  			Parallelism:  ptr.To[int32](10),
  2479  			BackoffLimit: ptr.To[int32](0),
  2480  			Template: v1.PodTemplateSpec{
  2481  				Spec: v1.PodSpec{
  2482  					NodeName: "foo", // Scheduled pods are not deleted immediately.
  2483  				},
  2484  			},
  2485  		},
  2486  	})
  2487  	if err != nil {
  2488  		t.Fatalf("Could not create job: %v", err)
  2489  	}
  2490  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2491  		Active:      10,
  2492  		Ready:       ptr.To[int32](0),
  2493  		Terminating: ptr.To[int32](0),
  2494  	})
  2495  	t.Log("Finishing pods")
  2496  	if err, _ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodFailed, 1); err != nil {
  2497  		t.Fatalf("Could not fail a pod: %v", err)
  2498  	}
  2499  	remaining := 9
  2500  	if err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
  2501  		if err, succ := setJobPodsPhase(ctx, clientSet, jobObj, v1.PodSucceeded, remaining); err != nil {
  2502  			remaining -= succ
  2503  			t.Logf("Transient failure succeeding pods: %v", err)
  2504  			return false, nil
  2505  		}
  2506  		return true, nil
  2507  	}); err != nil {
  2508  		t.Fatalf("Could not succeed the remaining %d pods: %v", remaining, err)
  2509  	}
  2510  	t.Log("Recreating job controller")
  2511  	cancel()
  2512  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  2513  	validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed)
  2514  }
  2515  
  2516  func validateNoOrphanPodsWithFinalizers(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
  2517  	t.Helper()
  2518  	orphanPods := 0
  2519  	if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (done bool, err error) {
  2520  		pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{
  2521  			LabelSelector: metav1.FormatLabelSelector(jobObj.Spec.Selector),
  2522  		})
  2523  		if err != nil {
  2524  			return false, err
  2525  		}
  2526  		orphanPods = 0
  2527  		for _, pod := range pods.Items {
  2528  			if hasJobTrackingFinalizer(&pod) {
  2529  				orphanPods++
  2530  			}
  2531  		}
  2532  		return orphanPods == 0, nil
  2533  	}); err != nil {
  2534  		t.Errorf("Failed waiting for pods to be freed from finalizer: %v", err)
  2535  		t.Logf("Last saw %d orphan pods", orphanPods)
  2536  	}
  2537  }
  2538  
  2539  func TestOrphanPodsFinalizersClearedOnRestart(t *testing.T) {
  2540  	// Step 0: create job.
  2541  	closeFn, restConfig, clientSet, ns := setup(t, "simple")
  2542  	defer closeFn()
  2543  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2544  	defer func() {
  2545  		cancel()
  2546  	}()
  2547  
  2548  	jobObj, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2549  		Spec: batchv1.JobSpec{
  2550  			Parallelism: ptr.To[int32](1),
  2551  		},
  2552  	})
  2553  	if err != nil {
  2554  		t.Fatalf("Failed to create Job: %v", err)
  2555  	}
  2556  	validateJobPodsStatus(ctx, t, clientSet, jobObj, podsByStatus{
  2557  		Active:      1,
  2558  		Ready:       ptr.To[int32](0),
  2559  		Terminating: ptr.To[int32](0),
  2560  	})
  2561  
  2562  	// Step 2: Delete the Job while the controller is stopped.
  2563  	cancel()
  2564  
  2565  	err = clientSet.BatchV1().Jobs(jobObj.Namespace).Delete(context.Background(), jobObj.Name, metav1.DeleteOptions{})
  2566  	if err != nil {
  2567  		t.Fatalf("Failed to delete job: %v", err)
  2568  	}
  2569  
  2570  	// Step 3: Restart controller.
  2571  	ctx, cancel = startJobControllerAndWaitForCaches(t, restConfig)
  2572  	validateNoOrphanPodsWithFinalizers(ctx, t, clientSet, jobObj)
  2573  }
  2574  
  2575  func TestSuspendJob(t *testing.T) {
  2576  	type step struct {
  2577  		flag       bool
  2578  		wantActive int
  2579  		wantStatus v1.ConditionStatus
  2580  		wantReason string
  2581  	}
  2582  	testCases := []struct {
  2583  		featureGate bool
  2584  		create      step
  2585  		update      step
  2586  	}{
  2587  		// Exhaustively test all combinations other than trivial true->true and
  2588  		// false->false cases.
  2589  		{
  2590  			create: step{flag: false, wantActive: 2},
  2591  			update: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"},
  2592  		},
  2593  		{
  2594  			create: step{flag: true, wantActive: 0, wantStatus: v1.ConditionTrue, wantReason: "Suspended"},
  2595  			update: step{flag: false, wantActive: 2, wantStatus: v1.ConditionFalse, wantReason: "Resumed"},
  2596  		},
  2597  	}
  2598  
  2599  	for _, tc := range testCases {
  2600  		name := fmt.Sprintf("feature=%v,create=%v,update=%v", tc.featureGate, tc.create.flag, tc.update.flag)
  2601  		t.Run(name, func(t *testing.T) {
  2602  			closeFn, restConfig, clientSet, ns := setup(t, "suspend")
  2603  			defer closeFn()
  2604  			ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2605  			defer cancel()
  2606  			events, err := clientSet.EventsV1().Events(ns.Name).Watch(ctx, metav1.ListOptions{})
  2607  			if err != nil {
  2608  				t.Fatal(err)
  2609  			}
  2610  			defer events.Stop()
  2611  
  2612  			parallelism := int32(2)
  2613  			job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2614  				Spec: batchv1.JobSpec{
  2615  					Parallelism: ptr.To(parallelism),
  2616  					Completions: ptr.To[int32](4),
  2617  					Suspend:     ptr.To(tc.create.flag),
  2618  				},
  2619  			})
  2620  			if err != nil {
  2621  				t.Fatalf("Failed to create Job: %v", err)
  2622  			}
  2623  
  2624  			validate := func(s string, active int, status v1.ConditionStatus, reason string) {
  2625  				validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{
  2626  					Active:      active,
  2627  					Ready:       ptr.To[int32](0),
  2628  					Terminating: ptr.To[int32](0),
  2629  				})
  2630  				job, err = clientSet.BatchV1().Jobs(ns.Name).Get(ctx, job.Name, metav1.GetOptions{})
  2631  				if err != nil {
  2632  					t.Fatalf("Failed to get Job after %s: %v", s, err)
  2633  				}
  2634  				if got, want := getJobConditionStatus(ctx, job, batchv1.JobSuspended), status; got != want {
  2635  					t.Errorf("Unexpected Job condition %q status after %s: got %q, want %q", batchv1.JobSuspended, s, got, want)
  2636  				}
  2637  				if err := waitForEvent(ctx, events, job.UID, reason); err != nil {
  2638  					t.Errorf("Waiting for event with reason %q after %s: %v", reason, s, err)
  2639  				}
  2640  			}
  2641  			validate("create", tc.create.wantActive, tc.create.wantStatus, tc.create.wantReason)
  2642  
  2643  			job.Spec.Suspend = ptr.To(tc.update.flag)
  2644  			job, err = clientSet.BatchV1().Jobs(ns.Name).Update(ctx, job, metav1.UpdateOptions{})
  2645  			if err != nil {
  2646  				t.Fatalf("Failed to update Job: %v", err)
  2647  			}
  2648  			validate("update", tc.update.wantActive, tc.update.wantStatus, tc.update.wantReason)
  2649  		})
  2650  	}
  2651  }
  2652  
  2653  func TestSuspendJobControllerRestart(t *testing.T) {
  2654  	closeFn, restConfig, clientSet, ns := setup(t, "suspend")
  2655  	defer closeFn()
  2656  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2657  	defer cancel()
  2658  
  2659  	job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{
  2660  		Spec: batchv1.JobSpec{
  2661  			Parallelism: ptr.To[int32](2),
  2662  			Completions: ptr.To[int32](4),
  2663  			Suspend:     ptr.To(true),
  2664  		},
  2665  	})
  2666  	if err != nil {
  2667  		t.Fatalf("Failed to create Job: %v", err)
  2668  	}
  2669  	validateJobPodsStatus(ctx, t, clientSet, job, podsByStatus{
  2670  		Active:      0,
  2671  		Ready:       ptr.To[int32](0),
  2672  		Terminating: ptr.To[int32](0),
  2673  	})
  2674  }
  2675  
  2676  func TestNodeSelectorUpdate(t *testing.T) {
  2677  	closeFn, restConfig, clientSet, ns := setup(t, "suspend")
  2678  	defer closeFn()
  2679  	ctx, cancel := startJobControllerAndWaitForCaches(t, restConfig)
  2680  	defer cancel()
  2681  
  2682  	job, err := createJobWithDefaults(ctx, clientSet, ns.Name, &batchv1.Job{Spec: batchv1.JobSpec{
  2683  		Parallelism: ptr.To[int32](1),
  2684  		Suspend:     ptr.To(true),
  2685  	}})
  2686  	if err != nil {
  2687  		t.Fatalf("Failed to create Job: %v", err)
  2688  	}
  2689  	jobName := job.Name
  2690  	jobNamespace := job.Namespace
  2691  	jobClient := clientSet.BatchV1().Jobs(jobNamespace)
  2692  
  2693  	// (1) Unsuspend and set node selector in the same update.
  2694  	nodeSelector := map[string]string{"foo": "bar"}
  2695  	if _, err := updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) {
  2696  		j.Spec.Template.Spec.NodeSelector = nodeSelector
  2697  		j.Spec.Suspend = ptr.To(false)
  2698  	}); err != nil {
  2699  		t.Errorf("Unexpected error: %v", err)
  2700  	}
  2701  
  2702  	// (2) Check that the pod was created using the expected node selector.
  2703  
  2704  	var pod *v1.Pod
  2705  	if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
  2706  		pods, err := clientSet.CoreV1().Pods(jobNamespace).List(ctx, metav1.ListOptions{})
  2707  		if err != nil {
  2708  			t.Fatalf("Failed to list Job Pods: %v", err)
  2709  		}
  2710  		if len(pods.Items) == 0 {
  2711  			return false, nil
  2712  		}
  2713  		pod = &pods.Items[0]
  2714  		return true, nil
  2715  	}); err != nil || pod == nil {
  2716  		t.Fatalf("pod not found: %v", err)
  2717  	}
  2718  
  2719  	// if the feature gate is enabled, then the job should now be unsuspended and
  2720  	// the pod has the node selector.
  2721  	if diff := cmp.Diff(nodeSelector, pod.Spec.NodeSelector); diff != "" {
  2722  		t.Errorf("Unexpected nodeSelector (-want,+got):\n%s", diff)
  2723  	}
  2724  
  2725  	// (3) Update node selector again. It should fail since the job is unsuspended.
  2726  	_, err = updateJob(ctx, jobClient, jobName, func(j *batchv1.Job) {
  2727  		j.Spec.Template.Spec.NodeSelector = map[string]string{"foo": "baz"}
  2728  	})
  2729  
  2730  	if err == nil || !strings.Contains(err.Error(), "spec.template: Invalid value") {
  2731  		t.Errorf("Expected \"spec.template: Invalid value\" error, got: %v", err)
  2732  	}
  2733  
  2734  }
  2735  
  2736  type podsByStatus struct {
  2737  	Active      int
  2738  	Ready       *int32
  2739  	Failed      int
  2740  	Succeeded   int
  2741  	Terminating *int32
  2742  }
  2743  
  2744  func validateJobsPodsStatusOnly(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) {
  2745  	t.Helper()
  2746  	validateJobsPodsStatusOnlyWithTimeout(ctx, t, clientSet, jobObj, desired, wait.ForeverTestTimeout)
  2747  }
  2748  
  2749  func validateJobsPodsStatusOnlyWithTimeout(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus, timeout time.Duration) {
  2750  	t.Helper()
  2751  	var actualCounts podsByStatus
  2752  	if err := wait.PollUntilContextTimeout(ctx, waitInterval, timeout, true, func(ctx context.Context) (bool, error) {
  2753  		updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
  2754  		if err != nil {
  2755  			t.Fatalf("Failed to get updated Job: %v", err)
  2756  		}
  2757  		actualCounts = podsByStatus{
  2758  			Active:      int(updatedJob.Status.Active),
  2759  			Ready:       updatedJob.Status.Ready,
  2760  			Succeeded:   int(updatedJob.Status.Succeeded),
  2761  			Failed:      int(updatedJob.Status.Failed),
  2762  			Terminating: updatedJob.Status.Terminating,
  2763  		}
  2764  		return cmp.Equal(actualCounts, desired), nil
  2765  	}); err != nil {
  2766  		diff := cmp.Diff(desired, actualCounts)
  2767  		t.Errorf("Waiting for Job Status: %v\nPods (-want,+got):\n%s", err, diff)
  2768  	}
  2769  }
  2770  
  2771  func validateJobPodsStatus(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, desired podsByStatus) {
  2772  	t.Helper()
  2773  	validateJobsPodsStatusOnly(ctx, t, clientSet, jobObj, desired)
  2774  	var active []*v1.Pod
  2775  	if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
  2776  		pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  2777  		if err != nil {
  2778  			t.Fatalf("Failed to list Job Pods: %v", err)
  2779  		}
  2780  		active = nil
  2781  		for _, pod := range pods.Items {
  2782  			phase := pod.Status.Phase
  2783  			if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) {
  2784  				p := pod
  2785  				active = append(active, &p)
  2786  			}
  2787  		}
  2788  		return len(active) == desired.Active, nil
  2789  	}); err != nil {
  2790  		if len(active) != desired.Active {
  2791  			t.Errorf("Found %d active Pods, want %d", len(active), desired.Active)
  2792  		}
  2793  	}
  2794  	for _, p := range active {
  2795  		if !hasJobTrackingFinalizer(p) {
  2796  			t.Errorf("Active pod %s doesn't have tracking finalizer", p.Name)
  2797  		}
  2798  	}
  2799  }
  2800  
  2801  func getJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, filter func(v1.PodStatus) bool) ([]*v1.Pod, error) {
  2802  	t.Helper()
  2803  	allPods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  2804  	if err != nil {
  2805  		return nil, err
  2806  	}
  2807  	jobPods := make([]*v1.Pod, 0, 0)
  2808  	for _, pod := range allPods.Items {
  2809  		if metav1.IsControlledBy(&pod, jobObj) && filter(pod.Status) {
  2810  			p := pod
  2811  			jobPods = append(jobPods, &p)
  2812  		}
  2813  	}
  2814  	return jobPods, nil
  2815  }
  2816  
  2817  func validateFinishedPodsNoFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
  2818  	t.Helper()
  2819  	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  2820  	if err != nil {
  2821  		t.Fatalf("Failed to list Job Pods: %v", err)
  2822  	}
  2823  	for _, pod := range pods.Items {
  2824  		phase := pod.Status.Phase
  2825  		if metav1.IsControlledBy(&pod, jobObj) && (phase == v1.PodPending || phase == v1.PodRunning) && hasJobTrackingFinalizer(&pod) {
  2826  			t.Errorf("Finished pod %s still has a tracking finalizer", pod.Name)
  2827  		}
  2828  	}
  2829  }
  2830  
  2831  // validateIndexedJobPods validates indexes and hostname of
  2832  // active and completed Pods of an Indexed Job.
  2833  // Call after validateJobPodsStatus
  2834  func validateIndexedJobPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job, wantActive sets.Set[int], gotCompleted string, wantFailed *string) {
  2835  	t.Helper()
  2836  	updatedJob, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
  2837  	if err != nil {
  2838  		t.Fatalf("Failed to get updated Job: %v", err)
  2839  	}
  2840  	if updatedJob.Status.CompletedIndexes != gotCompleted {
  2841  		t.Errorf("Got completed indexes %q, want %q", updatedJob.Status.CompletedIndexes, gotCompleted)
  2842  	}
  2843  	if diff := cmp.Diff(wantFailed, updatedJob.Status.FailedIndexes); diff != "" {
  2844  		t.Errorf("Got unexpected failed indexes: %s", diff)
  2845  	}
  2846  	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  2847  	if err != nil {
  2848  		t.Fatalf("Failed to list Job Pods: %v", err)
  2849  	}
  2850  	gotActive := sets.New[int]()
  2851  	for _, pod := range pods.Items {
  2852  		if metav1.IsControlledBy(&pod, jobObj) {
  2853  			if pod.Status.Phase == v1.PodPending || pod.Status.Phase == v1.PodRunning {
  2854  				ix, err := getCompletionIndex(&pod)
  2855  				if err != nil {
  2856  					t.Errorf("Failed getting completion index for pod %s: %v", pod.Name, err)
  2857  				} else {
  2858  					gotActive.Insert(ix)
  2859  				}
  2860  				expectedName := fmt.Sprintf("%s-%d", jobObj.Name, ix)
  2861  				if diff := cmp.Equal(expectedName, pod.Spec.Hostname); !diff {
  2862  					t.Errorf("Got pod hostname %s, want %s", pod.Spec.Hostname, expectedName)
  2863  				}
  2864  			}
  2865  		}
  2866  	}
  2867  	if wantActive == nil {
  2868  		wantActive = sets.New[int]()
  2869  	}
  2870  	if diff := cmp.Diff(sets.List(wantActive), sets.List(gotActive)); diff != "" {
  2871  		t.Errorf("Unexpected active indexes (-want,+got):\n%s", diff)
  2872  	}
  2873  }
  2874  
  2875  func waitForEvent(ctx context.Context, events watch.Interface, uid types.UID, reason string) error {
  2876  	if reason == "" {
  2877  		return nil
  2878  	}
  2879  	return wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
  2880  		for {
  2881  			var ev watch.Event
  2882  			select {
  2883  			case ev = <-events.ResultChan():
  2884  			default:
  2885  				return false, nil
  2886  			}
  2887  			e, ok := ev.Object.(*eventsv1.Event)
  2888  			if !ok {
  2889  				continue
  2890  			}
  2891  			ctrl := "job-controller"
  2892  			if (e.ReportingController == ctrl || e.DeprecatedSource.Component == ctrl) && e.Reason == reason && e.Regarding.UID == uid {
  2893  				return true, nil
  2894  			}
  2895  		}
  2896  	})
  2897  }
  2898  
  2899  func getJobConditionStatus(ctx context.Context, job *batchv1.Job, cType batchv1.JobConditionType) v1.ConditionStatus {
  2900  	for _, cond := range job.Status.Conditions {
  2901  		if cond.Type == cType {
  2902  			return cond.Status
  2903  		}
  2904  	}
  2905  	return ""
  2906  }
  2907  
  2908  func validateJobFailed(ctx context.Context, t *testing.T, clientSet clientset.Interface, jobObj *batchv1.Job) {
  2909  	t.Helper()
  2910  	validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobFailed)
  2911  }
  2912  
  2913  func validateJobSucceeded(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job) {
  2914  	t.Helper()
  2915  	validateJobCondition(ctx, t, clientSet, jobObj, batchv1.JobComplete)
  2916  }
  2917  
  2918  func validateJobCondition(ctx context.Context, t testing.TB, clientSet clientset.Interface, jobObj *batchv1.Job, cond batchv1.JobConditionType) {
  2919  	t.Helper()
  2920  	if err := wait.PollUntilContextTimeout(ctx, waitInterval, wait.ForeverTestTimeout, true, func(ctx context.Context) (bool, error) {
  2921  		j, err := clientSet.BatchV1().Jobs(jobObj.Namespace).Get(ctx, jobObj.Name, metav1.GetOptions{})
  2922  		if err != nil {
  2923  			t.Fatalf("Failed to obtain updated Job: %v", err)
  2924  		}
  2925  		return getJobConditionStatus(ctx, j, cond) == v1.ConditionTrue, nil
  2926  	}); err != nil {
  2927  		t.Errorf("Waiting for Job to have condition %s: %v", cond, err)
  2928  	}
  2929  }
  2930  
  2931  func setJobPodsPhase(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, cnt int) (error, int) {
  2932  	op := func(p *v1.Pod) bool {
  2933  		p.Status.Phase = phase
  2934  		if phase == v1.PodFailed || phase == v1.PodSucceeded {
  2935  			p.Status.ContainerStatuses = []v1.ContainerStatus{
  2936  				{
  2937  					State: v1.ContainerState{
  2938  						Terminated: &v1.ContainerStateTerminated{
  2939  							FinishedAt: metav1.Now(),
  2940  						},
  2941  					},
  2942  				},
  2943  			}
  2944  		}
  2945  		return true
  2946  	}
  2947  	return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt)
  2948  }
  2949  
  2950  func setJobPodsReady(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, cnt int) (error, int) {
  2951  	op := func(p *v1.Pod) bool {
  2952  		if podutil.IsPodReady(p) {
  2953  			return false
  2954  		}
  2955  		p.Status.Conditions = append(p.Status.Conditions, v1.PodCondition{
  2956  			Type:   v1.PodReady,
  2957  			Status: v1.ConditionTrue,
  2958  		})
  2959  		return true
  2960  	}
  2961  	return updateJobPodsStatus(ctx, clientSet, jobObj, op, cnt)
  2962  }
  2963  
  2964  func updateJobPodsStatus(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, op func(*v1.Pod) bool, cnt int) (error, int) {
  2965  	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  2966  	if err != nil {
  2967  		return fmt.Errorf("listing Job Pods: %w", err), 0
  2968  	}
  2969  	updates := make([]v1.Pod, 0, cnt)
  2970  	for _, pod := range pods.Items {
  2971  		if len(updates) == cnt {
  2972  			break
  2973  		}
  2974  		if p := pod.Status.Phase; metav1.IsControlledBy(&pod, jobObj) && p != v1.PodFailed && p != v1.PodSucceeded {
  2975  			if !op(&pod) {
  2976  				continue
  2977  			}
  2978  			updates = append(updates, pod)
  2979  		}
  2980  	}
  2981  	successful, err := updatePodStatuses(ctx, clientSet, updates)
  2982  	if successful != cnt {
  2983  		return fmt.Errorf("couldn't set phase on %d Job pods", cnt-successful), successful
  2984  	}
  2985  	return err, successful
  2986  }
  2987  
  2988  func updatePodStatuses(ctx context.Context, clientSet clientset.Interface, updates []v1.Pod) (int, error) {
  2989  	wg := sync.WaitGroup{}
  2990  	wg.Add(len(updates))
  2991  	errCh := make(chan error, len(updates))
  2992  	var updated int32
  2993  
  2994  	for _, pod := range updates {
  2995  		pod := pod
  2996  		go func() {
  2997  			_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
  2998  			if err != nil {
  2999  				errCh <- err
  3000  			} else {
  3001  				atomic.AddInt32(&updated, 1)
  3002  			}
  3003  			wg.Done()
  3004  		}()
  3005  	}
  3006  	wg.Wait()
  3007  
  3008  	select {
  3009  	case err := <-errCh:
  3010  		return int(updated), fmt.Errorf("updating Pod status: %w", err)
  3011  	default:
  3012  	}
  3013  	return int(updated), nil
  3014  }
  3015  
  3016  func setJobPhaseForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, phase v1.PodPhase, ix int) error {
  3017  	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  3018  	if err != nil {
  3019  		return fmt.Errorf("listing Job Pods: %w", err)
  3020  	}
  3021  	for _, pod := range pods.Items {
  3022  		if p := pod.Status.Phase; !metav1.IsControlledBy(&pod, jobObj) || p == v1.PodFailed || p == v1.PodSucceeded {
  3023  			continue
  3024  		}
  3025  		if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
  3026  			pod.Status.Phase = phase
  3027  			if phase == v1.PodFailed || phase == v1.PodSucceeded {
  3028  				pod.Status.ContainerStatuses = []v1.ContainerStatus{
  3029  					{
  3030  						State: v1.ContainerState{
  3031  							Terminated: &v1.ContainerStateTerminated{
  3032  								FinishedAt: metav1.Now(),
  3033  							},
  3034  						},
  3035  					},
  3036  				}
  3037  			}
  3038  			_, err := clientSet.CoreV1().Pods(pod.Namespace).UpdateStatus(ctx, &pod, metav1.UpdateOptions{})
  3039  			if err != nil {
  3040  				return fmt.Errorf("updating pod %s status: %w", pod.Name, err)
  3041  			}
  3042  			return nil
  3043  		}
  3044  	}
  3045  	return errors.New("no pod matching index found")
  3046  }
  3047  
  3048  func getActivePodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int) (*v1.Pod, error) {
  3049  	return getJobPodForIndex(ctx, clientSet, jobObj, ix, func(p *v1.Pod) bool {
  3050  		return !podutil.IsPodTerminal(p)
  3051  	})
  3052  }
  3053  
  3054  func getJobPodForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) (*v1.Pod, error) {
  3055  	pods, err := getJobPodsForIndex(ctx, clientSet, jobObj, ix, filter)
  3056  	if err != nil {
  3057  		return nil, err
  3058  	}
  3059  	if len(pods) == 0 {
  3060  		return nil, fmt.Errorf("Pod not found for index: %v", ix)
  3061  	}
  3062  	return pods[0], nil
  3063  }
  3064  
  3065  func getJobPodsForIndex(ctx context.Context, clientSet clientset.Interface, jobObj *batchv1.Job, ix int, filter func(*v1.Pod) bool) ([]*v1.Pod, error) {
  3066  	pods, err := clientSet.CoreV1().Pods(jobObj.Namespace).List(ctx, metav1.ListOptions{})
  3067  	if err != nil {
  3068  		return nil, fmt.Errorf("listing Job Pods: %w", err)
  3069  	}
  3070  	var result []*v1.Pod
  3071  	for _, pod := range pods.Items {
  3072  		pod := pod
  3073  		if !metav1.IsControlledBy(&pod, jobObj) {
  3074  			continue
  3075  		}
  3076  		if !filter(&pod) {
  3077  			continue
  3078  		}
  3079  		if pix, err := getCompletionIndex(&pod); err == nil && pix == ix {
  3080  			result = append(result, &pod)
  3081  		}
  3082  	}
  3083  	return result, nil
  3084  }
  3085  
  3086  func getCompletionIndex(p *v1.Pod) (int, error) {
  3087  	if p.Annotations == nil {
  3088  		return 0, errors.New("no annotations found")
  3089  	}
  3090  	v, ok := p.Annotations[batchv1.JobCompletionIndexAnnotation]
  3091  	if !ok {
  3092  		return 0, fmt.Errorf("annotation %s not found", batchv1.JobCompletionIndexAnnotation)
  3093  	}
  3094  	return strconv.Atoi(v)
  3095  }
  3096  
  3097  func createJobWithDefaults(ctx context.Context, clientSet clientset.Interface, ns string, jobObj *batchv1.Job) (*batchv1.Job, error) {
  3098  	if jobObj.Name == "" {
  3099  		jobObj.Name = "test-job"
  3100  	}
  3101  	if len(jobObj.Spec.Template.Spec.Containers) == 0 {
  3102  		jobObj.Spec.Template.Spec.Containers = []v1.Container{
  3103  			{Name: "foo", Image: "bar"},
  3104  		}
  3105  	}
  3106  	if jobObj.Spec.Template.Spec.RestartPolicy == "" {
  3107  		jobObj.Spec.Template.Spec.RestartPolicy = v1.RestartPolicyNever
  3108  	}
  3109  	return clientSet.BatchV1().Jobs(ns).Create(ctx, jobObj, metav1.CreateOptions{})
  3110  }
  3111  
  3112  func setup(t testing.TB, nsBaseName string) (framework.TearDownFunc, *restclient.Config, clientset.Interface, *v1.Namespace) {
  3113  	// Disable ServiceAccount admission plugin as we don't have serviceaccount controller running.
  3114  	server := kubeapiservertesting.StartTestServerOrDie(t, nil, []string{"--disable-admission-plugins=ServiceAccount"}, framework.SharedEtcd())
  3115  
  3116  	config := restclient.CopyConfig(server.ClientConfig)
  3117  	config.QPS = 200
  3118  	config.Burst = 200
  3119  	config.Timeout = 0
  3120  	clientSet, err := clientset.NewForConfig(config)
  3121  	if err != nil {
  3122  		t.Fatalf("Error creating clientset: %v", err)
  3123  	}
  3124  
  3125  	ns := framework.CreateNamespaceOrDie(clientSet, nsBaseName, t)
  3126  	closeFn := func() {
  3127  		framework.DeleteNamespaceOrDie(clientSet, ns, t)
  3128  		server.TearDownFn()
  3129  	}
  3130  	return closeFn, config, clientSet, ns
  3131  }
  3132  
  3133  func startJobControllerAndWaitForCaches(tb testing.TB, restConfig *restclient.Config) (context.Context, context.CancelFunc) {
  3134  	tb.Helper()
  3135  	informerSet := informers.NewSharedInformerFactory(clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-informers")), 0)
  3136  	jc, ctx, cancel := createJobControllerWithSharedInformers(tb, restConfig, informerSet)
  3137  	informerSet.Start(ctx.Done())
  3138  	go jc.Run(ctx, 1)
  3139  
  3140  	// since this method starts the controller in a separate goroutine
  3141  	// and the tests don't check /readyz there is no way
  3142  	// the tests can tell it is safe to call the server and requests won't be rejected
  3143  	// thus we wait until caches have synced
  3144  	informerSet.WaitForCacheSync(ctx.Done())
  3145  	return ctx, cancel
  3146  }
  3147  
  3148  func resetMetrics() {
  3149  	metrics.TerminatedPodsTrackingFinalizerTotal.Reset()
  3150  	metrics.JobFinishedNum.Reset()
  3151  	metrics.JobPodsFinished.Reset()
  3152  	metrics.PodFailuresHandledByFailurePolicy.Reset()
  3153  	metrics.JobFinishedIndexesTotal.Reset()
  3154  	metrics.JobPodsCreationTotal.Reset()
  3155  }
  3156  
  3157  func createJobControllerWithSharedInformers(tb testing.TB, restConfig *restclient.Config, informerSet informers.SharedInformerFactory) (*jobcontroller.Controller, context.Context, context.CancelFunc) {
  3158  	tb.Helper()
  3159  	clientSet := clientset.NewForConfigOrDie(restclient.AddUserAgent(restConfig, "job-controller"))
  3160  	ctx, cancel := context.WithCancel(context.Background())
  3161  	jc, err := jobcontroller.NewController(ctx, informerSet.Core().V1().Pods(), informerSet.Batch().V1().Jobs(), clientSet)
  3162  	if err != nil {
  3163  		tb.Fatalf("Error creating Job controller: %v", err)
  3164  	}
  3165  	return jc, ctx, cancel
  3166  }
  3167  
  3168  func hasJobTrackingFinalizer(obj metav1.Object) bool {
  3169  	for _, fin := range obj.GetFinalizers() {
  3170  		if fin == batchv1.JobTrackingFinalizer {
  3171  			return true
  3172  		}
  3173  	}
  3174  	return false
  3175  }
  3176  
  3177  func setDuringTest(val *int, newVal int) func() {
  3178  	origVal := *val
  3179  	*val = newVal
  3180  	return func() {
  3181  		*val = origVal
  3182  	}
  3183  }
  3184  
  3185  func setDurationDuringTest(val *time.Duration, newVal time.Duration) func() {
  3186  	origVal := *val
  3187  	*val = newVal
  3188  	return func() {
  3189  		*val = origVal
  3190  	}
  3191  }
  3192  
  3193  func updateJob(ctx context.Context, jobClient typedv1.JobInterface, jobName string, updateFunc func(*batchv1.Job)) (*batchv1.Job, error) {
  3194  	var job *batchv1.Job
  3195  	err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
  3196  		newJob, err := jobClient.Get(ctx, jobName, metav1.GetOptions{})
  3197  		if err != nil {
  3198  			return err
  3199  		}
  3200  		updateFunc(newJob)
  3201  		job, err = jobClient.Update(ctx, newJob, metav1.UpdateOptions{})
  3202  		return err
  3203  	})
  3204  	return job, err
  3205  }
  3206  
  3207  func waitForPodsToBeActive(ctx context.Context, t *testing.T, jobClient typedv1.JobInterface, podCount int32, jobObj *batchv1.Job) {
  3208  	t.Helper()
  3209  	err := wait.PollUntilContextTimeout(ctx, 5*time.Millisecond, wait.ForeverTestTimeout, true, func(context.Context) (done bool, err error) {
  3210  		job, err := jobClient.Get(ctx, jobObj.Name, metav1.GetOptions{})
  3211  		if err != nil {
  3212  			return false, err
  3213  		}
  3214  		return job.Status.Active == podCount, nil
  3215  	})
  3216  	if err != nil {
  3217  		t.Fatalf("Error waiting for Job pods to become active: %v", err)
  3218  	}
  3219  }
  3220  
  3221  func deletePods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
  3222  	t.Helper()
  3223  	err := clientSet.CoreV1().Pods(namespace).DeleteCollection(ctx,
  3224  		metav1.DeleteOptions{},
  3225  		metav1.ListOptions{
  3226  			Limit: 1000,
  3227  		})
  3228  	if err != nil {
  3229  		t.Fatalf("Failed to cleanup Pods: %v", err)
  3230  	}
  3231  }
  3232  
  3233  func removePodsFinalizer(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
  3234  	t.Helper()
  3235  	pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
  3236  	if err != nil {
  3237  		t.Fatalf("Failed to list pods: %v", err)
  3238  	}
  3239  	updatePod(ctx, t, clientSet, pods.Items, func(pod *v1.Pod) {
  3240  		for i, finalizer := range pod.Finalizers {
  3241  			if finalizer == "fake.example.com/blockDeletion" {
  3242  				pod.Finalizers = append(pod.Finalizers[:i], pod.Finalizers[i+1:]...)
  3243  			}
  3244  		}
  3245  	})
  3246  }
  3247  
  3248  func updatePod(ctx context.Context, t *testing.T, clientSet clientset.Interface, pods []v1.Pod, updateFunc func(*v1.Pod)) {
  3249  	t.Helper()
  3250  	for _, val := range pods {
  3251  		if err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
  3252  			newPod, err := clientSet.CoreV1().Pods(val.Namespace).Get(ctx, val.Name, metav1.GetOptions{})
  3253  			if err != nil {
  3254  				return err
  3255  			}
  3256  			updateFunc(newPod)
  3257  			_, err = clientSet.CoreV1().Pods(val.Namespace).Update(ctx, newPod, metav1.UpdateOptions{})
  3258  			return err
  3259  		}); err != nil {
  3260  			t.Fatalf("Failed to update pod %s: %v", val.Name, err)
  3261  		}
  3262  	}
  3263  }
  3264  
  3265  func failTerminatingPods(ctx context.Context, t *testing.T, clientSet clientset.Interface, namespace string) {
  3266  	t.Helper()
  3267  	pods, err := clientSet.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{})
  3268  	if err != nil {
  3269  		t.Fatalf("Failed to list pods: %v", err)
  3270  	}
  3271  	var terminatingPods []v1.Pod
  3272  	for _, pod := range pods.Items {
  3273  		if pod.DeletionTimestamp != nil {
  3274  			pod.Status.Phase = v1.PodFailed
  3275  			terminatingPods = append(terminatingPods, pod)
  3276  		}
  3277  	}
  3278  	_, err = updatePodStatuses(ctx, clientSet, terminatingPods)
  3279  	if err != nil {
  3280  		t.Fatalf("Failed to update pod statuses: %v", err)
  3281  	}
  3282  }