github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/status_test.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tensorflow
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	. "github.com/onsi/ginkgo/v2"
    22  	. "github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	v1 "k8s.io/api/core/v1"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/apimachinery/pkg/util/uuid"
    28  	"sigs.k8s.io/controller-runtime/pkg/client"
    29  
    30  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    31  	tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil"
    32  	"github.com/kubeflow/training-operator/pkg/util"
    33  	"github.com/kubeflow/training-operator/pkg/util/testutil"
    34  )
    35  
    36  var _ = Describe("TFJob controller", func() {
    37  	Context("Test Failed", func() {
    38  		It("should update TFJob with failed status", func() {
    39  			By("creating a TFJob with replicaStatues initialized")
    40  			tfJob := tftestutil.NewTFJob(3, 0)
    41  			initializeReplicaStatuses(&tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker)
    42  
    43  			By("prepare pod")
    44  			refs := []metav1.OwnerReference{
    45  				*reconciler.GenOwnerReference(tfJob),
    46  			}
    47  			pod := tftestutil.NewBasePod("pod", tfJob, refs)
    48  			pod.Status.Phase = v1.PodFailed
    49  
    50  			By("update job replica statuses")
    51  			updateJobReplicaStatuses(&tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker, pod)
    52  			Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Failed).Should(Equal(int32(1)))
    53  
    54  			By("update job status")
    55  			Expect(reconciler.UpdateJobStatus(tfJob, tfJob.Spec.TFReplicaSpecs, &tfJob.Status)).To(Succeed())
    56  
    57  			By("finding failed job status")
    58  			found := false
    59  			for _, condition := range tfJob.Status.Conditions {
    60  				if condition.Type == kubeflowv1.JobFailed {
    61  					found = true
    62  				}
    63  			}
    64  			Expect(found).To(BeTrue())
    65  		})
    66  	})
    67  
    68  	Context("Test Status", func() {
    69  		It("should update TFJob with desired status", func() {
    70  			type testCase struct {
    71  				description string
    72  				tfJob       *kubeflowv1.TFJob
    73  
    74  				expectedFailedPS    int32
    75  				expectedSucceededPS int32
    76  				expectedActivePS    int32
    77  
    78  				expectedFailedWorker    int32
    79  				expectedSucceededWorker int32
    80  				expectedActiveWorker    int32
    81  
    82  				expectedFailedChief    int32
    83  				expectedSucceededChief int32
    84  				expectedActiveChief    int32
    85  
    86  				restart          bool
    87  				worker0Completed bool
    88  
    89  				expectedType kubeflowv1.JobConditionType
    90  			}
    91  
    92  			testCases := []testCase{
    93  				{
    94  					description:             "Chief worker is succeeded",
    95  					tfJob:                   tftestutil.NewTFJobWithChief(1, 0),
    96  					expectedFailedPS:        0,
    97  					expectedSucceededPS:     0,
    98  					expectedActivePS:        0,
    99  					expectedFailedWorker:    0,
   100  					expectedSucceededWorker: 1,
   101  					expectedActiveWorker:    0,
   102  					expectedFailedChief:     0,
   103  					expectedSucceededChief:  1,
   104  					expectedActiveChief:     0,
   105  					restart:                 false,
   106  					worker0Completed:        false,
   107  					expectedType:            kubeflowv1.JobSucceeded,
   108  				},
   109  				{
   110  					description:             "Chief worker is running",
   111  					tfJob:                   tftestutil.NewTFJobWithChief(1, 0),
   112  					expectedFailedPS:        0,
   113  					expectedSucceededPS:     0,
   114  					expectedActivePS:        0,
   115  					expectedFailedWorker:    0,
   116  					expectedSucceededWorker: 0,
   117  					expectedActiveWorker:    0,
   118  					expectedFailedChief:     0,
   119  					expectedSucceededChief:  0,
   120  					expectedActiveChief:     1,
   121  					restart:                 false,
   122  					worker0Completed:        false,
   123  					expectedType:            kubeflowv1.JobRunning,
   124  				},
   125  				{
   126  					description:             "Chief worker is failed",
   127  					tfJob:                   tftestutil.NewTFJobWithChief(1, 0),
   128  					expectedFailedPS:        0,
   129  					expectedSucceededPS:     0,
   130  					expectedActivePS:        0,
   131  					expectedFailedWorker:    0,
   132  					expectedSucceededWorker: 0,
   133  					expectedActiveWorker:    0,
   134  					expectedFailedChief:     1,
   135  					expectedSucceededChief:  0,
   136  					expectedActiveChief:     0,
   137  					restart:                 false,
   138  					worker0Completed:        false,
   139  					expectedType:            kubeflowv1.JobFailed,
   140  				},
   141  				{
   142  					description:             "(No chief worker) Worker is failed",
   143  					tfJob:                   tftestutil.NewTFJob(1, 0),
   144  					expectedFailedPS:        0,
   145  					expectedSucceededPS:     0,
   146  					expectedActivePS:        0,
   147  					expectedFailedWorker:    1,
   148  					expectedSucceededWorker: 0,
   149  					expectedActiveWorker:    0,
   150  					expectedFailedChief:     0,
   151  					expectedSucceededChief:  0,
   152  					expectedActiveChief:     0,
   153  					restart:                 false,
   154  					worker0Completed:        false,
   155  					expectedType:            kubeflowv1.JobFailed,
   156  				},
   157  				{
   158  					description:             "(No chief worker) Worker is succeeded",
   159  					tfJob:                   tftestutil.NewTFJob(1, 0),
   160  					expectedFailedPS:        0,
   161  					expectedSucceededPS:     0,
   162  					expectedActivePS:        0,
   163  					expectedFailedWorker:    0,
   164  					expectedSucceededWorker: 1,
   165  					expectedActiveWorker:    0,
   166  					expectedFailedChief:     0,
   167  					expectedSucceededChief:  0,
   168  					expectedActiveChief:     0,
   169  					restart:                 false,
   170  					worker0Completed:        false,
   171  					expectedType:            kubeflowv1.JobSucceeded,
   172  				},
   173  				{
   174  					description:             "(No chief worker) Worker is running",
   175  					tfJob:                   tftestutil.NewTFJob(1, 0),
   176  					expectedFailedPS:        0,
   177  					expectedSucceededPS:     0,
   178  					expectedActivePS:        0,
   179  					expectedFailedWorker:    0,
   180  					expectedSucceededWorker: 0,
   181  					expectedActiveWorker:    1,
   182  					expectedFailedChief:     0,
   183  					expectedSucceededChief:  0,
   184  					expectedActiveChief:     0,
   185  					restart:                 false,
   186  					worker0Completed:        false,
   187  					expectedType:            kubeflowv1.JobRunning,
   188  				},
   189  				{
   190  					description:             "(No chief worker) 2 workers are succeeded, 2 workers are active",
   191  					tfJob:                   tftestutil.NewTFJob(4, 2),
   192  					expectedFailedPS:        0,
   193  					expectedSucceededPS:     0,
   194  					expectedActivePS:        2,
   195  					expectedFailedWorker:    0,
   196  					expectedSucceededWorker: 2,
   197  					expectedActiveWorker:    2,
   198  					expectedFailedChief:     0,
   199  					expectedSucceededChief:  0,
   200  					expectedActiveChief:     0,
   201  					restart:                 false,
   202  					worker0Completed:        false,
   203  					expectedType:            kubeflowv1.JobRunning,
   204  				},
   205  				{
   206  					description:             "(No chief worker) 2 workers are running, 2 workers are failed",
   207  					tfJob:                   tftestutil.NewTFJob(4, 2),
   208  					expectedFailedPS:        0,
   209  					expectedSucceededPS:     0,
   210  					expectedActivePS:        2,
   211  					expectedFailedWorker:    2,
   212  					expectedSucceededWorker: 0,
   213  					expectedActiveWorker:    2,
   214  					expectedFailedChief:     0,
   215  					expectedSucceededChief:  0,
   216  					expectedActiveChief:     0,
   217  					restart:                 false,
   218  					worker0Completed:        false,
   219  					expectedType:            kubeflowv1.JobFailed,
   220  				},
   221  				{
   222  					description:             "(No chief worker) 2 workers are succeeded, 2 workers are failed",
   223  					tfJob:                   tftestutil.NewTFJob(4, 2),
   224  					expectedFailedPS:        0,
   225  					expectedSucceededPS:     0,
   226  					expectedActivePS:        2,
   227  					expectedFailedWorker:    2,
   228  					expectedSucceededWorker: 2,
   229  					expectedActiveWorker:    0,
   230  					expectedFailedChief:     0,
   231  					expectedSucceededChief:  0,
   232  					expectedActiveChief:     0,
   233  					restart:                 false,
   234  					worker0Completed:        false,
   235  					expectedType:            kubeflowv1.JobFailed,
   236  				},
   237  				{
   238  					description:             "(No chief worker) worker-0 are succeeded, 3 workers are active",
   239  					tfJob:                   tftestutil.NewTFJob(4, 2),
   240  					expectedFailedPS:        0,
   241  					expectedSucceededPS:     0,
   242  					expectedActivePS:        2,
   243  					expectedFailedWorker:    0,
   244  					expectedSucceededWorker: 1,
   245  					expectedActiveWorker:    3,
   246  					expectedFailedChief:     0,
   247  					expectedSucceededChief:  0,
   248  					expectedActiveChief:     0,
   249  					restart:                 false,
   250  					worker0Completed:        true,
   251  					expectedType:            kubeflowv1.JobSucceeded,
   252  				},
   253  				{
   254  					description:             "(No chief worker, successPolicy: AllWorkers) worker-0 are succeeded, 3 workers are active",
   255  					tfJob:                   tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers),
   256  					expectedFailedPS:        0,
   257  					expectedSucceededPS:     0,
   258  					expectedActivePS:        0,
   259  					expectedFailedWorker:    0,
   260  					expectedSucceededWorker: 1,
   261  					expectedActiveWorker:    3,
   262  					expectedFailedChief:     0,
   263  					expectedSucceededChief:  0,
   264  					expectedActiveChief:     0,
   265  					restart:                 false,
   266  					worker0Completed:        true,
   267  					expectedType:            kubeflowv1.JobRunning,
   268  				},
   269  				{
   270  					description:             "(No chief worker, successPolicy: AllWorkers) 4 workers are succeeded",
   271  					tfJob:                   tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers),
   272  					expectedFailedPS:        0,
   273  					expectedSucceededPS:     0,
   274  					expectedActivePS:        0,
   275  					expectedFailedWorker:    0,
   276  					expectedSucceededWorker: 4,
   277  					expectedActiveWorker:    0,
   278  					expectedFailedChief:     0,
   279  					expectedSucceededChief:  0,
   280  					expectedActiveChief:     0,
   281  					restart:                 false,
   282  					worker0Completed:        true,
   283  					expectedType:            kubeflowv1.JobSucceeded,
   284  				},
   285  				{
   286  					description:             "(No chief worker, successPolicy: AllWorkers) worker-0 is succeeded, 2 workers are running, 1 worker is failed",
   287  					tfJob:                   tftestutil.NewTFJobWithSuccessPolicy(4, 0, kubeflowv1.SuccessPolicyAllWorkers),
   288  					expectedFailedPS:        0,
   289  					expectedSucceededPS:     0,
   290  					expectedActivePS:        0,
   291  					expectedFailedWorker:    1,
   292  					expectedSucceededWorker: 1,
   293  					expectedActiveWorker:    2,
   294  					expectedFailedChief:     0,
   295  					expectedSucceededChief:  0,
   296  					expectedActiveChief:     0,
   297  					restart:                 false,
   298  					worker0Completed:        true,
   299  					expectedType:            kubeflowv1.JobFailed,
   300  				},
   301  				{
   302  					description:             "Chief is running, workers are failed",
   303  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   304  					expectedFailedPS:        0,
   305  					expectedSucceededPS:     0,
   306  					expectedActivePS:        2,
   307  					expectedFailedWorker:    4,
   308  					expectedSucceededWorker: 0,
   309  					expectedActiveWorker:    0,
   310  					expectedFailedChief:     0,
   311  					expectedSucceededChief:  0,
   312  					expectedActiveChief:     1,
   313  					restart:                 false,
   314  					worker0Completed:        false,
   315  					expectedType:            kubeflowv1.JobRunning,
   316  				},
   317  				{
   318  					description:             "Chief is running, workers are succeeded",
   319  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   320  					expectedFailedPS:        0,
   321  					expectedSucceededPS:     0,
   322  					expectedActivePS:        2,
   323  					expectedFailedWorker:    0,
   324  					expectedSucceededWorker: 4,
   325  					expectedActiveWorker:    0,
   326  					expectedFailedChief:     0,
   327  					expectedSucceededChief:  0,
   328  					expectedActiveChief:     1,
   329  					restart:                 false,
   330  					worker0Completed:        false,
   331  					expectedType:            kubeflowv1.JobRunning,
   332  				},
   333  				{
   334  					description:             "Chief is running, a PS is failed",
   335  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   336  					expectedFailedPS:        1,
   337  					expectedSucceededPS:     0,
   338  					expectedActivePS:        1,
   339  					expectedFailedWorker:    0,
   340  					expectedSucceededWorker: 4,
   341  					expectedActiveWorker:    0,
   342  					expectedFailedChief:     0,
   343  					expectedSucceededChief:  0,
   344  					expectedActiveChief:     1,
   345  					restart:                 false,
   346  					worker0Completed:        false,
   347  					expectedType:            kubeflowv1.JobFailed,
   348  				},
   349  				{
   350  					description:             "Chief is failed, workers are succeeded",
   351  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   352  					expectedFailedPS:        0,
   353  					expectedSucceededPS:     0,
   354  					expectedActivePS:        2,
   355  					expectedFailedWorker:    0,
   356  					expectedSucceededWorker: 4,
   357  					expectedActiveWorker:    0,
   358  					expectedFailedChief:     1,
   359  					expectedSucceededChief:  0,
   360  					expectedActiveChief:     0,
   361  					restart:                 false,
   362  					worker0Completed:        false,
   363  					expectedType:            kubeflowv1.JobFailed,
   364  				},
   365  				{
   366  					description:             "Chief is succeeded, workers are failed",
   367  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   368  					expectedFailedPS:        0,
   369  					expectedSucceededPS:     0,
   370  					expectedActivePS:        2,
   371  					expectedFailedWorker:    4,
   372  					expectedSucceededWorker: 0,
   373  					expectedActiveWorker:    0,
   374  					expectedFailedChief:     0,
   375  					expectedSucceededChief:  1,
   376  					expectedActiveChief:     0,
   377  					restart:                 false,
   378  					worker0Completed:        false,
   379  					expectedType:            kubeflowv1.JobSucceeded,
   380  				},
   381  				{
   382  					description:             "Chief is failed and restarting",
   383  					tfJob:                   tftestutil.NewTFJobWithChief(4, 2),
   384  					expectedFailedPS:        0,
   385  					expectedSucceededPS:     0,
   386  					expectedActivePS:        2,
   387  					expectedFailedWorker:    4,
   388  					expectedSucceededWorker: 0,
   389  					expectedActiveWorker:    0,
   390  					expectedFailedChief:     1,
   391  					expectedSucceededChief:  0,
   392  					expectedActiveChief:     0,
   393  					restart:                 true,
   394  					worker0Completed:        false,
   395  					expectedType:            kubeflowv1.JobRestarting,
   396  				},
   397  			}
   398  
   399  			jobNameTemplate := "test-status-%d"
   400  			for i, c := range testCases {
   401  				reconciler.Log.Info("testing case", "description", c.description)
   402  				c.tfJob.SetName(fmt.Sprintf(jobNameTemplate, i))
   403  				c.tfJob.SetUID(uuid.NewUUID())
   404  
   405  				initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypeWorker)
   406  				initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypeChief)
   407  				initializeReplicaStatuses(&c.tfJob.Status, kubeflowv1.TFJobReplicaTypePS)
   408  
   409  				setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypePS, c.expectedFailedPS, c.expectedSucceededPS, c.expectedActivePS, c.restart, c.worker0Completed, testK8sClient)
   410  				setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypeWorker, c.expectedFailedWorker, c.expectedSucceededWorker, c.expectedActiveWorker, c.restart, c.worker0Completed, testK8sClient)
   411  				setStatusForTest(c.tfJob, kubeflowv1.TFJobReplicaTypeChief, c.expectedFailedChief, c.expectedSucceededChief, c.expectedActiveChief, c.restart, c.worker0Completed, testK8sClient)
   412  
   413  				// Adding this section to make sure all pods are created and cached
   414  				Eventually(func() error {
   415  					podList := &corev1.PodList{}
   416  					basicLabels := reconciler.GenLabels(c.tfJob.GetName())
   417  					selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   418  						MatchLabels: basicLabels,
   419  					})
   420  					if err != nil {
   421  						return err
   422  					}
   423  					listOpt := client.MatchingLabelsSelector{
   424  						Selector: selector,
   425  					}
   426  					err = testK8sClient.List(context.Background(), podList, listOpt)
   427  					if err != nil {
   428  						return nil
   429  					}
   430  					totalExpectedPodCount := c.expectedFailedPS + c.expectedSucceededPS + c.expectedActivePS +
   431  						c.expectedFailedWorker + c.expectedSucceededWorker + c.expectedActiveWorker +
   432  						c.expectedFailedChief + c.expectedSucceededChief + c.expectedActiveChief
   433  					if len(podList.Items) != int(totalExpectedPodCount) {
   434  						return fmt.Errorf("pod number (%d) for %s not match for expected pod number %d",
   435  							len(podList.Items), c.tfJob.GetName(), totalExpectedPodCount)
   436  					}
   437  					return nil
   438  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   439  
   440  				_ = reconciler.ReconcileJobs(c.tfJob, c.tfJob.Spec.TFReplicaSpecs, c.tfJob.Status, &c.tfJob.Spec.RunPolicy)
   441  
   442  				Expect(filterOutConditionTest(c.tfJob.Status)).Should(Succeed())
   443  
   444  				reconciler.Log.Info("checking status", "tfJob.Status", c.tfJob.Status)
   445  				found := false
   446  				for _, condition := range c.tfJob.Status.Conditions {
   447  					if condition.Type == c.expectedType {
   448  						found = true
   449  					}
   450  				}
   451  				Expect(found).To(BeTrue())
   452  				reconciler.Log.Info("passed!",
   453  					"job name", c.tfJob.GetName(), "job description", c.description)
   454  			}
   455  		})
   456  	})
   457  })
   458  
   459  func setStatusForTest(tfJob *kubeflowv1.TFJob, rtype kubeflowv1.ReplicaType, failed, succeeded, active int32, restart bool, worker0Completed bool, client client.Client) {
   460  	if restart == true {
   461  		tfJob.Spec.TFReplicaSpecs[rtype].RestartPolicy = kubeflowv1.RestartPolicyExitCode
   462  	}
   463  
   464  	basicLabels := reconciler.GenLabels(tfJob.GetName())
   465  	ctx := context.Background()
   466  
   467  	Expect(rtype).Should(BeElementOf([]kubeflowv1.ReplicaType{
   468  		kubeflowv1.TFJobReplicaTypeWorker,
   469  		kubeflowv1.TFJobReplicaTypePS,
   470  		kubeflowv1.TFJobReplicaTypeChief,
   471  	}))
   472  
   473  	refs := []metav1.OwnerReference{
   474  		*reconciler.GenOwnerReference(tfJob),
   475  	}
   476  
   477  	var i int32
   478  	index := 0
   479  	for i = 0; i < succeeded; i++ {
   480  		pod := tftestutil.NewPod(tfJob, rtype, index, refs)
   481  		for k, v := range basicLabels {
   482  			pod.Labels[k] = v
   483  		}
   484  		po := &corev1.Pod{}
   485  		Expect(client.Create(ctx, pod)).Should(Succeed())
   486  
   487  		key := genKeyFromJob(pod)
   488  		Eventually(func() error {
   489  			po = &corev1.Pod{}
   490  			if err := client.Get(ctx, key, po); err != nil {
   491  				return err
   492  			}
   493  
   494  			po.Status.Phase = corev1.PodSucceeded
   495  			if worker0Completed == true && rtype == kubeflowv1.TFJobReplicaTypeWorker && index == 0 {
   496  				po.Status.ContainerStatuses = []corev1.ContainerStatus{
   497  					{
   498  						Name: reconciler.GetDefaultContainerName(),
   499  						State: corev1.ContainerState{
   500  							Terminated: &corev1.ContainerStateTerminated{
   501  								ExitCode: int32(0), // exit with 0
   502  							},
   503  						},
   504  					},
   505  				}
   506  			}
   507  
   508  			return client.Status().Update(ctx, po)
   509  		}, testutil.Timeout, testutil.Interval).Should(BeNil())
   510  
   511  		updateJobReplicaStatuses(&tfJob.Status, rtype, po)
   512  
   513  		index++
   514  	}
   515  
   516  	for i = 0; i < failed; i++ {
   517  		pod := tftestutil.NewPod(tfJob, rtype, index, refs)
   518  		for k, v := range basicLabels {
   519  			pod.Labels[k] = v
   520  		}
   521  		po := &corev1.Pod{}
   522  		Expect(client.Create(ctx, pod)).Should(Succeed())
   523  
   524  		key := genKeyFromJob(pod)
   525  		Eventually(func() error {
   526  			po = &corev1.Pod{}
   527  			if err := client.Get(ctx, key, po); err != nil {
   528  				return err
   529  			}
   530  
   531  			po.Status.Phase = corev1.PodFailed
   532  			if restart == true {
   533  				if po.Status.ContainerStatuses == nil {
   534  					po.Status.ContainerStatuses = []corev1.ContainerStatus{
   535  						{
   536  							Name: reconciler.GetDefaultContainerName(),
   537  							State: corev1.ContainerState{
   538  								Terminated: &corev1.ContainerStateTerminated{
   539  									ExitCode: int32(130), // 130 is a retryable code
   540  								},
   541  							},
   542  						},
   543  					}
   544  				}
   545  			}
   546  
   547  			return client.Status().Update(ctx, po)
   548  		}, testutil.Timeout, testutil.Interval).Should(BeNil())
   549  
   550  		updateJobReplicaStatuses(&tfJob.Status, rtype, po)
   551  		index++
   552  	}
   553  
   554  	for i = 0; i < active; i++ {
   555  		pod := tftestutil.NewPod(tfJob, rtype, index, refs)
   556  		for k, v := range basicLabels {
   557  			pod.Labels[k] = v
   558  		}
   559  		po := &corev1.Pod{}
   560  		Expect(client.Create(ctx, pod)).Should(Succeed())
   561  
   562  		key := genKeyFromJob(pod)
   563  		Eventually(func() error {
   564  			po = &corev1.Pod{}
   565  			if err := client.Get(ctx, key, po); err != nil {
   566  				return err
   567  			}
   568  
   569  			po.Status.Phase = corev1.PodRunning
   570  
   571  			return client.Status().Update(ctx, po)
   572  		}, testutil.Timeout, testutil.Interval).Should(BeNil())
   573  
   574  		updateJobReplicaStatuses(&tfJob.Status, rtype, po)
   575  		index++
   576  	}
   577  }
   578  
   579  func genKeyFromJob(job client.Object) types.NamespacedName {
   580  	ns := metav1.NamespaceDefault
   581  	if job.GetNamespace() != "" {
   582  		ns = job.GetNamespace()
   583  	}
   584  	return types.NamespacedName{
   585  		Namespace: ns,
   586  		Name:      job.GetName(),
   587  	}
   588  }
   589  
   590  func filterOutConditionTest(status kubeflowv1.JobStatus) error {
   591  	flag := util.IsFailed(status) || util.IsSucceeded(status)
   592  	for _, condition := range status.Conditions {
   593  		if flag && condition.Type == kubeflowv1.JobRunning && condition.Status == corev1.ConditionTrue {
   594  			return fmt.Errorf("error condition status when succeeded or failed")
   595  		}
   596  	}
   597  	return nil
   598  }