github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/job_test.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tensorflow
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strconv"
    21  	"time"
    22  
    23  	"github.com/google/go-cmp/cmp/cmpopts"
    24  	. "github.com/onsi/ginkgo/v2"
    25  	. "github.com/onsi/gomega"
    26  	corev1 "k8s.io/api/core/v1"
    27  	"k8s.io/apimachinery/pkg/api/errors"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/apimachinery/pkg/util/intstr"
    31  	"k8s.io/apimachinery/pkg/util/uuid"
    32  	"k8s.io/utils/pointer"
    33  	"sigs.k8s.io/controller-runtime/pkg/client"
    34  
    35  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    36  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    37  	tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil"
    38  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    39  	"github.com/kubeflow/training-operator/pkg/util/testutil"
    40  )
    41  
    42  var _ = Describe("TFJob controller", func() {
    43  	Context("Test Add TFJob", func() {
    44  		It("should get the exact TFJob", func() {
    45  			By("submitting an TFJob")
    46  
    47  			testJobName := "test-case-12"
    48  			testNamespace := metav1.NamespaceDefault
    49  
    50  			decoyJobName := "decoy-case-34"
    51  
    52  			ctx := context.Background()
    53  
    54  			tfJob := tftestutil.NewTFJob(1, 0)
    55  			tfJob.SetName(testJobName)
    56  			tfJob.SetNamespace(testNamespace)
    57  
    58  			decoyJob := tftestutil.NewTFJob(2, 3)
    59  			decoyJob.SetName(decoyJobName)
    60  			decoyJob.SetNamespace(testNamespace)
    61  
    62  			Expect(testK8sClient.Create(ctx, tfJob)).Should(Succeed())
    63  			Expect(testK8sClient.Create(ctx, decoyJob)).Should(Succeed())
    64  
    65  			key := types.NamespacedName{
    66  				Namespace: testNamespace,
    67  				Name:      testJobName,
    68  			}
    69  			Eventually(func() error {
    70  				job := &kubeflowv1.TFJob{}
    71  				return reconciler.Get(ctx, key, job)
    72  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
    73  
    74  			Expect(testK8sClient.Delete(ctx, tfJob)).Should(Succeed())
    75  			Expect(testK8sClient.Delete(ctx, decoyJob)).Should(Succeed())
    76  		})
    77  	})
    78  
    79  	Context("Test Copy Labels and Annotation", func() {
    80  		It("should copy labels and annotation from the spec to generated Pods", func() {
    81  			ctx := context.Background()
    82  			testAnnotationKey := "annotation1"
    83  			testAnnotationVal := "1"
    84  			testLabelKey := "label1"
    85  			testLabelVal := "1"
    86  
    87  			testJobName := "test-copy-labels-anno"
    88  			tfjob := tftestutil.NewTFJob(1, 0)
    89  			tfjob.SetName(testJobName)
    90  			annotations := map[string]string{
    91  				testAnnotationKey: testAnnotationVal,
    92  			}
    93  			labels := map[string]string{
    94  				testLabelKey: testLabelVal,
    95  			}
    96  			tfjob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].Template.Labels = labels
    97  			tfjob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].Template.Annotations = annotations
    98  
    99  			By("submitting an TFJob with specific labels and annotations")
   100  			Expect(testK8sClient.Create(ctx, tfjob)).Should(Succeed())
   101  
   102  			Eventually(func() error {
   103  				pod := &corev1.Pod{}
   104  				key := types.NamespacedName{
   105  					Namespace: metav1.NamespaceDefault,
   106  					Name:      common.GenGeneralName(tfjob.Name, "worker", "0"),
   107  				}
   108  				err := testK8sClient.Get(ctx, key, pod)
   109  				if err != nil {
   110  					return err
   111  				}
   112  
   113  				if pod.Annotations == nil {
   114  					return fmt.Errorf("annotation of %s/%s is nil", pod.GetNamespace(), pod.GetName())
   115  				}
   116  				if val, exist := pod.Annotations[testAnnotationKey]; exist {
   117  					if val != testAnnotationVal {
   118  						return fmt.Errorf("annotation of %s not match with %s", testAnnotationKey, testAnnotationVal)
   119  					}
   120  				} else {
   121  					return fmt.Errorf("annotation %s not found", testAnnotationKey)
   122  				}
   123  
   124  				if pod.Labels == nil {
   125  					return fmt.Errorf("label of %s/%s is nil", pod.GetNamespace(), pod.GetName())
   126  				}
   127  				if val, exist := pod.Labels[testLabelKey]; exist {
   128  					if val != testLabelVal {
   129  						return fmt.Errorf("annotation of %s not match with %s", testLabelKey, testLabelVal)
   130  					}
   131  				} else {
   132  					return fmt.Errorf("label %s not found", testLabelKey)
   133  				}
   134  
   135  				return nil
   136  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   137  		})
   138  	})
   139  
   140  	Context("Test Delete Pods and Services", func() {
   141  		It("it should clean associated Pods and Services according to clean policy", func() {
   142  			type testCase struct {
   143  				description string
   144  				tfJob       *kubeflowv1.TFJob
   145  
   146  				pendingWorkerPods   int32
   147  				activeWorkerPods    int32
   148  				succeededWorkerPods int32
   149  				failedWorkerPods    int32
   150  
   151  				pendingPSPods   int32
   152  				activePSPods    int32
   153  				succeededPSPods int32
   154  				failedPSPods    int32
   155  
   156  				activeWorkerServices int32
   157  				activePSServices     int32
   158  
   159  				expectedPodRemaining int
   160  			}
   161  
   162  			testCases := []testCase{
   163  				{
   164  					description: "4 workers and 2 ps is running, policy is all",
   165  					tfJob:       tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyAll),
   166  
   167  					pendingWorkerPods:   0,
   168  					activeWorkerPods:    4,
   169  					succeededWorkerPods: 0,
   170  					failedWorkerPods:    0,
   171  
   172  					pendingPSPods:   0,
   173  					activePSPods:    2,
   174  					succeededPSPods: 0,
   175  					failedPSPods:    0,
   176  
   177  					activeWorkerServices: 4,
   178  					activePSServices:     2,
   179  
   180  					expectedPodRemaining: 0,
   181  				},
   182  				{
   183  					description: "4 workers and 2 ps is running, policy is running",
   184  					tfJob:       tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning),
   185  
   186  					pendingWorkerPods:   0,
   187  					activeWorkerPods:    4,
   188  					succeededWorkerPods: 0,
   189  					failedWorkerPods:    0,
   190  
   191  					pendingPSPods:   0,
   192  					activePSPods:    2,
   193  					succeededPSPods: 0,
   194  					failedPSPods:    0,
   195  
   196  					activeWorkerServices: 4,
   197  					activePSServices:     2,
   198  
   199  					expectedPodRemaining: 0,
   200  				},
   201  				{
   202  					description: "4 workers and 2 ps is succeeded, policy is running",
   203  					tfJob:       tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning),
   204  
   205  					pendingWorkerPods:   0,
   206  					activeWorkerPods:    0,
   207  					succeededWorkerPods: 4,
   208  					failedWorkerPods:    0,
   209  
   210  					pendingPSPods:   0,
   211  					activePSPods:    0,
   212  					succeededPSPods: 2,
   213  					failedPSPods:    0,
   214  
   215  					activeWorkerServices: 4,
   216  					activePSServices:     2,
   217  
   218  					expectedPodRemaining: 6,
   219  				},
   220  				{
   221  					description: "4 workers and 2 ps is succeeded, policy is None",
   222  					tfJob:       tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyNone),
   223  
   224  					pendingWorkerPods:   0,
   225  					activeWorkerPods:    0,
   226  					succeededWorkerPods: 4,
   227  					failedWorkerPods:    0,
   228  
   229  					pendingPSPods:   0,
   230  					activePSPods:    0,
   231  					succeededPSPods: 2,
   232  					failedPSPods:    0,
   233  
   234  					activeWorkerServices: 4,
   235  					activePSServices:     2,
   236  
   237  					expectedPodRemaining: 6,
   238  				},
   239  			}
   240  
   241  			jobNameTemplate := "test-del-pod-svc-%d"
   242  			for idx, tc := range testCases {
   243  				By(fmt.Sprintf("preparing cases %s", tc.description))
   244  				ctx := context.Background()
   245  				tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx))
   246  				tc.tfJob.SetUID(uuid.NewUUID())
   247  				commonutil.UpdateJobConditions(&tc.tfJob.Status, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSucceededReason), "")
   248  
   249  				refs := []metav1.OwnerReference{
   250  					*reconciler.GenOwnerReference(tc.tfJob),
   251  				}
   252  
   253  				basicLabels := reconciler.GenLabels(tc.tfJob.GetName())
   254  				selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   255  					MatchLabels: basicLabels,
   256  				})
   257  				Expect(err).Should(BeNil())
   258  				listOpt := client.MatchingLabelsSelector{
   259  					Selector: selector,
   260  				}
   261  
   262  				By("creating Services and Pods with designed phases")
   263  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker,
   264  					tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods,
   265  					nil, refs, basicLabels)
   266  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS,
   267  					tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods,
   268  					nil, refs, basicLabels)
   269  
   270  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels)
   271  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels)
   272  
   273  				podList := &corev1.PodList{}
   274  				Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed())
   275  				Expect(len(podList.Items)).To(Equal(
   276  					int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods +
   277  						tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods)))
   278  
   279  				By("calling ReconcileJob")
   280  				_ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy)
   281  
   282  				podList = &corev1.PodList{}
   283  				Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed())
   284  				podRemainingCount := len(podList.Items)
   285  				Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining))
   286  
   287  				svcList := &corev1.ServiceList{}
   288  				Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed())
   289  				svcRemainingCount := len(svcList.Items)
   290  				Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining))
   291  			}
   292  		})
   293  	})
   294  
   295  	Context("Test Active Deadline Seconds", func() {
   296  		It("clean desired Pods and Services according to TFJob config", func() {
   297  			type testCase struct {
   298  				description string
   299  				tfJob       *kubeflowv1.TFJob
   300  
   301  				pendingWorkerPods   int32
   302  				activeWorkerPods    int32
   303  				succeededWorkerPods int32
   304  				failedWorkerPods    int32
   305  
   306  				pendingPSPods   int32
   307  				activePSPods    int32
   308  				succeededPSPods int32
   309  				failedPSPods    int32
   310  
   311  				activeWorkerServices int32
   312  				activePSServices     int32
   313  
   314  				expectedPodRemaining int
   315  			}
   316  
   317  			ads2 := int64(2)
   318  			adsTest2 := &ads2
   319  			testCases := []testCase{
   320  				{
   321  					description: "4 workers and 2 ps is running, ActiveDeadlineSeconds unset",
   322  					tfJob:       tftestutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, nil),
   323  
   324  					pendingWorkerPods:   0,
   325  					activeWorkerPods:    4,
   326  					succeededWorkerPods: 0,
   327  					failedWorkerPods:    0,
   328  
   329  					pendingPSPods:   0,
   330  					activePSPods:    2,
   331  					succeededPSPods: 0,
   332  					failedPSPods:    0,
   333  
   334  					activeWorkerServices: 4,
   335  					activePSServices:     2,
   336  
   337  					expectedPodRemaining: 6,
   338  				},
   339  				{
   340  					description: "4 workers and 2 ps is running, ActiveDeadlineSeconds is 2",
   341  					tfJob:       tftestutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, adsTest2),
   342  
   343  					pendingWorkerPods:   0,
   344  					activeWorkerPods:    4,
   345  					succeededWorkerPods: 0,
   346  					failedWorkerPods:    0,
   347  
   348  					pendingPSPods:   0,
   349  					activePSPods:    2,
   350  					succeededPSPods: 0,
   351  					failedPSPods:    0,
   352  
   353  					activeWorkerServices: 4,
   354  					activePSServices:     2,
   355  
   356  					expectedPodRemaining: 0,
   357  				},
   358  			}
   359  			jobNameTemplate := "test-ads-%d"
   360  			for idx, tc := range testCases {
   361  				By(fmt.Sprintf("preparing cases %s", tc.description))
   362  				ctx := context.Background()
   363  				tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx))
   364  				tc.tfJob.SetUID(uuid.NewUUID())
   365  
   366  				refs := []metav1.OwnerReference{
   367  					*reconciler.GenOwnerReference(tc.tfJob),
   368  				}
   369  
   370  				basicLabels := reconciler.GenLabels(tc.tfJob.GetName())
   371  				selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   372  					MatchLabels: basicLabels,
   373  				})
   374  				Expect(err).Should(BeNil())
   375  				listOpt := client.MatchingLabelsSelector{
   376  					Selector: selector,
   377  				}
   378  
   379  				By("creating Services and Pods with designed phases")
   380  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker,
   381  					tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods,
   382  					nil, refs, basicLabels)
   383  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS,
   384  					tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods,
   385  					nil, refs, basicLabels)
   386  
   387  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels)
   388  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels)
   389  
   390  				podList := &corev1.PodList{}
   391  				Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed())
   392  				Expect(len(podList.Items)).To(Equal(
   393  					int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods +
   394  						tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods)))
   395  
   396  				By("waiting enough time")
   397  				now := metav1.Now()
   398  				tc.tfJob.Status.StartTime = &now
   399  				ads := tc.tfJob.Spec.RunPolicy.ActiveDeadlineSeconds
   400  				if ads != nil {
   401  					dur := time.Second * time.Duration(*ads)
   402  					time.Sleep(dur)
   403  				}
   404  
   405  				By("calling ReconcileJob")
   406  				_ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy)
   407  
   408  				podList = &corev1.PodList{}
   409  				Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed())
   410  				podRemainingCount := len(podList.Items)
   411  				Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining))
   412  
   413  				svcList := &corev1.ServiceList{}
   414  				Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed())
   415  				svcRemainingCount := len(svcList.Items)
   416  				Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining))
   417  			}
   418  		})
   419  	})
   420  
   421  	Context("Test Backoff For On Failure(", func() {
   422  		It("clean desired Pods and Services according to TFJob config", func() {
   423  			type testCase struct {
   424  				description string
   425  				tfJob       *kubeflowv1.TFJob
   426  
   427  				pendingWorkerPods   int32
   428  				activeWorkerPods    int32
   429  				succeededWorkerPods int32
   430  				failedWorkerPods    int32
   431  
   432  				restartCounts []int32
   433  
   434  				pendingPSPods   int32
   435  				activePSPods    int32
   436  				succeededPSPods int32
   437  				failedPSPods    int32
   438  
   439  				activeWorkerServices int32
   440  				activePSServices     int32
   441  
   442  				expectedPodRemaining int
   443  			}
   444  
   445  			backoffLimit4 := int32(4)
   446  			backoffLimitTest4 := &backoffLimit4
   447  			testCases := []testCase{
   448  				{
   449  					description: "4 workers each having 1 restartCount and 2 ps is running, backoffLimit 4 ",
   450  					tfJob:       tftestutil.NewTFJobWithBackoffLimit(0, 4, 2, backoffLimitTest4),
   451  
   452  					pendingWorkerPods:   0,
   453  					activeWorkerPods:    4,
   454  					succeededWorkerPods: 0,
   455  					failedWorkerPods:    0,
   456  
   457  					restartCounts: []int32{1, 1, 1, 1},
   458  
   459  					pendingPSPods:   0,
   460  					activePSPods:    2,
   461  					succeededPSPods: 0,
   462  					failedPSPods:    0,
   463  
   464  					activeWorkerServices: 4,
   465  					activePSServices:     2,
   466  
   467  					expectedPodRemaining: 0,
   468  				},
   469  			}
   470  
   471  			jobNameTemplate := "test-bof-%d"
   472  			for idx, tc := range testCases {
   473  				By(fmt.Sprintf("preparing cases %s", tc.description))
   474  				ctx := context.Background()
   475  				tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx))
   476  				tc.tfJob.SetUID(uuid.NewUUID())
   477  
   478  				refs := []metav1.OwnerReference{
   479  					*reconciler.GenOwnerReference(tc.tfJob),
   480  				}
   481  
   482  				basicLabels := reconciler.GenLabels(tc.tfJob.GetName())
   483  				selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   484  					MatchLabels: basicLabels,
   485  				})
   486  				Expect(err).Should(BeNil())
   487  				listOpt := client.MatchingLabelsSelector{
   488  					Selector: selector,
   489  				}
   490  
   491  				By("creating Services and Pods with designed phases")
   492  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker,
   493  					tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods,
   494  					tc.restartCounts, refs, basicLabels)
   495  				tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS,
   496  					tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods,
   497  					tc.restartCounts, refs, basicLabels)
   498  
   499  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels)
   500  				tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels)
   501  
   502  				podList := &corev1.PodList{}
   503  				Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed())
   504  				Expect(len(podList.Items)).To(Equal(
   505  					int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods +
   506  						tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods)))
   507  
   508  				By("calling ReconcileJob")
   509  				_ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy)
   510  
   511  				podList = &corev1.PodList{}
   512  				Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed())
   513  				podRemainingCount := len(podList.Items)
   514  				Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining))
   515  
   516  				svcList := &corev1.ServiceList{}
   517  				Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed())
   518  				svcRemainingCount := len(svcList.Items)
   519  				Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining))
   520  			}
   521  		})
   522  	})
   523  
   524  	Context("Test TTL Seconds After Finished", func() {
   525  		It("should delete job when expired time is up", func() {
   526  			type testCase struct {
   527  				description string
   528  				tfJob       *kubeflowv1.TFJob
   529  				phase       corev1.PodPhase
   530  			}
   531  			testCases := []testCase{
   532  				{
   533  					description: "succeeded job with TTL 3s",
   534  					tfJob:       tftestutil.NewTFJobWithCleanupJobDelay(0, 1, 0, pointer.Int32(3)),
   535  					phase:       corev1.PodSucceeded,
   536  				},
   537  				{
   538  					description: "failed job with TTL 3s",
   539  					tfJob:       tftestutil.NewTFJobWithCleanupJobDelay(0, 1, 0, pointer.Int32(3)),
   540  					phase:       corev1.PodFailed,
   541  				},
   542  			}
   543  			jobNameTemplate := "test-bof-%d"
   544  			for idx, tc := range testCases {
   545  				By(fmt.Sprintf("preparing cases %s", tc.description))
   546  				ctx := context.Background()
   547  				name := fmt.Sprintf(jobNameTemplate, idx)
   548  				tc.tfJob.SetName(name)
   549  				tc.tfJob.CreationTimestamp = metav1.Now()
   550  
   551  				By("creating a TFJob")
   552  				Expect(reconciler.Create(ctx, tc.tfJob)).Should(Succeed())
   553  
   554  				// We need to wait for synchronizing cache.
   555  				By("getting a created TFJob")
   556  				var updatedTFJob kubeflowv1.TFJob
   557  				Eventually(func() error {
   558  					return reconciler.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &updatedTFJob)
   559  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   560  
   561  				initializeReplicaStatuses(&updatedTFJob.Status, kubeflowv1.TFJobReplicaTypeWorker)
   562  
   563  				By("prepare pod")
   564  				refs := []metav1.OwnerReference{
   565  					*reconciler.GenOwnerReference(tc.tfJob),
   566  				}
   567  				pod := tftestutil.NewBasePod("pod", tc.tfJob, refs)
   568  				pod.Status.Phase = tc.phase
   569  
   570  				By("update job replica statuses")
   571  				updateJobReplicaStatuses(&updatedTFJob.Status, kubeflowv1.TFJobReplicaTypeWorker, pod)
   572  
   573  				By("update job status")
   574  				Expect(reconciler.UpdateJobStatus(&updatedTFJob, updatedTFJob.Spec.TFReplicaSpecs, &updatedTFJob.Status)).To(Succeed())
   575  				By("updating job status...")
   576  				Expect(reconciler.Status().Update(ctx, &updatedTFJob)).To(Succeed())
   577  
   578  				By("waiting for updating replicaStatus for workers")
   579  				Eventually(func() *kubeflowv1.ReplicaStatus {
   580  					var getTFJob kubeflowv1.TFJob
   581  					Expect(reconciler.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &getTFJob)).Should(Succeed())
   582  					return getTFJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker]
   583  				}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   584  
   585  				ttl := updatedTFJob.Spec.RunPolicy.TTLSecondsAfterFinished
   586  				if ttl != nil {
   587  					dur := time.Second * time.Duration(*ttl)
   588  					time.Sleep(dur)
   589  				}
   590  
   591  				Eventually(func() error {
   592  					tfJob := &kubeflowv1.TFJob{}
   593  					key := types.NamespacedName{
   594  						Namespace: metav1.NamespaceDefault,
   595  						Name:      name,
   596  					}
   597  					if err := reconciler.Get(ctx, key, tfJob); err != nil {
   598  						if errors.IsNotFound(err) {
   599  							return nil
   600  						}
   601  						return err
   602  					}
   603  					return fmt.Errorf("job %s still remains", name)
   604  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   605  			}
   606  		})
   607  	})
   608  })
   609  
   610  var _ = Describe("Test for controller.v1/common", func() {
   611  	var (
   612  		ctx = context.Background()
   613  		ns  *corev1.Namespace
   614  		now metav1.Time
   615  	)
   616  	BeforeEach(func() {
   617  		ns = &corev1.Namespace{
   618  			ObjectMeta: metav1.ObjectMeta{
   619  				GenerateName: "tfjob-ns-",
   620  			},
   621  		}
   622  		now = metav1.Now()
   623  		Expect(testK8sClient.Create(ctx, ns)).Should(Succeed())
   624  	})
   625  	AfterEach(func() {
   626  		Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed())
   627  	})
   628  
   629  	type cleanUpCases struct {
   630  		tfJob              *kubeflowv1.TFJob
   631  		runPolicy          *kubeflowv1.RunPolicy
   632  		jobStatus          kubeflowv1.JobStatus
   633  		wantTFJobIsRemoved bool
   634  		wantErr            bool
   635  	}
   636  	DescribeTable("TFJob is created and is cleaned up",
   637  		func(tc *cleanUpCases) {
   638  			tc.tfJob.SetNamespace(ns.Name)
   639  			Expect(testK8sClient.Create(ctx, tc.tfJob)).Should(Succeed())
   640  
   641  			if tc.wantErr {
   642  				Expect(reconciler.CleanupJob(tc.runPolicy, tc.jobStatus, tc.tfJob)).ShouldNot(Succeed())
   643  			} else {
   644  				Expect(reconciler.CleanupJob(tc.runPolicy, tc.jobStatus, tc.tfJob)).Should(Succeed())
   645  			}
   646  			if tc.wantTFJobIsRemoved {
   647  				Eventually(func() bool {
   648  					gotErr := testK8sClient.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &kubeflowv1.TFJob{})
   649  					return errors.IsNotFound(gotErr)
   650  				}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   651  			} else {
   652  				Eventually(func() error {
   653  					return testK8sClient.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &kubeflowv1.TFJob{})
   654  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   655  			}
   656  		},
   657  		Entry("TFJob shouldn't be removed since TTL is nil", &cleanUpCases{
   658  			tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil),
   659  			runPolicy: &kubeflowv1.RunPolicy{
   660  				TTLSecondsAfterFinished: nil,
   661  			},
   662  			jobStatus:          kubeflowv1.JobStatus{},
   663  			wantTFJobIsRemoved: false,
   664  			wantErr:            false,
   665  		}),
   666  		Entry("Error is occurred since completionTime is nil", &cleanUpCases{
   667  			tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(10)),
   668  			runPolicy: &kubeflowv1.RunPolicy{
   669  				TTLSecondsAfterFinished: pointer.Int32(10),
   670  			},
   671  			jobStatus: kubeflowv1.JobStatus{
   672  				CompletionTime: nil,
   673  			},
   674  			wantTFJobIsRemoved: false,
   675  			wantErr:            true,
   676  		}),
   677  		Entry("TFJob is removed since exceeded TTL (TTL is 180s)", &cleanUpCases{
   678  			tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(180)),
   679  			runPolicy: &kubeflowv1.RunPolicy{
   680  				TTLSecondsAfterFinished: pointer.Int32(180),
   681  			},
   682  			jobStatus: kubeflowv1.JobStatus{
   683  				CompletionTime: &metav1.Time{
   684  					Time: now.AddDate(0, 0, -1),
   685  				},
   686  			},
   687  			wantTFJobIsRemoved: true,
   688  			wantErr:            false,
   689  		}),
   690  		Entry("TFJob is removed since (TTL is 0s)", &cleanUpCases{
   691  			tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(0)),
   692  			runPolicy: &kubeflowv1.RunPolicy{
   693  				TTLSecondsAfterFinished: pointer.Int32(0),
   694  			},
   695  			jobStatus: kubeflowv1.JobStatus{
   696  				CompletionTime: &now,
   697  			},
   698  			wantTFJobIsRemoved: true,
   699  			wantErr:            false,
   700  		}),
   701  	)
   702  
   703  	type createServiceCases struct {
   704  		tfJob   *kubeflowv1.TFJob
   705  		rType   kubeflowv1.ReplicaType
   706  		spec    *kubeflowv1.ReplicaSpec
   707  		uid     types.UID
   708  		index   int
   709  		wantErr bool
   710  	}
   711  	DescribeTable("CreateNewService",
   712  		func(tc *createServiceCases) {
   713  			tc.tfJob.SetUID(tc.uid)
   714  			tc.tfJob.SetNamespace(ns.Name)
   715  
   716  			gotErr := reconciler.CreateNewService(tc.tfJob, tc.rType, tc.spec, strconv.Itoa(tc.index))
   717  			if tc.wantErr {
   718  				Expect(gotErr).ShouldNot(Succeed())
   719  			} else {
   720  				Expect(gotErr).Should(Succeed())
   721  
   722  				svcInternalTPC := corev1.ServiceInternalTrafficPolicyCluster
   723  				svcSingleStack := corev1.IPFamilyPolicySingleStack
   724  				wantSvc := &corev1.Service{
   725  					ObjectMeta: metav1.ObjectMeta{
   726  						Name:      fmt.Sprintf("%s-%s-%d", tc.tfJob.Name, tc.rType, tc.index),
   727  						Namespace: ns.Name,
   728  						OwnerReferences: []metav1.OwnerReference{
   729  							*reconciler.GenOwnerReference(tc.tfJob),
   730  						},
   731  						Labels: map[string]string{
   732  							kubeflowv1.JobNameLabel:      tc.tfJob.Name,
   733  							kubeflowv1.OperatorNameLabel: controllerName,
   734  							kubeflowv1.ReplicaIndexLabel: strconv.Itoa(tc.index),
   735  							kubeflowv1.ReplicaTypeLabel:  "",
   736  						},
   737  					},
   738  					Spec: corev1.ServiceSpec{
   739  						Ports: []corev1.ServicePort{
   740  							{
   741  								Name:     kubeflowv1.TFJobDefaultPortName,
   742  								Protocol: corev1.ProtocolTCP,
   743  								Port:     kubeflowv1.TFJobDefaultPort,
   744  								TargetPort: intstr.IntOrString{
   745  									IntVal: kubeflowv1.TFJobDefaultPort,
   746  								},
   747  							},
   748  						},
   749  						Selector: map[string]string{
   750  							kubeflowv1.JobNameLabel:      tc.tfJob.Name,
   751  							kubeflowv1.OperatorNameLabel: controllerName,
   752  							kubeflowv1.ReplicaIndexLabel: strconv.Itoa(tc.index),
   753  							kubeflowv1.ReplicaTypeLabel:  "",
   754  						},
   755  						ClusterIP:             corev1.ClusterIPNone,
   756  						Type:                  corev1.ServiceTypeClusterIP,
   757  						ClusterIPs:            []string{corev1.ClusterIPNone},
   758  						SessionAffinity:       corev1.ClusterIPNone,
   759  						IPFamilies:            []corev1.IPFamily{corev1.IPv4Protocol},
   760  						IPFamilyPolicy:        &svcSingleStack,
   761  						InternalTrafficPolicy: &svcInternalTPC,
   762  					},
   763  				}
   764  				Eventually(func() *corev1.Service {
   765  					svc := &corev1.Service{}
   766  					Expect(testK8sClient.Get(ctx, client.ObjectKeyFromObject(wantSvc), svc)).Should(Succeed())
   767  					return svc
   768  				}, testutil.Timeout, testutil.Interval).Should(BeComparableTo(wantSvc,
   769  					cmpopts.IgnoreFields(metav1.ObjectMeta{}, "UID", "ResourceVersion", "Generation", "CreationTimestamp", "ManagedFields")))
   770  			}
   771  		},
   772  		Entry("Failed to create service since containerPort is missing", &createServiceCases{
   773  			tfJob: tftestutil.NewTFJobV2(2, 0, 0, 1, 0),
   774  			spec: &kubeflowv1.ReplicaSpec{
   775  				Template: corev1.PodTemplateSpec{
   776  					Spec: corev1.PodSpec{
   777  						Containers: []corev1.Container{
   778  							{
   779  								Name: kubeflowv1.TFJobDefaultContainerName,
   780  							},
   781  						},
   782  					},
   783  				},
   784  			},
   785  			index:   0,
   786  			wantErr: true,
   787  		}),
   788  		Entry("Failed to create service since Job's ownerReference is invalid", &createServiceCases{
   789  			tfJob:   tftestutil.NewTFJobV2(2, 0, 0, 1, 0),
   790  			spec:    &kubeflowv1.ReplicaSpec{Template: tftestutil.NewTFReplicaSpecTemplate()},
   791  			index:   1,
   792  			wantErr: true,
   793  		}),
   794  		Entry("Succeeded to create service", &createServiceCases{
   795  			tfJob:   tftestutil.NewTFJobV2(2, 0, 0, 1, 0),
   796  			spec:    &kubeflowv1.ReplicaSpec{Template: tftestutil.NewTFReplicaSpecTemplate()},
   797  			index:   0,
   798  			wantErr: false,
   799  			uid:     uuid.NewUUID(),
   800  		}),
   801  	)
   802  })