github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/tfjob_controller_test.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package tensorflow
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  
    21  	. "github.com/onsi/ginkgo/v2"
    22  	. "github.com/onsi/gomega"
    23  	corev1 "k8s.io/api/core/v1"
    24  	"k8s.io/apimachinery/pkg/api/errors"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/apimachinery/pkg/util/uuid"
    28  	"k8s.io/utils/pointer"
    29  	"sigs.k8s.io/controller-runtime/pkg/client"
    30  
    31  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    32  	tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil"
    33  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    34  	"github.com/kubeflow/training-operator/pkg/util/testutil"
    35  )
    36  
    37  var _ = Describe("TFJob controller", func() {
    38  	Context("Test Normal Path", func() {
    39  		It("should create desired Pods and Services", func() {
    40  			var (
    41  				tfJobRunning   = kubeflowv1.JobRunning
    42  				tfJobSucceeded = kubeflowv1.JobSucceeded
    43  			)
    44  
    45  			testCases := map[string]struct {
    46  				worker int
    47  				ps     int
    48  
    49  				// pod setup
    50  				// ControllerError error
    51  				// jobKeyForget    bool
    52  
    53  				pendingWorkerPods   int32
    54  				activeWorkerPods    int32
    55  				succeededWorkerPods int32
    56  				failedWorkerPods    int32
    57  
    58  				pendingPSPods   int32
    59  				activePSPods    int32
    60  				succeededPSPods int32
    61  				failedPSPods    int32
    62  
    63  				activeWorkerServices int32
    64  				activePSServices     int32
    65  
    66  				// expectations
    67  				expectedPodCreations     int32
    68  				expectedPodDeletions     int32
    69  				expectedServiceCreations int32
    70  
    71  				expectedActiveWorkerPods    int32
    72  				expectedSucceededWorkerPods int32
    73  				expectedFailedWorkerPods    int32
    74  
    75  				expectedActivePSPods    int32
    76  				expectedSucceededPSPods int32
    77  				expectedFailedPSPods    int32
    78  
    79  				expectedCondition       *kubeflowv1.JobConditionType
    80  				expectedConditionReason string
    81  
    82  				// There are some cases that should not check start time since the field should be set in the previous sync loop.
    83  				needCheckStartTime bool
    84  			}{
    85  				"Local TFJob is created": {
    86  					1, 0,
    87  					0, 0, 0, 0,
    88  					0, 0, 0, 0,
    89  					0, 0,
    90  					1, 0, 1,
    91  					0, 0, 0,
    92  					0, 0, 0,
    93  					// We can not check if it is created since the condition is set in addTFJob.
    94  					nil, "",
    95  					false,
    96  				},
    97  				"Distributed TFJob (4 workers, 2 PS) is created": {
    98  					4, 2,
    99  					0, 0, 0, 0,
   100  					0, 0, 0, 0,
   101  					0, 0,
   102  					6, 0, 6,
   103  					0, 0, 0,
   104  					0, 0, 0,
   105  					nil, "",
   106  					false,
   107  				},
   108  				"Distributed TFJob (4 workers, 2 PS) is created and all replicas are pending": {
   109  					4, 2,
   110  					4, 0, 0, 0,
   111  					2, 0, 0, 0,
   112  					4, 2,
   113  					0, 0, 0,
   114  					0, 0, 0,
   115  					0, 0, 0,
   116  					nil, "",
   117  					false,
   118  				},
   119  				"Distributed TFJob (4 workers, 2 PS) is created and all replicas are running": {
   120  					4, 2,
   121  					0, 4, 0, 0,
   122  					0, 2, 0, 0,
   123  					4, 2,
   124  					0, 0, 0,
   125  					4, 0, 0,
   126  					2, 0, 0,
   127  					&tfJobRunning, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason),
   128  					true,
   129  				},
   130  				"Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending": {
   131  					4, 2,
   132  					2, 0, 0, 0,
   133  					1, 0, 0, 0,
   134  					2, 1,
   135  					3, 0, 3,
   136  					0, 0, 0,
   137  					0, 0, 0,
   138  					nil, "",
   139  					false,
   140  				},
   141  				"Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is running": {
   142  					4, 2,
   143  					2, 1, 0, 0,
   144  					1, 0, 0, 0,
   145  					3, 1,
   146  					2, 0, 2,
   147  					1, 0, 0,
   148  					0, 0, 0,
   149  					&tfJobRunning, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason),
   150  					false,
   151  				},
   152  				"Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is succeeded": {
   153  					4, 2,
   154  					2, 0, 1, 0,
   155  					1, 0, 0, 0,
   156  					3, 1,
   157  					2, 0, 2,
   158  					0, 1, 0,
   159  					0, 0, 0,
   160  					nil, "",
   161  					false,
   162  				},
   163  				"Distributed TFJob (4 workers, 2 PS) is succeeded": {
   164  					4, 2,
   165  					0, 0, 4, 0,
   166  					0, 0, 2, 0,
   167  					4, 2,
   168  					0, 0, 0,
   169  					0, 4, 0,
   170  					0, 2, 0,
   171  					&tfJobSucceeded, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSucceededReason),
   172  					false,
   173  				},
   174  			}
   175  
   176  			jobNameTemplate := "test-case-norm-%d"
   177  			caseIdx := 0
   178  			for name, tc := range testCases {
   179  				By(name)
   180  				ctx := context.Background()
   181  				jobName := fmt.Sprintf(jobNameTemplate, caseIdx)
   182  				caseIdx++
   183  
   184  				tfJob := tftestutil.NewTFJob(tc.worker, tc.ps)
   185  				tfJob.SetName(jobName)
   186  				tfJob.SetUID(uuid.NewUUID())
   187  
   188  				refs := []metav1.OwnerReference{*reconciler.GenOwnerReference(tfJob)}
   189  				basicLabels := reconciler.GenLabels(tfJob.GetName())
   190  
   191  				tftestutil.SetPodsStatuses(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, refs, basicLabels)
   192  				tftestutil.SetPodsStatuses(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypePS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, refs, basicLabels)
   193  
   194  				tftestutil.SetServices(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels)
   195  				tftestutil.SetServices(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels)
   196  
   197  				totalPodNumber := int(tc.pendingWorkerPods + tc.activeWorkerPods + tc.succeededWorkerPods + tc.failedWorkerPods + tc.pendingPSPods + tc.activePSPods + tc.succeededPSPods + tc.failedPSPods)
   198  				totalServiceNumber := int(tc.activeWorkerServices + tc.activePSServices)
   199  
   200  				selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{MatchLabels: reconciler.GenLabels(tfJob.GetName())})
   201  				Expect(err).Should(BeNil())
   202  				listOpt := client.MatchingLabelsSelector{Selector: selector}
   203  				Eventually(func() error {
   204  					podList := &corev1.PodList{}
   205  					svcList := &corev1.ServiceList{}
   206  
   207  					err = testK8sClient.List(ctx, podList, listOpt)
   208  					if err != nil {
   209  						return err
   210  					}
   211  					if len(podList.Items) != totalPodNumber {
   212  						return fmt.Errorf("expected %d Pods, got %d", totalPodNumber, len(podList.Items))
   213  					}
   214  
   215  					err = testK8sClient.List(ctx, svcList, listOpt)
   216  					if err != nil {
   217  						return err
   218  					}
   219  					if len(svcList.Items) != totalServiceNumber {
   220  						return fmt.Errorf("expected %d Services, got %d", totalServiceNumber, len(svcList.Items))
   221  					}
   222  					return nil
   223  				}).Should(BeNil())
   224  
   225  				_ = reconciler.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy)
   226  
   227  				// Check the number of Pods and Services
   228  				//var pods []*corev1.Pod = nil
   229  				//var svcs []*corev1.Service = nil
   230  				Eventually(func() error {
   231  					podList := &corev1.PodList{}
   232  					svcList := &corev1.ServiceList{}
   233  
   234  					err = testK8sClient.List(ctx, podList, listOpt)
   235  					if err != nil {
   236  						return err
   237  					}
   238  					podCreatedNumber := 0
   239  					if len(podList.Items) > totalPodNumber {
   240  						podCreatedNumber = len(podList.Items) - totalPodNumber
   241  					}
   242  					podDeletedNumber := 0
   243  					if len(podList.Items) < totalPodNumber {
   244  						podDeletedNumber = totalPodNumber - len(podList.Items)
   245  					}
   246  					if podCreatedNumber != int(tc.expectedPodCreations) {
   247  						return fmt.Errorf("%s: unexpected number of pod creates.  Expected %d, saw %d\n", name, tc.expectedPodCreations, podCreatedNumber)
   248  					}
   249  					if podDeletedNumber != int(tc.expectedPodDeletions) {
   250  						return fmt.Errorf("%s: unexpected number of service creates.  Expected %d, saw %d\n", name, tc.expectedServiceCreations, podDeletedNumber)
   251  					}
   252  					// check controller references for all pods
   253  					for _, p := range podList.Items {
   254  						for _, ref := range p.GetOwnerReferences() {
   255  							if ref.APIVersion != kubeflowv1.SchemeGroupVersion.String() {
   256  								return fmt.Errorf("controllerRef.APIVersion = %q, want %q", ref.APIVersion, kubeflowv1.SchemeGroupVersion.String())
   257  							}
   258  							if ref.Kind != kubeflowv1.TFJobKind {
   259  								return fmt.Errorf("controllerRef.MPIKind = %q, want %q", ref.Kind, kubeflowv1.TFJobKind)
   260  							}
   261  							if ref.Name != tfJob.GetName() {
   262  								return fmt.Errorf("controllerRef.Name = %q, want %q", ref.Name, tfJob.GetName())
   263  							}
   264  							if ref.UID != tfJob.GetUID() {
   265  								return fmt.Errorf("controllerRef.UID = %q, want %q", ref.UID, tfJob.GetUID())
   266  							}
   267  						}
   268  					}
   269  
   270  					err = testK8sClient.List(ctx, svcList, listOpt)
   271  					if err != nil {
   272  						return err
   273  					}
   274  					serviceCreatedNumber := 0
   275  					if len(svcList.Items) > totalServiceNumber {
   276  						serviceCreatedNumber = len(svcList.Items) - totalServiceNumber
   277  					}
   278  					if serviceCreatedNumber != int(tc.expectedServiceCreations) {
   279  						return fmt.Errorf("%s: unexpected number of pod deletes.  Expected %d, saw %d\n", name, tc.expectedPodDeletions, serviceCreatedNumber)
   280  					}
   281  					// check controller reference for all services
   282  					for _, s := range svcList.Items {
   283  						for _, ref := range s.GetOwnerReferences() {
   284  							if ref.APIVersion != kubeflowv1.SchemeGroupVersion.String() {
   285  								return fmt.Errorf("controllerRef.APIVersion = %q, want %q", ref.APIVersion, kubeflowv1.SchemeGroupVersion.String())
   286  							}
   287  							if ref.Kind != kubeflowv1.TFJobKind {
   288  								return fmt.Errorf("controllerRef.MPIKind = %q, want %q", ref.Kind, kubeflowv1.TFJobKind)
   289  							}
   290  							if ref.Name != tfJob.GetName() {
   291  								return fmt.Errorf("controllerRef.Name = %q, want %q", ref.Name, tfJob.GetName())
   292  							}
   293  							if ref.UID != tfJob.GetUID() {
   294  								return fmt.Errorf("controllerRef.UID = %q, want %q", ref.UID, tfJob.GetUID())
   295  							}
   296  						}
   297  					}
   298  					return nil
   299  				}).Should(BeNil())
   300  
   301  				// Validate Worker status
   302  				if tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker] != nil {
   303  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active).To(Equal(tc.expectedActiveWorkerPods))
   304  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Succeeded).To(Equal(tc.expectedSucceededWorkerPods))
   305  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Failed).To(Equal(tc.expectedFailedWorkerPods))
   306  				}
   307  				// Validate PS status
   308  				if tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS] != nil {
   309  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Active).To(Equal(tc.expectedActivePSPods))
   310  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Succeeded).To(Equal(tc.expectedSucceededPSPods))
   311  					Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Failed).To(Equal(tc.expectedFailedPSPods))
   312  				}
   313  
   314  				// Validate StartTime
   315  				if tc.needCheckStartTime {
   316  					Expect(tfJob.Status.StartTime).NotTo(BeNil())
   317  				}
   318  
   319  				// Validate Conditions
   320  				if tc.expectedCondition != nil {
   321  					Expect(tftestutil.CheckCondition(tfJob, *tc.expectedCondition, tc.expectedConditionReason)).Should(BeTrue())
   322  				}
   323  			}
   324  		})
   325  	})
   326  
   327  	Context("TFJob with suspend semantics", func() {
   328  		const name = "test-job"
   329  		var (
   330  			ns         *corev1.Namespace
   331  			job        *kubeflowv1.TFJob
   332  			jobKey     types.NamespacedName
   333  			chiefKey   types.NamespacedName
   334  			worker0Key types.NamespacedName
   335  			ctx        = context.Background()
   336  		)
   337  		BeforeEach(func() {
   338  			ns = &corev1.Namespace{
   339  				ObjectMeta: metav1.ObjectMeta{
   340  					GenerateName: "tensorflow-test-",
   341  				},
   342  			}
   343  			Expect(testK8sClient.Create(ctx, ns)).Should(Succeed())
   344  
   345  			// chief=1, worker=1
   346  			job = tftestutil.NewTFJobV2(1, 0, 0, 1, 0)
   347  			job.SetName(name)
   348  			job.SetNamespace(ns.Name)
   349  			jobKey = client.ObjectKeyFromObject(job)
   350  			chiefKey = types.NamespacedName{
   351  				Name:      fmt.Sprintf("%s-chief-0", name),
   352  				Namespace: ns.Name,
   353  			}
   354  			worker0Key = types.NamespacedName{
   355  				Name:      fmt.Sprintf("%s-worker-0", name),
   356  				Namespace: ns.Name,
   357  			}
   358  		})
   359  		AfterEach(func() {
   360  			Expect(testK8sClient.Delete(ctx, job)).Should(Succeed())
   361  			Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed())
   362  		})
   363  
   364  		It("Shouldn't create resources if TFJob is suspended", func() {
   365  			By("By creating a new TFJob with suspend=true")
   366  			job.Spec.RunPolicy.Suspend = pointer.Bool(true)
   367  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   368  
   369  			created := &kubeflowv1.TFJob{}
   370  			chiefPod := &corev1.Pod{}
   371  			workerPod := &corev1.Pod{}
   372  			chiefSvc := &corev1.Service{}
   373  			workerSvc := &corev1.Service{}
   374  
   375  			By("Checking created TFJob")
   376  			Eventually(func() bool {
   377  				err := testK8sClient.Get(ctx, jobKey, created)
   378  				return err == nil
   379  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   380  			By("Checking created TFJob has a nil startTime")
   381  			Consistently(func() *metav1.Time {
   382  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   383  				return created.Status.StartTime
   384  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeNil())
   385  
   386  			By("Checking if the pods and services aren't created")
   387  			Consistently(func() bool {
   388  				errChiefPod := testK8sClient.Get(ctx, chiefKey, chiefPod)
   389  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   390  				errChiefSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc)
   391  				errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc)
   392  				return errors.IsNotFound(errChiefPod) && errors.IsNotFound(errWorkerPod) &&
   393  					errors.IsNotFound(errChiefSvc) && errors.IsNotFound(errWorkerSvc)
   394  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   395  
   396  			By("Checking if the TFJob has suspended condition")
   397  			Eventually(func() []kubeflowv1.JobCondition {
   398  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   399  				return created.Status.Conditions
   400  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   401  				{
   402  					Type:    kubeflowv1.JobCreated,
   403  					Status:  corev1.ConditionTrue,
   404  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason),
   405  					Message: fmt.Sprintf("TFJob %s is created.", name),
   406  				},
   407  				{
   408  					Type:    kubeflowv1.JobSuspended,
   409  					Status:  corev1.ConditionTrue,
   410  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason),
   411  					Message: fmt.Sprintf("TFJob %s is suspended.", name),
   412  				},
   413  			}, testutil.IgnoreJobConditionsTimes))
   414  		})
   415  
   416  		It("Should delete resources after TFJob is suspended; Should resume TFJob after TFJob is unsuspended", func() {
   417  			By("By creating a new TFJob")
   418  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   419  
   420  			created := &kubeflowv1.TFJob{}
   421  			chiefPod := &corev1.Pod{}
   422  			workerPod := &corev1.Pod{}
   423  			chiefSvc := &corev1.Service{}
   424  			workerSvc := &corev1.Service{}
   425  
   426  			// We'll need to retry getting this newly created TFJob, given that creation may not immediately happen.
   427  			By("Checking created TFJob")
   428  			Eventually(func() bool {
   429  				err := testK8sClient.Get(ctx, jobKey, created)
   430  				return err == nil
   431  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   432  
   433  			var startTimeBeforeSuspended *metav1.Time
   434  			Eventually(func() *metav1.Time {
   435  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   436  				startTimeBeforeSuspended = created.Status.StartTime
   437  				return startTimeBeforeSuspended
   438  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   439  
   440  			By("Checking the created pods and services")
   441  			Eventually(func() bool {
   442  				errChief := testK8sClient.Get(ctx, chiefKey, chiefPod)
   443  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   444  				return errChief == nil && errWorker == nil
   445  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   446  			Eventually(func() bool {
   447  				errChief := testK8sClient.Get(ctx, chiefKey, chiefSvc)
   448  				errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc)
   449  				return errChief == nil && errWorker == nil
   450  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   451  
   452  			By("Updating the pod's phase with Running")
   453  			Eventually(func() error {
   454  				Expect(testK8sClient.Get(ctx, chiefKey, chiefPod)).Should(Succeed())
   455  				chiefPod.Status.Phase = corev1.PodRunning
   456  				return testK8sClient.Status().Update(ctx, chiefPod)
   457  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   458  			Eventually(func() error {
   459  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
   460  				workerPod.Status.Phase = corev1.PodRunning
   461  				return testK8sClient.Status().Update(ctx, workerPod)
   462  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   463  
   464  			By("Checking the TFJob's condition")
   465  			Eventually(func() []kubeflowv1.JobCondition {
   466  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   467  				return created.Status.Conditions
   468  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   469  				{
   470  					Type:    kubeflowv1.JobCreated,
   471  					Status:  corev1.ConditionTrue,
   472  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason),
   473  					Message: fmt.Sprintf("TFJob %s is created.", name),
   474  				},
   475  				{
   476  					Type:    kubeflowv1.JobRunning,
   477  					Status:  corev1.ConditionTrue,
   478  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason),
   479  					Message: fmt.Sprintf("TFJob %s/%s is running.", ns.Name, name),
   480  				},
   481  			}, testutil.IgnoreJobConditionsTimes))
   482  
   483  			By("Updating the TFJob with suspend=true")
   484  			Eventually(func() error {
   485  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   486  				created.Spec.RunPolicy.Suspend = pointer.Bool(true)
   487  				return testK8sClient.Update(ctx, created)
   488  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   489  
   490  			By("Checking if the pods and services are removed")
   491  			Eventually(func() bool {
   492  				errChief := testK8sClient.Get(ctx, chiefKey, chiefPod)
   493  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   494  				return errors.IsNotFound(errChief) && errors.IsNotFound(errWorker)
   495  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   496  			Eventually(func() bool {
   497  				errChief := testK8sClient.Get(ctx, chiefKey, chiefSvc)
   498  				errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc)
   499  				return errors.IsNotFound(errChief) && errors.IsNotFound(errWorker)
   500  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   501  			Consistently(func() bool {
   502  				errChiefPod := testK8sClient.Get(ctx, chiefKey, chiefPod)
   503  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   504  				errChiefSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc)
   505  				errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc)
   506  				return errors.IsNotFound(errChiefPod) && errors.IsNotFound(errWorkerPod) &&
   507  					errors.IsNotFound(errChiefSvc) && errors.IsNotFound(errWorkerSvc)
   508  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   509  
   510  			By("Checking if the TFJob has a suspended condition")
   511  			Eventually(func() bool {
   512  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   513  				return created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeChief].Active == 0 &&
   514  					created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active == 0 &&
   515  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   516  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   517  			Consistently(func() bool {
   518  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   519  				return created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeChief].Active == 0 &&
   520  					created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active == 0 &&
   521  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   522  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   523  			Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{
   524  				{
   525  					Type:    kubeflowv1.JobCreated,
   526  					Status:  corev1.ConditionTrue,
   527  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason),
   528  					Message: fmt.Sprintf("TFJob %s is created.", name),
   529  				},
   530  				{
   531  					Type:    kubeflowv1.JobRunning,
   532  					Status:  corev1.ConditionFalse,
   533  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason),
   534  					Message: fmt.Sprintf("TFJob %s is suspended.", name),
   535  				},
   536  				{
   537  					Type:    kubeflowv1.JobSuspended,
   538  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason),
   539  					Message: fmt.Sprintf("TFJob %s is suspended.", name),
   540  					Status:  corev1.ConditionTrue,
   541  				},
   542  			}, testutil.IgnoreJobConditionsTimes))
   543  
   544  			By("Unsuspending the TFJob")
   545  			Eventually(func() error {
   546  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   547  				created.Spec.RunPolicy.Suspend = pointer.Bool(false)
   548  				return testK8sClient.Update(ctx, created)
   549  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   550  			Eventually(func() *metav1.Time {
   551  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   552  				return created.Status.StartTime
   553  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   554  
   555  			By("Check if the pods and services are created")
   556  			Eventually(func() error {
   557  				return testK8sClient.Get(ctx, chiefKey, chiefPod)
   558  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   559  			Eventually(func() error {
   560  				return testK8sClient.Get(ctx, worker0Key, workerPod)
   561  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   562  			Eventually(func() error {
   563  				return testK8sClient.Get(ctx, chiefKey, chiefSvc)
   564  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   565  			Eventually(func() error {
   566  				return testK8sClient.Get(ctx, worker0Key, workerSvc)
   567  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   568  
   569  			By("Updating Pod's condition with running")
   570  			Eventually(func() error {
   571  				Expect(testK8sClient.Get(ctx, chiefKey, chiefPod)).Should(Succeed())
   572  				chiefPod.Status.Phase = corev1.PodRunning
   573  				return testK8sClient.Status().Update(ctx, chiefPod)
   574  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   575  			Eventually(func() error {
   576  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
   577  				workerPod.Status.Phase = corev1.PodRunning
   578  				return testK8sClient.Status().Update(ctx, workerPod)
   579  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   580  
   581  			By("Checking if the TFJob has resumed conditions")
   582  			Eventually(func() []kubeflowv1.JobCondition {
   583  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   584  				return created.Status.Conditions
   585  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   586  				{
   587  					Type:    kubeflowv1.JobCreated,
   588  					Status:  corev1.ConditionTrue,
   589  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason),
   590  					Message: fmt.Sprintf("TFJob %s is created.", name),
   591  				},
   592  				{
   593  					Type:    kubeflowv1.JobSuspended,
   594  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobResumedReason),
   595  					Message: fmt.Sprintf("TFJob %s is resumed.", name),
   596  					Status:  corev1.ConditionFalse,
   597  				},
   598  				{
   599  					Type:    kubeflowv1.JobRunning,
   600  					Status:  corev1.ConditionTrue,
   601  					Reason:  commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason),
   602  					Message: fmt.Sprintf("TFJob %s/%s is running.", ns.Name, name),
   603  				},
   604  			}, testutil.IgnoreJobConditionsTimes))
   605  
   606  			By("Checking if the startTime is updated")
   607  			Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended))
   608  		})
   609  	})
   610  })