github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/tfjob_controller_test.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tensorflow 16 17 import ( 18 "context" 19 "fmt" 20 21 . "github.com/onsi/ginkgo/v2" 22 . "github.com/onsi/gomega" 23 corev1 "k8s.io/api/core/v1" 24 "k8s.io/apimachinery/pkg/api/errors" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/apimachinery/pkg/util/uuid" 28 "k8s.io/utils/pointer" 29 "sigs.k8s.io/controller-runtime/pkg/client" 30 31 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 32 tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" 33 commonutil "github.com/kubeflow/training-operator/pkg/util" 34 "github.com/kubeflow/training-operator/pkg/util/testutil" 35 ) 36 37 var _ = Describe("TFJob controller", func() { 38 Context("Test Normal Path", func() { 39 It("should create desired Pods and Services", func() { 40 var ( 41 tfJobRunning = kubeflowv1.JobRunning 42 tfJobSucceeded = kubeflowv1.JobSucceeded 43 ) 44 45 testCases := map[string]struct { 46 worker int 47 ps int 48 49 // pod setup 50 // ControllerError error 51 // jobKeyForget bool 52 53 pendingWorkerPods int32 54 activeWorkerPods int32 55 succeededWorkerPods int32 56 failedWorkerPods int32 57 58 pendingPSPods int32 59 activePSPods int32 60 succeededPSPods int32 61 failedPSPods int32 62 63 activeWorkerServices int32 64 activePSServices int32 65 66 // expectations 67 expectedPodCreations int32 68 expectedPodDeletions int32 69 expectedServiceCreations int32 70 71 expectedActiveWorkerPods int32 72 expectedSucceededWorkerPods int32 73 expectedFailedWorkerPods int32 74 75 expectedActivePSPods int32 76 expectedSucceededPSPods int32 77 expectedFailedPSPods int32 78 79 expectedCondition *kubeflowv1.JobConditionType 80 expectedConditionReason string 81 82 // There are some cases that should not check start time since the field should be set in the previous sync loop. 83 needCheckStartTime bool 84 }{ 85 "Local TFJob is created": { 86 1, 0, 87 0, 0, 0, 0, 88 0, 0, 0, 0, 89 0, 0, 90 1, 0, 1, 91 0, 0, 0, 92 0, 0, 0, 93 // We can not check if it is created since the condition is set in addTFJob. 94 nil, "", 95 false, 96 }, 97 "Distributed TFJob (4 workers, 2 PS) is created": { 98 4, 2, 99 0, 0, 0, 0, 100 0, 0, 0, 0, 101 0, 0, 102 6, 0, 6, 103 0, 0, 0, 104 0, 0, 0, 105 nil, "", 106 false, 107 }, 108 "Distributed TFJob (4 workers, 2 PS) is created and all replicas are pending": { 109 4, 2, 110 4, 0, 0, 0, 111 2, 0, 0, 0, 112 4, 2, 113 0, 0, 0, 114 0, 0, 0, 115 0, 0, 0, 116 nil, "", 117 false, 118 }, 119 "Distributed TFJob (4 workers, 2 PS) is created and all replicas are running": { 120 4, 2, 121 0, 4, 0, 0, 122 0, 2, 0, 0, 123 4, 2, 124 0, 0, 0, 125 4, 0, 0, 126 2, 0, 0, 127 &tfJobRunning, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason), 128 true, 129 }, 130 "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending": { 131 4, 2, 132 2, 0, 0, 0, 133 1, 0, 0, 0, 134 2, 1, 135 3, 0, 3, 136 0, 0, 0, 137 0, 0, 0, 138 nil, "", 139 false, 140 }, 141 "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is running": { 142 4, 2, 143 2, 1, 0, 0, 144 1, 0, 0, 0, 145 3, 1, 146 2, 0, 2, 147 1, 0, 0, 148 0, 0, 0, 149 &tfJobRunning, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason), 150 false, 151 }, 152 "Distributed TFJob (4 workers, 2 PS) is created, 2 workers, 1 PS are pending, 1 worker is succeeded": { 153 4, 2, 154 2, 0, 1, 0, 155 1, 0, 0, 0, 156 3, 1, 157 2, 0, 2, 158 0, 1, 0, 159 0, 0, 0, 160 nil, "", 161 false, 162 }, 163 "Distributed TFJob (4 workers, 2 PS) is succeeded": { 164 4, 2, 165 0, 0, 4, 0, 166 0, 0, 2, 0, 167 4, 2, 168 0, 0, 0, 169 0, 4, 0, 170 0, 2, 0, 171 &tfJobSucceeded, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSucceededReason), 172 false, 173 }, 174 } 175 176 jobNameTemplate := "test-case-norm-%d" 177 caseIdx := 0 178 for name, tc := range testCases { 179 By(name) 180 ctx := context.Background() 181 jobName := fmt.Sprintf(jobNameTemplate, caseIdx) 182 caseIdx++ 183 184 tfJob := tftestutil.NewTFJob(tc.worker, tc.ps) 185 tfJob.SetName(jobName) 186 tfJob.SetUID(uuid.NewUUID()) 187 188 refs := []metav1.OwnerReference{*reconciler.GenOwnerReference(tfJob)} 189 basicLabels := reconciler.GenLabels(tfJob.GetName()) 190 191 tftestutil.SetPodsStatuses(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, nil, refs, basicLabels) 192 tftestutil.SetPodsStatuses(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypePS, tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, nil, refs, basicLabels) 193 194 tftestutil.SetServices(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels) 195 tftestutil.SetServices(testK8sClient, tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels) 196 197 totalPodNumber := int(tc.pendingWorkerPods + tc.activeWorkerPods + tc.succeededWorkerPods + tc.failedWorkerPods + tc.pendingPSPods + tc.activePSPods + tc.succeededPSPods + tc.failedPSPods) 198 totalServiceNumber := int(tc.activeWorkerServices + tc.activePSServices) 199 200 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{MatchLabels: reconciler.GenLabels(tfJob.GetName())}) 201 Expect(err).Should(BeNil()) 202 listOpt := client.MatchingLabelsSelector{Selector: selector} 203 Eventually(func() error { 204 podList := &corev1.PodList{} 205 svcList := &corev1.ServiceList{} 206 207 err = testK8sClient.List(ctx, podList, listOpt) 208 if err != nil { 209 return err 210 } 211 if len(podList.Items) != totalPodNumber { 212 return fmt.Errorf("expected %d Pods, got %d", totalPodNumber, len(podList.Items)) 213 } 214 215 err = testK8sClient.List(ctx, svcList, listOpt) 216 if err != nil { 217 return err 218 } 219 if len(svcList.Items) != totalServiceNumber { 220 return fmt.Errorf("expected %d Services, got %d", totalServiceNumber, len(svcList.Items)) 221 } 222 return nil 223 }).Should(BeNil()) 224 225 _ = reconciler.ReconcileJobs(tfJob, tfJob.Spec.TFReplicaSpecs, tfJob.Status, &tfJob.Spec.RunPolicy) 226 227 // Check the number of Pods and Services 228 //var pods []*corev1.Pod = nil 229 //var svcs []*corev1.Service = nil 230 Eventually(func() error { 231 podList := &corev1.PodList{} 232 svcList := &corev1.ServiceList{} 233 234 err = testK8sClient.List(ctx, podList, listOpt) 235 if err != nil { 236 return err 237 } 238 podCreatedNumber := 0 239 if len(podList.Items) > totalPodNumber { 240 podCreatedNumber = len(podList.Items) - totalPodNumber 241 } 242 podDeletedNumber := 0 243 if len(podList.Items) < totalPodNumber { 244 podDeletedNumber = totalPodNumber - len(podList.Items) 245 } 246 if podCreatedNumber != int(tc.expectedPodCreations) { 247 return fmt.Errorf("%s: unexpected number of pod creates. Expected %d, saw %d\n", name, tc.expectedPodCreations, podCreatedNumber) 248 } 249 if podDeletedNumber != int(tc.expectedPodDeletions) { 250 return fmt.Errorf("%s: unexpected number of service creates. Expected %d, saw %d\n", name, tc.expectedServiceCreations, podDeletedNumber) 251 } 252 // check controller references for all pods 253 for _, p := range podList.Items { 254 for _, ref := range p.GetOwnerReferences() { 255 if ref.APIVersion != kubeflowv1.SchemeGroupVersion.String() { 256 return fmt.Errorf("controllerRef.APIVersion = %q, want %q", ref.APIVersion, kubeflowv1.SchemeGroupVersion.String()) 257 } 258 if ref.Kind != kubeflowv1.TFJobKind { 259 return fmt.Errorf("controllerRef.MPIKind = %q, want %q", ref.Kind, kubeflowv1.TFJobKind) 260 } 261 if ref.Name != tfJob.GetName() { 262 return fmt.Errorf("controllerRef.Name = %q, want %q", ref.Name, tfJob.GetName()) 263 } 264 if ref.UID != tfJob.GetUID() { 265 return fmt.Errorf("controllerRef.UID = %q, want %q", ref.UID, tfJob.GetUID()) 266 } 267 } 268 } 269 270 err = testK8sClient.List(ctx, svcList, listOpt) 271 if err != nil { 272 return err 273 } 274 serviceCreatedNumber := 0 275 if len(svcList.Items) > totalServiceNumber { 276 serviceCreatedNumber = len(svcList.Items) - totalServiceNumber 277 } 278 if serviceCreatedNumber != int(tc.expectedServiceCreations) { 279 return fmt.Errorf("%s: unexpected number of pod deletes. Expected %d, saw %d\n", name, tc.expectedPodDeletions, serviceCreatedNumber) 280 } 281 // check controller reference for all services 282 for _, s := range svcList.Items { 283 for _, ref := range s.GetOwnerReferences() { 284 if ref.APIVersion != kubeflowv1.SchemeGroupVersion.String() { 285 return fmt.Errorf("controllerRef.APIVersion = %q, want %q", ref.APIVersion, kubeflowv1.SchemeGroupVersion.String()) 286 } 287 if ref.Kind != kubeflowv1.TFJobKind { 288 return fmt.Errorf("controllerRef.MPIKind = %q, want %q", ref.Kind, kubeflowv1.TFJobKind) 289 } 290 if ref.Name != tfJob.GetName() { 291 return fmt.Errorf("controllerRef.Name = %q, want %q", ref.Name, tfJob.GetName()) 292 } 293 if ref.UID != tfJob.GetUID() { 294 return fmt.Errorf("controllerRef.UID = %q, want %q", ref.UID, tfJob.GetUID()) 295 } 296 } 297 } 298 return nil 299 }).Should(BeNil()) 300 301 // Validate Worker status 302 if tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker] != nil { 303 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active).To(Equal(tc.expectedActiveWorkerPods)) 304 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Succeeded).To(Equal(tc.expectedSucceededWorkerPods)) 305 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Failed).To(Equal(tc.expectedFailedWorkerPods)) 306 } 307 // Validate PS status 308 if tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS] != nil { 309 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Active).To(Equal(tc.expectedActivePSPods)) 310 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Succeeded).To(Equal(tc.expectedSucceededPSPods)) 311 Expect(tfJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypePS].Failed).To(Equal(tc.expectedFailedPSPods)) 312 } 313 314 // Validate StartTime 315 if tc.needCheckStartTime { 316 Expect(tfJob.Status.StartTime).NotTo(BeNil()) 317 } 318 319 // Validate Conditions 320 if tc.expectedCondition != nil { 321 Expect(tftestutil.CheckCondition(tfJob, *tc.expectedCondition, tc.expectedConditionReason)).Should(BeTrue()) 322 } 323 } 324 }) 325 }) 326 327 Context("TFJob with suspend semantics", func() { 328 const name = "test-job" 329 var ( 330 ns *corev1.Namespace 331 job *kubeflowv1.TFJob 332 jobKey types.NamespacedName 333 chiefKey types.NamespacedName 334 worker0Key types.NamespacedName 335 ctx = context.Background() 336 ) 337 BeforeEach(func() { 338 ns = &corev1.Namespace{ 339 ObjectMeta: metav1.ObjectMeta{ 340 GenerateName: "tensorflow-test-", 341 }, 342 } 343 Expect(testK8sClient.Create(ctx, ns)).Should(Succeed()) 344 345 // chief=1, worker=1 346 job = tftestutil.NewTFJobV2(1, 0, 0, 1, 0) 347 job.SetName(name) 348 job.SetNamespace(ns.Name) 349 jobKey = client.ObjectKeyFromObject(job) 350 chiefKey = types.NamespacedName{ 351 Name: fmt.Sprintf("%s-chief-0", name), 352 Namespace: ns.Name, 353 } 354 worker0Key = types.NamespacedName{ 355 Name: fmt.Sprintf("%s-worker-0", name), 356 Namespace: ns.Name, 357 } 358 }) 359 AfterEach(func() { 360 Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) 361 Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed()) 362 }) 363 364 It("Shouldn't create resources if TFJob is suspended", func() { 365 By("By creating a new TFJob with suspend=true") 366 job.Spec.RunPolicy.Suspend = pointer.Bool(true) 367 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 368 369 created := &kubeflowv1.TFJob{} 370 chiefPod := &corev1.Pod{} 371 workerPod := &corev1.Pod{} 372 chiefSvc := &corev1.Service{} 373 workerSvc := &corev1.Service{} 374 375 By("Checking created TFJob") 376 Eventually(func() bool { 377 err := testK8sClient.Get(ctx, jobKey, created) 378 return err == nil 379 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 380 By("Checking created TFJob has a nil startTime") 381 Consistently(func() *metav1.Time { 382 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 383 return created.Status.StartTime 384 }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) 385 386 By("Checking if the pods and services aren't created") 387 Consistently(func() bool { 388 errChiefPod := testK8sClient.Get(ctx, chiefKey, chiefPod) 389 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 390 errChiefSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc) 391 errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) 392 return errors.IsNotFound(errChiefPod) && errors.IsNotFound(errWorkerPod) && 393 errors.IsNotFound(errChiefSvc) && errors.IsNotFound(errWorkerSvc) 394 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 395 396 By("Checking if the TFJob has suspended condition") 397 Eventually(func() []kubeflowv1.JobCondition { 398 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 399 return created.Status.Conditions 400 }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 401 { 402 Type: kubeflowv1.JobCreated, 403 Status: corev1.ConditionTrue, 404 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason), 405 Message: fmt.Sprintf("TFJob %s is created.", name), 406 }, 407 { 408 Type: kubeflowv1.JobSuspended, 409 Status: corev1.ConditionTrue, 410 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason), 411 Message: fmt.Sprintf("TFJob %s is suspended.", name), 412 }, 413 }, testutil.IgnoreJobConditionsTimes)) 414 }) 415 416 It("Should delete resources after TFJob is suspended; Should resume TFJob after TFJob is unsuspended", func() { 417 By("By creating a new TFJob") 418 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 419 420 created := &kubeflowv1.TFJob{} 421 chiefPod := &corev1.Pod{} 422 workerPod := &corev1.Pod{} 423 chiefSvc := &corev1.Service{} 424 workerSvc := &corev1.Service{} 425 426 // We'll need to retry getting this newly created TFJob, given that creation may not immediately happen. 427 By("Checking created TFJob") 428 Eventually(func() bool { 429 err := testK8sClient.Get(ctx, jobKey, created) 430 return err == nil 431 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 432 433 var startTimeBeforeSuspended *metav1.Time 434 Eventually(func() *metav1.Time { 435 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 436 startTimeBeforeSuspended = created.Status.StartTime 437 return startTimeBeforeSuspended 438 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 439 440 By("Checking the created pods and services") 441 Eventually(func() bool { 442 errChief := testK8sClient.Get(ctx, chiefKey, chiefPod) 443 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 444 return errChief == nil && errWorker == nil 445 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 446 Eventually(func() bool { 447 errChief := testK8sClient.Get(ctx, chiefKey, chiefSvc) 448 errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc) 449 return errChief == nil && errWorker == nil 450 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 451 452 By("Updating the pod's phase with Running") 453 Eventually(func() error { 454 Expect(testK8sClient.Get(ctx, chiefKey, chiefPod)).Should(Succeed()) 455 chiefPod.Status.Phase = corev1.PodRunning 456 return testK8sClient.Status().Update(ctx, chiefPod) 457 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 458 Eventually(func() error { 459 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 460 workerPod.Status.Phase = corev1.PodRunning 461 return testK8sClient.Status().Update(ctx, workerPod) 462 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 463 464 By("Checking the TFJob's condition") 465 Eventually(func() []kubeflowv1.JobCondition { 466 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 467 return created.Status.Conditions 468 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 469 { 470 Type: kubeflowv1.JobCreated, 471 Status: corev1.ConditionTrue, 472 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason), 473 Message: fmt.Sprintf("TFJob %s is created.", name), 474 }, 475 { 476 Type: kubeflowv1.JobRunning, 477 Status: corev1.ConditionTrue, 478 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason), 479 Message: fmt.Sprintf("TFJob %s/%s is running.", ns.Name, name), 480 }, 481 }, testutil.IgnoreJobConditionsTimes)) 482 483 By("Updating the TFJob with suspend=true") 484 Eventually(func() error { 485 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 486 created.Spec.RunPolicy.Suspend = pointer.Bool(true) 487 return testK8sClient.Update(ctx, created) 488 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 489 490 By("Checking if the pods and services are removed") 491 Eventually(func() bool { 492 errChief := testK8sClient.Get(ctx, chiefKey, chiefPod) 493 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 494 return errors.IsNotFound(errChief) && errors.IsNotFound(errWorker) 495 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 496 Eventually(func() bool { 497 errChief := testK8sClient.Get(ctx, chiefKey, chiefSvc) 498 errWorker := testK8sClient.Get(ctx, worker0Key, workerSvc) 499 return errors.IsNotFound(errChief) && errors.IsNotFound(errWorker) 500 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 501 Consistently(func() bool { 502 errChiefPod := testK8sClient.Get(ctx, chiefKey, chiefPod) 503 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 504 errChiefSvc := testK8sClient.Get(ctx, chiefKey, chiefSvc) 505 errWorkerSvc := testK8sClient.Get(ctx, worker0Key, workerSvc) 506 return errors.IsNotFound(errChiefPod) && errors.IsNotFound(errWorkerPod) && 507 errors.IsNotFound(errChiefSvc) && errors.IsNotFound(errWorkerSvc) 508 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 509 510 By("Checking if the TFJob has a suspended condition") 511 Eventually(func() bool { 512 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 513 return created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeChief].Active == 0 && 514 created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active == 0 && 515 created.Status.StartTime.Equal(startTimeBeforeSuspended) 516 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 517 Consistently(func() bool { 518 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 519 return created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeChief].Active == 0 && 520 created.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker].Active == 0 && 521 created.Status.StartTime.Equal(startTimeBeforeSuspended) 522 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 523 Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{ 524 { 525 Type: kubeflowv1.JobCreated, 526 Status: corev1.ConditionTrue, 527 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason), 528 Message: fmt.Sprintf("TFJob %s is created.", name), 529 }, 530 { 531 Type: kubeflowv1.JobRunning, 532 Status: corev1.ConditionFalse, 533 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason), 534 Message: fmt.Sprintf("TFJob %s is suspended.", name), 535 }, 536 { 537 Type: kubeflowv1.JobSuspended, 538 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSuspendedReason), 539 Message: fmt.Sprintf("TFJob %s is suspended.", name), 540 Status: corev1.ConditionTrue, 541 }, 542 }, testutil.IgnoreJobConditionsTimes)) 543 544 By("Unsuspending the TFJob") 545 Eventually(func() error { 546 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 547 created.Spec.RunPolicy.Suspend = pointer.Bool(false) 548 return testK8sClient.Update(ctx, created) 549 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 550 Eventually(func() *metav1.Time { 551 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 552 return created.Status.StartTime 553 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 554 555 By("Check if the pods and services are created") 556 Eventually(func() error { 557 return testK8sClient.Get(ctx, chiefKey, chiefPod) 558 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 559 Eventually(func() error { 560 return testK8sClient.Get(ctx, worker0Key, workerPod) 561 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 562 Eventually(func() error { 563 return testK8sClient.Get(ctx, chiefKey, chiefSvc) 564 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 565 Eventually(func() error { 566 return testK8sClient.Get(ctx, worker0Key, workerSvc) 567 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 568 569 By("Updating Pod's condition with running") 570 Eventually(func() error { 571 Expect(testK8sClient.Get(ctx, chiefKey, chiefPod)).Should(Succeed()) 572 chiefPod.Status.Phase = corev1.PodRunning 573 return testK8sClient.Status().Update(ctx, chiefPod) 574 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 575 Eventually(func() error { 576 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 577 workerPod.Status.Phase = corev1.PodRunning 578 return testK8sClient.Status().Update(ctx, workerPod) 579 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 580 581 By("Checking if the TFJob has resumed conditions") 582 Eventually(func() []kubeflowv1.JobCondition { 583 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 584 return created.Status.Conditions 585 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 586 { 587 Type: kubeflowv1.JobCreated, 588 Status: corev1.ConditionTrue, 589 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobCreatedReason), 590 Message: fmt.Sprintf("TFJob %s is created.", name), 591 }, 592 { 593 Type: kubeflowv1.JobSuspended, 594 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobResumedReason), 595 Message: fmt.Sprintf("TFJob %s is resumed.", name), 596 Status: corev1.ConditionFalse, 597 }, 598 { 599 Type: kubeflowv1.JobRunning, 600 Status: corev1.ConditionTrue, 601 Reason: commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobRunningReason), 602 Message: fmt.Sprintf("TFJob %s/%s is running.", ns.Name, name), 603 }, 604 }, testutil.IgnoreJobConditionsTimes)) 605 606 By("Checking if the startTime is updated") 607 Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) 608 }) 609 }) 610 })