github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/tensorflow/job_test.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package tensorflow 16 17 import ( 18 "context" 19 "fmt" 20 "strconv" 21 "time" 22 23 "github.com/google/go-cmp/cmp/cmpopts" 24 . "github.com/onsi/ginkgo/v2" 25 . "github.com/onsi/gomega" 26 corev1 "k8s.io/api/core/v1" 27 "k8s.io/apimachinery/pkg/api/errors" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/apimachinery/pkg/util/intstr" 31 "k8s.io/apimachinery/pkg/util/uuid" 32 "k8s.io/utils/pointer" 33 "sigs.k8s.io/controller-runtime/pkg/client" 34 35 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 36 "github.com/kubeflow/training-operator/pkg/controller.v1/common" 37 tftestutil "github.com/kubeflow/training-operator/pkg/controller.v1/tensorflow/testutil" 38 commonutil "github.com/kubeflow/training-operator/pkg/util" 39 "github.com/kubeflow/training-operator/pkg/util/testutil" 40 ) 41 42 var _ = Describe("TFJob controller", func() { 43 Context("Test Add TFJob", func() { 44 It("should get the exact TFJob", func() { 45 By("submitting an TFJob") 46 47 testJobName := "test-case-12" 48 testNamespace := metav1.NamespaceDefault 49 50 decoyJobName := "decoy-case-34" 51 52 ctx := context.Background() 53 54 tfJob := tftestutil.NewTFJob(1, 0) 55 tfJob.SetName(testJobName) 56 tfJob.SetNamespace(testNamespace) 57 58 decoyJob := tftestutil.NewTFJob(2, 3) 59 decoyJob.SetName(decoyJobName) 60 decoyJob.SetNamespace(testNamespace) 61 62 Expect(testK8sClient.Create(ctx, tfJob)).Should(Succeed()) 63 Expect(testK8sClient.Create(ctx, decoyJob)).Should(Succeed()) 64 65 key := types.NamespacedName{ 66 Namespace: testNamespace, 67 Name: testJobName, 68 } 69 Eventually(func() error { 70 job := &kubeflowv1.TFJob{} 71 return reconciler.Get(ctx, key, job) 72 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 73 74 Expect(testK8sClient.Delete(ctx, tfJob)).Should(Succeed()) 75 Expect(testK8sClient.Delete(ctx, decoyJob)).Should(Succeed()) 76 }) 77 }) 78 79 Context("Test Copy Labels and Annotation", func() { 80 It("should copy labels and annotation from the spec to generated Pods", func() { 81 ctx := context.Background() 82 testAnnotationKey := "annotation1" 83 testAnnotationVal := "1" 84 testLabelKey := "label1" 85 testLabelVal := "1" 86 87 testJobName := "test-copy-labels-anno" 88 tfjob := tftestutil.NewTFJob(1, 0) 89 tfjob.SetName(testJobName) 90 annotations := map[string]string{ 91 testAnnotationKey: testAnnotationVal, 92 } 93 labels := map[string]string{ 94 testLabelKey: testLabelVal, 95 } 96 tfjob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].Template.Labels = labels 97 tfjob.Spec.TFReplicaSpecs[kubeflowv1.TFJobReplicaTypeWorker].Template.Annotations = annotations 98 99 By("submitting an TFJob with specific labels and annotations") 100 Expect(testK8sClient.Create(ctx, tfjob)).Should(Succeed()) 101 102 Eventually(func() error { 103 pod := &corev1.Pod{} 104 key := types.NamespacedName{ 105 Namespace: metav1.NamespaceDefault, 106 Name: common.GenGeneralName(tfjob.Name, "worker", "0"), 107 } 108 err := testK8sClient.Get(ctx, key, pod) 109 if err != nil { 110 return err 111 } 112 113 if pod.Annotations == nil { 114 return fmt.Errorf("annotation of %s/%s is nil", pod.GetNamespace(), pod.GetName()) 115 } 116 if val, exist := pod.Annotations[testAnnotationKey]; exist { 117 if val != testAnnotationVal { 118 return fmt.Errorf("annotation of %s not match with %s", testAnnotationKey, testAnnotationVal) 119 } 120 } else { 121 return fmt.Errorf("annotation %s not found", testAnnotationKey) 122 } 123 124 if pod.Labels == nil { 125 return fmt.Errorf("label of %s/%s is nil", pod.GetNamespace(), pod.GetName()) 126 } 127 if val, exist := pod.Labels[testLabelKey]; exist { 128 if val != testLabelVal { 129 return fmt.Errorf("annotation of %s not match with %s", testLabelKey, testLabelVal) 130 } 131 } else { 132 return fmt.Errorf("label %s not found", testLabelKey) 133 } 134 135 return nil 136 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 137 }) 138 }) 139 140 Context("Test Delete Pods and Services", func() { 141 It("it should clean associated Pods and Services according to clean policy", func() { 142 type testCase struct { 143 description string 144 tfJob *kubeflowv1.TFJob 145 146 pendingWorkerPods int32 147 activeWorkerPods int32 148 succeededWorkerPods int32 149 failedWorkerPods int32 150 151 pendingPSPods int32 152 activePSPods int32 153 succeededPSPods int32 154 failedPSPods int32 155 156 activeWorkerServices int32 157 activePSServices int32 158 159 expectedPodRemaining int 160 } 161 162 testCases := []testCase{ 163 { 164 description: "4 workers and 2 ps is running, policy is all", 165 tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyAll), 166 167 pendingWorkerPods: 0, 168 activeWorkerPods: 4, 169 succeededWorkerPods: 0, 170 failedWorkerPods: 0, 171 172 pendingPSPods: 0, 173 activePSPods: 2, 174 succeededPSPods: 0, 175 failedPSPods: 0, 176 177 activeWorkerServices: 4, 178 activePSServices: 2, 179 180 expectedPodRemaining: 0, 181 }, 182 { 183 description: "4 workers and 2 ps is running, policy is running", 184 tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning), 185 186 pendingWorkerPods: 0, 187 activeWorkerPods: 4, 188 succeededWorkerPods: 0, 189 failedWorkerPods: 0, 190 191 pendingPSPods: 0, 192 activePSPods: 2, 193 succeededPSPods: 0, 194 failedPSPods: 0, 195 196 activeWorkerServices: 4, 197 activePSServices: 2, 198 199 expectedPodRemaining: 0, 200 }, 201 { 202 description: "4 workers and 2 ps is succeeded, policy is running", 203 tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyRunning), 204 205 pendingWorkerPods: 0, 206 activeWorkerPods: 0, 207 succeededWorkerPods: 4, 208 failedWorkerPods: 0, 209 210 pendingPSPods: 0, 211 activePSPods: 0, 212 succeededPSPods: 2, 213 failedPSPods: 0, 214 215 activeWorkerServices: 4, 216 activePSServices: 2, 217 218 expectedPodRemaining: 6, 219 }, 220 { 221 description: "4 workers and 2 ps is succeeded, policy is None", 222 tfJob: tftestutil.NewTFJobWithCleanPolicy(0, 4, 2, kubeflowv1.CleanPodPolicyNone), 223 224 pendingWorkerPods: 0, 225 activeWorkerPods: 0, 226 succeededWorkerPods: 4, 227 failedWorkerPods: 0, 228 229 pendingPSPods: 0, 230 activePSPods: 0, 231 succeededPSPods: 2, 232 failedPSPods: 0, 233 234 activeWorkerServices: 4, 235 activePSServices: 2, 236 237 expectedPodRemaining: 6, 238 }, 239 } 240 241 jobNameTemplate := "test-del-pod-svc-%d" 242 for idx, tc := range testCases { 243 By(fmt.Sprintf("preparing cases %s", tc.description)) 244 ctx := context.Background() 245 tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx)) 246 tc.tfJob.SetUID(uuid.NewUUID()) 247 commonutil.UpdateJobConditions(&tc.tfJob.Status, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.TFJobKind, commonutil.JobSucceededReason), "") 248 249 refs := []metav1.OwnerReference{ 250 *reconciler.GenOwnerReference(tc.tfJob), 251 } 252 253 basicLabels := reconciler.GenLabels(tc.tfJob.GetName()) 254 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 255 MatchLabels: basicLabels, 256 }) 257 Expect(err).Should(BeNil()) 258 listOpt := client.MatchingLabelsSelector{ 259 Selector: selector, 260 } 261 262 By("creating Services and Pods with designed phases") 263 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, 264 tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, 265 nil, refs, basicLabels) 266 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, 267 tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, 268 nil, refs, basicLabels) 269 270 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels) 271 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels) 272 273 podList := &corev1.PodList{} 274 Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed()) 275 Expect(len(podList.Items)).To(Equal( 276 int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods + 277 tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods))) 278 279 By("calling ReconcileJob") 280 _ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy) 281 282 podList = &corev1.PodList{} 283 Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed()) 284 podRemainingCount := len(podList.Items) 285 Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining)) 286 287 svcList := &corev1.ServiceList{} 288 Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed()) 289 svcRemainingCount := len(svcList.Items) 290 Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining)) 291 } 292 }) 293 }) 294 295 Context("Test Active Deadline Seconds", func() { 296 It("clean desired Pods and Services according to TFJob config", func() { 297 type testCase struct { 298 description string 299 tfJob *kubeflowv1.TFJob 300 301 pendingWorkerPods int32 302 activeWorkerPods int32 303 succeededWorkerPods int32 304 failedWorkerPods int32 305 306 pendingPSPods int32 307 activePSPods int32 308 succeededPSPods int32 309 failedPSPods int32 310 311 activeWorkerServices int32 312 activePSServices int32 313 314 expectedPodRemaining int 315 } 316 317 ads2 := int64(2) 318 adsTest2 := &ads2 319 testCases := []testCase{ 320 { 321 description: "4 workers and 2 ps is running, ActiveDeadlineSeconds unset", 322 tfJob: tftestutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, nil), 323 324 pendingWorkerPods: 0, 325 activeWorkerPods: 4, 326 succeededWorkerPods: 0, 327 failedWorkerPods: 0, 328 329 pendingPSPods: 0, 330 activePSPods: 2, 331 succeededPSPods: 0, 332 failedPSPods: 0, 333 334 activeWorkerServices: 4, 335 activePSServices: 2, 336 337 expectedPodRemaining: 6, 338 }, 339 { 340 description: "4 workers and 2 ps is running, ActiveDeadlineSeconds is 2", 341 tfJob: tftestutil.NewTFJobWithActiveDeadlineSeconds(0, 4, 2, adsTest2), 342 343 pendingWorkerPods: 0, 344 activeWorkerPods: 4, 345 succeededWorkerPods: 0, 346 failedWorkerPods: 0, 347 348 pendingPSPods: 0, 349 activePSPods: 2, 350 succeededPSPods: 0, 351 failedPSPods: 0, 352 353 activeWorkerServices: 4, 354 activePSServices: 2, 355 356 expectedPodRemaining: 0, 357 }, 358 } 359 jobNameTemplate := "test-ads-%d" 360 for idx, tc := range testCases { 361 By(fmt.Sprintf("preparing cases %s", tc.description)) 362 ctx := context.Background() 363 tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx)) 364 tc.tfJob.SetUID(uuid.NewUUID()) 365 366 refs := []metav1.OwnerReference{ 367 *reconciler.GenOwnerReference(tc.tfJob), 368 } 369 370 basicLabels := reconciler.GenLabels(tc.tfJob.GetName()) 371 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 372 MatchLabels: basicLabels, 373 }) 374 Expect(err).Should(BeNil()) 375 listOpt := client.MatchingLabelsSelector{ 376 Selector: selector, 377 } 378 379 By("creating Services and Pods with designed phases") 380 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, 381 tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, 382 nil, refs, basicLabels) 383 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, 384 tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, 385 nil, refs, basicLabels) 386 387 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels) 388 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels) 389 390 podList := &corev1.PodList{} 391 Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed()) 392 Expect(len(podList.Items)).To(Equal( 393 int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods + 394 tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods))) 395 396 By("waiting enough time") 397 now := metav1.Now() 398 tc.tfJob.Status.StartTime = &now 399 ads := tc.tfJob.Spec.RunPolicy.ActiveDeadlineSeconds 400 if ads != nil { 401 dur := time.Second * time.Duration(*ads) 402 time.Sleep(dur) 403 } 404 405 By("calling ReconcileJob") 406 _ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy) 407 408 podList = &corev1.PodList{} 409 Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed()) 410 podRemainingCount := len(podList.Items) 411 Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining)) 412 413 svcList := &corev1.ServiceList{} 414 Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed()) 415 svcRemainingCount := len(svcList.Items) 416 Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining)) 417 } 418 }) 419 }) 420 421 Context("Test Backoff For On Failure(", func() { 422 It("clean desired Pods and Services according to TFJob config", func() { 423 type testCase struct { 424 description string 425 tfJob *kubeflowv1.TFJob 426 427 pendingWorkerPods int32 428 activeWorkerPods int32 429 succeededWorkerPods int32 430 failedWorkerPods int32 431 432 restartCounts []int32 433 434 pendingPSPods int32 435 activePSPods int32 436 succeededPSPods int32 437 failedPSPods int32 438 439 activeWorkerServices int32 440 activePSServices int32 441 442 expectedPodRemaining int 443 } 444 445 backoffLimit4 := int32(4) 446 backoffLimitTest4 := &backoffLimit4 447 testCases := []testCase{ 448 { 449 description: "4 workers each having 1 restartCount and 2 ps is running, backoffLimit 4 ", 450 tfJob: tftestutil.NewTFJobWithBackoffLimit(0, 4, 2, backoffLimitTest4), 451 452 pendingWorkerPods: 0, 453 activeWorkerPods: 4, 454 succeededWorkerPods: 0, 455 failedWorkerPods: 0, 456 457 restartCounts: []int32{1, 1, 1, 1}, 458 459 pendingPSPods: 0, 460 activePSPods: 2, 461 succeededPSPods: 0, 462 failedPSPods: 0, 463 464 activeWorkerServices: 4, 465 activePSServices: 2, 466 467 expectedPodRemaining: 0, 468 }, 469 } 470 471 jobNameTemplate := "test-bof-%d" 472 for idx, tc := range testCases { 473 By(fmt.Sprintf("preparing cases %s", tc.description)) 474 ctx := context.Background() 475 tc.tfJob.SetName(fmt.Sprintf(jobNameTemplate, idx)) 476 tc.tfJob.SetUID(uuid.NewUUID()) 477 478 refs := []metav1.OwnerReference{ 479 *reconciler.GenOwnerReference(tc.tfJob), 480 } 481 482 basicLabels := reconciler.GenLabels(tc.tfJob.GetName()) 483 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 484 MatchLabels: basicLabels, 485 }) 486 Expect(err).Should(BeNil()) 487 listOpt := client.MatchingLabelsSelector{ 488 Selector: selector, 489 } 490 491 By("creating Services and Pods with designed phases") 492 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, 493 tc.pendingWorkerPods, tc.activeWorkerPods, tc.succeededWorkerPods, tc.failedWorkerPods, 494 tc.restartCounts, refs, basicLabels) 495 tftestutil.SetPodsStatuses(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, 496 tc.pendingPSPods, tc.activePSPods, tc.succeededPSPods, tc.failedPSPods, 497 tc.restartCounts, refs, basicLabels) 498 499 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypeWorker, tc.activeWorkerServices, refs, basicLabels) 500 tftestutil.SetServices(testK8sClient, tc.tfJob, kubeflowv1.TFJobReplicaTypePS, tc.activePSServices, refs, basicLabels) 501 502 podList := &corev1.PodList{} 503 Expect(testK8sClient.List(ctx, podList, listOpt)).Should(Succeed()) 504 Expect(len(podList.Items)).To(Equal( 505 int(tc.pendingPSPods + tc.activePSPods + tc.failedPSPods + tc.succeededPSPods + 506 tc.pendingWorkerPods + tc.activeWorkerPods + tc.failedWorkerPods + tc.succeededWorkerPods))) 507 508 By("calling ReconcileJob") 509 _ = reconciler.ReconcileJobs(tc.tfJob, tc.tfJob.Spec.TFReplicaSpecs, tc.tfJob.Status, &tc.tfJob.Spec.RunPolicy) 510 511 podList = &corev1.PodList{} 512 Expect(testK8sClient.List(ctx, podList, listOpt, client.InNamespace(tc.tfJob.GetNamespace()))).Should(Succeed()) 513 podRemainingCount := len(podList.Items) 514 Expect(podRemainingCount).To(Equal(tc.expectedPodRemaining)) 515 516 svcList := &corev1.ServiceList{} 517 Expect(testK8sClient.List(ctx, svcList, listOpt)).Should(Succeed()) 518 svcRemainingCount := len(svcList.Items) 519 Expect(svcRemainingCount).To(Equal(tc.expectedPodRemaining)) 520 } 521 }) 522 }) 523 524 Context("Test TTL Seconds After Finished", func() { 525 It("should delete job when expired time is up", func() { 526 type testCase struct { 527 description string 528 tfJob *kubeflowv1.TFJob 529 phase corev1.PodPhase 530 } 531 testCases := []testCase{ 532 { 533 description: "succeeded job with TTL 3s", 534 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(0, 1, 0, pointer.Int32(3)), 535 phase: corev1.PodSucceeded, 536 }, 537 { 538 description: "failed job with TTL 3s", 539 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(0, 1, 0, pointer.Int32(3)), 540 phase: corev1.PodFailed, 541 }, 542 } 543 jobNameTemplate := "test-bof-%d" 544 for idx, tc := range testCases { 545 By(fmt.Sprintf("preparing cases %s", tc.description)) 546 ctx := context.Background() 547 name := fmt.Sprintf(jobNameTemplate, idx) 548 tc.tfJob.SetName(name) 549 tc.tfJob.CreationTimestamp = metav1.Now() 550 551 By("creating a TFJob") 552 Expect(reconciler.Create(ctx, tc.tfJob)).Should(Succeed()) 553 554 // We need to wait for synchronizing cache. 555 By("getting a created TFJob") 556 var updatedTFJob kubeflowv1.TFJob 557 Eventually(func() error { 558 return reconciler.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &updatedTFJob) 559 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 560 561 initializeReplicaStatuses(&updatedTFJob.Status, kubeflowv1.TFJobReplicaTypeWorker) 562 563 By("prepare pod") 564 refs := []metav1.OwnerReference{ 565 *reconciler.GenOwnerReference(tc.tfJob), 566 } 567 pod := tftestutil.NewBasePod("pod", tc.tfJob, refs) 568 pod.Status.Phase = tc.phase 569 570 By("update job replica statuses") 571 updateJobReplicaStatuses(&updatedTFJob.Status, kubeflowv1.TFJobReplicaTypeWorker, pod) 572 573 By("update job status") 574 Expect(reconciler.UpdateJobStatus(&updatedTFJob, updatedTFJob.Spec.TFReplicaSpecs, &updatedTFJob.Status)).To(Succeed()) 575 By("updating job status...") 576 Expect(reconciler.Status().Update(ctx, &updatedTFJob)).To(Succeed()) 577 578 By("waiting for updating replicaStatus for workers") 579 Eventually(func() *kubeflowv1.ReplicaStatus { 580 var getTFJob kubeflowv1.TFJob 581 Expect(reconciler.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &getTFJob)).Should(Succeed()) 582 return getTFJob.Status.ReplicaStatuses[kubeflowv1.TFJobReplicaTypeWorker] 583 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 584 585 ttl := updatedTFJob.Spec.RunPolicy.TTLSecondsAfterFinished 586 if ttl != nil { 587 dur := time.Second * time.Duration(*ttl) 588 time.Sleep(dur) 589 } 590 591 Eventually(func() error { 592 tfJob := &kubeflowv1.TFJob{} 593 key := types.NamespacedName{ 594 Namespace: metav1.NamespaceDefault, 595 Name: name, 596 } 597 if err := reconciler.Get(ctx, key, tfJob); err != nil { 598 if errors.IsNotFound(err) { 599 return nil 600 } 601 return err 602 } 603 return fmt.Errorf("job %s still remains", name) 604 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 605 } 606 }) 607 }) 608 }) 609 610 var _ = Describe("Test for controller.v1/common", func() { 611 var ( 612 ctx = context.Background() 613 ns *corev1.Namespace 614 now metav1.Time 615 ) 616 BeforeEach(func() { 617 ns = &corev1.Namespace{ 618 ObjectMeta: metav1.ObjectMeta{ 619 GenerateName: "tfjob-ns-", 620 }, 621 } 622 now = metav1.Now() 623 Expect(testK8sClient.Create(ctx, ns)).Should(Succeed()) 624 }) 625 AfterEach(func() { 626 Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed()) 627 }) 628 629 type cleanUpCases struct { 630 tfJob *kubeflowv1.TFJob 631 runPolicy *kubeflowv1.RunPolicy 632 jobStatus kubeflowv1.JobStatus 633 wantTFJobIsRemoved bool 634 wantErr bool 635 } 636 DescribeTable("TFJob is created and is cleaned up", 637 func(tc *cleanUpCases) { 638 tc.tfJob.SetNamespace(ns.Name) 639 Expect(testK8sClient.Create(ctx, tc.tfJob)).Should(Succeed()) 640 641 if tc.wantErr { 642 Expect(reconciler.CleanupJob(tc.runPolicy, tc.jobStatus, tc.tfJob)).ShouldNot(Succeed()) 643 } else { 644 Expect(reconciler.CleanupJob(tc.runPolicy, tc.jobStatus, tc.tfJob)).Should(Succeed()) 645 } 646 if tc.wantTFJobIsRemoved { 647 Eventually(func() bool { 648 gotErr := testK8sClient.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &kubeflowv1.TFJob{}) 649 return errors.IsNotFound(gotErr) 650 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 651 } else { 652 Eventually(func() error { 653 return testK8sClient.Get(ctx, client.ObjectKeyFromObject(tc.tfJob), &kubeflowv1.TFJob{}) 654 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 655 } 656 }, 657 Entry("TFJob shouldn't be removed since TTL is nil", &cleanUpCases{ 658 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, nil), 659 runPolicy: &kubeflowv1.RunPolicy{ 660 TTLSecondsAfterFinished: nil, 661 }, 662 jobStatus: kubeflowv1.JobStatus{}, 663 wantTFJobIsRemoved: false, 664 wantErr: false, 665 }), 666 Entry("Error is occurred since completionTime is nil", &cleanUpCases{ 667 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(10)), 668 runPolicy: &kubeflowv1.RunPolicy{ 669 TTLSecondsAfterFinished: pointer.Int32(10), 670 }, 671 jobStatus: kubeflowv1.JobStatus{ 672 CompletionTime: nil, 673 }, 674 wantTFJobIsRemoved: false, 675 wantErr: true, 676 }), 677 Entry("TFJob is removed since exceeded TTL (TTL is 180s)", &cleanUpCases{ 678 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(180)), 679 runPolicy: &kubeflowv1.RunPolicy{ 680 TTLSecondsAfterFinished: pointer.Int32(180), 681 }, 682 jobStatus: kubeflowv1.JobStatus{ 683 CompletionTime: &metav1.Time{ 684 Time: now.AddDate(0, 0, -1), 685 }, 686 }, 687 wantTFJobIsRemoved: true, 688 wantErr: false, 689 }), 690 Entry("TFJob is removed since (TTL is 0s)", &cleanUpCases{ 691 tfJob: tftestutil.NewTFJobWithCleanupJobDelay(1, 2, 0, pointer.Int32(0)), 692 runPolicy: &kubeflowv1.RunPolicy{ 693 TTLSecondsAfterFinished: pointer.Int32(0), 694 }, 695 jobStatus: kubeflowv1.JobStatus{ 696 CompletionTime: &now, 697 }, 698 wantTFJobIsRemoved: true, 699 wantErr: false, 700 }), 701 ) 702 703 type createServiceCases struct { 704 tfJob *kubeflowv1.TFJob 705 rType kubeflowv1.ReplicaType 706 spec *kubeflowv1.ReplicaSpec 707 uid types.UID 708 index int 709 wantErr bool 710 } 711 DescribeTable("CreateNewService", 712 func(tc *createServiceCases) { 713 tc.tfJob.SetUID(tc.uid) 714 tc.tfJob.SetNamespace(ns.Name) 715 716 gotErr := reconciler.CreateNewService(tc.tfJob, tc.rType, tc.spec, strconv.Itoa(tc.index)) 717 if tc.wantErr { 718 Expect(gotErr).ShouldNot(Succeed()) 719 } else { 720 Expect(gotErr).Should(Succeed()) 721 722 svcInternalTPC := corev1.ServiceInternalTrafficPolicyCluster 723 svcSingleStack := corev1.IPFamilyPolicySingleStack 724 wantSvc := &corev1.Service{ 725 ObjectMeta: metav1.ObjectMeta{ 726 Name: fmt.Sprintf("%s-%s-%d", tc.tfJob.Name, tc.rType, tc.index), 727 Namespace: ns.Name, 728 OwnerReferences: []metav1.OwnerReference{ 729 *reconciler.GenOwnerReference(tc.tfJob), 730 }, 731 Labels: map[string]string{ 732 kubeflowv1.JobNameLabel: tc.tfJob.Name, 733 kubeflowv1.OperatorNameLabel: controllerName, 734 kubeflowv1.ReplicaIndexLabel: strconv.Itoa(tc.index), 735 kubeflowv1.ReplicaTypeLabel: "", 736 }, 737 }, 738 Spec: corev1.ServiceSpec{ 739 Ports: []corev1.ServicePort{ 740 { 741 Name: kubeflowv1.TFJobDefaultPortName, 742 Protocol: corev1.ProtocolTCP, 743 Port: kubeflowv1.TFJobDefaultPort, 744 TargetPort: intstr.IntOrString{ 745 IntVal: kubeflowv1.TFJobDefaultPort, 746 }, 747 }, 748 }, 749 Selector: map[string]string{ 750 kubeflowv1.JobNameLabel: tc.tfJob.Name, 751 kubeflowv1.OperatorNameLabel: controllerName, 752 kubeflowv1.ReplicaIndexLabel: strconv.Itoa(tc.index), 753 kubeflowv1.ReplicaTypeLabel: "", 754 }, 755 ClusterIP: corev1.ClusterIPNone, 756 Type: corev1.ServiceTypeClusterIP, 757 ClusterIPs: []string{corev1.ClusterIPNone}, 758 SessionAffinity: corev1.ClusterIPNone, 759 IPFamilies: []corev1.IPFamily{corev1.IPv4Protocol}, 760 IPFamilyPolicy: &svcSingleStack, 761 InternalTrafficPolicy: &svcInternalTPC, 762 }, 763 } 764 Eventually(func() *corev1.Service { 765 svc := &corev1.Service{} 766 Expect(testK8sClient.Get(ctx, client.ObjectKeyFromObject(wantSvc), svc)).Should(Succeed()) 767 return svc 768 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo(wantSvc, 769 cmpopts.IgnoreFields(metav1.ObjectMeta{}, "UID", "ResourceVersion", "Generation", "CreationTimestamp", "ManagedFields"))) 770 } 771 }, 772 Entry("Failed to create service since containerPort is missing", &createServiceCases{ 773 tfJob: tftestutil.NewTFJobV2(2, 0, 0, 1, 0), 774 spec: &kubeflowv1.ReplicaSpec{ 775 Template: corev1.PodTemplateSpec{ 776 Spec: corev1.PodSpec{ 777 Containers: []corev1.Container{ 778 { 779 Name: kubeflowv1.TFJobDefaultContainerName, 780 }, 781 }, 782 }, 783 }, 784 }, 785 index: 0, 786 wantErr: true, 787 }), 788 Entry("Failed to create service since Job's ownerReference is invalid", &createServiceCases{ 789 tfJob: tftestutil.NewTFJobV2(2, 0, 0, 1, 0), 790 spec: &kubeflowv1.ReplicaSpec{Template: tftestutil.NewTFReplicaSpecTemplate()}, 791 index: 1, 792 wantErr: true, 793 }), 794 Entry("Succeeded to create service", &createServiceCases{ 795 tfJob: tftestutil.NewTFJobV2(2, 0, 0, 1, 0), 796 spec: &kubeflowv1.ReplicaSpec{Template: tftestutil.NewTFReplicaSpecTemplate()}, 797 index: 0, 798 wantErr: false, 799 uid: uuid.NewUUID(), 800 }), 801 ) 802 })