github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/mpi/mpijob_controller_test.go (about) 1 // Copyright 2021 The Kubeflow Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mpi 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 22 common "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 23 . "github.com/onsi/ginkgo/v2" 24 . "github.com/onsi/gomega" 25 corev1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/errors" 27 "k8s.io/apimachinery/pkg/api/resource" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/types" 30 "k8s.io/utils/pointer" 31 ctrl "sigs.k8s.io/controller-runtime" 32 "sigs.k8s.io/controller-runtime/pkg/client" 33 34 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 35 commonutil "github.com/kubeflow/training-operator/pkg/util" 36 "github.com/kubeflow/training-operator/pkg/util/testutil" 37 ) 38 39 const ( 40 gpuResourceName = "nvidia.com/gpu" 41 extendedGPUResourceName = "vendor-domain/gpu" 42 ) 43 44 func newMPIJobCommon(name string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob { 45 mpiJob := &kubeflowv1.MPIJob{ 46 TypeMeta: metav1.TypeMeta{APIVersion: kubeflowv1.SchemeGroupVersion.String()}, 47 ObjectMeta: metav1.ObjectMeta{ 48 Name: name, 49 Namespace: metav1.NamespaceDefault, 50 }, 51 Spec: kubeflowv1.MPIJobSpec{ 52 RunPolicy: common.RunPolicy{ 53 CleanPodPolicy: kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll), 54 }, 55 MPIReplicaSpecs: map[common.ReplicaType]*common.ReplicaSpec{ 56 kubeflowv1.MPIJobReplicaTypeWorker: { 57 Template: corev1.PodTemplateSpec{ 58 Spec: corev1.PodSpec{ 59 Containers: []corev1.Container{ 60 { 61 Name: "foo", 62 Image: "bar", 63 }, 64 }, 65 }, 66 }, 67 }, 68 kubeflowv1.MPIJobReplicaTypeLauncher: { 69 Template: corev1.PodTemplateSpec{ 70 Spec: corev1.PodSpec{ 71 Containers: []corev1.Container{ 72 { 73 Name: "foo", 74 Image: "bar", 75 }, 76 }, 77 }, 78 }, 79 }, 80 }, 81 }, 82 Status: common.JobStatus{}, 83 } 84 85 if startTime != nil { 86 mpiJob.Status.StartTime = startTime 87 } 88 if completionTime != nil { 89 mpiJob.Status.CompletionTime = completionTime 90 } 91 92 return mpiJob 93 } 94 95 func newMPIJobOld(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob { 96 mpiJob := newMPIJobCommon(name, startTime, completionTime) 97 98 mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Replicas = replicas 99 100 workerContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Template.Spec.Containers 101 for i := range workerContainers { 102 container := &workerContainers[i] 103 container.Resources = corev1.ResourceRequirements{ 104 Limits: corev1.ResourceList{ 105 corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent), 106 }, 107 } 108 } 109 110 return mpiJob 111 } 112 113 var newMPIJob = newMPIJobWithLauncher 114 115 func newMPIJobWithLauncher(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob { 116 mpiJob := newMPIJobOld(name, replicas, pusPerReplica, resourceName, startTime, completionTime) 117 118 mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Replicas = pointer.Int32(1) 119 120 launcherContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.Containers 121 for i := range launcherContainers { 122 container := &launcherContainers[i] 123 container.Resources = corev1.ResourceRequirements{ 124 Limits: corev1.ResourceList{ 125 corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent), 126 }, 127 } 128 } 129 130 return mpiJob 131 } 132 133 var _ = Describe("MPIJob controller", func() { 134 Context("Test launcher is GPU launcher", func() { 135 It("Should pass GPU Launcher verification", func() { 136 By("By creating MPIJobs with various resource configuration") 137 138 testCases := map[string]struct { 139 gpu string 140 expected bool 141 }{ 142 "isNvidiaGPU": { 143 gpu: gpuResourceName, 144 expected: true, 145 }, 146 "isExtendedGPU": { 147 gpu: extendedGPUResourceName, 148 expected: true, 149 }, 150 "notGPU": { 151 gpu: "vendor-domain/resourcetype", 152 expected: false, 153 }, 154 } 155 156 startTime := metav1.Now() 157 completionTime := metav1.Now() 158 159 for testName, testCase := range testCases { 160 mpiJob := newMPIJobWithLauncher("test-"+strings.ToLower(testName), 161 pointer.Int32(64), 1, testCase.gpu, &startTime, &completionTime) 162 Expect(isGPULauncher(mpiJob) == testCase.expected).To(BeTrue()) 163 } 164 }) 165 }) 166 167 Context("Test MPIJob with succeeded launcher Pod", func() { 168 It("Should contains desired launcher ReplicaStatus", func() { 169 By("By marking a launcher pod with Phase Succeeded") 170 ctx := context.Background() 171 startTime := metav1.Now() 172 completionTime := metav1.Now() 173 174 jobName := "test-launcher-succeeded" 175 176 mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 177 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 178 179 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 180 launcher.Status.Phase = corev1.PodSucceeded 181 182 launcherKey := types.NamespacedName{ 183 Namespace: metav1.NamespaceDefault, 184 Name: launcher.GetName(), 185 } 186 Eventually(func() error { 187 launcherCreated := &corev1.Pod{} 188 if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil { 189 return err 190 } 191 launcherCreated.Status.Phase = corev1.PodSucceeded 192 return testK8sClient.Status().Update(ctx, launcherCreated) 193 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 194 195 created := &kubeflowv1.MPIJob{} 196 launcherStatus := &common.ReplicaStatus{ 197 Active: 0, 198 Succeeded: 1, 199 Failed: 0, 200 } 201 Eventually(func() bool { 202 err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created) 203 if err != nil { 204 return false 205 } 206 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, launcherStatus) 207 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 208 }) 209 }) 210 211 Context("Test MPIJob with failed launcher Pod", func() { 212 It("Should contains desired launcher ReplicaStatus", func() { 213 By("By marking a launcher pod with Phase Failed") 214 ctx := context.Background() 215 startTime := metav1.Now() 216 completionTime := metav1.Now() 217 218 jobName := "test-launcher-failed" 219 220 mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 221 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 222 223 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 224 launcherKey := types.NamespacedName{ 225 Namespace: metav1.NamespaceDefault, 226 Name: launcher.GetName(), 227 } 228 Eventually(func() error { 229 launcherCreated := &corev1.Pod{} 230 if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil { 231 return err 232 } 233 launcherCreated.Status.Phase = corev1.PodFailed 234 return testK8sClient.Status().Update(ctx, launcherCreated) 235 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 236 237 launcherStatus := &common.ReplicaStatus{ 238 Active: 0, 239 Succeeded: 0, 240 Failed: 1, 241 } 242 created := &kubeflowv1.MPIJob{} 243 Eventually(func() bool { 244 err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created) 245 if err != nil { 246 return false 247 } 248 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, launcherStatus) 249 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 250 }) 251 }) 252 253 Context("Test MPIJob with succeeded launcher pod", func() { 254 It("Should contain desired ReplicaStatuses for worker", func() { 255 By("By marking the launcher Pod as Succeeded") 256 ctx := context.Background() 257 startTime := metav1.Now() 258 completionTime := metav1.Now() 259 260 jobName := "test-launcher-succeeded2" 261 262 mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 263 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 264 265 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 266 launcher.Status.Phase = corev1.PodSucceeded 267 268 launcherKey := types.NamespacedName{ 269 Namespace: metav1.NamespaceDefault, 270 Name: launcher.GetName(), 271 } 272 Eventually(func() error { 273 launcherCreated := &corev1.Pod{} 274 if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil { 275 return err 276 } 277 launcherCreated.Status.Phase = corev1.PodSucceeded 278 return testK8sClient.Status().Update(ctx, launcherCreated) 279 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 280 281 created := &kubeflowv1.MPIJob{} 282 launcherStatus := &common.ReplicaStatus{ 283 Active: 0, 284 Succeeded: 0, 285 Failed: 0, 286 } 287 Eventually(func() bool { 288 err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created) 289 if err != nil { 290 return false 291 } 292 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker, launcherStatus) 293 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 294 }) 295 }) 296 297 Context("Test MPIJob with Running launcher Pod and Pending worker Pods", func() { 298 It("Should contain desired ReplicaStatuses", func() { 299 By("By marking an active launcher pod and pending worker pods") 300 301 ctx := context.Background() 302 startTime := metav1.Now() 303 completionTime := metav1.Now() 304 305 jobName := "test-launcher-running-worker-pending" 306 307 var replicas int32 = 8 308 mpiJob := newMPIJobWithLauncher(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime) 309 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 310 311 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 312 launcherKey := types.NamespacedName{ 313 Namespace: metav1.NamespaceDefault, 314 Name: launcher.GetName(), 315 } 316 Eventually(func() error { 317 launcherCreated := &corev1.Pod{} 318 if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil { 319 return err 320 } 321 launcherCreated.Status.Phase = corev1.PodRunning 322 return testK8sClient.Status().Update(ctx, launcherCreated) 323 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 324 325 for i := 0; i < int(replicas); i++ { 326 name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i) 327 worker := reconciler.newWorker(mpiJob, name) 328 workerKey := types.NamespacedName{ 329 Namespace: metav1.NamespaceDefault, 330 Name: worker.GetName(), 331 } 332 Eventually(func() error { 333 workerCreated := &corev1.Pod{} 334 if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil { 335 return err 336 } 337 workerCreated.Status.Phase = corev1.PodPending 338 return testK8sClient.Status().Update(ctx, workerCreated) 339 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 340 } 341 342 key := types.NamespacedName{ 343 Namespace: metav1.NamespaceDefault, 344 Name: jobName, 345 } 346 launcherStatus := &common.ReplicaStatus{ 347 Active: 1, 348 Succeeded: 0, 349 Failed: 0, 350 } 351 workerStatus := &common.ReplicaStatus{ 352 Active: 0, 353 Succeeded: 0, 354 Failed: 0, 355 } 356 Eventually(func() bool { 357 created := &kubeflowv1.MPIJob{} 358 err := testK8sClient.Get(ctx, key, created) 359 if err != nil { 360 return false 361 } 362 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, 363 launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker, 364 workerStatus) 365 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 366 }) 367 }) 368 369 Context("Test MPIJob with Running launcher Pod and Running worker Pods", func() { 370 It("Should contain desired ReplicaStatuses", func() { 371 By("By creating an active launcher pod and active worker pods") 372 373 ctx := context.Background() 374 startTime := metav1.Now() 375 completionTime := metav1.Now() 376 377 jobName := "test-launcher-running-worker-running" 378 379 var replicas int32 = 8 380 mpiJob := newMPIJob(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime) 381 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 382 383 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 384 launcherKey := types.NamespacedName{ 385 Namespace: metav1.NamespaceDefault, 386 Name: launcher.GetName(), 387 } 388 Eventually(func() error { 389 launcherCreated := &corev1.Pod{} 390 if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil { 391 return err 392 } 393 launcherCreated.Status.Phase = corev1.PodRunning 394 return testK8sClient.Status().Update(ctx, launcherCreated) 395 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 396 397 for i := 0; i < int(replicas); i++ { 398 name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i) 399 worker := reconciler.newWorker(mpiJob, name) 400 workerKey := types.NamespacedName{ 401 Namespace: metav1.NamespaceDefault, 402 Name: worker.GetName(), 403 } 404 Eventually(func() error { 405 workerCreated := &corev1.Pod{} 406 if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil { 407 return err 408 } 409 workerCreated.Status.Phase = corev1.PodRunning 410 return testK8sClient.Status().Update(ctx, workerCreated) 411 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 412 } 413 414 key := types.NamespacedName{ 415 Namespace: metav1.NamespaceDefault, 416 Name: jobName, 417 } 418 launcherStatus := &common.ReplicaStatus{ 419 Active: 1, 420 Succeeded: 0, 421 Failed: 0, 422 } 423 workerStatus := &common.ReplicaStatus{ 424 Active: 8, 425 Succeeded: 0, 426 Failed: 0, 427 } 428 Eventually(func() bool { 429 created := &kubeflowv1.MPIJob{} 430 err := testK8sClient.Get(ctx, key, created) 431 if err != nil { 432 return false 433 } 434 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, 435 launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker, 436 workerStatus) 437 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 438 }) 439 }) 440 441 Context("Test MPIJob with Running worker Pods", func() { 442 It("Should contain desired ReplicaStatuses and create a launcher pod", func() { 443 By("By creating only active worker pods") 444 445 ctx := context.Background() 446 startTime := metav1.Now() 447 completionTime := metav1.Now() 448 449 jobName := "test-worker-running" 450 451 var replicas int32 = 16 452 mpiJob := newMPIJob(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime) 453 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 454 455 for i := 0; i < int(replicas); i++ { 456 name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i) 457 worker := reconciler.newWorker(mpiJob, name) 458 workerKey := types.NamespacedName{ 459 Namespace: metav1.NamespaceDefault, 460 Name: worker.GetName(), 461 } 462 Eventually(func() error { 463 workerCreated := &corev1.Pod{} 464 if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil { 465 return err 466 } 467 workerCreated.Status.Phase = corev1.PodRunning 468 return testK8sClient.Status().Update(ctx, workerCreated) 469 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 470 } 471 472 launcherKey := types.NamespacedName{ 473 Namespace: metav1.NamespaceDefault, 474 Name: mpiJob.Name + launcherSuffix, 475 } 476 launcher := &kubeflowv1.MPIJob{} 477 Eventually(func() bool { 478 err := testK8sClient.Get(ctx, launcherKey, launcher) 479 return err != nil 480 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 481 482 key := types.NamespacedName{ 483 Namespace: metav1.NamespaceDefault, 484 Name: jobName, 485 } 486 launcherStatus := &common.ReplicaStatus{ 487 Active: 0, 488 Succeeded: 0, 489 Failed: 0, 490 } 491 workerStatus := &common.ReplicaStatus{ 492 Active: 16, 493 Succeeded: 0, 494 Failed: 0, 495 } 496 Eventually(func() bool { 497 created := &kubeflowv1.MPIJob{} 498 err := testK8sClient.Get(ctx, key, created) 499 if err != nil { 500 return false 501 } 502 return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, 503 launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker, 504 workerStatus) 505 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 506 }) 507 }) 508 509 Context("MPIJob not found", func() { 510 It("Should do nothing", func() { 511 By("Calling Reconcile method") 512 jobName := "test-not-exist" 513 514 ctx := context.Background() 515 516 req := ctrl.Request{NamespacedName: types.NamespacedName{ 517 Namespace: metav1.NamespaceDefault, 518 Name: jobName, 519 }} 520 _, err := reconciler.Reconcile(ctx, req) 521 Expect(err).Should(BeNil()) 522 }) 523 }) 524 525 Context("MPIJob with launcher Pod not controlled by itself", func() { 526 It("Should return error", func() { 527 By("Calling Reconcile method") 528 jobName := "test-launcher-orphan" 529 testKind := "Pod" 530 531 ctx := context.Background() 532 startTime := metav1.Now() 533 completionTime := metav1.Now() 534 535 mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 536 537 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob)) 538 launcher.OwnerReferences = nil 539 Expect(testK8sClient.Create(ctx, launcher)).Should(Succeed()) 540 541 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 542 543 req := ctrl.Request{NamespacedName: types.NamespacedName{ 544 Namespace: metav1.NamespaceDefault, 545 Name: mpiJob.GetName(), 546 }} 547 expectedErr := fmt.Errorf(MessageResourceExists, launcher.Name, testKind) 548 Eventually(func() error { 549 _, err := reconciler.Reconcile(ctx, req) 550 return err 551 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 552 }) 553 }) 554 555 Context("MPIJob with worker Pod not controlled by itself", func() { 556 It("Should return error", func() { 557 By("Calling Reconcile method") 558 jobName := "test-worker-orphan" 559 testKind := "Pod" 560 561 ctx := context.Background() 562 startTime := metav1.Now() 563 completionTime := metav1.Now() 564 565 mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime) 566 567 for i := 0; i < 1; i++ { 568 name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i) 569 worker := reconciler.newWorker(mpiJob, name) 570 worker.OwnerReferences = nil 571 Expect(testK8sClient.Create(ctx, worker)).Should(Succeed()) 572 } 573 574 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 575 576 req := ctrl.Request{NamespacedName: types.NamespacedName{ 577 Namespace: metav1.NamespaceDefault, 578 Name: mpiJob.GetName(), 579 }} 580 expectedErr := fmt.Errorf(MessageResourceExists, fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, 0), testKind) 581 Eventually(func() error { 582 _, err := reconciler.Reconcile(ctx, req) 583 return err 584 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 585 }) 586 }) 587 588 Context("MPIJob with ConfigMap not controlled by itself", func() { 589 It("Should return error", func() { 590 By("Calling Reconcile method") 591 jobName := "test-cm-orphan" 592 testKind := "ConfigMap" 593 594 ctx := context.Background() 595 startTime := metav1.Now() 596 completionTime := metav1.Now() 597 598 mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 599 600 cm := newConfigMap(mpiJob, 64, isGPULauncher(mpiJob)) 601 cm.OwnerReferences = nil 602 Expect(testK8sClient.Create(ctx, cm)).Should(Succeed()) 603 604 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 605 606 req := ctrl.Request{NamespacedName: types.NamespacedName{ 607 Namespace: metav1.NamespaceDefault, 608 Name: mpiJob.GetName(), 609 }} 610 expectedErr := fmt.Errorf(MessageResourceExists, cm.Name, testKind) 611 Eventually(func() error { 612 _, err := reconciler.Reconcile(ctx, req) 613 return err 614 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 615 }) 616 }) 617 618 Context("MPIJob with ServiceAccount not controlled by itself", func() { 619 It("Should return error", func() { 620 By("Calling Reconcile method") 621 jobName := "test-sa-orphan" 622 testKind := "ServiceAccount" 623 624 ctx := context.Background() 625 startTime := metav1.Now() 626 completionTime := metav1.Now() 627 628 mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 629 630 sa := newLauncherServiceAccount(mpiJob) 631 sa.OwnerReferences = nil 632 Expect(testK8sClient.Create(ctx, sa)).Should(Succeed()) 633 634 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 635 636 req := ctrl.Request{NamespacedName: types.NamespacedName{ 637 Namespace: metav1.NamespaceDefault, 638 Name: mpiJob.GetName(), 639 }} 640 expectedErr := fmt.Errorf(MessageResourceExists, sa.Name, testKind) 641 Eventually(func() error { 642 _, err := reconciler.Reconcile(ctx, req) 643 return err 644 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 645 }) 646 }) 647 648 Context("MPIJob with Role not controlled by itself", func() { 649 It("Should return error", func() { 650 By("Calling Reconcile method") 651 jobName := "test-role-orphan" 652 testKind := "Role" 653 654 ctx := context.Background() 655 startTime := metav1.Now() 656 completionTime := metav1.Now() 657 658 mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 659 660 role := newLauncherRole(mpiJob, 64) 661 role.OwnerReferences = nil 662 Expect(testK8sClient.Create(ctx, role)).Should(Succeed()) 663 664 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 665 666 req := ctrl.Request{NamespacedName: types.NamespacedName{ 667 Namespace: metav1.NamespaceDefault, 668 Name: mpiJob.GetName(), 669 }} 670 expectedErr := fmt.Errorf(MessageResourceExists, role.Name, testKind) 671 Eventually(func() error { 672 _, err := reconciler.Reconcile(ctx, req) 673 return err 674 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 675 }) 676 }) 677 678 Context("MPIJob with RoleBinding not controlled by itself", func() { 679 It("Should return error", func() { 680 By("Calling Reconcile method") 681 jobName := "test-rb-orphan" 682 testKind := "RoleBinding" 683 684 ctx := context.Background() 685 startTime := metav1.Now() 686 completionTime := metav1.Now() 687 688 mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime) 689 690 rb := newLauncherRoleBinding(mpiJob) 691 rb.OwnerReferences = nil 692 Expect(testK8sClient.Create(ctx, rb)).Should(Succeed()) 693 694 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 695 696 req := ctrl.Request{NamespacedName: types.NamespacedName{ 697 Namespace: metav1.NamespaceDefault, 698 Name: mpiJob.GetName(), 699 }} 700 expectedErr := fmt.Errorf(MessageResourceExists, rb.Name, testKind) 701 Eventually(func() error { 702 _, err := reconciler.Reconcile(ctx, req) 703 return err 704 }, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr)) 705 }) 706 }) 707 708 Context("Test launcher's Intel MPI handling", func() { 709 It("Should create a launcher job with Intel MPI env variables", func() { 710 By("By creating MPIJobs with and without preset env variables") 711 712 testCases := map[string]struct { 713 envVariables map[string]string 714 expectedEnvVariables map[string]string 715 }{ 716 "withoutIMPIValues": { 717 envVariables: map[string]string{ 718 "X_MPI_HYDRA_BOOTSTRAP": "foo", 719 }, 720 expectedEnvVariables: map[string]string{ 721 "I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap, 722 "I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), 723 }, 724 }, 725 "withIMPIBootstrap": { 726 envVariables: map[string]string{ 727 "I_MPI_HYDRA_BOOTSTRAP": "RSH", 728 }, 729 expectedEnvVariables: map[string]string{ 730 "I_MPI_HYDRA_BOOTSTRAP": "RSH", 731 "I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), 732 }, 733 }, 734 "withIMPIBootstrapExec": { 735 envVariables: map[string]string{ 736 "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", 737 }, 738 expectedEnvVariables: map[string]string{ 739 "I_MPI_HYDRA_BOOTSTRAP": iMPIDefaultBootstrap, 740 "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", 741 }, 742 }, 743 "withIMPIBootstrapAndExec": { 744 envVariables: map[string]string{ 745 "I_MPI_HYDRA_BOOTSTRAP": "RSH", 746 "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", 747 }, 748 expectedEnvVariables: map[string]string{ 749 "I_MPI_HYDRA_BOOTSTRAP": "RSH", 750 "I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh", 751 }, 752 }, 753 } 754 755 for testName, testCase := range testCases { 756 ctx := context.Background() 757 startTime := metav1.Now() 758 completionTime := metav1.Now() 759 760 jobName := "test-launcher-creation-" + strings.ToLower(testName) 761 762 mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime) 763 Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed()) 764 765 template := &mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template 766 Expect(len(template.Spec.Containers) == 1).To(BeTrue()) 767 768 cont := &template.Spec.Containers[0] 769 770 for k, v := range testCase.envVariables { 771 cont.Env = append(cont.Env, 772 corev1.EnvVar{ 773 Name: k, 774 Value: v, 775 }, 776 ) 777 } 778 779 launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", false) 780 781 Expect(len(launcher.Spec.Containers) == 1).To(BeTrue()) 782 for expectedKey, expectedValue := range testCase.expectedEnvVariables { 783 Expect(launcher.Spec.Containers[0].Env).Should(ContainElements( 784 corev1.EnvVar{ 785 Name: expectedKey, 786 Value: expectedValue, 787 }), 788 ) 789 } 790 } 791 }) 792 }) 793 794 Context("When creating the MPIJob with the suspend semantics", func() { 795 const name = "test-job" 796 var ( 797 ns *corev1.Namespace 798 job *kubeflowv1.MPIJob 799 jobKey types.NamespacedName 800 launcherKey types.NamespacedName 801 worker0Key types.NamespacedName 802 ctx = context.Background() 803 ) 804 BeforeEach(func() { 805 ns = &corev1.Namespace{ 806 ObjectMeta: metav1.ObjectMeta{ 807 GenerateName: "mpijob-test-", 808 }, 809 } 810 Expect(testK8sClient.Create(ctx, ns)).Should(Succeed()) 811 812 now := metav1.Now() 813 job = newMPIJob(name, pointer.Int32(1), 1, gpuResourceName, &now, &now) 814 job.Namespace = ns.Name 815 jobKey = client.ObjectKeyFromObject(job) 816 launcherKey = types.NamespacedName{ 817 Name: fmt.Sprintf("%s-launcher", name), 818 Namespace: ns.Name, 819 } 820 worker0Key = types.NamespacedName{ 821 Name: fmt.Sprintf("%s-worker-0", name), 822 Namespace: ns.Name, 823 } 824 }) 825 AfterEach(func() { 826 Expect(testK8sClient.Delete(ctx, job)).Should(Succeed()) 827 Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed()) 828 }) 829 It("Shouldn't create resources if MPIJob is suspended", func() { 830 By("By creating a new MPIJob with suspend=true") 831 job.Spec.RunPolicy.Suspend = pointer.Bool(true) 832 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 833 834 created := &kubeflowv1.MPIJob{} 835 launcherPod := &corev1.Pod{} 836 workerPod := &corev1.Pod{} 837 838 By("Checking created MPIJob") 839 Eventually(func() bool { 840 err := testK8sClient.Get(ctx, jobKey, created) 841 return err == nil 842 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 843 By("Checking created MPIJob has a nil startTime") 844 Consistently(func() *metav1.Time { 845 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 846 return created.Status.StartTime 847 }, testutil.ConsistentDuration, testutil.Interval).Should(BeNil()) 848 849 By("Checking if the pods aren't created") 850 Consistently(func() bool { 851 errLauncherPod := testK8sClient.Get(ctx, launcherKey, launcherPod) 852 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 853 return errors.IsNotFound(errLauncherPod) && errors.IsNotFound(errWorkerPod) 854 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 855 856 By("Checking if the MPIJob has suspended condition") 857 Eventually(func() []kubeflowv1.JobCondition { 858 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 859 return created.Status.Conditions 860 }, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 861 { 862 Type: kubeflowv1.JobCreated, 863 Status: corev1.ConditionTrue, 864 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), 865 Message: fmt.Sprintf("MPIJob %s is created.", name), 866 }, 867 { 868 Type: kubeflowv1.JobSuspended, 869 Status: corev1.ConditionTrue, 870 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason), 871 Message: fmt.Sprintf("MPIJob %s is suspended.", name), 872 }, 873 }, testutil.IgnoreJobConditionsTimes)) 874 }) 875 876 It("Should delete resources after MPIJob is suspended; Should resume MPIJob after MPIJob is unsuspended", func() { 877 By("By creating a new MPIJob") 878 Expect(testK8sClient.Create(ctx, job)).Should(Succeed()) 879 880 created := &kubeflowv1.MPIJob{} 881 launcherPod := &corev1.Pod{} 882 workerPod := &corev1.Pod{} 883 884 // We'll need to retry getting this newly created MPIJob, given that creation may not immediately happen. 885 By("Checking created MPIJob") 886 Eventually(func() bool { 887 err := testK8sClient.Get(ctx, jobKey, created) 888 return err == nil 889 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 890 891 var startTimeBeforeSuspended *metav1.Time 892 Eventually(func() *metav1.Time { 893 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 894 startTimeBeforeSuspended = created.Status.StartTime 895 return startTimeBeforeSuspended 896 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 897 898 By("Checking the created pods") 899 Eventually(func() bool { 900 errLauncher := testK8sClient.Get(ctx, launcherKey, launcherPod) 901 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 902 return errLauncher == nil && errWorker == nil 903 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 904 905 By("Updating the Pod's phase with Running") 906 Eventually(func() error { 907 Expect(testK8sClient.Get(ctx, launcherKey, launcherPod)).Should(Succeed()) 908 launcherPod.Status.Phase = corev1.PodRunning 909 return testK8sClient.Status().Update(ctx, launcherPod) 910 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 911 Eventually(func() error { 912 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 913 workerPod.Status.Phase = corev1.PodRunning 914 return testK8sClient.Status().Update(ctx, workerPod) 915 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 916 917 By("Checking the MPIJob's condition") 918 Eventually(func() []kubeflowv1.JobCondition { 919 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 920 return created.Status.Conditions 921 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 922 { 923 Type: kubeflowv1.JobCreated, 924 Status: corev1.ConditionTrue, 925 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), 926 Message: fmt.Sprintf("MPIJob %s is created.", name), 927 }, 928 { 929 Type: kubeflowv1.JobRunning, 930 Status: corev1.ConditionTrue, 931 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), 932 Message: fmt.Sprintf("MPIJob %s is running.", name), 933 }, 934 }, testutil.IgnoreJobConditionsTimes)) 935 936 By("Updating the MPIJob with suspend=true") 937 Eventually(func() error { 938 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 939 created.Spec.RunPolicy.Suspend = pointer.Bool(true) 940 return testK8sClient.Update(ctx, created) 941 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 942 943 By("Checking if the pods are removed") 944 Eventually(func() bool { 945 errLauncher := testK8sClient.Get(ctx, launcherKey, launcherPod) 946 errWorker := testK8sClient.Get(ctx, worker0Key, workerPod) 947 return errors.IsNotFound(errLauncher) && errors.IsNotFound(errWorker) 948 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 949 Consistently(func() bool { 950 errLauncherPod := testK8sClient.Get(ctx, launcherKey, launcherPod) 951 errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod) 952 return errors.IsNotFound(errLauncherPod) && errors.IsNotFound(errWorkerPod) 953 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 954 955 By("Checking if the MPIJob has a suspended condition") 956 Eventually(func() bool { 957 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 958 return created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active == 0 && 959 created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active == 0 && 960 created.Status.StartTime.Equal(startTimeBeforeSuspended) 961 }, testutil.Timeout, testutil.Interval).Should(BeTrue()) 962 Consistently(func() bool { 963 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 964 return created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active == 0 && 965 created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active == 0 && 966 created.Status.StartTime.Equal(startTimeBeforeSuspended) 967 }, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue()) 968 Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{ 969 { 970 Type: kubeflowv1.JobCreated, 971 Status: corev1.ConditionTrue, 972 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), 973 Message: fmt.Sprintf("MPIJob %s is created.", name), 974 }, 975 { 976 Type: kubeflowv1.JobRunning, 977 Status: corev1.ConditionFalse, 978 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason), 979 Message: fmt.Sprintf("MPIJob %s is suspended.", name), 980 }, 981 { 982 Type: kubeflowv1.JobSuspended, 983 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason), 984 Message: fmt.Sprintf("MPIJob %s is suspended.", name), 985 Status: corev1.ConditionTrue, 986 }, 987 }, testutil.IgnoreJobConditionsTimes)) 988 989 By("Unsuspending the MPIJob") 990 Eventually(func() error { 991 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 992 created.Spec.RunPolicy.Suspend = pointer.Bool(false) 993 return testK8sClient.Update(ctx, created) 994 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 995 Eventually(func() *metav1.Time { 996 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 997 return created.Status.StartTime 998 }, testutil.Timeout, testutil.Interval).ShouldNot(BeNil()) 999 1000 By("Check if the pods are created") 1001 Eventually(func() error { 1002 return testK8sClient.Get(ctx, launcherKey, launcherPod) 1003 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 1004 Eventually(func() error { 1005 return testK8sClient.Get(ctx, worker0Key, workerPod) 1006 }, testutil.Timeout, testutil.Interval).Should(BeNil()) 1007 1008 By("Updating Pod's condition with Running") 1009 Eventually(func() error { 1010 Expect(testK8sClient.Get(ctx, launcherKey, launcherPod)).Should(Succeed()) 1011 launcherPod.Status.Phase = corev1.PodRunning 1012 return testK8sClient.Status().Update(ctx, launcherPod) 1013 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 1014 Eventually(func() error { 1015 Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed()) 1016 workerPod.Status.Phase = corev1.PodRunning 1017 return testK8sClient.Status().Update(ctx, workerPod) 1018 }, testutil.Timeout, testutil.Interval).Should(Succeed()) 1019 1020 By("Checking if the MPIJob has resumed conditions") 1021 Eventually(func() []kubeflowv1.JobCondition { 1022 Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed()) 1023 return created.Status.Conditions 1024 }, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{ 1025 { 1026 Type: kubeflowv1.JobCreated, 1027 Status: corev1.ConditionTrue, 1028 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), 1029 Message: fmt.Sprintf("MPIJob %s is created.", name), 1030 }, 1031 { 1032 Type: kubeflowv1.JobSuspended, 1033 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobResumedReason), 1034 Message: fmt.Sprintf("MPIJob %s is resumed.", name), 1035 Status: corev1.ConditionFalse, 1036 }, 1037 { 1038 Type: kubeflowv1.JobRunning, 1039 Status: corev1.ConditionTrue, 1040 Reason: commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), 1041 Message: fmt.Sprintf("MPIJob %s is running.", name), 1042 }, 1043 }, testutil.IgnoreJobConditionsTimes)) 1044 1045 By("Checking if the startTime is updated") 1046 Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended)) 1047 }) 1048 }) 1049 }) 1050 1051 func ReplicaStatusMatch(replicaStatuses map[common.ReplicaType]*common.ReplicaStatus, 1052 replicaType common.ReplicaType, status *common.ReplicaStatus) bool { 1053 1054 result := true 1055 1056 if replicaStatuses == nil { 1057 return false 1058 } 1059 if val, exist := replicaStatuses[replicaType]; !exist { 1060 return false 1061 } else { 1062 result = result && (val.Active == status.Active) 1063 result = result && (val.Succeeded == status.Succeeded) 1064 result = result && (val.Failed == status.Failed) 1065 } 1066 1067 return result 1068 }