github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/mpi/mpijob_controller_test.go (about)

     1  // Copyright 2021 The Kubeflow Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mpi
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  
    22  	common "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    23  	. "github.com/onsi/ginkgo/v2"
    24  	. "github.com/onsi/gomega"
    25  	corev1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/errors"
    27  	"k8s.io/apimachinery/pkg/api/resource"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/types"
    30  	"k8s.io/utils/pointer"
    31  	ctrl "sigs.k8s.io/controller-runtime"
    32  	"sigs.k8s.io/controller-runtime/pkg/client"
    33  
    34  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    35  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    36  	"github.com/kubeflow/training-operator/pkg/util/testutil"
    37  )
    38  
    39  const (
    40  	gpuResourceName         = "nvidia.com/gpu"
    41  	extendedGPUResourceName = "vendor-domain/gpu"
    42  )
    43  
    44  func newMPIJobCommon(name string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob {
    45  	mpiJob := &kubeflowv1.MPIJob{
    46  		TypeMeta: metav1.TypeMeta{APIVersion: kubeflowv1.SchemeGroupVersion.String()},
    47  		ObjectMeta: metav1.ObjectMeta{
    48  			Name:      name,
    49  			Namespace: metav1.NamespaceDefault,
    50  		},
    51  		Spec: kubeflowv1.MPIJobSpec{
    52  			RunPolicy: common.RunPolicy{
    53  				CleanPodPolicy: kubeflowv1.CleanPodPolicyPointer(kubeflowv1.CleanPodPolicyAll),
    54  			},
    55  			MPIReplicaSpecs: map[common.ReplicaType]*common.ReplicaSpec{
    56  				kubeflowv1.MPIJobReplicaTypeWorker: {
    57  					Template: corev1.PodTemplateSpec{
    58  						Spec: corev1.PodSpec{
    59  							Containers: []corev1.Container{
    60  								{
    61  									Name:  "foo",
    62  									Image: "bar",
    63  								},
    64  							},
    65  						},
    66  					},
    67  				},
    68  				kubeflowv1.MPIJobReplicaTypeLauncher: {
    69  					Template: corev1.PodTemplateSpec{
    70  						Spec: corev1.PodSpec{
    71  							Containers: []corev1.Container{
    72  								{
    73  									Name:  "foo",
    74  									Image: "bar",
    75  								},
    76  							},
    77  						},
    78  					},
    79  				},
    80  			},
    81  		},
    82  		Status: common.JobStatus{},
    83  	}
    84  
    85  	if startTime != nil {
    86  		mpiJob.Status.StartTime = startTime
    87  	}
    88  	if completionTime != nil {
    89  		mpiJob.Status.CompletionTime = completionTime
    90  	}
    91  
    92  	return mpiJob
    93  }
    94  
    95  func newMPIJobOld(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob {
    96  	mpiJob := newMPIJobCommon(name, startTime, completionTime)
    97  
    98  	mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Replicas = replicas
    99  
   100  	workerContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Template.Spec.Containers
   101  	for i := range workerContainers {
   102  		container := &workerContainers[i]
   103  		container.Resources = corev1.ResourceRequirements{
   104  			Limits: corev1.ResourceList{
   105  				corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
   106  			},
   107  		}
   108  	}
   109  
   110  	return mpiJob
   111  }
   112  
   113  var newMPIJob = newMPIJobWithLauncher
   114  
   115  func newMPIJobWithLauncher(name string, replicas *int32, pusPerReplica int64, resourceName string, startTime, completionTime *metav1.Time) *kubeflowv1.MPIJob {
   116  	mpiJob := newMPIJobOld(name, replicas, pusPerReplica, resourceName, startTime, completionTime)
   117  
   118  	mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Replicas = pointer.Int32(1)
   119  
   120  	launcherContainers := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.Spec.Containers
   121  	for i := range launcherContainers {
   122  		container := &launcherContainers[i]
   123  		container.Resources = corev1.ResourceRequirements{
   124  			Limits: corev1.ResourceList{
   125  				corev1.ResourceName(resourceName): *resource.NewQuantity(pusPerReplica, resource.DecimalExponent),
   126  			},
   127  		}
   128  	}
   129  
   130  	return mpiJob
   131  }
   132  
   133  var _ = Describe("MPIJob controller", func() {
   134  	Context("Test launcher is GPU launcher", func() {
   135  		It("Should pass GPU Launcher verification", func() {
   136  			By("By creating MPIJobs with various resource configuration")
   137  
   138  			testCases := map[string]struct {
   139  				gpu      string
   140  				expected bool
   141  			}{
   142  				"isNvidiaGPU": {
   143  					gpu:      gpuResourceName,
   144  					expected: true,
   145  				},
   146  				"isExtendedGPU": {
   147  					gpu:      extendedGPUResourceName,
   148  					expected: true,
   149  				},
   150  				"notGPU": {
   151  					gpu:      "vendor-domain/resourcetype",
   152  					expected: false,
   153  				},
   154  			}
   155  
   156  			startTime := metav1.Now()
   157  			completionTime := metav1.Now()
   158  
   159  			for testName, testCase := range testCases {
   160  				mpiJob := newMPIJobWithLauncher("test-"+strings.ToLower(testName),
   161  					pointer.Int32(64), 1, testCase.gpu, &startTime, &completionTime)
   162  				Expect(isGPULauncher(mpiJob) == testCase.expected).To(BeTrue())
   163  			}
   164  		})
   165  	})
   166  
   167  	Context("Test MPIJob with succeeded launcher Pod", func() {
   168  		It("Should contains desired launcher ReplicaStatus", func() {
   169  			By("By marking a launcher pod with Phase Succeeded")
   170  			ctx := context.Background()
   171  			startTime := metav1.Now()
   172  			completionTime := metav1.Now()
   173  
   174  			jobName := "test-launcher-succeeded"
   175  
   176  			mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   177  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   178  
   179  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   180  			launcher.Status.Phase = corev1.PodSucceeded
   181  
   182  			launcherKey := types.NamespacedName{
   183  				Namespace: metav1.NamespaceDefault,
   184  				Name:      launcher.GetName(),
   185  			}
   186  			Eventually(func() error {
   187  				launcherCreated := &corev1.Pod{}
   188  				if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil {
   189  					return err
   190  				}
   191  				launcherCreated.Status.Phase = corev1.PodSucceeded
   192  				return testK8sClient.Status().Update(ctx, launcherCreated)
   193  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   194  
   195  			created := &kubeflowv1.MPIJob{}
   196  			launcherStatus := &common.ReplicaStatus{
   197  				Active:    0,
   198  				Succeeded: 1,
   199  				Failed:    0,
   200  			}
   201  			Eventually(func() bool {
   202  				err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created)
   203  				if err != nil {
   204  					return false
   205  				}
   206  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, launcherStatus)
   207  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   208  		})
   209  	})
   210  
   211  	Context("Test MPIJob with failed launcher Pod", func() {
   212  		It("Should contains desired launcher ReplicaStatus", func() {
   213  			By("By marking a launcher pod with Phase Failed")
   214  			ctx := context.Background()
   215  			startTime := metav1.Now()
   216  			completionTime := metav1.Now()
   217  
   218  			jobName := "test-launcher-failed"
   219  
   220  			mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   221  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   222  
   223  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   224  			launcherKey := types.NamespacedName{
   225  				Namespace: metav1.NamespaceDefault,
   226  				Name:      launcher.GetName(),
   227  			}
   228  			Eventually(func() error {
   229  				launcherCreated := &corev1.Pod{}
   230  				if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil {
   231  					return err
   232  				}
   233  				launcherCreated.Status.Phase = corev1.PodFailed
   234  				return testK8sClient.Status().Update(ctx, launcherCreated)
   235  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   236  
   237  			launcherStatus := &common.ReplicaStatus{
   238  				Active:    0,
   239  				Succeeded: 0,
   240  				Failed:    1,
   241  			}
   242  			created := &kubeflowv1.MPIJob{}
   243  			Eventually(func() bool {
   244  				err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created)
   245  				if err != nil {
   246  					return false
   247  				}
   248  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher, launcherStatus)
   249  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   250  		})
   251  	})
   252  
   253  	Context("Test MPIJob with succeeded launcher pod", func() {
   254  		It("Should contain desired ReplicaStatuses for worker", func() {
   255  			By("By marking the launcher Pod as Succeeded")
   256  			ctx := context.Background()
   257  			startTime := metav1.Now()
   258  			completionTime := metav1.Now()
   259  
   260  			jobName := "test-launcher-succeeded2"
   261  
   262  			mpiJob := newMPIJobWithLauncher(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   263  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   264  
   265  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   266  			launcher.Status.Phase = corev1.PodSucceeded
   267  
   268  			launcherKey := types.NamespacedName{
   269  				Namespace: metav1.NamespaceDefault,
   270  				Name:      launcher.GetName(),
   271  			}
   272  			Eventually(func() error {
   273  				launcherCreated := &corev1.Pod{}
   274  				if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil {
   275  					return err
   276  				}
   277  				launcherCreated.Status.Phase = corev1.PodSucceeded
   278  				return testK8sClient.Status().Update(ctx, launcherCreated)
   279  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   280  
   281  			created := &kubeflowv1.MPIJob{}
   282  			launcherStatus := &common.ReplicaStatus{
   283  				Active:    0,
   284  				Succeeded: 0,
   285  				Failed:    0,
   286  			}
   287  			Eventually(func() bool {
   288  				err := testK8sClient.Get(ctx, types.NamespacedName{Namespace: metav1.NamespaceDefault, Name: jobName}, created)
   289  				if err != nil {
   290  					return false
   291  				}
   292  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker, launcherStatus)
   293  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   294  		})
   295  	})
   296  
   297  	Context("Test MPIJob with Running launcher Pod and Pending worker Pods", func() {
   298  		It("Should contain desired ReplicaStatuses", func() {
   299  			By("By marking an active launcher pod and pending worker pods")
   300  
   301  			ctx := context.Background()
   302  			startTime := metav1.Now()
   303  			completionTime := metav1.Now()
   304  
   305  			jobName := "test-launcher-running-worker-pending"
   306  
   307  			var replicas int32 = 8
   308  			mpiJob := newMPIJobWithLauncher(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime)
   309  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   310  
   311  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   312  			launcherKey := types.NamespacedName{
   313  				Namespace: metav1.NamespaceDefault,
   314  				Name:      launcher.GetName(),
   315  			}
   316  			Eventually(func() error {
   317  				launcherCreated := &corev1.Pod{}
   318  				if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil {
   319  					return err
   320  				}
   321  				launcherCreated.Status.Phase = corev1.PodRunning
   322  				return testK8sClient.Status().Update(ctx, launcherCreated)
   323  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   324  
   325  			for i := 0; i < int(replicas); i++ {
   326  				name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
   327  				worker := reconciler.newWorker(mpiJob, name)
   328  				workerKey := types.NamespacedName{
   329  					Namespace: metav1.NamespaceDefault,
   330  					Name:      worker.GetName(),
   331  				}
   332  				Eventually(func() error {
   333  					workerCreated := &corev1.Pod{}
   334  					if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil {
   335  						return err
   336  					}
   337  					workerCreated.Status.Phase = corev1.PodPending
   338  					return testK8sClient.Status().Update(ctx, workerCreated)
   339  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   340  			}
   341  
   342  			key := types.NamespacedName{
   343  				Namespace: metav1.NamespaceDefault,
   344  				Name:      jobName,
   345  			}
   346  			launcherStatus := &common.ReplicaStatus{
   347  				Active:    1,
   348  				Succeeded: 0,
   349  				Failed:    0,
   350  			}
   351  			workerStatus := &common.ReplicaStatus{
   352  				Active:    0,
   353  				Succeeded: 0,
   354  				Failed:    0,
   355  			}
   356  			Eventually(func() bool {
   357  				created := &kubeflowv1.MPIJob{}
   358  				err := testK8sClient.Get(ctx, key, created)
   359  				if err != nil {
   360  					return false
   361  				}
   362  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher,
   363  					launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker,
   364  					workerStatus)
   365  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   366  		})
   367  	})
   368  
   369  	Context("Test MPIJob with Running launcher Pod and Running worker Pods", func() {
   370  		It("Should contain desired ReplicaStatuses", func() {
   371  			By("By creating an active launcher pod and active worker pods")
   372  
   373  			ctx := context.Background()
   374  			startTime := metav1.Now()
   375  			completionTime := metav1.Now()
   376  
   377  			jobName := "test-launcher-running-worker-running"
   378  
   379  			var replicas int32 = 8
   380  			mpiJob := newMPIJob(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime)
   381  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   382  
   383  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   384  			launcherKey := types.NamespacedName{
   385  				Namespace: metav1.NamespaceDefault,
   386  				Name:      launcher.GetName(),
   387  			}
   388  			Eventually(func() error {
   389  				launcherCreated := &corev1.Pod{}
   390  				if err := testK8sClient.Get(ctx, launcherKey, launcherCreated); err != nil {
   391  					return err
   392  				}
   393  				launcherCreated.Status.Phase = corev1.PodRunning
   394  				return testK8sClient.Status().Update(ctx, launcherCreated)
   395  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
   396  
   397  			for i := 0; i < int(replicas); i++ {
   398  				name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
   399  				worker := reconciler.newWorker(mpiJob, name)
   400  				workerKey := types.NamespacedName{
   401  					Namespace: metav1.NamespaceDefault,
   402  					Name:      worker.GetName(),
   403  				}
   404  				Eventually(func() error {
   405  					workerCreated := &corev1.Pod{}
   406  					if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil {
   407  						return err
   408  					}
   409  					workerCreated.Status.Phase = corev1.PodRunning
   410  					return testK8sClient.Status().Update(ctx, workerCreated)
   411  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   412  			}
   413  
   414  			key := types.NamespacedName{
   415  				Namespace: metav1.NamespaceDefault,
   416  				Name:      jobName,
   417  			}
   418  			launcherStatus := &common.ReplicaStatus{
   419  				Active:    1,
   420  				Succeeded: 0,
   421  				Failed:    0,
   422  			}
   423  			workerStatus := &common.ReplicaStatus{
   424  				Active:    8,
   425  				Succeeded: 0,
   426  				Failed:    0,
   427  			}
   428  			Eventually(func() bool {
   429  				created := &kubeflowv1.MPIJob{}
   430  				err := testK8sClient.Get(ctx, key, created)
   431  				if err != nil {
   432  					return false
   433  				}
   434  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher,
   435  					launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker,
   436  					workerStatus)
   437  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   438  		})
   439  	})
   440  
   441  	Context("Test MPIJob with Running worker Pods", func() {
   442  		It("Should contain desired ReplicaStatuses and create a launcher pod", func() {
   443  			By("By creating only active worker pods")
   444  
   445  			ctx := context.Background()
   446  			startTime := metav1.Now()
   447  			completionTime := metav1.Now()
   448  
   449  			jobName := "test-worker-running"
   450  
   451  			var replicas int32 = 16
   452  			mpiJob := newMPIJob(jobName, &replicas, 1, gpuResourceName, &startTime, &completionTime)
   453  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   454  
   455  			for i := 0; i < int(replicas); i++ {
   456  				name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
   457  				worker := reconciler.newWorker(mpiJob, name)
   458  				workerKey := types.NamespacedName{
   459  					Namespace: metav1.NamespaceDefault,
   460  					Name:      worker.GetName(),
   461  				}
   462  				Eventually(func() error {
   463  					workerCreated := &corev1.Pod{}
   464  					if err := testK8sClient.Get(ctx, workerKey, workerCreated); err != nil {
   465  						return err
   466  					}
   467  					workerCreated.Status.Phase = corev1.PodRunning
   468  					return testK8sClient.Status().Update(ctx, workerCreated)
   469  				}, testutil.Timeout, testutil.Interval).Should(BeNil())
   470  			}
   471  
   472  			launcherKey := types.NamespacedName{
   473  				Namespace: metav1.NamespaceDefault,
   474  				Name:      mpiJob.Name + launcherSuffix,
   475  			}
   476  			launcher := &kubeflowv1.MPIJob{}
   477  			Eventually(func() bool {
   478  				err := testK8sClient.Get(ctx, launcherKey, launcher)
   479  				return err != nil
   480  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   481  
   482  			key := types.NamespacedName{
   483  				Namespace: metav1.NamespaceDefault,
   484  				Name:      jobName,
   485  			}
   486  			launcherStatus := &common.ReplicaStatus{
   487  				Active:    0,
   488  				Succeeded: 0,
   489  				Failed:    0,
   490  			}
   491  			workerStatus := &common.ReplicaStatus{
   492  				Active:    16,
   493  				Succeeded: 0,
   494  				Failed:    0,
   495  			}
   496  			Eventually(func() bool {
   497  				created := &kubeflowv1.MPIJob{}
   498  				err := testK8sClient.Get(ctx, key, created)
   499  				if err != nil {
   500  					return false
   501  				}
   502  				return ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeLauncher,
   503  					launcherStatus) && ReplicaStatusMatch(created.Status.ReplicaStatuses, kubeflowv1.MPIJobReplicaTypeWorker,
   504  					workerStatus)
   505  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   506  		})
   507  	})
   508  
   509  	Context("MPIJob not found", func() {
   510  		It("Should do nothing", func() {
   511  			By("Calling Reconcile method")
   512  			jobName := "test-not-exist"
   513  
   514  			ctx := context.Background()
   515  
   516  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   517  				Namespace: metav1.NamespaceDefault,
   518  				Name:      jobName,
   519  			}}
   520  			_, err := reconciler.Reconcile(ctx, req)
   521  			Expect(err).Should(BeNil())
   522  		})
   523  	})
   524  
   525  	Context("MPIJob with launcher Pod not controlled by itself", func() {
   526  		It("Should return error", func() {
   527  			By("Calling Reconcile method")
   528  			jobName := "test-launcher-orphan"
   529  			testKind := "Pod"
   530  
   531  			ctx := context.Background()
   532  			startTime := metav1.Now()
   533  			completionTime := metav1.Now()
   534  
   535  			mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   536  
   537  			launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", isGPULauncher(mpiJob))
   538  			launcher.OwnerReferences = nil
   539  			Expect(testK8sClient.Create(ctx, launcher)).Should(Succeed())
   540  
   541  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   542  
   543  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   544  				Namespace: metav1.NamespaceDefault,
   545  				Name:      mpiJob.GetName(),
   546  			}}
   547  			expectedErr := fmt.Errorf(MessageResourceExists, launcher.Name, testKind)
   548  			Eventually(func() error {
   549  				_, err := reconciler.Reconcile(ctx, req)
   550  				return err
   551  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   552  		})
   553  	})
   554  
   555  	Context("MPIJob with worker Pod not controlled by itself", func() {
   556  		It("Should return error", func() {
   557  			By("Calling Reconcile method")
   558  			jobName := "test-worker-orphan"
   559  			testKind := "Pod"
   560  
   561  			ctx := context.Background()
   562  			startTime := metav1.Now()
   563  			completionTime := metav1.Now()
   564  
   565  			mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime)
   566  
   567  			for i := 0; i < 1; i++ {
   568  				name := fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, i)
   569  				worker := reconciler.newWorker(mpiJob, name)
   570  				worker.OwnerReferences = nil
   571  				Expect(testK8sClient.Create(ctx, worker)).Should(Succeed())
   572  			}
   573  
   574  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   575  
   576  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   577  				Namespace: metav1.NamespaceDefault,
   578  				Name:      mpiJob.GetName(),
   579  			}}
   580  			expectedErr := fmt.Errorf(MessageResourceExists, fmt.Sprintf("%s-%d", mpiJob.Name+workerSuffix, 0), testKind)
   581  			Eventually(func() error {
   582  				_, err := reconciler.Reconcile(ctx, req)
   583  				return err
   584  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   585  		})
   586  	})
   587  
   588  	Context("MPIJob with ConfigMap not controlled by itself", func() {
   589  		It("Should return error", func() {
   590  			By("Calling Reconcile method")
   591  			jobName := "test-cm-orphan"
   592  			testKind := "ConfigMap"
   593  
   594  			ctx := context.Background()
   595  			startTime := metav1.Now()
   596  			completionTime := metav1.Now()
   597  
   598  			mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   599  
   600  			cm := newConfigMap(mpiJob, 64, isGPULauncher(mpiJob))
   601  			cm.OwnerReferences = nil
   602  			Expect(testK8sClient.Create(ctx, cm)).Should(Succeed())
   603  
   604  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   605  
   606  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   607  				Namespace: metav1.NamespaceDefault,
   608  				Name:      mpiJob.GetName(),
   609  			}}
   610  			expectedErr := fmt.Errorf(MessageResourceExists, cm.Name, testKind)
   611  			Eventually(func() error {
   612  				_, err := reconciler.Reconcile(ctx, req)
   613  				return err
   614  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   615  		})
   616  	})
   617  
   618  	Context("MPIJob with ServiceAccount not controlled by itself", func() {
   619  		It("Should return error", func() {
   620  			By("Calling Reconcile method")
   621  			jobName := "test-sa-orphan"
   622  			testKind := "ServiceAccount"
   623  
   624  			ctx := context.Background()
   625  			startTime := metav1.Now()
   626  			completionTime := metav1.Now()
   627  
   628  			mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   629  
   630  			sa := newLauncherServiceAccount(mpiJob)
   631  			sa.OwnerReferences = nil
   632  			Expect(testK8sClient.Create(ctx, sa)).Should(Succeed())
   633  
   634  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   635  
   636  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   637  				Namespace: metav1.NamespaceDefault,
   638  				Name:      mpiJob.GetName(),
   639  			}}
   640  			expectedErr := fmt.Errorf(MessageResourceExists, sa.Name, testKind)
   641  			Eventually(func() error {
   642  				_, err := reconciler.Reconcile(ctx, req)
   643  				return err
   644  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   645  		})
   646  	})
   647  
   648  	Context("MPIJob with Role not controlled by itself", func() {
   649  		It("Should return error", func() {
   650  			By("Calling Reconcile method")
   651  			jobName := "test-role-orphan"
   652  			testKind := "Role"
   653  
   654  			ctx := context.Background()
   655  			startTime := metav1.Now()
   656  			completionTime := metav1.Now()
   657  
   658  			mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   659  
   660  			role := newLauncherRole(mpiJob, 64)
   661  			role.OwnerReferences = nil
   662  			Expect(testK8sClient.Create(ctx, role)).Should(Succeed())
   663  
   664  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   665  
   666  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   667  				Namespace: metav1.NamespaceDefault,
   668  				Name:      mpiJob.GetName(),
   669  			}}
   670  			expectedErr := fmt.Errorf(MessageResourceExists, role.Name, testKind)
   671  			Eventually(func() error {
   672  				_, err := reconciler.Reconcile(ctx, req)
   673  				return err
   674  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   675  		})
   676  	})
   677  
   678  	Context("MPIJob with RoleBinding not controlled by itself", func() {
   679  		It("Should return error", func() {
   680  			By("Calling Reconcile method")
   681  			jobName := "test-rb-orphan"
   682  			testKind := "RoleBinding"
   683  
   684  			ctx := context.Background()
   685  			startTime := metav1.Now()
   686  			completionTime := metav1.Now()
   687  
   688  			mpiJob := newMPIJob(jobName, pointer.Int32(64), 1, gpuResourceName, &startTime, &completionTime)
   689  
   690  			rb := newLauncherRoleBinding(mpiJob)
   691  			rb.OwnerReferences = nil
   692  			Expect(testK8sClient.Create(ctx, rb)).Should(Succeed())
   693  
   694  			Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   695  
   696  			req := ctrl.Request{NamespacedName: types.NamespacedName{
   697  				Namespace: metav1.NamespaceDefault,
   698  				Name:      mpiJob.GetName(),
   699  			}}
   700  			expectedErr := fmt.Errorf(MessageResourceExists, rb.Name, testKind)
   701  			Eventually(func() error {
   702  				_, err := reconciler.Reconcile(ctx, req)
   703  				return err
   704  			}, testutil.Timeout, testutil.Interval).Should(MatchError(expectedErr))
   705  		})
   706  	})
   707  
   708  	Context("Test launcher's Intel MPI handling", func() {
   709  		It("Should create a launcher job with Intel MPI env variables", func() {
   710  			By("By creating MPIJobs with and without preset env variables")
   711  
   712  			testCases := map[string]struct {
   713  				envVariables         map[string]string
   714  				expectedEnvVariables map[string]string
   715  			}{
   716  				"withoutIMPIValues": {
   717  					envVariables: map[string]string{
   718  						"X_MPI_HYDRA_BOOTSTRAP": "foo",
   719  					},
   720  					expectedEnvVariables: map[string]string{
   721  						"I_MPI_HYDRA_BOOTSTRAP":      iMPIDefaultBootstrap,
   722  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
   723  					},
   724  				},
   725  				"withIMPIBootstrap": {
   726  					envVariables: map[string]string{
   727  						"I_MPI_HYDRA_BOOTSTRAP": "RSH",
   728  					},
   729  					expectedEnvVariables: map[string]string{
   730  						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
   731  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
   732  					},
   733  				},
   734  				"withIMPIBootstrapExec": {
   735  					envVariables: map[string]string{
   736  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
   737  					},
   738  					expectedEnvVariables: map[string]string{
   739  						"I_MPI_HYDRA_BOOTSTRAP":      iMPIDefaultBootstrap,
   740  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
   741  					},
   742  				},
   743  				"withIMPIBootstrapAndExec": {
   744  					envVariables: map[string]string{
   745  						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
   746  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
   747  					},
   748  					expectedEnvVariables: map[string]string{
   749  						"I_MPI_HYDRA_BOOTSTRAP":      "RSH",
   750  						"I_MPI_HYDRA_BOOTSTRAP_EXEC": "/script.sh",
   751  					},
   752  				},
   753  			}
   754  
   755  			for testName, testCase := range testCases {
   756  				ctx := context.Background()
   757  				startTime := metav1.Now()
   758  				completionTime := metav1.Now()
   759  
   760  				jobName := "test-launcher-creation-" + strings.ToLower(testName)
   761  
   762  				mpiJob := newMPIJob(jobName, pointer.Int32(1), 1, gpuResourceName, &startTime, &completionTime)
   763  				Expect(testK8sClient.Create(ctx, mpiJob)).Should(Succeed())
   764  
   765  				template := &mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template
   766  				Expect(len(template.Spec.Containers) == 1).To(BeTrue())
   767  
   768  				cont := &template.Spec.Containers[0]
   769  
   770  				for k, v := range testCase.envVariables {
   771  					cont.Env = append(cont.Env,
   772  						corev1.EnvVar{
   773  							Name:  k,
   774  							Value: v,
   775  						},
   776  					)
   777  				}
   778  
   779  				launcher := reconciler.newLauncher(mpiJob, "kubectl-delivery", false)
   780  
   781  				Expect(len(launcher.Spec.Containers) == 1).To(BeTrue())
   782  				for expectedKey, expectedValue := range testCase.expectedEnvVariables {
   783  					Expect(launcher.Spec.Containers[0].Env).Should(ContainElements(
   784  						corev1.EnvVar{
   785  							Name:  expectedKey,
   786  							Value: expectedValue,
   787  						}),
   788  					)
   789  				}
   790  			}
   791  		})
   792  	})
   793  
   794  	Context("When creating the MPIJob with the suspend semantics", func() {
   795  		const name = "test-job"
   796  		var (
   797  			ns          *corev1.Namespace
   798  			job         *kubeflowv1.MPIJob
   799  			jobKey      types.NamespacedName
   800  			launcherKey types.NamespacedName
   801  			worker0Key  types.NamespacedName
   802  			ctx         = context.Background()
   803  		)
   804  		BeforeEach(func() {
   805  			ns = &corev1.Namespace{
   806  				ObjectMeta: metav1.ObjectMeta{
   807  					GenerateName: "mpijob-test-",
   808  				},
   809  			}
   810  			Expect(testK8sClient.Create(ctx, ns)).Should(Succeed())
   811  
   812  			now := metav1.Now()
   813  			job = newMPIJob(name, pointer.Int32(1), 1, gpuResourceName, &now, &now)
   814  			job.Namespace = ns.Name
   815  			jobKey = client.ObjectKeyFromObject(job)
   816  			launcherKey = types.NamespacedName{
   817  				Name:      fmt.Sprintf("%s-launcher", name),
   818  				Namespace: ns.Name,
   819  			}
   820  			worker0Key = types.NamespacedName{
   821  				Name:      fmt.Sprintf("%s-worker-0", name),
   822  				Namespace: ns.Name,
   823  			}
   824  		})
   825  		AfterEach(func() {
   826  			Expect(testK8sClient.Delete(ctx, job)).Should(Succeed())
   827  			Expect(testK8sClient.Delete(ctx, ns)).Should(Succeed())
   828  		})
   829  		It("Shouldn't create resources if MPIJob is suspended", func() {
   830  			By("By creating a new MPIJob with suspend=true")
   831  			job.Spec.RunPolicy.Suspend = pointer.Bool(true)
   832  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   833  
   834  			created := &kubeflowv1.MPIJob{}
   835  			launcherPod := &corev1.Pod{}
   836  			workerPod := &corev1.Pod{}
   837  
   838  			By("Checking created MPIJob")
   839  			Eventually(func() bool {
   840  				err := testK8sClient.Get(ctx, jobKey, created)
   841  				return err == nil
   842  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   843  			By("Checking created MPIJob has a nil startTime")
   844  			Consistently(func() *metav1.Time {
   845  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   846  				return created.Status.StartTime
   847  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeNil())
   848  
   849  			By("Checking if the pods aren't created")
   850  			Consistently(func() bool {
   851  				errLauncherPod := testK8sClient.Get(ctx, launcherKey, launcherPod)
   852  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   853  				return errors.IsNotFound(errLauncherPod) && errors.IsNotFound(errWorkerPod)
   854  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   855  
   856  			By("Checking if the MPIJob has suspended condition")
   857  			Eventually(func() []kubeflowv1.JobCondition {
   858  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   859  				return created.Status.Conditions
   860  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   861  				{
   862  					Type:    kubeflowv1.JobCreated,
   863  					Status:  corev1.ConditionTrue,
   864  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason),
   865  					Message: fmt.Sprintf("MPIJob %s is created.", name),
   866  				},
   867  				{
   868  					Type:    kubeflowv1.JobSuspended,
   869  					Status:  corev1.ConditionTrue,
   870  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason),
   871  					Message: fmt.Sprintf("MPIJob %s is suspended.", name),
   872  				},
   873  			}, testutil.IgnoreJobConditionsTimes))
   874  		})
   875  
   876  		It("Should delete resources after MPIJob is suspended; Should resume MPIJob after MPIJob is unsuspended", func() {
   877  			By("By creating a new MPIJob")
   878  			Expect(testK8sClient.Create(ctx, job)).Should(Succeed())
   879  
   880  			created := &kubeflowv1.MPIJob{}
   881  			launcherPod := &corev1.Pod{}
   882  			workerPod := &corev1.Pod{}
   883  
   884  			// We'll need to retry getting this newly created MPIJob, given that creation may not immediately happen.
   885  			By("Checking created MPIJob")
   886  			Eventually(func() bool {
   887  				err := testK8sClient.Get(ctx, jobKey, created)
   888  				return err == nil
   889  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   890  
   891  			var startTimeBeforeSuspended *metav1.Time
   892  			Eventually(func() *metav1.Time {
   893  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   894  				startTimeBeforeSuspended = created.Status.StartTime
   895  				return startTimeBeforeSuspended
   896  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   897  
   898  			By("Checking the created pods")
   899  			Eventually(func() bool {
   900  				errLauncher := testK8sClient.Get(ctx, launcherKey, launcherPod)
   901  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   902  				return errLauncher == nil && errWorker == nil
   903  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   904  
   905  			By("Updating the Pod's phase with Running")
   906  			Eventually(func() error {
   907  				Expect(testK8sClient.Get(ctx, launcherKey, launcherPod)).Should(Succeed())
   908  				launcherPod.Status.Phase = corev1.PodRunning
   909  				return testK8sClient.Status().Update(ctx, launcherPod)
   910  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   911  			Eventually(func() error {
   912  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
   913  				workerPod.Status.Phase = corev1.PodRunning
   914  				return testK8sClient.Status().Update(ctx, workerPod)
   915  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   916  
   917  			By("Checking the MPIJob's condition")
   918  			Eventually(func() []kubeflowv1.JobCondition {
   919  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   920  				return created.Status.Conditions
   921  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
   922  				{
   923  					Type:    kubeflowv1.JobCreated,
   924  					Status:  corev1.ConditionTrue,
   925  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason),
   926  					Message: fmt.Sprintf("MPIJob %s is created.", name),
   927  				},
   928  				{
   929  					Type:    kubeflowv1.JobRunning,
   930  					Status:  corev1.ConditionTrue,
   931  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason),
   932  					Message: fmt.Sprintf("MPIJob %s is running.", name),
   933  				},
   934  			}, testutil.IgnoreJobConditionsTimes))
   935  
   936  			By("Updating the MPIJob with suspend=true")
   937  			Eventually(func() error {
   938  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   939  				created.Spec.RunPolicy.Suspend = pointer.Bool(true)
   940  				return testK8sClient.Update(ctx, created)
   941  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   942  
   943  			By("Checking if the pods are removed")
   944  			Eventually(func() bool {
   945  				errLauncher := testK8sClient.Get(ctx, launcherKey, launcherPod)
   946  				errWorker := testK8sClient.Get(ctx, worker0Key, workerPod)
   947  				return errors.IsNotFound(errLauncher) && errors.IsNotFound(errWorker)
   948  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   949  			Consistently(func() bool {
   950  				errLauncherPod := testK8sClient.Get(ctx, launcherKey, launcherPod)
   951  				errWorkerPod := testK8sClient.Get(ctx, worker0Key, workerPod)
   952  				return errors.IsNotFound(errLauncherPod) && errors.IsNotFound(errWorkerPod)
   953  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   954  
   955  			By("Checking if the MPIJob has a suspended condition")
   956  			Eventually(func() bool {
   957  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   958  				return created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active == 0 &&
   959  					created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active == 0 &&
   960  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   961  			}, testutil.Timeout, testutil.Interval).Should(BeTrue())
   962  			Consistently(func() bool {
   963  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   964  				return created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active == 0 &&
   965  					created.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active == 0 &&
   966  					created.Status.StartTime.Equal(startTimeBeforeSuspended)
   967  			}, testutil.ConsistentDuration, testutil.Interval).Should(BeTrue())
   968  			Expect(created.Status.Conditions).Should(BeComparableTo([]kubeflowv1.JobCondition{
   969  				{
   970  					Type:    kubeflowv1.JobCreated,
   971  					Status:  corev1.ConditionTrue,
   972  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason),
   973  					Message: fmt.Sprintf("MPIJob %s is created.", name),
   974  				},
   975  				{
   976  					Type:    kubeflowv1.JobRunning,
   977  					Status:  corev1.ConditionFalse,
   978  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason),
   979  					Message: fmt.Sprintf("MPIJob %s is suspended.", name),
   980  				},
   981  				{
   982  					Type:    kubeflowv1.JobSuspended,
   983  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSuspendedReason),
   984  					Message: fmt.Sprintf("MPIJob %s is suspended.", name),
   985  					Status:  corev1.ConditionTrue,
   986  				},
   987  			}, testutil.IgnoreJobConditionsTimes))
   988  
   989  			By("Unsuspending the MPIJob")
   990  			Eventually(func() error {
   991  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   992  				created.Spec.RunPolicy.Suspend = pointer.Bool(false)
   993  				return testK8sClient.Update(ctx, created)
   994  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
   995  			Eventually(func() *metav1.Time {
   996  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
   997  				return created.Status.StartTime
   998  			}, testutil.Timeout, testutil.Interval).ShouldNot(BeNil())
   999  
  1000  			By("Check if the pods are created")
  1001  			Eventually(func() error {
  1002  				return testK8sClient.Get(ctx, launcherKey, launcherPod)
  1003  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
  1004  			Eventually(func() error {
  1005  				return testK8sClient.Get(ctx, worker0Key, workerPod)
  1006  			}, testutil.Timeout, testutil.Interval).Should(BeNil())
  1007  
  1008  			By("Updating Pod's condition with Running")
  1009  			Eventually(func() error {
  1010  				Expect(testK8sClient.Get(ctx, launcherKey, launcherPod)).Should(Succeed())
  1011  				launcherPod.Status.Phase = corev1.PodRunning
  1012  				return testK8sClient.Status().Update(ctx, launcherPod)
  1013  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
  1014  			Eventually(func() error {
  1015  				Expect(testK8sClient.Get(ctx, worker0Key, workerPod)).Should(Succeed())
  1016  				workerPod.Status.Phase = corev1.PodRunning
  1017  				return testK8sClient.Status().Update(ctx, workerPod)
  1018  			}, testutil.Timeout, testutil.Interval).Should(Succeed())
  1019  
  1020  			By("Checking if the MPIJob has resumed conditions")
  1021  			Eventually(func() []kubeflowv1.JobCondition {
  1022  				Expect(testK8sClient.Get(ctx, jobKey, created)).Should(Succeed())
  1023  				return created.Status.Conditions
  1024  			}, testutil.Timeout, testutil.Interval).Should(BeComparableTo([]kubeflowv1.JobCondition{
  1025  				{
  1026  					Type:    kubeflowv1.JobCreated,
  1027  					Status:  corev1.ConditionTrue,
  1028  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason),
  1029  					Message: fmt.Sprintf("MPIJob %s is created.", name),
  1030  				},
  1031  				{
  1032  					Type:    kubeflowv1.JobSuspended,
  1033  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobResumedReason),
  1034  					Message: fmt.Sprintf("MPIJob %s is resumed.", name),
  1035  					Status:  corev1.ConditionFalse,
  1036  				},
  1037  				{
  1038  					Type:    kubeflowv1.JobRunning,
  1039  					Status:  corev1.ConditionTrue,
  1040  					Reason:  commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason),
  1041  					Message: fmt.Sprintf("MPIJob %s is running.", name),
  1042  				},
  1043  			}, testutil.IgnoreJobConditionsTimes))
  1044  
  1045  			By("Checking if the startTime is updated")
  1046  			Expect(created.Status.StartTime).ShouldNot(Equal(startTimeBeforeSuspended))
  1047  		})
  1048  	})
  1049  })
  1050  
  1051  func ReplicaStatusMatch(replicaStatuses map[common.ReplicaType]*common.ReplicaStatus,
  1052  	replicaType common.ReplicaType, status *common.ReplicaStatus) bool {
  1053  
  1054  	result := true
  1055  
  1056  	if replicaStatuses == nil {
  1057  		return false
  1058  	}
  1059  	if val, exist := replicaStatuses[replicaType]; !exist {
  1060  		return false
  1061  	} else {
  1062  		result = result && (val.Active == status.Active)
  1063  		result = result && (val.Succeeded == status.Succeeded)
  1064  		result = result && (val.Failed == status.Failed)
  1065  	}
  1066  
  1067  	return result
  1068  }