github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/mpi/mpijob_controller.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package mpi
    16  
    17  import (
    18  	"bytes"
    19  	"context"
    20  	"fmt"
    21  	"reflect"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/go-logr/logr"
    28  	"github.com/sirupsen/logrus"
    29  	corev1 "k8s.io/api/core/v1"
    30  	rbacv1 "k8s.io/api/rbac/v1"
    31  	"k8s.io/apimachinery/pkg/api/errors"
    32  	"k8s.io/apimachinery/pkg/api/resource"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/runtime"
    35  	"k8s.io/apimachinery/pkg/runtime/schema"
    36  	"k8s.io/apimachinery/pkg/types"
    37  	"k8s.io/client-go/informers"
    38  	kubeclientset "k8s.io/client-go/kubernetes"
    39  	"k8s.io/client-go/tools/record"
    40  	"k8s.io/klog"
    41  	ctrl "sigs.k8s.io/controller-runtime"
    42  	"sigs.k8s.io/controller-runtime/pkg/client"
    43  	"sigs.k8s.io/controller-runtime/pkg/controller"
    44  	"sigs.k8s.io/controller-runtime/pkg/event"
    45  	"sigs.k8s.io/controller-runtime/pkg/handler"
    46  	"sigs.k8s.io/controller-runtime/pkg/log"
    47  	"sigs.k8s.io/controller-runtime/pkg/manager"
    48  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    49  	"sigs.k8s.io/controller-runtime/pkg/source"
    50  	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
    51  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    52  
    53  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    54  	trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common"
    55  	"github.com/kubeflow/training-operator/pkg/common/util"
    56  	ctlrconfig "github.com/kubeflow/training-operator/pkg/config"
    57  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    58  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    59  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    60  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    61  )
    62  
    63  const (
    64  	FailedDeleteJobReason     = "FailedDeleteJob"
    65  	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
    66  
    67  	controllerName  = "mpijob-controller"
    68  	labelMPIJobName = "mpi-job-name"
    69  )
    70  
    71  func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *MPIJobReconciler {
    72  	r := &MPIJobReconciler{
    73  		Client:    mgr.GetClient(),
    74  		Scheme:    mgr.GetScheme(),
    75  		recorder:  mgr.GetEventRecorderFor(controllerName),
    76  		apiReader: mgr.GetAPIReader(),
    77  		Log:       log.Log,
    78  	}
    79  
    80  	cfg := mgr.GetConfig()
    81  	kubeClientSet := kubeclientset.NewForConfigOrDie(cfg)
    82  	sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0)
    83  	priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses()
    84  
    85  	r.JobController = common.JobController{
    86  		Controller:                  r,
    87  		Expectations:                expectation.NewControllerExpectations(),
    88  		WorkQueue:                   &util.FakeWorkQueue{},
    89  		Recorder:                    r.recorder,
    90  		KubeClientSet:               kubeClientSet,
    91  		PriorityClassLister:         priorityClassInformer.Lister(),
    92  		PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced,
    93  		PodControl:                  control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    94  		ServiceControl:              control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    95  	}
    96  
    97  	gangSchedulingSetupFunc(&r.JobController)
    98  
    99  	return r
   100  }
   101  
   102  // MPIJobReconciler reconciles a MPIJob object
   103  type MPIJobReconciler struct {
   104  	common.JobController
   105  	client.Client
   106  	Scheme    *runtime.Scheme
   107  	recorder  record.EventRecorder
   108  	apiReader client.Reader
   109  	Log       logr.Logger
   110  }
   111  
   112  //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs,verbs=get;list;watch;create;update;patch;delete
   113  //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/status,verbs=get;update;patch
   114  //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/finalizers,verbs=update
   115  //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
   116  //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create
   117  //+kubebuilder:rbac:groups="",resources=configmaps,verbs=list;watch;create;update
   118  //+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=list;watch;create;update
   119  //+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=list;watch;create;update
   120  //+kubebuilder:rbac:groups="",resources=pods/exec,verbs=create
   121  //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   122  //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   123  //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete
   124  
   125  // Reconcile is part of the main kubernetes reconciliation loop which aims to
   126  // move the current state of the cluster closer to the desired state.
   127  func (jc *MPIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   128  	_ = log.FromContext(ctx)
   129  	logger := jc.Log.WithValues(kubeflowv1.MPIJobSingular, req.NamespacedName)
   130  
   131  	mpijob := &kubeflowv1.MPIJob{}
   132  	err := jc.Get(ctx, req.NamespacedName, mpijob)
   133  	if err != nil {
   134  		logger.Info(err.Error(), "unable to fetch MPIJob", req.NamespacedName.String())
   135  		return ctrl.Result{}, client.IgnoreNotFound(err)
   136  	}
   137  
   138  	if err = kubeflowv1.ValidateV1MpiJobSpec(&mpijob.Spec); err != nil {
   139  		logger.Error(err, "MPIJob failed validation")
   140  		jc.Recorder.Eventf(mpijob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedValidationReason),
   141  			"MPIJob failed validation because %s", err)
   142  		return ctrl.Result{}, err
   143  	}
   144  
   145  	// skip for MPIJob that is being deleted
   146  	if mpijob.GetDeletionTimestamp() != nil {
   147  		return ctrl.Result{}, nil
   148  	}
   149  
   150  	// Set default priorities to MPIJob
   151  	jc.Scheme.Default(mpijob)
   152  
   153  	// 1) validation rules out CleanPolicy with contradicting value
   154  	// 2) if both fields leave empty, Default function fills with None
   155  	// 3) if only one field set, sync value
   156  	cleanPolicyDefined := mpijob.Spec.CleanPodPolicy
   157  	if mpijob.Spec.RunPolicy.CleanPodPolicy != nil {
   158  		cleanPolicyDefined = mpijob.Spec.RunPolicy.CleanPodPolicy
   159  	}
   160  	mpijob.Spec.CleanPodPolicy = cleanPolicyDefined
   161  	mpijob.Spec.RunPolicy.CleanPodPolicy = cleanPolicyDefined
   162  
   163  	// Use common to reconcile the job related pod and service
   164  	// MPIJob needs not service
   165  	err = jc.ReconcileJobs(mpijob, mpijob.Spec.MPIReplicaSpecs, mpijob.Status, &mpijob.Spec.RunPolicy)
   166  	if err != nil {
   167  		logrus.Warnf("Reconcile MPIJob error %v", err)
   168  		return ctrl.Result{}, err
   169  	}
   170  
   171  	t, err := util.DurationUntilExpireTime(&mpijob.Spec.RunPolicy, mpijob.Status)
   172  	if err != nil {
   173  		logrus.Warnf("Reconcile MPIJob Job error %v", err)
   174  		return ctrl.Result{}, err
   175  	}
   176  	if t >= 0 {
   177  		return ctrl.Result{Requeue: true, RequeueAfter: t}, nil
   178  	}
   179  
   180  	return ctrl.Result{}, nil
   181  }
   182  
   183  // SetupWithManager sets up the controller with the Manager.
   184  func (jc *MPIJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error {
   185  	c, err := controller.New(jc.ControllerName(), mgr, controller.Options{
   186  		Reconciler:              jc,
   187  		MaxConcurrentReconciles: controllerThreads,
   188  	})
   189  	if err != nil {
   190  		return err
   191  	}
   192  
   193  	// using onOwnerCreateFunc is easier to set defaults
   194  	if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.MPIJob{}), &handler.EnqueueRequestForObject{},
   195  		predicate.Funcs{CreateFunc: jc.onOwnerCreateFunc()},
   196  	); err != nil {
   197  		return err
   198  	}
   199  
   200  	// eventHandler for owned objects
   201  	eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.MPIJob{}, handler.OnlyControllerOwner())
   202  	predicates := predicate.Funcs{
   203  		CreateFunc: util.OnDependentCreateFunc(jc.Expectations),
   204  		UpdateFunc: util.OnDependentUpdateFunc(&jc.JobController),
   205  		DeleteFunc: util.OnDependentDeleteFunc(jc.Expectations),
   206  	}
   207  	// Create generic predicates
   208  	genericPredicates := predicate.Funcs{
   209  		CreateFunc: util.OnDependentCreateFuncGeneric(jc.Expectations),
   210  		UpdateFunc: util.OnDependentUpdateFuncGeneric(&jc.JobController),
   211  		DeleteFunc: util.OnDependentDeleteFuncGeneric(jc.Expectations),
   212  	}
   213  	// inject watching for job related pod
   214  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil {
   215  		return err
   216  	}
   217  	// inject watching for job related ConfigMap
   218  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.ConfigMap{}), eventHandler, genericPredicates); err != nil {
   219  		return err
   220  	}
   221  	// inject watching for job related Role
   222  	if err = c.Watch(source.Kind(mgr.GetCache(), &rbacv1.Role{}), eventHandler, genericPredicates); err != nil {
   223  		return err
   224  	}
   225  	// inject watching for job related RoleBinding
   226  	if err = c.Watch(source.Kind(mgr.GetCache(), &rbacv1.RoleBinding{}), eventHandler, genericPredicates); err != nil {
   227  		return err
   228  	}
   229  	// inject watching for job related ServiceAccount
   230  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.ServiceAccount{}), eventHandler, genericPredicates); err != nil {
   231  		return err
   232  	}
   233  	// skip watching volcano PodGroup if volcano PodGroup is not installed
   234  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"},
   235  		v1beta1.SchemeGroupVersion.Version,
   236  	); err == nil {
   237  		// inject watching for job related volcano PodGroup
   238  		if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   239  			return err
   240  		}
   241  	}
   242  	// skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed
   243  	if _, err = mgr.GetRESTMapper().RESTMapping(
   244  		schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"},
   245  		schedulerpluginsv1alpha1.SchemeGroupVersion.Version,
   246  	); err == nil {
   247  		// inject watching for job related scheduler-plugins PodGroup
   248  		if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   249  			return err
   250  		}
   251  	}
   252  
   253  	return nil
   254  }
   255  
   256  // ReconcileServices is overridden because mpi-reconciler.v1 does not need to reconcile services
   257  func (jc *MPIJobReconciler) ReconcileServices(
   258  	job metav1.Object,
   259  	services []*corev1.Service,
   260  	rtype kubeflowv1.ReplicaType,
   261  	spec *kubeflowv1.ReplicaSpec) error {
   262  	return nil
   263  }
   264  
   265  func (jc *MPIJobReconciler) ControllerName() string {
   266  	return controllerName
   267  }
   268  
   269  func (jc *MPIJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind {
   270  	return kubeflowv1.GroupVersion.WithKind(kubeflowv1.MPIJobKind)
   271  }
   272  
   273  func (jc *MPIJobReconciler) GetAPIGroupVersion() schema.GroupVersion {
   274  	return kubeflowv1.GroupVersion
   275  }
   276  
   277  func (jc *MPIJobReconciler) GetGroupNameLabelValue() string {
   278  	return kubeflowv1.GroupVersion.Group
   279  }
   280  
   281  func (jc *MPIJobReconciler) GetFrameworkName() string {
   282  	return kubeflowv1.MPIJobFrameworkName
   283  }
   284  
   285  // SetClusterSpec is overridden because no cluster spec is needed for MPIJob
   286  func (jc *MPIJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error {
   287  	return nil
   288  }
   289  
   290  func (jc *MPIJobReconciler) GetDefaultContainerName() string {
   291  	return kubeflowv1.MPIJobDefaultContainerName
   292  }
   293  
   294  func (jc *MPIJobReconciler) GetDefaultContainerPortName() string {
   295  	return kubeflowv1.MPIJobDefaultPortName
   296  }
   297  
   298  func (jc *MPIJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   299  	rtype kubeflowv1.ReplicaType, index int) bool {
   300  	return string(rtype) == string(kubeflowv1.MPIJobReplicaTypeLauncher)
   301  }
   302  
   303  func (jc *MPIJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
   304  	mpijob := &kubeflowv1.MPIJob{}
   305  	err := jc.Get(context.Background(), types.NamespacedName{
   306  		Namespace: namespace, Name: name,
   307  	}, mpijob)
   308  	return mpijob, err
   309  }
   310  
   311  // onOwnerCreateFunc modify creation condition.
   312  func (jc *MPIJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool {
   313  	return func(e event.CreateEvent) bool {
   314  		mpiJob, ok := e.Object.(*kubeflowv1.MPIJob)
   315  		if !ok {
   316  			return true
   317  		}
   318  
   319  		jc.Scheme.Default(mpiJob)
   320  		msg := fmt.Sprintf("MPIJob %s is created.", e.Object.GetName())
   321  		logrus.Info(msg)
   322  		trainingoperatorcommon.CreatedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName())
   323  		commonutil.UpdateJobConditions(&mpiJob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), msg)
   324  		return true
   325  	}
   326  }
   327  
   328  func (jc *MPIJobReconciler) ReconcilePods(
   329  	job interface{},
   330  	jobStatus *kubeflowv1.JobStatus,
   331  	pods []*corev1.Pod,
   332  	rtype kubeflowv1.ReplicaType,
   333  	spec *kubeflowv1.ReplicaSpec,
   334  	replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   335  ) error {
   336  
   337  	mpiJob, ok := job.(*kubeflowv1.MPIJob)
   338  	if !ok {
   339  		return fmt.Errorf("%v is not a type of MPIJob", mpiJob)
   340  	}
   341  
   342  	// first set StartTime.
   343  	if jobStatus.StartTime == nil {
   344  		now := metav1.Now()
   345  		jobStatus.StartTime = &now
   346  	}
   347  
   348  	initializeReplicaStatuses(jobStatus, rtype)
   349  
   350  	// Get the launcher Job for this MPIJob.
   351  	launcher, err := jc.getLauncherJob(mpiJob)
   352  	if err != nil {
   353  		return err
   354  	}
   355  
   356  	var worker []*corev1.Pod
   357  	// We're done if the launcher either succeeded or failed.
   358  	done := launcher != nil && isPodFinished(launcher)
   359  
   360  	if !done {
   361  		workerSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker]
   362  		workerReplicas := int32(0)
   363  		if workerSpec != nil && workerSpec.Replicas != nil {
   364  			workerReplicas = *workerSpec.Replicas
   365  		}
   366  		isGPULauncher := isGPULauncher(mpiJob)
   367  
   368  		// Get the launcher ServiceAccount for this MPIJob.
   369  		if sa, err := jc.getOrCreateLauncherServiceAccount(mpiJob); sa == nil || err != nil {
   370  			return err
   371  		}
   372  
   373  		// Get the ConfigMap for this MPIJob.
   374  		if config, err := jc.getOrCreateConfigMap(mpiJob, workerReplicas, isGPULauncher); config == nil || err != nil {
   375  			return err
   376  		}
   377  
   378  		// Get the launcher Role for this MPIJob.
   379  		if r, err := jc.getOrCreateLauncherRole(mpiJob, workerReplicas); r == nil || err != nil {
   380  			return err
   381  		}
   382  
   383  		// Get the launcher RoleBinding for this MPIJob.
   384  		if rb, err := jc.getLauncherRoleBinding(mpiJob); rb == nil || err != nil {
   385  			return err
   386  		}
   387  
   388  		worker, err = jc.getOrCreateWorker(mpiJob)
   389  		if err != nil {
   390  			return err
   391  		}
   392  
   393  		if launcher == nil {
   394  			launcher, err = jc.KubeClientSet.CoreV1().Pods(mpiJob.Namespace).Create(context.Background(), jc.newLauncher(mpiJob, ctlrconfig.Config.MPIKubectlDeliveryImage, isGPULauncher), metav1.CreateOptions{})
   395  			if err != nil {
   396  				jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason), "launcher pod created failed: %v", err)
   397  				return err
   398  			} else {
   399  				jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), "launcher pod created success: %v", launcher.Name)
   400  			}
   401  		}
   402  	}
   403  
   404  	// Finally, we update the status block of the MPIJob resource to reflect the
   405  	// current state of the world.
   406  	err = jc.updateMPIJobStatus(mpiJob, launcher, worker)
   407  	if err != nil {
   408  		return err
   409  	}
   410  	return nil
   411  }
   412  
   413  func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launcher *corev1.Pod, worker []*corev1.Pod) error {
   414  	if launcher != nil {
   415  		initializeMPIJobStatuses(mpiJob, kubeflowv1.MPIJobReplicaTypeLauncher)
   416  		if isPodSucceeded(launcher) {
   417  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Succeeded = 1
   418  			msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name)
   419  			jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobPlural, commonutil.JobSucceededReason), msg)
   420  			if mpiJob.Status.CompletionTime == nil {
   421  				now := metav1.Now()
   422  				mpiJob.Status.CompletionTime = &now
   423  			}
   424  			err := updateMPIJobConditions(mpiJob, kubeflowv1.JobSucceeded, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg)
   425  			if err != nil {
   426  				return err
   427  			}
   428  		} else if isPodFailed(launcher) {
   429  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Failed = 1
   430  			msg := fmt.Sprintf("MPIJob %s/%s has failed", mpiJob.Namespace, mpiJob.Name)
   431  			reason := launcher.Status.Reason
   432  			if reason == "" {
   433  				reason = commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason)
   434  			}
   435  			jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, reason, msg)
   436  			if reason == "Evicted" {
   437  				reason = mpiJobEvict
   438  			} else if !isEvicted(mpiJob.Status) && mpiJob.Status.CompletionTime == nil {
   439  				now := metav1.Now()
   440  				mpiJob.Status.CompletionTime = &now
   441  			}
   442  			err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, reason, msg)
   443  			if err != nil {
   444  				klog.Errorf("Append mpiJob(%s/%s) condition error: %v", mpiJob.Namespace, mpiJob.Name, err)
   445  				return err
   446  			}
   447  
   448  		} else if isPodRunning(launcher) {
   449  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active = 1
   450  		}
   451  	}
   452  
   453  	var (
   454  		running = 0
   455  		evict   = 0
   456  	)
   457  
   458  	initializeMPIJobStatuses(mpiJob, kubeflowv1.MPIJobReplicaTypeWorker)
   459  	for i := 0; i < len(worker); i++ {
   460  		switch worker[i].Status.Phase {
   461  		case corev1.PodFailed:
   462  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Failed += 1
   463  			if worker[i].Status.Reason == "Evicted" {
   464  				evict += 1
   465  			}
   466  		case corev1.PodSucceeded:
   467  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Succeeded += 1
   468  		case corev1.PodRunning:
   469  			running += 1
   470  			mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active += 1
   471  		}
   472  	}
   473  	if evict > 0 {
   474  		msg := fmt.Sprintf("%d/%d workers are evicted", evict, len(worker))
   475  		if err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, mpiJobEvict, msg); err != nil {
   476  			return err
   477  		}
   478  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobEvict, msg)
   479  	}
   480  
   481  	if launcher != nil && launcher.Status.Phase == corev1.PodRunning && running == len(worker) {
   482  		msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name)
   483  		err := updateMPIJobConditions(mpiJob, kubeflowv1.JobRunning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), msg)
   484  		if err != nil {
   485  			return err
   486  		}
   487  		jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), "MPIJob %s/%s is running", mpiJob.Namespace, mpiJob.Name)
   488  	}
   489  	return nil
   490  }
   491  
   492  func (jc *MPIJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) {
   493  	job := &kubeflowv1.MPIJob{}
   494  
   495  	err := jc.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   496  	if err != nil {
   497  		if errors.IsNotFound(err) {
   498  			logrus.Error(err, "MPIJob not found", "namespace", namespace, "name", name)
   499  		} else {
   500  			logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   501  		}
   502  		return nil, err
   503  	}
   504  	return job, nil
   505  }
   506  
   507  // GetPodsForJob returns the set of pods that this job should manage.
   508  // It also reconciles ControllerRef by adopting/orphaning.
   509  // Note that the returned Pods are pointers into the cache.
   510  func (jc *MPIJobReconciler) GetPodsForJob(jobObject interface{}) ([]*corev1.Pod, error) {
   511  	job, ok := jobObject.(metav1.Object)
   512  	if !ok {
   513  		return nil, fmt.Errorf("job is not of type metav1.Object")
   514  	}
   515  
   516  	// Create selector.
   517  	selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   518  		MatchLabels: jc.GenLabels(job.GetName()),
   519  	})
   520  
   521  	if err != nil {
   522  		return nil, fmt.Errorf("couldn't convert Job selector: %v", err)
   523  	}
   524  	// List all pods to include those that don't match the selector anymore
   525  	// but have a ControllerRef pointing to this controller.
   526  	podlist := &corev1.PodList{}
   527  	err = jc.List(context.Background(), podlist,
   528  		client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(job.GetNamespace()))
   529  	if err != nil {
   530  		return nil, err
   531  	}
   532  
   533  	return util.JobControlledPodList(podlist.Items, job), nil
   534  }
   535  
   536  func (jc *MPIJobReconciler) DeleteJob(job interface{}) error {
   537  	mpiJob, ok := job.(*kubeflowv1.MPIJob)
   538  	if !ok {
   539  		return fmt.Errorf("%v is not a type of MPIJob", mpiJob)
   540  	}
   541  
   542  	log := commonutil.LoggerForJob(mpiJob)
   543  	if err := jc.Delete(context.Background(), mpiJob); err != nil {
   544  		jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err)
   545  		log.Errorf("failed to delete job %s/%s, %v", mpiJob.Namespace, mpiJob.Name, err)
   546  		return err
   547  	}
   548  
   549  	jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", mpiJob.Name)
   550  	log.Infof("job %s/%s has been deleted", mpiJob.Namespace, mpiJob.Name)
   551  	trainingoperatorcommon.DeletedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName())
   552  	return nil
   553  }
   554  
   555  // GetServicesForJob returns the set of services that this job should manage.
   556  // It also reconciles ControllerRef by adopting/orphaning.
   557  // Note that the returned services are pointers into the cache.
   558  func (jc *MPIJobReconciler) GetServicesForJob(jobObject interface{}) ([]*corev1.Service, error) {
   559  	return nil, nil
   560  }
   561  
   562  func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error {
   563  	mpiJob, ok := job.(*kubeflowv1.MPIJob)
   564  	if !ok {
   565  		return fmt.Errorf("%+v is not a type of MPIJob", job)
   566  	}
   567  
   568  	for rtype, spec := range replicas {
   569  		status := jobStatus.ReplicaStatuses[rtype]
   570  
   571  		succeeded := status.Succeeded
   572  		expected := *(spec.Replicas) - succeeded
   573  		running := status.Active
   574  		failed := status.Failed
   575  
   576  		logrus.Infof("MPIJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d",
   577  			mpiJob.Name, rtype, expected, running, succeeded, failed)
   578  
   579  		if rtype == kubeflowv1.MPIJobReplicaTypeLauncher {
   580  			if running > 0 {
   581  				msg := fmt.Sprintf("MPIJob %s is running.", mpiJob.Name)
   582  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), msg)
   583  			}
   584  			// when launcher is succeed, the job is finished.
   585  			if expected == 0 {
   586  				msg := fmt.Sprintf("MPIJob %s is successfully completed.", mpiJob.Name)
   587  				logrus.Info(msg)
   588  				jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg)
   589  				if jobStatus.CompletionTime == nil {
   590  					now := metav1.Now()
   591  					jobStatus.CompletionTime = &now
   592  				}
   593  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg)
   594  				trainingoperatorcommon.SuccessfulJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName())
   595  				return nil
   596  			}
   597  		}
   598  		if failed > 0 {
   599  			if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode {
   600  				msg := fmt.Sprintf("MPIJob %s is restarting because %d %s replica(s) failed.", mpiJob.Name, failed, rtype)
   601  				jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRestartingReason), msg)
   602  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRestartingReason), msg)
   603  				trainingoperatorcommon.RestartedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName())
   604  			} else {
   605  				msg := fmt.Sprintf("MPIJob %s is failed because %d %s replica(s) failed.", mpiJob.Name, failed, rtype)
   606  				jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason), msg)
   607  				if jobStatus.CompletionTime == nil {
   608  					now := metav1.Now()
   609  					jobStatus.CompletionTime = &now
   610  				}
   611  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason)), msg)
   612  				trainingoperatorcommon.FailedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName())
   613  			}
   614  		}
   615  	}
   616  	mpiJob.Status = *jobStatus.DeepCopy()
   617  	return nil
   618  }
   619  
   620  func (jc *MPIJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error {
   621  	if jobStatus.ReplicaStatuses == nil {
   622  		jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{}
   623  	}
   624  
   625  	mpiJob, ok := job.(*kubeflowv1.MPIJob)
   626  	trainingoperatorcommon.ClearGeneratedFields(&mpiJob.ObjectMeta)
   627  	if !ok {
   628  		return fmt.Errorf("%v is not a type of MpiJob", mpiJob)
   629  	}
   630  
   631  	startTime := time.Now()
   632  	logger := commonutil.LoggerForJob(mpiJob)
   633  	defer func() {
   634  		logger.Infof("Finished updating MpiJobs Status %q (%v)",
   635  			mpiJob.Name, time.Since(startTime))
   636  	}()
   637  
   638  	mpiJob = mpiJob.DeepCopy()
   639  	mpiJob.Status = *jobStatus.DeepCopy()
   640  
   641  	result := jc.Status().Update(context.Background(), mpiJob)
   642  
   643  	if result != nil {
   644  		jc.Log.WithValues("mpijob", types.NamespacedName{
   645  			Namespace: mpiJob.GetNamespace(),
   646  			Name:      mpiJob.GetName(),
   647  		})
   648  		return result
   649  	}
   650  
   651  	return nil
   652  }
   653  
   654  // getLauncherJob gets the launcher Job controlled by this MPIJob.
   655  func (jc *MPIJobReconciler) getLauncherJob(mpiJob *kubeflowv1.MPIJob) (*corev1.Pod, error) {
   656  	launcher := &corev1.Pod{}
   657  	NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix}
   658  	err := jc.Get(context.Background(), NamespacedName, launcher)
   659  	if errors.IsNotFound(err) {
   660  		return nil, nil
   661  	}
   662  	if err != nil {
   663  		// If an error occurs during Get, we'll requeue the item so we can
   664  		// attempt processing again later. This could have been caused by a
   665  		// temporary network failure, or any other transient reason.
   666  		return nil, err
   667  	}
   668  
   669  	// If the launcher is not controlled by this MPIJob resource, we should log
   670  	// a warning to the event recorder and return.
   671  	if !metav1.IsControlledBy(launcher, mpiJob) {
   672  		msg := fmt.Sprintf(MessageResourceExists, launcher.Name, launcher.Kind)
   673  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   674  		return launcher, fmt.Errorf(msg)
   675  	}
   676  	return launcher, nil
   677  }
   678  
   679  // getOrCreateConfigMap gets the ConfigMap controlled by this MPIJob, or creates
   680  // one if it doesn't exist.
   681  func (jc *MPIJobReconciler) getOrCreateConfigMap(mpiJob *kubeflowv1.MPIJob, workerReplicas int32, isGPULauncher bool) (*corev1.ConfigMap, error) {
   682  	newCM := newConfigMap(mpiJob, workerReplicas, isGPULauncher)
   683  	podList, err := jc.getRunningWorkerPods(mpiJob)
   684  	if err != nil {
   685  		return nil, err
   686  	}
   687  	updateDiscoverHostsInConfigMap(newCM, mpiJob, podList, isGPULauncher)
   688  
   689  	cm := &corev1.ConfigMap{}
   690  	NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + configSuffix}
   691  	err = jc.Get(context.Background(), NamespacedName, cm)
   692  
   693  	// If the ConfigMap doesn't exist, we'll create it.
   694  	if errors.IsNotFound(err) {
   695  		cm, err = jc.KubeClientSet.CoreV1().ConfigMaps(mpiJob.Namespace).Create(context.Background(), newCM, metav1.CreateOptions{})
   696  	}
   697  	// If an error occurs during Get/Create, we'll requeue the item so we
   698  	// can attempt processing again later. This could have been caused by a
   699  	// temporary network failure, or any other transient reason.
   700  	if err != nil {
   701  		return nil, err
   702  	}
   703  
   704  	// If the ConfigMap is not controlled by this MPIJob resource, we
   705  	// should log a warning to the event recorder and return.
   706  	if !metav1.IsControlledBy(cm, mpiJob) {
   707  		msg := fmt.Sprintf(MessageResourceExists, cm.Name, cm.Kind)
   708  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   709  		return nil, fmt.Errorf(msg)
   710  	}
   711  
   712  	// If the ConfigMap is changed, update it
   713  	if !reflect.DeepEqual(cm.Data, newCM.Data) {
   714  		cm, err = jc.KubeClientSet.CoreV1().ConfigMaps(mpiJob.Namespace).Update(context.Background(), newCM, metav1.UpdateOptions{})
   715  		if err != nil {
   716  			return nil, err
   717  		}
   718  	}
   719  
   720  	return cm, nil
   721  }
   722  
   723  // getOrCreateLauncherServiceAccount gets the launcher ServiceAccount controlled
   724  // by this MPIJob, or creates one if it doesn't exist.
   725  func (jc *MPIJobReconciler) getOrCreateLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) (*corev1.ServiceAccount, error) {
   726  
   727  	sa := &corev1.ServiceAccount{}
   728  	NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix}
   729  	err := jc.Get(context.Background(), NamespacedName, sa)
   730  
   731  	if err == nil {
   732  		jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "ServiceAccount is exist", "ServiceAccount: %v", sa.Name)
   733  	}
   734  
   735  	if errors.IsNotFound(err) {
   736  		sa, err = jc.KubeClientSet.CoreV1().ServiceAccounts(mpiJob.Namespace).Create(context.Background(), newLauncherServiceAccount(mpiJob), metav1.CreateOptions{})
   737  	}
   738  	// If an error occurs during Get/Create, we'll requeue the item so we
   739  	// can attempt processing again later. This could have been caused by a
   740  	// temporary network failure, or any other transient reason.
   741  	if err != nil {
   742  		return nil, err
   743  	}
   744  	// If the launcher ServiceAccount is not controlled by this MPIJob resource, we
   745  	// should log a warning to the event recorder and return.
   746  	if !metav1.IsControlledBy(sa, mpiJob) {
   747  		msg := fmt.Sprintf(MessageResourceExists, sa.Name, sa.Kind)
   748  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   749  		return nil, fmt.Errorf(msg)
   750  	}
   751  
   752  	return sa, nil
   753  }
   754  
   755  // getOrCreateLauncherRole gets the launcher Role controlled by this MPIJob.
   756  func (jc *MPIJobReconciler) getOrCreateLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) (*rbacv1.Role, error) {
   757  	role := &rbacv1.Role{}
   758  	NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix}
   759  	err := jc.Get(context.Background(), NamespacedName, role)
   760  
   761  	if err == nil {
   762  		jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "LauncherRole is exist", "LauncherRole: %v", role.Name)
   763  	}
   764  
   765  	launcherRole := newLauncherRole(mpiJob, workerReplicas)
   766  	// If the Role doesn't exist, we'll create it.
   767  	if errors.IsNotFound(err) {
   768  		role, err = jc.KubeClientSet.RbacV1().Roles(mpiJob.Namespace).Create(context.Background(), launcherRole, metav1.CreateOptions{})
   769  	}
   770  	// If an error occurs during Get/Create, we'll requeue the item so we
   771  	// can attempt processing again later. This could have been caused by a
   772  	// temporary network failure, or any other transient reason.
   773  	if err != nil {
   774  		return nil, err
   775  	}
   776  	// If the launcher Role is not controlled by this MPIJob resource, we
   777  	// should log a warning to the event recorder and return.
   778  	if !metav1.IsControlledBy(role, mpiJob) {
   779  		msg := fmt.Sprintf(MessageResourceExists, role.Name, role.Kind)
   780  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   781  		return nil, fmt.Errorf(msg)
   782  	}
   783  
   784  	if !reflect.DeepEqual(role.Rules, launcherRole.Rules) {
   785  		role, err = jc.KubeClientSet.RbacV1().Roles(mpiJob.Namespace).Update(context.Background(), launcherRole, metav1.UpdateOptions{})
   786  		if err != nil {
   787  			return nil, err
   788  		}
   789  	}
   790  
   791  	return role, nil
   792  }
   793  
   794  // getLauncherRoleBinding gets the launcher RoleBinding controlled by this
   795  // MPIJob, or creates one if it doesn't exist.
   796  func (jc *MPIJobReconciler) getLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) (*rbacv1.RoleBinding, error) {
   797  	rb := &rbacv1.RoleBinding{}
   798  	NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix}
   799  	err := jc.Get(context.Background(), NamespacedName, rb)
   800  	// If the RoleBinding doesn't exist, we'll create it.
   801  
   802  	if err == nil {
   803  		jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "RoleBinding is exist", "RoleBinding: %v", rb.Name)
   804  	}
   805  
   806  	if errors.IsNotFound(err) {
   807  		rb, err = jc.KubeClientSet.RbacV1().RoleBindings(mpiJob.Namespace).Create(context.Background(), newLauncherRoleBinding(mpiJob), metav1.CreateOptions{})
   808  	}
   809  	// If an error occurs during Get/Create, we'll requeue the item so we
   810  	// can attempt processing again later. This could have been caused by a
   811  	// temporary network failure, or any other transient reason.
   812  	if err != nil {
   813  		return nil, err
   814  	}
   815  	// If the launcher RoleBinding is not controlled by this MPIJob resource, we
   816  	// should log a warning to the event recorder and return.
   817  	if !metav1.IsControlledBy(rb, mpiJob) {
   818  		msg := fmt.Sprintf(MessageResourceExists, rb.Name, rb.Kind)
   819  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   820  		return nil, fmt.Errorf(msg)
   821  	}
   822  
   823  	return rb, nil
   824  }
   825  
   826  // getOrCreateWorker gets the worker Pod controlled by this
   827  // MPIJob, or creates one if it doesn't exist.
   828  func (jc *MPIJobReconciler) getOrCreateWorker(mpiJob *kubeflowv1.MPIJob) ([]*corev1.Pod, error) {
   829  	var (
   830  		workerPrefix   string        = mpiJob.Name + workerSuffix
   831  		workerPods     []*corev1.Pod = []*corev1.Pod{}
   832  		i              int32         = 0
   833  		workerReplicas *int32
   834  	)
   835  	if worker, ok := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker]; ok && worker != nil {
   836  		workerReplicas = worker.Replicas
   837  	} else {
   838  		return workerPods, nil
   839  	}
   840  
   841  	// Remove Pods when replicas are scaled down
   842  	genericLabels := jc.GenLabels(mpiJob.GetName())
   843  	selector, err := workerSelector(genericLabels)
   844  	if err != nil {
   845  		return nil, err
   846  	}
   847  
   848  	podlist := &corev1.PodList{}
   849  	err = jc.List(context.Background(), podlist, client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(mpiJob.GetNamespace()))
   850  
   851  	if err != nil {
   852  		return nil, err
   853  	}
   854  	if len(podlist.Items) > int(*workerReplicas) {
   855  		for _, pod := range podlist.Items {
   856  			indexStr, ok := pod.Labels[kubeflowv1.ReplicaIndexLabel]
   857  			if !ok {
   858  				return nil, err
   859  			}
   860  			index, err := strconv.Atoi(indexStr)
   861  			if err == nil {
   862  				if index >= int(*workerReplicas) {
   863  					err = jc.KubeClientSet.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{})
   864  					if err != nil {
   865  						return nil, err
   866  					}
   867  				}
   868  			}
   869  		}
   870  	}
   871  
   872  	for ; i < *workerReplicas; i++ {
   873  		name := fmt.Sprintf("%s-%d", workerPrefix, i)
   874  
   875  		pod := &corev1.Pod{}
   876  		NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: name}
   877  		err := jc.Get(context.Background(), NamespacedName, pod)
   878  
   879  		// If the worker Pod doesn't exist, we'll create it.
   880  		if errors.IsNotFound(err) {
   881  			worker := jc.newWorker(mpiJob, name)
   882  			if worker == nil {
   883  				msg := fmt.Sprintf(MessageResourceDoesNotExist, "Worker")
   884  				jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceDoesNotExist, msg)
   885  				err = fmt.Errorf(msg)
   886  				return nil, err
   887  			}
   888  			// Insert ReplicaIndexLabel
   889  			worker.Labels[kubeflowv1.ReplicaIndexLabel] = strconv.Itoa(int(i))
   890  			pod, err = jc.KubeClientSet.CoreV1().Pods(mpiJob.Namespace).Create(context.Background(), worker, metav1.CreateOptions{})
   891  			if err == nil {
   892  				jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "SuccessfulCreatePod", "Created worker pod: %v", pod.Name)
   893  			} else {
   894  				jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, "FailedCreatePod", "Created worker pod: %v", pod.Name)
   895  			}
   896  		}
   897  
   898  		// If an error occurs during Get/Create, we'll requeue the item so we
   899  		// can attempt processing again later. This could have been caused by a
   900  		// temporary network failure, or any other transient reason.
   901  		if err != nil && !errors.IsNotFound(err) {
   902  			jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason),
   903  				"worker pod created failed: %v", err)
   904  			return nil, err
   905  		}
   906  		// If the worker is not controlled by this MPIJob resource, we should log
   907  		// a warning to the event recorder and return.
   908  		if pod != nil && !metav1.IsControlledBy(pod, mpiJob) {
   909  			msg := fmt.Sprintf(MessageResourceExists, pod.Name, pod.Kind)
   910  			jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg)
   911  			return nil, fmt.Errorf(msg)
   912  		}
   913  		workerPods = append(workerPods, pod)
   914  	}
   915  
   916  	return workerPods, nil
   917  }
   918  
   919  // newWorker creates a new worker Pod for an MPIJob resource. It also
   920  // sets the appropriate OwnerReferences on the resource so handleObject can
   921  // discover the MPIJob resource that 'owns' it.
   922  func (jc *MPIJobReconciler) newWorker(mpiJob *kubeflowv1.MPIJob, name string) *corev1.Pod {
   923  	genericLabels := jc.GenLabels(mpiJob.GetName())
   924  	labels := defaultWorkerLabels(genericLabels)
   925  
   926  	podSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Template.DeepCopy()
   927  
   928  	// keep the labels which are set in PodTemplate
   929  	if len(podSpec.Labels) == 0 {
   930  		podSpec.Labels = make(map[string]string)
   931  	}
   932  
   933  	for key, value := range labels {
   934  		podSpec.Labels[key] = value
   935  	}
   936  	setRestartPolicy(podSpec, mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker])
   937  	logger := commonutil.LoggerForReplica(mpiJob, strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher)))
   938  	if len(podSpec.Spec.Containers) == 0 {
   939  		klog.Errorln("Worker pod does not have any containers in its spec")
   940  		return nil
   941  	}
   942  	container := podSpec.Spec.Containers[0]
   943  	if len(container.Command) == 0 {
   944  		container.Command = []string{"sleep"}
   945  		container.Args = []string{"365d"}
   946  	}
   947  
   948  	// We need the kubexec.sh script here because Open MPI checks for the path
   949  	// in every rank.
   950  	container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{
   951  		Name:      configVolumeName,
   952  		MountPath: configMountPath,
   953  	})
   954  	podSpec.Spec.Containers[0] = container
   955  
   956  	scriptMode := int32(0555)
   957  	podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, corev1.Volume{
   958  		Name: configVolumeName,
   959  		VolumeSource: corev1.VolumeSource{
   960  			ConfigMap: &corev1.ConfigMapVolumeSource{
   961  				LocalObjectReference: corev1.LocalObjectReference{
   962  					Name: mpiJob.Name + configSuffix,
   963  				},
   964  				Items: []corev1.KeyToPath{
   965  					{
   966  						Key:  kubexecScriptName,
   967  						Path: kubexecScriptName,
   968  						Mode: &scriptMode,
   969  					},
   970  				},
   971  			},
   972  		},
   973  	})
   974  
   975  	// if gang-scheduling is enabled:
   976  	// 1. if user has specified other scheduler, we report a warning without overriding any fields.
   977  	// 2. if no SchedulerName is set for pods, then we set the SchedulerName to "volcano".
   978  	if jc.Config.EnableGangScheduling() {
   979  		if !util.IsGangSchedulerSet(mpiJob.Spec.MPIReplicaSpecs, jc.PodGroupControl.GetSchedulerName()) {
   980  			errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
   981  			logger.Warning(errMsg)
   982  			jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
   983  		}
   984  
   985  		rtWorker := strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeWorker))
   986  		jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rtWorker)
   987  	}
   988  
   989  	return &corev1.Pod{
   990  		ObjectMeta: metav1.ObjectMeta{
   991  			Name:        name,
   992  			Namespace:   mpiJob.Namespace,
   993  			Labels:      podSpec.Labels,
   994  			Annotations: podSpec.Annotations,
   995  			OwnerReferences: []metav1.OwnerReference{
   996  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
   997  			},
   998  		},
   999  		Spec: podSpec.Spec,
  1000  	}
  1001  }
  1002  
  1003  // newLauncher creates a new launcher Job for an MPIJob resource. It also sets
  1004  // the appropriate OwnerReferences on the resource so handleObject can discover
  1005  // the MPIJob resource that 'owns' it.
  1006  func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDeliveryImage string, isGPULauncher bool) *corev1.Pod {
  1007  	launcherName := mpiJob.Name + launcherSuffix
  1008  
  1009  	genericLabels := jc.GenLabels(mpiJob.GetName())
  1010  	labels := defaultLauncherLabels(genericLabels)
  1011  
  1012  	masterRole := jc.IsMasterRole(mpiJob.Spec.MPIReplicaSpecs, kubeflowv1.MPIJobReplicaTypeLauncher, 0)
  1013  	if masterRole {
  1014  		labels[kubeflowv1.JobRoleLabel] = "master"
  1015  	}
  1016  	podSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.DeepCopy()
  1017  	// copy the labels and annotations to pod from PodTemplate
  1018  	if len(podSpec.Labels) == 0 {
  1019  		podSpec.Labels = make(map[string]string)
  1020  	}
  1021  	for key, value := range labels {
  1022  		podSpec.Labels[key] = value
  1023  	}
  1024  
  1025  	logger := commonutil.LoggerForReplica(mpiJob, strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher)))
  1026  	// add SchedulerName to podSpec
  1027  	if jc.Config.EnableGangScheduling() {
  1028  		if !util.IsGangSchedulerSet(mpiJob.Spec.MPIReplicaSpecs, jc.PodGroupControl.GetSchedulerName()) {
  1029  			errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
  1030  			logger.Warning(errMsg)
  1031  			jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
  1032  		}
  1033  
  1034  		rt := strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher))
  1035  		jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt)
  1036  	}
  1037  
  1038  	podSpec.Spec.ServiceAccountName = launcherName
  1039  	podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{
  1040  		Name:            kubectlDeliveryName,
  1041  		Image:           kubectlDeliveryImage,
  1042  		ImagePullPolicy: corev1.PullIfNotPresent,
  1043  		Env: []corev1.EnvVar{
  1044  			{
  1045  				Name:  kubectlTargetDirEnv,
  1046  				Value: kubectlMountPath,
  1047  			},
  1048  			{
  1049  				Name:  "NAMESPACE",
  1050  				Value: mpiJob.Namespace,
  1051  			},
  1052  		},
  1053  		VolumeMounts: []corev1.VolumeMount{
  1054  			{
  1055  				Name:      kubectlVolumeName,
  1056  				MountPath: kubectlMountPath,
  1057  			},
  1058  			{
  1059  				Name:      configVolumeName,
  1060  				MountPath: configMountPath,
  1061  			},
  1062  		},
  1063  		Resources: corev1.ResourceRequirements{
  1064  			Limits: corev1.ResourceList{
  1065  				corev1.ResourceCPU:              resource.MustParse(initContainerCpu),
  1066  				corev1.ResourceMemory:           resource.MustParse(initContainerMem),
  1067  				corev1.ResourceEphemeralStorage: resource.MustParse(initContainerEphStorage),
  1068  			},
  1069  			Requests: corev1.ResourceList{
  1070  				corev1.ResourceCPU:              resource.MustParse(initContainerCpu),
  1071  				corev1.ResourceMemory:           resource.MustParse(initContainerMem),
  1072  				corev1.ResourceEphemeralStorage: resource.MustParse(initContainerEphStorage),
  1073  			},
  1074  		},
  1075  	})
  1076  	if len(podSpec.Spec.Containers) == 0 {
  1077  		klog.Errorln("Launcher pod does not have any containers in its spec")
  1078  		msg := fmt.Sprintf(MessageResourceDoesNotExist, "Launcher")
  1079  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceDoesNotExist, msg)
  1080  		return nil
  1081  	}
  1082  	container := podSpec.Spec.Containers[0]
  1083  	container.Env = append(container.Env,
  1084  		corev1.EnvVar{
  1085  			Name:  "OMPI_MCA_plm_rsh_agent",
  1086  			Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
  1087  		},
  1088  		corev1.EnvVar{
  1089  			Name:  "OMPI_MCA_orte_default_hostfile",
  1090  			Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName),
  1091  		},
  1092  	)
  1093  
  1094  	if !isGPULauncher {
  1095  		container.Env = append(container.Env,
  1096  			// We overwrite these environment variables so that users will not
  1097  			// be mistakenly using GPU resources for launcher due to potential
  1098  			// issues with scheduler/container technologies.
  1099  			corev1.EnvVar{
  1100  				Name:  "NVIDIA_VISIBLE_DEVICES",
  1101  				Value: "",
  1102  			},
  1103  			corev1.EnvVar{
  1104  				Name:  "NVIDIA_DRIVER_CAPABILITIES",
  1105  				Value: "",
  1106  			})
  1107  	}
  1108  
  1109  	// Add default Intel MPI bootstrap variables if not provided by the user.
  1110  	bootstrap, exec := hasIntelMPIBootstrapValues(container.Env)
  1111  	if !bootstrap {
  1112  		container.Env = append(container.Env,
  1113  			corev1.EnvVar{
  1114  				Name:  "I_MPI_HYDRA_BOOTSTRAP",
  1115  				Value: iMPIDefaultBootstrap,
  1116  			},
  1117  		)
  1118  	}
  1119  	if !exec {
  1120  		container.Env = append(container.Env,
  1121  			corev1.EnvVar{
  1122  				Name:  "I_MPI_HYDRA_BOOTSTRAP_EXEC",
  1123  				Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName),
  1124  			},
  1125  		)
  1126  	}
  1127  
  1128  	container.VolumeMounts = append(container.VolumeMounts,
  1129  		corev1.VolumeMount{
  1130  			Name:      kubectlVolumeName,
  1131  			MountPath: kubectlMountPath,
  1132  		},
  1133  		corev1.VolumeMount{
  1134  			Name:      configVolumeName,
  1135  			MountPath: configMountPath,
  1136  		})
  1137  	podSpec.Spec.Containers[0] = container
  1138  
  1139  	// Submit a warning event if the user specifies restart policy for
  1140  	// the pod template. We recommend to set it from the replica level.
  1141  	if podSpec.Spec.RestartPolicy != corev1.RestartPolicy("") {
  1142  		errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec"
  1143  		klog.Warning(errMsg)
  1144  		jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg)
  1145  	}
  1146  	setRestartPolicy(podSpec, mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher])
  1147  
  1148  	scriptsMode := int32(0555)
  1149  	hostfileMode := int32(0444)
  1150  	podSpec.Spec.Volumes = append(podSpec.Spec.Volumes,
  1151  		corev1.Volume{
  1152  			Name: kubectlVolumeName,
  1153  			VolumeSource: corev1.VolumeSource{
  1154  				EmptyDir: &corev1.EmptyDirVolumeSource{},
  1155  			},
  1156  		},
  1157  		corev1.Volume{
  1158  			Name: configVolumeName,
  1159  			VolumeSource: corev1.VolumeSource{
  1160  				ConfigMap: &corev1.ConfigMapVolumeSource{
  1161  					LocalObjectReference: corev1.LocalObjectReference{
  1162  						Name: mpiJob.Name + configSuffix,
  1163  					},
  1164  					Items: []corev1.KeyToPath{
  1165  						{
  1166  							Key:  kubexecScriptName,
  1167  							Path: kubexecScriptName,
  1168  							Mode: &scriptsMode,
  1169  						},
  1170  						{
  1171  							Key:  hostfileName,
  1172  							Path: hostfileName,
  1173  							Mode: &hostfileMode,
  1174  						},
  1175  						{
  1176  							Key:  discoverHostsScriptName,
  1177  							Path: discoverHostsScriptName,
  1178  							Mode: &scriptsMode,
  1179  						},
  1180  					},
  1181  				},
  1182  			},
  1183  		})
  1184  	return &corev1.Pod{
  1185  		ObjectMeta: metav1.ObjectMeta{
  1186  			Name:        launcherName,
  1187  			Namespace:   mpiJob.Namespace,
  1188  			Labels:      podSpec.Labels,
  1189  			Annotations: podSpec.Annotations,
  1190  			OwnerReferences: []metav1.OwnerReference{
  1191  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
  1192  			},
  1193  		},
  1194  		Spec: podSpec.Spec,
  1195  	}
  1196  }
  1197  
  1198  // getRunningWorkerPods get all worker Pods with Running phase controlled by this MPIJob.
  1199  func (jc *MPIJobReconciler) getRunningWorkerPods(mpiJob *kubeflowv1.MPIJob) ([]*corev1.Pod, error) {
  1200  	genericLabels := jc.GenLabels(mpiJob.GetName())
  1201  	selector, err := workerSelector(genericLabels)
  1202  	if err != nil {
  1203  		return nil, err
  1204  	}
  1205  
  1206  	podFullList := &corev1.PodList{}
  1207  	err = jc.List(context.Background(), podFullList, client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(mpiJob.GetNamespace()))
  1208  	//podFullList, err := r.PodLister.List(selector)
  1209  	if err != nil {
  1210  		return nil, err
  1211  	}
  1212  	// Only running Pods should be included within the `discover_hosts.sh` script.
  1213  	var podList []corev1.Pod
  1214  	for idx, pod := range podFullList.Items {
  1215  		if pod.Status.Phase == corev1.PodRunning {
  1216  			podList = append(podList, podFullList.Items[idx])
  1217  		}
  1218  	}
  1219  	return util.JobControlledPodList(podList, mpiJob), nil
  1220  }
  1221  
  1222  // newConfigMap creates a new ConfigMap containing configurations for an MPIJob
  1223  // resource. It also sets the appropriate OwnerReferences on the resource so
  1224  // handleObject can discover the MPIJob resource that 'owns' it.
  1225  func newConfigMap(mpiJob *kubeflowv1.MPIJob, workerReplicas int32, isGPULauncher bool) *corev1.ConfigMap {
  1226  	kubexec := fmt.Sprintf(`#!/bin/sh
  1227  set -x
  1228  POD_NAME=$1
  1229  shift
  1230  %s/kubectl exec ${POD_NAME}`, kubectlMountPath)
  1231  	if len(mpiJob.Spec.MainContainer) > 0 {
  1232  		kubexec = fmt.Sprintf("%s --container %s", kubexec, mpiJob.Spec.MainContainer)
  1233  	}
  1234  	kubexec = fmt.Sprintf("%s -- /bin/sh -c \"$*\"", kubexec)
  1235  
  1236  	// If no processing unit is specified, default to 1 slot.
  1237  	slots := 1
  1238  	if mpiJob.Spec.SlotsPerWorker != nil {
  1239  		slots = int(*mpiJob.Spec.SlotsPerWorker)
  1240  	}
  1241  	var buffer bytes.Buffer
  1242  	if isGPULauncher {
  1243  		buffer.WriteString(fmt.Sprintf("%s%s slots=%d\n", mpiJob.Name, launcherSuffix, slots))
  1244  	}
  1245  	for i := 0; i < int(workerReplicas); i++ {
  1246  		buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d\n", mpiJob.Name, workerSuffix, i, slots))
  1247  	}
  1248  
  1249  	return &corev1.ConfigMap{
  1250  		ObjectMeta: metav1.ObjectMeta{
  1251  			Name:      mpiJob.Name + configSuffix,
  1252  			Namespace: mpiJob.Namespace,
  1253  			Labels: map[string]string{
  1254  				"app": mpiJob.Name,
  1255  			},
  1256  			OwnerReferences: []metav1.OwnerReference{
  1257  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
  1258  			},
  1259  		},
  1260  		Data: map[string]string{
  1261  			hostfileName:      buffer.String(),
  1262  			kubexecScriptName: kubexec,
  1263  		},
  1264  	}
  1265  }
  1266  
  1267  // updateDiscoverHostsInConfigMap updates the ConfigMap if the content of `discover_hosts.sh` changes.
  1268  func updateDiscoverHostsInConfigMap(configMap *corev1.ConfigMap, mpiJob *kubeflowv1.MPIJob, runningPods []*corev1.Pod, isGPULauncher bool) {
  1269  	slots := 1
  1270  	if mpiJob.Spec.SlotsPerWorker != nil {
  1271  		slots = int(*mpiJob.Spec.SlotsPerWorker)
  1272  	}
  1273  
  1274  	// Sort the slice of Pods to make sure the order of entries in `discover_hosts.sh` is maintained.
  1275  	sort.Slice(runningPods, func(i, j int) bool {
  1276  		return runningPods[i].Name < runningPods[j].Name
  1277  	})
  1278  
  1279  	discoverHosts := "#!/bin/sh"
  1280  	if isGPULauncher {
  1281  		discoverHosts = fmt.Sprintf("%s\necho %s%s:%d\n", discoverHosts, mpiJob.Name, launcherSuffix, slots)
  1282  	}
  1283  	for _, p := range runningPods {
  1284  		discoverHosts = fmt.Sprintf("%s\necho %s:%d", discoverHosts, p.Name, slots)
  1285  	}
  1286  
  1287  	oldDiscoverHosts, exist := configMap.Data[discoverHostsScriptName]
  1288  	if exist {
  1289  		if oldDiscoverHosts == discoverHosts {
  1290  			return
  1291  		}
  1292  	}
  1293  	configMap.Data[discoverHostsScriptName] = discoverHosts
  1294  }
  1295  
  1296  // newLauncherServiceAccount creates a new launcher ServiceAccount for an MPIJob
  1297  // resource. It also sets the appropriate OwnerReferences on the resource so
  1298  // handleObject can discover the MPIJob resource that 'owns' it.
  1299  func newLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) *corev1.ServiceAccount {
  1300  	return &corev1.ServiceAccount{
  1301  		ObjectMeta: metav1.ObjectMeta{
  1302  			Name:      mpiJob.Name + launcherSuffix,
  1303  			Namespace: mpiJob.Namespace,
  1304  			Labels: map[string]string{
  1305  				"app": mpiJob.Name,
  1306  			},
  1307  			OwnerReferences: []metav1.OwnerReference{
  1308  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
  1309  			},
  1310  		},
  1311  	}
  1312  }
  1313  
  1314  // newLauncherRole creates a new launcher Role for an MPIJob resource. It also
  1315  // sets the appropriate OwnerReferences on the resource so handleObject can
  1316  // discover the MPIJob resource that 'owns' it.
  1317  func newLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) *rbacv1.Role {
  1318  	var podNames []string
  1319  	for i := 0; i < int(workerReplicas); i++ {
  1320  		podNames = append(podNames, fmt.Sprintf("%s%s-%d", mpiJob.Name, workerSuffix, i))
  1321  	}
  1322  	return &rbacv1.Role{
  1323  		ObjectMeta: metav1.ObjectMeta{
  1324  			Name:      mpiJob.Name + launcherSuffix,
  1325  			Namespace: mpiJob.Namespace,
  1326  			Labels: map[string]string{
  1327  				"app": mpiJob.Name,
  1328  			},
  1329  			OwnerReferences: []metav1.OwnerReference{
  1330  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
  1331  			},
  1332  		},
  1333  		Rules: []rbacv1.PolicyRule{
  1334  			{
  1335  				Verbs:     []string{"get", "list", "watch"},
  1336  				APIGroups: []string{""},
  1337  				Resources: []string{"pods"},
  1338  			},
  1339  			{
  1340  				Verbs:         []string{"create"},
  1341  				APIGroups:     []string{""},
  1342  				Resources:     []string{"pods/exec"},
  1343  				ResourceNames: podNames,
  1344  			},
  1345  		},
  1346  	}
  1347  }
  1348  
  1349  // newLauncherRoleBinding creates a new launcher RoleBinding for an MPIJob
  1350  // resource. It also sets the appropriate OwnerReferences on the resource so
  1351  // handleObject can discover the MPIJob resource that 'owns' it.
  1352  func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding {
  1353  	launcherName := mpiJob.Name + launcherSuffix
  1354  	return &rbacv1.RoleBinding{
  1355  		ObjectMeta: metav1.ObjectMeta{
  1356  			Name:      launcherName,
  1357  			Namespace: mpiJob.Namespace,
  1358  			Labels: map[string]string{
  1359  				"app": mpiJob.Name,
  1360  			},
  1361  			OwnerReferences: []metav1.OwnerReference{
  1362  				*metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind),
  1363  			},
  1364  		},
  1365  		Subjects: []rbacv1.Subject{
  1366  			{
  1367  				Kind:      rbacv1.ServiceAccountKind,
  1368  				Name:      launcherName,
  1369  				Namespace: mpiJob.Namespace,
  1370  			},
  1371  		},
  1372  		RoleRef: rbacv1.RoleRef{
  1373  			APIGroup: rbacv1.GroupName,
  1374  			Kind:     "Role",
  1375  			Name:     launcherName,
  1376  		},
  1377  	}
  1378  }
  1379  
  1380  func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) {
  1381  	if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode {
  1382  		podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever
  1383  	} else {
  1384  		podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy)
  1385  	}
  1386  }