github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/pytorchjob_controller.go

github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/pytorch/pytorchjob_controller.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package pytorch
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  	"time"
    22  
    23  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    24  	trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common"
    25  	"github.com/kubeflow/training-operator/pkg/common/util"
    26  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    27  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    28  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    29  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    30  
    31  	"github.com/go-logr/logr"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	"k8s.io/apimachinery/pkg/api/equality"
    35  	"k8s.io/apimachinery/pkg/api/errors"
    36  	"k8s.io/apimachinery/pkg/api/meta"
    37  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    38  	"k8s.io/apimachinery/pkg/runtime"
    39  	"k8s.io/apimachinery/pkg/runtime/schema"
    40  	"k8s.io/apimachinery/pkg/types"
    41  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    42  	"k8s.io/client-go/informers"
    43  	kubeclientset "k8s.io/client-go/kubernetes"
    44  	"k8s.io/client-go/tools/record"
    45  	ctrl "sigs.k8s.io/controller-runtime"
    46  	"sigs.k8s.io/controller-runtime/pkg/client"
    47  	"sigs.k8s.io/controller-runtime/pkg/controller"
    48  	"sigs.k8s.io/controller-runtime/pkg/event"
    49  	"sigs.k8s.io/controller-runtime/pkg/handler"
    50  	"sigs.k8s.io/controller-runtime/pkg/log"
    51  	"sigs.k8s.io/controller-runtime/pkg/manager"
    52  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    53  	"sigs.k8s.io/controller-runtime/pkg/source"
    54  	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
    55  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    56  )
    57  
    58  const (
    59  	controllerName = "pytorchjob-controller"
    60  )
    61  
    62  // NewReconciler creates a PyTorchJob Reconciler
    63  func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *PyTorchJobReconciler {
    64  	r := &PyTorchJobReconciler{
    65  		Client:    mgr.GetClient(),
    66  		Scheme:    mgr.GetScheme(),
    67  		recorder:  mgr.GetEventRecorderFor(controllerName),
    68  		apiReader: mgr.GetAPIReader(),
    69  		Log:       log.Log,
    70  	}
    71  
    72  	// Create clients
    73  	cfg := mgr.GetConfig()
    74  	kubeClientSet := kubeclientset.NewForConfigOrDie(cfg)
    75  	sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0)
    76  	priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses()
    77  
    78  	// Initialize common job controller
    79  	r.JobController = common.JobController{
    80  		Controller:                  r,
    81  		Expectations:                expectation.NewControllerExpectations(),
    82  		WorkQueue:                   &util.FakeWorkQueue{},
    83  		Recorder:                    r.recorder,
    84  		KubeClientSet:               kubeClientSet,
    85  		PriorityClassLister:         priorityClassInformer.Lister(),
    86  		PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced,
    87  		PodControl:                  control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    88  		ServiceControl:              control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    89  	}
    90  
    91  	gangSchedulingSetupFunc(&r.JobController)
    92  
    93  	return r
    94  }
    95  
    96  // PyTorchJobReconciler reconciles a PyTorchJob object
    97  type PyTorchJobReconciler struct {
    98  	common.JobController
    99  	client.Client
   100  	Scheme    *runtime.Scheme
   101  	Log       logr.Logger
   102  	recorder  record.EventRecorder
   103  	apiReader client.Reader
   104  }
   105  
   106  //+kubebuilder:rbac:groups=kubeflow.org,resources=pytorchjobs,verbs=get;list;watch;create;update;patch;delete
   107  //+kubebuilder:rbac:groups=kubeflow.org,resources=pytorchjobs/status,verbs=get;update;patch
   108  //+kubebuilder:rbac:groups=kubeflow.org,resources=pytorchjobs/finalizers,verbs=update
   109  //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
   110  //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
   111  //+kubebuilder:rbac:groups=autoscaling,resources=horizontalpodautoscalers,verbs=get;list;watch;create;update;patch;delete
   112  //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   113  //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   114  //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete
   115  
   116  // Reconcile is part of the main kubernetes reconciliation loop which aims to
   117  // move the current state of the cluster closer to the desired state.
   118  // the PyTorchJob object against the actual cluster state, and then
   119  // perform operations to make the cluster state reflect the state specified by
   120  // the user.
   121  //
   122  // For more details, check Reconcile and its Result here:
   123  // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile
   124  func (r *PyTorchJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   125  	_ = log.FromContext(ctx)
   126  	logger := r.Log.WithValues(kubeflowv1.PyTorchJobSingular, req.NamespacedName)
   127  
   128  	pytorchjob := &kubeflowv1.PyTorchJob{}
   129  	err := r.Get(ctx, req.NamespacedName, pytorchjob)
   130  	if err != nil {
   131  		logger.Info(err.Error(), "unable to fetch PyTorchJob", req.NamespacedName.String())
   132  		return ctrl.Result{}, client.IgnoreNotFound(err)
   133  	}
   134  
   135  	if err = kubeflowv1.ValidateV1PyTorchJob(pytorchjob); err != nil {
   136  		logger.Error(err, "PyTorchJob failed validation")
   137  		r.Recorder.Eventf(pytorchjob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobFailedValidationReason),
   138  			"PyTorchJob failed validation because %s", err)
   139  		return ctrl.Result{}, err
   140  	}
   141  
   142  	// Check if reconciliation is needed
   143  	jobKey, err := common.KeyFunc(pytorchjob)
   144  	if err != nil {
   145  		utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", pytorchjob, err))
   146  	}
   147  
   148  	replicaTypes := util.GetReplicaTypes(pytorchjob.Spec.PyTorchReplicaSpecs)
   149  	needReconcile := util.SatisfiedExpectations(r.Expectations, jobKey, replicaTypes)
   150  
   151  	if !needReconcile || pytorchjob.GetDeletionTimestamp() != nil {
   152  		logger.Info("reconcile cancelled, job does not need to do reconcile or has been deleted",
   153  			"sync", needReconcile, "deleted", pytorchjob.GetDeletionTimestamp() != nil)
   154  		return ctrl.Result{}, nil
   155  	}
   156  
   157  	// Set default priorities to pytorch job
   158  	r.Scheme.Default(pytorchjob)
   159  
   160  	err = r.ReconcileHPA(pytorchjob)
   161  	if err != nil {
   162  		logger.Error(err, "Reconcile PyTorchJob HPA error")
   163  		return ctrl.Result{}, err
   164  	}
   165  	// Use common to reconcile the job related pod and service
   166  	err = r.ReconcileJobs(pytorchjob, pytorchjob.Spec.PyTorchReplicaSpecs, pytorchjob.Status, &pytorchjob.Spec.RunPolicy)
   167  	if err != nil {
   168  		logger.Error(err, "Reconcile PyTorchJob error")
   169  		return ctrl.Result{}, err
   170  	}
   171  	t, err := util.DurationUntilExpireTime(&pytorchjob.Spec.RunPolicy, pytorchjob.Status)
   172  	if err != nil {
   173  		logrus.Warnf("Reconcile PyTorchJob error %v", err)
   174  		return ctrl.Result{}, err
   175  	}
   176  	if t >= 0 {
   177  		return ctrl.Result{Requeue: true, RequeueAfter: t}, nil
   178  	}
   179  
   180  	return ctrl.Result{}, nil
   181  }
   182  
   183  // SetupWithManager sets up the controller with the Manager.
   184  func (r *PyTorchJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error {
   185  	c, err := controller.New(r.ControllerName(), mgr, controller.Options{
   186  		Reconciler:              r,
   187  		MaxConcurrentReconciles: controllerThreads,
   188  	})
   189  	if err != nil {
   190  		return err
   191  	}
   192  
   193  	// using onOwnerCreateFunc is easier to set defaults
   194  	if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.PyTorchJob{}), &handler.EnqueueRequestForObject{},
   195  		predicate.Funcs{CreateFunc: r.onOwnerCreateFunc()},
   196  	); err != nil {
   197  		return err
   198  	}
   199  
   200  	// eventHandler for owned object
   201  	eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.PyTorchJob{}, handler.OnlyControllerOwner())
   202  	predicates := predicate.Funcs{
   203  		CreateFunc: util.OnDependentCreateFunc(r.Expectations),
   204  		UpdateFunc: util.OnDependentUpdateFunc(&r.JobController),
   205  		DeleteFunc: util.OnDependentDeleteFunc(r.Expectations),
   206  	}
   207  	// Create generic predicates
   208  	genericPredicates := predicate.Funcs{
   209  		CreateFunc: util.OnDependentCreateFuncGeneric(r.Expectations),
   210  		UpdateFunc: util.OnDependentUpdateFuncGeneric(&r.JobController),
   211  		DeleteFunc: util.OnDependentDeleteFuncGeneric(r.Expectations),
   212  	}
   213  	// inject watching for job related pod
   214  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil {
   215  		return err
   216  	}
   217  	// inject watching for job related service
   218  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Service{}), eventHandler, predicates); err != nil {
   219  		return err
   220  	}
   221  	// skip watching volcano PodGroup if volcano PodGroup is not installed
   222  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"},
   223  		v1beta1.SchemeGroupVersion.Version); err == nil {
   224  		// inject watching for job related volcano PodGroup
   225  		if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   226  			return err
   227  		}
   228  	}
   229  	// skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed
   230  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"},
   231  		schedulerpluginsv1alpha1.SchemeGroupVersion.Version); err == nil {
   232  		// inject watching for job related scheduler-plugins PodGroup
   233  		if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   234  			return err
   235  		}
   236  	}
   237  	return nil
   238  }
   239  
   240  func (r *PyTorchJobReconciler) ControllerName() string {
   241  	return controllerName
   242  }
   243  
   244  func (r *PyTorchJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind {
   245  	return kubeflowv1.GroupVersion.WithKind(kubeflowv1.PyTorchJobKind)
   246  }
   247  
   248  func (r *PyTorchJobReconciler) GetAPIGroupVersion() schema.GroupVersion {
   249  	return kubeflowv1.GroupVersion
   250  }
   251  
   252  func (r *PyTorchJobReconciler) GetGroupNameLabelValue() string {
   253  	return kubeflowv1.GroupVersion.Group
   254  }
   255  
   256  func (r *PyTorchJobReconciler) GetFrameworkName() string {
   257  	return kubeflowv1.PyTorchJobFrameworkName
   258  }
   259  
   260  func (r *PyTorchJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
   261  	job := &kubeflowv1.PyTorchJob{}
   262  	err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   263  	if err != nil {
   264  		if errors.IsNotFound(err) {
   265  			logrus.Error(err, "pytorch job not found", "namespace", namespace, "name", name)
   266  		} else {
   267  			logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   268  		}
   269  		return nil, err
   270  	}
   271  	return job, nil
   272  }
   273  
   274  func (r *PyTorchJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) {
   275  	job := &kubeflowv1.PyTorchJob{}
   276  
   277  	err := r.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   278  	if err != nil {
   279  		if errors.IsNotFound(err) {
   280  			logrus.Error(err, "pytorch job not found", "namespace", namespace, "name", name)
   281  		} else {
   282  			logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   283  		}
   284  		return nil, err
   285  	}
   286  	return job, nil
   287  }
   288  
   289  func (r *PyTorchJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) {
   290  	job, err := meta.Accessor(obj)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  
   295  	// List all pods to include those that don't match the selector anymore
   296  	// but have a ControllerRef pointing to this controller.
   297  	podlist := &corev1.PodList{}
   298  	err = r.List(context.Background(), podlist, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   299  	if err != nil {
   300  		return nil, err
   301  	}
   302  
   303  	return util.JobControlledPodList(podlist.Items, job), nil
   304  }
   305  
   306  func (r *PyTorchJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) {
   307  	job, err := meta.Accessor(obj)
   308  	if err != nil {
   309  		return nil, err
   310  	}
   311  
   312  	// List all pods to include those that don't match the selector anymore
   313  	// but have a ControllerRef pointing to this controller.
   314  	serviceList := &corev1.ServiceList{}
   315  	err = r.List(context.Background(), serviceList, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   316  	if err != nil {
   317  		return nil, err
   318  	}
   319  
   320  	ret := util.ConvertServiceList(serviceList.Items)
   321  	return ret, nil
   322  }
   323  
   324  func (r *PyTorchJobReconciler) DeleteJob(job interface{}) error {
   325  	pytorchjob, ok := job.(*kubeflowv1.PyTorchJob)
   326  	if !ok {
   327  		return fmt.Errorf("%+v is not a type of PyTorchJob", job)
   328  	}
   329  	if err := r.Delete(context.Background(), pytorchjob); err != nil {
   330  		r.recorder.Eventf(pytorchjob, corev1.EventTypeWarning, control.FailedDeletePodReason, "Error deleting: %v", err)
   331  		logrus.Error(err, "failed to delete job", "namespace", pytorchjob.Namespace, "name", pytorchjob.Name)
   332  		return err
   333  	}
   334  	r.recorder.Eventf(pytorchjob, corev1.EventTypeNormal, control.SuccessfulDeletePodReason, "Deleted job: %v", pytorchjob.Name)
   335  	logrus.Info("job deleted", "namespace", pytorchjob.Namespace, "name", pytorchjob.Name)
   336  	trainingoperatorcommon.DeletedJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   337  	return nil
   338  }
   339  
   340  func (jc *PyTorchJobReconciler) GenLabelSelector(jobName string,
   341  	rtype kubeflowv1.ReplicaType) *metav1.LabelSelector {
   342  	labels := jc.GenLabels(jobName)
   343  	labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(rtype))
   344  
   345  	return &metav1.LabelSelector{
   346  		MatchLabels: labels,
   347  	}
   348  }
   349  
   350  // UpdateJobStatus updates the job status and job conditions
   351  func (r *PyTorchJobReconciler) UpdateJobStatus(job interface{},
   352  	replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   353  	jobStatus *kubeflowv1.JobStatus) error {
   354  	pytorchjob, ok := job.(*kubeflowv1.PyTorchJob)
   355  	if !ok {
   356  		return fmt.Errorf("%+v is not a type of PyTorchJob", job)
   357  	}
   358  	pytorchjobKey, err := common.KeyFunc(pytorchjob)
   359  	if err != nil {
   360  		utilruntime.HandleError(fmt.Errorf("couldn't get key for pytorchjob object %#v: %v", pytorchjob, err))
   361  		return err
   362  	}
   363  
   364  	logger := commonutil.LoggerForJob(pytorchjob)
   365  
   366  	// Set StartTime.
   367  	if jobStatus.StartTime == nil {
   368  		now := metav1.Now()
   369  		jobStatus.StartTime = &now
   370  		// enqueue a sync to check if job past ActiveDeadlineSeconds
   371  		if pytorchjob.Spec.RunPolicy.ActiveDeadlineSeconds != nil {
   372  			logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *pytorchjob.Spec.RunPolicy.ActiveDeadlineSeconds)
   373  			r.WorkQueue.AddAfter(pytorchjobKey, time.Duration(*pytorchjob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
   374  		}
   375  	}
   376  
   377  	for rtype, spec := range replicas {
   378  		status := jobStatus.ReplicaStatuses[rtype]
   379  		// Generate the label selector.
   380  		status.Selector = metav1.FormatLabelSelector(r.GenLabelSelector(pytorchjob.Name, rtype))
   381  
   382  		succeeded := status.Succeeded
   383  		expected := *(spec.Replicas) - succeeded
   384  		running := status.Active
   385  		failed := status.Failed
   386  		specReplicas := *spec.Replicas
   387  
   388  		logrus.Infof("PyTorchJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d, failed=%d, Replicas=%d",
   389  			pytorchjob.Name, rtype, expected, running, succeeded, failed, specReplicas)
   390  
   391  		if ContainsMasterSpec(replicas) {
   392  			if rtype == kubeflowv1.PyTorchJobReplicaTypeMaster {
   393  				if running > 0 {
   394  					msg := fmt.Sprintf("PyTorchJob %s is running.", pytorchjob.Name)
   395  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobRunningReason), msg)
   396  				}
   397  				// when master is succeed, the job is finished.
   398  				if expected == 0 {
   399  					msg := fmt.Sprintf("PyTorchJob %s is successfully completed.", pytorchjob.Name)
   400  					logrus.Info(msg)
   401  					r.Recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobSucceededReason), msg)
   402  					if jobStatus.CompletionTime == nil {
   403  						now := metav1.Now()
   404  						jobStatus.CompletionTime = &now
   405  					}
   406  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobSucceededReason), msg)
   407  					trainingoperatorcommon.SuccessfulJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   408  					return nil
   409  				}
   410  			}
   411  		} else {
   412  			if rtype == kubeflowv1.PyTorchJobReplicaTypeWorker {
   413  				// TODO(gaocegege): Support SuccessPolicy
   414  				// Leave a succeeded condition for the following two cases:
   415  				// 1. If all workers are succeeded.
   416  				// 2. If `ElasticPolicy` is not nil and any worker has completed.
   417  				if expected == 0 || (pytorchjob.Spec.ElasticPolicy != nil && succeeded > 0) {
   418  					msg := fmt.Sprintf("PyTorchJob %s/%s successfully completed.",
   419  						pytorchjob.Namespace, pytorchjob.Name)
   420  					r.recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobSucceededReason), msg)
   421  					if jobStatus.CompletionTime == nil {
   422  						now := metav1.Now()
   423  						jobStatus.CompletionTime = &now
   424  					}
   425  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobSucceededReason), msg)
   426  					trainingoperatorcommon.SuccessfulJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   427  				} else if running > 0 {
   428  					// Some workers are still running, leave a running condition.
   429  					msg := fmt.Sprintf("PyTorchJob %s/%s is running.",
   430  						pytorchjob.Namespace, pytorchjob.Name)
   431  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobRunningReason), msg)
   432  				}
   433  			}
   434  		}
   435  
   436  		if failed > 0 && (specReplicas > succeeded+running) {
   437  			if spec.RestartPolicy != kubeflowv1.RestartPolicyNever {
   438  				msg := fmt.Sprintf("PyTorchJob %s is restarting because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype)
   439  				r.Recorder.Event(pytorchjob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobRestartingReason), msg)
   440  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobRestartingReason), msg)
   441  				trainingoperatorcommon.RestartedJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   442  			} else {
   443  				msg := fmt.Sprintf("PyTorchJob %s is failed because %d %s replica(s) failed.", pytorchjob.Name, failed, rtype)
   444  				r.Recorder.Event(pytorchjob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobFailedReason), msg)
   445  				if jobStatus.CompletionTime == nil {
   446  					now := metav1.Now()
   447  					jobStatus.CompletionTime = &now
   448  				}
   449  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobFailedReason), msg)
   450  				trainingoperatorcommon.FailedJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   451  			}
   452  		}
   453  	}
   454  	return nil
   455  }
   456  
   457  // ContainsMasterSpec returns true if the pytorchjob contains master spec.
   458  func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool {
   459  	if _, ok := replicas[kubeflowv1.PyTorchJobReplicaTypeMaster]; ok {
   460  		return true
   461  	}
   462  	return false
   463  }
   464  
   465  // UpdateJobStatusInApiServer updates the job status in to cluster.
   466  func (r *PyTorchJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error {
   467  	if jobStatus.ReplicaStatuses == nil {
   468  		jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{}
   469  	}
   470  
   471  	pytorchjob, ok := job.(*kubeflowv1.PyTorchJob)
   472  	trainingoperatorcommon.ClearGeneratedFields(&pytorchjob.ObjectMeta)
   473  	if !ok {
   474  		return fmt.Errorf("%+v is not a type of PyTorchJob", job)
   475  	}
   476  
   477  	// Job status passed in differs with status in job, update in basis of the passed in one.
   478  	if !equality.Semantic.DeepEqual(&pytorchjob.Status, jobStatus) {
   479  		pytorchjob = pytorchjob.DeepCopy()
   480  		pytorchjob.Status = *jobStatus.DeepCopy()
   481  	}
   482  
   483  	result := r.Status().Update(context.Background(), pytorchjob)
   484  
   485  	if result != nil {
   486  		r.Log.WithValues("pytorchjob", types.NamespacedName{
   487  			Namespace: pytorchjob.GetNamespace(),
   488  			Name:      pytorchjob.GetName(),
   489  		})
   490  		return result
   491  	}
   492  
   493  	return nil
   494  }
   495  
   496  // SetClusterSpec sets the cluster spec and init container for the pod
   497  func (r *PyTorchJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error {
   498  	if err := setPodEnv(job, podTemplate, rtype, index); err != nil {
   499  		return err
   500  	}
   501  	if err := setInitContainer(job, podTemplate, rtype, index, r.Log); err != nil {
   502  		return err
   503  	}
   504  	return nil
   505  }
   506  
   507  func (r *PyTorchJobReconciler) GetDefaultContainerName() string {
   508  	return kubeflowv1.PyTorchJobDefaultContainerName
   509  }
   510  
   511  func (r *PyTorchJobReconciler) GetDefaultContainerPortName() string {
   512  	return kubeflowv1.PyTorchJobDefaultPortName
   513  }
   514  
   515  func (r *PyTorchJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   516  	rtype kubeflowv1.ReplicaType, index int) bool {
   517  	return string(rtype) == string(kubeflowv1.PyTorchJobReplicaTypeMaster)
   518  }
   519  
   520  // onOwnerCreateFunc modify creation condition.
   521  func (r *PyTorchJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool {
   522  	return func(e event.CreateEvent) bool {
   523  		pytorchjob, ok := e.Object.(*kubeflowv1.PyTorchJob)
   524  		if !ok {
   525  			return true
   526  		}
   527  		r.Scheme.Default(pytorchjob)
   528  		msg := fmt.Sprintf("PyTorchJob %s is created.", e.Object.GetName())
   529  		logrus.Info(msg)
   530  		trainingoperatorcommon.CreatedJobsCounterInc(pytorchjob.Namespace, r.GetFrameworkName())
   531  		commonutil.UpdateJobConditions(&pytorchjob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PyTorchJobKind, commonutil.JobCreatedReason), msg)
   532  		return true
   533  	}
   534  }