github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go (about)

     1  // Copyright 2022 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package paddle
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"strings"
    21  	"time"
    22  
    23  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    24  	trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common"
    25  	"github.com/kubeflow/training-operator/pkg/common/util"
    26  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    27  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    28  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    29  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    30  
    31  	"github.com/go-logr/logr"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	"k8s.io/apimachinery/pkg/api/equality"
    35  	"k8s.io/apimachinery/pkg/api/errors"
    36  	"k8s.io/apimachinery/pkg/api/meta"
    37  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    38  	"k8s.io/apimachinery/pkg/runtime"
    39  	"k8s.io/apimachinery/pkg/runtime/schema"
    40  	"k8s.io/apimachinery/pkg/types"
    41  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    42  	"k8s.io/client-go/informers"
    43  	kubeclientset "k8s.io/client-go/kubernetes"
    44  	"k8s.io/client-go/tools/record"
    45  	ctrl "sigs.k8s.io/controller-runtime"
    46  	"sigs.k8s.io/controller-runtime/pkg/client"
    47  	"sigs.k8s.io/controller-runtime/pkg/controller"
    48  	"sigs.k8s.io/controller-runtime/pkg/event"
    49  	"sigs.k8s.io/controller-runtime/pkg/handler"
    50  	"sigs.k8s.io/controller-runtime/pkg/log"
    51  	"sigs.k8s.io/controller-runtime/pkg/manager"
    52  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    53  	"sigs.k8s.io/controller-runtime/pkg/source"
    54  	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
    55  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    56  )
    57  
    58  const (
    59  	controllerName = "paddlejob-controller"
    60  )
    61  
    62  // NewReconciler creates a PaddleJob Reconciler
    63  func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *PaddleJobReconciler {
    64  	r := &PaddleJobReconciler{
    65  		Client:    mgr.GetClient(),
    66  		Scheme:    mgr.GetScheme(),
    67  		recorder:  mgr.GetEventRecorderFor(controllerName),
    68  		apiReader: mgr.GetAPIReader(),
    69  		Log:       log.Log,
    70  	}
    71  
    72  	// Create clients
    73  	cfg := mgr.GetConfig()
    74  	kubeClientSet := kubeclientset.NewForConfigOrDie(cfg)
    75  	sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0)
    76  	priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses()
    77  
    78  	// Initialize common job controller
    79  	r.JobController = common.JobController{
    80  		Controller:                  r,
    81  		Expectations:                expectation.NewControllerExpectations(),
    82  		WorkQueue:                   &util.FakeWorkQueue{},
    83  		Recorder:                    r.recorder,
    84  		KubeClientSet:               kubeClientSet,
    85  		PriorityClassLister:         priorityClassInformer.Lister(),
    86  		PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced,
    87  		PodControl:                  control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    88  		ServiceControl:              control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    89  	}
    90  
    91  	gangSchedulingSetupFunc(&r.JobController)
    92  
    93  	return r
    94  }
    95  
    96  // PaddleJobReconciler reconciles a PaddleJob object
    97  type PaddleJobReconciler struct {
    98  	common.JobController
    99  	client.Client
   100  	Scheme    *runtime.Scheme
   101  	Log       logr.Logger
   102  	recorder  record.EventRecorder
   103  	apiReader client.Reader
   104  }
   105  
   106  //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs,verbs=get;list;watch;create;update;patch;delete
   107  //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs/status,verbs=get;update;patch
   108  //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs/finalizers,verbs=update
   109  //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
   110  //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
   111  //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   112  //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   113  //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete
   114  
   115  // Reconcile is part of the main kubernetes reconciliation loop which aims to
   116  // move the current state of the cluster closer to the desired state.
   117  // the PaddleJob object against the actual cluster state, and then
   118  // perform operations to make the cluster state reflect the state specified by
   119  // the user.
   120  //
   121  // For more details, check Reconcile and its Result here:
   122  // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile
   123  func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   124  	_ = log.FromContext(ctx)
   125  	logger := r.Log.WithValues(kubeflowv1.PaddleJobSingular, req.NamespacedName)
   126  
   127  	paddlejob := &kubeflowv1.PaddleJob{}
   128  	err := r.Get(ctx, req.NamespacedName, paddlejob)
   129  	if err != nil {
   130  		logger.Info(err.Error(), "unable to fetch PaddleJob", req.NamespacedName.String())
   131  		return ctrl.Result{}, client.IgnoreNotFound(err)
   132  	}
   133  
   134  	if err = kubeflowv1.ValidateV1PaddleJob(paddlejob); err != nil {
   135  		logger.Error(err, "PaddleJob failed validation")
   136  		r.Recorder.Eventf(paddlejob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedValidationReason),
   137  			"PaddleJob failed validation because %s", err)
   138  		return ctrl.Result{}, err
   139  	}
   140  
   141  	// Check if reconciliation is needed
   142  	jobKey, err := common.KeyFunc(paddlejob)
   143  	if err != nil {
   144  		utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", paddlejob, err))
   145  	}
   146  
   147  	replicaTypes := util.GetReplicaTypes(paddlejob.Spec.PaddleReplicaSpecs)
   148  	needReconcile := util.SatisfiedExpectations(r.Expectations, jobKey, replicaTypes)
   149  
   150  	if !needReconcile || paddlejob.GetDeletionTimestamp() != nil {
   151  		logger.Info("reconcile cancelled, job does not need to do reconcile or has been deleted",
   152  			"sync", needReconcile, "deleted", paddlejob.GetDeletionTimestamp() != nil)
   153  		return ctrl.Result{}, nil
   154  	}
   155  
   156  	// Set default priorities to paddle job
   157  	r.Scheme.Default(paddlejob)
   158  
   159  	// Use common to reconcile the job related pod and service
   160  	err = r.ReconcileJobs(paddlejob, paddlejob.Spec.PaddleReplicaSpecs, paddlejob.Status, &paddlejob.Spec.RunPolicy)
   161  	if err != nil {
   162  		logger.Error(err, "Reconcile PaddleJob error")
   163  		return ctrl.Result{}, err
   164  	}
   165  
   166  	t, err := util.DurationUntilExpireTime(&paddlejob.Spec.RunPolicy, paddlejob.Status)
   167  	if err != nil {
   168  		logrus.Warnf("Reconcile PaddleJob error %v", err)
   169  		return ctrl.Result{}, err
   170  	}
   171  	if t >= 0 {
   172  		return ctrl.Result{Requeue: true, RequeueAfter: t}, nil
   173  	}
   174  
   175  	return ctrl.Result{}, nil
   176  }
   177  
   178  // SetupWithManager sets up the controller with the Manager.
   179  func (r *PaddleJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error {
   180  	c, err := controller.New(r.ControllerName(), mgr, controller.Options{
   181  		Reconciler:              r,
   182  		MaxConcurrentReconciles: controllerThreads,
   183  	})
   184  
   185  	if err != nil {
   186  		return err
   187  	}
   188  
   189  	// using onOwnerCreateFunc is easier to set defaults
   190  	if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.PaddleJob{}), &handler.EnqueueRequestForObject{},
   191  		predicate.Funcs{CreateFunc: r.onOwnerCreateFunc()},
   192  	); err != nil {
   193  		return err
   194  	}
   195  
   196  	// eventHandler for owned objects
   197  	eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.PaddleJob{}, handler.OnlyControllerOwner())
   198  	predicates := predicate.Funcs{
   199  		CreateFunc: util.OnDependentCreateFunc(r.Expectations),
   200  		UpdateFunc: util.OnDependentUpdateFunc(&r.JobController),
   201  		DeleteFunc: util.OnDependentDeleteFunc(r.Expectations),
   202  	}
   203  	// Create generic predicates
   204  	genericPredicates := predicate.Funcs{
   205  		CreateFunc: util.OnDependentCreateFuncGeneric(r.Expectations),
   206  		UpdateFunc: util.OnDependentUpdateFuncGeneric(&r.JobController),
   207  		DeleteFunc: util.OnDependentDeleteFuncGeneric(r.Expectations),
   208  	}
   209  	// inject watching for job related pod
   210  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil {
   211  		return err
   212  	}
   213  	// inject watching for job related service
   214  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Service{}), eventHandler, predicates); err != nil {
   215  		return err
   216  	}
   217  	// skip watching volcano PodGroup if volcano PodGroup is not installed
   218  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"},
   219  		v1beta1.SchemeGroupVersion.Version,
   220  	); err == nil {
   221  		// inject watching for job related volcano PodGroup
   222  		if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   223  			return err
   224  		}
   225  	}
   226  	// skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed
   227  	if _, err = mgr.GetRESTMapper().RESTMapping(
   228  		schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"},
   229  		schedulerpluginsv1alpha1.SchemeGroupVersion.Version,
   230  	); err == nil {
   231  		// inject watching for job related scheduler-plugins PodGroup
   232  		if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   233  			return err
   234  		}
   235  	}
   236  
   237  	return nil
   238  }
   239  
   240  func (r *PaddleJobReconciler) ControllerName() string {
   241  	return controllerName
   242  }
   243  
   244  func (r *PaddleJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind {
   245  	return kubeflowv1.GroupVersion.WithKind(kubeflowv1.PaddleJobKind)
   246  }
   247  
   248  func (r *PaddleJobReconciler) GetAPIGroupVersion() schema.GroupVersion {
   249  	return kubeflowv1.GroupVersion
   250  }
   251  
   252  func (r *PaddleJobReconciler) GetGroupNameLabelValue() string {
   253  	return kubeflowv1.GroupVersion.Group
   254  }
   255  
   256  func (r *PaddleJobReconciler) GetFrameworkName() string {
   257  	return kubeflowv1.PaddleJobFrameworkName
   258  }
   259  
   260  func (r *PaddleJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
   261  	job := &kubeflowv1.PaddleJob{}
   262  	err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   263  	if err != nil {
   264  		if errors.IsNotFound(err) {
   265  			logrus.Error(err, "paddle job not found", "namespace", namespace, "name", name)
   266  		} else {
   267  			logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   268  		}
   269  		return nil, err
   270  	}
   271  	return job, nil
   272  }
   273  
   274  func (r *PaddleJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) {
   275  	job := &kubeflowv1.PaddleJob{}
   276  
   277  	err := r.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   278  	if err != nil {
   279  		if errors.IsNotFound(err) {
   280  			logrus.Error(err, "paddle job not found", "namespace", namespace, "name", name)
   281  		} else {
   282  			logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   283  		}
   284  		return nil, err
   285  	}
   286  	return job, nil
   287  }
   288  
   289  func (r *PaddleJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) {
   290  	job, err := meta.Accessor(obj)
   291  	if err != nil {
   292  		return nil, err
   293  	}
   294  
   295  	// List all pods to include those that don't match the selector anymore
   296  	// but have a ControllerRef pointing to this controller.
   297  	podlist := &corev1.PodList{}
   298  	err = r.List(context.Background(), podlist, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   299  	if err != nil {
   300  		return nil, err
   301  	}
   302  
   303  	return util.JobControlledPodList(podlist.Items, job), nil
   304  }
   305  
   306  func (r *PaddleJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) {
   307  	job, err := meta.Accessor(obj)
   308  	if err != nil {
   309  		return nil, err
   310  	}
   311  
   312  	// List all pods to include those that don't match the selector anymore
   313  	// but have a ControllerRef pointing to this controller.
   314  	serviceList := &corev1.ServiceList{}
   315  	err = r.List(context.Background(), serviceList, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   316  	if err != nil {
   317  		return nil, err
   318  	}
   319  
   320  	ret := util.ConvertServiceList(serviceList.Items)
   321  	return ret, nil
   322  }
   323  
   324  func (r *PaddleJobReconciler) DeleteJob(job interface{}) error {
   325  	paddlejob, ok := job.(*kubeflowv1.PaddleJob)
   326  	if !ok {
   327  		return fmt.Errorf("%+v is not a type of PaddleJob", job)
   328  	}
   329  	if err := r.Delete(context.Background(), paddlejob); err != nil {
   330  		r.recorder.Eventf(paddlejob, corev1.EventTypeWarning, control.FailedDeletePodReason, "Error deleting: %v", err)
   331  		logrus.Error(err, "failed to delete job", "namespace", paddlejob.Namespace, "name", paddlejob.Name)
   332  		return err
   333  	}
   334  	r.recorder.Eventf(paddlejob, corev1.EventTypeNormal, control.SuccessfulDeletePodReason, "Deleted job: %v", paddlejob.Name)
   335  	logrus.Info("job deleted", "namespace", paddlejob.Namespace, "name", paddlejob.Name)
   336  	trainingoperatorcommon.DeletedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   337  	return nil
   338  }
   339  
   340  func (jc *PaddleJobReconciler) GenLabelSelector(jobName string,
   341  	rtype kubeflowv1.ReplicaType) *metav1.LabelSelector {
   342  	labels := jc.GenLabels(jobName)
   343  	labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(rtype))
   344  
   345  	return &metav1.LabelSelector{
   346  		MatchLabels: labels,
   347  	}
   348  }
   349  
   350  // UpdateJobStatus updates the job status and job conditions
   351  func (r *PaddleJobReconciler) UpdateJobStatus(job interface{},
   352  	replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   353  	jobStatus *kubeflowv1.JobStatus) error {
   354  	paddlejob, ok := job.(*kubeflowv1.PaddleJob)
   355  	if !ok {
   356  		return fmt.Errorf("%+v is not a type of PaddleJob", job)
   357  	}
   358  
   359  	paddlejobKey, err := common.KeyFunc(paddlejob)
   360  	if err != nil {
   361  		utilruntime.HandleError(fmt.Errorf("couldn't get key for paddlejob object %#v: %v", paddlejob, err))
   362  		return err
   363  	}
   364  
   365  	logger := commonutil.LoggerForJob(paddlejob)
   366  
   367  	// Set StartTime.
   368  	if jobStatus.StartTime == nil {
   369  		now := metav1.Now()
   370  		jobStatus.StartTime = &now
   371  		// enqueue a sync to check if job past ActiveDeadlineSeconds
   372  		if paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds != nil {
   373  			logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds)
   374  			r.WorkQueue.AddAfter(paddlejobKey, time.Duration(*paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
   375  		}
   376  	}
   377  
   378  	for rtype, spec := range replicas {
   379  		status := jobStatus.ReplicaStatuses[rtype]
   380  		// Generate the label selector.
   381  		status.Selector = metav1.FormatLabelSelector(r.GenLabelSelector(paddlejob.Name, rtype))
   382  
   383  		succeeded := status.Succeeded
   384  		expected := *(spec.Replicas) - succeeded
   385  		running := status.Active
   386  		failed := status.Failed
   387  		specReplicas := *spec.Replicas
   388  
   389  		logrus.Infof("PaddleJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d, failed=%d, Replicas=%d",
   390  			paddlejob.Name, rtype, expected, running, succeeded, failed, specReplicas)
   391  
   392  		if ContainsMasterSpec(replicas) {
   393  			if rtype == kubeflowv1.PaddleJobReplicaTypeMaster {
   394  				if running > 0 {
   395  					msg := fmt.Sprintf("PaddleJob %s is running.", paddlejob.Name)
   396  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), msg)
   397  				}
   398  				// when master is succeed, the job is finished.
   399  				if expected == 0 {
   400  					msg := fmt.Sprintf("PaddleJob %s is successfully completed.", paddlejob.Name)
   401  					logrus.Info(msg)
   402  					r.Recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg)
   403  					if jobStatus.CompletionTime == nil {
   404  						now := metav1.Now()
   405  						jobStatus.CompletionTime = &now
   406  					}
   407  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg)
   408  					trainingoperatorcommon.SuccessfulJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   409  					return nil
   410  				}
   411  			}
   412  		} else {
   413  			if rtype == kubeflowv1.PaddleJobReplicaTypeWorker {
   414  				// TODO(gaocegege): Support SuccessPolicy
   415  				if expected == 0 {
   416  					msg := fmt.Sprintf("PaddleJob %s/%s successfully completed.",
   417  						paddlejob.Namespace, paddlejob.Name)
   418  					r.recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg)
   419  					if jobStatus.CompletionTime == nil {
   420  						now := metav1.Now()
   421  						jobStatus.CompletionTime = &now
   422  					}
   423  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg)
   424  					trainingoperatorcommon.SuccessfulJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   425  				} else if running > 0 {
   426  					// Some workers are still running, leave a running condition.
   427  					msg := fmt.Sprintf("PaddleJob %s/%s is running.",
   428  						paddlejob.Namespace, paddlejob.Name)
   429  					commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), msg)
   430  				}
   431  			}
   432  		}
   433  
   434  		if failed > 0 && (specReplicas > succeeded+running) {
   435  			if spec.RestartPolicy != kubeflowv1.RestartPolicyNever {
   436  				msg := fmt.Sprintf("PaddleJob %s is restarting because %d %s replica(s) failed.", paddlejob.Name, failed, rtype)
   437  				r.Recorder.Event(paddlejob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRestartingReason), msg)
   438  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRestartingReason), msg)
   439  				trainingoperatorcommon.RestartedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   440  			} else {
   441  				msg := fmt.Sprintf("PaddleJob %s is failed because %d %s replica(s) failed.", paddlejob.Name, failed, rtype)
   442  				r.Recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedReason), msg)
   443  				if jobStatus.CompletionTime == nil {
   444  					now := metav1.Now()
   445  					jobStatus.CompletionTime = &now
   446  				}
   447  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedReason), msg)
   448  				trainingoperatorcommon.FailedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   449  			}
   450  		}
   451  	}
   452  
   453  	return nil
   454  }
   455  
   456  // ContainsMasterSpec returns true if the paddlejob contains master spec.
   457  func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool {
   458  	if _, ok := replicas[kubeflowv1.PaddleJobReplicaTypeMaster]; ok {
   459  		return true
   460  	}
   461  	return false
   462  }
   463  
   464  // UpdateJobStatusInApiServer updates the job status in to cluster.
   465  func (r *PaddleJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error {
   466  	if jobStatus.ReplicaStatuses == nil {
   467  		jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{}
   468  	}
   469  
   470  	paddlejob, ok := job.(*kubeflowv1.PaddleJob)
   471  	trainingoperatorcommon.ClearGeneratedFields(&paddlejob.ObjectMeta)
   472  	if !ok {
   473  		return fmt.Errorf("%+v is not a type of PaddleJob", job)
   474  	}
   475  
   476  	// Job status passed in differs with status in job, update in basis of the passed in one.
   477  	if !equality.Semantic.DeepEqual(&paddlejob.Status, jobStatus) {
   478  		paddlejob = paddlejob.DeepCopy()
   479  		paddlejob.Status = *jobStatus.DeepCopy()
   480  	}
   481  
   482  	result := r.Status().Update(context.Background(), paddlejob)
   483  
   484  	if result != nil {
   485  		r.Log.WithValues("paddlejob", types.NamespacedName{
   486  			Namespace: paddlejob.GetNamespace(),
   487  			Name:      paddlejob.GetName(),
   488  		})
   489  		return result
   490  	}
   491  
   492  	return nil
   493  }
   494  
   495  // SetClusterSpec sets the cluster spec and init container for the pod
   496  func (r *PaddleJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error {
   497  	// TODO
   498  	if err := setPodEnv(job, podTemplate, rtype, index); err != nil {
   499  		return err
   500  	}
   501  	return nil
   502  }
   503  
   504  func (r *PaddleJobReconciler) GetDefaultContainerName() string {
   505  	return kubeflowv1.PaddleJobDefaultContainerName
   506  }
   507  
   508  func (r *PaddleJobReconciler) GetDefaultContainerPortName() string {
   509  	return kubeflowv1.PaddleJobDefaultPortName
   510  }
   511  
   512  func (r *PaddleJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   513  	rtype kubeflowv1.ReplicaType, index int) bool {
   514  	return string(rtype) == string(kubeflowv1.PaddleJobReplicaTypeMaster)
   515  }
   516  
   517  // onOwnerCreateFunc modify creation condition.
   518  func (r *PaddleJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool {
   519  	return func(e event.CreateEvent) bool {
   520  		paddlejob, ok := e.Object.(*kubeflowv1.PaddleJob)
   521  		if !ok {
   522  			return true
   523  		}
   524  		r.Scheme.Default(paddlejob)
   525  		msg := fmt.Sprintf("PaddleJob %s is created.", e.Object.GetName())
   526  		logrus.Info(msg)
   527  		trainingoperatorcommon.CreatedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName())
   528  		commonutil.UpdateJobConditions(&paddlejob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), msg)
   529  		return true
   530  	}
   531  }