github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/xgboost/xgboostjob_controller.go (about)

     1  // Copyright 2021 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package xgboost
    16  
    17  import (
    18  	"context"
    19  	"fmt"
    20  	"reflect"
    21  	"time"
    22  
    23  	kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    24  	trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common"
    25  	"github.com/kubeflow/training-operator/pkg/common/util"
    26  	"github.com/kubeflow/training-operator/pkg/controller.v1/common"
    27  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    28  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    29  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    30  
    31  	"github.com/go-logr/logr"
    32  	"github.com/sirupsen/logrus"
    33  	corev1 "k8s.io/api/core/v1"
    34  	"k8s.io/apimachinery/pkg/api/errors"
    35  	"k8s.io/apimachinery/pkg/api/meta"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/apimachinery/pkg/runtime"
    38  	"k8s.io/apimachinery/pkg/runtime/schema"
    39  	"k8s.io/apimachinery/pkg/types"
    40  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    41  	"k8s.io/client-go/informers"
    42  	kubeclientset "k8s.io/client-go/kubernetes"
    43  	"k8s.io/client-go/tools/record"
    44  	ctrl "sigs.k8s.io/controller-runtime"
    45  	"sigs.k8s.io/controller-runtime/pkg/client"
    46  	"sigs.k8s.io/controller-runtime/pkg/controller"
    47  	"sigs.k8s.io/controller-runtime/pkg/event"
    48  	"sigs.k8s.io/controller-runtime/pkg/handler"
    49  	"sigs.k8s.io/controller-runtime/pkg/manager"
    50  	"sigs.k8s.io/controller-runtime/pkg/predicate"
    51  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    52  	"sigs.k8s.io/controller-runtime/pkg/source"
    53  	schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1"
    54  	"volcano.sh/apis/pkg/apis/scheduling/v1beta1"
    55  )
    56  
    57  const (
    58  	controllerName = "xgboostjob-controller"
    59  
    60  	// Reasons for job events.
    61  	FailedDeleteJobReason     = "FailedDeleteJob"
    62  	SuccessfulDeleteJobReason = "SuccessfulDeleteJob"
    63  )
    64  
    65  // NewReconciler creates a XGBoostJob Reconciler
    66  func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *XGBoostJobReconciler {
    67  	r := &XGBoostJobReconciler{
    68  		Client:    mgr.GetClient(),
    69  		Scheme:    mgr.GetScheme(),
    70  		recorder:  mgr.GetEventRecorderFor(controllerName),
    71  		apiReader: mgr.GetAPIReader(),
    72  		Log:       ctrl.Log.WithName("controllers").WithName(kubeflowv1.XGBoostJobKind),
    73  	}
    74  
    75  	// Create clients
    76  	cfg := mgr.GetConfig()
    77  	kubeClientSet := kubeclientset.NewForConfigOrDie(cfg)
    78  	sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0)
    79  	priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses()
    80  
    81  	// Initialize common job controller
    82  	r.JobController = common.JobController{
    83  		Controller:                  r,
    84  		Expectations:                expectation.NewControllerExpectations(),
    85  		WorkQueue:                   &util.FakeWorkQueue{},
    86  		Recorder:                    r.recorder,
    87  		KubeClientSet:               kubeClientSet,
    88  		PriorityClassLister:         priorityClassInformer.Lister(),
    89  		PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced,
    90  		PodControl:                  control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    91  		ServiceControl:              control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder},
    92  	}
    93  
    94  	gangSchedulingSetupFunc(&r.JobController)
    95  
    96  	return r
    97  }
    98  
    99  // XGBoostJobReconciler reconciles a XGBoostJob object
   100  type XGBoostJobReconciler struct {
   101  	common.JobController
   102  	client.Client
   103  	Log       logr.Logger
   104  	Scheme    *runtime.Scheme
   105  	recorder  record.EventRecorder
   106  	apiReader client.Reader
   107  }
   108  
   109  //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs,verbs=get;list;watch;create;update;patch;delete
   110  //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs/status,verbs=get;update;patch
   111  //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs/finalizers,verbs=update
   112  //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete
   113  //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete
   114  //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   115  //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete
   116  //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete
   117  
   118  // Reconcile reads that state of the cluster for a XGBoostJob object and makes changes based on the state read
   119  // and what is in the XGBoostJob.Spec
   120  // Automatically generate RBAC rules to allow the Controller to read and write Deployments
   121  func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   122  	logger := r.Log.WithValues(kubeflowv1.XGBoostJobSingular, req.NamespacedName)
   123  
   124  	xgboostjob := &kubeflowv1.XGBoostJob{}
   125  	err := r.Get(ctx, req.NamespacedName, xgboostjob)
   126  	if err != nil {
   127  		logger.Info(err.Error(), "unable to fetch XGBoostJob", req.NamespacedName.String())
   128  		// Object not found, return.  Created objects are automatically garbage collected.
   129  		// For additional cleanup logic use finalizers.
   130  		return ctrl.Result{}, client.IgnoreNotFound(err)
   131  	}
   132  
   133  	if err = kubeflowv1.ValidateV1XGBoostJob(xgboostjob); err != nil {
   134  		logger.Error(err, "XGBoostJob failed validation")
   135  		r.Recorder.Eventf(xgboostjob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedValidationReason),
   136  			"XGBoostJob failed validation because %s", err)
   137  		return ctrl.Result{}, err
   138  	}
   139  
   140  	// Check reconcile is required.
   141  	jobKey, err := common.KeyFunc(xgboostjob)
   142  	if err != nil {
   143  		utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", xgboostjob, err))
   144  	}
   145  
   146  	replicaTypes := util.GetReplicaTypes(xgboostjob.Spec.XGBReplicaSpecs)
   147  	needSync := util.SatisfiedExpectations(r.Expectations, jobKey, replicaTypes)
   148  
   149  	if !needSync || xgboostjob.GetDeletionTimestamp() != nil {
   150  		logger.Info("reconcile cancelled, job does not need to do reconcile or has been deleted",
   151  			"sync", needSync, "deleted", xgboostjob.GetDeletionTimestamp() != nil)
   152  		return reconcile.Result{}, nil
   153  	}
   154  
   155  	// Set default priorities for xgboost job
   156  	r.Scheme.Default(xgboostjob)
   157  
   158  	// Use common to reconcile the job related pod and service
   159  	err = r.ReconcileJobs(xgboostjob, xgboostjob.Spec.XGBReplicaSpecs, xgboostjob.Status, &xgboostjob.Spec.RunPolicy)
   160  	if err != nil {
   161  		logger.V(1).Error(err, "Reconcile XGBoost Job error")
   162  		return ctrl.Result{}, err
   163  	}
   164  
   165  	t, err := util.DurationUntilExpireTime(&xgboostjob.Spec.RunPolicy, xgboostjob.Status)
   166  	if err != nil {
   167  		logrus.Warnf("Reconcile XGBoost Job error %v", err)
   168  		return ctrl.Result{}, err
   169  	}
   170  	if t >= 0 {
   171  		return ctrl.Result{Requeue: true, RequeueAfter: t}, nil
   172  	}
   173  
   174  	return reconcile.Result{}, nil
   175  }
   176  
   177  // SetupWithManager sets up the controller with the Manager.
   178  func (r *XGBoostJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error {
   179  	c, err := controller.New(r.ControllerName(), mgr, controller.Options{
   180  		Reconciler:              r,
   181  		MaxConcurrentReconciles: controllerThreads,
   182  	})
   183  	if err != nil {
   184  		return err
   185  	}
   186  
   187  	// using onOwnerCreateFunc is easier to set defaults
   188  	if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.XGBoostJob{}), &handler.EnqueueRequestForObject{},
   189  		predicate.Funcs{CreateFunc: r.onOwnerCreateFunc()},
   190  	); err != nil {
   191  		return err
   192  	}
   193  
   194  	// eventHandler for owned objects
   195  	eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.XGBoostJob{}, handler.OnlyControllerOwner())
   196  	predicates := predicate.Funcs{
   197  		CreateFunc: util.OnDependentCreateFunc(r.Expectations),
   198  		UpdateFunc: util.OnDependentUpdateFunc(&r.JobController),
   199  		DeleteFunc: util.OnDependentDeleteFunc(r.Expectations),
   200  	}
   201  	// Create generic predicates
   202  	genericPredicates := predicate.Funcs{
   203  		CreateFunc: util.OnDependentCreateFuncGeneric(r.Expectations),
   204  		UpdateFunc: util.OnDependentUpdateFuncGeneric(&r.JobController),
   205  		DeleteFunc: util.OnDependentDeleteFuncGeneric(r.Expectations),
   206  	}
   207  	// inject watching for job related pod
   208  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil {
   209  		return err
   210  	}
   211  	// inject watching for job related service
   212  	if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Service{}), eventHandler, predicates); err != nil {
   213  		return err
   214  	}
   215  	// skip watching volcano PodGroup if volcano PodGroup is not installed
   216  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"},
   217  		v1beta1.SchemeGroupVersion.Version); err == nil {
   218  		// inject watching for job related volcano PodGroup
   219  		if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   220  			return err
   221  		}
   222  	}
   223  	// skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed
   224  	if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"},
   225  		schedulerpluginsv1alpha1.SchemeGroupVersion.Version); err == nil {
   226  		// inject watching for job related scheduler-plugins PodGroup
   227  		if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil {
   228  			return err
   229  		}
   230  	}
   231  	return nil
   232  }
   233  
   234  func (r *XGBoostJobReconciler) ControllerName() string {
   235  	return controllerName
   236  }
   237  
   238  func (r *XGBoostJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind {
   239  	return kubeflowv1.GroupVersion.WithKind(kubeflowv1.XGBoostJobKind)
   240  }
   241  
   242  func (r *XGBoostJobReconciler) GetAPIGroupVersion() schema.GroupVersion {
   243  	return kubeflowv1.GroupVersion
   244  }
   245  
   246  func (r *XGBoostJobReconciler) GetGroupNameLabelValue() string {
   247  	return kubeflowv1.GroupVersion.Group
   248  }
   249  
   250  func (r *XGBoostJobReconciler) GetFrameworkName() string {
   251  	return kubeflowv1.XGBoostJobFrameworkName
   252  }
   253  
   254  // GetJobFromInformerCache returns the Job from Informer Cache
   255  func (r *XGBoostJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) {
   256  	job := &kubeflowv1.XGBoostJob{}
   257  	// Default reader for XGBoostJob is cache reader.
   258  	err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   259  	if err != nil {
   260  		if errors.IsNotFound(err) {
   261  			r.Log.Error(err, "xgboost job not found", "namespace", namespace, "name", name)
   262  		} else {
   263  			r.Log.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   264  		}
   265  		return nil, err
   266  	}
   267  	return job, nil
   268  }
   269  
   270  // GetJobFromAPIClient returns the Job from API server
   271  func (r *XGBoostJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) {
   272  	job := &kubeflowv1.XGBoostJob{}
   273  
   274  	err := r.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job)
   275  	if err != nil {
   276  		if errors.IsNotFound(err) {
   277  			r.Log.Error(err, "xgboost job not found", "namespace", namespace, "name", name)
   278  		} else {
   279  			r.Log.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name)
   280  		}
   281  		return nil, err
   282  	}
   283  	return job, nil
   284  }
   285  
   286  // GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name"
   287  // i.e. all pods created by the job will come with label "job-name" = <this_job_name>
   288  func (r *XGBoostJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) {
   289  	job, err := meta.Accessor(obj)
   290  	if err != nil {
   291  		return nil, err
   292  	}
   293  	// List all pods to include those that don't match the selector anymore
   294  	// but have a ControllerRef pointing to this controller.
   295  	podlist := &corev1.PodList{}
   296  	err = r.List(context.Background(), podlist, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   297  	if err != nil {
   298  		return nil, err
   299  	}
   300  
   301  	return util.JobControlledPodList(podlist.Items, job), nil
   302  }
   303  
   304  // GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name"
   305  // i.e. all services created by the job will come with label "job-name" = <this_job_name>
   306  func (r *XGBoostJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) {
   307  	job, err := meta.Accessor(obj)
   308  	if err != nil {
   309  		return nil, fmt.Errorf("%+v is not a type of XGBoostJob", job)
   310  	}
   311  	// List all pods to include those that don't match the selector anymore
   312  	// but have a ControllerRef pointing to this controller.
   313  	serviceList := &corev1.ServiceList{}
   314  	err = r.List(context.Background(), serviceList, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace()))
   315  	if err != nil {
   316  		return nil, err
   317  	}
   318  
   319  	ret := util.ConvertServiceList(serviceList.Items)
   320  	return ret, nil
   321  }
   322  
   323  // DeleteJob deletes the job
   324  func (r *XGBoostJobReconciler) DeleteJob(job interface{}) error {
   325  	xgboostjob, ok := job.(*kubeflowv1.XGBoostJob)
   326  	if !ok {
   327  		return fmt.Errorf("%+v is not a type of XGBoostJob", xgboostjob)
   328  	}
   329  	if err := r.Delete(context.Background(), xgboostjob); err != nil {
   330  		r.recorder.Eventf(xgboostjob, corev1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err)
   331  		r.Log.Error(err, "failed to delete job", "namespace", xgboostjob.Namespace, "name", xgboostjob.Name)
   332  		return err
   333  	}
   334  	r.recorder.Eventf(xgboostjob, corev1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", xgboostjob.Name)
   335  	r.Log.Info("job deleted", "namespace", xgboostjob.Namespace, "name", xgboostjob.Name)
   336  	trainingoperatorcommon.DeletedJobsCounterInc(xgboostjob.Namespace, r.GetFrameworkName())
   337  	return nil
   338  }
   339  
   340  // UpdateJobStatus updates the job status and job conditions
   341  func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error {
   342  	xgboostJob, ok := job.(*kubeflowv1.XGBoostJob)
   343  	if !ok {
   344  		return fmt.Errorf("%+v is not a type of xgboostJob", xgboostJob)
   345  	}
   346  
   347  	xgboostJobKey, err := common.KeyFunc(xgboostJob)
   348  	if err != nil {
   349  		utilruntime.HandleError(fmt.Errorf("couldn't get key for xgboostjob object %#v: %v", xgboostJob, err))
   350  		return err
   351  	}
   352  
   353  	logger := commonutil.LoggerForJob(xgboostJob)
   354  
   355  	// Set StartTime.
   356  	if jobStatus.StartTime == nil {
   357  		now := metav1.Now()
   358  		jobStatus.StartTime = &now
   359  		// enqueue a sync to check if job past ActiveDeadlineSeconds
   360  		if xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds != nil {
   361  			logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds)
   362  			r.WorkQueue.AddAfter(xgboostJobKey, time.Duration(*xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second)
   363  		}
   364  	}
   365  
   366  	for rtype, spec := range replicas {
   367  		status := jobStatus.ReplicaStatuses[rtype]
   368  
   369  		succeeded := status.Succeeded
   370  		expected := *(spec.Replicas) - succeeded
   371  		running := status.Active
   372  		failed := status.Failed
   373  		runningMsg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name)
   374  
   375  		logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d",
   376  			xgboostJob.Name, rtype, expected, running, succeeded, failed)
   377  
   378  		if rtype == kubeflowv1.XGBoostJobReplicaTypeMaster {
   379  			if running > 0 {
   380  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg)
   381  			}
   382  			// when master is succeed, the job is finished.
   383  			if expected == 0 {
   384  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg)
   385  				msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name)
   386  				logrus.Info(msg)
   387  				r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobSucceededReason), msg)
   388  				if jobStatus.CompletionTime == nil {
   389  					now := metav1.Now()
   390  					jobStatus.CompletionTime = &now
   391  				}
   392  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobSucceededReason), msg)
   393  				trainingoperatorcommon.SuccessfulJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName())
   394  				return nil
   395  			}
   396  		}
   397  		if failed > 0 {
   398  			commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg)
   399  			if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode {
   400  				msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype)
   401  				r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRestartingReason), msg)
   402  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRestartingReason), msg)
   403  				trainingoperatorcommon.RestartedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName())
   404  			} else {
   405  				msg := fmt.Sprintf("XGBoostJob %s is failed because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype)
   406  				r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedReason), msg)
   407  				if jobStatus.CompletionTime == nil {
   408  					now := metav1.Now()
   409  					jobStatus.CompletionTime = &now
   410  				}
   411  				commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedReason), msg)
   412  				trainingoperatorcommon.FailedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName())
   413  			}
   414  		}
   415  	}
   416  	return nil
   417  }
   418  
   419  // UpdateJobStatusInApiServer updates the job status in to cluster.
   420  func (r *XGBoostJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error {
   421  	if jobStatus.ReplicaStatuses == nil {
   422  		jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{}
   423  	}
   424  
   425  	xgboostjob, ok := job.(*kubeflowv1.XGBoostJob)
   426  	if !ok {
   427  		return fmt.Errorf("%+v is not a type of XGBoostJob", xgboostjob)
   428  	}
   429  
   430  	// Job status passed in differs with status in job, update in basis of the passed in one.
   431  	if !reflect.DeepEqual(&xgboostjob.Status, jobStatus) {
   432  		xgboostjob = xgboostjob.DeepCopy()
   433  		xgboostjob.Status = *jobStatus.DeepCopy()
   434  	}
   435  
   436  	result := r.Status().Update(context.Background(), xgboostjob)
   437  
   438  	if result != nil {
   439  		commonutil.LoggerForJob(xgboostjob).Error(result, "failed to update XGBoost Job conditions in the API server")
   440  		return result
   441  	}
   442  
   443  	return nil
   444  }
   445  
   446  // SetClusterSpec sets the cluster spec for the pod
   447  func (r *XGBoostJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error {
   448  	return SetPodEnv(job, podTemplate, rtype, index)
   449  }
   450  
   451  func (r *XGBoostJobReconciler) GetDefaultContainerName() string {
   452  	return kubeflowv1.XGBoostJobDefaultContainerName
   453  }
   454  
   455  func (r *XGBoostJobReconciler) GetDefaultContainerPortName() string {
   456  	return kubeflowv1.XGBoostJobDefaultPortName
   457  }
   458  
   459  func (r *XGBoostJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec,
   460  	rtype kubeflowv1.ReplicaType, index int) bool {
   461  	return string(rtype) == string(kubeflowv1.XGBoostJobReplicaTypeMaster)
   462  }
   463  
   464  // onOwnerCreateFunc modify creation condition.
   465  func (r *XGBoostJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool {
   466  	return func(e event.CreateEvent) bool {
   467  		xgboostJob, ok := e.Object.(*kubeflowv1.XGBoostJob)
   468  		if !ok {
   469  			return true
   470  		}
   471  		r.Scheme.Default(xgboostJob)
   472  		msg := fmt.Sprintf("XGBoostJob %s is created.", e.Object.GetName())
   473  		logrus.Info()
   474  		trainingoperatorcommon.CreatedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName())
   475  		commonutil.UpdateJobConditions(&xgboostJob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobCreatedReason), msg)
   476  		return true
   477  	}
   478  }