github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/xgboost/xgboostjob_controller.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package xgboost 16 17 import ( 18 "context" 19 "fmt" 20 "reflect" 21 "time" 22 23 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 24 trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" 25 "github.com/kubeflow/training-operator/pkg/common/util" 26 "github.com/kubeflow/training-operator/pkg/controller.v1/common" 27 "github.com/kubeflow/training-operator/pkg/controller.v1/control" 28 "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" 29 commonutil "github.com/kubeflow/training-operator/pkg/util" 30 31 "github.com/go-logr/logr" 32 "github.com/sirupsen/logrus" 33 corev1 "k8s.io/api/core/v1" 34 "k8s.io/apimachinery/pkg/api/errors" 35 "k8s.io/apimachinery/pkg/api/meta" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/runtime" 38 "k8s.io/apimachinery/pkg/runtime/schema" 39 "k8s.io/apimachinery/pkg/types" 40 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 41 "k8s.io/client-go/informers" 42 kubeclientset "k8s.io/client-go/kubernetes" 43 "k8s.io/client-go/tools/record" 44 ctrl "sigs.k8s.io/controller-runtime" 45 "sigs.k8s.io/controller-runtime/pkg/client" 46 "sigs.k8s.io/controller-runtime/pkg/controller" 47 "sigs.k8s.io/controller-runtime/pkg/event" 48 "sigs.k8s.io/controller-runtime/pkg/handler" 49 "sigs.k8s.io/controller-runtime/pkg/manager" 50 "sigs.k8s.io/controller-runtime/pkg/predicate" 51 "sigs.k8s.io/controller-runtime/pkg/reconcile" 52 "sigs.k8s.io/controller-runtime/pkg/source" 53 schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" 54 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 55 ) 56 57 const ( 58 controllerName = "xgboostjob-controller" 59 60 // Reasons for job events. 61 FailedDeleteJobReason = "FailedDeleteJob" 62 SuccessfulDeleteJobReason = "SuccessfulDeleteJob" 63 ) 64 65 // NewReconciler creates a XGBoostJob Reconciler 66 func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *XGBoostJobReconciler { 67 r := &XGBoostJobReconciler{ 68 Client: mgr.GetClient(), 69 Scheme: mgr.GetScheme(), 70 recorder: mgr.GetEventRecorderFor(controllerName), 71 apiReader: mgr.GetAPIReader(), 72 Log: ctrl.Log.WithName("controllers").WithName(kubeflowv1.XGBoostJobKind), 73 } 74 75 // Create clients 76 cfg := mgr.GetConfig() 77 kubeClientSet := kubeclientset.NewForConfigOrDie(cfg) 78 sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0) 79 priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses() 80 81 // Initialize common job controller 82 r.JobController = common.JobController{ 83 Controller: r, 84 Expectations: expectation.NewControllerExpectations(), 85 WorkQueue: &util.FakeWorkQueue{}, 86 Recorder: r.recorder, 87 KubeClientSet: kubeClientSet, 88 PriorityClassLister: priorityClassInformer.Lister(), 89 PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced, 90 PodControl: control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 91 ServiceControl: control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 92 } 93 94 gangSchedulingSetupFunc(&r.JobController) 95 96 return r 97 } 98 99 // XGBoostJobReconciler reconciles a XGBoostJob object 100 type XGBoostJobReconciler struct { 101 common.JobController 102 client.Client 103 Log logr.Logger 104 Scheme *runtime.Scheme 105 recorder record.EventRecorder 106 apiReader client.Reader 107 } 108 109 //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs,verbs=get;list;watch;create;update;patch;delete 110 //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs/status,verbs=get;update;patch 111 //+kubebuilder:rbac:groups=kubeflow.org,resources=xgboostjobs/finalizers,verbs=update 112 //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete 113 //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete 114 //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 115 //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 116 //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete 117 118 // Reconcile reads that state of the cluster for a XGBoostJob object and makes changes based on the state read 119 // and what is in the XGBoostJob.Spec 120 // Automatically generate RBAC rules to allow the Controller to read and write Deployments 121 func (r *XGBoostJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 122 logger := r.Log.WithValues(kubeflowv1.XGBoostJobSingular, req.NamespacedName) 123 124 xgboostjob := &kubeflowv1.XGBoostJob{} 125 err := r.Get(ctx, req.NamespacedName, xgboostjob) 126 if err != nil { 127 logger.Info(err.Error(), "unable to fetch XGBoostJob", req.NamespacedName.String()) 128 // Object not found, return. Created objects are automatically garbage collected. 129 // For additional cleanup logic use finalizers. 130 return ctrl.Result{}, client.IgnoreNotFound(err) 131 } 132 133 if err = kubeflowv1.ValidateV1XGBoostJob(xgboostjob); err != nil { 134 logger.Error(err, "XGBoostJob failed validation") 135 r.Recorder.Eventf(xgboostjob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedValidationReason), 136 "XGBoostJob failed validation because %s", err) 137 return ctrl.Result{}, err 138 } 139 140 // Check reconcile is required. 141 jobKey, err := common.KeyFunc(xgboostjob) 142 if err != nil { 143 utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", xgboostjob, err)) 144 } 145 146 replicaTypes := util.GetReplicaTypes(xgboostjob.Spec.XGBReplicaSpecs) 147 needSync := util.SatisfiedExpectations(r.Expectations, jobKey, replicaTypes) 148 149 if !needSync || xgboostjob.GetDeletionTimestamp() != nil { 150 logger.Info("reconcile cancelled, job does not need to do reconcile or has been deleted", 151 "sync", needSync, "deleted", xgboostjob.GetDeletionTimestamp() != nil) 152 return reconcile.Result{}, nil 153 } 154 155 // Set default priorities for xgboost job 156 r.Scheme.Default(xgboostjob) 157 158 // Use common to reconcile the job related pod and service 159 err = r.ReconcileJobs(xgboostjob, xgboostjob.Spec.XGBReplicaSpecs, xgboostjob.Status, &xgboostjob.Spec.RunPolicy) 160 if err != nil { 161 logger.V(1).Error(err, "Reconcile XGBoost Job error") 162 return ctrl.Result{}, err 163 } 164 165 t, err := util.DurationUntilExpireTime(&xgboostjob.Spec.RunPolicy, xgboostjob.Status) 166 if err != nil { 167 logrus.Warnf("Reconcile XGBoost Job error %v", err) 168 return ctrl.Result{}, err 169 } 170 if t >= 0 { 171 return ctrl.Result{Requeue: true, RequeueAfter: t}, nil 172 } 173 174 return reconcile.Result{}, nil 175 } 176 177 // SetupWithManager sets up the controller with the Manager. 178 func (r *XGBoostJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error { 179 c, err := controller.New(r.ControllerName(), mgr, controller.Options{ 180 Reconciler: r, 181 MaxConcurrentReconciles: controllerThreads, 182 }) 183 if err != nil { 184 return err 185 } 186 187 // using onOwnerCreateFunc is easier to set defaults 188 if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.XGBoostJob{}), &handler.EnqueueRequestForObject{}, 189 predicate.Funcs{CreateFunc: r.onOwnerCreateFunc()}, 190 ); err != nil { 191 return err 192 } 193 194 // eventHandler for owned objects 195 eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.XGBoostJob{}, handler.OnlyControllerOwner()) 196 predicates := predicate.Funcs{ 197 CreateFunc: util.OnDependentCreateFunc(r.Expectations), 198 UpdateFunc: util.OnDependentUpdateFunc(&r.JobController), 199 DeleteFunc: util.OnDependentDeleteFunc(r.Expectations), 200 } 201 // Create generic predicates 202 genericPredicates := predicate.Funcs{ 203 CreateFunc: util.OnDependentCreateFuncGeneric(r.Expectations), 204 UpdateFunc: util.OnDependentUpdateFuncGeneric(&r.JobController), 205 DeleteFunc: util.OnDependentDeleteFuncGeneric(r.Expectations), 206 } 207 // inject watching for job related pod 208 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil { 209 return err 210 } 211 // inject watching for job related service 212 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Service{}), eventHandler, predicates); err != nil { 213 return err 214 } 215 // skip watching volcano PodGroup if volcano PodGroup is not installed 216 if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"}, 217 v1beta1.SchemeGroupVersion.Version); err == nil { 218 // inject watching for job related volcano PodGroup 219 if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil { 220 return err 221 } 222 } 223 // skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed 224 if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"}, 225 schedulerpluginsv1alpha1.SchemeGroupVersion.Version); err == nil { 226 // inject watching for job related scheduler-plugins PodGroup 227 if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil { 228 return err 229 } 230 } 231 return nil 232 } 233 234 func (r *XGBoostJobReconciler) ControllerName() string { 235 return controllerName 236 } 237 238 func (r *XGBoostJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind { 239 return kubeflowv1.GroupVersion.WithKind(kubeflowv1.XGBoostJobKind) 240 } 241 242 func (r *XGBoostJobReconciler) GetAPIGroupVersion() schema.GroupVersion { 243 return kubeflowv1.GroupVersion 244 } 245 246 func (r *XGBoostJobReconciler) GetGroupNameLabelValue() string { 247 return kubeflowv1.GroupVersion.Group 248 } 249 250 func (r *XGBoostJobReconciler) GetFrameworkName() string { 251 return kubeflowv1.XGBoostJobFrameworkName 252 } 253 254 // GetJobFromInformerCache returns the Job from Informer Cache 255 func (r *XGBoostJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { 256 job := &kubeflowv1.XGBoostJob{} 257 // Default reader for XGBoostJob is cache reader. 258 err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) 259 if err != nil { 260 if errors.IsNotFound(err) { 261 r.Log.Error(err, "xgboost job not found", "namespace", namespace, "name", name) 262 } else { 263 r.Log.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) 264 } 265 return nil, err 266 } 267 return job, nil 268 } 269 270 // GetJobFromAPIClient returns the Job from API server 271 func (r *XGBoostJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { 272 job := &kubeflowv1.XGBoostJob{} 273 274 err := r.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) 275 if err != nil { 276 if errors.IsNotFound(err) { 277 r.Log.Error(err, "xgboost job not found", "namespace", namespace, "name", name) 278 } else { 279 r.Log.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) 280 } 281 return nil, err 282 } 283 return job, nil 284 } 285 286 // GetPodsForJob returns the pods managed by the job. This can be achieved by selecting pods using label key "job-name" 287 // i.e. all pods created by the job will come with label "job-name" = <this_job_name> 288 func (r *XGBoostJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) { 289 job, err := meta.Accessor(obj) 290 if err != nil { 291 return nil, err 292 } 293 // List all pods to include those that don't match the selector anymore 294 // but have a ControllerRef pointing to this controller. 295 podlist := &corev1.PodList{} 296 err = r.List(context.Background(), podlist, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace())) 297 if err != nil { 298 return nil, err 299 } 300 301 return util.JobControlledPodList(podlist.Items, job), nil 302 } 303 304 // GetServicesForJob returns the services managed by the job. This can be achieved by selecting services using label key "job-name" 305 // i.e. all services created by the job will come with label "job-name" = <this_job_name> 306 func (r *XGBoostJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) { 307 job, err := meta.Accessor(obj) 308 if err != nil { 309 return nil, fmt.Errorf("%+v is not a type of XGBoostJob", job) 310 } 311 // List all pods to include those that don't match the selector anymore 312 // but have a ControllerRef pointing to this controller. 313 serviceList := &corev1.ServiceList{} 314 err = r.List(context.Background(), serviceList, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace())) 315 if err != nil { 316 return nil, err 317 } 318 319 ret := util.ConvertServiceList(serviceList.Items) 320 return ret, nil 321 } 322 323 // DeleteJob deletes the job 324 func (r *XGBoostJobReconciler) DeleteJob(job interface{}) error { 325 xgboostjob, ok := job.(*kubeflowv1.XGBoostJob) 326 if !ok { 327 return fmt.Errorf("%+v is not a type of XGBoostJob", xgboostjob) 328 } 329 if err := r.Delete(context.Background(), xgboostjob); err != nil { 330 r.recorder.Eventf(xgboostjob, corev1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err) 331 r.Log.Error(err, "failed to delete job", "namespace", xgboostjob.Namespace, "name", xgboostjob.Name) 332 return err 333 } 334 r.recorder.Eventf(xgboostjob, corev1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", xgboostjob.Name) 335 r.Log.Info("job deleted", "namespace", xgboostjob.Namespace, "name", xgboostjob.Name) 336 trainingoperatorcommon.DeletedJobsCounterInc(xgboostjob.Namespace, r.GetFrameworkName()) 337 return nil 338 } 339 340 // UpdateJobStatus updates the job status and job conditions 341 func (r *XGBoostJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { 342 xgboostJob, ok := job.(*kubeflowv1.XGBoostJob) 343 if !ok { 344 return fmt.Errorf("%+v is not a type of xgboostJob", xgboostJob) 345 } 346 347 xgboostJobKey, err := common.KeyFunc(xgboostJob) 348 if err != nil { 349 utilruntime.HandleError(fmt.Errorf("couldn't get key for xgboostjob object %#v: %v", xgboostJob, err)) 350 return err 351 } 352 353 logger := commonutil.LoggerForJob(xgboostJob) 354 355 // Set StartTime. 356 if jobStatus.StartTime == nil { 357 now := metav1.Now() 358 jobStatus.StartTime = &now 359 // enqueue a sync to check if job past ActiveDeadlineSeconds 360 if xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds != nil { 361 logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds) 362 r.WorkQueue.AddAfter(xgboostJobKey, time.Duration(*xgboostJob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second) 363 } 364 } 365 366 for rtype, spec := range replicas { 367 status := jobStatus.ReplicaStatuses[rtype] 368 369 succeeded := status.Succeeded 370 expected := *(spec.Replicas) - succeeded 371 running := status.Active 372 failed := status.Failed 373 runningMsg := fmt.Sprintf("XGBoostJob %s is running.", xgboostJob.Name) 374 375 logrus.Infof("XGBoostJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", 376 xgboostJob.Name, rtype, expected, running, succeeded, failed) 377 378 if rtype == kubeflowv1.XGBoostJobReplicaTypeMaster { 379 if running > 0 { 380 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg) 381 } 382 // when master is succeed, the job is finished. 383 if expected == 0 { 384 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg) 385 msg := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name) 386 logrus.Info(msg) 387 r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobSucceededReason), msg) 388 if jobStatus.CompletionTime == nil { 389 now := metav1.Now() 390 jobStatus.CompletionTime = &now 391 } 392 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobSucceededReason), msg) 393 trainingoperatorcommon.SuccessfulJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName()) 394 return nil 395 } 396 } 397 if failed > 0 { 398 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRunningReason), runningMsg) 399 if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { 400 msg := fmt.Sprintf("XGBoostJob %s is restarting because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype) 401 r.Recorder.Event(xgboostJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRestartingReason), msg) 402 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobRestartingReason), msg) 403 trainingoperatorcommon.RestartedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName()) 404 } else { 405 msg := fmt.Sprintf("XGBoostJob %s is failed because %d %s replica(s) failed.", xgboostJob.Name, failed, rtype) 406 r.Recorder.Event(xgboostJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedReason), msg) 407 if jobStatus.CompletionTime == nil { 408 now := metav1.Now() 409 jobStatus.CompletionTime = &now 410 } 411 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobFailedReason), msg) 412 trainingoperatorcommon.FailedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName()) 413 } 414 } 415 } 416 return nil 417 } 418 419 // UpdateJobStatusInApiServer updates the job status in to cluster. 420 func (r *XGBoostJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { 421 if jobStatus.ReplicaStatuses == nil { 422 jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} 423 } 424 425 xgboostjob, ok := job.(*kubeflowv1.XGBoostJob) 426 if !ok { 427 return fmt.Errorf("%+v is not a type of XGBoostJob", xgboostjob) 428 } 429 430 // Job status passed in differs with status in job, update in basis of the passed in one. 431 if !reflect.DeepEqual(&xgboostjob.Status, jobStatus) { 432 xgboostjob = xgboostjob.DeepCopy() 433 xgboostjob.Status = *jobStatus.DeepCopy() 434 } 435 436 result := r.Status().Update(context.Background(), xgboostjob) 437 438 if result != nil { 439 commonutil.LoggerForJob(xgboostjob).Error(result, "failed to update XGBoost Job conditions in the API server") 440 return result 441 } 442 443 return nil 444 } 445 446 // SetClusterSpec sets the cluster spec for the pod 447 func (r *XGBoostJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error { 448 return SetPodEnv(job, podTemplate, rtype, index) 449 } 450 451 func (r *XGBoostJobReconciler) GetDefaultContainerName() string { 452 return kubeflowv1.XGBoostJobDefaultContainerName 453 } 454 455 func (r *XGBoostJobReconciler) GetDefaultContainerPortName() string { 456 return kubeflowv1.XGBoostJobDefaultPortName 457 } 458 459 func (r *XGBoostJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, 460 rtype kubeflowv1.ReplicaType, index int) bool { 461 return string(rtype) == string(kubeflowv1.XGBoostJobReplicaTypeMaster) 462 } 463 464 // onOwnerCreateFunc modify creation condition. 465 func (r *XGBoostJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { 466 return func(e event.CreateEvent) bool { 467 xgboostJob, ok := e.Object.(*kubeflowv1.XGBoostJob) 468 if !ok { 469 return true 470 } 471 r.Scheme.Default(xgboostJob) 472 msg := fmt.Sprintf("XGBoostJob %s is created.", e.Object.GetName()) 473 logrus.Info() 474 trainingoperatorcommon.CreatedJobsCounterInc(xgboostJob.Namespace, r.GetFrameworkName()) 475 commonutil.UpdateJobConditions(&xgboostJob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.XGBoostJobKind, commonutil.JobCreatedReason), msg) 476 return true 477 } 478 }