github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/paddlepaddle/paddlepaddle_controller.go (about) 1 // Copyright 2022 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package paddle 16 17 import ( 18 "context" 19 "fmt" 20 "strings" 21 "time" 22 23 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 24 trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" 25 "github.com/kubeflow/training-operator/pkg/common/util" 26 "github.com/kubeflow/training-operator/pkg/controller.v1/common" 27 "github.com/kubeflow/training-operator/pkg/controller.v1/control" 28 "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" 29 commonutil "github.com/kubeflow/training-operator/pkg/util" 30 31 "github.com/go-logr/logr" 32 "github.com/sirupsen/logrus" 33 corev1 "k8s.io/api/core/v1" 34 "k8s.io/apimachinery/pkg/api/equality" 35 "k8s.io/apimachinery/pkg/api/errors" 36 "k8s.io/apimachinery/pkg/api/meta" 37 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 38 "k8s.io/apimachinery/pkg/runtime" 39 "k8s.io/apimachinery/pkg/runtime/schema" 40 "k8s.io/apimachinery/pkg/types" 41 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 42 "k8s.io/client-go/informers" 43 kubeclientset "k8s.io/client-go/kubernetes" 44 "k8s.io/client-go/tools/record" 45 ctrl "sigs.k8s.io/controller-runtime" 46 "sigs.k8s.io/controller-runtime/pkg/client" 47 "sigs.k8s.io/controller-runtime/pkg/controller" 48 "sigs.k8s.io/controller-runtime/pkg/event" 49 "sigs.k8s.io/controller-runtime/pkg/handler" 50 "sigs.k8s.io/controller-runtime/pkg/log" 51 "sigs.k8s.io/controller-runtime/pkg/manager" 52 "sigs.k8s.io/controller-runtime/pkg/predicate" 53 "sigs.k8s.io/controller-runtime/pkg/source" 54 schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" 55 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 56 ) 57 58 const ( 59 controllerName = "paddlejob-controller" 60 ) 61 62 // NewReconciler creates a PaddleJob Reconciler 63 func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *PaddleJobReconciler { 64 r := &PaddleJobReconciler{ 65 Client: mgr.GetClient(), 66 Scheme: mgr.GetScheme(), 67 recorder: mgr.GetEventRecorderFor(controllerName), 68 apiReader: mgr.GetAPIReader(), 69 Log: log.Log, 70 } 71 72 // Create clients 73 cfg := mgr.GetConfig() 74 kubeClientSet := kubeclientset.NewForConfigOrDie(cfg) 75 sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0) 76 priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses() 77 78 // Initialize common job controller 79 r.JobController = common.JobController{ 80 Controller: r, 81 Expectations: expectation.NewControllerExpectations(), 82 WorkQueue: &util.FakeWorkQueue{}, 83 Recorder: r.recorder, 84 KubeClientSet: kubeClientSet, 85 PriorityClassLister: priorityClassInformer.Lister(), 86 PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced, 87 PodControl: control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 88 ServiceControl: control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 89 } 90 91 gangSchedulingSetupFunc(&r.JobController) 92 93 return r 94 } 95 96 // PaddleJobReconciler reconciles a PaddleJob object 97 type PaddleJobReconciler struct { 98 common.JobController 99 client.Client 100 Scheme *runtime.Scheme 101 Log logr.Logger 102 recorder record.EventRecorder 103 apiReader client.Reader 104 } 105 106 //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs,verbs=get;list;watch;create;update;patch;delete 107 //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs/status,verbs=get;update;patch 108 //+kubebuilder:rbac:groups=kubeflow.org,resources=paddlejobs/finalizers,verbs=update 109 //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete 110 //+kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;delete 111 //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 112 //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 113 //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete 114 115 // Reconcile is part of the main kubernetes reconciliation loop which aims to 116 // move the current state of the cluster closer to the desired state. 117 // the PaddleJob object against the actual cluster state, and then 118 // perform operations to make the cluster state reflect the state specified by 119 // the user. 120 // 121 // For more details, check Reconcile and its Result here: 122 // - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.8.3/pkg/reconcile 123 func (r *PaddleJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 124 _ = log.FromContext(ctx) 125 logger := r.Log.WithValues(kubeflowv1.PaddleJobSingular, req.NamespacedName) 126 127 paddlejob := &kubeflowv1.PaddleJob{} 128 err := r.Get(ctx, req.NamespacedName, paddlejob) 129 if err != nil { 130 logger.Info(err.Error(), "unable to fetch PaddleJob", req.NamespacedName.String()) 131 return ctrl.Result{}, client.IgnoreNotFound(err) 132 } 133 134 if err = kubeflowv1.ValidateV1PaddleJob(paddlejob); err != nil { 135 logger.Error(err, "PaddleJob failed validation") 136 r.Recorder.Eventf(paddlejob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedValidationReason), 137 "PaddleJob failed validation because %s", err) 138 return ctrl.Result{}, err 139 } 140 141 // Check if reconciliation is needed 142 jobKey, err := common.KeyFunc(paddlejob) 143 if err != nil { 144 utilruntime.HandleError(fmt.Errorf("couldn't get jobKey for job object %#v: %v", paddlejob, err)) 145 } 146 147 replicaTypes := util.GetReplicaTypes(paddlejob.Spec.PaddleReplicaSpecs) 148 needReconcile := util.SatisfiedExpectations(r.Expectations, jobKey, replicaTypes) 149 150 if !needReconcile || paddlejob.GetDeletionTimestamp() != nil { 151 logger.Info("reconcile cancelled, job does not need to do reconcile or has been deleted", 152 "sync", needReconcile, "deleted", paddlejob.GetDeletionTimestamp() != nil) 153 return ctrl.Result{}, nil 154 } 155 156 // Set default priorities to paddle job 157 r.Scheme.Default(paddlejob) 158 159 // Use common to reconcile the job related pod and service 160 err = r.ReconcileJobs(paddlejob, paddlejob.Spec.PaddleReplicaSpecs, paddlejob.Status, &paddlejob.Spec.RunPolicy) 161 if err != nil { 162 logger.Error(err, "Reconcile PaddleJob error") 163 return ctrl.Result{}, err 164 } 165 166 t, err := util.DurationUntilExpireTime(&paddlejob.Spec.RunPolicy, paddlejob.Status) 167 if err != nil { 168 logrus.Warnf("Reconcile PaddleJob error %v", err) 169 return ctrl.Result{}, err 170 } 171 if t >= 0 { 172 return ctrl.Result{Requeue: true, RequeueAfter: t}, nil 173 } 174 175 return ctrl.Result{}, nil 176 } 177 178 // SetupWithManager sets up the controller with the Manager. 179 func (r *PaddleJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error { 180 c, err := controller.New(r.ControllerName(), mgr, controller.Options{ 181 Reconciler: r, 182 MaxConcurrentReconciles: controllerThreads, 183 }) 184 185 if err != nil { 186 return err 187 } 188 189 // using onOwnerCreateFunc is easier to set defaults 190 if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.PaddleJob{}), &handler.EnqueueRequestForObject{}, 191 predicate.Funcs{CreateFunc: r.onOwnerCreateFunc()}, 192 ); err != nil { 193 return err 194 } 195 196 // eventHandler for owned objects 197 eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.PaddleJob{}, handler.OnlyControllerOwner()) 198 predicates := predicate.Funcs{ 199 CreateFunc: util.OnDependentCreateFunc(r.Expectations), 200 UpdateFunc: util.OnDependentUpdateFunc(&r.JobController), 201 DeleteFunc: util.OnDependentDeleteFunc(r.Expectations), 202 } 203 // Create generic predicates 204 genericPredicates := predicate.Funcs{ 205 CreateFunc: util.OnDependentCreateFuncGeneric(r.Expectations), 206 UpdateFunc: util.OnDependentUpdateFuncGeneric(&r.JobController), 207 DeleteFunc: util.OnDependentDeleteFuncGeneric(r.Expectations), 208 } 209 // inject watching for job related pod 210 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil { 211 return err 212 } 213 // inject watching for job related service 214 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Service{}), eventHandler, predicates); err != nil { 215 return err 216 } 217 // skip watching volcano PodGroup if volcano PodGroup is not installed 218 if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"}, 219 v1beta1.SchemeGroupVersion.Version, 220 ); err == nil { 221 // inject watching for job related volcano PodGroup 222 if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil { 223 return err 224 } 225 } 226 // skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed 227 if _, err = mgr.GetRESTMapper().RESTMapping( 228 schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"}, 229 schedulerpluginsv1alpha1.SchemeGroupVersion.Version, 230 ); err == nil { 231 // inject watching for job related scheduler-plugins PodGroup 232 if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil { 233 return err 234 } 235 } 236 237 return nil 238 } 239 240 func (r *PaddleJobReconciler) ControllerName() string { 241 return controllerName 242 } 243 244 func (r *PaddleJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind { 245 return kubeflowv1.GroupVersion.WithKind(kubeflowv1.PaddleJobKind) 246 } 247 248 func (r *PaddleJobReconciler) GetAPIGroupVersion() schema.GroupVersion { 249 return kubeflowv1.GroupVersion 250 } 251 252 func (r *PaddleJobReconciler) GetGroupNameLabelValue() string { 253 return kubeflowv1.GroupVersion.Group 254 } 255 256 func (r *PaddleJobReconciler) GetFrameworkName() string { 257 return kubeflowv1.PaddleJobFrameworkName 258 } 259 260 func (r *PaddleJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { 261 job := &kubeflowv1.PaddleJob{} 262 err := r.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) 263 if err != nil { 264 if errors.IsNotFound(err) { 265 logrus.Error(err, "paddle job not found", "namespace", namespace, "name", name) 266 } else { 267 logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) 268 } 269 return nil, err 270 } 271 return job, nil 272 } 273 274 func (r *PaddleJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { 275 job := &kubeflowv1.PaddleJob{} 276 277 err := r.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) 278 if err != nil { 279 if errors.IsNotFound(err) { 280 logrus.Error(err, "paddle job not found", "namespace", namespace, "name", name) 281 } else { 282 logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) 283 } 284 return nil, err 285 } 286 return job, nil 287 } 288 289 func (r *PaddleJobReconciler) GetPodsForJob(obj interface{}) ([]*corev1.Pod, error) { 290 job, err := meta.Accessor(obj) 291 if err != nil { 292 return nil, err 293 } 294 295 // List all pods to include those that don't match the selector anymore 296 // but have a ControllerRef pointing to this controller. 297 podlist := &corev1.PodList{} 298 err = r.List(context.Background(), podlist, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace())) 299 if err != nil { 300 return nil, err 301 } 302 303 return util.JobControlledPodList(podlist.Items, job), nil 304 } 305 306 func (r *PaddleJobReconciler) GetServicesForJob(obj interface{}) ([]*corev1.Service, error) { 307 job, err := meta.Accessor(obj) 308 if err != nil { 309 return nil, err 310 } 311 312 // List all pods to include those that don't match the selector anymore 313 // but have a ControllerRef pointing to this controller. 314 serviceList := &corev1.ServiceList{} 315 err = r.List(context.Background(), serviceList, client.MatchingLabels(r.GenLabels(job.GetName())), client.InNamespace(job.GetNamespace())) 316 if err != nil { 317 return nil, err 318 } 319 320 ret := util.ConvertServiceList(serviceList.Items) 321 return ret, nil 322 } 323 324 func (r *PaddleJobReconciler) DeleteJob(job interface{}) error { 325 paddlejob, ok := job.(*kubeflowv1.PaddleJob) 326 if !ok { 327 return fmt.Errorf("%+v is not a type of PaddleJob", job) 328 } 329 if err := r.Delete(context.Background(), paddlejob); err != nil { 330 r.recorder.Eventf(paddlejob, corev1.EventTypeWarning, control.FailedDeletePodReason, "Error deleting: %v", err) 331 logrus.Error(err, "failed to delete job", "namespace", paddlejob.Namespace, "name", paddlejob.Name) 332 return err 333 } 334 r.recorder.Eventf(paddlejob, corev1.EventTypeNormal, control.SuccessfulDeletePodReason, "Deleted job: %v", paddlejob.Name) 335 logrus.Info("job deleted", "namespace", paddlejob.Namespace, "name", paddlejob.Name) 336 trainingoperatorcommon.DeletedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 337 return nil 338 } 339 340 func (jc *PaddleJobReconciler) GenLabelSelector(jobName string, 341 rtype kubeflowv1.ReplicaType) *metav1.LabelSelector { 342 labels := jc.GenLabels(jobName) 343 labels[kubeflowv1.ReplicaTypeLabel] = strings.ToLower(string(rtype)) 344 345 return &metav1.LabelSelector{ 346 MatchLabels: labels, 347 } 348 } 349 350 // UpdateJobStatus updates the job status and job conditions 351 func (r *PaddleJobReconciler) UpdateJobStatus(job interface{}, 352 replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, 353 jobStatus *kubeflowv1.JobStatus) error { 354 paddlejob, ok := job.(*kubeflowv1.PaddleJob) 355 if !ok { 356 return fmt.Errorf("%+v is not a type of PaddleJob", job) 357 } 358 359 paddlejobKey, err := common.KeyFunc(paddlejob) 360 if err != nil { 361 utilruntime.HandleError(fmt.Errorf("couldn't get key for paddlejob object %#v: %v", paddlejob, err)) 362 return err 363 } 364 365 logger := commonutil.LoggerForJob(paddlejob) 366 367 // Set StartTime. 368 if jobStatus.StartTime == nil { 369 now := metav1.Now() 370 jobStatus.StartTime = &now 371 // enqueue a sync to check if job past ActiveDeadlineSeconds 372 if paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds != nil { 373 logger.Infof("Job with ActiveDeadlineSeconds will sync after %d seconds", *paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds) 374 r.WorkQueue.AddAfter(paddlejobKey, time.Duration(*paddlejob.Spec.RunPolicy.ActiveDeadlineSeconds)*time.Second) 375 } 376 } 377 378 for rtype, spec := range replicas { 379 status := jobStatus.ReplicaStatuses[rtype] 380 // Generate the label selector. 381 status.Selector = metav1.FormatLabelSelector(r.GenLabelSelector(paddlejob.Name, rtype)) 382 383 succeeded := status.Succeeded 384 expected := *(spec.Replicas) - succeeded 385 running := status.Active 386 failed := status.Failed 387 specReplicas := *spec.Replicas 388 389 logrus.Infof("PaddleJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d, failed=%d, Replicas=%d", 390 paddlejob.Name, rtype, expected, running, succeeded, failed, specReplicas) 391 392 if ContainsMasterSpec(replicas) { 393 if rtype == kubeflowv1.PaddleJobReplicaTypeMaster { 394 if running > 0 { 395 msg := fmt.Sprintf("PaddleJob %s is running.", paddlejob.Name) 396 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), msg) 397 } 398 // when master is succeed, the job is finished. 399 if expected == 0 { 400 msg := fmt.Sprintf("PaddleJob %s is successfully completed.", paddlejob.Name) 401 logrus.Info(msg) 402 r.Recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg) 403 if jobStatus.CompletionTime == nil { 404 now := metav1.Now() 405 jobStatus.CompletionTime = &now 406 } 407 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg) 408 trainingoperatorcommon.SuccessfulJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 409 return nil 410 } 411 } 412 } else { 413 if rtype == kubeflowv1.PaddleJobReplicaTypeWorker { 414 // TODO(gaocegege): Support SuccessPolicy 415 if expected == 0 { 416 msg := fmt.Sprintf("PaddleJob %s/%s successfully completed.", 417 paddlejob.Namespace, paddlejob.Name) 418 r.recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg) 419 if jobStatus.CompletionTime == nil { 420 now := metav1.Now() 421 jobStatus.CompletionTime = &now 422 } 423 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobSucceededReason), msg) 424 trainingoperatorcommon.SuccessfulJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 425 } else if running > 0 { 426 // Some workers are still running, leave a running condition. 427 msg := fmt.Sprintf("PaddleJob %s/%s is running.", 428 paddlejob.Namespace, paddlejob.Name) 429 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRunningReason), msg) 430 } 431 } 432 } 433 434 if failed > 0 && (specReplicas > succeeded+running) { 435 if spec.RestartPolicy != kubeflowv1.RestartPolicyNever { 436 msg := fmt.Sprintf("PaddleJob %s is restarting because %d %s replica(s) failed.", paddlejob.Name, failed, rtype) 437 r.Recorder.Event(paddlejob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRestartingReason), msg) 438 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobRestartingReason), msg) 439 trainingoperatorcommon.RestartedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 440 } else { 441 msg := fmt.Sprintf("PaddleJob %s is failed because %d %s replica(s) failed.", paddlejob.Name, failed, rtype) 442 r.Recorder.Event(paddlejob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedReason), msg) 443 if jobStatus.CompletionTime == nil { 444 now := metav1.Now() 445 jobStatus.CompletionTime = &now 446 } 447 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobFailedReason), msg) 448 trainingoperatorcommon.FailedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 449 } 450 } 451 } 452 453 return nil 454 } 455 456 // ContainsMasterSpec returns true if the paddlejob contains master spec. 457 func ContainsMasterSpec(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec) bool { 458 if _, ok := replicas[kubeflowv1.PaddleJobReplicaTypeMaster]; ok { 459 return true 460 } 461 return false 462 } 463 464 // UpdateJobStatusInApiServer updates the job status in to cluster. 465 func (r *PaddleJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { 466 if jobStatus.ReplicaStatuses == nil { 467 jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} 468 } 469 470 paddlejob, ok := job.(*kubeflowv1.PaddleJob) 471 trainingoperatorcommon.ClearGeneratedFields(&paddlejob.ObjectMeta) 472 if !ok { 473 return fmt.Errorf("%+v is not a type of PaddleJob", job) 474 } 475 476 // Job status passed in differs with status in job, update in basis of the passed in one. 477 if !equality.Semantic.DeepEqual(&paddlejob.Status, jobStatus) { 478 paddlejob = paddlejob.DeepCopy() 479 paddlejob.Status = *jobStatus.DeepCopy() 480 } 481 482 result := r.Status().Update(context.Background(), paddlejob) 483 484 if result != nil { 485 r.Log.WithValues("paddlejob", types.NamespacedName{ 486 Namespace: paddlejob.GetNamespace(), 487 Name: paddlejob.GetName(), 488 }) 489 return result 490 } 491 492 return nil 493 } 494 495 // SetClusterSpec sets the cluster spec and init container for the pod 496 func (r *PaddleJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error { 497 // TODO 498 if err := setPodEnv(job, podTemplate, rtype, index); err != nil { 499 return err 500 } 501 return nil 502 } 503 504 func (r *PaddleJobReconciler) GetDefaultContainerName() string { 505 return kubeflowv1.PaddleJobDefaultContainerName 506 } 507 508 func (r *PaddleJobReconciler) GetDefaultContainerPortName() string { 509 return kubeflowv1.PaddleJobDefaultPortName 510 } 511 512 func (r *PaddleJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, 513 rtype kubeflowv1.ReplicaType, index int) bool { 514 return string(rtype) == string(kubeflowv1.PaddleJobReplicaTypeMaster) 515 } 516 517 // onOwnerCreateFunc modify creation condition. 518 func (r *PaddleJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { 519 return func(e event.CreateEvent) bool { 520 paddlejob, ok := e.Object.(*kubeflowv1.PaddleJob) 521 if !ok { 522 return true 523 } 524 r.Scheme.Default(paddlejob) 525 msg := fmt.Sprintf("PaddleJob %s is created.", e.Object.GetName()) 526 logrus.Info(msg) 527 trainingoperatorcommon.CreatedJobsCounterInc(paddlejob.Namespace, r.GetFrameworkName()) 528 commonutil.UpdateJobConditions(&paddlejob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.PaddleJobKind, commonutil.JobCreatedReason), msg) 529 return true 530 } 531 }