github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/mpi/mpijob_controller.go (about) 1 // Copyright 2021 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package mpi 16 17 import ( 18 "bytes" 19 "context" 20 "fmt" 21 "reflect" 22 "sort" 23 "strconv" 24 "strings" 25 "time" 26 27 "github.com/go-logr/logr" 28 "github.com/sirupsen/logrus" 29 corev1 "k8s.io/api/core/v1" 30 rbacv1 "k8s.io/api/rbac/v1" 31 "k8s.io/apimachinery/pkg/api/errors" 32 "k8s.io/apimachinery/pkg/api/resource" 33 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 34 "k8s.io/apimachinery/pkg/runtime" 35 "k8s.io/apimachinery/pkg/runtime/schema" 36 "k8s.io/apimachinery/pkg/types" 37 "k8s.io/client-go/informers" 38 kubeclientset "k8s.io/client-go/kubernetes" 39 "k8s.io/client-go/tools/record" 40 "k8s.io/klog" 41 ctrl "sigs.k8s.io/controller-runtime" 42 "sigs.k8s.io/controller-runtime/pkg/client" 43 "sigs.k8s.io/controller-runtime/pkg/controller" 44 "sigs.k8s.io/controller-runtime/pkg/event" 45 "sigs.k8s.io/controller-runtime/pkg/handler" 46 "sigs.k8s.io/controller-runtime/pkg/log" 47 "sigs.k8s.io/controller-runtime/pkg/manager" 48 "sigs.k8s.io/controller-runtime/pkg/predicate" 49 "sigs.k8s.io/controller-runtime/pkg/source" 50 schedulerpluginsv1alpha1 "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" 51 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" 52 53 kubeflowv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 54 trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" 55 "github.com/kubeflow/training-operator/pkg/common/util" 56 ctlrconfig "github.com/kubeflow/training-operator/pkg/config" 57 "github.com/kubeflow/training-operator/pkg/controller.v1/common" 58 "github.com/kubeflow/training-operator/pkg/controller.v1/control" 59 "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" 60 commonutil "github.com/kubeflow/training-operator/pkg/util" 61 ) 62 63 const ( 64 FailedDeleteJobReason = "FailedDeleteJob" 65 SuccessfulDeleteJobReason = "SuccessfulDeleteJob" 66 67 controllerName = "mpijob-controller" 68 labelMPIJobName = "mpi-job-name" 69 ) 70 71 func NewReconciler(mgr manager.Manager, gangSchedulingSetupFunc common.GangSchedulingSetupFunc) *MPIJobReconciler { 72 r := &MPIJobReconciler{ 73 Client: mgr.GetClient(), 74 Scheme: mgr.GetScheme(), 75 recorder: mgr.GetEventRecorderFor(controllerName), 76 apiReader: mgr.GetAPIReader(), 77 Log: log.Log, 78 } 79 80 cfg := mgr.GetConfig() 81 kubeClientSet := kubeclientset.NewForConfigOrDie(cfg) 82 sharedInformers := informers.NewSharedInformerFactory(kubeClientSet, 0) 83 priorityClassInformer := sharedInformers.Scheduling().V1().PriorityClasses() 84 85 r.JobController = common.JobController{ 86 Controller: r, 87 Expectations: expectation.NewControllerExpectations(), 88 WorkQueue: &util.FakeWorkQueue{}, 89 Recorder: r.recorder, 90 KubeClientSet: kubeClientSet, 91 PriorityClassLister: priorityClassInformer.Lister(), 92 PriorityClassInformerSynced: priorityClassInformer.Informer().HasSynced, 93 PodControl: control.RealPodControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 94 ServiceControl: control.RealServiceControl{KubeClient: kubeClientSet, Recorder: r.recorder}, 95 } 96 97 gangSchedulingSetupFunc(&r.JobController) 98 99 return r 100 } 101 102 // MPIJobReconciler reconciles a MPIJob object 103 type MPIJobReconciler struct { 104 common.JobController 105 client.Client 106 Scheme *runtime.Scheme 107 recorder record.EventRecorder 108 apiReader client.Reader 109 Log logr.Logger 110 } 111 112 //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs,verbs=get;list;watch;create;update;patch;delete 113 //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/status,verbs=get;update;patch 114 //+kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/finalizers,verbs=update 115 //+kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch;create;update;patch;delete 116 //+kubebuilder:rbac:groups="",resources=serviceaccounts,verbs=get;list;watch;create 117 //+kubebuilder:rbac:groups="",resources=configmaps,verbs=list;watch;create;update 118 //+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=roles,verbs=list;watch;create;update 119 //+kubebuilder:rbac:groups="rbac.authorization.k8s.io",resources=rolebindings,verbs=list;watch;create;update 120 //+kubebuilder:rbac:groups="",resources=pods/exec,verbs=create 121 //+kubebuilder:rbac:groups=scheduling.volcano.sh,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 122 //+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create;update;patch;delete 123 //+kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch;delete 124 125 // Reconcile is part of the main kubernetes reconciliation loop which aims to 126 // move the current state of the cluster closer to the desired state. 127 func (jc *MPIJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 128 _ = log.FromContext(ctx) 129 logger := jc.Log.WithValues(kubeflowv1.MPIJobSingular, req.NamespacedName) 130 131 mpijob := &kubeflowv1.MPIJob{} 132 err := jc.Get(ctx, req.NamespacedName, mpijob) 133 if err != nil { 134 logger.Info(err.Error(), "unable to fetch MPIJob", req.NamespacedName.String()) 135 return ctrl.Result{}, client.IgnoreNotFound(err) 136 } 137 138 if err = kubeflowv1.ValidateV1MpiJobSpec(&mpijob.Spec); err != nil { 139 logger.Error(err, "MPIJob failed validation") 140 jc.Recorder.Eventf(mpijob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedValidationReason), 141 "MPIJob failed validation because %s", err) 142 return ctrl.Result{}, err 143 } 144 145 // skip for MPIJob that is being deleted 146 if mpijob.GetDeletionTimestamp() != nil { 147 return ctrl.Result{}, nil 148 } 149 150 // Set default priorities to MPIJob 151 jc.Scheme.Default(mpijob) 152 153 // 1) validation rules out CleanPolicy with contradicting value 154 // 2) if both fields leave empty, Default function fills with None 155 // 3) if only one field set, sync value 156 cleanPolicyDefined := mpijob.Spec.CleanPodPolicy 157 if mpijob.Spec.RunPolicy.CleanPodPolicy != nil { 158 cleanPolicyDefined = mpijob.Spec.RunPolicy.CleanPodPolicy 159 } 160 mpijob.Spec.CleanPodPolicy = cleanPolicyDefined 161 mpijob.Spec.RunPolicy.CleanPodPolicy = cleanPolicyDefined 162 163 // Use common to reconcile the job related pod and service 164 // MPIJob needs not service 165 err = jc.ReconcileJobs(mpijob, mpijob.Spec.MPIReplicaSpecs, mpijob.Status, &mpijob.Spec.RunPolicy) 166 if err != nil { 167 logrus.Warnf("Reconcile MPIJob error %v", err) 168 return ctrl.Result{}, err 169 } 170 171 t, err := util.DurationUntilExpireTime(&mpijob.Spec.RunPolicy, mpijob.Status) 172 if err != nil { 173 logrus.Warnf("Reconcile MPIJob Job error %v", err) 174 return ctrl.Result{}, err 175 } 176 if t >= 0 { 177 return ctrl.Result{Requeue: true, RequeueAfter: t}, nil 178 } 179 180 return ctrl.Result{}, nil 181 } 182 183 // SetupWithManager sets up the controller with the Manager. 184 func (jc *MPIJobReconciler) SetupWithManager(mgr ctrl.Manager, controllerThreads int) error { 185 c, err := controller.New(jc.ControllerName(), mgr, controller.Options{ 186 Reconciler: jc, 187 MaxConcurrentReconciles: controllerThreads, 188 }) 189 if err != nil { 190 return err 191 } 192 193 // using onOwnerCreateFunc is easier to set defaults 194 if err = c.Watch(source.Kind(mgr.GetCache(), &kubeflowv1.MPIJob{}), &handler.EnqueueRequestForObject{}, 195 predicate.Funcs{CreateFunc: jc.onOwnerCreateFunc()}, 196 ); err != nil { 197 return err 198 } 199 200 // eventHandler for owned objects 201 eventHandler := handler.EnqueueRequestForOwner(mgr.GetScheme(), mgr.GetRESTMapper(), &kubeflowv1.MPIJob{}, handler.OnlyControllerOwner()) 202 predicates := predicate.Funcs{ 203 CreateFunc: util.OnDependentCreateFunc(jc.Expectations), 204 UpdateFunc: util.OnDependentUpdateFunc(&jc.JobController), 205 DeleteFunc: util.OnDependentDeleteFunc(jc.Expectations), 206 } 207 // Create generic predicates 208 genericPredicates := predicate.Funcs{ 209 CreateFunc: util.OnDependentCreateFuncGeneric(jc.Expectations), 210 UpdateFunc: util.OnDependentUpdateFuncGeneric(&jc.JobController), 211 DeleteFunc: util.OnDependentDeleteFuncGeneric(jc.Expectations), 212 } 213 // inject watching for job related pod 214 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.Pod{}), eventHandler, predicates); err != nil { 215 return err 216 } 217 // inject watching for job related ConfigMap 218 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.ConfigMap{}), eventHandler, genericPredicates); err != nil { 219 return err 220 } 221 // inject watching for job related Role 222 if err = c.Watch(source.Kind(mgr.GetCache(), &rbacv1.Role{}), eventHandler, genericPredicates); err != nil { 223 return err 224 } 225 // inject watching for job related RoleBinding 226 if err = c.Watch(source.Kind(mgr.GetCache(), &rbacv1.RoleBinding{}), eventHandler, genericPredicates); err != nil { 227 return err 228 } 229 // inject watching for job related ServiceAccount 230 if err = c.Watch(source.Kind(mgr.GetCache(), &corev1.ServiceAccount{}), eventHandler, genericPredicates); err != nil { 231 return err 232 } 233 // skip watching volcano PodGroup if volcano PodGroup is not installed 234 if _, err = mgr.GetRESTMapper().RESTMapping(schema.GroupKind{Group: v1beta1.GroupName, Kind: "PodGroup"}, 235 v1beta1.SchemeGroupVersion.Version, 236 ); err == nil { 237 // inject watching for job related volcano PodGroup 238 if err = c.Watch(source.Kind(mgr.GetCache(), &v1beta1.PodGroup{}), eventHandler, genericPredicates); err != nil { 239 return err 240 } 241 } 242 // skip watching scheduler-plugins PodGroup if scheduler-plugins PodGroup is not installed 243 if _, err = mgr.GetRESTMapper().RESTMapping( 244 schema.GroupKind{Group: schedulerpluginsv1alpha1.SchemeGroupVersion.Group, Kind: "PodGroup"}, 245 schedulerpluginsv1alpha1.SchemeGroupVersion.Version, 246 ); err == nil { 247 // inject watching for job related scheduler-plugins PodGroup 248 if err = c.Watch(source.Kind(mgr.GetCache(), &schedulerpluginsv1alpha1.PodGroup{}), eventHandler, genericPredicates); err != nil { 249 return err 250 } 251 } 252 253 return nil 254 } 255 256 // ReconcileServices is overridden because mpi-reconciler.v1 does not need to reconcile services 257 func (jc *MPIJobReconciler) ReconcileServices( 258 job metav1.Object, 259 services []*corev1.Service, 260 rtype kubeflowv1.ReplicaType, 261 spec *kubeflowv1.ReplicaSpec) error { 262 return nil 263 } 264 265 func (jc *MPIJobReconciler) ControllerName() string { 266 return controllerName 267 } 268 269 func (jc *MPIJobReconciler) GetAPIGroupVersionKind() schema.GroupVersionKind { 270 return kubeflowv1.GroupVersion.WithKind(kubeflowv1.MPIJobKind) 271 } 272 273 func (jc *MPIJobReconciler) GetAPIGroupVersion() schema.GroupVersion { 274 return kubeflowv1.GroupVersion 275 } 276 277 func (jc *MPIJobReconciler) GetGroupNameLabelValue() string { 278 return kubeflowv1.GroupVersion.Group 279 } 280 281 func (jc *MPIJobReconciler) GetFrameworkName() string { 282 return kubeflowv1.MPIJobFrameworkName 283 } 284 285 // SetClusterSpec is overridden because no cluster spec is needed for MPIJob 286 func (jc *MPIJobReconciler) SetClusterSpec(job interface{}, podTemplate *corev1.PodTemplateSpec, rtype, index string) error { 287 return nil 288 } 289 290 func (jc *MPIJobReconciler) GetDefaultContainerName() string { 291 return kubeflowv1.MPIJobDefaultContainerName 292 } 293 294 func (jc *MPIJobReconciler) GetDefaultContainerPortName() string { 295 return kubeflowv1.MPIJobDefaultPortName 296 } 297 298 func (jc *MPIJobReconciler) IsMasterRole(replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, 299 rtype kubeflowv1.ReplicaType, index int) bool { 300 return string(rtype) == string(kubeflowv1.MPIJobReplicaTypeLauncher) 301 } 302 303 func (jc *MPIJobReconciler) GetJobFromInformerCache(namespace, name string) (metav1.Object, error) { 304 mpijob := &kubeflowv1.MPIJob{} 305 err := jc.Get(context.Background(), types.NamespacedName{ 306 Namespace: namespace, Name: name, 307 }, mpijob) 308 return mpijob, err 309 } 310 311 // onOwnerCreateFunc modify creation condition. 312 func (jc *MPIJobReconciler) onOwnerCreateFunc() func(event.CreateEvent) bool { 313 return func(e event.CreateEvent) bool { 314 mpiJob, ok := e.Object.(*kubeflowv1.MPIJob) 315 if !ok { 316 return true 317 } 318 319 jc.Scheme.Default(mpiJob) 320 msg := fmt.Sprintf("MPIJob %s is created.", e.Object.GetName()) 321 logrus.Info(msg) 322 trainingoperatorcommon.CreatedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName()) 323 commonutil.UpdateJobConditions(&mpiJob.Status, kubeflowv1.JobCreated, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobCreatedReason), msg) 324 return true 325 } 326 } 327 328 func (jc *MPIJobReconciler) ReconcilePods( 329 job interface{}, 330 jobStatus *kubeflowv1.JobStatus, 331 pods []*corev1.Pod, 332 rtype kubeflowv1.ReplicaType, 333 spec *kubeflowv1.ReplicaSpec, 334 replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, 335 ) error { 336 337 mpiJob, ok := job.(*kubeflowv1.MPIJob) 338 if !ok { 339 return fmt.Errorf("%v is not a type of MPIJob", mpiJob) 340 } 341 342 // first set StartTime. 343 if jobStatus.StartTime == nil { 344 now := metav1.Now() 345 jobStatus.StartTime = &now 346 } 347 348 initializeReplicaStatuses(jobStatus, rtype) 349 350 // Get the launcher Job for this MPIJob. 351 launcher, err := jc.getLauncherJob(mpiJob) 352 if err != nil { 353 return err 354 } 355 356 var worker []*corev1.Pod 357 // We're done if the launcher either succeeded or failed. 358 done := launcher != nil && isPodFinished(launcher) 359 360 if !done { 361 workerSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker] 362 workerReplicas := int32(0) 363 if workerSpec != nil && workerSpec.Replicas != nil { 364 workerReplicas = *workerSpec.Replicas 365 } 366 isGPULauncher := isGPULauncher(mpiJob) 367 368 // Get the launcher ServiceAccount for this MPIJob. 369 if sa, err := jc.getOrCreateLauncherServiceAccount(mpiJob); sa == nil || err != nil { 370 return err 371 } 372 373 // Get the ConfigMap for this MPIJob. 374 if config, err := jc.getOrCreateConfigMap(mpiJob, workerReplicas, isGPULauncher); config == nil || err != nil { 375 return err 376 } 377 378 // Get the launcher Role for this MPIJob. 379 if r, err := jc.getOrCreateLauncherRole(mpiJob, workerReplicas); r == nil || err != nil { 380 return err 381 } 382 383 // Get the launcher RoleBinding for this MPIJob. 384 if rb, err := jc.getLauncherRoleBinding(mpiJob); rb == nil || err != nil { 385 return err 386 } 387 388 worker, err = jc.getOrCreateWorker(mpiJob) 389 if err != nil { 390 return err 391 } 392 393 if launcher == nil { 394 launcher, err = jc.KubeClientSet.CoreV1().Pods(mpiJob.Namespace).Create(context.Background(), jc.newLauncher(mpiJob, ctlrconfig.Config.MPIKubectlDeliveryImage, isGPULauncher), metav1.CreateOptions{}) 395 if err != nil { 396 jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason), "launcher pod created failed: %v", err) 397 return err 398 } else { 399 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), "launcher pod created success: %v", launcher.Name) 400 } 401 } 402 } 403 404 // Finally, we update the status block of the MPIJob resource to reflect the 405 // current state of the world. 406 err = jc.updateMPIJobStatus(mpiJob, launcher, worker) 407 if err != nil { 408 return err 409 } 410 return nil 411 } 412 413 func (jc *MPIJobReconciler) updateMPIJobStatus(mpiJob *kubeflowv1.MPIJob, launcher *corev1.Pod, worker []*corev1.Pod) error { 414 if launcher != nil { 415 initializeMPIJobStatuses(mpiJob, kubeflowv1.MPIJobReplicaTypeLauncher) 416 if isPodSucceeded(launcher) { 417 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Succeeded = 1 418 msg := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpiJob.Namespace, mpiJob.Name) 419 jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobPlural, commonutil.JobSucceededReason), msg) 420 if mpiJob.Status.CompletionTime == nil { 421 now := metav1.Now() 422 mpiJob.Status.CompletionTime = &now 423 } 424 err := updateMPIJobConditions(mpiJob, kubeflowv1.JobSucceeded, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg) 425 if err != nil { 426 return err 427 } 428 } else if isPodFailed(launcher) { 429 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Failed = 1 430 msg := fmt.Sprintf("MPIJob %s/%s has failed", mpiJob.Namespace, mpiJob.Name) 431 reason := launcher.Status.Reason 432 if reason == "" { 433 reason = commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason) 434 } 435 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, reason, msg) 436 if reason == "Evicted" { 437 reason = mpiJobEvict 438 } else if !isEvicted(mpiJob.Status) && mpiJob.Status.CompletionTime == nil { 439 now := metav1.Now() 440 mpiJob.Status.CompletionTime = &now 441 } 442 err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, reason, msg) 443 if err != nil { 444 klog.Errorf("Append mpiJob(%s/%s) condition error: %v", mpiJob.Namespace, mpiJob.Name, err) 445 return err 446 } 447 448 } else if isPodRunning(launcher) { 449 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeLauncher].Active = 1 450 } 451 } 452 453 var ( 454 running = 0 455 evict = 0 456 ) 457 458 initializeMPIJobStatuses(mpiJob, kubeflowv1.MPIJobReplicaTypeWorker) 459 for i := 0; i < len(worker); i++ { 460 switch worker[i].Status.Phase { 461 case corev1.PodFailed: 462 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Failed += 1 463 if worker[i].Status.Reason == "Evicted" { 464 evict += 1 465 } 466 case corev1.PodSucceeded: 467 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Succeeded += 1 468 case corev1.PodRunning: 469 running += 1 470 mpiJob.Status.ReplicaStatuses[kubeflowv1.MPIJobReplicaTypeWorker].Active += 1 471 } 472 } 473 if evict > 0 { 474 msg := fmt.Sprintf("%d/%d workers are evicted", evict, len(worker)) 475 if err := updateMPIJobConditions(mpiJob, kubeflowv1.JobFailed, mpiJobEvict, msg); err != nil { 476 return err 477 } 478 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, mpiJobEvict, msg) 479 } 480 481 if launcher != nil && launcher.Status.Phase == corev1.PodRunning && running == len(worker) { 482 msg := fmt.Sprintf("MPIJob %s/%s is running.", mpiJob.Namespace, mpiJob.Name) 483 err := updateMPIJobConditions(mpiJob, kubeflowv1.JobRunning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), msg) 484 if err != nil { 485 return err 486 } 487 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), "MPIJob %s/%s is running", mpiJob.Namespace, mpiJob.Name) 488 } 489 return nil 490 } 491 492 func (jc *MPIJobReconciler) GetJobFromAPIClient(namespace, name string) (metav1.Object, error) { 493 job := &kubeflowv1.MPIJob{} 494 495 err := jc.apiReader.Get(context.Background(), types.NamespacedName{Namespace: namespace, Name: name}, job) 496 if err != nil { 497 if errors.IsNotFound(err) { 498 logrus.Error(err, "MPIJob not found", "namespace", namespace, "name", name) 499 } else { 500 logrus.Error(err, "failed to get job from api-server", "namespace", namespace, "name", name) 501 } 502 return nil, err 503 } 504 return job, nil 505 } 506 507 // GetPodsForJob returns the set of pods that this job should manage. 508 // It also reconciles ControllerRef by adopting/orphaning. 509 // Note that the returned Pods are pointers into the cache. 510 func (jc *MPIJobReconciler) GetPodsForJob(jobObject interface{}) ([]*corev1.Pod, error) { 511 job, ok := jobObject.(metav1.Object) 512 if !ok { 513 return nil, fmt.Errorf("job is not of type metav1.Object") 514 } 515 516 // Create selector. 517 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 518 MatchLabels: jc.GenLabels(job.GetName()), 519 }) 520 521 if err != nil { 522 return nil, fmt.Errorf("couldn't convert Job selector: %v", err) 523 } 524 // List all pods to include those that don't match the selector anymore 525 // but have a ControllerRef pointing to this controller. 526 podlist := &corev1.PodList{} 527 err = jc.List(context.Background(), podlist, 528 client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(job.GetNamespace())) 529 if err != nil { 530 return nil, err 531 } 532 533 return util.JobControlledPodList(podlist.Items, job), nil 534 } 535 536 func (jc *MPIJobReconciler) DeleteJob(job interface{}) error { 537 mpiJob, ok := job.(*kubeflowv1.MPIJob) 538 if !ok { 539 return fmt.Errorf("%v is not a type of MPIJob", mpiJob) 540 } 541 542 log := commonutil.LoggerForJob(mpiJob) 543 if err := jc.Delete(context.Background(), mpiJob); err != nil { 544 jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, FailedDeleteJobReason, "Error deleting: %v", err) 545 log.Errorf("failed to delete job %s/%s, %v", mpiJob.Namespace, mpiJob.Name, err) 546 return err 547 } 548 549 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, SuccessfulDeleteJobReason, "Deleted job: %v", mpiJob.Name) 550 log.Infof("job %s/%s has been deleted", mpiJob.Namespace, mpiJob.Name) 551 trainingoperatorcommon.DeletedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName()) 552 return nil 553 } 554 555 // GetServicesForJob returns the set of services that this job should manage. 556 // It also reconciles ControllerRef by adopting/orphaning. 557 // Note that the returned services are pointers into the cache. 558 func (jc *MPIJobReconciler) GetServicesForJob(jobObject interface{}) ([]*corev1.Service, error) { 559 return nil, nil 560 } 561 562 func (jc *MPIJobReconciler) UpdateJobStatus(job interface{}, replicas map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaSpec, jobStatus *kubeflowv1.JobStatus) error { 563 mpiJob, ok := job.(*kubeflowv1.MPIJob) 564 if !ok { 565 return fmt.Errorf("%+v is not a type of MPIJob", job) 566 } 567 568 for rtype, spec := range replicas { 569 status := jobStatus.ReplicaStatuses[rtype] 570 571 succeeded := status.Succeeded 572 expected := *(spec.Replicas) - succeeded 573 running := status.Active 574 failed := status.Failed 575 576 logrus.Infof("MPIJob=%s, ReplicaType=%s expected=%d, running=%d, succeeded=%d , failed=%d", 577 mpiJob.Name, rtype, expected, running, succeeded, failed) 578 579 if rtype == kubeflowv1.MPIJobReplicaTypeLauncher { 580 if running > 0 { 581 msg := fmt.Sprintf("MPIJob %s is running.", mpiJob.Name) 582 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRunning, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRunningReason), msg) 583 } 584 // when launcher is succeed, the job is finished. 585 if expected == 0 { 586 msg := fmt.Sprintf("MPIJob %s is successfully completed.", mpiJob.Name) 587 logrus.Info(msg) 588 jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg) 589 if jobStatus.CompletionTime == nil { 590 now := metav1.Now() 591 jobStatus.CompletionTime = &now 592 } 593 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobSucceeded, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobSucceededReason), msg) 594 trainingoperatorcommon.SuccessfulJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName()) 595 return nil 596 } 597 } 598 if failed > 0 { 599 if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { 600 msg := fmt.Sprintf("MPIJob %s is restarting because %d %s replica(s) failed.", mpiJob.Name, failed, rtype) 601 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRestartingReason), msg) 602 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobRestarting, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobRestartingReason), msg) 603 trainingoperatorcommon.RestartedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName()) 604 } else { 605 msg := fmt.Sprintf("MPIJob %s is failed because %d %s replica(s) failed.", mpiJob.Name, failed, rtype) 606 jc.Recorder.Event(mpiJob, corev1.EventTypeNormal, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason), msg) 607 if jobStatus.CompletionTime == nil { 608 now := metav1.Now() 609 jobStatus.CompletionTime = &now 610 } 611 commonutil.UpdateJobConditions(jobStatus, kubeflowv1.JobFailed, corev1.ConditionTrue, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason)), msg) 612 trainingoperatorcommon.FailedJobsCounterInc(mpiJob.Namespace, jc.GetFrameworkName()) 613 } 614 } 615 } 616 mpiJob.Status = *jobStatus.DeepCopy() 617 return nil 618 } 619 620 func (jc *MPIJobReconciler) UpdateJobStatusInApiServer(job interface{}, jobStatus *kubeflowv1.JobStatus) error { 621 if jobStatus.ReplicaStatuses == nil { 622 jobStatus.ReplicaStatuses = map[kubeflowv1.ReplicaType]*kubeflowv1.ReplicaStatus{} 623 } 624 625 mpiJob, ok := job.(*kubeflowv1.MPIJob) 626 trainingoperatorcommon.ClearGeneratedFields(&mpiJob.ObjectMeta) 627 if !ok { 628 return fmt.Errorf("%v is not a type of MpiJob", mpiJob) 629 } 630 631 startTime := time.Now() 632 logger := commonutil.LoggerForJob(mpiJob) 633 defer func() { 634 logger.Infof("Finished updating MpiJobs Status %q (%v)", 635 mpiJob.Name, time.Since(startTime)) 636 }() 637 638 mpiJob = mpiJob.DeepCopy() 639 mpiJob.Status = *jobStatus.DeepCopy() 640 641 result := jc.Status().Update(context.Background(), mpiJob) 642 643 if result != nil { 644 jc.Log.WithValues("mpijob", types.NamespacedName{ 645 Namespace: mpiJob.GetNamespace(), 646 Name: mpiJob.GetName(), 647 }) 648 return result 649 } 650 651 return nil 652 } 653 654 // getLauncherJob gets the launcher Job controlled by this MPIJob. 655 func (jc *MPIJobReconciler) getLauncherJob(mpiJob *kubeflowv1.MPIJob) (*corev1.Pod, error) { 656 launcher := &corev1.Pod{} 657 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} 658 err := jc.Get(context.Background(), NamespacedName, launcher) 659 if errors.IsNotFound(err) { 660 return nil, nil 661 } 662 if err != nil { 663 // If an error occurs during Get, we'll requeue the item so we can 664 // attempt processing again later. This could have been caused by a 665 // temporary network failure, or any other transient reason. 666 return nil, err 667 } 668 669 // If the launcher is not controlled by this MPIJob resource, we should log 670 // a warning to the event recorder and return. 671 if !metav1.IsControlledBy(launcher, mpiJob) { 672 msg := fmt.Sprintf(MessageResourceExists, launcher.Name, launcher.Kind) 673 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 674 return launcher, fmt.Errorf(msg) 675 } 676 return launcher, nil 677 } 678 679 // getOrCreateConfigMap gets the ConfigMap controlled by this MPIJob, or creates 680 // one if it doesn't exist. 681 func (jc *MPIJobReconciler) getOrCreateConfigMap(mpiJob *kubeflowv1.MPIJob, workerReplicas int32, isGPULauncher bool) (*corev1.ConfigMap, error) { 682 newCM := newConfigMap(mpiJob, workerReplicas, isGPULauncher) 683 podList, err := jc.getRunningWorkerPods(mpiJob) 684 if err != nil { 685 return nil, err 686 } 687 updateDiscoverHostsInConfigMap(newCM, mpiJob, podList, isGPULauncher) 688 689 cm := &corev1.ConfigMap{} 690 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + configSuffix} 691 err = jc.Get(context.Background(), NamespacedName, cm) 692 693 // If the ConfigMap doesn't exist, we'll create it. 694 if errors.IsNotFound(err) { 695 cm, err = jc.KubeClientSet.CoreV1().ConfigMaps(mpiJob.Namespace).Create(context.Background(), newCM, metav1.CreateOptions{}) 696 } 697 // If an error occurs during Get/Create, we'll requeue the item so we 698 // can attempt processing again later. This could have been caused by a 699 // temporary network failure, or any other transient reason. 700 if err != nil { 701 return nil, err 702 } 703 704 // If the ConfigMap is not controlled by this MPIJob resource, we 705 // should log a warning to the event recorder and return. 706 if !metav1.IsControlledBy(cm, mpiJob) { 707 msg := fmt.Sprintf(MessageResourceExists, cm.Name, cm.Kind) 708 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 709 return nil, fmt.Errorf(msg) 710 } 711 712 // If the ConfigMap is changed, update it 713 if !reflect.DeepEqual(cm.Data, newCM.Data) { 714 cm, err = jc.KubeClientSet.CoreV1().ConfigMaps(mpiJob.Namespace).Update(context.Background(), newCM, metav1.UpdateOptions{}) 715 if err != nil { 716 return nil, err 717 } 718 } 719 720 return cm, nil 721 } 722 723 // getOrCreateLauncherServiceAccount gets the launcher ServiceAccount controlled 724 // by this MPIJob, or creates one if it doesn't exist. 725 func (jc *MPIJobReconciler) getOrCreateLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) (*corev1.ServiceAccount, error) { 726 727 sa := &corev1.ServiceAccount{} 728 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} 729 err := jc.Get(context.Background(), NamespacedName, sa) 730 731 if err == nil { 732 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "ServiceAccount is exist", "ServiceAccount: %v", sa.Name) 733 } 734 735 if errors.IsNotFound(err) { 736 sa, err = jc.KubeClientSet.CoreV1().ServiceAccounts(mpiJob.Namespace).Create(context.Background(), newLauncherServiceAccount(mpiJob), metav1.CreateOptions{}) 737 } 738 // If an error occurs during Get/Create, we'll requeue the item so we 739 // can attempt processing again later. This could have been caused by a 740 // temporary network failure, or any other transient reason. 741 if err != nil { 742 return nil, err 743 } 744 // If the launcher ServiceAccount is not controlled by this MPIJob resource, we 745 // should log a warning to the event recorder and return. 746 if !metav1.IsControlledBy(sa, mpiJob) { 747 msg := fmt.Sprintf(MessageResourceExists, sa.Name, sa.Kind) 748 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 749 return nil, fmt.Errorf(msg) 750 } 751 752 return sa, nil 753 } 754 755 // getOrCreateLauncherRole gets the launcher Role controlled by this MPIJob. 756 func (jc *MPIJobReconciler) getOrCreateLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) (*rbacv1.Role, error) { 757 role := &rbacv1.Role{} 758 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} 759 err := jc.Get(context.Background(), NamespacedName, role) 760 761 if err == nil { 762 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "LauncherRole is exist", "LauncherRole: %v", role.Name) 763 } 764 765 launcherRole := newLauncherRole(mpiJob, workerReplicas) 766 // If the Role doesn't exist, we'll create it. 767 if errors.IsNotFound(err) { 768 role, err = jc.KubeClientSet.RbacV1().Roles(mpiJob.Namespace).Create(context.Background(), launcherRole, metav1.CreateOptions{}) 769 } 770 // If an error occurs during Get/Create, we'll requeue the item so we 771 // can attempt processing again later. This could have been caused by a 772 // temporary network failure, or any other transient reason. 773 if err != nil { 774 return nil, err 775 } 776 // If the launcher Role is not controlled by this MPIJob resource, we 777 // should log a warning to the event recorder and return. 778 if !metav1.IsControlledBy(role, mpiJob) { 779 msg := fmt.Sprintf(MessageResourceExists, role.Name, role.Kind) 780 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 781 return nil, fmt.Errorf(msg) 782 } 783 784 if !reflect.DeepEqual(role.Rules, launcherRole.Rules) { 785 role, err = jc.KubeClientSet.RbacV1().Roles(mpiJob.Namespace).Update(context.Background(), launcherRole, metav1.UpdateOptions{}) 786 if err != nil { 787 return nil, err 788 } 789 } 790 791 return role, nil 792 } 793 794 // getLauncherRoleBinding gets the launcher RoleBinding controlled by this 795 // MPIJob, or creates one if it doesn't exist. 796 func (jc *MPIJobReconciler) getLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) (*rbacv1.RoleBinding, error) { 797 rb := &rbacv1.RoleBinding{} 798 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: mpiJob.Name + launcherSuffix} 799 err := jc.Get(context.Background(), NamespacedName, rb) 800 // If the RoleBinding doesn't exist, we'll create it. 801 802 if err == nil { 803 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "RoleBinding is exist", "RoleBinding: %v", rb.Name) 804 } 805 806 if errors.IsNotFound(err) { 807 rb, err = jc.KubeClientSet.RbacV1().RoleBindings(mpiJob.Namespace).Create(context.Background(), newLauncherRoleBinding(mpiJob), metav1.CreateOptions{}) 808 } 809 // If an error occurs during Get/Create, we'll requeue the item so we 810 // can attempt processing again later. This could have been caused by a 811 // temporary network failure, or any other transient reason. 812 if err != nil { 813 return nil, err 814 } 815 // If the launcher RoleBinding is not controlled by this MPIJob resource, we 816 // should log a warning to the event recorder and return. 817 if !metav1.IsControlledBy(rb, mpiJob) { 818 msg := fmt.Sprintf(MessageResourceExists, rb.Name, rb.Kind) 819 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 820 return nil, fmt.Errorf(msg) 821 } 822 823 return rb, nil 824 } 825 826 // getOrCreateWorker gets the worker Pod controlled by this 827 // MPIJob, or creates one if it doesn't exist. 828 func (jc *MPIJobReconciler) getOrCreateWorker(mpiJob *kubeflowv1.MPIJob) ([]*corev1.Pod, error) { 829 var ( 830 workerPrefix string = mpiJob.Name + workerSuffix 831 workerPods []*corev1.Pod = []*corev1.Pod{} 832 i int32 = 0 833 workerReplicas *int32 834 ) 835 if worker, ok := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker]; ok && worker != nil { 836 workerReplicas = worker.Replicas 837 } else { 838 return workerPods, nil 839 } 840 841 // Remove Pods when replicas are scaled down 842 genericLabels := jc.GenLabels(mpiJob.GetName()) 843 selector, err := workerSelector(genericLabels) 844 if err != nil { 845 return nil, err 846 } 847 848 podlist := &corev1.PodList{} 849 err = jc.List(context.Background(), podlist, client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(mpiJob.GetNamespace())) 850 851 if err != nil { 852 return nil, err 853 } 854 if len(podlist.Items) > int(*workerReplicas) { 855 for _, pod := range podlist.Items { 856 indexStr, ok := pod.Labels[kubeflowv1.ReplicaIndexLabel] 857 if !ok { 858 return nil, err 859 } 860 index, err := strconv.Atoi(indexStr) 861 if err == nil { 862 if index >= int(*workerReplicas) { 863 err = jc.KubeClientSet.CoreV1().Pods(pod.Namespace).Delete(context.Background(), pod.Name, metav1.DeleteOptions{}) 864 if err != nil { 865 return nil, err 866 } 867 } 868 } 869 } 870 } 871 872 for ; i < *workerReplicas; i++ { 873 name := fmt.Sprintf("%s-%d", workerPrefix, i) 874 875 pod := &corev1.Pod{} 876 NamespacedName := types.NamespacedName{Namespace: mpiJob.Namespace, Name: name} 877 err := jc.Get(context.Background(), NamespacedName, pod) 878 879 // If the worker Pod doesn't exist, we'll create it. 880 if errors.IsNotFound(err) { 881 worker := jc.newWorker(mpiJob, name) 882 if worker == nil { 883 msg := fmt.Sprintf(MessageResourceDoesNotExist, "Worker") 884 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceDoesNotExist, msg) 885 err = fmt.Errorf(msg) 886 return nil, err 887 } 888 // Insert ReplicaIndexLabel 889 worker.Labels[kubeflowv1.ReplicaIndexLabel] = strconv.Itoa(int(i)) 890 pod, err = jc.KubeClientSet.CoreV1().Pods(mpiJob.Namespace).Create(context.Background(), worker, metav1.CreateOptions{}) 891 if err == nil { 892 jc.Recorder.Eventf(mpiJob, corev1.EventTypeNormal, "SuccessfulCreatePod", "Created worker pod: %v", pod.Name) 893 } else { 894 jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, "FailedCreatePod", "Created worker pod: %v", pod.Name) 895 } 896 } 897 898 // If an error occurs during Get/Create, we'll requeue the item so we 899 // can attempt processing again later. This could have been caused by a 900 // temporary network failure, or any other transient reason. 901 if err != nil && !errors.IsNotFound(err) { 902 jc.Recorder.Eventf(mpiJob, corev1.EventTypeWarning, commonutil.NewReason(kubeflowv1.MPIJobKind, commonutil.JobFailedReason), 903 "worker pod created failed: %v", err) 904 return nil, err 905 } 906 // If the worker is not controlled by this MPIJob resource, we should log 907 // a warning to the event recorder and return. 908 if pod != nil && !metav1.IsControlledBy(pod, mpiJob) { 909 msg := fmt.Sprintf(MessageResourceExists, pod.Name, pod.Kind) 910 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceExists, msg) 911 return nil, fmt.Errorf(msg) 912 } 913 workerPods = append(workerPods, pod) 914 } 915 916 return workerPods, nil 917 } 918 919 // newWorker creates a new worker Pod for an MPIJob resource. It also 920 // sets the appropriate OwnerReferences on the resource so handleObject can 921 // discover the MPIJob resource that 'owns' it. 922 func (jc *MPIJobReconciler) newWorker(mpiJob *kubeflowv1.MPIJob, name string) *corev1.Pod { 923 genericLabels := jc.GenLabels(mpiJob.GetName()) 924 labels := defaultWorkerLabels(genericLabels) 925 926 podSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker].Template.DeepCopy() 927 928 // keep the labels which are set in PodTemplate 929 if len(podSpec.Labels) == 0 { 930 podSpec.Labels = make(map[string]string) 931 } 932 933 for key, value := range labels { 934 podSpec.Labels[key] = value 935 } 936 setRestartPolicy(podSpec, mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeWorker]) 937 logger := commonutil.LoggerForReplica(mpiJob, strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher))) 938 if len(podSpec.Spec.Containers) == 0 { 939 klog.Errorln("Worker pod does not have any containers in its spec") 940 return nil 941 } 942 container := podSpec.Spec.Containers[0] 943 if len(container.Command) == 0 { 944 container.Command = []string{"sleep"} 945 container.Args = []string{"365d"} 946 } 947 948 // We need the kubexec.sh script here because Open MPI checks for the path 949 // in every rank. 950 container.VolumeMounts = append(container.VolumeMounts, corev1.VolumeMount{ 951 Name: configVolumeName, 952 MountPath: configMountPath, 953 }) 954 podSpec.Spec.Containers[0] = container 955 956 scriptMode := int32(0555) 957 podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, corev1.Volume{ 958 Name: configVolumeName, 959 VolumeSource: corev1.VolumeSource{ 960 ConfigMap: &corev1.ConfigMapVolumeSource{ 961 LocalObjectReference: corev1.LocalObjectReference{ 962 Name: mpiJob.Name + configSuffix, 963 }, 964 Items: []corev1.KeyToPath{ 965 { 966 Key: kubexecScriptName, 967 Path: kubexecScriptName, 968 Mode: &scriptMode, 969 }, 970 }, 971 }, 972 }, 973 }) 974 975 // if gang-scheduling is enabled: 976 // 1. if user has specified other scheduler, we report a warning without overriding any fields. 977 // 2. if no SchedulerName is set for pods, then we set the SchedulerName to "volcano". 978 if jc.Config.EnableGangScheduling() { 979 if !util.IsGangSchedulerSet(mpiJob.Spec.MPIReplicaSpecs, jc.PodGroupControl.GetSchedulerName()) { 980 errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten" 981 logger.Warning(errMsg) 982 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg) 983 } 984 985 rtWorker := strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeWorker)) 986 jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rtWorker) 987 } 988 989 return &corev1.Pod{ 990 ObjectMeta: metav1.ObjectMeta{ 991 Name: name, 992 Namespace: mpiJob.Namespace, 993 Labels: podSpec.Labels, 994 Annotations: podSpec.Annotations, 995 OwnerReferences: []metav1.OwnerReference{ 996 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 997 }, 998 }, 999 Spec: podSpec.Spec, 1000 } 1001 } 1002 1003 // newLauncher creates a new launcher Job for an MPIJob resource. It also sets 1004 // the appropriate OwnerReferences on the resource so handleObject can discover 1005 // the MPIJob resource that 'owns' it. 1006 func (jc *MPIJobReconciler) newLauncher(mpiJob *kubeflowv1.MPIJob, kubectlDeliveryImage string, isGPULauncher bool) *corev1.Pod { 1007 launcherName := mpiJob.Name + launcherSuffix 1008 1009 genericLabels := jc.GenLabels(mpiJob.GetName()) 1010 labels := defaultLauncherLabels(genericLabels) 1011 1012 masterRole := jc.IsMasterRole(mpiJob.Spec.MPIReplicaSpecs, kubeflowv1.MPIJobReplicaTypeLauncher, 0) 1013 if masterRole { 1014 labels[kubeflowv1.JobRoleLabel] = "master" 1015 } 1016 podSpec := mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher].Template.DeepCopy() 1017 // copy the labels and annotations to pod from PodTemplate 1018 if len(podSpec.Labels) == 0 { 1019 podSpec.Labels = make(map[string]string) 1020 } 1021 for key, value := range labels { 1022 podSpec.Labels[key] = value 1023 } 1024 1025 logger := commonutil.LoggerForReplica(mpiJob, strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher))) 1026 // add SchedulerName to podSpec 1027 if jc.Config.EnableGangScheduling() { 1028 if !util.IsGangSchedulerSet(mpiJob.Spec.MPIReplicaSpecs, jc.PodGroupControl.GetSchedulerName()) { 1029 errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten" 1030 logger.Warning(errMsg) 1031 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg) 1032 } 1033 1034 rt := strings.ToLower(string(kubeflowv1.MPIJobReplicaTypeLauncher)) 1035 jc.PodGroupControl.DecoratePodTemplateSpec(podSpec, mpiJob, rt) 1036 } 1037 1038 podSpec.Spec.ServiceAccountName = launcherName 1039 podSpec.Spec.InitContainers = append(podSpec.Spec.InitContainers, corev1.Container{ 1040 Name: kubectlDeliveryName, 1041 Image: kubectlDeliveryImage, 1042 ImagePullPolicy: corev1.PullIfNotPresent, 1043 Env: []corev1.EnvVar{ 1044 { 1045 Name: kubectlTargetDirEnv, 1046 Value: kubectlMountPath, 1047 }, 1048 { 1049 Name: "NAMESPACE", 1050 Value: mpiJob.Namespace, 1051 }, 1052 }, 1053 VolumeMounts: []corev1.VolumeMount{ 1054 { 1055 Name: kubectlVolumeName, 1056 MountPath: kubectlMountPath, 1057 }, 1058 { 1059 Name: configVolumeName, 1060 MountPath: configMountPath, 1061 }, 1062 }, 1063 Resources: corev1.ResourceRequirements{ 1064 Limits: corev1.ResourceList{ 1065 corev1.ResourceCPU: resource.MustParse(initContainerCpu), 1066 corev1.ResourceMemory: resource.MustParse(initContainerMem), 1067 corev1.ResourceEphemeralStorage: resource.MustParse(initContainerEphStorage), 1068 }, 1069 Requests: corev1.ResourceList{ 1070 corev1.ResourceCPU: resource.MustParse(initContainerCpu), 1071 corev1.ResourceMemory: resource.MustParse(initContainerMem), 1072 corev1.ResourceEphemeralStorage: resource.MustParse(initContainerEphStorage), 1073 }, 1074 }, 1075 }) 1076 if len(podSpec.Spec.Containers) == 0 { 1077 klog.Errorln("Launcher pod does not have any containers in its spec") 1078 msg := fmt.Sprintf(MessageResourceDoesNotExist, "Launcher") 1079 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, ErrResourceDoesNotExist, msg) 1080 return nil 1081 } 1082 container := podSpec.Spec.Containers[0] 1083 container.Env = append(container.Env, 1084 corev1.EnvVar{ 1085 Name: "OMPI_MCA_plm_rsh_agent", 1086 Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), 1087 }, 1088 corev1.EnvVar{ 1089 Name: "OMPI_MCA_orte_default_hostfile", 1090 Value: fmt.Sprintf("%s/%s", configMountPath, hostfileName), 1091 }, 1092 ) 1093 1094 if !isGPULauncher { 1095 container.Env = append(container.Env, 1096 // We overwrite these environment variables so that users will not 1097 // be mistakenly using GPU resources for launcher due to potential 1098 // issues with scheduler/container technologies. 1099 corev1.EnvVar{ 1100 Name: "NVIDIA_VISIBLE_DEVICES", 1101 Value: "", 1102 }, 1103 corev1.EnvVar{ 1104 Name: "NVIDIA_DRIVER_CAPABILITIES", 1105 Value: "", 1106 }) 1107 } 1108 1109 // Add default Intel MPI bootstrap variables if not provided by the user. 1110 bootstrap, exec := hasIntelMPIBootstrapValues(container.Env) 1111 if !bootstrap { 1112 container.Env = append(container.Env, 1113 corev1.EnvVar{ 1114 Name: "I_MPI_HYDRA_BOOTSTRAP", 1115 Value: iMPIDefaultBootstrap, 1116 }, 1117 ) 1118 } 1119 if !exec { 1120 container.Env = append(container.Env, 1121 corev1.EnvVar{ 1122 Name: "I_MPI_HYDRA_BOOTSTRAP_EXEC", 1123 Value: fmt.Sprintf("%s/%s", configMountPath, kubexecScriptName), 1124 }, 1125 ) 1126 } 1127 1128 container.VolumeMounts = append(container.VolumeMounts, 1129 corev1.VolumeMount{ 1130 Name: kubectlVolumeName, 1131 MountPath: kubectlMountPath, 1132 }, 1133 corev1.VolumeMount{ 1134 Name: configVolumeName, 1135 MountPath: configMountPath, 1136 }) 1137 podSpec.Spec.Containers[0] = container 1138 1139 // Submit a warning event if the user specifies restart policy for 1140 // the pod template. We recommend to set it from the replica level. 1141 if podSpec.Spec.RestartPolicy != corev1.RestartPolicy("") { 1142 errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec" 1143 klog.Warning(errMsg) 1144 jc.Recorder.Event(mpiJob, corev1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg) 1145 } 1146 setRestartPolicy(podSpec, mpiJob.Spec.MPIReplicaSpecs[kubeflowv1.MPIJobReplicaTypeLauncher]) 1147 1148 scriptsMode := int32(0555) 1149 hostfileMode := int32(0444) 1150 podSpec.Spec.Volumes = append(podSpec.Spec.Volumes, 1151 corev1.Volume{ 1152 Name: kubectlVolumeName, 1153 VolumeSource: corev1.VolumeSource{ 1154 EmptyDir: &corev1.EmptyDirVolumeSource{}, 1155 }, 1156 }, 1157 corev1.Volume{ 1158 Name: configVolumeName, 1159 VolumeSource: corev1.VolumeSource{ 1160 ConfigMap: &corev1.ConfigMapVolumeSource{ 1161 LocalObjectReference: corev1.LocalObjectReference{ 1162 Name: mpiJob.Name + configSuffix, 1163 }, 1164 Items: []corev1.KeyToPath{ 1165 { 1166 Key: kubexecScriptName, 1167 Path: kubexecScriptName, 1168 Mode: &scriptsMode, 1169 }, 1170 { 1171 Key: hostfileName, 1172 Path: hostfileName, 1173 Mode: &hostfileMode, 1174 }, 1175 { 1176 Key: discoverHostsScriptName, 1177 Path: discoverHostsScriptName, 1178 Mode: &scriptsMode, 1179 }, 1180 }, 1181 }, 1182 }, 1183 }) 1184 return &corev1.Pod{ 1185 ObjectMeta: metav1.ObjectMeta{ 1186 Name: launcherName, 1187 Namespace: mpiJob.Namespace, 1188 Labels: podSpec.Labels, 1189 Annotations: podSpec.Annotations, 1190 OwnerReferences: []metav1.OwnerReference{ 1191 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 1192 }, 1193 }, 1194 Spec: podSpec.Spec, 1195 } 1196 } 1197 1198 // getRunningWorkerPods get all worker Pods with Running phase controlled by this MPIJob. 1199 func (jc *MPIJobReconciler) getRunningWorkerPods(mpiJob *kubeflowv1.MPIJob) ([]*corev1.Pod, error) { 1200 genericLabels := jc.GenLabels(mpiJob.GetName()) 1201 selector, err := workerSelector(genericLabels) 1202 if err != nil { 1203 return nil, err 1204 } 1205 1206 podFullList := &corev1.PodList{} 1207 err = jc.List(context.Background(), podFullList, client.MatchingLabelsSelector{Selector: selector}, client.InNamespace(mpiJob.GetNamespace())) 1208 //podFullList, err := r.PodLister.List(selector) 1209 if err != nil { 1210 return nil, err 1211 } 1212 // Only running Pods should be included within the `discover_hosts.sh` script. 1213 var podList []corev1.Pod 1214 for idx, pod := range podFullList.Items { 1215 if pod.Status.Phase == corev1.PodRunning { 1216 podList = append(podList, podFullList.Items[idx]) 1217 } 1218 } 1219 return util.JobControlledPodList(podList, mpiJob), nil 1220 } 1221 1222 // newConfigMap creates a new ConfigMap containing configurations for an MPIJob 1223 // resource. It also sets the appropriate OwnerReferences on the resource so 1224 // handleObject can discover the MPIJob resource that 'owns' it. 1225 func newConfigMap(mpiJob *kubeflowv1.MPIJob, workerReplicas int32, isGPULauncher bool) *corev1.ConfigMap { 1226 kubexec := fmt.Sprintf(`#!/bin/sh 1227 set -x 1228 POD_NAME=$1 1229 shift 1230 %s/kubectl exec ${POD_NAME}`, kubectlMountPath) 1231 if len(mpiJob.Spec.MainContainer) > 0 { 1232 kubexec = fmt.Sprintf("%s --container %s", kubexec, mpiJob.Spec.MainContainer) 1233 } 1234 kubexec = fmt.Sprintf("%s -- /bin/sh -c \"$*\"", kubexec) 1235 1236 // If no processing unit is specified, default to 1 slot. 1237 slots := 1 1238 if mpiJob.Spec.SlotsPerWorker != nil { 1239 slots = int(*mpiJob.Spec.SlotsPerWorker) 1240 } 1241 var buffer bytes.Buffer 1242 if isGPULauncher { 1243 buffer.WriteString(fmt.Sprintf("%s%s slots=%d\n", mpiJob.Name, launcherSuffix, slots)) 1244 } 1245 for i := 0; i < int(workerReplicas); i++ { 1246 buffer.WriteString(fmt.Sprintf("%s%s-%d slots=%d\n", mpiJob.Name, workerSuffix, i, slots)) 1247 } 1248 1249 return &corev1.ConfigMap{ 1250 ObjectMeta: metav1.ObjectMeta{ 1251 Name: mpiJob.Name + configSuffix, 1252 Namespace: mpiJob.Namespace, 1253 Labels: map[string]string{ 1254 "app": mpiJob.Name, 1255 }, 1256 OwnerReferences: []metav1.OwnerReference{ 1257 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 1258 }, 1259 }, 1260 Data: map[string]string{ 1261 hostfileName: buffer.String(), 1262 kubexecScriptName: kubexec, 1263 }, 1264 } 1265 } 1266 1267 // updateDiscoverHostsInConfigMap updates the ConfigMap if the content of `discover_hosts.sh` changes. 1268 func updateDiscoverHostsInConfigMap(configMap *corev1.ConfigMap, mpiJob *kubeflowv1.MPIJob, runningPods []*corev1.Pod, isGPULauncher bool) { 1269 slots := 1 1270 if mpiJob.Spec.SlotsPerWorker != nil { 1271 slots = int(*mpiJob.Spec.SlotsPerWorker) 1272 } 1273 1274 // Sort the slice of Pods to make sure the order of entries in `discover_hosts.sh` is maintained. 1275 sort.Slice(runningPods, func(i, j int) bool { 1276 return runningPods[i].Name < runningPods[j].Name 1277 }) 1278 1279 discoverHosts := "#!/bin/sh" 1280 if isGPULauncher { 1281 discoverHosts = fmt.Sprintf("%s\necho %s%s:%d\n", discoverHosts, mpiJob.Name, launcherSuffix, slots) 1282 } 1283 for _, p := range runningPods { 1284 discoverHosts = fmt.Sprintf("%s\necho %s:%d", discoverHosts, p.Name, slots) 1285 } 1286 1287 oldDiscoverHosts, exist := configMap.Data[discoverHostsScriptName] 1288 if exist { 1289 if oldDiscoverHosts == discoverHosts { 1290 return 1291 } 1292 } 1293 configMap.Data[discoverHostsScriptName] = discoverHosts 1294 } 1295 1296 // newLauncherServiceAccount creates a new launcher ServiceAccount for an MPIJob 1297 // resource. It also sets the appropriate OwnerReferences on the resource so 1298 // handleObject can discover the MPIJob resource that 'owns' it. 1299 func newLauncherServiceAccount(mpiJob *kubeflowv1.MPIJob) *corev1.ServiceAccount { 1300 return &corev1.ServiceAccount{ 1301 ObjectMeta: metav1.ObjectMeta{ 1302 Name: mpiJob.Name + launcherSuffix, 1303 Namespace: mpiJob.Namespace, 1304 Labels: map[string]string{ 1305 "app": mpiJob.Name, 1306 }, 1307 OwnerReferences: []metav1.OwnerReference{ 1308 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 1309 }, 1310 }, 1311 } 1312 } 1313 1314 // newLauncherRole creates a new launcher Role for an MPIJob resource. It also 1315 // sets the appropriate OwnerReferences on the resource so handleObject can 1316 // discover the MPIJob resource that 'owns' it. 1317 func newLauncherRole(mpiJob *kubeflowv1.MPIJob, workerReplicas int32) *rbacv1.Role { 1318 var podNames []string 1319 for i := 0; i < int(workerReplicas); i++ { 1320 podNames = append(podNames, fmt.Sprintf("%s%s-%d", mpiJob.Name, workerSuffix, i)) 1321 } 1322 return &rbacv1.Role{ 1323 ObjectMeta: metav1.ObjectMeta{ 1324 Name: mpiJob.Name + launcherSuffix, 1325 Namespace: mpiJob.Namespace, 1326 Labels: map[string]string{ 1327 "app": mpiJob.Name, 1328 }, 1329 OwnerReferences: []metav1.OwnerReference{ 1330 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 1331 }, 1332 }, 1333 Rules: []rbacv1.PolicyRule{ 1334 { 1335 Verbs: []string{"get", "list", "watch"}, 1336 APIGroups: []string{""}, 1337 Resources: []string{"pods"}, 1338 }, 1339 { 1340 Verbs: []string{"create"}, 1341 APIGroups: []string{""}, 1342 Resources: []string{"pods/exec"}, 1343 ResourceNames: podNames, 1344 }, 1345 }, 1346 } 1347 } 1348 1349 // newLauncherRoleBinding creates a new launcher RoleBinding for an MPIJob 1350 // resource. It also sets the appropriate OwnerReferences on the resource so 1351 // handleObject can discover the MPIJob resource that 'owns' it. 1352 func newLauncherRoleBinding(mpiJob *kubeflowv1.MPIJob) *rbacv1.RoleBinding { 1353 launcherName := mpiJob.Name + launcherSuffix 1354 return &rbacv1.RoleBinding{ 1355 ObjectMeta: metav1.ObjectMeta{ 1356 Name: launcherName, 1357 Namespace: mpiJob.Namespace, 1358 Labels: map[string]string{ 1359 "app": mpiJob.Name, 1360 }, 1361 OwnerReferences: []metav1.OwnerReference{ 1362 *metav1.NewControllerRef(mpiJob, kubeflowv1.MPIJobSchemeGroupVersionKind), 1363 }, 1364 }, 1365 Subjects: []rbacv1.Subject{ 1366 { 1367 Kind: rbacv1.ServiceAccountKind, 1368 Name: launcherName, 1369 Namespace: mpiJob.Namespace, 1370 }, 1371 }, 1372 RoleRef: rbacv1.RoleRef{ 1373 APIGroup: rbacv1.GroupName, 1374 Kind: "Role", 1375 Name: launcherName, 1376 }, 1377 } 1378 } 1379 1380 func setRestartPolicy(podTemplateSpec *corev1.PodTemplateSpec, spec *kubeflowv1.ReplicaSpec) { 1381 if spec.RestartPolicy == kubeflowv1.RestartPolicyExitCode { 1382 podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicyNever 1383 } else { 1384 podTemplateSpec.Spec.RestartPolicy = corev1.RestartPolicy(spec.RestartPolicy) 1385 } 1386 }