github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/common/pod.go (about) 1 // Copyright 2019 The Kubeflow Authors 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package common 16 17 import ( 18 "fmt" 19 "reflect" 20 "strconv" 21 "strings" 22 23 apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 24 trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common" 25 "github.com/kubeflow/training-operator/pkg/controller.v1/control" 26 "github.com/kubeflow/training-operator/pkg/controller.v1/expectation" 27 "github.com/kubeflow/training-operator/pkg/core" 28 commonutil "github.com/kubeflow/training-operator/pkg/util" 29 utillabels "github.com/kubeflow/training-operator/pkg/util/labels" 30 trainutil "github.com/kubeflow/training-operator/pkg/util/train" 31 32 "github.com/prometheus/client_golang/prometheus" 33 "github.com/prometheus/client_golang/prometheus/promauto" 34 log "github.com/sirupsen/logrus" 35 v1 "k8s.io/api/core/v1" 36 "k8s.io/apimachinery/pkg/api/errors" 37 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 38 "k8s.io/apimachinery/pkg/labels" 39 "k8s.io/apimachinery/pkg/runtime" 40 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 41 "k8s.io/client-go/tools/cache" 42 ) 43 44 const ( 45 // podTemplateRestartPolicyReason is the warning reason when the restart 46 // policy is set in pod template. 47 podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy" 48 // exitedWithCodeReason is the normal reason when the pod is exited because of the exit code. 49 exitedWithCodeReason = "ExitedWithCode" 50 // podTemplateSchedulerNameReason is the warning reason when other scheduler name is set 51 // in pod templates with gang-scheduling enabled 52 podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName" 53 ) 54 55 var ( 56 // Prometheus metrics 57 createdPodsCount = promauto.NewCounter(prometheus.CounterOpts{ 58 Name: "created_pods_total", 59 Help: "The total number of created pods", 60 }) 61 deletedPodsCount = promauto.NewCounter(prometheus.CounterOpts{ 62 Name: "deleted_pods_total", 63 Help: "The total number of deleted pods", 64 }) 65 failedPodsCount = promauto.NewCounter(prometheus.CounterOpts{ 66 Name: "failed_pods_total", 67 Help: "The total number of failed pods", 68 }) 69 ) 70 71 // When a pod is created, enqueue the job that manages it and update its expectations. 72 func (jc *JobController) AddPod(obj interface{}) { 73 pod := obj.(*v1.Pod) 74 if pod.DeletionTimestamp != nil { 75 // on a restart of the controller controller, it's possible a new pod shows up in a state that 76 // is already pending deletion. Prevent the pod from being a creation observation. 77 // jc.deletePod(pod) 78 return 79 } 80 81 // If it has a ControllerRef, that's all that matters. 82 if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil { 83 job := jc.resolveControllerRef(pod.Namespace, controllerRef) 84 85 logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind) 86 87 if job == nil { 88 if utillabels.HasKnownLabels(pod.Labels, jc.Controller.GetGroupNameLabelValue()) { 89 logger.Info("This pod's job does not exist") 90 } 91 return 92 } 93 94 jobKey, err := KeyFunc(job) 95 if err != nil { 96 logger.Infof("Failed to get the jobkey: %v", err) 97 return 98 } 99 100 rType, err := utillabels.ReplicaType(pod.Labels) 101 if err != nil { 102 logger.Infof("This pod maybe not created by %v", jc.Controller.ControllerName()) 103 return 104 } 105 106 expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, string(rType)) 107 108 jc.Expectations.CreationObserved(expectationPodsKey) 109 // TODO: we may need add backoff here 110 jc.WorkQueue.Add(jobKey) 111 112 return 113 } 114 115 } 116 117 // When a pod is updated, figure out what job is managing it and wake it up. 118 // If the labels of the pod have changed we need to awaken both the old 119 // and new replica set. old and cur must be *v1.Pod types. 120 func (jc *JobController) UpdatePod(old, cur interface{}) { 121 curPod := cur.(*v1.Pod) 122 oldPod := old.(*v1.Pod) 123 if curPod.ResourceVersion == oldPod.ResourceVersion { 124 // Periodic resync will send update events for all known pods. 125 // Two different versions of the same pod will always have different RVs. 126 return 127 } 128 129 logger := commonutil.LoggerForPod(curPod, jc.Controller.GetAPIGroupVersionKind().Kind) 130 curControllerRef := metav1.GetControllerOf(curPod) 131 oldControllerRef := metav1.GetControllerOf(oldPod) 132 controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef) 133 if controllerRefChanged && oldControllerRef != nil { 134 // The ControllerRef was changed. Sync the old controller, if any. 135 if job := jc.resolveControllerRef(oldPod.Namespace, oldControllerRef); job != nil { 136 logger.Infof("pod ControllerRef updated: %v, %v", curPod, oldPod) 137 jobKey, err := KeyFunc(job) 138 if err != nil { 139 return 140 } 141 // TODO: we may need add backoff here 142 jc.WorkQueue.Add(jobKey) 143 } 144 } 145 146 // If it has a ControllerRef, that's all that matters. 147 if curControllerRef != nil { 148 job := jc.resolveControllerRef(curPod.Namespace, curControllerRef) 149 if job == nil { 150 return 151 } 152 logger.Debugf("pod has a ControllerRef: %v, %v", curPod, oldPod) 153 jobKey, err := KeyFunc(job) 154 if err != nil { 155 return 156 } 157 // TODO: we may need add backoff here 158 jc.WorkQueue.Add(jobKey) 159 return 160 } 161 } 162 163 // When a pod is deleted, enqueue the job that manages the pod and update its expectations. 164 // obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item. 165 func (jc *JobController) DeletePod(obj interface{}) { 166 pod, ok := obj.(*v1.Pod) 167 168 logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind) 169 170 // When a delete is dropped, the relist will notice a pod in the store not 171 // in the list, leading to the insertion of a tombstone object which contains 172 // the deleted key/value. Note that this value might be stale. If the pod 173 // changed labels the new job will not be woken up till the periodic resync. 174 if !ok { 175 tombstone, ok := obj.(cache.DeletedFinalStateUnknown) 176 if !ok { 177 utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj)) 178 return 179 } 180 pod, ok = tombstone.Obj.(*v1.Pod) 181 if !ok { 182 utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj)) 183 return 184 } 185 } 186 187 controllerRef := metav1.GetControllerOf(pod) 188 if controllerRef == nil { 189 // No controller should care about orphans being deleted. 190 return 191 } 192 job := jc.resolveControllerRef(pod.Namespace, controllerRef) 193 if job == nil { 194 return 195 } 196 jobKey, err := KeyFunc(job) 197 if err != nil { 198 return 199 } 200 201 rType, err := utillabels.ReplicaType(pod.Labels) 202 if err != nil { 203 logger.Infof("This pod maybe not created by %v", jc.Controller.ControllerName()) 204 return 205 } 206 207 expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, string(rType)) 208 209 jc.Expectations.DeletionObserved(expectationPodsKey) 210 deletedPodsCount.Inc() 211 // TODO: we may need add backoff here 212 jc.WorkQueue.Add(jobKey) 213 } 214 215 // getPodsForJob returns the set of pods that this job should manage. 216 // It also reconciles ControllerRef by adopting/orphaning. 217 // Note that the returned Pods are pointers into the cache. 218 func (jc *JobController) GetPodsForJob(jobObject interface{}) ([]*v1.Pod, error) { 219 job, ok := jobObject.(metav1.Object) 220 if !ok { 221 return nil, fmt.Errorf("job is not of type metav1.Object") 222 } 223 224 // Create selector. 225 selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{ 226 MatchLabels: jc.GenLabels(job.GetName()), 227 }) 228 229 if err != nil { 230 return nil, fmt.Errorf("couldn't convert Job selector: %v", err) 231 } 232 // List all pods to include those that don't match the selector anymore 233 // but have a ControllerRef pointing to this controller. 234 pods, err := jc.PodLister.Pods(job.GetNamespace()).List(labels.Everything()) 235 if err != nil { 236 return nil, err 237 } 238 239 // If any adoptions are attempted, we should first recheck for deletion 240 // with an uncached quorum read sometime after listing Pods (see #42639). 241 canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) { 242 fresh, err := jc.Controller.GetJobFromAPIClient(job.GetNamespace(), job.GetName()) 243 if err != nil { 244 return nil, err 245 } 246 if fresh.GetUID() != job.GetUID() { 247 return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", job.GetNamespace(), job.GetName(), fresh.GetUID(), job.GetUID()) 248 } 249 return fresh, nil 250 }) 251 cm := control.NewPodControllerRefManager(jc.PodControl, job, selector, jc.Controller.GetAPIGroupVersionKind(), canAdoptFunc) 252 return cm.ClaimPods(pods) 253 } 254 255 // FilterPodsForReplicaType returns pods belong to a replicaType. 256 func (jc *JobController) FilterPodsForReplicaType(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error) { 257 return core.FilterPodsForReplicaType(pods, replicaType) 258 } 259 260 // getPodSlices returns a slice, which element is the slice of pod. 261 // It gives enough information to caller to make decision to up/down scale resources. 262 func (jc *JobController) GetPodSlices(pods []*v1.Pod, replicas int, logger *log.Entry) [][]*v1.Pod { 263 return core.GetPodSlices(pods, replicas, logger) 264 } 265 266 // ReconcilePods checks and updates pods for each given ReplicaSpec. 267 // It will requeue the job in case of an error while creating/deleting pods. 268 func (jc *JobController) ReconcilePods( 269 job interface{}, 270 jobStatus *apiv1.JobStatus, 271 pods []*v1.Pod, 272 rType apiv1.ReplicaType, 273 spec *apiv1.ReplicaSpec, 274 replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) error { 275 276 rt := strings.ToLower(string(rType)) 277 metaObject, ok := job.(metav1.Object) 278 if !ok { 279 return fmt.Errorf("job is not a metav1.Object type") 280 } 281 runtimeObject, ok := job.(runtime.Object) 282 if !ok { 283 return fmt.Errorf("job is not a runtime.Object type") 284 } 285 jobKey, err := KeyFunc(metaObject) 286 if err != nil { 287 utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err)) 288 return err 289 } 290 jobKind := jc.Controller.GetAPIGroupVersionKind().Kind 291 expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, rt) 292 293 // Convert ReplicaType to lower string. 294 logger := commonutil.LoggerForReplica(metaObject, rt) 295 // Get all pods for the type rt. 296 pods, err = jc.FilterPodsForReplicaType(pods, rt) 297 if err != nil { 298 return err 299 } 300 numReplicas := int(*spec.Replicas) 301 var masterRole bool 302 303 initializeReplicaStatuses(jobStatus, rType) 304 305 // GetPodSlices will return enough information here to make decision to add/remove/update resources. 306 // 307 // For example, let's assume we have pods with replica-index 0, 1, 2 308 // If replica is 4, return a slice with size 4. [[0],[1],[2],[]], a pod with replica-index 3 will be created. 309 // 310 // If replica is 1, return a slice with size 3. [[0],[1],[2]], pod with replica-index 1 and 2 are out of range and will be deleted. 311 podSlices := jc.GetPodSlices(pods, numReplicas, logger) 312 for index, podSlice := range podSlices { 313 if len(podSlice) > 1 { 314 logger.Warningf("We have too many pods for %s %d", rt, index) 315 } else if len(podSlice) == 0 { 316 logger.Infof("Need to create new pod: %s-%d", rt, index) 317 318 // check if this replica is the master role 319 masterRole = jc.Controller.IsMasterRole(replicas, rType, index) 320 err = jc.createNewPod(job, rt, index, spec, masterRole, replicas) 321 if err != nil { 322 return err 323 } 324 } else { 325 // Check the status of the current pod. 326 pod := podSlice[0] 327 328 // check if the index is in the valid range, if not, we should kill the pod 329 if index < 0 || index >= numReplicas { 330 err = jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject) 331 if err != nil { 332 return err 333 } 334 // Deletion is expected 335 jc.Expectations.RaiseExpectations(expectationPodsKey, 0, 1) 336 } 337 338 // Get the exit code of the container. 339 var exitCode int32 = 0xbeef // magic number 340 for _, status := range pod.Status.ContainerStatuses { 341 state := status.State 342 if status.Name == jc.Controller.GetDefaultContainerName() && state.Terminated != nil { 343 exitCode = state.Terminated.ExitCode 344 logger.Infof("Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) 345 jc.Recorder.Eventf(runtimeObject, v1.EventTypeNormal, exitedWithCodeReason, "Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode) 346 } 347 } 348 // Check if the pod is retryable. 349 if pod.Status.Phase == v1.PodFailed && 350 (spec.RestartPolicy == apiv1.RestartPolicyExitCode && trainutil.IsRetryableExitCode(exitCode) || 351 spec.RestartPolicy == apiv1.RestartPolicyOnFailure || 352 spec.RestartPolicy == apiv1.RestartPolicyAlways) { 353 failedPodsCount.Inc() 354 logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name) 355 if err := jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject); err != nil { 356 return err 357 } 358 // Deletion is expected 359 jc.Expectations.RaiseExpectations(expectationPodsKey, 0, 1) 360 361 msg := fmt.Sprintf("job %s is restarting because %s replica(s) failed.", 362 metaObject.GetName(), rType) 363 jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, commonutil.NewReason(jobKind, commonutil.JobRestartingReason), msg) 364 commonutil.UpdateJobConditions(jobStatus, apiv1.JobRestarting, v1.ConditionTrue, commonutil.NewReason(jobKind, commonutil.JobRestartingReason), msg) 365 trainingoperatorcommon.RestartedJobsCounterInc(metaObject.GetNamespace(), jc.Controller.GetFrameworkName()) 366 } 367 368 updateJobReplicaStatuses(jobStatus, rType, pod) 369 } 370 } 371 return nil 372 } 373 374 // createNewPod creates a new pod for the given index and type. 375 func (jc *JobController) createNewPod(job interface{}, rt string, index int, spec *apiv1.ReplicaSpec, masterRole bool, 376 replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) error { 377 378 metaObject, ok := job.(metav1.Object) 379 if !ok { 380 return fmt.Errorf("job is not a metav1.Object type") 381 } 382 runtimeObject, ok := job.(runtime.Object) 383 if !ok { 384 return fmt.Errorf("job is not a runtime.Object type") 385 } 386 jobKey, err := KeyFunc(metaObject) 387 if err != nil { 388 utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err)) 389 return err 390 } 391 logger := commonutil.LoggerForReplica(metaObject, rt) 392 393 // Set type and index for the worker. 394 labels := jc.GenLabels(metaObject.GetName()) 395 utillabels.SetReplicaType(labels, rt) 396 utillabels.SetReplicaIndex(labels, index) 397 398 if masterRole { 399 utillabels.SetJobRole(labels, "master") 400 } 401 402 podTemplate := spec.Template.DeepCopy() 403 404 idxStr := strconv.Itoa(index) 405 // Set name for the template. 406 podTemplate.Name = GenGeneralName(metaObject.GetName(), rt, idxStr) 407 408 if podTemplate.Labels == nil { 409 podTemplate.Labels = make(map[string]string) 410 } 411 412 for key, value := range labels { 413 podTemplate.Labels[key] = value 414 } 415 416 if err := jc.Controller.SetClusterSpec(job, podTemplate, rt, idxStr); err != nil { 417 return err 418 } 419 420 // Submit a warning event if the user specifies restart policy for 421 // the pod template. We recommend to set it from the replica level. 422 if podTemplate.Spec.RestartPolicy != v1.RestartPolicy("") { 423 errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec" 424 logger.Warning(errMsg) 425 jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg) 426 } 427 core.SetRestartPolicy(podTemplate, spec) 428 429 // if gang-scheduling is enabled: 430 // 1. if user has specified other scheduler, we report a warning without overriding any fields. 431 // 2. if no SchedulerName is set for pods, we set the SchedulerName to gang-scheduler-name. 432 if jc.Config.EnableGangScheduling() { 433 if isCustomSchedulerSet(replicas, jc.PodGroupControl.GetSchedulerName()) { 434 errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten" 435 logger.Warning(errMsg) 436 jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg) 437 } 438 jc.PodGroupControl.DecoratePodTemplateSpec(podTemplate, metaObject, rt) 439 } 440 441 // Creation is expected when there is no error returned 442 // We use `RaiseExpectations` here to accumulate expectations since `SetExpectations` has no such kind of ability 443 expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, rt) 444 jc.Expectations.RaiseExpectations(expectationPodsKey, 1, 0) 445 446 controllerRef := jc.GenOwnerReference(metaObject) 447 err = jc.PodControl.CreatePodsWithControllerRef(metaObject.GetNamespace(), podTemplate, runtimeObject, controllerRef) 448 if err != nil && errors.IsTimeout(err) { 449 // Pod is created but its initialization has timed out. 450 // If the initialization is successful eventually, the 451 // controller will observe the creation via the informer. 452 // If the initialization fails, or if the pod keeps 453 // uninitialized for a long time, the informer will not 454 // receive any update, and the controller will create a new 455 // pod when the expectation expires. 456 return nil 457 } else if err != nil { 458 // Since error occurred(the informer won't observe this pod), 459 // we decrement the expected number of creates 460 // and wait until next reconciliation 461 jc.Expectations.CreationObserved(expectationPodsKey) 462 return err 463 } 464 createdPodsCount.Inc() 465 return nil 466 } 467 468 func isCustomSchedulerSet(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, gangSchedulerName string) bool { 469 for _, spec := range replicas { 470 if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName != gangSchedulerName { 471 return true 472 } 473 } 474 return false 475 }