github.com/kubeflow/training-operator@v1.7.0/pkg/controller.v1/common/pod.go (about)

     1  // Copyright 2019 The Kubeflow Authors
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //      http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package common
    16  
    17  import (
    18  	"fmt"
    19  	"reflect"
    20  	"strconv"
    21  	"strings"
    22  
    23  	apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    24  	trainingoperatorcommon "github.com/kubeflow/training-operator/pkg/common"
    25  	"github.com/kubeflow/training-operator/pkg/controller.v1/control"
    26  	"github.com/kubeflow/training-operator/pkg/controller.v1/expectation"
    27  	"github.com/kubeflow/training-operator/pkg/core"
    28  	commonutil "github.com/kubeflow/training-operator/pkg/util"
    29  	utillabels "github.com/kubeflow/training-operator/pkg/util/labels"
    30  	trainutil "github.com/kubeflow/training-operator/pkg/util/train"
    31  
    32  	"github.com/prometheus/client_golang/prometheus"
    33  	"github.com/prometheus/client_golang/prometheus/promauto"
    34  	log "github.com/sirupsen/logrus"
    35  	v1 "k8s.io/api/core/v1"
    36  	"k8s.io/apimachinery/pkg/api/errors"
    37  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    38  	"k8s.io/apimachinery/pkg/labels"
    39  	"k8s.io/apimachinery/pkg/runtime"
    40  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    41  	"k8s.io/client-go/tools/cache"
    42  )
    43  
    44  const (
    45  	// podTemplateRestartPolicyReason is the warning reason when the restart
    46  	// policy is set in pod template.
    47  	podTemplateRestartPolicyReason = "SettedPodTemplateRestartPolicy"
    48  	// exitedWithCodeReason is the normal reason when the pod is exited because of the exit code.
    49  	exitedWithCodeReason = "ExitedWithCode"
    50  	// podTemplateSchedulerNameReason is the warning reason when other scheduler name is set
    51  	// in pod templates with gang-scheduling enabled
    52  	podTemplateSchedulerNameReason = "SettedPodTemplateSchedulerName"
    53  )
    54  
    55  var (
    56  	// Prometheus metrics
    57  	createdPodsCount = promauto.NewCounter(prometheus.CounterOpts{
    58  		Name: "created_pods_total",
    59  		Help: "The total number of created pods",
    60  	})
    61  	deletedPodsCount = promauto.NewCounter(prometheus.CounterOpts{
    62  		Name: "deleted_pods_total",
    63  		Help: "The total number of deleted pods",
    64  	})
    65  	failedPodsCount = promauto.NewCounter(prometheus.CounterOpts{
    66  		Name: "failed_pods_total",
    67  		Help: "The total number of failed pods",
    68  	})
    69  )
    70  
    71  // When a pod is created, enqueue the job that manages it and update its expectations.
    72  func (jc *JobController) AddPod(obj interface{}) {
    73  	pod := obj.(*v1.Pod)
    74  	if pod.DeletionTimestamp != nil {
    75  		// on a restart of the controller controller, it's possible a new pod shows up in a state that
    76  		// is already pending deletion. Prevent the pod from being a creation observation.
    77  		// jc.deletePod(pod)
    78  		return
    79  	}
    80  
    81  	// If it has a ControllerRef, that's all that matters.
    82  	if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
    83  		job := jc.resolveControllerRef(pod.Namespace, controllerRef)
    84  
    85  		logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind)
    86  
    87  		if job == nil {
    88  			if utillabels.HasKnownLabels(pod.Labels, jc.Controller.GetGroupNameLabelValue()) {
    89  				logger.Info("This pod's job does not exist")
    90  			}
    91  			return
    92  		}
    93  
    94  		jobKey, err := KeyFunc(job)
    95  		if err != nil {
    96  			logger.Infof("Failed to get the jobkey: %v", err)
    97  			return
    98  		}
    99  
   100  		rType, err := utillabels.ReplicaType(pod.Labels)
   101  		if err != nil {
   102  			logger.Infof("This pod maybe not created by %v", jc.Controller.ControllerName())
   103  			return
   104  		}
   105  
   106  		expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, string(rType))
   107  
   108  		jc.Expectations.CreationObserved(expectationPodsKey)
   109  		// TODO: we may need add backoff here
   110  		jc.WorkQueue.Add(jobKey)
   111  
   112  		return
   113  	}
   114  
   115  }
   116  
   117  // When a pod is updated, figure out what job is managing it and wake it up.
   118  // If the labels of the pod have changed we need to awaken both the old
   119  // and new replica set. old and cur must be *v1.Pod types.
   120  func (jc *JobController) UpdatePod(old, cur interface{}) {
   121  	curPod := cur.(*v1.Pod)
   122  	oldPod := old.(*v1.Pod)
   123  	if curPod.ResourceVersion == oldPod.ResourceVersion {
   124  		// Periodic resync will send update events for all known pods.
   125  		// Two different versions of the same pod will always have different RVs.
   126  		return
   127  	}
   128  
   129  	logger := commonutil.LoggerForPod(curPod, jc.Controller.GetAPIGroupVersionKind().Kind)
   130  	curControllerRef := metav1.GetControllerOf(curPod)
   131  	oldControllerRef := metav1.GetControllerOf(oldPod)
   132  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   133  	if controllerRefChanged && oldControllerRef != nil {
   134  		// The ControllerRef was changed. Sync the old controller, if any.
   135  		if job := jc.resolveControllerRef(oldPod.Namespace, oldControllerRef); job != nil {
   136  			logger.Infof("pod ControllerRef updated: %v, %v", curPod, oldPod)
   137  			jobKey, err := KeyFunc(job)
   138  			if err != nil {
   139  				return
   140  			}
   141  			// TODO: we may need add backoff here
   142  			jc.WorkQueue.Add(jobKey)
   143  		}
   144  	}
   145  
   146  	// If it has a ControllerRef, that's all that matters.
   147  	if curControllerRef != nil {
   148  		job := jc.resolveControllerRef(curPod.Namespace, curControllerRef)
   149  		if job == nil {
   150  			return
   151  		}
   152  		logger.Debugf("pod has a ControllerRef: %v, %v", curPod, oldPod)
   153  		jobKey, err := KeyFunc(job)
   154  		if err != nil {
   155  			return
   156  		}
   157  		// TODO: we may need add backoff here
   158  		jc.WorkQueue.Add(jobKey)
   159  		return
   160  	}
   161  }
   162  
   163  // When a pod is deleted, enqueue the job that manages the pod and update its expectations.
   164  // obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.
   165  func (jc *JobController) DeletePod(obj interface{}) {
   166  	pod, ok := obj.(*v1.Pod)
   167  
   168  	logger := commonutil.LoggerForPod(pod, jc.Controller.GetAPIGroupVersionKind().Kind)
   169  
   170  	// When a delete is dropped, the relist will notice a pod in the store not
   171  	// in the list, leading to the insertion of a tombstone object which contains
   172  	// the deleted key/value. Note that this value might be stale. If the pod
   173  	// changed labels the new job will not be woken up till the periodic resync.
   174  	if !ok {
   175  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   176  		if !ok {
   177  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
   178  			return
   179  		}
   180  		pod, ok = tombstone.Obj.(*v1.Pod)
   181  		if !ok {
   182  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %+v", obj))
   183  			return
   184  		}
   185  	}
   186  
   187  	controllerRef := metav1.GetControllerOf(pod)
   188  	if controllerRef == nil {
   189  		// No controller should care about orphans being deleted.
   190  		return
   191  	}
   192  	job := jc.resolveControllerRef(pod.Namespace, controllerRef)
   193  	if job == nil {
   194  		return
   195  	}
   196  	jobKey, err := KeyFunc(job)
   197  	if err != nil {
   198  		return
   199  	}
   200  
   201  	rType, err := utillabels.ReplicaType(pod.Labels)
   202  	if err != nil {
   203  		logger.Infof("This pod maybe not created by %v", jc.Controller.ControllerName())
   204  		return
   205  	}
   206  
   207  	expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, string(rType))
   208  
   209  	jc.Expectations.DeletionObserved(expectationPodsKey)
   210  	deletedPodsCount.Inc()
   211  	// TODO: we may need add backoff here
   212  	jc.WorkQueue.Add(jobKey)
   213  }
   214  
   215  // getPodsForJob returns the set of pods that this job should manage.
   216  // It also reconciles ControllerRef by adopting/orphaning.
   217  // Note that the returned Pods are pointers into the cache.
   218  func (jc *JobController) GetPodsForJob(jobObject interface{}) ([]*v1.Pod, error) {
   219  	job, ok := jobObject.(metav1.Object)
   220  	if !ok {
   221  		return nil, fmt.Errorf("job is not of type metav1.Object")
   222  	}
   223  
   224  	// Create selector.
   225  	selector, err := metav1.LabelSelectorAsSelector(&metav1.LabelSelector{
   226  		MatchLabels: jc.GenLabels(job.GetName()),
   227  	})
   228  
   229  	if err != nil {
   230  		return nil, fmt.Errorf("couldn't convert Job selector: %v", err)
   231  	}
   232  	// List all pods to include those that don't match the selector anymore
   233  	// but have a ControllerRef pointing to this controller.
   234  	pods, err := jc.PodLister.Pods(job.GetNamespace()).List(labels.Everything())
   235  	if err != nil {
   236  		return nil, err
   237  	}
   238  
   239  	// If any adoptions are attempted, we should first recheck for deletion
   240  	// with an uncached quorum read sometime after listing Pods (see #42639).
   241  	canAdoptFunc := RecheckDeletionTimestamp(func() (metav1.Object, error) {
   242  		fresh, err := jc.Controller.GetJobFromAPIClient(job.GetNamespace(), job.GetName())
   243  		if err != nil {
   244  			return nil, err
   245  		}
   246  		if fresh.GetUID() != job.GetUID() {
   247  			return nil, fmt.Errorf("original Job %v/%v is gone: got uid %v, wanted %v", job.GetNamespace(), job.GetName(), fresh.GetUID(), job.GetUID())
   248  		}
   249  		return fresh, nil
   250  	})
   251  	cm := control.NewPodControllerRefManager(jc.PodControl, job, selector, jc.Controller.GetAPIGroupVersionKind(), canAdoptFunc)
   252  	return cm.ClaimPods(pods)
   253  }
   254  
   255  // FilterPodsForReplicaType returns pods belong to a replicaType.
   256  func (jc *JobController) FilterPodsForReplicaType(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error) {
   257  	return core.FilterPodsForReplicaType(pods, replicaType)
   258  }
   259  
   260  // getPodSlices returns a slice, which element is the slice of pod.
   261  // It gives enough information to caller to make decision to up/down scale resources.
   262  func (jc *JobController) GetPodSlices(pods []*v1.Pod, replicas int, logger *log.Entry) [][]*v1.Pod {
   263  	return core.GetPodSlices(pods, replicas, logger)
   264  }
   265  
   266  // ReconcilePods checks and updates pods for each given ReplicaSpec.
   267  // It will requeue the job in case of an error while creating/deleting pods.
   268  func (jc *JobController) ReconcilePods(
   269  	job interface{},
   270  	jobStatus *apiv1.JobStatus,
   271  	pods []*v1.Pod,
   272  	rType apiv1.ReplicaType,
   273  	spec *apiv1.ReplicaSpec,
   274  	replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) error {
   275  
   276  	rt := strings.ToLower(string(rType))
   277  	metaObject, ok := job.(metav1.Object)
   278  	if !ok {
   279  		return fmt.Errorf("job is not a metav1.Object type")
   280  	}
   281  	runtimeObject, ok := job.(runtime.Object)
   282  	if !ok {
   283  		return fmt.Errorf("job is not a runtime.Object type")
   284  	}
   285  	jobKey, err := KeyFunc(metaObject)
   286  	if err != nil {
   287  		utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err))
   288  		return err
   289  	}
   290  	jobKind := jc.Controller.GetAPIGroupVersionKind().Kind
   291  	expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, rt)
   292  
   293  	// Convert ReplicaType to lower string.
   294  	logger := commonutil.LoggerForReplica(metaObject, rt)
   295  	// Get all pods for the type rt.
   296  	pods, err = jc.FilterPodsForReplicaType(pods, rt)
   297  	if err != nil {
   298  		return err
   299  	}
   300  	numReplicas := int(*spec.Replicas)
   301  	var masterRole bool
   302  
   303  	initializeReplicaStatuses(jobStatus, rType)
   304  
   305  	// GetPodSlices will return enough information here to make decision to add/remove/update resources.
   306  	//
   307  	// For example, let's assume we have pods with replica-index 0, 1, 2
   308  	// If replica is 4, return a slice with size 4. [[0],[1],[2],[]], a pod with replica-index 3 will be created.
   309  	//
   310  	// If replica is 1, return a slice with size 3. [[0],[1],[2]], pod with replica-index 1 and 2 are out of range and will be deleted.
   311  	podSlices := jc.GetPodSlices(pods, numReplicas, logger)
   312  	for index, podSlice := range podSlices {
   313  		if len(podSlice) > 1 {
   314  			logger.Warningf("We have too many pods for %s %d", rt, index)
   315  		} else if len(podSlice) == 0 {
   316  			logger.Infof("Need to create new pod: %s-%d", rt, index)
   317  
   318  			// check if this replica is the master role
   319  			masterRole = jc.Controller.IsMasterRole(replicas, rType, index)
   320  			err = jc.createNewPod(job, rt, index, spec, masterRole, replicas)
   321  			if err != nil {
   322  				return err
   323  			}
   324  		} else {
   325  			// Check the status of the current pod.
   326  			pod := podSlice[0]
   327  
   328  			// check if the index is in the valid range, if not, we should kill the pod
   329  			if index < 0 || index >= numReplicas {
   330  				err = jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject)
   331  				if err != nil {
   332  					return err
   333  				}
   334  				// Deletion is expected
   335  				jc.Expectations.RaiseExpectations(expectationPodsKey, 0, 1)
   336  			}
   337  
   338  			// Get the exit code of the container.
   339  			var exitCode int32 = 0xbeef // magic number
   340  			for _, status := range pod.Status.ContainerStatuses {
   341  				state := status.State
   342  				if status.Name == jc.Controller.GetDefaultContainerName() && state.Terminated != nil {
   343  					exitCode = state.Terminated.ExitCode
   344  					logger.Infof("Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode)
   345  					jc.Recorder.Eventf(runtimeObject, v1.EventTypeNormal, exitedWithCodeReason, "Pod: %v.%v exited with code %v", pod.Namespace, pod.Name, exitCode)
   346  				}
   347  			}
   348  			// Check if the pod is retryable.
   349  			if pod.Status.Phase == v1.PodFailed &&
   350  				(spec.RestartPolicy == apiv1.RestartPolicyExitCode && trainutil.IsRetryableExitCode(exitCode) ||
   351  					spec.RestartPolicy == apiv1.RestartPolicyOnFailure ||
   352  					spec.RestartPolicy == apiv1.RestartPolicyAlways) {
   353  				failedPodsCount.Inc()
   354  				logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name)
   355  				if err := jc.PodControl.DeletePod(pod.Namespace, pod.Name, runtimeObject); err != nil {
   356  					return err
   357  				}
   358  				// Deletion is expected
   359  				jc.Expectations.RaiseExpectations(expectationPodsKey, 0, 1)
   360  
   361  				msg := fmt.Sprintf("job %s is restarting because %s replica(s) failed.",
   362  					metaObject.GetName(), rType)
   363  				jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, commonutil.NewReason(jobKind, commonutil.JobRestartingReason), msg)
   364  				commonutil.UpdateJobConditions(jobStatus, apiv1.JobRestarting, v1.ConditionTrue, commonutil.NewReason(jobKind, commonutil.JobRestartingReason), msg)
   365  				trainingoperatorcommon.RestartedJobsCounterInc(metaObject.GetNamespace(), jc.Controller.GetFrameworkName())
   366  			}
   367  
   368  			updateJobReplicaStatuses(jobStatus, rType, pod)
   369  		}
   370  	}
   371  	return nil
   372  }
   373  
   374  // createNewPod creates a new pod for the given index and type.
   375  func (jc *JobController) createNewPod(job interface{}, rt string, index int, spec *apiv1.ReplicaSpec, masterRole bool,
   376  	replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec) error {
   377  
   378  	metaObject, ok := job.(metav1.Object)
   379  	if !ok {
   380  		return fmt.Errorf("job is not a metav1.Object type")
   381  	}
   382  	runtimeObject, ok := job.(runtime.Object)
   383  	if !ok {
   384  		return fmt.Errorf("job is not a runtime.Object type")
   385  	}
   386  	jobKey, err := KeyFunc(metaObject)
   387  	if err != nil {
   388  		utilruntime.HandleError(fmt.Errorf("couldn't get key for job object %#v: %v", job, err))
   389  		return err
   390  	}
   391  	logger := commonutil.LoggerForReplica(metaObject, rt)
   392  
   393  	// Set type and index for the worker.
   394  	labels := jc.GenLabels(metaObject.GetName())
   395  	utillabels.SetReplicaType(labels, rt)
   396  	utillabels.SetReplicaIndex(labels, index)
   397  
   398  	if masterRole {
   399  		utillabels.SetJobRole(labels, "master")
   400  	}
   401  
   402  	podTemplate := spec.Template.DeepCopy()
   403  
   404  	idxStr := strconv.Itoa(index)
   405  	// Set name for the template.
   406  	podTemplate.Name = GenGeneralName(metaObject.GetName(), rt, idxStr)
   407  
   408  	if podTemplate.Labels == nil {
   409  		podTemplate.Labels = make(map[string]string)
   410  	}
   411  
   412  	for key, value := range labels {
   413  		podTemplate.Labels[key] = value
   414  	}
   415  
   416  	if err := jc.Controller.SetClusterSpec(job, podTemplate, rt, idxStr); err != nil {
   417  		return err
   418  	}
   419  
   420  	// Submit a warning event if the user specifies restart policy for
   421  	// the pod template. We recommend to set it from the replica level.
   422  	if podTemplate.Spec.RestartPolicy != v1.RestartPolicy("") {
   423  		errMsg := "Restart policy in pod template will be overwritten by restart policy in replica spec"
   424  		logger.Warning(errMsg)
   425  		jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, podTemplateRestartPolicyReason, errMsg)
   426  	}
   427  	core.SetRestartPolicy(podTemplate, spec)
   428  
   429  	// if gang-scheduling is enabled:
   430  	// 1. if user has specified other scheduler, we report a warning without overriding any fields.
   431  	// 2. if no SchedulerName is set for pods, we set the SchedulerName to gang-scheduler-name.
   432  	if jc.Config.EnableGangScheduling() {
   433  		if isCustomSchedulerSet(replicas, jc.PodGroupControl.GetSchedulerName()) {
   434  			errMsg := "Another scheduler is specified when gang-scheduling is enabled and it will not be overwritten"
   435  			logger.Warning(errMsg)
   436  			jc.Recorder.Event(runtimeObject, v1.EventTypeWarning, podTemplateSchedulerNameReason, errMsg)
   437  		}
   438  		jc.PodGroupControl.DecoratePodTemplateSpec(podTemplate, metaObject, rt)
   439  	}
   440  
   441  	// Creation is expected when there is no error returned
   442  	// We use `RaiseExpectations` here to accumulate expectations since `SetExpectations` has no such kind of ability
   443  	expectationPodsKey := expectation.GenExpectationPodsKey(jobKey, rt)
   444  	jc.Expectations.RaiseExpectations(expectationPodsKey, 1, 0)
   445  
   446  	controllerRef := jc.GenOwnerReference(metaObject)
   447  	err = jc.PodControl.CreatePodsWithControllerRef(metaObject.GetNamespace(), podTemplate, runtimeObject, controllerRef)
   448  	if err != nil && errors.IsTimeout(err) {
   449  		// Pod is created but its initialization has timed out.
   450  		// If the initialization is successful eventually, the
   451  		// controller will observe the creation via the informer.
   452  		// If the initialization fails, or if the pod keeps
   453  		// uninitialized for a long time, the informer will not
   454  		// receive any update, and the controller will create a new
   455  		// pod when the expectation expires.
   456  		return nil
   457  	} else if err != nil {
   458  		// Since error occurred(the informer won't observe this pod),
   459  		// we decrement the expected number of creates
   460  		// and wait until next reconciliation
   461  		jc.Expectations.CreationObserved(expectationPodsKey)
   462  		return err
   463  	}
   464  	createdPodsCount.Inc()
   465  	return nil
   466  }
   467  
   468  func isCustomSchedulerSet(replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, gangSchedulerName string) bool {
   469  	for _, spec := range replicas {
   470  		if spec.Template.Spec.SchedulerName != "" && spec.Template.Spec.SchedulerName != gangSchedulerName {
   471  			return true
   472  		}
   473  	}
   474  	return false
   475  }