github.com/kubeflow/training-operator@v1.7.0/pkg/core/job.go (about)

     1  /*
     2  Copyright 2023 The Kubeflow Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package core
    18  
    19  import (
    20  	"sort"
    21  	"strings"
    22  	"time"
    23  
    24  	log "github.com/sirupsen/logrus"
    25  
    26  	apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  
    29  	v1 "k8s.io/api/core/v1"
    30  	"k8s.io/apimachinery/pkg/runtime"
    31  	"k8s.io/client-go/tools/record"
    32  )
    33  
    34  // RecordAbnormalPods records the active pod whose latest condition is not in True status.
    35  func RecordAbnormalPods(activePods []*v1.Pod, object runtime.Object, recorder record.EventRecorder) {
    36  	for _, pod := range activePods {
    37  		// If the pod starts running, should checks the container statuses rather than the conditions.
    38  		recordContainerStatus := func(status *v1.ContainerStatus) {
    39  			if status.State.Terminated != nil && status.State.Terminated.ExitCode != 0 {
    40  				terminated := status.State.Terminated
    41  				recorder.Eventf(object, v1.EventTypeWarning, terminated.Reason,
    42  					"Error pod %s container %s exitCode: %d terminated message: %s",
    43  					pod.Name, status.Name, terminated.ExitCode, terminated.Message)
    44  			}
    45  			// The terminated state and waiting state don't simultaneously exists, checks them at the same time.
    46  			if status.State.Waiting != nil && status.State.Waiting.Message != "" {
    47  				wait := status.State.Waiting
    48  				recorder.Eventf(object, v1.EventTypeWarning, wait.Reason,
    49  					"Error pod %s container %s waiting message: %s", pod.Name, status.Name, wait.Message)
    50  			}
    51  		}
    52  		if len(pod.Status.ContainerStatuses) != 0 {
    53  			for _, status := range pod.Status.ContainerStatuses {
    54  				recordContainerStatus(&status)
    55  			}
    56  			// If the pod has container status info, that means the init container statuses are normal.
    57  			continue
    58  		}
    59  		if len(pod.Status.InitContainerStatuses) != 0 {
    60  			for _, status := range pod.Status.InitContainerStatuses {
    61  				recordContainerStatus(&status)
    62  			}
    63  			continue
    64  		}
    65  		if len(pod.Status.Conditions) == 0 {
    66  			continue
    67  		}
    68  		// Should not modify the original pod which is stored in the informer cache.
    69  		status := pod.Status.DeepCopy()
    70  		sort.Slice(status.Conditions, func(i, j int) bool {
    71  			return status.Conditions[i].LastTransitionTime.After(status.Conditions[j].LastTransitionTime.Time)
    72  		})
    73  		condition := status.Conditions[0]
    74  		if condition.Status == v1.ConditionTrue {
    75  			continue
    76  		}
    77  		recorder.Eventf(object, v1.EventTypeWarning, condition.Reason, "Error pod %s condition message: %s", pod.Name, condition.Message)
    78  	}
    79  }
    80  
    81  // PastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if it is exceeded.
    82  func PastActiveDeadline(runPolicy *apiv1.RunPolicy, jobStatus apiv1.JobStatus) bool {
    83  	if runPolicy.ActiveDeadlineSeconds == nil || jobStatus.StartTime == nil {
    84  		return false
    85  	}
    86  	now := metav1.Now()
    87  	start := jobStatus.StartTime.Time
    88  	duration := now.Time.Sub(start)
    89  	allowedDuration := time.Duration(*runPolicy.ActiveDeadlineSeconds) * time.Second
    90  	return duration >= allowedDuration
    91  }
    92  
    93  // PastBackoffLimit checks if container restartCounts sum exceeds BackoffLimit
    94  // this method applies only to pods when restartPolicy is one of OnFailure, Always or ExitCode
    95  func PastBackoffLimit(jobName string, runPolicy *apiv1.RunPolicy,
    96  	replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, pods []*v1.Pod,
    97  	podFilterFunc func(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error)) (bool, error) {
    98  	if runPolicy.BackoffLimit == nil {
    99  		return false, nil
   100  	}
   101  	result := int32(0)
   102  	for rtype, spec := range replicas {
   103  		if spec.RestartPolicy != apiv1.RestartPolicyOnFailure && spec.RestartPolicy != apiv1.RestartPolicyAlways && spec.RestartPolicy != apiv1.RestartPolicyExitCode {
   104  			log.Warnf("The restart policy of replica %v of the job %v is not OnFailure, Always or ExitCode. Not counted in backoff limit.", rtype, jobName)
   105  			continue
   106  		}
   107  		// Convert ReplicaType to lower string.
   108  		rt := strings.ToLower(string(rtype))
   109  		pods, err := podFilterFunc(pods, rt)
   110  		if err != nil {
   111  			return false, err
   112  		}
   113  		for i := range pods {
   114  			po := pods[i]
   115  			if po.Status.Phase != v1.PodRunning {
   116  				continue
   117  			}
   118  			for j := range po.Status.InitContainerStatuses {
   119  				stat := po.Status.InitContainerStatuses[j]
   120  				result += stat.RestartCount
   121  			}
   122  			for j := range po.Status.ContainerStatuses {
   123  				stat := po.Status.ContainerStatuses[j]
   124  				result += stat.RestartCount
   125  			}
   126  		}
   127  	}
   128  
   129  	if *runPolicy.BackoffLimit == 0 {
   130  		return result > 0, nil
   131  	}
   132  	return result >= *runPolicy.BackoffLimit, nil
   133  }