github.com/kubeflow/training-operator@v1.7.0/pkg/core/job.go (about) 1 /* 2 Copyright 2023 The Kubeflow Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package core 18 19 import ( 20 "sort" 21 "strings" 22 "time" 23 24 log "github.com/sirupsen/logrus" 25 26 apiv1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 29 v1 "k8s.io/api/core/v1" 30 "k8s.io/apimachinery/pkg/runtime" 31 "k8s.io/client-go/tools/record" 32 ) 33 34 // RecordAbnormalPods records the active pod whose latest condition is not in True status. 35 func RecordAbnormalPods(activePods []*v1.Pod, object runtime.Object, recorder record.EventRecorder) { 36 for _, pod := range activePods { 37 // If the pod starts running, should checks the container statuses rather than the conditions. 38 recordContainerStatus := func(status *v1.ContainerStatus) { 39 if status.State.Terminated != nil && status.State.Terminated.ExitCode != 0 { 40 terminated := status.State.Terminated 41 recorder.Eventf(object, v1.EventTypeWarning, terminated.Reason, 42 "Error pod %s container %s exitCode: %d terminated message: %s", 43 pod.Name, status.Name, terminated.ExitCode, terminated.Message) 44 } 45 // The terminated state and waiting state don't simultaneously exists, checks them at the same time. 46 if status.State.Waiting != nil && status.State.Waiting.Message != "" { 47 wait := status.State.Waiting 48 recorder.Eventf(object, v1.EventTypeWarning, wait.Reason, 49 "Error pod %s container %s waiting message: %s", pod.Name, status.Name, wait.Message) 50 } 51 } 52 if len(pod.Status.ContainerStatuses) != 0 { 53 for _, status := range pod.Status.ContainerStatuses { 54 recordContainerStatus(&status) 55 } 56 // If the pod has container status info, that means the init container statuses are normal. 57 continue 58 } 59 if len(pod.Status.InitContainerStatuses) != 0 { 60 for _, status := range pod.Status.InitContainerStatuses { 61 recordContainerStatus(&status) 62 } 63 continue 64 } 65 if len(pod.Status.Conditions) == 0 { 66 continue 67 } 68 // Should not modify the original pod which is stored in the informer cache. 69 status := pod.Status.DeepCopy() 70 sort.Slice(status.Conditions, func(i, j int) bool { 71 return status.Conditions[i].LastTransitionTime.After(status.Conditions[j].LastTransitionTime.Time) 72 }) 73 condition := status.Conditions[0] 74 if condition.Status == v1.ConditionTrue { 75 continue 76 } 77 recorder.Eventf(object, v1.EventTypeWarning, condition.Reason, "Error pod %s condition message: %s", pod.Name, condition.Message) 78 } 79 } 80 81 // PastActiveDeadline checks if job has ActiveDeadlineSeconds field set and if it is exceeded. 82 func PastActiveDeadline(runPolicy *apiv1.RunPolicy, jobStatus apiv1.JobStatus) bool { 83 if runPolicy.ActiveDeadlineSeconds == nil || jobStatus.StartTime == nil { 84 return false 85 } 86 now := metav1.Now() 87 start := jobStatus.StartTime.Time 88 duration := now.Time.Sub(start) 89 allowedDuration := time.Duration(*runPolicy.ActiveDeadlineSeconds) * time.Second 90 return duration >= allowedDuration 91 } 92 93 // PastBackoffLimit checks if container restartCounts sum exceeds BackoffLimit 94 // this method applies only to pods when restartPolicy is one of OnFailure, Always or ExitCode 95 func PastBackoffLimit(jobName string, runPolicy *apiv1.RunPolicy, 96 replicas map[apiv1.ReplicaType]*apiv1.ReplicaSpec, pods []*v1.Pod, 97 podFilterFunc func(pods []*v1.Pod, replicaType string) ([]*v1.Pod, error)) (bool, error) { 98 if runPolicy.BackoffLimit == nil { 99 return false, nil 100 } 101 result := int32(0) 102 for rtype, spec := range replicas { 103 if spec.RestartPolicy != apiv1.RestartPolicyOnFailure && spec.RestartPolicy != apiv1.RestartPolicyAlways && spec.RestartPolicy != apiv1.RestartPolicyExitCode { 104 log.Warnf("The restart policy of replica %v of the job %v is not OnFailure, Always or ExitCode. Not counted in backoff limit.", rtype, jobName) 105 continue 106 } 107 // Convert ReplicaType to lower string. 108 rt := strings.ToLower(string(rtype)) 109 pods, err := podFilterFunc(pods, rt) 110 if err != nil { 111 return false, err 112 } 113 for i := range pods { 114 po := pods[i] 115 if po.Status.Phase != v1.PodRunning { 116 continue 117 } 118 for j := range po.Status.InitContainerStatuses { 119 stat := po.Status.InitContainerStatuses[j] 120 result += stat.RestartCount 121 } 122 for j := range po.Status.ContainerStatuses { 123 stat := po.Status.ContainerStatuses[j] 124 result += stat.RestartCount 125 } 126 } 127 } 128 129 if *runPolicy.BackoffLimit == 0 { 130 return result > 0, nil 131 } 132 return result >= *runPolicy.BackoffLimit, nil 133 }