sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/kubeflow/kubeflowjob/kubeflowjob_controller.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package kubeflowjob 18 19 import ( 20 "strings" 21 22 kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" 23 corev1 "k8s.io/api/core/v1" 24 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 25 "k8s.io/apimachinery/pkg/runtime/schema" 26 "k8s.io/utils/ptr" 27 "sigs.k8s.io/controller-runtime/pkg/client" 28 29 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 30 "sigs.k8s.io/kueue/pkg/controller/jobframework" 31 "sigs.k8s.io/kueue/pkg/podset" 32 ) 33 34 type KubeflowJob struct { 35 KFJobControl KFJobControl 36 } 37 38 var _ jobframework.GenericJob = (*KubeflowJob)(nil) 39 var _ jobframework.JobWithPriorityClass = (*KubeflowJob)(nil) 40 41 func (j *KubeflowJob) Object() client.Object { 42 return j.KFJobControl.Object() 43 } 44 45 func (j *KubeflowJob) IsSuspended() bool { 46 return j.KFJobControl.RunPolicy().Suspend != nil && *j.KFJobControl.RunPolicy().Suspend 47 } 48 49 func (j *KubeflowJob) Suspend() { 50 j.KFJobControl.RunPolicy().Suspend = ptr.To(true) 51 } 52 53 func (j *KubeflowJob) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error { 54 j.KFJobControl.RunPolicy().Suspend = ptr.To(false) 55 orderedReplicaTypes := j.OrderedReplicaTypes() 56 57 if len(podSetsInfo) != len(orderedReplicaTypes) { 58 return podset.BadPodSetsInfoLenError(len(orderedReplicaTypes), len(podSetsInfo)) 59 } 60 // The node selectors are provided in the same order as the generated list of 61 // podSets, use the same ordering logic to restore them. 62 for index := range podSetsInfo { 63 replicaType := orderedReplicaTypes[index] 64 info := podSetsInfo[index] 65 replica := &j.KFJobControl.ReplicaSpecs()[replicaType].Template 66 if err := podset.Merge(&replica.ObjectMeta, &replica.Spec, info); err != nil { 67 return err 68 } 69 70 } 71 return nil 72 } 73 74 func (j *KubeflowJob) RestorePodSetsInfo(podSetsInfo []podset.PodSetInfo) bool { 75 orderedReplicaTypes := j.OrderedReplicaTypes() 76 changed := false 77 for index, info := range podSetsInfo { 78 replicaType := orderedReplicaTypes[index] 79 replica := &j.KFJobControl.ReplicaSpecs()[replicaType].Template 80 changed = podset.RestorePodSpec(&replica.ObjectMeta, &replica.Spec, info) || changed 81 } 82 return changed 83 } 84 85 func (j *KubeflowJob) Finished() (metav1.Condition, bool) { 86 var conditionType kftraining.JobConditionType 87 var finished bool 88 if j.KFJobControl.JobStatus() == nil { 89 return metav1.Condition{}, false 90 } 91 for _, c := range j.KFJobControl.JobStatus().Conditions { 92 if (c.Type == kftraining.JobSucceeded || c.Type == kftraining.JobFailed) && c.Status == corev1.ConditionTrue { 93 conditionType = c.Type 94 finished = true 95 break 96 } 97 } 98 message := "Job finished successfully" 99 if conditionType == kftraining.JobFailed { 100 message = "Job failed" 101 } 102 condition := metav1.Condition{ 103 Type: kueue.WorkloadFinished, 104 Status: metav1.ConditionTrue, 105 Reason: "JobFinished", 106 Message: message, 107 } 108 return condition, finished 109 } 110 111 func (j *KubeflowJob) PodSets() []kueue.PodSet { 112 replicaTypes := j.OrderedReplicaTypes() 113 podSets := make([]kueue.PodSet, len(replicaTypes)) 114 for index, replicaType := range replicaTypes { 115 podSets[index] = kueue.PodSet{ 116 Name: strings.ToLower(string(replicaType)), 117 Template: *j.KFJobControl.ReplicaSpecs()[replicaType].Template.DeepCopy(), 118 Count: podsCount(j.KFJobControl.ReplicaSpecs(), replicaType), 119 } 120 } 121 return podSets 122 } 123 124 func (j *KubeflowJob) IsActive() bool { 125 for _, replicaStatus := range j.KFJobControl.JobStatus().ReplicaStatuses { 126 if replicaStatus.Active != 0 { 127 return true 128 } 129 } 130 return false 131 } 132 133 func (j *KubeflowJob) PodsReady() bool { 134 for _, c := range j.KFJobControl.JobStatus().Conditions { 135 if c.Type == kftraining.JobRunning && c.Status == corev1.ConditionTrue { 136 return true 137 } 138 } 139 return false 140 } 141 142 func (j *KubeflowJob) GVK() schema.GroupVersionKind { 143 return j.KFJobControl.GVK() 144 } 145 146 // PriorityClass calculates the priorityClass name needed for workload according to the following priorities: 147 // 1. .spec.runPolicy.schedulingPolicy.priorityClass 148 // 2. .spec.replicaSpecs[OrderedReplicaTypes[0]].template.spec.priorityClassName 149 // 3. .spec.replicaSpecs[OrderedReplicaTypes[1]].template.spec.priorityClassName 150 // 4. ... 151 // 152 // This function is inspired by an analogous one in mpi-controller: 153 // https://github.com/kubeflow/mpi-operator/blob/5946ef4157599a474ab82ff80e780d5c2546c9ee/pkg/controller/podgroup.go#L69-L72 154 func (j *KubeflowJob) PriorityClass() string { 155 if j.KFJobControl.RunPolicy().SchedulingPolicy != nil && len(j.KFJobControl.RunPolicy().SchedulingPolicy.PriorityClass) != 0 { 156 return j.KFJobControl.RunPolicy().SchedulingPolicy.PriorityClass 157 } 158 replicaTypes := j.OrderedReplicaTypes() 159 for _, replicaType := range replicaTypes { 160 if m := j.KFJobControl.ReplicaSpecs()[replicaType]; m != nil && len(m.Template.Spec.PriorityClassName) != 0 { 161 return m.Template.Spec.PriorityClassName 162 } 163 } 164 return "" 165 } 166 167 func (j *KubeflowJob) OrderedReplicaTypes() []kftraining.ReplicaType { 168 replicaTypes := j.KFJobControl.OrderedReplicaTypes() 169 result := make([]kftraining.ReplicaType, 0, len(replicaTypes)) 170 for _, replicaType := range replicaTypes { 171 if j.KFJobControl.ReplicaSpecs()[replicaType] != nil { 172 result = append(result, replicaType) 173 } 174 } 175 return result 176 } 177 178 func podsCount(replicaSpecs map[kftraining.ReplicaType]*kftraining.ReplicaSpec, replicaType kftraining.ReplicaType) int32 { 179 return ptr.Deref(replicaSpecs[replicaType].Replicas, 1) 180 }