sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/kubeflow/kubeflowjob/kubeflowjob_controller.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package kubeflowjob
    18  
    19  import (
    20  	"strings"
    21  
    22  	kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
    23  	corev1 "k8s.io/api/core/v1"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  	"k8s.io/apimachinery/pkg/runtime/schema"
    26  	"k8s.io/utils/ptr"
    27  	"sigs.k8s.io/controller-runtime/pkg/client"
    28  
    29  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    30  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    31  	"sigs.k8s.io/kueue/pkg/podset"
    32  )
    33  
    34  type KubeflowJob struct {
    35  	KFJobControl KFJobControl
    36  }
    37  
    38  var _ jobframework.GenericJob = (*KubeflowJob)(nil)
    39  var _ jobframework.JobWithPriorityClass = (*KubeflowJob)(nil)
    40  
    41  func (j *KubeflowJob) Object() client.Object {
    42  	return j.KFJobControl.Object()
    43  }
    44  
    45  func (j *KubeflowJob) IsSuspended() bool {
    46  	return j.KFJobControl.RunPolicy().Suspend != nil && *j.KFJobControl.RunPolicy().Suspend
    47  }
    48  
    49  func (j *KubeflowJob) Suspend() {
    50  	j.KFJobControl.RunPolicy().Suspend = ptr.To(true)
    51  }
    52  
    53  func (j *KubeflowJob) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error {
    54  	j.KFJobControl.RunPolicy().Suspend = ptr.To(false)
    55  	orderedReplicaTypes := j.OrderedReplicaTypes()
    56  
    57  	if len(podSetsInfo) != len(orderedReplicaTypes) {
    58  		return podset.BadPodSetsInfoLenError(len(orderedReplicaTypes), len(podSetsInfo))
    59  	}
    60  	// The node selectors are provided in the same order as the generated list of
    61  	// podSets, use the same ordering logic to restore them.
    62  	for index := range podSetsInfo {
    63  		replicaType := orderedReplicaTypes[index]
    64  		info := podSetsInfo[index]
    65  		replica := &j.KFJobControl.ReplicaSpecs()[replicaType].Template
    66  		if err := podset.Merge(&replica.ObjectMeta, &replica.Spec, info); err != nil {
    67  			return err
    68  		}
    69  
    70  	}
    71  	return nil
    72  }
    73  
    74  func (j *KubeflowJob) RestorePodSetsInfo(podSetsInfo []podset.PodSetInfo) bool {
    75  	orderedReplicaTypes := j.OrderedReplicaTypes()
    76  	changed := false
    77  	for index, info := range podSetsInfo {
    78  		replicaType := orderedReplicaTypes[index]
    79  		replica := &j.KFJobControl.ReplicaSpecs()[replicaType].Template
    80  		changed = podset.RestorePodSpec(&replica.ObjectMeta, &replica.Spec, info) || changed
    81  	}
    82  	return changed
    83  }
    84  
    85  func (j *KubeflowJob) Finished() (metav1.Condition, bool) {
    86  	var conditionType kftraining.JobConditionType
    87  	var finished bool
    88  	if j.KFJobControl.JobStatus() == nil {
    89  		return metav1.Condition{}, false
    90  	}
    91  	for _, c := range j.KFJobControl.JobStatus().Conditions {
    92  		if (c.Type == kftraining.JobSucceeded || c.Type == kftraining.JobFailed) && c.Status == corev1.ConditionTrue {
    93  			conditionType = c.Type
    94  			finished = true
    95  			break
    96  		}
    97  	}
    98  	message := "Job finished successfully"
    99  	if conditionType == kftraining.JobFailed {
   100  		message = "Job failed"
   101  	}
   102  	condition := metav1.Condition{
   103  		Type:    kueue.WorkloadFinished,
   104  		Status:  metav1.ConditionTrue,
   105  		Reason:  "JobFinished",
   106  		Message: message,
   107  	}
   108  	return condition, finished
   109  }
   110  
   111  func (j *KubeflowJob) PodSets() []kueue.PodSet {
   112  	replicaTypes := j.OrderedReplicaTypes()
   113  	podSets := make([]kueue.PodSet, len(replicaTypes))
   114  	for index, replicaType := range replicaTypes {
   115  		podSets[index] = kueue.PodSet{
   116  			Name:     strings.ToLower(string(replicaType)),
   117  			Template: *j.KFJobControl.ReplicaSpecs()[replicaType].Template.DeepCopy(),
   118  			Count:    podsCount(j.KFJobControl.ReplicaSpecs(), replicaType),
   119  		}
   120  	}
   121  	return podSets
   122  }
   123  
   124  func (j *KubeflowJob) IsActive() bool {
   125  	for _, replicaStatus := range j.KFJobControl.JobStatus().ReplicaStatuses {
   126  		if replicaStatus.Active != 0 {
   127  			return true
   128  		}
   129  	}
   130  	return false
   131  }
   132  
   133  func (j *KubeflowJob) PodsReady() bool {
   134  	for _, c := range j.KFJobControl.JobStatus().Conditions {
   135  		if c.Type == kftraining.JobRunning && c.Status == corev1.ConditionTrue {
   136  			return true
   137  		}
   138  	}
   139  	return false
   140  }
   141  
   142  func (j *KubeflowJob) GVK() schema.GroupVersionKind {
   143  	return j.KFJobControl.GVK()
   144  }
   145  
   146  // PriorityClass calculates the priorityClass name needed for workload according to the following priorities:
   147  //  1. .spec.runPolicy.schedulingPolicy.priorityClass
   148  //  2. .spec.replicaSpecs[OrderedReplicaTypes[0]].template.spec.priorityClassName
   149  //  3. .spec.replicaSpecs[OrderedReplicaTypes[1]].template.spec.priorityClassName
   150  //  4. ...
   151  //
   152  // This function is inspired by an analogous one in mpi-controller:
   153  // https://github.com/kubeflow/mpi-operator/blob/5946ef4157599a474ab82ff80e780d5c2546c9ee/pkg/controller/podgroup.go#L69-L72
   154  func (j *KubeflowJob) PriorityClass() string {
   155  	if j.KFJobControl.RunPolicy().SchedulingPolicy != nil && len(j.KFJobControl.RunPolicy().SchedulingPolicy.PriorityClass) != 0 {
   156  		return j.KFJobControl.RunPolicy().SchedulingPolicy.PriorityClass
   157  	}
   158  	replicaTypes := j.OrderedReplicaTypes()
   159  	for _, replicaType := range replicaTypes {
   160  		if m := j.KFJobControl.ReplicaSpecs()[replicaType]; m != nil && len(m.Template.Spec.PriorityClassName) != 0 {
   161  			return m.Template.Spec.PriorityClassName
   162  		}
   163  	}
   164  	return ""
   165  }
   166  
   167  func (j *KubeflowJob) OrderedReplicaTypes() []kftraining.ReplicaType {
   168  	replicaTypes := j.KFJobControl.OrderedReplicaTypes()
   169  	result := make([]kftraining.ReplicaType, 0, len(replicaTypes))
   170  	for _, replicaType := range replicaTypes {
   171  		if j.KFJobControl.ReplicaSpecs()[replicaType] != nil {
   172  			result = append(result, replicaType)
   173  		}
   174  	}
   175  	return result
   176  }
   177  
   178  func podsCount(replicaSpecs map[kftraining.ReplicaType]*kftraining.ReplicaSpec, replicaType kftraining.ReplicaType) int32 {
   179  	return ptr.Deref(replicaSpecs[replicaType].Replicas, 1)
   180  }