sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/jobset/jobset_controller.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package jobset
    18  
    19  import (
    20  	"context"
    21  	"strings"
    22  
    23  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    24  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    25  	"k8s.io/apimachinery/pkg/runtime"
    26  	"k8s.io/apimachinery/pkg/runtime/schema"
    27  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    28  	"k8s.io/utils/ptr"
    29  	"sigs.k8s.io/controller-runtime/pkg/client"
    30  	jobsetapi "sigs.k8s.io/jobset/api/jobset/v1alpha2"
    31  
    32  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    33  	"sigs.k8s.io/kueue/pkg/controller/jobframework"
    34  	"sigs.k8s.io/kueue/pkg/podset"
    35  	"sigs.k8s.io/kueue/pkg/util/slices"
    36  )
    37  
    38  var (
    39  	gvk           = jobsetapi.GroupVersion.WithKind("JobSet")
    40  	FrameworkName = "jobset.x-k8s.io/jobset"
    41  )
    42  
    43  func init() {
    44  	utilruntime.Must(jobframework.RegisterIntegration(FrameworkName, jobframework.IntegrationCallbacks{
    45  		SetupIndexes:           SetupIndexes,
    46  		NewReconciler:          NewReconciler,
    47  		SetupWebhook:           SetupJobSetWebhook,
    48  		JobType:                &jobsetapi.JobSet{},
    49  		AddToScheme:            jobsetapi.AddToScheme,
    50  		IsManagingObjectsOwner: isJobSet,
    51  	}))
    52  }
    53  
    54  // +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=list;get;watch
    55  // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch
    56  // +kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets,verbs=get;list;watch;update;patch
    57  // +kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets/status,verbs=get;update
    58  // +kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets/finalizers,verbs=get;update
    59  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete
    60  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch
    61  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update
    62  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch
    63  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloadpriorityclasses,verbs=get;list;watch
    64  
    65  var NewReconciler = jobframework.NewGenericReconcilerFactory(func() jobframework.GenericJob { return &JobSet{} })
    66  
    67  func isJobSet(owner *metav1.OwnerReference) bool {
    68  	return owner.Kind == "JobSet" && strings.HasPrefix(owner.APIVersion, "jobset.x-k8s.io/v1")
    69  }
    70  
    71  type JobSet jobsetapi.JobSet
    72  
    73  var _ jobframework.GenericJob = (*JobSet)(nil)
    74  var _ jobframework.JobWithReclaimablePods = (*JobSet)(nil)
    75  
    76  func fromObject(obj runtime.Object) *JobSet {
    77  	return (*JobSet)(obj.(*jobsetapi.JobSet))
    78  }
    79  
    80  func (j *JobSet) Object() client.Object {
    81  	return (*jobsetapi.JobSet)(j)
    82  }
    83  
    84  func (j *JobSet) IsSuspended() bool {
    85  	return ptr.Deref(j.Spec.Suspend, false)
    86  }
    87  
    88  func (j *JobSet) IsActive() bool {
    89  	for i := range j.Status.ReplicatedJobsStatus {
    90  		if j.Status.ReplicatedJobsStatus[i].Active > 0 {
    91  			return true
    92  		}
    93  	}
    94  	return false
    95  }
    96  
    97  func (j *JobSet) Suspend() {
    98  	j.Spec.Suspend = ptr.To(true)
    99  }
   100  
   101  func (j *JobSet) GVK() schema.GroupVersionKind {
   102  	return gvk
   103  }
   104  
   105  func (j *JobSet) PodSets() []kueue.PodSet {
   106  	podSets := make([]kueue.PodSet, len(j.Spec.ReplicatedJobs))
   107  	for index, replicatedJob := range j.Spec.ReplicatedJobs {
   108  		podSets[index] = kueue.PodSet{
   109  			Name:     replicatedJob.Name,
   110  			Template: *replicatedJob.Template.Spec.Template.DeepCopy(),
   111  			Count:    podsCount(&replicatedJob),
   112  		}
   113  	}
   114  	return podSets
   115  }
   116  
   117  func (j *JobSet) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error {
   118  	j.Spec.Suspend = ptr.To(false)
   119  	if len(podSetsInfo) != len(j.Spec.ReplicatedJobs) {
   120  		return podset.BadPodSetsInfoLenError(len(j.Spec.ReplicatedJobs), len(podSetsInfo))
   121  	}
   122  
   123  	// If there are Jobs already created by the JobSet, their node selectors will be updated by the JobSet controller
   124  	// before unsuspending the individual Jobs.
   125  	for index := range j.Spec.ReplicatedJobs {
   126  		template := &j.Spec.ReplicatedJobs[index].Template.Spec.Template
   127  		info := podSetsInfo[index]
   128  		if err := podset.Merge(&template.ObjectMeta, &template.Spec, info); err != nil {
   129  			return nil
   130  		}
   131  	}
   132  	return nil
   133  }
   134  
   135  func (j *JobSet) RestorePodSetsInfo(podSetsInfo []podset.PodSetInfo) bool {
   136  	if len(podSetsInfo) == 0 {
   137  		return false
   138  	}
   139  	changed := false
   140  	for index := range j.Spec.ReplicatedJobs {
   141  		replica := &j.Spec.ReplicatedJobs[index].Template.Spec.Template
   142  		info := podSetsInfo[index]
   143  		changed = podset.RestorePodSpec(&replica.ObjectMeta, &replica.Spec, info) || changed
   144  	}
   145  	return changed
   146  }
   147  
   148  func (j *JobSet) Finished() (metav1.Condition, bool) {
   149  	if apimeta.IsStatusConditionTrue(j.Status.Conditions, string(jobsetapi.JobSetCompleted)) {
   150  		condition := metav1.Condition{
   151  			Type:    kueue.WorkloadFinished,
   152  			Status:  metav1.ConditionTrue,
   153  			Reason:  "JobSetFinished",
   154  			Message: "JobSet finished successfully",
   155  		}
   156  		return condition, true
   157  	}
   158  	if apimeta.IsStatusConditionTrue(j.Status.Conditions, string(jobsetapi.JobSetFailed)) {
   159  		condition := metav1.Condition{
   160  			Type:    kueue.WorkloadFinished,
   161  			Status:  metav1.ConditionTrue,
   162  			Reason:  "JobSetFinished",
   163  			Message: "JobSet failed",
   164  		}
   165  		return condition, true
   166  	}
   167  	return metav1.Condition{}, false
   168  }
   169  
   170  func (j *JobSet) PodsReady() bool {
   171  	var replicas int32
   172  	for _, replicatedJob := range j.Spec.ReplicatedJobs {
   173  		replicas += int32(replicatedJob.Replicas)
   174  	}
   175  	var readyReplicas int32
   176  	for _, replicatedJobStatus := range j.Status.ReplicatedJobsStatus {
   177  		readyReplicas += replicatedJobStatus.Ready + replicatedJobStatus.Succeeded
   178  	}
   179  	return replicas == readyReplicas
   180  }
   181  
   182  func (j *JobSet) ReclaimablePods() ([]kueue.ReclaimablePod, error) {
   183  	if len(j.Status.ReplicatedJobsStatus) == 0 {
   184  		return nil, nil
   185  	}
   186  
   187  	ret := make([]kueue.ReclaimablePod, 0, len(j.Spec.ReplicatedJobs))
   188  	statuses := slices.ToRefMap(j.Status.ReplicatedJobsStatus, func(js *jobsetapi.ReplicatedJobStatus) string { return js.Name })
   189  
   190  	for i := range j.Spec.ReplicatedJobs {
   191  		spec := &j.Spec.ReplicatedJobs[i]
   192  		if status, found := statuses[spec.Name]; found && status.Succeeded > 0 {
   193  			if status.Succeeded > 0 && status.Succeeded <= int32(spec.Replicas) {
   194  				ret = append(ret, kueue.ReclaimablePod{
   195  					Name:  spec.Name,
   196  					Count: status.Succeeded * podsCountPerReplica(spec),
   197  				})
   198  			}
   199  		}
   200  	}
   201  	return ret, nil
   202  }
   203  
   204  func podsCountPerReplica(rj *jobsetapi.ReplicatedJob) int32 {
   205  	spec := &rj.Template.Spec
   206  	// parallelism is always set as it is otherwise defaulted by k8s to 1
   207  	jobPodsCount := ptr.Deref(spec.Parallelism, 1)
   208  	if comp := ptr.Deref(spec.Completions, jobPodsCount); comp < jobPodsCount {
   209  		jobPodsCount = comp
   210  	}
   211  	return jobPodsCount
   212  }
   213  
   214  func podsCount(rj *jobsetapi.ReplicatedJob) int32 {
   215  	// The JobSet's operator validates that this will not overflow.
   216  	return int32(rj.Replicas) * podsCountPerReplica(rj)
   217  }
   218  
   219  func SetupIndexes(ctx context.Context, indexer client.FieldIndexer) error {
   220  	return jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk)
   221  }
   222  
   223  func GetWorkloadNameForJobSet(jobSetName string) string {
   224  	return jobframework.GetWorkloadNameForOwnerWithGVK(jobSetName, gvk)
   225  }