sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobs/job/job_controller.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package job 18 19 import ( 20 "context" 21 "fmt" 22 "strconv" 23 24 batchv1 "k8s.io/api/batch/v1" 25 corev1 "k8s.io/api/core/v1" 26 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 27 "k8s.io/apimachinery/pkg/runtime" 28 "k8s.io/apimachinery/pkg/runtime/schema" 29 "k8s.io/apimachinery/pkg/types" 30 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 31 "k8s.io/client-go/util/workqueue" 32 "k8s.io/klog/v2" 33 "k8s.io/utils/ptr" 34 ctrl "sigs.k8s.io/controller-runtime" 35 "sigs.k8s.io/controller-runtime/pkg/builder" 36 "sigs.k8s.io/controller-runtime/pkg/client" 37 "sigs.k8s.io/controller-runtime/pkg/event" 38 "sigs.k8s.io/controller-runtime/pkg/reconcile" 39 40 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 41 "sigs.k8s.io/kueue/pkg/controller/jobframework" 42 "sigs.k8s.io/kueue/pkg/podset" 43 ) 44 45 var ( 46 parentWorkloadKey = ".metadata.parentWorkload" 47 gvk = batchv1.SchemeGroupVersion.WithKind("Job") 48 49 FrameworkName = "batch/job" 50 ) 51 52 const ( 53 JobMinParallelismAnnotation = "kueue.x-k8s.io/job-min-parallelism" 54 JobCompletionsEqualParallelismAnnotation = "kueue.x-k8s.io/job-completions-equal-parallelism" 55 ) 56 57 func init() { 58 utilruntime.Must(jobframework.RegisterIntegration(FrameworkName, jobframework.IntegrationCallbacks{ 59 SetupIndexes: SetupIndexes, 60 NewReconciler: NewReconciler, 61 SetupWebhook: SetupWebhook, 62 JobType: &batchv1.Job{}, 63 IsManagingObjectsOwner: isJob, 64 })) 65 } 66 67 // +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=list;get;watch 68 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch 69 // +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;update;patch 70 // +kubebuilder:rbac:groups=batch,resources=jobs/status,verbs=get;update 71 // +kubebuilder:rbac:groups=batch,resources=jobs/finalizers,verbs=get;update;patch 72 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete 73 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch 74 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update 75 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=resourceflavors,verbs=get;list;watch 76 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloadpriorityclasses,verbs=get;list;watch 77 78 var NewReconciler = jobframework.NewGenericReconcilerFactory( 79 func() jobframework.GenericJob { 80 return &Job{} 81 }, func(b *builder.Builder, c client.Client) *builder.Builder { 82 return b.Watches(&kueue.Workload{}, &parentWorkloadHandler{client: c}) 83 }, 84 ) 85 86 func isJob(owner *metav1.OwnerReference) bool { 87 return owner.Kind == "Job" && owner.APIVersion == gvk.GroupVersion().String() 88 } 89 90 type parentWorkloadHandler struct { 91 client client.Client 92 } 93 94 func (h *parentWorkloadHandler) Create(ctx context.Context, e event.CreateEvent, q workqueue.RateLimitingInterface) { 95 h.queueReconcileForChildJob(ctx, e.Object, q) 96 } 97 98 func (h *parentWorkloadHandler) Update(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) { 99 h.queueReconcileForChildJob(ctx, e.ObjectNew, q) 100 } 101 102 func (h *parentWorkloadHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 103 } 104 105 func (h *parentWorkloadHandler) Generic(ctx context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 106 } 107 108 // queueReconcileForChildJob queues reconciliation of the child jobs (jobs with the 109 // parent-workload annotation) in reaction to the parent-workload events. 110 func (h *parentWorkloadHandler) queueReconcileForChildJob(ctx context.Context, object client.Object, q workqueue.RateLimitingInterface) { 111 w, ok := object.(*kueue.Workload) 112 if !ok { 113 return 114 } 115 log := ctrl.LoggerFrom(ctx).WithValues("workload", klog.KObj(w)) 116 ctx = ctrl.LoggerInto(ctx, log) 117 log.V(5).Info("Queueing reconcile for child jobs") 118 var childJobs batchv1.JobList 119 if err := h.client.List(ctx, &childJobs, client.InNamespace(w.Namespace), client.MatchingFields{parentWorkloadKey: w.Name}); err != nil { 120 klog.Error(err, "Unable to list child jobs") 121 return 122 } 123 for _, childJob := range childJobs.Items { 124 log.V(5).Info("Queueing reconcile for child job", "job", klog.KObj(&childJob)) 125 q.Add(reconcile.Request{ 126 NamespacedName: types.NamespacedName{ 127 Name: childJob.Name, 128 Namespace: w.Namespace, 129 }, 130 }) 131 } 132 } 133 134 type Job batchv1.Job 135 136 var _ jobframework.GenericJob = (*Job)(nil) 137 var _ jobframework.JobWithReclaimablePods = (*Job)(nil) 138 var _ jobframework.JobWithCustomStop = (*Job)(nil) 139 140 func (j *Job) Object() client.Object { 141 return (*batchv1.Job)(j) 142 } 143 144 func fromObject(o runtime.Object) *Job { 145 return (*Job)(o.(*batchv1.Job)) 146 } 147 148 func (j *Job) IsSuspended() bool { 149 return j.Spec.Suspend != nil && *j.Spec.Suspend 150 } 151 152 func (j *Job) IsActive() bool { 153 return j.Status.Active != 0 154 } 155 156 func (j *Job) Suspend() { 157 j.Spec.Suspend = ptr.To(true) 158 } 159 160 func (j *Job) Stop(ctx context.Context, c client.Client, podSetsInfo []podset.PodSetInfo, _ jobframework.StopReason, eventMsg string) (bool, error) { 161 stoppedNow := false 162 if !j.IsSuspended() { 163 j.Suspend() 164 if err := c.Update(ctx, j.Object()); err != nil { 165 return false, fmt.Errorf("suspend: %w", err) 166 } 167 stoppedNow = true 168 } 169 170 // Reset start time if necessary, so we can update the scheduling directives. 171 if j.Status.StartTime != nil { 172 j.Status.StartTime = nil 173 if err := c.Status().Update(ctx, j.Object()); err != nil { 174 return stoppedNow, fmt.Errorf("reset status: %w", err) 175 } 176 } 177 178 if changed := j.RestorePodSetsInfo(podSetsInfo); !changed { 179 return stoppedNow, nil 180 } 181 if err := c.Update(ctx, j.Object()); err != nil { 182 return false, fmt.Errorf("restore info: %w", err) 183 } 184 return stoppedNow, nil 185 } 186 187 func (j *Job) GVK() schema.GroupVersionKind { 188 return gvk 189 } 190 191 func (j *Job) ReclaimablePods() ([]kueue.ReclaimablePod, error) { 192 parallelism := ptr.Deref(j.Spec.Parallelism, 1) 193 if parallelism == 1 || j.Status.Succeeded == 0 { 194 return nil, nil 195 } 196 197 remaining := ptr.Deref(j.Spec.Completions, parallelism) - j.Status.Succeeded 198 if remaining >= parallelism { 199 return nil, nil 200 } 201 202 return []kueue.ReclaimablePod{{ 203 Name: kueue.DefaultPodSetName, 204 Count: parallelism - remaining, 205 }}, nil 206 } 207 208 // The following labels are managed internally by batch/job controller, we should not 209 // propagate them to the workload. 210 var ( 211 // the legacy names are no longer defined in the api, only in k/2/apis/batch 212 legacyJobNameLabel = "job-name" 213 legacyControllerUidLabel = "controller-uid" 214 ManagedLabels = []string{legacyJobNameLabel, legacyControllerUidLabel, batchv1.JobNameLabel, batchv1.ControllerUidLabel} 215 ) 216 217 func cleanManagedLabels(pt *corev1.PodTemplateSpec) *corev1.PodTemplateSpec { 218 for _, managedLabel := range ManagedLabels { 219 delete(pt.Labels, managedLabel) 220 } 221 return pt 222 } 223 224 func (j *Job) PodSets() []kueue.PodSet { 225 return []kueue.PodSet{ 226 { 227 Name: kueue.DefaultPodSetName, 228 Template: *cleanManagedLabels(j.Spec.Template.DeepCopy()), 229 Count: j.podsCount(), 230 MinCount: j.minPodsCount(), 231 }, 232 } 233 } 234 235 func (j *Job) RunWithPodSetsInfo(podSetsInfo []podset.PodSetInfo) error { 236 j.Spec.Suspend = ptr.To(false) 237 if len(podSetsInfo) != 1 { 238 return podset.BadPodSetsInfoLenError(1, len(podSetsInfo)) 239 } 240 241 info := podSetsInfo[0] 242 243 if j.minPodsCount() != nil { 244 j.Spec.Parallelism = ptr.To(info.Count) 245 if j.syncCompletionWithParallelism() { 246 j.Spec.Completions = j.Spec.Parallelism 247 } 248 } 249 return podset.Merge(&j.Spec.Template.ObjectMeta, &j.Spec.Template.Spec, info) 250 } 251 252 func (j *Job) RestorePodSetsInfo(podSetsInfo []podset.PodSetInfo) bool { 253 if len(podSetsInfo) == 0 { 254 return false 255 } 256 257 changed := false 258 // if the job accepts partial admission 259 if j.minPodsCount() != nil && ptr.Deref(j.Spec.Parallelism, 0) != podSetsInfo[0].Count { 260 changed = true 261 j.Spec.Parallelism = ptr.To(podSetsInfo[0].Count) 262 if j.syncCompletionWithParallelism() { 263 j.Spec.Completions = j.Spec.Parallelism 264 } 265 } 266 info := podSetsInfo[0] 267 for _, managedLabel := range ManagedLabels { 268 if v, found := j.Spec.Template.Labels[managedLabel]; found { 269 info.AddOrUpdateLabel(managedLabel, v) 270 } 271 } 272 changed = podset.RestorePodSpec(&j.Spec.Template.ObjectMeta, &j.Spec.Template.Spec, info) || changed 273 return changed 274 } 275 276 func (j *Job) Finished() (metav1.Condition, bool) { 277 var conditionType batchv1.JobConditionType 278 var finished bool 279 280 for _, c := range j.Status.Conditions { 281 if (c.Type == batchv1.JobComplete || c.Type == batchv1.JobFailed) && c.Status == corev1.ConditionTrue { 282 conditionType = c.Type 283 finished = true 284 break 285 } 286 } 287 288 condition := metav1.Condition{ 289 Type: kueue.WorkloadFinished, 290 Status: metav1.ConditionTrue, 291 Reason: "JobFinished", 292 Message: "Job finished successfully", 293 } 294 if conditionType == batchv1.JobFailed { 295 condition.Message = "Job failed" 296 } 297 298 return condition, finished 299 } 300 301 func (j *Job) PodsReady() bool { 302 ready := ptr.Deref(j.Status.Ready, 0) 303 return j.Status.Succeeded+ready >= j.podsCount() 304 } 305 306 func (j *Job) podsCount() int32 { 307 // parallelism is always set as it is otherwise defaulted by k8s to 1 308 podsCount := *(j.Spec.Parallelism) 309 if j.Spec.Completions != nil && *j.Spec.Completions < podsCount { 310 podsCount = *j.Spec.Completions 311 } 312 return podsCount 313 } 314 315 func (j *Job) minPodsCount() *int32 { 316 if strVal, found := j.GetAnnotations()[JobMinParallelismAnnotation]; found { 317 if iVal, err := strconv.Atoi(strVal); err == nil { 318 return ptr.To[int32](int32(iVal)) 319 } 320 } 321 return nil 322 } 323 324 func (j *Job) syncCompletionWithParallelism() bool { 325 if strVal, found := j.GetAnnotations()[JobCompletionsEqualParallelismAnnotation]; found { 326 if bVal, err := strconv.ParseBool(strVal); err == nil { 327 return bVal 328 } 329 } 330 return false 331 } 332 333 func SetupIndexes(ctx context.Context, indexer client.FieldIndexer) error { 334 if err := indexer.IndexField(ctx, &batchv1.Job{}, parentWorkloadKey, func(o client.Object) []string { 335 job := fromObject(o) 336 if pwName := jobframework.ParentWorkloadName(job); pwName != "" { 337 return []string{pwName} 338 } 339 return nil 340 }); err != nil { 341 return err 342 } 343 return jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk) 344 } 345 346 func GetWorkloadNameForJob(jobName string) string { 347 return jobframework.GetWorkloadNameForOwnerWithGVK(jobName, gvk) 348 }