sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobframework/reconciler.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 http://www.apache.org/licenses/LICENSE-2.0 7 Unless required by applicable law or agreed to in writing, software 8 distributed under the License is distributed on an "AS IS" BASIS, 9 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 See the License for the specific language governing permissions and 11 limitations under the License. 12 */ 13 14 package jobframework 15 16 import ( 17 "context" 18 "errors" 19 "fmt" 20 21 "github.com/go-logr/logr" 22 corev1 "k8s.io/api/core/v1" 23 apierrors "k8s.io/apimachinery/pkg/api/errors" 24 apimeta "k8s.io/apimachinery/pkg/api/meta" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/apimachinery/pkg/util/sets" 28 "k8s.io/apimachinery/pkg/util/validation" 29 "k8s.io/client-go/tools/record" 30 "k8s.io/klog/v2" 31 "k8s.io/utils/ptr" 32 ctrl "sigs.k8s.io/controller-runtime" 33 "sigs.k8s.io/controller-runtime/pkg/builder" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 configapi "sigs.k8s.io/kueue/apis/config/v1beta1" 37 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 38 "sigs.k8s.io/kueue/pkg/constants" 39 controllerconsts "sigs.k8s.io/kueue/pkg/controller/constants" 40 "sigs.k8s.io/kueue/pkg/features" 41 "sigs.k8s.io/kueue/pkg/podset" 42 "sigs.k8s.io/kueue/pkg/util/equality" 43 "sigs.k8s.io/kueue/pkg/util/kubeversion" 44 "sigs.k8s.io/kueue/pkg/util/maps" 45 utilpriority "sigs.k8s.io/kueue/pkg/util/priority" 46 "sigs.k8s.io/kueue/pkg/util/slices" 47 "sigs.k8s.io/kueue/pkg/workload" 48 ) 49 50 const ( 51 FailedToStartFinishedReason = "FailedToStart" 52 ) 53 54 var ( 55 ErrChildJobOwnerNotFound = fmt.Errorf("owner isn't set even though %s annotation is set", controllerconsts.ParentWorkloadAnnotation) 56 ErrUnknownWorkloadOwner = errors.New("workload owner is unknown") 57 ErrWorkloadOwnerNotFound = errors.New("workload owner not found") 58 ErrNoMatchingWorkloads = errors.New("no matching workloads") 59 ErrExtraWorkloads = errors.New("extra workloads") 60 ) 61 62 // JobReconciler reconciles a GenericJob object 63 type JobReconciler struct { 64 client client.Client 65 record record.EventRecorder 66 manageJobsWithoutQueueName bool 67 waitForPodsReady bool 68 } 69 70 type Options struct { 71 ManageJobsWithoutQueueName bool 72 WaitForPodsReady bool 73 KubeServerVersion *kubeversion.ServerVersionFetcher 74 // IntegrationOptions key is "$GROUP/$VERSION, Kind=$KIND". 75 IntegrationOptions map[string]any 76 EnabledFrameworks sets.Set[string] 77 ManagerName string 78 } 79 80 // Option configures the reconciler. 81 type Option func(*Options) 82 83 func ProcessOptions(opts ...Option) Options { 84 options := defaultOptions 85 for _, opt := range opts { 86 opt(&options) 87 } 88 return options 89 } 90 91 // WithManageJobsWithoutQueueName indicates if the controller should reconcile 92 // jobs that don't set the queue name annotation. 93 func WithManageJobsWithoutQueueName(f bool) Option { 94 return func(o *Options) { 95 o.ManageJobsWithoutQueueName = f 96 } 97 } 98 99 // WithWaitForPodsReady indicates if the controller should add the PodsReady 100 // condition to the workload when the corresponding job has all pods ready 101 // or succeeded. 102 func WithWaitForPodsReady(w *configapi.WaitForPodsReady) Option { 103 return func(o *Options) { 104 o.WaitForPodsReady = w != nil && w.Enable 105 } 106 } 107 108 func WithKubeServerVersion(v *kubeversion.ServerVersionFetcher) Option { 109 return func(o *Options) { 110 o.KubeServerVersion = v 111 } 112 } 113 114 // WithIntegrationOptions adds integrations options like podOptions. 115 // The second arg, `opts` should be recognized as any option struct. 116 func WithIntegrationOptions(integrationName string, opts any) Option { 117 return func(o *Options) { 118 if len(o.IntegrationOptions) == 0 { 119 o.IntegrationOptions = make(map[string]any) 120 } 121 o.IntegrationOptions[integrationName] = opts 122 } 123 } 124 125 // WithEnabledFrameworks adds framework names enabled in the ConfigAPI. 126 func WithEnabledFrameworks(i *configapi.Integrations) Option { 127 return func(o *Options) { 128 if i == nil || len(i.Frameworks) == 0 { 129 return 130 } 131 o.EnabledFrameworks = sets.New(i.Frameworks...) 132 } 133 } 134 135 // WithManagerName adds the kueue's manager name. 136 func WithManagerName(n string) Option { 137 return func(o *Options) { 138 o.ManagerName = n 139 } 140 } 141 142 var defaultOptions = Options{} 143 144 func NewReconciler( 145 client client.Client, 146 record record.EventRecorder, 147 opts ...Option) *JobReconciler { 148 options := ProcessOptions(opts...) 149 150 return &JobReconciler{ 151 client: client, 152 record: record, 153 manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName, 154 waitForPodsReady: options.WaitForPodsReady, 155 } 156 } 157 158 func (r *JobReconciler) ReconcileGenericJob(ctx context.Context, req ctrl.Request, job GenericJob) (result ctrl.Result, err error) { 159 object := job.Object() 160 log := ctrl.LoggerFrom(ctx).WithValues("job", req.String(), "gvk", job.GVK()) 161 ctx = ctrl.LoggerInto(ctx, log) 162 163 defer func() { 164 err = r.ignoreUnretryableError(log, err) 165 }() 166 167 dropFinalizers := false 168 if cJob, isComposable := job.(ComposableJob); isComposable { 169 dropFinalizers, err = cJob.Load(ctx, r.client, &req.NamespacedName) 170 } else { 171 err = r.client.Get(ctx, req.NamespacedName, object) 172 dropFinalizers = apierrors.IsNotFound(err) || !object.GetDeletionTimestamp().IsZero() 173 } 174 175 if jws, implements := job.(JobWithSkip); implements { 176 if jws.Skip() { 177 return ctrl.Result{}, nil 178 } 179 } 180 181 if dropFinalizers { 182 // Remove workload finalizer 183 workloads := &kueue.WorkloadList{} 184 185 if cJob, isComposable := job.(ComposableJob); isComposable { 186 var err error 187 workloads, err = cJob.ListChildWorkloads(ctx, r.client, req.NamespacedName) 188 if err != nil { 189 log.Error(err, "Removing finalizer") 190 return ctrl.Result{}, err 191 } 192 } else { 193 if err := r.client.List(ctx, workloads, client.InNamespace(req.Namespace), 194 client.MatchingFields{GetOwnerKey(job.GVK()): req.Name}); err != nil { 195 log.Error(err, "Unable to list child workloads") 196 return ctrl.Result{}, err 197 } 198 } 199 for i := range workloads.Items { 200 err := workload.RemoveFinalizer(ctx, r.client, &workloads.Items[i]) 201 if client.IgnoreNotFound(err) != nil { 202 log.Error(err, "Removing finalizer") 203 return ctrl.Result{}, err 204 } 205 } 206 207 // Remove job finalizer 208 if !object.GetDeletionTimestamp().IsZero() { 209 if err = r.finalizeJob(ctx, job); err != nil { 210 return ctrl.Result{}, err 211 } 212 } 213 return ctrl.Result{}, nil 214 } 215 216 if err != nil { 217 return ctrl.Result{}, client.IgnoreNotFound(err) 218 } 219 220 isStandaloneJob := ParentWorkloadName(job) == "" 221 222 // when manageJobsWithoutQueueName is disabled we only reconcile jobs that have either 223 // queue-name or the parent-workload annotation set. 224 // If the parent-workload annotation is set, it also checks whether the parent job has queue-name label. 225 if !r.manageJobsWithoutQueueName && QueueName(job) == "" { 226 if isStandaloneJob { 227 log.V(3).Info("Neither queue-name label, nor parent-workload annotation is set, ignoring the job", 228 "queueName", QueueName(job), "parentWorkload", ParentWorkloadName(job)) 229 return ctrl.Result{}, nil 230 } 231 isParentJobManaged, err := r.IsParentJobManaged(ctx, job.Object(), req.Namespace) 232 if err != nil { 233 log.Error(err, "couldn't check whether the parent job is managed by kueue") 234 return ctrl.Result{}, err 235 } 236 if !isParentJobManaged { 237 log.V(3).Info("parent-workload annotation is set, and the parent job doesn't have a queue-name label, ignoring the job", 238 "parentWorkload", ParentWorkloadName(job)) 239 return ctrl.Result{}, nil 240 } 241 } 242 243 // if this is a non-standalone job, suspend the job if its parent workload is not found or not admitted. 244 if !isStandaloneJob { 245 _, finished := job.Finished() 246 if !finished && !job.IsSuspended() { 247 if parentWorkload, err := r.getParentWorkload(ctx, job, object); err != nil { 248 log.Error(err, "couldn't get the parent job workload") 249 return ctrl.Result{}, err 250 } else if parentWorkload == nil || !workload.IsAdmitted(parentWorkload) { 251 // suspend it 252 job.Suspend() 253 if err := r.client.Update(ctx, object); err != nil { 254 log.Error(err, "suspending child job failed") 255 return ctrl.Result{}, err 256 } 257 r.record.Event(object, corev1.EventTypeNormal, ReasonSuspended, "Kueue managed child job suspended") 258 } 259 } 260 return ctrl.Result{}, nil 261 } 262 263 log.V(2).Info("Reconciling Job") 264 265 // 1. make sure there is only a single existing instance of the workload. 266 // If there's no workload exists and job is unsuspended, we'll stop it immediately. 267 wl, err := r.ensureOneWorkload(ctx, job, object) 268 if err != nil { 269 return ctrl.Result{}, err 270 } 271 272 if wl != nil && apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { 273 if err := r.finalizeJob(ctx, job); err != nil { 274 return ctrl.Result{}, err 275 } 276 277 r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload, 278 "Workload '%s' is declared finished", workload.Key(wl)) 279 return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, wl) 280 } 281 282 // 1.1 If the workload is pending deletion, suspend the job if needed 283 // and drop the finalizer. 284 if wl != nil && !wl.DeletionTimestamp.IsZero() { 285 log.V(2).Info("The workload is marked for deletion") 286 err := r.stopJob(ctx, job, wl, StopReasonWorkloadDeleted, "Workload is deleted") 287 if err != nil { 288 log.Error(err, "Suspending job with deleted workload") 289 } 290 291 if err == nil && wl != nil { 292 err = workload.RemoveFinalizer(ctx, r.client, wl) 293 } 294 return ctrl.Result{}, err 295 } 296 297 // 2. handle job is finished. 298 if condition, finished := job.Finished(); finished { 299 if wl != nil && !apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { 300 err := workload.UpdateStatus(ctx, r.client, wl, condition.Type, condition.Status, condition.Reason, condition.Message, constants.JobControllerName) 301 if err != nil && !apierrors.IsNotFound(err) { 302 return ctrl.Result{}, err 303 } 304 r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload, 305 "Workload '%s' is declared finished", workload.Key(wl)) 306 } 307 308 // Execute job finalization logic 309 if err := r.finalizeJob(ctx, job); err != nil { 310 return ctrl.Result{}, err 311 } 312 313 return ctrl.Result{}, nil 314 } 315 316 // 3. handle workload is nil. 317 if wl == nil { 318 err := r.handleJobWithNoWorkload(ctx, job, object) 319 if err != nil { 320 if IsUnretryableError(err) { 321 log.V(3).Info("Handling job with no workload", "unretryableError", err) 322 } else { 323 log.Error(err, "Handling job with no workload") 324 } 325 } 326 return ctrl.Result{}, err 327 } 328 329 // 4. update reclaimable counts if implemented by the job 330 if jobRecl, implementsReclaimable := job.(JobWithReclaimablePods); implementsReclaimable { 331 reclPods, err := jobRecl.ReclaimablePods() 332 if err != nil { 333 log.Error(err, "Getting reclaimable pods") 334 return ctrl.Result{}, err 335 } 336 337 if !workload.ReclaimablePodsAreEqual(reclPods, wl.Status.ReclaimablePods) { 338 err = workload.UpdateReclaimablePods(ctx, r.client, wl, reclPods) 339 if err != nil { 340 log.Error(err, "Updating reclaimable pods") 341 return ctrl.Result{}, err 342 } 343 return ctrl.Result{}, nil 344 } 345 } 346 347 // 5. handle WaitForPodsReady only for a standalone job. 348 // handle a job when waitForPodsReady is enabled, and it is the main job 349 if r.waitForPodsReady { 350 log.V(5).Info("Handling a job when waitForPodsReady is enabled") 351 condition := generatePodsReadyCondition(job, wl) 352 // optimization to avoid sending the update request if the status didn't change 353 if !apimeta.IsStatusConditionPresentAndEqual(wl.Status.Conditions, condition.Type, condition.Status) { 354 log.V(3).Info(fmt.Sprintf("Updating the PodsReady condition with status: %v", condition.Status)) 355 apimeta.SetStatusCondition(&wl.Status.Conditions, condition) 356 err := workload.UpdateStatus(ctx, r.client, wl, condition.Type, condition.Status, condition.Reason, condition.Message, constants.JobControllerName) 357 if err != nil { 358 log.Error(err, "Updating workload status") 359 } 360 } 361 } 362 363 // 6. handle eviction 364 if evCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadEvicted); evCond != nil && evCond.Status == metav1.ConditionTrue { 365 if err := r.stopJob(ctx, job, wl, StopReasonWorkloadEvicted, evCond.Message); err != nil { 366 return ctrl.Result{}, err 367 } 368 if workload.HasQuotaReservation(wl) { 369 if !job.IsActive() { 370 log.V(6).Info("The job is no longer active, clear the workloads admission") 371 _ = workload.UnsetQuotaReservationWithCondition(wl, "Pending", evCond.Message) 372 err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 373 if err != nil { 374 return ctrl.Result{}, fmt.Errorf("clearing admission: %w", err) 375 } 376 } 377 } 378 return ctrl.Result{}, nil 379 } 380 381 // 7. handle job is suspended. 382 if job.IsSuspended() { 383 // start the job if the workload has been admitted, and the job is still suspended 384 if workload.IsAdmitted(wl) { 385 log.V(2).Info("Job admitted, unsuspending") 386 err := r.startJob(ctx, job, object, wl) 387 if err != nil { 388 log.Error(err, "Unsuspending job") 389 if podset.IsPermanent(err) { 390 // Mark the workload as finished with failure since the is no point to retry. 391 errUpdateStatus := workload.UpdateStatus(ctx, r.client, wl, kueue.WorkloadFinished, metav1.ConditionTrue, FailedToStartFinishedReason, err.Error(), constants.JobControllerName) 392 if errUpdateStatus != nil { 393 log.Error(errUpdateStatus, "Updating workload status, on start failure %s", err.Error()) 394 } 395 return ctrl.Result{}, errUpdateStatus 396 } 397 } 398 return ctrl.Result{}, err 399 } 400 401 // update queue name if changed. 402 q := QueueName(job) 403 if wl.Spec.QueueName != q { 404 log.V(2).Info("Job changed queues, updating workload") 405 wl.Spec.QueueName = q 406 err := r.client.Update(ctx, wl) 407 if err != nil { 408 log.Error(err, "Updating workload queue") 409 } 410 return ctrl.Result{}, err 411 } 412 log.V(3).Info("Job is suspended and workload not yet admitted by a clusterQueue, nothing to do") 413 return ctrl.Result{}, nil 414 } 415 416 // 8. handle workload is deactivated. 417 if !ptr.Deref(wl.Spec.Active, true) { 418 workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByDeactivation, "The workload is deactivated") 419 err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 420 if err != nil { 421 return ctrl.Result{}, fmt.Errorf("setting eviction: %w", err) 422 } 423 return ctrl.Result{}, nil 424 } 425 426 // 9. handle job is unsuspended. 427 if !workload.IsAdmitted(wl) { 428 // the job must be suspended if the workload is not yet admitted. 429 log.V(2).Info("Running job is not admitted by a cluster queue, suspending") 430 err := r.stopJob(ctx, job, wl, StopReasonNotAdmitted, "Not admitted by cluster queue") 431 if err != nil { 432 log.Error(err, "Suspending job with non admitted workload") 433 } 434 return ctrl.Result{}, err 435 } 436 437 // workload is admitted and job is running, nothing to do. 438 log.V(3).Info("Job running with admitted workload, nothing to do") 439 return ctrl.Result{}, nil 440 } 441 442 // IsParentJobManaged checks whether the parent job is managed by kueue. 443 func (r *JobReconciler) IsParentJobManaged(ctx context.Context, jobObj client.Object, namespace string) (bool, error) { 444 owner := metav1.GetControllerOf(jobObj) 445 if owner == nil { 446 return false, ErrChildJobOwnerNotFound 447 } 448 parentJob := GetEmptyOwnerObject(owner) 449 if parentJob == nil { 450 return false, fmt.Errorf("workload owner %v: %w", owner, ErrUnknownWorkloadOwner) 451 } 452 if err := r.client.Get(ctx, client.ObjectKey{Name: owner.Name, Namespace: namespace}, parentJob); err != nil { 453 return false, errors.Join(ErrWorkloadOwnerNotFound, err) 454 } 455 return QueueNameForObject(parentJob) != "", nil 456 } 457 458 func (r *JobReconciler) getParentWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) { 459 pw := kueue.Workload{} 460 namespacedName := types.NamespacedName{ 461 Name: ParentWorkloadName(job), 462 Namespace: object.GetNamespace(), 463 } 464 if err := r.client.Get(ctx, namespacedName, &pw); err != nil { 465 return nil, client.IgnoreNotFound(err) 466 } else { 467 return &pw, nil 468 } 469 } 470 471 // ensureOneWorkload will query for the single matched workload corresponding to job and return it. 472 // If there are more than one workload, we should delete the excess ones. 473 // The returned workload could be nil. 474 func (r *JobReconciler) ensureOneWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) { 475 log := ctrl.LoggerFrom(ctx) 476 477 if prebuiltWorkloadName, usePrebuiltWorkload := PrebuiltWorkloadFor(job); usePrebuiltWorkload { 478 wl := &kueue.Workload{} 479 err := r.client.Get(ctx, types.NamespacedName{Name: prebuiltWorkloadName, Namespace: object.GetNamespace()}, wl) 480 if err != nil { 481 return nil, client.IgnoreNotFound(err) 482 } 483 484 if owns, err := r.ensurePrebuiltWorkloadOwnership(ctx, wl, object); !owns || err != nil { 485 return nil, err 486 } 487 488 if inSync, err := r.ensurePrebuiltWorkloadInSync(ctx, wl, job); !inSync || err != nil { 489 return nil, err 490 } 491 return wl, nil 492 } 493 494 // Find a matching workload first if there is one. 495 var toDelete []*kueue.Workload 496 var match *kueue.Workload 497 if cj, implements := job.(ComposableJob); implements { 498 var err error 499 match, toDelete, err = cj.FindMatchingWorkloads(ctx, r.client, r.record) 500 if err != nil { 501 log.Error(err, "Composable job is unable to find matching workloads") 502 return nil, err 503 } 504 } else { 505 var err error 506 match, toDelete, err = FindMatchingWorkloads(ctx, r.client, job) 507 if err != nil { 508 log.Error(err, "Unable to list child workloads") 509 return nil, err 510 } 511 } 512 513 var toUpdate *kueue.Workload 514 if match == nil && len(toDelete) > 0 && job.IsSuspended() && !workload.HasQuotaReservation(toDelete[0]) { 515 toUpdate = toDelete[0] 516 toDelete = toDelete[1:] 517 } 518 519 // If there is no matching workload and the job is running, suspend it. 520 if match == nil && !job.IsSuspended() { 521 log.V(2).Info("job with no matching workload, suspending") 522 var w *kueue.Workload 523 if len(toDelete) == 1 { 524 // The job may have been modified and hence the existing workload 525 // doesn't match the job anymore. All bets are off if there are more 526 // than one workload... 527 w = toDelete[0] 528 } 529 530 if _, finished := job.Finished(); !finished { 531 var msg string 532 if w == nil { 533 msg = "Missing Workload; unable to restore pod templates" 534 } else { 535 msg = "No matching Workload; restoring pod templates according to existent Workload" 536 } 537 if err := r.stopJob(ctx, job, w, StopReasonNoMatchingWorkload, msg); err != nil { 538 return nil, fmt.Errorf("stopping job with no matching workload: %w", err) 539 } 540 } 541 } 542 543 // Delete duplicate workload instances. 544 existedWls := 0 545 for _, wl := range toDelete { 546 wlKey := workload.Key(wl) 547 err := workload.RemoveFinalizer(ctx, r.client, wl) 548 if err != nil && !apierrors.IsNotFound(err) { 549 return nil, fmt.Errorf("failed to remove workload finalizer for: %w ", err) 550 } 551 552 err = r.client.Delete(ctx, wl) 553 if err != nil && !apierrors.IsNotFound(err) { 554 return nil, fmt.Errorf("deleting not matching workload: %w", err) 555 } 556 if err == nil { 557 existedWls++ 558 r.record.Eventf(object, corev1.EventTypeNormal, ReasonDeletedWorkload, 559 "Deleted not matching Workload: %v", wlKey) 560 } 561 } 562 563 if existedWls != 0 { 564 if match == nil { 565 return nil, fmt.Errorf("%w: deleted %d workloads", ErrNoMatchingWorkloads, len(toDelete)) 566 } 567 return nil, fmt.Errorf("%w: deleted %d workloads", ErrExtraWorkloads, len(toDelete)) 568 } 569 570 if toUpdate != nil { 571 return r.updateWorkloadToMatchJob(ctx, job, object, toUpdate) 572 } 573 574 return match, nil 575 } 576 577 func FindMatchingWorkloads(ctx context.Context, c client.Client, job GenericJob) (match *kueue.Workload, toDelete []*kueue.Workload, err error) { 578 object := job.Object() 579 580 workloads := &kueue.WorkloadList{} 581 if err := c.List(ctx, workloads, client.InNamespace(object.GetNamespace()), 582 client.MatchingFields{GetOwnerKey(job.GVK()): object.GetName()}); err != nil { 583 return nil, nil, err 584 } 585 586 for i := range workloads.Items { 587 w := &workloads.Items[i] 588 if match == nil && equivalentToWorkload(ctx, c, job, w) { 589 match = w 590 } else { 591 toDelete = append(toDelete, w) 592 } 593 } 594 595 return match, toDelete, nil 596 } 597 598 func (r *JobReconciler) ensurePrebuiltWorkloadOwnership(ctx context.Context, wl *kueue.Workload, object client.Object) (bool, error) { 599 if !metav1.IsControlledBy(wl, object) { 600 if err := ctrl.SetControllerReference(object, wl, r.client.Scheme()); err != nil { 601 // don't return an error here, since a retry cannot give a different result, 602 // log the error. 603 log := ctrl.LoggerFrom(ctx) 604 log.Error(err, "Cannot take ownership of the workload") 605 return false, nil 606 } 607 608 if errs := validation.IsValidLabelValue(string(object.GetUID())); len(errs) == 0 { 609 wl.Labels = maps.MergeKeepFirst(map[string]string{controllerconsts.JobUIDLabel: string(object.GetUID())}, wl.Labels) 610 } 611 612 if err := r.client.Update(ctx, wl); err != nil { 613 return false, err 614 } 615 } 616 return true, nil 617 } 618 619 func (r *JobReconciler) ensurePrebuiltWorkloadInSync(ctx context.Context, wl *kueue.Workload, job GenericJob) (bool, error) { 620 if !equivalentToWorkload(ctx, r.client, job, wl) { 621 // mark the workload as finished 622 err := workload.UpdateStatus(ctx, r.client, wl, 623 kueue.WorkloadFinished, 624 metav1.ConditionTrue, 625 "OutOfSync", 626 "The prebuilt workload is out of sync with its user job", 627 constants.JobControllerName) 628 return false, err 629 } 630 return true, nil 631 } 632 633 // expectedRunningPodSets gets the expected podsets during the job execution, returns nil if the workload has no reservation or 634 // the admission does not match. 635 func expectedRunningPodSets(ctx context.Context, c client.Client, wl *kueue.Workload) []kueue.PodSet { 636 if !workload.HasQuotaReservation(wl) { 637 return nil 638 } 639 info, err := getPodSetsInfoFromStatus(ctx, c, wl) 640 if err != nil { 641 return nil 642 } 643 infoMap := slices.ToRefMap(info, func(psi *podset.PodSetInfo) string { return psi.Name }) 644 runningPodSets := wl.Spec.DeepCopy().PodSets 645 canBePartiallyAdmitted := workload.CanBePartiallyAdmitted(wl) 646 for i := range runningPodSets { 647 ps := &runningPodSets[i] 648 psi, found := infoMap[ps.Name] 649 if !found { 650 return nil 651 } 652 err := podset.Merge(&ps.Template.ObjectMeta, &ps.Template.Spec, *psi) 653 if err != nil { 654 return nil 655 } 656 if canBePartiallyAdmitted && ps.MinCount != nil { 657 // update the expected running count 658 ps.Count = psi.Count 659 } 660 } 661 return runningPodSets 662 } 663 664 // equivalentToWorkload checks if the job corresponds to the workload 665 func equivalentToWorkload(ctx context.Context, c client.Client, job GenericJob, wl *kueue.Workload) bool { 666 owner := metav1.GetControllerOf(wl) 667 // Indexes don't work in unit tests, so we explicitly check for the 668 // owner here. 669 if owner.Name != job.Object().GetName() { 670 return false 671 } 672 673 jobPodSets := clearMinCountsIfFeatureDisabled(job.PodSets()) 674 675 if runningPodSets := expectedRunningPodSets(ctx, c, wl); runningPodSets != nil { 676 if equality.ComparePodSetSlices(jobPodSets, runningPodSets) { 677 return true 678 } 679 // If the workload is admitted but the job is suspended, do the check 680 // against the non-running info. 681 // This might allow some violating jobs to pass equivalency checks, but their 682 // workloads would be invalidated in the next sync after unsuspending. 683 return job.IsSuspended() && equality.ComparePodSetSlices(jobPodSets, wl.Spec.PodSets) 684 } 685 686 return equality.ComparePodSetSlices(jobPodSets, wl.Spec.PodSets) 687 } 688 689 func (r *JobReconciler) updateWorkloadToMatchJob(ctx context.Context, job GenericJob, object client.Object, wl *kueue.Workload) (*kueue.Workload, error) { 690 newWl, err := r.constructWorkload(ctx, job, object) 691 if err != nil { 692 return nil, fmt.Errorf("can't construct workload for update: %w", err) 693 } 694 err = r.prepareWorkload(ctx, job, newWl) 695 if err != nil { 696 return nil, fmt.Errorf("can't construct workload for update: %w", err) 697 } 698 wl.Spec = newWl.Spec 699 if err = r.client.Update(ctx, wl); err != nil { 700 return nil, fmt.Errorf("updating existed workload: %w", err) 701 } 702 703 r.record.Eventf(object, corev1.EventTypeNormal, ReasonUpdatedWorkload, 704 "Updated not matching Workload for suspended job: %v", klog.KObj(wl)) 705 return newWl, nil 706 } 707 708 // startJob will unsuspend the job, and also inject the node affinity. 709 func (r *JobReconciler) startJob(ctx context.Context, job GenericJob, object client.Object, wl *kueue.Workload) error { 710 info, err := getPodSetsInfoFromStatus(ctx, r.client, wl) 711 if err != nil { 712 return err 713 } 714 msg := fmt.Sprintf("Admitted by clusterQueue %v", wl.Status.Admission.ClusterQueue) 715 716 if cj, implements := job.(ComposableJob); implements { 717 if err := cj.Run(ctx, r.client, info, r.record, msg); err != nil { 718 return err 719 } 720 } else { 721 if runErr := job.RunWithPodSetsInfo(info); runErr != nil { 722 return runErr 723 } 724 725 if err := r.client.Update(ctx, object); err != nil { 726 return err 727 } 728 r.record.Event(object, corev1.EventTypeNormal, ReasonStarted, msg) 729 } 730 731 return nil 732 } 733 734 // stopJob will suspend the job, and also restore node affinity, reset job status if needed. 735 // Returns whether any operation was done to stop the job or an error. 736 func (r *JobReconciler) stopJob(ctx context.Context, job GenericJob, wl *kueue.Workload, stopReason StopReason, eventMsg string) error { 737 object := job.Object() 738 739 info := GetPodSetsInfoFromWorkload(wl) 740 741 if jws, implements := job.(JobWithCustomStop); implements { 742 stoppedNow, err := jws.Stop(ctx, r.client, info, stopReason, eventMsg) 743 if stoppedNow { 744 r.record.Event(object, corev1.EventTypeNormal, ReasonStopped, eventMsg) 745 } 746 return err 747 } 748 749 if jws, implements := job.(ComposableJob); implements { 750 stoppedNow, err := jws.Stop(ctx, r.client, info, stopReason, eventMsg) 751 for _, objStoppedNow := range stoppedNow { 752 r.record.Event(objStoppedNow, corev1.EventTypeNormal, ReasonStopped, eventMsg) 753 } 754 return err 755 } 756 757 if job.IsSuspended() { 758 return nil 759 } 760 761 job.Suspend() 762 if info != nil { 763 job.RestorePodSetsInfo(info) 764 } 765 if err := r.client.Update(ctx, object); err != nil { 766 return err 767 } 768 769 r.record.Event(object, corev1.EventTypeNormal, ReasonStopped, eventMsg) 770 return nil 771 } 772 773 func (r *JobReconciler) finalizeJob(ctx context.Context, job GenericJob) error { 774 if jwf, implements := job.(JobWithFinalize); implements { 775 if err := jwf.Finalize(ctx, r.client); err != nil { 776 return err 777 } 778 } 779 780 return nil 781 } 782 783 // constructWorkload will derive a workload from the corresponding job. 784 func (r *JobReconciler) constructWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) { 785 log := ctrl.LoggerFrom(ctx) 786 787 if cj, implements := job.(ComposableJob); implements { 788 wl, err := cj.ConstructComposableWorkload(ctx, r.client, r.record) 789 if err != nil { 790 return nil, err 791 } 792 793 return wl, nil 794 } 795 796 podSets := job.PodSets() 797 798 wl := &kueue.Workload{ 799 ObjectMeta: metav1.ObjectMeta{ 800 Name: GetWorkloadNameForOwnerWithGVK(object.GetName(), job.GVK()), 801 Namespace: object.GetNamespace(), 802 Labels: map[string]string{}, 803 Finalizers: []string{kueue.ResourceInUseFinalizerName}, 804 }, 805 Spec: kueue.WorkloadSpec{ 806 PodSets: podSets, 807 QueueName: QueueName(job), 808 }, 809 } 810 811 jobUid := string(job.Object().GetUID()) 812 if errs := validation.IsValidLabelValue(jobUid); len(errs) == 0 { 813 wl.Labels[controllerconsts.JobUIDLabel] = jobUid 814 } else { 815 log.V(2).Info( 816 "Validation of the owner job UID label has failed. Creating workload without the label.", 817 "ValidationErrors", errs, 818 "LabelValue", jobUid, 819 ) 820 } 821 822 if err := ctrl.SetControllerReference(object, wl, r.client.Scheme()); err != nil { 823 return nil, err 824 } 825 return wl, nil 826 } 827 828 // prepareWorkload adds the priority information for the constructed workload 829 func (r *JobReconciler) prepareWorkload(ctx context.Context, job GenericJob, wl *kueue.Workload) error { 830 priorityClassName, source, p, err := r.extractPriority(ctx, wl.Spec.PodSets, job) 831 if err != nil { 832 return err 833 } 834 835 wl.Spec.PriorityClassName = priorityClassName 836 wl.Spec.Priority = &p 837 wl.Spec.PriorityClassSource = source 838 839 wl.Spec.PodSets = clearMinCountsIfFeatureDisabled(wl.Spec.PodSets) 840 841 return nil 842 } 843 844 func (r *JobReconciler) extractPriority(ctx context.Context, podSets []kueue.PodSet, job GenericJob) (string, string, int32, error) { 845 if workloadPriorityClass := workloadPriorityClassName(job); len(workloadPriorityClass) > 0 { 846 return utilpriority.GetPriorityFromWorkloadPriorityClass(ctx, r.client, workloadPriorityClass) 847 } 848 if jobWithPriorityClass, isImplemented := job.(JobWithPriorityClass); isImplemented { 849 return utilpriority.GetPriorityFromPriorityClass( 850 ctx, r.client, jobWithPriorityClass.PriorityClass()) 851 } 852 return utilpriority.GetPriorityFromPriorityClass( 853 ctx, r.client, extractPriorityFromPodSets(podSets)) 854 } 855 856 func extractPriorityFromPodSets(podSets []kueue.PodSet) string { 857 for _, podSet := range podSets { 858 if len(podSet.Template.Spec.PriorityClassName) > 0 { 859 return podSet.Template.Spec.PriorityClassName 860 } 861 } 862 return "" 863 } 864 865 // getPodSetsInfoFromStatus extracts podSetsInfo from workload status, based on 866 // admission, and admission checks. 867 func getPodSetsInfoFromStatus(ctx context.Context, c client.Client, w *kueue.Workload) ([]podset.PodSetInfo, error) { 868 if len(w.Status.Admission.PodSetAssignments) == 0 { 869 return nil, nil 870 } 871 872 podSetsInfo := make([]podset.PodSetInfo, len(w.Status.Admission.PodSetAssignments)) 873 874 for i, podSetFlavor := range w.Status.Admission.PodSetAssignments { 875 info, err := podset.FromAssignment(ctx, c, &podSetFlavor, w.Spec.PodSets[i].Count) 876 if err != nil { 877 return nil, err 878 } 879 880 for _, admissionCheck := range w.Status.AdmissionChecks { 881 for _, podSetUpdate := range admissionCheck.PodSetUpdates { 882 if podSetUpdate.Name == info.Name { 883 if err := info.Merge(podset.FromUpdate(&podSetUpdate)); err != nil { 884 return nil, fmt.Errorf("in admission check %q: %w", admissionCheck.Name, err) 885 } 886 break 887 } 888 } 889 } 890 podSetsInfo[i] = info 891 } 892 return podSetsInfo, nil 893 } 894 895 func (r *JobReconciler) handleJobWithNoWorkload(ctx context.Context, job GenericJob, object client.Object) error { 896 log := ctrl.LoggerFrom(ctx) 897 898 _, usePrebuiltWorkload := PrebuiltWorkloadFor(job) 899 if usePrebuiltWorkload { 900 // Stop the job if not already suspended 901 if stopErr := r.stopJob(ctx, job, nil, StopReasonNoMatchingWorkload, "missing workload"); stopErr != nil { 902 return stopErr 903 } 904 } 905 906 // Wait until there are no active pods. 907 if job.IsActive() { 908 log.V(2).Info("Job is suspended but still has active pods, waiting") 909 return nil 910 } 911 912 if usePrebuiltWorkload { 913 log.V(2).Info("Skip workload creation for job with prebuilt workload") 914 return nil 915 } 916 917 // Create the corresponding workload. 918 wl, err := r.constructWorkload(ctx, job, object) 919 if err != nil { 920 return err 921 } 922 err = r.prepareWorkload(ctx, job, wl) 923 if err != nil { 924 return err 925 } 926 if err = r.client.Create(ctx, wl); err != nil { 927 return err 928 } 929 r.record.Eventf(object, corev1.EventTypeNormal, ReasonCreatedWorkload, 930 "Created Workload: %v", workload.Key(wl)) 931 return nil 932 } 933 934 func (r *JobReconciler) ignoreUnretryableError(log logr.Logger, err error) error { 935 if IsUnretryableError(err) { 936 log.V(2).Info("Received an unretryable error", "error", err) 937 return nil 938 } 939 return err 940 } 941 942 func generatePodsReadyCondition(job GenericJob, wl *kueue.Workload) metav1.Condition { 943 conditionStatus := metav1.ConditionFalse 944 message := "Not all pods are ready or succeeded" 945 // Once PodsReady=True it stays as long as the workload remains admitted to 946 // avoid unnecessary flickering the condition when the pods transition 947 // from Ready to Completed. As pods finish, they transition first into the 948 // uncountedTerminatedPods staging area, before passing to the 949 // succeeded/failed counters. 950 if workload.IsAdmitted(wl) && (job.PodsReady() || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadPodsReady)) { 951 conditionStatus = metav1.ConditionTrue 952 message = "All pods were ready or succeeded since the workload admission" 953 } 954 return metav1.Condition{ 955 Type: kueue.WorkloadPodsReady, 956 Status: conditionStatus, 957 Reason: "PodsReady", 958 Message: message, 959 } 960 } 961 962 // GetPodSetsInfoFromWorkload retrieve the podSetsInfo slice from the 963 // provided workload's spec 964 func GetPodSetsInfoFromWorkload(wl *kueue.Workload) []podset.PodSetInfo { 965 if wl == nil { 966 return nil 967 } 968 969 return slices.Map(wl.Spec.PodSets, podset.FromPodSet) 970 971 } 972 973 type ReconcilerSetup func(*builder.Builder, client.Client) *builder.Builder 974 975 // NewGenericReconcilerFactory creates a new reconciler factory for a concrete GenericJob type. 976 // newJob should return a new empty job. 977 func NewGenericReconcilerFactory(newJob func() GenericJob, setup ...ReconcilerSetup) ReconcilerFactory { 978 return func(client client.Client, record record.EventRecorder, opts ...Option) JobReconcilerInterface { 979 return &genericReconciler{ 980 jr: NewReconciler(client, record, opts...), 981 newJob: newJob, 982 setup: setup, 983 } 984 } 985 } 986 987 type genericReconciler struct { 988 jr *JobReconciler 989 newJob func() GenericJob 990 setup []ReconcilerSetup 991 } 992 993 func (r *genericReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 994 return r.jr.ReconcileGenericJob(ctx, req, r.newJob()) 995 } 996 997 func (r *genericReconciler) SetupWithManager(mgr ctrl.Manager) error { 998 b := ctrl.NewControllerManagedBy(mgr). 999 For(r.newJob().Object()).Owns(&kueue.Workload{}) 1000 c := mgr.GetClient() 1001 for _, f := range r.setup { 1002 b = f(b, c) 1003 } 1004 return b.Complete(r) 1005 } 1006 1007 // clearMinCountsIfFeatureDisabled sets the minCount for all podSets to nil if the PartialAdmission feature is not enabled 1008 func clearMinCountsIfFeatureDisabled(in []kueue.PodSet) []kueue.PodSet { 1009 if features.Enabled(features.PartialAdmission) || len(in) == 0 { 1010 return in 1011 } 1012 for i := range in { 1013 in[i].MinCount = nil 1014 } 1015 return in 1016 }