sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/scheduler.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package scheduler 18 19 import ( 20 "context" 21 "fmt" 22 "maps" 23 "sort" 24 "strings" 25 "time" 26 27 "github.com/go-logr/logr" 28 corev1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/api/errors" 30 apimeta "k8s.io/apimachinery/pkg/api/meta" 31 "k8s.io/apimachinery/pkg/labels" 32 "k8s.io/apimachinery/pkg/types" 33 "k8s.io/apimachinery/pkg/util/sets" 34 "k8s.io/apimachinery/pkg/util/wait" 35 "k8s.io/client-go/tools/record" 36 "k8s.io/klog/v2" 37 "k8s.io/utils/field" 38 ctrl "sigs.k8s.io/controller-runtime" 39 "sigs.k8s.io/controller-runtime/pkg/client" 40 41 config "sigs.k8s.io/kueue/apis/config/v1beta1" 42 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 43 "sigs.k8s.io/kueue/pkg/cache" 44 "sigs.k8s.io/kueue/pkg/features" 45 "sigs.k8s.io/kueue/pkg/metrics" 46 "sigs.k8s.io/kueue/pkg/queue" 47 "sigs.k8s.io/kueue/pkg/scheduler/flavorassigner" 48 "sigs.k8s.io/kueue/pkg/scheduler/preemption" 49 "sigs.k8s.io/kueue/pkg/util/api" 50 "sigs.k8s.io/kueue/pkg/util/limitrange" 51 utilmaps "sigs.k8s.io/kueue/pkg/util/maps" 52 "sigs.k8s.io/kueue/pkg/util/priority" 53 "sigs.k8s.io/kueue/pkg/util/resource" 54 "sigs.k8s.io/kueue/pkg/util/routine" 55 "sigs.k8s.io/kueue/pkg/workload" 56 ) 57 58 const ( 59 errCouldNotAdmitWL = "Could not admit Workload and assign flavors in apiserver" 60 ) 61 62 type Scheduler struct { 63 queues *queue.Manager 64 cache *cache.Cache 65 client client.Client 66 recorder record.EventRecorder 67 admissionRoutineWrapper routine.Wrapper 68 preemptor *preemption.Preemptor 69 // Stubs. 70 applyAdmission func(context.Context, *kueue.Workload) error 71 72 workloadOrdering workload.Ordering 73 } 74 75 type options struct { 76 podsReadyRequeuingTimestamp config.RequeuingTimestamp 77 } 78 79 // Option configures the reconciler. 80 type Option func(*options) 81 82 var defaultOptions = options{ 83 podsReadyRequeuingTimestamp: config.EvictionTimestamp, 84 } 85 86 // WithPodsReadyRequeuingTimestamp sets the timestamp that is used for ordering 87 // workloads that have been requeued due to the PodsReady condition. 88 func WithPodsReadyRequeuingTimestamp(ts config.RequeuingTimestamp) Option { 89 return func(o *options) { 90 o.podsReadyRequeuingTimestamp = ts 91 } 92 } 93 94 func New(queues *queue.Manager, cache *cache.Cache, cl client.Client, recorder record.EventRecorder, opts ...Option) *Scheduler { 95 options := defaultOptions 96 for _, opt := range opts { 97 opt(&options) 98 } 99 wo := workload.Ordering{ 100 PodsReadyRequeuingTimestamp: options.podsReadyRequeuingTimestamp, 101 } 102 s := &Scheduler{ 103 queues: queues, 104 cache: cache, 105 client: cl, 106 recorder: recorder, 107 preemptor: preemption.New(cl, wo, recorder), 108 admissionRoutineWrapper: routine.DefaultWrapper, 109 workloadOrdering: wo, 110 } 111 s.applyAdmission = s.applyAdmissionWithSSA 112 return s 113 } 114 115 // Start implements the Runnable interface to run scheduler as a controller. 116 func (s *Scheduler) Start(ctx context.Context) error { 117 log := ctrl.LoggerFrom(ctx).WithName("scheduler") 118 ctx = ctrl.LoggerInto(ctx, log) 119 go wait.UntilWithContext(ctx, s.schedule, 0) 120 return nil 121 } 122 123 // NeedLeaderElection Implements LeaderElectionRunnable interface to make scheduler 124 // run in leader election mode 125 func (s *Scheduler) NeedLeaderElection() bool { 126 return true 127 } 128 129 func (s *Scheduler) setAdmissionRoutineWrapper(wrapper routine.Wrapper) { 130 s.admissionRoutineWrapper = wrapper 131 } 132 133 type cohortsUsage map[string]cache.FlavorResourceQuantities 134 135 func (cu *cohortsUsage) add(cohort string, assigment cache.FlavorResourceQuantities) { 136 cohortUsage := (*cu)[cohort] 137 if cohortUsage == nil { 138 cohortUsage = make(cache.FlavorResourceQuantities, len(assigment)) 139 } 140 141 for flavor, resources := range assigment { 142 if _, found := cohortUsage[flavor]; found { 143 cohortUsage[flavor] = utilmaps.Merge(cohortUsage[flavor], resources, func(a, b int64) int64 { return a + b }) 144 } else { 145 cohortUsage[flavor] = maps.Clone(resources) 146 } 147 } 148 (*cu)[cohort] = cohortUsage 149 } 150 151 func (cu *cohortsUsage) totalUsageForCommonFlavorResources(cohort string, assigment cache.FlavorResourceQuantities) cache.FlavorResourceQuantities { 152 return utilmaps.Intersect((*cu)[cohort], assigment, func(a, b map[corev1.ResourceName]int64) map[corev1.ResourceName]int64 { 153 return utilmaps.Intersect(a, b, func(a, b int64) int64 { return a + b }) 154 }) 155 } 156 157 func (cu *cohortsUsage) hasCommonFlavorResources(cohort string, assigment cache.FlavorResourceQuantities) bool { 158 cohortUsage, cohortFound := (*cu)[cohort] 159 if !cohortFound { 160 return false 161 } 162 for flavor, assigmentResources := range assigment { 163 if cohortResources, found := cohortUsage[flavor]; found { 164 for resName := range assigmentResources { 165 if _, found := cohortResources[resName]; found { 166 return true 167 } 168 } 169 } 170 } 171 return false 172 } 173 174 func (s *Scheduler) schedule(ctx context.Context) { 175 log := ctrl.LoggerFrom(ctx) 176 177 // 1. Get the heads from the queues, including their desired clusterQueue. 178 // This operation blocks while the queues are empty. 179 headWorkloads := s.queues.Heads(ctx) 180 // If there are no elements, it means that the program is finishing. 181 if len(headWorkloads) == 0 { 182 return 183 } 184 startTime := time.Now() 185 186 // 2. Take a snapshot of the cache. 187 snapshot := s.cache.Snapshot() 188 logSnapshotIfVerbose(log, &snapshot) 189 190 // 3. Calculate requirements (resource flavors, borrowing) for admitting workloads. 191 entries := s.nominate(ctx, headWorkloads, snapshot) 192 193 // 4. Sort entries based on borrowing, priorities (if enabled) and timestamps. 194 sort.Sort(entryOrdering{ 195 entries: entries, 196 workloadOrdering: s.workloadOrdering, 197 }) 198 199 // 5. Admit entries, ensuring that no more than one workload gets 200 // admitted by a cohort (if borrowing). 201 // This is because there can be other workloads deeper in a clusterQueue whose 202 // head got admitted that should be scheduled in the cohort before the heads 203 // of other clusterQueues. 204 cycleCohortsUsage := cohortsUsage{} 205 cycleCohortsSkipPreemption := sets.New[string]() 206 for i := range entries { 207 e := &entries[i] 208 mode := e.assignment.RepresentativeMode() 209 if mode == flavorassigner.NoFit { 210 continue 211 } 212 213 cq := snapshot.ClusterQueues[e.ClusterQueue] 214 if cq.Cohort != nil { 215 sum := cycleCohortsUsage.totalUsageForCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) 216 // Check whether there was an assignment in this cycle that could render the next assignments invalid: 217 // - If the workload no longer fits in the cohort. 218 // - If there was another assignment in the cohort, then the preemption calculation is no longer valid. 219 if cycleCohortsUsage.hasCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) && 220 ((mode == flavorassigner.Fit && !cq.FitInCohort(sum)) || 221 (mode == flavorassigner.Preempt && cycleCohortsSkipPreemption.Has(cq.Cohort.Name))) { 222 e.status = skipped 223 e.inadmissibleMsg = "other workloads in the cohort were prioritized" 224 // When the workload needs borrowing and there is another workload in cohort doesn't 225 // need borrowing, the workload needborrowing will come again. In this case we should 226 // not skip the previous flavors. 227 e.LastAssignment = nil 228 continue 229 } 230 // Even if the workload will not be admitted after this point, due to preemption pending or other failures, 231 // we should still account for its usage. 232 cycleCohortsUsage.add(cq.Cohort.Name, resourcesToReserve(e, cq)) 233 } 234 log := log.WithValues("workload", klog.KObj(e.Obj), "clusterQueue", klog.KRef("", e.ClusterQueue)) 235 ctx := ctrl.LoggerInto(ctx, log) 236 if e.assignment.RepresentativeMode() != flavorassigner.Fit { 237 if len(e.preemptionTargets) != 0 { 238 // If preemptions are issued, the next attempt should try all the flavors. 239 e.LastAssignment = nil 240 preempted, err := s.preemptor.IssuePreemptions(ctx, e.preemptionTargets, cq) 241 if err != nil { 242 log.Error(err, "Failed to preempt workloads") 243 } 244 if preempted != 0 { 245 e.inadmissibleMsg += fmt.Sprintf(". Pending the preemption of %d workload(s)", preempted) 246 e.requeueReason = queue.RequeueReasonPendingPreemption 247 } 248 if cq.Cohort != nil { 249 cycleCohortsSkipPreemption.Insert(cq.Cohort.Name) 250 } 251 } else { 252 log.V(2).Info("Workload requires preemption, but there are no candidate workloads allowed for preemption", "preemption", cq.Preemption) 253 } 254 continue 255 } 256 if !s.cache.PodsReadyForAllAdmittedWorkloads(log) { 257 log.V(5).Info("Waiting for all admitted workloads to be in the PodsReady condition") 258 // If WaitForPodsReady is enabled and WaitForPodsReady.BlockAdmission is true 259 // Block admission until all currently admitted workloads are in 260 // PodsReady condition if the waitForPodsReady is enabled 261 workload.UnsetQuotaReservationWithCondition(e.Obj, "Waiting", "waiting for all admitted workloads to be in PodsReady condition") 262 if err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, false); err != nil { 263 log.Error(err, "Could not update Workload status") 264 } 265 s.cache.WaitForPodsReady(ctx) 266 log.V(5).Info("Finished waiting for all admitted workloads to be in the PodsReady condition") 267 } 268 e.status = nominated 269 if err := s.admit(ctx, e, cq.AdmissionChecks); err != nil { 270 e.inadmissibleMsg = fmt.Sprintf("Failed to admit workload: %v", err) 271 } 272 if cq.Cohort != nil { 273 cycleCohortsSkipPreemption.Insert(cq.Cohort.Name) 274 } 275 } 276 277 // 6. Requeue the heads that were not scheduled. 278 result := metrics.AdmissionResultInadmissible 279 for _, e := range entries { 280 logAdmissionAttemptIfVerbose(log, &e) 281 if e.status != assumed { 282 s.requeueAndUpdate(log, ctx, e) 283 } else { 284 result = metrics.AdmissionResultSuccess 285 } 286 } 287 metrics.AdmissionAttempt(result, time.Since(startTime)) 288 } 289 290 type entryStatus string 291 292 const ( 293 // indicates if the workload was nominated for admission. 294 nominated entryStatus = "nominated" 295 // indicates if the workload was skipped in this cycle. 296 skipped entryStatus = "skipped" 297 // indicates if the workload was assumed to have been admitted. 298 assumed entryStatus = "assumed" 299 // indicates that the workload was never nominated for admission. 300 notNominated entryStatus = "" 301 ) 302 303 // entry holds requirements for a workload to be admitted by a clusterQueue. 304 type entry struct { 305 // workload.Info holds the workload from the API as well as resource usage 306 // and flavors assigned. 307 workload.Info 308 assignment flavorassigner.Assignment 309 status entryStatus 310 inadmissibleMsg string 311 requeueReason queue.RequeueReason 312 preemptionTargets []*workload.Info 313 } 314 315 // nominate returns the workloads with their requirements (resource flavors, borrowing) if 316 // they were admitted by the clusterQueues in the snapshot. 317 func (s *Scheduler) nominate(ctx context.Context, workloads []workload.Info, snap cache.Snapshot) []entry { 318 log := ctrl.LoggerFrom(ctx) 319 entries := make([]entry, 0, len(workloads)) 320 for _, w := range workloads { 321 log := log.WithValues("workload", klog.KObj(w.Obj), "clusterQueue", klog.KRef("", w.ClusterQueue)) 322 cq := snap.ClusterQueues[w.ClusterQueue] 323 ns := corev1.Namespace{} 324 e := entry{Info: w} 325 if s.cache.IsAssumedOrAdmittedWorkload(w) { 326 log.Info("Workload skipped from admission because it's already assumed or admitted", "workload", klog.KObj(w.Obj)) 327 continue 328 } else if workload.HasRetryOrRejectedChecks(w.Obj) { 329 e.inadmissibleMsg = "The workload has failed admission checks" 330 } else if snap.InactiveClusterQueueSets.Has(w.ClusterQueue) { 331 e.inadmissibleMsg = fmt.Sprintf("ClusterQueue %s is inactive", w.ClusterQueue) 332 } else if cq == nil { 333 e.inadmissibleMsg = fmt.Sprintf("ClusterQueue %s not found", w.ClusterQueue) 334 } else if err := s.client.Get(ctx, types.NamespacedName{Name: w.Obj.Namespace}, &ns); err != nil { 335 e.inadmissibleMsg = fmt.Sprintf("Could not obtain workload namespace: %v", err) 336 } else if !cq.NamespaceSelector.Matches(labels.Set(ns.Labels)) { 337 e.inadmissibleMsg = "Workload namespace doesn't match ClusterQueue selector" 338 e.requeueReason = queue.RequeueReasonNamespaceMismatch 339 } else if err := s.validateResources(&w); err != nil { 340 e.inadmissibleMsg = err.Error() 341 } else if err := s.validateLimitRange(ctx, &w); err != nil { 342 e.inadmissibleMsg = err.Error() 343 } else { 344 e.assignment, e.preemptionTargets = s.getAssignments(log, &e.Info, &snap) 345 e.inadmissibleMsg = e.assignment.Message() 346 e.Info.LastAssignment = &e.assignment.LastState 347 } 348 entries = append(entries, e) 349 } 350 return entries 351 } 352 353 // resourcesToReserve calculates how much of the available resources in cq/cohort assignment should be reserved. 354 func resourcesToReserve(e *entry, cq *cache.ClusterQueue) cache.FlavorResourceQuantities { 355 if e.assignment.RepresentativeMode() != flavorassigner.Preempt { 356 return e.assignment.Usage 357 } 358 reservedUsage := make(cache.FlavorResourceQuantities) 359 for flavor, resourceUsage := range e.assignment.Usage { 360 reservedUsage[flavor] = make(map[corev1.ResourceName]int64) 361 for resource, usage := range resourceUsage { 362 rg := cq.RGByResource[resource] 363 cqQuota := cache.ResourceQuota{} 364 for _, cqFlavor := range rg.Flavors { 365 if cqFlavor.Name == flavor { 366 cqQuota = *cqFlavor.Resources[resource] 367 break 368 } 369 } 370 if !e.assignment.Borrowing { 371 reservedUsage[flavor][resource] = max(0, min(usage, cqQuota.Nominal-cq.Usage[flavor][resource])) 372 } else { 373 if cqQuota.BorrowingLimit == nil { 374 reservedUsage[flavor][resource] = usage 375 } else { 376 reservedUsage[flavor][resource] = min(usage, cqQuota.Nominal+*cqQuota.BorrowingLimit-cq.Usage[flavor][resource]) 377 } 378 } 379 380 } 381 } 382 return reservedUsage 383 } 384 385 type partialAssignment struct { 386 assignment flavorassigner.Assignment 387 preemptionTargets []*workload.Info 388 } 389 390 func (s *Scheduler) getAssignments(log logr.Logger, wl *workload.Info, snap *cache.Snapshot) (flavorassigner.Assignment, []*workload.Info) { 391 cq := snap.ClusterQueues[wl.ClusterQueue] 392 fullAssignment := flavorassigner.AssignFlavors(log, wl, snap.ResourceFlavors, cq, nil) 393 var faPreemtionTargets []*workload.Info 394 395 arm := fullAssignment.RepresentativeMode() 396 if arm == flavorassigner.Fit { 397 return fullAssignment, nil 398 } 399 400 if arm == flavorassigner.Preempt { 401 faPreemtionTargets = s.preemptor.GetTargets(*wl, fullAssignment, snap) 402 } 403 404 // if the feature gate is not enabled or we can preempt 405 if !features.Enabled(features.PartialAdmission) || len(faPreemtionTargets) > 0 { 406 return fullAssignment, faPreemtionTargets 407 } 408 409 if wl.CanBePartiallyAdmitted() { 410 reducer := flavorassigner.NewPodSetReducer(wl.Obj.Spec.PodSets, func(nextCounts []int32) (*partialAssignment, bool) { 411 assignment := flavorassigner.AssignFlavors(log, wl, snap.ResourceFlavors, cq, nextCounts) 412 if assignment.RepresentativeMode() == flavorassigner.Fit { 413 return &partialAssignment{assignment: assignment}, true 414 } 415 preemptionTargets := s.preemptor.GetTargets(*wl, assignment, snap) 416 if len(preemptionTargets) > 0 { 417 418 return &partialAssignment{assignment: assignment, preemptionTargets: preemptionTargets}, true 419 } 420 return nil, false 421 422 }) 423 if pa, found := reducer.Search(); found { 424 return pa.assignment, pa.preemptionTargets 425 } 426 } 427 return fullAssignment, nil 428 } 429 430 // validateResources validates that requested resources are less or equal 431 // to limits. 432 func (s *Scheduler) validateResources(wi *workload.Info) error { 433 podsetsPath := field.NewPath("podSets") 434 // requests should be less than limits. 435 allReasons := []string{} 436 for i := range wi.Obj.Spec.PodSets { 437 ps := &wi.Obj.Spec.PodSets[i] 438 psPath := podsetsPath.Child(ps.Name) 439 for i := range ps.Template.Spec.InitContainers { 440 c := ps.Template.Spec.InitContainers[i] 441 if list := resource.GetGreaterKeys(c.Resources.Requests, c.Resources.Limits); len(list) > 0 { 442 allReasons = append(allReasons, fmt.Sprintf("%s[%s] requests exceed it's limits", 443 psPath.Child("initContainers").Index(i).String(), 444 strings.Join(list, ", "))) 445 } 446 } 447 448 for i := range ps.Template.Spec.Containers { 449 c := ps.Template.Spec.Containers[i] 450 if list := resource.GetGreaterKeys(c.Resources.Requests, c.Resources.Limits); len(list) > 0 { 451 allReasons = append(allReasons, fmt.Sprintf("%s[%s] requests exceed it's limits", 452 psPath.Child("containers").Index(i).String(), 453 strings.Join(list, ", "))) 454 } 455 } 456 } 457 if len(allReasons) > 0 { 458 return fmt.Errorf("resource validation failed: %s", strings.Join(allReasons, "; ")) 459 } 460 return nil 461 } 462 463 // validateLimitRange validates that the requested resources fit into the namespace defined 464 // limitRanges. 465 func (s *Scheduler) validateLimitRange(ctx context.Context, wi *workload.Info) error { 466 podsetsPath := field.NewPath("podSets") 467 // get the range summary from the namespace. 468 list := corev1.LimitRangeList{} 469 if err := s.client.List(ctx, &list, &client.ListOptions{Namespace: wi.Obj.Namespace}); err != nil { 470 return err 471 } 472 if len(list.Items) == 0 { 473 return nil 474 } 475 summary := limitrange.Summarize(list.Items...) 476 477 // verify 478 allReasons := []string{} 479 for i := range wi.Obj.Spec.PodSets { 480 ps := &wi.Obj.Spec.PodSets[i] 481 allReasons = append(allReasons, summary.ValidatePodSpec(&ps.Template.Spec, podsetsPath.Child(ps.Name))...) 482 } 483 if len(allReasons) > 0 { 484 return fmt.Errorf("didn't satisfy LimitRange constraints: %s", strings.Join(allReasons, "; ")) 485 } 486 return nil 487 } 488 489 // admit sets the admitting clusterQueue and flavors into the workload of 490 // the entry, and asynchronously updates the object in the apiserver after 491 // assuming it in the cache. 492 func (s *Scheduler) admit(ctx context.Context, e *entry, mustHaveChecks sets.Set[string]) error { 493 log := ctrl.LoggerFrom(ctx) 494 newWorkload := e.Obj.DeepCopy() 495 admission := &kueue.Admission{ 496 ClusterQueue: kueue.ClusterQueueReference(e.ClusterQueue), 497 PodSetAssignments: e.assignment.ToAPI(), 498 } 499 500 workload.SetQuotaReservation(newWorkload, admission) 501 if workload.HasAllChecks(newWorkload, mustHaveChecks) { 502 // sync Admitted, ignore the result since an API update is always done. 503 _ = workload.SyncAdmittedCondition(newWorkload) 504 } 505 if err := s.cache.AssumeWorkload(newWorkload); err != nil { 506 return err 507 } 508 e.status = assumed 509 log.V(2).Info("Workload assumed in the cache") 510 511 s.admissionRoutineWrapper.Run(func() { 512 err := s.applyAdmission(ctx, newWorkload) 513 if err == nil { 514 waitStarted := e.Obj.CreationTimestamp.Time 515 if c := apimeta.FindStatusCondition(e.Obj.Status.Conditions, kueue.WorkloadEvicted); c != nil { 516 waitStarted = c.LastTransitionTime.Time 517 } 518 waitTime := time.Since(waitStarted) 519 s.recorder.Eventf(newWorkload, corev1.EventTypeNormal, "QuotaReserved", "Quota reserved in ClusterQueue %v, wait time since queued was %.0fs", admission.ClusterQueue, waitTime.Seconds()) 520 if workload.IsAdmitted(newWorkload) { 521 s.recorder.Eventf(newWorkload, corev1.EventTypeNormal, "Admitted", "Admitted by ClusterQueue %v, wait time since reservation was 0s ", admission.ClusterQueue) 522 } 523 metrics.AdmittedWorkload(admission.ClusterQueue, waitTime) 524 log.V(2).Info("Workload successfully admitted and assigned flavors", "assignments", admission.PodSetAssignments) 525 return 526 } 527 // Ignore errors because the workload or clusterQueue could have been deleted 528 // by an event. 529 _ = s.cache.ForgetWorkload(newWorkload) 530 if errors.IsNotFound(err) { 531 log.V(2).Info("Workload not admitted because it was deleted") 532 return 533 } 534 535 log.Error(err, errCouldNotAdmitWL) 536 s.requeueAndUpdate(log, ctx, *e) 537 }) 538 539 return nil 540 } 541 542 func (s *Scheduler) applyAdmissionWithSSA(ctx context.Context, w *kueue.Workload) error { 543 return workload.ApplyAdmissionStatus(ctx, s.client, w, false) 544 } 545 546 type entryOrdering struct { 547 entries []entry 548 workloadOrdering workload.Ordering 549 } 550 551 func (e entryOrdering) Len() int { 552 return len(e.entries) 553 } 554 555 func (e entryOrdering) Swap(i, j int) { 556 e.entries[i], e.entries[j] = e.entries[j], e.entries[i] 557 } 558 559 // Less is the ordering criteria: 560 // 1. request under nominal quota before borrowing. 561 // 2. higher priority first. 562 // 3. FIFO on eviction or creation timestamp. 563 func (e entryOrdering) Less(i, j int) bool { 564 a := e.entries[i] 565 b := e.entries[j] 566 567 // 1. Request under nominal quota. 568 aBorrows := a.assignment.Borrows() 569 bBorrows := b.assignment.Borrows() 570 if aBorrows != bBorrows { 571 return !aBorrows 572 } 573 574 // 2. Higher priority first if not disabled. 575 if features.Enabled(features.PrioritySortingWithinCohort) { 576 p1 := priority.Priority(a.Obj) 577 p2 := priority.Priority(b.Obj) 578 if p1 != p2 { 579 return p1 > p2 580 } 581 } 582 583 // 3. FIFO. 584 aComparisonTimestamp := e.workloadOrdering.GetQueueOrderTimestamp(a.Obj) 585 bComparisonTimestamp := e.workloadOrdering.GetQueueOrderTimestamp(b.Obj) 586 return aComparisonTimestamp.Before(bComparisonTimestamp) 587 } 588 589 func (s *Scheduler) requeueAndUpdate(log logr.Logger, ctx context.Context, e entry) { 590 if e.status != notNominated && e.requeueReason == queue.RequeueReasonGeneric { 591 // Failed after nomination is the only reason why a workload would be requeued downstream. 592 e.requeueReason = queue.RequeueReasonFailedAfterNomination 593 } 594 added := s.queues.RequeueWorkload(ctx, &e.Info, e.requeueReason) 595 log.V(2).Info("Workload re-queued", "workload", klog.KObj(e.Obj), "clusterQueue", klog.KRef("", e.ClusterQueue), "queue", klog.KRef(e.Obj.Namespace, e.Obj.Spec.QueueName), "requeueReason", e.requeueReason, "added", added) 596 597 if e.status == notNominated || e.status == skipped { 598 if workload.UnsetQuotaReservationWithCondition(e.Obj, "Pending", e.inadmissibleMsg) { 599 err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, true) 600 if err != nil { 601 log.Error(err, "Could not update Workload status") 602 } 603 } 604 s.recorder.Eventf(e.Obj, corev1.EventTypeNormal, "Pending", api.TruncateEventMessage(e.inadmissibleMsg)) 605 } 606 }