sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/scheduler.go

sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/scheduler.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package scheduler
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"maps"
    23  	"sort"
    24  	"strings"
    25  	"time"
    26  
    27  	"github.com/go-logr/logr"
    28  	corev1 "k8s.io/api/core/v1"
    29  	"k8s.io/apimachinery/pkg/api/errors"
    30  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    31  	"k8s.io/apimachinery/pkg/labels"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	"k8s.io/apimachinery/pkg/util/wait"
    35  	"k8s.io/client-go/tools/record"
    36  	"k8s.io/klog/v2"
    37  	"k8s.io/utils/field"
    38  	ctrl "sigs.k8s.io/controller-runtime"
    39  	"sigs.k8s.io/controller-runtime/pkg/client"
    40  
    41  	config "sigs.k8s.io/kueue/apis/config/v1beta1"
    42  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    43  	"sigs.k8s.io/kueue/pkg/cache"
    44  	"sigs.k8s.io/kueue/pkg/features"
    45  	"sigs.k8s.io/kueue/pkg/metrics"
    46  	"sigs.k8s.io/kueue/pkg/queue"
    47  	"sigs.k8s.io/kueue/pkg/scheduler/flavorassigner"
    48  	"sigs.k8s.io/kueue/pkg/scheduler/preemption"
    49  	"sigs.k8s.io/kueue/pkg/util/api"
    50  	"sigs.k8s.io/kueue/pkg/util/limitrange"
    51  	utilmaps "sigs.k8s.io/kueue/pkg/util/maps"
    52  	"sigs.k8s.io/kueue/pkg/util/priority"
    53  	"sigs.k8s.io/kueue/pkg/util/resource"
    54  	"sigs.k8s.io/kueue/pkg/util/routine"
    55  	"sigs.k8s.io/kueue/pkg/workload"
    56  )
    57  
    58  const (
    59  	errCouldNotAdmitWL = "Could not admit Workload and assign flavors in apiserver"
    60  )
    61  
    62  type Scheduler struct {
    63  	queues                  *queue.Manager
    64  	cache                   *cache.Cache
    65  	client                  client.Client
    66  	recorder                record.EventRecorder
    67  	admissionRoutineWrapper routine.Wrapper
    68  	preemptor               *preemption.Preemptor
    69  	// Stubs.
    70  	applyAdmission func(context.Context, *kueue.Workload) error
    71  
    72  	workloadOrdering workload.Ordering
    73  }
    74  
    75  type options struct {
    76  	podsReadyRequeuingTimestamp config.RequeuingTimestamp
    77  }
    78  
    79  // Option configures the reconciler.
    80  type Option func(*options)
    81  
    82  var defaultOptions = options{
    83  	podsReadyRequeuingTimestamp: config.EvictionTimestamp,
    84  }
    85  
    86  // WithPodsReadyRequeuingTimestamp sets the timestamp that is used for ordering
    87  // workloads that have been requeued due to the PodsReady condition.
    88  func WithPodsReadyRequeuingTimestamp(ts config.RequeuingTimestamp) Option {
    89  	return func(o *options) {
    90  		o.podsReadyRequeuingTimestamp = ts
    91  	}
    92  }
    93  
    94  func New(queues *queue.Manager, cache *cache.Cache, cl client.Client, recorder record.EventRecorder, opts ...Option) *Scheduler {
    95  	options := defaultOptions
    96  	for _, opt := range opts {
    97  		opt(&options)
    98  	}
    99  	wo := workload.Ordering{
   100  		PodsReadyRequeuingTimestamp: options.podsReadyRequeuingTimestamp,
   101  	}
   102  	s := &Scheduler{
   103  		queues:                  queues,
   104  		cache:                   cache,
   105  		client:                  cl,
   106  		recorder:                recorder,
   107  		preemptor:               preemption.New(cl, wo, recorder),
   108  		admissionRoutineWrapper: routine.DefaultWrapper,
   109  		workloadOrdering:        wo,
   110  	}
   111  	s.applyAdmission = s.applyAdmissionWithSSA
   112  	return s
   113  }
   114  
   115  // Start implements the Runnable interface to run scheduler as a controller.
   116  func (s *Scheduler) Start(ctx context.Context) error {
   117  	log := ctrl.LoggerFrom(ctx).WithName("scheduler")
   118  	ctx = ctrl.LoggerInto(ctx, log)
   119  	go wait.UntilWithContext(ctx, s.schedule, 0)
   120  	return nil
   121  }
   122  
   123  // NeedLeaderElection Implements LeaderElectionRunnable interface to make scheduler
   124  // run in leader election mode
   125  func (s *Scheduler) NeedLeaderElection() bool {
   126  	return true
   127  }
   128  
   129  func (s *Scheduler) setAdmissionRoutineWrapper(wrapper routine.Wrapper) {
   130  	s.admissionRoutineWrapper = wrapper
   131  }
   132  
   133  type cohortsUsage map[string]cache.FlavorResourceQuantities
   134  
   135  func (cu *cohortsUsage) add(cohort string, assigment cache.FlavorResourceQuantities) {
   136  	cohortUsage := (*cu)[cohort]
   137  	if cohortUsage == nil {
   138  		cohortUsage = make(cache.FlavorResourceQuantities, len(assigment))
   139  	}
   140  
   141  	for flavor, resources := range assigment {
   142  		if _, found := cohortUsage[flavor]; found {
   143  			cohortUsage[flavor] = utilmaps.Merge(cohortUsage[flavor], resources, func(a, b int64) int64 { return a + b })
   144  		} else {
   145  			cohortUsage[flavor] = maps.Clone(resources)
   146  		}
   147  	}
   148  	(*cu)[cohort] = cohortUsage
   149  }
   150  
   151  func (cu *cohortsUsage) totalUsageForCommonFlavorResources(cohort string, assigment cache.FlavorResourceQuantities) cache.FlavorResourceQuantities {
   152  	return utilmaps.Intersect((*cu)[cohort], assigment, func(a, b map[corev1.ResourceName]int64) map[corev1.ResourceName]int64 {
   153  		return utilmaps.Intersect(a, b, func(a, b int64) int64 { return a + b })
   154  	})
   155  }
   156  
   157  func (cu *cohortsUsage) hasCommonFlavorResources(cohort string, assigment cache.FlavorResourceQuantities) bool {
   158  	cohortUsage, cohortFound := (*cu)[cohort]
   159  	if !cohortFound {
   160  		return false
   161  	}
   162  	for flavor, assigmentResources := range assigment {
   163  		if cohortResources, found := cohortUsage[flavor]; found {
   164  			for resName := range assigmentResources {
   165  				if _, found := cohortResources[resName]; found {
   166  					return true
   167  				}
   168  			}
   169  		}
   170  	}
   171  	return false
   172  }
   173  
   174  func (s *Scheduler) schedule(ctx context.Context) {
   175  	log := ctrl.LoggerFrom(ctx)
   176  
   177  	// 1. Get the heads from the queues, including their desired clusterQueue.
   178  	// This operation blocks while the queues are empty.
   179  	headWorkloads := s.queues.Heads(ctx)
   180  	// If there are no elements, it means that the program is finishing.
   181  	if len(headWorkloads) == 0 {
   182  		return
   183  	}
   184  	startTime := time.Now()
   185  
   186  	// 2. Take a snapshot of the cache.
   187  	snapshot := s.cache.Snapshot()
   188  	logSnapshotIfVerbose(log, &snapshot)
   189  
   190  	// 3. Calculate requirements (resource flavors, borrowing) for admitting workloads.
   191  	entries := s.nominate(ctx, headWorkloads, snapshot)
   192  
   193  	// 4. Sort entries based on borrowing, priorities (if enabled) and timestamps.
   194  	sort.Sort(entryOrdering{
   195  		entries:          entries,
   196  		workloadOrdering: s.workloadOrdering,
   197  	})
   198  
   199  	// 5. Admit entries, ensuring that no more than one workload gets
   200  	// admitted by a cohort (if borrowing).
   201  	// This is because there can be other workloads deeper in a clusterQueue whose
   202  	// head got admitted that should be scheduled in the cohort before the heads
   203  	// of other clusterQueues.
   204  	cycleCohortsUsage := cohortsUsage{}
   205  	cycleCohortsSkipPreemption := sets.New[string]()
   206  	for i := range entries {
   207  		e := &entries[i]
   208  		mode := e.assignment.RepresentativeMode()
   209  		if mode == flavorassigner.NoFit {
   210  			continue
   211  		}
   212  
   213  		cq := snapshot.ClusterQueues[e.ClusterQueue]
   214  		if cq.Cohort != nil {
   215  			sum := cycleCohortsUsage.totalUsageForCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage)
   216  			// Check whether there was an assignment in this cycle that could render the next assignments invalid:
   217  			// - If the workload no longer fits in the cohort.
   218  			// - If there was another assignment in the cohort, then the preemption calculation is no longer valid.
   219  			if cycleCohortsUsage.hasCommonFlavorResources(cq.Cohort.Name, e.assignment.Usage) &&
   220  				((mode == flavorassigner.Fit && !cq.FitInCohort(sum)) ||
   221  					(mode == flavorassigner.Preempt && cycleCohortsSkipPreemption.Has(cq.Cohort.Name))) {
   222  				e.status = skipped
   223  				e.inadmissibleMsg = "other workloads in the cohort were prioritized"
   224  				// When the workload needs borrowing and there is another workload in cohort doesn't
   225  				// need borrowing, the workload needborrowing will come again. In this case we should
   226  				// not skip the previous flavors.
   227  				e.LastAssignment = nil
   228  				continue
   229  			}
   230  			// Even if the workload will not be admitted after this point, due to preemption pending or other failures,
   231  			// we should still account for its usage.
   232  			cycleCohortsUsage.add(cq.Cohort.Name, resourcesToReserve(e, cq))
   233  		}
   234  		log := log.WithValues("workload", klog.KObj(e.Obj), "clusterQueue", klog.KRef("", e.ClusterQueue))
   235  		ctx := ctrl.LoggerInto(ctx, log)
   236  		if e.assignment.RepresentativeMode() != flavorassigner.Fit {
   237  			if len(e.preemptionTargets) != 0 {
   238  				// If preemptions are issued, the next attempt should try all the flavors.
   239  				e.LastAssignment = nil
   240  				preempted, err := s.preemptor.IssuePreemptions(ctx, e.preemptionTargets, cq)
   241  				if err != nil {
   242  					log.Error(err, "Failed to preempt workloads")
   243  				}
   244  				if preempted != 0 {
   245  					e.inadmissibleMsg += fmt.Sprintf(". Pending the preemption of %d workload(s)", preempted)
   246  					e.requeueReason = queue.RequeueReasonPendingPreemption
   247  				}
   248  				if cq.Cohort != nil {
   249  					cycleCohortsSkipPreemption.Insert(cq.Cohort.Name)
   250  				}
   251  			} else {
   252  				log.V(2).Info("Workload requires preemption, but there are no candidate workloads allowed for preemption", "preemption", cq.Preemption)
   253  			}
   254  			continue
   255  		}
   256  		if !s.cache.PodsReadyForAllAdmittedWorkloads(log) {
   257  			log.V(5).Info("Waiting for all admitted workloads to be in the PodsReady condition")
   258  			// If WaitForPodsReady is enabled and WaitForPodsReady.BlockAdmission is true
   259  			// Block admission until all currently admitted workloads are in
   260  			// PodsReady condition if the waitForPodsReady is enabled
   261  			workload.UnsetQuotaReservationWithCondition(e.Obj, "Waiting", "waiting for all admitted workloads to be in PodsReady condition")
   262  			if err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, false); err != nil {
   263  				log.Error(err, "Could not update Workload status")
   264  			}
   265  			s.cache.WaitForPodsReady(ctx)
   266  			log.V(5).Info("Finished waiting for all admitted workloads to be in the PodsReady condition")
   267  		}
   268  		e.status = nominated
   269  		if err := s.admit(ctx, e, cq.AdmissionChecks); err != nil {
   270  			e.inadmissibleMsg = fmt.Sprintf("Failed to admit workload: %v", err)
   271  		}
   272  		if cq.Cohort != nil {
   273  			cycleCohortsSkipPreemption.Insert(cq.Cohort.Name)
   274  		}
   275  	}
   276  
   277  	// 6. Requeue the heads that were not scheduled.
   278  	result := metrics.AdmissionResultInadmissible
   279  	for _, e := range entries {
   280  		logAdmissionAttemptIfVerbose(log, &e)
   281  		if e.status != assumed {
   282  			s.requeueAndUpdate(log, ctx, e)
   283  		} else {
   284  			result = metrics.AdmissionResultSuccess
   285  		}
   286  	}
   287  	metrics.AdmissionAttempt(result, time.Since(startTime))
   288  }
   289  
   290  type entryStatus string
   291  
   292  const (
   293  	// indicates if the workload was nominated for admission.
   294  	nominated entryStatus = "nominated"
   295  	// indicates if the workload was skipped in this cycle.
   296  	skipped entryStatus = "skipped"
   297  	// indicates if the workload was assumed to have been admitted.
   298  	assumed entryStatus = "assumed"
   299  	// indicates that the workload was never nominated for admission.
   300  	notNominated entryStatus = ""
   301  )
   302  
   303  // entry holds requirements for a workload to be admitted by a clusterQueue.
   304  type entry struct {
   305  	// workload.Info holds the workload from the API as well as resource usage
   306  	// and flavors assigned.
   307  	workload.Info
   308  	assignment        flavorassigner.Assignment
   309  	status            entryStatus
   310  	inadmissibleMsg   string
   311  	requeueReason     queue.RequeueReason
   312  	preemptionTargets []*workload.Info
   313  }
   314  
   315  // nominate returns the workloads with their requirements (resource flavors, borrowing) if
   316  // they were admitted by the clusterQueues in the snapshot.
   317  func (s *Scheduler) nominate(ctx context.Context, workloads []workload.Info, snap cache.Snapshot) []entry {
   318  	log := ctrl.LoggerFrom(ctx)
   319  	entries := make([]entry, 0, len(workloads))
   320  	for _, w := range workloads {
   321  		log := log.WithValues("workload", klog.KObj(w.Obj), "clusterQueue", klog.KRef("", w.ClusterQueue))
   322  		cq := snap.ClusterQueues[w.ClusterQueue]
   323  		ns := corev1.Namespace{}
   324  		e := entry{Info: w}
   325  		if s.cache.IsAssumedOrAdmittedWorkload(w) {
   326  			log.Info("Workload skipped from admission because it's already assumed or admitted", "workload", klog.KObj(w.Obj))
   327  			continue
   328  		} else if workload.HasRetryOrRejectedChecks(w.Obj) {
   329  			e.inadmissibleMsg = "The workload has failed admission checks"
   330  		} else if snap.InactiveClusterQueueSets.Has(w.ClusterQueue) {
   331  			e.inadmissibleMsg = fmt.Sprintf("ClusterQueue %s is inactive", w.ClusterQueue)
   332  		} else if cq == nil {
   333  			e.inadmissibleMsg = fmt.Sprintf("ClusterQueue %s not found", w.ClusterQueue)
   334  		} else if err := s.client.Get(ctx, types.NamespacedName{Name: w.Obj.Namespace}, &ns); err != nil {
   335  			e.inadmissibleMsg = fmt.Sprintf("Could not obtain workload namespace: %v", err)
   336  		} else if !cq.NamespaceSelector.Matches(labels.Set(ns.Labels)) {
   337  			e.inadmissibleMsg = "Workload namespace doesn't match ClusterQueue selector"
   338  			e.requeueReason = queue.RequeueReasonNamespaceMismatch
   339  		} else if err := s.validateResources(&w); err != nil {
   340  			e.inadmissibleMsg = err.Error()
   341  		} else if err := s.validateLimitRange(ctx, &w); err != nil {
   342  			e.inadmissibleMsg = err.Error()
   343  		} else {
   344  			e.assignment, e.preemptionTargets = s.getAssignments(log, &e.Info, &snap)
   345  			e.inadmissibleMsg = e.assignment.Message()
   346  			e.Info.LastAssignment = &e.assignment.LastState
   347  		}
   348  		entries = append(entries, e)
   349  	}
   350  	return entries
   351  }
   352  
   353  // resourcesToReserve calculates how much of the available resources in cq/cohort assignment should be reserved.
   354  func resourcesToReserve(e *entry, cq *cache.ClusterQueue) cache.FlavorResourceQuantities {
   355  	if e.assignment.RepresentativeMode() != flavorassigner.Preempt {
   356  		return e.assignment.Usage
   357  	}
   358  	reservedUsage := make(cache.FlavorResourceQuantities)
   359  	for flavor, resourceUsage := range e.assignment.Usage {
   360  		reservedUsage[flavor] = make(map[corev1.ResourceName]int64)
   361  		for resource, usage := range resourceUsage {
   362  			rg := cq.RGByResource[resource]
   363  			cqQuota := cache.ResourceQuota{}
   364  			for _, cqFlavor := range rg.Flavors {
   365  				if cqFlavor.Name == flavor {
   366  					cqQuota = *cqFlavor.Resources[resource]
   367  					break
   368  				}
   369  			}
   370  			if !e.assignment.Borrowing {
   371  				reservedUsage[flavor][resource] = max(0, min(usage, cqQuota.Nominal-cq.Usage[flavor][resource]))
   372  			} else {
   373  				if cqQuota.BorrowingLimit == nil {
   374  					reservedUsage[flavor][resource] = usage
   375  				} else {
   376  					reservedUsage[flavor][resource] = min(usage, cqQuota.Nominal+*cqQuota.BorrowingLimit-cq.Usage[flavor][resource])
   377  				}
   378  			}
   379  
   380  		}
   381  	}
   382  	return reservedUsage
   383  }
   384  
   385  type partialAssignment struct {
   386  	assignment        flavorassigner.Assignment
   387  	preemptionTargets []*workload.Info
   388  }
   389  
   390  func (s *Scheduler) getAssignments(log logr.Logger, wl *workload.Info, snap *cache.Snapshot) (flavorassigner.Assignment, []*workload.Info) {
   391  	cq := snap.ClusterQueues[wl.ClusterQueue]
   392  	fullAssignment := flavorassigner.AssignFlavors(log, wl, snap.ResourceFlavors, cq, nil)
   393  	var faPreemtionTargets []*workload.Info
   394  
   395  	arm := fullAssignment.RepresentativeMode()
   396  	if arm == flavorassigner.Fit {
   397  		return fullAssignment, nil
   398  	}
   399  
   400  	if arm == flavorassigner.Preempt {
   401  		faPreemtionTargets = s.preemptor.GetTargets(*wl, fullAssignment, snap)
   402  	}
   403  
   404  	// if the feature gate is not enabled or we can preempt
   405  	if !features.Enabled(features.PartialAdmission) || len(faPreemtionTargets) > 0 {
   406  		return fullAssignment, faPreemtionTargets
   407  	}
   408  
   409  	if wl.CanBePartiallyAdmitted() {
   410  		reducer := flavorassigner.NewPodSetReducer(wl.Obj.Spec.PodSets, func(nextCounts []int32) (*partialAssignment, bool) {
   411  			assignment := flavorassigner.AssignFlavors(log, wl, snap.ResourceFlavors, cq, nextCounts)
   412  			if assignment.RepresentativeMode() == flavorassigner.Fit {
   413  				return &partialAssignment{assignment: assignment}, true
   414  			}
   415  			preemptionTargets := s.preemptor.GetTargets(*wl, assignment, snap)
   416  			if len(preemptionTargets) > 0 {
   417  
   418  				return &partialAssignment{assignment: assignment, preemptionTargets: preemptionTargets}, true
   419  			}
   420  			return nil, false
   421  
   422  		})
   423  		if pa, found := reducer.Search(); found {
   424  			return pa.assignment, pa.preemptionTargets
   425  		}
   426  	}
   427  	return fullAssignment, nil
   428  }
   429  
   430  // validateResources validates that requested resources are less or equal
   431  // to limits.
   432  func (s *Scheduler) validateResources(wi *workload.Info) error {
   433  	podsetsPath := field.NewPath("podSets")
   434  	// requests should be less than limits.
   435  	allReasons := []string{}
   436  	for i := range wi.Obj.Spec.PodSets {
   437  		ps := &wi.Obj.Spec.PodSets[i]
   438  		psPath := podsetsPath.Child(ps.Name)
   439  		for i := range ps.Template.Spec.InitContainers {
   440  			c := ps.Template.Spec.InitContainers[i]
   441  			if list := resource.GetGreaterKeys(c.Resources.Requests, c.Resources.Limits); len(list) > 0 {
   442  				allReasons = append(allReasons, fmt.Sprintf("%s[%s] requests exceed it's limits",
   443  					psPath.Child("initContainers").Index(i).String(),
   444  					strings.Join(list, ", ")))
   445  			}
   446  		}
   447  
   448  		for i := range ps.Template.Spec.Containers {
   449  			c := ps.Template.Spec.Containers[i]
   450  			if list := resource.GetGreaterKeys(c.Resources.Requests, c.Resources.Limits); len(list) > 0 {
   451  				allReasons = append(allReasons, fmt.Sprintf("%s[%s] requests exceed it's limits",
   452  					psPath.Child("containers").Index(i).String(),
   453  					strings.Join(list, ", ")))
   454  			}
   455  		}
   456  	}
   457  	if len(allReasons) > 0 {
   458  		return fmt.Errorf("resource validation failed: %s", strings.Join(allReasons, "; "))
   459  	}
   460  	return nil
   461  }
   462  
   463  // validateLimitRange validates that the requested resources fit into the namespace defined
   464  // limitRanges.
   465  func (s *Scheduler) validateLimitRange(ctx context.Context, wi *workload.Info) error {
   466  	podsetsPath := field.NewPath("podSets")
   467  	// get the range summary from the namespace.
   468  	list := corev1.LimitRangeList{}
   469  	if err := s.client.List(ctx, &list, &client.ListOptions{Namespace: wi.Obj.Namespace}); err != nil {
   470  		return err
   471  	}
   472  	if len(list.Items) == 0 {
   473  		return nil
   474  	}
   475  	summary := limitrange.Summarize(list.Items...)
   476  
   477  	// verify
   478  	allReasons := []string{}
   479  	for i := range wi.Obj.Spec.PodSets {
   480  		ps := &wi.Obj.Spec.PodSets[i]
   481  		allReasons = append(allReasons, summary.ValidatePodSpec(&ps.Template.Spec, podsetsPath.Child(ps.Name))...)
   482  	}
   483  	if len(allReasons) > 0 {
   484  		return fmt.Errorf("didn't satisfy LimitRange constraints: %s", strings.Join(allReasons, "; "))
   485  	}
   486  	return nil
   487  }
   488  
   489  // admit sets the admitting clusterQueue and flavors into the workload of
   490  // the entry, and asynchronously updates the object in the apiserver after
   491  // assuming it in the cache.
   492  func (s *Scheduler) admit(ctx context.Context, e *entry, mustHaveChecks sets.Set[string]) error {
   493  	log := ctrl.LoggerFrom(ctx)
   494  	newWorkload := e.Obj.DeepCopy()
   495  	admission := &kueue.Admission{
   496  		ClusterQueue:      kueue.ClusterQueueReference(e.ClusterQueue),
   497  		PodSetAssignments: e.assignment.ToAPI(),
   498  	}
   499  
   500  	workload.SetQuotaReservation(newWorkload, admission)
   501  	if workload.HasAllChecks(newWorkload, mustHaveChecks) {
   502  		// sync Admitted, ignore the result since an API update is always done.
   503  		_ = workload.SyncAdmittedCondition(newWorkload)
   504  	}
   505  	if err := s.cache.AssumeWorkload(newWorkload); err != nil {
   506  		return err
   507  	}
   508  	e.status = assumed
   509  	log.V(2).Info("Workload assumed in the cache")
   510  
   511  	s.admissionRoutineWrapper.Run(func() {
   512  		err := s.applyAdmission(ctx, newWorkload)
   513  		if err == nil {
   514  			waitStarted := e.Obj.CreationTimestamp.Time
   515  			if c := apimeta.FindStatusCondition(e.Obj.Status.Conditions, kueue.WorkloadEvicted); c != nil {
   516  				waitStarted = c.LastTransitionTime.Time
   517  			}
   518  			waitTime := time.Since(waitStarted)
   519  			s.recorder.Eventf(newWorkload, corev1.EventTypeNormal, "QuotaReserved", "Quota reserved in ClusterQueue %v, wait time since queued was %.0fs", admission.ClusterQueue, waitTime.Seconds())
   520  			if workload.IsAdmitted(newWorkload) {
   521  				s.recorder.Eventf(newWorkload, corev1.EventTypeNormal, "Admitted", "Admitted by ClusterQueue %v, wait time since reservation was 0s ", admission.ClusterQueue)
   522  			}
   523  			metrics.AdmittedWorkload(admission.ClusterQueue, waitTime)
   524  			log.V(2).Info("Workload successfully admitted and assigned flavors", "assignments", admission.PodSetAssignments)
   525  			return
   526  		}
   527  		// Ignore errors because the workload or clusterQueue could have been deleted
   528  		// by an event.
   529  		_ = s.cache.ForgetWorkload(newWorkload)
   530  		if errors.IsNotFound(err) {
   531  			log.V(2).Info("Workload not admitted because it was deleted")
   532  			return
   533  		}
   534  
   535  		log.Error(err, errCouldNotAdmitWL)
   536  		s.requeueAndUpdate(log, ctx, *e)
   537  	})
   538  
   539  	return nil
   540  }
   541  
   542  func (s *Scheduler) applyAdmissionWithSSA(ctx context.Context, w *kueue.Workload) error {
   543  	return workload.ApplyAdmissionStatus(ctx, s.client, w, false)
   544  }
   545  
   546  type entryOrdering struct {
   547  	entries          []entry
   548  	workloadOrdering workload.Ordering
   549  }
   550  
   551  func (e entryOrdering) Len() int {
   552  	return len(e.entries)
   553  }
   554  
   555  func (e entryOrdering) Swap(i, j int) {
   556  	e.entries[i], e.entries[j] = e.entries[j], e.entries[i]
   557  }
   558  
   559  // Less is the ordering criteria:
   560  // 1. request under nominal quota before borrowing.
   561  // 2. higher priority first.
   562  // 3. FIFO on eviction or creation timestamp.
   563  func (e entryOrdering) Less(i, j int) bool {
   564  	a := e.entries[i]
   565  	b := e.entries[j]
   566  
   567  	// 1. Request under nominal quota.
   568  	aBorrows := a.assignment.Borrows()
   569  	bBorrows := b.assignment.Borrows()
   570  	if aBorrows != bBorrows {
   571  		return !aBorrows
   572  	}
   573  
   574  	// 2. Higher priority first if not disabled.
   575  	if features.Enabled(features.PrioritySortingWithinCohort) {
   576  		p1 := priority.Priority(a.Obj)
   577  		p2 := priority.Priority(b.Obj)
   578  		if p1 != p2 {
   579  			return p1 > p2
   580  		}
   581  	}
   582  
   583  	// 3. FIFO.
   584  	aComparisonTimestamp := e.workloadOrdering.GetQueueOrderTimestamp(a.Obj)
   585  	bComparisonTimestamp := e.workloadOrdering.GetQueueOrderTimestamp(b.Obj)
   586  	return aComparisonTimestamp.Before(bComparisonTimestamp)
   587  }
   588  
   589  func (s *Scheduler) requeueAndUpdate(log logr.Logger, ctx context.Context, e entry) {
   590  	if e.status != notNominated && e.requeueReason == queue.RequeueReasonGeneric {
   591  		// Failed after nomination is the only reason why a workload would be requeued downstream.
   592  		e.requeueReason = queue.RequeueReasonFailedAfterNomination
   593  	}
   594  	added := s.queues.RequeueWorkload(ctx, &e.Info, e.requeueReason)
   595  	log.V(2).Info("Workload re-queued", "workload", klog.KObj(e.Obj), "clusterQueue", klog.KRef("", e.ClusterQueue), "queue", klog.KRef(e.Obj.Namespace, e.Obj.Spec.QueueName), "requeueReason", e.requeueReason, "added", added)
   596  
   597  	if e.status == notNominated || e.status == skipped {
   598  		if workload.UnsetQuotaReservationWithCondition(e.Obj, "Pending", e.inadmissibleMsg) {
   599  			err := workload.ApplyAdmissionStatus(ctx, s.client, e.Obj, true)
   600  			if err != nil {
   601  				log.Error(err, "Could not update Workload status")
   602  			}
   603  		}
   604  		s.recorder.Eventf(e.Obj, corev1.EventTypeNormal, "Pending", api.TruncateEventMessage(e.inadmissibleMsg))
   605  	}
   606  }