sigs.k8s.io/kueue@v0.6.2/pkg/controller/core/workload_controller.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package core
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"time"
    23  
    24  	"github.com/go-logr/logr"
    25  	corev1 "k8s.io/api/core/v1"
    26  	nodev1 "k8s.io/api/node/v1"
    27  	"k8s.io/apimachinery/pkg/api/equality"
    28  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
    31  	"k8s.io/apimachinery/pkg/types"
    32  	"k8s.io/apimachinery/pkg/util/sets"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	"k8s.io/client-go/tools/record"
    35  	"k8s.io/client-go/util/workqueue"
    36  	"k8s.io/klog/v2"
    37  	"k8s.io/utils/clock"
    38  	"k8s.io/utils/ptr"
    39  	ctrl "sigs.k8s.io/controller-runtime"
    40  	"sigs.k8s.io/controller-runtime/pkg/client"
    41  	"sigs.k8s.io/controller-runtime/pkg/controller"
    42  	"sigs.k8s.io/controller-runtime/pkg/event"
    43  	"sigs.k8s.io/controller-runtime/pkg/handler"
    44  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    45  
    46  	config "sigs.k8s.io/kueue/apis/config/v1beta1"
    47  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    48  	"sigs.k8s.io/kueue/pkg/cache"
    49  	"sigs.k8s.io/kueue/pkg/constants"
    50  	"sigs.k8s.io/kueue/pkg/controller/core/indexer"
    51  	"sigs.k8s.io/kueue/pkg/queue"
    52  	"sigs.k8s.io/kueue/pkg/util/slices"
    53  	"sigs.k8s.io/kueue/pkg/workload"
    54  )
    55  
    56  const (
    57  	// statuses for logging purposes
    58  	pending  = "pending"
    59  	admitted = "admitted"
    60  	finished = "finished"
    61  )
    62  
    63  var (
    64  	realClock = clock.RealClock{}
    65  )
    66  
    67  type options struct {
    68  	watchers                   []WorkloadUpdateWatcher
    69  	podsReadyTimeout           *time.Duration
    70  	requeuingBackoffLimitCount *int32
    71  }
    72  
    73  // Option configures the reconciler.
    74  type Option func(*options)
    75  
    76  // WithPodsReadyTimeout indicates if the controller should interrupt startup
    77  // of a workload if it exceeds the timeout to reach the PodsReady=True condition.
    78  func WithPodsReadyTimeout(value *time.Duration) Option {
    79  	return func(o *options) {
    80  		o.podsReadyTimeout = value
    81  	}
    82  }
    83  
    84  // WithRequeuingBackoffLimitCount indicates if the controller should deactivate a workload
    85  // if it reaches the limitation.
    86  func WithRequeuingBackoffLimitCount(value *int32) Option {
    87  	return func(o *options) {
    88  		o.requeuingBackoffLimitCount = value
    89  	}
    90  }
    91  
    92  // WithWorkloadUpdateWatchers allows to specify the workload update watchers
    93  func WithWorkloadUpdateWatchers(value ...WorkloadUpdateWatcher) Option {
    94  	return func(o *options) {
    95  		o.watchers = value
    96  	}
    97  }
    98  
    99  var defaultOptions = options{}
   100  
   101  type WorkloadUpdateWatcher interface {
   102  	NotifyWorkloadUpdate(oldWl, newWl *kueue.Workload)
   103  }
   104  
   105  // WorkloadReconciler reconciles a Workload object
   106  type WorkloadReconciler struct {
   107  	log                        logr.Logger
   108  	queues                     *queue.Manager
   109  	cache                      *cache.Cache
   110  	client                     client.Client
   111  	watchers                   []WorkloadUpdateWatcher
   112  	podsReadyTimeout           *time.Duration
   113  	requeuingBackoffLimitCount *int32
   114  	recorder                   record.EventRecorder
   115  }
   116  
   117  func NewWorkloadReconciler(client client.Client, queues *queue.Manager, cache *cache.Cache, recorder record.EventRecorder, opts ...Option) *WorkloadReconciler {
   118  	options := defaultOptions
   119  	for _, opt := range opts {
   120  		opt(&options)
   121  	}
   122  
   123  	return &WorkloadReconciler{
   124  		log:                        ctrl.Log.WithName("workload-reconciler"),
   125  		client:                     client,
   126  		queues:                     queues,
   127  		cache:                      cache,
   128  		watchers:                   options.watchers,
   129  		podsReadyTimeout:           options.podsReadyTimeout,
   130  		requeuingBackoffLimitCount: options.requeuingBackoffLimitCount,
   131  		recorder:                   recorder,
   132  	}
   133  }
   134  
   135  // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch
   136  // +kubebuilder:rbac:groups="",resources=limitranges,verbs=get;list;watch
   137  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete
   138  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch
   139  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update
   140  // +kubebuilder:rbac:groups=node.k8s.io,resources=runtimeclasses,verbs=get;list;watch
   141  
   142  func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   143  	var wl kueue.Workload
   144  	if err := r.client.Get(ctx, req.NamespacedName, &wl); err != nil {
   145  		// we'll ignore not-found errors, since there is nothing to do.
   146  		return ctrl.Result{}, client.IgnoreNotFound(err)
   147  	}
   148  	log := ctrl.LoggerFrom(ctx).WithValues("workload", klog.KObj(&wl))
   149  	ctx = ctrl.LoggerInto(ctx, log)
   150  	log.V(2).Info("Reconciling Workload")
   151  
   152  	// If a deactivated workload is re-activated, we need to reset the RequeueState.
   153  	if wl.Status.RequeueState != nil && ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(&wl) {
   154  		wl.Status.RequeueState = nil
   155  		return ctrl.Result{}, workload.ApplyAdmissionStatus(ctx, r.client, &wl, true)
   156  	}
   157  
   158  	if len(wl.ObjectMeta.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() {
   159  		return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, &wl)
   160  	}
   161  
   162  	if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
   163  		return ctrl.Result{}, nil
   164  	}
   165  
   166  	cqName, cqOk := r.queues.ClusterQueueForWorkload(&wl)
   167  	if cqOk {
   168  		if updated, err := r.reconcileSyncAdmissionChecks(ctx, &wl, cqName); updated || err != nil {
   169  			return ctrl.Result{}, err
   170  		}
   171  	}
   172  
   173  	// If the workload is admitted, updating the status here would set the Admitted condition to
   174  	// false before the workloads eviction.
   175  	if !workload.IsAdmitted(&wl) && workload.SyncAdmittedCondition(&wl) {
   176  		if err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true); err != nil {
   177  			return ctrl.Result{}, err
   178  		}
   179  		if workload.IsAdmitted(&wl) {
   180  			c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved)
   181  			r.recorder.Eventf(&wl, corev1.EventTypeNormal, "Admitted", "Admitted by ClusterQueue %v, wait time since reservation was %.0fs", wl.Status.Admission.ClusterQueue, time.Since(c.LastTransitionTime.Time).Seconds())
   182  		}
   183  		return ctrl.Result{}, nil
   184  	}
   185  
   186  	if workload.HasQuotaReservation(&wl) {
   187  		if evictionTriggered, err := r.reconcileCheckBasedEviction(ctx, &wl); evictionTriggered || err != nil {
   188  			return ctrl.Result{}, err
   189  		}
   190  
   191  		if updated, err := r.reconcileOnClusterQueueActiveState(ctx, &wl, cqName); updated || err != nil {
   192  			return ctrl.Result{}, err
   193  		}
   194  
   195  		return r.reconcileNotReadyTimeout(ctx, req, &wl)
   196  	}
   197  
   198  	// At this point the workload is not Admitted, if it has rejected admission checks mark it as finished.
   199  	if rejectedChecks := workload.GetRejectedChecks(&wl); len(rejectedChecks) > 0 {
   200  		log.V(3).Info("Workload has Rejected admission checks, Finish with failure")
   201  		err := workload.UpdateStatus(ctx, r.client, &wl, kueue.WorkloadFinished,
   202  			metav1.ConditionTrue,
   203  			"AdmissionChecksRejected",
   204  			fmt.Sprintf("Admission checks %v are rejected", rejectedChecks),
   205  			constants.KueueName)
   206  		if err == nil {
   207  			for _, owner := range wl.OwnerReferences {
   208  				uowner := unstructured.Unstructured{}
   209  				uowner.SetKind(owner.Kind)
   210  				uowner.SetAPIVersion(owner.APIVersion)
   211  				uowner.SetName(owner.Name)
   212  				uowner.SetNamespace(wl.Namespace)
   213  				uowner.SetUID(owner.UID)
   214  				r.recorder.Eventf(&uowner, corev1.EventTypeNormal, "WorkloadFinished", "Admission checks %v are rejected", rejectedChecks)
   215  			}
   216  		}
   217  		return ctrl.Result{}, err
   218  	}
   219  
   220  	switch {
   221  	case !r.queues.QueueForWorkloadExists(&wl):
   222  		log.V(3).Info("Workload is inadmissible because of missing LocalQueue", "localQueue", klog.KRef(wl.Namespace, wl.Spec.QueueName))
   223  		if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("LocalQueue %s doesn't exist", wl.Spec.QueueName)) {
   224  			err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true)
   225  			return ctrl.Result{}, client.IgnoreNotFound(err)
   226  		}
   227  	case !cqOk:
   228  		log.V(3).Info("Workload is inadmissible because of missing ClusterQueue", "clusterQueue", klog.KRef("", cqName))
   229  		if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s doesn't exist", cqName)) {
   230  			err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true)
   231  			return ctrl.Result{}, client.IgnoreNotFound(err)
   232  		}
   233  	case !r.cache.ClusterQueueActive(cqName):
   234  		log.V(3).Info("Workload is inadmissible because ClusterQueue is inactive", "clusterQueue", klog.KRef("", cqName))
   235  		if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is inactive", cqName)) {
   236  			err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true)
   237  			return ctrl.Result{}, client.IgnoreNotFound(err)
   238  		}
   239  	}
   240  
   241  	return ctrl.Result{}, nil
   242  }
   243  
   244  func (r *WorkloadReconciler) reconcileCheckBasedEviction(ctx context.Context, wl *kueue.Workload) (bool, error) {
   245  	if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted) || !workload.HasRetryOrRejectedChecks(wl) {
   246  		return false, nil
   247  	}
   248  	log := ctrl.LoggerFrom(ctx)
   249  	log.V(3).Info("Workload is evicted due to admission checks")
   250  	workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByAdmissionCheck, "At least one admission check is false")
   251  	err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   252  	return true, client.IgnoreNotFound(err)
   253  }
   254  
   255  func (r *WorkloadReconciler) reconcileSyncAdmissionChecks(ctx context.Context, wl *kueue.Workload, cqName string) (bool, error) {
   256  	// because we need to react to API cluster queue events, the list of checks from a cache can lead to race conditions
   257  	queue := kueue.ClusterQueue{}
   258  	if err := r.client.Get(ctx, types.NamespacedName{Name: cqName}, &queue); err != nil {
   259  		return false, err
   260  	}
   261  
   262  	queueAdmissionChecks := queue.Spec.AdmissionChecks
   263  	newChecks, shouldUpdate := syncAdmissionCheckConditions(wl.Status.AdmissionChecks, queueAdmissionChecks)
   264  	if shouldUpdate {
   265  		log := ctrl.LoggerFrom(ctx)
   266  		log.V(3).Info("The workload needs admission checks updates", "clusterQueue", klog.KRef("", cqName), "admissionChecks", queueAdmissionChecks)
   267  		wl.Status.AdmissionChecks = newChecks
   268  		err := r.client.Status().Update(ctx, wl)
   269  		return true, client.IgnoreNotFound(err)
   270  	}
   271  	return false, nil
   272  }
   273  
   274  func (r *WorkloadReconciler) reconcileOnClusterQueueActiveState(ctx context.Context, wl *kueue.Workload, cqName string) (bool, error) {
   275  	queue := kueue.ClusterQueue{}
   276  	err := r.client.Get(ctx, types.NamespacedName{Name: cqName}, &queue)
   277  	if client.IgnoreNotFound(err) != nil {
   278  		return false, err
   279  	}
   280  
   281  	queueStopPolicy := ptr.Deref(queue.Spec.StopPolicy, kueue.None)
   282  
   283  	log := ctrl.LoggerFrom(ctx)
   284  	if workload.IsAdmitted(wl) {
   285  		if queueStopPolicy != kueue.HoldAndDrain {
   286  			return false, nil
   287  		}
   288  		log.V(3).Info("Workload is evicted because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName))
   289  		workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByClusterQueueStopped, "The ClusterQueue is stopped")
   290  		err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   291  		return true, client.IgnoreNotFound(err)
   292  	}
   293  
   294  	if err != nil || !queue.DeletionTimestamp.IsZero() {
   295  		log.V(3).Info("Workload is inadmissible because the ClusterQueue is terminating or missing", "clusterQueue", klog.KRef("", cqName))
   296  		_ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is terminating or missing", cqName))
   297  		return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   298  	}
   299  
   300  	if queueStopPolicy != kueue.None {
   301  		log.V(3).Info("Workload is inadmissible because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName))
   302  		_ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is stopped", cqName))
   303  		return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   304  	}
   305  
   306  	return false, nil
   307  }
   308  
   309  func syncAdmissionCheckConditions(conds []kueue.AdmissionCheckState, queueChecks []string) ([]kueue.AdmissionCheckState, bool) {
   310  	if len(queueChecks) == 0 {
   311  		return nil, len(conds) > 0
   312  	}
   313  
   314  	shouldUpdate := false
   315  	currentChecks := slices.ToRefMap(conds, func(c *kueue.AdmissionCheckState) string { return c.Name })
   316  	for _, t := range queueChecks {
   317  		if _, found := currentChecks[t]; !found {
   318  			workload.SetAdmissionCheckState(&conds, kueue.AdmissionCheckState{
   319  				Name:  t,
   320  				State: kueue.CheckStatePending,
   321  			})
   322  			shouldUpdate = true
   323  		}
   324  	}
   325  
   326  	// if the workload conditions length is bigger, then some cleanup should be done
   327  	if len(conds) > len(queueChecks) {
   328  		newConds := make([]kueue.AdmissionCheckState, 0, len(queueChecks))
   329  		queueChecksSet := sets.New(queueChecks...)
   330  		shouldUpdate = true
   331  		for i := range conds {
   332  			c := &conds[i]
   333  			if queueChecksSet.Has(c.Name) {
   334  				newConds = append(newConds, *c)
   335  			}
   336  		}
   337  		conds = newConds
   338  	}
   339  	return conds, shouldUpdate
   340  }
   341  
   342  func (r *WorkloadReconciler) reconcileNotReadyTimeout(ctx context.Context, req ctrl.Request, wl *kueue.Workload) (ctrl.Result, error) {
   343  	log := ctrl.LoggerFrom(ctx)
   344  
   345  	if !ptr.Deref(wl.Spec.Active, true) || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted) {
   346  		// the workload has already been evicted by the PodsReadyTimeout or been deactivated.
   347  		return ctrl.Result{}, nil
   348  	}
   349  	countingTowardsTimeout, recheckAfter := r.admittedNotReadyWorkload(wl, realClock)
   350  	if !countingTowardsTimeout {
   351  		return ctrl.Result{}, nil
   352  	}
   353  	if recheckAfter > 0 {
   354  		log.V(4).Info("Workload not yet ready and did not exceed its timeout", "recheckAfter", recheckAfter)
   355  		return ctrl.Result{RequeueAfter: recheckAfter}, nil
   356  	}
   357  	log.V(2).Info("Start the eviction of the workload due to exceeding the PodsReady timeout")
   358  	if deactivated, err := r.triggerDeactivationOrBackoffRequeue(ctx, wl); deactivated || err != nil {
   359  		return ctrl.Result{}, err
   360  	}
   361  	workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByPodsReadyTimeout, fmt.Sprintf("Exceeded the PodsReady timeout %s", req.NamespacedName.String()))
   362  	err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   363  	return ctrl.Result{}, client.IgnoreNotFound(err)
   364  }
   365  
   366  // triggerDeactivationOrBackoffRequeue deactivates a workload (".spec.active"="false")
   367  // if a re-queued number has already exceeded the limit of re-queuing backoff.
   368  // Otherwise, it increments a re-queueing count and update a time to be re-queued.
   369  // It returns true as a first value if a workload is deactivated.
   370  func (r *WorkloadReconciler) triggerDeactivationOrBackoffRequeue(ctx context.Context, wl *kueue.Workload) (bool, error) {
   371  	if !workload.HasRequeueState(wl) {
   372  		wl.Status.RequeueState = &kueue.RequeueState{}
   373  	}
   374  	// If requeuingBackoffLimitCount equals to null, the workloads is repeatedly and endless re-queued.
   375  	requeuingCount := ptr.Deref(wl.Status.RequeueState.Count, 0) + 1
   376  	if r.requeuingBackoffLimitCount != nil && requeuingCount > *r.requeuingBackoffLimitCount {
   377  		wl.Spec.Active = ptr.To(false)
   378  		if err := r.client.Update(ctx, wl); err != nil {
   379  			return false, err
   380  		}
   381  		r.recorder.Eventf(wl, corev1.EventTypeNormal, kueue.WorkloadEvictedByDeactivation,
   382  			"Deactivated Workload %q by reached re-queue backoffLimitCount", klog.KObj(wl))
   383  		return true, nil
   384  	}
   385  	// Every backoff duration is about "1.41284738^(n-1)+Rand" where the "n" represents the "requeuingCount",
   386  	// and the "Rand" represents the random jitter. During this time, the workload is taken as an inadmissible and
   387  	// other workloads will have a chance to be admitted.
   388  	// Considering the ".waitForPodsReady.timeout",
   389  	// this indicates that an evicted workload with PodsReadyTimeout reason is continued re-queuing for
   390  	// the "t(n+1) + SUM[k=1,n](1.41284738^(k-1) + Rand)" seconds where the "t" represents "waitForPodsReady.timeout".
   391  	// Given that the "backoffLimitCount" equals "30" and the "waitForPodsReady.timeout" equals "300" (default),
   392  	// the result equals 24 hours (+Rand seconds).
   393  	backoff := &wait.Backoff{
   394  		Duration: 1 * time.Second,
   395  		Factor:   1.41284738,
   396  		Jitter:   0.0001,
   397  		Steps:    int(requeuingCount),
   398  	}
   399  	var waitDuration time.Duration
   400  	for backoff.Steps > 0 {
   401  		waitDuration = backoff.Step()
   402  	}
   403  	wl.Status.RequeueState.RequeueAt = ptr.To(metav1.NewTime(time.Now().Add(waitDuration)))
   404  	wl.Status.RequeueState.Count = &requeuingCount
   405  	return false, nil
   406  }
   407  
   408  func (r *WorkloadReconciler) Create(e event.CreateEvent) bool {
   409  	wl, isWorkload := e.Object.(*kueue.Workload)
   410  	if !isWorkload {
   411  		// this event will be handled by the LimitRange/RuntimeClass handle
   412  		return true
   413  	}
   414  	defer r.notifyWatchers(nil, wl)
   415  	status := workloadStatus(wl)
   416  	log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status)
   417  	log.V(2).Info("Workload create event")
   418  
   419  	if status == finished {
   420  		return true
   421  	}
   422  
   423  	ctx := ctrl.LoggerInto(context.Background(), log)
   424  	wlCopy := wl.DeepCopy()
   425  	workload.AdjustResources(ctx, r.client, wlCopy)
   426  
   427  	if !workload.HasQuotaReservation(wl) {
   428  		if !r.queues.AddOrUpdateWorkload(wlCopy) {
   429  			log.V(2).Info("Queue for workload didn't exist; ignored for now")
   430  		}
   431  		return true
   432  	}
   433  	if !r.cache.AddOrUpdateWorkload(wlCopy) {
   434  		log.V(2).Info("ClusterQueue for workload didn't exist; ignored for now")
   435  	}
   436  
   437  	return true
   438  }
   439  
   440  func (r *WorkloadReconciler) Delete(e event.DeleteEvent) bool {
   441  	wl, isWorkload := e.Object.(*kueue.Workload)
   442  	if !isWorkload {
   443  		// this event will be handled by the LimitRange/RuntimeClass handle
   444  		return true
   445  	}
   446  	defer r.notifyWatchers(wl, nil)
   447  	status := "unknown"
   448  	if !e.DeleteStateUnknown {
   449  		status = workloadStatus(wl)
   450  	}
   451  	log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status)
   452  	log.V(2).Info("Workload delete event")
   453  	ctx := ctrl.LoggerInto(context.Background(), log)
   454  
   455  	// When assigning a clusterQueue to a workload, we assume it in the cache. If
   456  	// the state is unknown, the workload could have been assumed, and we need
   457  	// to clear it from the cache.
   458  	if workload.HasQuotaReservation(wl) || e.DeleteStateUnknown {
   459  		// trigger the move of associated inadmissibleWorkloads if required.
   460  		r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
   461  			// Delete the workload from cache while holding the queues lock
   462  			// to guarantee that requeueued workloads are taken into account before
   463  			// the next scheduling cycle.
   464  			if err := r.cache.DeleteWorkload(wl); err != nil {
   465  				if !e.DeleteStateUnknown {
   466  					log.Error(err, "Failed to delete workload from cache")
   467  				}
   468  			}
   469  		})
   470  	}
   471  
   472  	// Even if the state is unknown, the last cached state tells us whether the
   473  	// workload was in the queues and should be cleared from them.
   474  	r.queues.DeleteWorkload(wl)
   475  
   476  	return true
   477  }
   478  
   479  func (r *WorkloadReconciler) Update(e event.UpdateEvent) bool {
   480  	oldWl, isWorkload := e.ObjectOld.(*kueue.Workload)
   481  	if !isWorkload {
   482  		// this event will be handled by the LimitRange/RuntimeClass handle
   483  		return true
   484  	}
   485  	wl := e.ObjectNew.(*kueue.Workload)
   486  	defer r.notifyWatchers(oldWl, wl)
   487  
   488  	status := workloadStatus(wl)
   489  	log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status)
   490  	ctx := ctrl.LoggerInto(context.Background(), log)
   491  	active := ptr.Deref(wl.Spec.Active, true)
   492  
   493  	prevQueue := oldWl.Spec.QueueName
   494  	if prevQueue != wl.Spec.QueueName {
   495  		log = log.WithValues("prevQueue", prevQueue)
   496  	}
   497  	prevStatus := workloadStatus(oldWl)
   498  	if prevStatus != status {
   499  		log = log.WithValues("prevStatus", prevStatus)
   500  	}
   501  	if workload.HasQuotaReservation(wl) {
   502  		log = log.WithValues("clusterQueue", wl.Status.Admission.ClusterQueue)
   503  	}
   504  	if workload.HasQuotaReservation(oldWl) && (!workload.HasQuotaReservation(wl) || wl.Status.Admission.ClusterQueue != oldWl.Status.Admission.ClusterQueue) {
   505  		log = log.WithValues("prevClusterQueue", oldWl.Status.Admission.ClusterQueue)
   506  	}
   507  	log.V(2).Info("Workload update event")
   508  
   509  	wlCopy := wl.DeepCopy()
   510  	// We do not handle old workload here as it will be deleted or replaced by new one anyway.
   511  	workload.AdjustResources(ctrl.LoggerInto(ctx, log), r.client, wlCopy)
   512  
   513  	switch {
   514  	case status == finished || !active:
   515  		if !active {
   516  			log.V(2).Info("Workload will not be queued because the workload is not active", "workload", klog.KObj(wl))
   517  		}
   518  		// The workload could have been in the queues if we missed an event.
   519  		r.queues.DeleteWorkload(wl)
   520  
   521  		// trigger the move of associated inadmissibleWorkloads, if there are any.
   522  		r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
   523  			// Delete the workload from cache while holding the queues lock
   524  			// to guarantee that requeueued workloads are taken into account before
   525  			// the next scheduling cycle.
   526  			if err := r.cache.DeleteWorkload(oldWl); err != nil && prevStatus == admitted {
   527  				log.Error(err, "Failed to delete workload from cache")
   528  			}
   529  		})
   530  
   531  	case prevStatus == pending && status == pending:
   532  		if !r.queues.UpdateWorkload(oldWl, wlCopy) {
   533  			log.V(2).Info("Queue for updated workload didn't exist; ignoring for now")
   534  		}
   535  
   536  	case prevStatus == pending && status == admitted:
   537  		r.queues.DeleteWorkload(oldWl)
   538  		if !r.cache.AddOrUpdateWorkload(wlCopy) {
   539  			log.V(2).Info("ClusterQueue for workload didn't exist; ignored for now")
   540  		}
   541  	case prevStatus == admitted && status == pending:
   542  		// trigger the move of associated inadmissibleWorkloads, if there are any.
   543  		r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
   544  			// Delete the workload from cache while holding the queues lock
   545  			// to guarantee that requeueued workloads are taken into account before
   546  			// the next scheduling cycle.
   547  			if err := r.cache.DeleteWorkload(wl); err != nil {
   548  				log.Error(err, "Failed to delete workload from cache")
   549  			}
   550  		})
   551  		var backoff time.Duration
   552  		if wlCopy.Status.RequeueState != nil && wlCopy.Status.RequeueState.RequeueAt != nil {
   553  			backoff = time.Until(wl.Status.RequeueState.RequeueAt.Time)
   554  		}
   555  		if backoff <= 0 {
   556  			if !r.queues.AddOrUpdateWorkload(wlCopy) {
   557  				log.V(2).Info("Queue for workload didn't exist; ignored for now")
   558  			}
   559  		} else {
   560  			log.V(3).Info("Workload to be requeued after backoff", "backoff", backoff, "requeueAt", wl.Status.RequeueState.RequeueAt.Time)
   561  			time.AfterFunc(backoff, func() {
   562  				updatedWl := kueue.Workload{}
   563  				err := r.client.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWl)
   564  				if err == nil && workloadStatus(&updatedWl) == pending {
   565  					if !r.queues.AddOrUpdateWorkload(wlCopy) {
   566  						log.V(2).Info("Queue for workload didn't exist; ignored for now")
   567  					} else {
   568  						log.V(3).Info("Workload requeued after backoff")
   569  					}
   570  				}
   571  			})
   572  		}
   573  	case prevStatus == admitted && status == admitted && !equality.Semantic.DeepEqual(oldWl.Status.ReclaimablePods, wl.Status.ReclaimablePods):
   574  		// trigger the move of associated inadmissibleWorkloads, if there are any.
   575  		r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() {
   576  			// Update the workload from cache while holding the queues lock
   577  			// to guarantee that requeued workloads are taken into account before
   578  			// the next scheduling cycle.
   579  			if err := r.cache.UpdateWorkload(oldWl, wlCopy); err != nil {
   580  				log.Error(err, "Failed to delete workload from cache")
   581  			}
   582  		})
   583  
   584  	default:
   585  		// Workload update in the cache is handled here; however, some fields are immutable
   586  		// and are not supposed to actually change anything.
   587  		if err := r.cache.UpdateWorkload(oldWl, wlCopy); err != nil {
   588  			log.Error(err, "Updating workload in cache")
   589  		}
   590  	}
   591  
   592  	return true
   593  }
   594  
   595  func (r *WorkloadReconciler) Generic(e event.GenericEvent) bool {
   596  	r.log.V(3).Info("Ignore generic event", "obj", klog.KObj(e.Object), "kind", e.Object.GetObjectKind().GroupVersionKind())
   597  	return false
   598  }
   599  
   600  func (r *WorkloadReconciler) notifyWatchers(oldWl, newWl *kueue.Workload) {
   601  	for _, w := range r.watchers {
   602  		w.NotifyWorkloadUpdate(oldWl, newWl)
   603  	}
   604  }
   605  
   606  // SetupWithManager sets up the controller with the Manager.
   607  func (r *WorkloadReconciler) SetupWithManager(mgr ctrl.Manager, cfg *config.Configuration) error {
   608  	ruh := &resourceUpdatesHandler{
   609  		r: r,
   610  	}
   611  	return ctrl.NewControllerManagedBy(mgr).
   612  		For(&kueue.Workload{}).
   613  		WithOptions(controller.Options{NeedLeaderElection: ptr.To(false)}).
   614  		Watches(&corev1.LimitRange{}, ruh).
   615  		Watches(&nodev1.RuntimeClass{}, ruh).
   616  		Watches(&kueue.ClusterQueue{}, &workloadCqHandler{client: r.client}).
   617  		WithEventFilter(r).
   618  		Complete(WithLeadingManager(mgr, r, &kueue.Workload{}, cfg))
   619  }
   620  
   621  // admittedNotReadyWorkload returns as a pair of values. The first boolean determines
   622  // if the workload is currently counting towards the timeout for PodsReady, i.e.
   623  // it has the Admitted condition True and the PodsReady condition not equal
   624  // True (False or not set). The second value is the remaining time to exceed the
   625  // specified timeout counted since max of the LastTransitionTime's for the
   626  // Admitted and PodsReady conditions.
   627  func (r *WorkloadReconciler) admittedNotReadyWorkload(wl *kueue.Workload, clock clock.Clock) (bool, time.Duration) {
   628  	if r.podsReadyTimeout == nil {
   629  		// the timeout is not configured for the workload controller
   630  		return false, 0
   631  	}
   632  	if !workload.IsAdmitted(wl) {
   633  		// the workload is not admitted so there is no need to time it out
   634  		return false, 0
   635  	}
   636  
   637  	podsReadyCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadPodsReady)
   638  	if podsReadyCond != nil && podsReadyCond.Status == metav1.ConditionTrue {
   639  		return false, 0
   640  	}
   641  	admittedCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadAdmitted)
   642  	elapsedTime := clock.Since(admittedCond.LastTransitionTime.Time)
   643  	if podsReadyCond != nil && podsReadyCond.Status == metav1.ConditionFalse && podsReadyCond.LastTransitionTime.After(admittedCond.LastTransitionTime.Time) {
   644  		elapsedTime = clock.Since(podsReadyCond.LastTransitionTime.Time)
   645  	}
   646  	waitFor := *r.podsReadyTimeout - elapsedTime
   647  	if waitFor < 0 {
   648  		waitFor = 0
   649  	}
   650  	return true, waitFor
   651  }
   652  
   653  func workloadStatus(w *kueue.Workload) string {
   654  	if apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadFinished) {
   655  		return finished
   656  	}
   657  	if workload.HasQuotaReservation(w) {
   658  		return admitted
   659  	}
   660  	return pending
   661  }
   662  
   663  type resourceUpdatesHandler struct {
   664  	r *WorkloadReconciler
   665  }
   666  
   667  func (h *resourceUpdatesHandler) Create(ctx context.Context, e event.CreateEvent, q workqueue.RateLimitingInterface) {
   668  	log := ctrl.LoggerFrom(ctx).WithValues("kind", e.Object.GetObjectKind())
   669  	ctx = ctrl.LoggerInto(ctx, log)
   670  	log.V(5).Info("Create event")
   671  	h.handle(ctx, e.Object, q)
   672  }
   673  
   674  func (h *resourceUpdatesHandler) Update(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) {
   675  	log := ctrl.LoggerFrom(ctx).WithValues("kind", e.ObjectNew.GetObjectKind())
   676  	ctx = ctrl.LoggerInto(ctx, log)
   677  	log.V(5).Info("Update event")
   678  	h.handle(ctx, e.ObjectNew, q)
   679  }
   680  
   681  func (h *resourceUpdatesHandler) Delete(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) {
   682  	log := ctrl.LoggerFrom(ctx).WithValues("kind", e.Object.GetObjectKind())
   683  	ctx = ctrl.LoggerInto(ctx, log)
   684  	log.V(5).Info("Delete event")
   685  	h.handle(ctx, e.Object, q)
   686  }
   687  
   688  func (h *resourceUpdatesHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) {
   689  }
   690  
   691  func (h *resourceUpdatesHandler) handle(ctx context.Context, obj client.Object, q workqueue.RateLimitingInterface) {
   692  	switch v := obj.(type) {
   693  	case *corev1.LimitRange:
   694  		log := ctrl.LoggerFrom(ctx).WithValues("limitRange", klog.KObj(v))
   695  		ctx = ctrl.LoggerInto(ctx, log)
   696  		h.queueReconcileForPending(ctx, q, client.InNamespace(v.Namespace))
   697  	case *nodev1.RuntimeClass:
   698  		log := ctrl.LoggerFrom(ctx).WithValues("runtimeClass", klog.KObj(v))
   699  		ctx = ctrl.LoggerInto(ctx, log)
   700  		h.queueReconcileForPending(ctx, q, client.MatchingFields{indexer.WorkloadRuntimeClassKey: v.Name})
   701  	default:
   702  		panic(v)
   703  	}
   704  }
   705  
   706  func (h *resourceUpdatesHandler) queueReconcileForPending(ctx context.Context, _ workqueue.RateLimitingInterface, opts ...client.ListOption) {
   707  	log := ctrl.LoggerFrom(ctx)
   708  	lst := kueue.WorkloadList{}
   709  	opts = append(opts, client.MatchingFields{indexer.WorkloadQuotaReservedKey: string(metav1.ConditionFalse)})
   710  	err := h.r.client.List(ctx, &lst, opts...)
   711  	if err != nil {
   712  		log.Error(err, "Could not list pending workloads")
   713  	}
   714  	log.V(4).Info("Updating pending workload requests", "count", len(lst.Items))
   715  	for _, w := range lst.Items {
   716  		wlCopy := w.DeepCopy()
   717  		log := log.WithValues("workload", klog.KObj(wlCopy))
   718  		log.V(5).Info("Queue reconcile for")
   719  		workload.AdjustResources(ctrl.LoggerInto(ctx, log), h.r.client, wlCopy)
   720  		if !h.r.queues.AddOrUpdateWorkload(wlCopy) {
   721  			log.V(2).Info("Queue for workload didn't exist")
   722  		}
   723  	}
   724  }
   725  
   726  type workloadCqHandler struct {
   727  	client client.Client
   728  }
   729  
   730  var _ handler.EventHandler = (*workloadCqHandler)(nil)
   731  
   732  // Create is called in response to a create event.
   733  func (w *workloadCqHandler) Create(ctx context.Context, ev event.CreateEvent, wq workqueue.RateLimitingInterface) {
   734  	if cq, isQueue := ev.Object.(*kueue.ClusterQueue); isQueue {
   735  		w.queueReconcileForWorkloads(ctx, cq.Name, wq)
   736  	}
   737  }
   738  
   739  // Update is called in response to an update event.
   740  func (w *workloadCqHandler) Update(ctx context.Context, ev event.UpdateEvent, wq workqueue.RateLimitingInterface) {
   741  	log := ctrl.LoggerFrom(ctx).WithValues("clusterQueue", klog.KObj(ev.ObjectNew))
   742  	ctx = ctrl.LoggerInto(ctx, log)
   743  	log.V(5).Info("Workload cluster queue update event")
   744  	oldCq, oldIsQueue := ev.ObjectOld.(*kueue.ClusterQueue)
   745  	newCq, newIsQueue := ev.ObjectNew.(*kueue.ClusterQueue)
   746  
   747  	if !oldIsQueue || !newIsQueue {
   748  		return
   749  	}
   750  
   751  	if !newCq.DeletionTimestamp.IsZero() ||
   752  		!slices.CmpNoOrder(oldCq.Spec.AdmissionChecks, newCq.Spec.AdmissionChecks) ||
   753  		!ptr.Equal(oldCq.Spec.StopPolicy, newCq.Spec.StopPolicy) {
   754  		w.queueReconcileForWorkloads(ctx, newCq.Name, wq)
   755  	}
   756  }
   757  
   758  // Delete is called in response to a delete event.
   759  func (w *workloadCqHandler) Delete(ctx context.Context, ev event.DeleteEvent, wq workqueue.RateLimitingInterface) {
   760  	if cq, isQueue := ev.Object.(*kueue.ClusterQueue); isQueue {
   761  		w.queueReconcileForWorkloads(ctx, cq.Name, wq)
   762  	}
   763  }
   764  
   765  // Generic is called in response to an event of an unknown type or a synthetic event triggered as a cron or
   766  // external trigger request.
   767  func (w *workloadCqHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) {
   768  	// nothing to do here
   769  }
   770  
   771  func (w *workloadCqHandler) queueReconcileForWorkloads(ctx context.Context, cqName string, wq workqueue.RateLimitingInterface) {
   772  	log := ctrl.LoggerFrom(ctx)
   773  	lst := kueue.LocalQueueList{}
   774  	err := w.client.List(ctx, &lst, client.MatchingFields{indexer.QueueClusterQueueKey: cqName})
   775  	if err != nil {
   776  		log.Error(err, "Could not list cluster queues local queues")
   777  	}
   778  	for _, lq := range lst.Items {
   779  		log := log.WithValues("localQueue", klog.KObj(&lq))
   780  		ctx = ctrl.LoggerInto(ctx, log)
   781  		w.queueReconcileForWorkloadsOfLocalQueue(ctx, lq.Namespace, lq.Name, wq)
   782  	}
   783  }
   784  
   785  func (w *workloadCqHandler) queueReconcileForWorkloadsOfLocalQueue(ctx context.Context, namespace string, name string, wq workqueue.RateLimitingInterface) {
   786  	log := ctrl.LoggerFrom(ctx)
   787  	lst := kueue.WorkloadList{}
   788  	err := w.client.List(ctx, &lst, &client.ListOptions{Namespace: namespace}, client.MatchingFields{indexer.WorkloadQueueKey: name})
   789  	if err != nil {
   790  		log.Error(err, "Could not list cluster queues workloads")
   791  	}
   792  	for _, wl := range lst.Items {
   793  		log := log.WithValues("workload", klog.KObj(&wl))
   794  		req := reconcile.Request{
   795  			NamespacedName: types.NamespacedName{
   796  				Name:      wl.Name,
   797  				Namespace: wl.Namespace,
   798  			},
   799  		}
   800  		wq.Add(req)
   801  		log.V(5).Info("Queued reconcile for workload")
   802  	}
   803  }