sigs.k8s.io/kueue@v0.6.2/pkg/controller/jobframework/reconciler.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  Licensed under the Apache License, Version 2.0 (the "License");
     4  you may not use this file except in compliance with the License.
     5  You may obtain a copy of the License at
     6      http://www.apache.org/licenses/LICENSE-2.0
     7  Unless required by applicable law or agreed to in writing, software
     8  distributed under the License is distributed on an "AS IS" BASIS,
     9  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    10  See the License for the specific language governing permissions and
    11  limitations under the License.
    12  */
    13  
    14  package jobframework
    15  
    16  import (
    17  	"context"
    18  	"errors"
    19  	"fmt"
    20  
    21  	"github.com/go-logr/logr"
    22  	corev1 "k8s.io/api/core/v1"
    23  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    24  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    25  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    26  	"k8s.io/apimachinery/pkg/types"
    27  	"k8s.io/apimachinery/pkg/util/sets"
    28  	"k8s.io/apimachinery/pkg/util/validation"
    29  	"k8s.io/client-go/tools/record"
    30  	"k8s.io/klog/v2"
    31  	"k8s.io/utils/ptr"
    32  	ctrl "sigs.k8s.io/controller-runtime"
    33  	"sigs.k8s.io/controller-runtime/pkg/builder"
    34  	"sigs.k8s.io/controller-runtime/pkg/client"
    35  
    36  	configapi "sigs.k8s.io/kueue/apis/config/v1beta1"
    37  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    38  	"sigs.k8s.io/kueue/pkg/constants"
    39  	controllerconsts "sigs.k8s.io/kueue/pkg/controller/constants"
    40  	"sigs.k8s.io/kueue/pkg/features"
    41  	"sigs.k8s.io/kueue/pkg/podset"
    42  	"sigs.k8s.io/kueue/pkg/util/equality"
    43  	"sigs.k8s.io/kueue/pkg/util/kubeversion"
    44  	"sigs.k8s.io/kueue/pkg/util/maps"
    45  	utilpriority "sigs.k8s.io/kueue/pkg/util/priority"
    46  	"sigs.k8s.io/kueue/pkg/util/slices"
    47  	"sigs.k8s.io/kueue/pkg/workload"
    48  )
    49  
    50  const (
    51  	FailedToStartFinishedReason = "FailedToStart"
    52  )
    53  
    54  var (
    55  	ErrChildJobOwnerNotFound = fmt.Errorf("owner isn't set even though %s annotation is set", controllerconsts.ParentWorkloadAnnotation)
    56  	ErrUnknownWorkloadOwner  = errors.New("workload owner is unknown")
    57  	ErrWorkloadOwnerNotFound = errors.New("workload owner not found")
    58  	ErrNoMatchingWorkloads   = errors.New("no matching workloads")
    59  	ErrExtraWorkloads        = errors.New("extra workloads")
    60  )
    61  
    62  // JobReconciler reconciles a GenericJob object
    63  type JobReconciler struct {
    64  	client                     client.Client
    65  	record                     record.EventRecorder
    66  	manageJobsWithoutQueueName bool
    67  	waitForPodsReady           bool
    68  }
    69  
    70  type Options struct {
    71  	ManageJobsWithoutQueueName bool
    72  	WaitForPodsReady           bool
    73  	KubeServerVersion          *kubeversion.ServerVersionFetcher
    74  	// IntegrationOptions key is "$GROUP/$VERSION, Kind=$KIND".
    75  	IntegrationOptions map[string]any
    76  	EnabledFrameworks  sets.Set[string]
    77  	ManagerName        string
    78  }
    79  
    80  // Option configures the reconciler.
    81  type Option func(*Options)
    82  
    83  func ProcessOptions(opts ...Option) Options {
    84  	options := defaultOptions
    85  	for _, opt := range opts {
    86  		opt(&options)
    87  	}
    88  	return options
    89  }
    90  
    91  // WithManageJobsWithoutQueueName indicates if the controller should reconcile
    92  // jobs that don't set the queue name annotation.
    93  func WithManageJobsWithoutQueueName(f bool) Option {
    94  	return func(o *Options) {
    95  		o.ManageJobsWithoutQueueName = f
    96  	}
    97  }
    98  
    99  // WithWaitForPodsReady indicates if the controller should add the PodsReady
   100  // condition to the workload when the corresponding job has all pods ready
   101  // or succeeded.
   102  func WithWaitForPodsReady(w *configapi.WaitForPodsReady) Option {
   103  	return func(o *Options) {
   104  		o.WaitForPodsReady = w != nil && w.Enable
   105  	}
   106  }
   107  
   108  func WithKubeServerVersion(v *kubeversion.ServerVersionFetcher) Option {
   109  	return func(o *Options) {
   110  		o.KubeServerVersion = v
   111  	}
   112  }
   113  
   114  // WithIntegrationOptions adds integrations options like podOptions.
   115  // The second arg, `opts` should be recognized as any option struct.
   116  func WithIntegrationOptions(integrationName string, opts any) Option {
   117  	return func(o *Options) {
   118  		if len(o.IntegrationOptions) == 0 {
   119  			o.IntegrationOptions = make(map[string]any)
   120  		}
   121  		o.IntegrationOptions[integrationName] = opts
   122  	}
   123  }
   124  
   125  // WithEnabledFrameworks adds framework names enabled in the ConfigAPI.
   126  func WithEnabledFrameworks(i *configapi.Integrations) Option {
   127  	return func(o *Options) {
   128  		if i == nil || len(i.Frameworks) == 0 {
   129  			return
   130  		}
   131  		o.EnabledFrameworks = sets.New(i.Frameworks...)
   132  	}
   133  }
   134  
   135  // WithManagerName adds the kueue's manager name.
   136  func WithManagerName(n string) Option {
   137  	return func(o *Options) {
   138  		o.ManagerName = n
   139  	}
   140  }
   141  
   142  var defaultOptions = Options{}
   143  
   144  func NewReconciler(
   145  	client client.Client,
   146  	record record.EventRecorder,
   147  	opts ...Option) *JobReconciler {
   148  	options := ProcessOptions(opts...)
   149  
   150  	return &JobReconciler{
   151  		client:                     client,
   152  		record:                     record,
   153  		manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName,
   154  		waitForPodsReady:           options.WaitForPodsReady,
   155  	}
   156  }
   157  
   158  func (r *JobReconciler) ReconcileGenericJob(ctx context.Context, req ctrl.Request, job GenericJob) (result ctrl.Result, err error) {
   159  	object := job.Object()
   160  	log := ctrl.LoggerFrom(ctx).WithValues("job", req.String(), "gvk", job.GVK())
   161  	ctx = ctrl.LoggerInto(ctx, log)
   162  
   163  	defer func() {
   164  		err = r.ignoreUnretryableError(log, err)
   165  	}()
   166  
   167  	dropFinalizers := false
   168  	if cJob, isComposable := job.(ComposableJob); isComposable {
   169  		dropFinalizers, err = cJob.Load(ctx, r.client, &req.NamespacedName)
   170  	} else {
   171  		err = r.client.Get(ctx, req.NamespacedName, object)
   172  		dropFinalizers = apierrors.IsNotFound(err) || !object.GetDeletionTimestamp().IsZero()
   173  	}
   174  
   175  	if jws, implements := job.(JobWithSkip); implements {
   176  		if jws.Skip() {
   177  			return ctrl.Result{}, nil
   178  		}
   179  	}
   180  
   181  	if dropFinalizers {
   182  		// Remove workload finalizer
   183  		workloads := &kueue.WorkloadList{}
   184  
   185  		if cJob, isComposable := job.(ComposableJob); isComposable {
   186  			var err error
   187  			workloads, err = cJob.ListChildWorkloads(ctx, r.client, req.NamespacedName)
   188  			if err != nil {
   189  				log.Error(err, "Removing finalizer")
   190  				return ctrl.Result{}, err
   191  			}
   192  		} else {
   193  			if err := r.client.List(ctx, workloads, client.InNamespace(req.Namespace),
   194  				client.MatchingFields{GetOwnerKey(job.GVK()): req.Name}); err != nil {
   195  				log.Error(err, "Unable to list child workloads")
   196  				return ctrl.Result{}, err
   197  			}
   198  		}
   199  		for i := range workloads.Items {
   200  			err := workload.RemoveFinalizer(ctx, r.client, &workloads.Items[i])
   201  			if client.IgnoreNotFound(err) != nil {
   202  				log.Error(err, "Removing finalizer")
   203  				return ctrl.Result{}, err
   204  			}
   205  		}
   206  
   207  		// Remove job finalizer
   208  		if !object.GetDeletionTimestamp().IsZero() {
   209  			if err = r.finalizeJob(ctx, job); err != nil {
   210  				return ctrl.Result{}, err
   211  			}
   212  		}
   213  		return ctrl.Result{}, nil
   214  	}
   215  
   216  	if err != nil {
   217  		return ctrl.Result{}, client.IgnoreNotFound(err)
   218  	}
   219  
   220  	isStandaloneJob := ParentWorkloadName(job) == ""
   221  
   222  	// when manageJobsWithoutQueueName is disabled we only reconcile jobs that have either
   223  	// queue-name or the parent-workload annotation set.
   224  	// If the parent-workload annotation is set, it also checks whether the parent job has queue-name label.
   225  	if !r.manageJobsWithoutQueueName && QueueName(job) == "" {
   226  		if isStandaloneJob {
   227  			log.V(3).Info("Neither queue-name label, nor parent-workload annotation is set, ignoring the job",
   228  				"queueName", QueueName(job), "parentWorkload", ParentWorkloadName(job))
   229  			return ctrl.Result{}, nil
   230  		}
   231  		isParentJobManaged, err := r.IsParentJobManaged(ctx, job.Object(), req.Namespace)
   232  		if err != nil {
   233  			log.Error(err, "couldn't check whether the parent job is managed by kueue")
   234  			return ctrl.Result{}, err
   235  		}
   236  		if !isParentJobManaged {
   237  			log.V(3).Info("parent-workload annotation is set, and the parent job doesn't have a queue-name label, ignoring the job",
   238  				"parentWorkload", ParentWorkloadName(job))
   239  			return ctrl.Result{}, nil
   240  		}
   241  	}
   242  
   243  	// if this is a non-standalone job, suspend the job if its parent workload is not found or not admitted.
   244  	if !isStandaloneJob {
   245  		_, finished := job.Finished()
   246  		if !finished && !job.IsSuspended() {
   247  			if parentWorkload, err := r.getParentWorkload(ctx, job, object); err != nil {
   248  				log.Error(err, "couldn't get the parent job workload")
   249  				return ctrl.Result{}, err
   250  			} else if parentWorkload == nil || !workload.IsAdmitted(parentWorkload) {
   251  				// suspend it
   252  				job.Suspend()
   253  				if err := r.client.Update(ctx, object); err != nil {
   254  					log.Error(err, "suspending child job failed")
   255  					return ctrl.Result{}, err
   256  				}
   257  				r.record.Event(object, corev1.EventTypeNormal, ReasonSuspended, "Kueue managed child job suspended")
   258  			}
   259  		}
   260  		return ctrl.Result{}, nil
   261  	}
   262  
   263  	log.V(2).Info("Reconciling Job")
   264  
   265  	// 1. make sure there is only a single existing instance of the workload.
   266  	// If there's no workload exists and job is unsuspended, we'll stop it immediately.
   267  	wl, err := r.ensureOneWorkload(ctx, job, object)
   268  	if err != nil {
   269  		return ctrl.Result{}, err
   270  	}
   271  
   272  	if wl != nil && apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
   273  		if err := r.finalizeJob(ctx, job); err != nil {
   274  			return ctrl.Result{}, err
   275  		}
   276  
   277  		r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload,
   278  			"Workload '%s' is declared finished", workload.Key(wl))
   279  		return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, wl)
   280  	}
   281  
   282  	// 1.1 If the workload is pending deletion, suspend the job if needed
   283  	// and drop the finalizer.
   284  	if wl != nil && !wl.DeletionTimestamp.IsZero() {
   285  		log.V(2).Info("The workload is marked for deletion")
   286  		err := r.stopJob(ctx, job, wl, StopReasonWorkloadDeleted, "Workload is deleted")
   287  		if err != nil {
   288  			log.Error(err, "Suspending job with deleted workload")
   289  		}
   290  
   291  		if err == nil && wl != nil {
   292  			err = workload.RemoveFinalizer(ctx, r.client, wl)
   293  		}
   294  		return ctrl.Result{}, err
   295  	}
   296  
   297  	// 2. handle job is finished.
   298  	if condition, finished := job.Finished(); finished {
   299  		if wl != nil && !apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
   300  			err := workload.UpdateStatus(ctx, r.client, wl, condition.Type, condition.Status, condition.Reason, condition.Message, constants.JobControllerName)
   301  			if err != nil && !apierrors.IsNotFound(err) {
   302  				return ctrl.Result{}, err
   303  			}
   304  			r.record.Eventf(object, corev1.EventTypeNormal, ReasonFinishedWorkload,
   305  				"Workload '%s' is declared finished", workload.Key(wl))
   306  		}
   307  
   308  		// Execute job finalization logic
   309  		if err := r.finalizeJob(ctx, job); err != nil {
   310  			return ctrl.Result{}, err
   311  		}
   312  
   313  		return ctrl.Result{}, nil
   314  	}
   315  
   316  	// 3. handle workload is nil.
   317  	if wl == nil {
   318  		err := r.handleJobWithNoWorkload(ctx, job, object)
   319  		if err != nil {
   320  			if IsUnretryableError(err) {
   321  				log.V(3).Info("Handling job with no workload", "unretryableError", err)
   322  			} else {
   323  				log.Error(err, "Handling job with no workload")
   324  			}
   325  		}
   326  		return ctrl.Result{}, err
   327  	}
   328  
   329  	// 4. update reclaimable counts if implemented by the job
   330  	if jobRecl, implementsReclaimable := job.(JobWithReclaimablePods); implementsReclaimable {
   331  		reclPods, err := jobRecl.ReclaimablePods()
   332  		if err != nil {
   333  			log.Error(err, "Getting reclaimable pods")
   334  			return ctrl.Result{}, err
   335  		}
   336  
   337  		if !workload.ReclaimablePodsAreEqual(reclPods, wl.Status.ReclaimablePods) {
   338  			err = workload.UpdateReclaimablePods(ctx, r.client, wl, reclPods)
   339  			if err != nil {
   340  				log.Error(err, "Updating reclaimable pods")
   341  				return ctrl.Result{}, err
   342  			}
   343  			return ctrl.Result{}, nil
   344  		}
   345  	}
   346  
   347  	// 5. handle WaitForPodsReady only for a standalone job.
   348  	// handle a job when waitForPodsReady is enabled, and it is the main job
   349  	if r.waitForPodsReady {
   350  		log.V(5).Info("Handling a job when waitForPodsReady is enabled")
   351  		condition := generatePodsReadyCondition(job, wl)
   352  		// optimization to avoid sending the update request if the status didn't change
   353  		if !apimeta.IsStatusConditionPresentAndEqual(wl.Status.Conditions, condition.Type, condition.Status) {
   354  			log.V(3).Info(fmt.Sprintf("Updating the PodsReady condition with status: %v", condition.Status))
   355  			apimeta.SetStatusCondition(&wl.Status.Conditions, condition)
   356  			err := workload.UpdateStatus(ctx, r.client, wl, condition.Type, condition.Status, condition.Reason, condition.Message, constants.JobControllerName)
   357  			if err != nil {
   358  				log.Error(err, "Updating workload status")
   359  			}
   360  		}
   361  	}
   362  
   363  	// 6. handle eviction
   364  	if evCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadEvicted); evCond != nil && evCond.Status == metav1.ConditionTrue {
   365  		if err := r.stopJob(ctx, job, wl, StopReasonWorkloadEvicted, evCond.Message); err != nil {
   366  			return ctrl.Result{}, err
   367  		}
   368  		if workload.HasQuotaReservation(wl) {
   369  			if !job.IsActive() {
   370  				log.V(6).Info("The job is no longer active, clear the workloads admission")
   371  				_ = workload.UnsetQuotaReservationWithCondition(wl, "Pending", evCond.Message)
   372  				err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   373  				if err != nil {
   374  					return ctrl.Result{}, fmt.Errorf("clearing admission: %w", err)
   375  				}
   376  			}
   377  		}
   378  		return ctrl.Result{}, nil
   379  	}
   380  
   381  	// 7. handle job is suspended.
   382  	if job.IsSuspended() {
   383  		// start the job if the workload has been admitted, and the job is still suspended
   384  		if workload.IsAdmitted(wl) {
   385  			log.V(2).Info("Job admitted, unsuspending")
   386  			err := r.startJob(ctx, job, object, wl)
   387  			if err != nil {
   388  				log.Error(err, "Unsuspending job")
   389  				if podset.IsPermanent(err) {
   390  					// Mark the workload as finished with failure since the is no point to retry.
   391  					errUpdateStatus := workload.UpdateStatus(ctx, r.client, wl, kueue.WorkloadFinished, metav1.ConditionTrue, FailedToStartFinishedReason, err.Error(), constants.JobControllerName)
   392  					if errUpdateStatus != nil {
   393  						log.Error(errUpdateStatus, "Updating workload status, on start failure %s", err.Error())
   394  					}
   395  					return ctrl.Result{}, errUpdateStatus
   396  				}
   397  			}
   398  			return ctrl.Result{}, err
   399  		}
   400  
   401  		// update queue name if changed.
   402  		q := QueueName(job)
   403  		if wl.Spec.QueueName != q {
   404  			log.V(2).Info("Job changed queues, updating workload")
   405  			wl.Spec.QueueName = q
   406  			err := r.client.Update(ctx, wl)
   407  			if err != nil {
   408  				log.Error(err, "Updating workload queue")
   409  			}
   410  			return ctrl.Result{}, err
   411  		}
   412  		log.V(3).Info("Job is suspended and workload not yet admitted by a clusterQueue, nothing to do")
   413  		return ctrl.Result{}, nil
   414  	}
   415  
   416  	// 8. handle workload is deactivated.
   417  	if !ptr.Deref(wl.Spec.Active, true) {
   418  		workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByDeactivation, "The workload is deactivated")
   419  		err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true)
   420  		if err != nil {
   421  			return ctrl.Result{}, fmt.Errorf("setting eviction: %w", err)
   422  		}
   423  		return ctrl.Result{}, nil
   424  	}
   425  
   426  	// 9. handle job is unsuspended.
   427  	if !workload.IsAdmitted(wl) {
   428  		// the job must be suspended if the workload is not yet admitted.
   429  		log.V(2).Info("Running job is not admitted by a cluster queue, suspending")
   430  		err := r.stopJob(ctx, job, wl, StopReasonNotAdmitted, "Not admitted by cluster queue")
   431  		if err != nil {
   432  			log.Error(err, "Suspending job with non admitted workload")
   433  		}
   434  		return ctrl.Result{}, err
   435  	}
   436  
   437  	// workload is admitted and job is running, nothing to do.
   438  	log.V(3).Info("Job running with admitted workload, nothing to do")
   439  	return ctrl.Result{}, nil
   440  }
   441  
   442  // IsParentJobManaged checks whether the parent job is managed by kueue.
   443  func (r *JobReconciler) IsParentJobManaged(ctx context.Context, jobObj client.Object, namespace string) (bool, error) {
   444  	owner := metav1.GetControllerOf(jobObj)
   445  	if owner == nil {
   446  		return false, ErrChildJobOwnerNotFound
   447  	}
   448  	parentJob := GetEmptyOwnerObject(owner)
   449  	if parentJob == nil {
   450  		return false, fmt.Errorf("workload owner %v: %w", owner, ErrUnknownWorkloadOwner)
   451  	}
   452  	if err := r.client.Get(ctx, client.ObjectKey{Name: owner.Name, Namespace: namespace}, parentJob); err != nil {
   453  		return false, errors.Join(ErrWorkloadOwnerNotFound, err)
   454  	}
   455  	return QueueNameForObject(parentJob) != "", nil
   456  }
   457  
   458  func (r *JobReconciler) getParentWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) {
   459  	pw := kueue.Workload{}
   460  	namespacedName := types.NamespacedName{
   461  		Name:      ParentWorkloadName(job),
   462  		Namespace: object.GetNamespace(),
   463  	}
   464  	if err := r.client.Get(ctx, namespacedName, &pw); err != nil {
   465  		return nil, client.IgnoreNotFound(err)
   466  	} else {
   467  		return &pw, nil
   468  	}
   469  }
   470  
   471  // ensureOneWorkload will query for the single matched workload corresponding to job and return it.
   472  // If there are more than one workload, we should delete the excess ones.
   473  // The returned workload could be nil.
   474  func (r *JobReconciler) ensureOneWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) {
   475  	log := ctrl.LoggerFrom(ctx)
   476  
   477  	if prebuiltWorkloadName, usePrebuiltWorkload := PrebuiltWorkloadFor(job); usePrebuiltWorkload {
   478  		wl := &kueue.Workload{}
   479  		err := r.client.Get(ctx, types.NamespacedName{Name: prebuiltWorkloadName, Namespace: object.GetNamespace()}, wl)
   480  		if err != nil {
   481  			return nil, client.IgnoreNotFound(err)
   482  		}
   483  
   484  		if owns, err := r.ensurePrebuiltWorkloadOwnership(ctx, wl, object); !owns || err != nil {
   485  			return nil, err
   486  		}
   487  
   488  		if inSync, err := r.ensurePrebuiltWorkloadInSync(ctx, wl, job); !inSync || err != nil {
   489  			return nil, err
   490  		}
   491  		return wl, nil
   492  	}
   493  
   494  	// Find a matching workload first if there is one.
   495  	var toDelete []*kueue.Workload
   496  	var match *kueue.Workload
   497  	if cj, implements := job.(ComposableJob); implements {
   498  		var err error
   499  		match, toDelete, err = cj.FindMatchingWorkloads(ctx, r.client, r.record)
   500  		if err != nil {
   501  			log.Error(err, "Composable job is unable to find matching workloads")
   502  			return nil, err
   503  		}
   504  	} else {
   505  		var err error
   506  		match, toDelete, err = FindMatchingWorkloads(ctx, r.client, job)
   507  		if err != nil {
   508  			log.Error(err, "Unable to list child workloads")
   509  			return nil, err
   510  		}
   511  	}
   512  
   513  	var toUpdate *kueue.Workload
   514  	if match == nil && len(toDelete) > 0 && job.IsSuspended() && !workload.HasQuotaReservation(toDelete[0]) {
   515  		toUpdate = toDelete[0]
   516  		toDelete = toDelete[1:]
   517  	}
   518  
   519  	// If there is no matching workload and the job is running, suspend it.
   520  	if match == nil && !job.IsSuspended() {
   521  		log.V(2).Info("job with no matching workload, suspending")
   522  		var w *kueue.Workload
   523  		if len(toDelete) == 1 {
   524  			// The job may have been modified and hence the existing workload
   525  			// doesn't match the job anymore. All bets are off if there are more
   526  			// than one workload...
   527  			w = toDelete[0]
   528  		}
   529  
   530  		if _, finished := job.Finished(); !finished {
   531  			var msg string
   532  			if w == nil {
   533  				msg = "Missing Workload; unable to restore pod templates"
   534  			} else {
   535  				msg = "No matching Workload; restoring pod templates according to existent Workload"
   536  			}
   537  			if err := r.stopJob(ctx, job, w, StopReasonNoMatchingWorkload, msg); err != nil {
   538  				return nil, fmt.Errorf("stopping job with no matching workload: %w", err)
   539  			}
   540  		}
   541  	}
   542  
   543  	// Delete duplicate workload instances.
   544  	existedWls := 0
   545  	for _, wl := range toDelete {
   546  		wlKey := workload.Key(wl)
   547  		err := workload.RemoveFinalizer(ctx, r.client, wl)
   548  		if err != nil && !apierrors.IsNotFound(err) {
   549  			return nil, fmt.Errorf("failed to remove workload finalizer for: %w ", err)
   550  		}
   551  
   552  		err = r.client.Delete(ctx, wl)
   553  		if err != nil && !apierrors.IsNotFound(err) {
   554  			return nil, fmt.Errorf("deleting not matching workload: %w", err)
   555  		}
   556  		if err == nil {
   557  			existedWls++
   558  			r.record.Eventf(object, corev1.EventTypeNormal, ReasonDeletedWorkload,
   559  				"Deleted not matching Workload: %v", wlKey)
   560  		}
   561  	}
   562  
   563  	if existedWls != 0 {
   564  		if match == nil {
   565  			return nil, fmt.Errorf("%w: deleted %d workloads", ErrNoMatchingWorkloads, len(toDelete))
   566  		}
   567  		return nil, fmt.Errorf("%w: deleted %d workloads", ErrExtraWorkloads, len(toDelete))
   568  	}
   569  
   570  	if toUpdate != nil {
   571  		return r.updateWorkloadToMatchJob(ctx, job, object, toUpdate)
   572  	}
   573  
   574  	return match, nil
   575  }
   576  
   577  func FindMatchingWorkloads(ctx context.Context, c client.Client, job GenericJob) (match *kueue.Workload, toDelete []*kueue.Workload, err error) {
   578  	object := job.Object()
   579  
   580  	workloads := &kueue.WorkloadList{}
   581  	if err := c.List(ctx, workloads, client.InNamespace(object.GetNamespace()),
   582  		client.MatchingFields{GetOwnerKey(job.GVK()): object.GetName()}); err != nil {
   583  		return nil, nil, err
   584  	}
   585  
   586  	for i := range workloads.Items {
   587  		w := &workloads.Items[i]
   588  		if match == nil && equivalentToWorkload(ctx, c, job, w) {
   589  			match = w
   590  		} else {
   591  			toDelete = append(toDelete, w)
   592  		}
   593  	}
   594  
   595  	return match, toDelete, nil
   596  }
   597  
   598  func (r *JobReconciler) ensurePrebuiltWorkloadOwnership(ctx context.Context, wl *kueue.Workload, object client.Object) (bool, error) {
   599  	if !metav1.IsControlledBy(wl, object) {
   600  		if err := ctrl.SetControllerReference(object, wl, r.client.Scheme()); err != nil {
   601  			// don't return an error here, since a retry cannot give a different result,
   602  			// log the error.
   603  			log := ctrl.LoggerFrom(ctx)
   604  			log.Error(err, "Cannot take ownership of the workload")
   605  			return false, nil
   606  		}
   607  
   608  		if errs := validation.IsValidLabelValue(string(object.GetUID())); len(errs) == 0 {
   609  			wl.Labels = maps.MergeKeepFirst(map[string]string{controllerconsts.JobUIDLabel: string(object.GetUID())}, wl.Labels)
   610  		}
   611  
   612  		if err := r.client.Update(ctx, wl); err != nil {
   613  			return false, err
   614  		}
   615  	}
   616  	return true, nil
   617  }
   618  
   619  func (r *JobReconciler) ensurePrebuiltWorkloadInSync(ctx context.Context, wl *kueue.Workload, job GenericJob) (bool, error) {
   620  	if !equivalentToWorkload(ctx, r.client, job, wl) {
   621  		// mark the workload as finished
   622  		err := workload.UpdateStatus(ctx, r.client, wl,
   623  			kueue.WorkloadFinished,
   624  			metav1.ConditionTrue,
   625  			"OutOfSync",
   626  			"The prebuilt workload is out of sync with its user job",
   627  			constants.JobControllerName)
   628  		return false, err
   629  	}
   630  	return true, nil
   631  }
   632  
   633  // expectedRunningPodSets gets the expected podsets during the job execution, returns nil if the workload has no reservation or
   634  // the admission does not match.
   635  func expectedRunningPodSets(ctx context.Context, c client.Client, wl *kueue.Workload) []kueue.PodSet {
   636  	if !workload.HasQuotaReservation(wl) {
   637  		return nil
   638  	}
   639  	info, err := getPodSetsInfoFromStatus(ctx, c, wl)
   640  	if err != nil {
   641  		return nil
   642  	}
   643  	infoMap := slices.ToRefMap(info, func(psi *podset.PodSetInfo) string { return psi.Name })
   644  	runningPodSets := wl.Spec.DeepCopy().PodSets
   645  	canBePartiallyAdmitted := workload.CanBePartiallyAdmitted(wl)
   646  	for i := range runningPodSets {
   647  		ps := &runningPodSets[i]
   648  		psi, found := infoMap[ps.Name]
   649  		if !found {
   650  			return nil
   651  		}
   652  		err := podset.Merge(&ps.Template.ObjectMeta, &ps.Template.Spec, *psi)
   653  		if err != nil {
   654  			return nil
   655  		}
   656  		if canBePartiallyAdmitted && ps.MinCount != nil {
   657  			// update the expected running count
   658  			ps.Count = psi.Count
   659  		}
   660  	}
   661  	return runningPodSets
   662  }
   663  
   664  // equivalentToWorkload checks if the job corresponds to the workload
   665  func equivalentToWorkload(ctx context.Context, c client.Client, job GenericJob, wl *kueue.Workload) bool {
   666  	owner := metav1.GetControllerOf(wl)
   667  	// Indexes don't work in unit tests, so we explicitly check for the
   668  	// owner here.
   669  	if owner.Name != job.Object().GetName() {
   670  		return false
   671  	}
   672  
   673  	jobPodSets := clearMinCountsIfFeatureDisabled(job.PodSets())
   674  
   675  	if runningPodSets := expectedRunningPodSets(ctx, c, wl); runningPodSets != nil {
   676  		if equality.ComparePodSetSlices(jobPodSets, runningPodSets) {
   677  			return true
   678  		}
   679  		// If the workload is admitted but the job is suspended, do the check
   680  		// against the non-running info.
   681  		// This might allow some violating jobs to pass equivalency checks, but their
   682  		// workloads would be invalidated in the next sync after unsuspending.
   683  		return job.IsSuspended() && equality.ComparePodSetSlices(jobPodSets, wl.Spec.PodSets)
   684  	}
   685  
   686  	return equality.ComparePodSetSlices(jobPodSets, wl.Spec.PodSets)
   687  }
   688  
   689  func (r *JobReconciler) updateWorkloadToMatchJob(ctx context.Context, job GenericJob, object client.Object, wl *kueue.Workload) (*kueue.Workload, error) {
   690  	newWl, err := r.constructWorkload(ctx, job, object)
   691  	if err != nil {
   692  		return nil, fmt.Errorf("can't construct workload for update: %w", err)
   693  	}
   694  	err = r.prepareWorkload(ctx, job, newWl)
   695  	if err != nil {
   696  		return nil, fmt.Errorf("can't construct workload for update: %w", err)
   697  	}
   698  	wl.Spec = newWl.Spec
   699  	if err = r.client.Update(ctx, wl); err != nil {
   700  		return nil, fmt.Errorf("updating existed workload: %w", err)
   701  	}
   702  
   703  	r.record.Eventf(object, corev1.EventTypeNormal, ReasonUpdatedWorkload,
   704  		"Updated not matching Workload for suspended job: %v", klog.KObj(wl))
   705  	return newWl, nil
   706  }
   707  
   708  // startJob will unsuspend the job, and also inject the node affinity.
   709  func (r *JobReconciler) startJob(ctx context.Context, job GenericJob, object client.Object, wl *kueue.Workload) error {
   710  	info, err := getPodSetsInfoFromStatus(ctx, r.client, wl)
   711  	if err != nil {
   712  		return err
   713  	}
   714  	msg := fmt.Sprintf("Admitted by clusterQueue %v", wl.Status.Admission.ClusterQueue)
   715  
   716  	if cj, implements := job.(ComposableJob); implements {
   717  		if err := cj.Run(ctx, r.client, info, r.record, msg); err != nil {
   718  			return err
   719  		}
   720  	} else {
   721  		if runErr := job.RunWithPodSetsInfo(info); runErr != nil {
   722  			return runErr
   723  		}
   724  
   725  		if err := r.client.Update(ctx, object); err != nil {
   726  			return err
   727  		}
   728  		r.record.Event(object, corev1.EventTypeNormal, ReasonStarted, msg)
   729  	}
   730  
   731  	return nil
   732  }
   733  
   734  // stopJob will suspend the job, and also restore node affinity, reset job status if needed.
   735  // Returns whether any operation was done to stop the job or an error.
   736  func (r *JobReconciler) stopJob(ctx context.Context, job GenericJob, wl *kueue.Workload, stopReason StopReason, eventMsg string) error {
   737  	object := job.Object()
   738  
   739  	info := GetPodSetsInfoFromWorkload(wl)
   740  
   741  	if jws, implements := job.(JobWithCustomStop); implements {
   742  		stoppedNow, err := jws.Stop(ctx, r.client, info, stopReason, eventMsg)
   743  		if stoppedNow {
   744  			r.record.Event(object, corev1.EventTypeNormal, ReasonStopped, eventMsg)
   745  		}
   746  		return err
   747  	}
   748  
   749  	if jws, implements := job.(ComposableJob); implements {
   750  		stoppedNow, err := jws.Stop(ctx, r.client, info, stopReason, eventMsg)
   751  		for _, objStoppedNow := range stoppedNow {
   752  			r.record.Event(objStoppedNow, corev1.EventTypeNormal, ReasonStopped, eventMsg)
   753  		}
   754  		return err
   755  	}
   756  
   757  	if job.IsSuspended() {
   758  		return nil
   759  	}
   760  
   761  	job.Suspend()
   762  	if info != nil {
   763  		job.RestorePodSetsInfo(info)
   764  	}
   765  	if err := r.client.Update(ctx, object); err != nil {
   766  		return err
   767  	}
   768  
   769  	r.record.Event(object, corev1.EventTypeNormal, ReasonStopped, eventMsg)
   770  	return nil
   771  }
   772  
   773  func (r *JobReconciler) finalizeJob(ctx context.Context, job GenericJob) error {
   774  	if jwf, implements := job.(JobWithFinalize); implements {
   775  		if err := jwf.Finalize(ctx, r.client); err != nil {
   776  			return err
   777  		}
   778  	}
   779  
   780  	return nil
   781  }
   782  
   783  // constructWorkload will derive a workload from the corresponding job.
   784  func (r *JobReconciler) constructWorkload(ctx context.Context, job GenericJob, object client.Object) (*kueue.Workload, error) {
   785  	log := ctrl.LoggerFrom(ctx)
   786  
   787  	if cj, implements := job.(ComposableJob); implements {
   788  		wl, err := cj.ConstructComposableWorkload(ctx, r.client, r.record)
   789  		if err != nil {
   790  			return nil, err
   791  		}
   792  
   793  		return wl, nil
   794  	}
   795  
   796  	podSets := job.PodSets()
   797  
   798  	wl := &kueue.Workload{
   799  		ObjectMeta: metav1.ObjectMeta{
   800  			Name:       GetWorkloadNameForOwnerWithGVK(object.GetName(), job.GVK()),
   801  			Namespace:  object.GetNamespace(),
   802  			Labels:     map[string]string{},
   803  			Finalizers: []string{kueue.ResourceInUseFinalizerName},
   804  		},
   805  		Spec: kueue.WorkloadSpec{
   806  			PodSets:   podSets,
   807  			QueueName: QueueName(job),
   808  		},
   809  	}
   810  
   811  	jobUid := string(job.Object().GetUID())
   812  	if errs := validation.IsValidLabelValue(jobUid); len(errs) == 0 {
   813  		wl.Labels[controllerconsts.JobUIDLabel] = jobUid
   814  	} else {
   815  		log.V(2).Info(
   816  			"Validation of the owner job UID label has failed. Creating workload without the label.",
   817  			"ValidationErrors", errs,
   818  			"LabelValue", jobUid,
   819  		)
   820  	}
   821  
   822  	if err := ctrl.SetControllerReference(object, wl, r.client.Scheme()); err != nil {
   823  		return nil, err
   824  	}
   825  	return wl, nil
   826  }
   827  
   828  // prepareWorkload adds the priority information for the constructed workload
   829  func (r *JobReconciler) prepareWorkload(ctx context.Context, job GenericJob, wl *kueue.Workload) error {
   830  	priorityClassName, source, p, err := r.extractPriority(ctx, wl.Spec.PodSets, job)
   831  	if err != nil {
   832  		return err
   833  	}
   834  
   835  	wl.Spec.PriorityClassName = priorityClassName
   836  	wl.Spec.Priority = &p
   837  	wl.Spec.PriorityClassSource = source
   838  
   839  	wl.Spec.PodSets = clearMinCountsIfFeatureDisabled(wl.Spec.PodSets)
   840  
   841  	return nil
   842  }
   843  
   844  func (r *JobReconciler) extractPriority(ctx context.Context, podSets []kueue.PodSet, job GenericJob) (string, string, int32, error) {
   845  	if workloadPriorityClass := workloadPriorityClassName(job); len(workloadPriorityClass) > 0 {
   846  		return utilpriority.GetPriorityFromWorkloadPriorityClass(ctx, r.client, workloadPriorityClass)
   847  	}
   848  	if jobWithPriorityClass, isImplemented := job.(JobWithPriorityClass); isImplemented {
   849  		return utilpriority.GetPriorityFromPriorityClass(
   850  			ctx, r.client, jobWithPriorityClass.PriorityClass())
   851  	}
   852  	return utilpriority.GetPriorityFromPriorityClass(
   853  		ctx, r.client, extractPriorityFromPodSets(podSets))
   854  }
   855  
   856  func extractPriorityFromPodSets(podSets []kueue.PodSet) string {
   857  	for _, podSet := range podSets {
   858  		if len(podSet.Template.Spec.PriorityClassName) > 0 {
   859  			return podSet.Template.Spec.PriorityClassName
   860  		}
   861  	}
   862  	return ""
   863  }
   864  
   865  // getPodSetsInfoFromStatus extracts podSetsInfo from workload status, based on
   866  // admission, and admission checks.
   867  func getPodSetsInfoFromStatus(ctx context.Context, c client.Client, w *kueue.Workload) ([]podset.PodSetInfo, error) {
   868  	if len(w.Status.Admission.PodSetAssignments) == 0 {
   869  		return nil, nil
   870  	}
   871  
   872  	podSetsInfo := make([]podset.PodSetInfo, len(w.Status.Admission.PodSetAssignments))
   873  
   874  	for i, podSetFlavor := range w.Status.Admission.PodSetAssignments {
   875  		info, err := podset.FromAssignment(ctx, c, &podSetFlavor, w.Spec.PodSets[i].Count)
   876  		if err != nil {
   877  			return nil, err
   878  		}
   879  
   880  		for _, admissionCheck := range w.Status.AdmissionChecks {
   881  			for _, podSetUpdate := range admissionCheck.PodSetUpdates {
   882  				if podSetUpdate.Name == info.Name {
   883  					if err := info.Merge(podset.FromUpdate(&podSetUpdate)); err != nil {
   884  						return nil, fmt.Errorf("in admission check %q: %w", admissionCheck.Name, err)
   885  					}
   886  					break
   887  				}
   888  			}
   889  		}
   890  		podSetsInfo[i] = info
   891  	}
   892  	return podSetsInfo, nil
   893  }
   894  
   895  func (r *JobReconciler) handleJobWithNoWorkload(ctx context.Context, job GenericJob, object client.Object) error {
   896  	log := ctrl.LoggerFrom(ctx)
   897  
   898  	_, usePrebuiltWorkload := PrebuiltWorkloadFor(job)
   899  	if usePrebuiltWorkload {
   900  		// Stop the job if not already suspended
   901  		if stopErr := r.stopJob(ctx, job, nil, StopReasonNoMatchingWorkload, "missing workload"); stopErr != nil {
   902  			return stopErr
   903  		}
   904  	}
   905  
   906  	// Wait until there are no active pods.
   907  	if job.IsActive() {
   908  		log.V(2).Info("Job is suspended but still has active pods, waiting")
   909  		return nil
   910  	}
   911  
   912  	if usePrebuiltWorkload {
   913  		log.V(2).Info("Skip workload creation for job with prebuilt workload")
   914  		return nil
   915  	}
   916  
   917  	// Create the corresponding workload.
   918  	wl, err := r.constructWorkload(ctx, job, object)
   919  	if err != nil {
   920  		return err
   921  	}
   922  	err = r.prepareWorkload(ctx, job, wl)
   923  	if err != nil {
   924  		return err
   925  	}
   926  	if err = r.client.Create(ctx, wl); err != nil {
   927  		return err
   928  	}
   929  	r.record.Eventf(object, corev1.EventTypeNormal, ReasonCreatedWorkload,
   930  		"Created Workload: %v", workload.Key(wl))
   931  	return nil
   932  }
   933  
   934  func (r *JobReconciler) ignoreUnretryableError(log logr.Logger, err error) error {
   935  	if IsUnretryableError(err) {
   936  		log.V(2).Info("Received an unretryable error", "error", err)
   937  		return nil
   938  	}
   939  	return err
   940  }
   941  
   942  func generatePodsReadyCondition(job GenericJob, wl *kueue.Workload) metav1.Condition {
   943  	conditionStatus := metav1.ConditionFalse
   944  	message := "Not all pods are ready or succeeded"
   945  	// Once PodsReady=True it stays as long as the workload remains admitted to
   946  	// avoid unnecessary flickering the condition when the pods transition
   947  	// from Ready to Completed. As pods finish, they transition first into the
   948  	// uncountedTerminatedPods staging area, before passing to the
   949  	// succeeded/failed counters.
   950  	if workload.IsAdmitted(wl) && (job.PodsReady() || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadPodsReady)) {
   951  		conditionStatus = metav1.ConditionTrue
   952  		message = "All pods were ready or succeeded since the workload admission"
   953  	}
   954  	return metav1.Condition{
   955  		Type:    kueue.WorkloadPodsReady,
   956  		Status:  conditionStatus,
   957  		Reason:  "PodsReady",
   958  		Message: message,
   959  	}
   960  }
   961  
   962  // GetPodSetsInfoFromWorkload retrieve the podSetsInfo slice from the
   963  // provided workload's spec
   964  func GetPodSetsInfoFromWorkload(wl *kueue.Workload) []podset.PodSetInfo {
   965  	if wl == nil {
   966  		return nil
   967  	}
   968  
   969  	return slices.Map(wl.Spec.PodSets, podset.FromPodSet)
   970  
   971  }
   972  
   973  type ReconcilerSetup func(*builder.Builder, client.Client) *builder.Builder
   974  
   975  // NewGenericReconcilerFactory creates a new reconciler factory for a concrete GenericJob type.
   976  // newJob should return a new empty job.
   977  func NewGenericReconcilerFactory(newJob func() GenericJob, setup ...ReconcilerSetup) ReconcilerFactory {
   978  	return func(client client.Client, record record.EventRecorder, opts ...Option) JobReconcilerInterface {
   979  		return &genericReconciler{
   980  			jr:     NewReconciler(client, record, opts...),
   981  			newJob: newJob,
   982  			setup:  setup,
   983  		}
   984  	}
   985  }
   986  
   987  type genericReconciler struct {
   988  	jr     *JobReconciler
   989  	newJob func() GenericJob
   990  	setup  []ReconcilerSetup
   991  }
   992  
   993  func (r *genericReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
   994  	return r.jr.ReconcileGenericJob(ctx, req, r.newJob())
   995  }
   996  
   997  func (r *genericReconciler) SetupWithManager(mgr ctrl.Manager) error {
   998  	b := ctrl.NewControllerManagedBy(mgr).
   999  		For(r.newJob().Object()).Owns(&kueue.Workload{})
  1000  	c := mgr.GetClient()
  1001  	for _, f := range r.setup {
  1002  		b = f(b, c)
  1003  	}
  1004  	return b.Complete(r)
  1005  }
  1006  
  1007  // clearMinCountsIfFeatureDisabled sets the minCount for all podSets to nil if the PartialAdmission feature is not enabled
  1008  func clearMinCountsIfFeatureDisabled(in []kueue.PodSet) []kueue.PodSet {
  1009  	if features.Enabled(features.PartialAdmission) || len(in) == 0 {
  1010  		return in
  1011  	}
  1012  	for i := range in {
  1013  		in[i].MinCount = nil
  1014  	}
  1015  	return in
  1016  }