sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/provisioning/controller.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package provisioning
    18  
    19  import (
    20  	"context"
    21  	"crypto/sha1"
    22  	"encoding/hex"
    23  	"errors"
    24  	"fmt"
    25  	"maps"
    26  	"regexp"
    27  	"strconv"
    28  	"time"
    29  
    30  	corev1 "k8s.io/api/core/v1"
    31  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	"k8s.io/apimachinery/pkg/types"
    34  	"k8s.io/apimachinery/pkg/util/sets"
    35  	autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1"
    36  	"k8s.io/client-go/tools/record"
    37  	"k8s.io/client-go/util/workqueue"
    38  	"k8s.io/klog/v2"
    39  	"k8s.io/utils/ptr"
    40  	ctrl "sigs.k8s.io/controller-runtime"
    41  	"sigs.k8s.io/controller-runtime/pkg/client"
    42  	"sigs.k8s.io/controller-runtime/pkg/event"
    43  	"sigs.k8s.io/controller-runtime/pkg/handler"
    44  	"sigs.k8s.io/controller-runtime/pkg/log"
    45  	"sigs.k8s.io/controller-runtime/pkg/reconcile"
    46  
    47  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    48  	"sigs.k8s.io/kueue/pkg/podset"
    49  	"sigs.k8s.io/kueue/pkg/util/admissioncheck"
    50  	"sigs.k8s.io/kueue/pkg/util/api"
    51  	"sigs.k8s.io/kueue/pkg/util/slices"
    52  	"sigs.k8s.io/kueue/pkg/workload"
    53  )
    54  
    55  const (
    56  	objNameHashLength = 5
    57  	// 253 is the maximal length for a CRD name. We need to subtract one for '-', and the hash length.
    58  	objNameMaxPrefixLength = 252 - objNameHashLength
    59  	podTemplatesPrefix     = "ppt"
    60  )
    61  
    62  var (
    63  	errInconsistentPodSetAssignments = errors.New("inconsistent podSet assignments")
    64  )
    65  
    66  var (
    67  	MaxRetries        int32 = 3
    68  	MinBackoffSeconds int32 = 60
    69  )
    70  
    71  type provisioningConfigHelper = admissioncheck.ConfigHelper[*kueue.ProvisioningRequestConfig, kueue.ProvisioningRequestConfig]
    72  
    73  func newProvisioningConfigHelper(c client.Client) (*provisioningConfigHelper, error) {
    74  	return admissioncheck.NewConfigHelper[*kueue.ProvisioningRequestConfig](c)
    75  }
    76  
    77  type Controller struct {
    78  	client client.Client
    79  	helper *provisioningConfigHelper
    80  	record record.EventRecorder
    81  }
    82  
    83  var _ reconcile.Reconciler = (*Controller)(nil)
    84  
    85  // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update
    86  // +kubebuilder:rbac:groups="",resources=podtemplates,verbs=get;list;watch;create;delete;update
    87  // +kubebuilder:rbac:groups=autoscaling.x-k8s.io,resources=provisioningrequests,verbs=get;list;watch;create;update;patch;delete
    88  // +kubebuilder:rbac:groups=autoscaling.x-k8s.io,resources=provisioningrequests/status,verbs=get
    89  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;update;patch;delete
    90  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch
    91  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=admissionchecks,verbs=get;list;watch
    92  // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=provisioningrequestconfigs,verbs=get;list;watch
    93  
    94  func NewController(client client.Client, record record.EventRecorder) (*Controller, error) {
    95  	helper, err := newProvisioningConfigHelper(client)
    96  	if err != nil {
    97  		return nil, err
    98  	}
    99  	return &Controller{
   100  		client: client,
   101  		record: record,
   102  		helper: helper,
   103  	}, nil
   104  }
   105  
   106  // Reconcile performs a full reconciliation for the object referred to by the Request.
   107  // The Controller will requeue the Request to be processed again if an error is non-nil or
   108  // Result.Requeue is true, otherwise upon completion it will remove the work from the queue.
   109  func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) {
   110  	wl := &kueue.Workload{}
   111  	log := ctrl.LoggerFrom(ctx)
   112  	log.V(2).Info("Reconcile workload")
   113  
   114  	err := c.client.Get(ctx, req.NamespacedName, wl)
   115  	if err != nil {
   116  		return reconcile.Result{}, client.IgnoreNotFound(err)
   117  	}
   118  
   119  	if !workload.HasQuotaReservation(wl) || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) {
   120  		//1.2 workload has no reservation or is finished
   121  		log.V(5).Info("workload with no reservation, delete owned requests")
   122  		return reconcile.Result{}, c.deleteOwnedProvisionRequests(ctx, req.Namespace, req.Name)
   123  	}
   124  
   125  	// get the lists of relevant checks
   126  	relevantChecks, err := admissioncheck.FilterForController(ctx, c.client, wl.Status.AdmissionChecks, ControllerName)
   127  	if err != nil {
   128  		return reconcile.Result{}, err
   129  	}
   130  
   131  	list := &autoscaling.ProvisioningRequestList{}
   132  	if err := c.client.List(ctx, list, client.InNamespace(wl.Namespace), client.MatchingFields{RequestsOwnedByWorkloadKey: wl.Name}); client.IgnoreNotFound(err) != nil {
   133  		return reconcile.Result{}, err
   134  	}
   135  	ownedPrs := list.Items
   136  	activeOrLastPRForChecks := c.activeOrLastPRForChecks(ctx, wl, relevantChecks, ownedPrs)
   137  
   138  	if workload.IsAdmitted(wl) {
   139  		// check the state of the provision requests, eventually toggle the checks to false
   140  		// otherwise there is nothing to here
   141  		log.V(5).Info("workload admitted, sync checks")
   142  		return reconcile.Result{}, c.syncCheckStates(ctx, wl, relevantChecks, activeOrLastPRForChecks)
   143  	}
   144  
   145  	err = c.deleteUnusedProvisioningRequests(ctx, ownedPrs, activeOrLastPRForChecks)
   146  	if err != nil {
   147  		log.V(2).Error(err, "syncOwnedProvisionRequest failed to delete unused provisioning requests")
   148  		return reconcile.Result{}, err
   149  	}
   150  
   151  	requeAfter, err := c.syncOwnedProvisionRequest(ctx, wl, relevantChecks, activeOrLastPRForChecks)
   152  	if err != nil {
   153  		// this can also delete unneeded checks
   154  		log.V(2).Error(err, "syncOwnedProvisionRequest failed")
   155  		return reconcile.Result{}, err
   156  	}
   157  
   158  	err = c.syncCheckStates(ctx, wl, relevantChecks, activeOrLastPRForChecks)
   159  	if err != nil {
   160  		return reconcile.Result{}, err
   161  	}
   162  	if requeAfter != nil {
   163  		return reconcile.Result{RequeueAfter: *requeAfter}, nil
   164  	}
   165  	return reconcile.Result{}, nil
   166  }
   167  
   168  func (c *Controller) activeOrLastPRForChecks(ctx context.Context, wl *kueue.Workload, relevantChecks []string, ownedPrs []autoscaling.ProvisioningRequest) map[string]*autoscaling.ProvisioningRequest {
   169  	activeOrLastPRForChecks := make(map[string]*autoscaling.ProvisioningRequest)
   170  	for _, checkName := range relevantChecks {
   171  		for i := range ownedPrs {
   172  			req := &ownedPrs[i]
   173  			// PRs relevant for the admission check
   174  			if matches(req, wl.Name, checkName) {
   175  				prc, err := c.helper.ConfigForAdmissionCheck(ctx, checkName)
   176  				if err == nil && c.reqIsNeeded(ctx, wl, prc) && requestHasParamaters(req, prc) {
   177  					if currPr, exists := activeOrLastPRForChecks[checkName]; !exists || getAttempt(ctx, currPr, wl.Name, checkName) < getAttempt(ctx, req, wl.Name, checkName) {
   178  						activeOrLastPRForChecks[checkName] = req
   179  					}
   180  				}
   181  			}
   182  		}
   183  	}
   184  	return activeOrLastPRForChecks
   185  }
   186  
   187  func (c *Controller) deleteUnusedProvisioningRequests(ctx context.Context, ownedPrs []autoscaling.ProvisioningRequest, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) error {
   188  	log := ctrl.LoggerFrom(ctx)
   189  	prNames := sets.New[string]()
   190  	for _, pr := range activeOrLastPRForChecks {
   191  		prNames.Insert(pr.Name)
   192  	}
   193  	for _, pr := range ownedPrs {
   194  		req := &pr
   195  		if !prNames.Has(req.Name) {
   196  			if err := c.client.Delete(ctx, req); client.IgnoreNotFound(err) != nil {
   197  				log.V(5).Error(err, "deleting the request", "req", klog.KObj(req))
   198  				return err
   199  			}
   200  		}
   201  	}
   202  	return nil
   203  }
   204  
   205  func (c *Controller) deleteOwnedProvisionRequests(ctx context.Context, namespace string, name string) error {
   206  	list := &autoscaling.ProvisioningRequestList{}
   207  	if err := c.client.List(ctx, list, client.InNamespace(namespace), client.MatchingFields{RequestsOwnedByWorkloadKey: name}); err != nil {
   208  		return client.IgnoreNotFound(err)
   209  	}
   210  
   211  	for i := range list.Items {
   212  		if err := c.client.Delete(ctx, &list.Items[i]); client.IgnoreNotFound(err) != nil {
   213  			return fmt.Errorf("delete requests for %s/%s: %w", namespace, name, err)
   214  		}
   215  	}
   216  	return nil
   217  }
   218  
   219  func (c *Controller) syncOwnedProvisionRequest(ctx context.Context, wl *kueue.Workload, relevantChecks []string, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) (*time.Duration, error) {
   220  	log := ctrl.LoggerFrom(ctx)
   221  	var requeAfter *time.Duration
   222  	for _, checkName := range relevantChecks {
   223  		//get the config
   224  		prc, err := c.helper.ConfigForAdmissionCheck(ctx, checkName)
   225  		if err != nil {
   226  			// the check is not active
   227  			continue
   228  		}
   229  		if !c.reqIsNeeded(ctx, wl, prc) {
   230  			continue
   231  		}
   232  		if ac := workload.FindAdmissionCheck(wl.Status.AdmissionChecks, checkName); ac != nil && ac.State == kueue.CheckStateReady {
   233  			log.V(2).Info("Skip syncing of the ProvReq for admission check which is Ready", "workload", klog.KObj(wl), "admissionCheck", checkName)
   234  			continue
   235  		}
   236  
   237  		oldPr, exists := activeOrLastPRForChecks[checkName]
   238  		attempt := int32(1)
   239  		shouldCreatePr := false
   240  		if exists {
   241  			attempt = getAttempt(ctx, oldPr, wl.Name, checkName)
   242  			if apimeta.IsStatusConditionTrue(oldPr.Status.Conditions, autoscaling.Failed) {
   243  				if attempt <= MaxRetries {
   244  					prFailed := apimeta.FindStatusCondition(oldPr.Status.Conditions, autoscaling.Failed)
   245  					remainingTime := remainingTime(prc, attempt, prFailed.LastTransitionTime.Time)
   246  					if remainingTime <= 0 {
   247  						shouldCreatePr = true
   248  						attempt += 1
   249  					} else if requeAfter == nil || remainingTime < *requeAfter {
   250  						requeAfter = &remainingTime
   251  					}
   252  				}
   253  			}
   254  		} else {
   255  			shouldCreatePr = true
   256  		}
   257  		requestName := GetProvisioningRequestName(wl.Name, checkName, attempt)
   258  		if shouldCreatePr {
   259  			log.V(3).Info("Creating ProvisioningRequest", "requestName", requestName, "attempt", attempt)
   260  			req := &autoscaling.ProvisioningRequest{
   261  				ObjectMeta: metav1.ObjectMeta{
   262  					Name:      requestName,
   263  					Namespace: wl.Namespace,
   264  				},
   265  				Spec: autoscaling.ProvisioningRequestSpec{
   266  					ProvisioningClassName: prc.Spec.ProvisioningClassName,
   267  					Parameters:            parametersKueueToProvisioning(prc.Spec.Parameters),
   268  				},
   269  			}
   270  
   271  			expectedPodSets := requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources)
   272  			psaMap := slices.ToRefMap(wl.Status.Admission.PodSetAssignments, func(p *kueue.PodSetAssignment) string { return p.Name })
   273  			podSetMap := slices.ToRefMap(wl.Spec.PodSets, func(ps *kueue.PodSet) string { return ps.Name })
   274  			for _, psName := range expectedPodSets {
   275  				ps, psFound := podSetMap[psName]
   276  				psa, psaFound := psaMap[psName]
   277  				if !psFound || !psaFound {
   278  					return nil, errInconsistentPodSetAssignments
   279  				}
   280  				req.Spec.PodSets = append(req.Spec.PodSets, autoscaling.PodSet{
   281  					PodTemplateRef: autoscaling.Reference{
   282  						Name: getProvisioningRequestPodTemplateName(requestName, psName),
   283  					},
   284  					Count: ptr.Deref(psa.Count, ps.Count),
   285  				})
   286  			}
   287  
   288  			if err := ctrl.SetControllerReference(wl, req, c.client.Scheme()); err != nil {
   289  				return nil, err
   290  			}
   291  
   292  			if err := c.client.Create(ctx, req); err != nil {
   293  				return nil, err
   294  			}
   295  			c.record.Eventf(wl, corev1.EventTypeNormal, "ProvisioningRequestCreated", "Created ProvisioningRequest: %q", req.Name)
   296  			activeOrLastPRForChecks[checkName] = req
   297  		}
   298  		if err := c.syncProvisionRequestsPodTemplates(ctx, wl, requestName, prc); err != nil {
   299  			return nil, err
   300  		}
   301  	}
   302  	return requeAfter, nil
   303  }
   304  
   305  func (c *Controller) syncProvisionRequestsPodTemplates(ctx context.Context, wl *kueue.Workload, prName string, prc *kueue.ProvisioningRequestConfig) error {
   306  	request := &autoscaling.ProvisioningRequest{}
   307  	requestKey := types.NamespacedName{
   308  		Name:      prName,
   309  		Namespace: wl.Namespace,
   310  	}
   311  	err := c.client.Get(ctx, requestKey, request)
   312  	if err != nil {
   313  		return client.IgnoreNotFound(err)
   314  	}
   315  
   316  	expectedPodSets := requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources)
   317  	podsetRefsMap := slices.ToMap(expectedPodSets, func(i int) (string, string) {
   318  		return getProvisioningRequestPodTemplateName(prName, expectedPodSets[i]), expectedPodSets[i]
   319  	})
   320  
   321  	// the order of the podSets should be the same in the workload and prov. req.
   322  	// if the number is different, just delete the request
   323  	if len(request.Spec.PodSets) != len(expectedPodSets) {
   324  		return c.client.Delete(ctx, request)
   325  	}
   326  
   327  	psaMap := slices.ToRefMap(wl.Status.Admission.PodSetAssignments, func(p *kueue.PodSetAssignment) string { return p.Name })
   328  	podSetMap := slices.ToRefMap(wl.Spec.PodSets, func(ps *kueue.PodSet) string { return ps.Name })
   329  
   330  	for i := range request.Spec.PodSets {
   331  		reqPS := &request.Spec.PodSets[i]
   332  		psName, refFound := podsetRefsMap[reqPS.PodTemplateRef.Name]
   333  		ps, psFound := podSetMap[psName]
   334  		psa, psaFound := psaMap[psName]
   335  
   336  		if !refFound || !psFound || !psaFound || ptr.Deref(psa.Count, 0) != reqPS.Count {
   337  			return c.client.Delete(ctx, request)
   338  		}
   339  
   340  		pt := &corev1.PodTemplate{}
   341  		ptKey := types.NamespacedName{
   342  			Namespace: request.Namespace,
   343  			Name:      reqPS.PodTemplateRef.Name,
   344  		}
   345  
   346  		err := c.client.Get(ctx, ptKey, pt)
   347  
   348  		if client.IgnoreNotFound(err) != nil {
   349  			return err
   350  		}
   351  
   352  		if err != nil {
   353  			// it's a not found, so create it
   354  			newPt := &corev1.PodTemplate{
   355  				ObjectMeta: metav1.ObjectMeta{
   356  					Name:      ptKey.Name,
   357  					Namespace: ptKey.Namespace,
   358  				},
   359  				Template: ps.Template,
   360  			}
   361  
   362  			// apply the admission node selectors to the Template
   363  			psi, err := podset.FromAssignment(ctx, c.client, psaMap[psName], reqPS.Count)
   364  			if err != nil {
   365  				return err
   366  			}
   367  
   368  			err = podset.Merge(&newPt.Template.ObjectMeta, &newPt.Template.Spec, psi)
   369  			if err != nil {
   370  				return err
   371  			}
   372  
   373  			if err := ctrl.SetControllerReference(request, newPt, c.client.Scheme()); err != nil {
   374  				return err
   375  			}
   376  
   377  			if err = c.client.Create(ctx, newPt); err != nil {
   378  				return err
   379  			}
   380  		}
   381  		// maybe check the consistency deeper
   382  	}
   383  	return nil
   384  }
   385  
   386  func (c *Controller) reqIsNeeded(ctx context.Context, wl *kueue.Workload, prc *kueue.ProvisioningRequestConfig) bool {
   387  	return len(requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources)) > 0
   388  }
   389  
   390  func requiredPodSets(podSets []kueue.PodSet, resources []corev1.ResourceName) []string {
   391  	resourcesSet := sets.New(resources...)
   392  	users := make([]string, 0, len(podSets))
   393  	for i := range podSets {
   394  		ps := &podSets[i]
   395  		if len(resources) == 0 || podUses(&ps.Template.Spec, resourcesSet) {
   396  			users = append(users, ps.Name)
   397  		}
   398  	}
   399  	return users
   400  }
   401  
   402  func podUses(pod *corev1.PodSpec, resourceSet sets.Set[corev1.ResourceName]) bool {
   403  	for i := range pod.InitContainers {
   404  		if containerUses(&pod.InitContainers[i], resourceSet) {
   405  			return true
   406  		}
   407  	}
   408  	for i := range pod.Containers {
   409  		if containerUses(&pod.Containers[i], resourceSet) {
   410  			return true
   411  		}
   412  	}
   413  	return false
   414  }
   415  
   416  func containerUses(cont *corev1.Container, resourceSet sets.Set[corev1.ResourceName]) bool {
   417  	for r := range cont.Resources.Requests {
   418  		if resourceSet.Has(r) {
   419  			return true
   420  		}
   421  	}
   422  	return false
   423  }
   424  
   425  func parametersKueueToProvisioning(in map[string]kueue.Parameter) map[string]autoscaling.Parameter {
   426  	if in == nil {
   427  		return nil
   428  	}
   429  
   430  	out := make(map[string]autoscaling.Parameter, len(in))
   431  	for k, v := range in {
   432  		out[k] = autoscaling.Parameter(v)
   433  	}
   434  	return out
   435  }
   436  
   437  func requestHasParamaters(req *autoscaling.ProvisioningRequest, prc *kueue.ProvisioningRequestConfig) bool {
   438  	if req.Spec.ProvisioningClassName != prc.Spec.ProvisioningClassName {
   439  		return false
   440  	}
   441  	if len(req.Spec.Parameters) != len(prc.Spec.Parameters) {
   442  		return false
   443  	}
   444  	for k, vReq := range req.Spec.Parameters {
   445  		if vCfg, found := prc.Spec.Parameters[k]; !found || vReq != autoscaling.Parameter(vCfg) {
   446  			return false
   447  		}
   448  	}
   449  	return true
   450  }
   451  
   452  func (c *Controller) syncCheckStates(ctx context.Context, wl *kueue.Workload, checks []string, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) error {
   453  	log := ctrl.LoggerFrom(ctx)
   454  	checksMap := slices.ToRefMap(wl.Status.AdmissionChecks, func(c *kueue.AdmissionCheckState) string { return c.Name })
   455  	wlPatch := workload.BaseSSAWorkload(wl)
   456  	recorderMessages := make([]string, 0, len(checks))
   457  	updated := false
   458  	for _, check := range checks {
   459  		checkState := *checksMap[check]
   460  		if prc, err := c.helper.ConfigForAdmissionCheck(ctx, check); err != nil {
   461  			// the check is not active
   462  			if checkState.State != kueue.CheckStatePending || checkState.Message != CheckInactiveMessage {
   463  				updated = true
   464  				checkState.State = kueue.CheckStatePending
   465  				checkState.Message = CheckInactiveMessage
   466  			}
   467  		} else if !c.reqIsNeeded(ctx, wl, prc) {
   468  			if checkState.State != kueue.CheckStateReady {
   469  				updated = true
   470  				checkState.State = kueue.CheckStateReady
   471  				checkState.Message = NoRequestNeeded
   472  				checkState.PodSetUpdates = nil
   473  			}
   474  		} else {
   475  			pr := activeOrLastPRForChecks[check]
   476  			if pr == nil {
   477  				return nil
   478  			}
   479  
   480  			prFailed := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Failed)
   481  			prProvisioned := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Provisioned)
   482  			log.V(3).Info("Synchronizing admission check state based on provisioning request", "wl", klog.KObj(wl), "check", check, "prName", pr.Name, "failed", prFailed, "accepted", prProvisioned)
   483  
   484  			switch {
   485  			case prFailed:
   486  				if checkState.State != kueue.CheckStateRejected {
   487  					if attempt := getAttempt(ctx, pr, wl.Name, check); attempt <= MaxRetries {
   488  						// it is going to be retried
   489  						message := fmt.Sprintf("Retrying after failure: %s", apimeta.FindStatusCondition(pr.Status.Conditions, autoscaling.Failed).Message)
   490  						updated = updated || checkState.State != kueue.CheckStatePending || checkState.Message != message
   491  						checkState.State = kueue.CheckStatePending
   492  						checkState.Message = message
   493  					} else {
   494  						updated = true
   495  						checkState.State = kueue.CheckStateRejected
   496  						checkState.Message = apimeta.FindStatusCondition(pr.Status.Conditions, autoscaling.Failed).Message
   497  					}
   498  				}
   499  			case prProvisioned:
   500  				if checkState.State != kueue.CheckStateReady {
   501  					updated = true
   502  					checkState.State = kueue.CheckStateReady
   503  					// add the pod podSetUpdates
   504  					checkState.PodSetUpdates = podSetUpdates(wl, pr)
   505  				}
   506  			default:
   507  				if checkState.State != kueue.CheckStatePending {
   508  					updated = true
   509  					checkState.State = kueue.CheckStatePending
   510  				}
   511  			}
   512  		}
   513  
   514  		existingCondition := workload.FindAdmissionCheck(wlPatch.Status.AdmissionChecks, checkState.Name)
   515  		if existingCondition != nil && existingCondition.State != checkState.State {
   516  			message := fmt.Sprintf("Admission check %s updated state from %s to %s", checkState.Name, existingCondition.State, checkState.State)
   517  			if checkState.Message != "" {
   518  				message += fmt.Sprintf(" with message %s", checkState.Message)
   519  			}
   520  			recorderMessages = append(recorderMessages, message)
   521  		}
   522  
   523  		workload.SetAdmissionCheckState(&wlPatch.Status.AdmissionChecks, checkState)
   524  	}
   525  	if updated {
   526  		if err := c.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName), client.ForceOwnership); err != nil {
   527  			return err
   528  		}
   529  		for i := range recorderMessages {
   530  			c.record.Event(wl, corev1.EventTypeNormal, "AdmissionCheckUpdated", api.TruncateEventMessage(recorderMessages[i]))
   531  		}
   532  	}
   533  	return nil
   534  }
   535  
   536  func podSetUpdates(wl *kueue.Workload, pr *autoscaling.ProvisioningRequest) []kueue.PodSetUpdate {
   537  	podSets := wl.Spec.PodSets
   538  	refMap := slices.ToMap(podSets, func(i int) (string, string) {
   539  		return getProvisioningRequestPodTemplateName(pr.Name, podSets[i].Name), podSets[i].Name
   540  	})
   541  	return slices.Map(pr.Spec.PodSets, func(ps *autoscaling.PodSet) kueue.PodSetUpdate {
   542  		return kueue.PodSetUpdate{
   543  			Name:        refMap[ps.PodTemplateRef.Name],
   544  			Annotations: map[string]string{ConsumesAnnotationKey: pr.Name},
   545  		}
   546  	})
   547  }
   548  
   549  type acHandler struct {
   550  	client client.Client
   551  }
   552  
   553  var _ handler.EventHandler = (*acHandler)(nil)
   554  
   555  func (a *acHandler) Create(ctx context.Context, event event.CreateEvent, q workqueue.RateLimitingInterface) {
   556  	ac, isAc := event.Object.(*kueue.AdmissionCheck)
   557  	if !isAc {
   558  		return
   559  	}
   560  
   561  	if ac.Spec.ControllerName == ControllerName {
   562  		err := a.reconcileWorkloadsUsing(ctx, ac.Name, q)
   563  		if err != nil {
   564  			ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on create event", "admissionCheck", klog.KObj(ac))
   565  		}
   566  	}
   567  }
   568  
   569  func (a *acHandler) Update(ctx context.Context, event event.UpdateEvent, q workqueue.RateLimitingInterface) {
   570  	oldAc, isOldAc := event.ObjectOld.(*kueue.AdmissionCheck)
   571  	newAc, isNewAc := event.ObjectNew.(*kueue.AdmissionCheck)
   572  	if !isNewAc || !isOldAc {
   573  		return
   574  	}
   575  
   576  	if oldAc.Spec.ControllerName == ControllerName || newAc.Spec.ControllerName == ControllerName {
   577  		err := a.reconcileWorkloadsUsing(ctx, oldAc.Name, q)
   578  		if err != nil {
   579  			ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on update event", "admissionCheck", klog.KObj(oldAc))
   580  		}
   581  	}
   582  }
   583  
   584  func (a *acHandler) Delete(ctx context.Context, event event.DeleteEvent, q workqueue.RateLimitingInterface) {
   585  	ac, isAc := event.Object.(*kueue.AdmissionCheck)
   586  	if !isAc {
   587  		return
   588  	}
   589  
   590  	if ac.Spec.ControllerName == ControllerName {
   591  		err := a.reconcileWorkloadsUsing(ctx, ac.Name, q)
   592  		if err != nil {
   593  			ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on delete event", "admissionCheck", klog.KObj(ac))
   594  		}
   595  	}
   596  }
   597  
   598  func (a *acHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) {
   599  	// nothing to do for now
   600  }
   601  
   602  func (a *acHandler) reconcileWorkloadsUsing(ctx context.Context, check string, q workqueue.RateLimitingInterface) error {
   603  	list := &kueue.WorkloadList{}
   604  	if err := a.client.List(ctx, list, client.MatchingFields{WorkloadsWithAdmissionCheckKey: check}); client.IgnoreNotFound(err) != nil {
   605  		return err
   606  	}
   607  
   608  	for i := range list.Items {
   609  		wl := &list.Items[i]
   610  		req := reconcile.Request{
   611  			NamespacedName: types.NamespacedName{
   612  				Name:      wl.Name,
   613  				Namespace: wl.Namespace,
   614  			},
   615  		}
   616  		q.Add(req)
   617  	}
   618  
   619  	return nil
   620  }
   621  
   622  type prcHandler struct {
   623  	client            client.Client
   624  	acHandlerOverride func(ctx context.Context, config string, q workqueue.RateLimitingInterface) error
   625  }
   626  
   627  var _ handler.EventHandler = (*prcHandler)(nil)
   628  
   629  func (p *prcHandler) Create(ctx context.Context, event event.CreateEvent, q workqueue.RateLimitingInterface) {
   630  	prc, isPRC := event.Object.(*kueue.ProvisioningRequestConfig)
   631  	if !isPRC {
   632  		return
   633  	}
   634  	err := p.reconcileWorkloadsUsing(ctx, prc.Name, q)
   635  	if err != nil {
   636  		ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on create event", "provisioningRequestConfig", klog.KObj(prc))
   637  	}
   638  }
   639  
   640  func (p *prcHandler) Update(ctx context.Context, event event.UpdateEvent, q workqueue.RateLimitingInterface) {
   641  	oldPRC, isOldPRC := event.ObjectOld.(*kueue.ProvisioningRequestConfig)
   642  	newPRC, isNewPRC := event.ObjectNew.(*kueue.ProvisioningRequestConfig)
   643  	if !isNewPRC || !isOldPRC {
   644  		return
   645  	}
   646  
   647  	if oldPRC.Spec.ProvisioningClassName != newPRC.Spec.ProvisioningClassName || !maps.Equal(oldPRC.Spec.Parameters, newPRC.Spec.Parameters) || !slices.CmpNoOrder(oldPRC.Spec.ManagedResources, newPRC.Spec.ManagedResources) {
   648  		err := p.reconcileWorkloadsUsing(ctx, oldPRC.Name, q)
   649  		if err != nil {
   650  			ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on update event", "provisioningRequestConfig", klog.KObj(oldPRC))
   651  		}
   652  	}
   653  }
   654  
   655  func (p *prcHandler) Delete(ctx context.Context, event event.DeleteEvent, q workqueue.RateLimitingInterface) {
   656  	prc, isPRC := event.Object.(*kueue.ProvisioningRequestConfig)
   657  	if !isPRC {
   658  		return
   659  	}
   660  	err := p.reconcileWorkloadsUsing(ctx, prc.Name, q)
   661  	if err != nil {
   662  		ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on delete event", "provisioningRequestConfig", klog.KObj(prc))
   663  	}
   664  }
   665  
   666  func (p *prcHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) {
   667  	// nothing to do for now
   668  }
   669  
   670  func (p *prcHandler) reconcileWorkloadsUsing(ctx context.Context, config string, q workqueue.RateLimitingInterface) error {
   671  	list := &kueue.AdmissionCheckList{}
   672  	if err := p.client.List(ctx, list, client.MatchingFields{AdmissionCheckUsingConfigKey: config}); client.IgnoreNotFound(err) != nil {
   673  		return err
   674  	}
   675  	users := slices.Map(list.Items, func(ac *kueue.AdmissionCheck) string { return ac.Name })
   676  	for _, user := range users {
   677  		if p.acHandlerOverride != nil {
   678  			if err := p.acHandlerOverride(ctx, user, q); err != nil {
   679  				return err
   680  			}
   681  		} else {
   682  			req := reconcile.Request{
   683  				NamespacedName: types.NamespacedName{
   684  					Name: user,
   685  				},
   686  			}
   687  			q.Add(req)
   688  		}
   689  	}
   690  	return nil
   691  }
   692  
   693  func (c *Controller) SetupWithManager(mgr ctrl.Manager) error {
   694  	ach := &acHandler{
   695  		client: c.client,
   696  	}
   697  	prch := &prcHandler{
   698  		client:            c.client,
   699  		acHandlerOverride: ach.reconcileWorkloadsUsing,
   700  	}
   701  	err := ctrl.NewControllerManagedBy(mgr).
   702  		For(&kueue.Workload{}).
   703  		Owns(&autoscaling.ProvisioningRequest{}).
   704  		Watches(&kueue.AdmissionCheck{}, ach).
   705  		Watches(&kueue.ProvisioningRequestConfig{}, prch).
   706  		Complete(c)
   707  	if err != nil {
   708  		return err
   709  	}
   710  
   711  	prcACh := &prcHandler{
   712  		client: c.client,
   713  	}
   714  	acReconciler := &acReconciler{
   715  		client: c.client,
   716  		helper: c.helper,
   717  	}
   718  
   719  	return ctrl.NewControllerManagedBy(mgr).
   720  		For(&kueue.AdmissionCheck{}).
   721  		Watches(&kueue.ProvisioningRequestConfig{}, prcACh).
   722  		Complete(acReconciler)
   723  }
   724  
   725  func GetProvisioningRequestName(workloadName, checkName string, attempt int32) string {
   726  	fullName := fmt.Sprintf("%s-%s-%d", workloadName, checkName, int(attempt))
   727  	return limitObjectName(fullName)
   728  }
   729  
   730  func getProvisioningRequestNamePrefix(workloadName, checkName string) string {
   731  	fullName := fmt.Sprintf("%s-%s-", workloadName, checkName)
   732  	return limitObjectName(fullName)
   733  }
   734  
   735  func getProvisioningRequestPodTemplateName(prName, podsetName string) string {
   736  	fullName := fmt.Sprintf("%s-%s-%s", podTemplatesPrefix, prName, podsetName)
   737  	return limitObjectName(fullName)
   738  }
   739  
   740  func limitObjectName(fullName string) string {
   741  	if len(fullName) <= objNameMaxPrefixLength {
   742  		return fullName
   743  	}
   744  	h := sha1.New()
   745  	h.Write([]byte(fullName))
   746  	hashBytes := hex.EncodeToString(h.Sum(nil))
   747  	return fmt.Sprintf("%s-%s", fullName[:objNameMaxPrefixLength], hashBytes[:objNameHashLength])
   748  }
   749  
   750  func matches(pr *autoscaling.ProvisioningRequest, workloadName, checkName string) bool {
   751  	attemptRegex := getAttemptRegex(workloadName, checkName)
   752  	matches := attemptRegex.FindStringSubmatch(pr.Name)
   753  	return len(matches) > 0
   754  }
   755  
   756  func getAttempt(ctx context.Context, pr *autoscaling.ProvisioningRequest, workloadName, checkName string) int32 {
   757  	logger := log.FromContext(ctx)
   758  	attemptRegex := getAttemptRegex(workloadName, checkName)
   759  	matches := attemptRegex.FindStringSubmatch(pr.Name)
   760  	if len(matches) > 0 {
   761  		number, err := strconv.Atoi(matches[1])
   762  		if err != nil {
   763  			logger.Error(err, "Parsing the attempt number from provisioning request", "requestName", pr.Name)
   764  			return 1
   765  		} else {
   766  			return int32(number)
   767  		}
   768  	} else {
   769  		logger.Info("No attempt suffix in provisioning request", "requestName", pr.Name)
   770  		return 1
   771  	}
   772  }
   773  
   774  func getAttemptRegex(workloadName, checkName string) *regexp.Regexp {
   775  	prefix := getProvisioningRequestNamePrefix(workloadName, checkName)
   776  	escapedPrefix := regexp.QuoteMeta(prefix)
   777  	return regexp.MustCompile("^" + escapedPrefix + "([0-9]+)$")
   778  }
   779  
   780  func remainingTime(prc *kueue.ProvisioningRequestConfig, failuresCount int32, lastFailureTime time.Time) time.Duration {
   781  	defaultBackoff := time.Duration(MinBackoffSeconds) * time.Second
   782  	maxBackoff := 30 * time.Minute
   783  	backoffDuration := defaultBackoff
   784  	for i := 1; i < int(failuresCount); i++ {
   785  		backoffDuration = backoffDuration * 2
   786  		if backoffDuration >= maxBackoff {
   787  			backoffDuration = maxBackoff
   788  			break
   789  		}
   790  	}
   791  	timeElapsedSinceLastFailure := time.Since(lastFailureTime)
   792  	return backoffDuration - timeElapsedSinceLastFailure
   793  }