sigs.k8s.io/kueue@v0.6.2/pkg/controller/admissionchecks/provisioning/controller.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package provisioning 18 19 import ( 20 "context" 21 "crypto/sha1" 22 "encoding/hex" 23 "errors" 24 "fmt" 25 "maps" 26 "regexp" 27 "strconv" 28 "time" 29 30 corev1 "k8s.io/api/core/v1" 31 apimeta "k8s.io/apimachinery/pkg/api/meta" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/sets" 35 autoscaling "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1beta1" 36 "k8s.io/client-go/tools/record" 37 "k8s.io/client-go/util/workqueue" 38 "k8s.io/klog/v2" 39 "k8s.io/utils/ptr" 40 ctrl "sigs.k8s.io/controller-runtime" 41 "sigs.k8s.io/controller-runtime/pkg/client" 42 "sigs.k8s.io/controller-runtime/pkg/event" 43 "sigs.k8s.io/controller-runtime/pkg/handler" 44 "sigs.k8s.io/controller-runtime/pkg/log" 45 "sigs.k8s.io/controller-runtime/pkg/reconcile" 46 47 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 48 "sigs.k8s.io/kueue/pkg/podset" 49 "sigs.k8s.io/kueue/pkg/util/admissioncheck" 50 "sigs.k8s.io/kueue/pkg/util/api" 51 "sigs.k8s.io/kueue/pkg/util/slices" 52 "sigs.k8s.io/kueue/pkg/workload" 53 ) 54 55 const ( 56 objNameHashLength = 5 57 // 253 is the maximal length for a CRD name. We need to subtract one for '-', and the hash length. 58 objNameMaxPrefixLength = 252 - objNameHashLength 59 podTemplatesPrefix = "ppt" 60 ) 61 62 var ( 63 errInconsistentPodSetAssignments = errors.New("inconsistent podSet assignments") 64 ) 65 66 var ( 67 MaxRetries int32 = 3 68 MinBackoffSeconds int32 = 60 69 ) 70 71 type provisioningConfigHelper = admissioncheck.ConfigHelper[*kueue.ProvisioningRequestConfig, kueue.ProvisioningRequestConfig] 72 73 func newProvisioningConfigHelper(c client.Client) (*provisioningConfigHelper, error) { 74 return admissioncheck.NewConfigHelper[*kueue.ProvisioningRequestConfig](c) 75 } 76 77 type Controller struct { 78 client client.Client 79 helper *provisioningConfigHelper 80 record record.EventRecorder 81 } 82 83 var _ reconcile.Reconciler = (*Controller)(nil) 84 85 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update 86 // +kubebuilder:rbac:groups="",resources=podtemplates,verbs=get;list;watch;create;delete;update 87 // +kubebuilder:rbac:groups=autoscaling.x-k8s.io,resources=provisioningrequests,verbs=get;list;watch;create;update;patch;delete 88 // +kubebuilder:rbac:groups=autoscaling.x-k8s.io,resources=provisioningrequests/status,verbs=get 89 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;update;patch;delete 90 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch 91 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=admissionchecks,verbs=get;list;watch 92 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=provisioningrequestconfigs,verbs=get;list;watch 93 94 func NewController(client client.Client, record record.EventRecorder) (*Controller, error) { 95 helper, err := newProvisioningConfigHelper(client) 96 if err != nil { 97 return nil, err 98 } 99 return &Controller{ 100 client: client, 101 record: record, 102 helper: helper, 103 }, nil 104 } 105 106 // Reconcile performs a full reconciliation for the object referred to by the Request. 107 // The Controller will requeue the Request to be processed again if an error is non-nil or 108 // Result.Requeue is true, otherwise upon completion it will remove the work from the queue. 109 func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { 110 wl := &kueue.Workload{} 111 log := ctrl.LoggerFrom(ctx) 112 log.V(2).Info("Reconcile workload") 113 114 err := c.client.Get(ctx, req.NamespacedName, wl) 115 if err != nil { 116 return reconcile.Result{}, client.IgnoreNotFound(err) 117 } 118 119 if !workload.HasQuotaReservation(wl) || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { 120 //1.2 workload has no reservation or is finished 121 log.V(5).Info("workload with no reservation, delete owned requests") 122 return reconcile.Result{}, c.deleteOwnedProvisionRequests(ctx, req.Namespace, req.Name) 123 } 124 125 // get the lists of relevant checks 126 relevantChecks, err := admissioncheck.FilterForController(ctx, c.client, wl.Status.AdmissionChecks, ControllerName) 127 if err != nil { 128 return reconcile.Result{}, err 129 } 130 131 list := &autoscaling.ProvisioningRequestList{} 132 if err := c.client.List(ctx, list, client.InNamespace(wl.Namespace), client.MatchingFields{RequestsOwnedByWorkloadKey: wl.Name}); client.IgnoreNotFound(err) != nil { 133 return reconcile.Result{}, err 134 } 135 ownedPrs := list.Items 136 activeOrLastPRForChecks := c.activeOrLastPRForChecks(ctx, wl, relevantChecks, ownedPrs) 137 138 if workload.IsAdmitted(wl) { 139 // check the state of the provision requests, eventually toggle the checks to false 140 // otherwise there is nothing to here 141 log.V(5).Info("workload admitted, sync checks") 142 return reconcile.Result{}, c.syncCheckStates(ctx, wl, relevantChecks, activeOrLastPRForChecks) 143 } 144 145 err = c.deleteUnusedProvisioningRequests(ctx, ownedPrs, activeOrLastPRForChecks) 146 if err != nil { 147 log.V(2).Error(err, "syncOwnedProvisionRequest failed to delete unused provisioning requests") 148 return reconcile.Result{}, err 149 } 150 151 requeAfter, err := c.syncOwnedProvisionRequest(ctx, wl, relevantChecks, activeOrLastPRForChecks) 152 if err != nil { 153 // this can also delete unneeded checks 154 log.V(2).Error(err, "syncOwnedProvisionRequest failed") 155 return reconcile.Result{}, err 156 } 157 158 err = c.syncCheckStates(ctx, wl, relevantChecks, activeOrLastPRForChecks) 159 if err != nil { 160 return reconcile.Result{}, err 161 } 162 if requeAfter != nil { 163 return reconcile.Result{RequeueAfter: *requeAfter}, nil 164 } 165 return reconcile.Result{}, nil 166 } 167 168 func (c *Controller) activeOrLastPRForChecks(ctx context.Context, wl *kueue.Workload, relevantChecks []string, ownedPrs []autoscaling.ProvisioningRequest) map[string]*autoscaling.ProvisioningRequest { 169 activeOrLastPRForChecks := make(map[string]*autoscaling.ProvisioningRequest) 170 for _, checkName := range relevantChecks { 171 for i := range ownedPrs { 172 req := &ownedPrs[i] 173 // PRs relevant for the admission check 174 if matches(req, wl.Name, checkName) { 175 prc, err := c.helper.ConfigForAdmissionCheck(ctx, checkName) 176 if err == nil && c.reqIsNeeded(ctx, wl, prc) && requestHasParamaters(req, prc) { 177 if currPr, exists := activeOrLastPRForChecks[checkName]; !exists || getAttempt(ctx, currPr, wl.Name, checkName) < getAttempt(ctx, req, wl.Name, checkName) { 178 activeOrLastPRForChecks[checkName] = req 179 } 180 } 181 } 182 } 183 } 184 return activeOrLastPRForChecks 185 } 186 187 func (c *Controller) deleteUnusedProvisioningRequests(ctx context.Context, ownedPrs []autoscaling.ProvisioningRequest, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) error { 188 log := ctrl.LoggerFrom(ctx) 189 prNames := sets.New[string]() 190 for _, pr := range activeOrLastPRForChecks { 191 prNames.Insert(pr.Name) 192 } 193 for _, pr := range ownedPrs { 194 req := &pr 195 if !prNames.Has(req.Name) { 196 if err := c.client.Delete(ctx, req); client.IgnoreNotFound(err) != nil { 197 log.V(5).Error(err, "deleting the request", "req", klog.KObj(req)) 198 return err 199 } 200 } 201 } 202 return nil 203 } 204 205 func (c *Controller) deleteOwnedProvisionRequests(ctx context.Context, namespace string, name string) error { 206 list := &autoscaling.ProvisioningRequestList{} 207 if err := c.client.List(ctx, list, client.InNamespace(namespace), client.MatchingFields{RequestsOwnedByWorkloadKey: name}); err != nil { 208 return client.IgnoreNotFound(err) 209 } 210 211 for i := range list.Items { 212 if err := c.client.Delete(ctx, &list.Items[i]); client.IgnoreNotFound(err) != nil { 213 return fmt.Errorf("delete requests for %s/%s: %w", namespace, name, err) 214 } 215 } 216 return nil 217 } 218 219 func (c *Controller) syncOwnedProvisionRequest(ctx context.Context, wl *kueue.Workload, relevantChecks []string, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) (*time.Duration, error) { 220 log := ctrl.LoggerFrom(ctx) 221 var requeAfter *time.Duration 222 for _, checkName := range relevantChecks { 223 //get the config 224 prc, err := c.helper.ConfigForAdmissionCheck(ctx, checkName) 225 if err != nil { 226 // the check is not active 227 continue 228 } 229 if !c.reqIsNeeded(ctx, wl, prc) { 230 continue 231 } 232 if ac := workload.FindAdmissionCheck(wl.Status.AdmissionChecks, checkName); ac != nil && ac.State == kueue.CheckStateReady { 233 log.V(2).Info("Skip syncing of the ProvReq for admission check which is Ready", "workload", klog.KObj(wl), "admissionCheck", checkName) 234 continue 235 } 236 237 oldPr, exists := activeOrLastPRForChecks[checkName] 238 attempt := int32(1) 239 shouldCreatePr := false 240 if exists { 241 attempt = getAttempt(ctx, oldPr, wl.Name, checkName) 242 if apimeta.IsStatusConditionTrue(oldPr.Status.Conditions, autoscaling.Failed) { 243 if attempt <= MaxRetries { 244 prFailed := apimeta.FindStatusCondition(oldPr.Status.Conditions, autoscaling.Failed) 245 remainingTime := remainingTime(prc, attempt, prFailed.LastTransitionTime.Time) 246 if remainingTime <= 0 { 247 shouldCreatePr = true 248 attempt += 1 249 } else if requeAfter == nil || remainingTime < *requeAfter { 250 requeAfter = &remainingTime 251 } 252 } 253 } 254 } else { 255 shouldCreatePr = true 256 } 257 requestName := GetProvisioningRequestName(wl.Name, checkName, attempt) 258 if shouldCreatePr { 259 log.V(3).Info("Creating ProvisioningRequest", "requestName", requestName, "attempt", attempt) 260 req := &autoscaling.ProvisioningRequest{ 261 ObjectMeta: metav1.ObjectMeta{ 262 Name: requestName, 263 Namespace: wl.Namespace, 264 }, 265 Spec: autoscaling.ProvisioningRequestSpec{ 266 ProvisioningClassName: prc.Spec.ProvisioningClassName, 267 Parameters: parametersKueueToProvisioning(prc.Spec.Parameters), 268 }, 269 } 270 271 expectedPodSets := requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources) 272 psaMap := slices.ToRefMap(wl.Status.Admission.PodSetAssignments, func(p *kueue.PodSetAssignment) string { return p.Name }) 273 podSetMap := slices.ToRefMap(wl.Spec.PodSets, func(ps *kueue.PodSet) string { return ps.Name }) 274 for _, psName := range expectedPodSets { 275 ps, psFound := podSetMap[psName] 276 psa, psaFound := psaMap[psName] 277 if !psFound || !psaFound { 278 return nil, errInconsistentPodSetAssignments 279 } 280 req.Spec.PodSets = append(req.Spec.PodSets, autoscaling.PodSet{ 281 PodTemplateRef: autoscaling.Reference{ 282 Name: getProvisioningRequestPodTemplateName(requestName, psName), 283 }, 284 Count: ptr.Deref(psa.Count, ps.Count), 285 }) 286 } 287 288 if err := ctrl.SetControllerReference(wl, req, c.client.Scheme()); err != nil { 289 return nil, err 290 } 291 292 if err := c.client.Create(ctx, req); err != nil { 293 return nil, err 294 } 295 c.record.Eventf(wl, corev1.EventTypeNormal, "ProvisioningRequestCreated", "Created ProvisioningRequest: %q", req.Name) 296 activeOrLastPRForChecks[checkName] = req 297 } 298 if err := c.syncProvisionRequestsPodTemplates(ctx, wl, requestName, prc); err != nil { 299 return nil, err 300 } 301 } 302 return requeAfter, nil 303 } 304 305 func (c *Controller) syncProvisionRequestsPodTemplates(ctx context.Context, wl *kueue.Workload, prName string, prc *kueue.ProvisioningRequestConfig) error { 306 request := &autoscaling.ProvisioningRequest{} 307 requestKey := types.NamespacedName{ 308 Name: prName, 309 Namespace: wl.Namespace, 310 } 311 err := c.client.Get(ctx, requestKey, request) 312 if err != nil { 313 return client.IgnoreNotFound(err) 314 } 315 316 expectedPodSets := requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources) 317 podsetRefsMap := slices.ToMap(expectedPodSets, func(i int) (string, string) { 318 return getProvisioningRequestPodTemplateName(prName, expectedPodSets[i]), expectedPodSets[i] 319 }) 320 321 // the order of the podSets should be the same in the workload and prov. req. 322 // if the number is different, just delete the request 323 if len(request.Spec.PodSets) != len(expectedPodSets) { 324 return c.client.Delete(ctx, request) 325 } 326 327 psaMap := slices.ToRefMap(wl.Status.Admission.PodSetAssignments, func(p *kueue.PodSetAssignment) string { return p.Name }) 328 podSetMap := slices.ToRefMap(wl.Spec.PodSets, func(ps *kueue.PodSet) string { return ps.Name }) 329 330 for i := range request.Spec.PodSets { 331 reqPS := &request.Spec.PodSets[i] 332 psName, refFound := podsetRefsMap[reqPS.PodTemplateRef.Name] 333 ps, psFound := podSetMap[psName] 334 psa, psaFound := psaMap[psName] 335 336 if !refFound || !psFound || !psaFound || ptr.Deref(psa.Count, 0) != reqPS.Count { 337 return c.client.Delete(ctx, request) 338 } 339 340 pt := &corev1.PodTemplate{} 341 ptKey := types.NamespacedName{ 342 Namespace: request.Namespace, 343 Name: reqPS.PodTemplateRef.Name, 344 } 345 346 err := c.client.Get(ctx, ptKey, pt) 347 348 if client.IgnoreNotFound(err) != nil { 349 return err 350 } 351 352 if err != nil { 353 // it's a not found, so create it 354 newPt := &corev1.PodTemplate{ 355 ObjectMeta: metav1.ObjectMeta{ 356 Name: ptKey.Name, 357 Namespace: ptKey.Namespace, 358 }, 359 Template: ps.Template, 360 } 361 362 // apply the admission node selectors to the Template 363 psi, err := podset.FromAssignment(ctx, c.client, psaMap[psName], reqPS.Count) 364 if err != nil { 365 return err 366 } 367 368 err = podset.Merge(&newPt.Template.ObjectMeta, &newPt.Template.Spec, psi) 369 if err != nil { 370 return err 371 } 372 373 if err := ctrl.SetControllerReference(request, newPt, c.client.Scheme()); err != nil { 374 return err 375 } 376 377 if err = c.client.Create(ctx, newPt); err != nil { 378 return err 379 } 380 } 381 // maybe check the consistency deeper 382 } 383 return nil 384 } 385 386 func (c *Controller) reqIsNeeded(ctx context.Context, wl *kueue.Workload, prc *kueue.ProvisioningRequestConfig) bool { 387 return len(requiredPodSets(wl.Spec.PodSets, prc.Spec.ManagedResources)) > 0 388 } 389 390 func requiredPodSets(podSets []kueue.PodSet, resources []corev1.ResourceName) []string { 391 resourcesSet := sets.New(resources...) 392 users := make([]string, 0, len(podSets)) 393 for i := range podSets { 394 ps := &podSets[i] 395 if len(resources) == 0 || podUses(&ps.Template.Spec, resourcesSet) { 396 users = append(users, ps.Name) 397 } 398 } 399 return users 400 } 401 402 func podUses(pod *corev1.PodSpec, resourceSet sets.Set[corev1.ResourceName]) bool { 403 for i := range pod.InitContainers { 404 if containerUses(&pod.InitContainers[i], resourceSet) { 405 return true 406 } 407 } 408 for i := range pod.Containers { 409 if containerUses(&pod.Containers[i], resourceSet) { 410 return true 411 } 412 } 413 return false 414 } 415 416 func containerUses(cont *corev1.Container, resourceSet sets.Set[corev1.ResourceName]) bool { 417 for r := range cont.Resources.Requests { 418 if resourceSet.Has(r) { 419 return true 420 } 421 } 422 return false 423 } 424 425 func parametersKueueToProvisioning(in map[string]kueue.Parameter) map[string]autoscaling.Parameter { 426 if in == nil { 427 return nil 428 } 429 430 out := make(map[string]autoscaling.Parameter, len(in)) 431 for k, v := range in { 432 out[k] = autoscaling.Parameter(v) 433 } 434 return out 435 } 436 437 func requestHasParamaters(req *autoscaling.ProvisioningRequest, prc *kueue.ProvisioningRequestConfig) bool { 438 if req.Spec.ProvisioningClassName != prc.Spec.ProvisioningClassName { 439 return false 440 } 441 if len(req.Spec.Parameters) != len(prc.Spec.Parameters) { 442 return false 443 } 444 for k, vReq := range req.Spec.Parameters { 445 if vCfg, found := prc.Spec.Parameters[k]; !found || vReq != autoscaling.Parameter(vCfg) { 446 return false 447 } 448 } 449 return true 450 } 451 452 func (c *Controller) syncCheckStates(ctx context.Context, wl *kueue.Workload, checks []string, activeOrLastPRForChecks map[string]*autoscaling.ProvisioningRequest) error { 453 log := ctrl.LoggerFrom(ctx) 454 checksMap := slices.ToRefMap(wl.Status.AdmissionChecks, func(c *kueue.AdmissionCheckState) string { return c.Name }) 455 wlPatch := workload.BaseSSAWorkload(wl) 456 recorderMessages := make([]string, 0, len(checks)) 457 updated := false 458 for _, check := range checks { 459 checkState := *checksMap[check] 460 if prc, err := c.helper.ConfigForAdmissionCheck(ctx, check); err != nil { 461 // the check is not active 462 if checkState.State != kueue.CheckStatePending || checkState.Message != CheckInactiveMessage { 463 updated = true 464 checkState.State = kueue.CheckStatePending 465 checkState.Message = CheckInactiveMessage 466 } 467 } else if !c.reqIsNeeded(ctx, wl, prc) { 468 if checkState.State != kueue.CheckStateReady { 469 updated = true 470 checkState.State = kueue.CheckStateReady 471 checkState.Message = NoRequestNeeded 472 checkState.PodSetUpdates = nil 473 } 474 } else { 475 pr := activeOrLastPRForChecks[check] 476 if pr == nil { 477 return nil 478 } 479 480 prFailed := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Failed) 481 prProvisioned := apimeta.IsStatusConditionTrue(pr.Status.Conditions, autoscaling.Provisioned) 482 log.V(3).Info("Synchronizing admission check state based on provisioning request", "wl", klog.KObj(wl), "check", check, "prName", pr.Name, "failed", prFailed, "accepted", prProvisioned) 483 484 switch { 485 case prFailed: 486 if checkState.State != kueue.CheckStateRejected { 487 if attempt := getAttempt(ctx, pr, wl.Name, check); attempt <= MaxRetries { 488 // it is going to be retried 489 message := fmt.Sprintf("Retrying after failure: %s", apimeta.FindStatusCondition(pr.Status.Conditions, autoscaling.Failed).Message) 490 updated = updated || checkState.State != kueue.CheckStatePending || checkState.Message != message 491 checkState.State = kueue.CheckStatePending 492 checkState.Message = message 493 } else { 494 updated = true 495 checkState.State = kueue.CheckStateRejected 496 checkState.Message = apimeta.FindStatusCondition(pr.Status.Conditions, autoscaling.Failed).Message 497 } 498 } 499 case prProvisioned: 500 if checkState.State != kueue.CheckStateReady { 501 updated = true 502 checkState.State = kueue.CheckStateReady 503 // add the pod podSetUpdates 504 checkState.PodSetUpdates = podSetUpdates(wl, pr) 505 } 506 default: 507 if checkState.State != kueue.CheckStatePending { 508 updated = true 509 checkState.State = kueue.CheckStatePending 510 } 511 } 512 } 513 514 existingCondition := workload.FindAdmissionCheck(wlPatch.Status.AdmissionChecks, checkState.Name) 515 if existingCondition != nil && existingCondition.State != checkState.State { 516 message := fmt.Sprintf("Admission check %s updated state from %s to %s", checkState.Name, existingCondition.State, checkState.State) 517 if checkState.Message != "" { 518 message += fmt.Sprintf(" with message %s", checkState.Message) 519 } 520 recorderMessages = append(recorderMessages, message) 521 } 522 523 workload.SetAdmissionCheckState(&wlPatch.Status.AdmissionChecks, checkState) 524 } 525 if updated { 526 if err := c.client.Status().Patch(ctx, wlPatch, client.Apply, client.FieldOwner(ControllerName), client.ForceOwnership); err != nil { 527 return err 528 } 529 for i := range recorderMessages { 530 c.record.Event(wl, corev1.EventTypeNormal, "AdmissionCheckUpdated", api.TruncateEventMessage(recorderMessages[i])) 531 } 532 } 533 return nil 534 } 535 536 func podSetUpdates(wl *kueue.Workload, pr *autoscaling.ProvisioningRequest) []kueue.PodSetUpdate { 537 podSets := wl.Spec.PodSets 538 refMap := slices.ToMap(podSets, func(i int) (string, string) { 539 return getProvisioningRequestPodTemplateName(pr.Name, podSets[i].Name), podSets[i].Name 540 }) 541 return slices.Map(pr.Spec.PodSets, func(ps *autoscaling.PodSet) kueue.PodSetUpdate { 542 return kueue.PodSetUpdate{ 543 Name: refMap[ps.PodTemplateRef.Name], 544 Annotations: map[string]string{ConsumesAnnotationKey: pr.Name}, 545 } 546 }) 547 } 548 549 type acHandler struct { 550 client client.Client 551 } 552 553 var _ handler.EventHandler = (*acHandler)(nil) 554 555 func (a *acHandler) Create(ctx context.Context, event event.CreateEvent, q workqueue.RateLimitingInterface) { 556 ac, isAc := event.Object.(*kueue.AdmissionCheck) 557 if !isAc { 558 return 559 } 560 561 if ac.Spec.ControllerName == ControllerName { 562 err := a.reconcileWorkloadsUsing(ctx, ac.Name, q) 563 if err != nil { 564 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on create event", "admissionCheck", klog.KObj(ac)) 565 } 566 } 567 } 568 569 func (a *acHandler) Update(ctx context.Context, event event.UpdateEvent, q workqueue.RateLimitingInterface) { 570 oldAc, isOldAc := event.ObjectOld.(*kueue.AdmissionCheck) 571 newAc, isNewAc := event.ObjectNew.(*kueue.AdmissionCheck) 572 if !isNewAc || !isOldAc { 573 return 574 } 575 576 if oldAc.Spec.ControllerName == ControllerName || newAc.Spec.ControllerName == ControllerName { 577 err := a.reconcileWorkloadsUsing(ctx, oldAc.Name, q) 578 if err != nil { 579 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on update event", "admissionCheck", klog.KObj(oldAc)) 580 } 581 } 582 } 583 584 func (a *acHandler) Delete(ctx context.Context, event event.DeleteEvent, q workqueue.RateLimitingInterface) { 585 ac, isAc := event.Object.(*kueue.AdmissionCheck) 586 if !isAc { 587 return 588 } 589 590 if ac.Spec.ControllerName == ControllerName { 591 err := a.reconcileWorkloadsUsing(ctx, ac.Name, q) 592 if err != nil { 593 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on delete event", "admissionCheck", klog.KObj(ac)) 594 } 595 } 596 } 597 598 func (a *acHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) { 599 // nothing to do for now 600 } 601 602 func (a *acHandler) reconcileWorkloadsUsing(ctx context.Context, check string, q workqueue.RateLimitingInterface) error { 603 list := &kueue.WorkloadList{} 604 if err := a.client.List(ctx, list, client.MatchingFields{WorkloadsWithAdmissionCheckKey: check}); client.IgnoreNotFound(err) != nil { 605 return err 606 } 607 608 for i := range list.Items { 609 wl := &list.Items[i] 610 req := reconcile.Request{ 611 NamespacedName: types.NamespacedName{ 612 Name: wl.Name, 613 Namespace: wl.Namespace, 614 }, 615 } 616 q.Add(req) 617 } 618 619 return nil 620 } 621 622 type prcHandler struct { 623 client client.Client 624 acHandlerOverride func(ctx context.Context, config string, q workqueue.RateLimitingInterface) error 625 } 626 627 var _ handler.EventHandler = (*prcHandler)(nil) 628 629 func (p *prcHandler) Create(ctx context.Context, event event.CreateEvent, q workqueue.RateLimitingInterface) { 630 prc, isPRC := event.Object.(*kueue.ProvisioningRequestConfig) 631 if !isPRC { 632 return 633 } 634 err := p.reconcileWorkloadsUsing(ctx, prc.Name, q) 635 if err != nil { 636 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on create event", "provisioningRequestConfig", klog.KObj(prc)) 637 } 638 } 639 640 func (p *prcHandler) Update(ctx context.Context, event event.UpdateEvent, q workqueue.RateLimitingInterface) { 641 oldPRC, isOldPRC := event.ObjectOld.(*kueue.ProvisioningRequestConfig) 642 newPRC, isNewPRC := event.ObjectNew.(*kueue.ProvisioningRequestConfig) 643 if !isNewPRC || !isOldPRC { 644 return 645 } 646 647 if oldPRC.Spec.ProvisioningClassName != newPRC.Spec.ProvisioningClassName || !maps.Equal(oldPRC.Spec.Parameters, newPRC.Spec.Parameters) || !slices.CmpNoOrder(oldPRC.Spec.ManagedResources, newPRC.Spec.ManagedResources) { 648 err := p.reconcileWorkloadsUsing(ctx, oldPRC.Name, q) 649 if err != nil { 650 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on update event", "provisioningRequestConfig", klog.KObj(oldPRC)) 651 } 652 } 653 } 654 655 func (p *prcHandler) Delete(ctx context.Context, event event.DeleteEvent, q workqueue.RateLimitingInterface) { 656 prc, isPRC := event.Object.(*kueue.ProvisioningRequestConfig) 657 if !isPRC { 658 return 659 } 660 err := p.reconcileWorkloadsUsing(ctx, prc.Name, q) 661 if err != nil { 662 ctrl.LoggerFrom(ctx).V(5).Error(err, "Failure on delete event", "provisioningRequestConfig", klog.KObj(prc)) 663 } 664 } 665 666 func (p *prcHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) { 667 // nothing to do for now 668 } 669 670 func (p *prcHandler) reconcileWorkloadsUsing(ctx context.Context, config string, q workqueue.RateLimitingInterface) error { 671 list := &kueue.AdmissionCheckList{} 672 if err := p.client.List(ctx, list, client.MatchingFields{AdmissionCheckUsingConfigKey: config}); client.IgnoreNotFound(err) != nil { 673 return err 674 } 675 users := slices.Map(list.Items, func(ac *kueue.AdmissionCheck) string { return ac.Name }) 676 for _, user := range users { 677 if p.acHandlerOverride != nil { 678 if err := p.acHandlerOverride(ctx, user, q); err != nil { 679 return err 680 } 681 } else { 682 req := reconcile.Request{ 683 NamespacedName: types.NamespacedName{ 684 Name: user, 685 }, 686 } 687 q.Add(req) 688 } 689 } 690 return nil 691 } 692 693 func (c *Controller) SetupWithManager(mgr ctrl.Manager) error { 694 ach := &acHandler{ 695 client: c.client, 696 } 697 prch := &prcHandler{ 698 client: c.client, 699 acHandlerOverride: ach.reconcileWorkloadsUsing, 700 } 701 err := ctrl.NewControllerManagedBy(mgr). 702 For(&kueue.Workload{}). 703 Owns(&autoscaling.ProvisioningRequest{}). 704 Watches(&kueue.AdmissionCheck{}, ach). 705 Watches(&kueue.ProvisioningRequestConfig{}, prch). 706 Complete(c) 707 if err != nil { 708 return err 709 } 710 711 prcACh := &prcHandler{ 712 client: c.client, 713 } 714 acReconciler := &acReconciler{ 715 client: c.client, 716 helper: c.helper, 717 } 718 719 return ctrl.NewControllerManagedBy(mgr). 720 For(&kueue.AdmissionCheck{}). 721 Watches(&kueue.ProvisioningRequestConfig{}, prcACh). 722 Complete(acReconciler) 723 } 724 725 func GetProvisioningRequestName(workloadName, checkName string, attempt int32) string { 726 fullName := fmt.Sprintf("%s-%s-%d", workloadName, checkName, int(attempt)) 727 return limitObjectName(fullName) 728 } 729 730 func getProvisioningRequestNamePrefix(workloadName, checkName string) string { 731 fullName := fmt.Sprintf("%s-%s-", workloadName, checkName) 732 return limitObjectName(fullName) 733 } 734 735 func getProvisioningRequestPodTemplateName(prName, podsetName string) string { 736 fullName := fmt.Sprintf("%s-%s-%s", podTemplatesPrefix, prName, podsetName) 737 return limitObjectName(fullName) 738 } 739 740 func limitObjectName(fullName string) string { 741 if len(fullName) <= objNameMaxPrefixLength { 742 return fullName 743 } 744 h := sha1.New() 745 h.Write([]byte(fullName)) 746 hashBytes := hex.EncodeToString(h.Sum(nil)) 747 return fmt.Sprintf("%s-%s", fullName[:objNameMaxPrefixLength], hashBytes[:objNameHashLength]) 748 } 749 750 func matches(pr *autoscaling.ProvisioningRequest, workloadName, checkName string) bool { 751 attemptRegex := getAttemptRegex(workloadName, checkName) 752 matches := attemptRegex.FindStringSubmatch(pr.Name) 753 return len(matches) > 0 754 } 755 756 func getAttempt(ctx context.Context, pr *autoscaling.ProvisioningRequest, workloadName, checkName string) int32 { 757 logger := log.FromContext(ctx) 758 attemptRegex := getAttemptRegex(workloadName, checkName) 759 matches := attemptRegex.FindStringSubmatch(pr.Name) 760 if len(matches) > 0 { 761 number, err := strconv.Atoi(matches[1]) 762 if err != nil { 763 logger.Error(err, "Parsing the attempt number from provisioning request", "requestName", pr.Name) 764 return 1 765 } else { 766 return int32(number) 767 } 768 } else { 769 logger.Info("No attempt suffix in provisioning request", "requestName", pr.Name) 770 return 1 771 } 772 } 773 774 func getAttemptRegex(workloadName, checkName string) *regexp.Regexp { 775 prefix := getProvisioningRequestNamePrefix(workloadName, checkName) 776 escapedPrefix := regexp.QuoteMeta(prefix) 777 return regexp.MustCompile("^" + escapedPrefix + "([0-9]+)$") 778 } 779 780 func remainingTime(prc *kueue.ProvisioningRequestConfig, failuresCount int32, lastFailureTime time.Time) time.Duration { 781 defaultBackoff := time.Duration(MinBackoffSeconds) * time.Second 782 maxBackoff := 30 * time.Minute 783 backoffDuration := defaultBackoff 784 for i := 1; i < int(failuresCount); i++ { 785 backoffDuration = backoffDuration * 2 786 if backoffDuration >= maxBackoff { 787 backoffDuration = maxBackoff 788 break 789 } 790 } 791 timeElapsedSinceLastFailure := time.Since(lastFailureTime) 792 return backoffDuration - timeElapsedSinceLastFailure 793 }