sigs.k8s.io/kueue@v0.6.2/pkg/controller/core/clusterqueue_controller.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package core 18 19 import ( 20 "context" 21 "time" 22 23 "github.com/go-logr/logr" 24 corev1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/equality" 26 "k8s.io/apimachinery/pkg/api/meta" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/types" 29 "k8s.io/apimachinery/pkg/util/sets" 30 "k8s.io/apimachinery/pkg/util/wait" 31 "k8s.io/client-go/util/workqueue" 32 "k8s.io/klog/v2" 33 "k8s.io/utils/ptr" 34 ctrl "sigs.k8s.io/controller-runtime" 35 "sigs.k8s.io/controller-runtime/pkg/client" 36 "sigs.k8s.io/controller-runtime/pkg/controller" 37 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 38 "sigs.k8s.io/controller-runtime/pkg/event" 39 "sigs.k8s.io/controller-runtime/pkg/reconcile" 40 "sigs.k8s.io/controller-runtime/pkg/source" 41 42 config "sigs.k8s.io/kueue/apis/config/v1beta1" 43 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 44 "sigs.k8s.io/kueue/pkg/cache" 45 "sigs.k8s.io/kueue/pkg/constants" 46 "sigs.k8s.io/kueue/pkg/features" 47 "sigs.k8s.io/kueue/pkg/metrics" 48 "sigs.k8s.io/kueue/pkg/queue" 49 "sigs.k8s.io/kueue/pkg/util/resource" 50 "sigs.k8s.io/kueue/pkg/util/slices" 51 "sigs.k8s.io/kueue/pkg/workload" 52 ) 53 54 const snapshotWorkers = 5 55 56 type ClusterQueueUpdateWatcher interface { 57 NotifyClusterQueueUpdate(*kueue.ClusterQueue, *kueue.ClusterQueue) 58 } 59 60 // ClusterQueueReconciler reconciles a ClusterQueue object 61 type ClusterQueueReconciler struct { 62 client client.Client 63 log logr.Logger 64 qManager *queue.Manager 65 cache *cache.Cache 66 snapshotsQueue workqueue.Interface 67 wlUpdateCh chan event.GenericEvent 68 rfUpdateCh chan event.GenericEvent 69 acUpdateCh chan event.GenericEvent 70 snapUpdateCh chan event.GenericEvent 71 watchers []ClusterQueueUpdateWatcher 72 reportResourceMetrics bool 73 queueVisibilityUpdateInterval time.Duration 74 queueVisibilityClusterQueuesMaxCount int32 75 } 76 77 type ClusterQueueReconcilerOptions struct { 78 Watchers []ClusterQueueUpdateWatcher 79 ReportResourceMetrics bool 80 QueueVisibilityUpdateInterval time.Duration 81 QueueVisibilityClusterQueuesMaxCount int32 82 } 83 84 // ClusterQueueReconcilerOption configures the reconciler. 85 type ClusterQueueReconcilerOption func(*ClusterQueueReconcilerOptions) 86 87 func WithWatchers(watchers ...ClusterQueueUpdateWatcher) ClusterQueueReconcilerOption { 88 return func(o *ClusterQueueReconcilerOptions) { 89 o.Watchers = watchers 90 } 91 } 92 93 func WithReportResourceMetrics(report bool) ClusterQueueReconcilerOption { 94 return func(o *ClusterQueueReconcilerOptions) { 95 o.ReportResourceMetrics = report 96 } 97 } 98 99 // WithQueueVisibilityUpdateInterval specifies the time interval for updates to the structure 100 // of the top pending workloads in the queues. 101 func WithQueueVisibilityUpdateInterval(interval time.Duration) ClusterQueueReconcilerOption { 102 return func(o *ClusterQueueReconcilerOptions) { 103 o.QueueVisibilityUpdateInterval = interval 104 } 105 } 106 107 // WithQueueVisibilityClusterQueuesMaxCount indicates the maximal number of pending workloads exposed in the 108 // cluster queue status 109 func WithQueueVisibilityClusterQueuesMaxCount(value int32) ClusterQueueReconcilerOption { 110 return func(o *ClusterQueueReconcilerOptions) { 111 o.QueueVisibilityClusterQueuesMaxCount = value 112 } 113 } 114 115 var defaultCQOptions = ClusterQueueReconcilerOptions{} 116 117 func NewClusterQueueReconciler( 118 client client.Client, 119 qMgr *queue.Manager, 120 cache *cache.Cache, 121 opts ...ClusterQueueReconcilerOption, 122 ) *ClusterQueueReconciler { 123 options := defaultCQOptions 124 for _, opt := range opts { 125 opt(&options) 126 } 127 return &ClusterQueueReconciler{ 128 client: client, 129 log: ctrl.Log.WithName("cluster-queue-reconciler"), 130 qManager: qMgr, 131 cache: cache, 132 snapshotsQueue: workqueue.New(), 133 wlUpdateCh: make(chan event.GenericEvent, updateChBuffer), 134 rfUpdateCh: make(chan event.GenericEvent, updateChBuffer), 135 acUpdateCh: make(chan event.GenericEvent, updateChBuffer), 136 snapUpdateCh: make(chan event.GenericEvent, updateChBuffer), 137 watchers: options.Watchers, 138 reportResourceMetrics: options.ReportResourceMetrics, 139 queueVisibilityUpdateInterval: options.QueueVisibilityUpdateInterval, 140 queueVisibilityClusterQueuesMaxCount: options.QueueVisibilityClusterQueuesMaxCount, 141 } 142 } 143 144 // +kubebuilder:rbac:groups="",resources=namespaces,verbs=get;list;watch 145 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch 146 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues,verbs=get;list;watch;create;update;patch;delete 147 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues/status,verbs=get;update;patch 148 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=clusterqueues/finalizers,verbs=update 149 150 func (r *ClusterQueueReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 151 var cqObj kueue.ClusterQueue 152 if err := r.client.Get(ctx, req.NamespacedName, &cqObj); err != nil { 153 // we'll ignore not-found errors, since there is nothing to do. 154 return ctrl.Result{}, client.IgnoreNotFound(err) 155 } 156 log := ctrl.LoggerFrom(ctx).WithValues("clusterQueue", klog.KObj(&cqObj)) 157 ctx = ctrl.LoggerInto(ctx, log) 158 log.V(2).Info("Reconciling ClusterQueue") 159 160 if cqObj.ObjectMeta.DeletionTimestamp.IsZero() { 161 // Although we'll add the finalizer via webhook mutation now, this is still useful 162 // as a fallback. 163 if !controllerutil.ContainsFinalizer(&cqObj, kueue.ResourceInUseFinalizerName) { 164 controllerutil.AddFinalizer(&cqObj, kueue.ResourceInUseFinalizerName) 165 if err := r.client.Update(ctx, &cqObj); err != nil { 166 return ctrl.Result{}, client.IgnoreNotFound(err) 167 } 168 } 169 } else { 170 if !r.cache.ClusterQueueTerminating(cqObj.Name) { 171 r.cache.TerminateClusterQueue(cqObj.Name) 172 } 173 174 if controllerutil.ContainsFinalizer(&cqObj, kueue.ResourceInUseFinalizerName) { 175 // The clusterQueue is being deleted, remove the finalizer only if 176 // there are no active reserving workloads. 177 if r.cache.ClusterQueueEmpty(cqObj.Name) { 178 controllerutil.RemoveFinalizer(&cqObj, kueue.ResourceInUseFinalizerName) 179 if err := r.client.Update(ctx, &cqObj); err != nil { 180 return ctrl.Result{}, client.IgnoreNotFound(err) 181 } 182 } 183 return ctrl.Result{}, nil 184 } 185 } 186 187 newCQObj := cqObj.DeepCopy() 188 cqCondition, reason, msg := r.cache.ClusterQueueReadiness(newCQObj.Name) 189 if err := r.updateCqStatusIfChanged(ctx, newCQObj, cqCondition, reason, msg); err != nil { 190 return ctrl.Result{}, client.IgnoreNotFound(err) 191 } 192 return ctrl.Result{}, nil 193 } 194 195 func (r *ClusterQueueReconciler) NotifyWorkloadUpdate(oldWl, newWl *kueue.Workload) { 196 if oldWl != nil { 197 r.wlUpdateCh <- event.GenericEvent{Object: oldWl} 198 if newWl != nil && oldWl.Spec.QueueName != newWl.Spec.QueueName { 199 r.wlUpdateCh <- event.GenericEvent{Object: newWl} 200 } 201 return 202 } 203 if newWl != nil { 204 r.wlUpdateCh <- event.GenericEvent{Object: newWl} 205 } 206 } 207 208 func (r *ClusterQueueReconciler) notifyWatchers(oldCQ, newCQ *kueue.ClusterQueue) { 209 for _, w := range r.watchers { 210 w.NotifyClusterQueueUpdate(oldCQ, newCQ) 211 } 212 } 213 214 // NotifyResourceFlavorUpdate ignores updates since they have no impact on the ClusterQueue's readiness. 215 func (r *ClusterQueueReconciler) NotifyResourceFlavorUpdate(oldRF, newRF *kueue.ResourceFlavor) { 216 // if oldRF is nil, it's a create event. 217 if oldRF == nil { 218 r.rfUpdateCh <- event.GenericEvent{Object: newRF} 219 return 220 } 221 222 // if newRF is nil, it's a delete event. 223 if newRF == nil { 224 r.rfUpdateCh <- event.GenericEvent{Object: oldRF} 225 return 226 } 227 } 228 229 func (r *ClusterQueueReconciler) NotifyAdmissionCheckUpdate(oldAc, newAc *kueue.AdmissionCheck) { 230 switch { 231 case oldAc != nil: 232 r.acUpdateCh <- event.GenericEvent{Object: oldAc} 233 case newAc != nil: 234 r.acUpdateCh <- event.GenericEvent{Object: newAc} 235 } 236 } 237 238 // Event handlers return true to signal the controller to reconcile the 239 // ClusterQueue associated with the event. 240 241 func (r *ClusterQueueReconciler) Create(e event.CreateEvent) bool { 242 cq, match := e.Object.(*kueue.ClusterQueue) 243 if !match { 244 // No need to interact with the cache for other objects. 245 return true 246 } 247 defer r.notifyWatchers(nil, cq) 248 249 log := r.log.WithValues("clusterQueue", klog.KObj(cq)) 250 log.V(2).Info("ClusterQueue create event") 251 ctx := ctrl.LoggerInto(context.Background(), log) 252 if err := r.cache.AddClusterQueue(ctx, cq); err != nil { 253 log.Error(err, "Failed to add clusterQueue to cache") 254 } 255 256 if err := r.qManager.AddClusterQueue(ctx, cq); err != nil { 257 log.Error(err, "Failed to add clusterQueue to queue manager") 258 } 259 260 if r.reportResourceMetrics { 261 recordResourceMetrics(cq) 262 } 263 264 return true 265 } 266 267 func (r *ClusterQueueReconciler) Delete(e event.DeleteEvent) bool { 268 cq, match := e.Object.(*kueue.ClusterQueue) 269 if !match { 270 // No need to interact with the cache for other objects. 271 return true 272 } 273 defer r.notifyWatchers(cq, nil) 274 275 r.log.V(2).Info("ClusterQueue delete event", "clusterQueue", klog.KObj(cq)) 276 r.cache.DeleteClusterQueue(cq) 277 r.qManager.DeleteClusterQueue(cq) 278 r.qManager.DeleteSnapshot(cq) 279 280 metrics.ClearClusterQueueResourceMetrics(cq.Name) 281 r.log.V(2).Info("Cleared resource metrics for deleted ClusterQueue.", "clusterQueue", klog.KObj(cq)) 282 283 return true 284 } 285 286 func (r *ClusterQueueReconciler) Update(e event.UpdateEvent) bool { 287 oldCq, match := e.ObjectOld.(*kueue.ClusterQueue) 288 if !match { 289 // No need to interact with the cache for other objects. 290 return true 291 } 292 newCq, match := e.ObjectNew.(*kueue.ClusterQueue) 293 if !match { 294 // No need to interact with the cache for other objects. 295 return true 296 } 297 298 log := r.log.WithValues("clusterQueue", klog.KObj(newCq)) 299 log.V(2).Info("ClusterQueue update event") 300 301 if newCq.DeletionTimestamp != nil { 302 return true 303 } 304 defer r.notifyWatchers(oldCq, newCq) 305 specUpdated := !equality.Semantic.DeepEqual(oldCq.Spec, newCq.Spec) 306 307 if err := r.cache.UpdateClusterQueue(newCq); err != nil { 308 log.Error(err, "Failed to update clusterQueue in cache") 309 } 310 if err := r.qManager.UpdateClusterQueue(context.Background(), newCq, specUpdated); err != nil { 311 log.Error(err, "Failed to update clusterQueue in queue manager") 312 } 313 314 if r.reportResourceMetrics { 315 updateResourceMetrics(oldCq, newCq) 316 } 317 return true 318 } 319 320 func (r *ClusterQueueReconciler) Generic(e event.GenericEvent) bool { 321 r.log.V(2).Info("Got generic event", "obj", klog.KObj(e.Object), "kind", e.Object.GetObjectKind().GroupVersionKind()) 322 return true 323 } 324 325 func recordResourceMetrics(cq *kueue.ClusterQueue) { 326 for rgi := range cq.Spec.ResourceGroups { 327 rg := &cq.Spec.ResourceGroups[rgi] 328 for fqi := range rg.Flavors { 329 fq := &rg.Flavors[fqi] 330 for ri := range fq.Resources { 331 r := &fq.Resources[ri] 332 nominal := resource.QuantityToFloat(&r.NominalQuota) 333 borrow := resource.QuantityToFloat(r.BorrowingLimit) 334 lend := resource.QuantityToFloat(r.LendingLimit) 335 metrics.ReportClusterQueueQuotas(cq.Spec.Cohort, cq.Name, string(fq.Name), string(r.Name), nominal, borrow, lend) 336 } 337 } 338 } 339 340 for fri := range cq.Status.FlavorsReservation { 341 fr := &cq.Status.FlavorsReservation[fri] 342 for ri := range fr.Resources { 343 r := &fr.Resources[ri] 344 metrics.ReportClusterQueueResourceReservations(cq.Spec.Cohort, cq.Name, string(fr.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) 345 } 346 } 347 348 for fui := range cq.Status.FlavorsUsage { 349 fu := &cq.Status.FlavorsUsage[fui] 350 for ri := range fu.Resources { 351 r := &fu.Resources[ri] 352 metrics.ReportClusterQueueResourceUsage(cq.Spec.Cohort, cq.Name, string(fu.Name), string(r.Name), resource.QuantityToFloat(&r.Total)) 353 } 354 } 355 } 356 357 func updateResourceMetrics(oldCq, newCq *kueue.ClusterQueue) { 358 // if the cohort changed, drop all the old metrics 359 if oldCq.Spec.Cohort != newCq.Spec.Cohort { 360 metrics.ClearClusterQueueResourceMetrics(oldCq.Name) 361 } else { 362 // selective remove 363 clearOldResourceQuotas(oldCq, newCq) 364 } 365 recordResourceMetrics(newCq) 366 } 367 368 func clearOldResourceQuotas(oldCq, newCq *kueue.ClusterQueue) { 369 for rgi := range oldCq.Spec.ResourceGroups { 370 oldRG := &oldCq.Spec.ResourceGroups[rgi] 371 newFlavors := map[kueue.ResourceFlavorReference]*kueue.FlavorQuotas{} 372 if rgi < len(newCq.Spec.ResourceGroups) && len(newCq.Spec.ResourceGroups[rgi].Flavors) > 0 { 373 newFlavors = slices.ToRefMap(newCq.Spec.ResourceGroups[rgi].Flavors, func(f *kueue.FlavorQuotas) kueue.ResourceFlavorReference { return f.Name }) 374 } 375 376 for fi := range oldRG.Flavors { 377 flavor := &oldRG.Flavors[fi] 378 if newFlavor, found := newFlavors[flavor.Name]; !found || len(newFlavor.Resources) == 0 { 379 metrics.ClearClusterQueueResourceQuotas(oldCq.Name, string(flavor.Name), "") 380 } else { 381 // check all resources 382 newResources := slices.ToRefMap(newFlavor.Resources, func(r *kueue.ResourceQuota) corev1.ResourceName { return r.Name }) 383 for ri := range flavor.Resources { 384 rname := flavor.Resources[ri].Name 385 if _, found := newResources[rname]; !found { 386 metrics.ClearClusterQueueResourceQuotas(oldCq.Name, string(flavor.Name), string(rname)) 387 } 388 } 389 } 390 } 391 } 392 393 // reservation metrics 394 if len(oldCq.Status.FlavorsReservation) > 0 { 395 newFlavors := map[kueue.ResourceFlavorReference]*kueue.FlavorUsage{} 396 if len(newCq.Status.FlavorsReservation) > 0 { 397 newFlavors = slices.ToRefMap(newCq.Status.FlavorsReservation, func(f *kueue.FlavorUsage) kueue.ResourceFlavorReference { return f.Name }) 398 } 399 for fi := range oldCq.Status.FlavorsReservation { 400 flavor := &oldCq.Status.FlavorsReservation[fi] 401 if newFlavor, found := newFlavors[flavor.Name]; !found || len(newFlavor.Resources) == 0 { 402 metrics.ClearClusterQueueResourceReservations(oldCq.Name, string(flavor.Name), "") 403 } else { 404 newResources := slices.ToRefMap(newFlavor.Resources, func(r *kueue.ResourceUsage) corev1.ResourceName { return r.Name }) 405 for ri := range flavor.Resources { 406 rname := flavor.Resources[ri].Name 407 if _, found := newResources[rname]; !found { 408 metrics.ClearClusterQueueResourceReservations(oldCq.Name, string(flavor.Name), string(rname)) 409 } 410 } 411 } 412 } 413 } 414 415 // usage metrics 416 if len(oldCq.Status.FlavorsUsage) > 0 { 417 newFlavors := map[kueue.ResourceFlavorReference]*kueue.FlavorUsage{} 418 if len(newCq.Status.FlavorsUsage) > 0 { 419 newFlavors = slices.ToRefMap(newCq.Status.FlavorsUsage, func(f *kueue.FlavorUsage) kueue.ResourceFlavorReference { return f.Name }) 420 } 421 for fi := range oldCq.Status.FlavorsUsage { 422 flavor := &oldCq.Status.FlavorsUsage[fi] 423 if newFlavor, found := newFlavors[flavor.Name]; !found || len(newFlavor.Resources) == 0 { 424 metrics.ClearClusterQueueResourceUsage(oldCq.Name, string(flavor.Name), "") 425 } else { 426 newResources := slices.ToRefMap(newFlavor.Resources, func(r *kueue.ResourceUsage) corev1.ResourceName { return r.Name }) 427 for ri := range flavor.Resources { 428 rname := flavor.Resources[ri].Name 429 if _, found := newResources[rname]; !found { 430 metrics.ClearClusterQueueResourceUsage(oldCq.Name, string(flavor.Name), string(rname)) 431 } 432 } 433 } 434 } 435 } 436 } 437 438 // cqWorkloadHandler signals the controller to reconcile the ClusterQueue 439 // associated to the workload in the event. 440 // Since the events come from a channel Source, only the Generic handler will 441 // receive events. 442 type cqWorkloadHandler struct { 443 qManager *queue.Manager 444 } 445 446 func (h *cqWorkloadHandler) Create(context.Context, event.CreateEvent, workqueue.RateLimitingInterface) { 447 } 448 449 func (h *cqWorkloadHandler) Update(context.Context, event.UpdateEvent, workqueue.RateLimitingInterface) { 450 } 451 452 func (h *cqWorkloadHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 453 } 454 455 func (h *cqWorkloadHandler) Generic(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 456 w := e.Object.(*kueue.Workload) 457 req := h.requestForWorkloadClusterQueue(w) 458 if req != nil { 459 q.AddAfter(*req, constants.UpdatesBatchPeriod) 460 } 461 } 462 463 func (h *cqWorkloadHandler) requestForWorkloadClusterQueue(w *kueue.Workload) *reconcile.Request { 464 var name string 465 if workload.HasQuotaReservation(w) { 466 name = string(w.Status.Admission.ClusterQueue) 467 } else { 468 var ok bool 469 name, ok = h.qManager.ClusterQueueForWorkload(w) 470 if !ok { 471 return nil 472 } 473 } 474 return &reconcile.Request{ 475 NamespacedName: types.NamespacedName{ 476 Name: name, 477 }, 478 } 479 } 480 481 // cqNamespaceHandler handles namespace update events. 482 type cqNamespaceHandler struct { 483 qManager *queue.Manager 484 cache *cache.Cache 485 } 486 487 func (h *cqNamespaceHandler) Create(ctx context.Context, e event.CreateEvent, q workqueue.RateLimitingInterface) { 488 } 489 490 func (h *cqNamespaceHandler) Update(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) { 491 oldNs := e.ObjectOld.(*corev1.Namespace) 492 oldMatchingCqs := h.cache.MatchingClusterQueues(oldNs.Labels) 493 newNs := e.ObjectNew.(*corev1.Namespace) 494 newMatchingCqs := h.cache.MatchingClusterQueues(newNs.Labels) 495 cqs := sets.New[string]() 496 for cq := range newMatchingCqs { 497 if !oldMatchingCqs.Has(cq) { 498 cqs.Insert(cq) 499 } 500 } 501 h.qManager.QueueInadmissibleWorkloads(ctx, cqs) 502 } 503 504 func (h *cqNamespaceHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 505 } 506 507 func (h *cqNamespaceHandler) Generic(context.Context, event.GenericEvent, workqueue.RateLimitingInterface) { 508 } 509 510 type cqResourceFlavorHandler struct { 511 cache *cache.Cache 512 } 513 514 func (h *cqResourceFlavorHandler) Create(context.Context, event.CreateEvent, workqueue.RateLimitingInterface) { 515 } 516 517 func (h *cqResourceFlavorHandler) Update(context.Context, event.UpdateEvent, workqueue.RateLimitingInterface) { 518 } 519 520 func (h *cqResourceFlavorHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 521 } 522 523 func (h *cqResourceFlavorHandler) Generic(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 524 rf, ok := e.Object.(*kueue.ResourceFlavor) 525 if !ok { 526 return 527 } 528 529 if cqs := h.cache.ClusterQueuesUsingFlavor(rf.Name); len(cqs) != 0 { 530 for _, cq := range cqs { 531 req := reconcile.Request{ 532 NamespacedName: types.NamespacedName{ 533 Name: cq, 534 }} 535 q.Add(req) 536 } 537 } 538 } 539 540 type cqAdmissionCheckHandler struct { 541 cache *cache.Cache 542 } 543 544 type cqSnapshotHandler struct { 545 queueVisibilityUpdateInterval time.Duration 546 } 547 548 func (h *cqAdmissionCheckHandler) Create(context.Context, event.CreateEvent, workqueue.RateLimitingInterface) { 549 } 550 551 func (h *cqAdmissionCheckHandler) Update(context.Context, event.UpdateEvent, workqueue.RateLimitingInterface) { 552 } 553 554 func (h *cqAdmissionCheckHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 555 } 556 557 func (h *cqAdmissionCheckHandler) Generic(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 558 ac, isAc := e.Object.(*kueue.AdmissionCheck) 559 if !isAc { 560 return 561 } 562 563 if cqs := h.cache.ClusterQueuesUsingAdmissionCheck(ac.Name); len(cqs) != 0 { 564 for _, cq := range cqs { 565 req := reconcile.Request{ 566 NamespacedName: types.NamespacedName{ 567 Name: cq, 568 }} 569 q.Add(req) 570 } 571 } 572 } 573 574 func (h *cqSnapshotHandler) Create(context.Context, event.CreateEvent, workqueue.RateLimitingInterface) { 575 } 576 577 func (h *cqSnapshotHandler) Update(context.Context, event.UpdateEvent, workqueue.RateLimitingInterface) { 578 } 579 580 func (h *cqSnapshotHandler) Delete(context.Context, event.DeleteEvent, workqueue.RateLimitingInterface) { 581 } 582 583 func (h *cqSnapshotHandler) Generic(_ context.Context, e event.GenericEvent, q workqueue.RateLimitingInterface) { 584 cq, isCq := e.Object.(*kueue.ClusterQueue) 585 if !isCq { 586 return 587 } 588 remainingTime := constants.UpdatesBatchPeriod 589 if cq.Status.PendingWorkloadsStatus != nil { 590 remainingTime = h.queueVisibilityUpdateInterval - time.Since(cq.Status.PendingWorkloadsStatus.LastChangeTime.Time) 591 if remainingTime <= constants.UpdatesBatchPeriod { 592 remainingTime = constants.UpdatesBatchPeriod 593 } 594 } 595 q.AddAfter(reconcile.Request{ 596 NamespacedName: types.NamespacedName{ 597 Name: cq.Name, 598 }}, remainingTime) 599 } 600 601 // SetupWithManager sets up the controller with the Manager. 602 func (r *ClusterQueueReconciler) SetupWithManager(mgr ctrl.Manager, cfg *config.Configuration) error { 603 wHandler := cqWorkloadHandler{ 604 qManager: r.qManager, 605 } 606 nsHandler := cqNamespaceHandler{ 607 qManager: r.qManager, 608 cache: r.cache, 609 } 610 rfHandler := cqResourceFlavorHandler{ 611 cache: r.cache, 612 } 613 acHandler := cqAdmissionCheckHandler{ 614 cache: r.cache, 615 } 616 snapHandler := cqSnapshotHandler{ 617 queueVisibilityUpdateInterval: r.queueVisibilityUpdateInterval, 618 } 619 return ctrl.NewControllerManagedBy(mgr). 620 For(&kueue.ClusterQueue{}). 621 WithOptions(controller.Options{NeedLeaderElection: ptr.To(false)}). 622 Watches(&corev1.Namespace{}, &nsHandler). 623 WatchesRawSource(&source.Channel{Source: r.wlUpdateCh}, &wHandler). 624 WatchesRawSource(&source.Channel{Source: r.rfUpdateCh}, &rfHandler). 625 WatchesRawSource(&source.Channel{Source: r.acUpdateCh}, &acHandler). 626 WatchesRawSource(&source.Channel{Source: r.snapUpdateCh}, &snapHandler). 627 WithEventFilter(r). 628 Complete(WithLeadingManager(mgr, r, &kueue.ClusterQueue{}, cfg)) 629 } 630 631 func (r *ClusterQueueReconciler) updateCqStatusIfChanged( 632 ctx context.Context, 633 cq *kueue.ClusterQueue, 634 conditionStatus metav1.ConditionStatus, 635 reason, msg string, 636 ) error { 637 oldStatus := cq.Status.DeepCopy() 638 pendingWorkloads := r.qManager.Pending(cq) 639 stats, err := r.cache.Usage(cq) 640 if err != nil { 641 r.log.Error(err, "Failed getting usage from cache") 642 // This is likely because the cluster queue was recently removed, 643 // but we didn't process that event yet. 644 return err 645 } 646 cq.Status.FlavorsReservation = stats.ReservedResources 647 cq.Status.FlavorsUsage = stats.AdmittedResources 648 cq.Status.ReservingWorkloads = int32(stats.ReservingWorkloads) 649 cq.Status.AdmittedWorkloads = int32(stats.AdmittedWorkloads) 650 cq.Status.PendingWorkloads = int32(pendingWorkloads) 651 cq.Status.PendingWorkloadsStatus = r.getWorkloadsStatus(cq) 652 meta.SetStatusCondition(&cq.Status.Conditions, metav1.Condition{ 653 Type: kueue.ClusterQueueActive, 654 Status: conditionStatus, 655 Reason: reason, 656 Message: msg, 657 }) 658 if !equality.Semantic.DeepEqual(cq.Status, oldStatus) { 659 return r.client.Status().Update(ctx, cq) 660 } 661 return nil 662 } 663 664 // Taking snapshot of cluster queue is enabled when maxcount non-zero 665 func (r *ClusterQueueReconciler) isVisibilityEnabled() bool { 666 return features.Enabled(features.QueueVisibility) && r.queueVisibilityClusterQueuesMaxCount > 0 667 } 668 669 func (r *ClusterQueueReconciler) getWorkloadsStatus(cq *kueue.ClusterQueue) *kueue.ClusterQueuePendingWorkloadsStatus { 670 if !r.isVisibilityEnabled() { 671 return nil 672 } 673 pendingWorkloads := r.qManager.GetSnapshot(cq.Name) 674 if cq.Status.PendingWorkloadsStatus == nil || 675 cq.Status.PendingWorkloadsStatus.Head == nil || 676 !equality.Semantic.DeepEqual(cq.Status.PendingWorkloadsStatus.Head, pendingWorkloads) { 677 return &kueue.ClusterQueuePendingWorkloadsStatus{ 678 Head: pendingWorkloads, 679 LastChangeTime: metav1.Time{Time: time.Now()}, 680 } 681 } 682 return cq.Status.PendingWorkloadsStatus 683 } 684 685 func (r *ClusterQueueReconciler) Start(ctx context.Context) error { 686 if !r.isVisibilityEnabled() { 687 return nil 688 } 689 690 defer r.snapshotsQueue.ShutDown() 691 692 for i := 0; i < snapshotWorkers; i++ { 693 go wait.UntilWithContext(ctx, r.takeSnapshot, r.queueVisibilityUpdateInterval) 694 } 695 696 go wait.UntilWithContext(ctx, r.enqueueTakeSnapshot, r.queueVisibilityUpdateInterval) 697 698 <-ctx.Done() 699 700 return nil 701 } 702 703 func (r *ClusterQueueReconciler) enqueueTakeSnapshot(ctx context.Context) { 704 for _, cq := range r.qManager.GetClusterQueueNames() { 705 r.snapshotsQueue.Add(cq) 706 } 707 } 708 709 func (r *ClusterQueueReconciler) takeSnapshot(ctx context.Context) { 710 for r.processNextSnapshot(ctx) { 711 } 712 } 713 714 func (r *ClusterQueueReconciler) processNextSnapshot(ctx context.Context) bool { 715 log := ctrl.LoggerFrom(ctx).WithName("processNextSnapshot") 716 717 key, quit := r.snapshotsQueue.Get() 718 if quit { 719 return false 720 } 721 722 startTime := time.Now() 723 defer func() { 724 log.V(5).Info("Finished snapshot job", "key", key, "elapsed", time.Since(startTime)) 725 }() 726 727 defer r.snapshotsQueue.Done(key) 728 729 cqName := key.(string) 730 if r.qManager.UpdateSnapshot(cqName, r.queueVisibilityClusterQueuesMaxCount) { 731 log.V(5).Info("Triggering CQ update due to snapshot change", "clusterQueue", klog.KRef("", cqName)) 732 r.snapUpdateCh <- event.GenericEvent{Object: &kueue.ClusterQueue{ 733 ObjectMeta: metav1.ObjectMeta{ 734 Name: cqName, 735 }, 736 }} 737 } 738 return true 739 }