sigs.k8s.io/kueue@v0.6.2/pkg/controller/core/workload_controller.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package core 18 19 import ( 20 "context" 21 "fmt" 22 "time" 23 24 "github.com/go-logr/logr" 25 corev1 "k8s.io/api/core/v1" 26 nodev1 "k8s.io/api/node/v1" 27 "k8s.io/apimachinery/pkg/api/equality" 28 apimeta "k8s.io/apimachinery/pkg/api/meta" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" 31 "k8s.io/apimachinery/pkg/types" 32 "k8s.io/apimachinery/pkg/util/sets" 33 "k8s.io/apimachinery/pkg/util/wait" 34 "k8s.io/client-go/tools/record" 35 "k8s.io/client-go/util/workqueue" 36 "k8s.io/klog/v2" 37 "k8s.io/utils/clock" 38 "k8s.io/utils/ptr" 39 ctrl "sigs.k8s.io/controller-runtime" 40 "sigs.k8s.io/controller-runtime/pkg/client" 41 "sigs.k8s.io/controller-runtime/pkg/controller" 42 "sigs.k8s.io/controller-runtime/pkg/event" 43 "sigs.k8s.io/controller-runtime/pkg/handler" 44 "sigs.k8s.io/controller-runtime/pkg/reconcile" 45 46 config "sigs.k8s.io/kueue/apis/config/v1beta1" 47 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 48 "sigs.k8s.io/kueue/pkg/cache" 49 "sigs.k8s.io/kueue/pkg/constants" 50 "sigs.k8s.io/kueue/pkg/controller/core/indexer" 51 "sigs.k8s.io/kueue/pkg/queue" 52 "sigs.k8s.io/kueue/pkg/util/slices" 53 "sigs.k8s.io/kueue/pkg/workload" 54 ) 55 56 const ( 57 // statuses for logging purposes 58 pending = "pending" 59 admitted = "admitted" 60 finished = "finished" 61 ) 62 63 var ( 64 realClock = clock.RealClock{} 65 ) 66 67 type options struct { 68 watchers []WorkloadUpdateWatcher 69 podsReadyTimeout *time.Duration 70 requeuingBackoffLimitCount *int32 71 } 72 73 // Option configures the reconciler. 74 type Option func(*options) 75 76 // WithPodsReadyTimeout indicates if the controller should interrupt startup 77 // of a workload if it exceeds the timeout to reach the PodsReady=True condition. 78 func WithPodsReadyTimeout(value *time.Duration) Option { 79 return func(o *options) { 80 o.podsReadyTimeout = value 81 } 82 } 83 84 // WithRequeuingBackoffLimitCount indicates if the controller should deactivate a workload 85 // if it reaches the limitation. 86 func WithRequeuingBackoffLimitCount(value *int32) Option { 87 return func(o *options) { 88 o.requeuingBackoffLimitCount = value 89 } 90 } 91 92 // WithWorkloadUpdateWatchers allows to specify the workload update watchers 93 func WithWorkloadUpdateWatchers(value ...WorkloadUpdateWatcher) Option { 94 return func(o *options) { 95 o.watchers = value 96 } 97 } 98 99 var defaultOptions = options{} 100 101 type WorkloadUpdateWatcher interface { 102 NotifyWorkloadUpdate(oldWl, newWl *kueue.Workload) 103 } 104 105 // WorkloadReconciler reconciles a Workload object 106 type WorkloadReconciler struct { 107 log logr.Logger 108 queues *queue.Manager 109 cache *cache.Cache 110 client client.Client 111 watchers []WorkloadUpdateWatcher 112 podsReadyTimeout *time.Duration 113 requeuingBackoffLimitCount *int32 114 recorder record.EventRecorder 115 } 116 117 func NewWorkloadReconciler(client client.Client, queues *queue.Manager, cache *cache.Cache, recorder record.EventRecorder, opts ...Option) *WorkloadReconciler { 118 options := defaultOptions 119 for _, opt := range opts { 120 opt(&options) 121 } 122 123 return &WorkloadReconciler{ 124 log: ctrl.Log.WithName("workload-reconciler"), 125 client: client, 126 queues: queues, 127 cache: cache, 128 watchers: options.watchers, 129 podsReadyTimeout: options.podsReadyTimeout, 130 requeuingBackoffLimitCount: options.requeuingBackoffLimitCount, 131 recorder: recorder, 132 } 133 } 134 135 // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch 136 // +kubebuilder:rbac:groups="",resources=limitranges,verbs=get;list;watch 137 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete 138 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch 139 // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/finalizers,verbs=update 140 // +kubebuilder:rbac:groups=node.k8s.io,resources=runtimeclasses,verbs=get;list;watch 141 142 func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 143 var wl kueue.Workload 144 if err := r.client.Get(ctx, req.NamespacedName, &wl); err != nil { 145 // we'll ignore not-found errors, since there is nothing to do. 146 return ctrl.Result{}, client.IgnoreNotFound(err) 147 } 148 log := ctrl.LoggerFrom(ctx).WithValues("workload", klog.KObj(&wl)) 149 ctx = ctrl.LoggerInto(ctx, log) 150 log.V(2).Info("Reconciling Workload") 151 152 // If a deactivated workload is re-activated, we need to reset the RequeueState. 153 if wl.Status.RequeueState != nil && ptr.Deref(wl.Spec.Active, true) && workload.IsEvictedByDeactivation(&wl) { 154 wl.Status.RequeueState = nil 155 return ctrl.Result{}, workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) 156 } 157 158 if len(wl.ObjectMeta.OwnerReferences) == 0 && !wl.DeletionTimestamp.IsZero() { 159 return ctrl.Result{}, workload.RemoveFinalizer(ctx, r.client, &wl) 160 } 161 162 if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadFinished) { 163 return ctrl.Result{}, nil 164 } 165 166 cqName, cqOk := r.queues.ClusterQueueForWorkload(&wl) 167 if cqOk { 168 if updated, err := r.reconcileSyncAdmissionChecks(ctx, &wl, cqName); updated || err != nil { 169 return ctrl.Result{}, err 170 } 171 } 172 173 // If the workload is admitted, updating the status here would set the Admitted condition to 174 // false before the workloads eviction. 175 if !workload.IsAdmitted(&wl) && workload.SyncAdmittedCondition(&wl) { 176 if err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true); err != nil { 177 return ctrl.Result{}, err 178 } 179 if workload.IsAdmitted(&wl) { 180 c := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved) 181 r.recorder.Eventf(&wl, corev1.EventTypeNormal, "Admitted", "Admitted by ClusterQueue %v, wait time since reservation was %.0fs", wl.Status.Admission.ClusterQueue, time.Since(c.LastTransitionTime.Time).Seconds()) 182 } 183 return ctrl.Result{}, nil 184 } 185 186 if workload.HasQuotaReservation(&wl) { 187 if evictionTriggered, err := r.reconcileCheckBasedEviction(ctx, &wl); evictionTriggered || err != nil { 188 return ctrl.Result{}, err 189 } 190 191 if updated, err := r.reconcileOnClusterQueueActiveState(ctx, &wl, cqName); updated || err != nil { 192 return ctrl.Result{}, err 193 } 194 195 return r.reconcileNotReadyTimeout(ctx, req, &wl) 196 } 197 198 // At this point the workload is not Admitted, if it has rejected admission checks mark it as finished. 199 if rejectedChecks := workload.GetRejectedChecks(&wl); len(rejectedChecks) > 0 { 200 log.V(3).Info("Workload has Rejected admission checks, Finish with failure") 201 err := workload.UpdateStatus(ctx, r.client, &wl, kueue.WorkloadFinished, 202 metav1.ConditionTrue, 203 "AdmissionChecksRejected", 204 fmt.Sprintf("Admission checks %v are rejected", rejectedChecks), 205 constants.KueueName) 206 if err == nil { 207 for _, owner := range wl.OwnerReferences { 208 uowner := unstructured.Unstructured{} 209 uowner.SetKind(owner.Kind) 210 uowner.SetAPIVersion(owner.APIVersion) 211 uowner.SetName(owner.Name) 212 uowner.SetNamespace(wl.Namespace) 213 uowner.SetUID(owner.UID) 214 r.recorder.Eventf(&uowner, corev1.EventTypeNormal, "WorkloadFinished", "Admission checks %v are rejected", rejectedChecks) 215 } 216 } 217 return ctrl.Result{}, err 218 } 219 220 switch { 221 case !r.queues.QueueForWorkloadExists(&wl): 222 log.V(3).Info("Workload is inadmissible because of missing LocalQueue", "localQueue", klog.KRef(wl.Namespace, wl.Spec.QueueName)) 223 if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("LocalQueue %s doesn't exist", wl.Spec.QueueName)) { 224 err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) 225 return ctrl.Result{}, client.IgnoreNotFound(err) 226 } 227 case !cqOk: 228 log.V(3).Info("Workload is inadmissible because of missing ClusterQueue", "clusterQueue", klog.KRef("", cqName)) 229 if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s doesn't exist", cqName)) { 230 err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) 231 return ctrl.Result{}, client.IgnoreNotFound(err) 232 } 233 case !r.cache.ClusterQueueActive(cqName): 234 log.V(3).Info("Workload is inadmissible because ClusterQueue is inactive", "clusterQueue", klog.KRef("", cqName)) 235 if workload.UnsetQuotaReservationWithCondition(&wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is inactive", cqName)) { 236 err := workload.ApplyAdmissionStatus(ctx, r.client, &wl, true) 237 return ctrl.Result{}, client.IgnoreNotFound(err) 238 } 239 } 240 241 return ctrl.Result{}, nil 242 } 243 244 func (r *WorkloadReconciler) reconcileCheckBasedEviction(ctx context.Context, wl *kueue.Workload) (bool, error) { 245 if apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted) || !workload.HasRetryOrRejectedChecks(wl) { 246 return false, nil 247 } 248 log := ctrl.LoggerFrom(ctx) 249 log.V(3).Info("Workload is evicted due to admission checks") 250 workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByAdmissionCheck, "At least one admission check is false") 251 err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 252 return true, client.IgnoreNotFound(err) 253 } 254 255 func (r *WorkloadReconciler) reconcileSyncAdmissionChecks(ctx context.Context, wl *kueue.Workload, cqName string) (bool, error) { 256 // because we need to react to API cluster queue events, the list of checks from a cache can lead to race conditions 257 queue := kueue.ClusterQueue{} 258 if err := r.client.Get(ctx, types.NamespacedName{Name: cqName}, &queue); err != nil { 259 return false, err 260 } 261 262 queueAdmissionChecks := queue.Spec.AdmissionChecks 263 newChecks, shouldUpdate := syncAdmissionCheckConditions(wl.Status.AdmissionChecks, queueAdmissionChecks) 264 if shouldUpdate { 265 log := ctrl.LoggerFrom(ctx) 266 log.V(3).Info("The workload needs admission checks updates", "clusterQueue", klog.KRef("", cqName), "admissionChecks", queueAdmissionChecks) 267 wl.Status.AdmissionChecks = newChecks 268 err := r.client.Status().Update(ctx, wl) 269 return true, client.IgnoreNotFound(err) 270 } 271 return false, nil 272 } 273 274 func (r *WorkloadReconciler) reconcileOnClusterQueueActiveState(ctx context.Context, wl *kueue.Workload, cqName string) (bool, error) { 275 queue := kueue.ClusterQueue{} 276 err := r.client.Get(ctx, types.NamespacedName{Name: cqName}, &queue) 277 if client.IgnoreNotFound(err) != nil { 278 return false, err 279 } 280 281 queueStopPolicy := ptr.Deref(queue.Spec.StopPolicy, kueue.None) 282 283 log := ctrl.LoggerFrom(ctx) 284 if workload.IsAdmitted(wl) { 285 if queueStopPolicy != kueue.HoldAndDrain { 286 return false, nil 287 } 288 log.V(3).Info("Workload is evicted because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName)) 289 workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByClusterQueueStopped, "The ClusterQueue is stopped") 290 err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 291 return true, client.IgnoreNotFound(err) 292 } 293 294 if err != nil || !queue.DeletionTimestamp.IsZero() { 295 log.V(3).Info("Workload is inadmissible because the ClusterQueue is terminating or missing", "clusterQueue", klog.KRef("", cqName)) 296 _ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is terminating or missing", cqName)) 297 return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 298 } 299 300 if queueStopPolicy != kueue.None { 301 log.V(3).Info("Workload is inadmissible because the ClusterQueue is stopped", "clusterQueue", klog.KRef("", cqName)) 302 _ = workload.UnsetQuotaReservationWithCondition(wl, "Inadmissible", fmt.Sprintf("ClusterQueue %s is stopped", cqName)) 303 return true, workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 304 } 305 306 return false, nil 307 } 308 309 func syncAdmissionCheckConditions(conds []kueue.AdmissionCheckState, queueChecks []string) ([]kueue.AdmissionCheckState, bool) { 310 if len(queueChecks) == 0 { 311 return nil, len(conds) > 0 312 } 313 314 shouldUpdate := false 315 currentChecks := slices.ToRefMap(conds, func(c *kueue.AdmissionCheckState) string { return c.Name }) 316 for _, t := range queueChecks { 317 if _, found := currentChecks[t]; !found { 318 workload.SetAdmissionCheckState(&conds, kueue.AdmissionCheckState{ 319 Name: t, 320 State: kueue.CheckStatePending, 321 }) 322 shouldUpdate = true 323 } 324 } 325 326 // if the workload conditions length is bigger, then some cleanup should be done 327 if len(conds) > len(queueChecks) { 328 newConds := make([]kueue.AdmissionCheckState, 0, len(queueChecks)) 329 queueChecksSet := sets.New(queueChecks...) 330 shouldUpdate = true 331 for i := range conds { 332 c := &conds[i] 333 if queueChecksSet.Has(c.Name) { 334 newConds = append(newConds, *c) 335 } 336 } 337 conds = newConds 338 } 339 return conds, shouldUpdate 340 } 341 342 func (r *WorkloadReconciler) reconcileNotReadyTimeout(ctx context.Context, req ctrl.Request, wl *kueue.Workload) (ctrl.Result, error) { 343 log := ctrl.LoggerFrom(ctx) 344 345 if !ptr.Deref(wl.Spec.Active, true) || apimeta.IsStatusConditionTrue(wl.Status.Conditions, kueue.WorkloadEvicted) { 346 // the workload has already been evicted by the PodsReadyTimeout or been deactivated. 347 return ctrl.Result{}, nil 348 } 349 countingTowardsTimeout, recheckAfter := r.admittedNotReadyWorkload(wl, realClock) 350 if !countingTowardsTimeout { 351 return ctrl.Result{}, nil 352 } 353 if recheckAfter > 0 { 354 log.V(4).Info("Workload not yet ready and did not exceed its timeout", "recheckAfter", recheckAfter) 355 return ctrl.Result{RequeueAfter: recheckAfter}, nil 356 } 357 log.V(2).Info("Start the eviction of the workload due to exceeding the PodsReady timeout") 358 if deactivated, err := r.triggerDeactivationOrBackoffRequeue(ctx, wl); deactivated || err != nil { 359 return ctrl.Result{}, err 360 } 361 workload.SetEvictedCondition(wl, kueue.WorkloadEvictedByPodsReadyTimeout, fmt.Sprintf("Exceeded the PodsReady timeout %s", req.NamespacedName.String())) 362 err := workload.ApplyAdmissionStatus(ctx, r.client, wl, true) 363 return ctrl.Result{}, client.IgnoreNotFound(err) 364 } 365 366 // triggerDeactivationOrBackoffRequeue deactivates a workload (".spec.active"="false") 367 // if a re-queued number has already exceeded the limit of re-queuing backoff. 368 // Otherwise, it increments a re-queueing count and update a time to be re-queued. 369 // It returns true as a first value if a workload is deactivated. 370 func (r *WorkloadReconciler) triggerDeactivationOrBackoffRequeue(ctx context.Context, wl *kueue.Workload) (bool, error) { 371 if !workload.HasRequeueState(wl) { 372 wl.Status.RequeueState = &kueue.RequeueState{} 373 } 374 // If requeuingBackoffLimitCount equals to null, the workloads is repeatedly and endless re-queued. 375 requeuingCount := ptr.Deref(wl.Status.RequeueState.Count, 0) + 1 376 if r.requeuingBackoffLimitCount != nil && requeuingCount > *r.requeuingBackoffLimitCount { 377 wl.Spec.Active = ptr.To(false) 378 if err := r.client.Update(ctx, wl); err != nil { 379 return false, err 380 } 381 r.recorder.Eventf(wl, corev1.EventTypeNormal, kueue.WorkloadEvictedByDeactivation, 382 "Deactivated Workload %q by reached re-queue backoffLimitCount", klog.KObj(wl)) 383 return true, nil 384 } 385 // Every backoff duration is about "1.41284738^(n-1)+Rand" where the "n" represents the "requeuingCount", 386 // and the "Rand" represents the random jitter. During this time, the workload is taken as an inadmissible and 387 // other workloads will have a chance to be admitted. 388 // Considering the ".waitForPodsReady.timeout", 389 // this indicates that an evicted workload with PodsReadyTimeout reason is continued re-queuing for 390 // the "t(n+1) + SUM[k=1,n](1.41284738^(k-1) + Rand)" seconds where the "t" represents "waitForPodsReady.timeout". 391 // Given that the "backoffLimitCount" equals "30" and the "waitForPodsReady.timeout" equals "300" (default), 392 // the result equals 24 hours (+Rand seconds). 393 backoff := &wait.Backoff{ 394 Duration: 1 * time.Second, 395 Factor: 1.41284738, 396 Jitter: 0.0001, 397 Steps: int(requeuingCount), 398 } 399 var waitDuration time.Duration 400 for backoff.Steps > 0 { 401 waitDuration = backoff.Step() 402 } 403 wl.Status.RequeueState.RequeueAt = ptr.To(metav1.NewTime(time.Now().Add(waitDuration))) 404 wl.Status.RequeueState.Count = &requeuingCount 405 return false, nil 406 } 407 408 func (r *WorkloadReconciler) Create(e event.CreateEvent) bool { 409 wl, isWorkload := e.Object.(*kueue.Workload) 410 if !isWorkload { 411 // this event will be handled by the LimitRange/RuntimeClass handle 412 return true 413 } 414 defer r.notifyWatchers(nil, wl) 415 status := workloadStatus(wl) 416 log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status) 417 log.V(2).Info("Workload create event") 418 419 if status == finished { 420 return true 421 } 422 423 ctx := ctrl.LoggerInto(context.Background(), log) 424 wlCopy := wl.DeepCopy() 425 workload.AdjustResources(ctx, r.client, wlCopy) 426 427 if !workload.HasQuotaReservation(wl) { 428 if !r.queues.AddOrUpdateWorkload(wlCopy) { 429 log.V(2).Info("Queue for workload didn't exist; ignored for now") 430 } 431 return true 432 } 433 if !r.cache.AddOrUpdateWorkload(wlCopy) { 434 log.V(2).Info("ClusterQueue for workload didn't exist; ignored for now") 435 } 436 437 return true 438 } 439 440 func (r *WorkloadReconciler) Delete(e event.DeleteEvent) bool { 441 wl, isWorkload := e.Object.(*kueue.Workload) 442 if !isWorkload { 443 // this event will be handled by the LimitRange/RuntimeClass handle 444 return true 445 } 446 defer r.notifyWatchers(wl, nil) 447 status := "unknown" 448 if !e.DeleteStateUnknown { 449 status = workloadStatus(wl) 450 } 451 log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status) 452 log.V(2).Info("Workload delete event") 453 ctx := ctrl.LoggerInto(context.Background(), log) 454 455 // When assigning a clusterQueue to a workload, we assume it in the cache. If 456 // the state is unknown, the workload could have been assumed, and we need 457 // to clear it from the cache. 458 if workload.HasQuotaReservation(wl) || e.DeleteStateUnknown { 459 // trigger the move of associated inadmissibleWorkloads if required. 460 r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() { 461 // Delete the workload from cache while holding the queues lock 462 // to guarantee that requeueued workloads are taken into account before 463 // the next scheduling cycle. 464 if err := r.cache.DeleteWorkload(wl); err != nil { 465 if !e.DeleteStateUnknown { 466 log.Error(err, "Failed to delete workload from cache") 467 } 468 } 469 }) 470 } 471 472 // Even if the state is unknown, the last cached state tells us whether the 473 // workload was in the queues and should be cleared from them. 474 r.queues.DeleteWorkload(wl) 475 476 return true 477 } 478 479 func (r *WorkloadReconciler) Update(e event.UpdateEvent) bool { 480 oldWl, isWorkload := e.ObjectOld.(*kueue.Workload) 481 if !isWorkload { 482 // this event will be handled by the LimitRange/RuntimeClass handle 483 return true 484 } 485 wl := e.ObjectNew.(*kueue.Workload) 486 defer r.notifyWatchers(oldWl, wl) 487 488 status := workloadStatus(wl) 489 log := r.log.WithValues("workload", klog.KObj(wl), "queue", wl.Spec.QueueName, "status", status) 490 ctx := ctrl.LoggerInto(context.Background(), log) 491 active := ptr.Deref(wl.Spec.Active, true) 492 493 prevQueue := oldWl.Spec.QueueName 494 if prevQueue != wl.Spec.QueueName { 495 log = log.WithValues("prevQueue", prevQueue) 496 } 497 prevStatus := workloadStatus(oldWl) 498 if prevStatus != status { 499 log = log.WithValues("prevStatus", prevStatus) 500 } 501 if workload.HasQuotaReservation(wl) { 502 log = log.WithValues("clusterQueue", wl.Status.Admission.ClusterQueue) 503 } 504 if workload.HasQuotaReservation(oldWl) && (!workload.HasQuotaReservation(wl) || wl.Status.Admission.ClusterQueue != oldWl.Status.Admission.ClusterQueue) { 505 log = log.WithValues("prevClusterQueue", oldWl.Status.Admission.ClusterQueue) 506 } 507 log.V(2).Info("Workload update event") 508 509 wlCopy := wl.DeepCopy() 510 // We do not handle old workload here as it will be deleted or replaced by new one anyway. 511 workload.AdjustResources(ctrl.LoggerInto(ctx, log), r.client, wlCopy) 512 513 switch { 514 case status == finished || !active: 515 if !active { 516 log.V(2).Info("Workload will not be queued because the workload is not active", "workload", klog.KObj(wl)) 517 } 518 // The workload could have been in the queues if we missed an event. 519 r.queues.DeleteWorkload(wl) 520 521 // trigger the move of associated inadmissibleWorkloads, if there are any. 522 r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() { 523 // Delete the workload from cache while holding the queues lock 524 // to guarantee that requeueued workloads are taken into account before 525 // the next scheduling cycle. 526 if err := r.cache.DeleteWorkload(oldWl); err != nil && prevStatus == admitted { 527 log.Error(err, "Failed to delete workload from cache") 528 } 529 }) 530 531 case prevStatus == pending && status == pending: 532 if !r.queues.UpdateWorkload(oldWl, wlCopy) { 533 log.V(2).Info("Queue for updated workload didn't exist; ignoring for now") 534 } 535 536 case prevStatus == pending && status == admitted: 537 r.queues.DeleteWorkload(oldWl) 538 if !r.cache.AddOrUpdateWorkload(wlCopy) { 539 log.V(2).Info("ClusterQueue for workload didn't exist; ignored for now") 540 } 541 case prevStatus == admitted && status == pending: 542 // trigger the move of associated inadmissibleWorkloads, if there are any. 543 r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() { 544 // Delete the workload from cache while holding the queues lock 545 // to guarantee that requeueued workloads are taken into account before 546 // the next scheduling cycle. 547 if err := r.cache.DeleteWorkload(wl); err != nil { 548 log.Error(err, "Failed to delete workload from cache") 549 } 550 }) 551 var backoff time.Duration 552 if wlCopy.Status.RequeueState != nil && wlCopy.Status.RequeueState.RequeueAt != nil { 553 backoff = time.Until(wl.Status.RequeueState.RequeueAt.Time) 554 } 555 if backoff <= 0 { 556 if !r.queues.AddOrUpdateWorkload(wlCopy) { 557 log.V(2).Info("Queue for workload didn't exist; ignored for now") 558 } 559 } else { 560 log.V(3).Info("Workload to be requeued after backoff", "backoff", backoff, "requeueAt", wl.Status.RequeueState.RequeueAt.Time) 561 time.AfterFunc(backoff, func() { 562 updatedWl := kueue.Workload{} 563 err := r.client.Get(ctx, client.ObjectKeyFromObject(wl), &updatedWl) 564 if err == nil && workloadStatus(&updatedWl) == pending { 565 if !r.queues.AddOrUpdateWorkload(wlCopy) { 566 log.V(2).Info("Queue for workload didn't exist; ignored for now") 567 } else { 568 log.V(3).Info("Workload requeued after backoff") 569 } 570 } 571 }) 572 } 573 case prevStatus == admitted && status == admitted && !equality.Semantic.DeepEqual(oldWl.Status.ReclaimablePods, wl.Status.ReclaimablePods): 574 // trigger the move of associated inadmissibleWorkloads, if there are any. 575 r.queues.QueueAssociatedInadmissibleWorkloadsAfter(ctx, wl, func() { 576 // Update the workload from cache while holding the queues lock 577 // to guarantee that requeued workloads are taken into account before 578 // the next scheduling cycle. 579 if err := r.cache.UpdateWorkload(oldWl, wlCopy); err != nil { 580 log.Error(err, "Failed to delete workload from cache") 581 } 582 }) 583 584 default: 585 // Workload update in the cache is handled here; however, some fields are immutable 586 // and are not supposed to actually change anything. 587 if err := r.cache.UpdateWorkload(oldWl, wlCopy); err != nil { 588 log.Error(err, "Updating workload in cache") 589 } 590 } 591 592 return true 593 } 594 595 func (r *WorkloadReconciler) Generic(e event.GenericEvent) bool { 596 r.log.V(3).Info("Ignore generic event", "obj", klog.KObj(e.Object), "kind", e.Object.GetObjectKind().GroupVersionKind()) 597 return false 598 } 599 600 func (r *WorkloadReconciler) notifyWatchers(oldWl, newWl *kueue.Workload) { 601 for _, w := range r.watchers { 602 w.NotifyWorkloadUpdate(oldWl, newWl) 603 } 604 } 605 606 // SetupWithManager sets up the controller with the Manager. 607 func (r *WorkloadReconciler) SetupWithManager(mgr ctrl.Manager, cfg *config.Configuration) error { 608 ruh := &resourceUpdatesHandler{ 609 r: r, 610 } 611 return ctrl.NewControllerManagedBy(mgr). 612 For(&kueue.Workload{}). 613 WithOptions(controller.Options{NeedLeaderElection: ptr.To(false)}). 614 Watches(&corev1.LimitRange{}, ruh). 615 Watches(&nodev1.RuntimeClass{}, ruh). 616 Watches(&kueue.ClusterQueue{}, &workloadCqHandler{client: r.client}). 617 WithEventFilter(r). 618 Complete(WithLeadingManager(mgr, r, &kueue.Workload{}, cfg)) 619 } 620 621 // admittedNotReadyWorkload returns as a pair of values. The first boolean determines 622 // if the workload is currently counting towards the timeout for PodsReady, i.e. 623 // it has the Admitted condition True and the PodsReady condition not equal 624 // True (False or not set). The second value is the remaining time to exceed the 625 // specified timeout counted since max of the LastTransitionTime's for the 626 // Admitted and PodsReady conditions. 627 func (r *WorkloadReconciler) admittedNotReadyWorkload(wl *kueue.Workload, clock clock.Clock) (bool, time.Duration) { 628 if r.podsReadyTimeout == nil { 629 // the timeout is not configured for the workload controller 630 return false, 0 631 } 632 if !workload.IsAdmitted(wl) { 633 // the workload is not admitted so there is no need to time it out 634 return false, 0 635 } 636 637 podsReadyCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadPodsReady) 638 if podsReadyCond != nil && podsReadyCond.Status == metav1.ConditionTrue { 639 return false, 0 640 } 641 admittedCond := apimeta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadAdmitted) 642 elapsedTime := clock.Since(admittedCond.LastTransitionTime.Time) 643 if podsReadyCond != nil && podsReadyCond.Status == metav1.ConditionFalse && podsReadyCond.LastTransitionTime.After(admittedCond.LastTransitionTime.Time) { 644 elapsedTime = clock.Since(podsReadyCond.LastTransitionTime.Time) 645 } 646 waitFor := *r.podsReadyTimeout - elapsedTime 647 if waitFor < 0 { 648 waitFor = 0 649 } 650 return true, waitFor 651 } 652 653 func workloadStatus(w *kueue.Workload) string { 654 if apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadFinished) { 655 return finished 656 } 657 if workload.HasQuotaReservation(w) { 658 return admitted 659 } 660 return pending 661 } 662 663 type resourceUpdatesHandler struct { 664 r *WorkloadReconciler 665 } 666 667 func (h *resourceUpdatesHandler) Create(ctx context.Context, e event.CreateEvent, q workqueue.RateLimitingInterface) { 668 log := ctrl.LoggerFrom(ctx).WithValues("kind", e.Object.GetObjectKind()) 669 ctx = ctrl.LoggerInto(ctx, log) 670 log.V(5).Info("Create event") 671 h.handle(ctx, e.Object, q) 672 } 673 674 func (h *resourceUpdatesHandler) Update(ctx context.Context, e event.UpdateEvent, q workqueue.RateLimitingInterface) { 675 log := ctrl.LoggerFrom(ctx).WithValues("kind", e.ObjectNew.GetObjectKind()) 676 ctx = ctrl.LoggerInto(ctx, log) 677 log.V(5).Info("Update event") 678 h.handle(ctx, e.ObjectNew, q) 679 } 680 681 func (h *resourceUpdatesHandler) Delete(ctx context.Context, e event.DeleteEvent, q workqueue.RateLimitingInterface) { 682 log := ctrl.LoggerFrom(ctx).WithValues("kind", e.Object.GetObjectKind()) 683 ctx = ctrl.LoggerInto(ctx, log) 684 log.V(5).Info("Delete event") 685 h.handle(ctx, e.Object, q) 686 } 687 688 func (h *resourceUpdatesHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) { 689 } 690 691 func (h *resourceUpdatesHandler) handle(ctx context.Context, obj client.Object, q workqueue.RateLimitingInterface) { 692 switch v := obj.(type) { 693 case *corev1.LimitRange: 694 log := ctrl.LoggerFrom(ctx).WithValues("limitRange", klog.KObj(v)) 695 ctx = ctrl.LoggerInto(ctx, log) 696 h.queueReconcileForPending(ctx, q, client.InNamespace(v.Namespace)) 697 case *nodev1.RuntimeClass: 698 log := ctrl.LoggerFrom(ctx).WithValues("runtimeClass", klog.KObj(v)) 699 ctx = ctrl.LoggerInto(ctx, log) 700 h.queueReconcileForPending(ctx, q, client.MatchingFields{indexer.WorkloadRuntimeClassKey: v.Name}) 701 default: 702 panic(v) 703 } 704 } 705 706 func (h *resourceUpdatesHandler) queueReconcileForPending(ctx context.Context, _ workqueue.RateLimitingInterface, opts ...client.ListOption) { 707 log := ctrl.LoggerFrom(ctx) 708 lst := kueue.WorkloadList{} 709 opts = append(opts, client.MatchingFields{indexer.WorkloadQuotaReservedKey: string(metav1.ConditionFalse)}) 710 err := h.r.client.List(ctx, &lst, opts...) 711 if err != nil { 712 log.Error(err, "Could not list pending workloads") 713 } 714 log.V(4).Info("Updating pending workload requests", "count", len(lst.Items)) 715 for _, w := range lst.Items { 716 wlCopy := w.DeepCopy() 717 log := log.WithValues("workload", klog.KObj(wlCopy)) 718 log.V(5).Info("Queue reconcile for") 719 workload.AdjustResources(ctrl.LoggerInto(ctx, log), h.r.client, wlCopy) 720 if !h.r.queues.AddOrUpdateWorkload(wlCopy) { 721 log.V(2).Info("Queue for workload didn't exist") 722 } 723 } 724 } 725 726 type workloadCqHandler struct { 727 client client.Client 728 } 729 730 var _ handler.EventHandler = (*workloadCqHandler)(nil) 731 732 // Create is called in response to a create event. 733 func (w *workloadCqHandler) Create(ctx context.Context, ev event.CreateEvent, wq workqueue.RateLimitingInterface) { 734 if cq, isQueue := ev.Object.(*kueue.ClusterQueue); isQueue { 735 w.queueReconcileForWorkloads(ctx, cq.Name, wq) 736 } 737 } 738 739 // Update is called in response to an update event. 740 func (w *workloadCqHandler) Update(ctx context.Context, ev event.UpdateEvent, wq workqueue.RateLimitingInterface) { 741 log := ctrl.LoggerFrom(ctx).WithValues("clusterQueue", klog.KObj(ev.ObjectNew)) 742 ctx = ctrl.LoggerInto(ctx, log) 743 log.V(5).Info("Workload cluster queue update event") 744 oldCq, oldIsQueue := ev.ObjectOld.(*kueue.ClusterQueue) 745 newCq, newIsQueue := ev.ObjectNew.(*kueue.ClusterQueue) 746 747 if !oldIsQueue || !newIsQueue { 748 return 749 } 750 751 if !newCq.DeletionTimestamp.IsZero() || 752 !slices.CmpNoOrder(oldCq.Spec.AdmissionChecks, newCq.Spec.AdmissionChecks) || 753 !ptr.Equal(oldCq.Spec.StopPolicy, newCq.Spec.StopPolicy) { 754 w.queueReconcileForWorkloads(ctx, newCq.Name, wq) 755 } 756 } 757 758 // Delete is called in response to a delete event. 759 func (w *workloadCqHandler) Delete(ctx context.Context, ev event.DeleteEvent, wq workqueue.RateLimitingInterface) { 760 if cq, isQueue := ev.Object.(*kueue.ClusterQueue); isQueue { 761 w.queueReconcileForWorkloads(ctx, cq.Name, wq) 762 } 763 } 764 765 // Generic is called in response to an event of an unknown type or a synthetic event triggered as a cron or 766 // external trigger request. 767 func (w *workloadCqHandler) Generic(_ context.Context, _ event.GenericEvent, _ workqueue.RateLimitingInterface) { 768 // nothing to do here 769 } 770 771 func (w *workloadCqHandler) queueReconcileForWorkloads(ctx context.Context, cqName string, wq workqueue.RateLimitingInterface) { 772 log := ctrl.LoggerFrom(ctx) 773 lst := kueue.LocalQueueList{} 774 err := w.client.List(ctx, &lst, client.MatchingFields{indexer.QueueClusterQueueKey: cqName}) 775 if err != nil { 776 log.Error(err, "Could not list cluster queues local queues") 777 } 778 for _, lq := range lst.Items { 779 log := log.WithValues("localQueue", klog.KObj(&lq)) 780 ctx = ctrl.LoggerInto(ctx, log) 781 w.queueReconcileForWorkloadsOfLocalQueue(ctx, lq.Namespace, lq.Name, wq) 782 } 783 } 784 785 func (w *workloadCqHandler) queueReconcileForWorkloadsOfLocalQueue(ctx context.Context, namespace string, name string, wq workqueue.RateLimitingInterface) { 786 log := ctrl.LoggerFrom(ctx) 787 lst := kueue.WorkloadList{} 788 err := w.client.List(ctx, &lst, &client.ListOptions{Namespace: namespace}, client.MatchingFields{indexer.WorkloadQueueKey: name}) 789 if err != nil { 790 log.Error(err, "Could not list cluster queues workloads") 791 } 792 for _, wl := range lst.Items { 793 log := log.WithValues("workload", klog.KObj(&wl)) 794 req := reconcile.Request{ 795 NamespacedName: types.NamespacedName{ 796 Name: wl.Name, 797 Namespace: wl.Namespace, 798 }, 799 } 800 wq.Add(req) 801 log.V(5).Info("Queued reconcile for workload") 802 } 803 }