sigs.k8s.io/kueue@v0.6.2/pkg/queue/manager.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package queue 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "sync" 24 25 "k8s.io/apimachinery/pkg/api/equality" 26 apierrors "k8s.io/apimachinery/pkg/api/errors" 27 "k8s.io/apimachinery/pkg/util/sets" 28 ctrl "sigs.k8s.io/controller-runtime" 29 "sigs.k8s.io/controller-runtime/pkg/client" 30 31 config "sigs.k8s.io/kueue/apis/config/v1beta1" 32 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 33 utilindexer "sigs.k8s.io/kueue/pkg/controller/core/indexer" 34 "sigs.k8s.io/kueue/pkg/metrics" 35 "sigs.k8s.io/kueue/pkg/workload" 36 ) 37 38 var ( 39 errQueueDoesNotExist = errors.New("queue doesn't exist") 40 errClusterQueueDoesNotExist = errors.New("clusterQueue doesn't exist") 41 errClusterQueueAlreadyExists = errors.New("clusterQueue already exists") 42 ) 43 44 type options struct { 45 podsReadyRequeuingTimestamp config.RequeuingTimestamp 46 } 47 48 // Option configures the manager. 49 type Option func(*options) 50 51 var defaultOptions = options{ 52 podsReadyRequeuingTimestamp: config.EvictionTimestamp, 53 } 54 55 // WithPodsReadyRequeuingTimestamp sets the timestamp that is used for ordering 56 // workloads that have been requeued due to the PodsReady condition. 57 func WithPodsReadyRequeuingTimestamp(ts config.RequeuingTimestamp) Option { 58 return func(o *options) { 59 o.podsReadyRequeuingTimestamp = ts 60 } 61 } 62 63 type Manager struct { 64 sync.RWMutex 65 cond sync.Cond 66 67 client client.Client 68 statusChecker StatusChecker 69 clusterQueues map[string]ClusterQueue 70 localQueues map[string]*LocalQueue 71 72 snapshotsMutex sync.RWMutex 73 snapshots map[string][]kueue.ClusterQueuePendingWorkload 74 75 // Key is cohort's name. Value is a set of associated ClusterQueue names. 76 cohorts map[string]sets.Set[string] 77 78 workloadOrdering workload.Ordering 79 } 80 81 func NewManager(client client.Client, checker StatusChecker, opts ...Option) *Manager { 82 options := defaultOptions 83 for _, opt := range opts { 84 opt(&options) 85 } 86 m := &Manager{ 87 client: client, 88 statusChecker: checker, 89 localQueues: make(map[string]*LocalQueue), 90 clusterQueues: make(map[string]ClusterQueue), 91 cohorts: make(map[string]sets.Set[string]), 92 snapshotsMutex: sync.RWMutex{}, 93 snapshots: make(map[string][]kueue.ClusterQueuePendingWorkload, 0), 94 workloadOrdering: workload.Ordering{ 95 PodsReadyRequeuingTimestamp: options.podsReadyRequeuingTimestamp, 96 }, 97 } 98 m.cond.L = &m.RWMutex 99 return m 100 } 101 102 func (m *Manager) AddClusterQueue(ctx context.Context, cq *kueue.ClusterQueue) error { 103 m.Lock() 104 defer m.Unlock() 105 106 if _, ok := m.clusterQueues[cq.Name]; ok { 107 return errClusterQueueAlreadyExists 108 } 109 110 cqImpl, err := newClusterQueue(cq, m.workloadOrdering) 111 if err != nil { 112 return err 113 } 114 m.clusterQueues[cq.Name] = cqImpl 115 116 cohort := cq.Spec.Cohort 117 if cohort != "" { 118 m.addCohort(cohort, cq.Name) 119 } 120 121 // Iterate through existing queues, as queues corresponding to this cluster 122 // queue might have been added earlier. 123 var queues kueue.LocalQueueList 124 if err := m.client.List(ctx, &queues, client.MatchingFields{utilindexer.QueueClusterQueueKey: cq.Name}); err != nil { 125 return fmt.Errorf("listing queues pointing to the cluster queue: %w", err) 126 } 127 addedWorkloads := false 128 for _, q := range queues.Items { 129 qImpl := m.localQueues[Key(&q)] 130 if qImpl != nil { 131 added := cqImpl.AddFromLocalQueue(qImpl) 132 addedWorkloads = addedWorkloads || added 133 } 134 } 135 136 queued := m.queueAllInadmissibleWorkloadsInCohort(ctx, cqImpl) 137 m.reportPendingWorkloads(cq.Name, cqImpl) 138 if queued || addedWorkloads { 139 m.Broadcast() 140 } 141 return nil 142 } 143 144 func (m *Manager) UpdateClusterQueue(ctx context.Context, cq *kueue.ClusterQueue, specUpdated bool) error { 145 m.Lock() 146 defer m.Unlock() 147 cqImpl, ok := m.clusterQueues[cq.Name] 148 if !ok { 149 return errClusterQueueDoesNotExist 150 } 151 152 oldCohort := cqImpl.Cohort() 153 oldActive := cqImpl.Active() 154 // TODO(#8): recreate heap based on a change of queueing policy. 155 if err := cqImpl.Update(cq); err != nil { 156 return err 157 } 158 newCohort := cqImpl.Cohort() 159 if oldCohort != newCohort { 160 m.updateCohort(oldCohort, newCohort, cq.Name) 161 } 162 163 // TODO(#8): Selectively move workloads based on the exact event. 164 // If any workload becomes admissible or the queue becomes active. 165 if (specUpdated && m.queueAllInadmissibleWorkloadsInCohort(ctx, cqImpl)) || (!oldActive && cqImpl.Active()) { 166 m.reportPendingWorkloads(cq.Name, cqImpl) 167 m.Broadcast() 168 } 169 return nil 170 } 171 172 func (m *Manager) DeleteClusterQueue(cq *kueue.ClusterQueue) { 173 m.Lock() 174 defer m.Unlock() 175 cqImpl := m.clusterQueues[cq.Name] 176 if cqImpl == nil { 177 return 178 } 179 delete(m.clusterQueues, cq.Name) 180 metrics.ClearQueueSystemMetrics(cq.Name) 181 182 cohort := cq.Spec.Cohort 183 m.deleteCohort(cohort, cq.Name) 184 } 185 186 func (m *Manager) AddLocalQueue(ctx context.Context, q *kueue.LocalQueue) error { 187 m.Lock() 188 defer m.Unlock() 189 190 key := Key(q) 191 if _, ok := m.localQueues[key]; ok { 192 return fmt.Errorf("queue %q already exists", q.Name) 193 } 194 qImpl := newLocalQueue(q) 195 m.localQueues[key] = qImpl 196 // Iterate through existing workloads, as workloads corresponding to this 197 // queue might have been added earlier. 198 var workloads kueue.WorkloadList 199 if err := m.client.List(ctx, &workloads, client.MatchingFields{utilindexer.WorkloadQueueKey: q.Name}, client.InNamespace(q.Namespace)); err != nil { 200 return fmt.Errorf("listing workloads that match the queue: %w", err) 201 } 202 for _, w := range workloads.Items { 203 w := w 204 if workload.HasQuotaReservation(&w) { 205 continue 206 } 207 workload.AdjustResources(ctx, m.client, &w) 208 qImpl.AddOrUpdate(workload.NewInfo(&w)) 209 } 210 cq := m.clusterQueues[qImpl.ClusterQueue] 211 if cq != nil && cq.AddFromLocalQueue(qImpl) { 212 m.Broadcast() 213 } 214 return nil 215 } 216 217 func (m *Manager) UpdateLocalQueue(q *kueue.LocalQueue) error { 218 m.Lock() 219 defer m.Unlock() 220 qImpl, ok := m.localQueues[Key(q)] 221 if !ok { 222 return errQueueDoesNotExist 223 } 224 if qImpl.ClusterQueue != string(q.Spec.ClusterQueue) { 225 oldCQ := m.clusterQueues[qImpl.ClusterQueue] 226 if oldCQ != nil { 227 oldCQ.DeleteFromLocalQueue(qImpl) 228 } 229 newCQ := m.clusterQueues[string(q.Spec.ClusterQueue)] 230 if newCQ != nil && newCQ.AddFromLocalQueue(qImpl) { 231 m.Broadcast() 232 } 233 } 234 qImpl.update(q) 235 return nil 236 } 237 238 func (m *Manager) DeleteLocalQueue(q *kueue.LocalQueue) { 239 m.Lock() 240 defer m.Unlock() 241 key := Key(q) 242 qImpl := m.localQueues[key] 243 if qImpl == nil { 244 return 245 } 246 cq := m.clusterQueues[qImpl.ClusterQueue] 247 if cq != nil { 248 cq.DeleteFromLocalQueue(qImpl) 249 } 250 delete(m.localQueues, key) 251 } 252 253 func (m *Manager) PendingWorkloads(q *kueue.LocalQueue) (int32, error) { 254 m.RLock() 255 defer m.RUnlock() 256 257 qImpl, ok := m.localQueues[Key(q)] 258 if !ok { 259 return 0, errQueueDoesNotExist 260 } 261 262 return int32(len(qImpl.items)), nil 263 } 264 265 func (m *Manager) Pending(cq *kueue.ClusterQueue) int { 266 m.RLock() 267 defer m.RUnlock() 268 return m.clusterQueues[cq.Name].Pending() 269 } 270 271 func (m *Manager) QueueForWorkloadExists(wl *kueue.Workload) bool { 272 m.RLock() 273 defer m.RUnlock() 274 _, ok := m.localQueues[workload.QueueKey(wl)] 275 return ok 276 } 277 278 // ClusterQueueForWorkload returns the name of the ClusterQueue where the 279 // workload should be queued and whether it exists. 280 // Returns empty string if the queue doesn't exist. 281 func (m *Manager) ClusterQueueForWorkload(wl *kueue.Workload) (string, bool) { 282 m.RLock() 283 defer m.RUnlock() 284 q, ok := m.localQueues[workload.QueueKey(wl)] 285 if !ok { 286 return "", false 287 } 288 _, ok = m.clusterQueues[q.ClusterQueue] 289 return q.ClusterQueue, ok 290 } 291 292 // AddOrUpdateWorkload adds or updates workload to the corresponding queue. 293 // Returns whether the queue existed. 294 func (m *Manager) AddOrUpdateWorkload(w *kueue.Workload) bool { 295 m.Lock() 296 defer m.Unlock() 297 return m.addOrUpdateWorkload(w) 298 } 299 300 func (m *Manager) addOrUpdateWorkload(w *kueue.Workload) bool { 301 qKey := workload.QueueKey(w) 302 q := m.localQueues[qKey] 303 if q == nil { 304 return false 305 } 306 wInfo := workload.NewInfo(w) 307 q.AddOrUpdate(wInfo) 308 cq := m.clusterQueues[q.ClusterQueue] 309 if cq == nil { 310 return false 311 } 312 cq.PushOrUpdate(wInfo) 313 m.reportPendingWorkloads(q.ClusterQueue, cq) 314 m.Broadcast() 315 return true 316 } 317 318 // RequeueWorkload requeues the workload ensuring that the queue and the 319 // workload still exist in the client cache and not admitted. It won't 320 // requeue if the workload is already in the queue (possible if the workload was updated). 321 func (m *Manager) RequeueWorkload(ctx context.Context, info *workload.Info, reason RequeueReason) bool { 322 m.Lock() 323 defer m.Unlock() 324 325 var w kueue.Workload 326 // Always get the newest workload to avoid requeuing the out-of-date obj. 327 err := m.client.Get(ctx, client.ObjectKeyFromObject(info.Obj), &w) 328 // Since the client is cached, the only possible error is NotFound 329 if apierrors.IsNotFound(err) || workload.HasQuotaReservation(&w) { 330 return false 331 } 332 333 q := m.localQueues[workload.QueueKey(&w)] 334 if q == nil { 335 return false 336 } 337 info.Update(&w) 338 q.AddOrUpdate(info) 339 cq := m.clusterQueues[q.ClusterQueue] 340 if cq == nil { 341 return false 342 } 343 344 added := cq.RequeueIfNotPresent(info, reason) 345 m.reportPendingWorkloads(q.ClusterQueue, cq) 346 if added { 347 m.Broadcast() 348 } 349 return added 350 } 351 352 func (m *Manager) DeleteWorkload(w *kueue.Workload) { 353 m.Lock() 354 m.deleteWorkloadFromQueueAndClusterQueue(w, workload.QueueKey(w)) 355 m.Unlock() 356 } 357 358 func (m *Manager) deleteWorkloadFromQueueAndClusterQueue(w *kueue.Workload, qKey string) { 359 q := m.localQueues[qKey] 360 if q == nil { 361 return 362 } 363 delete(q.items, workload.Key(w)) 364 cq := m.clusterQueues[q.ClusterQueue] 365 if cq != nil { 366 cq.Delete(w) 367 m.reportPendingWorkloads(q.ClusterQueue, cq) 368 } 369 } 370 371 // QueueAssociatedInadmissibleWorkloadsAfter requeues into the heaps all 372 // previously inadmissible workloads in the same ClusterQueue and cohort (if 373 // they exist) as the provided admitted workload to the heaps. 374 // An optional action can be executed at the beginning of the function, 375 // while holding the lock, to provide atomicity with the operations in the 376 // queues. 377 func (m *Manager) QueueAssociatedInadmissibleWorkloadsAfter(ctx context.Context, w *kueue.Workload, action func()) { 378 m.Lock() 379 defer m.Unlock() 380 if action != nil { 381 action() 382 } 383 384 q := m.localQueues[workload.QueueKey(w)] 385 if q == nil { 386 return 387 } 388 cq := m.clusterQueues[q.ClusterQueue] 389 if cq == nil { 390 return 391 } 392 393 if m.queueAllInadmissibleWorkloadsInCohort(ctx, cq) { 394 m.Broadcast() 395 } 396 } 397 398 // QueueInadmissibleWorkloads moves all inadmissibleWorkloads in 399 // corresponding ClusterQueues to heap. If at least one workload queued, 400 // we will broadcast the event. 401 func (m *Manager) QueueInadmissibleWorkloads(ctx context.Context, cqNames sets.Set[string]) { 402 m.Lock() 403 defer m.Unlock() 404 if len(cqNames) == 0 { 405 return 406 } 407 408 var queued bool 409 for name := range cqNames { 410 cq, exists := m.clusterQueues[name] 411 if !exists { 412 continue 413 } 414 if m.queueAllInadmissibleWorkloadsInCohort(ctx, cq) { 415 queued = true 416 } 417 } 418 419 if queued { 420 m.Broadcast() 421 } 422 } 423 424 // queueAllInadmissibleWorkloadsInCohort moves all workloads in the same 425 // cohort with this ClusterQueue from inadmissibleWorkloads to heap. If the 426 // cohort of this ClusterQueue is empty, it just moves all workloads in this 427 // ClusterQueue. If at least one workload is moved, returns true, otherwise 428 // returns false. 429 // The events listed below could make workloads in the same cohort admissible. 430 // Then queueAllInadmissibleWorkloadsInCohort need to be invoked. 431 // 1. delete events for any admitted workload in the cohort. 432 // 2. add events of any cluster queue in the cohort. 433 // 3. update events of any cluster queue in the cohort. 434 func (m *Manager) queueAllInadmissibleWorkloadsInCohort(ctx context.Context, cq ClusterQueue) bool { 435 cohort := cq.Cohort() 436 if cohort == "" { 437 return cq.QueueInadmissibleWorkloads(ctx, m.client) 438 } 439 440 queued := false 441 for cqName := range m.cohorts[cohort] { 442 if clusterQueue, ok := m.clusterQueues[cqName]; ok { 443 queued = clusterQueue.QueueInadmissibleWorkloads(ctx, m.client) || queued 444 } 445 } 446 return queued 447 } 448 449 // UpdateWorkload updates the workload to the corresponding queue or adds it if 450 // it didn't exist. Returns whether the queue existed. 451 func (m *Manager) UpdateWorkload(oldW, w *kueue.Workload) bool { 452 m.Lock() 453 defer m.Unlock() 454 if oldW.Spec.QueueName != w.Spec.QueueName { 455 m.deleteWorkloadFromQueueAndClusterQueue(w, workload.QueueKey(oldW)) 456 } 457 return m.addOrUpdateWorkload(w) 458 } 459 460 // CleanUpOnContext tracks the context. When closed, it wakes routines waiting 461 // on elements to be available. It should be called before doing any calls to 462 // Heads. 463 func (m *Manager) CleanUpOnContext(ctx context.Context) { 464 <-ctx.Done() 465 m.Broadcast() 466 } 467 468 // Heads returns the heads of the queues, along with their associated ClusterQueue. 469 // It blocks if the queues empty until they have elements or the context terminates. 470 func (m *Manager) Heads(ctx context.Context) []workload.Info { 471 m.Lock() 472 defer m.Unlock() 473 log := ctrl.LoggerFrom(ctx) 474 for { 475 workloads := m.heads() 476 log.V(3).Info("Obtained ClusterQueue heads", "count", len(workloads)) 477 if len(workloads) != 0 { 478 return workloads 479 } 480 select { 481 case <-ctx.Done(): 482 return nil 483 default: 484 m.cond.Wait() 485 } 486 } 487 } 488 489 func (m *Manager) heads() []workload.Info { 490 var workloads []workload.Info 491 for cqName, cq := range m.clusterQueues { 492 // Cache might be nil in tests, if cache is nil, we'll skip the check. 493 if m.statusChecker != nil && !m.statusChecker.ClusterQueueActive(cqName) { 494 continue 495 } 496 wl := cq.Pop() 497 if wl == nil { 498 continue 499 } 500 m.reportPendingWorkloads(cqName, cq) 501 wlCopy := *wl 502 wlCopy.ClusterQueue = cqName 503 workloads = append(workloads, wlCopy) 504 q := m.localQueues[workload.QueueKey(wl.Obj)] 505 delete(q.items, workload.Key(wl.Obj)) 506 } 507 return workloads 508 } 509 510 func (m *Manager) addCohort(cohort string, cqName string) { 511 if m.cohorts[cohort] == nil { 512 m.cohorts[cohort] = make(sets.Set[string]) 513 } 514 m.cohorts[cohort].Insert(cqName) 515 } 516 517 func (m *Manager) deleteCohort(cohort string, cqName string) { 518 if cohort == "" { 519 return 520 } 521 if m.cohorts[cohort] != nil { 522 m.cohorts[cohort].Delete(cqName) 523 if len(m.cohorts[cohort]) == 0 { 524 delete(m.cohorts, cohort) 525 } 526 } 527 } 528 529 func (m *Manager) updateCohort(oldCohort string, newCohort string, cqName string) { 530 m.deleteCohort(oldCohort, cqName) 531 m.addCohort(newCohort, cqName) 532 } 533 534 func (m *Manager) Broadcast() { 535 m.cond.Broadcast() 536 } 537 538 func (m *Manager) reportPendingWorkloads(cqName string, cq ClusterQueue) { 539 active := cq.PendingActive() 540 inadmissible := cq.PendingInadmissible() 541 if m.statusChecker != nil && !m.statusChecker.ClusterQueueActive(cqName) { 542 inadmissible += active 543 active = 0 544 } 545 metrics.ReportPendingWorkloads(cqName, active, inadmissible) 546 } 547 548 func (m *Manager) GetClusterQueueNames() []string { 549 m.RLock() 550 defer m.RUnlock() 551 clusterQueueNames := make([]string, 0, len(m.clusterQueues)) 552 for k := range m.clusterQueues { 553 clusterQueueNames = append(clusterQueueNames, k) 554 } 555 return clusterQueueNames 556 } 557 558 func (m *Manager) getClusterQueue(cqName string) ClusterQueue { 559 m.RLock() 560 defer m.RUnlock() 561 return m.clusterQueues[cqName] 562 } 563 564 func (m *Manager) PendingWorkloadsInfo(cqName string) []*workload.Info { 565 cq := m.getClusterQueue(cqName) 566 if cq == nil { 567 return nil 568 } 569 return cq.Snapshot() 570 } 571 572 func (m *Manager) ClusterQueueFromLocalQueue(lqName string) (string, error) { 573 if lq, ok := m.localQueues[lqName]; ok { 574 return lq.ClusterQueue, nil 575 } 576 return "", errQueueDoesNotExist 577 } 578 579 // UpdateSnapshot computes the new snapshot and replaces if it differs from the 580 // previous version. It returns true if the snapshot was actually updated. 581 func (m *Manager) UpdateSnapshot(cqName string, maxCount int32) bool { 582 cq := m.getClusterQueue(cqName) 583 if cq == nil { 584 return false 585 } 586 newSnapshot := make([]kueue.ClusterQueuePendingWorkload, 0) 587 for index, info := range cq.Snapshot() { 588 if int32(index) >= maxCount { 589 break 590 } 591 if info == nil { 592 continue 593 } 594 newSnapshot = append(newSnapshot, kueue.ClusterQueuePendingWorkload{ 595 Name: info.Obj.Name, 596 Namespace: info.Obj.Namespace, 597 }) 598 } 599 prevSnapshot := m.GetSnapshot(cqName) 600 if !equality.Semantic.DeepEqual(prevSnapshot, newSnapshot) { 601 m.setSnapshot(cqName, newSnapshot) 602 return true 603 } 604 return false 605 } 606 607 func (m *Manager) setSnapshot(cqName string, workloads []kueue.ClusterQueuePendingWorkload) { 608 m.snapshotsMutex.Lock() 609 defer m.snapshotsMutex.Unlock() 610 m.snapshots[cqName] = workloads 611 } 612 613 func (m *Manager) GetSnapshot(cqName string) []kueue.ClusterQueuePendingWorkload { 614 m.snapshotsMutex.RLock() 615 defer m.snapshotsMutex.RUnlock() 616 return m.snapshots[cqName] 617 } 618 619 func (m *Manager) DeleteSnapshot(cq *kueue.ClusterQueue) { 620 m.snapshotsMutex.Lock() 621 defer m.snapshotsMutex.Unlock() 622 delete(m.snapshots, cq.Name) 623 }