sigs.k8s.io/kueue@v0.6.2/pkg/cache/clusterqueue.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package cache 18 19 import ( 20 "errors" 21 "fmt" 22 "strings" 23 24 corev1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/equality" 26 apimeta "k8s.io/apimachinery/pkg/api/meta" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/labels" 29 "k8s.io/apimachinery/pkg/util/sets" 30 "k8s.io/utils/ptr" 31 32 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 33 "sigs.k8s.io/kueue/pkg/features" 34 "sigs.k8s.io/kueue/pkg/metrics" 35 "sigs.k8s.io/kueue/pkg/workload" 36 ) 37 38 var ( 39 errQueueAlreadyExists = errors.New("queue already exists") 40 ) 41 42 // ClusterQueue is the internal implementation of kueue.ClusterQueue that 43 // holds admitted workloads. 44 type ClusterQueue struct { 45 Name string 46 Cohort *Cohort 47 ResourceGroups []ResourceGroup 48 RGByResource map[corev1.ResourceName]*ResourceGroup 49 Usage FlavorResourceQuantities 50 AdmittedUsage FlavorResourceQuantities 51 Workloads map[string]*workload.Info 52 WorkloadsNotReady sets.Set[string] 53 NamespaceSelector labels.Selector 54 Preemption kueue.ClusterQueuePreemption 55 FlavorFungibility kueue.FlavorFungibility 56 AdmissionChecks sets.Set[string] 57 Status metrics.ClusterQueueStatus 58 // GuaranteedQuota records how much resource quota the ClusterQueue reserved 59 // when feature LendingLimit is enabled and flavor's lendingLimit is not nil. 60 GuaranteedQuota FlavorResourceQuantities 61 // AllocatableResourceGeneration will be increased when some admitted workloads are 62 // deleted, or the resource groups are changed. 63 AllocatableResourceGeneration int64 64 65 // The following fields are not populated in a snapshot. 66 67 // Key is localQueue's key (namespace/name). 68 localQueues map[string]*queue 69 podsReadyTracking bool 70 hasMissingFlavors bool 71 hasMissingOrInactiveAdmissionChecks bool 72 admittedWorkloadsCount int 73 isStopped bool 74 } 75 76 // Cohort is a set of ClusterQueues that can borrow resources from each other. 77 type Cohort struct { 78 Name string 79 Members sets.Set[*ClusterQueue] 80 81 // These fields are only populated for a snapshot. This field equals to 82 // the sum of LendingLimit when feature LendingLimit enabled. 83 RequestableResources FlavorResourceQuantities 84 Usage FlavorResourceQuantities 85 // This field will only be set in snapshot. This field equals to 86 // the sum of allocatable generation among its members. 87 AllocatableResourceGeneration int64 88 } 89 90 type ResourceGroup struct { 91 CoveredResources sets.Set[corev1.ResourceName] 92 Flavors []FlavorQuotas 93 // The set of key labels from all flavors. 94 // Those keys define the affinity terms of a workload 95 // that can be matched against the flavors. 96 LabelKeys sets.Set[string] 97 } 98 99 // FlavorQuotas holds a processed ClusterQueue flavor quota. 100 type FlavorQuotas struct { 101 Name kueue.ResourceFlavorReference 102 Resources map[corev1.ResourceName]*ResourceQuota 103 } 104 105 type ResourceQuota struct { 106 Nominal int64 107 BorrowingLimit *int64 108 LendingLimit *int64 109 } 110 111 type FlavorResourceQuantities map[kueue.ResourceFlavorReference]map[corev1.ResourceName]int64 112 113 type queue struct { 114 key string 115 reservingWorkloads int 116 admittedWorkloads int 117 //TODO: rename this to better distinguish between reserved and "in use" quantities 118 usage FlavorResourceQuantities 119 admittedUsage FlavorResourceQuantities 120 } 121 122 func newCohort(name string, size int) *Cohort { 123 return &Cohort{ 124 Name: name, 125 Members: make(sets.Set[*ClusterQueue], size), 126 } 127 } 128 129 func (c *ClusterQueue) FitInCohort(q FlavorResourceQuantities) bool { 130 for flavor, qResources := range q { 131 if _, flavorFound := c.Cohort.RequestableResources[flavor]; flavorFound { 132 for resource, value := range qResources { 133 available := c.RequestableCohortQuota(flavor, resource) - c.UsedCohortQuota(flavor, resource) 134 if available < value { 135 return false 136 } 137 } 138 } else { 139 return false 140 } 141 } 142 return true 143 } 144 145 func (c *ClusterQueue) IsBorrowing() bool { 146 if c.Cohort == nil || len(c.Usage) == 0 { 147 return false 148 } 149 for _, rg := range c.ResourceGroups { 150 for _, flvQuotas := range rg.Flavors { 151 if flvUsage, isUsing := c.Usage[flvQuotas.Name]; isUsing { 152 for rName, rQuota := range flvQuotas.Resources { 153 used := flvUsage[rName] 154 if used > rQuota.Nominal { 155 return true 156 } 157 } 158 } 159 } 160 } 161 return false 162 } 163 164 func (c *ClusterQueue) Active() bool { 165 return c.Status == active 166 } 167 168 var defaultPreemption = kueue.ClusterQueuePreemption{ 169 ReclaimWithinCohort: kueue.PreemptionPolicyNever, 170 WithinClusterQueue: kueue.PreemptionPolicyNever, 171 } 172 173 var defaultFlavorFungibility = kueue.FlavorFungibility{WhenCanBorrow: kueue.Borrow, WhenCanPreempt: kueue.TryNextFlavor} 174 175 func (c *ClusterQueue) update(in *kueue.ClusterQueue, resourceFlavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor, admissionChecks map[string]AdmissionCheck) error { 176 c.updateResourceGroups(in.Spec.ResourceGroups) 177 nsSelector, err := metav1.LabelSelectorAsSelector(in.Spec.NamespaceSelector) 178 if err != nil { 179 return err 180 } 181 c.NamespaceSelector = nsSelector 182 183 c.isStopped = ptr.Deref(in.Spec.StopPolicy, kueue.None) != kueue.None 184 185 c.AdmissionChecks = sets.New(in.Spec.AdmissionChecks...) 186 187 c.Usage = filterQuantities(c.Usage, in.Spec.ResourceGroups) 188 c.AdmittedUsage = filterQuantities(c.AdmittedUsage, in.Spec.ResourceGroups) 189 c.UpdateWithFlavors(resourceFlavors) 190 c.updateWithAdmissionChecks(admissionChecks) 191 192 if in.Spec.Preemption != nil { 193 c.Preemption = *in.Spec.Preemption 194 } else { 195 c.Preemption = defaultPreemption 196 } 197 198 if in.Spec.FlavorFungibility != nil { 199 c.FlavorFungibility = *in.Spec.FlavorFungibility 200 if c.FlavorFungibility.WhenCanBorrow == "" { 201 c.FlavorFungibility.WhenCanBorrow = defaultFlavorFungibility.WhenCanBorrow 202 } 203 if c.FlavorFungibility.WhenCanPreempt == "" { 204 c.FlavorFungibility.WhenCanPreempt = defaultFlavorFungibility.WhenCanPreempt 205 } 206 } else { 207 c.FlavorFungibility = defaultFlavorFungibility 208 } 209 210 if features.Enabled(features.LendingLimit) { 211 var guaranteedQuota FlavorResourceQuantities 212 for _, rg := range c.ResourceGroups { 213 for _, flvQuotas := range rg.Flavors { 214 for rName, rQuota := range flvQuotas.Resources { 215 if rQuota.LendingLimit != nil { 216 if guaranteedQuota == nil { 217 guaranteedQuota = make(FlavorResourceQuantities) 218 } 219 if guaranteedQuota[flvQuotas.Name] == nil { 220 guaranteedQuota[flvQuotas.Name] = make(map[corev1.ResourceName]int64) 221 } 222 guaranteedQuota[flvQuotas.Name][rName] = rQuota.Nominal - *rQuota.LendingLimit 223 } 224 } 225 } 226 } 227 c.GuaranteedQuota = guaranteedQuota 228 } 229 230 return nil 231 } 232 233 func filterQuantities(orig FlavorResourceQuantities, resourceGroups []kueue.ResourceGroup) FlavorResourceQuantities { 234 ret := make(FlavorResourceQuantities) 235 for _, rg := range resourceGroups { 236 for _, f := range rg.Flavors { 237 existingUsedResources := orig[f.Name] 238 usedResources := make(map[corev1.ResourceName]int64, len(f.Resources)) 239 for _, r := range f.Resources { 240 usedResources[r.Name] = existingUsedResources[r.Name] 241 } 242 ret[f.Name] = usedResources 243 } 244 } 245 return ret 246 } 247 248 func (c *ClusterQueue) updateResourceGroups(in []kueue.ResourceGroup) { 249 oldRG := c.ResourceGroups 250 c.ResourceGroups = make([]ResourceGroup, len(in)) 251 for i, rgIn := range in { 252 rg := &c.ResourceGroups[i] 253 *rg = ResourceGroup{ 254 CoveredResources: sets.New(rgIn.CoveredResources...), 255 Flavors: make([]FlavorQuotas, 0, len(rgIn.Flavors)), 256 } 257 for i := range rgIn.Flavors { 258 fIn := &rgIn.Flavors[i] 259 fQuotas := FlavorQuotas{ 260 Name: fIn.Name, 261 Resources: make(map[corev1.ResourceName]*ResourceQuota, len(fIn.Resources)), 262 } 263 for _, rIn := range fIn.Resources { 264 rQuota := ResourceQuota{ 265 Nominal: workload.ResourceValue(rIn.Name, rIn.NominalQuota), 266 } 267 if rIn.BorrowingLimit != nil { 268 rQuota.BorrowingLimit = ptr.To(workload.ResourceValue(rIn.Name, *rIn.BorrowingLimit)) 269 } 270 if features.Enabled(features.LendingLimit) && rIn.LendingLimit != nil { 271 rQuota.LendingLimit = ptr.To(workload.ResourceValue(rIn.Name, *rIn.LendingLimit)) 272 } 273 fQuotas.Resources[rIn.Name] = &rQuota 274 } 275 rg.Flavors = append(rg.Flavors, fQuotas) 276 } 277 } 278 // Start at 1, for backwards compatibility. 279 if c.AllocatableResourceGeneration == 0 || !equality.Semantic.DeepEqual(oldRG, c.ResourceGroups) { 280 c.AllocatableResourceGeneration++ 281 } 282 c.UpdateRGByResource() 283 } 284 285 func (c *ClusterQueue) UpdateRGByResource() { 286 c.RGByResource = make(map[corev1.ResourceName]*ResourceGroup) 287 for i := range c.ResourceGroups { 288 rg := &c.ResourceGroups[i] 289 for rName := range rg.CoveredResources { 290 c.RGByResource[rName] = rg 291 } 292 } 293 } 294 295 func (c *ClusterQueue) updateQueueStatus() { 296 status := active 297 if c.hasMissingFlavors || c.hasMissingOrInactiveAdmissionChecks || c.isStopped { 298 status = pending 299 } 300 if c.Status == terminating { 301 status = terminating 302 } 303 if status != c.Status { 304 c.Status = status 305 metrics.ReportClusterQueueStatus(c.Name, c.Status) 306 } 307 } 308 309 func (c *ClusterQueue) inactiveReason() (string, string) { 310 switch c.Status { 311 case terminating: 312 return "Terminating", "Can't admit new workloads; clusterQueue is terminating" 313 case pending: 314 reasons := make([]string, 0, 3) 315 if c.isStopped { 316 reasons = append(reasons, "Stopped") 317 } 318 if c.hasMissingFlavors { 319 reasons = append(reasons, "FlavorNotFound") 320 } 321 if c.hasMissingOrInactiveAdmissionChecks { 322 reasons = append(reasons, "CheckNotFoundOrInactive") 323 } 324 325 if len(reasons) == 0 { 326 return "Unknown", "Can't admit new workloads." 327 } 328 329 return reasons[0], strings.Join([]string{"Can't admit new workloads:", strings.Join(reasons, ", ")}, " ") 330 } 331 return "Ready", "Can admit new flavors" 332 } 333 334 // UpdateWithFlavors updates a ClusterQueue based on the passed ResourceFlavors set. 335 // Exported only for testing. 336 func (c *ClusterQueue) UpdateWithFlavors(flavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor) { 337 c.hasMissingFlavors = c.updateLabelKeys(flavors) 338 c.updateQueueStatus() 339 } 340 341 func (c *ClusterQueue) updateLabelKeys(flavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor) bool { 342 var flavorNotFound bool 343 for i := range c.ResourceGroups { 344 rg := &c.ResourceGroups[i] 345 if len(rg.Flavors) == 0 { 346 rg.LabelKeys = nil 347 continue 348 } 349 keys := sets.New[string]() 350 for _, rf := range rg.Flavors { 351 if flv, exist := flavors[rf.Name]; exist { 352 for k := range flv.Spec.NodeLabels { 353 keys.Insert(k) 354 } 355 } else { 356 flavorNotFound = true 357 } 358 } 359 360 if keys.Len() > 0 { 361 rg.LabelKeys = keys 362 } 363 } 364 365 return flavorNotFound 366 } 367 368 // updateWithAdmissionChecks updates a ClusterQueue based on the passed AdmissionChecks set. 369 func (c *ClusterQueue) updateWithAdmissionChecks(checks map[string]AdmissionCheck) { 370 hasMissing := false 371 for acName := range c.AdmissionChecks { 372 if ac, found := checks[acName]; !found || !ac.Active { 373 hasMissing = true 374 break 375 } 376 } 377 378 if hasMissing != c.hasMissingOrInactiveAdmissionChecks { 379 c.hasMissingOrInactiveAdmissionChecks = hasMissing 380 c.updateQueueStatus() 381 } 382 } 383 384 func (c *ClusterQueue) addWorkload(w *kueue.Workload) error { 385 k := workload.Key(w) 386 if _, exist := c.Workloads[k]; exist { 387 return fmt.Errorf("workload already exists in ClusterQueue") 388 } 389 wi := workload.NewInfo(w) 390 c.Workloads[k] = wi 391 c.updateWorkloadUsage(wi, 1) 392 if c.podsReadyTracking && !apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadPodsReady) { 393 c.WorkloadsNotReady.Insert(k) 394 } 395 c.reportActiveWorkloads() 396 return nil 397 } 398 399 func (c *ClusterQueue) deleteWorkload(w *kueue.Workload) { 400 k := workload.Key(w) 401 wi, exist := c.Workloads[k] 402 if !exist { 403 return 404 } 405 c.updateWorkloadUsage(wi, -1) 406 if c.podsReadyTracking && !apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadPodsReady) { 407 c.WorkloadsNotReady.Delete(k) 408 } 409 // we only increase the AllocatableResourceGeneration cause the add of workload won't make more 410 // workloads fit in ClusterQueue. 411 c.AllocatableResourceGeneration++ 412 413 delete(c.Workloads, k) 414 c.reportActiveWorkloads() 415 } 416 417 func (c *ClusterQueue) reportActiveWorkloads() { 418 metrics.AdmittedActiveWorkloads.WithLabelValues(c.Name).Set(float64(c.admittedWorkloadsCount)) 419 metrics.ReservingActiveWorkloads.WithLabelValues(c.Name).Set(float64(len(c.Workloads))) 420 } 421 422 // updateWorkloadUsage updates the usage of the ClusterQueue for the workload 423 // and the number of admitted workloads for local queues. 424 func (c *ClusterQueue) updateWorkloadUsage(wi *workload.Info, m int64) { 425 admitted := workload.IsAdmitted(wi.Obj) 426 updateUsage(wi, c.Usage, m) 427 if admitted { 428 updateUsage(wi, c.AdmittedUsage, m) 429 c.admittedWorkloadsCount += int(m) 430 } 431 qKey := workload.QueueKey(wi.Obj) 432 if lq, ok := c.localQueues[qKey]; ok { 433 updateUsage(wi, lq.usage, m) 434 lq.reservingWorkloads += int(m) 435 if admitted { 436 updateUsage(wi, lq.admittedUsage, m) 437 lq.admittedWorkloads += int(m) 438 } 439 } 440 } 441 442 func updateUsage(wi *workload.Info, flvUsage FlavorResourceQuantities, m int64) { 443 for _, ps := range wi.TotalRequests { 444 for wlRes, wlResFlv := range ps.Flavors { 445 v, wlResExist := ps.Requests[wlRes] 446 flv, flvExist := flvUsage[wlResFlv] 447 if flvExist && wlResExist { 448 if _, exists := flv[wlRes]; exists { 449 flv[wlRes] += v * m 450 } 451 } 452 } 453 } 454 } 455 456 func updateCohortUsage(wi *workload.Info, cq *ClusterQueue, m int64) { 457 for _, ps := range wi.TotalRequests { 458 for wlRes, wlResFlv := range ps.Flavors { 459 v, wlResExist := ps.Requests[wlRes] 460 flv, flvExist := cq.Cohort.Usage[wlResFlv] 461 if flvExist && wlResExist { 462 if _, exists := flv[wlRes]; exists { 463 after := cq.Usage[wlResFlv][wlRes] - cq.guaranteedQuota(wlResFlv, wlRes) 464 // rollback update cq.Usage 465 before := after - v*m 466 if before > 0 { 467 flv[wlRes] -= before 468 } 469 // simulate updating cq.Usage 470 if after > 0 { 471 flv[wlRes] += after 472 } 473 } 474 } 475 } 476 } 477 } 478 479 func (c *ClusterQueue) addLocalQueue(q *kueue.LocalQueue) error { 480 qKey := queueKey(q) 481 if _, ok := c.localQueues[qKey]; ok { 482 return errQueueAlreadyExists 483 } 484 // We need to count the workloads, because they could have been added before 485 // receiving the queue add event. 486 qImpl := &queue{ 487 key: qKey, 488 reservingWorkloads: 0, 489 usage: make(FlavorResourceQuantities), 490 } 491 if err := qImpl.resetFlavorsAndResources(c.Usage, c.AdmittedUsage); err != nil { 492 return err 493 } 494 for _, wl := range c.Workloads { 495 if workloadBelongsToLocalQueue(wl.Obj, q) { 496 updateUsage(wl, qImpl.usage, 1) 497 qImpl.reservingWorkloads++ 498 if workload.IsAdmitted(wl.Obj) { 499 updateUsage(wl, qImpl.admittedUsage, 1) 500 qImpl.admittedWorkloads++ 501 } 502 } 503 } 504 c.localQueues[qKey] = qImpl 505 return nil 506 } 507 508 func (c *ClusterQueue) deleteLocalQueue(q *kueue.LocalQueue) { 509 qKey := queueKey(q) 510 delete(c.localQueues, qKey) 511 } 512 513 func (c *ClusterQueue) flavorInUse(flavor string) bool { 514 for _, rg := range c.ResourceGroups { 515 for _, f := range rg.Flavors { 516 if kueue.ResourceFlavorReference(flavor) == f.Name { 517 return true 518 } 519 } 520 } 521 return false 522 } 523 524 func (q *queue) resetFlavorsAndResources(cqUsage FlavorResourceQuantities, cqAdmittedUsage FlavorResourceQuantities) error { 525 // Clean up removed flavors or resources. 526 q.usage = resetUsage(q.usage, cqUsage) 527 q.admittedUsage = resetUsage(q.admittedUsage, cqAdmittedUsage) 528 return nil 529 } 530 531 func resetUsage(lqUsage FlavorResourceQuantities, cqUsage FlavorResourceQuantities) FlavorResourceQuantities { 532 usedFlavorResources := make(FlavorResourceQuantities) 533 for cqFlv, cqRes := range cqUsage { 534 existingUsedResources := lqUsage[cqFlv] 535 usedResources := make(map[corev1.ResourceName]int64, len(cqRes)) 536 for rName := range cqRes { 537 usedResources[rName] = existingUsedResources[rName] 538 } 539 usedFlavorResources[cqFlv] = usedResources 540 } 541 return usedFlavorResources 542 } 543 544 func workloadBelongsToLocalQueue(wl *kueue.Workload, q *kueue.LocalQueue) bool { 545 return wl.Namespace == q.Namespace && wl.Spec.QueueName == q.Name 546 } 547 548 // RequestableCohortQuota returns the total available quota by the flavor and resource name in the cohort. 549 // LendingLimit will also be counted here if feature LendingLimit enabled. 550 // Please note that for different clusterQueues, the requestable quota is different, 551 // they should be calculated dynamically. 552 func (c *ClusterQueue) RequestableCohortQuota(fName kueue.ResourceFlavorReference, rName corev1.ResourceName) (val int64) { 553 if c.Cohort.RequestableResources == nil || c.Cohort.RequestableResources[fName] == nil { 554 return 0 555 } 556 requestableCohortQuota := c.Cohort.RequestableResources[fName][rName] 557 558 // When feature LendingLimit enabled, cohort.requestableResource accumulated the lendingLimit if not null 559 // rather than the flavor's quota, then the total available quota should include its own guaranteed resources. 560 requestableCohortQuota += c.guaranteedQuota(fName, rName) 561 562 return requestableCohortQuota 563 } 564 565 func (c *ClusterQueue) guaranteedQuota(fName kueue.ResourceFlavorReference, rName corev1.ResourceName) (val int64) { 566 if !features.Enabled(features.LendingLimit) { 567 return 0 568 } 569 if c.GuaranteedQuota == nil || c.GuaranteedQuota[fName] == nil { 570 return 0 571 } 572 return c.GuaranteedQuota[fName][rName] 573 } 574 575 // UsedCohortQuota returns the used quota by the flavor and resource name in the cohort. 576 // Note that when LendingLimit enabled, the usage is not equal to the total used quota but the one 577 // minus the guaranteed resources, this is only for judging whether workloads fit in the cohort. 578 func (c *ClusterQueue) UsedCohortQuota(fName kueue.ResourceFlavorReference, rName corev1.ResourceName) (val int64) { 579 if c.Cohort.Usage == nil || c.Cohort.Usage[fName] == nil { 580 return 0 581 } 582 583 cohortUsage := c.Cohort.Usage[fName][rName] 584 585 // When feature LendingLimit enabled, cohortUsage is the sum of usage in LendingLimit. 586 // If cqUsage < c.guaranteedQuota, it means the cq is not using all its guaranteedQuota, 587 // need to count the cqUsage in, otherwise need to count the guaranteedQuota in. 588 if features.Enabled(features.LendingLimit) { 589 cqUsage := c.Usage[fName][rName] 590 if cqUsage < c.guaranteedQuota(fName, rName) { 591 cohortUsage += cqUsage 592 } else { 593 cohortUsage += c.guaranteedQuota(fName, rName) 594 } 595 } 596 597 return cohortUsage 598 }