sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/flavorassigner/flavorassigner.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package flavorassigner 18 19 import ( 20 "errors" 21 "fmt" 22 "sort" 23 "strings" 24 25 "github.com/go-logr/logr" 26 "github.com/google/go-cmp/cmp" 27 "github.com/google/go-cmp/cmp/cmpopts" 28 corev1 "k8s.io/api/core/v1" 29 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 30 "k8s.io/apimachinery/pkg/util/sets" 31 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 32 "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" 33 "k8s.io/utils/ptr" 34 35 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 36 "sigs.k8s.io/kueue/pkg/cache" 37 "sigs.k8s.io/kueue/pkg/features" 38 "sigs.k8s.io/kueue/pkg/workload" 39 ) 40 41 type Assignment struct { 42 PodSets []PodSetAssignment 43 Borrowing bool 44 LastState workload.AssigmentClusterQueueState 45 46 // Usage is the accumulated Usage of resources as pod sets get 47 // flavors assigned. 48 Usage cache.FlavorResourceQuantities 49 50 // representativeMode is the cached representative mode for this assignment. 51 representativeMode *FlavorAssignmentMode 52 } 53 54 // Borrows return whether assigment requires borrowing. 55 func (a *Assignment) Borrows() bool { 56 return a.Borrowing 57 } 58 59 // RepresentativeMode calculates the representative mode for the assignment as 60 // the worst assignment mode among all the pod sets. 61 func (a *Assignment) RepresentativeMode() FlavorAssignmentMode { 62 if len(a.PodSets) == 0 { 63 // No assignments calculated. 64 return NoFit 65 } 66 if a.representativeMode != nil { 67 return *a.representativeMode 68 } 69 mode := Fit 70 for _, ps := range a.PodSets { 71 psMode := ps.RepresentativeMode() 72 if psMode < mode { 73 mode = psMode 74 } 75 } 76 a.representativeMode = &mode 77 return mode 78 } 79 80 func (a *Assignment) Message() string { 81 var builder strings.Builder 82 for _, ps := range a.PodSets { 83 if ps.Status == nil { 84 continue 85 } 86 if ps.Status.IsError() { 87 return fmt.Sprintf("failed to assign flavors to pod set %s: %v", ps.Name, ps.Status.err) 88 } 89 if builder.Len() > 0 { 90 builder.WriteString("; ") 91 } 92 builder.WriteString("couldn't assign flavors to pod set ") 93 builder.WriteString(ps.Name) 94 builder.WriteString(": ") 95 builder.WriteString(ps.Status.Message()) 96 } 97 return builder.String() 98 } 99 100 func (a *Assignment) ToAPI() []kueue.PodSetAssignment { 101 psFlavors := make([]kueue.PodSetAssignment, len(a.PodSets)) 102 for i := range psFlavors { 103 psFlavors[i] = a.PodSets[i].toAPI() 104 } 105 return psFlavors 106 } 107 108 type Status struct { 109 reasons []string 110 err error 111 } 112 113 func (s *Status) IsError() bool { 114 return s != nil && s.err != nil 115 } 116 117 func (s *Status) append(r ...string) *Status { 118 s.reasons = append(s.reasons, r...) 119 return s 120 } 121 122 func (s *Status) Message() string { 123 if s == nil { 124 return "" 125 } 126 if s.err != nil { 127 return s.err.Error() 128 } 129 sort.Strings(s.reasons) 130 return strings.Join(s.reasons, ", ") 131 } 132 133 func (s *Status) Equal(o *Status) bool { 134 if s == nil || o == nil { 135 return s == o 136 } 137 if s.err != nil { 138 return errors.Is(s.err, o.err) 139 } 140 return cmp.Equal(s.reasons, o.reasons, cmpopts.SortSlices(func(a, b string) bool { 141 return a < b 142 })) 143 } 144 145 // PodSetAssignment holds the assigned flavors and status messages for each of 146 // the resources that the pod set requests. Each assigned flavor is accompanied 147 // with an AssignmentMode. 148 // Empty .Flavors can be interpreted as NoFit mode for all the resources. 149 // Empty .Status can be interpreted as Fit mode for all the resources. 150 // .Flavors and .Status can't be empty at the same time, once PodSetAssignment 151 // is fully calculated. 152 type PodSetAssignment struct { 153 Name string 154 Flavors ResourceAssignment 155 Status *Status 156 Requests corev1.ResourceList 157 Count int32 158 } 159 160 // RepresentativeMode calculates the representative mode for this assignment as 161 // the worst assignment mode among all assigned flavors. 162 func (psa *PodSetAssignment) RepresentativeMode() FlavorAssignmentMode { 163 if psa.Status == nil { 164 return Fit 165 } 166 if len(psa.Flavors) == 0 { 167 return NoFit 168 } 169 mode := Fit 170 for _, flvAssignment := range psa.Flavors { 171 if flvAssignment.Mode < mode { 172 mode = flvAssignment.Mode 173 } 174 } 175 return mode 176 } 177 178 type ResourceAssignment map[corev1.ResourceName]*FlavorAssignment 179 180 func (psa *PodSetAssignment) toAPI() kueue.PodSetAssignment { 181 flavors := make(map[corev1.ResourceName]kueue.ResourceFlavorReference, len(psa.Flavors)) 182 for res, flvAssignment := range psa.Flavors { 183 flavors[res] = flvAssignment.Name 184 } 185 return kueue.PodSetAssignment{ 186 Name: psa.Name, 187 Flavors: flavors, 188 ResourceUsage: psa.Requests, 189 Count: ptr.To(psa.Count), 190 } 191 } 192 193 // FlavorAssignmentMode describes whether the flavor can be assigned immediately 194 // or what needs to happen, so it can be assigned. 195 type FlavorAssignmentMode int 196 197 // The flavor assignment modes below are ordered from lowest to highest 198 // preference. 199 const ( 200 // NoFit means that there is not enough quota to assign this flavor. 201 NoFit FlavorAssignmentMode = iota 202 // Preempt means that there is not enough unused nominal quota in the ClusterQueue 203 // or cohort. Preempting other workloads in the ClusterQueue or cohort, or 204 // waiting for them to finish might make it possible to assign this flavor. 205 Preempt 206 // Fit means that there is enough unused quota in the cohort to assign this 207 // flavor. 208 Fit 209 ) 210 211 func (m FlavorAssignmentMode) String() string { 212 switch m { 213 case NoFit: 214 return "NoFit" 215 case Preempt: 216 return "Preempt" 217 case Fit: 218 return "Fit" 219 } 220 return "Unknown" 221 } 222 223 type FlavorAssignment struct { 224 Name kueue.ResourceFlavorReference 225 Mode FlavorAssignmentMode 226 TriedFlavorIdx int 227 borrow bool 228 } 229 230 func lastAssignmentOutdated(wl *workload.Info, cq *cache.ClusterQueue) bool { 231 return cq.AllocatableResourceGeneration > wl.LastAssignment.ClusterQueueGeneration || 232 (cq.Cohort != nil && cq.Cohort.AllocatableResourceGeneration > wl.LastAssignment.CohortGeneration) 233 } 234 235 // AssignFlavors assigns a flavor to each of the resources requested in each pod set. 236 // The result for each pod set is accompanied with reasons why the flavor can't 237 // be assigned immediately. Each assigned flavor is accompanied with a 238 // FlavorAssignmentMode. 239 func AssignFlavors(log logr.Logger, wl *workload.Info, resourceFlavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor, cq *cache.ClusterQueue, counts []int32) Assignment { 240 if wl.LastAssignment != nil && lastAssignmentOutdated(wl, cq) { 241 if logV := log.V(6); logV.Enabled() { 242 keysValues := []any{ 243 "cq.AllocatableResourceGeneration", cq.AllocatableResourceGeneration, 244 "wl.LastAssignment.ClusterQueueGeneration", wl.LastAssignment.ClusterQueueGeneration, 245 } 246 if cq.Cohort != nil { 247 keysValues = append(keysValues, 248 "cq.Cohort.AllocatableResourceGeneration", cq.Cohort.AllocatableResourceGeneration, 249 "wl.LastAssignment.CohortGeneration", wl.LastAssignment.CohortGeneration, 250 ) 251 } 252 logV.Info("Clearing Workload's last assignment because it was outdated", keysValues...) 253 } 254 wl.LastAssignment = nil 255 } 256 257 if len(counts) == 0 { 258 return assignFlavors(log, wl.TotalRequests, wl.Obj.Spec.PodSets, resourceFlavors, cq, wl.LastAssignment) 259 } 260 261 currentResources := make([]workload.PodSetResources, len(wl.TotalRequests)) 262 for i := range wl.TotalRequests { 263 currentResources[i] = *wl.TotalRequests[i].ScaledTo(counts[i]) 264 } 265 return assignFlavors(log, currentResources, wl.Obj.Spec.PodSets, resourceFlavors, cq, wl.LastAssignment) 266 } 267 268 func assignFlavors(log logr.Logger, requests []workload.PodSetResources, podSets []kueue.PodSet, resourceFlavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor, cq *cache.ClusterQueue, lastAssignment *workload.AssigmentClusterQueueState) Assignment { 269 assignment := Assignment{ 270 PodSets: make([]PodSetAssignment, 0, len(requests)), 271 Usage: make(cache.FlavorResourceQuantities), 272 LastState: workload.AssigmentClusterQueueState{ 273 LastTriedFlavorIdx: make([]map[corev1.ResourceName]int, 0, len(podSets)), 274 CohortGeneration: 0, 275 ClusterQueueGeneration: cq.AllocatableResourceGeneration, 276 }, 277 } 278 if cq.Cohort != nil { 279 assignment.LastState.CohortGeneration = cq.Cohort.AllocatableResourceGeneration 280 } 281 282 for i, podSet := range requests { 283 if _, found := cq.RGByResource[corev1.ResourcePods]; found { 284 podSet.Requests[corev1.ResourcePods] = int64(podSet.Count) 285 } 286 287 psAssignment := PodSetAssignment{ 288 Name: podSet.Name, 289 Flavors: make(ResourceAssignment, len(podSet.Requests)), 290 Requests: podSet.Requests.ToResourceList(), 291 Count: podSet.Count, 292 } 293 294 for resName := range podSet.Requests { 295 if _, found := psAssignment.Flavors[resName]; found { 296 // This resource got assigned the same flavor as its resource group. 297 // No need to compute again. 298 continue 299 } 300 rg, found := cq.RGByResource[resName] 301 if !found { 302 psAssignment.Flavors = nil 303 psAssignment.Status = &Status{ 304 reasons: []string{fmt.Sprintf("resource %s unavailable in ClusterQueue", resName)}, 305 } 306 break 307 } 308 lastFlavorAssignment := -1 309 if lastAssignment != nil && len(lastAssignment.LastTriedFlavorIdx) > i { 310 idx, ok := lastAssignment.LastTriedFlavorIdx[i][resName] 311 if ok { 312 lastFlavorAssignment = idx 313 } 314 } 315 flavors, status := assignment.findFlavorForResourceGroup(log, rg, podSet.Requests, resourceFlavors, cq, &podSets[i].Template.Spec, lastFlavorAssignment) 316 if status.IsError() || len(flavors) == 0 { 317 psAssignment.Flavors = nil 318 psAssignment.Status = status 319 break 320 } 321 psAssignment.append(flavors, status) 322 } 323 324 assignment.append(podSet.Requests, &psAssignment) 325 if psAssignment.Status.IsError() || (len(podSet.Requests) > 0 && len(psAssignment.Flavors) == 0) { 326 return assignment 327 } 328 } 329 return assignment 330 } 331 332 func (psa *PodSetAssignment) append(flavors ResourceAssignment, status *Status) { 333 for resource, assignment := range flavors { 334 psa.Flavors[resource] = assignment 335 } 336 if psa.Status == nil { 337 psa.Status = status 338 } else if status != nil { 339 psa.Status.reasons = append(psa.Status.reasons, status.reasons...) 340 } 341 } 342 343 func (a *Assignment) append(requests workload.Requests, psAssignment *PodSetAssignment) { 344 flavorIdx := make(map[corev1.ResourceName]int, len(psAssignment.Flavors)) 345 a.PodSets = append(a.PodSets, *psAssignment) 346 for resource, flvAssignment := range psAssignment.Flavors { 347 if flvAssignment.borrow { 348 a.Borrowing = true 349 } 350 if a.Usage[flvAssignment.Name] == nil { 351 a.Usage[flvAssignment.Name] = make(map[corev1.ResourceName]int64) 352 } 353 a.Usage[flvAssignment.Name][resource] += requests[resource] 354 flavorIdx[resource] = flvAssignment.TriedFlavorIdx 355 } 356 a.LastState.LastTriedFlavorIdx = append(a.LastState.LastTriedFlavorIdx, flavorIdx) 357 } 358 359 // findFlavorForResourceGroup finds the flavor which can satisfy the resource 360 // request, along with the information about resources that need to be borrowed. 361 // If the flavor cannot be immediately assigned, it returns a status with 362 // reasons or failure. 363 func (a *Assignment) findFlavorForResourceGroup( 364 log logr.Logger, 365 rg *cache.ResourceGroup, 366 requests workload.Requests, 367 resourceFlavors map[kueue.ResourceFlavorReference]*kueue.ResourceFlavor, 368 cq *cache.ClusterQueue, 369 spec *corev1.PodSpec, 370 lastAssignment int) (ResourceAssignment, *Status) { 371 status := &Status{} 372 requests = filterRequestedResources(requests, rg.CoveredResources) 373 374 var bestAssignment ResourceAssignment 375 bestAssignmentMode := NoFit 376 377 // We will only check against the flavors' labels for the resource. 378 selector := flavorSelector(spec, rg.LabelKeys) 379 flavorIdx := -1 380 for idx, flvQuotas := range rg.Flavors { 381 if features.Enabled(features.FlavorFungibility) && idx <= lastAssignment { 382 continue 383 } 384 flavor, exist := resourceFlavors[flvQuotas.Name] 385 if !exist { 386 log.Error(nil, "Flavor not found", "Flavor", flvQuotas.Name) 387 status.append(fmt.Sprintf("flavor %s not found", flvQuotas.Name)) 388 continue 389 } 390 taint, untolerated := corev1helpers.FindMatchingUntoleratedTaint(flavor.Spec.NodeTaints, spec.Tolerations, func(t *corev1.Taint) bool { 391 return t.Effect == corev1.TaintEffectNoSchedule || t.Effect == corev1.TaintEffectNoExecute 392 }) 393 if untolerated { 394 status.append(fmt.Sprintf("untolerated taint %s in flavor %s", taint, flvQuotas.Name)) 395 continue 396 } 397 if match, err := selector.Match(&corev1.Node{ObjectMeta: metav1.ObjectMeta{Labels: flavor.Spec.NodeLabels}}); !match || err != nil { 398 if err != nil { 399 status.err = err 400 return nil, status 401 } 402 status.append(fmt.Sprintf("flavor %s doesn't match node affinity", flvQuotas.Name)) 403 continue 404 } 405 406 flavorIdx = idx 407 needsBorrowing := false 408 assignments := make(ResourceAssignment, len(requests)) 409 // Calculate representativeMode for this assignment as the worst mode among all requests. 410 representativeMode := Fit 411 for rName, val := range requests { 412 resQuota := flvQuotas.Resources[rName] 413 // Check considering the flavor usage by previous pod sets. 414 mode, borrow, s := fitsResourceQuota(flvQuotas.Name, rName, val+a.Usage[flvQuotas.Name][rName], cq, resQuota) 415 if s != nil { 416 status.reasons = append(status.reasons, s.reasons...) 417 } 418 if mode < representativeMode { 419 representativeMode = mode 420 } 421 needsBorrowing = needsBorrowing || borrow 422 if representativeMode == NoFit { 423 // The flavor doesn't fit, no need to check other resources. 424 break 425 } 426 427 assignments[rName] = &FlavorAssignment{ 428 Name: flvQuotas.Name, 429 Mode: mode, 430 borrow: borrow, 431 } 432 } 433 434 if features.Enabled(features.FlavorFungibility) { 435 if !shouldTryNextFlavor(representativeMode, cq.FlavorFungibility, needsBorrowing) { 436 bestAssignment = assignments 437 bestAssignmentMode = representativeMode 438 break 439 } 440 if representativeMode > bestAssignmentMode { 441 bestAssignment = assignments 442 bestAssignmentMode = representativeMode 443 } 444 } else { 445 if representativeMode > bestAssignmentMode { 446 bestAssignment = assignments 447 bestAssignmentMode = representativeMode 448 if bestAssignmentMode == Fit { 449 // All the resources fit in the cohort, no need to check more flavors. 450 return bestAssignment, nil 451 } 452 } 453 } 454 } 455 456 if features.Enabled(features.FlavorFungibility) { 457 for _, assignment := range bestAssignment { 458 if flavorIdx == len(rg.Flavors)-1 { 459 // we have reach the last flavor, try from the first flavor next time 460 assignment.TriedFlavorIdx = -1 461 } else { 462 assignment.TriedFlavorIdx = flavorIdx 463 } 464 } 465 if bestAssignmentMode == Fit { 466 return bestAssignment, nil 467 } 468 } 469 return bestAssignment, status 470 } 471 472 func shouldTryNextFlavor(representativeMode FlavorAssignmentMode, flavorFungibility kueue.FlavorFungibility, needsBorrowing bool) bool { 473 policyPreempt := flavorFungibility.WhenCanPreempt 474 policyBorrow := flavorFungibility.WhenCanBorrow 475 if representativeMode == Preempt && policyPreempt == kueue.Preempt { 476 if !needsBorrowing || policyBorrow == kueue.Borrow { 477 return false 478 } 479 } 480 481 if representativeMode == Fit && needsBorrowing && policyBorrow == kueue.Borrow { 482 return false 483 } 484 485 if representativeMode == Fit && !needsBorrowing { 486 return false 487 } 488 489 return true 490 } 491 492 func flavorSelector(spec *corev1.PodSpec, allowedKeys sets.Set[string]) nodeaffinity.RequiredNodeAffinity { 493 // This function generally replicates the implementation of kube-scheduler's NodeAffintiy 494 // Filter plugin as of v1.24. 495 var specCopy corev1.PodSpec 496 497 // Remove affinity constraints with irrelevant keys. 498 if len(spec.NodeSelector) != 0 { 499 specCopy.NodeSelector = map[string]string{} 500 for k, v := range spec.NodeSelector { 501 if allowedKeys.Has(k) { 502 specCopy.NodeSelector[k] = v 503 } 504 } 505 } 506 507 affinity := spec.Affinity 508 if affinity != nil && affinity.NodeAffinity != nil && affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { 509 var termsCopy []corev1.NodeSelectorTerm 510 for _, t := range affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution.NodeSelectorTerms { 511 var expCopy []corev1.NodeSelectorRequirement 512 for _, e := range t.MatchExpressions { 513 if allowedKeys.Has(e.Key) { 514 expCopy = append(expCopy, e) 515 } 516 } 517 // If a term becomes empty, it means node affinity matches any flavor since those terms are ORed, 518 // and so matching gets reduced to spec.NodeSelector 519 if len(expCopy) == 0 { 520 termsCopy = nil 521 break 522 } 523 termsCopy = append(termsCopy, corev1.NodeSelectorTerm{MatchExpressions: expCopy}) 524 } 525 if len(termsCopy) != 0 { 526 specCopy.Affinity = &corev1.Affinity{ 527 NodeAffinity: &corev1.NodeAffinity{ 528 RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 529 NodeSelectorTerms: termsCopy, 530 }, 531 }, 532 } 533 } 534 } 535 return nodeaffinity.GetRequiredNodeAffinity(&corev1.Pod{Spec: specCopy}) 536 } 537 538 // fitsResourceQuota returns how this flavor could be assigned to the resource, 539 // according to the remaining quota in the ClusterQueue and cohort. 540 // If it fits, also returns if borrowing required. Similarly, it returns information 541 // if borrowing is required when preempting. 542 // If the flavor doesn't satisfy limits immediately (when waiting or preemption 543 // could help), it returns a Status with reasons. 544 func fitsResourceQuota(fName kueue.ResourceFlavorReference, rName corev1.ResourceName, val int64, cq *cache.ClusterQueue, rQuota *cache.ResourceQuota) (FlavorAssignmentMode, bool, *Status) { 545 var status Status 546 var borrow bool 547 used := cq.Usage[fName][rName] 548 mode := NoFit 549 if val <= rQuota.Nominal { 550 // The request can be satisfied by the nominal quota, assuming quota is 551 // reclaimed from the cohort or assuming all active workloads in the 552 // ClusterQueue are preempted. 553 mode = Preempt 554 } 555 cohortAvailable := rQuota.Nominal 556 if cq.Cohort != nil { 557 cohortAvailable = cq.RequestableCohortQuota(fName, rName) 558 } 559 560 if cq.Preemption.BorrowWithinCohort != nil && cq.Preemption.BorrowWithinCohort.Policy != kueue.BorrowWithinCohortPolicyNever { 561 // when preemption with borrowing is enabled, we can succeed to admit the 562 // workload if preemption is used. 563 if (rQuota.BorrowingLimit == nil || val <= rQuota.Nominal+*rQuota.BorrowingLimit) && val <= cohortAvailable { 564 mode = Preempt 565 borrow = val > rQuota.Nominal 566 } 567 } 568 if rQuota.BorrowingLimit != nil && used+val > rQuota.Nominal+*rQuota.BorrowingLimit { 569 status.append(fmt.Sprintf("borrowing limit for %s in flavor %s exceeded", rName, fName)) 570 return mode, borrow, &status 571 } 572 573 cohortUsed := used 574 if cq.Cohort != nil { 575 cohortUsed = cq.UsedCohortQuota(fName, rName) 576 } 577 578 lack := cohortUsed + val - cohortAvailable 579 if lack <= 0 { 580 return Fit, used+val > rQuota.Nominal, nil 581 } 582 583 lackQuantity := workload.ResourceQuantity(rName, lack) 584 msg := fmt.Sprintf("insufficient unused quota in cohort for %s in flavor %s, %s more needed", rName, fName, &lackQuantity) 585 if cq.Cohort == nil { 586 if mode == NoFit { 587 msg = fmt.Sprintf("insufficient quota for %s in flavor %s in ClusterQueue", rName, fName) 588 } else { 589 msg = fmt.Sprintf("insufficient unused quota for %s in flavor %s, %s more needed", rName, fName, &lackQuantity) 590 } 591 } 592 status.append(msg) 593 return mode, borrow, &status 594 } 595 596 func filterRequestedResources(req workload.Requests, allowList sets.Set[corev1.ResourceName]) workload.Requests { 597 filtered := make(workload.Requests) 598 for n, v := range req { 599 if allowList.Has(n) { 600 filtered[n] = v 601 } 602 } 603 return filtered 604 }