sigs.k8s.io/kueue@v0.6.2/pkg/workload/workload.go (about) 1 /* 2 Copyright 2022 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package workload 18 19 import ( 20 "context" 21 "fmt" 22 "maps" 23 "strings" 24 25 corev1 "k8s.io/api/core/v1" 26 apimeta "k8s.io/apimachinery/pkg/api/meta" 27 "k8s.io/apimachinery/pkg/api/resource" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/utils/ptr" 30 "sigs.k8s.io/controller-runtime/pkg/client" 31 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" 32 33 config "sigs.k8s.io/kueue/apis/config/v1beta1" 34 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 35 "sigs.k8s.io/kueue/pkg/constants" 36 "sigs.k8s.io/kueue/pkg/util/api" 37 "sigs.k8s.io/kueue/pkg/util/limitrange" 38 ) 39 40 var ( 41 admissionManagedConditions = []string{kueue.WorkloadQuotaReserved, kueue.WorkloadEvicted, kueue.WorkloadAdmitted} 42 ) 43 44 type AssigmentClusterQueueState struct { 45 LastTriedFlavorIdx []map[corev1.ResourceName]int 46 CohortGeneration int64 47 ClusterQueueGeneration int64 48 } 49 50 func (s *AssigmentClusterQueueState) Clone() *AssigmentClusterQueueState { 51 c := AssigmentClusterQueueState{ 52 LastTriedFlavorIdx: make([]map[corev1.ResourceName]int, len(s.LastTriedFlavorIdx)), 53 CohortGeneration: s.CohortGeneration, 54 ClusterQueueGeneration: s.ClusterQueueGeneration, 55 } 56 for ps, flavorIdx := range s.LastTriedFlavorIdx { 57 c.LastTriedFlavorIdx[ps] = maps.Clone(flavorIdx) 58 } 59 return &c 60 } 61 62 // PendingFlavors returns whether there are pending flavors to try 63 // after the last attempt. 64 func (s *AssigmentClusterQueueState) PendingFlavors() bool { 65 if s == nil { 66 // This is only reached in unit tests. 67 return false 68 } 69 for _, podSetIdxs := range s.LastTriedFlavorIdx { 70 for _, idx := range podSetIdxs { 71 if idx != -1 { 72 return true 73 } 74 } 75 } 76 return false 77 } 78 79 // Info holds a Workload object and some pre-processing. 80 type Info struct { 81 Obj *kueue.Workload 82 // list of total resources requested by the podsets. 83 TotalRequests []PodSetResources 84 // Populated from the queue during admission or from the admission field if 85 // already admitted. 86 ClusterQueue string 87 LastAssignment *AssigmentClusterQueueState 88 } 89 90 type PodSetResources struct { 91 Name string 92 Requests Requests 93 Count int32 94 Flavors map[corev1.ResourceName]kueue.ResourceFlavorReference 95 } 96 97 func (psr *PodSetResources) ScaledTo(newCount int32) *PodSetResources { 98 ret := &PodSetResources{ 99 Name: psr.Name, 100 Requests: maps.Clone(psr.Requests), 101 Count: psr.Count, 102 Flavors: maps.Clone(psr.Flavors), 103 } 104 ret.Requests.scaleDown(int64(ret.Count)) 105 ret.Requests.scaleUp(int64(newCount)) 106 ret.Count = newCount 107 return ret 108 } 109 110 func NewInfo(w *kueue.Workload) *Info { 111 info := &Info{ 112 Obj: w, 113 } 114 if w.Status.Admission != nil { 115 info.ClusterQueue = string(w.Status.Admission.ClusterQueue) 116 info.TotalRequests = totalRequestsFromAdmission(w) 117 } else { 118 info.TotalRequests = totalRequestsFromPodSets(w) 119 } 120 return info 121 } 122 123 func (i *Info) Update(wl *kueue.Workload) { 124 i.Obj = wl 125 } 126 127 func (i *Info) CanBePartiallyAdmitted() bool { 128 return CanBePartiallyAdmitted(i.Obj) 129 } 130 131 func CanBePartiallyAdmitted(wl *kueue.Workload) bool { 132 ps := wl.Spec.PodSets 133 for psi := range ps { 134 if ps[psi].Count > ptr.Deref(ps[psi].MinCount, ps[psi].Count) { 135 return true 136 } 137 } 138 return false 139 } 140 141 func Key(w *kueue.Workload) string { 142 return fmt.Sprintf("%s/%s", w.Namespace, w.Name) 143 } 144 145 func QueueKey(w *kueue.Workload) string { 146 return fmt.Sprintf("%s/%s", w.Namespace, w.Spec.QueueName) 147 } 148 149 func reclaimableCounts(wl *kueue.Workload) map[string]int32 { 150 ret := make(map[string]int32, len(wl.Status.ReclaimablePods)) 151 for i := range wl.Status.ReclaimablePods { 152 reclaimInfo := &wl.Status.ReclaimablePods[i] 153 ret[reclaimInfo.Name] = reclaimInfo.Count 154 } 155 return ret 156 } 157 158 func podSetsCounts(wl *kueue.Workload) map[string]int32 { 159 160 ret := make(map[string]int32, len(wl.Spec.PodSets)) 161 for i := range wl.Spec.PodSets { 162 ps := &wl.Spec.PodSets[i] 163 ret[ps.Name] = ps.Count 164 } 165 return ret 166 } 167 168 func podSetsCountsAfterReclaim(wl *kueue.Workload) map[string]int32 { 169 totalCounts := podSetsCounts(wl) 170 reclaimCounts := reclaimableCounts(wl) 171 for podSetName := range totalCounts { 172 if rc, found := reclaimCounts[podSetName]; found { 173 totalCounts[podSetName] -= rc 174 } 175 } 176 return totalCounts 177 } 178 179 func totalRequestsFromPodSets(wl *kueue.Workload) []PodSetResources { 180 if len(wl.Spec.PodSets) == 0 { 181 return nil 182 } 183 res := make([]PodSetResources, 0, len(wl.Spec.PodSets)) 184 currentCounts := podSetsCountsAfterReclaim(wl) 185 for _, ps := range wl.Spec.PodSets { 186 count := currentCounts[ps.Name] 187 setRes := PodSetResources{ 188 Name: ps.Name, 189 Count: count, 190 } 191 setRes.Requests = newRequests(limitrange.TotalRequests(&ps.Template.Spec)) 192 setRes.Requests.scaleUp(int64(count)) 193 res = append(res, setRes) 194 } 195 return res 196 } 197 198 func totalRequestsFromAdmission(wl *kueue.Workload) []PodSetResources { 199 if wl.Status.Admission == nil { 200 return nil 201 } 202 res := make([]PodSetResources, 0, len(wl.Spec.PodSets)) 203 currentCounts := podSetsCountsAfterReclaim(wl) 204 totalCounts := podSetsCounts(wl) 205 for _, psa := range wl.Status.Admission.PodSetAssignments { 206 setRes := PodSetResources{ 207 Name: psa.Name, 208 Flavors: psa.Flavors, 209 Count: ptr.Deref(psa.Count, totalCounts[psa.Name]), 210 Requests: newRequests(psa.ResourceUsage), 211 } 212 213 if count := currentCounts[psa.Name]; count != setRes.Count { 214 setRes.Requests.scaleDown(int64(setRes.Count)) 215 setRes.Requests.scaleUp(int64(count)) 216 setRes.Count = count 217 } 218 219 res = append(res, setRes) 220 } 221 return res 222 } 223 224 // The following resources calculations are inspired on 225 // https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/types.go 226 227 // Requests maps ResourceName to flavor to value; for CPU it is tracked in MilliCPU. 228 type Requests map[corev1.ResourceName]int64 229 230 func newRequests(rl corev1.ResourceList) Requests { 231 r := Requests{} 232 for name, quant := range rl { 233 r[name] = ResourceValue(name, quant) 234 } 235 return r 236 } 237 238 func (r Requests) ToResourceList() corev1.ResourceList { 239 ret := make(corev1.ResourceList, len(r)) 240 for k, v := range r { 241 ret[k] = ResourceQuantity(k, v) 242 } 243 return ret 244 } 245 246 // ResourceValue returns the integer value for the resource name. 247 // It's milli-units for CPU and absolute units for everything else. 248 func ResourceValue(name corev1.ResourceName, q resource.Quantity) int64 { 249 if name == corev1.ResourceCPU { 250 return q.MilliValue() 251 } 252 return q.Value() 253 } 254 255 func ResourceQuantity(name corev1.ResourceName, v int64) resource.Quantity { 256 switch name { 257 case corev1.ResourceCPU: 258 return *resource.NewMilliQuantity(v, resource.DecimalSI) 259 case corev1.ResourceMemory, corev1.ResourceEphemeralStorage: 260 return *resource.NewQuantity(v, resource.BinarySI) 261 default: 262 if strings.HasPrefix(string(name), corev1.ResourceHugePagesPrefix) { 263 return *resource.NewQuantity(v, resource.BinarySI) 264 } 265 return *resource.NewQuantity(v, resource.DecimalSI) 266 } 267 } 268 269 func (r Requests) scaleUp(f int64) { 270 for name := range r { 271 r[name] *= f 272 } 273 } 274 275 func (r Requests) scaleDown(f int64) { 276 for name := range r { 277 r[name] /= f 278 } 279 } 280 281 // UpdateStatus updates the condition of a workload with ssa, 282 // fieldManager being set to managerPrefix + "-" + conditionType 283 func UpdateStatus(ctx context.Context, 284 c client.Client, 285 wl *kueue.Workload, 286 conditionType string, 287 conditionStatus metav1.ConditionStatus, 288 reason, message string, 289 managerPrefix string) error { 290 now := metav1.Now() 291 condition := metav1.Condition{ 292 Type: conditionType, 293 Status: conditionStatus, 294 LastTransitionTime: now, 295 Reason: reason, 296 Message: api.TruncateConditionMessage(message), 297 } 298 299 newWl := BaseSSAWorkload(wl) 300 newWl.Status.Conditions = []metav1.Condition{condition} 301 return c.Status().Patch(ctx, newWl, client.Apply, client.FieldOwner(managerPrefix+"-"+condition.Type)) 302 } 303 304 // UnsetQuotaReservationWithCondition sets the QuotaReserved condition to false and clears 305 // the admission. 306 // Returns whether any change was done. 307 func UnsetQuotaReservationWithCondition(wl *kueue.Workload, reason, message string) bool { 308 condition := metav1.Condition{ 309 Type: kueue.WorkloadQuotaReserved, 310 Status: metav1.ConditionFalse, 311 LastTransitionTime: metav1.Now(), 312 Reason: reason, 313 Message: api.TruncateConditionMessage(message), 314 } 315 changed := apimeta.SetStatusCondition(&wl.Status.Conditions, condition) 316 if wl.Status.Admission != nil { 317 wl.Status.Admission = nil 318 changed = true 319 } 320 321 // Reset the admitted condition if necessary. 322 if SyncAdmittedCondition(wl) { 323 changed = true 324 } 325 return changed 326 } 327 328 // BaseSSAWorkload creates a new object based on the input workload that 329 // only contains the fields necessary to identify the original object. 330 // The object can be used in as a base for Server-Side-Apply. 331 func BaseSSAWorkload(w *kueue.Workload) *kueue.Workload { 332 wlCopy := &kueue.Workload{ 333 ObjectMeta: metav1.ObjectMeta{ 334 UID: w.UID, 335 Name: w.Name, 336 Namespace: w.Namespace, 337 Generation: w.Generation, // Produce a conflict if there was a change in the spec. 338 }, 339 TypeMeta: w.TypeMeta, 340 } 341 if wlCopy.APIVersion == "" { 342 wlCopy.APIVersion = kueue.GroupVersion.String() 343 } 344 if wlCopy.Kind == "" { 345 wlCopy.Kind = "Workload" 346 } 347 return wlCopy 348 } 349 350 // SetQuotaReservation applies the provided admission to the workload. 351 // The WorkloadAdmitted and WorkloadEvicted are added or updated if necessary. 352 func SetQuotaReservation(w *kueue.Workload, admission *kueue.Admission) { 353 w.Status.Admission = admission 354 admittedCond := metav1.Condition{ 355 Type: kueue.WorkloadQuotaReserved, 356 Status: metav1.ConditionTrue, 357 LastTransitionTime: metav1.Now(), 358 Reason: "QuotaReserved", 359 Message: fmt.Sprintf("Quota reserved in ClusterQueue %s", w.Status.Admission.ClusterQueue), 360 } 361 apimeta.SetStatusCondition(&w.Status.Conditions, admittedCond) 362 363 //reset Evicted condition if present. 364 if evictedCond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted); evictedCond != nil { 365 evictedCond.Status = metav1.ConditionFalse 366 evictedCond.LastTransitionTime = metav1.Now() 367 } 368 } 369 370 func SetEvictedCondition(w *kueue.Workload, reason string, message string) { 371 condition := metav1.Condition{ 372 Type: kueue.WorkloadEvicted, 373 Status: metav1.ConditionTrue, 374 LastTransitionTime: metav1.Now(), 375 Reason: reason, 376 Message: message, 377 } 378 apimeta.SetStatusCondition(&w.Status.Conditions, condition) 379 } 380 381 // admissionPatch creates a new object based on the input workload that contains 382 // the admission and related conditions. The object can be used in Server-Side-Apply. 383 func admissionPatch(w *kueue.Workload) *kueue.Workload { 384 wlCopy := BaseSSAWorkload(w) 385 386 wlCopy.Status.Admission = w.Status.Admission.DeepCopy() 387 wlCopy.Status.RequeueState = w.Status.RequeueState.DeepCopy() 388 for _, conditionName := range admissionManagedConditions { 389 if existing := apimeta.FindStatusCondition(w.Status.Conditions, conditionName); existing != nil { 390 wlCopy.Status.Conditions = append(wlCopy.Status.Conditions, *existing.DeepCopy()) 391 } 392 } 393 return wlCopy 394 } 395 396 // ApplyAdmissionStatus updated all the admission related status fields of a workload with SSA. 397 // if strict is true, resourceVersion will be part of the patch, make this call fail if Workload 398 // was changed. 399 func ApplyAdmissionStatus(ctx context.Context, c client.Client, w *kueue.Workload, strict bool) error { 400 patch := admissionPatch(w) 401 if strict { 402 patch.ResourceVersion = w.ResourceVersion 403 } 404 return c.Status().Patch(ctx, patch, client.Apply, client.FieldOwner(constants.AdmissionName)) 405 } 406 407 type Ordering struct { 408 PodsReadyRequeuingTimestamp config.RequeuingTimestamp 409 } 410 411 // GetQueueOrderTimestamp return the timestamp to be used by the scheduler. It could 412 // be the workload creation time or the last time a PodsReady timeout has occurred. 413 func (o Ordering) GetQueueOrderTimestamp(w *kueue.Workload) *metav1.Time { 414 if o.PodsReadyRequeuingTimestamp == config.EvictionTimestamp { 415 if evictedCond, evictedByTimout := IsEvictedByPodsReadyTimeout(w); evictedByTimout { 416 return &evictedCond.LastTransitionTime 417 } 418 } 419 return &w.CreationTimestamp 420 } 421 422 // HasQuotaReservation checks if workload is admitted based on conditions 423 func HasQuotaReservation(w *kueue.Workload) bool { 424 return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadQuotaReserved) 425 } 426 427 // UpdateReclaimablePods updates the ReclaimablePods list for the workload wit SSA. 428 func UpdateReclaimablePods(ctx context.Context, c client.Client, w *kueue.Workload, reclaimablePods []kueue.ReclaimablePod) error { 429 patch := BaseSSAWorkload(w) 430 patch.Status.ReclaimablePods = reclaimablePods 431 return c.Status().Patch(ctx, patch, client.Apply, client.FieldOwner(constants.ReclaimablePodsMgr)) 432 } 433 434 // ReclaimablePodsAreEqual checks if two Reclaimable pods are semantically equal 435 // having the same length and all keys have the same value. 436 func ReclaimablePodsAreEqual(a, b []kueue.ReclaimablePod) bool { 437 if len(a) != len(b) { 438 return false 439 } 440 441 mb := make(map[string]int32, len(b)) 442 for i := range b { 443 mb[b[i].Name] = b[i].Count 444 } 445 446 for i := range a { 447 if bCount, found := mb[a[i].Name]; !found || bCount != a[i].Count { 448 return false 449 } 450 } 451 return true 452 } 453 454 // HasRequeueState returns true if the workload has re-queue state. 455 func HasRequeueState(w *kueue.Workload) bool { 456 return w.Status.RequeueState != nil 457 } 458 459 // IsAdmitted returns true if the workload is admitted. 460 func IsAdmitted(w *kueue.Workload) bool { 461 return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadAdmitted) 462 } 463 464 // IsFinished returns true if the workload is finished. 465 func IsFinished(w *kueue.Workload) bool { 466 return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadFinished) 467 } 468 469 // IsEvictedByDeactivation returns true if the workload is evicted by deactivation. 470 func IsEvictedByDeactivation(w *kueue.Workload) bool { 471 cond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted) 472 return cond != nil && cond.Status == metav1.ConditionTrue && cond.Reason == kueue.WorkloadEvictedByDeactivation 473 } 474 475 func IsEvictedByPodsReadyTimeout(w *kueue.Workload) (*metav1.Condition, bool) { 476 cond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted) 477 if cond == nil || cond.Status != metav1.ConditionTrue || cond.Reason != kueue.WorkloadEvictedByPodsReadyTimeout { 478 return nil, false 479 } 480 return cond, true 481 } 482 483 func RemoveFinalizer(ctx context.Context, c client.Client, wl *kueue.Workload) error { 484 if controllerutil.RemoveFinalizer(wl, kueue.ResourceInUseFinalizerName) { 485 return c.Update(ctx, wl) 486 } 487 return nil 488 }