sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/preemption/preemption.go (about) 1 /* 2 Copyright 2023 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package preemption 18 19 import ( 20 "context" 21 "sort" 22 "sync/atomic" 23 "time" 24 25 corev1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/meta" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/util/sets" 29 "k8s.io/client-go/tools/record" 30 "k8s.io/client-go/util/workqueue" 31 "k8s.io/klog/v2" 32 "k8s.io/utils/ptr" 33 ctrl "sigs.k8s.io/controller-runtime" 34 "sigs.k8s.io/controller-runtime/pkg/client" 35 36 kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" 37 "sigs.k8s.io/kueue/pkg/cache" 38 "sigs.k8s.io/kueue/pkg/scheduler/flavorassigner" 39 "sigs.k8s.io/kueue/pkg/util/priority" 40 "sigs.k8s.io/kueue/pkg/util/routine" 41 "sigs.k8s.io/kueue/pkg/workload" 42 ) 43 44 const parallelPreemptions = 8 45 46 type Preemptor struct { 47 client client.Client 48 recorder record.EventRecorder 49 50 workloadOrdering workload.Ordering 51 52 // stubs 53 applyPreemption func(context.Context, *kueue.Workload) error 54 } 55 56 func New(cl client.Client, workloadOrdering workload.Ordering, recorder record.EventRecorder) *Preemptor { 57 p := &Preemptor{ 58 client: cl, 59 recorder: recorder, 60 workloadOrdering: workloadOrdering, 61 } 62 p.applyPreemption = p.applyPreemptionWithSSA 63 return p 64 } 65 66 func (p *Preemptor) OverrideApply(f func(context.Context, *kueue.Workload) error) { 67 p.applyPreemption = f 68 } 69 70 func candidatesOnlyFromQueue(candidates []*workload.Info, clusterQueue string) []*workload.Info { 71 result := make([]*workload.Info, 0, len(candidates)) 72 for _, wi := range candidates { 73 if wi.ClusterQueue == clusterQueue { 74 result = append(result, wi) 75 } 76 } 77 return result 78 } 79 80 // GetTargets returns the list of workloads that should be evicted in order to make room for wl. 81 func (p *Preemptor) GetTargets(wl workload.Info, assignment flavorassigner.Assignment, snapshot *cache.Snapshot) []*workload.Info { 82 resPerFlv := resourcesRequiringPreemption(assignment) 83 cq := snapshot.ClusterQueues[wl.ClusterQueue] 84 85 candidates := findCandidates(wl.Obj, p.workloadOrdering, cq, resPerFlv) 86 if len(candidates) == 0 { 87 return nil 88 } 89 sort.Slice(candidates, candidatesOrdering(candidates, cq.Name, time.Now())) 90 91 sameQueueCandidates := candidatesOnlyFromQueue(candidates, wl.ClusterQueue) 92 93 // To avoid flapping, Kueue only allows preemption of workloads from the same 94 // queue if borrowing. Preemption of workloads from queues can happen only 95 // if not borrowing at the same time. Kueue prioritizes preemption of 96 // workloads from the other queues (that borrowed resources) first, before 97 // trying to preempt more own workloads and borrow at the same time. 98 99 if len(sameQueueCandidates) == len(candidates) { 100 // There is no possible preemption of workloads from other queues, 101 // so we'll try borrowing. 102 return minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, true, nil) 103 } 104 105 // There is a potential of preemption of workloads from the other queue in the 106 // cohort. We proceed with borrowing only if the dedicated policy 107 // (borrowWithinCohort) is enabled. This ensures the preempted workloads 108 // have lower priority, and so they will not preempt the preemptor when 109 // requeued. 110 borrowWithinCohort := cq.Preemption.BorrowWithinCohort 111 if borrowWithinCohort != nil && borrowWithinCohort.Policy != kueue.BorrowWithinCohortPolicyNever { 112 allowBorrowingBelowPriority := ptr.To(priority.Priority(wl.Obj)) 113 if borrowWithinCohort.MaxPriorityThreshold != nil && *borrowWithinCohort.MaxPriorityThreshold < *allowBorrowingBelowPriority { 114 allowBorrowingBelowPriority = ptr.To(*borrowWithinCohort.MaxPriorityThreshold + 1) 115 } 116 return minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, true, allowBorrowingBelowPriority) 117 } 118 targets := minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, false, nil) 119 if len(targets) == 0 { 120 // Another attempt. This time only candidates from the same queue, but 121 // with borrowing. The previous attempt didn't try borrowing and had broader 122 // scope of preemption. 123 targets = minimalPreemptions(&wl, assignment, snapshot, resPerFlv, sameQueueCandidates, true, nil) 124 } 125 return targets 126 } 127 128 // IssuePreemptions marks the target workloads as evicted. 129 func (p *Preemptor) IssuePreemptions(ctx context.Context, targets []*workload.Info, cq *cache.ClusterQueue) (int, error) { 130 log := ctrl.LoggerFrom(ctx) 131 errCh := routine.NewErrorChannel() 132 ctx, cancel := context.WithCancel(ctx) 133 var successfullyPreempted int64 134 defer cancel() 135 workqueue.ParallelizeUntil(ctx, parallelPreemptions, len(targets), func(i int) { 136 target := targets[i] 137 if !meta.IsStatusConditionTrue(target.Obj.Status.Conditions, kueue.WorkloadEvicted) { 138 err := p.applyPreemption(ctx, target.Obj) 139 if err != nil { 140 errCh.SendErrorWithCancel(err, cancel) 141 return 142 } 143 144 origin := "ClusterQueue" 145 if cq.Name != target.ClusterQueue { 146 origin = "cohort" 147 } 148 log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj)) 149 p.recorder.Eventf(target.Obj, corev1.EventTypeNormal, "Preempted", "Preempted by another workload in the %s", origin) 150 } else { 151 log.V(3).Info("Preemption ongoing", "targetWorkload", klog.KObj(target.Obj)) 152 } 153 atomic.AddInt64(&successfullyPreempted, 1) 154 }) 155 return int(successfullyPreempted), errCh.ReceiveError() 156 } 157 158 func (p *Preemptor) applyPreemptionWithSSA(ctx context.Context, w *kueue.Workload) error { 159 w = w.DeepCopy() 160 workload.SetEvictedCondition(w, kueue.WorkloadEvictedByPreemption, "Preempted to accommodate a higher priority Workload") 161 return workload.ApplyAdmissionStatus(ctx, p.client, w, false) 162 } 163 164 // minimalPreemptions implements a heuristic to find a minimal set of Workloads 165 // to preempt. 166 // The heuristic first removes candidates, in the input order, while their 167 // ClusterQueues are still borrowing resources and while the incoming Workload 168 // doesn't fit in the quota. 169 // Once the Workload fits, the heuristic tries to add Workloads back, in the 170 // reverse order in which they were removed, while the incoming Workload still 171 // fits. 172 func minimalPreemptions(wl *workload.Info, assignment flavorassigner.Assignment, snapshot *cache.Snapshot, resPerFlv resourcesPerFlavor, candidates []*workload.Info, allowBorrowing bool, allowBorrowingBelowPriority *int32) []*workload.Info { 173 wlReq := totalRequestsForAssignment(wl, assignment) 174 cq := snapshot.ClusterQueues[wl.ClusterQueue] 175 176 // Simulate removing all candidates from the ClusterQueue and cohort. 177 var targets []*workload.Info 178 fits := false 179 for _, candWl := range candidates { 180 candCQ := snapshot.ClusterQueues[candWl.ClusterQueue] 181 if cq != candCQ && !cqIsBorrowing(candCQ, resPerFlv) { 182 continue 183 } 184 if cq != candCQ && allowBorrowingBelowPriority != nil && priority.Priority(candWl.Obj) >= *allowBorrowingBelowPriority { 185 // We set allowBorrowing=false if there is a candidate with priority 186 // exceeding allowBorrowingBelowPriority added to targets. 187 // 188 // We need to be careful mutating allowBorrowing. We rely on the 189 // fact that once there is a candidate exceeding the priority added 190 // to targets, then at least one such candidate is present in the 191 // final set of targets (after the second phase of the function). 192 // 193 // This is true, because the candidates are ordered according 194 // to priorities (from lowest to highest, using candidatesOrdering), 195 // and the last added target is not removed in the second phase of 196 // the function. 197 allowBorrowing = false 198 } 199 snapshot.RemoveWorkload(candWl) 200 targets = append(targets, candWl) 201 if workloadFits(wlReq, cq, allowBorrowing) { 202 fits = true 203 break 204 } 205 } 206 if !fits { 207 // Reset changes to the snapshot. 208 for _, t := range targets { 209 snapshot.AddWorkload(t) 210 } 211 return nil 212 } 213 214 // In the reverse order, check if any of the workloads can be added back. 215 for i := len(targets) - 2; i >= 0; i-- { 216 snapshot.AddWorkload(targets[i]) 217 if workloadFits(wlReq, cq, allowBorrowing) { 218 // O(1) deletion: copy the last element into index i and reduce size. 219 targets[i] = targets[len(targets)-1] 220 targets = targets[:len(targets)-1] 221 } else { 222 snapshot.RemoveWorkload(targets[i]) 223 } 224 } 225 // Reset changes to the snapshot. 226 for _, t := range targets { 227 snapshot.AddWorkload(t) 228 } 229 230 return targets 231 } 232 233 type resourcesPerFlavor map[kueue.ResourceFlavorReference]sets.Set[corev1.ResourceName] 234 235 func resourcesRequiringPreemption(assignment flavorassigner.Assignment) resourcesPerFlavor { 236 resPerFlavor := make(resourcesPerFlavor) 237 for _, ps := range assignment.PodSets { 238 for res, flvAssignment := range ps.Flavors { 239 // assignments with NoFit mode wouldn't enter the preemption path. 240 if flvAssignment.Mode != flavorassigner.Preempt { 241 continue 242 } 243 if resPerFlavor[flvAssignment.Name] == nil { 244 resPerFlavor[flvAssignment.Name] = sets.New(res) 245 } else { 246 resPerFlavor[flvAssignment.Name].Insert(res) 247 } 248 } 249 } 250 return resPerFlavor 251 } 252 253 // findCandidates obtains candidates for preemption within the ClusterQueue and 254 // cohort that respect the preemption policy and are using a resource that the 255 // preempting workload needs. 256 func findCandidates(wl *kueue.Workload, wo workload.Ordering, cq *cache.ClusterQueue, resPerFlv resourcesPerFlavor) []*workload.Info { 257 var candidates []*workload.Info 258 wlPriority := priority.Priority(wl) 259 260 if cq.Preemption.WithinClusterQueue != kueue.PreemptionPolicyNever { 261 considerSamePrio := (cq.Preemption.WithinClusterQueue == kueue.PreemptionPolicyLowerOrNewerEqualPriority) 262 preemptorTS := wo.GetQueueOrderTimestamp(wl) 263 264 for _, candidateWl := range cq.Workloads { 265 candidatePriority := priority.Priority(candidateWl.Obj) 266 if candidatePriority > wlPriority { 267 continue 268 } 269 270 if candidatePriority == wlPriority && !(considerSamePrio && preemptorTS.Before(wo.GetQueueOrderTimestamp(candidateWl.Obj))) { 271 continue 272 } 273 274 if !workloadUsesResources(candidateWl, resPerFlv) { 275 continue 276 } 277 candidates = append(candidates, candidateWl) 278 } 279 } 280 281 if cq.Cohort != nil && cq.Preemption.ReclaimWithinCohort != kueue.PreemptionPolicyNever { 282 for cohortCQ := range cq.Cohort.Members { 283 if cq == cohortCQ || !cqIsBorrowing(cohortCQ, resPerFlv) { 284 // Can't reclaim quota from itself or ClusterQueues that are not borrowing. 285 continue 286 } 287 onlyLowerPrio := true 288 if cq.Preemption.ReclaimWithinCohort == kueue.PreemptionPolicyAny { 289 onlyLowerPrio = false 290 } 291 for _, candidateWl := range cohortCQ.Workloads { 292 if onlyLowerPrio && priority.Priority(candidateWl.Obj) >= priority.Priority(wl) { 293 continue 294 } 295 if !workloadUsesResources(candidateWl, resPerFlv) { 296 continue 297 } 298 candidates = append(candidates, candidateWl) 299 } 300 } 301 } 302 return candidates 303 } 304 305 func cqIsBorrowing(cq *cache.ClusterQueue, resPerFlv resourcesPerFlavor) bool { 306 if cq.Cohort == nil { 307 return false 308 } 309 for _, rg := range cq.ResourceGroups { 310 for _, fQuotas := range rg.Flavors { 311 fUsage := cq.Usage[fQuotas.Name] 312 for rName := range resPerFlv[fQuotas.Name] { 313 if fUsage[rName] > fQuotas.Resources[rName].Nominal { 314 return true 315 } 316 } 317 } 318 } 319 return false 320 } 321 322 func workloadUsesResources(wl *workload.Info, resPerFlv resourcesPerFlavor) bool { 323 for _, ps := range wl.TotalRequests { 324 for res, flv := range ps.Flavors { 325 if resPerFlv[flv].Has(res) { 326 return true 327 } 328 } 329 } 330 return false 331 } 332 333 func totalRequestsForAssignment(wl *workload.Info, assignment flavorassigner.Assignment) cache.FlavorResourceQuantities { 334 usage := make(cache.FlavorResourceQuantities) 335 for i, ps := range wl.TotalRequests { 336 for res, q := range ps.Requests { 337 flv := assignment.PodSets[i].Flavors[res].Name 338 resUsage := usage[flv] 339 if resUsage == nil { 340 resUsage = make(map[corev1.ResourceName]int64) 341 usage[flv] = resUsage 342 } 343 resUsage[res] += q 344 } 345 } 346 return usage 347 } 348 349 // workloadFits determines if the workload requests would fit given the 350 // requestable resources and simulated usage of the ClusterQueue and its cohort, 351 // if it belongs to one. 352 func workloadFits(wlReq cache.FlavorResourceQuantities, cq *cache.ClusterQueue, allowBorrowing bool) bool { 353 for _, rg := range cq.ResourceGroups { 354 for _, flvQuotas := range rg.Flavors { 355 flvReq, found := wlReq[flvQuotas.Name] 356 if !found { 357 // Workload doesn't request this flavor. 358 continue 359 } 360 cqResUsage := cq.Usage[flvQuotas.Name] 361 for rName, rReq := range flvReq { 362 resource := flvQuotas.Resources[rName] 363 364 if cq.Cohort == nil || !allowBorrowing { 365 if cqResUsage[rName]+rReq > resource.Nominal { 366 return false 367 } 368 } else { 369 // When resource.BorrowingLimit == nil there is no borrowing 370 // limit, so we can skip the check. 371 if resource.BorrowingLimit != nil { 372 if cqResUsage[rName]+rReq > resource.Nominal+*resource.BorrowingLimit { 373 return false 374 } 375 } 376 } 377 378 if cq.Cohort != nil { 379 cohortResUsage := cq.UsedCohortQuota(flvQuotas.Name, rName) 380 requestableQuota := cq.RequestableCohortQuota(flvQuotas.Name, rName) 381 if cohortResUsage+rReq > requestableQuota { 382 return false 383 } 384 } 385 } 386 } 387 } 388 return true 389 } 390 391 // candidatesOrdering criteria: 392 // 0. Workloads already marked for preemption first. 393 // 1. Workloads from other ClusterQueues in the cohort before the ones in the 394 // same ClusterQueue as the preemptor. 395 // 2. Workloads with lower priority first. 396 // 3. Workloads admitted more recently first. 397 func candidatesOrdering(candidates []*workload.Info, cq string, now time.Time) func(int, int) bool { 398 return func(i, j int) bool { 399 a := candidates[i] 400 b := candidates[j] 401 aEvicted := meta.IsStatusConditionTrue(a.Obj.Status.Conditions, kueue.WorkloadEvicted) 402 bEvicted := meta.IsStatusConditionTrue(b.Obj.Status.Conditions, kueue.WorkloadEvicted) 403 if aEvicted != bEvicted { 404 return aEvicted 405 } 406 aInCQ := a.ClusterQueue == cq 407 bInCQ := b.ClusterQueue == cq 408 if aInCQ != bInCQ { 409 return !aInCQ 410 } 411 pa := priority.Priority(a.Obj) 412 pb := priority.Priority(b.Obj) 413 if pa != pb { 414 return pa < pb 415 } 416 timeA := quotaReservationTime(a.Obj, now) 417 timeB := quotaReservationTime(b.Obj, now) 418 if !timeA.Equal(timeB) { 419 return timeA.After(timeB) 420 } 421 // Arbitrary comparison for deterministic sorting. 422 return a.Obj.UID < b.Obj.UID 423 } 424 } 425 426 func quotaReservationTime(wl *kueue.Workload, now time.Time) time.Time { 427 cond := meta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved) 428 if cond == nil || cond.Status != metav1.ConditionTrue { 429 // The condition wasn't populated yet, use the current time. 430 return now 431 } 432 return cond.LastTransitionTime.Time 433 }