sigs.k8s.io/kueue@v0.6.2/pkg/scheduler/preemption/preemption.go (about)

     1  /*
     2  Copyright 2023 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package preemption
    18  
    19  import (
    20  	"context"
    21  	"sort"
    22  	"sync/atomic"
    23  	"time"
    24  
    25  	corev1 "k8s.io/api/core/v1"
    26  	"k8s.io/apimachinery/pkg/api/meta"
    27  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    28  	"k8s.io/apimachinery/pkg/util/sets"
    29  	"k8s.io/client-go/tools/record"
    30  	"k8s.io/client-go/util/workqueue"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/utils/ptr"
    33  	ctrl "sigs.k8s.io/controller-runtime"
    34  	"sigs.k8s.io/controller-runtime/pkg/client"
    35  
    36  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    37  	"sigs.k8s.io/kueue/pkg/cache"
    38  	"sigs.k8s.io/kueue/pkg/scheduler/flavorassigner"
    39  	"sigs.k8s.io/kueue/pkg/util/priority"
    40  	"sigs.k8s.io/kueue/pkg/util/routine"
    41  	"sigs.k8s.io/kueue/pkg/workload"
    42  )
    43  
    44  const parallelPreemptions = 8
    45  
    46  type Preemptor struct {
    47  	client   client.Client
    48  	recorder record.EventRecorder
    49  
    50  	workloadOrdering workload.Ordering
    51  
    52  	// stubs
    53  	applyPreemption func(context.Context, *kueue.Workload) error
    54  }
    55  
    56  func New(cl client.Client, workloadOrdering workload.Ordering, recorder record.EventRecorder) *Preemptor {
    57  	p := &Preemptor{
    58  		client:           cl,
    59  		recorder:         recorder,
    60  		workloadOrdering: workloadOrdering,
    61  	}
    62  	p.applyPreemption = p.applyPreemptionWithSSA
    63  	return p
    64  }
    65  
    66  func (p *Preemptor) OverrideApply(f func(context.Context, *kueue.Workload) error) {
    67  	p.applyPreemption = f
    68  }
    69  
    70  func candidatesOnlyFromQueue(candidates []*workload.Info, clusterQueue string) []*workload.Info {
    71  	result := make([]*workload.Info, 0, len(candidates))
    72  	for _, wi := range candidates {
    73  		if wi.ClusterQueue == clusterQueue {
    74  			result = append(result, wi)
    75  		}
    76  	}
    77  	return result
    78  }
    79  
    80  // GetTargets returns the list of workloads that should be evicted in order to make room for wl.
    81  func (p *Preemptor) GetTargets(wl workload.Info, assignment flavorassigner.Assignment, snapshot *cache.Snapshot) []*workload.Info {
    82  	resPerFlv := resourcesRequiringPreemption(assignment)
    83  	cq := snapshot.ClusterQueues[wl.ClusterQueue]
    84  
    85  	candidates := findCandidates(wl.Obj, p.workloadOrdering, cq, resPerFlv)
    86  	if len(candidates) == 0 {
    87  		return nil
    88  	}
    89  	sort.Slice(candidates, candidatesOrdering(candidates, cq.Name, time.Now()))
    90  
    91  	sameQueueCandidates := candidatesOnlyFromQueue(candidates, wl.ClusterQueue)
    92  
    93  	// To avoid flapping, Kueue only allows preemption of workloads from the same
    94  	// queue if borrowing. Preemption of workloads from queues can happen only
    95  	// if not borrowing at the same time. Kueue prioritizes preemption of
    96  	// workloads from the other queues (that borrowed resources) first, before
    97  	// trying to preempt more own workloads and borrow at the same time.
    98  
    99  	if len(sameQueueCandidates) == len(candidates) {
   100  		// There is no possible preemption of workloads from other queues,
   101  		// so we'll try borrowing.
   102  		return minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, true, nil)
   103  	}
   104  
   105  	// There is a potential of preemption of workloads from the other queue in the
   106  	// cohort. We proceed with borrowing only if the dedicated policy
   107  	// (borrowWithinCohort) is enabled. This ensures the preempted workloads
   108  	// have lower priority, and so they will not preempt the preemptor when
   109  	// requeued.
   110  	borrowWithinCohort := cq.Preemption.BorrowWithinCohort
   111  	if borrowWithinCohort != nil && borrowWithinCohort.Policy != kueue.BorrowWithinCohortPolicyNever {
   112  		allowBorrowingBelowPriority := ptr.To(priority.Priority(wl.Obj))
   113  		if borrowWithinCohort.MaxPriorityThreshold != nil && *borrowWithinCohort.MaxPriorityThreshold < *allowBorrowingBelowPriority {
   114  			allowBorrowingBelowPriority = ptr.To(*borrowWithinCohort.MaxPriorityThreshold + 1)
   115  		}
   116  		return minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, true, allowBorrowingBelowPriority)
   117  	}
   118  	targets := minimalPreemptions(&wl, assignment, snapshot, resPerFlv, candidates, false, nil)
   119  	if len(targets) == 0 {
   120  		// Another attempt. This time only candidates from the same queue, but
   121  		// with borrowing. The previous attempt didn't try borrowing and had broader
   122  		// scope of preemption.
   123  		targets = minimalPreemptions(&wl, assignment, snapshot, resPerFlv, sameQueueCandidates, true, nil)
   124  	}
   125  	return targets
   126  }
   127  
   128  // IssuePreemptions marks the target workloads as evicted.
   129  func (p *Preemptor) IssuePreemptions(ctx context.Context, targets []*workload.Info, cq *cache.ClusterQueue) (int, error) {
   130  	log := ctrl.LoggerFrom(ctx)
   131  	errCh := routine.NewErrorChannel()
   132  	ctx, cancel := context.WithCancel(ctx)
   133  	var successfullyPreempted int64
   134  	defer cancel()
   135  	workqueue.ParallelizeUntil(ctx, parallelPreemptions, len(targets), func(i int) {
   136  		target := targets[i]
   137  		if !meta.IsStatusConditionTrue(target.Obj.Status.Conditions, kueue.WorkloadEvicted) {
   138  			err := p.applyPreemption(ctx, target.Obj)
   139  			if err != nil {
   140  				errCh.SendErrorWithCancel(err, cancel)
   141  				return
   142  			}
   143  
   144  			origin := "ClusterQueue"
   145  			if cq.Name != target.ClusterQueue {
   146  				origin = "cohort"
   147  			}
   148  			log.V(3).Info("Preempted", "targetWorkload", klog.KObj(target.Obj))
   149  			p.recorder.Eventf(target.Obj, corev1.EventTypeNormal, "Preempted", "Preempted by another workload in the %s", origin)
   150  		} else {
   151  			log.V(3).Info("Preemption ongoing", "targetWorkload", klog.KObj(target.Obj))
   152  		}
   153  		atomic.AddInt64(&successfullyPreempted, 1)
   154  	})
   155  	return int(successfullyPreempted), errCh.ReceiveError()
   156  }
   157  
   158  func (p *Preemptor) applyPreemptionWithSSA(ctx context.Context, w *kueue.Workload) error {
   159  	w = w.DeepCopy()
   160  	workload.SetEvictedCondition(w, kueue.WorkloadEvictedByPreemption, "Preempted to accommodate a higher priority Workload")
   161  	return workload.ApplyAdmissionStatus(ctx, p.client, w, false)
   162  }
   163  
   164  // minimalPreemptions implements a heuristic to find a minimal set of Workloads
   165  // to preempt.
   166  // The heuristic first removes candidates, in the input order, while their
   167  // ClusterQueues are still borrowing resources and while the incoming Workload
   168  // doesn't fit in the quota.
   169  // Once the Workload fits, the heuristic tries to add Workloads back, in the
   170  // reverse order in which they were removed, while the incoming Workload still
   171  // fits.
   172  func minimalPreemptions(wl *workload.Info, assignment flavorassigner.Assignment, snapshot *cache.Snapshot, resPerFlv resourcesPerFlavor, candidates []*workload.Info, allowBorrowing bool, allowBorrowingBelowPriority *int32) []*workload.Info {
   173  	wlReq := totalRequestsForAssignment(wl, assignment)
   174  	cq := snapshot.ClusterQueues[wl.ClusterQueue]
   175  
   176  	// Simulate removing all candidates from the ClusterQueue and cohort.
   177  	var targets []*workload.Info
   178  	fits := false
   179  	for _, candWl := range candidates {
   180  		candCQ := snapshot.ClusterQueues[candWl.ClusterQueue]
   181  		if cq != candCQ && !cqIsBorrowing(candCQ, resPerFlv) {
   182  			continue
   183  		}
   184  		if cq != candCQ && allowBorrowingBelowPriority != nil && priority.Priority(candWl.Obj) >= *allowBorrowingBelowPriority {
   185  			// We set allowBorrowing=false if there is a candidate with priority
   186  			// exceeding allowBorrowingBelowPriority added to targets.
   187  			//
   188  			// We need to be careful mutating allowBorrowing. We rely on the
   189  			// fact that once there is a candidate exceeding the priority added
   190  			// to targets, then at least one such candidate is present in the
   191  			// final set of targets (after the second phase of the function).
   192  			//
   193  			// This is true, because the candidates are ordered according
   194  			// to priorities (from lowest to highest, using candidatesOrdering),
   195  			// and the last added target is not removed in the second phase of
   196  			// the function.
   197  			allowBorrowing = false
   198  		}
   199  		snapshot.RemoveWorkload(candWl)
   200  		targets = append(targets, candWl)
   201  		if workloadFits(wlReq, cq, allowBorrowing) {
   202  			fits = true
   203  			break
   204  		}
   205  	}
   206  	if !fits {
   207  		// Reset changes to the snapshot.
   208  		for _, t := range targets {
   209  			snapshot.AddWorkload(t)
   210  		}
   211  		return nil
   212  	}
   213  
   214  	// In the reverse order, check if any of the workloads can be added back.
   215  	for i := len(targets) - 2; i >= 0; i-- {
   216  		snapshot.AddWorkload(targets[i])
   217  		if workloadFits(wlReq, cq, allowBorrowing) {
   218  			// O(1) deletion: copy the last element into index i and reduce size.
   219  			targets[i] = targets[len(targets)-1]
   220  			targets = targets[:len(targets)-1]
   221  		} else {
   222  			snapshot.RemoveWorkload(targets[i])
   223  		}
   224  	}
   225  	// Reset changes to the snapshot.
   226  	for _, t := range targets {
   227  		snapshot.AddWorkload(t)
   228  	}
   229  
   230  	return targets
   231  }
   232  
   233  type resourcesPerFlavor map[kueue.ResourceFlavorReference]sets.Set[corev1.ResourceName]
   234  
   235  func resourcesRequiringPreemption(assignment flavorassigner.Assignment) resourcesPerFlavor {
   236  	resPerFlavor := make(resourcesPerFlavor)
   237  	for _, ps := range assignment.PodSets {
   238  		for res, flvAssignment := range ps.Flavors {
   239  			// assignments with NoFit mode wouldn't enter the preemption path.
   240  			if flvAssignment.Mode != flavorassigner.Preempt {
   241  				continue
   242  			}
   243  			if resPerFlavor[flvAssignment.Name] == nil {
   244  				resPerFlavor[flvAssignment.Name] = sets.New(res)
   245  			} else {
   246  				resPerFlavor[flvAssignment.Name].Insert(res)
   247  			}
   248  		}
   249  	}
   250  	return resPerFlavor
   251  }
   252  
   253  // findCandidates obtains candidates for preemption within the ClusterQueue and
   254  // cohort that respect the preemption policy and are using a resource that the
   255  // preempting workload needs.
   256  func findCandidates(wl *kueue.Workload, wo workload.Ordering, cq *cache.ClusterQueue, resPerFlv resourcesPerFlavor) []*workload.Info {
   257  	var candidates []*workload.Info
   258  	wlPriority := priority.Priority(wl)
   259  
   260  	if cq.Preemption.WithinClusterQueue != kueue.PreemptionPolicyNever {
   261  		considerSamePrio := (cq.Preemption.WithinClusterQueue == kueue.PreemptionPolicyLowerOrNewerEqualPriority)
   262  		preemptorTS := wo.GetQueueOrderTimestamp(wl)
   263  
   264  		for _, candidateWl := range cq.Workloads {
   265  			candidatePriority := priority.Priority(candidateWl.Obj)
   266  			if candidatePriority > wlPriority {
   267  				continue
   268  			}
   269  
   270  			if candidatePriority == wlPriority && !(considerSamePrio && preemptorTS.Before(wo.GetQueueOrderTimestamp(candidateWl.Obj))) {
   271  				continue
   272  			}
   273  
   274  			if !workloadUsesResources(candidateWl, resPerFlv) {
   275  				continue
   276  			}
   277  			candidates = append(candidates, candidateWl)
   278  		}
   279  	}
   280  
   281  	if cq.Cohort != nil && cq.Preemption.ReclaimWithinCohort != kueue.PreemptionPolicyNever {
   282  		for cohortCQ := range cq.Cohort.Members {
   283  			if cq == cohortCQ || !cqIsBorrowing(cohortCQ, resPerFlv) {
   284  				// Can't reclaim quota from itself or ClusterQueues that are not borrowing.
   285  				continue
   286  			}
   287  			onlyLowerPrio := true
   288  			if cq.Preemption.ReclaimWithinCohort == kueue.PreemptionPolicyAny {
   289  				onlyLowerPrio = false
   290  			}
   291  			for _, candidateWl := range cohortCQ.Workloads {
   292  				if onlyLowerPrio && priority.Priority(candidateWl.Obj) >= priority.Priority(wl) {
   293  					continue
   294  				}
   295  				if !workloadUsesResources(candidateWl, resPerFlv) {
   296  					continue
   297  				}
   298  				candidates = append(candidates, candidateWl)
   299  			}
   300  		}
   301  	}
   302  	return candidates
   303  }
   304  
   305  func cqIsBorrowing(cq *cache.ClusterQueue, resPerFlv resourcesPerFlavor) bool {
   306  	if cq.Cohort == nil {
   307  		return false
   308  	}
   309  	for _, rg := range cq.ResourceGroups {
   310  		for _, fQuotas := range rg.Flavors {
   311  			fUsage := cq.Usage[fQuotas.Name]
   312  			for rName := range resPerFlv[fQuotas.Name] {
   313  				if fUsage[rName] > fQuotas.Resources[rName].Nominal {
   314  					return true
   315  				}
   316  			}
   317  		}
   318  	}
   319  	return false
   320  }
   321  
   322  func workloadUsesResources(wl *workload.Info, resPerFlv resourcesPerFlavor) bool {
   323  	for _, ps := range wl.TotalRequests {
   324  		for res, flv := range ps.Flavors {
   325  			if resPerFlv[flv].Has(res) {
   326  				return true
   327  			}
   328  		}
   329  	}
   330  	return false
   331  }
   332  
   333  func totalRequestsForAssignment(wl *workload.Info, assignment flavorassigner.Assignment) cache.FlavorResourceQuantities {
   334  	usage := make(cache.FlavorResourceQuantities)
   335  	for i, ps := range wl.TotalRequests {
   336  		for res, q := range ps.Requests {
   337  			flv := assignment.PodSets[i].Flavors[res].Name
   338  			resUsage := usage[flv]
   339  			if resUsage == nil {
   340  				resUsage = make(map[corev1.ResourceName]int64)
   341  				usage[flv] = resUsage
   342  			}
   343  			resUsage[res] += q
   344  		}
   345  	}
   346  	return usage
   347  }
   348  
   349  // workloadFits determines if the workload requests would fit given the
   350  // requestable resources and simulated usage of the ClusterQueue and its cohort,
   351  // if it belongs to one.
   352  func workloadFits(wlReq cache.FlavorResourceQuantities, cq *cache.ClusterQueue, allowBorrowing bool) bool {
   353  	for _, rg := range cq.ResourceGroups {
   354  		for _, flvQuotas := range rg.Flavors {
   355  			flvReq, found := wlReq[flvQuotas.Name]
   356  			if !found {
   357  				// Workload doesn't request this flavor.
   358  				continue
   359  			}
   360  			cqResUsage := cq.Usage[flvQuotas.Name]
   361  			for rName, rReq := range flvReq {
   362  				resource := flvQuotas.Resources[rName]
   363  
   364  				if cq.Cohort == nil || !allowBorrowing {
   365  					if cqResUsage[rName]+rReq > resource.Nominal {
   366  						return false
   367  					}
   368  				} else {
   369  					// When resource.BorrowingLimit == nil there is no borrowing
   370  					// limit, so we can skip the check.
   371  					if resource.BorrowingLimit != nil {
   372  						if cqResUsage[rName]+rReq > resource.Nominal+*resource.BorrowingLimit {
   373  							return false
   374  						}
   375  					}
   376  				}
   377  
   378  				if cq.Cohort != nil {
   379  					cohortResUsage := cq.UsedCohortQuota(flvQuotas.Name, rName)
   380  					requestableQuota := cq.RequestableCohortQuota(flvQuotas.Name, rName)
   381  					if cohortResUsage+rReq > requestableQuota {
   382  						return false
   383  					}
   384  				}
   385  			}
   386  		}
   387  	}
   388  	return true
   389  }
   390  
   391  // candidatesOrdering criteria:
   392  // 0. Workloads already marked for preemption first.
   393  // 1. Workloads from other ClusterQueues in the cohort before the ones in the
   394  // same ClusterQueue as the preemptor.
   395  // 2. Workloads with lower priority first.
   396  // 3. Workloads admitted more recently first.
   397  func candidatesOrdering(candidates []*workload.Info, cq string, now time.Time) func(int, int) bool {
   398  	return func(i, j int) bool {
   399  		a := candidates[i]
   400  		b := candidates[j]
   401  		aEvicted := meta.IsStatusConditionTrue(a.Obj.Status.Conditions, kueue.WorkloadEvicted)
   402  		bEvicted := meta.IsStatusConditionTrue(b.Obj.Status.Conditions, kueue.WorkloadEvicted)
   403  		if aEvicted != bEvicted {
   404  			return aEvicted
   405  		}
   406  		aInCQ := a.ClusterQueue == cq
   407  		bInCQ := b.ClusterQueue == cq
   408  		if aInCQ != bInCQ {
   409  			return !aInCQ
   410  		}
   411  		pa := priority.Priority(a.Obj)
   412  		pb := priority.Priority(b.Obj)
   413  		if pa != pb {
   414  			return pa < pb
   415  		}
   416  		timeA := quotaReservationTime(a.Obj, now)
   417  		timeB := quotaReservationTime(b.Obj, now)
   418  		if !timeA.Equal(timeB) {
   419  			return timeA.After(timeB)
   420  		}
   421  		// Arbitrary comparison for deterministic sorting.
   422  		return a.Obj.UID < b.Obj.UID
   423  	}
   424  }
   425  
   426  func quotaReservationTime(wl *kueue.Workload, now time.Time) time.Time {
   427  	cond := meta.FindStatusCondition(wl.Status.Conditions, kueue.WorkloadQuotaReserved)
   428  	if cond == nil || cond.Status != metav1.ConditionTrue {
   429  		// The condition wasn't populated yet, use the current time.
   430  		return now
   431  	}
   432  	return cond.LastTransitionTime.Time
   433  }