sigs.k8s.io/kueue@v0.6.2/pkg/workload/workload.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package workload
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"maps"
    23  	"strings"
    24  
    25  	corev1 "k8s.io/api/core/v1"
    26  	apimeta "k8s.io/apimachinery/pkg/api/meta"
    27  	"k8s.io/apimachinery/pkg/api/resource"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/utils/ptr"
    30  	"sigs.k8s.io/controller-runtime/pkg/client"
    31  	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
    32  
    33  	config "sigs.k8s.io/kueue/apis/config/v1beta1"
    34  	kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
    35  	"sigs.k8s.io/kueue/pkg/constants"
    36  	"sigs.k8s.io/kueue/pkg/util/api"
    37  	"sigs.k8s.io/kueue/pkg/util/limitrange"
    38  )
    39  
    40  var (
    41  	admissionManagedConditions = []string{kueue.WorkloadQuotaReserved, kueue.WorkloadEvicted, kueue.WorkloadAdmitted}
    42  )
    43  
    44  type AssigmentClusterQueueState struct {
    45  	LastTriedFlavorIdx     []map[corev1.ResourceName]int
    46  	CohortGeneration       int64
    47  	ClusterQueueGeneration int64
    48  }
    49  
    50  func (s *AssigmentClusterQueueState) Clone() *AssigmentClusterQueueState {
    51  	c := AssigmentClusterQueueState{
    52  		LastTriedFlavorIdx:     make([]map[corev1.ResourceName]int, len(s.LastTriedFlavorIdx)),
    53  		CohortGeneration:       s.CohortGeneration,
    54  		ClusterQueueGeneration: s.ClusterQueueGeneration,
    55  	}
    56  	for ps, flavorIdx := range s.LastTriedFlavorIdx {
    57  		c.LastTriedFlavorIdx[ps] = maps.Clone(flavorIdx)
    58  	}
    59  	return &c
    60  }
    61  
    62  // PendingFlavors returns whether there are pending flavors to try
    63  // after the last attempt.
    64  func (s *AssigmentClusterQueueState) PendingFlavors() bool {
    65  	if s == nil {
    66  		// This is only reached in unit tests.
    67  		return false
    68  	}
    69  	for _, podSetIdxs := range s.LastTriedFlavorIdx {
    70  		for _, idx := range podSetIdxs {
    71  			if idx != -1 {
    72  				return true
    73  			}
    74  		}
    75  	}
    76  	return false
    77  }
    78  
    79  // Info holds a Workload object and some pre-processing.
    80  type Info struct {
    81  	Obj *kueue.Workload
    82  	// list of total resources requested by the podsets.
    83  	TotalRequests []PodSetResources
    84  	// Populated from the queue during admission or from the admission field if
    85  	// already admitted.
    86  	ClusterQueue   string
    87  	LastAssignment *AssigmentClusterQueueState
    88  }
    89  
    90  type PodSetResources struct {
    91  	Name     string
    92  	Requests Requests
    93  	Count    int32
    94  	Flavors  map[corev1.ResourceName]kueue.ResourceFlavorReference
    95  }
    96  
    97  func (psr *PodSetResources) ScaledTo(newCount int32) *PodSetResources {
    98  	ret := &PodSetResources{
    99  		Name:     psr.Name,
   100  		Requests: maps.Clone(psr.Requests),
   101  		Count:    psr.Count,
   102  		Flavors:  maps.Clone(psr.Flavors),
   103  	}
   104  	ret.Requests.scaleDown(int64(ret.Count))
   105  	ret.Requests.scaleUp(int64(newCount))
   106  	ret.Count = newCount
   107  	return ret
   108  }
   109  
   110  func NewInfo(w *kueue.Workload) *Info {
   111  	info := &Info{
   112  		Obj: w,
   113  	}
   114  	if w.Status.Admission != nil {
   115  		info.ClusterQueue = string(w.Status.Admission.ClusterQueue)
   116  		info.TotalRequests = totalRequestsFromAdmission(w)
   117  	} else {
   118  		info.TotalRequests = totalRequestsFromPodSets(w)
   119  	}
   120  	return info
   121  }
   122  
   123  func (i *Info) Update(wl *kueue.Workload) {
   124  	i.Obj = wl
   125  }
   126  
   127  func (i *Info) CanBePartiallyAdmitted() bool {
   128  	return CanBePartiallyAdmitted(i.Obj)
   129  }
   130  
   131  func CanBePartiallyAdmitted(wl *kueue.Workload) bool {
   132  	ps := wl.Spec.PodSets
   133  	for psi := range ps {
   134  		if ps[psi].Count > ptr.Deref(ps[psi].MinCount, ps[psi].Count) {
   135  			return true
   136  		}
   137  	}
   138  	return false
   139  }
   140  
   141  func Key(w *kueue.Workload) string {
   142  	return fmt.Sprintf("%s/%s", w.Namespace, w.Name)
   143  }
   144  
   145  func QueueKey(w *kueue.Workload) string {
   146  	return fmt.Sprintf("%s/%s", w.Namespace, w.Spec.QueueName)
   147  }
   148  
   149  func reclaimableCounts(wl *kueue.Workload) map[string]int32 {
   150  	ret := make(map[string]int32, len(wl.Status.ReclaimablePods))
   151  	for i := range wl.Status.ReclaimablePods {
   152  		reclaimInfo := &wl.Status.ReclaimablePods[i]
   153  		ret[reclaimInfo.Name] = reclaimInfo.Count
   154  	}
   155  	return ret
   156  }
   157  
   158  func podSetsCounts(wl *kueue.Workload) map[string]int32 {
   159  
   160  	ret := make(map[string]int32, len(wl.Spec.PodSets))
   161  	for i := range wl.Spec.PodSets {
   162  		ps := &wl.Spec.PodSets[i]
   163  		ret[ps.Name] = ps.Count
   164  	}
   165  	return ret
   166  }
   167  
   168  func podSetsCountsAfterReclaim(wl *kueue.Workload) map[string]int32 {
   169  	totalCounts := podSetsCounts(wl)
   170  	reclaimCounts := reclaimableCounts(wl)
   171  	for podSetName := range totalCounts {
   172  		if rc, found := reclaimCounts[podSetName]; found {
   173  			totalCounts[podSetName] -= rc
   174  		}
   175  	}
   176  	return totalCounts
   177  }
   178  
   179  func totalRequestsFromPodSets(wl *kueue.Workload) []PodSetResources {
   180  	if len(wl.Spec.PodSets) == 0 {
   181  		return nil
   182  	}
   183  	res := make([]PodSetResources, 0, len(wl.Spec.PodSets))
   184  	currentCounts := podSetsCountsAfterReclaim(wl)
   185  	for _, ps := range wl.Spec.PodSets {
   186  		count := currentCounts[ps.Name]
   187  		setRes := PodSetResources{
   188  			Name:  ps.Name,
   189  			Count: count,
   190  		}
   191  		setRes.Requests = newRequests(limitrange.TotalRequests(&ps.Template.Spec))
   192  		setRes.Requests.scaleUp(int64(count))
   193  		res = append(res, setRes)
   194  	}
   195  	return res
   196  }
   197  
   198  func totalRequestsFromAdmission(wl *kueue.Workload) []PodSetResources {
   199  	if wl.Status.Admission == nil {
   200  		return nil
   201  	}
   202  	res := make([]PodSetResources, 0, len(wl.Spec.PodSets))
   203  	currentCounts := podSetsCountsAfterReclaim(wl)
   204  	totalCounts := podSetsCounts(wl)
   205  	for _, psa := range wl.Status.Admission.PodSetAssignments {
   206  		setRes := PodSetResources{
   207  			Name:     psa.Name,
   208  			Flavors:  psa.Flavors,
   209  			Count:    ptr.Deref(psa.Count, totalCounts[psa.Name]),
   210  			Requests: newRequests(psa.ResourceUsage),
   211  		}
   212  
   213  		if count := currentCounts[psa.Name]; count != setRes.Count {
   214  			setRes.Requests.scaleDown(int64(setRes.Count))
   215  			setRes.Requests.scaleUp(int64(count))
   216  			setRes.Count = count
   217  		}
   218  
   219  		res = append(res, setRes)
   220  	}
   221  	return res
   222  }
   223  
   224  // The following resources calculations are inspired on
   225  // https://github.com/kubernetes/kubernetes/blob/master/pkg/scheduler/framework/types.go
   226  
   227  // Requests maps ResourceName to flavor to value; for CPU it is tracked in MilliCPU.
   228  type Requests map[corev1.ResourceName]int64
   229  
   230  func newRequests(rl corev1.ResourceList) Requests {
   231  	r := Requests{}
   232  	for name, quant := range rl {
   233  		r[name] = ResourceValue(name, quant)
   234  	}
   235  	return r
   236  }
   237  
   238  func (r Requests) ToResourceList() corev1.ResourceList {
   239  	ret := make(corev1.ResourceList, len(r))
   240  	for k, v := range r {
   241  		ret[k] = ResourceQuantity(k, v)
   242  	}
   243  	return ret
   244  }
   245  
   246  // ResourceValue returns the integer value for the resource name.
   247  // It's milli-units for CPU and absolute units for everything else.
   248  func ResourceValue(name corev1.ResourceName, q resource.Quantity) int64 {
   249  	if name == corev1.ResourceCPU {
   250  		return q.MilliValue()
   251  	}
   252  	return q.Value()
   253  }
   254  
   255  func ResourceQuantity(name corev1.ResourceName, v int64) resource.Quantity {
   256  	switch name {
   257  	case corev1.ResourceCPU:
   258  		return *resource.NewMilliQuantity(v, resource.DecimalSI)
   259  	case corev1.ResourceMemory, corev1.ResourceEphemeralStorage:
   260  		return *resource.NewQuantity(v, resource.BinarySI)
   261  	default:
   262  		if strings.HasPrefix(string(name), corev1.ResourceHugePagesPrefix) {
   263  			return *resource.NewQuantity(v, resource.BinarySI)
   264  		}
   265  		return *resource.NewQuantity(v, resource.DecimalSI)
   266  	}
   267  }
   268  
   269  func (r Requests) scaleUp(f int64) {
   270  	for name := range r {
   271  		r[name] *= f
   272  	}
   273  }
   274  
   275  func (r Requests) scaleDown(f int64) {
   276  	for name := range r {
   277  		r[name] /= f
   278  	}
   279  }
   280  
   281  // UpdateStatus updates the condition of a workload with ssa,
   282  // fieldManager being set to managerPrefix + "-" + conditionType
   283  func UpdateStatus(ctx context.Context,
   284  	c client.Client,
   285  	wl *kueue.Workload,
   286  	conditionType string,
   287  	conditionStatus metav1.ConditionStatus,
   288  	reason, message string,
   289  	managerPrefix string) error {
   290  	now := metav1.Now()
   291  	condition := metav1.Condition{
   292  		Type:               conditionType,
   293  		Status:             conditionStatus,
   294  		LastTransitionTime: now,
   295  		Reason:             reason,
   296  		Message:            api.TruncateConditionMessage(message),
   297  	}
   298  
   299  	newWl := BaseSSAWorkload(wl)
   300  	newWl.Status.Conditions = []metav1.Condition{condition}
   301  	return c.Status().Patch(ctx, newWl, client.Apply, client.FieldOwner(managerPrefix+"-"+condition.Type))
   302  }
   303  
   304  // UnsetQuotaReservationWithCondition sets the QuotaReserved condition to false and clears
   305  // the admission.
   306  // Returns whether any change was done.
   307  func UnsetQuotaReservationWithCondition(wl *kueue.Workload, reason, message string) bool {
   308  	condition := metav1.Condition{
   309  		Type:               kueue.WorkloadQuotaReserved,
   310  		Status:             metav1.ConditionFalse,
   311  		LastTransitionTime: metav1.Now(),
   312  		Reason:             reason,
   313  		Message:            api.TruncateConditionMessage(message),
   314  	}
   315  	changed := apimeta.SetStatusCondition(&wl.Status.Conditions, condition)
   316  	if wl.Status.Admission != nil {
   317  		wl.Status.Admission = nil
   318  		changed = true
   319  	}
   320  
   321  	// Reset the admitted condition if necessary.
   322  	if SyncAdmittedCondition(wl) {
   323  		changed = true
   324  	}
   325  	return changed
   326  }
   327  
   328  // BaseSSAWorkload creates a new object based on the input workload that
   329  // only contains the fields necessary to identify the original object.
   330  // The object can be used in as a base for Server-Side-Apply.
   331  func BaseSSAWorkload(w *kueue.Workload) *kueue.Workload {
   332  	wlCopy := &kueue.Workload{
   333  		ObjectMeta: metav1.ObjectMeta{
   334  			UID:        w.UID,
   335  			Name:       w.Name,
   336  			Namespace:  w.Namespace,
   337  			Generation: w.Generation, // Produce a conflict if there was a change in the spec.
   338  		},
   339  		TypeMeta: w.TypeMeta,
   340  	}
   341  	if wlCopy.APIVersion == "" {
   342  		wlCopy.APIVersion = kueue.GroupVersion.String()
   343  	}
   344  	if wlCopy.Kind == "" {
   345  		wlCopy.Kind = "Workload"
   346  	}
   347  	return wlCopy
   348  }
   349  
   350  // SetQuotaReservation applies the provided admission to the workload.
   351  // The WorkloadAdmitted and WorkloadEvicted are added or updated if necessary.
   352  func SetQuotaReservation(w *kueue.Workload, admission *kueue.Admission) {
   353  	w.Status.Admission = admission
   354  	admittedCond := metav1.Condition{
   355  		Type:               kueue.WorkloadQuotaReserved,
   356  		Status:             metav1.ConditionTrue,
   357  		LastTransitionTime: metav1.Now(),
   358  		Reason:             "QuotaReserved",
   359  		Message:            fmt.Sprintf("Quota reserved in ClusterQueue %s", w.Status.Admission.ClusterQueue),
   360  	}
   361  	apimeta.SetStatusCondition(&w.Status.Conditions, admittedCond)
   362  
   363  	//reset Evicted condition if present.
   364  	if evictedCond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted); evictedCond != nil {
   365  		evictedCond.Status = metav1.ConditionFalse
   366  		evictedCond.LastTransitionTime = metav1.Now()
   367  	}
   368  }
   369  
   370  func SetEvictedCondition(w *kueue.Workload, reason string, message string) {
   371  	condition := metav1.Condition{
   372  		Type:               kueue.WorkloadEvicted,
   373  		Status:             metav1.ConditionTrue,
   374  		LastTransitionTime: metav1.Now(),
   375  		Reason:             reason,
   376  		Message:            message,
   377  	}
   378  	apimeta.SetStatusCondition(&w.Status.Conditions, condition)
   379  }
   380  
   381  // admissionPatch creates a new object based on the input workload that contains
   382  // the admission and related conditions. The object can be used in Server-Side-Apply.
   383  func admissionPatch(w *kueue.Workload) *kueue.Workload {
   384  	wlCopy := BaseSSAWorkload(w)
   385  
   386  	wlCopy.Status.Admission = w.Status.Admission.DeepCopy()
   387  	wlCopy.Status.RequeueState = w.Status.RequeueState.DeepCopy()
   388  	for _, conditionName := range admissionManagedConditions {
   389  		if existing := apimeta.FindStatusCondition(w.Status.Conditions, conditionName); existing != nil {
   390  			wlCopy.Status.Conditions = append(wlCopy.Status.Conditions, *existing.DeepCopy())
   391  		}
   392  	}
   393  	return wlCopy
   394  }
   395  
   396  // ApplyAdmissionStatus updated all the admission related status fields of a workload with SSA.
   397  // if strict is true, resourceVersion will be part of the patch, make this call fail if Workload
   398  // was changed.
   399  func ApplyAdmissionStatus(ctx context.Context, c client.Client, w *kueue.Workload, strict bool) error {
   400  	patch := admissionPatch(w)
   401  	if strict {
   402  		patch.ResourceVersion = w.ResourceVersion
   403  	}
   404  	return c.Status().Patch(ctx, patch, client.Apply, client.FieldOwner(constants.AdmissionName))
   405  }
   406  
   407  type Ordering struct {
   408  	PodsReadyRequeuingTimestamp config.RequeuingTimestamp
   409  }
   410  
   411  // GetQueueOrderTimestamp return the timestamp to be used by the scheduler. It could
   412  // be the workload creation time or the last time a PodsReady timeout has occurred.
   413  func (o Ordering) GetQueueOrderTimestamp(w *kueue.Workload) *metav1.Time {
   414  	if o.PodsReadyRequeuingTimestamp == config.EvictionTimestamp {
   415  		if evictedCond, evictedByTimout := IsEvictedByPodsReadyTimeout(w); evictedByTimout {
   416  			return &evictedCond.LastTransitionTime
   417  		}
   418  	}
   419  	return &w.CreationTimestamp
   420  }
   421  
   422  // HasQuotaReservation checks if workload is admitted based on conditions
   423  func HasQuotaReservation(w *kueue.Workload) bool {
   424  	return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadQuotaReserved)
   425  }
   426  
   427  // UpdateReclaimablePods updates the ReclaimablePods list for the workload wit SSA.
   428  func UpdateReclaimablePods(ctx context.Context, c client.Client, w *kueue.Workload, reclaimablePods []kueue.ReclaimablePod) error {
   429  	patch := BaseSSAWorkload(w)
   430  	patch.Status.ReclaimablePods = reclaimablePods
   431  	return c.Status().Patch(ctx, patch, client.Apply, client.FieldOwner(constants.ReclaimablePodsMgr))
   432  }
   433  
   434  // ReclaimablePodsAreEqual checks if two Reclaimable pods are semantically equal
   435  // having the same length and all keys have the same value.
   436  func ReclaimablePodsAreEqual(a, b []kueue.ReclaimablePod) bool {
   437  	if len(a) != len(b) {
   438  		return false
   439  	}
   440  
   441  	mb := make(map[string]int32, len(b))
   442  	for i := range b {
   443  		mb[b[i].Name] = b[i].Count
   444  	}
   445  
   446  	for i := range a {
   447  		if bCount, found := mb[a[i].Name]; !found || bCount != a[i].Count {
   448  			return false
   449  		}
   450  	}
   451  	return true
   452  }
   453  
   454  // HasRequeueState returns true if the workload has re-queue state.
   455  func HasRequeueState(w *kueue.Workload) bool {
   456  	return w.Status.RequeueState != nil
   457  }
   458  
   459  // IsAdmitted returns true if the workload is admitted.
   460  func IsAdmitted(w *kueue.Workload) bool {
   461  	return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadAdmitted)
   462  }
   463  
   464  // IsFinished returns true if the workload is finished.
   465  func IsFinished(w *kueue.Workload) bool {
   466  	return apimeta.IsStatusConditionTrue(w.Status.Conditions, kueue.WorkloadFinished)
   467  }
   468  
   469  // IsEvictedByDeactivation returns true if the workload is evicted by deactivation.
   470  func IsEvictedByDeactivation(w *kueue.Workload) bool {
   471  	cond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted)
   472  	return cond != nil && cond.Status == metav1.ConditionTrue && cond.Reason == kueue.WorkloadEvictedByDeactivation
   473  }
   474  
   475  func IsEvictedByPodsReadyTimeout(w *kueue.Workload) (*metav1.Condition, bool) {
   476  	cond := apimeta.FindStatusCondition(w.Status.Conditions, kueue.WorkloadEvicted)
   477  	if cond == nil || cond.Status != metav1.ConditionTrue || cond.Reason != kueue.WorkloadEvictedByPodsReadyTimeout {
   478  		return nil, false
   479  	}
   480  	return cond, true
   481  }
   482  
   483  func RemoveFinalizer(ctx context.Context, c client.Client, wl *kueue.Workload) error {
   484  	if controllerutil.RemoveFinalizer(wl, kueue.ResourceInUseFinalizerName) {
   485  		return c.Update(ctx, wl)
   486  	}
   487  	return nil
   488  }