k8s.io/kubernetes@v1.29.3/pkg/quota/v1/evaluator/core/pods.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package core
    18  
    19  import (
    20  	"fmt"
    21  	"strings"
    22  	"time"
    23  
    24  	corev1 "k8s.io/api/core/v1"
    25  	"k8s.io/apimachinery/pkg/api/resource"
    26  	"k8s.io/apimachinery/pkg/labels"
    27  	"k8s.io/apimachinery/pkg/runtime"
    28  	"k8s.io/apimachinery/pkg/runtime/schema"
    29  	"k8s.io/apimachinery/pkg/util/sets"
    30  	"k8s.io/apiserver/pkg/admission"
    31  	quota "k8s.io/apiserver/pkg/quota/v1"
    32  	"k8s.io/apiserver/pkg/quota/v1/generic"
    33  	"k8s.io/apiserver/pkg/util/feature"
    34  	"k8s.io/utils/clock"
    35  
    36  	resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource"
    37  	api "k8s.io/kubernetes/pkg/apis/core"
    38  	k8s_api_v1 "k8s.io/kubernetes/pkg/apis/core/v1"
    39  	"k8s.io/kubernetes/pkg/apis/core/v1/helper"
    40  	"k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
    41  	"k8s.io/kubernetes/pkg/features"
    42  )
    43  
    44  // the name used for object count quota
    45  var podObjectCountName = generic.ObjectCountQuotaResourceNameFor(corev1.SchemeGroupVersion.WithResource("pods").GroupResource())
    46  
    47  // podResources are the set of resources managed by quota associated with pods.
    48  var podResources = []corev1.ResourceName{
    49  	podObjectCountName,
    50  	corev1.ResourceCPU,
    51  	corev1.ResourceMemory,
    52  	corev1.ResourceEphemeralStorage,
    53  	corev1.ResourceRequestsCPU,
    54  	corev1.ResourceRequestsMemory,
    55  	corev1.ResourceRequestsEphemeralStorage,
    56  	corev1.ResourceLimitsCPU,
    57  	corev1.ResourceLimitsMemory,
    58  	corev1.ResourceLimitsEphemeralStorage,
    59  	corev1.ResourcePods,
    60  }
    61  
    62  // podResourcePrefixes are the set of prefixes for resources (Hugepages, and other
    63  // potential extended resources with specific prefix) managed by quota associated with pods.
    64  var podResourcePrefixes = []string{
    65  	corev1.ResourceHugePagesPrefix,
    66  	corev1.ResourceRequestsHugePagesPrefix,
    67  }
    68  
    69  // requestedResourcePrefixes are the set of prefixes for resources
    70  // that might be declared in pod's Resources.Requests/Limits
    71  var requestedResourcePrefixes = []string{
    72  	corev1.ResourceHugePagesPrefix,
    73  }
    74  
    75  // maskResourceWithPrefix mask resource with certain prefix
    76  // e.g. hugepages-XXX -> requests.hugepages-XXX
    77  func maskResourceWithPrefix(resource corev1.ResourceName, prefix string) corev1.ResourceName {
    78  	return corev1.ResourceName(fmt.Sprintf("%s%s", prefix, string(resource)))
    79  }
    80  
    81  // isExtendedResourceNameForQuota returns true if the extended resource name
    82  // has the quota related resource prefix.
    83  func isExtendedResourceNameForQuota(name corev1.ResourceName) bool {
    84  	// As overcommit is not supported by extended resources for now,
    85  	// only quota objects in format of "requests.resourceName" is allowed.
    86  	return !helper.IsNativeResource(name) && strings.HasPrefix(string(name), corev1.DefaultResourceRequestsPrefix)
    87  }
    88  
    89  // NOTE: it was a mistake, but if a quota tracks cpu or memory related resources,
    90  // the incoming pod is required to have those values set.  we should not repeat
    91  // this mistake for other future resources (gpus, ephemeral-storage,etc).
    92  // do not add more resources to this list!
    93  var validationSet = sets.NewString(
    94  	string(corev1.ResourceCPU),
    95  	string(corev1.ResourceMemory),
    96  	string(corev1.ResourceRequestsCPU),
    97  	string(corev1.ResourceRequestsMemory),
    98  	string(corev1.ResourceLimitsCPU),
    99  	string(corev1.ResourceLimitsMemory),
   100  )
   101  
   102  // NewPodEvaluator returns an evaluator that can evaluate pods
   103  func NewPodEvaluator(f quota.ListerForResourceFunc, clock clock.Clock) quota.Evaluator {
   104  	listFuncByNamespace := generic.ListResourceUsingListerFunc(f, corev1.SchemeGroupVersion.WithResource("pods"))
   105  	podEvaluator := &podEvaluator{listFuncByNamespace: listFuncByNamespace, clock: clock}
   106  	return podEvaluator
   107  }
   108  
   109  // podEvaluator knows how to measure usage of pods.
   110  type podEvaluator struct {
   111  	// knows how to list pods
   112  	listFuncByNamespace generic.ListFuncByNamespace
   113  	// used to track time
   114  	clock clock.Clock
   115  }
   116  
   117  // Constraints verifies that all required resources are present on the pod
   118  // In addition, it validates that the resources are valid (i.e. requests < limits)
   119  func (p *podEvaluator) Constraints(required []corev1.ResourceName, item runtime.Object) error {
   120  	pod, err := toExternalPodOrError(item)
   121  	if err != nil {
   122  		return err
   123  	}
   124  
   125  	// BACKWARD COMPATIBILITY REQUIREMENT: if we quota cpu or memory, then each container
   126  	// must make an explicit request for the resource.  this was a mistake.  it coupled
   127  	// validation with resource counting, but we did this before QoS was even defined.
   128  	// let's not make that mistake again with other resources now that QoS is defined.
   129  	requiredSet := quota.ToSet(required).Intersection(validationSet)
   130  	missingSetResourceToContainerNames := make(map[string]sets.String)
   131  	for i := range pod.Spec.Containers {
   132  		enforcePodContainerConstraints(&pod.Spec.Containers[i], requiredSet, missingSetResourceToContainerNames)
   133  	}
   134  	for i := range pod.Spec.InitContainers {
   135  		enforcePodContainerConstraints(&pod.Spec.InitContainers[i], requiredSet, missingSetResourceToContainerNames)
   136  	}
   137  	if len(missingSetResourceToContainerNames) == 0 {
   138  		return nil
   139  	}
   140  	var resources = sets.NewString()
   141  	for resource := range missingSetResourceToContainerNames {
   142  		resources.Insert(resource)
   143  	}
   144  	var errorMessages = make([]string, 0, len(missingSetResourceToContainerNames))
   145  	for _, resource := range resources.List() {
   146  		errorMessages = append(errorMessages, fmt.Sprintf("%s for: %s", resource, strings.Join(missingSetResourceToContainerNames[resource].List(), ",")))
   147  	}
   148  	return fmt.Errorf("must specify %s", strings.Join(errorMessages, "; "))
   149  }
   150  
   151  // GroupResource that this evaluator tracks
   152  func (p *podEvaluator) GroupResource() schema.GroupResource {
   153  	return corev1.SchemeGroupVersion.WithResource("pods").GroupResource()
   154  }
   155  
   156  // Handles returns true if the evaluator should handle the specified attributes.
   157  func (p *podEvaluator) Handles(a admission.Attributes) bool {
   158  	op := a.GetOperation()
   159  	if op == admission.Create {
   160  		return true
   161  	}
   162  	if feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && op == admission.Update {
   163  		return true
   164  	}
   165  	return false
   166  }
   167  
   168  // Matches returns true if the evaluator matches the specified quota with the provided input item
   169  func (p *podEvaluator) Matches(resourceQuota *corev1.ResourceQuota, item runtime.Object) (bool, error) {
   170  	return generic.Matches(resourceQuota, item, p.MatchingResources, podMatchesScopeFunc)
   171  }
   172  
   173  // MatchingResources takes the input specified list of resources and returns the set of resources it matches.
   174  func (p *podEvaluator) MatchingResources(input []corev1.ResourceName) []corev1.ResourceName {
   175  	result := quota.Intersection(input, podResources)
   176  	for _, resource := range input {
   177  		// for resources with certain prefix, e.g. hugepages
   178  		if quota.ContainsPrefix(podResourcePrefixes, resource) {
   179  			result = append(result, resource)
   180  		}
   181  		// for extended resources
   182  		if isExtendedResourceNameForQuota(resource) {
   183  			result = append(result, resource)
   184  		}
   185  	}
   186  
   187  	return result
   188  }
   189  
   190  // MatchingScopes takes the input specified list of scopes and pod object. Returns the set of scope selectors pod matches.
   191  func (p *podEvaluator) MatchingScopes(item runtime.Object, scopeSelectors []corev1.ScopedResourceSelectorRequirement) ([]corev1.ScopedResourceSelectorRequirement, error) {
   192  	matchedScopes := []corev1.ScopedResourceSelectorRequirement{}
   193  	for _, selector := range scopeSelectors {
   194  		match, err := podMatchesScopeFunc(selector, item)
   195  		if err != nil {
   196  			return []corev1.ScopedResourceSelectorRequirement{}, fmt.Errorf("error on matching scope %v: %v", selector, err)
   197  		}
   198  		if match {
   199  			matchedScopes = append(matchedScopes, selector)
   200  		}
   201  	}
   202  	return matchedScopes, nil
   203  }
   204  
   205  // UncoveredQuotaScopes takes the input matched scopes which are limited by configuration and the matched quota scopes.
   206  // It returns the scopes which are in limited scopes but don't have a corresponding covering quota scope
   207  func (p *podEvaluator) UncoveredQuotaScopes(limitedScopes []corev1.ScopedResourceSelectorRequirement, matchedQuotaScopes []corev1.ScopedResourceSelectorRequirement) ([]corev1.ScopedResourceSelectorRequirement, error) {
   208  	uncoveredScopes := []corev1.ScopedResourceSelectorRequirement{}
   209  	for _, selector := range limitedScopes {
   210  		isCovered := false
   211  		for _, matchedScopeSelector := range matchedQuotaScopes {
   212  			if matchedScopeSelector.ScopeName == selector.ScopeName {
   213  				isCovered = true
   214  				break
   215  			}
   216  		}
   217  
   218  		if !isCovered {
   219  			uncoveredScopes = append(uncoveredScopes, selector)
   220  		}
   221  	}
   222  	return uncoveredScopes, nil
   223  }
   224  
   225  // Usage knows how to measure usage associated with pods
   226  func (p *podEvaluator) Usage(item runtime.Object) (corev1.ResourceList, error) {
   227  	// delegate to normal usage
   228  	return PodUsageFunc(item, p.clock)
   229  }
   230  
   231  // UsageStats calculates aggregate usage for the object.
   232  func (p *podEvaluator) UsageStats(options quota.UsageStatsOptions) (quota.UsageStats, error) {
   233  	return generic.CalculateUsageStats(options, p.listFuncByNamespace, podMatchesScopeFunc, p.Usage)
   234  }
   235  
   236  // verifies we implement the required interface.
   237  var _ quota.Evaluator = &podEvaluator{}
   238  
   239  // enforcePodContainerConstraints checks for required resources that are not set on this container and
   240  // adds them to missingSet.
   241  func enforcePodContainerConstraints(container *corev1.Container, requiredSet sets.String, missingSetResourceToContainerNames map[string]sets.String) {
   242  	requests := container.Resources.Requests
   243  	limits := container.Resources.Limits
   244  	containerUsage := podComputeUsageHelper(requests, limits)
   245  	containerSet := quota.ToSet(quota.ResourceNames(containerUsage))
   246  	if !containerSet.Equal(requiredSet) {
   247  		if difference := requiredSet.Difference(containerSet); difference.Len() != 0 {
   248  			for _, diff := range difference.List() {
   249  				if _, ok := missingSetResourceToContainerNames[diff]; !ok {
   250  					missingSetResourceToContainerNames[diff] = sets.NewString(container.Name)
   251  				} else {
   252  					missingSetResourceToContainerNames[diff].Insert(container.Name)
   253  				}
   254  			}
   255  		}
   256  	}
   257  }
   258  
   259  // podComputeUsageHelper can summarize the pod compute quota usage based on requests and limits
   260  func podComputeUsageHelper(requests corev1.ResourceList, limits corev1.ResourceList) corev1.ResourceList {
   261  	result := corev1.ResourceList{}
   262  	result[corev1.ResourcePods] = resource.MustParse("1")
   263  	if request, found := requests[corev1.ResourceCPU]; found {
   264  		result[corev1.ResourceCPU] = request
   265  		result[corev1.ResourceRequestsCPU] = request
   266  	}
   267  	if limit, found := limits[corev1.ResourceCPU]; found {
   268  		result[corev1.ResourceLimitsCPU] = limit
   269  	}
   270  	if request, found := requests[corev1.ResourceMemory]; found {
   271  		result[corev1.ResourceMemory] = request
   272  		result[corev1.ResourceRequestsMemory] = request
   273  	}
   274  	if limit, found := limits[corev1.ResourceMemory]; found {
   275  		result[corev1.ResourceLimitsMemory] = limit
   276  	}
   277  	if request, found := requests[corev1.ResourceEphemeralStorage]; found {
   278  		result[corev1.ResourceEphemeralStorage] = request
   279  		result[corev1.ResourceRequestsEphemeralStorage] = request
   280  	}
   281  	if limit, found := limits[corev1.ResourceEphemeralStorage]; found {
   282  		result[corev1.ResourceLimitsEphemeralStorage] = limit
   283  	}
   284  	for resource, request := range requests {
   285  		// for resources with certain prefix, e.g. hugepages
   286  		if quota.ContainsPrefix(requestedResourcePrefixes, resource) {
   287  			result[resource] = request
   288  			result[maskResourceWithPrefix(resource, corev1.DefaultResourceRequestsPrefix)] = request
   289  		}
   290  		// for extended resources
   291  		if helper.IsExtendedResourceName(resource) {
   292  			// only quota objects in format of "requests.resourceName" is allowed for extended resource.
   293  			result[maskResourceWithPrefix(resource, corev1.DefaultResourceRequestsPrefix)] = request
   294  		}
   295  	}
   296  
   297  	return result
   298  }
   299  
   300  func toExternalPodOrError(obj runtime.Object) (*corev1.Pod, error) {
   301  	pod := &corev1.Pod{}
   302  	switch t := obj.(type) {
   303  	case *corev1.Pod:
   304  		pod = t
   305  	case *api.Pod:
   306  		if err := k8s_api_v1.Convert_core_Pod_To_v1_Pod(t, pod, nil); err != nil {
   307  			return nil, err
   308  		}
   309  	default:
   310  		return nil, fmt.Errorf("expect *api.Pod or *v1.Pod, got %v", t)
   311  	}
   312  	return pod, nil
   313  }
   314  
   315  // podMatchesScopeFunc is a function that knows how to evaluate if a pod matches a scope
   316  func podMatchesScopeFunc(selector corev1.ScopedResourceSelectorRequirement, object runtime.Object) (bool, error) {
   317  	pod, err := toExternalPodOrError(object)
   318  	if err != nil {
   319  		return false, err
   320  	}
   321  	switch selector.ScopeName {
   322  	case corev1.ResourceQuotaScopeTerminating:
   323  		return isTerminating(pod), nil
   324  	case corev1.ResourceQuotaScopeNotTerminating:
   325  		return !isTerminating(pod), nil
   326  	case corev1.ResourceQuotaScopeBestEffort:
   327  		return isBestEffort(pod), nil
   328  	case corev1.ResourceQuotaScopeNotBestEffort:
   329  		return !isBestEffort(pod), nil
   330  	case corev1.ResourceQuotaScopePriorityClass:
   331  		if selector.Operator == corev1.ScopeSelectorOpExists {
   332  			// This is just checking for existence of a priorityClass on the pod,
   333  			// no need to take the overhead of selector parsing/evaluation.
   334  			return len(pod.Spec.PriorityClassName) != 0, nil
   335  		}
   336  		return podMatchesSelector(pod, selector)
   337  	case corev1.ResourceQuotaScopeCrossNamespacePodAffinity:
   338  		return usesCrossNamespacePodAffinity(pod), nil
   339  	}
   340  	return false, nil
   341  }
   342  
   343  // PodUsageFunc returns the quota usage for a pod.
   344  // A pod is charged for quota if the following are not true.
   345  //   - pod has a terminal phase (failed or succeeded)
   346  //   - pod has been marked for deletion and grace period has expired
   347  func PodUsageFunc(obj runtime.Object, clock clock.Clock) (corev1.ResourceList, error) {
   348  	pod, err := toExternalPodOrError(obj)
   349  	if err != nil {
   350  		return corev1.ResourceList{}, err
   351  	}
   352  
   353  	// always quota the object count (even if the pod is end of life)
   354  	// object count quotas track all objects that are in storage.
   355  	// where "pods" tracks all pods that have not reached a terminal state,
   356  	// count/pods tracks all pods independent of state.
   357  	result := corev1.ResourceList{
   358  		podObjectCountName: *(resource.NewQuantity(1, resource.DecimalSI)),
   359  	}
   360  
   361  	// by convention, we do not quota compute resources that have reached end-of life
   362  	// note: the "pods" resource is considered a compute resource since it is tied to life-cycle.
   363  	if !QuotaV1Pod(pod, clock) {
   364  		return result, nil
   365  	}
   366  
   367  	opts := resourcehelper.PodResourcesOptions{
   368  		InPlacePodVerticalScalingEnabled: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
   369  	}
   370  	requests := resourcehelper.PodRequests(pod, opts)
   371  	limits := resourcehelper.PodLimits(pod, opts)
   372  
   373  	result = quota.Add(result, podComputeUsageHelper(requests, limits))
   374  	return result, nil
   375  }
   376  
   377  func isBestEffort(pod *corev1.Pod) bool {
   378  	return qos.GetPodQOS(pod) == corev1.PodQOSBestEffort
   379  }
   380  
   381  func isTerminating(pod *corev1.Pod) bool {
   382  	if pod.Spec.ActiveDeadlineSeconds != nil && *pod.Spec.ActiveDeadlineSeconds >= int64(0) {
   383  		return true
   384  	}
   385  	return false
   386  }
   387  
   388  func podMatchesSelector(pod *corev1.Pod, selector corev1.ScopedResourceSelectorRequirement) (bool, error) {
   389  	labelSelector, err := helper.ScopedResourceSelectorRequirementsAsSelector(selector)
   390  	if err != nil {
   391  		return false, fmt.Errorf("failed to parse and convert selector: %v", err)
   392  	}
   393  	var m map[string]string
   394  	if len(pod.Spec.PriorityClassName) != 0 {
   395  		m = map[string]string{string(corev1.ResourceQuotaScopePriorityClass): pod.Spec.PriorityClassName}
   396  	}
   397  	if labelSelector.Matches(labels.Set(m)) {
   398  		return true, nil
   399  	}
   400  	return false, nil
   401  }
   402  
   403  func crossNamespacePodAffinityTerm(term *corev1.PodAffinityTerm) bool {
   404  	return len(term.Namespaces) != 0 || term.NamespaceSelector != nil
   405  }
   406  
   407  func crossNamespacePodAffinityTerms(terms []corev1.PodAffinityTerm) bool {
   408  	for _, t := range terms {
   409  		if crossNamespacePodAffinityTerm(&t) {
   410  			return true
   411  		}
   412  	}
   413  	return false
   414  }
   415  
   416  func crossNamespaceWeightedPodAffinityTerms(terms []corev1.WeightedPodAffinityTerm) bool {
   417  	for _, t := range terms {
   418  		if crossNamespacePodAffinityTerm(&t.PodAffinityTerm) {
   419  			return true
   420  		}
   421  	}
   422  	return false
   423  }
   424  
   425  func usesCrossNamespacePodAffinity(pod *corev1.Pod) bool {
   426  	if pod == nil || pod.Spec.Affinity == nil {
   427  		return false
   428  	}
   429  
   430  	affinity := pod.Spec.Affinity.PodAffinity
   431  	if affinity != nil {
   432  		if crossNamespacePodAffinityTerms(affinity.RequiredDuringSchedulingIgnoredDuringExecution) {
   433  			return true
   434  		}
   435  		if crossNamespaceWeightedPodAffinityTerms(affinity.PreferredDuringSchedulingIgnoredDuringExecution) {
   436  			return true
   437  		}
   438  	}
   439  
   440  	antiAffinity := pod.Spec.Affinity.PodAntiAffinity
   441  	if antiAffinity != nil {
   442  		if crossNamespacePodAffinityTerms(antiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) {
   443  			return true
   444  		}
   445  		if crossNamespaceWeightedPodAffinityTerms(antiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) {
   446  			return true
   447  		}
   448  	}
   449  
   450  	return false
   451  }
   452  
   453  // QuotaV1Pod returns true if the pod is eligible to track against a quota
   454  // if it's not in a terminal state according to its phase.
   455  func QuotaV1Pod(pod *corev1.Pod, clock clock.Clock) bool {
   456  	// if pod is terminal, ignore it for quota
   457  	if corev1.PodFailed == pod.Status.Phase || corev1.PodSucceeded == pod.Status.Phase {
   458  		return false
   459  	}
   460  	// if pods are stuck terminating (for example, a node is lost), we do not want
   461  	// to charge the user for that pod in quota because it could prevent them from
   462  	// scaling up new pods to service their application.
   463  	if pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil {
   464  		now := clock.Now()
   465  		deletionTime := pod.DeletionTimestamp.Time
   466  		gracePeriod := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second
   467  		if now.After(deletionTime.Add(gracePeriod)) {
   468  			return false
   469  		}
   470  	}
   471  	return true
   472  }