k8s.io/kubernetes@v1.29.3/pkg/quota/v1/evaluator/core/pods.go (about) 1 /* 2 Copyright 2016 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package core 18 19 import ( 20 "fmt" 21 "strings" 22 "time" 23 24 corev1 "k8s.io/api/core/v1" 25 "k8s.io/apimachinery/pkg/api/resource" 26 "k8s.io/apimachinery/pkg/labels" 27 "k8s.io/apimachinery/pkg/runtime" 28 "k8s.io/apimachinery/pkg/runtime/schema" 29 "k8s.io/apimachinery/pkg/util/sets" 30 "k8s.io/apiserver/pkg/admission" 31 quota "k8s.io/apiserver/pkg/quota/v1" 32 "k8s.io/apiserver/pkg/quota/v1/generic" 33 "k8s.io/apiserver/pkg/util/feature" 34 "k8s.io/utils/clock" 35 36 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource" 37 api "k8s.io/kubernetes/pkg/apis/core" 38 k8s_api_v1 "k8s.io/kubernetes/pkg/apis/core/v1" 39 "k8s.io/kubernetes/pkg/apis/core/v1/helper" 40 "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos" 41 "k8s.io/kubernetes/pkg/features" 42 ) 43 44 // the name used for object count quota 45 var podObjectCountName = generic.ObjectCountQuotaResourceNameFor(corev1.SchemeGroupVersion.WithResource("pods").GroupResource()) 46 47 // podResources are the set of resources managed by quota associated with pods. 48 var podResources = []corev1.ResourceName{ 49 podObjectCountName, 50 corev1.ResourceCPU, 51 corev1.ResourceMemory, 52 corev1.ResourceEphemeralStorage, 53 corev1.ResourceRequestsCPU, 54 corev1.ResourceRequestsMemory, 55 corev1.ResourceRequestsEphemeralStorage, 56 corev1.ResourceLimitsCPU, 57 corev1.ResourceLimitsMemory, 58 corev1.ResourceLimitsEphemeralStorage, 59 corev1.ResourcePods, 60 } 61 62 // podResourcePrefixes are the set of prefixes for resources (Hugepages, and other 63 // potential extended resources with specific prefix) managed by quota associated with pods. 64 var podResourcePrefixes = []string{ 65 corev1.ResourceHugePagesPrefix, 66 corev1.ResourceRequestsHugePagesPrefix, 67 } 68 69 // requestedResourcePrefixes are the set of prefixes for resources 70 // that might be declared in pod's Resources.Requests/Limits 71 var requestedResourcePrefixes = []string{ 72 corev1.ResourceHugePagesPrefix, 73 } 74 75 // maskResourceWithPrefix mask resource with certain prefix 76 // e.g. hugepages-XXX -> requests.hugepages-XXX 77 func maskResourceWithPrefix(resource corev1.ResourceName, prefix string) corev1.ResourceName { 78 return corev1.ResourceName(fmt.Sprintf("%s%s", prefix, string(resource))) 79 } 80 81 // isExtendedResourceNameForQuota returns true if the extended resource name 82 // has the quota related resource prefix. 83 func isExtendedResourceNameForQuota(name corev1.ResourceName) bool { 84 // As overcommit is not supported by extended resources for now, 85 // only quota objects in format of "requests.resourceName" is allowed. 86 return !helper.IsNativeResource(name) && strings.HasPrefix(string(name), corev1.DefaultResourceRequestsPrefix) 87 } 88 89 // NOTE: it was a mistake, but if a quota tracks cpu or memory related resources, 90 // the incoming pod is required to have those values set. we should not repeat 91 // this mistake for other future resources (gpus, ephemeral-storage,etc). 92 // do not add more resources to this list! 93 var validationSet = sets.NewString( 94 string(corev1.ResourceCPU), 95 string(corev1.ResourceMemory), 96 string(corev1.ResourceRequestsCPU), 97 string(corev1.ResourceRequestsMemory), 98 string(corev1.ResourceLimitsCPU), 99 string(corev1.ResourceLimitsMemory), 100 ) 101 102 // NewPodEvaluator returns an evaluator that can evaluate pods 103 func NewPodEvaluator(f quota.ListerForResourceFunc, clock clock.Clock) quota.Evaluator { 104 listFuncByNamespace := generic.ListResourceUsingListerFunc(f, corev1.SchemeGroupVersion.WithResource("pods")) 105 podEvaluator := &podEvaluator{listFuncByNamespace: listFuncByNamespace, clock: clock} 106 return podEvaluator 107 } 108 109 // podEvaluator knows how to measure usage of pods. 110 type podEvaluator struct { 111 // knows how to list pods 112 listFuncByNamespace generic.ListFuncByNamespace 113 // used to track time 114 clock clock.Clock 115 } 116 117 // Constraints verifies that all required resources are present on the pod 118 // In addition, it validates that the resources are valid (i.e. requests < limits) 119 func (p *podEvaluator) Constraints(required []corev1.ResourceName, item runtime.Object) error { 120 pod, err := toExternalPodOrError(item) 121 if err != nil { 122 return err 123 } 124 125 // BACKWARD COMPATIBILITY REQUIREMENT: if we quota cpu or memory, then each container 126 // must make an explicit request for the resource. this was a mistake. it coupled 127 // validation with resource counting, but we did this before QoS was even defined. 128 // let's not make that mistake again with other resources now that QoS is defined. 129 requiredSet := quota.ToSet(required).Intersection(validationSet) 130 missingSetResourceToContainerNames := make(map[string]sets.String) 131 for i := range pod.Spec.Containers { 132 enforcePodContainerConstraints(&pod.Spec.Containers[i], requiredSet, missingSetResourceToContainerNames) 133 } 134 for i := range pod.Spec.InitContainers { 135 enforcePodContainerConstraints(&pod.Spec.InitContainers[i], requiredSet, missingSetResourceToContainerNames) 136 } 137 if len(missingSetResourceToContainerNames) == 0 { 138 return nil 139 } 140 var resources = sets.NewString() 141 for resource := range missingSetResourceToContainerNames { 142 resources.Insert(resource) 143 } 144 var errorMessages = make([]string, 0, len(missingSetResourceToContainerNames)) 145 for _, resource := range resources.List() { 146 errorMessages = append(errorMessages, fmt.Sprintf("%s for: %s", resource, strings.Join(missingSetResourceToContainerNames[resource].List(), ","))) 147 } 148 return fmt.Errorf("must specify %s", strings.Join(errorMessages, "; ")) 149 } 150 151 // GroupResource that this evaluator tracks 152 func (p *podEvaluator) GroupResource() schema.GroupResource { 153 return corev1.SchemeGroupVersion.WithResource("pods").GroupResource() 154 } 155 156 // Handles returns true if the evaluator should handle the specified attributes. 157 func (p *podEvaluator) Handles(a admission.Attributes) bool { 158 op := a.GetOperation() 159 if op == admission.Create { 160 return true 161 } 162 if feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) && op == admission.Update { 163 return true 164 } 165 return false 166 } 167 168 // Matches returns true if the evaluator matches the specified quota with the provided input item 169 func (p *podEvaluator) Matches(resourceQuota *corev1.ResourceQuota, item runtime.Object) (bool, error) { 170 return generic.Matches(resourceQuota, item, p.MatchingResources, podMatchesScopeFunc) 171 } 172 173 // MatchingResources takes the input specified list of resources and returns the set of resources it matches. 174 func (p *podEvaluator) MatchingResources(input []corev1.ResourceName) []corev1.ResourceName { 175 result := quota.Intersection(input, podResources) 176 for _, resource := range input { 177 // for resources with certain prefix, e.g. hugepages 178 if quota.ContainsPrefix(podResourcePrefixes, resource) { 179 result = append(result, resource) 180 } 181 // for extended resources 182 if isExtendedResourceNameForQuota(resource) { 183 result = append(result, resource) 184 } 185 } 186 187 return result 188 } 189 190 // MatchingScopes takes the input specified list of scopes and pod object. Returns the set of scope selectors pod matches. 191 func (p *podEvaluator) MatchingScopes(item runtime.Object, scopeSelectors []corev1.ScopedResourceSelectorRequirement) ([]corev1.ScopedResourceSelectorRequirement, error) { 192 matchedScopes := []corev1.ScopedResourceSelectorRequirement{} 193 for _, selector := range scopeSelectors { 194 match, err := podMatchesScopeFunc(selector, item) 195 if err != nil { 196 return []corev1.ScopedResourceSelectorRequirement{}, fmt.Errorf("error on matching scope %v: %v", selector, err) 197 } 198 if match { 199 matchedScopes = append(matchedScopes, selector) 200 } 201 } 202 return matchedScopes, nil 203 } 204 205 // UncoveredQuotaScopes takes the input matched scopes which are limited by configuration and the matched quota scopes. 206 // It returns the scopes which are in limited scopes but don't have a corresponding covering quota scope 207 func (p *podEvaluator) UncoveredQuotaScopes(limitedScopes []corev1.ScopedResourceSelectorRequirement, matchedQuotaScopes []corev1.ScopedResourceSelectorRequirement) ([]corev1.ScopedResourceSelectorRequirement, error) { 208 uncoveredScopes := []corev1.ScopedResourceSelectorRequirement{} 209 for _, selector := range limitedScopes { 210 isCovered := false 211 for _, matchedScopeSelector := range matchedQuotaScopes { 212 if matchedScopeSelector.ScopeName == selector.ScopeName { 213 isCovered = true 214 break 215 } 216 } 217 218 if !isCovered { 219 uncoveredScopes = append(uncoveredScopes, selector) 220 } 221 } 222 return uncoveredScopes, nil 223 } 224 225 // Usage knows how to measure usage associated with pods 226 func (p *podEvaluator) Usage(item runtime.Object) (corev1.ResourceList, error) { 227 // delegate to normal usage 228 return PodUsageFunc(item, p.clock) 229 } 230 231 // UsageStats calculates aggregate usage for the object. 232 func (p *podEvaluator) UsageStats(options quota.UsageStatsOptions) (quota.UsageStats, error) { 233 return generic.CalculateUsageStats(options, p.listFuncByNamespace, podMatchesScopeFunc, p.Usage) 234 } 235 236 // verifies we implement the required interface. 237 var _ quota.Evaluator = &podEvaluator{} 238 239 // enforcePodContainerConstraints checks for required resources that are not set on this container and 240 // adds them to missingSet. 241 func enforcePodContainerConstraints(container *corev1.Container, requiredSet sets.String, missingSetResourceToContainerNames map[string]sets.String) { 242 requests := container.Resources.Requests 243 limits := container.Resources.Limits 244 containerUsage := podComputeUsageHelper(requests, limits) 245 containerSet := quota.ToSet(quota.ResourceNames(containerUsage)) 246 if !containerSet.Equal(requiredSet) { 247 if difference := requiredSet.Difference(containerSet); difference.Len() != 0 { 248 for _, diff := range difference.List() { 249 if _, ok := missingSetResourceToContainerNames[diff]; !ok { 250 missingSetResourceToContainerNames[diff] = sets.NewString(container.Name) 251 } else { 252 missingSetResourceToContainerNames[diff].Insert(container.Name) 253 } 254 } 255 } 256 } 257 } 258 259 // podComputeUsageHelper can summarize the pod compute quota usage based on requests and limits 260 func podComputeUsageHelper(requests corev1.ResourceList, limits corev1.ResourceList) corev1.ResourceList { 261 result := corev1.ResourceList{} 262 result[corev1.ResourcePods] = resource.MustParse("1") 263 if request, found := requests[corev1.ResourceCPU]; found { 264 result[corev1.ResourceCPU] = request 265 result[corev1.ResourceRequestsCPU] = request 266 } 267 if limit, found := limits[corev1.ResourceCPU]; found { 268 result[corev1.ResourceLimitsCPU] = limit 269 } 270 if request, found := requests[corev1.ResourceMemory]; found { 271 result[corev1.ResourceMemory] = request 272 result[corev1.ResourceRequestsMemory] = request 273 } 274 if limit, found := limits[corev1.ResourceMemory]; found { 275 result[corev1.ResourceLimitsMemory] = limit 276 } 277 if request, found := requests[corev1.ResourceEphemeralStorage]; found { 278 result[corev1.ResourceEphemeralStorage] = request 279 result[corev1.ResourceRequestsEphemeralStorage] = request 280 } 281 if limit, found := limits[corev1.ResourceEphemeralStorage]; found { 282 result[corev1.ResourceLimitsEphemeralStorage] = limit 283 } 284 for resource, request := range requests { 285 // for resources with certain prefix, e.g. hugepages 286 if quota.ContainsPrefix(requestedResourcePrefixes, resource) { 287 result[resource] = request 288 result[maskResourceWithPrefix(resource, corev1.DefaultResourceRequestsPrefix)] = request 289 } 290 // for extended resources 291 if helper.IsExtendedResourceName(resource) { 292 // only quota objects in format of "requests.resourceName" is allowed for extended resource. 293 result[maskResourceWithPrefix(resource, corev1.DefaultResourceRequestsPrefix)] = request 294 } 295 } 296 297 return result 298 } 299 300 func toExternalPodOrError(obj runtime.Object) (*corev1.Pod, error) { 301 pod := &corev1.Pod{} 302 switch t := obj.(type) { 303 case *corev1.Pod: 304 pod = t 305 case *api.Pod: 306 if err := k8s_api_v1.Convert_core_Pod_To_v1_Pod(t, pod, nil); err != nil { 307 return nil, err 308 } 309 default: 310 return nil, fmt.Errorf("expect *api.Pod or *v1.Pod, got %v", t) 311 } 312 return pod, nil 313 } 314 315 // podMatchesScopeFunc is a function that knows how to evaluate if a pod matches a scope 316 func podMatchesScopeFunc(selector corev1.ScopedResourceSelectorRequirement, object runtime.Object) (bool, error) { 317 pod, err := toExternalPodOrError(object) 318 if err != nil { 319 return false, err 320 } 321 switch selector.ScopeName { 322 case corev1.ResourceQuotaScopeTerminating: 323 return isTerminating(pod), nil 324 case corev1.ResourceQuotaScopeNotTerminating: 325 return !isTerminating(pod), nil 326 case corev1.ResourceQuotaScopeBestEffort: 327 return isBestEffort(pod), nil 328 case corev1.ResourceQuotaScopeNotBestEffort: 329 return !isBestEffort(pod), nil 330 case corev1.ResourceQuotaScopePriorityClass: 331 if selector.Operator == corev1.ScopeSelectorOpExists { 332 // This is just checking for existence of a priorityClass on the pod, 333 // no need to take the overhead of selector parsing/evaluation. 334 return len(pod.Spec.PriorityClassName) != 0, nil 335 } 336 return podMatchesSelector(pod, selector) 337 case corev1.ResourceQuotaScopeCrossNamespacePodAffinity: 338 return usesCrossNamespacePodAffinity(pod), nil 339 } 340 return false, nil 341 } 342 343 // PodUsageFunc returns the quota usage for a pod. 344 // A pod is charged for quota if the following are not true. 345 // - pod has a terminal phase (failed or succeeded) 346 // - pod has been marked for deletion and grace period has expired 347 func PodUsageFunc(obj runtime.Object, clock clock.Clock) (corev1.ResourceList, error) { 348 pod, err := toExternalPodOrError(obj) 349 if err != nil { 350 return corev1.ResourceList{}, err 351 } 352 353 // always quota the object count (even if the pod is end of life) 354 // object count quotas track all objects that are in storage. 355 // where "pods" tracks all pods that have not reached a terminal state, 356 // count/pods tracks all pods independent of state. 357 result := corev1.ResourceList{ 358 podObjectCountName: *(resource.NewQuantity(1, resource.DecimalSI)), 359 } 360 361 // by convention, we do not quota compute resources that have reached end-of life 362 // note: the "pods" resource is considered a compute resource since it is tied to life-cycle. 363 if !QuotaV1Pod(pod, clock) { 364 return result, nil 365 } 366 367 opts := resourcehelper.PodResourcesOptions{ 368 InPlacePodVerticalScalingEnabled: feature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling), 369 } 370 requests := resourcehelper.PodRequests(pod, opts) 371 limits := resourcehelper.PodLimits(pod, opts) 372 373 result = quota.Add(result, podComputeUsageHelper(requests, limits)) 374 return result, nil 375 } 376 377 func isBestEffort(pod *corev1.Pod) bool { 378 return qos.GetPodQOS(pod) == corev1.PodQOSBestEffort 379 } 380 381 func isTerminating(pod *corev1.Pod) bool { 382 if pod.Spec.ActiveDeadlineSeconds != nil && *pod.Spec.ActiveDeadlineSeconds >= int64(0) { 383 return true 384 } 385 return false 386 } 387 388 func podMatchesSelector(pod *corev1.Pod, selector corev1.ScopedResourceSelectorRequirement) (bool, error) { 389 labelSelector, err := helper.ScopedResourceSelectorRequirementsAsSelector(selector) 390 if err != nil { 391 return false, fmt.Errorf("failed to parse and convert selector: %v", err) 392 } 393 var m map[string]string 394 if len(pod.Spec.PriorityClassName) != 0 { 395 m = map[string]string{string(corev1.ResourceQuotaScopePriorityClass): pod.Spec.PriorityClassName} 396 } 397 if labelSelector.Matches(labels.Set(m)) { 398 return true, nil 399 } 400 return false, nil 401 } 402 403 func crossNamespacePodAffinityTerm(term *corev1.PodAffinityTerm) bool { 404 return len(term.Namespaces) != 0 || term.NamespaceSelector != nil 405 } 406 407 func crossNamespacePodAffinityTerms(terms []corev1.PodAffinityTerm) bool { 408 for _, t := range terms { 409 if crossNamespacePodAffinityTerm(&t) { 410 return true 411 } 412 } 413 return false 414 } 415 416 func crossNamespaceWeightedPodAffinityTerms(terms []corev1.WeightedPodAffinityTerm) bool { 417 for _, t := range terms { 418 if crossNamespacePodAffinityTerm(&t.PodAffinityTerm) { 419 return true 420 } 421 } 422 return false 423 } 424 425 func usesCrossNamespacePodAffinity(pod *corev1.Pod) bool { 426 if pod == nil || pod.Spec.Affinity == nil { 427 return false 428 } 429 430 affinity := pod.Spec.Affinity.PodAffinity 431 if affinity != nil { 432 if crossNamespacePodAffinityTerms(affinity.RequiredDuringSchedulingIgnoredDuringExecution) { 433 return true 434 } 435 if crossNamespaceWeightedPodAffinityTerms(affinity.PreferredDuringSchedulingIgnoredDuringExecution) { 436 return true 437 } 438 } 439 440 antiAffinity := pod.Spec.Affinity.PodAntiAffinity 441 if antiAffinity != nil { 442 if crossNamespacePodAffinityTerms(antiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) { 443 return true 444 } 445 if crossNamespaceWeightedPodAffinityTerms(antiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) { 446 return true 447 } 448 } 449 450 return false 451 } 452 453 // QuotaV1Pod returns true if the pod is eligible to track against a quota 454 // if it's not in a terminal state according to its phase. 455 func QuotaV1Pod(pod *corev1.Pod, clock clock.Clock) bool { 456 // if pod is terminal, ignore it for quota 457 if corev1.PodFailed == pod.Status.Phase || corev1.PodSucceeded == pod.Status.Phase { 458 return false 459 } 460 // if pods are stuck terminating (for example, a node is lost), we do not want 461 // to charge the user for that pod in quota because it could prevent them from 462 // scaling up new pods to service their application. 463 if pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil { 464 now := clock.Now() 465 deletionTime := pod.DeletionTimestamp.Time 466 gracePeriod := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second 467 if now.After(deletionTime.Add(gracePeriod)) { 468 return false 469 } 470 } 471 return true 472 }