k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/types.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package framework 18 19 import ( 20 "errors" 21 "fmt" 22 "sort" 23 "strings" 24 "sync/atomic" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/labels" 30 utilerrors "k8s.io/apimachinery/pkg/util/errors" 31 "k8s.io/apimachinery/pkg/util/sets" 32 utilfeature "k8s.io/apiserver/pkg/util/feature" 33 "k8s.io/klog/v2" 34 35 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 36 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource" 37 "k8s.io/kubernetes/pkg/features" 38 schedutil "k8s.io/kubernetes/pkg/scheduler/util" 39 ) 40 41 var generation int64 42 43 // ActionType is an integer to represent one type of resource change. 44 // Different ActionTypes can be bit-wised to compose new semantics. 45 type ActionType int64 46 47 // Constants for ActionTypes. 48 const ( 49 Add ActionType = 1 << iota // 1 50 Delete // 10 51 // UpdateNodeXYZ is only applicable for Node events. 52 UpdateNodeAllocatable // 100 53 UpdateNodeLabel // 1000 54 UpdateNodeTaint // 10000 55 UpdateNodeCondition // 100000 56 57 All ActionType = 1<<iota - 1 // 111111 58 59 // Use the general Update type if you don't either know or care the specific sub-Update type to use. 60 Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition 61 ) 62 63 // GVK is short for group/version/kind, which can uniquely represent a particular API resource. 64 type GVK string 65 66 // Constants for GVKs. 67 const ( 68 Pod GVK = "Pod" 69 Node GVK = "Node" 70 PersistentVolume GVK = "PersistentVolume" 71 PersistentVolumeClaim GVK = "PersistentVolumeClaim" 72 PodSchedulingContext GVK = "PodSchedulingContext" 73 ResourceClaim GVK = "ResourceClaim" 74 ResourceClass GVK = "ResourceClass" 75 StorageClass GVK = "storage.k8s.io/StorageClass" 76 CSINode GVK = "storage.k8s.io/CSINode" 77 CSIDriver GVK = "storage.k8s.io/CSIDriver" 78 CSIStorageCapacity GVK = "storage.k8s.io/CSIStorageCapacity" 79 WildCard GVK = "*" 80 ) 81 82 type ClusterEventWithHint struct { 83 Event ClusterEvent 84 // QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens, 85 // and filters out events to reduce useless retry of Pod's scheduling. 86 // It's an optional field. If not set, 87 // the scheduling of Pods will be always retried with backoff when this Event happens. 88 // (the same as Queue) 89 QueueingHintFn QueueingHintFn 90 } 91 92 // QueueingHintFn returns a hint that signals whether the event can make a Pod, 93 // which was rejected by this plugin in the past scheduling cycle, schedulable or not. 94 // It's called before a Pod gets moved from unschedulableQ to backoffQ or activeQ. 95 // If it returns an error, we'll take the returned QueueingHint as `Queue` at the caller whatever we returned here so that 96 // we can prevent the Pod from being stuck in the unschedulable pod pool. 97 // 98 // - `pod`: the Pod to be enqueued, which is rejected by this plugin in the past. 99 // - `oldObj` `newObj`: the object involved in that event. 100 // - For example, the given event is "Node deleted", the `oldObj` will be that deleted Node. 101 // - `oldObj` is nil if the event is add event. 102 // - `newObj` is nil if the event is delete event. 103 type QueueingHintFn func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (QueueingHint, error) 104 105 type QueueingHint int 106 107 const ( 108 // QueueSkip implies that the cluster event has no impact on 109 // scheduling of the pod. 110 QueueSkip QueueingHint = iota 111 112 // Queue implies that the Pod may be schedulable by the event. 113 Queue 114 ) 115 116 func (s QueueingHint) String() string { 117 switch s { 118 case QueueSkip: 119 return "QueueSkip" 120 case Queue: 121 return "Queue" 122 } 123 return "" 124 } 125 126 // ClusterEvent abstracts how a system resource's state gets changed. 127 // Resource represents the standard API resources such as Pod, Node, etc. 128 // ActionType denotes the specific change such as Add, Update or Delete. 129 type ClusterEvent struct { 130 Resource GVK 131 ActionType ActionType 132 Label string 133 } 134 135 // IsWildCard returns true if ClusterEvent follows WildCard semantics 136 func (ce ClusterEvent) IsWildCard() bool { 137 return ce.Resource == WildCard && ce.ActionType == All 138 } 139 140 func UnrollWildCardResource() []ClusterEventWithHint { 141 return []ClusterEventWithHint{ 142 {Event: ClusterEvent{Resource: Pod, ActionType: All}}, 143 {Event: ClusterEvent{Resource: Node, ActionType: All}}, 144 {Event: ClusterEvent{Resource: CSINode, ActionType: All}}, 145 {Event: ClusterEvent{Resource: CSIDriver, ActionType: All}}, 146 {Event: ClusterEvent{Resource: CSIStorageCapacity, ActionType: All}}, 147 {Event: ClusterEvent{Resource: PersistentVolume, ActionType: All}}, 148 {Event: ClusterEvent{Resource: PersistentVolumeClaim, ActionType: All}}, 149 {Event: ClusterEvent{Resource: StorageClass, ActionType: All}}, 150 {Event: ClusterEvent{Resource: PodSchedulingContext, ActionType: All}}, 151 } 152 } 153 154 // QueuedPodInfo is a Pod wrapper with additional information related to 155 // the pod's status in the scheduling queue, such as the timestamp when 156 // it's added to the queue. 157 type QueuedPodInfo struct { 158 *PodInfo 159 // The time pod added to the scheduling queue. 160 Timestamp time.Time 161 // Number of schedule attempts before successfully scheduled. 162 // It's used to record the # attempts metric. 163 Attempts int 164 // The time when the pod is added to the queue for the first time. The pod may be added 165 // back to the queue multiple times before it's successfully scheduled. 166 // It shouldn't be updated once initialized. It's used to record the e2e scheduling 167 // latency for a pod. 168 InitialAttemptTimestamp *time.Time 169 // UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status. 170 // It's registered only when the Pod is rejected in PreFilter, Filter, Reserve, or Permit (WaitOnPermit). 171 UnschedulablePlugins sets.Set[string] 172 // PendingPlugins records the plugin names that the Pod failed with Pending status. 173 PendingPlugins sets.Set[string] 174 // Whether the Pod is scheduling gated (by PreEnqueuePlugins) or not. 175 Gated bool 176 } 177 178 // DeepCopy returns a deep copy of the QueuedPodInfo object. 179 func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo { 180 return &QueuedPodInfo{ 181 PodInfo: pqi.PodInfo.DeepCopy(), 182 Timestamp: pqi.Timestamp, 183 Attempts: pqi.Attempts, 184 InitialAttemptTimestamp: pqi.InitialAttemptTimestamp, 185 UnschedulablePlugins: pqi.UnschedulablePlugins.Clone(), 186 Gated: pqi.Gated, 187 } 188 } 189 190 // PodInfo is a wrapper to a Pod with additional pre-computed information to 191 // accelerate processing. This information is typically immutable (e.g., pre-processed 192 // inter-pod affinity selectors). 193 type PodInfo struct { 194 Pod *v1.Pod 195 RequiredAffinityTerms []AffinityTerm 196 RequiredAntiAffinityTerms []AffinityTerm 197 PreferredAffinityTerms []WeightedAffinityTerm 198 PreferredAntiAffinityTerms []WeightedAffinityTerm 199 } 200 201 // DeepCopy returns a deep copy of the PodInfo object. 202 func (pi *PodInfo) DeepCopy() *PodInfo { 203 return &PodInfo{ 204 Pod: pi.Pod.DeepCopy(), 205 RequiredAffinityTerms: pi.RequiredAffinityTerms, 206 RequiredAntiAffinityTerms: pi.RequiredAntiAffinityTerms, 207 PreferredAffinityTerms: pi.PreferredAffinityTerms, 208 PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms, 209 } 210 } 211 212 // Update creates a full new PodInfo by default. And only updates the pod when the PodInfo 213 // has been instantiated and the passed pod is the exact same one as the original pod. 214 func (pi *PodInfo) Update(pod *v1.Pod) error { 215 if pod != nil && pi.Pod != nil && pi.Pod.UID == pod.UID { 216 // PodInfo includes immutable information, and so it is safe to update the pod in place if it is 217 // the exact same pod 218 pi.Pod = pod 219 return nil 220 } 221 var preferredAffinityTerms []v1.WeightedPodAffinityTerm 222 var preferredAntiAffinityTerms []v1.WeightedPodAffinityTerm 223 if affinity := pod.Spec.Affinity; affinity != nil { 224 if a := affinity.PodAffinity; a != nil { 225 preferredAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution 226 } 227 if a := affinity.PodAntiAffinity; a != nil { 228 preferredAntiAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution 229 } 230 } 231 232 // Attempt to parse the affinity terms 233 var parseErrs []error 234 requiredAffinityTerms, err := getAffinityTerms(pod, getPodAffinityTerms(pod.Spec.Affinity)) 235 if err != nil { 236 parseErrs = append(parseErrs, fmt.Errorf("requiredAffinityTerms: %w", err)) 237 } 238 requiredAntiAffinityTerms, err := getAffinityTerms(pod, 239 getPodAntiAffinityTerms(pod.Spec.Affinity)) 240 if err != nil { 241 parseErrs = append(parseErrs, fmt.Errorf("requiredAntiAffinityTerms: %w", err)) 242 } 243 weightedAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAffinityTerms) 244 if err != nil { 245 parseErrs = append(parseErrs, fmt.Errorf("preferredAffinityTerms: %w", err)) 246 } 247 weightedAntiAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAntiAffinityTerms) 248 if err != nil { 249 parseErrs = append(parseErrs, fmt.Errorf("preferredAntiAffinityTerms: %w", err)) 250 } 251 252 pi.Pod = pod 253 pi.RequiredAffinityTerms = requiredAffinityTerms 254 pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms 255 pi.PreferredAffinityTerms = weightedAffinityTerms 256 pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms 257 return utilerrors.NewAggregate(parseErrs) 258 } 259 260 // AffinityTerm is a processed version of v1.PodAffinityTerm. 261 type AffinityTerm struct { 262 Namespaces sets.Set[string] 263 Selector labels.Selector 264 TopologyKey string 265 NamespaceSelector labels.Selector 266 } 267 268 // Matches returns true if the pod matches the label selector and namespaces or namespace selector. 269 func (at *AffinityTerm) Matches(pod *v1.Pod, nsLabels labels.Set) bool { 270 if at.Namespaces.Has(pod.Namespace) || at.NamespaceSelector.Matches(nsLabels) { 271 return at.Selector.Matches(labels.Set(pod.Labels)) 272 } 273 return false 274 } 275 276 // WeightedAffinityTerm is a "processed" representation of v1.WeightedAffinityTerm. 277 type WeightedAffinityTerm struct { 278 AffinityTerm 279 Weight int32 280 } 281 282 // Diagnosis records the details to diagnose a scheduling failure. 283 type Diagnosis struct { 284 NodeToStatusMap NodeToStatusMap 285 // UnschedulablePlugins are plugins that returns Unschedulable or UnschedulableAndUnresolvable. 286 UnschedulablePlugins sets.Set[string] 287 // UnschedulablePlugins are plugins that returns Pending. 288 PendingPlugins sets.Set[string] 289 // PreFilterMsg records the messages returned from PreFilter plugins. 290 PreFilterMsg string 291 // PostFilterMsg records the messages returned from PostFilter plugins. 292 PostFilterMsg string 293 } 294 295 // FitError describes a fit error of a pod. 296 type FitError struct { 297 Pod *v1.Pod 298 NumAllNodes int 299 Diagnosis Diagnosis 300 } 301 302 const ( 303 // NoNodeAvailableMsg is used to format message when no nodes available. 304 NoNodeAvailableMsg = "0/%v nodes are available" 305 ) 306 307 func (d *Diagnosis) AddPluginStatus(sts *Status) { 308 if sts.Plugin() == "" { 309 return 310 } 311 if sts.IsRejected() { 312 if d.UnschedulablePlugins == nil { 313 d.UnschedulablePlugins = sets.New[string]() 314 } 315 d.UnschedulablePlugins.Insert(sts.Plugin()) 316 } 317 if sts.Code() == Pending { 318 if d.PendingPlugins == nil { 319 d.PendingPlugins = sets.New[string]() 320 } 321 d.PendingPlugins.Insert(sts.Plugin()) 322 } 323 } 324 325 // Error returns detailed information of why the pod failed to fit on each node. 326 // A message format is "0/X nodes are available: <PreFilterMsg>. <FilterMsg>. <PostFilterMsg>." 327 func (f *FitError) Error() string { 328 reasonMsg := fmt.Sprintf(NoNodeAvailableMsg+":", f.NumAllNodes) 329 preFilterMsg := f.Diagnosis.PreFilterMsg 330 if preFilterMsg != "" { 331 // PreFilter plugin returns unschedulable. 332 // Add the messages from PreFilter plugins to reasonMsg. 333 reasonMsg += fmt.Sprintf(" %v.", preFilterMsg) 334 } 335 336 if preFilterMsg == "" { 337 // the scheduling cycle went through PreFilter extension point successfully. 338 // 339 // When the prefilter plugin returns unschedulable, 340 // the scheduling framework inserts the same unschedulable status to all nodes in NodeToStatusMap. 341 // So, we shouldn't add the message from NodeToStatusMap when the PreFilter failed. 342 // Otherwise, we will have duplicated reasons in the error message. 343 reasons := make(map[string]int) 344 for _, status := range f.Diagnosis.NodeToStatusMap { 345 for _, reason := range status.Reasons() { 346 reasons[reason]++ 347 } 348 } 349 350 sortReasonsHistogram := func() []string { 351 var reasonStrings []string 352 for k, v := range reasons { 353 reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k)) 354 } 355 sort.Strings(reasonStrings) 356 return reasonStrings 357 } 358 sortedFilterMsg := sortReasonsHistogram() 359 if len(sortedFilterMsg) != 0 { 360 reasonMsg += fmt.Sprintf(" %v.", strings.Join(sortedFilterMsg, ", ")) 361 } 362 } 363 364 // Add the messages from PostFilter plugins to reasonMsg. 365 // We can add this message regardless of whether the scheduling cycle fails at PreFilter or Filter 366 // since we may run PostFilter (if enabled) in both cases. 367 postFilterMsg := f.Diagnosis.PostFilterMsg 368 if postFilterMsg != "" { 369 reasonMsg += fmt.Sprintf(" %v", postFilterMsg) 370 } 371 return reasonMsg 372 } 373 374 func newAffinityTerm(pod *v1.Pod, term *v1.PodAffinityTerm) (*AffinityTerm, error) { 375 selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) 376 if err != nil { 377 return nil, err 378 } 379 380 namespaces := getNamespacesFromPodAffinityTerm(pod, term) 381 nsSelector, err := metav1.LabelSelectorAsSelector(term.NamespaceSelector) 382 if err != nil { 383 return nil, err 384 } 385 386 return &AffinityTerm{Namespaces: namespaces, Selector: selector, TopologyKey: term.TopologyKey, NamespaceSelector: nsSelector}, nil 387 } 388 389 // getAffinityTerms receives a Pod and affinity terms and returns the namespaces and 390 // selectors of the terms. 391 func getAffinityTerms(pod *v1.Pod, v1Terms []v1.PodAffinityTerm) ([]AffinityTerm, error) { 392 if v1Terms == nil { 393 return nil, nil 394 } 395 396 var terms []AffinityTerm 397 for i := range v1Terms { 398 t, err := newAffinityTerm(pod, &v1Terms[i]) 399 if err != nil { 400 // We get here if the label selector failed to process 401 return nil, err 402 } 403 terms = append(terms, *t) 404 } 405 return terms, nil 406 } 407 408 // getWeightedAffinityTerms returns the list of processed affinity terms. 409 func getWeightedAffinityTerms(pod *v1.Pod, v1Terms []v1.WeightedPodAffinityTerm) ([]WeightedAffinityTerm, error) { 410 if v1Terms == nil { 411 return nil, nil 412 } 413 414 var terms []WeightedAffinityTerm 415 for i := range v1Terms { 416 t, err := newAffinityTerm(pod, &v1Terms[i].PodAffinityTerm) 417 if err != nil { 418 // We get here if the label selector failed to process 419 return nil, err 420 } 421 terms = append(terms, WeightedAffinityTerm{AffinityTerm: *t, Weight: v1Terms[i].Weight}) 422 } 423 return terms, nil 424 } 425 426 // NewPodInfo returns a new PodInfo. 427 func NewPodInfo(pod *v1.Pod) (*PodInfo, error) { 428 pInfo := &PodInfo{} 429 err := pInfo.Update(pod) 430 return pInfo, err 431 } 432 433 func getPodAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { 434 if affinity != nil && affinity.PodAffinity != nil { 435 if len(affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { 436 terms = affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution 437 } 438 // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. 439 // if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { 440 // terms = append(terms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) 441 // } 442 } 443 return terms 444 } 445 446 func getPodAntiAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { 447 if affinity != nil && affinity.PodAntiAffinity != nil { 448 if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { 449 terms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution 450 } 451 // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. 452 // if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { 453 // terms = append(terms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) 454 // } 455 } 456 return terms 457 } 458 459 // returns a set of names according to the namespaces indicated in podAffinityTerm. 460 // If namespaces is empty it considers the given pod's namespace. 461 func getNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] { 462 names := sets.Set[string]{} 463 if len(podAffinityTerm.Namespaces) == 0 && podAffinityTerm.NamespaceSelector == nil { 464 names.Insert(pod.Namespace) 465 } else { 466 names.Insert(podAffinityTerm.Namespaces...) 467 } 468 return names 469 } 470 471 // ImageStateSummary provides summarized information about the state of an image. 472 type ImageStateSummary struct { 473 // Size of the image 474 Size int64 475 // Used to track how many nodes have this image, it is computed from the Nodes field below 476 // during the execution of Snapshot. 477 NumNodes int 478 // A set of node names for nodes having this image present. This field is used for 479 // keeping track of the nodes during update/add/remove events. 480 Nodes sets.Set[string] 481 } 482 483 // Snapshot returns a copy without Nodes field of ImageStateSummary 484 func (iss *ImageStateSummary) Snapshot() *ImageStateSummary { 485 return &ImageStateSummary{ 486 Size: iss.Size, 487 NumNodes: iss.Nodes.Len(), 488 } 489 } 490 491 // NodeInfo is node level aggregated information. 492 type NodeInfo struct { 493 // Overall node information. 494 node *v1.Node 495 496 // Pods running on the node. 497 Pods []*PodInfo 498 499 // The subset of pods with affinity. 500 PodsWithAffinity []*PodInfo 501 502 // The subset of pods with required anti-affinity. 503 PodsWithRequiredAntiAffinity []*PodInfo 504 505 // Ports allocated on the node. 506 UsedPorts HostPortInfo 507 508 // Total requested resources of all pods on this node. This includes assumed 509 // pods, which scheduler has sent for binding, but may not be scheduled yet. 510 Requested *Resource 511 // Total requested resources of all pods on this node with a minimum value 512 // applied to each container's CPU and memory requests. This does not reflect 513 // the actual resource requests for this node, but is used to avoid scheduling 514 // many zero-request pods onto one node. 515 NonZeroRequested *Resource 516 // We store allocatedResources (which is Node.Status.Allocatable.*) explicitly 517 // as int64, to avoid conversions and accessing map. 518 Allocatable *Resource 519 520 // ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for 521 // checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image 522 // state information. 523 ImageStates map[string]*ImageStateSummary 524 525 // PVCRefCounts contains a mapping of PVC names to the number of pods on the node using it. 526 // Keys are in the format "namespace/name". 527 PVCRefCounts map[string]int 528 529 // Whenever NodeInfo changes, generation is bumped. 530 // This is used to avoid cloning it if the object didn't change. 531 Generation int64 532 } 533 534 // nextGeneration: Let's make sure history never forgets the name... 535 // Increments the generation number monotonically ensuring that generation numbers never collide. 536 // Collision of the generation numbers would be particularly problematic if a node was deleted and 537 // added back with the same name. See issue#63262. 538 func nextGeneration() int64 { 539 return atomic.AddInt64(&generation, 1) 540 } 541 542 // Resource is a collection of compute resource. 543 type Resource struct { 544 MilliCPU int64 545 Memory int64 546 EphemeralStorage int64 547 // We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value()) 548 // explicitly as int, to avoid conversions and improve performance. 549 AllowedPodNumber int 550 // ScalarResources 551 ScalarResources map[v1.ResourceName]int64 552 } 553 554 // NewResource creates a Resource from ResourceList 555 func NewResource(rl v1.ResourceList) *Resource { 556 r := &Resource{} 557 r.Add(rl) 558 return r 559 } 560 561 // Add adds ResourceList into Resource. 562 func (r *Resource) Add(rl v1.ResourceList) { 563 if r == nil { 564 return 565 } 566 567 for rName, rQuant := range rl { 568 switch rName { 569 case v1.ResourceCPU: 570 r.MilliCPU += rQuant.MilliValue() 571 case v1.ResourceMemory: 572 r.Memory += rQuant.Value() 573 case v1.ResourcePods: 574 r.AllowedPodNumber += int(rQuant.Value()) 575 case v1.ResourceEphemeralStorage: 576 r.EphemeralStorage += rQuant.Value() 577 default: 578 if schedutil.IsScalarResourceName(rName) { 579 r.AddScalar(rName, rQuant.Value()) 580 } 581 } 582 } 583 } 584 585 // Clone returns a copy of this resource. 586 func (r *Resource) Clone() *Resource { 587 res := &Resource{ 588 MilliCPU: r.MilliCPU, 589 Memory: r.Memory, 590 AllowedPodNumber: r.AllowedPodNumber, 591 EphemeralStorage: r.EphemeralStorage, 592 } 593 if r.ScalarResources != nil { 594 res.ScalarResources = make(map[v1.ResourceName]int64, len(r.ScalarResources)) 595 for k, v := range r.ScalarResources { 596 res.ScalarResources[k] = v 597 } 598 } 599 return res 600 } 601 602 // AddScalar adds a resource by a scalar value of this resource. 603 func (r *Resource) AddScalar(name v1.ResourceName, quantity int64) { 604 r.SetScalar(name, r.ScalarResources[name]+quantity) 605 } 606 607 // SetScalar sets a resource by a scalar value of this resource. 608 func (r *Resource) SetScalar(name v1.ResourceName, quantity int64) { 609 // Lazily allocate scalar resource map. 610 if r.ScalarResources == nil { 611 r.ScalarResources = map[v1.ResourceName]int64{} 612 } 613 r.ScalarResources[name] = quantity 614 } 615 616 // SetMaxResource compares with ResourceList and takes max value for each Resource. 617 func (r *Resource) SetMaxResource(rl v1.ResourceList) { 618 if r == nil { 619 return 620 } 621 622 for rName, rQuantity := range rl { 623 switch rName { 624 case v1.ResourceMemory: 625 r.Memory = max(r.Memory, rQuantity.Value()) 626 case v1.ResourceCPU: 627 r.MilliCPU = max(r.MilliCPU, rQuantity.MilliValue()) 628 case v1.ResourceEphemeralStorage: 629 r.EphemeralStorage = max(r.EphemeralStorage, rQuantity.Value()) 630 default: 631 if schedutil.IsScalarResourceName(rName) { 632 r.SetScalar(rName, max(r.ScalarResources[rName], rQuantity.Value())) 633 } 634 } 635 } 636 } 637 638 // NewNodeInfo returns a ready to use empty NodeInfo object. 639 // If any pods are given in arguments, their information will be aggregated in 640 // the returned object. 641 func NewNodeInfo(pods ...*v1.Pod) *NodeInfo { 642 ni := &NodeInfo{ 643 Requested: &Resource{}, 644 NonZeroRequested: &Resource{}, 645 Allocatable: &Resource{}, 646 Generation: nextGeneration(), 647 UsedPorts: make(HostPortInfo), 648 ImageStates: make(map[string]*ImageStateSummary), 649 PVCRefCounts: make(map[string]int), 650 } 651 for _, pod := range pods { 652 ni.AddPod(pod) 653 } 654 return ni 655 } 656 657 // Node returns overall information about this node. 658 func (n *NodeInfo) Node() *v1.Node { 659 if n == nil { 660 return nil 661 } 662 return n.node 663 } 664 665 // Snapshot returns a copy of this node, Except that ImageStates is copied without the Nodes field. 666 func (n *NodeInfo) Snapshot() *NodeInfo { 667 clone := &NodeInfo{ 668 node: n.node, 669 Requested: n.Requested.Clone(), 670 NonZeroRequested: n.NonZeroRequested.Clone(), 671 Allocatable: n.Allocatable.Clone(), 672 UsedPorts: make(HostPortInfo), 673 ImageStates: make(map[string]*ImageStateSummary), 674 PVCRefCounts: make(map[string]int), 675 Generation: n.Generation, 676 } 677 if len(n.Pods) > 0 { 678 clone.Pods = append([]*PodInfo(nil), n.Pods...) 679 } 680 if len(n.UsedPorts) > 0 { 681 // HostPortInfo is a map-in-map struct 682 // make sure it's deep copied 683 for ip, portMap := range n.UsedPorts { 684 clone.UsedPorts[ip] = make(map[ProtocolPort]struct{}) 685 for protocolPort, v := range portMap { 686 clone.UsedPorts[ip][protocolPort] = v 687 } 688 } 689 } 690 if len(n.PodsWithAffinity) > 0 { 691 clone.PodsWithAffinity = append([]*PodInfo(nil), n.PodsWithAffinity...) 692 } 693 if len(n.PodsWithRequiredAntiAffinity) > 0 { 694 clone.PodsWithRequiredAntiAffinity = append([]*PodInfo(nil), n.PodsWithRequiredAntiAffinity...) 695 } 696 if len(n.ImageStates) > 0 { 697 state := make(map[string]*ImageStateSummary, len(n.ImageStates)) 698 for imageName, imageState := range n.ImageStates { 699 state[imageName] = imageState.Snapshot() 700 } 701 clone.ImageStates = state 702 } 703 for key, value := range n.PVCRefCounts { 704 clone.PVCRefCounts[key] = value 705 } 706 return clone 707 } 708 709 // String returns representation of human readable format of this NodeInfo. 710 func (n *NodeInfo) String() string { 711 podKeys := make([]string, len(n.Pods)) 712 for i, p := range n.Pods { 713 podKeys[i] = p.Pod.Name 714 } 715 return fmt.Sprintf("&NodeInfo{Pods:%v, RequestedResource:%#v, NonZeroRequest: %#v, UsedPort: %#v, AllocatableResource:%#v}", 716 podKeys, n.Requested, n.NonZeroRequested, n.UsedPorts, n.Allocatable) 717 } 718 719 // AddPodInfo adds pod information to this NodeInfo. 720 // Consider using this instead of AddPod if a PodInfo is already computed. 721 func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) { 722 n.Pods = append(n.Pods, podInfo) 723 if podWithAffinity(podInfo.Pod) { 724 n.PodsWithAffinity = append(n.PodsWithAffinity, podInfo) 725 } 726 if podWithRequiredAntiAffinity(podInfo.Pod) { 727 n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo) 728 } 729 n.update(podInfo.Pod, 1) 730 } 731 732 // AddPod is a wrapper around AddPodInfo. 733 func (n *NodeInfo) AddPod(pod *v1.Pod) { 734 // ignore this err since apiserver doesn't properly validate affinity terms 735 // and we can't fix the validation for backwards compatibility. 736 podInfo, _ := NewPodInfo(pod) 737 n.AddPodInfo(podInfo) 738 } 739 740 func podWithAffinity(p *v1.Pod) bool { 741 affinity := p.Spec.Affinity 742 return affinity != nil && (affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil) 743 } 744 745 func podWithRequiredAntiAffinity(p *v1.Pod) bool { 746 affinity := p.Spec.Affinity 747 return affinity != nil && affinity.PodAntiAffinity != nil && 748 len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 749 } 750 751 func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bool) { 752 var removed bool 753 for i := range s { 754 tmpKey, err := GetPodKey(s[i].Pod) 755 if err != nil { 756 logger.Error(err, "Cannot get pod key", "pod", klog.KObj(s[i].Pod)) 757 continue 758 } 759 if k == tmpKey { 760 // delete the element 761 s[i] = s[len(s)-1] 762 s = s[:len(s)-1] 763 removed = true 764 break 765 } 766 } 767 // resets the slices to nil so that we can do DeepEqual in unit tests. 768 if len(s) == 0 { 769 return nil, removed 770 } 771 return s, removed 772 } 773 774 // RemovePod subtracts pod information from this NodeInfo. 775 func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error { 776 k, err := GetPodKey(pod) 777 if err != nil { 778 return err 779 } 780 if podWithAffinity(pod) { 781 n.PodsWithAffinity, _ = removeFromSlice(logger, n.PodsWithAffinity, k) 782 } 783 if podWithRequiredAntiAffinity(pod) { 784 n.PodsWithRequiredAntiAffinity, _ = removeFromSlice(logger, n.PodsWithRequiredAntiAffinity, k) 785 } 786 787 var removed bool 788 if n.Pods, removed = removeFromSlice(logger, n.Pods, k); removed { 789 n.update(pod, -1) 790 return nil 791 } 792 return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name) 793 } 794 795 // update node info based on the pod and sign. 796 // The sign will be set to `+1` when AddPod and to `-1` when RemovePod. 797 func (n *NodeInfo) update(pod *v1.Pod, sign int64) { 798 res, non0CPU, non0Mem := calculateResource(pod) 799 n.Requested.MilliCPU += sign * res.MilliCPU 800 n.Requested.Memory += sign * res.Memory 801 n.Requested.EphemeralStorage += sign * res.EphemeralStorage 802 if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 { 803 n.Requested.ScalarResources = map[v1.ResourceName]int64{} 804 } 805 for rName, rQuant := range res.ScalarResources { 806 n.Requested.ScalarResources[rName] += sign * rQuant 807 } 808 n.NonZeroRequested.MilliCPU += sign * non0CPU 809 n.NonZeroRequested.Memory += sign * non0Mem 810 811 // Consume ports when pod added or release ports when pod removed. 812 n.updateUsedPorts(pod, sign > 0) 813 n.updatePVCRefCounts(pod, sign > 0) 814 815 n.Generation = nextGeneration() 816 } 817 818 func max(a, b int64) int64 { 819 if a >= b { 820 return a 821 } 822 return b 823 } 824 825 func calculateResource(pod *v1.Pod) (Resource, int64, int64) { 826 var non0InitCPU, non0InitMem int64 827 var non0CPU, non0Mem int64 828 requests := resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{ 829 InPlacePodVerticalScalingEnabled: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling), 830 ContainerFn: func(requests v1.ResourceList, containerType podutil.ContainerType) { 831 non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&requests) 832 switch containerType { 833 case podutil.Containers: 834 non0CPU += non0CPUReq 835 non0Mem += non0MemReq 836 case podutil.InitContainers: 837 non0InitCPU = max(non0InitCPU, non0CPUReq) 838 non0InitMem = max(non0InitMem, non0MemReq) 839 } 840 }, 841 }) 842 843 non0CPU = max(non0CPU, non0InitCPU) 844 non0Mem = max(non0Mem, non0InitMem) 845 846 // If Overhead is being utilized, add to the non-zero cpu/memory tracking for the pod. It has already been added 847 // into ScalarResources since it is part of requests 848 if pod.Spec.Overhead != nil { 849 if _, found := pod.Spec.Overhead[v1.ResourceCPU]; found { 850 non0CPU += pod.Spec.Overhead.Cpu().MilliValue() 851 } 852 853 if _, found := pod.Spec.Overhead[v1.ResourceMemory]; found { 854 non0Mem += pod.Spec.Overhead.Memory().Value() 855 } 856 } 857 var res Resource 858 res.Add(requests) 859 return res, non0CPU, non0Mem 860 } 861 862 // updateUsedPorts updates the UsedPorts of NodeInfo. 863 func (n *NodeInfo) updateUsedPorts(pod *v1.Pod, add bool) { 864 for _, container := range pod.Spec.Containers { 865 for _, podPort := range container.Ports { 866 if add { 867 n.UsedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort) 868 } else { 869 n.UsedPorts.Remove(podPort.HostIP, string(podPort.Protocol), podPort.HostPort) 870 } 871 } 872 } 873 } 874 875 // updatePVCRefCounts updates the PVCRefCounts of NodeInfo. 876 func (n *NodeInfo) updatePVCRefCounts(pod *v1.Pod, add bool) { 877 for _, v := range pod.Spec.Volumes { 878 if v.PersistentVolumeClaim == nil { 879 continue 880 } 881 882 key := GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName) 883 if add { 884 n.PVCRefCounts[key] += 1 885 } else { 886 n.PVCRefCounts[key] -= 1 887 if n.PVCRefCounts[key] <= 0 { 888 delete(n.PVCRefCounts, key) 889 } 890 } 891 } 892 } 893 894 // SetNode sets the overall node information. 895 func (n *NodeInfo) SetNode(node *v1.Node) { 896 n.node = node 897 n.Allocatable = NewResource(node.Status.Allocatable) 898 n.Generation = nextGeneration() 899 } 900 901 // RemoveNode removes the node object, leaving all other tracking information. 902 func (n *NodeInfo) RemoveNode() { 903 n.node = nil 904 n.Generation = nextGeneration() 905 } 906 907 // GetPodKey returns the string key of a pod. 908 func GetPodKey(pod *v1.Pod) (string, error) { 909 uid := string(pod.UID) 910 if len(uid) == 0 { 911 return "", errors.New("cannot get cache key for pod with empty UID") 912 } 913 return uid, nil 914 } 915 916 // GetNamespacedName returns the string format of a namespaced resource name. 917 func GetNamespacedName(namespace, name string) string { 918 return fmt.Sprintf("%s/%s", namespace, name) 919 } 920 921 // DefaultBindAllHostIP defines the default ip address used to bind to all host. 922 const DefaultBindAllHostIP = "0.0.0.0" 923 924 // ProtocolPort represents a protocol port pair, e.g. tcp:80. 925 type ProtocolPort struct { 926 Protocol string 927 Port int32 928 } 929 930 // NewProtocolPort creates a ProtocolPort instance. 931 func NewProtocolPort(protocol string, port int32) *ProtocolPort { 932 pp := &ProtocolPort{ 933 Protocol: protocol, 934 Port: port, 935 } 936 937 if len(pp.Protocol) == 0 { 938 pp.Protocol = string(v1.ProtocolTCP) 939 } 940 941 return pp 942 } 943 944 // HostPortInfo stores mapping from ip to a set of ProtocolPort 945 type HostPortInfo map[string]map[ProtocolPort]struct{} 946 947 // Add adds (ip, protocol, port) to HostPortInfo 948 func (h HostPortInfo) Add(ip, protocol string, port int32) { 949 if port <= 0 { 950 return 951 } 952 953 h.sanitize(&ip, &protocol) 954 955 pp := NewProtocolPort(protocol, port) 956 if _, ok := h[ip]; !ok { 957 h[ip] = map[ProtocolPort]struct{}{ 958 *pp: {}, 959 } 960 return 961 } 962 963 h[ip][*pp] = struct{}{} 964 } 965 966 // Remove removes (ip, protocol, port) from HostPortInfo 967 func (h HostPortInfo) Remove(ip, protocol string, port int32) { 968 if port <= 0 { 969 return 970 } 971 972 h.sanitize(&ip, &protocol) 973 974 pp := NewProtocolPort(protocol, port) 975 if m, ok := h[ip]; ok { 976 delete(m, *pp) 977 if len(h[ip]) == 0 { 978 delete(h, ip) 979 } 980 } 981 } 982 983 // Len returns the total number of (ip, protocol, port) tuple in HostPortInfo 984 func (h HostPortInfo) Len() int { 985 length := 0 986 for _, m := range h { 987 length += len(m) 988 } 989 return length 990 } 991 992 // CheckConflict checks if the input (ip, protocol, port) conflicts with the existing 993 // ones in HostPortInfo. 994 func (h HostPortInfo) CheckConflict(ip, protocol string, port int32) bool { 995 if port <= 0 { 996 return false 997 } 998 999 h.sanitize(&ip, &protocol) 1000 1001 pp := NewProtocolPort(protocol, port) 1002 1003 // If ip is 0.0.0.0 check all IP's (protocol, port) pair 1004 if ip == DefaultBindAllHostIP { 1005 for _, m := range h { 1006 if _, ok := m[*pp]; ok { 1007 return true 1008 } 1009 } 1010 return false 1011 } 1012 1013 // If ip isn't 0.0.0.0, only check IP and 0.0.0.0's (protocol, port) pair 1014 for _, key := range []string{DefaultBindAllHostIP, ip} { 1015 if m, ok := h[key]; ok { 1016 if _, ok2 := m[*pp]; ok2 { 1017 return true 1018 } 1019 } 1020 } 1021 1022 return false 1023 } 1024 1025 // sanitize the parameters 1026 func (h HostPortInfo) sanitize(ip, protocol *string) { 1027 if len(*ip) == 0 { 1028 *ip = DefaultBindAllHostIP 1029 } 1030 if len(*protocol) == 0 { 1031 *protocol = string(v1.ProtocolTCP) 1032 } 1033 }