k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/types.go (about) 1 /* 2 Copyright 2015 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package framework 18 19 import ( 20 "errors" 21 "fmt" 22 "sort" 23 "strings" 24 "sync/atomic" 25 "time" 26 27 v1 "k8s.io/api/core/v1" 28 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 29 "k8s.io/apimachinery/pkg/labels" 30 utilerrors "k8s.io/apimachinery/pkg/util/errors" 31 "k8s.io/apimachinery/pkg/util/sets" 32 utilfeature "k8s.io/apiserver/pkg/util/feature" 33 "k8s.io/klog/v2" 34 35 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 36 resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource" 37 "k8s.io/kubernetes/pkg/features" 38 schedutil "k8s.io/kubernetes/pkg/scheduler/util" 39 ) 40 41 var generation int64 42 43 // ActionType is an integer to represent one type of resource change. 44 // Different ActionTypes can be bit-wised to compose new semantics. 45 type ActionType int64 46 47 // Constants for ActionTypes. 48 const ( 49 Add ActionType = 1 << iota // 1 50 Delete // 10 51 // UpdateNodeXYZ is only applicable for Node events. 52 UpdateNodeAllocatable // 100 53 UpdateNodeLabel // 1000 54 UpdateNodeTaint // 10000 55 UpdateNodeCondition // 100000 56 UpdateNodeAnnotation // 1000000 57 58 All ActionType = 1<<iota - 1 // 1111111 59 60 // Use the general Update type if you don't either know or care the specific sub-Update type to use. 61 Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition | UpdateNodeAnnotation 62 ) 63 64 // GVK is short for group/version/kind, which can uniquely represent a particular API resource. 65 type GVK string 66 67 // Constants for GVKs. 68 const ( 69 // There are a couple of notes about how the scheduler notifies the events of Pods: 70 // - Add: add events could be triggered by either a newly created Pod or an existing Pod that is scheduled to a Node. 71 // - Delete: delete events could be triggered by: 72 // - a Pod that is deleted 73 // - a Pod that was assumed, but gets un-assumed due to some errors in the binding cycle. 74 // - an existing Pod that was unscheduled but gets scheduled to a Node. 75 Pod GVK = "Pod" 76 // A note about NodeAdd event and UpdateNodeTaint event: 77 // NodeAdd QueueingHint isn't always called because of the internal feature called preCheck. 78 // It's definitely not something expected for plugin developers, 79 // and registering UpdateNodeTaint event is the only mitigation for now. 80 // So, kube-scheduler registers UpdateNodeTaint event for plugins that has NodeAdded event, but don't have UpdateNodeTaint event. 81 // It has a bad impact for the requeuing efficiency though, a lot better than some Pods being stuck in the 82 // unschedulable pod pool. 83 // This behavior will be removed when we remove the preCheck feature. 84 // See: https://github.com/kubernetes/kubernetes/issues/110175 85 Node GVK = "Node" 86 PersistentVolume GVK = "PersistentVolume" 87 PersistentVolumeClaim GVK = "PersistentVolumeClaim" 88 CSINode GVK = "storage.k8s.io/CSINode" 89 CSIDriver GVK = "storage.k8s.io/CSIDriver" 90 CSIStorageCapacity GVK = "storage.k8s.io/CSIStorageCapacity" 91 StorageClass GVK = "storage.k8s.io/StorageClass" 92 PodSchedulingContext GVK = "PodSchedulingContext" 93 ResourceClaim GVK = "ResourceClaim" 94 ResourceClass GVK = "ResourceClass" 95 ResourceClaimParameters GVK = "ResourceClaimParameters" 96 ResourceClassParameters GVK = "ResourceClassParameters" 97 98 // WildCard is a special GVK to match all resources. 99 // e.g., If you register `{Resource: "*", ActionType: All}` in EventsToRegister, 100 // all coming clusterEvents will be admitted. Be careful to register it, it will 101 // increase the computing pressure in requeueing unless you really need it. 102 // 103 // Meanwhile, if the coming clusterEvent is a wildcard one, all pods 104 // will be moved from unschedulablePod pool to activeQ/backoffQ forcibly. 105 WildCard GVK = "*" 106 ) 107 108 type ClusterEventWithHint struct { 109 Event ClusterEvent 110 // QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens, 111 // and filters out events to reduce useless retry of Pod's scheduling. 112 // It's an optional field. If not set, 113 // the scheduling of Pods will be always retried with backoff when this Event happens. 114 // (the same as Queue) 115 QueueingHintFn QueueingHintFn 116 } 117 118 // QueueingHintFn returns a hint that signals whether the event can make a Pod, 119 // which was rejected by this plugin in the past scheduling cycle, schedulable or not. 120 // It's called before a Pod gets moved from unschedulableQ to backoffQ or activeQ. 121 // If it returns an error, we'll take the returned QueueingHint as `Queue` at the caller whatever we returned here so that 122 // we can prevent the Pod from being stuck in the unschedulable pod pool. 123 // 124 // - `pod`: the Pod to be enqueued, which is rejected by this plugin in the past. 125 // - `oldObj` `newObj`: the object involved in that event. 126 // - For example, the given event is "Node deleted", the `oldObj` will be that deleted Node. 127 // - `oldObj` is nil if the event is add event. 128 // - `newObj` is nil if the event is delete event. 129 type QueueingHintFn func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (QueueingHint, error) 130 131 type QueueingHint int 132 133 const ( 134 // QueueSkip implies that the cluster event has no impact on 135 // scheduling of the pod. 136 QueueSkip QueueingHint = iota 137 138 // Queue implies that the Pod may be schedulable by the event. 139 Queue 140 ) 141 142 func (s QueueingHint) String() string { 143 switch s { 144 case QueueSkip: 145 return "QueueSkip" 146 case Queue: 147 return "Queue" 148 } 149 return "" 150 } 151 152 // ClusterEvent abstracts how a system resource's state gets changed. 153 // Resource represents the standard API resources such as Pod, Node, etc. 154 // ActionType denotes the specific change such as Add, Update or Delete. 155 type ClusterEvent struct { 156 Resource GVK 157 ActionType ActionType 158 Label string 159 } 160 161 // IsWildCard returns true if ClusterEvent follows WildCard semantics 162 func (ce ClusterEvent) IsWildCard() bool { 163 return ce.Resource == WildCard && ce.ActionType == All 164 } 165 166 // Match returns true if ClusterEvent is matched with the coming event. 167 // If the ce.Resource is "*", there's no requirement for the coming event' Resource. 168 // Contrarily, if the coming event's Resource is "*", the ce.Resource should only be "*". 169 // 170 // Note: we have a special case here when the coming event is a wildcard event, 171 // it will force all Pods to move to activeQ/backoffQ, 172 // but we take it as an unmatched event unless the ce is also a wildcard one. 173 func (ce ClusterEvent) Match(event ClusterEvent) bool { 174 return ce.IsWildCard() || (ce.Resource == WildCard || ce.Resource == event.Resource) && ce.ActionType&event.ActionType != 0 175 } 176 177 func UnrollWildCardResource() []ClusterEventWithHint { 178 return []ClusterEventWithHint{ 179 {Event: ClusterEvent{Resource: Pod, ActionType: All}}, 180 {Event: ClusterEvent{Resource: Node, ActionType: All}}, 181 {Event: ClusterEvent{Resource: PersistentVolume, ActionType: All}}, 182 {Event: ClusterEvent{Resource: PersistentVolumeClaim, ActionType: All}}, 183 {Event: ClusterEvent{Resource: CSINode, ActionType: All}}, 184 {Event: ClusterEvent{Resource: CSIDriver, ActionType: All}}, 185 {Event: ClusterEvent{Resource: CSIStorageCapacity, ActionType: All}}, 186 {Event: ClusterEvent{Resource: StorageClass, ActionType: All}}, 187 {Event: ClusterEvent{Resource: PodSchedulingContext, ActionType: All}}, 188 {Event: ClusterEvent{Resource: ResourceClaim, ActionType: All}}, 189 {Event: ClusterEvent{Resource: ResourceClass, ActionType: All}}, 190 {Event: ClusterEvent{Resource: ResourceClaimParameters, ActionType: All}}, 191 {Event: ClusterEvent{Resource: ResourceClassParameters, ActionType: All}}, 192 } 193 } 194 195 // QueuedPodInfo is a Pod wrapper with additional information related to 196 // the pod's status in the scheduling queue, such as the timestamp when 197 // it's added to the queue. 198 type QueuedPodInfo struct { 199 *PodInfo 200 // The time pod added to the scheduling queue. 201 Timestamp time.Time 202 // Number of schedule attempts before successfully scheduled. 203 // It's used to record the # attempts metric. 204 Attempts int 205 // The time when the pod is added to the queue for the first time. The pod may be added 206 // back to the queue multiple times before it's successfully scheduled. 207 // It shouldn't be updated once initialized. It's used to record the e2e scheduling 208 // latency for a pod. 209 InitialAttemptTimestamp *time.Time 210 // UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status. 211 // It's registered only when the Pod is rejected in PreFilter, Filter, Reserve, or Permit (WaitOnPermit). 212 UnschedulablePlugins sets.Set[string] 213 // PendingPlugins records the plugin names that the Pod failed with Pending status. 214 PendingPlugins sets.Set[string] 215 // Whether the Pod is scheduling gated (by PreEnqueuePlugins) or not. 216 Gated bool 217 } 218 219 // DeepCopy returns a deep copy of the QueuedPodInfo object. 220 func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo { 221 return &QueuedPodInfo{ 222 PodInfo: pqi.PodInfo.DeepCopy(), 223 Timestamp: pqi.Timestamp, 224 Attempts: pqi.Attempts, 225 InitialAttemptTimestamp: pqi.InitialAttemptTimestamp, 226 UnschedulablePlugins: pqi.UnschedulablePlugins.Clone(), 227 Gated: pqi.Gated, 228 } 229 } 230 231 // PodInfo is a wrapper to a Pod with additional pre-computed information to 232 // accelerate processing. This information is typically immutable (e.g., pre-processed 233 // inter-pod affinity selectors). 234 type PodInfo struct { 235 Pod *v1.Pod 236 RequiredAffinityTerms []AffinityTerm 237 RequiredAntiAffinityTerms []AffinityTerm 238 PreferredAffinityTerms []WeightedAffinityTerm 239 PreferredAntiAffinityTerms []WeightedAffinityTerm 240 } 241 242 // DeepCopy returns a deep copy of the PodInfo object. 243 func (pi *PodInfo) DeepCopy() *PodInfo { 244 return &PodInfo{ 245 Pod: pi.Pod.DeepCopy(), 246 RequiredAffinityTerms: pi.RequiredAffinityTerms, 247 RequiredAntiAffinityTerms: pi.RequiredAntiAffinityTerms, 248 PreferredAffinityTerms: pi.PreferredAffinityTerms, 249 PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms, 250 } 251 } 252 253 // Update creates a full new PodInfo by default. And only updates the pod when the PodInfo 254 // has been instantiated and the passed pod is the exact same one as the original pod. 255 func (pi *PodInfo) Update(pod *v1.Pod) error { 256 if pod != nil && pi.Pod != nil && pi.Pod.UID == pod.UID { 257 // PodInfo includes immutable information, and so it is safe to update the pod in place if it is 258 // the exact same pod 259 pi.Pod = pod 260 return nil 261 } 262 var preferredAffinityTerms []v1.WeightedPodAffinityTerm 263 var preferredAntiAffinityTerms []v1.WeightedPodAffinityTerm 264 if affinity := pod.Spec.Affinity; affinity != nil { 265 if a := affinity.PodAffinity; a != nil { 266 preferredAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution 267 } 268 if a := affinity.PodAntiAffinity; a != nil { 269 preferredAntiAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution 270 } 271 } 272 273 // Attempt to parse the affinity terms 274 var parseErrs []error 275 requiredAffinityTerms, err := GetAffinityTerms(pod, GetPodAffinityTerms(pod.Spec.Affinity)) 276 if err != nil { 277 parseErrs = append(parseErrs, fmt.Errorf("requiredAffinityTerms: %w", err)) 278 } 279 requiredAntiAffinityTerms, err := GetAffinityTerms(pod, 280 GetPodAntiAffinityTerms(pod.Spec.Affinity)) 281 if err != nil { 282 parseErrs = append(parseErrs, fmt.Errorf("requiredAntiAffinityTerms: %w", err)) 283 } 284 weightedAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAffinityTerms) 285 if err != nil { 286 parseErrs = append(parseErrs, fmt.Errorf("preferredAffinityTerms: %w", err)) 287 } 288 weightedAntiAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAntiAffinityTerms) 289 if err != nil { 290 parseErrs = append(parseErrs, fmt.Errorf("preferredAntiAffinityTerms: %w", err)) 291 } 292 293 pi.Pod = pod 294 pi.RequiredAffinityTerms = requiredAffinityTerms 295 pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms 296 pi.PreferredAffinityTerms = weightedAffinityTerms 297 pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms 298 return utilerrors.NewAggregate(parseErrs) 299 } 300 301 // AffinityTerm is a processed version of v1.PodAffinityTerm. 302 type AffinityTerm struct { 303 Namespaces sets.Set[string] 304 Selector labels.Selector 305 TopologyKey string 306 NamespaceSelector labels.Selector 307 } 308 309 // Matches returns true if the pod matches the label selector and namespaces or namespace selector. 310 func (at *AffinityTerm) Matches(pod *v1.Pod, nsLabels labels.Set) bool { 311 if at.Namespaces.Has(pod.Namespace) || at.NamespaceSelector.Matches(nsLabels) { 312 return at.Selector.Matches(labels.Set(pod.Labels)) 313 } 314 return false 315 } 316 317 // WeightedAffinityTerm is a "processed" representation of v1.WeightedAffinityTerm. 318 type WeightedAffinityTerm struct { 319 AffinityTerm 320 Weight int32 321 } 322 323 // ExtenderName is a fake plugin name put in UnschedulablePlugins when Extender rejected some Nodes. 324 const ExtenderName = "Extender" 325 326 // Diagnosis records the details to diagnose a scheduling failure. 327 type Diagnosis struct { 328 // NodeToStatusMap records the status of each node 329 // if they're rejected in PreFilter (via PreFilterResult) or Filter plugins. 330 // Nodes that pass PreFilter/Filter plugins are not included in this map. 331 NodeToStatusMap NodeToStatusMap 332 // UnschedulablePlugins are plugins that returns Unschedulable or UnschedulableAndUnresolvable. 333 UnschedulablePlugins sets.Set[string] 334 // UnschedulablePlugins are plugins that returns Pending. 335 PendingPlugins sets.Set[string] 336 // PreFilterMsg records the messages returned from PreFilter plugins. 337 PreFilterMsg string 338 // PostFilterMsg records the messages returned from PostFilter plugins. 339 PostFilterMsg string 340 // EvaluatedNodes records the number of nodes evaluated by Filter stage. 341 // It is used for debugging purposes only. 342 EvaluatedNodes int 343 } 344 345 // FitError describes a fit error of a pod. 346 type FitError struct { 347 Pod *v1.Pod 348 NumAllNodes int 349 Diagnosis Diagnosis 350 } 351 352 const ( 353 // NoNodeAvailableMsg is used to format message when no nodes available. 354 NoNodeAvailableMsg = "0/%v nodes are available" 355 ) 356 357 func (d *Diagnosis) AddPluginStatus(sts *Status) { 358 if sts.Plugin() == "" { 359 return 360 } 361 if sts.IsRejected() { 362 if d.UnschedulablePlugins == nil { 363 d.UnschedulablePlugins = sets.New[string]() 364 } 365 d.UnschedulablePlugins.Insert(sts.Plugin()) 366 } 367 if sts.Code() == Pending { 368 if d.PendingPlugins == nil { 369 d.PendingPlugins = sets.New[string]() 370 } 371 d.PendingPlugins.Insert(sts.Plugin()) 372 } 373 } 374 375 // Error returns detailed information of why the pod failed to fit on each node. 376 // A message format is "0/X nodes are available: <PreFilterMsg>. <FilterMsg>. <PostFilterMsg>." 377 func (f *FitError) Error() string { 378 reasonMsg := fmt.Sprintf(NoNodeAvailableMsg+":", f.NumAllNodes) 379 preFilterMsg := f.Diagnosis.PreFilterMsg 380 if preFilterMsg != "" { 381 // PreFilter plugin returns unschedulable. 382 // Add the messages from PreFilter plugins to reasonMsg. 383 reasonMsg += fmt.Sprintf(" %v.", preFilterMsg) 384 } 385 386 if preFilterMsg == "" { 387 // the scheduling cycle went through PreFilter extension point successfully. 388 // 389 // When the prefilter plugin returns unschedulable, 390 // the scheduling framework inserts the same unschedulable status to all nodes in NodeToStatusMap. 391 // So, we shouldn't add the message from NodeToStatusMap when the PreFilter failed. 392 // Otherwise, we will have duplicated reasons in the error message. 393 reasons := make(map[string]int) 394 for _, status := range f.Diagnosis.NodeToStatusMap { 395 for _, reason := range status.Reasons() { 396 reasons[reason]++ 397 } 398 } 399 400 sortReasonsHistogram := func() []string { 401 var reasonStrings []string 402 for k, v := range reasons { 403 reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k)) 404 } 405 sort.Strings(reasonStrings) 406 return reasonStrings 407 } 408 sortedFilterMsg := sortReasonsHistogram() 409 if len(sortedFilterMsg) != 0 { 410 reasonMsg += fmt.Sprintf(" %v.", strings.Join(sortedFilterMsg, ", ")) 411 } 412 } 413 414 // Add the messages from PostFilter plugins to reasonMsg. 415 // We can add this message regardless of whether the scheduling cycle fails at PreFilter or Filter 416 // since we may run PostFilter (if enabled) in both cases. 417 postFilterMsg := f.Diagnosis.PostFilterMsg 418 if postFilterMsg != "" { 419 reasonMsg += fmt.Sprintf(" %v", postFilterMsg) 420 } 421 return reasonMsg 422 } 423 424 func newAffinityTerm(pod *v1.Pod, term *v1.PodAffinityTerm) (*AffinityTerm, error) { 425 selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) 426 if err != nil { 427 return nil, err 428 } 429 430 namespaces := getNamespacesFromPodAffinityTerm(pod, term) 431 nsSelector, err := metav1.LabelSelectorAsSelector(term.NamespaceSelector) 432 if err != nil { 433 return nil, err 434 } 435 436 return &AffinityTerm{Namespaces: namespaces, Selector: selector, TopologyKey: term.TopologyKey, NamespaceSelector: nsSelector}, nil 437 } 438 439 // GetAffinityTerms receives a Pod and affinity terms and returns the namespaces and 440 // selectors of the terms. 441 func GetAffinityTerms(pod *v1.Pod, v1Terms []v1.PodAffinityTerm) ([]AffinityTerm, error) { 442 if v1Terms == nil { 443 return nil, nil 444 } 445 446 var terms []AffinityTerm 447 for i := range v1Terms { 448 t, err := newAffinityTerm(pod, &v1Terms[i]) 449 if err != nil { 450 // We get here if the label selector failed to process 451 return nil, err 452 } 453 terms = append(terms, *t) 454 } 455 return terms, nil 456 } 457 458 // getWeightedAffinityTerms returns the list of processed affinity terms. 459 func getWeightedAffinityTerms(pod *v1.Pod, v1Terms []v1.WeightedPodAffinityTerm) ([]WeightedAffinityTerm, error) { 460 if v1Terms == nil { 461 return nil, nil 462 } 463 464 var terms []WeightedAffinityTerm 465 for i := range v1Terms { 466 t, err := newAffinityTerm(pod, &v1Terms[i].PodAffinityTerm) 467 if err != nil { 468 // We get here if the label selector failed to process 469 return nil, err 470 } 471 terms = append(terms, WeightedAffinityTerm{AffinityTerm: *t, Weight: v1Terms[i].Weight}) 472 } 473 return terms, nil 474 } 475 476 // NewPodInfo returns a new PodInfo. 477 func NewPodInfo(pod *v1.Pod) (*PodInfo, error) { 478 pInfo := &PodInfo{} 479 err := pInfo.Update(pod) 480 return pInfo, err 481 } 482 483 func GetPodAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { 484 if affinity != nil && affinity.PodAffinity != nil { 485 if len(affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { 486 terms = affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution 487 } 488 // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. 489 // if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { 490 // terms = append(terms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...) 491 // } 492 } 493 return terms 494 } 495 496 func GetPodAntiAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) { 497 if affinity != nil && affinity.PodAntiAffinity != nil { 498 if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { 499 terms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution 500 } 501 // TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution. 502 // if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 { 503 // terms = append(terms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...) 504 // } 505 } 506 return terms 507 } 508 509 // returns a set of names according to the namespaces indicated in podAffinityTerm. 510 // If namespaces is empty it considers the given pod's namespace. 511 func getNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] { 512 names := sets.Set[string]{} 513 if len(podAffinityTerm.Namespaces) == 0 && podAffinityTerm.NamespaceSelector == nil { 514 names.Insert(pod.Namespace) 515 } else { 516 names.Insert(podAffinityTerm.Namespaces...) 517 } 518 return names 519 } 520 521 // ImageStateSummary provides summarized information about the state of an image. 522 type ImageStateSummary struct { 523 // Size of the image 524 Size int64 525 // Used to track how many nodes have this image, it is computed from the Nodes field below 526 // during the execution of Snapshot. 527 NumNodes int 528 // A set of node names for nodes having this image present. This field is used for 529 // keeping track of the nodes during update/add/remove events. 530 Nodes sets.Set[string] 531 } 532 533 // Snapshot returns a copy without Nodes field of ImageStateSummary 534 func (iss *ImageStateSummary) Snapshot() *ImageStateSummary { 535 return &ImageStateSummary{ 536 Size: iss.Size, 537 NumNodes: iss.Nodes.Len(), 538 } 539 } 540 541 // NodeInfo is node level aggregated information. 542 type NodeInfo struct { 543 // Overall node information. 544 node *v1.Node 545 546 // Pods running on the node. 547 Pods []*PodInfo 548 549 // The subset of pods with affinity. 550 PodsWithAffinity []*PodInfo 551 552 // The subset of pods with required anti-affinity. 553 PodsWithRequiredAntiAffinity []*PodInfo 554 555 // Ports allocated on the node. 556 UsedPorts HostPortInfo 557 558 // Total requested resources of all pods on this node. This includes assumed 559 // pods, which scheduler has sent for binding, but may not be scheduled yet. 560 Requested *Resource 561 // Total requested resources of all pods on this node with a minimum value 562 // applied to each container's CPU and memory requests. This does not reflect 563 // the actual resource requests for this node, but is used to avoid scheduling 564 // many zero-request pods onto one node. 565 NonZeroRequested *Resource 566 // We store allocatedResources (which is Node.Status.Allocatable.*) explicitly 567 // as int64, to avoid conversions and accessing map. 568 Allocatable *Resource 569 570 // ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for 571 // checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image 572 // state information. 573 ImageStates map[string]*ImageStateSummary 574 575 // PVCRefCounts contains a mapping of PVC names to the number of pods on the node using it. 576 // Keys are in the format "namespace/name". 577 PVCRefCounts map[string]int 578 579 // Whenever NodeInfo changes, generation is bumped. 580 // This is used to avoid cloning it if the object didn't change. 581 Generation int64 582 } 583 584 // nextGeneration: Let's make sure history never forgets the name... 585 // Increments the generation number monotonically ensuring that generation numbers never collide. 586 // Collision of the generation numbers would be particularly problematic if a node was deleted and 587 // added back with the same name. See issue#63262. 588 func nextGeneration() int64 { 589 return atomic.AddInt64(&generation, 1) 590 } 591 592 // Resource is a collection of compute resource. 593 type Resource struct { 594 MilliCPU int64 595 Memory int64 596 EphemeralStorage int64 597 // We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value()) 598 // explicitly as int, to avoid conversions and improve performance. 599 AllowedPodNumber int 600 // ScalarResources 601 ScalarResources map[v1.ResourceName]int64 602 } 603 604 // NewResource creates a Resource from ResourceList 605 func NewResource(rl v1.ResourceList) *Resource { 606 r := &Resource{} 607 r.Add(rl) 608 return r 609 } 610 611 // Add adds ResourceList into Resource. 612 func (r *Resource) Add(rl v1.ResourceList) { 613 if r == nil { 614 return 615 } 616 617 for rName, rQuant := range rl { 618 switch rName { 619 case v1.ResourceCPU: 620 r.MilliCPU += rQuant.MilliValue() 621 case v1.ResourceMemory: 622 r.Memory += rQuant.Value() 623 case v1.ResourcePods: 624 r.AllowedPodNumber += int(rQuant.Value()) 625 case v1.ResourceEphemeralStorage: 626 r.EphemeralStorage += rQuant.Value() 627 default: 628 if schedutil.IsScalarResourceName(rName) { 629 r.AddScalar(rName, rQuant.Value()) 630 } 631 } 632 } 633 } 634 635 // Clone returns a copy of this resource. 636 func (r *Resource) Clone() *Resource { 637 res := &Resource{ 638 MilliCPU: r.MilliCPU, 639 Memory: r.Memory, 640 AllowedPodNumber: r.AllowedPodNumber, 641 EphemeralStorage: r.EphemeralStorage, 642 } 643 if r.ScalarResources != nil { 644 res.ScalarResources = make(map[v1.ResourceName]int64, len(r.ScalarResources)) 645 for k, v := range r.ScalarResources { 646 res.ScalarResources[k] = v 647 } 648 } 649 return res 650 } 651 652 // AddScalar adds a resource by a scalar value of this resource. 653 func (r *Resource) AddScalar(name v1.ResourceName, quantity int64) { 654 r.SetScalar(name, r.ScalarResources[name]+quantity) 655 } 656 657 // SetScalar sets a resource by a scalar value of this resource. 658 func (r *Resource) SetScalar(name v1.ResourceName, quantity int64) { 659 // Lazily allocate scalar resource map. 660 if r.ScalarResources == nil { 661 r.ScalarResources = map[v1.ResourceName]int64{} 662 } 663 r.ScalarResources[name] = quantity 664 } 665 666 // SetMaxResource compares with ResourceList and takes max value for each Resource. 667 func (r *Resource) SetMaxResource(rl v1.ResourceList) { 668 if r == nil { 669 return 670 } 671 672 for rName, rQuantity := range rl { 673 switch rName { 674 case v1.ResourceMemory: 675 r.Memory = max(r.Memory, rQuantity.Value()) 676 case v1.ResourceCPU: 677 r.MilliCPU = max(r.MilliCPU, rQuantity.MilliValue()) 678 case v1.ResourceEphemeralStorage: 679 r.EphemeralStorage = max(r.EphemeralStorage, rQuantity.Value()) 680 default: 681 if schedutil.IsScalarResourceName(rName) { 682 r.SetScalar(rName, max(r.ScalarResources[rName], rQuantity.Value())) 683 } 684 } 685 } 686 } 687 688 // NewNodeInfo returns a ready to use empty NodeInfo object. 689 // If any pods are given in arguments, their information will be aggregated in 690 // the returned object. 691 func NewNodeInfo(pods ...*v1.Pod) *NodeInfo { 692 ni := &NodeInfo{ 693 Requested: &Resource{}, 694 NonZeroRequested: &Resource{}, 695 Allocatable: &Resource{}, 696 Generation: nextGeneration(), 697 UsedPorts: make(HostPortInfo), 698 ImageStates: make(map[string]*ImageStateSummary), 699 PVCRefCounts: make(map[string]int), 700 } 701 for _, pod := range pods { 702 ni.AddPod(pod) 703 } 704 return ni 705 } 706 707 // Node returns overall information about this node. 708 func (n *NodeInfo) Node() *v1.Node { 709 if n == nil { 710 return nil 711 } 712 return n.node 713 } 714 715 // Snapshot returns a copy of this node, Except that ImageStates is copied without the Nodes field. 716 func (n *NodeInfo) Snapshot() *NodeInfo { 717 clone := &NodeInfo{ 718 node: n.node, 719 Requested: n.Requested.Clone(), 720 NonZeroRequested: n.NonZeroRequested.Clone(), 721 Allocatable: n.Allocatable.Clone(), 722 UsedPorts: make(HostPortInfo), 723 ImageStates: make(map[string]*ImageStateSummary), 724 PVCRefCounts: make(map[string]int), 725 Generation: n.Generation, 726 } 727 if len(n.Pods) > 0 { 728 clone.Pods = append([]*PodInfo(nil), n.Pods...) 729 } 730 if len(n.UsedPorts) > 0 { 731 // HostPortInfo is a map-in-map struct 732 // make sure it's deep copied 733 for ip, portMap := range n.UsedPorts { 734 clone.UsedPorts[ip] = make(map[ProtocolPort]struct{}) 735 for protocolPort, v := range portMap { 736 clone.UsedPorts[ip][protocolPort] = v 737 } 738 } 739 } 740 if len(n.PodsWithAffinity) > 0 { 741 clone.PodsWithAffinity = append([]*PodInfo(nil), n.PodsWithAffinity...) 742 } 743 if len(n.PodsWithRequiredAntiAffinity) > 0 { 744 clone.PodsWithRequiredAntiAffinity = append([]*PodInfo(nil), n.PodsWithRequiredAntiAffinity...) 745 } 746 if len(n.ImageStates) > 0 { 747 state := make(map[string]*ImageStateSummary, len(n.ImageStates)) 748 for imageName, imageState := range n.ImageStates { 749 state[imageName] = imageState.Snapshot() 750 } 751 clone.ImageStates = state 752 } 753 for key, value := range n.PVCRefCounts { 754 clone.PVCRefCounts[key] = value 755 } 756 return clone 757 } 758 759 // String returns representation of human readable format of this NodeInfo. 760 func (n *NodeInfo) String() string { 761 podKeys := make([]string, len(n.Pods)) 762 for i, p := range n.Pods { 763 podKeys[i] = p.Pod.Name 764 } 765 return fmt.Sprintf("&NodeInfo{Pods:%v, RequestedResource:%#v, NonZeroRequest: %#v, UsedPort: %#v, AllocatableResource:%#v}", 766 podKeys, n.Requested, n.NonZeroRequested, n.UsedPorts, n.Allocatable) 767 } 768 769 // AddPodInfo adds pod information to this NodeInfo. 770 // Consider using this instead of AddPod if a PodInfo is already computed. 771 func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) { 772 n.Pods = append(n.Pods, podInfo) 773 if podWithAffinity(podInfo.Pod) { 774 n.PodsWithAffinity = append(n.PodsWithAffinity, podInfo) 775 } 776 if podWithRequiredAntiAffinity(podInfo.Pod) { 777 n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo) 778 } 779 n.update(podInfo.Pod, 1) 780 } 781 782 // AddPod is a wrapper around AddPodInfo. 783 func (n *NodeInfo) AddPod(pod *v1.Pod) { 784 // ignore this err since apiserver doesn't properly validate affinity terms 785 // and we can't fix the validation for backwards compatibility. 786 podInfo, _ := NewPodInfo(pod) 787 n.AddPodInfo(podInfo) 788 } 789 790 func podWithAffinity(p *v1.Pod) bool { 791 affinity := p.Spec.Affinity 792 return affinity != nil && (affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil) 793 } 794 795 func podWithRequiredAntiAffinity(p *v1.Pod) bool { 796 affinity := p.Spec.Affinity 797 return affinity != nil && affinity.PodAntiAffinity != nil && 798 len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 799 } 800 801 func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bool) { 802 var removed bool 803 for i := range s { 804 tmpKey, err := GetPodKey(s[i].Pod) 805 if err != nil { 806 logger.Error(err, "Cannot get pod key", "pod", klog.KObj(s[i].Pod)) 807 continue 808 } 809 if k == tmpKey { 810 // delete the element 811 s[i] = s[len(s)-1] 812 s = s[:len(s)-1] 813 removed = true 814 break 815 } 816 } 817 // resets the slices to nil so that we can do DeepEqual in unit tests. 818 if len(s) == 0 { 819 return nil, removed 820 } 821 return s, removed 822 } 823 824 // RemovePod subtracts pod information from this NodeInfo. 825 func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error { 826 k, err := GetPodKey(pod) 827 if err != nil { 828 return err 829 } 830 if podWithAffinity(pod) { 831 n.PodsWithAffinity, _ = removeFromSlice(logger, n.PodsWithAffinity, k) 832 } 833 if podWithRequiredAntiAffinity(pod) { 834 n.PodsWithRequiredAntiAffinity, _ = removeFromSlice(logger, n.PodsWithRequiredAntiAffinity, k) 835 } 836 837 var removed bool 838 if n.Pods, removed = removeFromSlice(logger, n.Pods, k); removed { 839 n.update(pod, -1) 840 return nil 841 } 842 return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name) 843 } 844 845 // update node info based on the pod and sign. 846 // The sign will be set to `+1` when AddPod and to `-1` when RemovePod. 847 func (n *NodeInfo) update(pod *v1.Pod, sign int64) { 848 res, non0CPU, non0Mem := calculateResource(pod) 849 n.Requested.MilliCPU += sign * res.MilliCPU 850 n.Requested.Memory += sign * res.Memory 851 n.Requested.EphemeralStorage += sign * res.EphemeralStorage 852 if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 { 853 n.Requested.ScalarResources = map[v1.ResourceName]int64{} 854 } 855 for rName, rQuant := range res.ScalarResources { 856 n.Requested.ScalarResources[rName] += sign * rQuant 857 } 858 n.NonZeroRequested.MilliCPU += sign * non0CPU 859 n.NonZeroRequested.Memory += sign * non0Mem 860 861 // Consume ports when pod added or release ports when pod removed. 862 n.updateUsedPorts(pod, sign > 0) 863 n.updatePVCRefCounts(pod, sign > 0) 864 865 n.Generation = nextGeneration() 866 } 867 868 func calculateResource(pod *v1.Pod) (Resource, int64, int64) { 869 var non0InitCPU, non0InitMem int64 870 var non0CPU, non0Mem int64 871 requests := resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{ 872 InPlacePodVerticalScalingEnabled: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling), 873 ContainerFn: func(requests v1.ResourceList, containerType podutil.ContainerType) { 874 non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&requests) 875 switch containerType { 876 case podutil.Containers: 877 non0CPU += non0CPUReq 878 non0Mem += non0MemReq 879 case podutil.InitContainers: 880 non0InitCPU = max(non0InitCPU, non0CPUReq) 881 non0InitMem = max(non0InitMem, non0MemReq) 882 } 883 }, 884 }) 885 886 non0CPU = max(non0CPU, non0InitCPU) 887 non0Mem = max(non0Mem, non0InitMem) 888 889 // If Overhead is being utilized, add to the non-zero cpu/memory tracking for the pod. It has already been added 890 // into ScalarResources since it is part of requests 891 if pod.Spec.Overhead != nil { 892 if _, found := pod.Spec.Overhead[v1.ResourceCPU]; found { 893 non0CPU += pod.Spec.Overhead.Cpu().MilliValue() 894 } 895 896 if _, found := pod.Spec.Overhead[v1.ResourceMemory]; found { 897 non0Mem += pod.Spec.Overhead.Memory().Value() 898 } 899 } 900 var res Resource 901 res.Add(requests) 902 return res, non0CPU, non0Mem 903 } 904 905 // updateUsedPorts updates the UsedPorts of NodeInfo. 906 func (n *NodeInfo) updateUsedPorts(pod *v1.Pod, add bool) { 907 for _, container := range pod.Spec.Containers { 908 for _, podPort := range container.Ports { 909 if add { 910 n.UsedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort) 911 } else { 912 n.UsedPorts.Remove(podPort.HostIP, string(podPort.Protocol), podPort.HostPort) 913 } 914 } 915 } 916 } 917 918 // updatePVCRefCounts updates the PVCRefCounts of NodeInfo. 919 func (n *NodeInfo) updatePVCRefCounts(pod *v1.Pod, add bool) { 920 for _, v := range pod.Spec.Volumes { 921 if v.PersistentVolumeClaim == nil { 922 continue 923 } 924 925 key := GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName) 926 if add { 927 n.PVCRefCounts[key] += 1 928 } else { 929 n.PVCRefCounts[key] -= 1 930 if n.PVCRefCounts[key] <= 0 { 931 delete(n.PVCRefCounts, key) 932 } 933 } 934 } 935 } 936 937 // SetNode sets the overall node information. 938 func (n *NodeInfo) SetNode(node *v1.Node) { 939 n.node = node 940 n.Allocatable = NewResource(node.Status.Allocatable) 941 n.Generation = nextGeneration() 942 } 943 944 // RemoveNode removes the node object, leaving all other tracking information. 945 func (n *NodeInfo) RemoveNode() { 946 n.node = nil 947 n.Generation = nextGeneration() 948 } 949 950 // GetPodKey returns the string key of a pod. 951 func GetPodKey(pod *v1.Pod) (string, error) { 952 uid := string(pod.UID) 953 if len(uid) == 0 { 954 return "", errors.New("cannot get cache key for pod with empty UID") 955 } 956 return uid, nil 957 } 958 959 // GetNamespacedName returns the string format of a namespaced resource name. 960 func GetNamespacedName(namespace, name string) string { 961 return fmt.Sprintf("%s/%s", namespace, name) 962 } 963 964 // DefaultBindAllHostIP defines the default ip address used to bind to all host. 965 const DefaultBindAllHostIP = "0.0.0.0" 966 967 // ProtocolPort represents a protocol port pair, e.g. tcp:80. 968 type ProtocolPort struct { 969 Protocol string 970 Port int32 971 } 972 973 // NewProtocolPort creates a ProtocolPort instance. 974 func NewProtocolPort(protocol string, port int32) *ProtocolPort { 975 pp := &ProtocolPort{ 976 Protocol: protocol, 977 Port: port, 978 } 979 980 if len(pp.Protocol) == 0 { 981 pp.Protocol = string(v1.ProtocolTCP) 982 } 983 984 return pp 985 } 986 987 // HostPortInfo stores mapping from ip to a set of ProtocolPort 988 type HostPortInfo map[string]map[ProtocolPort]struct{} 989 990 // Add adds (ip, protocol, port) to HostPortInfo 991 func (h HostPortInfo) Add(ip, protocol string, port int32) { 992 if port <= 0 { 993 return 994 } 995 996 h.sanitize(&ip, &protocol) 997 998 pp := NewProtocolPort(protocol, port) 999 if _, ok := h[ip]; !ok { 1000 h[ip] = map[ProtocolPort]struct{}{ 1001 *pp: {}, 1002 } 1003 return 1004 } 1005 1006 h[ip][*pp] = struct{}{} 1007 } 1008 1009 // Remove removes (ip, protocol, port) from HostPortInfo 1010 func (h HostPortInfo) Remove(ip, protocol string, port int32) { 1011 if port <= 0 { 1012 return 1013 } 1014 1015 h.sanitize(&ip, &protocol) 1016 1017 pp := NewProtocolPort(protocol, port) 1018 if m, ok := h[ip]; ok { 1019 delete(m, *pp) 1020 if len(h[ip]) == 0 { 1021 delete(h, ip) 1022 } 1023 } 1024 } 1025 1026 // Len returns the total number of (ip, protocol, port) tuple in HostPortInfo 1027 func (h HostPortInfo) Len() int { 1028 length := 0 1029 for _, m := range h { 1030 length += len(m) 1031 } 1032 return length 1033 } 1034 1035 // CheckConflict checks if the input (ip, protocol, port) conflicts with the existing 1036 // ones in HostPortInfo. 1037 func (h HostPortInfo) CheckConflict(ip, protocol string, port int32) bool { 1038 if port <= 0 { 1039 return false 1040 } 1041 1042 h.sanitize(&ip, &protocol) 1043 1044 pp := NewProtocolPort(protocol, port) 1045 1046 // If ip is 0.0.0.0 check all IP's (protocol, port) pair 1047 if ip == DefaultBindAllHostIP { 1048 for _, m := range h { 1049 if _, ok := m[*pp]; ok { 1050 return true 1051 } 1052 } 1053 return false 1054 } 1055 1056 // If ip isn't 0.0.0.0, only check IP and 0.0.0.0's (protocol, port) pair 1057 for _, key := range []string{DefaultBindAllHostIP, ip} { 1058 if m, ok := h[key]; ok { 1059 if _, ok2 := m[*pp]; ok2 { 1060 return true 1061 } 1062 } 1063 } 1064 1065 return false 1066 } 1067 1068 // sanitize the parameters 1069 func (h HostPortInfo) sanitize(ip, protocol *string) { 1070 if len(*ip) == 0 { 1071 *ip = DefaultBindAllHostIP 1072 } 1073 if len(*protocol) == 0 { 1074 *protocol = string(v1.ProtocolTCP) 1075 } 1076 }