k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/types.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package framework
    18  
    19  import (
    20  	"errors"
    21  	"fmt"
    22  	"sort"
    23  	"strings"
    24  	"sync/atomic"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    29  	"k8s.io/apimachinery/pkg/labels"
    30  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    31  	"k8s.io/apimachinery/pkg/util/sets"
    32  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    33  	"k8s.io/klog/v2"
    34  
    35  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    36  	resourcehelper "k8s.io/kubernetes/pkg/api/v1/resource"
    37  	"k8s.io/kubernetes/pkg/features"
    38  	schedutil "k8s.io/kubernetes/pkg/scheduler/util"
    39  )
    40  
    41  var generation int64
    42  
    43  // ActionType is an integer to represent one type of resource change.
    44  // Different ActionTypes can be bit-wised to compose new semantics.
    45  type ActionType int64
    46  
    47  // Constants for ActionTypes.
    48  const (
    49  	Add    ActionType = 1 << iota // 1
    50  	Delete                        // 10
    51  	// UpdateNodeXYZ is only applicable for Node events.
    52  	UpdateNodeAllocatable // 100
    53  	UpdateNodeLabel       // 1000
    54  	UpdateNodeTaint       // 10000
    55  	UpdateNodeCondition   // 100000
    56  
    57  	All ActionType = 1<<iota - 1 // 111111
    58  
    59  	// Use the general Update type if you don't either know or care the specific sub-Update type to use.
    60  	Update = UpdateNodeAllocatable | UpdateNodeLabel | UpdateNodeTaint | UpdateNodeCondition
    61  )
    62  
    63  // GVK is short for group/version/kind, which can uniquely represent a particular API resource.
    64  type GVK string
    65  
    66  // Constants for GVKs.
    67  const (
    68  	Pod                   GVK = "Pod"
    69  	Node                  GVK = "Node"
    70  	PersistentVolume      GVK = "PersistentVolume"
    71  	PersistentVolumeClaim GVK = "PersistentVolumeClaim"
    72  	PodSchedulingContext  GVK = "PodSchedulingContext"
    73  	ResourceClaim         GVK = "ResourceClaim"
    74  	ResourceClass         GVK = "ResourceClass"
    75  	StorageClass          GVK = "storage.k8s.io/StorageClass"
    76  	CSINode               GVK = "storage.k8s.io/CSINode"
    77  	CSIDriver             GVK = "storage.k8s.io/CSIDriver"
    78  	CSIStorageCapacity    GVK = "storage.k8s.io/CSIStorageCapacity"
    79  	WildCard              GVK = "*"
    80  )
    81  
    82  type ClusterEventWithHint struct {
    83  	Event ClusterEvent
    84  	// QueueingHintFn is executed for the plugin rejected by this plugin when the above Event happens,
    85  	// and filters out events to reduce useless retry of Pod's scheduling.
    86  	// It's an optional field. If not set,
    87  	// the scheduling of Pods will be always retried with backoff when this Event happens.
    88  	// (the same as Queue)
    89  	QueueingHintFn QueueingHintFn
    90  }
    91  
    92  // QueueingHintFn returns a hint that signals whether the event can make a Pod,
    93  // which was rejected by this plugin in the past scheduling cycle, schedulable or not.
    94  // It's called before a Pod gets moved from unschedulableQ to backoffQ or activeQ.
    95  // If it returns an error, we'll take the returned QueueingHint as `Queue` at the caller whatever we returned here so that
    96  // we can prevent the Pod from being stuck in the unschedulable pod pool.
    97  //
    98  // - `pod`: the Pod to be enqueued, which is rejected by this plugin in the past.
    99  // - `oldObj` `newObj`: the object involved in that event.
   100  //   - For example, the given event is "Node deleted", the `oldObj` will be that deleted Node.
   101  //   - `oldObj` is nil if the event is add event.
   102  //   - `newObj` is nil if the event is delete event.
   103  type QueueingHintFn func(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (QueueingHint, error)
   104  
   105  type QueueingHint int
   106  
   107  const (
   108  	// QueueSkip implies that the cluster event has no impact on
   109  	// scheduling of the pod.
   110  	QueueSkip QueueingHint = iota
   111  
   112  	// Queue implies that the Pod may be schedulable by the event.
   113  	Queue
   114  )
   115  
   116  func (s QueueingHint) String() string {
   117  	switch s {
   118  	case QueueSkip:
   119  		return "QueueSkip"
   120  	case Queue:
   121  		return "Queue"
   122  	}
   123  	return ""
   124  }
   125  
   126  // ClusterEvent abstracts how a system resource's state gets changed.
   127  // Resource represents the standard API resources such as Pod, Node, etc.
   128  // ActionType denotes the specific change such as Add, Update or Delete.
   129  type ClusterEvent struct {
   130  	Resource   GVK
   131  	ActionType ActionType
   132  	Label      string
   133  }
   134  
   135  // IsWildCard returns true if ClusterEvent follows WildCard semantics
   136  func (ce ClusterEvent) IsWildCard() bool {
   137  	return ce.Resource == WildCard && ce.ActionType == All
   138  }
   139  
   140  func UnrollWildCardResource() []ClusterEventWithHint {
   141  	return []ClusterEventWithHint{
   142  		{Event: ClusterEvent{Resource: Pod, ActionType: All}},
   143  		{Event: ClusterEvent{Resource: Node, ActionType: All}},
   144  		{Event: ClusterEvent{Resource: CSINode, ActionType: All}},
   145  		{Event: ClusterEvent{Resource: CSIDriver, ActionType: All}},
   146  		{Event: ClusterEvent{Resource: CSIStorageCapacity, ActionType: All}},
   147  		{Event: ClusterEvent{Resource: PersistentVolume, ActionType: All}},
   148  		{Event: ClusterEvent{Resource: PersistentVolumeClaim, ActionType: All}},
   149  		{Event: ClusterEvent{Resource: StorageClass, ActionType: All}},
   150  		{Event: ClusterEvent{Resource: PodSchedulingContext, ActionType: All}},
   151  	}
   152  }
   153  
   154  // QueuedPodInfo is a Pod wrapper with additional information related to
   155  // the pod's status in the scheduling queue, such as the timestamp when
   156  // it's added to the queue.
   157  type QueuedPodInfo struct {
   158  	*PodInfo
   159  	// The time pod added to the scheduling queue.
   160  	Timestamp time.Time
   161  	// Number of schedule attempts before successfully scheduled.
   162  	// It's used to record the # attempts metric.
   163  	Attempts int
   164  	// The time when the pod is added to the queue for the first time. The pod may be added
   165  	// back to the queue multiple times before it's successfully scheduled.
   166  	// It shouldn't be updated once initialized. It's used to record the e2e scheduling
   167  	// latency for a pod.
   168  	InitialAttemptTimestamp *time.Time
   169  	// UnschedulablePlugins records the plugin names that the Pod failed with Unschedulable or UnschedulableAndUnresolvable status.
   170  	// It's registered only when the Pod is rejected in PreFilter, Filter, Reserve, or Permit (WaitOnPermit).
   171  	UnschedulablePlugins sets.Set[string]
   172  	// PendingPlugins records the plugin names that the Pod failed with Pending status.
   173  	PendingPlugins sets.Set[string]
   174  	// Whether the Pod is scheduling gated (by PreEnqueuePlugins) or not.
   175  	Gated bool
   176  }
   177  
   178  // DeepCopy returns a deep copy of the QueuedPodInfo object.
   179  func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo {
   180  	return &QueuedPodInfo{
   181  		PodInfo:                 pqi.PodInfo.DeepCopy(),
   182  		Timestamp:               pqi.Timestamp,
   183  		Attempts:                pqi.Attempts,
   184  		InitialAttemptTimestamp: pqi.InitialAttemptTimestamp,
   185  		UnschedulablePlugins:    pqi.UnschedulablePlugins.Clone(),
   186  		Gated:                   pqi.Gated,
   187  	}
   188  }
   189  
   190  // PodInfo is a wrapper to a Pod with additional pre-computed information to
   191  // accelerate processing. This information is typically immutable (e.g., pre-processed
   192  // inter-pod affinity selectors).
   193  type PodInfo struct {
   194  	Pod                        *v1.Pod
   195  	RequiredAffinityTerms      []AffinityTerm
   196  	RequiredAntiAffinityTerms  []AffinityTerm
   197  	PreferredAffinityTerms     []WeightedAffinityTerm
   198  	PreferredAntiAffinityTerms []WeightedAffinityTerm
   199  }
   200  
   201  // DeepCopy returns a deep copy of the PodInfo object.
   202  func (pi *PodInfo) DeepCopy() *PodInfo {
   203  	return &PodInfo{
   204  		Pod:                        pi.Pod.DeepCopy(),
   205  		RequiredAffinityTerms:      pi.RequiredAffinityTerms,
   206  		RequiredAntiAffinityTerms:  pi.RequiredAntiAffinityTerms,
   207  		PreferredAffinityTerms:     pi.PreferredAffinityTerms,
   208  		PreferredAntiAffinityTerms: pi.PreferredAntiAffinityTerms,
   209  	}
   210  }
   211  
   212  // Update creates a full new PodInfo by default. And only updates the pod when the PodInfo
   213  // has been instantiated and the passed pod is the exact same one as the original pod.
   214  func (pi *PodInfo) Update(pod *v1.Pod) error {
   215  	if pod != nil && pi.Pod != nil && pi.Pod.UID == pod.UID {
   216  		// PodInfo includes immutable information, and so it is safe to update the pod in place if it is
   217  		// the exact same pod
   218  		pi.Pod = pod
   219  		return nil
   220  	}
   221  	var preferredAffinityTerms []v1.WeightedPodAffinityTerm
   222  	var preferredAntiAffinityTerms []v1.WeightedPodAffinityTerm
   223  	if affinity := pod.Spec.Affinity; affinity != nil {
   224  		if a := affinity.PodAffinity; a != nil {
   225  			preferredAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution
   226  		}
   227  		if a := affinity.PodAntiAffinity; a != nil {
   228  			preferredAntiAffinityTerms = a.PreferredDuringSchedulingIgnoredDuringExecution
   229  		}
   230  	}
   231  
   232  	// Attempt to parse the affinity terms
   233  	var parseErrs []error
   234  	requiredAffinityTerms, err := getAffinityTerms(pod, getPodAffinityTerms(pod.Spec.Affinity))
   235  	if err != nil {
   236  		parseErrs = append(parseErrs, fmt.Errorf("requiredAffinityTerms: %w", err))
   237  	}
   238  	requiredAntiAffinityTerms, err := getAffinityTerms(pod,
   239  		getPodAntiAffinityTerms(pod.Spec.Affinity))
   240  	if err != nil {
   241  		parseErrs = append(parseErrs, fmt.Errorf("requiredAntiAffinityTerms: %w", err))
   242  	}
   243  	weightedAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAffinityTerms)
   244  	if err != nil {
   245  		parseErrs = append(parseErrs, fmt.Errorf("preferredAffinityTerms: %w", err))
   246  	}
   247  	weightedAntiAffinityTerms, err := getWeightedAffinityTerms(pod, preferredAntiAffinityTerms)
   248  	if err != nil {
   249  		parseErrs = append(parseErrs, fmt.Errorf("preferredAntiAffinityTerms: %w", err))
   250  	}
   251  
   252  	pi.Pod = pod
   253  	pi.RequiredAffinityTerms = requiredAffinityTerms
   254  	pi.RequiredAntiAffinityTerms = requiredAntiAffinityTerms
   255  	pi.PreferredAffinityTerms = weightedAffinityTerms
   256  	pi.PreferredAntiAffinityTerms = weightedAntiAffinityTerms
   257  	return utilerrors.NewAggregate(parseErrs)
   258  }
   259  
   260  // AffinityTerm is a processed version of v1.PodAffinityTerm.
   261  type AffinityTerm struct {
   262  	Namespaces        sets.Set[string]
   263  	Selector          labels.Selector
   264  	TopologyKey       string
   265  	NamespaceSelector labels.Selector
   266  }
   267  
   268  // Matches returns true if the pod matches the label selector and namespaces or namespace selector.
   269  func (at *AffinityTerm) Matches(pod *v1.Pod, nsLabels labels.Set) bool {
   270  	if at.Namespaces.Has(pod.Namespace) || at.NamespaceSelector.Matches(nsLabels) {
   271  		return at.Selector.Matches(labels.Set(pod.Labels))
   272  	}
   273  	return false
   274  }
   275  
   276  // WeightedAffinityTerm is a "processed" representation of v1.WeightedAffinityTerm.
   277  type WeightedAffinityTerm struct {
   278  	AffinityTerm
   279  	Weight int32
   280  }
   281  
   282  // Diagnosis records the details to diagnose a scheduling failure.
   283  type Diagnosis struct {
   284  	NodeToStatusMap NodeToStatusMap
   285  	// UnschedulablePlugins are plugins that returns Unschedulable or UnschedulableAndUnresolvable.
   286  	UnschedulablePlugins sets.Set[string]
   287  	// UnschedulablePlugins are plugins that returns Pending.
   288  	PendingPlugins sets.Set[string]
   289  	// PreFilterMsg records the messages returned from PreFilter plugins.
   290  	PreFilterMsg string
   291  	// PostFilterMsg records the messages returned from PostFilter plugins.
   292  	PostFilterMsg string
   293  }
   294  
   295  // FitError describes a fit error of a pod.
   296  type FitError struct {
   297  	Pod         *v1.Pod
   298  	NumAllNodes int
   299  	Diagnosis   Diagnosis
   300  }
   301  
   302  const (
   303  	// NoNodeAvailableMsg is used to format message when no nodes available.
   304  	NoNodeAvailableMsg = "0/%v nodes are available"
   305  )
   306  
   307  func (d *Diagnosis) AddPluginStatus(sts *Status) {
   308  	if sts.Plugin() == "" {
   309  		return
   310  	}
   311  	if sts.IsRejected() {
   312  		if d.UnschedulablePlugins == nil {
   313  			d.UnschedulablePlugins = sets.New[string]()
   314  		}
   315  		d.UnschedulablePlugins.Insert(sts.Plugin())
   316  	}
   317  	if sts.Code() == Pending {
   318  		if d.PendingPlugins == nil {
   319  			d.PendingPlugins = sets.New[string]()
   320  		}
   321  		d.PendingPlugins.Insert(sts.Plugin())
   322  	}
   323  }
   324  
   325  // Error returns detailed information of why the pod failed to fit on each node.
   326  // A message format is "0/X nodes are available: <PreFilterMsg>. <FilterMsg>. <PostFilterMsg>."
   327  func (f *FitError) Error() string {
   328  	reasonMsg := fmt.Sprintf(NoNodeAvailableMsg+":", f.NumAllNodes)
   329  	preFilterMsg := f.Diagnosis.PreFilterMsg
   330  	if preFilterMsg != "" {
   331  		// PreFilter plugin returns unschedulable.
   332  		// Add the messages from PreFilter plugins to reasonMsg.
   333  		reasonMsg += fmt.Sprintf(" %v.", preFilterMsg)
   334  	}
   335  
   336  	if preFilterMsg == "" {
   337  		// the scheduling cycle went through PreFilter extension point successfully.
   338  		//
   339  		// When the prefilter plugin returns unschedulable,
   340  		// the scheduling framework inserts the same unschedulable status to all nodes in NodeToStatusMap.
   341  		// So, we shouldn't add the message from NodeToStatusMap when the PreFilter failed.
   342  		// Otherwise, we will have duplicated reasons in the error message.
   343  		reasons := make(map[string]int)
   344  		for _, status := range f.Diagnosis.NodeToStatusMap {
   345  			for _, reason := range status.Reasons() {
   346  				reasons[reason]++
   347  			}
   348  		}
   349  
   350  		sortReasonsHistogram := func() []string {
   351  			var reasonStrings []string
   352  			for k, v := range reasons {
   353  				reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
   354  			}
   355  			sort.Strings(reasonStrings)
   356  			return reasonStrings
   357  		}
   358  		sortedFilterMsg := sortReasonsHistogram()
   359  		if len(sortedFilterMsg) != 0 {
   360  			reasonMsg += fmt.Sprintf(" %v.", strings.Join(sortedFilterMsg, ", "))
   361  		}
   362  	}
   363  
   364  	// Add the messages from PostFilter plugins to reasonMsg.
   365  	// We can add this message regardless of whether the scheduling cycle fails at PreFilter or Filter
   366  	// since we may run PostFilter (if enabled) in both cases.
   367  	postFilterMsg := f.Diagnosis.PostFilterMsg
   368  	if postFilterMsg != "" {
   369  		reasonMsg += fmt.Sprintf(" %v", postFilterMsg)
   370  	}
   371  	return reasonMsg
   372  }
   373  
   374  func newAffinityTerm(pod *v1.Pod, term *v1.PodAffinityTerm) (*AffinityTerm, error) {
   375  	selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector)
   376  	if err != nil {
   377  		return nil, err
   378  	}
   379  
   380  	namespaces := getNamespacesFromPodAffinityTerm(pod, term)
   381  	nsSelector, err := metav1.LabelSelectorAsSelector(term.NamespaceSelector)
   382  	if err != nil {
   383  		return nil, err
   384  	}
   385  
   386  	return &AffinityTerm{Namespaces: namespaces, Selector: selector, TopologyKey: term.TopologyKey, NamespaceSelector: nsSelector}, nil
   387  }
   388  
   389  // getAffinityTerms receives a Pod and affinity terms and returns the namespaces and
   390  // selectors of the terms.
   391  func getAffinityTerms(pod *v1.Pod, v1Terms []v1.PodAffinityTerm) ([]AffinityTerm, error) {
   392  	if v1Terms == nil {
   393  		return nil, nil
   394  	}
   395  
   396  	var terms []AffinityTerm
   397  	for i := range v1Terms {
   398  		t, err := newAffinityTerm(pod, &v1Terms[i])
   399  		if err != nil {
   400  			// We get here if the label selector failed to process
   401  			return nil, err
   402  		}
   403  		terms = append(terms, *t)
   404  	}
   405  	return terms, nil
   406  }
   407  
   408  // getWeightedAffinityTerms returns the list of processed affinity terms.
   409  func getWeightedAffinityTerms(pod *v1.Pod, v1Terms []v1.WeightedPodAffinityTerm) ([]WeightedAffinityTerm, error) {
   410  	if v1Terms == nil {
   411  		return nil, nil
   412  	}
   413  
   414  	var terms []WeightedAffinityTerm
   415  	for i := range v1Terms {
   416  		t, err := newAffinityTerm(pod, &v1Terms[i].PodAffinityTerm)
   417  		if err != nil {
   418  			// We get here if the label selector failed to process
   419  			return nil, err
   420  		}
   421  		terms = append(terms, WeightedAffinityTerm{AffinityTerm: *t, Weight: v1Terms[i].Weight})
   422  	}
   423  	return terms, nil
   424  }
   425  
   426  // NewPodInfo returns a new PodInfo.
   427  func NewPodInfo(pod *v1.Pod) (*PodInfo, error) {
   428  	pInfo := &PodInfo{}
   429  	err := pInfo.Update(pod)
   430  	return pInfo, err
   431  }
   432  
   433  func getPodAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) {
   434  	if affinity != nil && affinity.PodAffinity != nil {
   435  		if len(affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
   436  			terms = affinity.PodAffinity.RequiredDuringSchedulingIgnoredDuringExecution
   437  		}
   438  		// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
   439  		// if len(affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
   440  		//	terms = append(terms, affinity.PodAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
   441  		// }
   442  	}
   443  	return terms
   444  }
   445  
   446  func getPodAntiAffinityTerms(affinity *v1.Affinity) (terms []v1.PodAffinityTerm) {
   447  	if affinity != nil && affinity.PodAntiAffinity != nil {
   448  		if len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 {
   449  			terms = affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
   450  		}
   451  		// TODO: Uncomment this block when implement RequiredDuringSchedulingRequiredDuringExecution.
   452  		// if len(affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution) != 0 {
   453  		//	terms = append(terms, affinity.PodAntiAffinity.RequiredDuringSchedulingRequiredDuringExecution...)
   454  		// }
   455  	}
   456  	return terms
   457  }
   458  
   459  // returns a set of names according to the namespaces indicated in podAffinityTerm.
   460  // If namespaces is empty it considers the given pod's namespace.
   461  func getNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] {
   462  	names := sets.Set[string]{}
   463  	if len(podAffinityTerm.Namespaces) == 0 && podAffinityTerm.NamespaceSelector == nil {
   464  		names.Insert(pod.Namespace)
   465  	} else {
   466  		names.Insert(podAffinityTerm.Namespaces...)
   467  	}
   468  	return names
   469  }
   470  
   471  // ImageStateSummary provides summarized information about the state of an image.
   472  type ImageStateSummary struct {
   473  	// Size of the image
   474  	Size int64
   475  	// Used to track how many nodes have this image, it is computed from the Nodes field below
   476  	// during the execution of Snapshot.
   477  	NumNodes int
   478  	// A set of node names for nodes having this image present. This field is used for
   479  	// keeping track of the nodes during update/add/remove events.
   480  	Nodes sets.Set[string]
   481  }
   482  
   483  // Snapshot returns a copy without Nodes field of ImageStateSummary
   484  func (iss *ImageStateSummary) Snapshot() *ImageStateSummary {
   485  	return &ImageStateSummary{
   486  		Size:     iss.Size,
   487  		NumNodes: iss.Nodes.Len(),
   488  	}
   489  }
   490  
   491  // NodeInfo is node level aggregated information.
   492  type NodeInfo struct {
   493  	// Overall node information.
   494  	node *v1.Node
   495  
   496  	// Pods running on the node.
   497  	Pods []*PodInfo
   498  
   499  	// The subset of pods with affinity.
   500  	PodsWithAffinity []*PodInfo
   501  
   502  	// The subset of pods with required anti-affinity.
   503  	PodsWithRequiredAntiAffinity []*PodInfo
   504  
   505  	// Ports allocated on the node.
   506  	UsedPorts HostPortInfo
   507  
   508  	// Total requested resources of all pods on this node. This includes assumed
   509  	// pods, which scheduler has sent for binding, but may not be scheduled yet.
   510  	Requested *Resource
   511  	// Total requested resources of all pods on this node with a minimum value
   512  	// applied to each container's CPU and memory requests. This does not reflect
   513  	// the actual resource requests for this node, but is used to avoid scheduling
   514  	// many zero-request pods onto one node.
   515  	NonZeroRequested *Resource
   516  	// We store allocatedResources (which is Node.Status.Allocatable.*) explicitly
   517  	// as int64, to avoid conversions and accessing map.
   518  	Allocatable *Resource
   519  
   520  	// ImageStates holds the entry of an image if and only if this image is on the node. The entry can be used for
   521  	// checking an image's existence and advanced usage (e.g., image locality scheduling policy) based on the image
   522  	// state information.
   523  	ImageStates map[string]*ImageStateSummary
   524  
   525  	// PVCRefCounts contains a mapping of PVC names to the number of pods on the node using it.
   526  	// Keys are in the format "namespace/name".
   527  	PVCRefCounts map[string]int
   528  
   529  	// Whenever NodeInfo changes, generation is bumped.
   530  	// This is used to avoid cloning it if the object didn't change.
   531  	Generation int64
   532  }
   533  
   534  // nextGeneration: Let's make sure history never forgets the name...
   535  // Increments the generation number monotonically ensuring that generation numbers never collide.
   536  // Collision of the generation numbers would be particularly problematic if a node was deleted and
   537  // added back with the same name. See issue#63262.
   538  func nextGeneration() int64 {
   539  	return atomic.AddInt64(&generation, 1)
   540  }
   541  
   542  // Resource is a collection of compute resource.
   543  type Resource struct {
   544  	MilliCPU         int64
   545  	Memory           int64
   546  	EphemeralStorage int64
   547  	// We store allowedPodNumber (which is Node.Status.Allocatable.Pods().Value())
   548  	// explicitly as int, to avoid conversions and improve performance.
   549  	AllowedPodNumber int
   550  	// ScalarResources
   551  	ScalarResources map[v1.ResourceName]int64
   552  }
   553  
   554  // NewResource creates a Resource from ResourceList
   555  func NewResource(rl v1.ResourceList) *Resource {
   556  	r := &Resource{}
   557  	r.Add(rl)
   558  	return r
   559  }
   560  
   561  // Add adds ResourceList into Resource.
   562  func (r *Resource) Add(rl v1.ResourceList) {
   563  	if r == nil {
   564  		return
   565  	}
   566  
   567  	for rName, rQuant := range rl {
   568  		switch rName {
   569  		case v1.ResourceCPU:
   570  			r.MilliCPU += rQuant.MilliValue()
   571  		case v1.ResourceMemory:
   572  			r.Memory += rQuant.Value()
   573  		case v1.ResourcePods:
   574  			r.AllowedPodNumber += int(rQuant.Value())
   575  		case v1.ResourceEphemeralStorage:
   576  			r.EphemeralStorage += rQuant.Value()
   577  		default:
   578  			if schedutil.IsScalarResourceName(rName) {
   579  				r.AddScalar(rName, rQuant.Value())
   580  			}
   581  		}
   582  	}
   583  }
   584  
   585  // Clone returns a copy of this resource.
   586  func (r *Resource) Clone() *Resource {
   587  	res := &Resource{
   588  		MilliCPU:         r.MilliCPU,
   589  		Memory:           r.Memory,
   590  		AllowedPodNumber: r.AllowedPodNumber,
   591  		EphemeralStorage: r.EphemeralStorage,
   592  	}
   593  	if r.ScalarResources != nil {
   594  		res.ScalarResources = make(map[v1.ResourceName]int64, len(r.ScalarResources))
   595  		for k, v := range r.ScalarResources {
   596  			res.ScalarResources[k] = v
   597  		}
   598  	}
   599  	return res
   600  }
   601  
   602  // AddScalar adds a resource by a scalar value of this resource.
   603  func (r *Resource) AddScalar(name v1.ResourceName, quantity int64) {
   604  	r.SetScalar(name, r.ScalarResources[name]+quantity)
   605  }
   606  
   607  // SetScalar sets a resource by a scalar value of this resource.
   608  func (r *Resource) SetScalar(name v1.ResourceName, quantity int64) {
   609  	// Lazily allocate scalar resource map.
   610  	if r.ScalarResources == nil {
   611  		r.ScalarResources = map[v1.ResourceName]int64{}
   612  	}
   613  	r.ScalarResources[name] = quantity
   614  }
   615  
   616  // SetMaxResource compares with ResourceList and takes max value for each Resource.
   617  func (r *Resource) SetMaxResource(rl v1.ResourceList) {
   618  	if r == nil {
   619  		return
   620  	}
   621  
   622  	for rName, rQuantity := range rl {
   623  		switch rName {
   624  		case v1.ResourceMemory:
   625  			r.Memory = max(r.Memory, rQuantity.Value())
   626  		case v1.ResourceCPU:
   627  			r.MilliCPU = max(r.MilliCPU, rQuantity.MilliValue())
   628  		case v1.ResourceEphemeralStorage:
   629  			r.EphemeralStorage = max(r.EphemeralStorage, rQuantity.Value())
   630  		default:
   631  			if schedutil.IsScalarResourceName(rName) {
   632  				r.SetScalar(rName, max(r.ScalarResources[rName], rQuantity.Value()))
   633  			}
   634  		}
   635  	}
   636  }
   637  
   638  // NewNodeInfo returns a ready to use empty NodeInfo object.
   639  // If any pods are given in arguments, their information will be aggregated in
   640  // the returned object.
   641  func NewNodeInfo(pods ...*v1.Pod) *NodeInfo {
   642  	ni := &NodeInfo{
   643  		Requested:        &Resource{},
   644  		NonZeroRequested: &Resource{},
   645  		Allocatable:      &Resource{},
   646  		Generation:       nextGeneration(),
   647  		UsedPorts:        make(HostPortInfo),
   648  		ImageStates:      make(map[string]*ImageStateSummary),
   649  		PVCRefCounts:     make(map[string]int),
   650  	}
   651  	for _, pod := range pods {
   652  		ni.AddPod(pod)
   653  	}
   654  	return ni
   655  }
   656  
   657  // Node returns overall information about this node.
   658  func (n *NodeInfo) Node() *v1.Node {
   659  	if n == nil {
   660  		return nil
   661  	}
   662  	return n.node
   663  }
   664  
   665  // Snapshot returns a copy of this node, Except that ImageStates is copied without the Nodes field.
   666  func (n *NodeInfo) Snapshot() *NodeInfo {
   667  	clone := &NodeInfo{
   668  		node:             n.node,
   669  		Requested:        n.Requested.Clone(),
   670  		NonZeroRequested: n.NonZeroRequested.Clone(),
   671  		Allocatable:      n.Allocatable.Clone(),
   672  		UsedPorts:        make(HostPortInfo),
   673  		ImageStates:      make(map[string]*ImageStateSummary),
   674  		PVCRefCounts:     make(map[string]int),
   675  		Generation:       n.Generation,
   676  	}
   677  	if len(n.Pods) > 0 {
   678  		clone.Pods = append([]*PodInfo(nil), n.Pods...)
   679  	}
   680  	if len(n.UsedPorts) > 0 {
   681  		// HostPortInfo is a map-in-map struct
   682  		// make sure it's deep copied
   683  		for ip, portMap := range n.UsedPorts {
   684  			clone.UsedPorts[ip] = make(map[ProtocolPort]struct{})
   685  			for protocolPort, v := range portMap {
   686  				clone.UsedPorts[ip][protocolPort] = v
   687  			}
   688  		}
   689  	}
   690  	if len(n.PodsWithAffinity) > 0 {
   691  		clone.PodsWithAffinity = append([]*PodInfo(nil), n.PodsWithAffinity...)
   692  	}
   693  	if len(n.PodsWithRequiredAntiAffinity) > 0 {
   694  		clone.PodsWithRequiredAntiAffinity = append([]*PodInfo(nil), n.PodsWithRequiredAntiAffinity...)
   695  	}
   696  	if len(n.ImageStates) > 0 {
   697  		state := make(map[string]*ImageStateSummary, len(n.ImageStates))
   698  		for imageName, imageState := range n.ImageStates {
   699  			state[imageName] = imageState.Snapshot()
   700  		}
   701  		clone.ImageStates = state
   702  	}
   703  	for key, value := range n.PVCRefCounts {
   704  		clone.PVCRefCounts[key] = value
   705  	}
   706  	return clone
   707  }
   708  
   709  // String returns representation of human readable format of this NodeInfo.
   710  func (n *NodeInfo) String() string {
   711  	podKeys := make([]string, len(n.Pods))
   712  	for i, p := range n.Pods {
   713  		podKeys[i] = p.Pod.Name
   714  	}
   715  	return fmt.Sprintf("&NodeInfo{Pods:%v, RequestedResource:%#v, NonZeroRequest: %#v, UsedPort: %#v, AllocatableResource:%#v}",
   716  		podKeys, n.Requested, n.NonZeroRequested, n.UsedPorts, n.Allocatable)
   717  }
   718  
   719  // AddPodInfo adds pod information to this NodeInfo.
   720  // Consider using this instead of AddPod if a PodInfo is already computed.
   721  func (n *NodeInfo) AddPodInfo(podInfo *PodInfo) {
   722  	n.Pods = append(n.Pods, podInfo)
   723  	if podWithAffinity(podInfo.Pod) {
   724  		n.PodsWithAffinity = append(n.PodsWithAffinity, podInfo)
   725  	}
   726  	if podWithRequiredAntiAffinity(podInfo.Pod) {
   727  		n.PodsWithRequiredAntiAffinity = append(n.PodsWithRequiredAntiAffinity, podInfo)
   728  	}
   729  	n.update(podInfo.Pod, 1)
   730  }
   731  
   732  // AddPod is a wrapper around AddPodInfo.
   733  func (n *NodeInfo) AddPod(pod *v1.Pod) {
   734  	// ignore this err since apiserver doesn't properly validate affinity terms
   735  	// and we can't fix the validation for backwards compatibility.
   736  	podInfo, _ := NewPodInfo(pod)
   737  	n.AddPodInfo(podInfo)
   738  }
   739  
   740  func podWithAffinity(p *v1.Pod) bool {
   741  	affinity := p.Spec.Affinity
   742  	return affinity != nil && (affinity.PodAffinity != nil || affinity.PodAntiAffinity != nil)
   743  }
   744  
   745  func podWithRequiredAntiAffinity(p *v1.Pod) bool {
   746  	affinity := p.Spec.Affinity
   747  	return affinity != nil && affinity.PodAntiAffinity != nil &&
   748  		len(affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0
   749  }
   750  
   751  func removeFromSlice(logger klog.Logger, s []*PodInfo, k string) ([]*PodInfo, bool) {
   752  	var removed bool
   753  	for i := range s {
   754  		tmpKey, err := GetPodKey(s[i].Pod)
   755  		if err != nil {
   756  			logger.Error(err, "Cannot get pod key", "pod", klog.KObj(s[i].Pod))
   757  			continue
   758  		}
   759  		if k == tmpKey {
   760  			// delete the element
   761  			s[i] = s[len(s)-1]
   762  			s = s[:len(s)-1]
   763  			removed = true
   764  			break
   765  		}
   766  	}
   767  	// resets the slices to nil so that we can do DeepEqual in unit tests.
   768  	if len(s) == 0 {
   769  		return nil, removed
   770  	}
   771  	return s, removed
   772  }
   773  
   774  // RemovePod subtracts pod information from this NodeInfo.
   775  func (n *NodeInfo) RemovePod(logger klog.Logger, pod *v1.Pod) error {
   776  	k, err := GetPodKey(pod)
   777  	if err != nil {
   778  		return err
   779  	}
   780  	if podWithAffinity(pod) {
   781  		n.PodsWithAffinity, _ = removeFromSlice(logger, n.PodsWithAffinity, k)
   782  	}
   783  	if podWithRequiredAntiAffinity(pod) {
   784  		n.PodsWithRequiredAntiAffinity, _ = removeFromSlice(logger, n.PodsWithRequiredAntiAffinity, k)
   785  	}
   786  
   787  	var removed bool
   788  	if n.Pods, removed = removeFromSlice(logger, n.Pods, k); removed {
   789  		n.update(pod, -1)
   790  		return nil
   791  	}
   792  	return fmt.Errorf("no corresponding pod %s in pods of node %s", pod.Name, n.node.Name)
   793  }
   794  
   795  // update node info based on the pod and sign.
   796  // The sign will be set to `+1` when AddPod and to `-1` when RemovePod.
   797  func (n *NodeInfo) update(pod *v1.Pod, sign int64) {
   798  	res, non0CPU, non0Mem := calculateResource(pod)
   799  	n.Requested.MilliCPU += sign * res.MilliCPU
   800  	n.Requested.Memory += sign * res.Memory
   801  	n.Requested.EphemeralStorage += sign * res.EphemeralStorage
   802  	if n.Requested.ScalarResources == nil && len(res.ScalarResources) > 0 {
   803  		n.Requested.ScalarResources = map[v1.ResourceName]int64{}
   804  	}
   805  	for rName, rQuant := range res.ScalarResources {
   806  		n.Requested.ScalarResources[rName] += sign * rQuant
   807  	}
   808  	n.NonZeroRequested.MilliCPU += sign * non0CPU
   809  	n.NonZeroRequested.Memory += sign * non0Mem
   810  
   811  	// Consume ports when pod added or release ports when pod removed.
   812  	n.updateUsedPorts(pod, sign > 0)
   813  	n.updatePVCRefCounts(pod, sign > 0)
   814  
   815  	n.Generation = nextGeneration()
   816  }
   817  
   818  func max(a, b int64) int64 {
   819  	if a >= b {
   820  		return a
   821  	}
   822  	return b
   823  }
   824  
   825  func calculateResource(pod *v1.Pod) (Resource, int64, int64) {
   826  	var non0InitCPU, non0InitMem int64
   827  	var non0CPU, non0Mem int64
   828  	requests := resourcehelper.PodRequests(pod, resourcehelper.PodResourcesOptions{
   829  		InPlacePodVerticalScalingEnabled: utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling),
   830  		ContainerFn: func(requests v1.ResourceList, containerType podutil.ContainerType) {
   831  			non0CPUReq, non0MemReq := schedutil.GetNonzeroRequests(&requests)
   832  			switch containerType {
   833  			case podutil.Containers:
   834  				non0CPU += non0CPUReq
   835  				non0Mem += non0MemReq
   836  			case podutil.InitContainers:
   837  				non0InitCPU = max(non0InitCPU, non0CPUReq)
   838  				non0InitMem = max(non0InitMem, non0MemReq)
   839  			}
   840  		},
   841  	})
   842  
   843  	non0CPU = max(non0CPU, non0InitCPU)
   844  	non0Mem = max(non0Mem, non0InitMem)
   845  
   846  	// If Overhead is being utilized, add to the non-zero cpu/memory tracking for the pod. It has already been added
   847  	// into ScalarResources since it is part of requests
   848  	if pod.Spec.Overhead != nil {
   849  		if _, found := pod.Spec.Overhead[v1.ResourceCPU]; found {
   850  			non0CPU += pod.Spec.Overhead.Cpu().MilliValue()
   851  		}
   852  
   853  		if _, found := pod.Spec.Overhead[v1.ResourceMemory]; found {
   854  			non0Mem += pod.Spec.Overhead.Memory().Value()
   855  		}
   856  	}
   857  	var res Resource
   858  	res.Add(requests)
   859  	return res, non0CPU, non0Mem
   860  }
   861  
   862  // updateUsedPorts updates the UsedPorts of NodeInfo.
   863  func (n *NodeInfo) updateUsedPorts(pod *v1.Pod, add bool) {
   864  	for _, container := range pod.Spec.Containers {
   865  		for _, podPort := range container.Ports {
   866  			if add {
   867  				n.UsedPorts.Add(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
   868  			} else {
   869  				n.UsedPorts.Remove(podPort.HostIP, string(podPort.Protocol), podPort.HostPort)
   870  			}
   871  		}
   872  	}
   873  }
   874  
   875  // updatePVCRefCounts updates the PVCRefCounts of NodeInfo.
   876  func (n *NodeInfo) updatePVCRefCounts(pod *v1.Pod, add bool) {
   877  	for _, v := range pod.Spec.Volumes {
   878  		if v.PersistentVolumeClaim == nil {
   879  			continue
   880  		}
   881  
   882  		key := GetNamespacedName(pod.Namespace, v.PersistentVolumeClaim.ClaimName)
   883  		if add {
   884  			n.PVCRefCounts[key] += 1
   885  		} else {
   886  			n.PVCRefCounts[key] -= 1
   887  			if n.PVCRefCounts[key] <= 0 {
   888  				delete(n.PVCRefCounts, key)
   889  			}
   890  		}
   891  	}
   892  }
   893  
   894  // SetNode sets the overall node information.
   895  func (n *NodeInfo) SetNode(node *v1.Node) {
   896  	n.node = node
   897  	n.Allocatable = NewResource(node.Status.Allocatable)
   898  	n.Generation = nextGeneration()
   899  }
   900  
   901  // RemoveNode removes the node object, leaving all other tracking information.
   902  func (n *NodeInfo) RemoveNode() {
   903  	n.node = nil
   904  	n.Generation = nextGeneration()
   905  }
   906  
   907  // GetPodKey returns the string key of a pod.
   908  func GetPodKey(pod *v1.Pod) (string, error) {
   909  	uid := string(pod.UID)
   910  	if len(uid) == 0 {
   911  		return "", errors.New("cannot get cache key for pod with empty UID")
   912  	}
   913  	return uid, nil
   914  }
   915  
   916  // GetNamespacedName returns the string format of a namespaced resource name.
   917  func GetNamespacedName(namespace, name string) string {
   918  	return fmt.Sprintf("%s/%s", namespace, name)
   919  }
   920  
   921  // DefaultBindAllHostIP defines the default ip address used to bind to all host.
   922  const DefaultBindAllHostIP = "0.0.0.0"
   923  
   924  // ProtocolPort represents a protocol port pair, e.g. tcp:80.
   925  type ProtocolPort struct {
   926  	Protocol string
   927  	Port     int32
   928  }
   929  
   930  // NewProtocolPort creates a ProtocolPort instance.
   931  func NewProtocolPort(protocol string, port int32) *ProtocolPort {
   932  	pp := &ProtocolPort{
   933  		Protocol: protocol,
   934  		Port:     port,
   935  	}
   936  
   937  	if len(pp.Protocol) == 0 {
   938  		pp.Protocol = string(v1.ProtocolTCP)
   939  	}
   940  
   941  	return pp
   942  }
   943  
   944  // HostPortInfo stores mapping from ip to a set of ProtocolPort
   945  type HostPortInfo map[string]map[ProtocolPort]struct{}
   946  
   947  // Add adds (ip, protocol, port) to HostPortInfo
   948  func (h HostPortInfo) Add(ip, protocol string, port int32) {
   949  	if port <= 0 {
   950  		return
   951  	}
   952  
   953  	h.sanitize(&ip, &protocol)
   954  
   955  	pp := NewProtocolPort(protocol, port)
   956  	if _, ok := h[ip]; !ok {
   957  		h[ip] = map[ProtocolPort]struct{}{
   958  			*pp: {},
   959  		}
   960  		return
   961  	}
   962  
   963  	h[ip][*pp] = struct{}{}
   964  }
   965  
   966  // Remove removes (ip, protocol, port) from HostPortInfo
   967  func (h HostPortInfo) Remove(ip, protocol string, port int32) {
   968  	if port <= 0 {
   969  		return
   970  	}
   971  
   972  	h.sanitize(&ip, &protocol)
   973  
   974  	pp := NewProtocolPort(protocol, port)
   975  	if m, ok := h[ip]; ok {
   976  		delete(m, *pp)
   977  		if len(h[ip]) == 0 {
   978  			delete(h, ip)
   979  		}
   980  	}
   981  }
   982  
   983  // Len returns the total number of (ip, protocol, port) tuple in HostPortInfo
   984  func (h HostPortInfo) Len() int {
   985  	length := 0
   986  	for _, m := range h {
   987  		length += len(m)
   988  	}
   989  	return length
   990  }
   991  
   992  // CheckConflict checks if the input (ip, protocol, port) conflicts with the existing
   993  // ones in HostPortInfo.
   994  func (h HostPortInfo) CheckConflict(ip, protocol string, port int32) bool {
   995  	if port <= 0 {
   996  		return false
   997  	}
   998  
   999  	h.sanitize(&ip, &protocol)
  1000  
  1001  	pp := NewProtocolPort(protocol, port)
  1002  
  1003  	// If ip is 0.0.0.0 check all IP's (protocol, port) pair
  1004  	if ip == DefaultBindAllHostIP {
  1005  		for _, m := range h {
  1006  			if _, ok := m[*pp]; ok {
  1007  				return true
  1008  			}
  1009  		}
  1010  		return false
  1011  	}
  1012  
  1013  	// If ip isn't 0.0.0.0, only check IP and 0.0.0.0's (protocol, port) pair
  1014  	for _, key := range []string{DefaultBindAllHostIP, ip} {
  1015  		if m, ok := h[key]; ok {
  1016  			if _, ok2 := m[*pp]; ok2 {
  1017  				return true
  1018  			}
  1019  		}
  1020  	}
  1021  
  1022  	return false
  1023  }
  1024  
  1025  // sanitize the parameters
  1026  func (h HostPortInfo) sanitize(ip, protocol *string) {
  1027  	if len(*ip) == 0 {
  1028  		*ip = DefaultBindAllHostIP
  1029  	}
  1030  	if len(*protocol) == 0 {
  1031  		*protocol = string(v1.ProtocolTCP)
  1032  	}
  1033  }