k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/interface.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // This file defines the scheduling framework plugin interfaces.
    18  
    19  package framework
    20  
    21  import (
    22  	"context"
    23  	"errors"
    24  	"math"
    25  	"strings"
    26  	"sync"
    27  	"time"
    28  
    29  	"github.com/google/go-cmp/cmp"
    30  	"github.com/google/go-cmp/cmp/cmpopts"
    31  	v1 "k8s.io/api/core/v1"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	"k8s.io/client-go/informers"
    35  	clientset "k8s.io/client-go/kubernetes"
    36  	restclient "k8s.io/client-go/rest"
    37  	"k8s.io/client-go/tools/events"
    38  	"k8s.io/klog/v2"
    39  	"k8s.io/kubernetes/pkg/scheduler/apis/config"
    40  	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
    41  )
    42  
    43  // NodeScoreList declares a list of nodes and their scores.
    44  type NodeScoreList []NodeScore
    45  
    46  // NodeScore is a struct with node name and score.
    47  type NodeScore struct {
    48  	Name  string
    49  	Score int64
    50  }
    51  
    52  // NodeToStatusMap declares map from node name to its status.
    53  type NodeToStatusMap map[string]*Status
    54  
    55  // NodePluginScores is a struct with node name and scores for that node.
    56  type NodePluginScores struct {
    57  	// Name is node name.
    58  	Name string
    59  	// Scores is scores from plugins and extenders.
    60  	Scores []PluginScore
    61  	// TotalScore is the total score in Scores.
    62  	TotalScore int64
    63  }
    64  
    65  // PluginScore is a struct with plugin/extender name and score.
    66  type PluginScore struct {
    67  	// Name is the name of plugin or extender.
    68  	Name  string
    69  	Score int64
    70  }
    71  
    72  // Code is the Status code/type which is returned from plugins.
    73  type Code int
    74  
    75  // These are predefined codes used in a Status.
    76  // Note: when you add a new status, you have to add it in `codes` slice below.
    77  const (
    78  	// Success means that plugin ran correctly and found pod schedulable.
    79  	// NOTE: A nil status is also considered as "Success".
    80  	Success Code = iota
    81  	// Error is one of the failures, used for internal plugin errors, unexpected input, etc.
    82  	// Plugin shouldn't return this code for expected failures, like Unschedulable.
    83  	// Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins.
    84  	// Meaning, the Pod will be requeued to activeQ/backoffQ soon.
    85  	Error
    86  	// Unschedulable is one of the failures, used when a plugin finds a pod unschedulable.
    87  	// If it's returned from PreFilter or Filter, the scheduler might attempt to
    88  	// run other postFilter plugins like preemption to get this pod scheduled.
    89  	// Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins.
    90  	// The accompanying status message should explain why the pod is unschedulable.
    91  	//
    92  	// We regard the backoff as a penalty of wasting the scheduling cycle.
    93  	// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
    94  	// the Pod goes through backoff.
    95  	Unschedulable
    96  	// UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and
    97  	// other postFilter plugins like preemption would not change anything.
    98  	// Plugins should return Unschedulable if it is possible that the pod can get scheduled
    99  	// after running other postFilter plugins.
   100  	// The accompanying status message should explain why the pod is unschedulable.
   101  	//
   102  	// We regard the backoff as a penalty of wasting the scheduling cycle.
   103  	// When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling,
   104  	// the Pod goes through backoff.
   105  	UnschedulableAndUnresolvable
   106  	// Wait is used when a Permit plugin finds a pod scheduling should wait.
   107  	Wait
   108  	// Skip is used in the following scenarios:
   109  	// - when a Bind plugin chooses to skip binding.
   110  	// - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped.
   111  	// - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped.
   112  	Skip
   113  	// Pending means that the scheduling process is finished successfully,
   114  	// but the plugin wants to stop the scheduling cycle/binding cycle here.
   115  	//
   116  	// For example, the DRA plugin sometimes needs to wait for the external device driver
   117  	// to provision the resource for the Pod.
   118  	// It's different from when to return Unschedulable/UnschedulableAndUnresolvable,
   119  	// because in this case, the scheduler decides where the Pod can go successfully,
   120  	// but we need to wait for the external component to do something based on that scheduling result.
   121  	//
   122  	// We regard the backoff as a penalty of wasting the scheduling cycle.
   123  	// In the case of returning Pending, we cannot say the scheduling cycle is wasted
   124  	// because the scheduling result is used to proceed the Pod's scheduling forward,
   125  	// that particular scheduling cycle is failed though.
   126  	// So, Pods rejected by such reasons don't need to suffer a penalty (backoff).
   127  	// When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling,
   128  	// the Pod goes to activeQ directly ignoring backoff.
   129  	Pending
   130  )
   131  
   132  // This list should be exactly the same as the codes iota defined above in the same order.
   133  var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"}
   134  
   135  func (c Code) String() string {
   136  	return codes[c]
   137  }
   138  
   139  const (
   140  	// MaxNodeScore is the maximum score a Score plugin is expected to return.
   141  	MaxNodeScore int64 = 100
   142  
   143  	// MinNodeScore is the minimum score a Score plugin is expected to return.
   144  	MinNodeScore int64 = 0
   145  
   146  	// MaxTotalScore is the maximum total score.
   147  	MaxTotalScore int64 = math.MaxInt64
   148  )
   149  
   150  // PodsToActivateKey is a reserved state key for stashing pods.
   151  // If the stashed pods are present in unschedulablePods or backoffQ,they will be
   152  // activated (i.e., moved to activeQ) in two phases:
   153  // - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated)
   154  // - end of a binding cycle if it succeeds
   155  var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate"
   156  
   157  // PodsToActivate stores pods to be activated.
   158  type PodsToActivate struct {
   159  	sync.Mutex
   160  	// Map is keyed with namespaced pod name, and valued with the pod.
   161  	Map map[string]*v1.Pod
   162  }
   163  
   164  // Clone just returns the same state.
   165  func (s *PodsToActivate) Clone() StateData {
   166  	return s
   167  }
   168  
   169  // NewPodsToActivate instantiates a PodsToActivate object.
   170  func NewPodsToActivate() *PodsToActivate {
   171  	return &PodsToActivate{Map: make(map[string]*v1.Pod)}
   172  }
   173  
   174  // Status indicates the result of running a plugin. It consists of a code, a
   175  // message, (optionally) an error, and a plugin name it fails by.
   176  // When the status code is not Success, the reasons should explain why.
   177  // And, when code is Success, all the other fields should be empty.
   178  // NOTE: A nil Status is also considered as Success.
   179  type Status struct {
   180  	code    Code
   181  	reasons []string
   182  	err     error
   183  	// plugin is an optional field that records the plugin name causes this status.
   184  	// It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending.
   185  	plugin string
   186  }
   187  
   188  func (s *Status) WithError(err error) *Status {
   189  	s.err = err
   190  	return s
   191  }
   192  
   193  // Code returns code of the Status.
   194  func (s *Status) Code() Code {
   195  	if s == nil {
   196  		return Success
   197  	}
   198  	return s.code
   199  }
   200  
   201  // Message returns a concatenated message on reasons of the Status.
   202  func (s *Status) Message() string {
   203  	if s == nil {
   204  		return ""
   205  	}
   206  	return strings.Join(s.Reasons(), ", ")
   207  }
   208  
   209  // SetPlugin sets the given plugin name to s.plugin.
   210  func (s *Status) SetPlugin(plugin string) {
   211  	s.plugin = plugin
   212  }
   213  
   214  // WithPlugin sets the given plugin name to s.plugin,
   215  // and returns the given status object.
   216  func (s *Status) WithPlugin(plugin string) *Status {
   217  	s.SetPlugin(plugin)
   218  	return s
   219  }
   220  
   221  // Plugin returns the plugin name which caused this status.
   222  func (s *Status) Plugin() string {
   223  	return s.plugin
   224  }
   225  
   226  // Reasons returns reasons of the Status.
   227  func (s *Status) Reasons() []string {
   228  	if s.err != nil {
   229  		return append([]string{s.err.Error()}, s.reasons...)
   230  	}
   231  	return s.reasons
   232  }
   233  
   234  // AppendReason appends given reason to the Status.
   235  func (s *Status) AppendReason(reason string) {
   236  	s.reasons = append(s.reasons, reason)
   237  }
   238  
   239  // IsSuccess returns true if and only if "Status" is nil or Code is "Success".
   240  func (s *Status) IsSuccess() bool {
   241  	return s.Code() == Success
   242  }
   243  
   244  // IsWait returns true if and only if "Status" is non-nil and its Code is "Wait".
   245  func (s *Status) IsWait() bool {
   246  	return s.Code() == Wait
   247  }
   248  
   249  // IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip".
   250  func (s *Status) IsSkip() bool {
   251  	return s.Code() == Skip
   252  }
   253  
   254  // IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending).
   255  func (s *Status) IsRejected() bool {
   256  	code := s.Code()
   257  	return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending
   258  }
   259  
   260  // AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object
   261  // with a concatenated message on reasons of the Status.
   262  func (s *Status) AsError() error {
   263  	if s.IsSuccess() || s.IsWait() || s.IsSkip() {
   264  		return nil
   265  	}
   266  	if s.err != nil {
   267  		return s.err
   268  	}
   269  	return errors.New(s.Message())
   270  }
   271  
   272  // Equal checks equality of two statuses. This is useful for testing with
   273  // cmp.Equal.
   274  func (s *Status) Equal(x *Status) bool {
   275  	if s == nil || x == nil {
   276  		return s.IsSuccess() && x.IsSuccess()
   277  	}
   278  	if s.code != x.code {
   279  		return false
   280  	}
   281  	if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) {
   282  		return false
   283  	}
   284  	if !cmp.Equal(s.reasons, x.reasons) {
   285  		return false
   286  	}
   287  	return cmp.Equal(s.plugin, x.plugin)
   288  }
   289  
   290  // NewStatus makes a Status out of the given arguments and returns its pointer.
   291  func NewStatus(code Code, reasons ...string) *Status {
   292  	s := &Status{
   293  		code:    code,
   294  		reasons: reasons,
   295  	}
   296  	return s
   297  }
   298  
   299  // AsStatus wraps an error in a Status.
   300  func AsStatus(err error) *Status {
   301  	if err == nil {
   302  		return nil
   303  	}
   304  	return &Status{
   305  		code: Error,
   306  		err:  err,
   307  	}
   308  }
   309  
   310  // WaitingPod represents a pod currently waiting in the permit phase.
   311  type WaitingPod interface {
   312  	// GetPod returns a reference to the waiting pod.
   313  	GetPod() *v1.Pod
   314  	// GetPendingPlugins returns a list of pending Permit plugin's name.
   315  	GetPendingPlugins() []string
   316  	// Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName".
   317  	// If this is the last remaining plugin to allow, then a success signal is delivered
   318  	// to unblock the pod.
   319  	Allow(pluginName string)
   320  	// Reject declares the waiting pod unschedulable.
   321  	Reject(pluginName, msg string)
   322  }
   323  
   324  // Plugin is the parent type for all the scheduling framework plugins.
   325  type Plugin interface {
   326  	Name() string
   327  }
   328  
   329  // PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins.
   330  // These plugins are called prior to adding Pods to activeQ.
   331  // Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to
   332  // involve expensive calls like accessing external endpoints; otherwise it'd block other
   333  // Pods' enqueuing in event handlers.
   334  type PreEnqueuePlugin interface {
   335  	Plugin
   336  	// PreEnqueue is called prior to adding Pods to activeQ.
   337  	PreEnqueue(ctx context.Context, p *v1.Pod) *Status
   338  }
   339  
   340  // LessFunc is the function to sort pod info
   341  type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool
   342  
   343  // QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins.
   344  // These plugins are used to sort pods in the scheduling queue. Only one queue sort
   345  // plugin may be enabled at a time.
   346  type QueueSortPlugin interface {
   347  	Plugin
   348  	// Less are used to sort pods in the scheduling queue.
   349  	Less(*QueuedPodInfo, *QueuedPodInfo) bool
   350  }
   351  
   352  // EnqueueExtensions is an optional interface that plugins can implement to efficiently
   353  // move unschedulable Pods in internal scheduling queues.
   354  // In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins,
   355  // and Pods rejected by these plugins are requeued based on this extension point.
   356  // Failures from other extension points are regarded as temporal errors (e.g., network failure),
   357  // and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff.
   358  // This is because such temporal errors cannot be resolved by specific cluster events,
   359  // and we have no choise but keep retrying scheduling until the failure is resolved.
   360  //
   361  // Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface,
   362  // otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin.
   363  // And, if plugins other than above extension points support this interface, they are just ignored.
   364  type EnqueueExtensions interface {
   365  	Plugin
   366  	// EventsToRegister returns a series of possible events that may cause a Pod
   367  	// failed by this plugin schedulable. Each event has a callback function that
   368  	// filters out events to reduce useless retry of Pod's scheduling.
   369  	// The events will be registered when instantiating the internal scheduling queue,
   370  	// and leveraged to build event handlers dynamically.
   371  	// Note: the returned list needs to be static (not depend on configuration parameters);
   372  	// otherwise it would lead to undefined behavior.
   373  	//
   374  	// Appropriate implementation of this function will make Pod's re-scheduling accurate and performant.
   375  	EventsToRegister() []ClusterEventWithHint
   376  }
   377  
   378  // PreFilterExtensions is an interface that is included in plugins that allow specifying
   379  // callbacks to make incremental updates to its supposedly pre-calculated
   380  // state.
   381  type PreFilterExtensions interface {
   382  	// AddPod is called by the framework while trying to evaluate the impact
   383  	// of adding podToAdd to the node while scheduling podToSchedule.
   384  	AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
   385  	// RemovePod is called by the framework while trying to evaluate the impact
   386  	// of removing podToRemove from the node while scheduling podToSchedule.
   387  	RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
   388  }
   389  
   390  // PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins.
   391  // These plugins are called at the beginning of the scheduling cycle.
   392  type PreFilterPlugin interface {
   393  	Plugin
   394  	// PreFilter is called at the beginning of the scheduling cycle. All PreFilter
   395  	// plugins must return success or the pod will be rejected. PreFilter could optionally
   396  	// return a PreFilterResult to influence which nodes to evaluate downstream. This is useful
   397  	// for cases where it is possible to determine the subset of nodes to process in O(1) time.
   398  	// When it returns Skip status, returned PreFilterResult and other fields in status are just ignored,
   399  	// and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle.
   400  	PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status)
   401  	// PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one,
   402  	// or nil if it does not. A Pre-filter plugin can provide extensions to incrementally
   403  	// modify its pre-processed info. The framework guarantees that the extensions
   404  	// AddPod/RemovePod will only be called after PreFilter, possibly on a cloned
   405  	// CycleState, and may call those functions more than once before calling
   406  	// Filter again on a specific node.
   407  	PreFilterExtensions() PreFilterExtensions
   408  }
   409  
   410  // FilterPlugin is an interface for Filter plugins. These plugins are called at the
   411  // filter extension point for filtering out hosts that cannot run a pod.
   412  // This concept used to be called 'predicate' in the original scheduler.
   413  // These plugins should return "Success", "Unschedulable" or "Error" in Status.code.
   414  // However, the scheduler accepts other valid codes as well.
   415  // Anything other than "Success" will lead to exclusion of the given host from
   416  // running the pod.
   417  type FilterPlugin interface {
   418  	Plugin
   419  	// Filter is called by the scheduling framework.
   420  	// All FilterPlugins should return "Success" to declare that
   421  	// the given node fits the pod. If Filter doesn't return "Success",
   422  	// it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error".
   423  	// For the node being evaluated, Filter plugins should look at the passed
   424  	// nodeInfo reference for this particular node's information (e.g., pods
   425  	// considered to be running on the node) instead of looking it up in the
   426  	// NodeInfoSnapshot because we don't guarantee that they will be the same.
   427  	// For example, during preemption, we may pass a copy of the original
   428  	// nodeInfo object that has some pods removed from it to evaluate the
   429  	// possibility of preempting them to schedule the target pod.
   430  	Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status
   431  }
   432  
   433  // PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called
   434  // after a pod cannot be scheduled.
   435  type PostFilterPlugin interface {
   436  	Plugin
   437  	// PostFilter is called by the scheduling framework.
   438  	// A PostFilter plugin should return one of the following statuses:
   439  	// - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable.
   440  	// - Success: the plugin gets executed successfully and the pod can be made schedulable.
   441  	// - Error: the plugin aborts due to some internal error.
   442  	//
   443  	// Informational plugins should be configured ahead of other ones, and always return Unschedulable status.
   444  	// Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example,
   445  	// a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the
   446  	// preemptor pod's .spec.status.nominatedNodeName field.
   447  	PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status)
   448  }
   449  
   450  // PreScorePlugin is an interface for "PreScore" plugin. PreScore is an
   451  // informational extension point. Plugins will be called with a list of nodes
   452  // that passed the filtering phase. A plugin may use this data to update internal
   453  // state or to generate logs/metrics.
   454  type PreScorePlugin interface {
   455  	Plugin
   456  	// PreScore is called by the scheduling framework after a list of nodes
   457  	// passed the filtering phase. All prescore plugins must return success or
   458  	// the pod will be rejected
   459  	// When it returns Skip status, other fields in status are just ignored,
   460  	// and coupled Score plugin will be skipped in this scheduling cycle.
   461  	PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*v1.Node) *Status
   462  }
   463  
   464  // ScoreExtensions is an interface for Score extended functionality.
   465  type ScoreExtensions interface {
   466  	// NormalizeScore is called for all node scores produced by the same plugin's "Score"
   467  	// method. A successful run of NormalizeScore will update the scores list and return
   468  	// a success status.
   469  	NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status
   470  }
   471  
   472  // ScorePlugin is an interface that must be implemented by "Score" plugins to rank
   473  // nodes that passed the filtering phase.
   474  type ScorePlugin interface {
   475  	Plugin
   476  	// Score is called on each filtered node. It must return success and an integer
   477  	// indicating the rank of the node. All scoring plugins must return success or
   478  	// the pod will be rejected.
   479  	Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status)
   480  
   481  	// ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not.
   482  	ScoreExtensions() ScoreExtensions
   483  }
   484  
   485  // ReservePlugin is an interface for plugins with Reserve and Unreserve
   486  // methods. These are meant to update the state of the plugin. This concept
   487  // used to be called 'assume' in the original scheduler. These plugins should
   488  // return only Success or Error in Status.code. However, the scheduler accepts
   489  // other valid codes as well. Anything other than Success will lead to
   490  // rejection of the pod.
   491  type ReservePlugin interface {
   492  	Plugin
   493  	// Reserve is called by the scheduling framework when the scheduler cache is
   494  	// updated. If this method returns a failed Status, the scheduler will call
   495  	// the Unreserve method for all enabled ReservePlugins.
   496  	Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   497  	// Unreserve is called by the scheduling framework when a reserved pod was
   498  	// rejected, an error occurred during reservation of subsequent plugins, or
   499  	// in a later phase. The Unreserve method implementation must be idempotent
   500  	// and may be called by the scheduler even if the corresponding Reserve
   501  	// method for the same plugin was not called.
   502  	Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
   503  }
   504  
   505  // PreBindPlugin is an interface that must be implemented by "PreBind" plugins.
   506  // These plugins are called before a pod being scheduled.
   507  type PreBindPlugin interface {
   508  	Plugin
   509  	// PreBind is called before binding a pod. All prebind plugins must return
   510  	// success or the pod will be rejected and won't be sent for binding.
   511  	PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   512  }
   513  
   514  // PostBindPlugin is an interface that must be implemented by "PostBind" plugins.
   515  // These plugins are called after a pod is successfully bound to a node.
   516  type PostBindPlugin interface {
   517  	Plugin
   518  	// PostBind is called after a pod is successfully bound. These plugins are
   519  	// informational. A common application of this extension point is for cleaning
   520  	// up. If a plugin needs to clean-up its state after a pod is scheduled and
   521  	// bound, PostBind is the extension point that it should register.
   522  	PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string)
   523  }
   524  
   525  // PermitPlugin is an interface that must be implemented by "Permit" plugins.
   526  // These plugins are called before a pod is bound to a node.
   527  type PermitPlugin interface {
   528  	Plugin
   529  	// Permit is called before binding a pod (and before prebind plugins). Permit
   530  	// plugins are used to prevent or delay the binding of a Pod. A permit plugin
   531  	// must return success or wait with timeout duration, or the pod will be rejected.
   532  	// The pod will also be rejected if the wait timeout or the pod is rejected while
   533  	// waiting. Note that if the plugin returns "wait", the framework will wait only
   534  	// after running the remaining plugins given that no other plugin rejects the pod.
   535  	Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration)
   536  }
   537  
   538  // BindPlugin is an interface that must be implemented by "Bind" plugins. Bind
   539  // plugins are used to bind a pod to a Node.
   540  type BindPlugin interface {
   541  	Plugin
   542  	// Bind plugins will not be called until all pre-bind plugins have completed. Each
   543  	// bind plugin is called in the configured order. A bind plugin may choose whether
   544  	// or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the
   545  	// remaining bind plugins are skipped. When a bind plugin does not handle a pod,
   546  	// it must return Skip in its Status code. If a bind plugin returns an Error, the
   547  	// pod is rejected and will not be bound.
   548  	Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status
   549  }
   550  
   551  // Framework manages the set of plugins in use by the scheduling framework.
   552  // Configured plugins are called at specified points in a scheduling context.
   553  type Framework interface {
   554  	Handle
   555  
   556  	// PreEnqueuePlugins returns the registered preEnqueue plugins.
   557  	PreEnqueuePlugins() []PreEnqueuePlugin
   558  
   559  	// EnqueueExtensions returns the registered Enqueue extensions.
   560  	EnqueueExtensions() []EnqueueExtensions
   561  
   562  	// QueueSortFunc returns the function to sort pods in scheduling queue
   563  	QueueSortFunc() LessFunc
   564  
   565  	// RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns
   566  	// *Status and its code is set to non-success if any of the plugins returns
   567  	// anything but Success. If a non-success status is returned, then the scheduling
   568  	// cycle is aborted.
   569  	// It also returns a PreFilterResult, which may influence what or how many nodes to
   570  	// evaluate downstream.
   571  	RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status)
   572  
   573  	// RunPostFilterPlugins runs the set of configured PostFilter plugins.
   574  	// PostFilter plugins can either be informational, in which case should be configured
   575  	// to execute first and return Unschedulable status, or ones that try to change the
   576  	// cluster state to make the pod potentially schedulable in a future scheduling cycle.
   577  	RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status)
   578  
   579  	// RunPreBindPlugins runs the set of configured PreBind plugins. It returns
   580  	// *Status and its code is set to non-success if any of the plugins returns
   581  	// anything but Success. If the Status code is "Unschedulable", it is
   582  	// considered as a scheduling check failure, otherwise, it is considered as an
   583  	// internal error. In either case the pod is not going to be bound.
   584  	RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   585  
   586  	// RunPostBindPlugins runs the set of configured PostBind plugins.
   587  	RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
   588  
   589  	// RunReservePluginsReserve runs the Reserve method of the set of
   590  	// configured Reserve plugins. If any of these calls returns an error, it
   591  	// does not continue running the remaining ones and returns the error. In
   592  	// such case, pod will not be scheduled.
   593  	RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   594  
   595  	// RunReservePluginsUnreserve runs the Unreserve method of the set of
   596  	// configured Reserve plugins.
   597  	RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string)
   598  
   599  	// RunPermitPlugins runs the set of configured Permit plugins. If any of these
   600  	// plugins returns a status other than "Success" or "Wait", it does not continue
   601  	// running the remaining plugins and returns an error. Otherwise, if any of the
   602  	// plugins returns "Wait", then this function will create and add waiting pod
   603  	// to a map of currently waiting pods and return status with "Wait" code.
   604  	// Pod will remain waiting pod for the minimum duration returned by the Permit plugins.
   605  	RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   606  
   607  	// WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed.
   608  	WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status
   609  
   610  	// RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose
   611  	// whether or not to handle the given Pod. If a Bind plugin chooses to skip the
   612  	// binding, it should return code=5("skip") status. Otherwise, it should return "Error"
   613  	// or "Success". If none of the plugins handled binding, RunBindPlugins returns
   614  	// code=5("skip") status.
   615  	RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status
   616  
   617  	// HasFilterPlugins returns true if at least one Filter plugin is defined.
   618  	HasFilterPlugins() bool
   619  
   620  	// HasPostFilterPlugins returns true if at least one PostFilter plugin is defined.
   621  	HasPostFilterPlugins() bool
   622  
   623  	// HasScorePlugins returns true if at least one Score plugin is defined.
   624  	HasScorePlugins() bool
   625  
   626  	// ListPlugins returns a map of extension point name to list of configured Plugins.
   627  	ListPlugins() *config.Plugins
   628  
   629  	// ProfileName returns the profile name associated to a profile.
   630  	ProfileName() string
   631  
   632  	// PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile.
   633  	PercentageOfNodesToScore() *int32
   634  
   635  	// SetPodNominator sets the PodNominator
   636  	SetPodNominator(nominator PodNominator)
   637  }
   638  
   639  // Handle provides data and some tools that plugins can use. It is
   640  // passed to the plugin factories at the time of plugin initialization. Plugins
   641  // must store and use this handle to call framework functions.
   642  type Handle interface {
   643  	// PodNominator abstracts operations to maintain nominated Pods.
   644  	PodNominator
   645  	// PluginsRunner abstracts operations to run some plugins.
   646  	PluginsRunner
   647  	// SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot
   648  	// is taken at the beginning of a scheduling cycle and remains unchanged until
   649  	// a pod finishes "Permit" point. There is no guarantee that the information
   650  	// remains unchanged in the binding phase of scheduling, so plugins in the binding
   651  	// cycle (pre-bind/bind/post-bind/un-reserve plugin) should not use it,
   652  	// otherwise a concurrent read/write error might occur, they should use scheduler
   653  	// cache instead.
   654  	SnapshotSharedLister() SharedLister
   655  
   656  	// IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map.
   657  	IterateOverWaitingPods(callback func(WaitingPod))
   658  
   659  	// GetWaitingPod returns a waiting pod given its UID.
   660  	GetWaitingPod(uid types.UID) WaitingPod
   661  
   662  	// RejectWaitingPod rejects a waiting pod given its UID.
   663  	// The return value indicates if the pod is waiting or not.
   664  	RejectWaitingPod(uid types.UID) bool
   665  
   666  	// ClientSet returns a kubernetes clientSet.
   667  	ClientSet() clientset.Interface
   668  
   669  	// KubeConfig returns the raw kube config.
   670  	KubeConfig() *restclient.Config
   671  
   672  	// EventRecorder returns an event recorder.
   673  	EventRecorder() events.EventRecorder
   674  
   675  	SharedInformerFactory() informers.SharedInformerFactory
   676  
   677  	// RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node.
   678  	RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status
   679  
   680  	// Extenders returns registered scheduler extenders.
   681  	Extenders() []Extender
   682  
   683  	// Parallelizer returns a parallelizer holding parallelism for scheduler.
   684  	Parallelizer() parallelize.Parallelizer
   685  }
   686  
   687  // PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase.
   688  type PreFilterResult struct {
   689  	// The set of nodes that should be considered downstream; if nil then
   690  	// all nodes are eligible.
   691  	NodeNames sets.Set[string]
   692  }
   693  
   694  func (p *PreFilterResult) AllNodes() bool {
   695  	return p == nil || p.NodeNames == nil
   696  }
   697  
   698  func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult {
   699  	if p.AllNodes() && in.AllNodes() {
   700  		return nil
   701  	}
   702  
   703  	r := PreFilterResult{}
   704  	if p.AllNodes() {
   705  		r.NodeNames = in.NodeNames.Clone()
   706  		return &r
   707  	}
   708  	if in.AllNodes() {
   709  		r.NodeNames = p.NodeNames.Clone()
   710  		return &r
   711  	}
   712  
   713  	r.NodeNames = p.NodeNames.Intersection(in.NodeNames)
   714  	return &r
   715  }
   716  
   717  type NominatingMode int
   718  
   719  const (
   720  	ModeNoop NominatingMode = iota
   721  	ModeOverride
   722  )
   723  
   724  type NominatingInfo struct {
   725  	NominatedNodeName string
   726  	NominatingMode    NominatingMode
   727  }
   728  
   729  // PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase.
   730  type PostFilterResult struct {
   731  	*NominatingInfo
   732  }
   733  
   734  func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult {
   735  	return &PostFilterResult{
   736  		NominatingInfo: &NominatingInfo{
   737  			NominatedNodeName: name,
   738  			NominatingMode:    ModeOverride,
   739  		},
   740  	}
   741  }
   742  
   743  func (ni *NominatingInfo) Mode() NominatingMode {
   744  	if ni == nil {
   745  		return ModeNoop
   746  	}
   747  	return ni.NominatingMode
   748  }
   749  
   750  // PodNominator abstracts operations to maintain nominated Pods.
   751  type PodNominator interface {
   752  	// AddNominatedPod adds the given pod to the nominator or
   753  	// updates it if it already exists.
   754  	AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo)
   755  	// DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist.
   756  	DeleteNominatedPodIfExists(pod *v1.Pod)
   757  	// UpdateNominatedPod updates the <oldPod> with <newPod>.
   758  	UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo)
   759  	// NominatedPodsForNode returns nominatedPods on the given node.
   760  	NominatedPodsForNode(nodeName string) []*PodInfo
   761  }
   762  
   763  // PluginsRunner abstracts operations to run some plugins.
   764  // This is used by preemption PostFilter plugins when evaluating the feasibility of
   765  // scheduling the pod on nodes when certain running pods get evicted.
   766  type PluginsRunner interface {
   767  	// RunPreScorePlugins runs the set of configured PreScore plugins. If any
   768  	// of these plugins returns any status other than "Success", the given pod is rejected.
   769  	RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*v1.Node) *Status
   770  	// RunScorePlugins runs the set of configured scoring plugins.
   771  	// It returns a list that stores scores from each plugin and total score for each Node.
   772  	// It also returns *Status, which is set to non-success if any of the plugins returns
   773  	// a non-success status.
   774  	RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*v1.Node) ([]NodePluginScores, *Status)
   775  	// RunFilterPlugins runs the set of configured Filter plugins for pod on
   776  	// the given node. Note that for the node being evaluated, the passed nodeInfo
   777  	// reference could be different from the one in NodeInfoSnapshot map (e.g., pods
   778  	// considered to be running on the node could be different). For example, during
   779  	// preemption, we may pass a copy of the original nodeInfo object that has some pods
   780  	// removed from it to evaluate the possibility of preempting them to
   781  	// schedule the target pod.
   782  	RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status
   783  	// RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured
   784  	// PreFilter plugins. It returns directly if any of the plugins return any
   785  	// status other than Success.
   786  	RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status
   787  	// RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured
   788  	// PreFilter plugins. It returns directly if any of the plugins return any
   789  	// status other than Success.
   790  	RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status
   791  }