k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/interface.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // This file defines the scheduling framework plugin interfaces. 18 19 package framework 20 21 import ( 22 "context" 23 "errors" 24 "math" 25 "strings" 26 "sync" 27 "time" 28 29 "github.com/google/go-cmp/cmp" 30 "github.com/google/go-cmp/cmp/cmpopts" 31 v1 "k8s.io/api/core/v1" 32 "k8s.io/apimachinery/pkg/types" 33 "k8s.io/apimachinery/pkg/util/sets" 34 "k8s.io/client-go/informers" 35 clientset "k8s.io/client-go/kubernetes" 36 restclient "k8s.io/client-go/rest" 37 "k8s.io/client-go/tools/events" 38 "k8s.io/klog/v2" 39 "k8s.io/kubernetes/pkg/scheduler/apis/config" 40 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 41 ) 42 43 // NodeScoreList declares a list of nodes and their scores. 44 type NodeScoreList []NodeScore 45 46 // NodeScore is a struct with node name and score. 47 type NodeScore struct { 48 Name string 49 Score int64 50 } 51 52 // NodeToStatusMap declares map from node name to its status. 53 type NodeToStatusMap map[string]*Status 54 55 // NodePluginScores is a struct with node name and scores for that node. 56 type NodePluginScores struct { 57 // Name is node name. 58 Name string 59 // Scores is scores from plugins and extenders. 60 Scores []PluginScore 61 // TotalScore is the total score in Scores. 62 TotalScore int64 63 } 64 65 // PluginScore is a struct with plugin/extender name and score. 66 type PluginScore struct { 67 // Name is the name of plugin or extender. 68 Name string 69 Score int64 70 } 71 72 // Code is the Status code/type which is returned from plugins. 73 type Code int 74 75 // These are predefined codes used in a Status. 76 // Note: when you add a new status, you have to add it in `codes` slice below. 77 const ( 78 // Success means that plugin ran correctly and found pod schedulable. 79 // NOTE: A nil status is also considered as "Success". 80 Success Code = iota 81 // Error is one of the failures, used for internal plugin errors, unexpected input, etc. 82 // Plugin shouldn't return this code for expected failures, like Unschedulable. 83 // Since it's the unexpected failure, the scheduling queue registers the pod without unschedulable plugins. 84 // Meaning, the Pod will be requeued to activeQ/backoffQ soon. 85 Error 86 // Unschedulable is one of the failures, used when a plugin finds a pod unschedulable. 87 // If it's returned from PreFilter or Filter, the scheduler might attempt to 88 // run other postFilter plugins like preemption to get this pod scheduled. 89 // Use UnschedulableAndUnresolvable to make the scheduler skipping other postFilter plugins. 90 // The accompanying status message should explain why the pod is unschedulable. 91 // 92 // We regard the backoff as a penalty of wasting the scheduling cycle. 93 // When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling, 94 // the Pod goes through backoff. 95 Unschedulable 96 // UnschedulableAndUnresolvable is used when a plugin finds a pod unschedulable and 97 // other postFilter plugins like preemption would not change anything. 98 // Plugins should return Unschedulable if it is possible that the pod can get scheduled 99 // after running other postFilter plugins. 100 // The accompanying status message should explain why the pod is unschedulable. 101 // 102 // We regard the backoff as a penalty of wasting the scheduling cycle. 103 // When the scheduling queue requeues Pods, which was rejected with Unschedulable in the last scheduling, 104 // the Pod goes through backoff. 105 UnschedulableAndUnresolvable 106 // Wait is used when a Permit plugin finds a pod scheduling should wait. 107 Wait 108 // Skip is used in the following scenarios: 109 // - when a Bind plugin chooses to skip binding. 110 // - when a PreFilter plugin returns Skip so that coupled Filter plugin/PreFilterExtensions() will be skipped. 111 // - when a PreScore plugin returns Skip so that coupled Score plugin will be skipped. 112 Skip 113 // Pending means that the scheduling process is finished successfully, 114 // but the plugin wants to stop the scheduling cycle/binding cycle here. 115 // 116 // For example, the DRA plugin sometimes needs to wait for the external device driver 117 // to provision the resource for the Pod. 118 // It's different from when to return Unschedulable/UnschedulableAndUnresolvable, 119 // because in this case, the scheduler decides where the Pod can go successfully, 120 // but we need to wait for the external component to do something based on that scheduling result. 121 // 122 // We regard the backoff as a penalty of wasting the scheduling cycle. 123 // In the case of returning Pending, we cannot say the scheduling cycle is wasted 124 // because the scheduling result is used to proceed the Pod's scheduling forward, 125 // that particular scheduling cycle is failed though. 126 // So, Pods rejected by such reasons don't need to suffer a penalty (backoff). 127 // When the scheduling queue requeues Pods, which was rejected with Pending in the last scheduling, 128 // the Pod goes to activeQ directly ignoring backoff. 129 Pending 130 ) 131 132 // This list should be exactly the same as the codes iota defined above in the same order. 133 var codes = []string{"Success", "Error", "Unschedulable", "UnschedulableAndUnresolvable", "Wait", "Skip", "Pending"} 134 135 func (c Code) String() string { 136 return codes[c] 137 } 138 139 const ( 140 // MaxNodeScore is the maximum score a Score plugin is expected to return. 141 MaxNodeScore int64 = 100 142 143 // MinNodeScore is the minimum score a Score plugin is expected to return. 144 MinNodeScore int64 = 0 145 146 // MaxTotalScore is the maximum total score. 147 MaxTotalScore int64 = math.MaxInt64 148 ) 149 150 // PodsToActivateKey is a reserved state key for stashing pods. 151 // If the stashed pods are present in unschedulablePods or backoffQ,they will be 152 // activated (i.e., moved to activeQ) in two phases: 153 // - end of a scheduling cycle if it succeeds (will be cleared from `PodsToActivate` if activated) 154 // - end of a binding cycle if it succeeds 155 var PodsToActivateKey StateKey = "kubernetes.io/pods-to-activate" 156 157 // PodsToActivate stores pods to be activated. 158 type PodsToActivate struct { 159 sync.Mutex 160 // Map is keyed with namespaced pod name, and valued with the pod. 161 Map map[string]*v1.Pod 162 } 163 164 // Clone just returns the same state. 165 func (s *PodsToActivate) Clone() StateData { 166 return s 167 } 168 169 // NewPodsToActivate instantiates a PodsToActivate object. 170 func NewPodsToActivate() *PodsToActivate { 171 return &PodsToActivate{Map: make(map[string]*v1.Pod)} 172 } 173 174 // Status indicates the result of running a plugin. It consists of a code, a 175 // message, (optionally) an error, and a plugin name it fails by. 176 // When the status code is not Success, the reasons should explain why. 177 // And, when code is Success, all the other fields should be empty. 178 // NOTE: A nil Status is also considered as Success. 179 type Status struct { 180 code Code 181 reasons []string 182 err error 183 // plugin is an optional field that records the plugin name causes this status. 184 // It's set by the framework when code is Unschedulable, UnschedulableAndUnresolvable or Pending. 185 plugin string 186 } 187 188 func (s *Status) WithError(err error) *Status { 189 s.err = err 190 return s 191 } 192 193 // Code returns code of the Status. 194 func (s *Status) Code() Code { 195 if s == nil { 196 return Success 197 } 198 return s.code 199 } 200 201 // Message returns a concatenated message on reasons of the Status. 202 func (s *Status) Message() string { 203 if s == nil { 204 return "" 205 } 206 return strings.Join(s.Reasons(), ", ") 207 } 208 209 // SetPlugin sets the given plugin name to s.plugin. 210 func (s *Status) SetPlugin(plugin string) { 211 s.plugin = plugin 212 } 213 214 // WithPlugin sets the given plugin name to s.plugin, 215 // and returns the given status object. 216 func (s *Status) WithPlugin(plugin string) *Status { 217 s.SetPlugin(plugin) 218 return s 219 } 220 221 // Plugin returns the plugin name which caused this status. 222 func (s *Status) Plugin() string { 223 return s.plugin 224 } 225 226 // Reasons returns reasons of the Status. 227 func (s *Status) Reasons() []string { 228 if s.err != nil { 229 return append([]string{s.err.Error()}, s.reasons...) 230 } 231 return s.reasons 232 } 233 234 // AppendReason appends given reason to the Status. 235 func (s *Status) AppendReason(reason string) { 236 s.reasons = append(s.reasons, reason) 237 } 238 239 // IsSuccess returns true if and only if "Status" is nil or Code is "Success". 240 func (s *Status) IsSuccess() bool { 241 return s.Code() == Success 242 } 243 244 // IsWait returns true if and only if "Status" is non-nil and its Code is "Wait". 245 func (s *Status) IsWait() bool { 246 return s.Code() == Wait 247 } 248 249 // IsSkip returns true if and only if "Status" is non-nil and its Code is "Skip". 250 func (s *Status) IsSkip() bool { 251 return s.Code() == Skip 252 } 253 254 // IsRejected returns true if "Status" is Unschedulable (Unschedulable, UnschedulableAndUnresolvable, or Pending). 255 func (s *Status) IsRejected() bool { 256 code := s.Code() 257 return code == Unschedulable || code == UnschedulableAndUnresolvable || code == Pending 258 } 259 260 // AsError returns nil if the status is a success, a wait or a skip; otherwise returns an "error" object 261 // with a concatenated message on reasons of the Status. 262 func (s *Status) AsError() error { 263 if s.IsSuccess() || s.IsWait() || s.IsSkip() { 264 return nil 265 } 266 if s.err != nil { 267 return s.err 268 } 269 return errors.New(s.Message()) 270 } 271 272 // Equal checks equality of two statuses. This is useful for testing with 273 // cmp.Equal. 274 func (s *Status) Equal(x *Status) bool { 275 if s == nil || x == nil { 276 return s.IsSuccess() && x.IsSuccess() 277 } 278 if s.code != x.code { 279 return false 280 } 281 if !cmp.Equal(s.err, x.err, cmpopts.EquateErrors()) { 282 return false 283 } 284 if !cmp.Equal(s.reasons, x.reasons) { 285 return false 286 } 287 return cmp.Equal(s.plugin, x.plugin) 288 } 289 290 // NewStatus makes a Status out of the given arguments and returns its pointer. 291 func NewStatus(code Code, reasons ...string) *Status { 292 s := &Status{ 293 code: code, 294 reasons: reasons, 295 } 296 return s 297 } 298 299 // AsStatus wraps an error in a Status. 300 func AsStatus(err error) *Status { 301 if err == nil { 302 return nil 303 } 304 return &Status{ 305 code: Error, 306 err: err, 307 } 308 } 309 310 // WaitingPod represents a pod currently waiting in the permit phase. 311 type WaitingPod interface { 312 // GetPod returns a reference to the waiting pod. 313 GetPod() *v1.Pod 314 // GetPendingPlugins returns a list of pending Permit plugin's name. 315 GetPendingPlugins() []string 316 // Allow declares the waiting pod is allowed to be scheduled by the plugin named as "pluginName". 317 // If this is the last remaining plugin to allow, then a success signal is delivered 318 // to unblock the pod. 319 Allow(pluginName string) 320 // Reject declares the waiting pod unschedulable. 321 Reject(pluginName, msg string) 322 } 323 324 // Plugin is the parent type for all the scheduling framework plugins. 325 type Plugin interface { 326 Name() string 327 } 328 329 // PreEnqueuePlugin is an interface that must be implemented by "PreEnqueue" plugins. 330 // These plugins are called prior to adding Pods to activeQ. 331 // Note: an preEnqueue plugin is expected to be lightweight and efficient, so it's not expected to 332 // involve expensive calls like accessing external endpoints; otherwise it'd block other 333 // Pods' enqueuing in event handlers. 334 type PreEnqueuePlugin interface { 335 Plugin 336 // PreEnqueue is called prior to adding Pods to activeQ. 337 PreEnqueue(ctx context.Context, p *v1.Pod) *Status 338 } 339 340 // LessFunc is the function to sort pod info 341 type LessFunc func(podInfo1, podInfo2 *QueuedPodInfo) bool 342 343 // QueueSortPlugin is an interface that must be implemented by "QueueSort" plugins. 344 // These plugins are used to sort pods in the scheduling queue. Only one queue sort 345 // plugin may be enabled at a time. 346 type QueueSortPlugin interface { 347 Plugin 348 // Less are used to sort pods in the scheduling queue. 349 Less(*QueuedPodInfo, *QueuedPodInfo) bool 350 } 351 352 // EnqueueExtensions is an optional interface that plugins can implement to efficiently 353 // move unschedulable Pods in internal scheduling queues. 354 // In the scheduler, Pods can be unschedulable by PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins, 355 // and Pods rejected by these plugins are requeued based on this extension point. 356 // Failures from other extension points are regarded as temporal errors (e.g., network failure), 357 // and the scheduler requeue Pods without this extension point - always requeue Pods to activeQ after backoff. 358 // This is because such temporal errors cannot be resolved by specific cluster events, 359 // and we have no choise but keep retrying scheduling until the failure is resolved. 360 // 361 // Plugins that make pod unschedulable (PreEnqueue, PreFilter, Filter, Reserve, and Permit plugins) should implement this interface, 362 // otherwise the default implementation will be used, which is less efficient in requeueing Pods rejected by the plugin. 363 // And, if plugins other than above extension points support this interface, they are just ignored. 364 type EnqueueExtensions interface { 365 Plugin 366 // EventsToRegister returns a series of possible events that may cause a Pod 367 // failed by this plugin schedulable. Each event has a callback function that 368 // filters out events to reduce useless retry of Pod's scheduling. 369 // The events will be registered when instantiating the internal scheduling queue, 370 // and leveraged to build event handlers dynamically. 371 // Note: the returned list needs to be static (not depend on configuration parameters); 372 // otherwise it would lead to undefined behavior. 373 // 374 // Appropriate implementation of this function will make Pod's re-scheduling accurate and performant. 375 EventsToRegister() []ClusterEventWithHint 376 } 377 378 // PreFilterExtensions is an interface that is included in plugins that allow specifying 379 // callbacks to make incremental updates to its supposedly pre-calculated 380 // state. 381 type PreFilterExtensions interface { 382 // AddPod is called by the framework while trying to evaluate the impact 383 // of adding podToAdd to the node while scheduling podToSchedule. 384 AddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status 385 // RemovePod is called by the framework while trying to evaluate the impact 386 // of removing podToRemove from the node while scheduling podToSchedule. 387 RemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status 388 } 389 390 // PreFilterPlugin is an interface that must be implemented by "PreFilter" plugins. 391 // These plugins are called at the beginning of the scheduling cycle. 392 type PreFilterPlugin interface { 393 Plugin 394 // PreFilter is called at the beginning of the scheduling cycle. All PreFilter 395 // plugins must return success or the pod will be rejected. PreFilter could optionally 396 // return a PreFilterResult to influence which nodes to evaluate downstream. This is useful 397 // for cases where it is possible to determine the subset of nodes to process in O(1) time. 398 // When it returns Skip status, returned PreFilterResult and other fields in status are just ignored, 399 // and coupled Filter plugin/PreFilterExtensions() will be skipped in this scheduling cycle. 400 PreFilter(ctx context.Context, state *CycleState, p *v1.Pod) (*PreFilterResult, *Status) 401 // PreFilterExtensions returns a PreFilterExtensions interface if the plugin implements one, 402 // or nil if it does not. A Pre-filter plugin can provide extensions to incrementally 403 // modify its pre-processed info. The framework guarantees that the extensions 404 // AddPod/RemovePod will only be called after PreFilter, possibly on a cloned 405 // CycleState, and may call those functions more than once before calling 406 // Filter again on a specific node. 407 PreFilterExtensions() PreFilterExtensions 408 } 409 410 // FilterPlugin is an interface for Filter plugins. These plugins are called at the 411 // filter extension point for filtering out hosts that cannot run a pod. 412 // This concept used to be called 'predicate' in the original scheduler. 413 // These plugins should return "Success", "Unschedulable" or "Error" in Status.code. 414 // However, the scheduler accepts other valid codes as well. 415 // Anything other than "Success" will lead to exclusion of the given host from 416 // running the pod. 417 type FilterPlugin interface { 418 Plugin 419 // Filter is called by the scheduling framework. 420 // All FilterPlugins should return "Success" to declare that 421 // the given node fits the pod. If Filter doesn't return "Success", 422 // it will return "Unschedulable", "UnschedulableAndUnresolvable" or "Error". 423 // For the node being evaluated, Filter plugins should look at the passed 424 // nodeInfo reference for this particular node's information (e.g., pods 425 // considered to be running on the node) instead of looking it up in the 426 // NodeInfoSnapshot because we don't guarantee that they will be the same. 427 // For example, during preemption, we may pass a copy of the original 428 // nodeInfo object that has some pods removed from it to evaluate the 429 // possibility of preempting them to schedule the target pod. 430 Filter(ctx context.Context, state *CycleState, pod *v1.Pod, nodeInfo *NodeInfo) *Status 431 } 432 433 // PostFilterPlugin is an interface for "PostFilter" plugins. These plugins are called 434 // after a pod cannot be scheduled. 435 type PostFilterPlugin interface { 436 Plugin 437 // PostFilter is called by the scheduling framework. 438 // A PostFilter plugin should return one of the following statuses: 439 // - Unschedulable: the plugin gets executed successfully but the pod cannot be made schedulable. 440 // - Success: the plugin gets executed successfully and the pod can be made schedulable. 441 // - Error: the plugin aborts due to some internal error. 442 // 443 // Informational plugins should be configured ahead of other ones, and always return Unschedulable status. 444 // Optionally, a non-nil PostFilterResult may be returned along with a Success status. For example, 445 // a preemption plugin may choose to return nominatedNodeName, so that framework can reuse that to update the 446 // preemptor pod's .spec.status.nominatedNodeName field. 447 PostFilter(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status) 448 } 449 450 // PreScorePlugin is an interface for "PreScore" plugin. PreScore is an 451 // informational extension point. Plugins will be called with a list of nodes 452 // that passed the filtering phase. A plugin may use this data to update internal 453 // state or to generate logs/metrics. 454 type PreScorePlugin interface { 455 Plugin 456 // PreScore is called by the scheduling framework after a list of nodes 457 // passed the filtering phase. All prescore plugins must return success or 458 // the pod will be rejected 459 // When it returns Skip status, other fields in status are just ignored, 460 // and coupled Score plugin will be skipped in this scheduling cycle. 461 PreScore(ctx context.Context, state *CycleState, pod *v1.Pod, nodes []*v1.Node) *Status 462 } 463 464 // ScoreExtensions is an interface for Score extended functionality. 465 type ScoreExtensions interface { 466 // NormalizeScore is called for all node scores produced by the same plugin's "Score" 467 // method. A successful run of NormalizeScore will update the scores list and return 468 // a success status. 469 NormalizeScore(ctx context.Context, state *CycleState, p *v1.Pod, scores NodeScoreList) *Status 470 } 471 472 // ScorePlugin is an interface that must be implemented by "Score" plugins to rank 473 // nodes that passed the filtering phase. 474 type ScorePlugin interface { 475 Plugin 476 // Score is called on each filtered node. It must return success and an integer 477 // indicating the rank of the node. All scoring plugins must return success or 478 // the pod will be rejected. 479 Score(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (int64, *Status) 480 481 // ScoreExtensions returns a ScoreExtensions interface if it implements one, or nil if does not. 482 ScoreExtensions() ScoreExtensions 483 } 484 485 // ReservePlugin is an interface for plugins with Reserve and Unreserve 486 // methods. These are meant to update the state of the plugin. This concept 487 // used to be called 'assume' in the original scheduler. These plugins should 488 // return only Success or Error in Status.code. However, the scheduler accepts 489 // other valid codes as well. Anything other than Success will lead to 490 // rejection of the pod. 491 type ReservePlugin interface { 492 Plugin 493 // Reserve is called by the scheduling framework when the scheduler cache is 494 // updated. If this method returns a failed Status, the scheduler will call 495 // the Unreserve method for all enabled ReservePlugins. 496 Reserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 497 // Unreserve is called by the scheduling framework when a reserved pod was 498 // rejected, an error occurred during reservation of subsequent plugins, or 499 // in a later phase. The Unreserve method implementation must be idempotent 500 // and may be called by the scheduler even if the corresponding Reserve 501 // method for the same plugin was not called. 502 Unreserve(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) 503 } 504 505 // PreBindPlugin is an interface that must be implemented by "PreBind" plugins. 506 // These plugins are called before a pod being scheduled. 507 type PreBindPlugin interface { 508 Plugin 509 // PreBind is called before binding a pod. All prebind plugins must return 510 // success or the pod will be rejected and won't be sent for binding. 511 PreBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 512 } 513 514 // PostBindPlugin is an interface that must be implemented by "PostBind" plugins. 515 // These plugins are called after a pod is successfully bound to a node. 516 type PostBindPlugin interface { 517 Plugin 518 // PostBind is called after a pod is successfully bound. These plugins are 519 // informational. A common application of this extension point is for cleaning 520 // up. If a plugin needs to clean-up its state after a pod is scheduled and 521 // bound, PostBind is the extension point that it should register. 522 PostBind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) 523 } 524 525 // PermitPlugin is an interface that must be implemented by "Permit" plugins. 526 // These plugins are called before a pod is bound to a node. 527 type PermitPlugin interface { 528 Plugin 529 // Permit is called before binding a pod (and before prebind plugins). Permit 530 // plugins are used to prevent or delay the binding of a Pod. A permit plugin 531 // must return success or wait with timeout duration, or the pod will be rejected. 532 // The pod will also be rejected if the wait timeout or the pod is rejected while 533 // waiting. Note that if the plugin returns "wait", the framework will wait only 534 // after running the remaining plugins given that no other plugin rejects the pod. 535 Permit(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) (*Status, time.Duration) 536 } 537 538 // BindPlugin is an interface that must be implemented by "Bind" plugins. Bind 539 // plugins are used to bind a pod to a Node. 540 type BindPlugin interface { 541 Plugin 542 // Bind plugins will not be called until all pre-bind plugins have completed. Each 543 // bind plugin is called in the configured order. A bind plugin may choose whether 544 // or not to handle the given Pod. If a bind plugin chooses to handle a Pod, the 545 // remaining bind plugins are skipped. When a bind plugin does not handle a pod, 546 // it must return Skip in its Status code. If a bind plugin returns an Error, the 547 // pod is rejected and will not be bound. 548 Bind(ctx context.Context, state *CycleState, p *v1.Pod, nodeName string) *Status 549 } 550 551 // Framework manages the set of plugins in use by the scheduling framework. 552 // Configured plugins are called at specified points in a scheduling context. 553 type Framework interface { 554 Handle 555 556 // PreEnqueuePlugins returns the registered preEnqueue plugins. 557 PreEnqueuePlugins() []PreEnqueuePlugin 558 559 // EnqueueExtensions returns the registered Enqueue extensions. 560 EnqueueExtensions() []EnqueueExtensions 561 562 // QueueSortFunc returns the function to sort pods in scheduling queue 563 QueueSortFunc() LessFunc 564 565 // RunPreFilterPlugins runs the set of configured PreFilter plugins. It returns 566 // *Status and its code is set to non-success if any of the plugins returns 567 // anything but Success. If a non-success status is returned, then the scheduling 568 // cycle is aborted. 569 // It also returns a PreFilterResult, which may influence what or how many nodes to 570 // evaluate downstream. 571 RunPreFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod) (*PreFilterResult, *Status) 572 573 // RunPostFilterPlugins runs the set of configured PostFilter plugins. 574 // PostFilter plugins can either be informational, in which case should be configured 575 // to execute first and return Unschedulable status, or ones that try to change the 576 // cluster state to make the pod potentially schedulable in a future scheduling cycle. 577 RunPostFilterPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, filteredNodeStatusMap NodeToStatusMap) (*PostFilterResult, *Status) 578 579 // RunPreBindPlugins runs the set of configured PreBind plugins. It returns 580 // *Status and its code is set to non-success if any of the plugins returns 581 // anything but Success. If the Status code is "Unschedulable", it is 582 // considered as a scheduling check failure, otherwise, it is considered as an 583 // internal error. In either case the pod is not going to be bound. 584 RunPreBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 585 586 // RunPostBindPlugins runs the set of configured PostBind plugins. 587 RunPostBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) 588 589 // RunReservePluginsReserve runs the Reserve method of the set of 590 // configured Reserve plugins. If any of these calls returns an error, it 591 // does not continue running the remaining ones and returns the error. In 592 // such case, pod will not be scheduled. 593 RunReservePluginsReserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 594 595 // RunReservePluginsUnreserve runs the Unreserve method of the set of 596 // configured Reserve plugins. 597 RunReservePluginsUnreserve(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) 598 599 // RunPermitPlugins runs the set of configured Permit plugins. If any of these 600 // plugins returns a status other than "Success" or "Wait", it does not continue 601 // running the remaining plugins and returns an error. Otherwise, if any of the 602 // plugins returns "Wait", then this function will create and add waiting pod 603 // to a map of currently waiting pods and return status with "Wait" code. 604 // Pod will remain waiting pod for the minimum duration returned by the Permit plugins. 605 RunPermitPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 606 607 // WaitOnPermit will block, if the pod is a waiting pod, until the waiting pod is rejected or allowed. 608 WaitOnPermit(ctx context.Context, pod *v1.Pod) *Status 609 610 // RunBindPlugins runs the set of configured Bind plugins. A Bind plugin may choose 611 // whether or not to handle the given Pod. If a Bind plugin chooses to skip the 612 // binding, it should return code=5("skip") status. Otherwise, it should return "Error" 613 // or "Success". If none of the plugins handled binding, RunBindPlugins returns 614 // code=5("skip") status. 615 RunBindPlugins(ctx context.Context, state *CycleState, pod *v1.Pod, nodeName string) *Status 616 617 // HasFilterPlugins returns true if at least one Filter plugin is defined. 618 HasFilterPlugins() bool 619 620 // HasPostFilterPlugins returns true if at least one PostFilter plugin is defined. 621 HasPostFilterPlugins() bool 622 623 // HasScorePlugins returns true if at least one Score plugin is defined. 624 HasScorePlugins() bool 625 626 // ListPlugins returns a map of extension point name to list of configured Plugins. 627 ListPlugins() *config.Plugins 628 629 // ProfileName returns the profile name associated to a profile. 630 ProfileName() string 631 632 // PercentageOfNodesToScore returns percentageOfNodesToScore associated to a profile. 633 PercentageOfNodesToScore() *int32 634 635 // SetPodNominator sets the PodNominator 636 SetPodNominator(nominator PodNominator) 637 } 638 639 // Handle provides data and some tools that plugins can use. It is 640 // passed to the plugin factories at the time of plugin initialization. Plugins 641 // must store and use this handle to call framework functions. 642 type Handle interface { 643 // PodNominator abstracts operations to maintain nominated Pods. 644 PodNominator 645 // PluginsRunner abstracts operations to run some plugins. 646 PluginsRunner 647 // SnapshotSharedLister returns listers from the latest NodeInfo Snapshot. The snapshot 648 // is taken at the beginning of a scheduling cycle and remains unchanged until 649 // a pod finishes "Permit" point. There is no guarantee that the information 650 // remains unchanged in the binding phase of scheduling, so plugins in the binding 651 // cycle (pre-bind/bind/post-bind/un-reserve plugin) should not use it, 652 // otherwise a concurrent read/write error might occur, they should use scheduler 653 // cache instead. 654 SnapshotSharedLister() SharedLister 655 656 // IterateOverWaitingPods acquires a read lock and iterates over the WaitingPods map. 657 IterateOverWaitingPods(callback func(WaitingPod)) 658 659 // GetWaitingPod returns a waiting pod given its UID. 660 GetWaitingPod(uid types.UID) WaitingPod 661 662 // RejectWaitingPod rejects a waiting pod given its UID. 663 // The return value indicates if the pod is waiting or not. 664 RejectWaitingPod(uid types.UID) bool 665 666 // ClientSet returns a kubernetes clientSet. 667 ClientSet() clientset.Interface 668 669 // KubeConfig returns the raw kube config. 670 KubeConfig() *restclient.Config 671 672 // EventRecorder returns an event recorder. 673 EventRecorder() events.EventRecorder 674 675 SharedInformerFactory() informers.SharedInformerFactory 676 677 // RunFilterPluginsWithNominatedPods runs the set of configured filter plugins for nominated pod on the given node. 678 RunFilterPluginsWithNominatedPods(ctx context.Context, state *CycleState, pod *v1.Pod, info *NodeInfo) *Status 679 680 // Extenders returns registered scheduler extenders. 681 Extenders() []Extender 682 683 // Parallelizer returns a parallelizer holding parallelism for scheduler. 684 Parallelizer() parallelize.Parallelizer 685 } 686 687 // PreFilterResult wraps needed info for scheduler framework to act upon PreFilter phase. 688 type PreFilterResult struct { 689 // The set of nodes that should be considered downstream; if nil then 690 // all nodes are eligible. 691 NodeNames sets.Set[string] 692 } 693 694 func (p *PreFilterResult) AllNodes() bool { 695 return p == nil || p.NodeNames == nil 696 } 697 698 func (p *PreFilterResult) Merge(in *PreFilterResult) *PreFilterResult { 699 if p.AllNodes() && in.AllNodes() { 700 return nil 701 } 702 703 r := PreFilterResult{} 704 if p.AllNodes() { 705 r.NodeNames = in.NodeNames.Clone() 706 return &r 707 } 708 if in.AllNodes() { 709 r.NodeNames = p.NodeNames.Clone() 710 return &r 711 } 712 713 r.NodeNames = p.NodeNames.Intersection(in.NodeNames) 714 return &r 715 } 716 717 type NominatingMode int 718 719 const ( 720 ModeNoop NominatingMode = iota 721 ModeOverride 722 ) 723 724 type NominatingInfo struct { 725 NominatedNodeName string 726 NominatingMode NominatingMode 727 } 728 729 // PostFilterResult wraps needed info for scheduler framework to act upon PostFilter phase. 730 type PostFilterResult struct { 731 *NominatingInfo 732 } 733 734 func NewPostFilterResultWithNominatedNode(name string) *PostFilterResult { 735 return &PostFilterResult{ 736 NominatingInfo: &NominatingInfo{ 737 NominatedNodeName: name, 738 NominatingMode: ModeOverride, 739 }, 740 } 741 } 742 743 func (ni *NominatingInfo) Mode() NominatingMode { 744 if ni == nil { 745 return ModeNoop 746 } 747 return ni.NominatingMode 748 } 749 750 // PodNominator abstracts operations to maintain nominated Pods. 751 type PodNominator interface { 752 // AddNominatedPod adds the given pod to the nominator or 753 // updates it if it already exists. 754 AddNominatedPod(logger klog.Logger, pod *PodInfo, nominatingInfo *NominatingInfo) 755 // DeleteNominatedPodIfExists deletes nominatedPod from internal cache. It's a no-op if it doesn't exist. 756 DeleteNominatedPodIfExists(pod *v1.Pod) 757 // UpdateNominatedPod updates the <oldPod> with <newPod>. 758 UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *PodInfo) 759 // NominatedPodsForNode returns nominatedPods on the given node. 760 NominatedPodsForNode(nodeName string) []*PodInfo 761 } 762 763 // PluginsRunner abstracts operations to run some plugins. 764 // This is used by preemption PostFilter plugins when evaluating the feasibility of 765 // scheduling the pod on nodes when certain running pods get evicted. 766 type PluginsRunner interface { 767 // RunPreScorePlugins runs the set of configured PreScore plugins. If any 768 // of these plugins returns any status other than "Success", the given pod is rejected. 769 RunPreScorePlugins(context.Context, *CycleState, *v1.Pod, []*v1.Node) *Status 770 // RunScorePlugins runs the set of configured scoring plugins. 771 // It returns a list that stores scores from each plugin and total score for each Node. 772 // It also returns *Status, which is set to non-success if any of the plugins returns 773 // a non-success status. 774 RunScorePlugins(context.Context, *CycleState, *v1.Pod, []*v1.Node) ([]NodePluginScores, *Status) 775 // RunFilterPlugins runs the set of configured Filter plugins for pod on 776 // the given node. Note that for the node being evaluated, the passed nodeInfo 777 // reference could be different from the one in NodeInfoSnapshot map (e.g., pods 778 // considered to be running on the node could be different). For example, during 779 // preemption, we may pass a copy of the original nodeInfo object that has some pods 780 // removed from it to evaluate the possibility of preempting them to 781 // schedule the target pod. 782 RunFilterPlugins(context.Context, *CycleState, *v1.Pod, *NodeInfo) *Status 783 // RunPreFilterExtensionAddPod calls the AddPod interface for the set of configured 784 // PreFilter plugins. It returns directly if any of the plugins return any 785 // status other than Success. 786 RunPreFilterExtensionAddPod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToAdd *PodInfo, nodeInfo *NodeInfo) *Status 787 // RunPreFilterExtensionRemovePod calls the RemovePod interface for the set of configured 788 // PreFilter plugins. It returns directly if any of the plugins return any 789 // status other than Success. 790 RunPreFilterExtensionRemovePod(ctx context.Context, state *CycleState, podToSchedule *v1.Pod, podInfoToRemove *PodInfo, nodeInfo *NodeInfo) *Status 791 }