k8s.io/kubernetes@v1.29.3/pkg/scheduler/scheduler.go (about) 1 /* 2 Copyright 2014 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package scheduler 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "time" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/apimachinery/pkg/api/meta" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/util/wait" 29 utilfeature "k8s.io/apiserver/pkg/util/feature" 30 "k8s.io/client-go/dynamic/dynamicinformer" 31 "k8s.io/client-go/informers" 32 coreinformers "k8s.io/client-go/informers/core/v1" 33 clientset "k8s.io/client-go/kubernetes" 34 restclient "k8s.io/client-go/rest" 35 "k8s.io/client-go/tools/cache" 36 "k8s.io/klog/v2" 37 configv1 "k8s.io/kube-scheduler/config/v1" 38 "k8s.io/kubernetes/pkg/features" 39 schedulerapi "k8s.io/kubernetes/pkg/scheduler/apis/config" 40 "k8s.io/kubernetes/pkg/scheduler/apis/config/scheme" 41 "k8s.io/kubernetes/pkg/scheduler/framework" 42 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 43 frameworkplugins "k8s.io/kubernetes/pkg/scheduler/framework/plugins" 44 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/noderesources" 45 frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" 46 internalcache "k8s.io/kubernetes/pkg/scheduler/internal/cache" 47 cachedebugger "k8s.io/kubernetes/pkg/scheduler/internal/cache/debugger" 48 internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" 49 "k8s.io/kubernetes/pkg/scheduler/metrics" 50 "k8s.io/kubernetes/pkg/scheduler/profile" 51 ) 52 53 const ( 54 // Duration the scheduler will wait before expiring an assumed pod. 55 // See issue #106361 for more details about this parameter and its value. 56 durationToExpireAssumedPod time.Duration = 0 57 ) 58 59 // ErrNoNodesAvailable is used to describe the error that no nodes available to schedule pods. 60 var ErrNoNodesAvailable = fmt.Errorf("no nodes available to schedule pods") 61 62 // Scheduler watches for new unscheduled pods. It attempts to find 63 // nodes that they fit on and writes bindings back to the api server. 64 type Scheduler struct { 65 // It is expected that changes made via Cache will be observed 66 // by NodeLister and Algorithm. 67 Cache internalcache.Cache 68 69 Extenders []framework.Extender 70 71 // NextPod should be a function that blocks until the next pod 72 // is available. We don't use a channel for this, because scheduling 73 // a pod may take some amount of time and we don't want pods to get 74 // stale while they sit in a channel. 75 NextPod func(logger klog.Logger) (*framework.QueuedPodInfo, error) 76 77 // FailureHandler is called upon a scheduling failure. 78 FailureHandler FailureHandlerFn 79 80 // SchedulePod tries to schedule the given pod to one of the nodes in the node list. 81 // Return a struct of ScheduleResult with the name of suggested host on success, 82 // otherwise will return a FitError with reasons. 83 SchedulePod func(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (ScheduleResult, error) 84 85 // Close this to shut down the scheduler. 86 StopEverything <-chan struct{} 87 88 // SchedulingQueue holds pods to be scheduled 89 SchedulingQueue internalqueue.SchedulingQueue 90 91 // Profiles are the scheduling profiles. 92 Profiles profile.Map 93 94 client clientset.Interface 95 96 nodeInfoSnapshot *internalcache.Snapshot 97 98 percentageOfNodesToScore int32 99 100 nextStartNodeIndex int 101 102 // logger *must* be initialized when creating a Scheduler, 103 // otherwise logging functions will access a nil sink and 104 // panic. 105 logger klog.Logger 106 107 // registeredHandlers contains the registrations of all handlers. It's used to check if all handlers have finished syncing before the scheduling cycles start. 108 registeredHandlers []cache.ResourceEventHandlerRegistration 109 } 110 111 func (sched *Scheduler) applyDefaultHandlers() { 112 sched.SchedulePod = sched.schedulePod 113 sched.FailureHandler = sched.handleSchedulingFailure 114 } 115 116 type schedulerOptions struct { 117 componentConfigVersion string 118 kubeConfig *restclient.Config 119 // Overridden by profile level percentageOfNodesToScore if set in v1. 120 percentageOfNodesToScore int32 121 podInitialBackoffSeconds int64 122 podMaxBackoffSeconds int64 123 podMaxInUnschedulablePodsDuration time.Duration 124 // Contains out-of-tree plugins to be merged with the in-tree registry. 125 frameworkOutOfTreeRegistry frameworkruntime.Registry 126 profiles []schedulerapi.KubeSchedulerProfile 127 extenders []schedulerapi.Extender 128 frameworkCapturer FrameworkCapturer 129 parallelism int32 130 applyDefaultProfile bool 131 } 132 133 // Option configures a Scheduler 134 type Option func(*schedulerOptions) 135 136 // ScheduleResult represents the result of scheduling a pod. 137 type ScheduleResult struct { 138 // Name of the selected node. 139 SuggestedHost string 140 // The number of nodes the scheduler evaluated the pod against in the filtering 141 // phase and beyond. 142 EvaluatedNodes int 143 // The number of nodes out of the evaluated ones that fit the pod. 144 FeasibleNodes int 145 // The nominating info for scheduling cycle. 146 nominatingInfo *framework.NominatingInfo 147 } 148 149 // WithComponentConfigVersion sets the component config version to the 150 // KubeSchedulerConfiguration version used. The string should be the full 151 // scheme group/version of the external type we converted from (for example 152 // "kubescheduler.config.k8s.io/v1") 153 func WithComponentConfigVersion(apiVersion string) Option { 154 return func(o *schedulerOptions) { 155 o.componentConfigVersion = apiVersion 156 } 157 } 158 159 // WithKubeConfig sets the kube config for Scheduler. 160 func WithKubeConfig(cfg *restclient.Config) Option { 161 return func(o *schedulerOptions) { 162 o.kubeConfig = cfg 163 } 164 } 165 166 // WithProfiles sets profiles for Scheduler. By default, there is one profile 167 // with the name "default-scheduler". 168 func WithProfiles(p ...schedulerapi.KubeSchedulerProfile) Option { 169 return func(o *schedulerOptions) { 170 o.profiles = p 171 o.applyDefaultProfile = false 172 } 173 } 174 175 // WithParallelism sets the parallelism for all scheduler algorithms. Default is 16. 176 func WithParallelism(threads int32) Option { 177 return func(o *schedulerOptions) { 178 o.parallelism = threads 179 } 180 } 181 182 // WithPercentageOfNodesToScore sets percentageOfNodesToScore for Scheduler. 183 // The default value of 0 will use an adaptive percentage: 50 - (num of nodes)/125. 184 func WithPercentageOfNodesToScore(percentageOfNodesToScore *int32) Option { 185 return func(o *schedulerOptions) { 186 if percentageOfNodesToScore != nil { 187 o.percentageOfNodesToScore = *percentageOfNodesToScore 188 } 189 } 190 } 191 192 // WithFrameworkOutOfTreeRegistry sets the registry for out-of-tree plugins. Those plugins 193 // will be appended to the default registry. 194 func WithFrameworkOutOfTreeRegistry(registry frameworkruntime.Registry) Option { 195 return func(o *schedulerOptions) { 196 o.frameworkOutOfTreeRegistry = registry 197 } 198 } 199 200 // WithPodInitialBackoffSeconds sets podInitialBackoffSeconds for Scheduler, the default value is 1 201 func WithPodInitialBackoffSeconds(podInitialBackoffSeconds int64) Option { 202 return func(o *schedulerOptions) { 203 o.podInitialBackoffSeconds = podInitialBackoffSeconds 204 } 205 } 206 207 // WithPodMaxBackoffSeconds sets podMaxBackoffSeconds for Scheduler, the default value is 10 208 func WithPodMaxBackoffSeconds(podMaxBackoffSeconds int64) Option { 209 return func(o *schedulerOptions) { 210 o.podMaxBackoffSeconds = podMaxBackoffSeconds 211 } 212 } 213 214 // WithPodMaxInUnschedulablePodsDuration sets podMaxInUnschedulablePodsDuration for PriorityQueue. 215 func WithPodMaxInUnschedulablePodsDuration(duration time.Duration) Option { 216 return func(o *schedulerOptions) { 217 o.podMaxInUnschedulablePodsDuration = duration 218 } 219 } 220 221 // WithExtenders sets extenders for the Scheduler 222 func WithExtenders(e ...schedulerapi.Extender) Option { 223 return func(o *schedulerOptions) { 224 o.extenders = e 225 } 226 } 227 228 // FrameworkCapturer is used for registering a notify function in building framework. 229 type FrameworkCapturer func(schedulerapi.KubeSchedulerProfile) 230 231 // WithBuildFrameworkCapturer sets a notify function for getting buildFramework details. 232 func WithBuildFrameworkCapturer(fc FrameworkCapturer) Option { 233 return func(o *schedulerOptions) { 234 o.frameworkCapturer = fc 235 } 236 } 237 238 var defaultSchedulerOptions = schedulerOptions{ 239 percentageOfNodesToScore: schedulerapi.DefaultPercentageOfNodesToScore, 240 podInitialBackoffSeconds: int64(internalqueue.DefaultPodInitialBackoffDuration.Seconds()), 241 podMaxBackoffSeconds: int64(internalqueue.DefaultPodMaxBackoffDuration.Seconds()), 242 podMaxInUnschedulablePodsDuration: internalqueue.DefaultPodMaxInUnschedulablePodsDuration, 243 parallelism: int32(parallelize.DefaultParallelism), 244 // Ideally we would statically set the default profile here, but we can't because 245 // creating the default profile may require testing feature gates, which may get 246 // set dynamically in tests. Therefore, we delay creating it until New is actually 247 // invoked. 248 applyDefaultProfile: true, 249 } 250 251 // New returns a Scheduler 252 func New(ctx context.Context, 253 client clientset.Interface, 254 informerFactory informers.SharedInformerFactory, 255 dynInformerFactory dynamicinformer.DynamicSharedInformerFactory, 256 recorderFactory profile.RecorderFactory, 257 opts ...Option) (*Scheduler, error) { 258 259 logger := klog.FromContext(ctx) 260 stopEverything := ctx.Done() 261 262 options := defaultSchedulerOptions 263 for _, opt := range opts { 264 opt(&options) 265 } 266 267 if options.applyDefaultProfile { 268 var versionedCfg configv1.KubeSchedulerConfiguration 269 scheme.Scheme.Default(&versionedCfg) 270 cfg := schedulerapi.KubeSchedulerConfiguration{} 271 if err := scheme.Scheme.Convert(&versionedCfg, &cfg, nil); err != nil { 272 return nil, err 273 } 274 options.profiles = cfg.Profiles 275 } 276 277 registry := frameworkplugins.NewInTreeRegistry() 278 if err := registry.Merge(options.frameworkOutOfTreeRegistry); err != nil { 279 return nil, err 280 } 281 282 metrics.Register() 283 284 extenders, err := buildExtenders(logger, options.extenders, options.profiles) 285 if err != nil { 286 return nil, fmt.Errorf("couldn't build extenders: %w", err) 287 } 288 289 podLister := informerFactory.Core().V1().Pods().Lister() 290 nodeLister := informerFactory.Core().V1().Nodes().Lister() 291 292 snapshot := internalcache.NewEmptySnapshot() 293 metricsRecorder := metrics.NewMetricsAsyncRecorder(1000, time.Second, stopEverything) 294 295 profiles, err := profile.NewMap(ctx, options.profiles, registry, recorderFactory, 296 frameworkruntime.WithComponentConfigVersion(options.componentConfigVersion), 297 frameworkruntime.WithClientSet(client), 298 frameworkruntime.WithKubeConfig(options.kubeConfig), 299 frameworkruntime.WithInformerFactory(informerFactory), 300 frameworkruntime.WithSnapshotSharedLister(snapshot), 301 frameworkruntime.WithCaptureProfile(frameworkruntime.CaptureProfile(options.frameworkCapturer)), 302 frameworkruntime.WithParallelism(int(options.parallelism)), 303 frameworkruntime.WithExtenders(extenders), 304 frameworkruntime.WithMetricsRecorder(metricsRecorder), 305 ) 306 if err != nil { 307 return nil, fmt.Errorf("initializing profiles: %v", err) 308 } 309 310 if len(profiles) == 0 { 311 return nil, errors.New("at least one profile is required") 312 } 313 314 preEnqueuePluginMap := make(map[string][]framework.PreEnqueuePlugin) 315 queueingHintsPerProfile := make(internalqueue.QueueingHintMapPerProfile) 316 for profileName, profile := range profiles { 317 preEnqueuePluginMap[profileName] = profile.PreEnqueuePlugins() 318 queueingHintsPerProfile[profileName] = buildQueueingHintMap(profile.EnqueueExtensions()) 319 } 320 321 podQueue := internalqueue.NewSchedulingQueue( 322 profiles[options.profiles[0].SchedulerName].QueueSortFunc(), 323 informerFactory, 324 internalqueue.WithPodInitialBackoffDuration(time.Duration(options.podInitialBackoffSeconds)*time.Second), 325 internalqueue.WithPodMaxBackoffDuration(time.Duration(options.podMaxBackoffSeconds)*time.Second), 326 internalqueue.WithPodLister(podLister), 327 internalqueue.WithPodMaxInUnschedulablePodsDuration(options.podMaxInUnschedulablePodsDuration), 328 internalqueue.WithPreEnqueuePluginMap(preEnqueuePluginMap), 329 internalqueue.WithQueueingHintMapPerProfile(queueingHintsPerProfile), 330 internalqueue.WithPluginMetricsSamplePercent(pluginMetricsSamplePercent), 331 internalqueue.WithMetricsRecorder(*metricsRecorder), 332 ) 333 334 for _, fwk := range profiles { 335 fwk.SetPodNominator(podQueue) 336 } 337 338 schedulerCache := internalcache.New(ctx, durationToExpireAssumedPod) 339 340 // Setup cache debugger. 341 debugger := cachedebugger.New(nodeLister, podLister, schedulerCache, podQueue) 342 debugger.ListenForSignal(ctx) 343 344 sched := &Scheduler{ 345 Cache: schedulerCache, 346 client: client, 347 nodeInfoSnapshot: snapshot, 348 percentageOfNodesToScore: options.percentageOfNodesToScore, 349 Extenders: extenders, 350 StopEverything: stopEverything, 351 SchedulingQueue: podQueue, 352 Profiles: profiles, 353 logger: logger, 354 } 355 sched.NextPod = podQueue.Pop 356 sched.applyDefaultHandlers() 357 358 if err = addAllEventHandlers(sched, informerFactory, dynInformerFactory, unionedGVKs(queueingHintsPerProfile)); err != nil { 359 return nil, fmt.Errorf("adding event handlers: %w", err) 360 } 361 362 return sched, nil 363 } 364 365 // defaultQueueingHintFn is the default queueing hint function. 366 // It always returns Queue as the queueing hint. 367 var defaultQueueingHintFn = func(_ klog.Logger, _ *v1.Pod, _, _ interface{}) (framework.QueueingHint, error) { 368 return framework.Queue, nil 369 } 370 371 func buildQueueingHintMap(es []framework.EnqueueExtensions) internalqueue.QueueingHintMap { 372 queueingHintMap := make(internalqueue.QueueingHintMap) 373 for _, e := range es { 374 events := e.EventsToRegister() 375 376 // This will happen when plugin registers with empty events, it's usually the case a pod 377 // will become reschedulable only for self-update, e.g. schedulingGates plugin, the pod 378 // will enter into the activeQ via priorityQueue.Update(). 379 if len(events) == 0 { 380 continue 381 } 382 383 // Note: Rarely, a plugin implements EnqueueExtensions but returns nil. 384 // We treat it as: the plugin is not interested in any event, and hence pod failed by that plugin 385 // cannot be moved by any regular cluster event. 386 // So, we can just ignore such EventsToRegister here. 387 388 for _, event := range events { 389 fn := event.QueueingHintFn 390 if fn == nil || !utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints) { 391 fn = defaultQueueingHintFn 392 } 393 394 queueingHintMap[event.Event] = append(queueingHintMap[event.Event], &internalqueue.QueueingHintFunction{ 395 PluginName: e.Name(), 396 QueueingHintFn: fn, 397 }) 398 } 399 } 400 return queueingHintMap 401 } 402 403 // Run begins watching and scheduling. It starts scheduling and blocked until the context is done. 404 func (sched *Scheduler) Run(ctx context.Context) { 405 logger := klog.FromContext(ctx) 406 sched.SchedulingQueue.Run(logger) 407 408 // We need to start scheduleOne loop in a dedicated goroutine, 409 // because scheduleOne function hangs on getting the next item 410 // from the SchedulingQueue. 411 // If there are no new pods to schedule, it will be hanging there 412 // and if done in this goroutine it will be blocking closing 413 // SchedulingQueue, in effect causing a deadlock on shutdown. 414 go wait.UntilWithContext(ctx, sched.scheduleOne, 0) 415 416 <-ctx.Done() 417 sched.SchedulingQueue.Close() 418 } 419 420 // NewInformerFactory creates a SharedInformerFactory and initializes a scheduler specific 421 // in-place podInformer. 422 func NewInformerFactory(cs clientset.Interface, resyncPeriod time.Duration) informers.SharedInformerFactory { 423 informerFactory := informers.NewSharedInformerFactory(cs, resyncPeriod) 424 informerFactory.InformerFor(&v1.Pod{}, newPodInformer) 425 return informerFactory 426 } 427 428 func buildExtenders(logger klog.Logger, extenders []schedulerapi.Extender, profiles []schedulerapi.KubeSchedulerProfile) ([]framework.Extender, error) { 429 var fExtenders []framework.Extender 430 if len(extenders) == 0 { 431 return nil, nil 432 } 433 434 var ignoredExtendedResources []string 435 var ignorableExtenders []framework.Extender 436 for i := range extenders { 437 logger.V(2).Info("Creating extender", "extender", extenders[i]) 438 extender, err := NewHTTPExtender(&extenders[i]) 439 if err != nil { 440 return nil, err 441 } 442 if !extender.IsIgnorable() { 443 fExtenders = append(fExtenders, extender) 444 } else { 445 ignorableExtenders = append(ignorableExtenders, extender) 446 } 447 for _, r := range extenders[i].ManagedResources { 448 if r.IgnoredByScheduler { 449 ignoredExtendedResources = append(ignoredExtendedResources, r.Name) 450 } 451 } 452 } 453 // place ignorable extenders to the tail of extenders 454 fExtenders = append(fExtenders, ignorableExtenders...) 455 456 // If there are any extended resources found from the Extenders, append them to the pluginConfig for each profile. 457 // This should only have an effect on ComponentConfig, where it is possible to configure Extenders and 458 // plugin args (and in which case the extender ignored resources take precedence). 459 if len(ignoredExtendedResources) == 0 { 460 return fExtenders, nil 461 } 462 463 for i := range profiles { 464 prof := &profiles[i] 465 var found = false 466 for k := range prof.PluginConfig { 467 if prof.PluginConfig[k].Name == noderesources.Name { 468 // Update the existing args 469 pc := &prof.PluginConfig[k] 470 args, ok := pc.Args.(*schedulerapi.NodeResourcesFitArgs) 471 if !ok { 472 return nil, fmt.Errorf("want args to be of type NodeResourcesFitArgs, got %T", pc.Args) 473 } 474 args.IgnoredResources = ignoredExtendedResources 475 found = true 476 break 477 } 478 } 479 if !found { 480 return nil, fmt.Errorf("can't find NodeResourcesFitArgs in plugin config") 481 } 482 } 483 return fExtenders, nil 484 } 485 486 type FailureHandlerFn func(ctx context.Context, fwk framework.Framework, podInfo *framework.QueuedPodInfo, status *framework.Status, nominatingInfo *framework.NominatingInfo, start time.Time) 487 488 func unionedGVKs(queueingHintsPerProfile internalqueue.QueueingHintMapPerProfile) map[framework.GVK]framework.ActionType { 489 gvkMap := make(map[framework.GVK]framework.ActionType) 490 for _, queueingHints := range queueingHintsPerProfile { 491 for evt := range queueingHints { 492 if _, ok := gvkMap[evt.Resource]; ok { 493 gvkMap[evt.Resource] |= evt.ActionType 494 } else { 495 gvkMap[evt.Resource] = evt.ActionType 496 } 497 } 498 } 499 return gvkMap 500 } 501 502 // newPodInformer creates a shared index informer that returns only non-terminal pods. 503 // The PodInformer allows indexers to be added, but note that only non-conflict indexers are allowed. 504 func newPodInformer(cs clientset.Interface, resyncPeriod time.Duration) cache.SharedIndexInformer { 505 selector := fmt.Sprintf("status.phase!=%v,status.phase!=%v", v1.PodSucceeded, v1.PodFailed) 506 tweakListOptions := func(options *metav1.ListOptions) { 507 options.FieldSelector = selector 508 } 509 informer := coreinformers.NewFilteredPodInformer(cs, metav1.NamespaceAll, resyncPeriod, cache.Indexers{}, tweakListOptions) 510 511 // Dropping `.metadata.managedFields` to improve memory usage. 512 // The Extract workflow (i.e. `ExtractPod`) should be unused. 513 trim := func(obj interface{}) (interface{}, error) { 514 if accessor, err := meta.Accessor(obj); err == nil { 515 accessor.SetManagedFields(nil) 516 } 517 return obj, nil 518 } 519 informer.SetTransform(trim) 520 return informer 521 }