k8s.io/kubernetes@v1.29.3/pkg/scheduler/internal/queue/scheduling_queue.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // This file contains structures that implement scheduling queue types. 18 // Scheduling queues hold pods waiting to be scheduled. This file implements a 19 // priority queue which has two sub queues and a additional data structure, 20 // namely: activeQ, backoffQ and unschedulablePods. 21 // - activeQ holds pods that are being considered for scheduling. 22 // - backoffQ holds pods that moved from unschedulablePods and will move to 23 // activeQ when their backoff periods complete. 24 // - unschedulablePods holds pods that were already attempted for scheduling and 25 // are currently determined to be unschedulable. 26 27 package queue 28 29 import ( 30 "container/list" 31 "context" 32 "fmt" 33 "math/rand" 34 "reflect" 35 "sync" 36 "time" 37 38 v1 "k8s.io/api/core/v1" 39 "k8s.io/apimachinery/pkg/types" 40 "k8s.io/apimachinery/pkg/util/sets" 41 "k8s.io/apimachinery/pkg/util/wait" 42 utilfeature "k8s.io/apiserver/pkg/util/feature" 43 "k8s.io/client-go/informers" 44 listersv1 "k8s.io/client-go/listers/core/v1" 45 "k8s.io/client-go/tools/cache" 46 "k8s.io/klog/v2" 47 "k8s.io/kubernetes/pkg/features" 48 "k8s.io/kubernetes/pkg/scheduler/framework" 49 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity" 50 "k8s.io/kubernetes/pkg/scheduler/internal/heap" 51 "k8s.io/kubernetes/pkg/scheduler/metrics" 52 "k8s.io/kubernetes/pkg/scheduler/util" 53 "k8s.io/utils/clock" 54 ) 55 56 const ( 57 // DefaultPodMaxInUnschedulablePodsDuration is the default value for the maximum 58 // time a pod can stay in unschedulablePods. If a pod stays in unschedulablePods 59 // for longer than this value, the pod will be moved from unschedulablePods to 60 // backoffQ or activeQ. If this value is empty, the default value (5min) 61 // will be used. 62 DefaultPodMaxInUnschedulablePodsDuration time.Duration = 5 * time.Minute 63 // Scheduling queue names 64 activeQ = "Active" 65 backoffQ = "Backoff" 66 unschedulablePods = "Unschedulable" 67 68 preEnqueue = "PreEnqueue" 69 ) 70 71 const ( 72 // DefaultPodInitialBackoffDuration is the default value for the initial backoff duration 73 // for unschedulable pods. To change the default podInitialBackoffDurationSeconds used by the 74 // scheduler, update the ComponentConfig value in defaults.go 75 DefaultPodInitialBackoffDuration time.Duration = 1 * time.Second 76 // DefaultPodMaxBackoffDuration is the default value for the max backoff duration 77 // for unschedulable pods. To change the default podMaxBackoffDurationSeconds used by the 78 // scheduler, update the ComponentConfig value in defaults.go 79 DefaultPodMaxBackoffDuration time.Duration = 10 * time.Second 80 ) 81 82 // PreEnqueueCheck is a function type. It's used to build functions that 83 // run against a Pod and the caller can choose to enqueue or skip the Pod 84 // by the checking result. 85 type PreEnqueueCheck func(pod *v1.Pod) bool 86 87 // SchedulingQueue is an interface for a queue to store pods waiting to be scheduled. 88 // The interface follows a pattern similar to cache.FIFO and cache.Heap and 89 // makes it easy to use those data structures as a SchedulingQueue. 90 type SchedulingQueue interface { 91 framework.PodNominator 92 Add(logger klog.Logger, pod *v1.Pod) error 93 // Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ. 94 // The passed-in pods are originally compiled from plugins that want to activate Pods, 95 // by injecting the pods through a reserved CycleState struct (PodsToActivate). 96 Activate(logger klog.Logger, pods map[string]*v1.Pod) 97 // AddUnschedulableIfNotPresent adds an unschedulable pod back to scheduling queue. 98 // The podSchedulingCycle represents the current scheduling cycle number which can be 99 // returned by calling SchedulingCycle(). 100 AddUnschedulableIfNotPresent(logger klog.Logger, pod *framework.QueuedPodInfo, podSchedulingCycle int64) error 101 // SchedulingCycle returns the current number of scheduling cycle which is 102 // cached by scheduling queue. Normally, incrementing this number whenever 103 // a pod is popped (e.g. called Pop()) is enough. 104 SchedulingCycle() int64 105 // Pop removes the head of the queue and returns it. It blocks if the 106 // queue is empty and waits until a new item is added to the queue. 107 Pop(logger klog.Logger) (*framework.QueuedPodInfo, error) 108 // Done must be called for pod returned by Pop. This allows the queue to 109 // keep track of which pods are currently being processed. 110 Done(types.UID) 111 Update(logger klog.Logger, oldPod, newPod *v1.Pod) error 112 Delete(pod *v1.Pod) error 113 // TODO(sanposhiho): move all PreEnqueueCkeck to Requeue and delete it from this parameter eventually. 114 // Some PreEnqueueCheck include event filtering logic based on some in-tree plugins 115 // and it affect badly to other plugins. 116 // See https://github.com/kubernetes/kubernetes/issues/110175 117 MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) 118 AssignedPodAdded(logger klog.Logger, pod *v1.Pod) 119 AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod) 120 PendingPods() ([]*v1.Pod, string) 121 // Close closes the SchedulingQueue so that the goroutine which is 122 // waiting to pop items can exit gracefully. 123 Close() 124 // Run starts the goroutines managing the queue. 125 Run(logger klog.Logger) 126 } 127 128 // NewSchedulingQueue initializes a priority queue as a new scheduling queue. 129 func NewSchedulingQueue( 130 lessFn framework.LessFunc, 131 informerFactory informers.SharedInformerFactory, 132 opts ...Option) SchedulingQueue { 133 return NewPriorityQueue(lessFn, informerFactory, opts...) 134 } 135 136 // NominatedNodeName returns nominated node name of a Pod. 137 func NominatedNodeName(pod *v1.Pod) string { 138 return pod.Status.NominatedNodeName 139 } 140 141 // PriorityQueue implements a scheduling queue. 142 // The head of PriorityQueue is the highest priority pending pod. This structure 143 // has two sub queues and a additional data structure, namely: activeQ, 144 // backoffQ and unschedulablePods. 145 // - activeQ holds pods that are being considered for scheduling. 146 // - backoffQ holds pods that moved from unschedulablePods and will move to 147 // activeQ when their backoff periods complete. 148 // - unschedulablePods holds pods that were already attempted for scheduling and 149 // are currently determined to be unschedulable. 150 type PriorityQueue struct { 151 *nominator 152 153 stop chan struct{} 154 clock clock.Clock 155 156 // pod initial backoff duration. 157 podInitialBackoffDuration time.Duration 158 // pod maximum backoff duration. 159 podMaxBackoffDuration time.Duration 160 // the maximum time a pod can stay in the unschedulablePods. 161 podMaxInUnschedulablePodsDuration time.Duration 162 163 cond sync.Cond 164 165 // inFlightPods holds the UID of all pods which have been popped out for which Done 166 // hasn't been called yet - in other words, all pods that are currently being 167 // processed (being scheduled, in permit, or in the binding cycle). 168 // 169 // The values in the map are the entry of each pod in the inFlightEvents list. 170 // The value of that entry is the *v1.Pod at the time that scheduling of that 171 // pod started, which can be useful for logging or debugging. 172 inFlightPods map[types.UID]*list.Element 173 174 // inFlightEvents holds the events received by the scheduling queue 175 // (entry value is clusterEvent) together with in-flight pods (entry 176 // value is *v1.Pod). Entries get added at the end while the mutex is 177 // locked, so they get serialized. 178 // 179 // The pod entries are added in Pop and used to track which events 180 // occurred after the pod scheduling attempt for that pod started. 181 // They get removed when the scheduling attempt is done, at which 182 // point all events that occurred in the meantime are processed. 183 // 184 // After removal of a pod, events at the start of the list are no 185 // longer needed because all of the other in-flight pods started 186 // later. Those events can be removed. 187 inFlightEvents *list.List 188 189 // activeQ is heap structure that scheduler actively looks at to find pods to 190 // schedule. Head of heap is the highest priority pod. 191 activeQ *heap.Heap 192 // podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff 193 // are popped from this heap before the scheduler looks at activeQ 194 podBackoffQ *heap.Heap 195 // unschedulablePods holds pods that have been tried and determined unschedulable. 196 unschedulablePods *UnschedulablePods 197 // schedulingCycle represents sequence number of scheduling cycle and is incremented 198 // when a pod is popped. 199 schedulingCycle int64 200 // moveRequestCycle caches the sequence number of scheduling cycle when we 201 // received a move request. Unschedulable pods in and before this scheduling 202 // cycle will be put back to activeQueue if we were trying to schedule them 203 // when we received move request. 204 // TODO: this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed. 205 moveRequestCycle int64 206 207 // preEnqueuePluginMap is keyed with profile name, valued with registered preEnqueue plugins. 208 preEnqueuePluginMap map[string][]framework.PreEnqueuePlugin 209 // queueingHintMap is keyed with profile name, valued with registered queueing hint functions. 210 queueingHintMap QueueingHintMapPerProfile 211 212 // closed indicates that the queue is closed. 213 // It is mainly used to let Pop() exit its control loop while waiting for an item. 214 closed bool 215 216 nsLister listersv1.NamespaceLister 217 218 metricsRecorder metrics.MetricAsyncRecorder 219 // pluginMetricsSamplePercent is the percentage of plugin metrics to be sampled. 220 pluginMetricsSamplePercent int 221 222 // isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled. 223 isSchedulingQueueHintEnabled bool 224 } 225 226 // QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName. 227 type QueueingHintFunction struct { 228 PluginName string 229 QueueingHintFn framework.QueueingHintFn 230 } 231 232 // clusterEvent has the event and involved objects. 233 type clusterEvent struct { 234 event framework.ClusterEvent 235 // oldObj is the object that involved this event. 236 oldObj interface{} 237 // newObj is the object that involved this event. 238 newObj interface{} 239 } 240 241 type priorityQueueOptions struct { 242 clock clock.Clock 243 podInitialBackoffDuration time.Duration 244 podMaxBackoffDuration time.Duration 245 podMaxInUnschedulablePodsDuration time.Duration 246 podLister listersv1.PodLister 247 metricsRecorder metrics.MetricAsyncRecorder 248 pluginMetricsSamplePercent int 249 preEnqueuePluginMap map[string][]framework.PreEnqueuePlugin 250 queueingHintMap QueueingHintMapPerProfile 251 } 252 253 // Option configures a PriorityQueue 254 type Option func(*priorityQueueOptions) 255 256 // WithClock sets clock for PriorityQueue, the default clock is clock.RealClock. 257 func WithClock(clock clock.Clock) Option { 258 return func(o *priorityQueueOptions) { 259 o.clock = clock 260 } 261 } 262 263 // WithPodInitialBackoffDuration sets pod initial backoff duration for PriorityQueue. 264 func WithPodInitialBackoffDuration(duration time.Duration) Option { 265 return func(o *priorityQueueOptions) { 266 o.podInitialBackoffDuration = duration 267 } 268 } 269 270 // WithPodMaxBackoffDuration sets pod max backoff duration for PriorityQueue. 271 func WithPodMaxBackoffDuration(duration time.Duration) Option { 272 return func(o *priorityQueueOptions) { 273 o.podMaxBackoffDuration = duration 274 } 275 } 276 277 // WithPodLister sets pod lister for PriorityQueue. 278 func WithPodLister(pl listersv1.PodLister) Option { 279 return func(o *priorityQueueOptions) { 280 o.podLister = pl 281 } 282 } 283 284 // WithPodMaxInUnschedulablePodsDuration sets podMaxInUnschedulablePodsDuration for PriorityQueue. 285 func WithPodMaxInUnschedulablePodsDuration(duration time.Duration) Option { 286 return func(o *priorityQueueOptions) { 287 o.podMaxInUnschedulablePodsDuration = duration 288 } 289 } 290 291 // QueueingHintMapPerProfile is keyed with profile name, valued with queueing hint map registered for the profile. 292 type QueueingHintMapPerProfile map[string]QueueingHintMap 293 294 // QueueingHintMap is keyed with ClusterEvent, valued with queueing hint functions registered for the event. 295 type QueueingHintMap map[framework.ClusterEvent][]*QueueingHintFunction 296 297 // WithQueueingHintMapPerProfile sets preEnqueuePluginMap for PriorityQueue. 298 func WithQueueingHintMapPerProfile(m QueueingHintMapPerProfile) Option { 299 return func(o *priorityQueueOptions) { 300 o.queueingHintMap = m 301 } 302 } 303 304 // WithPreEnqueuePluginMap sets preEnqueuePluginMap for PriorityQueue. 305 func WithPreEnqueuePluginMap(m map[string][]framework.PreEnqueuePlugin) Option { 306 return func(o *priorityQueueOptions) { 307 o.preEnqueuePluginMap = m 308 } 309 } 310 311 // WithMetricsRecorder sets metrics recorder. 312 func WithMetricsRecorder(recorder metrics.MetricAsyncRecorder) Option { 313 return func(o *priorityQueueOptions) { 314 o.metricsRecorder = recorder 315 } 316 } 317 318 // WithPluginMetricsSamplePercent sets the percentage of plugin metrics to be sampled. 319 func WithPluginMetricsSamplePercent(percent int) Option { 320 return func(o *priorityQueueOptions) { 321 o.pluginMetricsSamplePercent = percent 322 } 323 } 324 325 var defaultPriorityQueueOptions = priorityQueueOptions{ 326 clock: clock.RealClock{}, 327 podInitialBackoffDuration: DefaultPodInitialBackoffDuration, 328 podMaxBackoffDuration: DefaultPodMaxBackoffDuration, 329 podMaxInUnschedulablePodsDuration: DefaultPodMaxInUnschedulablePodsDuration, 330 } 331 332 // Making sure that PriorityQueue implements SchedulingQueue. 333 var _ SchedulingQueue = &PriorityQueue{} 334 335 // newQueuedPodInfoForLookup builds a QueuedPodInfo object for a lookup in the queue. 336 func newQueuedPodInfoForLookup(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo { 337 // Since this is only used for a lookup in the queue, we only need to set the Pod, 338 // and so we avoid creating a full PodInfo, which is expensive to instantiate frequently. 339 return &framework.QueuedPodInfo{ 340 PodInfo: &framework.PodInfo{Pod: pod}, 341 UnschedulablePlugins: sets.New(plugins...), 342 } 343 } 344 345 // NewPriorityQueue creates a PriorityQueue object. 346 func NewPriorityQueue( 347 lessFn framework.LessFunc, 348 informerFactory informers.SharedInformerFactory, 349 opts ...Option, 350 ) *PriorityQueue { 351 options := defaultPriorityQueueOptions 352 if options.podLister == nil { 353 options.podLister = informerFactory.Core().V1().Pods().Lister() 354 } 355 for _, opt := range opts { 356 opt(&options) 357 } 358 359 comp := func(podInfo1, podInfo2 interface{}) bool { 360 pInfo1 := podInfo1.(*framework.QueuedPodInfo) 361 pInfo2 := podInfo2.(*framework.QueuedPodInfo) 362 return lessFn(pInfo1, pInfo2) 363 } 364 365 pq := &PriorityQueue{ 366 nominator: newPodNominator(options.podLister), 367 clock: options.clock, 368 stop: make(chan struct{}), 369 podInitialBackoffDuration: options.podInitialBackoffDuration, 370 podMaxBackoffDuration: options.podMaxBackoffDuration, 371 podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration, 372 activeQ: heap.NewWithRecorder(podInfoKeyFunc, comp, metrics.NewActivePodsRecorder()), 373 unschedulablePods: newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()), 374 inFlightPods: make(map[types.UID]*list.Element), 375 inFlightEvents: list.New(), 376 preEnqueuePluginMap: options.preEnqueuePluginMap, 377 queueingHintMap: options.queueingHintMap, 378 metricsRecorder: options.metricsRecorder, 379 pluginMetricsSamplePercent: options.pluginMetricsSamplePercent, 380 moveRequestCycle: -1, 381 isSchedulingQueueHintEnabled: utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints), 382 } 383 pq.cond.L = &pq.lock 384 pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder()) 385 pq.nsLister = informerFactory.Core().V1().Namespaces().Lister() 386 387 return pq 388 } 389 390 // Run starts the goroutine to pump from podBackoffQ to activeQ 391 func (p *PriorityQueue) Run(logger klog.Logger) { 392 go wait.Until(func() { 393 p.flushBackoffQCompleted(logger) 394 }, 1.0*time.Second, p.stop) 395 go wait.Until(func() { 396 p.flushUnschedulablePodsLeftover(logger) 397 }, 30*time.Second, p.stop) 398 } 399 400 // queueingStrategy indicates how the scheduling queue should enqueue the Pod from unschedulable pod pool. 401 type queueingStrategy int 402 403 const ( 404 // queueSkip indicates that the scheduling queue should skip requeuing the Pod to activeQ/backoffQ. 405 queueSkip queueingStrategy = iota 406 // queueAfterBackoff indicates that the scheduling queue should requeue the Pod after backoff is completed. 407 queueAfterBackoff 408 // queueImmediately indicates that the scheduling queue should skip backoff and requeue the Pod immediately to activeQ. 409 queueImmediately 410 ) 411 412 // isPodWorthRequeuing calls QueueingHintFn of only plugins registered in pInfo.unschedulablePlugins and pInfo.PendingPlugins. 413 // 414 // If any of pInfo.PendingPlugins return Queue, 415 // the scheduling queue is supposed to enqueue this Pod to activeQ, skipping backoffQ. 416 // If any of pInfo.unschedulablePlugins return Queue, 417 // the scheduling queue is supposed to enqueue this Pod to activeQ/backoffQ depending on the remaining backoff time of the Pod. 418 // If all QueueingHintFns returns Skip, the scheduling queue enqueues the Pod back to unschedulable Pod pool 419 // because no plugin changes the scheduling result via the event. 420 func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) queueingStrategy { 421 rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) 422 if rejectorPlugins.Len() == 0 { 423 logger.V(6).Info("Worth requeuing because no failed plugins", "pod", klog.KObj(pInfo.Pod)) 424 return queueAfterBackoff 425 } 426 427 if event.IsWildCard() { 428 // If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ. 429 // We return queueAfterBackoff in this case, while resetting all blocked plugins. 430 logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod)) 431 return queueAfterBackoff 432 } 433 434 hintMap, ok := p.queueingHintMap[pInfo.Pod.Spec.SchedulerName] 435 if !ok { 436 // shouldn't reach here unless bug. 437 logger.Error(nil, "No QueueingHintMap is registered for this profile", "profile", pInfo.Pod.Spec.SchedulerName, "pod", klog.KObj(pInfo.Pod)) 438 return queueAfterBackoff 439 } 440 441 pod := pInfo.Pod 442 queueStrategy := queueSkip 443 for eventToMatch, hintfns := range hintMap { 444 if eventToMatch.Resource != event.Resource || eventToMatch.ActionType&event.ActionType == 0 { 445 continue 446 } 447 448 for _, hintfn := range hintfns { 449 if !rejectorPlugins.Has(hintfn.PluginName) { 450 // skip if it's not hintfn from rejectorPlugins. 451 continue 452 } 453 454 hint, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj) 455 if err != nil { 456 // If the QueueingHintFn returned an error, we should treat the event as Queue so that we can prevent 457 // the Pod from being stuck in the unschedulable pod pool. 458 oldObjMeta, newObjMeta, asErr := util.As[klog.KMetadata](oldObj, newObj) 459 if asErr != nil { 460 logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod)) 461 } else { 462 logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod), "oldObj", klog.KObj(oldObjMeta), "newObj", klog.KObj(newObjMeta)) 463 } 464 hint = framework.Queue 465 } 466 if hint == framework.QueueSkip { 467 continue 468 } 469 470 if pInfo.PendingPlugins.Has(hintfn.PluginName) { 471 // interprets Queue from the Pending plugin as queueImmediately. 472 // We can return immediately because queueImmediately is the highest priority. 473 return queueImmediately 474 } 475 476 // interprets Queue from the unschedulable plugin as queueAfterBackoff. 477 478 if pInfo.PendingPlugins.Len() == 0 { 479 // We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod, 480 // and queueAfterBackoff is the second highest priority. 481 return queueAfterBackoff 482 } 483 484 // We can't return immediately because there are some Pending plugins registered in this Pod. 485 // We need to check if those plugins return Queue or not and if they do, we return queueImmediately. 486 queueStrategy = queueAfterBackoff 487 } 488 } 489 490 return queueStrategy 491 } 492 493 // runPreEnqueuePlugins iterates PreEnqueue function in each registered PreEnqueuePlugin. 494 // It returns true if all PreEnqueue function run successfully; otherwise returns false 495 // upon the first failure. 496 // Note: we need to associate the failed plugin to `pInfo`, so that the pod can be moved back 497 // to activeQ by related cluster event. 498 func (p *PriorityQueue) runPreEnqueuePlugins(ctx context.Context, pInfo *framework.QueuedPodInfo) bool { 499 logger := klog.FromContext(ctx) 500 var s *framework.Status 501 pod := pInfo.Pod 502 startTime := p.clock.Now() 503 defer func() { 504 metrics.FrameworkExtensionPointDuration.WithLabelValues(preEnqueue, s.Code().String(), pod.Spec.SchedulerName).Observe(metrics.SinceInSeconds(startTime)) 505 }() 506 507 shouldRecordMetric := rand.Intn(100) < p.pluginMetricsSamplePercent 508 for _, pl := range p.preEnqueuePluginMap[pod.Spec.SchedulerName] { 509 s = p.runPreEnqueuePlugin(ctx, pl, pod, shouldRecordMetric) 510 if s.IsSuccess() { 511 continue 512 } 513 pInfo.UnschedulablePlugins.Insert(pl.Name()) 514 metrics.UnschedulableReason(pl.Name(), pod.Spec.SchedulerName).Inc() 515 if s.Code() == framework.Error { 516 logger.Error(s.AsError(), "Unexpected error running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name()) 517 } else { 518 logger.Info("Status after running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name(), "status", s) 519 } 520 return false 521 } 522 return true 523 } 524 525 func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.PreEnqueuePlugin, pod *v1.Pod, shouldRecordMetric bool) *framework.Status { 526 if !shouldRecordMetric { 527 return pl.PreEnqueue(ctx, pod) 528 } 529 startTime := p.clock.Now() 530 s := pl.PreEnqueue(ctx, pod) 531 p.metricsRecorder.ObservePluginDurationAsync(preEnqueue, pl.Name(), s.Code().String(), p.clock.Since(startTime).Seconds()) 532 return s 533 } 534 535 // addToActiveQ tries to add pod to active queue. It returns 2 parameters: 536 // 1. a boolean flag to indicate whether the pod is added successfully. 537 // 2. an error for the caller to act on. 538 func (p *PriorityQueue) addToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo) (bool, error) { 539 pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo) 540 if pInfo.Gated { 541 // Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins. 542 p.unschedulablePods.addOrUpdate(pInfo) 543 return false, nil 544 } 545 if pInfo.InitialAttemptTimestamp == nil { 546 now := p.clock.Now() 547 pInfo.InitialAttemptTimestamp = &now 548 } 549 if err := p.activeQ.Add(pInfo); err != nil { 550 logger.Error(err, "Error adding pod to the active queue", "pod", klog.KObj(pInfo.Pod)) 551 return false, err 552 } 553 return true, nil 554 } 555 556 // Add adds a pod to the active queue. It should be called only when a new pod 557 // is added so there is no chance the pod is already in active/unschedulable/backoff queues 558 func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) error { 559 p.lock.Lock() 560 defer p.lock.Unlock() 561 562 pInfo := p.newQueuedPodInfo(pod) 563 gated := pInfo.Gated 564 if added, err := p.addToActiveQ(logger, pInfo); !added { 565 return err 566 } 567 if p.unschedulablePods.get(pod) != nil { 568 logger.Error(nil, "Error: pod is already in the unschedulable queue", "pod", klog.KObj(pod)) 569 p.unschedulablePods.delete(pod, gated) 570 } 571 // Delete pod from backoffQ if it is backing off 572 if err := p.podBackoffQ.Delete(pInfo); err == nil { 573 logger.Error(nil, "Error: pod is already in the podBackoff queue", "pod", klog.KObj(pod)) 574 } 575 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", PodAdd, "queue", activeQ) 576 metrics.SchedulerQueueIncomingPods.WithLabelValues("active", PodAdd).Inc() 577 p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil) 578 p.cond.Broadcast() 579 580 return nil 581 } 582 583 // Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ. 584 func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) { 585 p.lock.Lock() 586 defer p.lock.Unlock() 587 588 activated := false 589 for _, pod := range pods { 590 if p.activate(logger, pod) { 591 activated = true 592 } 593 } 594 595 if activated { 596 p.cond.Broadcast() 597 } 598 } 599 600 func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool { 601 // Verify if the pod is present in activeQ. 602 if _, exists, _ := p.activeQ.Get(newQueuedPodInfoForLookup(pod)); exists { 603 // No need to activate if it's already present in activeQ. 604 return false 605 } 606 var pInfo *framework.QueuedPodInfo 607 // Verify if the pod is present in unschedulablePods or backoffQ. 608 if pInfo = p.unschedulablePods.get(pod); pInfo == nil { 609 // If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it. 610 if obj, exists, _ := p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod)); !exists { 611 logger.Error(nil, "To-activate pod does not exist in unschedulablePods or backoffQ", "pod", klog.KObj(pod)) 612 return false 613 } else { 614 pInfo = obj.(*framework.QueuedPodInfo) 615 } 616 } 617 618 if pInfo == nil { 619 // Redundant safe check. We shouldn't reach here. 620 logger.Error(nil, "Internal error: cannot obtain pInfo") 621 return false 622 } 623 624 gated := pInfo.Gated 625 if added, _ := p.addToActiveQ(logger, pInfo); !added { 626 return false 627 } 628 p.unschedulablePods.delete(pInfo.Pod, gated) 629 p.podBackoffQ.Delete(pInfo) 630 metrics.SchedulerQueueIncomingPods.WithLabelValues("active", ForceActivate).Inc() 631 p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil) 632 return true 633 } 634 635 // isPodBackingoff returns true if a pod is still waiting for its backoff timer. 636 // If this returns true, the pod should not be re-tried. 637 func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool { 638 if podInfo.Gated { 639 return false 640 } 641 boTime := p.getBackoffTime(podInfo) 642 return boTime.After(p.clock.Now()) 643 } 644 645 // SchedulingCycle returns current scheduling cycle. 646 func (p *PriorityQueue) SchedulingCycle() int64 { 647 p.lock.RLock() 648 defer p.lock.RUnlock() 649 return p.schedulingCycle 650 } 651 652 // determineSchedulingHintForInFlightPod looks at the unschedulable plugins of the given Pod 653 // and determines the scheduling hint for this Pod while checking the events that happened during in-flight. 654 func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) queueingStrategy { 655 logger.V(5).Info("Checking events for in-flight pod", "pod", klog.KObj(pInfo.Pod), "unschedulablePlugins", pInfo.UnschedulablePlugins, "inFlightEventsSize", p.inFlightEvents.Len(), "inFlightPodsSize", len(p.inFlightPods)) 656 657 // AddUnschedulableIfNotPresent is called with the Pod at the end of scheduling or binding. 658 // So, given pInfo should have been Pop()ed before, 659 // we can assume pInfo must be recorded in inFlightPods and thus inFlightEvents. 660 inFlightPod, ok := p.inFlightPods[pInfo.Pod.UID] 661 if !ok { 662 // This can happen while updating a pod. In that case pInfo.UnschedulablePlugins should 663 // be empty. If it is not, we may have a problem. 664 if len(pInfo.UnschedulablePlugins) != 0 { 665 logger.Error(nil, "In flight Pod isn't found in the scheduling queue. If you see this error log, it's likely a bug in the scheduler.", "pod", klog.KObj(pInfo.Pod)) 666 return queueAfterBackoff 667 } 668 if p.inFlightEvents.Len() > len(p.inFlightPods) { 669 return queueAfterBackoff 670 } 671 return queueSkip 672 } 673 674 rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) 675 if len(rejectorPlugins) == 0 { 676 // No failed plugins are associated with this Pod. 677 // Meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue. 678 // In this case, we should retry scheduling it because this Pod may not be retried until the next flush. 679 return queueAfterBackoff 680 } 681 682 // check if there is an event that makes this Pod schedulable based on pInfo.UnschedulablePlugins. 683 queueingStrategy := queueSkip 684 for event := inFlightPod.Next(); event != nil; event = event.Next() { 685 e, ok := event.Value.(*clusterEvent) 686 if !ok { 687 // Must be another in-flight Pod (*v1.Pod). Can be ignored. 688 continue 689 } 690 logger.V(5).Info("Checking event for in-flight pod", "pod", klog.KObj(pInfo.Pod), "event", e.event.Label) 691 692 switch p.isPodWorthRequeuing(logger, pInfo, e.event, e.oldObj, e.newObj) { 693 case queueSkip: 694 continue 695 case queueImmediately: 696 // queueImmediately is the highest priority. 697 // No need to go through the rest of the events. 698 return queueImmediately 699 case queueAfterBackoff: 700 // replace schedulingHint with queueAfterBackoff 701 queueingStrategy = queueAfterBackoff 702 if pInfo.PendingPlugins.Len() == 0 { 703 // We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod, 704 // and queueAfterBackoff is the second highest priority. 705 return queueAfterBackoff 706 } 707 } 708 } 709 return queueingStrategy 710 } 711 712 // addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into 713 // the queue, unless it is already in the queue. Normally, PriorityQueue puts 714 // unschedulable pods in `unschedulablePods`. But if there has been a recent move 715 // request, then the pod is put in `podBackoffQ`. 716 // TODO: This function is called only when p.isSchedulingQueueHintEnabled is false, 717 // and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed. 718 func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error { 719 pod := pInfo.Pod 720 // Refresh the timestamp since the pod is re-added. 721 pInfo.Timestamp = p.clock.Now() 722 723 // When the queueing hint is enabled, they are used differently. 724 // But, we use all of them as UnschedulablePlugins when the queueing hint isn't enabled so that we don't break the old behaviour. 725 rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) 726 727 // If a move request has been received, move it to the BackoffQ, otherwise move 728 // it to unschedulablePods. 729 for plugin := range rejectorPlugins { 730 metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc() 731 } 732 if p.moveRequestCycle >= podSchedulingCycle || len(rejectorPlugins) == 0 { 733 // Two cases to move a Pod to the active/backoff queue: 734 // - The Pod is rejected by some plugins, but a move request is received after this Pod's scheduling cycle is started. 735 // In this case, the received event may be make Pod schedulable and we should retry scheduling it. 736 // - No unschedulable plugins are associated with this Pod, 737 // meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue. 738 // In this case, we should retry scheduling it because this Pod may not be retried until the next flush. 739 if err := p.podBackoffQ.Add(pInfo); err != nil { 740 return fmt.Errorf("error adding pod %v to the backoff queue: %v", klog.KObj(pod), err) 741 } 742 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", backoffQ) 743 metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", ScheduleAttemptFailure).Inc() 744 } else { 745 p.unschedulablePods.addOrUpdate(pInfo) 746 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", unschedulablePods) 747 metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc() 748 } 749 750 p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil) 751 return nil 752 } 753 754 // AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into 755 // the queue, unless it is already in the queue. Normally, PriorityQueue puts 756 // unschedulable pods in `unschedulablePods`. But if there has been a recent move 757 // request, then the pod is put in `podBackoffQ`. 758 func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error { 759 p.lock.Lock() 760 defer p.lock.Unlock() 761 762 // In any case, this Pod will be moved back to the queue and we should call Done. 763 defer p.done(pInfo.Pod.UID) 764 765 pod := pInfo.Pod 766 if p.unschedulablePods.get(pod) != nil { 767 return fmt.Errorf("Pod %v is already present in unschedulable queue", klog.KObj(pod)) 768 } 769 770 if _, exists, _ := p.activeQ.Get(pInfo); exists { 771 return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod)) 772 } 773 if _, exists, _ := p.podBackoffQ.Get(pInfo); exists { 774 return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod)) 775 } 776 777 if !p.isSchedulingQueueHintEnabled { 778 // fall back to the old behavior which doesn't depend on the queueing hint. 779 return p.addUnschedulableWithoutQueueingHint(logger, pInfo, podSchedulingCycle) 780 } 781 782 // Refresh the timestamp since the pod is re-added. 783 pInfo.Timestamp = p.clock.Now() 784 785 // If a move request has been received, move it to the BackoffQ, otherwise move 786 // it to unschedulablePods. 787 rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) 788 for plugin := range rejectorPlugins { 789 metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc() 790 } 791 792 // We check whether this Pod may change its scheduling result by any of events that happened during scheduling. 793 schedulingHint := p.determineSchedulingHintForInFlightPod(logger, pInfo) 794 795 // In this case, we try to requeue this Pod to activeQ/backoffQ. 796 queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, ScheduleAttemptFailure) 797 logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins) 798 if queue == activeQ { 799 // When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out. 800 p.cond.Broadcast() 801 } 802 803 p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil) 804 return nil 805 } 806 807 // flushBackoffQCompleted Moves all pods from backoffQ which have completed backoff in to activeQ 808 func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) { 809 p.lock.Lock() 810 defer p.lock.Unlock() 811 activated := false 812 for { 813 rawPodInfo := p.podBackoffQ.Peek() 814 if rawPodInfo == nil { 815 break 816 } 817 pInfo := rawPodInfo.(*framework.QueuedPodInfo) 818 pod := pInfo.Pod 819 if p.isPodBackingoff(pInfo) { 820 break 821 } 822 _, err := p.podBackoffQ.Pop() 823 if err != nil { 824 logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod)) 825 break 826 } 827 if added, _ := p.addToActiveQ(logger, pInfo); added { 828 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", BackoffComplete, "queue", activeQ) 829 metrics.SchedulerQueueIncomingPods.WithLabelValues("active", BackoffComplete).Inc() 830 activated = true 831 } 832 } 833 834 if activated { 835 p.cond.Broadcast() 836 } 837 } 838 839 // flushUnschedulablePodsLeftover moves pods which stay in unschedulablePods 840 // longer than podMaxInUnschedulablePodsDuration to backoffQ or activeQ. 841 func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) { 842 p.lock.Lock() 843 defer p.lock.Unlock() 844 845 var podsToMove []*framework.QueuedPodInfo 846 currentTime := p.clock.Now() 847 for _, pInfo := range p.unschedulablePods.podInfoMap { 848 lastScheduleTime := pInfo.Timestamp 849 if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration { 850 podsToMove = append(podsToMove, pInfo) 851 } 852 } 853 854 if len(podsToMove) > 0 { 855 p.movePodsToActiveOrBackoffQueue(logger, podsToMove, UnschedulableTimeout, nil, nil) 856 } 857 } 858 859 // Pop removes the head of the active queue and returns it. It blocks if the 860 // activeQ is empty and waits until a new item is added to the queue. It 861 // increments scheduling cycle when a pod is popped. 862 func (p *PriorityQueue) Pop(logger klog.Logger) (*framework.QueuedPodInfo, error) { 863 p.lock.Lock() 864 defer p.lock.Unlock() 865 for p.activeQ.Len() == 0 { 866 // When the queue is empty, invocation of Pop() is blocked until new item is enqueued. 867 // When Close() is called, the p.closed is set and the condition is broadcast, 868 // which causes this loop to continue and return from the Pop(). 869 if p.closed { 870 logger.V(2).Info("Scheduling queue is closed") 871 return nil, nil 872 } 873 p.cond.Wait() 874 } 875 obj, err := p.activeQ.Pop() 876 if err != nil { 877 return nil, err 878 } 879 pInfo := obj.(*framework.QueuedPodInfo) 880 pInfo.Attempts++ 881 p.schedulingCycle++ 882 // In flight, no concurrent events yet. 883 if p.isSchedulingQueueHintEnabled { 884 p.inFlightPods[pInfo.Pod.UID] = p.inFlightEvents.PushBack(pInfo.Pod) 885 } 886 887 // Update metrics and reset the set of unschedulable plugins for the next attempt. 888 for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) { 889 metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec() 890 } 891 pInfo.UnschedulablePlugins.Clear() 892 pInfo.PendingPlugins.Clear() 893 894 return pInfo, nil 895 } 896 897 // Done must be called for pod returned by Pop. This allows the queue to 898 // keep track of which pods are currently being processed. 899 func (p *PriorityQueue) Done(pod types.UID) { 900 p.lock.Lock() 901 defer p.lock.Unlock() 902 903 p.done(pod) 904 } 905 906 func (p *PriorityQueue) done(pod types.UID) { 907 if !p.isSchedulingQueueHintEnabled { 908 // do nothing if schedulingQueueHint is disabled. 909 // In that case, we don't have inFlightPods and inFlightEvents. 910 return 911 } 912 inFlightPod, ok := p.inFlightPods[pod] 913 if !ok { 914 // This Pod is already done()ed. 915 return 916 } 917 delete(p.inFlightPods, pod) 918 919 // Remove the pod from the list. 920 p.inFlightEvents.Remove(inFlightPod) 921 922 // Remove events which are only referred to by this Pod 923 // so that the inFlightEvents list doesn't grow infinitely. 924 // If the pod was at the head of the list, then all 925 // events between it and the next pod are no longer needed 926 // and can be removed. 927 for { 928 e := p.inFlightEvents.Front() 929 if e == nil { 930 // Empty list. 931 break 932 } 933 if _, ok := e.Value.(*clusterEvent); !ok { 934 // A pod, must stop pruning. 935 break 936 } 937 p.inFlightEvents.Remove(e) 938 } 939 } 940 941 // isPodUpdated checks if the pod is updated in a way that it may have become 942 // schedulable. It drops status of the pod and compares it with old version, 943 // except for pod.status.resourceClaimStatuses: changing that may have an 944 // effect on scheduling. 945 func isPodUpdated(oldPod, newPod *v1.Pod) bool { 946 strip := func(pod *v1.Pod) *v1.Pod { 947 p := pod.DeepCopy() 948 p.ResourceVersion = "" 949 p.Generation = 0 950 p.Status = v1.PodStatus{ 951 ResourceClaimStatuses: pod.Status.ResourceClaimStatuses, 952 } 953 p.ManagedFields = nil 954 p.Finalizers = nil 955 return p 956 } 957 return !reflect.DeepEqual(strip(oldPod), strip(newPod)) 958 } 959 960 // Update updates a pod in the active or backoff queue if present. Otherwise, it removes 961 // the item from the unschedulable queue if pod is updated in a way that it may 962 // become schedulable and adds the updated one to the active queue. 963 // If pod is not present in any of the queues, it is added to the active queue. 964 func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) error { 965 p.lock.Lock() 966 defer p.lock.Unlock() 967 968 if oldPod != nil { 969 oldPodInfo := newQueuedPodInfoForLookup(oldPod) 970 // If the pod is already in the active queue, just update it there. 971 if oldPodInfo, exists, _ := p.activeQ.Get(oldPodInfo); exists { 972 pInfo := updatePod(oldPodInfo, newPod) 973 p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo) 974 return p.activeQ.Update(pInfo) 975 } 976 977 // If the pod is in the backoff queue, update it there. 978 if oldPodInfo, exists, _ := p.podBackoffQ.Get(oldPodInfo); exists { 979 pInfo := updatePod(oldPodInfo, newPod) 980 p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo) 981 return p.podBackoffQ.Update(pInfo) 982 } 983 } 984 985 // If the pod is in the unschedulable queue, updating it may make it schedulable. 986 if usPodInfo := p.unschedulablePods.get(newPod); usPodInfo != nil { 987 pInfo := updatePod(usPodInfo, newPod) 988 p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo) 989 if isPodUpdated(oldPod, newPod) { 990 gated := usPodInfo.Gated 991 if p.isPodBackingoff(usPodInfo) { 992 if err := p.podBackoffQ.Add(pInfo); err != nil { 993 return err 994 } 995 p.unschedulablePods.delete(usPodInfo.Pod, gated) 996 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", backoffQ) 997 } else { 998 if added, err := p.addToActiveQ(logger, pInfo); !added { 999 return err 1000 } 1001 p.unschedulablePods.delete(usPodInfo.Pod, gated) 1002 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", BackoffComplete, "queue", activeQ) 1003 p.cond.Broadcast() 1004 } 1005 } else { 1006 // Pod update didn't make it schedulable, keep it in the unschedulable queue. 1007 p.unschedulablePods.addOrUpdate(pInfo) 1008 } 1009 1010 return nil 1011 } 1012 // If pod is not in any of the queues, we put it in the active queue. 1013 pInfo := p.newQueuedPodInfo(newPod) 1014 if added, err := p.addToActiveQ(logger, pInfo); !added { 1015 return err 1016 } 1017 p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil) 1018 logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", activeQ) 1019 p.cond.Broadcast() 1020 return nil 1021 } 1022 1023 // Delete deletes the item from either of the two queues. It assumes the pod is 1024 // only in one queue. 1025 func (p *PriorityQueue) Delete(pod *v1.Pod) error { 1026 p.lock.Lock() 1027 defer p.lock.Unlock() 1028 p.deleteNominatedPodIfExistsUnlocked(pod) 1029 pInfo := newQueuedPodInfoForLookup(pod) 1030 if err := p.activeQ.Delete(pInfo); err != nil { 1031 // The item was probably not found in the activeQ. 1032 p.podBackoffQ.Delete(pInfo) 1033 if pInfo = p.unschedulablePods.get(pod); pInfo != nil { 1034 p.unschedulablePods.delete(pod, pInfo.Gated) 1035 } 1036 } 1037 return nil 1038 } 1039 1040 // AssignedPodAdded is called when a bound pod is added. Creation of this pod 1041 // may make pending pods with matching affinity terms schedulable. 1042 func (p *PriorityQueue) AssignedPodAdded(logger klog.Logger, pod *v1.Pod) { 1043 p.lock.Lock() 1044 p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, pod), AssignedPodAdd, nil, pod) 1045 p.lock.Unlock() 1046 } 1047 1048 // isPodResourcesResizedDown returns true if a pod CPU and/or memory resize request has been 1049 // admitted by kubelet, is 'InProgress', and results in a net sizing down of updated resources. 1050 // It returns false if either CPU or memory resource is net resized up, or if no resize is in progress. 1051 func isPodResourcesResizedDown(pod *v1.Pod) bool { 1052 if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) { 1053 // TODO(vinaykul,wangchen615,InPlacePodVerticalScaling): Fix this to determine when a 1054 // pod is truly resized down (might need oldPod if we cannot determine from Status alone) 1055 if pod.Status.Resize == v1.PodResizeStatusInProgress { 1056 return true 1057 } 1058 } 1059 return false 1060 } 1061 1062 // AssignedPodUpdated is called when a bound pod is updated. Change of labels 1063 // may make pending pods with matching affinity terms schedulable. 1064 func (p *PriorityQueue) AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod) { 1065 p.lock.Lock() 1066 if isPodResourcesResizedDown(newPod) { 1067 p.moveAllToActiveOrBackoffQueue(logger, AssignedPodUpdate, oldPod, newPod, nil) 1068 } else { 1069 p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, newPod), AssignedPodUpdate, oldPod, newPod) 1070 } 1071 p.lock.Unlock() 1072 } 1073 1074 // NOTE: this function assumes a lock has been acquired in the caller. 1075 // moveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ. 1076 // This function adds all pods and then signals the condition variable to ensure that 1077 // if Pop() is waiting for an item, it receives the signal after all the pods are in the 1078 // queue and the head is the highest priority pod. 1079 func (p *PriorityQueue) moveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) { 1080 unschedulablePods := make([]*framework.QueuedPodInfo, 0, len(p.unschedulablePods.podInfoMap)) 1081 for _, pInfo := range p.unschedulablePods.podInfoMap { 1082 if preCheck == nil || preCheck(pInfo.Pod) { 1083 unschedulablePods = append(unschedulablePods, pInfo) 1084 } 1085 } 1086 p.movePodsToActiveOrBackoffQueue(logger, unschedulablePods, event, oldObj, newObj) 1087 } 1088 1089 // MoveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ. 1090 // This function adds all pods and then signals the condition variable to ensure that 1091 // if Pop() is waiting for an item, it receives the signal after all the pods are in the 1092 // queue and the head is the highest priority pod. 1093 func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) { 1094 p.lock.Lock() 1095 defer p.lock.Unlock() 1096 p.moveAllToActiveOrBackoffQueue(logger, event, oldObj, newObj, preCheck) 1097 } 1098 1099 // requeuePodViaQueueingHint tries to requeue Pod to activeQ, backoffQ or unschedulable pod pool based on schedulingHint. 1100 // It returns the queue name Pod goes. 1101 // 1102 // NOTE: this function assumes lock has been acquired in caller 1103 func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string { 1104 if strategy == queueSkip { 1105 p.unschedulablePods.addOrUpdate(pInfo) 1106 metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc() 1107 return unschedulablePods 1108 } 1109 1110 pod := pInfo.Pod 1111 if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) { 1112 if err := p.podBackoffQ.Add(pInfo); err != nil { 1113 logger.Error(err, "Error adding pod to the backoff queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod)) 1114 p.unschedulablePods.addOrUpdate(pInfo) 1115 return unschedulablePods 1116 } 1117 1118 metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc() 1119 return backoffQ 1120 } 1121 1122 // Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off. 1123 1124 added, err := p.addToActiveQ(logger, pInfo) 1125 if err != nil { 1126 logger.Error(err, "Error adding pod to the active queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod)) 1127 } 1128 if added { 1129 metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc() 1130 return activeQ 1131 } 1132 if pInfo.Gated { 1133 // In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in addToActiveQ. 1134 return unschedulablePods 1135 } 1136 1137 p.unschedulablePods.addOrUpdate(pInfo) 1138 metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc() 1139 return unschedulablePods 1140 } 1141 1142 // NOTE: this function assumes lock has been acquired in caller 1143 func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podInfoList []*framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) { 1144 activated := false 1145 for _, pInfo := range podInfoList { 1146 schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj) 1147 if schedulingHint == queueSkip { 1148 // QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event. 1149 logger.V(5).Info("Event is not making pod schedulable", "pod", klog.KObj(pInfo.Pod), "event", event.Label) 1150 continue 1151 } 1152 1153 p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated) 1154 queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label) 1155 logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label, "queue", queue, "hint", schedulingHint) 1156 if queue == activeQ { 1157 activated = true 1158 } 1159 } 1160 1161 p.moveRequestCycle = p.schedulingCycle 1162 1163 if p.isSchedulingQueueHintEnabled && len(p.inFlightPods) != 0 { 1164 logger.V(5).Info("Event received while pods are in flight", "event", event.Label, "numPods", len(p.inFlightPods)) 1165 // AddUnschedulableIfNotPresent might get called for in-flight Pods later, and in 1166 // AddUnschedulableIfNotPresent we need to know whether events were 1167 // observed while scheduling them. 1168 p.inFlightEvents.PushBack(&clusterEvent{ 1169 event: event, 1170 oldObj: oldObj, 1171 newObj: newObj, 1172 }) 1173 } 1174 1175 if activated { 1176 p.cond.Broadcast() 1177 } 1178 } 1179 1180 // getUnschedulablePodsWithMatchingAffinityTerm returns unschedulable pods which have 1181 // any affinity term that matches "pod". 1182 // NOTE: this function assumes lock has been acquired in caller. 1183 func (p *PriorityQueue) getUnschedulablePodsWithMatchingAffinityTerm(logger klog.Logger, pod *v1.Pod) []*framework.QueuedPodInfo { 1184 nsLabels := interpodaffinity.GetNamespaceLabelsSnapshot(logger, pod.Namespace, p.nsLister) 1185 1186 var podsToMove []*framework.QueuedPodInfo 1187 for _, pInfo := range p.unschedulablePods.podInfoMap { 1188 for _, term := range pInfo.RequiredAffinityTerms { 1189 if term.Matches(pod, nsLabels) { 1190 podsToMove = append(podsToMove, pInfo) 1191 break 1192 } 1193 } 1194 1195 } 1196 return podsToMove 1197 } 1198 1199 var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v" 1200 1201 // PendingPods returns all the pending pods in the queue; accompanied by a debugging string 1202 // recording showing the number of pods in each queue respectively. 1203 // This function is used for debugging purposes in the scheduler cache dumper and comparer. 1204 func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) { 1205 p.lock.RLock() 1206 defer p.lock.RUnlock() 1207 var result []*v1.Pod 1208 for _, pInfo := range p.activeQ.List() { 1209 result = append(result, pInfo.(*framework.QueuedPodInfo).Pod) 1210 } 1211 for _, pInfo := range p.podBackoffQ.List() { 1212 result = append(result, pInfo.(*framework.QueuedPodInfo).Pod) 1213 } 1214 for _, pInfo := range p.unschedulablePods.podInfoMap { 1215 result = append(result, pInfo.Pod) 1216 } 1217 return result, fmt.Sprintf(pendingPodsSummary, p.activeQ.Len(), p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap)) 1218 } 1219 1220 // Close closes the priority queue. 1221 func (p *PriorityQueue) Close() { 1222 p.lock.Lock() 1223 defer p.lock.Unlock() 1224 close(p.stop) 1225 p.closed = true 1226 p.cond.Broadcast() 1227 } 1228 1229 // DeleteNominatedPodIfExists deletes <pod> from nominatedPods. 1230 func (npm *nominator) DeleteNominatedPodIfExists(pod *v1.Pod) { 1231 npm.lock.Lock() 1232 npm.deleteNominatedPodIfExistsUnlocked(pod) 1233 npm.lock.Unlock() 1234 } 1235 1236 func (npm *nominator) deleteNominatedPodIfExistsUnlocked(pod *v1.Pod) { 1237 npm.delete(pod) 1238 } 1239 1240 // AddNominatedPod adds a pod to the nominated pods of the given node. 1241 // This is called during the preemption process after a node is nominated to run 1242 // the pod. We update the structure before sending a request to update the pod 1243 // object to avoid races with the following scheduling cycles. 1244 func (npm *nominator) AddNominatedPod(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) { 1245 npm.lock.Lock() 1246 npm.addNominatedPodUnlocked(logger, pi, nominatingInfo) 1247 npm.lock.Unlock() 1248 } 1249 1250 // NominatedPodsForNode returns a copy of pods that are nominated to run on the given node, 1251 // but they are waiting for other pods to be removed from the node. 1252 func (npm *nominator) NominatedPodsForNode(nodeName string) []*framework.PodInfo { 1253 npm.lock.RLock() 1254 defer npm.lock.RUnlock() 1255 // Make a copy of the nominated Pods so the caller can mutate safely. 1256 pods := make([]*framework.PodInfo, len(npm.nominatedPods[nodeName])) 1257 for i := 0; i < len(pods); i++ { 1258 pods[i] = npm.nominatedPods[nodeName][i].DeepCopy() 1259 } 1260 return pods 1261 } 1262 1263 func (p *PriorityQueue) podsCompareBackoffCompleted(podInfo1, podInfo2 interface{}) bool { 1264 pInfo1 := podInfo1.(*framework.QueuedPodInfo) 1265 pInfo2 := podInfo2.(*framework.QueuedPodInfo) 1266 bo1 := p.getBackoffTime(pInfo1) 1267 bo2 := p.getBackoffTime(pInfo2) 1268 return bo1.Before(bo2) 1269 } 1270 1271 // newQueuedPodInfo builds a QueuedPodInfo object. 1272 func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo { 1273 now := p.clock.Now() 1274 // ignore this err since apiserver doesn't properly validate affinity terms 1275 // and we can't fix the validation for backwards compatibility. 1276 podInfo, _ := framework.NewPodInfo(pod) 1277 return &framework.QueuedPodInfo{ 1278 PodInfo: podInfo, 1279 Timestamp: now, 1280 InitialAttemptTimestamp: nil, 1281 UnschedulablePlugins: sets.New(plugins...), 1282 } 1283 } 1284 1285 // getBackoffTime returns the time that podInfo completes backoff 1286 func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time { 1287 duration := p.calculateBackoffDuration(podInfo) 1288 backoffTime := podInfo.Timestamp.Add(duration) 1289 return backoffTime 1290 } 1291 1292 // calculateBackoffDuration is a helper function for calculating the backoffDuration 1293 // based on the number of attempts the pod has made. 1294 func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration { 1295 duration := p.podInitialBackoffDuration 1296 for i := 1; i < podInfo.Attempts; i++ { 1297 // Use subtraction instead of addition or multiplication to avoid overflow. 1298 if duration > p.podMaxBackoffDuration-duration { 1299 return p.podMaxBackoffDuration 1300 } 1301 duration += duration 1302 } 1303 return duration 1304 } 1305 1306 func updatePod(oldPodInfo interface{}, newPod *v1.Pod) *framework.QueuedPodInfo { 1307 pInfo := oldPodInfo.(*framework.QueuedPodInfo) 1308 pInfo.Update(newPod) 1309 return pInfo 1310 } 1311 1312 // UnschedulablePods holds pods that cannot be scheduled. This data structure 1313 // is used to implement unschedulablePods. 1314 type UnschedulablePods struct { 1315 // podInfoMap is a map key by a pod's full-name and the value is a pointer to the QueuedPodInfo. 1316 podInfoMap map[string]*framework.QueuedPodInfo 1317 keyFunc func(*v1.Pod) string 1318 // unschedulableRecorder/gatedRecorder updates the counter when elements of an unschedulablePodsMap 1319 // get added or removed, and it does nothing if it's nil. 1320 unschedulableRecorder, gatedRecorder metrics.MetricRecorder 1321 } 1322 1323 // addOrUpdate adds a pod to the unschedulable podInfoMap. 1324 func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) { 1325 podID := u.keyFunc(pInfo.Pod) 1326 if _, exists := u.podInfoMap[podID]; !exists { 1327 if pInfo.Gated && u.gatedRecorder != nil { 1328 u.gatedRecorder.Inc() 1329 } else if !pInfo.Gated && u.unschedulableRecorder != nil { 1330 u.unschedulableRecorder.Inc() 1331 } 1332 } 1333 u.podInfoMap[podID] = pInfo 1334 } 1335 1336 // delete deletes a pod from the unschedulable podInfoMap. 1337 // The `gated` parameter is used to figure out which metric should be decreased. 1338 func (u *UnschedulablePods) delete(pod *v1.Pod, gated bool) { 1339 podID := u.keyFunc(pod) 1340 if _, exists := u.podInfoMap[podID]; exists { 1341 if gated && u.gatedRecorder != nil { 1342 u.gatedRecorder.Dec() 1343 } else if !gated && u.unschedulableRecorder != nil { 1344 u.unschedulableRecorder.Dec() 1345 } 1346 } 1347 delete(u.podInfoMap, podID) 1348 } 1349 1350 // get returns the QueuedPodInfo if a pod with the same key as the key of the given "pod" 1351 // is found in the map. It returns nil otherwise. 1352 func (u *UnschedulablePods) get(pod *v1.Pod) *framework.QueuedPodInfo { 1353 podKey := u.keyFunc(pod) 1354 if pInfo, exists := u.podInfoMap[podKey]; exists { 1355 return pInfo 1356 } 1357 return nil 1358 } 1359 1360 // clear removes all the entries from the unschedulable podInfoMap. 1361 func (u *UnschedulablePods) clear() { 1362 u.podInfoMap = make(map[string]*framework.QueuedPodInfo) 1363 if u.unschedulableRecorder != nil { 1364 u.unschedulableRecorder.Clear() 1365 } 1366 if u.gatedRecorder != nil { 1367 u.gatedRecorder.Clear() 1368 } 1369 } 1370 1371 // newUnschedulablePods initializes a new object of UnschedulablePods. 1372 func newUnschedulablePods(unschedulableRecorder, gatedRecorder metrics.MetricRecorder) *UnschedulablePods { 1373 return &UnschedulablePods{ 1374 podInfoMap: make(map[string]*framework.QueuedPodInfo), 1375 keyFunc: util.GetPodFullName, 1376 unschedulableRecorder: unschedulableRecorder, 1377 gatedRecorder: gatedRecorder, 1378 } 1379 } 1380 1381 // nominator is a structure that stores pods nominated to run on nodes. 1382 // It exists because nominatedNodeName of pod objects stored in the structure 1383 // may be different than what scheduler has here. We should be able to find pods 1384 // by their UID and update/delete them. 1385 type nominator struct { 1386 // podLister is used to verify if the given pod is alive. 1387 podLister listersv1.PodLister 1388 // nominatedPods is a map keyed by a node name and the value is a list of 1389 // pods which are nominated to run on the node. These are pods which can be in 1390 // the activeQ or unschedulablePods. 1391 nominatedPods map[string][]*framework.PodInfo 1392 // nominatedPodToNode is map keyed by a Pod UID to the node name where it is 1393 // nominated. 1394 nominatedPodToNode map[types.UID]string 1395 1396 lock sync.RWMutex 1397 } 1398 1399 func (npm *nominator) addNominatedPodUnlocked(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) { 1400 // Always delete the pod if it already exists, to ensure we never store more than 1401 // one instance of the pod. 1402 npm.delete(pi.Pod) 1403 1404 var nodeName string 1405 if nominatingInfo.Mode() == framework.ModeOverride { 1406 nodeName = nominatingInfo.NominatedNodeName 1407 } else if nominatingInfo.Mode() == framework.ModeNoop { 1408 if pi.Pod.Status.NominatedNodeName == "" { 1409 return 1410 } 1411 nodeName = pi.Pod.Status.NominatedNodeName 1412 } 1413 1414 if npm.podLister != nil { 1415 // If the pod was removed or if it was already scheduled, don't nominate it. 1416 updatedPod, err := npm.podLister.Pods(pi.Pod.Namespace).Get(pi.Pod.Name) 1417 if err != nil { 1418 logger.V(4).Info("Pod doesn't exist in podLister, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod)) 1419 return 1420 } 1421 if updatedPod.Spec.NodeName != "" { 1422 logger.V(4).Info("Pod is already scheduled to a node, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod), "node", updatedPod.Spec.NodeName) 1423 return 1424 } 1425 } 1426 1427 npm.nominatedPodToNode[pi.Pod.UID] = nodeName 1428 for _, npi := range npm.nominatedPods[nodeName] { 1429 if npi.Pod.UID == pi.Pod.UID { 1430 logger.V(4).Info("Pod already exists in the nominator", "pod", klog.KObj(npi.Pod)) 1431 return 1432 } 1433 } 1434 npm.nominatedPods[nodeName] = append(npm.nominatedPods[nodeName], pi) 1435 } 1436 1437 func (npm *nominator) delete(p *v1.Pod) { 1438 nnn, ok := npm.nominatedPodToNode[p.UID] 1439 if !ok { 1440 return 1441 } 1442 for i, np := range npm.nominatedPods[nnn] { 1443 if np.Pod.UID == p.UID { 1444 npm.nominatedPods[nnn] = append(npm.nominatedPods[nnn][:i], npm.nominatedPods[nnn][i+1:]...) 1445 if len(npm.nominatedPods[nnn]) == 0 { 1446 delete(npm.nominatedPods, nnn) 1447 } 1448 break 1449 } 1450 } 1451 delete(npm.nominatedPodToNode, p.UID) 1452 } 1453 1454 // UpdateNominatedPod updates the <oldPod> with <newPod>. 1455 func (npm *nominator) UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) { 1456 npm.lock.Lock() 1457 defer npm.lock.Unlock() 1458 npm.updateNominatedPodUnlocked(logger, oldPod, newPodInfo) 1459 } 1460 1461 func (npm *nominator) updateNominatedPodUnlocked(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) { 1462 // In some cases, an Update event with no "NominatedNode" present is received right 1463 // after a node("NominatedNode") is reserved for this pod in memory. 1464 // In this case, we need to keep reserving the NominatedNode when updating the pod pointer. 1465 var nominatingInfo *framework.NominatingInfo 1466 // We won't fall into below `if` block if the Update event represents: 1467 // (1) NominatedNode info is added 1468 // (2) NominatedNode info is updated 1469 // (3) NominatedNode info is removed 1470 if NominatedNodeName(oldPod) == "" && NominatedNodeName(newPodInfo.Pod) == "" { 1471 if nnn, ok := npm.nominatedPodToNode[oldPod.UID]; ok { 1472 // This is the only case we should continue reserving the NominatedNode 1473 nominatingInfo = &framework.NominatingInfo{ 1474 NominatingMode: framework.ModeOverride, 1475 NominatedNodeName: nnn, 1476 } 1477 } 1478 } 1479 // We update irrespective of the nominatedNodeName changed or not, to ensure 1480 // that pod pointer is updated. 1481 npm.delete(oldPod) 1482 npm.addNominatedPodUnlocked(logger, newPodInfo, nominatingInfo) 1483 } 1484 1485 // NewPodNominator creates a nominator as a backing of framework.PodNominator. 1486 // A podLister is passed in so as to check if the pod exists 1487 // before adding its nominatedNode info. 1488 func NewPodNominator(podLister listersv1.PodLister) framework.PodNominator { 1489 return newPodNominator(podLister) 1490 } 1491 1492 func newPodNominator(podLister listersv1.PodLister) *nominator { 1493 return &nominator{ 1494 podLister: podLister, 1495 nominatedPods: make(map[string][]*framework.PodInfo), 1496 nominatedPodToNode: make(map[types.UID]string), 1497 } 1498 } 1499 1500 func podInfoKeyFunc(obj interface{}) (string, error) { 1501 return cache.MetaNamespaceKeyFunc(obj.(*framework.QueuedPodInfo).Pod) 1502 }