k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/internal/queue/scheduling_queue.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // This file contains structures that implement scheduling queue types.
    18  // Scheduling queues hold pods waiting to be scheduled. This file implements a
    19  // priority queue which has two sub queues and a additional data structure,
    20  // namely: activeQ, backoffQ and unschedulablePods.
    21  // - activeQ holds pods that are being considered for scheduling.
    22  // - backoffQ holds pods that moved from unschedulablePods and will move to
    23  //   activeQ when their backoff periods complete.
    24  // - unschedulablePods holds pods that were already attempted for scheduling and
    25  //   are currently determined to be unschedulable.
    26  
    27  package queue
    28  
    29  import (
    30  	"container/list"
    31  	"context"
    32  	"fmt"
    33  	"math/rand"
    34  	"reflect"
    35  	"sync"
    36  	"time"
    37  
    38  	v1 "k8s.io/api/core/v1"
    39  	"k8s.io/apimachinery/pkg/types"
    40  	"k8s.io/apimachinery/pkg/util/sets"
    41  	"k8s.io/apimachinery/pkg/util/wait"
    42  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    43  	"k8s.io/client-go/informers"
    44  	listersv1 "k8s.io/client-go/listers/core/v1"
    45  	"k8s.io/client-go/tools/cache"
    46  	"k8s.io/klog/v2"
    47  	"k8s.io/kubernetes/pkg/features"
    48  	"k8s.io/kubernetes/pkg/scheduler/framework"
    49  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
    50  	"k8s.io/kubernetes/pkg/scheduler/internal/heap"
    51  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    52  	"k8s.io/kubernetes/pkg/scheduler/util"
    53  	"k8s.io/utils/clock"
    54  )
    55  
    56  const (
    57  	// DefaultPodMaxInUnschedulablePodsDuration is the default value for the maximum
    58  	// time a pod can stay in unschedulablePods. If a pod stays in unschedulablePods
    59  	// for longer than this value, the pod will be moved from unschedulablePods to
    60  	// backoffQ or activeQ. If this value is empty, the default value (5min)
    61  	// will be used.
    62  	DefaultPodMaxInUnschedulablePodsDuration time.Duration = 5 * time.Minute
    63  	// Scheduling queue names
    64  	activeQ           = "Active"
    65  	backoffQ          = "Backoff"
    66  	unschedulablePods = "Unschedulable"
    67  
    68  	preEnqueue = "PreEnqueue"
    69  )
    70  
    71  const (
    72  	// DefaultPodInitialBackoffDuration is the default value for the initial backoff duration
    73  	// for unschedulable pods. To change the default podInitialBackoffDurationSeconds used by the
    74  	// scheduler, update the ComponentConfig value in defaults.go
    75  	DefaultPodInitialBackoffDuration time.Duration = 1 * time.Second
    76  	// DefaultPodMaxBackoffDuration is the default value for the max backoff duration
    77  	// for unschedulable pods. To change the default podMaxBackoffDurationSeconds used by the
    78  	// scheduler, update the ComponentConfig value in defaults.go
    79  	DefaultPodMaxBackoffDuration time.Duration = 10 * time.Second
    80  )
    81  
    82  // PreEnqueueCheck is a function type. It's used to build functions that
    83  // run against a Pod and the caller can choose to enqueue or skip the Pod
    84  // by the checking result.
    85  type PreEnqueueCheck func(pod *v1.Pod) bool
    86  
    87  // SchedulingQueue is an interface for a queue to store pods waiting to be scheduled.
    88  // The interface follows a pattern similar to cache.FIFO and cache.Heap and
    89  // makes it easy to use those data structures as a SchedulingQueue.
    90  type SchedulingQueue interface {
    91  	framework.PodNominator
    92  	Add(logger klog.Logger, pod *v1.Pod) error
    93  	// Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
    94  	// The passed-in pods are originally compiled from plugins that want to activate Pods,
    95  	// by injecting the pods through a reserved CycleState struct (PodsToActivate).
    96  	Activate(logger klog.Logger, pods map[string]*v1.Pod)
    97  	// AddUnschedulableIfNotPresent adds an unschedulable pod back to scheduling queue.
    98  	// The podSchedulingCycle represents the current scheduling cycle number which can be
    99  	// returned by calling SchedulingCycle().
   100  	AddUnschedulableIfNotPresent(logger klog.Logger, pod *framework.QueuedPodInfo, podSchedulingCycle int64) error
   101  	// SchedulingCycle returns the current number of scheduling cycle which is
   102  	// cached by scheduling queue. Normally, incrementing this number whenever
   103  	// a pod is popped (e.g. called Pop()) is enough.
   104  	SchedulingCycle() int64
   105  	// Pop removes the head of the queue and returns it. It blocks if the
   106  	// queue is empty and waits until a new item is added to the queue.
   107  	Pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
   108  	// Done must be called for pod returned by Pop. This allows the queue to
   109  	// keep track of which pods are currently being processed.
   110  	Done(types.UID)
   111  	Update(logger klog.Logger, oldPod, newPod *v1.Pod) error
   112  	Delete(pod *v1.Pod) error
   113  	// TODO(sanposhiho): move all PreEnqueueCkeck to Requeue and delete it from this parameter eventually.
   114  	// Some PreEnqueueCheck include event filtering logic based on some in-tree plugins
   115  	// and it affect badly to other plugins.
   116  	// See https://github.com/kubernetes/kubernetes/issues/110175
   117  	MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck)
   118  	AssignedPodAdded(logger klog.Logger, pod *v1.Pod)
   119  	AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod)
   120  	PendingPods() ([]*v1.Pod, string)
   121  	PodsInActiveQ() []*v1.Pod
   122  	// Close closes the SchedulingQueue so that the goroutine which is
   123  	// waiting to pop items can exit gracefully.
   124  	Close()
   125  	// Run starts the goroutines managing the queue.
   126  	Run(logger klog.Logger)
   127  }
   128  
   129  // NewSchedulingQueue initializes a priority queue as a new scheduling queue.
   130  func NewSchedulingQueue(
   131  	lessFn framework.LessFunc,
   132  	informerFactory informers.SharedInformerFactory,
   133  	opts ...Option) SchedulingQueue {
   134  	return NewPriorityQueue(lessFn, informerFactory, opts...)
   135  }
   136  
   137  // NominatedNodeName returns nominated node name of a Pod.
   138  func NominatedNodeName(pod *v1.Pod) string {
   139  	return pod.Status.NominatedNodeName
   140  }
   141  
   142  // PriorityQueue implements a scheduling queue.
   143  // The head of PriorityQueue is the highest priority pending pod. This structure
   144  // has two sub queues and a additional data structure, namely: activeQ,
   145  // backoffQ and unschedulablePods.
   146  //   - activeQ holds pods that are being considered for scheduling.
   147  //   - backoffQ holds pods that moved from unschedulablePods and will move to
   148  //     activeQ when their backoff periods complete.
   149  //   - unschedulablePods holds pods that were already attempted for scheduling and
   150  //     are currently determined to be unschedulable.
   151  type PriorityQueue struct {
   152  	*nominator
   153  
   154  	stop  chan struct{}
   155  	clock clock.Clock
   156  
   157  	// pod initial backoff duration.
   158  	podInitialBackoffDuration time.Duration
   159  	// pod maximum backoff duration.
   160  	podMaxBackoffDuration time.Duration
   161  	// the maximum time a pod can stay in the unschedulablePods.
   162  	podMaxInUnschedulablePodsDuration time.Duration
   163  
   164  	cond sync.Cond
   165  
   166  	// inFlightPods holds the UID of all pods which have been popped out for which Done
   167  	// hasn't been called yet - in other words, all pods that are currently being
   168  	// processed (being scheduled, in permit, or in the binding cycle).
   169  	//
   170  	// The values in the map are the entry of each pod in the inFlightEvents list.
   171  	// The value of that entry is the *v1.Pod at the time that scheduling of that
   172  	// pod started, which can be useful for logging or debugging.
   173  	inFlightPods map[types.UID]*list.Element
   174  
   175  	// inFlightEvents holds the events received by the scheduling queue
   176  	// (entry value is clusterEvent) together with in-flight pods (entry
   177  	// value is *v1.Pod). Entries get added at the end while the mutex is
   178  	// locked, so they get serialized.
   179  	//
   180  	// The pod entries are added in Pop and used to track which events
   181  	// occurred after the pod scheduling attempt for that pod started.
   182  	// They get removed when the scheduling attempt is done, at which
   183  	// point all events that occurred in the meantime are processed.
   184  	//
   185  	// After removal of a pod, events at the start of the list are no
   186  	// longer needed because all of the other in-flight pods started
   187  	// later. Those events can be removed.
   188  	inFlightEvents *list.List
   189  
   190  	// activeQ is heap structure that scheduler actively looks at to find pods to
   191  	// schedule. Head of heap is the highest priority pod.
   192  	activeQ *heap.Heap
   193  	// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
   194  	// are popped from this heap before the scheduler looks at activeQ
   195  	podBackoffQ *heap.Heap
   196  	// unschedulablePods holds pods that have been tried and determined unschedulable.
   197  	unschedulablePods *UnschedulablePods
   198  	// schedulingCycle represents sequence number of scheduling cycle and is incremented
   199  	// when a pod is popped.
   200  	schedulingCycle int64
   201  	// moveRequestCycle caches the sequence number of scheduling cycle when we
   202  	// received a move request. Unschedulable pods in and before this scheduling
   203  	// cycle will be put back to activeQueue if we were trying to schedule them
   204  	// when we received move request.
   205  	// TODO: this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
   206  	moveRequestCycle int64
   207  
   208  	// preEnqueuePluginMap is keyed with profile name, valued with registered preEnqueue plugins.
   209  	preEnqueuePluginMap map[string][]framework.PreEnqueuePlugin
   210  	// queueingHintMap is keyed with profile name, valued with registered queueing hint functions.
   211  	queueingHintMap QueueingHintMapPerProfile
   212  
   213  	// closed indicates that the queue is closed.
   214  	// It is mainly used to let Pop() exit its control loop while waiting for an item.
   215  	closed bool
   216  
   217  	nsLister listersv1.NamespaceLister
   218  
   219  	metricsRecorder metrics.MetricAsyncRecorder
   220  	// pluginMetricsSamplePercent is the percentage of plugin metrics to be sampled.
   221  	pluginMetricsSamplePercent int
   222  
   223  	// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
   224  	isSchedulingQueueHintEnabled bool
   225  }
   226  
   227  // QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName.
   228  type QueueingHintFunction struct {
   229  	PluginName     string
   230  	QueueingHintFn framework.QueueingHintFn
   231  }
   232  
   233  // clusterEvent has the event and involved objects.
   234  type clusterEvent struct {
   235  	event framework.ClusterEvent
   236  	// oldObj is the object that involved this event.
   237  	oldObj interface{}
   238  	// newObj is the object that involved this event.
   239  	newObj interface{}
   240  }
   241  
   242  type priorityQueueOptions struct {
   243  	clock                             clock.Clock
   244  	podInitialBackoffDuration         time.Duration
   245  	podMaxBackoffDuration             time.Duration
   246  	podMaxInUnschedulablePodsDuration time.Duration
   247  	podLister                         listersv1.PodLister
   248  	metricsRecorder                   metrics.MetricAsyncRecorder
   249  	pluginMetricsSamplePercent        int
   250  	preEnqueuePluginMap               map[string][]framework.PreEnqueuePlugin
   251  	queueingHintMap                   QueueingHintMapPerProfile
   252  }
   253  
   254  // Option configures a PriorityQueue
   255  type Option func(*priorityQueueOptions)
   256  
   257  // WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
   258  func WithClock(clock clock.Clock) Option {
   259  	return func(o *priorityQueueOptions) {
   260  		o.clock = clock
   261  	}
   262  }
   263  
   264  // WithPodInitialBackoffDuration sets pod initial backoff duration for PriorityQueue.
   265  func WithPodInitialBackoffDuration(duration time.Duration) Option {
   266  	return func(o *priorityQueueOptions) {
   267  		o.podInitialBackoffDuration = duration
   268  	}
   269  }
   270  
   271  // WithPodMaxBackoffDuration sets pod max backoff duration for PriorityQueue.
   272  func WithPodMaxBackoffDuration(duration time.Duration) Option {
   273  	return func(o *priorityQueueOptions) {
   274  		o.podMaxBackoffDuration = duration
   275  	}
   276  }
   277  
   278  // WithPodLister sets pod lister for PriorityQueue.
   279  func WithPodLister(pl listersv1.PodLister) Option {
   280  	return func(o *priorityQueueOptions) {
   281  		o.podLister = pl
   282  	}
   283  }
   284  
   285  // WithPodMaxInUnschedulablePodsDuration sets podMaxInUnschedulablePodsDuration for PriorityQueue.
   286  func WithPodMaxInUnschedulablePodsDuration(duration time.Duration) Option {
   287  	return func(o *priorityQueueOptions) {
   288  		o.podMaxInUnschedulablePodsDuration = duration
   289  	}
   290  }
   291  
   292  // QueueingHintMapPerProfile is keyed with profile name, valued with queueing hint map registered for the profile.
   293  type QueueingHintMapPerProfile map[string]QueueingHintMap
   294  
   295  // QueueingHintMap is keyed with ClusterEvent, valued with queueing hint functions registered for the event.
   296  type QueueingHintMap map[framework.ClusterEvent][]*QueueingHintFunction
   297  
   298  // WithQueueingHintMapPerProfile sets queueingHintMap for PriorityQueue.
   299  func WithQueueingHintMapPerProfile(m QueueingHintMapPerProfile) Option {
   300  	return func(o *priorityQueueOptions) {
   301  		o.queueingHintMap = m
   302  	}
   303  }
   304  
   305  // WithPreEnqueuePluginMap sets preEnqueuePluginMap for PriorityQueue.
   306  func WithPreEnqueuePluginMap(m map[string][]framework.PreEnqueuePlugin) Option {
   307  	return func(o *priorityQueueOptions) {
   308  		o.preEnqueuePluginMap = m
   309  	}
   310  }
   311  
   312  // WithMetricsRecorder sets metrics recorder.
   313  func WithMetricsRecorder(recorder metrics.MetricAsyncRecorder) Option {
   314  	return func(o *priorityQueueOptions) {
   315  		o.metricsRecorder = recorder
   316  	}
   317  }
   318  
   319  // WithPluginMetricsSamplePercent sets the percentage of plugin metrics to be sampled.
   320  func WithPluginMetricsSamplePercent(percent int) Option {
   321  	return func(o *priorityQueueOptions) {
   322  		o.pluginMetricsSamplePercent = percent
   323  	}
   324  }
   325  
   326  var defaultPriorityQueueOptions = priorityQueueOptions{
   327  	clock:                             clock.RealClock{},
   328  	podInitialBackoffDuration:         DefaultPodInitialBackoffDuration,
   329  	podMaxBackoffDuration:             DefaultPodMaxBackoffDuration,
   330  	podMaxInUnschedulablePodsDuration: DefaultPodMaxInUnschedulablePodsDuration,
   331  }
   332  
   333  // Making sure that PriorityQueue implements SchedulingQueue.
   334  var _ SchedulingQueue = &PriorityQueue{}
   335  
   336  // newQueuedPodInfoForLookup builds a QueuedPodInfo object for a lookup in the queue.
   337  func newQueuedPodInfoForLookup(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
   338  	// Since this is only used for a lookup in the queue, we only need to set the Pod,
   339  	// and so we avoid creating a full PodInfo, which is expensive to instantiate frequently.
   340  	return &framework.QueuedPodInfo{
   341  		PodInfo:              &framework.PodInfo{Pod: pod},
   342  		UnschedulablePlugins: sets.New(plugins...),
   343  	}
   344  }
   345  
   346  // NewPriorityQueue creates a PriorityQueue object.
   347  func NewPriorityQueue(
   348  	lessFn framework.LessFunc,
   349  	informerFactory informers.SharedInformerFactory,
   350  	opts ...Option,
   351  ) *PriorityQueue {
   352  	options := defaultPriorityQueueOptions
   353  	if options.podLister == nil {
   354  		options.podLister = informerFactory.Core().V1().Pods().Lister()
   355  	}
   356  	for _, opt := range opts {
   357  		opt(&options)
   358  	}
   359  
   360  	comp := func(podInfo1, podInfo2 interface{}) bool {
   361  		pInfo1 := podInfo1.(*framework.QueuedPodInfo)
   362  		pInfo2 := podInfo2.(*framework.QueuedPodInfo)
   363  		return lessFn(pInfo1, pInfo2)
   364  	}
   365  
   366  	pq := &PriorityQueue{
   367  		nominator:                         newPodNominator(options.podLister),
   368  		clock:                             options.clock,
   369  		stop:                              make(chan struct{}),
   370  		podInitialBackoffDuration:         options.podInitialBackoffDuration,
   371  		podMaxBackoffDuration:             options.podMaxBackoffDuration,
   372  		podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration,
   373  		activeQ:                           heap.NewWithRecorder(podInfoKeyFunc, comp, metrics.NewActivePodsRecorder()),
   374  		unschedulablePods:                 newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()),
   375  		inFlightPods:                      make(map[types.UID]*list.Element),
   376  		inFlightEvents:                    list.New(),
   377  		preEnqueuePluginMap:               options.preEnqueuePluginMap,
   378  		queueingHintMap:                   options.queueingHintMap,
   379  		metricsRecorder:                   options.metricsRecorder,
   380  		pluginMetricsSamplePercent:        options.pluginMetricsSamplePercent,
   381  		moveRequestCycle:                  -1,
   382  		isSchedulingQueueHintEnabled:      utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
   383  	}
   384  	pq.cond.L = &pq.lock
   385  	pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder())
   386  	pq.nsLister = informerFactory.Core().V1().Namespaces().Lister()
   387  
   388  	return pq
   389  }
   390  
   391  // Run starts the goroutine to pump from podBackoffQ to activeQ
   392  func (p *PriorityQueue) Run(logger klog.Logger) {
   393  	go wait.Until(func() {
   394  		p.flushBackoffQCompleted(logger)
   395  	}, 1.0*time.Second, p.stop)
   396  	go wait.Until(func() {
   397  		p.flushUnschedulablePodsLeftover(logger)
   398  	}, 30*time.Second, p.stop)
   399  }
   400  
   401  // queueingStrategy indicates how the scheduling queue should enqueue the Pod from unschedulable pod pool.
   402  type queueingStrategy int
   403  
   404  const (
   405  	// queueSkip indicates that the scheduling queue should skip requeuing the Pod to activeQ/backoffQ.
   406  	queueSkip queueingStrategy = iota
   407  	// queueAfterBackoff indicates that the scheduling queue should requeue the Pod after backoff is completed.
   408  	queueAfterBackoff
   409  	// queueImmediately indicates that the scheduling queue should skip backoff and requeue the Pod immediately to activeQ.
   410  	queueImmediately
   411  )
   412  
   413  // isEventOfInterest returns true if the event is of interest by some plugins.
   414  func (p *PriorityQueue) isEventOfInterest(logger klog.Logger, event framework.ClusterEvent) bool {
   415  	if event.IsWildCard() {
   416  		return true
   417  	}
   418  
   419  	for _, hintMap := range p.queueingHintMap {
   420  		for eventToMatch := range hintMap {
   421  			if eventToMatch.Match(event) {
   422  				// This event is interested by some plugins.
   423  				return true
   424  			}
   425  		}
   426  	}
   427  
   428  	logger.V(6).Info("receive an event that isn't interested by any enabled plugins", "event", event)
   429  
   430  	return false
   431  }
   432  
   433  // isPodWorthRequeuing calls QueueingHintFn of only plugins registered in pInfo.unschedulablePlugins and pInfo.PendingPlugins.
   434  //
   435  // If any of pInfo.PendingPlugins return Queue,
   436  // the scheduling queue is supposed to enqueue this Pod to activeQ, skipping backoffQ.
   437  // If any of pInfo.unschedulablePlugins return Queue,
   438  // the scheduling queue is supposed to enqueue this Pod to activeQ/backoffQ depending on the remaining backoff time of the Pod.
   439  // If all QueueingHintFns returns Skip, the scheduling queue enqueues the Pod back to unschedulable Pod pool
   440  // because no plugin changes the scheduling result via the event.
   441  func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) queueingStrategy {
   442  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   443  	if rejectorPlugins.Len() == 0 {
   444  		logger.V(6).Info("Worth requeuing because no failed plugins", "pod", klog.KObj(pInfo.Pod))
   445  		return queueAfterBackoff
   446  	}
   447  
   448  	if event.IsWildCard() {
   449  		// If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ.
   450  		// We return queueAfterBackoff in this case, while resetting all blocked plugins.
   451  		logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod))
   452  		return queueAfterBackoff
   453  	}
   454  
   455  	hintMap, ok := p.queueingHintMap[pInfo.Pod.Spec.SchedulerName]
   456  	if !ok {
   457  		// shouldn't reach here unless bug.
   458  		logger.Error(nil, "No QueueingHintMap is registered for this profile", "profile", pInfo.Pod.Spec.SchedulerName, "pod", klog.KObj(pInfo.Pod))
   459  		return queueAfterBackoff
   460  	}
   461  
   462  	pod := pInfo.Pod
   463  	queueStrategy := queueSkip
   464  	for eventToMatch, hintfns := range hintMap {
   465  		if !eventToMatch.Match(event) {
   466  			continue
   467  		}
   468  
   469  		for _, hintfn := range hintfns {
   470  			if !rejectorPlugins.Has(hintfn.PluginName) {
   471  				// skip if it's not hintfn from rejectorPlugins.
   472  				continue
   473  			}
   474  
   475  			hint, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj)
   476  			if err != nil {
   477  				// If the QueueingHintFn returned an error, we should treat the event as Queue so that we can prevent
   478  				// the Pod from being stuck in the unschedulable pod pool.
   479  				oldObjMeta, newObjMeta, asErr := util.As[klog.KMetadata](oldObj, newObj)
   480  				if asErr != nil {
   481  					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod))
   482  				} else {
   483  					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod), "oldObj", klog.KObj(oldObjMeta), "newObj", klog.KObj(newObjMeta))
   484  				}
   485  				hint = framework.Queue
   486  			}
   487  			if hint == framework.QueueSkip {
   488  				continue
   489  			}
   490  
   491  			if pInfo.PendingPlugins.Has(hintfn.PluginName) {
   492  				// interprets Queue from the Pending plugin as queueImmediately.
   493  				// We can return immediately because queueImmediately is the highest priority.
   494  				return queueImmediately
   495  			}
   496  
   497  			// interprets Queue from the unschedulable plugin as queueAfterBackoff.
   498  
   499  			if pInfo.PendingPlugins.Len() == 0 {
   500  				// We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod,
   501  				// and queueAfterBackoff is the second highest priority.
   502  				return queueAfterBackoff
   503  			}
   504  
   505  			// We can't return immediately because there are some Pending plugins registered in this Pod.
   506  			// We need to check if those plugins return Queue or not and if they do, we return queueImmediately.
   507  			queueStrategy = queueAfterBackoff
   508  		}
   509  	}
   510  
   511  	return queueStrategy
   512  }
   513  
   514  // runPreEnqueuePlugins iterates PreEnqueue function in each registered PreEnqueuePlugin.
   515  // It returns true if all PreEnqueue function run successfully; otherwise returns false
   516  // upon the first failure.
   517  // Note: we need to associate the failed plugin to `pInfo`, so that the pod can be moved back
   518  // to activeQ by related cluster event.
   519  func (p *PriorityQueue) runPreEnqueuePlugins(ctx context.Context, pInfo *framework.QueuedPodInfo) bool {
   520  	logger := klog.FromContext(ctx)
   521  	var s *framework.Status
   522  	pod := pInfo.Pod
   523  	startTime := p.clock.Now()
   524  	defer func() {
   525  		metrics.FrameworkExtensionPointDuration.WithLabelValues(preEnqueue, s.Code().String(), pod.Spec.SchedulerName).Observe(metrics.SinceInSeconds(startTime))
   526  	}()
   527  
   528  	shouldRecordMetric := rand.Intn(100) < p.pluginMetricsSamplePercent
   529  	for _, pl := range p.preEnqueuePluginMap[pod.Spec.SchedulerName] {
   530  		s = p.runPreEnqueuePlugin(ctx, pl, pod, shouldRecordMetric)
   531  		if s.IsSuccess() {
   532  			continue
   533  		}
   534  		pInfo.UnschedulablePlugins.Insert(pl.Name())
   535  		metrics.UnschedulableReason(pl.Name(), pod.Spec.SchedulerName).Inc()
   536  		if s.Code() == framework.Error {
   537  			logger.Error(s.AsError(), "Unexpected error running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name())
   538  		} else {
   539  			logger.V(4).Info("Status after running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name(), "status", s)
   540  		}
   541  		return false
   542  	}
   543  	return true
   544  }
   545  
   546  func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.PreEnqueuePlugin, pod *v1.Pod, shouldRecordMetric bool) *framework.Status {
   547  	if !shouldRecordMetric {
   548  		return pl.PreEnqueue(ctx, pod)
   549  	}
   550  	startTime := p.clock.Now()
   551  	s := pl.PreEnqueue(ctx, pod)
   552  	p.metricsRecorder.ObservePluginDurationAsync(preEnqueue, pl.Name(), s.Code().String(), p.clock.Since(startTime).Seconds())
   553  	return s
   554  }
   555  
   556  // addToActiveQ tries to add pod to active queue. It returns 2 parameters:
   557  // 1. a boolean flag to indicate whether the pod is added successfully.
   558  // 2. an error for the caller to act on.
   559  func (p *PriorityQueue) addToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo) (bool, error) {
   560  	pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
   561  	if pInfo.Gated {
   562  		// Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins.
   563  		p.unschedulablePods.addOrUpdate(pInfo)
   564  		return false, nil
   565  	}
   566  	if pInfo.InitialAttemptTimestamp == nil {
   567  		now := p.clock.Now()
   568  		pInfo.InitialAttemptTimestamp = &now
   569  	}
   570  	if err := p.activeQ.Add(pInfo); err != nil {
   571  		logger.Error(err, "Error adding pod to the active queue", "pod", klog.KObj(pInfo.Pod))
   572  		return false, err
   573  	}
   574  	return true, nil
   575  }
   576  
   577  // Add adds a pod to the active queue. It should be called only when a new pod
   578  // is added so there is no chance the pod is already in active/unschedulable/backoff queues
   579  func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) error {
   580  	p.lock.Lock()
   581  	defer p.lock.Unlock()
   582  
   583  	pInfo := p.newQueuedPodInfo(pod)
   584  	gated := pInfo.Gated
   585  	if added, err := p.addToActiveQ(logger, pInfo); !added {
   586  		return err
   587  	}
   588  	if p.unschedulablePods.get(pod) != nil {
   589  		logger.Error(nil, "Error: pod is already in the unschedulable queue", "pod", klog.KObj(pod))
   590  		p.unschedulablePods.delete(pod, gated)
   591  	}
   592  	// Delete pod from backoffQ if it is backing off
   593  	if err := p.podBackoffQ.Delete(pInfo); err == nil {
   594  		logger.Error(nil, "Error: pod is already in the podBackoff queue", "pod", klog.KObj(pod))
   595  	}
   596  	logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", PodAdd, "queue", activeQ)
   597  	metrics.SchedulerQueueIncomingPods.WithLabelValues("active", PodAdd).Inc()
   598  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   599  	p.cond.Broadcast()
   600  
   601  	return nil
   602  }
   603  
   604  // Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
   605  func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
   606  	p.lock.Lock()
   607  	defer p.lock.Unlock()
   608  
   609  	activated := false
   610  	for _, pod := range pods {
   611  		if p.activate(logger, pod) {
   612  			activated = true
   613  		}
   614  	}
   615  
   616  	if activated {
   617  		p.cond.Broadcast()
   618  	}
   619  }
   620  
   621  func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
   622  	// Verify if the pod is present in activeQ.
   623  	if _, exists, _ := p.activeQ.Get(newQueuedPodInfoForLookup(pod)); exists {
   624  		// No need to activate if it's already present in activeQ.
   625  		return false
   626  	}
   627  	var pInfo *framework.QueuedPodInfo
   628  	// Verify if the pod is present in unschedulablePods or backoffQ.
   629  	if pInfo = p.unschedulablePods.get(pod); pInfo == nil {
   630  		// If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it.
   631  		if obj, exists, _ := p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod)); !exists {
   632  			logger.Error(nil, "To-activate pod does not exist in unschedulablePods or backoffQ", "pod", klog.KObj(pod))
   633  			return false
   634  		} else {
   635  			pInfo = obj.(*framework.QueuedPodInfo)
   636  		}
   637  	}
   638  
   639  	if pInfo == nil {
   640  		// Redundant safe check. We shouldn't reach here.
   641  		logger.Error(nil, "Internal error: cannot obtain pInfo")
   642  		return false
   643  	}
   644  
   645  	gated := pInfo.Gated
   646  	if added, _ := p.addToActiveQ(logger, pInfo); !added {
   647  		return false
   648  	}
   649  	p.unschedulablePods.delete(pInfo.Pod, gated)
   650  	p.podBackoffQ.Delete(pInfo)
   651  	metrics.SchedulerQueueIncomingPods.WithLabelValues("active", ForceActivate).Inc()
   652  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   653  	return true
   654  }
   655  
   656  // isPodBackingoff returns true if a pod is still waiting for its backoff timer.
   657  // If this returns true, the pod should not be re-tried.
   658  func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
   659  	if podInfo.Gated {
   660  		return false
   661  	}
   662  	boTime := p.getBackoffTime(podInfo)
   663  	return boTime.After(p.clock.Now())
   664  }
   665  
   666  // SchedulingCycle returns current scheduling cycle.
   667  func (p *PriorityQueue) SchedulingCycle() int64 {
   668  	p.lock.RLock()
   669  	defer p.lock.RUnlock()
   670  	return p.schedulingCycle
   671  }
   672  
   673  // determineSchedulingHintForInFlightPod looks at the unschedulable plugins of the given Pod
   674  // and determines the scheduling hint for this Pod while checking the events that happened during in-flight.
   675  func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) queueingStrategy {
   676  	logger.V(5).Info("Checking events for in-flight pod", "pod", klog.KObj(pInfo.Pod), "unschedulablePlugins", pInfo.UnschedulablePlugins, "inFlightEventsSize", p.inFlightEvents.Len(), "inFlightPodsSize", len(p.inFlightPods))
   677  
   678  	// AddUnschedulableIfNotPresent is called with the Pod at the end of scheduling or binding.
   679  	// So, given pInfo should have been Pop()ed before,
   680  	// we can assume pInfo must be recorded in inFlightPods and thus inFlightEvents.
   681  	inFlightPod, ok := p.inFlightPods[pInfo.Pod.UID]
   682  	if !ok {
   683  		// This can happen while updating a pod. In that case pInfo.UnschedulablePlugins should
   684  		// be empty. If it is not, we may have a problem.
   685  		if len(pInfo.UnschedulablePlugins) != 0 {
   686  			logger.Error(nil, "In flight Pod isn't found in the scheduling queue. If you see this error log, it's likely a bug in the scheduler.", "pod", klog.KObj(pInfo.Pod))
   687  			return queueAfterBackoff
   688  		}
   689  		if p.inFlightEvents.Len() > len(p.inFlightPods) {
   690  			return queueAfterBackoff
   691  		}
   692  		return queueSkip
   693  	}
   694  
   695  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   696  	if len(rejectorPlugins) == 0 {
   697  		// No failed plugins are associated with this Pod.
   698  		// Meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
   699  		// In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
   700  		return queueAfterBackoff
   701  	}
   702  
   703  	// check if there is an event that makes this Pod schedulable based on pInfo.UnschedulablePlugins.
   704  	queueingStrategy := queueSkip
   705  	for event := inFlightPod.Next(); event != nil; event = event.Next() {
   706  		e, ok := event.Value.(*clusterEvent)
   707  		if !ok {
   708  			// Must be another in-flight Pod (*v1.Pod). Can be ignored.
   709  			continue
   710  		}
   711  		logger.V(5).Info("Checking event for in-flight pod", "pod", klog.KObj(pInfo.Pod), "event", e.event.Label)
   712  
   713  		switch p.isPodWorthRequeuing(logger, pInfo, e.event, e.oldObj, e.newObj) {
   714  		case queueSkip:
   715  			continue
   716  		case queueImmediately:
   717  			// queueImmediately is the highest priority.
   718  			// No need to go through the rest of the events.
   719  			return queueImmediately
   720  		case queueAfterBackoff:
   721  			// replace schedulingHint with queueAfterBackoff
   722  			queueingStrategy = queueAfterBackoff
   723  			if pInfo.PendingPlugins.Len() == 0 {
   724  				// We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod,
   725  				// and queueAfterBackoff is the second highest priority.
   726  				return queueAfterBackoff
   727  			}
   728  		}
   729  	}
   730  	return queueingStrategy
   731  }
   732  
   733  // addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into
   734  // the queue, unless it is already in the queue. Normally, PriorityQueue puts
   735  // unschedulable pods in `unschedulablePods`. But if there has been a recent move
   736  // request, then the pod is put in `podBackoffQ`.
   737  // TODO: This function is called only when p.isSchedulingQueueHintEnabled is false,
   738  // and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
   739  func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
   740  	pod := pInfo.Pod
   741  	// Refresh the timestamp since the pod is re-added.
   742  	pInfo.Timestamp = p.clock.Now()
   743  
   744  	// When the queueing hint is enabled, they are used differently.
   745  	// But, we use all of them as UnschedulablePlugins when the queueing hint isn't enabled so that we don't break the old behaviour.
   746  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   747  
   748  	// If a move request has been received, move it to the BackoffQ, otherwise move
   749  	// it to unschedulablePods.
   750  	for plugin := range rejectorPlugins {
   751  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc()
   752  	}
   753  	if p.moveRequestCycle >= podSchedulingCycle || len(rejectorPlugins) == 0 {
   754  		// Two cases to move a Pod to the active/backoff queue:
   755  		// - The Pod is rejected by some plugins, but a move request is received after this Pod's scheduling cycle is started.
   756  		//   In this case, the received event may be make Pod schedulable and we should retry scheduling it.
   757  		// - No unschedulable plugins are associated with this Pod,
   758  		//   meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
   759  		//   In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
   760  		if err := p.podBackoffQ.Add(pInfo); err != nil {
   761  			return fmt.Errorf("error adding pod %v to the backoff queue: %v", klog.KObj(pod), err)
   762  		}
   763  		logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", backoffQ)
   764  		metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", ScheduleAttemptFailure).Inc()
   765  	} else {
   766  		p.unschedulablePods.addOrUpdate(pInfo)
   767  		logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", unschedulablePods)
   768  		metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
   769  	}
   770  
   771  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   772  	return nil
   773  }
   774  
   775  // AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into
   776  // the queue, unless it is already in the queue. Normally, PriorityQueue puts
   777  // unschedulable pods in `unschedulablePods`. But if there has been a recent move
   778  // request, then the pod is put in `podBackoffQ`.
   779  func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
   780  	p.lock.Lock()
   781  	defer p.lock.Unlock()
   782  
   783  	// In any case, this Pod will be moved back to the queue and we should call Done.
   784  	defer p.done(pInfo.Pod.UID)
   785  
   786  	pod := pInfo.Pod
   787  	if p.unschedulablePods.get(pod) != nil {
   788  		return fmt.Errorf("Pod %v is already present in unschedulable queue", klog.KObj(pod))
   789  	}
   790  
   791  	if _, exists, _ := p.activeQ.Get(pInfo); exists {
   792  		return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
   793  	}
   794  	if _, exists, _ := p.podBackoffQ.Get(pInfo); exists {
   795  		return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
   796  	}
   797  
   798  	if !p.isSchedulingQueueHintEnabled {
   799  		// fall back to the old behavior which doesn't depend on the queueing hint.
   800  		return p.addUnschedulableWithoutQueueingHint(logger, pInfo, podSchedulingCycle)
   801  	}
   802  
   803  	// Refresh the timestamp since the pod is re-added.
   804  	pInfo.Timestamp = p.clock.Now()
   805  
   806  	// If a move request has been received, move it to the BackoffQ, otherwise move
   807  	// it to unschedulablePods.
   808  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   809  	for plugin := range rejectorPlugins {
   810  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc()
   811  	}
   812  
   813  	// We check whether this Pod may change its scheduling result by any of events that happened during scheduling.
   814  	schedulingHint := p.determineSchedulingHintForInFlightPod(logger, pInfo)
   815  
   816  	// In this case, we try to requeue this Pod to activeQ/backoffQ.
   817  	queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, ScheduleAttemptFailure)
   818  	logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins)
   819  	if queue == activeQ {
   820  		// When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out.
   821  		p.cond.Broadcast()
   822  	}
   823  
   824  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   825  	return nil
   826  }
   827  
   828  // flushBackoffQCompleted Moves all pods from backoffQ which have completed backoff in to activeQ
   829  func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) {
   830  	p.lock.Lock()
   831  	defer p.lock.Unlock()
   832  	activated := false
   833  	for {
   834  		rawPodInfo := p.podBackoffQ.Peek()
   835  		if rawPodInfo == nil {
   836  			break
   837  		}
   838  		pInfo := rawPodInfo.(*framework.QueuedPodInfo)
   839  		pod := pInfo.Pod
   840  		if p.isPodBackingoff(pInfo) {
   841  			break
   842  		}
   843  		_, err := p.podBackoffQ.Pop()
   844  		if err != nil {
   845  			logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
   846  			break
   847  		}
   848  		if added, _ := p.addToActiveQ(logger, pInfo); added {
   849  			logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", BackoffComplete, "queue", activeQ)
   850  			metrics.SchedulerQueueIncomingPods.WithLabelValues("active", BackoffComplete).Inc()
   851  			activated = true
   852  		}
   853  	}
   854  
   855  	if activated {
   856  		p.cond.Broadcast()
   857  	}
   858  }
   859  
   860  // flushUnschedulablePodsLeftover moves pods which stay in unschedulablePods
   861  // longer than podMaxInUnschedulablePodsDuration to backoffQ or activeQ.
   862  func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) {
   863  	p.lock.Lock()
   864  	defer p.lock.Unlock()
   865  
   866  	var podsToMove []*framework.QueuedPodInfo
   867  	currentTime := p.clock.Now()
   868  	for _, pInfo := range p.unschedulablePods.podInfoMap {
   869  		lastScheduleTime := pInfo.Timestamp
   870  		if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration {
   871  			podsToMove = append(podsToMove, pInfo)
   872  		}
   873  	}
   874  
   875  	if len(podsToMove) > 0 {
   876  		p.movePodsToActiveOrBackoffQueue(logger, podsToMove, UnschedulableTimeout, nil, nil)
   877  	}
   878  }
   879  
   880  // Pop removes the head of the active queue and returns it. It blocks if the
   881  // activeQ is empty and waits until a new item is added to the queue. It
   882  // increments scheduling cycle when a pod is popped.
   883  func (p *PriorityQueue) Pop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
   884  	p.lock.Lock()
   885  	defer p.lock.Unlock()
   886  	for p.activeQ.Len() == 0 {
   887  		// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
   888  		// When Close() is called, the p.closed is set and the condition is broadcast,
   889  		// which causes this loop to continue and return from the Pop().
   890  		if p.closed {
   891  			logger.V(2).Info("Scheduling queue is closed")
   892  			return nil, nil
   893  		}
   894  		p.cond.Wait()
   895  	}
   896  	obj, err := p.activeQ.Pop()
   897  	if err != nil {
   898  		return nil, err
   899  	}
   900  	pInfo := obj.(*framework.QueuedPodInfo)
   901  	pInfo.Attempts++
   902  	p.schedulingCycle++
   903  	// In flight, no concurrent events yet.
   904  	if p.isSchedulingQueueHintEnabled {
   905  		p.inFlightPods[pInfo.Pod.UID] = p.inFlightEvents.PushBack(pInfo.Pod)
   906  	}
   907  
   908  	// Update metrics and reset the set of unschedulable plugins for the next attempt.
   909  	for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) {
   910  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec()
   911  	}
   912  	pInfo.UnschedulablePlugins.Clear()
   913  	pInfo.PendingPlugins.Clear()
   914  
   915  	return pInfo, nil
   916  }
   917  
   918  // Done must be called for pod returned by Pop. This allows the queue to
   919  // keep track of which pods are currently being processed.
   920  func (p *PriorityQueue) Done(pod types.UID) {
   921  	p.lock.Lock()
   922  	defer p.lock.Unlock()
   923  
   924  	p.done(pod)
   925  }
   926  
   927  func (p *PriorityQueue) done(pod types.UID) {
   928  	if !p.isSchedulingQueueHintEnabled {
   929  		// do nothing if schedulingQueueHint is disabled.
   930  		// In that case, we don't have inFlightPods and inFlightEvents.
   931  		return
   932  	}
   933  	inFlightPod, ok := p.inFlightPods[pod]
   934  	if !ok {
   935  		// This Pod is already done()ed.
   936  		return
   937  	}
   938  	delete(p.inFlightPods, pod)
   939  
   940  	// Remove the pod from the list.
   941  	p.inFlightEvents.Remove(inFlightPod)
   942  
   943  	// Remove events which are only referred to by this Pod
   944  	// so that the inFlightEvents list doesn't grow infinitely.
   945  	// If the pod was at the head of the list, then all
   946  	// events between it and the next pod are no longer needed
   947  	// and can be removed.
   948  	for {
   949  		e := p.inFlightEvents.Front()
   950  		if e == nil {
   951  			// Empty list.
   952  			break
   953  		}
   954  		if _, ok := e.Value.(*clusterEvent); !ok {
   955  			// A pod, must stop pruning.
   956  			break
   957  		}
   958  		p.inFlightEvents.Remove(e)
   959  	}
   960  }
   961  
   962  // isPodUpdated checks if the pod is updated in a way that it may have become
   963  // schedulable. It drops status of the pod and compares it with old version,
   964  // except for pod.status.resourceClaimStatuses: changing that may have an
   965  // effect on scheduling.
   966  func isPodUpdated(oldPod, newPod *v1.Pod) bool {
   967  	strip := func(pod *v1.Pod) *v1.Pod {
   968  		p := pod.DeepCopy()
   969  		p.ResourceVersion = ""
   970  		p.Generation = 0
   971  		p.Status = v1.PodStatus{
   972  			ResourceClaimStatuses: pod.Status.ResourceClaimStatuses,
   973  		}
   974  		p.ManagedFields = nil
   975  		p.Finalizers = nil
   976  		return p
   977  	}
   978  	return !reflect.DeepEqual(strip(oldPod), strip(newPod))
   979  }
   980  
   981  // Update updates a pod in the active or backoff queue if present. Otherwise, it removes
   982  // the item from the unschedulable queue if pod is updated in a way that it may
   983  // become schedulable and adds the updated one to the active queue.
   984  // If pod is not present in any of the queues, it is added to the active queue.
   985  func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) error {
   986  	p.lock.Lock()
   987  	defer p.lock.Unlock()
   988  
   989  	if oldPod != nil {
   990  		oldPodInfo := newQueuedPodInfoForLookup(oldPod)
   991  		// If the pod is already in the active queue, just update it there.
   992  		if oldPodInfo, exists, _ := p.activeQ.Get(oldPodInfo); exists {
   993  			pInfo := updatePod(oldPodInfo, newPod)
   994  			p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
   995  			return p.activeQ.Update(pInfo)
   996  		}
   997  
   998  		// If the pod is in the backoff queue, update it there.
   999  		if oldPodInfo, exists, _ := p.podBackoffQ.Get(oldPodInfo); exists {
  1000  			pInfo := updatePod(oldPodInfo, newPod)
  1001  			p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
  1002  			return p.podBackoffQ.Update(pInfo)
  1003  		}
  1004  	}
  1005  
  1006  	// If the pod is in the unschedulable queue, updating it may make it schedulable.
  1007  	if usPodInfo := p.unschedulablePods.get(newPod); usPodInfo != nil {
  1008  		pInfo := updatePod(usPodInfo, newPod)
  1009  		p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
  1010  		if isPodUpdated(oldPod, newPod) {
  1011  			gated := usPodInfo.Gated
  1012  			if p.isPodBackingoff(usPodInfo) {
  1013  				if err := p.podBackoffQ.Add(pInfo); err != nil {
  1014  					return err
  1015  				}
  1016  				p.unschedulablePods.delete(usPodInfo.Pod, gated)
  1017  				logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", backoffQ)
  1018  			} else {
  1019  				if added, err := p.addToActiveQ(logger, pInfo); !added {
  1020  					return err
  1021  				}
  1022  				p.unschedulablePods.delete(usPodInfo.Pod, gated)
  1023  				logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", BackoffComplete, "queue", activeQ)
  1024  				p.cond.Broadcast()
  1025  			}
  1026  		} else {
  1027  			// Pod update didn't make it schedulable, keep it in the unschedulable queue.
  1028  			p.unschedulablePods.addOrUpdate(pInfo)
  1029  		}
  1030  
  1031  		return nil
  1032  	}
  1033  	// If pod is not in any of the queues, we put it in the active queue.
  1034  	pInfo := p.newQueuedPodInfo(newPod)
  1035  	if added, err := p.addToActiveQ(logger, pInfo); !added {
  1036  		return err
  1037  	}
  1038  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
  1039  	logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", activeQ)
  1040  	p.cond.Broadcast()
  1041  	return nil
  1042  }
  1043  
  1044  // Delete deletes the item from either of the two queues. It assumes the pod is
  1045  // only in one queue.
  1046  func (p *PriorityQueue) Delete(pod *v1.Pod) error {
  1047  	p.lock.Lock()
  1048  	defer p.lock.Unlock()
  1049  	p.deleteNominatedPodIfExistsUnlocked(pod)
  1050  	pInfo := newQueuedPodInfoForLookup(pod)
  1051  	if err := p.activeQ.Delete(pInfo); err != nil {
  1052  		// The item was probably not found in the activeQ.
  1053  		p.podBackoffQ.Delete(pInfo)
  1054  		if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
  1055  			p.unschedulablePods.delete(pod, pInfo.Gated)
  1056  		}
  1057  	}
  1058  	return nil
  1059  }
  1060  
  1061  // AssignedPodAdded is called when a bound pod is added. Creation of this pod
  1062  // may make pending pods with matching affinity terms schedulable.
  1063  func (p *PriorityQueue) AssignedPodAdded(logger klog.Logger, pod *v1.Pod) {
  1064  	p.lock.Lock()
  1065  	p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, pod), AssignedPodAdd, nil, pod)
  1066  	p.lock.Unlock()
  1067  }
  1068  
  1069  // isPodResourcesResizedDown returns true if a pod CPU and/or memory resize request has been
  1070  // admitted by kubelet, is 'InProgress', and results in a net sizing down of updated resources.
  1071  // It returns false if either CPU or memory resource is net resized up, or if no resize is in progress.
  1072  func isPodResourcesResizedDown(pod *v1.Pod) bool {
  1073  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  1074  		// TODO(vinaykul,wangchen615,InPlacePodVerticalScaling): Fix this to determine when a
  1075  		// pod is truly resized down (might need oldPod if we cannot determine from Status alone)
  1076  		if pod.Status.Resize == v1.PodResizeStatusInProgress {
  1077  			return true
  1078  		}
  1079  	}
  1080  	return false
  1081  }
  1082  
  1083  // AssignedPodUpdated is called when a bound pod is updated. Change of labels
  1084  // may make pending pods with matching affinity terms schedulable.
  1085  func (p *PriorityQueue) AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod) {
  1086  	p.lock.Lock()
  1087  	if isPodResourcesResizedDown(newPod) {
  1088  		p.moveAllToActiveOrBackoffQueue(logger, AssignedPodUpdate, oldPod, newPod, nil)
  1089  	} else {
  1090  		p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, newPod), AssignedPodUpdate, oldPod, newPod)
  1091  	}
  1092  	p.lock.Unlock()
  1093  }
  1094  
  1095  // NOTE: this function assumes a lock has been acquired in the caller.
  1096  // moveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ.
  1097  // This function adds all pods and then signals the condition variable to ensure that
  1098  // if Pop() is waiting for an item, it receives the signal after all the pods are in the
  1099  // queue and the head is the highest priority pod.
  1100  func (p *PriorityQueue) moveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) {
  1101  	if !p.isEventOfInterest(logger, event) {
  1102  		// No plugin is interested in this event.
  1103  		// Return early before iterating all pods in unschedulablePods for preCheck.
  1104  		return
  1105  	}
  1106  
  1107  	unschedulablePods := make([]*framework.QueuedPodInfo, 0, len(p.unschedulablePods.podInfoMap))
  1108  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1109  		if preCheck == nil || preCheck(pInfo.Pod) {
  1110  			unschedulablePods = append(unschedulablePods, pInfo)
  1111  		}
  1112  	}
  1113  	p.movePodsToActiveOrBackoffQueue(logger, unschedulablePods, event, oldObj, newObj)
  1114  }
  1115  
  1116  // MoveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ.
  1117  // This function adds all pods and then signals the condition variable to ensure that
  1118  // if Pop() is waiting for an item, it receives the signal after all the pods are in the
  1119  // queue and the head is the highest priority pod.
  1120  func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) {
  1121  	p.lock.Lock()
  1122  	defer p.lock.Unlock()
  1123  	p.moveAllToActiveOrBackoffQueue(logger, event, oldObj, newObj, preCheck)
  1124  }
  1125  
  1126  // requeuePodViaQueueingHint tries to requeue Pod to activeQ, backoffQ or unschedulable pod pool based on schedulingHint.
  1127  // It returns the queue name Pod goes.
  1128  //
  1129  // NOTE: this function assumes lock has been acquired in caller
  1130  func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string {
  1131  	if strategy == queueSkip {
  1132  		p.unschedulablePods.addOrUpdate(pInfo)
  1133  		metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
  1134  		return unschedulablePods
  1135  	}
  1136  
  1137  	pod := pInfo.Pod
  1138  	if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) {
  1139  		if err := p.podBackoffQ.Add(pInfo); err != nil {
  1140  			logger.Error(err, "Error adding pod to the backoff queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod))
  1141  			p.unschedulablePods.addOrUpdate(pInfo)
  1142  			return unschedulablePods
  1143  		}
  1144  
  1145  		metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
  1146  		return backoffQ
  1147  	}
  1148  
  1149  	// Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off.
  1150  
  1151  	added, err := p.addToActiveQ(logger, pInfo)
  1152  	if err != nil {
  1153  		logger.Error(err, "Error adding pod to the active queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod))
  1154  	}
  1155  	if added {
  1156  		metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
  1157  		return activeQ
  1158  	}
  1159  	if pInfo.Gated {
  1160  		// In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in addToActiveQ.
  1161  		return unschedulablePods
  1162  	}
  1163  
  1164  	p.unschedulablePods.addOrUpdate(pInfo)
  1165  	metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
  1166  	return unschedulablePods
  1167  }
  1168  
  1169  // NOTE: this function assumes lock has been acquired in caller
  1170  func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podInfoList []*framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) {
  1171  	if !p.isEventOfInterest(logger, event) {
  1172  		// No plugin is interested in this event.
  1173  		return
  1174  	}
  1175  
  1176  	activated := false
  1177  	for _, pInfo := range podInfoList {
  1178  		// Since there may be many gated pods and they will not move from the
  1179  		// unschedulable pool, we skip calling the expensive isPodWorthRequeueing.
  1180  		if pInfo.Gated {
  1181  			continue
  1182  		}
  1183  		schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj)
  1184  		if schedulingHint == queueSkip {
  1185  			// QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event.
  1186  			logger.V(5).Info("Event is not making pod schedulable", "pod", klog.KObj(pInfo.Pod), "event", event.Label)
  1187  			continue
  1188  		}
  1189  
  1190  		p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated)
  1191  		queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label)
  1192  		logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label, "queue", queue, "hint", schedulingHint)
  1193  		if queue == activeQ {
  1194  			activated = true
  1195  		}
  1196  	}
  1197  
  1198  	p.moveRequestCycle = p.schedulingCycle
  1199  
  1200  	if p.isSchedulingQueueHintEnabled && len(p.inFlightPods) != 0 {
  1201  		logger.V(5).Info("Event received while pods are in flight", "event", event.Label, "numPods", len(p.inFlightPods))
  1202  		// AddUnschedulableIfNotPresent might get called for in-flight Pods later, and in
  1203  		// AddUnschedulableIfNotPresent we need to know whether events were
  1204  		// observed while scheduling them.
  1205  		p.inFlightEvents.PushBack(&clusterEvent{
  1206  			event:  event,
  1207  			oldObj: oldObj,
  1208  			newObj: newObj,
  1209  		})
  1210  	}
  1211  
  1212  	if activated {
  1213  		p.cond.Broadcast()
  1214  	}
  1215  }
  1216  
  1217  // getUnschedulablePodsWithMatchingAffinityTerm returns unschedulable pods which have
  1218  // any affinity term that matches "pod".
  1219  // NOTE: this function assumes lock has been acquired in caller.
  1220  func (p *PriorityQueue) getUnschedulablePodsWithMatchingAffinityTerm(logger klog.Logger, pod *v1.Pod) []*framework.QueuedPodInfo {
  1221  	nsLabels := interpodaffinity.GetNamespaceLabelsSnapshot(logger, pod.Namespace, p.nsLister)
  1222  
  1223  	var podsToMove []*framework.QueuedPodInfo
  1224  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1225  		for _, term := range pInfo.RequiredAffinityTerms {
  1226  			if term.Matches(pod, nsLabels) {
  1227  				podsToMove = append(podsToMove, pInfo)
  1228  				break
  1229  			}
  1230  		}
  1231  
  1232  	}
  1233  	return podsToMove
  1234  }
  1235  
  1236  // PodsInActiveQ returns all the Pods in the activeQ.
  1237  // This function is only used in tests.
  1238  func (p *PriorityQueue) PodsInActiveQ() []*v1.Pod {
  1239  	p.lock.RLock()
  1240  	defer p.lock.RUnlock()
  1241  	var result []*v1.Pod
  1242  	for _, pInfo := range p.activeQ.List() {
  1243  		result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
  1244  	}
  1245  	return result
  1246  }
  1247  
  1248  var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
  1249  
  1250  // PendingPods returns all the pending pods in the queue; accompanied by a debugging string
  1251  // recording showing the number of pods in each queue respectively.
  1252  // This function is used for debugging purposes in the scheduler cache dumper and comparer.
  1253  func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) {
  1254  	p.lock.RLock()
  1255  	defer p.lock.RUnlock()
  1256  	var result []*v1.Pod
  1257  	for _, pInfo := range p.activeQ.List() {
  1258  		result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
  1259  	}
  1260  	for _, pInfo := range p.podBackoffQ.List() {
  1261  		result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
  1262  	}
  1263  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1264  		result = append(result, pInfo.Pod)
  1265  	}
  1266  	return result, fmt.Sprintf(pendingPodsSummary, p.activeQ.Len(), p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap))
  1267  }
  1268  
  1269  // Close closes the priority queue.
  1270  func (p *PriorityQueue) Close() {
  1271  	p.lock.Lock()
  1272  	defer p.lock.Unlock()
  1273  	close(p.stop)
  1274  	p.closed = true
  1275  	p.cond.Broadcast()
  1276  }
  1277  
  1278  // DeleteNominatedPodIfExists deletes <pod> from nominatedPods.
  1279  func (npm *nominator) DeleteNominatedPodIfExists(pod *v1.Pod) {
  1280  	npm.lock.Lock()
  1281  	npm.deleteNominatedPodIfExistsUnlocked(pod)
  1282  	npm.lock.Unlock()
  1283  }
  1284  
  1285  func (npm *nominator) deleteNominatedPodIfExistsUnlocked(pod *v1.Pod) {
  1286  	npm.delete(pod)
  1287  }
  1288  
  1289  // AddNominatedPod adds a pod to the nominated pods of the given node.
  1290  // This is called during the preemption process after a node is nominated to run
  1291  // the pod. We update the structure before sending a request to update the pod
  1292  // object to avoid races with the following scheduling cycles.
  1293  func (npm *nominator) AddNominatedPod(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
  1294  	npm.lock.Lock()
  1295  	npm.addNominatedPodUnlocked(logger, pi, nominatingInfo)
  1296  	npm.lock.Unlock()
  1297  }
  1298  
  1299  // NominatedPodsForNode returns a copy of pods that are nominated to run on the given node,
  1300  // but they are waiting for other pods to be removed from the node.
  1301  func (npm *nominator) NominatedPodsForNode(nodeName string) []*framework.PodInfo {
  1302  	npm.lock.RLock()
  1303  	defer npm.lock.RUnlock()
  1304  	// Make a copy of the nominated Pods so the caller can mutate safely.
  1305  	pods := make([]*framework.PodInfo, len(npm.nominatedPods[nodeName]))
  1306  	for i := 0; i < len(pods); i++ {
  1307  		pods[i] = npm.nominatedPods[nodeName][i].DeepCopy()
  1308  	}
  1309  	return pods
  1310  }
  1311  
  1312  func (p *PriorityQueue) podsCompareBackoffCompleted(podInfo1, podInfo2 interface{}) bool {
  1313  	pInfo1 := podInfo1.(*framework.QueuedPodInfo)
  1314  	pInfo2 := podInfo2.(*framework.QueuedPodInfo)
  1315  	bo1 := p.getBackoffTime(pInfo1)
  1316  	bo2 := p.getBackoffTime(pInfo2)
  1317  	return bo1.Before(bo2)
  1318  }
  1319  
  1320  // newQueuedPodInfo builds a QueuedPodInfo object.
  1321  func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
  1322  	now := p.clock.Now()
  1323  	// ignore this err since apiserver doesn't properly validate affinity terms
  1324  	// and we can't fix the validation for backwards compatibility.
  1325  	podInfo, _ := framework.NewPodInfo(pod)
  1326  	return &framework.QueuedPodInfo{
  1327  		PodInfo:                 podInfo,
  1328  		Timestamp:               now,
  1329  		InitialAttemptTimestamp: nil,
  1330  		UnschedulablePlugins:    sets.New(plugins...),
  1331  	}
  1332  }
  1333  
  1334  // getBackoffTime returns the time that podInfo completes backoff
  1335  func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
  1336  	duration := p.calculateBackoffDuration(podInfo)
  1337  	backoffTime := podInfo.Timestamp.Add(duration)
  1338  	return backoffTime
  1339  }
  1340  
  1341  // calculateBackoffDuration is a helper function for calculating the backoffDuration
  1342  // based on the number of attempts the pod has made.
  1343  func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
  1344  	duration := p.podInitialBackoffDuration
  1345  	for i := 1; i < podInfo.Attempts; i++ {
  1346  		// Use subtraction instead of addition or multiplication to avoid overflow.
  1347  		if duration > p.podMaxBackoffDuration-duration {
  1348  			return p.podMaxBackoffDuration
  1349  		}
  1350  		duration += duration
  1351  	}
  1352  	return duration
  1353  }
  1354  
  1355  func updatePod(oldPodInfo interface{}, newPod *v1.Pod) *framework.QueuedPodInfo {
  1356  	pInfo := oldPodInfo.(*framework.QueuedPodInfo)
  1357  	pInfo.Update(newPod)
  1358  	return pInfo
  1359  }
  1360  
  1361  // UnschedulablePods holds pods that cannot be scheduled. This data structure
  1362  // is used to implement unschedulablePods.
  1363  type UnschedulablePods struct {
  1364  	// podInfoMap is a map key by a pod's full-name and the value is a pointer to the QueuedPodInfo.
  1365  	podInfoMap map[string]*framework.QueuedPodInfo
  1366  	keyFunc    func(*v1.Pod) string
  1367  	// unschedulableRecorder/gatedRecorder updates the counter when elements of an unschedulablePodsMap
  1368  	// get added or removed, and it does nothing if it's nil.
  1369  	unschedulableRecorder, gatedRecorder metrics.MetricRecorder
  1370  }
  1371  
  1372  // addOrUpdate adds a pod to the unschedulable podInfoMap.
  1373  func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
  1374  	podID := u.keyFunc(pInfo.Pod)
  1375  	if _, exists := u.podInfoMap[podID]; !exists {
  1376  		if pInfo.Gated && u.gatedRecorder != nil {
  1377  			u.gatedRecorder.Inc()
  1378  		} else if !pInfo.Gated && u.unschedulableRecorder != nil {
  1379  			u.unschedulableRecorder.Inc()
  1380  		}
  1381  	}
  1382  	u.podInfoMap[podID] = pInfo
  1383  }
  1384  
  1385  // delete deletes a pod from the unschedulable podInfoMap.
  1386  // The `gated` parameter is used to figure out which metric should be decreased.
  1387  func (u *UnschedulablePods) delete(pod *v1.Pod, gated bool) {
  1388  	podID := u.keyFunc(pod)
  1389  	if _, exists := u.podInfoMap[podID]; exists {
  1390  		if gated && u.gatedRecorder != nil {
  1391  			u.gatedRecorder.Dec()
  1392  		} else if !gated && u.unschedulableRecorder != nil {
  1393  			u.unschedulableRecorder.Dec()
  1394  		}
  1395  	}
  1396  	delete(u.podInfoMap, podID)
  1397  }
  1398  
  1399  // get returns the QueuedPodInfo if a pod with the same key as the key of the given "pod"
  1400  // is found in the map. It returns nil otherwise.
  1401  func (u *UnschedulablePods) get(pod *v1.Pod) *framework.QueuedPodInfo {
  1402  	podKey := u.keyFunc(pod)
  1403  	if pInfo, exists := u.podInfoMap[podKey]; exists {
  1404  		return pInfo
  1405  	}
  1406  	return nil
  1407  }
  1408  
  1409  // clear removes all the entries from the unschedulable podInfoMap.
  1410  func (u *UnschedulablePods) clear() {
  1411  	u.podInfoMap = make(map[string]*framework.QueuedPodInfo)
  1412  	if u.unschedulableRecorder != nil {
  1413  		u.unschedulableRecorder.Clear()
  1414  	}
  1415  	if u.gatedRecorder != nil {
  1416  		u.gatedRecorder.Clear()
  1417  	}
  1418  }
  1419  
  1420  // newUnschedulablePods initializes a new object of UnschedulablePods.
  1421  func newUnschedulablePods(unschedulableRecorder, gatedRecorder metrics.MetricRecorder) *UnschedulablePods {
  1422  	return &UnschedulablePods{
  1423  		podInfoMap:            make(map[string]*framework.QueuedPodInfo),
  1424  		keyFunc:               util.GetPodFullName,
  1425  		unschedulableRecorder: unschedulableRecorder,
  1426  		gatedRecorder:         gatedRecorder,
  1427  	}
  1428  }
  1429  
  1430  // nominator is a structure that stores pods nominated to run on nodes.
  1431  // It exists because nominatedNodeName of pod objects stored in the structure
  1432  // may be different than what scheduler has here. We should be able to find pods
  1433  // by their UID and update/delete them.
  1434  type nominator struct {
  1435  	// podLister is used to verify if the given pod is alive.
  1436  	podLister listersv1.PodLister
  1437  	// nominatedPods is a map keyed by a node name and the value is a list of
  1438  	// pods which are nominated to run on the node. These are pods which can be in
  1439  	// the activeQ or unschedulablePods.
  1440  	nominatedPods map[string][]*framework.PodInfo
  1441  	// nominatedPodToNode is map keyed by a Pod UID to the node name where it is
  1442  	// nominated.
  1443  	nominatedPodToNode map[types.UID]string
  1444  
  1445  	lock sync.RWMutex
  1446  }
  1447  
  1448  func (npm *nominator) addNominatedPodUnlocked(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
  1449  	// Always delete the pod if it already exists, to ensure we never store more than
  1450  	// one instance of the pod.
  1451  	npm.delete(pi.Pod)
  1452  
  1453  	var nodeName string
  1454  	if nominatingInfo.Mode() == framework.ModeOverride {
  1455  		nodeName = nominatingInfo.NominatedNodeName
  1456  	} else if nominatingInfo.Mode() == framework.ModeNoop {
  1457  		if pi.Pod.Status.NominatedNodeName == "" {
  1458  			return
  1459  		}
  1460  		nodeName = pi.Pod.Status.NominatedNodeName
  1461  	}
  1462  
  1463  	if npm.podLister != nil {
  1464  		// If the pod was removed or if it was already scheduled, don't nominate it.
  1465  		updatedPod, err := npm.podLister.Pods(pi.Pod.Namespace).Get(pi.Pod.Name)
  1466  		if err != nil {
  1467  			logger.V(4).Info("Pod doesn't exist in podLister, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod))
  1468  			return
  1469  		}
  1470  		if updatedPod.Spec.NodeName != "" {
  1471  			logger.V(4).Info("Pod is already scheduled to a node, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod), "node", updatedPod.Spec.NodeName)
  1472  			return
  1473  		}
  1474  	}
  1475  
  1476  	npm.nominatedPodToNode[pi.Pod.UID] = nodeName
  1477  	for _, npi := range npm.nominatedPods[nodeName] {
  1478  		if npi.Pod.UID == pi.Pod.UID {
  1479  			logger.V(4).Info("Pod already exists in the nominator", "pod", klog.KObj(npi.Pod))
  1480  			return
  1481  		}
  1482  	}
  1483  	npm.nominatedPods[nodeName] = append(npm.nominatedPods[nodeName], pi)
  1484  }
  1485  
  1486  func (npm *nominator) delete(p *v1.Pod) {
  1487  	nnn, ok := npm.nominatedPodToNode[p.UID]
  1488  	if !ok {
  1489  		return
  1490  	}
  1491  	for i, np := range npm.nominatedPods[nnn] {
  1492  		if np.Pod.UID == p.UID {
  1493  			npm.nominatedPods[nnn] = append(npm.nominatedPods[nnn][:i], npm.nominatedPods[nnn][i+1:]...)
  1494  			if len(npm.nominatedPods[nnn]) == 0 {
  1495  				delete(npm.nominatedPods, nnn)
  1496  			}
  1497  			break
  1498  		}
  1499  	}
  1500  	delete(npm.nominatedPodToNode, p.UID)
  1501  }
  1502  
  1503  // UpdateNominatedPod updates the <oldPod> with <newPod>.
  1504  func (npm *nominator) UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) {
  1505  	npm.lock.Lock()
  1506  	defer npm.lock.Unlock()
  1507  	npm.updateNominatedPodUnlocked(logger, oldPod, newPodInfo)
  1508  }
  1509  
  1510  func (npm *nominator) updateNominatedPodUnlocked(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) {
  1511  	// In some cases, an Update event with no "NominatedNode" present is received right
  1512  	// after a node("NominatedNode") is reserved for this pod in memory.
  1513  	// In this case, we need to keep reserving the NominatedNode when updating the pod pointer.
  1514  	var nominatingInfo *framework.NominatingInfo
  1515  	// We won't fall into below `if` block if the Update event represents:
  1516  	// (1) NominatedNode info is added
  1517  	// (2) NominatedNode info is updated
  1518  	// (3) NominatedNode info is removed
  1519  	if NominatedNodeName(oldPod) == "" && NominatedNodeName(newPodInfo.Pod) == "" {
  1520  		if nnn, ok := npm.nominatedPodToNode[oldPod.UID]; ok {
  1521  			// This is the only case we should continue reserving the NominatedNode
  1522  			nominatingInfo = &framework.NominatingInfo{
  1523  				NominatingMode:    framework.ModeOverride,
  1524  				NominatedNodeName: nnn,
  1525  			}
  1526  		}
  1527  	}
  1528  	// We update irrespective of the nominatedNodeName changed or not, to ensure
  1529  	// that pod pointer is updated.
  1530  	npm.delete(oldPod)
  1531  	npm.addNominatedPodUnlocked(logger, newPodInfo, nominatingInfo)
  1532  }
  1533  
  1534  // NewPodNominator creates a nominator as a backing of framework.PodNominator.
  1535  // A podLister is passed in so as to check if the pod exists
  1536  // before adding its nominatedNode info.
  1537  func NewPodNominator(podLister listersv1.PodLister) framework.PodNominator {
  1538  	return newPodNominator(podLister)
  1539  }
  1540  
  1541  func newPodNominator(podLister listersv1.PodLister) *nominator {
  1542  	return &nominator{
  1543  		podLister:          podLister,
  1544  		nominatedPods:      make(map[string][]*framework.PodInfo),
  1545  		nominatedPodToNode: make(map[types.UID]string),
  1546  	}
  1547  }
  1548  
  1549  func podInfoKeyFunc(obj interface{}) (string, error) {
  1550  	return cache.MetaNamespaceKeyFunc(obj.(*framework.QueuedPodInfo).Pod)
  1551  }