k8s.io/kubernetes@v1.29.3/pkg/scheduler/internal/queue/scheduling_queue.go

k8s.io/kubernetes@v1.29.3/pkg/scheduler/internal/queue/scheduling_queue.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // This file contains structures that implement scheduling queue types.
    18  // Scheduling queues hold pods waiting to be scheduled. This file implements a
    19  // priority queue which has two sub queues and a additional data structure,
    20  // namely: activeQ, backoffQ and unschedulablePods.
    21  // - activeQ holds pods that are being considered for scheduling.
    22  // - backoffQ holds pods that moved from unschedulablePods and will move to
    23  //   activeQ when their backoff periods complete.
    24  // - unschedulablePods holds pods that were already attempted for scheduling and
    25  //   are currently determined to be unschedulable.
    26  
    27  package queue
    28  
    29  import (
    30  	"container/list"
    31  	"context"
    32  	"fmt"
    33  	"math/rand"
    34  	"reflect"
    35  	"sync"
    36  	"time"
    37  
    38  	v1 "k8s.io/api/core/v1"
    39  	"k8s.io/apimachinery/pkg/types"
    40  	"k8s.io/apimachinery/pkg/util/sets"
    41  	"k8s.io/apimachinery/pkg/util/wait"
    42  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    43  	"k8s.io/client-go/informers"
    44  	listersv1 "k8s.io/client-go/listers/core/v1"
    45  	"k8s.io/client-go/tools/cache"
    46  	"k8s.io/klog/v2"
    47  	"k8s.io/kubernetes/pkg/features"
    48  	"k8s.io/kubernetes/pkg/scheduler/framework"
    49  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/interpodaffinity"
    50  	"k8s.io/kubernetes/pkg/scheduler/internal/heap"
    51  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    52  	"k8s.io/kubernetes/pkg/scheduler/util"
    53  	"k8s.io/utils/clock"
    54  )
    55  
    56  const (
    57  	// DefaultPodMaxInUnschedulablePodsDuration is the default value for the maximum
    58  	// time a pod can stay in unschedulablePods. If a pod stays in unschedulablePods
    59  	// for longer than this value, the pod will be moved from unschedulablePods to
    60  	// backoffQ or activeQ. If this value is empty, the default value (5min)
    61  	// will be used.
    62  	DefaultPodMaxInUnschedulablePodsDuration time.Duration = 5 * time.Minute
    63  	// Scheduling queue names
    64  	activeQ           = "Active"
    65  	backoffQ          = "Backoff"
    66  	unschedulablePods = "Unschedulable"
    67  
    68  	preEnqueue = "PreEnqueue"
    69  )
    70  
    71  const (
    72  	// DefaultPodInitialBackoffDuration is the default value for the initial backoff duration
    73  	// for unschedulable pods. To change the default podInitialBackoffDurationSeconds used by the
    74  	// scheduler, update the ComponentConfig value in defaults.go
    75  	DefaultPodInitialBackoffDuration time.Duration = 1 * time.Second
    76  	// DefaultPodMaxBackoffDuration is the default value for the max backoff duration
    77  	// for unschedulable pods. To change the default podMaxBackoffDurationSeconds used by the
    78  	// scheduler, update the ComponentConfig value in defaults.go
    79  	DefaultPodMaxBackoffDuration time.Duration = 10 * time.Second
    80  )
    81  
    82  // PreEnqueueCheck is a function type. It's used to build functions that
    83  // run against a Pod and the caller can choose to enqueue or skip the Pod
    84  // by the checking result.
    85  type PreEnqueueCheck func(pod *v1.Pod) bool
    86  
    87  // SchedulingQueue is an interface for a queue to store pods waiting to be scheduled.
    88  // The interface follows a pattern similar to cache.FIFO and cache.Heap and
    89  // makes it easy to use those data structures as a SchedulingQueue.
    90  type SchedulingQueue interface {
    91  	framework.PodNominator
    92  	Add(logger klog.Logger, pod *v1.Pod) error
    93  	// Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
    94  	// The passed-in pods are originally compiled from plugins that want to activate Pods,
    95  	// by injecting the pods through a reserved CycleState struct (PodsToActivate).
    96  	Activate(logger klog.Logger, pods map[string]*v1.Pod)
    97  	// AddUnschedulableIfNotPresent adds an unschedulable pod back to scheduling queue.
    98  	// The podSchedulingCycle represents the current scheduling cycle number which can be
    99  	// returned by calling SchedulingCycle().
   100  	AddUnschedulableIfNotPresent(logger klog.Logger, pod *framework.QueuedPodInfo, podSchedulingCycle int64) error
   101  	// SchedulingCycle returns the current number of scheduling cycle which is
   102  	// cached by scheduling queue. Normally, incrementing this number whenever
   103  	// a pod is popped (e.g. called Pop()) is enough.
   104  	SchedulingCycle() int64
   105  	// Pop removes the head of the queue and returns it. It blocks if the
   106  	// queue is empty and waits until a new item is added to the queue.
   107  	Pop(logger klog.Logger) (*framework.QueuedPodInfo, error)
   108  	// Done must be called for pod returned by Pop. This allows the queue to
   109  	// keep track of which pods are currently being processed.
   110  	Done(types.UID)
   111  	Update(logger klog.Logger, oldPod, newPod *v1.Pod) error
   112  	Delete(pod *v1.Pod) error
   113  	// TODO(sanposhiho): move all PreEnqueueCkeck to Requeue and delete it from this parameter eventually.
   114  	// Some PreEnqueueCheck include event filtering logic based on some in-tree plugins
   115  	// and it affect badly to other plugins.
   116  	// See https://github.com/kubernetes/kubernetes/issues/110175
   117  	MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck)
   118  	AssignedPodAdded(logger klog.Logger, pod *v1.Pod)
   119  	AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod)
   120  	PendingPods() ([]*v1.Pod, string)
   121  	// Close closes the SchedulingQueue so that the goroutine which is
   122  	// waiting to pop items can exit gracefully.
   123  	Close()
   124  	// Run starts the goroutines managing the queue.
   125  	Run(logger klog.Logger)
   126  }
   127  
   128  // NewSchedulingQueue initializes a priority queue as a new scheduling queue.
   129  func NewSchedulingQueue(
   130  	lessFn framework.LessFunc,
   131  	informerFactory informers.SharedInformerFactory,
   132  	opts ...Option) SchedulingQueue {
   133  	return NewPriorityQueue(lessFn, informerFactory, opts...)
   134  }
   135  
   136  // NominatedNodeName returns nominated node name of a Pod.
   137  func NominatedNodeName(pod *v1.Pod) string {
   138  	return pod.Status.NominatedNodeName
   139  }
   140  
   141  // PriorityQueue implements a scheduling queue.
   142  // The head of PriorityQueue is the highest priority pending pod. This structure
   143  // has two sub queues and a additional data structure, namely: activeQ,
   144  // backoffQ and unschedulablePods.
   145  //   - activeQ holds pods that are being considered for scheduling.
   146  //   - backoffQ holds pods that moved from unschedulablePods and will move to
   147  //     activeQ when their backoff periods complete.
   148  //   - unschedulablePods holds pods that were already attempted for scheduling and
   149  //     are currently determined to be unschedulable.
   150  type PriorityQueue struct {
   151  	*nominator
   152  
   153  	stop  chan struct{}
   154  	clock clock.Clock
   155  
   156  	// pod initial backoff duration.
   157  	podInitialBackoffDuration time.Duration
   158  	// pod maximum backoff duration.
   159  	podMaxBackoffDuration time.Duration
   160  	// the maximum time a pod can stay in the unschedulablePods.
   161  	podMaxInUnschedulablePodsDuration time.Duration
   162  
   163  	cond sync.Cond
   164  
   165  	// inFlightPods holds the UID of all pods which have been popped out for which Done
   166  	// hasn't been called yet - in other words, all pods that are currently being
   167  	// processed (being scheduled, in permit, or in the binding cycle).
   168  	//
   169  	// The values in the map are the entry of each pod in the inFlightEvents list.
   170  	// The value of that entry is the *v1.Pod at the time that scheduling of that
   171  	// pod started, which can be useful for logging or debugging.
   172  	inFlightPods map[types.UID]*list.Element
   173  
   174  	// inFlightEvents holds the events received by the scheduling queue
   175  	// (entry value is clusterEvent) together with in-flight pods (entry
   176  	// value is *v1.Pod). Entries get added at the end while the mutex is
   177  	// locked, so they get serialized.
   178  	//
   179  	// The pod entries are added in Pop and used to track which events
   180  	// occurred after the pod scheduling attempt for that pod started.
   181  	// They get removed when the scheduling attempt is done, at which
   182  	// point all events that occurred in the meantime are processed.
   183  	//
   184  	// After removal of a pod, events at the start of the list are no
   185  	// longer needed because all of the other in-flight pods started
   186  	// later. Those events can be removed.
   187  	inFlightEvents *list.List
   188  
   189  	// activeQ is heap structure that scheduler actively looks at to find pods to
   190  	// schedule. Head of heap is the highest priority pod.
   191  	activeQ *heap.Heap
   192  	// podBackoffQ is a heap ordered by backoff expiry. Pods which have completed backoff
   193  	// are popped from this heap before the scheduler looks at activeQ
   194  	podBackoffQ *heap.Heap
   195  	// unschedulablePods holds pods that have been tried and determined unschedulable.
   196  	unschedulablePods *UnschedulablePods
   197  	// schedulingCycle represents sequence number of scheduling cycle and is incremented
   198  	// when a pod is popped.
   199  	schedulingCycle int64
   200  	// moveRequestCycle caches the sequence number of scheduling cycle when we
   201  	// received a move request. Unschedulable pods in and before this scheduling
   202  	// cycle will be put back to activeQueue if we were trying to schedule them
   203  	// when we received move request.
   204  	// TODO: this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
   205  	moveRequestCycle int64
   206  
   207  	// preEnqueuePluginMap is keyed with profile name, valued with registered preEnqueue plugins.
   208  	preEnqueuePluginMap map[string][]framework.PreEnqueuePlugin
   209  	// queueingHintMap is keyed with profile name, valued with registered queueing hint functions.
   210  	queueingHintMap QueueingHintMapPerProfile
   211  
   212  	// closed indicates that the queue is closed.
   213  	// It is mainly used to let Pop() exit its control loop while waiting for an item.
   214  	closed bool
   215  
   216  	nsLister listersv1.NamespaceLister
   217  
   218  	metricsRecorder metrics.MetricAsyncRecorder
   219  	// pluginMetricsSamplePercent is the percentage of plugin metrics to be sampled.
   220  	pluginMetricsSamplePercent int
   221  
   222  	// isSchedulingQueueHintEnabled indicates whether the feature gate for the scheduling queue is enabled.
   223  	isSchedulingQueueHintEnabled bool
   224  }
   225  
   226  // QueueingHintFunction is the wrapper of QueueingHintFn that has PluginName.
   227  type QueueingHintFunction struct {
   228  	PluginName     string
   229  	QueueingHintFn framework.QueueingHintFn
   230  }
   231  
   232  // clusterEvent has the event and involved objects.
   233  type clusterEvent struct {
   234  	event framework.ClusterEvent
   235  	// oldObj is the object that involved this event.
   236  	oldObj interface{}
   237  	// newObj is the object that involved this event.
   238  	newObj interface{}
   239  }
   240  
   241  type priorityQueueOptions struct {
   242  	clock                             clock.Clock
   243  	podInitialBackoffDuration         time.Duration
   244  	podMaxBackoffDuration             time.Duration
   245  	podMaxInUnschedulablePodsDuration time.Duration
   246  	podLister                         listersv1.PodLister
   247  	metricsRecorder                   metrics.MetricAsyncRecorder
   248  	pluginMetricsSamplePercent        int
   249  	preEnqueuePluginMap               map[string][]framework.PreEnqueuePlugin
   250  	queueingHintMap                   QueueingHintMapPerProfile
   251  }
   252  
   253  // Option configures a PriorityQueue
   254  type Option func(*priorityQueueOptions)
   255  
   256  // WithClock sets clock for PriorityQueue, the default clock is clock.RealClock.
   257  func WithClock(clock clock.Clock) Option {
   258  	return func(o *priorityQueueOptions) {
   259  		o.clock = clock
   260  	}
   261  }
   262  
   263  // WithPodInitialBackoffDuration sets pod initial backoff duration for PriorityQueue.
   264  func WithPodInitialBackoffDuration(duration time.Duration) Option {
   265  	return func(o *priorityQueueOptions) {
   266  		o.podInitialBackoffDuration = duration
   267  	}
   268  }
   269  
   270  // WithPodMaxBackoffDuration sets pod max backoff duration for PriorityQueue.
   271  func WithPodMaxBackoffDuration(duration time.Duration) Option {
   272  	return func(o *priorityQueueOptions) {
   273  		o.podMaxBackoffDuration = duration
   274  	}
   275  }
   276  
   277  // WithPodLister sets pod lister for PriorityQueue.
   278  func WithPodLister(pl listersv1.PodLister) Option {
   279  	return func(o *priorityQueueOptions) {
   280  		o.podLister = pl
   281  	}
   282  }
   283  
   284  // WithPodMaxInUnschedulablePodsDuration sets podMaxInUnschedulablePodsDuration for PriorityQueue.
   285  func WithPodMaxInUnschedulablePodsDuration(duration time.Duration) Option {
   286  	return func(o *priorityQueueOptions) {
   287  		o.podMaxInUnschedulablePodsDuration = duration
   288  	}
   289  }
   290  
   291  // QueueingHintMapPerProfile is keyed with profile name, valued with queueing hint map registered for the profile.
   292  type QueueingHintMapPerProfile map[string]QueueingHintMap
   293  
   294  // QueueingHintMap is keyed with ClusterEvent, valued with queueing hint functions registered for the event.
   295  type QueueingHintMap map[framework.ClusterEvent][]*QueueingHintFunction
   296  
   297  // WithQueueingHintMapPerProfile sets preEnqueuePluginMap for PriorityQueue.
   298  func WithQueueingHintMapPerProfile(m QueueingHintMapPerProfile) Option {
   299  	return func(o *priorityQueueOptions) {
   300  		o.queueingHintMap = m
   301  	}
   302  }
   303  
   304  // WithPreEnqueuePluginMap sets preEnqueuePluginMap for PriorityQueue.
   305  func WithPreEnqueuePluginMap(m map[string][]framework.PreEnqueuePlugin) Option {
   306  	return func(o *priorityQueueOptions) {
   307  		o.preEnqueuePluginMap = m
   308  	}
   309  }
   310  
   311  // WithMetricsRecorder sets metrics recorder.
   312  func WithMetricsRecorder(recorder metrics.MetricAsyncRecorder) Option {
   313  	return func(o *priorityQueueOptions) {
   314  		o.metricsRecorder = recorder
   315  	}
   316  }
   317  
   318  // WithPluginMetricsSamplePercent sets the percentage of plugin metrics to be sampled.
   319  func WithPluginMetricsSamplePercent(percent int) Option {
   320  	return func(o *priorityQueueOptions) {
   321  		o.pluginMetricsSamplePercent = percent
   322  	}
   323  }
   324  
   325  var defaultPriorityQueueOptions = priorityQueueOptions{
   326  	clock:                             clock.RealClock{},
   327  	podInitialBackoffDuration:         DefaultPodInitialBackoffDuration,
   328  	podMaxBackoffDuration:             DefaultPodMaxBackoffDuration,
   329  	podMaxInUnschedulablePodsDuration: DefaultPodMaxInUnschedulablePodsDuration,
   330  }
   331  
   332  // Making sure that PriorityQueue implements SchedulingQueue.
   333  var _ SchedulingQueue = &PriorityQueue{}
   334  
   335  // newQueuedPodInfoForLookup builds a QueuedPodInfo object for a lookup in the queue.
   336  func newQueuedPodInfoForLookup(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
   337  	// Since this is only used for a lookup in the queue, we only need to set the Pod,
   338  	// and so we avoid creating a full PodInfo, which is expensive to instantiate frequently.
   339  	return &framework.QueuedPodInfo{
   340  		PodInfo:              &framework.PodInfo{Pod: pod},
   341  		UnschedulablePlugins: sets.New(plugins...),
   342  	}
   343  }
   344  
   345  // NewPriorityQueue creates a PriorityQueue object.
   346  func NewPriorityQueue(
   347  	lessFn framework.LessFunc,
   348  	informerFactory informers.SharedInformerFactory,
   349  	opts ...Option,
   350  ) *PriorityQueue {
   351  	options := defaultPriorityQueueOptions
   352  	if options.podLister == nil {
   353  		options.podLister = informerFactory.Core().V1().Pods().Lister()
   354  	}
   355  	for _, opt := range opts {
   356  		opt(&options)
   357  	}
   358  
   359  	comp := func(podInfo1, podInfo2 interface{}) bool {
   360  		pInfo1 := podInfo1.(*framework.QueuedPodInfo)
   361  		pInfo2 := podInfo2.(*framework.QueuedPodInfo)
   362  		return lessFn(pInfo1, pInfo2)
   363  	}
   364  
   365  	pq := &PriorityQueue{
   366  		nominator:                         newPodNominator(options.podLister),
   367  		clock:                             options.clock,
   368  		stop:                              make(chan struct{}),
   369  		podInitialBackoffDuration:         options.podInitialBackoffDuration,
   370  		podMaxBackoffDuration:             options.podMaxBackoffDuration,
   371  		podMaxInUnschedulablePodsDuration: options.podMaxInUnschedulablePodsDuration,
   372  		activeQ:                           heap.NewWithRecorder(podInfoKeyFunc, comp, metrics.NewActivePodsRecorder()),
   373  		unschedulablePods:                 newUnschedulablePods(metrics.NewUnschedulablePodsRecorder(), metrics.NewGatedPodsRecorder()),
   374  		inFlightPods:                      make(map[types.UID]*list.Element),
   375  		inFlightEvents:                    list.New(),
   376  		preEnqueuePluginMap:               options.preEnqueuePluginMap,
   377  		queueingHintMap:                   options.queueingHintMap,
   378  		metricsRecorder:                   options.metricsRecorder,
   379  		pluginMetricsSamplePercent:        options.pluginMetricsSamplePercent,
   380  		moveRequestCycle:                  -1,
   381  		isSchedulingQueueHintEnabled:      utilfeature.DefaultFeatureGate.Enabled(features.SchedulerQueueingHints),
   382  	}
   383  	pq.cond.L = &pq.lock
   384  	pq.podBackoffQ = heap.NewWithRecorder(podInfoKeyFunc, pq.podsCompareBackoffCompleted, metrics.NewBackoffPodsRecorder())
   385  	pq.nsLister = informerFactory.Core().V1().Namespaces().Lister()
   386  
   387  	return pq
   388  }
   389  
   390  // Run starts the goroutine to pump from podBackoffQ to activeQ
   391  func (p *PriorityQueue) Run(logger klog.Logger) {
   392  	go wait.Until(func() {
   393  		p.flushBackoffQCompleted(logger)
   394  	}, 1.0*time.Second, p.stop)
   395  	go wait.Until(func() {
   396  		p.flushUnschedulablePodsLeftover(logger)
   397  	}, 30*time.Second, p.stop)
   398  }
   399  
   400  // queueingStrategy indicates how the scheduling queue should enqueue the Pod from unschedulable pod pool.
   401  type queueingStrategy int
   402  
   403  const (
   404  	// queueSkip indicates that the scheduling queue should skip requeuing the Pod to activeQ/backoffQ.
   405  	queueSkip queueingStrategy = iota
   406  	// queueAfterBackoff indicates that the scheduling queue should requeue the Pod after backoff is completed.
   407  	queueAfterBackoff
   408  	// queueImmediately indicates that the scheduling queue should skip backoff and requeue the Pod immediately to activeQ.
   409  	queueImmediately
   410  )
   411  
   412  // isPodWorthRequeuing calls QueueingHintFn of only plugins registered in pInfo.unschedulablePlugins and pInfo.PendingPlugins.
   413  //
   414  // If any of pInfo.PendingPlugins return Queue,
   415  // the scheduling queue is supposed to enqueue this Pod to activeQ, skipping backoffQ.
   416  // If any of pInfo.unschedulablePlugins return Queue,
   417  // the scheduling queue is supposed to enqueue this Pod to activeQ/backoffQ depending on the remaining backoff time of the Pod.
   418  // If all QueueingHintFns returns Skip, the scheduling queue enqueues the Pod back to unschedulable Pod pool
   419  // because no plugin changes the scheduling result via the event.
   420  func (p *PriorityQueue) isPodWorthRequeuing(logger klog.Logger, pInfo *framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) queueingStrategy {
   421  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   422  	if rejectorPlugins.Len() == 0 {
   423  		logger.V(6).Info("Worth requeuing because no failed plugins", "pod", klog.KObj(pInfo.Pod))
   424  		return queueAfterBackoff
   425  	}
   426  
   427  	if event.IsWildCard() {
   428  		// If the wildcard event is special one as someone wants to force all Pods to move to activeQ/backoffQ.
   429  		// We return queueAfterBackoff in this case, while resetting all blocked plugins.
   430  		logger.V(6).Info("Worth requeuing because the event is wildcard", "pod", klog.KObj(pInfo.Pod))
   431  		return queueAfterBackoff
   432  	}
   433  
   434  	hintMap, ok := p.queueingHintMap[pInfo.Pod.Spec.SchedulerName]
   435  	if !ok {
   436  		// shouldn't reach here unless bug.
   437  		logger.Error(nil, "No QueueingHintMap is registered for this profile", "profile", pInfo.Pod.Spec.SchedulerName, "pod", klog.KObj(pInfo.Pod))
   438  		return queueAfterBackoff
   439  	}
   440  
   441  	pod := pInfo.Pod
   442  	queueStrategy := queueSkip
   443  	for eventToMatch, hintfns := range hintMap {
   444  		if eventToMatch.Resource != event.Resource || eventToMatch.ActionType&event.ActionType == 0 {
   445  			continue
   446  		}
   447  
   448  		for _, hintfn := range hintfns {
   449  			if !rejectorPlugins.Has(hintfn.PluginName) {
   450  				// skip if it's not hintfn from rejectorPlugins.
   451  				continue
   452  			}
   453  
   454  			hint, err := hintfn.QueueingHintFn(logger, pod, oldObj, newObj)
   455  			if err != nil {
   456  				// If the QueueingHintFn returned an error, we should treat the event as Queue so that we can prevent
   457  				// the Pod from being stuck in the unschedulable pod pool.
   458  				oldObjMeta, newObjMeta, asErr := util.As[klog.KMetadata](oldObj, newObj)
   459  				if asErr != nil {
   460  					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod))
   461  				} else {
   462  					logger.Error(err, "QueueingHintFn returns error", "event", event, "plugin", hintfn.PluginName, "pod", klog.KObj(pod), "oldObj", klog.KObj(oldObjMeta), "newObj", klog.KObj(newObjMeta))
   463  				}
   464  				hint = framework.Queue
   465  			}
   466  			if hint == framework.QueueSkip {
   467  				continue
   468  			}
   469  
   470  			if pInfo.PendingPlugins.Has(hintfn.PluginName) {
   471  				// interprets Queue from the Pending plugin as queueImmediately.
   472  				// We can return immediately because queueImmediately is the highest priority.
   473  				return queueImmediately
   474  			}
   475  
   476  			// interprets Queue from the unschedulable plugin as queueAfterBackoff.
   477  
   478  			if pInfo.PendingPlugins.Len() == 0 {
   479  				// We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod,
   480  				// and queueAfterBackoff is the second highest priority.
   481  				return queueAfterBackoff
   482  			}
   483  
   484  			// We can't return immediately because there are some Pending plugins registered in this Pod.
   485  			// We need to check if those plugins return Queue or not and if they do, we return queueImmediately.
   486  			queueStrategy = queueAfterBackoff
   487  		}
   488  	}
   489  
   490  	return queueStrategy
   491  }
   492  
   493  // runPreEnqueuePlugins iterates PreEnqueue function in each registered PreEnqueuePlugin.
   494  // It returns true if all PreEnqueue function run successfully; otherwise returns false
   495  // upon the first failure.
   496  // Note: we need to associate the failed plugin to `pInfo`, so that the pod can be moved back
   497  // to activeQ by related cluster event.
   498  func (p *PriorityQueue) runPreEnqueuePlugins(ctx context.Context, pInfo *framework.QueuedPodInfo) bool {
   499  	logger := klog.FromContext(ctx)
   500  	var s *framework.Status
   501  	pod := pInfo.Pod
   502  	startTime := p.clock.Now()
   503  	defer func() {
   504  		metrics.FrameworkExtensionPointDuration.WithLabelValues(preEnqueue, s.Code().String(), pod.Spec.SchedulerName).Observe(metrics.SinceInSeconds(startTime))
   505  	}()
   506  
   507  	shouldRecordMetric := rand.Intn(100) < p.pluginMetricsSamplePercent
   508  	for _, pl := range p.preEnqueuePluginMap[pod.Spec.SchedulerName] {
   509  		s = p.runPreEnqueuePlugin(ctx, pl, pod, shouldRecordMetric)
   510  		if s.IsSuccess() {
   511  			continue
   512  		}
   513  		pInfo.UnschedulablePlugins.Insert(pl.Name())
   514  		metrics.UnschedulableReason(pl.Name(), pod.Spec.SchedulerName).Inc()
   515  		if s.Code() == framework.Error {
   516  			logger.Error(s.AsError(), "Unexpected error running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name())
   517  		} else {
   518  			logger.Info("Status after running PreEnqueue plugin", "pod", klog.KObj(pod), "plugin", pl.Name(), "status", s)
   519  		}
   520  		return false
   521  	}
   522  	return true
   523  }
   524  
   525  func (p *PriorityQueue) runPreEnqueuePlugin(ctx context.Context, pl framework.PreEnqueuePlugin, pod *v1.Pod, shouldRecordMetric bool) *framework.Status {
   526  	if !shouldRecordMetric {
   527  		return pl.PreEnqueue(ctx, pod)
   528  	}
   529  	startTime := p.clock.Now()
   530  	s := pl.PreEnqueue(ctx, pod)
   531  	p.metricsRecorder.ObservePluginDurationAsync(preEnqueue, pl.Name(), s.Code().String(), p.clock.Since(startTime).Seconds())
   532  	return s
   533  }
   534  
   535  // addToActiveQ tries to add pod to active queue. It returns 2 parameters:
   536  // 1. a boolean flag to indicate whether the pod is added successfully.
   537  // 2. an error for the caller to act on.
   538  func (p *PriorityQueue) addToActiveQ(logger klog.Logger, pInfo *framework.QueuedPodInfo) (bool, error) {
   539  	pInfo.Gated = !p.runPreEnqueuePlugins(context.Background(), pInfo)
   540  	if pInfo.Gated {
   541  		// Add the Pod to unschedulablePods if it's not passing PreEnqueuePlugins.
   542  		p.unschedulablePods.addOrUpdate(pInfo)
   543  		return false, nil
   544  	}
   545  	if pInfo.InitialAttemptTimestamp == nil {
   546  		now := p.clock.Now()
   547  		pInfo.InitialAttemptTimestamp = &now
   548  	}
   549  	if err := p.activeQ.Add(pInfo); err != nil {
   550  		logger.Error(err, "Error adding pod to the active queue", "pod", klog.KObj(pInfo.Pod))
   551  		return false, err
   552  	}
   553  	return true, nil
   554  }
   555  
   556  // Add adds a pod to the active queue. It should be called only when a new pod
   557  // is added so there is no chance the pod is already in active/unschedulable/backoff queues
   558  func (p *PriorityQueue) Add(logger klog.Logger, pod *v1.Pod) error {
   559  	p.lock.Lock()
   560  	defer p.lock.Unlock()
   561  
   562  	pInfo := p.newQueuedPodInfo(pod)
   563  	gated := pInfo.Gated
   564  	if added, err := p.addToActiveQ(logger, pInfo); !added {
   565  		return err
   566  	}
   567  	if p.unschedulablePods.get(pod) != nil {
   568  		logger.Error(nil, "Error: pod is already in the unschedulable queue", "pod", klog.KObj(pod))
   569  		p.unschedulablePods.delete(pod, gated)
   570  	}
   571  	// Delete pod from backoffQ if it is backing off
   572  	if err := p.podBackoffQ.Delete(pInfo); err == nil {
   573  		logger.Error(nil, "Error: pod is already in the podBackoff queue", "pod", klog.KObj(pod))
   574  	}
   575  	logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", PodAdd, "queue", activeQ)
   576  	metrics.SchedulerQueueIncomingPods.WithLabelValues("active", PodAdd).Inc()
   577  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   578  	p.cond.Broadcast()
   579  
   580  	return nil
   581  }
   582  
   583  // Activate moves the given pods to activeQ iff they're in unschedulablePods or backoffQ.
   584  func (p *PriorityQueue) Activate(logger klog.Logger, pods map[string]*v1.Pod) {
   585  	p.lock.Lock()
   586  	defer p.lock.Unlock()
   587  
   588  	activated := false
   589  	for _, pod := range pods {
   590  		if p.activate(logger, pod) {
   591  			activated = true
   592  		}
   593  	}
   594  
   595  	if activated {
   596  		p.cond.Broadcast()
   597  	}
   598  }
   599  
   600  func (p *PriorityQueue) activate(logger klog.Logger, pod *v1.Pod) bool {
   601  	// Verify if the pod is present in activeQ.
   602  	if _, exists, _ := p.activeQ.Get(newQueuedPodInfoForLookup(pod)); exists {
   603  		// No need to activate if it's already present in activeQ.
   604  		return false
   605  	}
   606  	var pInfo *framework.QueuedPodInfo
   607  	// Verify if the pod is present in unschedulablePods or backoffQ.
   608  	if pInfo = p.unschedulablePods.get(pod); pInfo == nil {
   609  		// If the pod doesn't belong to unschedulablePods or backoffQ, don't activate it.
   610  		if obj, exists, _ := p.podBackoffQ.Get(newQueuedPodInfoForLookup(pod)); !exists {
   611  			logger.Error(nil, "To-activate pod does not exist in unschedulablePods or backoffQ", "pod", klog.KObj(pod))
   612  			return false
   613  		} else {
   614  			pInfo = obj.(*framework.QueuedPodInfo)
   615  		}
   616  	}
   617  
   618  	if pInfo == nil {
   619  		// Redundant safe check. We shouldn't reach here.
   620  		logger.Error(nil, "Internal error: cannot obtain pInfo")
   621  		return false
   622  	}
   623  
   624  	gated := pInfo.Gated
   625  	if added, _ := p.addToActiveQ(logger, pInfo); !added {
   626  		return false
   627  	}
   628  	p.unschedulablePods.delete(pInfo.Pod, gated)
   629  	p.podBackoffQ.Delete(pInfo)
   630  	metrics.SchedulerQueueIncomingPods.WithLabelValues("active", ForceActivate).Inc()
   631  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   632  	return true
   633  }
   634  
   635  // isPodBackingoff returns true if a pod is still waiting for its backoff timer.
   636  // If this returns true, the pod should not be re-tried.
   637  func (p *PriorityQueue) isPodBackingoff(podInfo *framework.QueuedPodInfo) bool {
   638  	if podInfo.Gated {
   639  		return false
   640  	}
   641  	boTime := p.getBackoffTime(podInfo)
   642  	return boTime.After(p.clock.Now())
   643  }
   644  
   645  // SchedulingCycle returns current scheduling cycle.
   646  func (p *PriorityQueue) SchedulingCycle() int64 {
   647  	p.lock.RLock()
   648  	defer p.lock.RUnlock()
   649  	return p.schedulingCycle
   650  }
   651  
   652  // determineSchedulingHintForInFlightPod looks at the unschedulable plugins of the given Pod
   653  // and determines the scheduling hint for this Pod while checking the events that happened during in-flight.
   654  func (p *PriorityQueue) determineSchedulingHintForInFlightPod(logger klog.Logger, pInfo *framework.QueuedPodInfo) queueingStrategy {
   655  	logger.V(5).Info("Checking events for in-flight pod", "pod", klog.KObj(pInfo.Pod), "unschedulablePlugins", pInfo.UnschedulablePlugins, "inFlightEventsSize", p.inFlightEvents.Len(), "inFlightPodsSize", len(p.inFlightPods))
   656  
   657  	// AddUnschedulableIfNotPresent is called with the Pod at the end of scheduling or binding.
   658  	// So, given pInfo should have been Pop()ed before,
   659  	// we can assume pInfo must be recorded in inFlightPods and thus inFlightEvents.
   660  	inFlightPod, ok := p.inFlightPods[pInfo.Pod.UID]
   661  	if !ok {
   662  		// This can happen while updating a pod. In that case pInfo.UnschedulablePlugins should
   663  		// be empty. If it is not, we may have a problem.
   664  		if len(pInfo.UnschedulablePlugins) != 0 {
   665  			logger.Error(nil, "In flight Pod isn't found in the scheduling queue. If you see this error log, it's likely a bug in the scheduler.", "pod", klog.KObj(pInfo.Pod))
   666  			return queueAfterBackoff
   667  		}
   668  		if p.inFlightEvents.Len() > len(p.inFlightPods) {
   669  			return queueAfterBackoff
   670  		}
   671  		return queueSkip
   672  	}
   673  
   674  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   675  	if len(rejectorPlugins) == 0 {
   676  		// No failed plugins are associated with this Pod.
   677  		// Meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
   678  		// In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
   679  		return queueAfterBackoff
   680  	}
   681  
   682  	// check if there is an event that makes this Pod schedulable based on pInfo.UnschedulablePlugins.
   683  	queueingStrategy := queueSkip
   684  	for event := inFlightPod.Next(); event != nil; event = event.Next() {
   685  		e, ok := event.Value.(*clusterEvent)
   686  		if !ok {
   687  			// Must be another in-flight Pod (*v1.Pod). Can be ignored.
   688  			continue
   689  		}
   690  		logger.V(5).Info("Checking event for in-flight pod", "pod", klog.KObj(pInfo.Pod), "event", e.event.Label)
   691  
   692  		switch p.isPodWorthRequeuing(logger, pInfo, e.event, e.oldObj, e.newObj) {
   693  		case queueSkip:
   694  			continue
   695  		case queueImmediately:
   696  			// queueImmediately is the highest priority.
   697  			// No need to go through the rest of the events.
   698  			return queueImmediately
   699  		case queueAfterBackoff:
   700  			// replace schedulingHint with queueAfterBackoff
   701  			queueingStrategy = queueAfterBackoff
   702  			if pInfo.PendingPlugins.Len() == 0 {
   703  				// We can return immediately because no Pending plugins, which only can make queueImmediately, registered in this Pod,
   704  				// and queueAfterBackoff is the second highest priority.
   705  				return queueAfterBackoff
   706  			}
   707  		}
   708  	}
   709  	return queueingStrategy
   710  }
   711  
   712  // addUnschedulableIfNotPresentWithoutQueueingHint inserts a pod that cannot be scheduled into
   713  // the queue, unless it is already in the queue. Normally, PriorityQueue puts
   714  // unschedulable pods in `unschedulablePods`. But if there has been a recent move
   715  // request, then the pod is put in `podBackoffQ`.
   716  // TODO: This function is called only when p.isSchedulingQueueHintEnabled is false,
   717  // and this will be removed after SchedulingQueueHint goes to stable and the feature gate is removed.
   718  func (p *PriorityQueue) addUnschedulableWithoutQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
   719  	pod := pInfo.Pod
   720  	// Refresh the timestamp since the pod is re-added.
   721  	pInfo.Timestamp = p.clock.Now()
   722  
   723  	// When the queueing hint is enabled, they are used differently.
   724  	// But, we use all of them as UnschedulablePlugins when the queueing hint isn't enabled so that we don't break the old behaviour.
   725  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   726  
   727  	// If a move request has been received, move it to the BackoffQ, otherwise move
   728  	// it to unschedulablePods.
   729  	for plugin := range rejectorPlugins {
   730  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc()
   731  	}
   732  	if p.moveRequestCycle >= podSchedulingCycle || len(rejectorPlugins) == 0 {
   733  		// Two cases to move a Pod to the active/backoff queue:
   734  		// - The Pod is rejected by some plugins, but a move request is received after this Pod's scheduling cycle is started.
   735  		//   In this case, the received event may be make Pod schedulable and we should retry scheduling it.
   736  		// - No unschedulable plugins are associated with this Pod,
   737  		//   meaning something unusual (a temporal failure on kube-apiserver, etc) happened and this Pod gets moved back to the queue.
   738  		//   In this case, we should retry scheduling it because this Pod may not be retried until the next flush.
   739  		if err := p.podBackoffQ.Add(pInfo); err != nil {
   740  			return fmt.Errorf("error adding pod %v to the backoff queue: %v", klog.KObj(pod), err)
   741  		}
   742  		logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", backoffQ)
   743  		metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", ScheduleAttemptFailure).Inc()
   744  	} else {
   745  		p.unschedulablePods.addOrUpdate(pInfo)
   746  		logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", unschedulablePods)
   747  		metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
   748  	}
   749  
   750  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   751  	return nil
   752  }
   753  
   754  // AddUnschedulableIfNotPresent inserts a pod that cannot be scheduled into
   755  // the queue, unless it is already in the queue. Normally, PriorityQueue puts
   756  // unschedulable pods in `unschedulablePods`. But if there has been a recent move
   757  // request, then the pod is put in `podBackoffQ`.
   758  func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo *framework.QueuedPodInfo, podSchedulingCycle int64) error {
   759  	p.lock.Lock()
   760  	defer p.lock.Unlock()
   761  
   762  	// In any case, this Pod will be moved back to the queue and we should call Done.
   763  	defer p.done(pInfo.Pod.UID)
   764  
   765  	pod := pInfo.Pod
   766  	if p.unschedulablePods.get(pod) != nil {
   767  		return fmt.Errorf("Pod %v is already present in unschedulable queue", klog.KObj(pod))
   768  	}
   769  
   770  	if _, exists, _ := p.activeQ.Get(pInfo); exists {
   771  		return fmt.Errorf("Pod %v is already present in the active queue", klog.KObj(pod))
   772  	}
   773  	if _, exists, _ := p.podBackoffQ.Get(pInfo); exists {
   774  		return fmt.Errorf("Pod %v is already present in the backoff queue", klog.KObj(pod))
   775  	}
   776  
   777  	if !p.isSchedulingQueueHintEnabled {
   778  		// fall back to the old behavior which doesn't depend on the queueing hint.
   779  		return p.addUnschedulableWithoutQueueingHint(logger, pInfo, podSchedulingCycle)
   780  	}
   781  
   782  	// Refresh the timestamp since the pod is re-added.
   783  	pInfo.Timestamp = p.clock.Now()
   784  
   785  	// If a move request has been received, move it to the BackoffQ, otherwise move
   786  	// it to unschedulablePods.
   787  	rejectorPlugins := pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins)
   788  	for plugin := range rejectorPlugins {
   789  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Inc()
   790  	}
   791  
   792  	// We check whether this Pod may change its scheduling result by any of events that happened during scheduling.
   793  	schedulingHint := p.determineSchedulingHintForInFlightPod(logger, pInfo)
   794  
   795  	// In this case, we try to requeue this Pod to activeQ/backoffQ.
   796  	queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, ScheduleAttemptFailure)
   797  	logger.V(3).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", ScheduleAttemptFailure, "queue", queue, "schedulingCycle", podSchedulingCycle, "hint", schedulingHint, "unschedulable plugins", rejectorPlugins)
   798  	if queue == activeQ {
   799  		// When the Pod is moved to activeQ, need to let p.cond know so that the Pod will be pop()ed out.
   800  		p.cond.Broadcast()
   801  	}
   802  
   803  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
   804  	return nil
   805  }
   806  
   807  // flushBackoffQCompleted Moves all pods from backoffQ which have completed backoff in to activeQ
   808  func (p *PriorityQueue) flushBackoffQCompleted(logger klog.Logger) {
   809  	p.lock.Lock()
   810  	defer p.lock.Unlock()
   811  	activated := false
   812  	for {
   813  		rawPodInfo := p.podBackoffQ.Peek()
   814  		if rawPodInfo == nil {
   815  			break
   816  		}
   817  		pInfo := rawPodInfo.(*framework.QueuedPodInfo)
   818  		pod := pInfo.Pod
   819  		if p.isPodBackingoff(pInfo) {
   820  			break
   821  		}
   822  		_, err := p.podBackoffQ.Pop()
   823  		if err != nil {
   824  			logger.Error(err, "Unable to pop pod from backoff queue despite backoff completion", "pod", klog.KObj(pod))
   825  			break
   826  		}
   827  		if added, _ := p.addToActiveQ(logger, pInfo); added {
   828  			logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pod), "event", BackoffComplete, "queue", activeQ)
   829  			metrics.SchedulerQueueIncomingPods.WithLabelValues("active", BackoffComplete).Inc()
   830  			activated = true
   831  		}
   832  	}
   833  
   834  	if activated {
   835  		p.cond.Broadcast()
   836  	}
   837  }
   838  
   839  // flushUnschedulablePodsLeftover moves pods which stay in unschedulablePods
   840  // longer than podMaxInUnschedulablePodsDuration to backoffQ or activeQ.
   841  func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) {
   842  	p.lock.Lock()
   843  	defer p.lock.Unlock()
   844  
   845  	var podsToMove []*framework.QueuedPodInfo
   846  	currentTime := p.clock.Now()
   847  	for _, pInfo := range p.unschedulablePods.podInfoMap {
   848  		lastScheduleTime := pInfo.Timestamp
   849  		if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration {
   850  			podsToMove = append(podsToMove, pInfo)
   851  		}
   852  	}
   853  
   854  	if len(podsToMove) > 0 {
   855  		p.movePodsToActiveOrBackoffQueue(logger, podsToMove, UnschedulableTimeout, nil, nil)
   856  	}
   857  }
   858  
   859  // Pop removes the head of the active queue and returns it. It blocks if the
   860  // activeQ is empty and waits until a new item is added to the queue. It
   861  // increments scheduling cycle when a pod is popped.
   862  func (p *PriorityQueue) Pop(logger klog.Logger) (*framework.QueuedPodInfo, error) {
   863  	p.lock.Lock()
   864  	defer p.lock.Unlock()
   865  	for p.activeQ.Len() == 0 {
   866  		// When the queue is empty, invocation of Pop() is blocked until new item is enqueued.
   867  		// When Close() is called, the p.closed is set and the condition is broadcast,
   868  		// which causes this loop to continue and return from the Pop().
   869  		if p.closed {
   870  			logger.V(2).Info("Scheduling queue is closed")
   871  			return nil, nil
   872  		}
   873  		p.cond.Wait()
   874  	}
   875  	obj, err := p.activeQ.Pop()
   876  	if err != nil {
   877  		return nil, err
   878  	}
   879  	pInfo := obj.(*framework.QueuedPodInfo)
   880  	pInfo.Attempts++
   881  	p.schedulingCycle++
   882  	// In flight, no concurrent events yet.
   883  	if p.isSchedulingQueueHintEnabled {
   884  		p.inFlightPods[pInfo.Pod.UID] = p.inFlightEvents.PushBack(pInfo.Pod)
   885  	}
   886  
   887  	// Update metrics and reset the set of unschedulable plugins for the next attempt.
   888  	for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) {
   889  		metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec()
   890  	}
   891  	pInfo.UnschedulablePlugins.Clear()
   892  	pInfo.PendingPlugins.Clear()
   893  
   894  	return pInfo, nil
   895  }
   896  
   897  // Done must be called for pod returned by Pop. This allows the queue to
   898  // keep track of which pods are currently being processed.
   899  func (p *PriorityQueue) Done(pod types.UID) {
   900  	p.lock.Lock()
   901  	defer p.lock.Unlock()
   902  
   903  	p.done(pod)
   904  }
   905  
   906  func (p *PriorityQueue) done(pod types.UID) {
   907  	if !p.isSchedulingQueueHintEnabled {
   908  		// do nothing if schedulingQueueHint is disabled.
   909  		// In that case, we don't have inFlightPods and inFlightEvents.
   910  		return
   911  	}
   912  	inFlightPod, ok := p.inFlightPods[pod]
   913  	if !ok {
   914  		// This Pod is already done()ed.
   915  		return
   916  	}
   917  	delete(p.inFlightPods, pod)
   918  
   919  	// Remove the pod from the list.
   920  	p.inFlightEvents.Remove(inFlightPod)
   921  
   922  	// Remove events which are only referred to by this Pod
   923  	// so that the inFlightEvents list doesn't grow infinitely.
   924  	// If the pod was at the head of the list, then all
   925  	// events between it and the next pod are no longer needed
   926  	// and can be removed.
   927  	for {
   928  		e := p.inFlightEvents.Front()
   929  		if e == nil {
   930  			// Empty list.
   931  			break
   932  		}
   933  		if _, ok := e.Value.(*clusterEvent); !ok {
   934  			// A pod, must stop pruning.
   935  			break
   936  		}
   937  		p.inFlightEvents.Remove(e)
   938  	}
   939  }
   940  
   941  // isPodUpdated checks if the pod is updated in a way that it may have become
   942  // schedulable. It drops status of the pod and compares it with old version,
   943  // except for pod.status.resourceClaimStatuses: changing that may have an
   944  // effect on scheduling.
   945  func isPodUpdated(oldPod, newPod *v1.Pod) bool {
   946  	strip := func(pod *v1.Pod) *v1.Pod {
   947  		p := pod.DeepCopy()
   948  		p.ResourceVersion = ""
   949  		p.Generation = 0
   950  		p.Status = v1.PodStatus{
   951  			ResourceClaimStatuses: pod.Status.ResourceClaimStatuses,
   952  		}
   953  		p.ManagedFields = nil
   954  		p.Finalizers = nil
   955  		return p
   956  	}
   957  	return !reflect.DeepEqual(strip(oldPod), strip(newPod))
   958  }
   959  
   960  // Update updates a pod in the active or backoff queue if present. Otherwise, it removes
   961  // the item from the unschedulable queue if pod is updated in a way that it may
   962  // become schedulable and adds the updated one to the active queue.
   963  // If pod is not present in any of the queues, it is added to the active queue.
   964  func (p *PriorityQueue) Update(logger klog.Logger, oldPod, newPod *v1.Pod) error {
   965  	p.lock.Lock()
   966  	defer p.lock.Unlock()
   967  
   968  	if oldPod != nil {
   969  		oldPodInfo := newQueuedPodInfoForLookup(oldPod)
   970  		// If the pod is already in the active queue, just update it there.
   971  		if oldPodInfo, exists, _ := p.activeQ.Get(oldPodInfo); exists {
   972  			pInfo := updatePod(oldPodInfo, newPod)
   973  			p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
   974  			return p.activeQ.Update(pInfo)
   975  		}
   976  
   977  		// If the pod is in the backoff queue, update it there.
   978  		if oldPodInfo, exists, _ := p.podBackoffQ.Get(oldPodInfo); exists {
   979  			pInfo := updatePod(oldPodInfo, newPod)
   980  			p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
   981  			return p.podBackoffQ.Update(pInfo)
   982  		}
   983  	}
   984  
   985  	// If the pod is in the unschedulable queue, updating it may make it schedulable.
   986  	if usPodInfo := p.unschedulablePods.get(newPod); usPodInfo != nil {
   987  		pInfo := updatePod(usPodInfo, newPod)
   988  		p.updateNominatedPodUnlocked(logger, oldPod, pInfo.PodInfo)
   989  		if isPodUpdated(oldPod, newPod) {
   990  			gated := usPodInfo.Gated
   991  			if p.isPodBackingoff(usPodInfo) {
   992  				if err := p.podBackoffQ.Add(pInfo); err != nil {
   993  					return err
   994  				}
   995  				p.unschedulablePods.delete(usPodInfo.Pod, gated)
   996  				logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", backoffQ)
   997  			} else {
   998  				if added, err := p.addToActiveQ(logger, pInfo); !added {
   999  					return err
  1000  				}
  1001  				p.unschedulablePods.delete(usPodInfo.Pod, gated)
  1002  				logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", BackoffComplete, "queue", activeQ)
  1003  				p.cond.Broadcast()
  1004  			}
  1005  		} else {
  1006  			// Pod update didn't make it schedulable, keep it in the unschedulable queue.
  1007  			p.unschedulablePods.addOrUpdate(pInfo)
  1008  		}
  1009  
  1010  		return nil
  1011  	}
  1012  	// If pod is not in any of the queues, we put it in the active queue.
  1013  	pInfo := p.newQueuedPodInfo(newPod)
  1014  	if added, err := p.addToActiveQ(logger, pInfo); !added {
  1015  		return err
  1016  	}
  1017  	p.addNominatedPodUnlocked(logger, pInfo.PodInfo, nil)
  1018  	logger.V(5).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", PodUpdate, "queue", activeQ)
  1019  	p.cond.Broadcast()
  1020  	return nil
  1021  }
  1022  
  1023  // Delete deletes the item from either of the two queues. It assumes the pod is
  1024  // only in one queue.
  1025  func (p *PriorityQueue) Delete(pod *v1.Pod) error {
  1026  	p.lock.Lock()
  1027  	defer p.lock.Unlock()
  1028  	p.deleteNominatedPodIfExistsUnlocked(pod)
  1029  	pInfo := newQueuedPodInfoForLookup(pod)
  1030  	if err := p.activeQ.Delete(pInfo); err != nil {
  1031  		// The item was probably not found in the activeQ.
  1032  		p.podBackoffQ.Delete(pInfo)
  1033  		if pInfo = p.unschedulablePods.get(pod); pInfo != nil {
  1034  			p.unschedulablePods.delete(pod, pInfo.Gated)
  1035  		}
  1036  	}
  1037  	return nil
  1038  }
  1039  
  1040  // AssignedPodAdded is called when a bound pod is added. Creation of this pod
  1041  // may make pending pods with matching affinity terms schedulable.
  1042  func (p *PriorityQueue) AssignedPodAdded(logger klog.Logger, pod *v1.Pod) {
  1043  	p.lock.Lock()
  1044  	p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, pod), AssignedPodAdd, nil, pod)
  1045  	p.lock.Unlock()
  1046  }
  1047  
  1048  // isPodResourcesResizedDown returns true if a pod CPU and/or memory resize request has been
  1049  // admitted by kubelet, is 'InProgress', and results in a net sizing down of updated resources.
  1050  // It returns false if either CPU or memory resource is net resized up, or if no resize is in progress.
  1051  func isPodResourcesResizedDown(pod *v1.Pod) bool {
  1052  	if utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScaling) {
  1053  		// TODO(vinaykul,wangchen615,InPlacePodVerticalScaling): Fix this to determine when a
  1054  		// pod is truly resized down (might need oldPod if we cannot determine from Status alone)
  1055  		if pod.Status.Resize == v1.PodResizeStatusInProgress {
  1056  			return true
  1057  		}
  1058  	}
  1059  	return false
  1060  }
  1061  
  1062  // AssignedPodUpdated is called when a bound pod is updated. Change of labels
  1063  // may make pending pods with matching affinity terms schedulable.
  1064  func (p *PriorityQueue) AssignedPodUpdated(logger klog.Logger, oldPod, newPod *v1.Pod) {
  1065  	p.lock.Lock()
  1066  	if isPodResourcesResizedDown(newPod) {
  1067  		p.moveAllToActiveOrBackoffQueue(logger, AssignedPodUpdate, oldPod, newPod, nil)
  1068  	} else {
  1069  		p.movePodsToActiveOrBackoffQueue(logger, p.getUnschedulablePodsWithMatchingAffinityTerm(logger, newPod), AssignedPodUpdate, oldPod, newPod)
  1070  	}
  1071  	p.lock.Unlock()
  1072  }
  1073  
  1074  // NOTE: this function assumes a lock has been acquired in the caller.
  1075  // moveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ.
  1076  // This function adds all pods and then signals the condition variable to ensure that
  1077  // if Pop() is waiting for an item, it receives the signal after all the pods are in the
  1078  // queue and the head is the highest priority pod.
  1079  func (p *PriorityQueue) moveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) {
  1080  	unschedulablePods := make([]*framework.QueuedPodInfo, 0, len(p.unschedulablePods.podInfoMap))
  1081  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1082  		if preCheck == nil || preCheck(pInfo.Pod) {
  1083  			unschedulablePods = append(unschedulablePods, pInfo)
  1084  		}
  1085  	}
  1086  	p.movePodsToActiveOrBackoffQueue(logger, unschedulablePods, event, oldObj, newObj)
  1087  }
  1088  
  1089  // MoveAllToActiveOrBackoffQueue moves all pods from unschedulablePods to activeQ or backoffQ.
  1090  // This function adds all pods and then signals the condition variable to ensure that
  1091  // if Pop() is waiting for an item, it receives the signal after all the pods are in the
  1092  // queue and the head is the highest priority pod.
  1093  func (p *PriorityQueue) MoveAllToActiveOrBackoffQueue(logger klog.Logger, event framework.ClusterEvent, oldObj, newObj interface{}, preCheck PreEnqueueCheck) {
  1094  	p.lock.Lock()
  1095  	defer p.lock.Unlock()
  1096  	p.moveAllToActiveOrBackoffQueue(logger, event, oldObj, newObj, preCheck)
  1097  }
  1098  
  1099  // requeuePodViaQueueingHint tries to requeue Pod to activeQ, backoffQ or unschedulable pod pool based on schedulingHint.
  1100  // It returns the queue name Pod goes.
  1101  //
  1102  // NOTE: this function assumes lock has been acquired in caller
  1103  func (p *PriorityQueue) requeuePodViaQueueingHint(logger klog.Logger, pInfo *framework.QueuedPodInfo, strategy queueingStrategy, event string) string {
  1104  	if strategy == queueSkip {
  1105  		p.unschedulablePods.addOrUpdate(pInfo)
  1106  		metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", event).Inc()
  1107  		return unschedulablePods
  1108  	}
  1109  
  1110  	pod := pInfo.Pod
  1111  	if strategy == queueAfterBackoff && p.isPodBackingoff(pInfo) {
  1112  		if err := p.podBackoffQ.Add(pInfo); err != nil {
  1113  			logger.Error(err, "Error adding pod to the backoff queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod))
  1114  			p.unschedulablePods.addOrUpdate(pInfo)
  1115  			return unschedulablePods
  1116  		}
  1117  
  1118  		metrics.SchedulerQueueIncomingPods.WithLabelValues("backoff", event).Inc()
  1119  		return backoffQ
  1120  	}
  1121  
  1122  	// Reach here if schedulingHint is QueueImmediately, or schedulingHint is Queue but the pod is not backing off.
  1123  
  1124  	added, err := p.addToActiveQ(logger, pInfo)
  1125  	if err != nil {
  1126  		logger.Error(err, "Error adding pod to the active queue, queue this Pod to unschedulable pod pool", "pod", klog.KObj(pod))
  1127  	}
  1128  	if added {
  1129  		metrics.SchedulerQueueIncomingPods.WithLabelValues("active", event).Inc()
  1130  		return activeQ
  1131  	}
  1132  	if pInfo.Gated {
  1133  		// In case the pod is gated, the Pod is pushed back to unschedulable Pods pool in addToActiveQ.
  1134  		return unschedulablePods
  1135  	}
  1136  
  1137  	p.unschedulablePods.addOrUpdate(pInfo)
  1138  	metrics.SchedulerQueueIncomingPods.WithLabelValues("unschedulable", ScheduleAttemptFailure).Inc()
  1139  	return unschedulablePods
  1140  }
  1141  
  1142  // NOTE: this function assumes lock has been acquired in caller
  1143  func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podInfoList []*framework.QueuedPodInfo, event framework.ClusterEvent, oldObj, newObj interface{}) {
  1144  	activated := false
  1145  	for _, pInfo := range podInfoList {
  1146  		schedulingHint := p.isPodWorthRequeuing(logger, pInfo, event, oldObj, newObj)
  1147  		if schedulingHint == queueSkip {
  1148  			// QueueingHintFn determined that this Pod isn't worth putting to activeQ or backoffQ by this event.
  1149  			logger.V(5).Info("Event is not making pod schedulable", "pod", klog.KObj(pInfo.Pod), "event", event.Label)
  1150  			continue
  1151  		}
  1152  
  1153  		p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated)
  1154  		queue := p.requeuePodViaQueueingHint(logger, pInfo, schedulingHint, event.Label)
  1155  		logger.V(4).Info("Pod moved to an internal scheduling queue", "pod", klog.KObj(pInfo.Pod), "event", event.Label, "queue", queue, "hint", schedulingHint)
  1156  		if queue == activeQ {
  1157  			activated = true
  1158  		}
  1159  	}
  1160  
  1161  	p.moveRequestCycle = p.schedulingCycle
  1162  
  1163  	if p.isSchedulingQueueHintEnabled && len(p.inFlightPods) != 0 {
  1164  		logger.V(5).Info("Event received while pods are in flight", "event", event.Label, "numPods", len(p.inFlightPods))
  1165  		// AddUnschedulableIfNotPresent might get called for in-flight Pods later, and in
  1166  		// AddUnschedulableIfNotPresent we need to know whether events were
  1167  		// observed while scheduling them.
  1168  		p.inFlightEvents.PushBack(&clusterEvent{
  1169  			event:  event,
  1170  			oldObj: oldObj,
  1171  			newObj: newObj,
  1172  		})
  1173  	}
  1174  
  1175  	if activated {
  1176  		p.cond.Broadcast()
  1177  	}
  1178  }
  1179  
  1180  // getUnschedulablePodsWithMatchingAffinityTerm returns unschedulable pods which have
  1181  // any affinity term that matches "pod".
  1182  // NOTE: this function assumes lock has been acquired in caller.
  1183  func (p *PriorityQueue) getUnschedulablePodsWithMatchingAffinityTerm(logger klog.Logger, pod *v1.Pod) []*framework.QueuedPodInfo {
  1184  	nsLabels := interpodaffinity.GetNamespaceLabelsSnapshot(logger, pod.Namespace, p.nsLister)
  1185  
  1186  	var podsToMove []*framework.QueuedPodInfo
  1187  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1188  		for _, term := range pInfo.RequiredAffinityTerms {
  1189  			if term.Matches(pod, nsLabels) {
  1190  				podsToMove = append(podsToMove, pInfo)
  1191  				break
  1192  			}
  1193  		}
  1194  
  1195  	}
  1196  	return podsToMove
  1197  }
  1198  
  1199  var pendingPodsSummary = "activeQ:%v; backoffQ:%v; unschedulablePods:%v"
  1200  
  1201  // PendingPods returns all the pending pods in the queue; accompanied by a debugging string
  1202  // recording showing the number of pods in each queue respectively.
  1203  // This function is used for debugging purposes in the scheduler cache dumper and comparer.
  1204  func (p *PriorityQueue) PendingPods() ([]*v1.Pod, string) {
  1205  	p.lock.RLock()
  1206  	defer p.lock.RUnlock()
  1207  	var result []*v1.Pod
  1208  	for _, pInfo := range p.activeQ.List() {
  1209  		result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
  1210  	}
  1211  	for _, pInfo := range p.podBackoffQ.List() {
  1212  		result = append(result, pInfo.(*framework.QueuedPodInfo).Pod)
  1213  	}
  1214  	for _, pInfo := range p.unschedulablePods.podInfoMap {
  1215  		result = append(result, pInfo.Pod)
  1216  	}
  1217  	return result, fmt.Sprintf(pendingPodsSummary, p.activeQ.Len(), p.podBackoffQ.Len(), len(p.unschedulablePods.podInfoMap))
  1218  }
  1219  
  1220  // Close closes the priority queue.
  1221  func (p *PriorityQueue) Close() {
  1222  	p.lock.Lock()
  1223  	defer p.lock.Unlock()
  1224  	close(p.stop)
  1225  	p.closed = true
  1226  	p.cond.Broadcast()
  1227  }
  1228  
  1229  // DeleteNominatedPodIfExists deletes <pod> from nominatedPods.
  1230  func (npm *nominator) DeleteNominatedPodIfExists(pod *v1.Pod) {
  1231  	npm.lock.Lock()
  1232  	npm.deleteNominatedPodIfExistsUnlocked(pod)
  1233  	npm.lock.Unlock()
  1234  }
  1235  
  1236  func (npm *nominator) deleteNominatedPodIfExistsUnlocked(pod *v1.Pod) {
  1237  	npm.delete(pod)
  1238  }
  1239  
  1240  // AddNominatedPod adds a pod to the nominated pods of the given node.
  1241  // This is called during the preemption process after a node is nominated to run
  1242  // the pod. We update the structure before sending a request to update the pod
  1243  // object to avoid races with the following scheduling cycles.
  1244  func (npm *nominator) AddNominatedPod(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
  1245  	npm.lock.Lock()
  1246  	npm.addNominatedPodUnlocked(logger, pi, nominatingInfo)
  1247  	npm.lock.Unlock()
  1248  }
  1249  
  1250  // NominatedPodsForNode returns a copy of pods that are nominated to run on the given node,
  1251  // but they are waiting for other pods to be removed from the node.
  1252  func (npm *nominator) NominatedPodsForNode(nodeName string) []*framework.PodInfo {
  1253  	npm.lock.RLock()
  1254  	defer npm.lock.RUnlock()
  1255  	// Make a copy of the nominated Pods so the caller can mutate safely.
  1256  	pods := make([]*framework.PodInfo, len(npm.nominatedPods[nodeName]))
  1257  	for i := 0; i < len(pods); i++ {
  1258  		pods[i] = npm.nominatedPods[nodeName][i].DeepCopy()
  1259  	}
  1260  	return pods
  1261  }
  1262  
  1263  func (p *PriorityQueue) podsCompareBackoffCompleted(podInfo1, podInfo2 interface{}) bool {
  1264  	pInfo1 := podInfo1.(*framework.QueuedPodInfo)
  1265  	pInfo2 := podInfo2.(*framework.QueuedPodInfo)
  1266  	bo1 := p.getBackoffTime(pInfo1)
  1267  	bo2 := p.getBackoffTime(pInfo2)
  1268  	return bo1.Before(bo2)
  1269  }
  1270  
  1271  // newQueuedPodInfo builds a QueuedPodInfo object.
  1272  func (p *PriorityQueue) newQueuedPodInfo(pod *v1.Pod, plugins ...string) *framework.QueuedPodInfo {
  1273  	now := p.clock.Now()
  1274  	// ignore this err since apiserver doesn't properly validate affinity terms
  1275  	// and we can't fix the validation for backwards compatibility.
  1276  	podInfo, _ := framework.NewPodInfo(pod)
  1277  	return &framework.QueuedPodInfo{
  1278  		PodInfo:                 podInfo,
  1279  		Timestamp:               now,
  1280  		InitialAttemptTimestamp: nil,
  1281  		UnschedulablePlugins:    sets.New(plugins...),
  1282  	}
  1283  }
  1284  
  1285  // getBackoffTime returns the time that podInfo completes backoff
  1286  func (p *PriorityQueue) getBackoffTime(podInfo *framework.QueuedPodInfo) time.Time {
  1287  	duration := p.calculateBackoffDuration(podInfo)
  1288  	backoffTime := podInfo.Timestamp.Add(duration)
  1289  	return backoffTime
  1290  }
  1291  
  1292  // calculateBackoffDuration is a helper function for calculating the backoffDuration
  1293  // based on the number of attempts the pod has made.
  1294  func (p *PriorityQueue) calculateBackoffDuration(podInfo *framework.QueuedPodInfo) time.Duration {
  1295  	duration := p.podInitialBackoffDuration
  1296  	for i := 1; i < podInfo.Attempts; i++ {
  1297  		// Use subtraction instead of addition or multiplication to avoid overflow.
  1298  		if duration > p.podMaxBackoffDuration-duration {
  1299  			return p.podMaxBackoffDuration
  1300  		}
  1301  		duration += duration
  1302  	}
  1303  	return duration
  1304  }
  1305  
  1306  func updatePod(oldPodInfo interface{}, newPod *v1.Pod) *framework.QueuedPodInfo {
  1307  	pInfo := oldPodInfo.(*framework.QueuedPodInfo)
  1308  	pInfo.Update(newPod)
  1309  	return pInfo
  1310  }
  1311  
  1312  // UnschedulablePods holds pods that cannot be scheduled. This data structure
  1313  // is used to implement unschedulablePods.
  1314  type UnschedulablePods struct {
  1315  	// podInfoMap is a map key by a pod's full-name and the value is a pointer to the QueuedPodInfo.
  1316  	podInfoMap map[string]*framework.QueuedPodInfo
  1317  	keyFunc    func(*v1.Pod) string
  1318  	// unschedulableRecorder/gatedRecorder updates the counter when elements of an unschedulablePodsMap
  1319  	// get added or removed, and it does nothing if it's nil.
  1320  	unschedulableRecorder, gatedRecorder metrics.MetricRecorder
  1321  }
  1322  
  1323  // addOrUpdate adds a pod to the unschedulable podInfoMap.
  1324  func (u *UnschedulablePods) addOrUpdate(pInfo *framework.QueuedPodInfo) {
  1325  	podID := u.keyFunc(pInfo.Pod)
  1326  	if _, exists := u.podInfoMap[podID]; !exists {
  1327  		if pInfo.Gated && u.gatedRecorder != nil {
  1328  			u.gatedRecorder.Inc()
  1329  		} else if !pInfo.Gated && u.unschedulableRecorder != nil {
  1330  			u.unschedulableRecorder.Inc()
  1331  		}
  1332  	}
  1333  	u.podInfoMap[podID] = pInfo
  1334  }
  1335  
  1336  // delete deletes a pod from the unschedulable podInfoMap.
  1337  // The `gated` parameter is used to figure out which metric should be decreased.
  1338  func (u *UnschedulablePods) delete(pod *v1.Pod, gated bool) {
  1339  	podID := u.keyFunc(pod)
  1340  	if _, exists := u.podInfoMap[podID]; exists {
  1341  		if gated && u.gatedRecorder != nil {
  1342  			u.gatedRecorder.Dec()
  1343  		} else if !gated && u.unschedulableRecorder != nil {
  1344  			u.unschedulableRecorder.Dec()
  1345  		}
  1346  	}
  1347  	delete(u.podInfoMap, podID)
  1348  }
  1349  
  1350  // get returns the QueuedPodInfo if a pod with the same key as the key of the given "pod"
  1351  // is found in the map. It returns nil otherwise.
  1352  func (u *UnschedulablePods) get(pod *v1.Pod) *framework.QueuedPodInfo {
  1353  	podKey := u.keyFunc(pod)
  1354  	if pInfo, exists := u.podInfoMap[podKey]; exists {
  1355  		return pInfo
  1356  	}
  1357  	return nil
  1358  }
  1359  
  1360  // clear removes all the entries from the unschedulable podInfoMap.
  1361  func (u *UnschedulablePods) clear() {
  1362  	u.podInfoMap = make(map[string]*framework.QueuedPodInfo)
  1363  	if u.unschedulableRecorder != nil {
  1364  		u.unschedulableRecorder.Clear()
  1365  	}
  1366  	if u.gatedRecorder != nil {
  1367  		u.gatedRecorder.Clear()
  1368  	}
  1369  }
  1370  
  1371  // newUnschedulablePods initializes a new object of UnschedulablePods.
  1372  func newUnschedulablePods(unschedulableRecorder, gatedRecorder metrics.MetricRecorder) *UnschedulablePods {
  1373  	return &UnschedulablePods{
  1374  		podInfoMap:            make(map[string]*framework.QueuedPodInfo),
  1375  		keyFunc:               util.GetPodFullName,
  1376  		unschedulableRecorder: unschedulableRecorder,
  1377  		gatedRecorder:         gatedRecorder,
  1378  	}
  1379  }
  1380  
  1381  // nominator is a structure that stores pods nominated to run on nodes.
  1382  // It exists because nominatedNodeName of pod objects stored in the structure
  1383  // may be different than what scheduler has here. We should be able to find pods
  1384  // by their UID and update/delete them.
  1385  type nominator struct {
  1386  	// podLister is used to verify if the given pod is alive.
  1387  	podLister listersv1.PodLister
  1388  	// nominatedPods is a map keyed by a node name and the value is a list of
  1389  	// pods which are nominated to run on the node. These are pods which can be in
  1390  	// the activeQ or unschedulablePods.
  1391  	nominatedPods map[string][]*framework.PodInfo
  1392  	// nominatedPodToNode is map keyed by a Pod UID to the node name where it is
  1393  	// nominated.
  1394  	nominatedPodToNode map[types.UID]string
  1395  
  1396  	lock sync.RWMutex
  1397  }
  1398  
  1399  func (npm *nominator) addNominatedPodUnlocked(logger klog.Logger, pi *framework.PodInfo, nominatingInfo *framework.NominatingInfo) {
  1400  	// Always delete the pod if it already exists, to ensure we never store more than
  1401  	// one instance of the pod.
  1402  	npm.delete(pi.Pod)
  1403  
  1404  	var nodeName string
  1405  	if nominatingInfo.Mode() == framework.ModeOverride {
  1406  		nodeName = nominatingInfo.NominatedNodeName
  1407  	} else if nominatingInfo.Mode() == framework.ModeNoop {
  1408  		if pi.Pod.Status.NominatedNodeName == "" {
  1409  			return
  1410  		}
  1411  		nodeName = pi.Pod.Status.NominatedNodeName
  1412  	}
  1413  
  1414  	if npm.podLister != nil {
  1415  		// If the pod was removed or if it was already scheduled, don't nominate it.
  1416  		updatedPod, err := npm.podLister.Pods(pi.Pod.Namespace).Get(pi.Pod.Name)
  1417  		if err != nil {
  1418  			logger.V(4).Info("Pod doesn't exist in podLister, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod))
  1419  			return
  1420  		}
  1421  		if updatedPod.Spec.NodeName != "" {
  1422  			logger.V(4).Info("Pod is already scheduled to a node, aborted adding it to the nominator", "pod", klog.KObj(pi.Pod), "node", updatedPod.Spec.NodeName)
  1423  			return
  1424  		}
  1425  	}
  1426  
  1427  	npm.nominatedPodToNode[pi.Pod.UID] = nodeName
  1428  	for _, npi := range npm.nominatedPods[nodeName] {
  1429  		if npi.Pod.UID == pi.Pod.UID {
  1430  			logger.V(4).Info("Pod already exists in the nominator", "pod", klog.KObj(npi.Pod))
  1431  			return
  1432  		}
  1433  	}
  1434  	npm.nominatedPods[nodeName] = append(npm.nominatedPods[nodeName], pi)
  1435  }
  1436  
  1437  func (npm *nominator) delete(p *v1.Pod) {
  1438  	nnn, ok := npm.nominatedPodToNode[p.UID]
  1439  	if !ok {
  1440  		return
  1441  	}
  1442  	for i, np := range npm.nominatedPods[nnn] {
  1443  		if np.Pod.UID == p.UID {
  1444  			npm.nominatedPods[nnn] = append(npm.nominatedPods[nnn][:i], npm.nominatedPods[nnn][i+1:]...)
  1445  			if len(npm.nominatedPods[nnn]) == 0 {
  1446  				delete(npm.nominatedPods, nnn)
  1447  			}
  1448  			break
  1449  		}
  1450  	}
  1451  	delete(npm.nominatedPodToNode, p.UID)
  1452  }
  1453  
  1454  // UpdateNominatedPod updates the <oldPod> with <newPod>.
  1455  func (npm *nominator) UpdateNominatedPod(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) {
  1456  	npm.lock.Lock()
  1457  	defer npm.lock.Unlock()
  1458  	npm.updateNominatedPodUnlocked(logger, oldPod, newPodInfo)
  1459  }
  1460  
  1461  func (npm *nominator) updateNominatedPodUnlocked(logger klog.Logger, oldPod *v1.Pod, newPodInfo *framework.PodInfo) {
  1462  	// In some cases, an Update event with no "NominatedNode" present is received right
  1463  	// after a node("NominatedNode") is reserved for this pod in memory.
  1464  	// In this case, we need to keep reserving the NominatedNode when updating the pod pointer.
  1465  	var nominatingInfo *framework.NominatingInfo
  1466  	// We won't fall into below `if` block if the Update event represents:
  1467  	// (1) NominatedNode info is added
  1468  	// (2) NominatedNode info is updated
  1469  	// (3) NominatedNode info is removed
  1470  	if NominatedNodeName(oldPod) == "" && NominatedNodeName(newPodInfo.Pod) == "" {
  1471  		if nnn, ok := npm.nominatedPodToNode[oldPod.UID]; ok {
  1472  			// This is the only case we should continue reserving the NominatedNode
  1473  			nominatingInfo = &framework.NominatingInfo{
  1474  				NominatingMode:    framework.ModeOverride,
  1475  				NominatedNodeName: nnn,
  1476  			}
  1477  		}
  1478  	}
  1479  	// We update irrespective of the nominatedNodeName changed or not, to ensure
  1480  	// that pod pointer is updated.
  1481  	npm.delete(oldPod)
  1482  	npm.addNominatedPodUnlocked(logger, newPodInfo, nominatingInfo)
  1483  }
  1484  
  1485  // NewPodNominator creates a nominator as a backing of framework.PodNominator.
  1486  // A podLister is passed in so as to check if the pod exists
  1487  // before adding its nominatedNode info.
  1488  func NewPodNominator(podLister listersv1.PodLister) framework.PodNominator {
  1489  	return newPodNominator(podLister)
  1490  }
  1491  
  1492  func newPodNominator(podLister listersv1.PodLister) *nominator {
  1493  	return &nominator{
  1494  		podLister:          podLister,
  1495  		nominatedPods:      make(map[string][]*framework.PodInfo),
  1496  		nominatedPodToNode: make(map[types.UID]string),
  1497  	}
  1498  }
  1499  
  1500  func podInfoKeyFunc(obj interface{}) (string, error) {
  1501  	return cache.MetaNamespaceKeyFunc(obj.(*framework.QueuedPodInfo).Pod)
  1502  }