k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/schedule_one.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/schedule_one.go (about)

     1  /*
     2  Copyright 2014 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package scheduler
    18  
    19  import (
    20  	"container/heap"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"math/rand"
    25  	"strconv"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	v1 "k8s.io/api/core/v1"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    33  	"k8s.io/apimachinery/pkg/util/sets"
    34  	clientset "k8s.io/client-go/kubernetes"
    35  	"k8s.io/klog/v2"
    36  	extenderv1 "k8s.io/kube-scheduler/extender/v1"
    37  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    38  	"k8s.io/kubernetes/pkg/apis/core/validation"
    39  	"k8s.io/kubernetes/pkg/scheduler/framework"
    40  	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
    41  	internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
    42  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    43  	"k8s.io/kubernetes/pkg/scheduler/util"
    44  	utiltrace "k8s.io/utils/trace"
    45  )
    46  
    47  const (
    48  	// Percentage of plugin metrics to be sampled.
    49  	pluginMetricsSamplePercent = 10
    50  	// minFeasibleNodesToFind is the minimum number of nodes that would be scored
    51  	// in each scheduling cycle. This is a semi-arbitrary value to ensure that a
    52  	// certain minimum of nodes are checked for feasibility. This in turn helps
    53  	// ensure a minimum level of spreading.
    54  	minFeasibleNodesToFind = 100
    55  	// minFeasibleNodesPercentageToFind is the minimum percentage of nodes that
    56  	// would be scored in each scheduling cycle. This is a semi-arbitrary value
    57  	// to ensure that a certain minimum of nodes are checked for feasibility.
    58  	// This in turn helps ensure a minimum level of spreading.
    59  	minFeasibleNodesPercentageToFind = 5
    60  	// numberOfHighestScoredNodesToReport is the number of node scores
    61  	// to be included in ScheduleResult.
    62  	numberOfHighestScoredNodesToReport = 3
    63  )
    64  
    65  // ScheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
    66  func (sched *Scheduler) ScheduleOne(ctx context.Context) {
    67  	logger := klog.FromContext(ctx)
    68  	podInfo, err := sched.NextPod(logger)
    69  	if err != nil {
    70  		logger.Error(err, "Error while retrieving next pod from scheduling queue")
    71  		return
    72  	}
    73  	// pod could be nil when schedulerQueue is closed
    74  	if podInfo == nil || podInfo.Pod == nil {
    75  		return
    76  	}
    77  
    78  	pod := podInfo.Pod
    79  	// TODO(knelasevero): Remove duplicated keys from log entry calls
    80  	// When contextualized logging hits GA
    81  	// https://github.com/kubernetes/kubernetes/issues/111672
    82  	logger = klog.LoggerWithValues(logger, "pod", klog.KObj(pod))
    83  	ctx = klog.NewContext(ctx, logger)
    84  	logger.V(4).Info("About to try and schedule pod", "pod", klog.KObj(pod))
    85  
    86  	fwk, err := sched.frameworkForPod(pod)
    87  	if err != nil {
    88  		// This shouldn't happen, because we only accept for scheduling the pods
    89  		// which specify a scheduler name that matches one of the profiles.
    90  		logger.Error(err, "Error occurred")
    91  		return
    92  	}
    93  	if sched.skipPodSchedule(ctx, fwk, pod) {
    94  		return
    95  	}
    96  
    97  	logger.V(3).Info("Attempting to schedule pod", "pod", klog.KObj(pod))
    98  
    99  	// Synchronously attempt to find a fit for the pod.
   100  	start := time.Now()
   101  	state := framework.NewCycleState()
   102  	state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent)
   103  
   104  	// Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty.
   105  	podsToActivate := framework.NewPodsToActivate()
   106  	state.Write(framework.PodsToActivateKey, podsToActivate)
   107  
   108  	schedulingCycleCtx, cancel := context.WithCancel(ctx)
   109  	defer cancel()
   110  
   111  	scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate)
   112  	if !status.IsSuccess() {
   113  		sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start)
   114  		return
   115  	}
   116  
   117  	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
   118  	go func() {
   119  		bindingCycleCtx, cancel := context.WithCancel(ctx)
   120  		defer cancel()
   121  
   122  		metrics.Goroutines.WithLabelValues(metrics.Binding).Inc()
   123  		defer metrics.Goroutines.WithLabelValues(metrics.Binding).Dec()
   124  
   125  		status := sched.bindingCycle(bindingCycleCtx, state, fwk, scheduleResult, assumedPodInfo, start, podsToActivate)
   126  		if !status.IsSuccess() {
   127  			sched.handleBindingCycleError(bindingCycleCtx, state, fwk, assumedPodInfo, start, scheduleResult, status)
   128  			return
   129  		}
   130  		// Usually, DonePod is called inside the scheduling queue,
   131  		// but in this case, we need to call it here because this Pod won't go back to the scheduling queue.
   132  		sched.SchedulingQueue.Done(assumedPodInfo.Pod.UID)
   133  	}()
   134  }
   135  
   136  var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
   137  
   138  // schedulingCycle tries to schedule a single Pod.
   139  func (sched *Scheduler) schedulingCycle(
   140  	ctx context.Context,
   141  	state *framework.CycleState,
   142  	fwk framework.Framework,
   143  	podInfo *framework.QueuedPodInfo,
   144  	start time.Time,
   145  	podsToActivate *framework.PodsToActivate,
   146  ) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) {
   147  	logger := klog.FromContext(ctx)
   148  	pod := podInfo.Pod
   149  	scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod)
   150  	if err != nil {
   151  		defer func() {
   152  			metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))
   153  		}()
   154  		if err == ErrNoNodesAvailable {
   155  			status := framework.NewStatus(framework.UnschedulableAndUnresolvable).WithError(err)
   156  			return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status
   157  		}
   158  
   159  		fitError, ok := err.(*framework.FitError)
   160  		if !ok {
   161  			logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod))
   162  			return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, framework.AsStatus(err)
   163  		}
   164  
   165  		// SchedulePod() may have failed because the pod would not fit on any host, so we try to
   166  		// preempt, with the expectation that the next time the pod is tried for scheduling it
   167  		// will fit due to the preemption. It is also possible that a different pod will schedule
   168  		// into the resources that were preempted, but this is harmless.
   169  
   170  		if !fwk.HasPostFilterPlugins() {
   171  			logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed")
   172  			return ScheduleResult{}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err)
   173  		}
   174  
   175  		// Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle.
   176  		result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap)
   177  		msg := status.Message()
   178  		fitError.Diagnosis.PostFilterMsg = msg
   179  		if status.Code() == framework.Error {
   180  			logger.Error(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   181  		} else {
   182  			logger.V(5).Info("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   183  		}
   184  
   185  		var nominatingInfo *framework.NominatingInfo
   186  		if result != nil {
   187  			nominatingInfo = result.NominatingInfo
   188  		}
   189  		return ScheduleResult{nominatingInfo: nominatingInfo}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err)
   190  	}
   191  
   192  	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))
   193  	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
   194  	// This allows us to keep scheduling without waiting on binding to occur.
   195  	assumedPodInfo := podInfo.DeepCopy()
   196  	assumedPod := assumedPodInfo.Pod
   197  	// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
   198  	err = sched.assume(logger, assumedPod, scheduleResult.SuggestedHost)
   199  	if err != nil {
   200  		// This is most probably result of a BUG in retrying logic.
   201  		// We report an error here so that pod scheduling can be retried.
   202  		// This relies on the fact that Error will check if the pod has been bound
   203  		// to a node and if so will not add it back to the unscheduled pods queue
   204  		// (otherwise this would cause an infinite loop).
   205  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.AsStatus(err)
   206  	}
   207  
   208  	// Run the Reserve method of reserve plugins.
   209  	if sts := fwk.RunReservePluginsReserve(ctx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
   210  		// trigger un-reserve to clean up state associated with the reserved Pod
   211  		fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   212  		if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   213  			logger.Error(forgetErr, "Scheduler cache ForgetPod failed")
   214  		}
   215  
   216  		if sts.IsRejected() {
   217  			fitErr := &framework.FitError{
   218  				NumAllNodes: 1,
   219  				Pod:         pod,
   220  				Diagnosis: framework.Diagnosis{
   221  					NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts},
   222  				},
   223  			}
   224  			fitErr.Diagnosis.AddPluginStatus(sts)
   225  			return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(sts.Code()).WithError(fitErr)
   226  		}
   227  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts
   228  	}
   229  
   230  	// Run "permit" plugins.
   231  	runPermitStatus := fwk.RunPermitPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   232  	if !runPermitStatus.IsWait() && !runPermitStatus.IsSuccess() {
   233  		// trigger un-reserve to clean up state associated with the reserved Pod
   234  		fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   235  		if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   236  			logger.Error(forgetErr, "Scheduler cache ForgetPod failed")
   237  		}
   238  
   239  		if runPermitStatus.IsRejected() {
   240  			fitErr := &framework.FitError{
   241  				NumAllNodes: 1,
   242  				Pod:         pod,
   243  				Diagnosis: framework.Diagnosis{
   244  					NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus},
   245  				},
   246  			}
   247  			fitErr.Diagnosis.AddPluginStatus(runPermitStatus)
   248  			return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(runPermitStatus.Code()).WithError(fitErr)
   249  		}
   250  
   251  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus
   252  	}
   253  
   254  	// At the end of a successful scheduling cycle, pop and move up Pods if needed.
   255  	if len(podsToActivate.Map) != 0 {
   256  		sched.SchedulingQueue.Activate(logger, podsToActivate.Map)
   257  		// Clear the entries after activation.
   258  		podsToActivate.Map = make(map[string]*v1.Pod)
   259  	}
   260  
   261  	return scheduleResult, assumedPodInfo, nil
   262  }
   263  
   264  // bindingCycle tries to bind an assumed Pod.
   265  func (sched *Scheduler) bindingCycle(
   266  	ctx context.Context,
   267  	state *framework.CycleState,
   268  	fwk framework.Framework,
   269  	scheduleResult ScheduleResult,
   270  	assumedPodInfo *framework.QueuedPodInfo,
   271  	start time.Time,
   272  	podsToActivate *framework.PodsToActivate) *framework.Status {
   273  	logger := klog.FromContext(ctx)
   274  
   275  	assumedPod := assumedPodInfo.Pod
   276  
   277  	// Run "permit" plugins.
   278  	if status := fwk.WaitOnPermit(ctx, assumedPod); !status.IsSuccess() {
   279  		if status.IsRejected() {
   280  			fitErr := &framework.FitError{
   281  				NumAllNodes: 1,
   282  				Pod:         assumedPodInfo.Pod,
   283  				Diagnosis: framework.Diagnosis{
   284  					NodeToStatusMap:      framework.NodeToStatusMap{scheduleResult.SuggestedHost: status},
   285  					UnschedulablePlugins: sets.New(status.Plugin()),
   286  				},
   287  			}
   288  			return framework.NewStatus(status.Code()).WithError(fitErr)
   289  		}
   290  		return status
   291  	}
   292  
   293  	// Run "prebind" plugins.
   294  	if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
   295  		return status
   296  	}
   297  
   298  	// Run "bind" plugins.
   299  	if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() {
   300  		return status
   301  	}
   302  
   303  	// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
   304  	logger.V(2).Info("Successfully bound pod to node", "pod", klog.KObj(assumedPod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
   305  	metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
   306  	metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts))
   307  	if assumedPodInfo.InitialAttemptTimestamp != nil {
   308  		metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
   309  		metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
   310  	}
   311  	// Run "postbind" plugins.
   312  	fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   313  
   314  	// At the end of a successful binding cycle, move up Pods if needed.
   315  	if len(podsToActivate.Map) != 0 {
   316  		sched.SchedulingQueue.Activate(logger, podsToActivate.Map)
   317  		// Unlike the logic in schedulingCycle(), we don't bother deleting the entries
   318  		// as `podsToActivate.Map` is no longer consumed.
   319  	}
   320  
   321  	return nil
   322  }
   323  
   324  func (sched *Scheduler) handleBindingCycleError(
   325  	ctx context.Context,
   326  	state *framework.CycleState,
   327  	fwk framework.Framework,
   328  	podInfo *framework.QueuedPodInfo,
   329  	start time.Time,
   330  	scheduleResult ScheduleResult,
   331  	status *framework.Status) {
   332  	logger := klog.FromContext(ctx)
   333  
   334  	assumedPod := podInfo.Pod
   335  	// trigger un-reserve plugins to clean up state associated with the reserved Pod
   336  	fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   337  	if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   338  		logger.Error(forgetErr, "scheduler cache ForgetPod failed")
   339  	} else {
   340  		// "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event,
   341  		// as the assumed Pod had occupied a certain amount of resources in scheduler cache.
   342  		//
   343  		// Avoid moving the assumed Pod itself as it's always Unschedulable.
   344  		// It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would
   345  		// add this event to in-flight events and thus move the assumed pod to backoffQ anyways if the plugins don't have appropriate QueueingHint.
   346  		if status.IsRejected() {
   347  			defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, func(pod *v1.Pod) bool {
   348  				return assumedPod.UID != pod.UID
   349  			})
   350  		} else {
   351  			sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, nil)
   352  		}
   353  	}
   354  
   355  	sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start)
   356  }
   357  
   358  func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {
   359  	fwk, ok := sched.Profiles[pod.Spec.SchedulerName]
   360  	if !ok {
   361  		return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName)
   362  	}
   363  	return fwk, nil
   364  }
   365  
   366  // skipPodSchedule returns true if we could skip scheduling the pod for specified cases.
   367  func (sched *Scheduler) skipPodSchedule(ctx context.Context, fwk framework.Framework, pod *v1.Pod) bool {
   368  	// Case 1: pod is being deleted.
   369  	if pod.DeletionTimestamp != nil {
   370  		fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
   371  		klog.FromContext(ctx).V(3).Info("Skip schedule deleting pod", "pod", klog.KObj(pod))
   372  		return true
   373  	}
   374  
   375  	// Case 2: pod that has been assumed could be skipped.
   376  	// An assumed pod can be added again to the scheduling queue if it got an update event
   377  	// during its previous scheduling cycle but before getting assumed.
   378  	isAssumed, err := sched.Cache.IsAssumedPod(pod)
   379  	if err != nil {
   380  		// TODO(91633): pass ctx into a revised HandleError
   381  		utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err))
   382  		return false
   383  	}
   384  	return isAssumed
   385  }
   386  
   387  // schedulePod tries to schedule the given pod to one of the nodes in the node list.
   388  // If it succeeds, it will return the name of the node.
   389  // If it fails, it will return a FitError with reasons.
   390  func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
   391  	trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
   392  	defer trace.LogIfLong(100 * time.Millisecond)
   393  	if err := sched.Cache.UpdateSnapshot(klog.FromContext(ctx), sched.nodeInfoSnapshot); err != nil {
   394  		return result, err
   395  	}
   396  	trace.Step("Snapshotting scheduler cache and node infos done")
   397  
   398  	if sched.nodeInfoSnapshot.NumNodes() == 0 {
   399  		return result, ErrNoNodesAvailable
   400  	}
   401  
   402  	feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod)
   403  	if err != nil {
   404  		return result, err
   405  	}
   406  	trace.Step("Computing predicates done")
   407  
   408  	if len(feasibleNodes) == 0 {
   409  		return result, &framework.FitError{
   410  			Pod:         pod,
   411  			NumAllNodes: sched.nodeInfoSnapshot.NumNodes(),
   412  			Diagnosis:   diagnosis,
   413  		}
   414  	}
   415  
   416  	// When only one node after predicate, just use it.
   417  	if len(feasibleNodes) == 1 {
   418  		return ScheduleResult{
   419  			SuggestedHost:  feasibleNodes[0].Node().Name,
   420  			EvaluatedNodes: diagnosis.EvaluatedNodes,
   421  			FeasibleNodes:  1,
   422  		}, nil
   423  	}
   424  
   425  	priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes)
   426  	if err != nil {
   427  		return result, err
   428  	}
   429  
   430  	host, _, err := selectHost(priorityList, numberOfHighestScoredNodesToReport)
   431  	trace.Step("Prioritizing done")
   432  
   433  	return ScheduleResult{
   434  		SuggestedHost:  host,
   435  		EvaluatedNodes: diagnosis.EvaluatedNodes,
   436  		FeasibleNodes:  len(feasibleNodes),
   437  	}, err
   438  }
   439  
   440  // Filters the nodes to find the ones that fit the pod based on the framework
   441  // filter plugins and filter extenders.
   442  func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*framework.NodeInfo, framework.Diagnosis, error) {
   443  	logger := klog.FromContext(ctx)
   444  
   445  	allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List()
   446  	if err != nil {
   447  		return nil, framework.Diagnosis{}, err
   448  	}
   449  
   450  	diagnosis := framework.Diagnosis{
   451  		NodeToStatusMap: make(framework.NodeToStatusMap, len(allNodes)),
   452  	}
   453  	// Run "prefilter" plugins.
   454  	preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod)
   455  	if !s.IsSuccess() {
   456  		if !s.IsRejected() {
   457  			return nil, diagnosis, s.AsError()
   458  		}
   459  		// All nodes in NodeToStatusMap will have the same status so that they can be handled in the preemption.
   460  		// Some non trivial refactoring is needed to avoid this copy.
   461  		for _, n := range allNodes {
   462  			diagnosis.NodeToStatusMap[n.Node().Name] = s
   463  		}
   464  
   465  		// Record the messages from PreFilter in Diagnosis.PreFilterMsg.
   466  		msg := s.Message()
   467  		diagnosis.PreFilterMsg = msg
   468  		logger.V(5).Info("Status after running PreFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   469  		diagnosis.AddPluginStatus(s)
   470  		return nil, diagnosis, nil
   471  	}
   472  
   473  	// "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption.
   474  	// This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes.
   475  	if len(pod.Status.NominatedNodeName) > 0 {
   476  		feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis)
   477  		if err != nil {
   478  			logger.Error(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName)
   479  		}
   480  		// Nominated node passes all the filters, scheduler is good to assign this node to the pod.
   481  		if len(feasibleNodes) != 0 {
   482  			return feasibleNodes, diagnosis, nil
   483  		}
   484  	}
   485  
   486  	nodes := allNodes
   487  	if !preRes.AllNodes() {
   488  		nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames))
   489  		for _, n := range allNodes {
   490  			if !preRes.NodeNames.Has(n.Node().Name) {
   491  				// We consider Nodes that are filtered out by PreFilterResult as rejected via UnschedulableAndUnresolvable.
   492  				// We have to record them in NodeToStatusMap so that they won't be considered as candidates in the preemption.
   493  				diagnosis.NodeToStatusMap[n.Node().Name] = framework.NewStatus(framework.UnschedulableAndUnresolvable, "node is filtered out by the prefilter result")
   494  				continue
   495  			}
   496  			nodes = append(nodes, n)
   497  		}
   498  	}
   499  	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, nodes)
   500  	// always try to update the sched.nextStartNodeIndex regardless of whether an error has occurred
   501  	// this is helpful to make sure that all the nodes have a chance to be searched
   502  	processedNodes := len(feasibleNodes) + len(diagnosis.NodeToStatusMap)
   503  	sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes)
   504  	if err != nil {
   505  		return nil, diagnosis, err
   506  	}
   507  
   508  	feasibleNodesAfterExtender, err := findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
   509  	if err != nil {
   510  		return nil, diagnosis, err
   511  	}
   512  	if len(feasibleNodesAfterExtender) != len(feasibleNodes) {
   513  		// Extenders filtered out some nodes.
   514  		//
   515  		// Extender doesn't support any kind of requeueing feature like EnqueueExtensions in the scheduling framework.
   516  		// When Extenders reject some Nodes and the pod ends up being unschedulable,
   517  		// we put framework.ExtenderName to pInfo.UnschedulablePlugins.
   518  		// This Pod will be requeued from unschedulable pod pool to activeQ/backoffQ
   519  		// by any kind of cluster events.
   520  		// https://github.com/kubernetes/kubernetes/issues/122019
   521  		if diagnosis.UnschedulablePlugins == nil {
   522  			diagnosis.UnschedulablePlugins = sets.New[string]()
   523  		}
   524  		diagnosis.UnschedulablePlugins.Insert(framework.ExtenderName)
   525  	}
   526  
   527  	return feasibleNodesAfterExtender, diagnosis, nil
   528  }
   529  
   530  func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*framework.NodeInfo, error) {
   531  	nnn := pod.Status.NominatedNodeName
   532  	nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn)
   533  	if err != nil {
   534  		return nil, err
   535  	}
   536  	node := []*framework.NodeInfo{nodeInfo}
   537  	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, node)
   538  	if err != nil {
   539  		return nil, err
   540  	}
   541  
   542  	feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
   543  	if err != nil {
   544  		return nil, err
   545  	}
   546  
   547  	return feasibleNodes, nil
   548  }
   549  
   550  // hasScoring checks if scoring nodes is configured.
   551  func (sched *Scheduler) hasScoring(fwk framework.Framework) bool {
   552  	if fwk.HasScorePlugins() {
   553  		return true
   554  	}
   555  	for _, extender := range sched.Extenders {
   556  		if extender.IsPrioritizer() {
   557  			return true
   558  		}
   559  	}
   560  	return false
   561  }
   562  
   563  // hasExtenderFilters checks if any extenders filter nodes.
   564  func (sched *Scheduler) hasExtenderFilters() bool {
   565  	for _, extender := range sched.Extenders {
   566  		if extender.IsFilter() {
   567  			return true
   568  		}
   569  	}
   570  	return false
   571  }
   572  
   573  // findNodesThatPassFilters finds the nodes that fit the filter plugins.
   574  func (sched *Scheduler) findNodesThatPassFilters(
   575  	ctx context.Context,
   576  	fwk framework.Framework,
   577  	state *framework.CycleState,
   578  	pod *v1.Pod,
   579  	diagnosis *framework.Diagnosis,
   580  	nodes []*framework.NodeInfo) ([]*framework.NodeInfo, error) {
   581  	numAllNodes := len(nodes)
   582  	numNodesToFind := sched.numFeasibleNodesToFind(fwk.PercentageOfNodesToScore(), int32(numAllNodes))
   583  	if !sched.hasExtenderFilters() && !sched.hasScoring(fwk) {
   584  		numNodesToFind = 1
   585  	}
   586  
   587  	// Create feasible list with enough space to avoid growing it
   588  	// and allow assigning.
   589  	feasibleNodes := make([]*framework.NodeInfo, numNodesToFind)
   590  
   591  	if !fwk.HasFilterPlugins() {
   592  		for i := range feasibleNodes {
   593  			feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%numAllNodes]
   594  		}
   595  		diagnosis.EvaluatedNodes = int(numNodesToFind)
   596  		return feasibleNodes, nil
   597  	}
   598  
   599  	errCh := parallelize.NewErrorChannel()
   600  	var feasibleNodesLen int32
   601  	ctx, cancel := context.WithCancel(ctx)
   602  	defer cancel()
   603  
   604  	type nodeStatus struct {
   605  		node   string
   606  		status *framework.Status
   607  	}
   608  	result := make([]*nodeStatus, numAllNodes)
   609  	checkNode := func(i int) {
   610  		// We check the nodes starting from where we left off in the previous scheduling cycle,
   611  		// this is to make sure all nodes have the same chance of being examined across pods.
   612  		nodeInfo := nodes[(sched.nextStartNodeIndex+i)%numAllNodes]
   613  		status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
   614  		if status.Code() == framework.Error {
   615  			errCh.SendErrorWithCancel(status.AsError(), cancel)
   616  			return
   617  		}
   618  		if status.IsSuccess() {
   619  			length := atomic.AddInt32(&feasibleNodesLen, 1)
   620  			if length > numNodesToFind {
   621  				cancel()
   622  				atomic.AddInt32(&feasibleNodesLen, -1)
   623  			} else {
   624  				feasibleNodes[length-1] = nodeInfo
   625  			}
   626  		} else {
   627  			result[i] = &nodeStatus{node: nodeInfo.Node().Name, status: status}
   628  		}
   629  	}
   630  
   631  	beginCheckNode := time.Now()
   632  	statusCode := framework.Success
   633  	defer func() {
   634  		// We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins
   635  		// function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle.
   636  		// Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod.
   637  		metrics.FrameworkExtensionPointDuration.WithLabelValues(metrics.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode))
   638  	}()
   639  
   640  	// Stops searching for more nodes once the configured number of feasible nodes
   641  	// are found.
   642  	fwk.Parallelizer().Until(ctx, numAllNodes, checkNode, metrics.Filter)
   643  	feasibleNodes = feasibleNodes[:feasibleNodesLen]
   644  	diagnosis.EvaluatedNodes = int(feasibleNodesLen)
   645  	for _, item := range result {
   646  		if item == nil {
   647  			continue
   648  		}
   649  		diagnosis.NodeToStatusMap[item.node] = item.status
   650  		diagnosis.EvaluatedNodes++
   651  		diagnosis.AddPluginStatus(item.status)
   652  	}
   653  	if err := errCh.ReceiveError(); err != nil {
   654  		statusCode = framework.Error
   655  		return feasibleNodes, err
   656  	}
   657  	return feasibleNodes, nil
   658  }
   659  
   660  // numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops
   661  // its search for more feasible nodes.
   662  func (sched *Scheduler) numFeasibleNodesToFind(percentageOfNodesToScore *int32, numAllNodes int32) (numNodes int32) {
   663  	if numAllNodes < minFeasibleNodesToFind {
   664  		return numAllNodes
   665  	}
   666  
   667  	// Use profile percentageOfNodesToScore if it's set. Otherwise, use global percentageOfNodesToScore.
   668  	var percentage int32
   669  	if percentageOfNodesToScore != nil {
   670  		percentage = *percentageOfNodesToScore
   671  	} else {
   672  		percentage = sched.percentageOfNodesToScore
   673  	}
   674  
   675  	if percentage == 0 {
   676  		percentage = int32(50) - numAllNodes/125
   677  		if percentage < minFeasibleNodesPercentageToFind {
   678  			percentage = minFeasibleNodesPercentageToFind
   679  		}
   680  	}
   681  
   682  	numNodes = numAllNodes * percentage / 100
   683  	if numNodes < minFeasibleNodesToFind {
   684  		return minFeasibleNodesToFind
   685  	}
   686  
   687  	return numNodes
   688  }
   689  
   690  func findNodesThatPassExtenders(ctx context.Context, extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*framework.NodeInfo, statuses framework.NodeToStatusMap) ([]*framework.NodeInfo, error) {
   691  	logger := klog.FromContext(ctx)
   692  	// Extenders are called sequentially.
   693  	// Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next
   694  	// extender in a decreasing manner.
   695  	for _, extender := range extenders {
   696  		if len(feasibleNodes) == 0 {
   697  			break
   698  		}
   699  		if !extender.IsInterested(pod) {
   700  			continue
   701  		}
   702  
   703  		// Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in <statuses>,
   704  		// so that the scheduler framework can respect the UnschedulableAndUnresolvable status for
   705  		// particular nodes, and this may eventually improve preemption efficiency.
   706  		// Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable
   707  		// status ahead of others.
   708  		feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes)
   709  		if err != nil {
   710  			if extender.IsIgnorable() {
   711  				logger.Info("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err)
   712  				continue
   713  			}
   714  			return nil, err
   715  		}
   716  
   717  		for failedNodeName, failedMsg := range failedAndUnresolvableMap {
   718  			var aggregatedReasons []string
   719  			if _, found := statuses[failedNodeName]; found {
   720  				aggregatedReasons = statuses[failedNodeName].Reasons()
   721  			}
   722  			aggregatedReasons = append(aggregatedReasons, failedMsg)
   723  			statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...)
   724  		}
   725  
   726  		for failedNodeName, failedMsg := range failedMap {
   727  			if _, found := failedAndUnresolvableMap[failedNodeName]; found {
   728  				// failedAndUnresolvableMap takes precedence over failedMap
   729  				// note that this only happens if the extender returns the node in both maps
   730  				continue
   731  			}
   732  			if _, found := statuses[failedNodeName]; !found {
   733  				statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg)
   734  			} else {
   735  				statuses[failedNodeName].AppendReason(failedMsg)
   736  			}
   737  		}
   738  
   739  		feasibleNodes = feasibleList
   740  	}
   741  	return feasibleNodes, nil
   742  }
   743  
   744  // prioritizeNodes prioritizes the nodes by running the score plugins,
   745  // which return a score for each node from the call to RunScorePlugins().
   746  // The scores from each plugin are added together to make the score for that node, then
   747  // any extenders are run as well.
   748  // All scores are finally combined (added) to get the total weighted scores of all nodes
   749  func prioritizeNodes(
   750  	ctx context.Context,
   751  	extenders []framework.Extender,
   752  	fwk framework.Framework,
   753  	state *framework.CycleState,
   754  	pod *v1.Pod,
   755  	nodes []*framework.NodeInfo,
   756  ) ([]framework.NodePluginScores, error) {
   757  	logger := klog.FromContext(ctx)
   758  	// If no priority configs are provided, then all nodes will have a score of one.
   759  	// This is required to generate the priority list in the required format
   760  	if len(extenders) == 0 && !fwk.HasScorePlugins() {
   761  		result := make([]framework.NodePluginScores, 0, len(nodes))
   762  		for i := range nodes {
   763  			result = append(result, framework.NodePluginScores{
   764  				Name:       nodes[i].Node().Name,
   765  				TotalScore: 1,
   766  			})
   767  		}
   768  		return result, nil
   769  	}
   770  
   771  	// Run PreScore plugins.
   772  	preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes)
   773  	if !preScoreStatus.IsSuccess() {
   774  		return nil, preScoreStatus.AsError()
   775  	}
   776  
   777  	// Run the Score plugins.
   778  	nodesScores, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)
   779  	if !scoreStatus.IsSuccess() {
   780  		return nil, scoreStatus.AsError()
   781  	}
   782  
   783  	// Additional details logged at level 10 if enabled.
   784  	loggerVTen := logger.V(10)
   785  	if loggerVTen.Enabled() {
   786  		for _, nodeScore := range nodesScores {
   787  			for _, pluginScore := range nodeScore.Scores {
   788  				loggerVTen.Info("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", pluginScore.Name, "node", nodeScore.Name, "score", pluginScore.Score)
   789  			}
   790  		}
   791  	}
   792  
   793  	if len(extenders) != 0 && nodes != nil {
   794  		// allNodeExtendersScores has all extenders scores for all nodes.
   795  		// It is keyed with node name.
   796  		allNodeExtendersScores := make(map[string]*framework.NodePluginScores, len(nodes))
   797  		var mu sync.Mutex
   798  		var wg sync.WaitGroup
   799  		for i := range extenders {
   800  			if !extenders[i].IsInterested(pod) {
   801  				continue
   802  			}
   803  			wg.Add(1)
   804  			go func(extIndex int) {
   805  				metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Inc()
   806  				defer func() {
   807  					metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Dec()
   808  					wg.Done()
   809  				}()
   810  				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
   811  				if err != nil {
   812  					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
   813  					logger.V(5).Info("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name())
   814  					return
   815  				}
   816  				mu.Lock()
   817  				defer mu.Unlock()
   818  				for i := range *prioritizedList {
   819  					nodename := (*prioritizedList)[i].Host
   820  					score := (*prioritizedList)[i].Score
   821  					if loggerVTen.Enabled() {
   822  						loggerVTen.Info("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", nodename, "score", score)
   823  					}
   824  
   825  					// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
   826  					// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
   827  					finalscore := score * weight * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
   828  
   829  					if allNodeExtendersScores[nodename] == nil {
   830  						allNodeExtendersScores[nodename] = &framework.NodePluginScores{
   831  							Name:   nodename,
   832  							Scores: make([]framework.PluginScore, 0, len(extenders)),
   833  						}
   834  					}
   835  					allNodeExtendersScores[nodename].Scores = append(allNodeExtendersScores[nodename].Scores, framework.PluginScore{
   836  						Name:  extenders[extIndex].Name(),
   837  						Score: finalscore,
   838  					})
   839  					allNodeExtendersScores[nodename].TotalScore += finalscore
   840  				}
   841  			}(i)
   842  		}
   843  		// wait for all go routines to finish
   844  		wg.Wait()
   845  		for i := range nodesScores {
   846  			if score, ok := allNodeExtendersScores[nodes[i].Node().Name]; ok {
   847  				nodesScores[i].Scores = append(nodesScores[i].Scores, score.Scores...)
   848  				nodesScores[i].TotalScore += score.TotalScore
   849  			}
   850  		}
   851  	}
   852  
   853  	if loggerVTen.Enabled() {
   854  		for i := range nodesScores {
   855  			loggerVTen.Info("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", nodesScores[i].Name, "score", nodesScores[i].TotalScore)
   856  		}
   857  	}
   858  	return nodesScores, nil
   859  }
   860  
   861  var errEmptyPriorityList = errors.New("empty priorityList")
   862  
   863  // selectHost takes a prioritized list of nodes and then picks one
   864  // in a reservoir sampling manner from the nodes that had the highest score.
   865  // It also returns the top {count} Nodes,
   866  // and the top of the list will be always the selected host.
   867  func selectHost(nodeScoreList []framework.NodePluginScores, count int) (string, []framework.NodePluginScores, error) {
   868  	if len(nodeScoreList) == 0 {
   869  		return "", nil, errEmptyPriorityList
   870  	}
   871  
   872  	var h nodeScoreHeap = nodeScoreList
   873  	heap.Init(&h)
   874  	cntOfMaxScore := 1
   875  	selectedIndex := 0
   876  	// The top of the heap is the NodeScoreResult with the highest score.
   877  	sortedNodeScoreList := make([]framework.NodePluginScores, 0, count)
   878  	sortedNodeScoreList = append(sortedNodeScoreList, heap.Pop(&h).(framework.NodePluginScores))
   879  
   880  	// This for-loop will continue until all Nodes with the highest scores get checked for a reservoir sampling,
   881  	// and sortedNodeScoreList gets (count - 1) elements.
   882  	for ns := heap.Pop(&h).(framework.NodePluginScores); ; ns = heap.Pop(&h).(framework.NodePluginScores) {
   883  		if ns.TotalScore != sortedNodeScoreList[0].TotalScore && len(sortedNodeScoreList) == count {
   884  			break
   885  		}
   886  
   887  		if ns.TotalScore == sortedNodeScoreList[0].TotalScore {
   888  			cntOfMaxScore++
   889  			if rand.Intn(cntOfMaxScore) == 0 {
   890  				// Replace the candidate with probability of 1/cntOfMaxScore
   891  				selectedIndex = cntOfMaxScore - 1
   892  			}
   893  		}
   894  
   895  		sortedNodeScoreList = append(sortedNodeScoreList, ns)
   896  
   897  		if h.Len() == 0 {
   898  			break
   899  		}
   900  	}
   901  
   902  	if selectedIndex != 0 {
   903  		// replace the first one with selected one
   904  		previous := sortedNodeScoreList[0]
   905  		sortedNodeScoreList[0] = sortedNodeScoreList[selectedIndex]
   906  		sortedNodeScoreList[selectedIndex] = previous
   907  	}
   908  
   909  	if len(sortedNodeScoreList) > count {
   910  		sortedNodeScoreList = sortedNodeScoreList[:count]
   911  	}
   912  
   913  	return sortedNodeScoreList[0].Name, sortedNodeScoreList, nil
   914  }
   915  
   916  // nodeScoreHeap is a heap of framework.NodePluginScores.
   917  type nodeScoreHeap []framework.NodePluginScores
   918  
   919  // nodeScoreHeap implements heap.Interface.
   920  var _ heap.Interface = &nodeScoreHeap{}
   921  
   922  func (h nodeScoreHeap) Len() int           { return len(h) }
   923  func (h nodeScoreHeap) Less(i, j int) bool { return h[i].TotalScore > h[j].TotalScore }
   924  func (h nodeScoreHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
   925  
   926  func (h *nodeScoreHeap) Push(x interface{}) {
   927  	*h = append(*h, x.(framework.NodePluginScores))
   928  }
   929  
   930  func (h *nodeScoreHeap) Pop() interface{} {
   931  	old := *h
   932  	n := len(old)
   933  	x := old[n-1]
   934  	*h = old[0 : n-1]
   935  	return x
   936  }
   937  
   938  // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
   939  // assume modifies `assumed`.
   940  func (sched *Scheduler) assume(logger klog.Logger, assumed *v1.Pod, host string) error {
   941  	// Optimistically assume that the binding will succeed and send it to apiserver
   942  	// in the background.
   943  	// If the binding fails, scheduler will release resources allocated to assumed pod
   944  	// immediately.
   945  	assumed.Spec.NodeName = host
   946  
   947  	if err := sched.Cache.AssumePod(logger, assumed); err != nil {
   948  		logger.Error(err, "Scheduler cache AssumePod failed")
   949  		return err
   950  	}
   951  	// if "assumed" is a nominated pod, we should remove it from internal cache
   952  	if sched.SchedulingQueue != nil {
   953  		sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed)
   954  	}
   955  
   956  	return nil
   957  }
   958  
   959  // bind binds a pod to a given node defined in a binding object.
   960  // The precedence for binding is: (1) extenders and (2) framework plugins.
   961  // We expect this to run asynchronously, so we handle binding metrics internally.
   962  func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (status *framework.Status) {
   963  	logger := klog.FromContext(ctx)
   964  	defer func() {
   965  		sched.finishBinding(logger, fwk, assumed, targetNode, status)
   966  	}()
   967  
   968  	bound, err := sched.extendersBinding(logger, assumed, targetNode)
   969  	if bound {
   970  		return framework.AsStatus(err)
   971  	}
   972  	return fwk.RunBindPlugins(ctx, state, assumed, targetNode)
   973  }
   974  
   975  // TODO(#87159): Move this to a Plugin.
   976  func (sched *Scheduler) extendersBinding(logger klog.Logger, pod *v1.Pod, node string) (bool, error) {
   977  	for _, extender := range sched.Extenders {
   978  		if !extender.IsBinder() || !extender.IsInterested(pod) {
   979  			continue
   980  		}
   981  		err := extender.Bind(&v1.Binding{
   982  			ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID},
   983  			Target:     v1.ObjectReference{Kind: "Node", Name: node},
   984  		})
   985  		if err != nil && extender.IsIgnorable() {
   986  			logger.Info("Skipping extender in bind as it returned error and has ignorable flag set", "extender", extender, "err", err)
   987  			continue
   988  		}
   989  		return true, err
   990  	}
   991  	return false, nil
   992  }
   993  
   994  func (sched *Scheduler) finishBinding(logger klog.Logger, fwk framework.Framework, assumed *v1.Pod, targetNode string, status *framework.Status) {
   995  	if finErr := sched.Cache.FinishBinding(logger, assumed); finErr != nil {
   996  		logger.Error(finErr, "Scheduler cache FinishBinding failed")
   997  	}
   998  	if !status.IsSuccess() {
   999  		logger.V(1).Info("Failed to bind pod", "pod", klog.KObj(assumed))
  1000  		return
  1001  	}
  1002  
  1003  	fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode)
  1004  }
  1005  
  1006  func getAttemptsLabel(p *framework.QueuedPodInfo) string {
  1007  	// We breakdown the pod scheduling duration by attempts capped to a limit
  1008  	// to avoid ending up with a high cardinality metric.
  1009  	if p.Attempts >= 15 {
  1010  		return "15+"
  1011  	}
  1012  	return strconv.Itoa(p.Attempts)
  1013  }
  1014  
  1015  // handleSchedulingFailure records an event for the pod that indicates the
  1016  // pod has failed to schedule. Also, update the pod condition and nominated node name if set.
  1017  func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framework.Framework, podInfo *framework.QueuedPodInfo, status *framework.Status, nominatingInfo *framework.NominatingInfo, start time.Time) {
  1018  	calledDone := false
  1019  	defer func() {
  1020  		if !calledDone {
  1021  			// Basically, AddUnschedulableIfNotPresent calls DonePod internally.
  1022  			// But, AddUnschedulableIfNotPresent isn't called in some corner cases.
  1023  			// Here, we call DonePod explicitly to avoid leaking the pod.
  1024  			sched.SchedulingQueue.Done(podInfo.Pod.UID)
  1025  		}
  1026  	}()
  1027  
  1028  	logger := klog.FromContext(ctx)
  1029  	reason := v1.PodReasonSchedulerError
  1030  	if status.IsRejected() {
  1031  		reason = v1.PodReasonUnschedulable
  1032  	}
  1033  
  1034  	switch reason {
  1035  	case v1.PodReasonUnschedulable:
  1036  		metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
  1037  	case v1.PodReasonSchedulerError:
  1038  		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
  1039  	}
  1040  
  1041  	pod := podInfo.Pod
  1042  	err := status.AsError()
  1043  	errMsg := status.Message()
  1044  
  1045  	if err == ErrNoNodesAvailable {
  1046  		logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod))
  1047  	} else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently.
  1048  		podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins
  1049  		podInfo.PendingPlugins = fitError.Diagnosis.PendingPlugins
  1050  		logger.V(2).Info("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", errMsg)
  1051  	} else {
  1052  		logger.Error(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod))
  1053  	}
  1054  
  1055  	// Check if the Pod exists in informer cache.
  1056  	podLister := fwk.SharedInformerFactory().Core().V1().Pods().Lister()
  1057  	cachedPod, e := podLister.Pods(pod.Namespace).Get(pod.Name)
  1058  	if e != nil {
  1059  		logger.Info("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", e)
  1060  		// We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case.
  1061  	} else {
  1062  		// In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler.
  1063  		// It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version.
  1064  		if len(cachedPod.Spec.NodeName) != 0 {
  1065  			logger.Info("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName)
  1066  			// We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case.
  1067  		} else {
  1068  			// As <cachedPod> is from SharedInformer, we need to do a DeepCopy() here.
  1069  			// ignore this err since apiserver doesn't properly validate affinity terms
  1070  			// and we can't fix the validation for backwards compatibility.
  1071  			podInfo.PodInfo, _ = framework.NewPodInfo(cachedPod.DeepCopy())
  1072  			if err := sched.SchedulingQueue.AddUnschedulableIfNotPresent(logger, podInfo, sched.SchedulingQueue.SchedulingCycle()); err != nil {
  1073  				logger.Error(err, "Error occurred")
  1074  			}
  1075  			calledDone = true
  1076  		}
  1077  	}
  1078  
  1079  	// Update the scheduling queue with the nominated pod information. Without
  1080  	// this, there would be a race condition between the next scheduling cycle
  1081  	// and the time the scheduler receives a Pod Update for the nominated pod.
  1082  	// Here we check for nil only for tests.
  1083  	if sched.SchedulingQueue != nil {
  1084  		logger := klog.FromContext(ctx)
  1085  		sched.SchedulingQueue.AddNominatedPod(logger, podInfo.PodInfo, nominatingInfo)
  1086  	}
  1087  
  1088  	if err == nil {
  1089  		// Only tests can reach here.
  1090  		return
  1091  	}
  1092  
  1093  	msg := truncateMessage(errMsg)
  1094  	fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)
  1095  	if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{
  1096  		Type:    v1.PodScheduled,
  1097  		Status:  v1.ConditionFalse,
  1098  		Reason:  reason,
  1099  		Message: errMsg,
  1100  	}, nominatingInfo); err != nil {
  1101  		klog.FromContext(ctx).Error(err, "Error updating pod", "pod", klog.KObj(pod))
  1102  	}
  1103  }
  1104  
  1105  // truncateMessage truncates a message if it hits the NoteLengthLimit.
  1106  func truncateMessage(message string) string {
  1107  	max := validation.NoteLengthLimit
  1108  	if len(message) <= max {
  1109  		return message
  1110  	}
  1111  	suffix := " ..."
  1112  	return message[:max-len(suffix)] + suffix
  1113  }
  1114  
  1115  func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error {
  1116  	logger := klog.FromContext(ctx)
  1117  	logger.V(3).Info("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason)
  1118  	podStatusCopy := pod.Status.DeepCopy()
  1119  	// NominatedNodeName is updated only if we are trying to set it, and the value is
  1120  	// different from the existing one.
  1121  	nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName
  1122  	if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate {
  1123  		return nil
  1124  	}
  1125  	if nnnNeedsUpdate {
  1126  		podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName
  1127  	}
  1128  	return util.PatchPodStatus(ctx, client, pod, podStatusCopy)
  1129  }