k8s.io/kubernetes@v1.29.3/pkg/scheduler/schedule_one.go (about)

     1  /*
     2  Copyright 2014 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package scheduler
    18  
    19  import (
    20  	"container/heap"
    21  	"context"
    22  	"errors"
    23  	"fmt"
    24  	"math/rand"
    25  	"strconv"
    26  	"sync"
    27  	"sync/atomic"
    28  	"time"
    29  
    30  	v1 "k8s.io/api/core/v1"
    31  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    32  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    33  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    34  	"k8s.io/apimachinery/pkg/util/sets"
    35  	clientset "k8s.io/client-go/kubernetes"
    36  	"k8s.io/klog/v2"
    37  	extenderv1 "k8s.io/kube-scheduler/extender/v1"
    38  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    39  	"k8s.io/kubernetes/pkg/apis/core/validation"
    40  	"k8s.io/kubernetes/pkg/scheduler/framework"
    41  	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
    42  	internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue"
    43  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    44  	"k8s.io/kubernetes/pkg/scheduler/util"
    45  	utiltrace "k8s.io/utils/trace"
    46  )
    47  
    48  const (
    49  	// Percentage of plugin metrics to be sampled.
    50  	pluginMetricsSamplePercent = 10
    51  	// minFeasibleNodesToFind is the minimum number of nodes that would be scored
    52  	// in each scheduling cycle. This is a semi-arbitrary value to ensure that a
    53  	// certain minimum of nodes are checked for feasibility. This in turn helps
    54  	// ensure a minimum level of spreading.
    55  	minFeasibleNodesToFind = 100
    56  	// minFeasibleNodesPercentageToFind is the minimum percentage of nodes that
    57  	// would be scored in each scheduling cycle. This is a semi-arbitrary value
    58  	// to ensure that a certain minimum of nodes are checked for feasibility.
    59  	// This in turn helps ensure a minimum level of spreading.
    60  	minFeasibleNodesPercentageToFind = 5
    61  	// numberOfHighestScoredNodesToReport is the number of node scores
    62  	// to be included in ScheduleResult.
    63  	numberOfHighestScoredNodesToReport = 3
    64  )
    65  
    66  // scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting.
    67  func (sched *Scheduler) scheduleOne(ctx context.Context) {
    68  	logger := klog.FromContext(ctx)
    69  	podInfo, err := sched.NextPod(logger)
    70  	if err != nil {
    71  		logger.Error(err, "Error while retrieving next pod from scheduling queue")
    72  		return
    73  	}
    74  	// pod could be nil when schedulerQueue is closed
    75  	if podInfo == nil || podInfo.Pod == nil {
    76  		return
    77  	}
    78  
    79  	pod := podInfo.Pod
    80  	// TODO(knelasevero): Remove duplicated keys from log entry calls
    81  	// When contextualized logging hits GA
    82  	// https://github.com/kubernetes/kubernetes/issues/111672
    83  	logger = klog.LoggerWithValues(logger, "pod", klog.KObj(pod))
    84  	ctx = klog.NewContext(ctx, logger)
    85  	logger.V(4).Info("About to try and schedule pod", "pod", klog.KObj(pod))
    86  
    87  	fwk, err := sched.frameworkForPod(pod)
    88  	if err != nil {
    89  		// This shouldn't happen, because we only accept for scheduling the pods
    90  		// which specify a scheduler name that matches one of the profiles.
    91  		logger.Error(err, "Error occurred")
    92  		return
    93  	}
    94  	if sched.skipPodSchedule(ctx, fwk, pod) {
    95  		return
    96  	}
    97  
    98  	logger.V(3).Info("Attempting to schedule pod", "pod", klog.KObj(pod))
    99  
   100  	// Synchronously attempt to find a fit for the pod.
   101  	start := time.Now()
   102  	state := framework.NewCycleState()
   103  	state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent)
   104  
   105  	// Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty.
   106  	podsToActivate := framework.NewPodsToActivate()
   107  	state.Write(framework.PodsToActivateKey, podsToActivate)
   108  
   109  	schedulingCycleCtx, cancel := context.WithCancel(ctx)
   110  	defer cancel()
   111  
   112  	scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate)
   113  	if !status.IsSuccess() {
   114  		sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start)
   115  		return
   116  	}
   117  
   118  	// bind the pod to its host asynchronously (we can do this b/c of the assumption step above).
   119  	go func() {
   120  		bindingCycleCtx, cancel := context.WithCancel(ctx)
   121  		defer cancel()
   122  
   123  		metrics.Goroutines.WithLabelValues(metrics.Binding).Inc()
   124  		defer metrics.Goroutines.WithLabelValues(metrics.Binding).Dec()
   125  
   126  		status := sched.bindingCycle(bindingCycleCtx, state, fwk, scheduleResult, assumedPodInfo, start, podsToActivate)
   127  		if !status.IsSuccess() {
   128  			sched.handleBindingCycleError(bindingCycleCtx, state, fwk, assumedPodInfo, start, scheduleResult, status)
   129  			return
   130  		}
   131  		// Usually, DonePod is called inside the scheduling queue,
   132  		// but in this case, we need to call it here because this Pod won't go back to the scheduling queue.
   133  		sched.SchedulingQueue.Done(assumedPodInfo.Pod.UID)
   134  	}()
   135  }
   136  
   137  var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""}
   138  
   139  // schedulingCycle tries to schedule a single Pod.
   140  func (sched *Scheduler) schedulingCycle(
   141  	ctx context.Context,
   142  	state *framework.CycleState,
   143  	fwk framework.Framework,
   144  	podInfo *framework.QueuedPodInfo,
   145  	start time.Time,
   146  	podsToActivate *framework.PodsToActivate,
   147  ) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) {
   148  	logger := klog.FromContext(ctx)
   149  	pod := podInfo.Pod
   150  	scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod)
   151  	if err != nil {
   152  		if err == ErrNoNodesAvailable {
   153  			status := framework.NewStatus(framework.UnschedulableAndUnresolvable).WithError(err)
   154  			return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status
   155  		}
   156  
   157  		fitError, ok := err.(*framework.FitError)
   158  		if !ok {
   159  			logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod))
   160  			return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, framework.AsStatus(err)
   161  		}
   162  
   163  		// SchedulePod() may have failed because the pod would not fit on any host, so we try to
   164  		// preempt, with the expectation that the next time the pod is tried for scheduling it
   165  		// will fit due to the preemption. It is also possible that a different pod will schedule
   166  		// into the resources that were preempted, but this is harmless.
   167  
   168  		if !fwk.HasPostFilterPlugins() {
   169  			logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed")
   170  			return ScheduleResult{}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err)
   171  		}
   172  
   173  		// Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle.
   174  		result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap)
   175  		msg := status.Message()
   176  		fitError.Diagnosis.PostFilterMsg = msg
   177  		if status.Code() == framework.Error {
   178  			logger.Error(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   179  		} else {
   180  			logger.V(5).Info("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   181  		}
   182  
   183  		var nominatingInfo *framework.NominatingInfo
   184  		if result != nil {
   185  			nominatingInfo = result.NominatingInfo
   186  		}
   187  		return ScheduleResult{nominatingInfo: nominatingInfo}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err)
   188  	}
   189  
   190  	metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start))
   191  	// Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet.
   192  	// This allows us to keep scheduling without waiting on binding to occur.
   193  	assumedPodInfo := podInfo.DeepCopy()
   194  	assumedPod := assumedPodInfo.Pod
   195  	// assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost
   196  	err = sched.assume(logger, assumedPod, scheduleResult.SuggestedHost)
   197  	if err != nil {
   198  		// This is most probably result of a BUG in retrying logic.
   199  		// We report an error here so that pod scheduling can be retried.
   200  		// This relies on the fact that Error will check if the pod has been bound
   201  		// to a node and if so will not add it back to the unscheduled pods queue
   202  		// (otherwise this would cause an infinite loop).
   203  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.AsStatus(err)
   204  	}
   205  
   206  	// Run the Reserve method of reserve plugins.
   207  	if sts := fwk.RunReservePluginsReserve(ctx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() {
   208  		// trigger un-reserve to clean up state associated with the reserved Pod
   209  		fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   210  		if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   211  			logger.Error(forgetErr, "Scheduler cache ForgetPod failed")
   212  		}
   213  
   214  		if sts.IsRejected() {
   215  			fitErr := &framework.FitError{
   216  				NumAllNodes: 1,
   217  				Pod:         pod,
   218  				Diagnosis: framework.Diagnosis{
   219  					NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts},
   220  				},
   221  			}
   222  			fitErr.Diagnosis.AddPluginStatus(sts)
   223  			return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(sts.Code()).WithError(fitErr)
   224  		}
   225  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts
   226  	}
   227  
   228  	// Run "permit" plugins.
   229  	runPermitStatus := fwk.RunPermitPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   230  	if !runPermitStatus.IsWait() && !runPermitStatus.IsSuccess() {
   231  		// trigger un-reserve to clean up state associated with the reserved Pod
   232  		fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   233  		if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   234  			logger.Error(forgetErr, "Scheduler cache ForgetPod failed")
   235  		}
   236  
   237  		if runPermitStatus.IsRejected() {
   238  			fitErr := &framework.FitError{
   239  				NumAllNodes: 1,
   240  				Pod:         pod,
   241  				Diagnosis: framework.Diagnosis{
   242  					NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus},
   243  				},
   244  			}
   245  			fitErr.Diagnosis.AddPluginStatus(runPermitStatus)
   246  			return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(runPermitStatus.Code()).WithError(fitErr)
   247  		}
   248  
   249  		return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus
   250  	}
   251  
   252  	// At the end of a successful scheduling cycle, pop and move up Pods if needed.
   253  	if len(podsToActivate.Map) != 0 {
   254  		sched.SchedulingQueue.Activate(logger, podsToActivate.Map)
   255  		// Clear the entries after activation.
   256  		podsToActivate.Map = make(map[string]*v1.Pod)
   257  	}
   258  
   259  	return scheduleResult, assumedPodInfo, nil
   260  }
   261  
   262  // bindingCycle tries to bind an assumed Pod.
   263  func (sched *Scheduler) bindingCycle(
   264  	ctx context.Context,
   265  	state *framework.CycleState,
   266  	fwk framework.Framework,
   267  	scheduleResult ScheduleResult,
   268  	assumedPodInfo *framework.QueuedPodInfo,
   269  	start time.Time,
   270  	podsToActivate *framework.PodsToActivate) *framework.Status {
   271  	logger := klog.FromContext(ctx)
   272  
   273  	assumedPod := assumedPodInfo.Pod
   274  
   275  	// Run "permit" plugins.
   276  	if status := fwk.WaitOnPermit(ctx, assumedPod); !status.IsSuccess() {
   277  		if status.IsRejected() {
   278  			fitErr := &framework.FitError{
   279  				NumAllNodes: 1,
   280  				Pod:         assumedPodInfo.Pod,
   281  				Diagnosis: framework.Diagnosis{
   282  					NodeToStatusMap:      framework.NodeToStatusMap{scheduleResult.SuggestedHost: status},
   283  					UnschedulablePlugins: sets.New(status.Plugin()),
   284  				},
   285  			}
   286  			return framework.NewStatus(status.Code()).WithError(fitErr)
   287  		}
   288  		return status
   289  	}
   290  
   291  	// Run "prebind" plugins.
   292  	if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() {
   293  		return status
   294  	}
   295  
   296  	// Run "bind" plugins.
   297  	if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() {
   298  		return status
   299  	}
   300  
   301  	// Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2.
   302  	logger.V(2).Info("Successfully bound pod to node", "pod", klog.KObj(assumedPod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes)
   303  	metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start))
   304  	metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts))
   305  	if assumedPodInfo.InitialAttemptTimestamp != nil {
   306  		metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
   307  		metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp))
   308  	}
   309  	// Run "postbind" plugins.
   310  	fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   311  
   312  	// At the end of a successful binding cycle, move up Pods if needed.
   313  	if len(podsToActivate.Map) != 0 {
   314  		sched.SchedulingQueue.Activate(logger, podsToActivate.Map)
   315  		// Unlike the logic in schedulingCycle(), we don't bother deleting the entries
   316  		// as `podsToActivate.Map` is no longer consumed.
   317  	}
   318  
   319  	return nil
   320  }
   321  
   322  func (sched *Scheduler) handleBindingCycleError(
   323  	ctx context.Context,
   324  	state *framework.CycleState,
   325  	fwk framework.Framework,
   326  	podInfo *framework.QueuedPodInfo,
   327  	start time.Time,
   328  	scheduleResult ScheduleResult,
   329  	status *framework.Status) {
   330  	logger := klog.FromContext(ctx)
   331  
   332  	assumedPod := podInfo.Pod
   333  	// trigger un-reserve plugins to clean up state associated with the reserved Pod
   334  	fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost)
   335  	if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil {
   336  		logger.Error(forgetErr, "scheduler cache ForgetPod failed")
   337  	} else {
   338  		// "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event,
   339  		// as the assumed Pod had occupied a certain amount of resources in scheduler cache.
   340  		//
   341  		// Avoid moving the assumed Pod itself as it's always Unschedulable.
   342  		// It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would
   343  		// update `q.moveRequest` and thus move the assumed pod to backoffQ anyways.
   344  		if status.IsRejected() {
   345  			defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, func(pod *v1.Pod) bool {
   346  				return assumedPod.UID != pod.UID
   347  			})
   348  		} else {
   349  			sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, nil)
   350  		}
   351  	}
   352  
   353  	sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start)
   354  }
   355  
   356  func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {
   357  	fwk, ok := sched.Profiles[pod.Spec.SchedulerName]
   358  	if !ok {
   359  		return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName)
   360  	}
   361  	return fwk, nil
   362  }
   363  
   364  // skipPodSchedule returns true if we could skip scheduling the pod for specified cases.
   365  func (sched *Scheduler) skipPodSchedule(ctx context.Context, fwk framework.Framework, pod *v1.Pod) bool {
   366  	// Case 1: pod is being deleted.
   367  	if pod.DeletionTimestamp != nil {
   368  		fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name)
   369  		klog.FromContext(ctx).V(3).Info("Skip schedule deleting pod", "pod", klog.KObj(pod))
   370  		return true
   371  	}
   372  
   373  	// Case 2: pod that has been assumed could be skipped.
   374  	// An assumed pod can be added again to the scheduling queue if it got an update event
   375  	// during its previous scheduling cycle but before getting assumed.
   376  	isAssumed, err := sched.Cache.IsAssumedPod(pod)
   377  	if err != nil {
   378  		// TODO(91633): pass ctx into a revised HandleError
   379  		utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err))
   380  		return false
   381  	}
   382  	return isAssumed
   383  }
   384  
   385  // schedulePod tries to schedule the given pod to one of the nodes in the node list.
   386  // If it succeeds, it will return the name of the node.
   387  // If it fails, it will return a FitError with reasons.
   388  func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) {
   389  	trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name})
   390  	defer trace.LogIfLong(100 * time.Millisecond)
   391  	if err := sched.Cache.UpdateSnapshot(klog.FromContext(ctx), sched.nodeInfoSnapshot); err != nil {
   392  		return result, err
   393  	}
   394  	trace.Step("Snapshotting scheduler cache and node infos done")
   395  
   396  	if sched.nodeInfoSnapshot.NumNodes() == 0 {
   397  		return result, ErrNoNodesAvailable
   398  	}
   399  
   400  	feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod)
   401  	if err != nil {
   402  		return result, err
   403  	}
   404  	trace.Step("Computing predicates done")
   405  
   406  	if len(feasibleNodes) == 0 {
   407  		return result, &framework.FitError{
   408  			Pod:         pod,
   409  			NumAllNodes: sched.nodeInfoSnapshot.NumNodes(),
   410  			Diagnosis:   diagnosis,
   411  		}
   412  	}
   413  
   414  	// When only one node after predicate, just use it.
   415  	if len(feasibleNodes) == 1 {
   416  		return ScheduleResult{
   417  			SuggestedHost:  feasibleNodes[0].Name,
   418  			EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap),
   419  			FeasibleNodes:  1,
   420  		}, nil
   421  	}
   422  
   423  	priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes)
   424  	if err != nil {
   425  		return result, err
   426  	}
   427  
   428  	host, _, err := selectHost(priorityList, numberOfHighestScoredNodesToReport)
   429  	trace.Step("Prioritizing done")
   430  
   431  	return ScheduleResult{
   432  		SuggestedHost:  host,
   433  		EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap),
   434  		FeasibleNodes:  len(feasibleNodes),
   435  	}, err
   436  }
   437  
   438  // Filters the nodes to find the ones that fit the pod based on the framework
   439  // filter plugins and filter extenders.
   440  func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) {
   441  	logger := klog.FromContext(ctx)
   442  	diagnosis := framework.Diagnosis{
   443  		NodeToStatusMap: make(framework.NodeToStatusMap),
   444  	}
   445  
   446  	allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List()
   447  	if err != nil {
   448  		return nil, diagnosis, err
   449  	}
   450  	// Run "prefilter" plugins.
   451  	preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod)
   452  	if !s.IsSuccess() {
   453  		if !s.IsRejected() {
   454  			return nil, diagnosis, s.AsError()
   455  		}
   456  		// All nodes in NodeToStatusMap will have the same status so that they can be handled in the preemption.
   457  		// Some non trivial refactoring is needed to avoid this copy.
   458  		for _, n := range allNodes {
   459  			diagnosis.NodeToStatusMap[n.Node().Name] = s
   460  		}
   461  
   462  		// Record the messages from PreFilter in Diagnosis.PreFilterMsg.
   463  		msg := s.Message()
   464  		diagnosis.PreFilterMsg = msg
   465  		logger.V(5).Info("Status after running PreFilter plugins for pod", "pod", klog.KObj(pod), "status", msg)
   466  		diagnosis.AddPluginStatus(s)
   467  		return nil, diagnosis, nil
   468  	}
   469  
   470  	// "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption.
   471  	// This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes.
   472  	if len(pod.Status.NominatedNodeName) > 0 {
   473  		feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis)
   474  		if err != nil {
   475  			logger.Error(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName)
   476  		}
   477  		// Nominated node passes all the filters, scheduler is good to assign this node to the pod.
   478  		if len(feasibleNodes) != 0 {
   479  			return feasibleNodes, diagnosis, nil
   480  		}
   481  	}
   482  
   483  	nodes := allNodes
   484  	if !preRes.AllNodes() {
   485  		nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames))
   486  		for n := range preRes.NodeNames {
   487  			nInfo, err := sched.nodeInfoSnapshot.NodeInfos().Get(n)
   488  			if err != nil {
   489  				return nil, diagnosis, err
   490  			}
   491  			nodes = append(nodes, nInfo)
   492  		}
   493  	}
   494  	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, nodes)
   495  	// always try to update the sched.nextStartNodeIndex regardless of whether an error has occurred
   496  	// this is helpful to make sure that all the nodes have a chance to be searched
   497  	processedNodes := len(feasibleNodes) + len(diagnosis.NodeToStatusMap)
   498  	sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes)
   499  	if err != nil {
   500  		return nil, diagnosis, err
   501  	}
   502  
   503  	feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
   504  	if err != nil {
   505  		return nil, diagnosis, err
   506  	}
   507  	return feasibleNodes, diagnosis, nil
   508  }
   509  
   510  func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*v1.Node, error) {
   511  	nnn := pod.Status.NominatedNodeName
   512  	nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn)
   513  	if err != nil {
   514  		return nil, err
   515  	}
   516  	node := []*framework.NodeInfo{nodeInfo}
   517  	feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, node)
   518  	if err != nil {
   519  		return nil, err
   520  	}
   521  
   522  	feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap)
   523  	if err != nil {
   524  		return nil, err
   525  	}
   526  
   527  	return feasibleNodes, nil
   528  }
   529  
   530  // findNodesThatPassFilters finds the nodes that fit the filter plugins.
   531  func (sched *Scheduler) findNodesThatPassFilters(
   532  	ctx context.Context,
   533  	fwk framework.Framework,
   534  	state *framework.CycleState,
   535  	pod *v1.Pod,
   536  	diagnosis *framework.Diagnosis,
   537  	nodes []*framework.NodeInfo) ([]*v1.Node, error) {
   538  	numAllNodes := len(nodes)
   539  	numNodesToFind := sched.numFeasibleNodesToFind(fwk.PercentageOfNodesToScore(), int32(numAllNodes))
   540  
   541  	// Create feasible list with enough space to avoid growing it
   542  	// and allow assigning.
   543  	feasibleNodes := make([]*v1.Node, numNodesToFind)
   544  
   545  	if !fwk.HasFilterPlugins() {
   546  		for i := range feasibleNodes {
   547  			feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%numAllNodes].Node()
   548  		}
   549  		return feasibleNodes, nil
   550  	}
   551  
   552  	errCh := parallelize.NewErrorChannel()
   553  	var statusesLock sync.Mutex
   554  	var feasibleNodesLen int32
   555  	ctx, cancel := context.WithCancel(ctx)
   556  	defer cancel()
   557  	checkNode := func(i int) {
   558  		// We check the nodes starting from where we left off in the previous scheduling cycle,
   559  		// this is to make sure all nodes have the same chance of being examined across pods.
   560  		nodeInfo := nodes[(sched.nextStartNodeIndex+i)%numAllNodes]
   561  		status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo)
   562  		if status.Code() == framework.Error {
   563  			errCh.SendErrorWithCancel(status.AsError(), cancel)
   564  			return
   565  		}
   566  		if status.IsSuccess() {
   567  			length := atomic.AddInt32(&feasibleNodesLen, 1)
   568  			if length > numNodesToFind {
   569  				cancel()
   570  				atomic.AddInt32(&feasibleNodesLen, -1)
   571  			} else {
   572  				feasibleNodes[length-1] = nodeInfo.Node()
   573  			}
   574  		} else {
   575  			statusesLock.Lock()
   576  			diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status
   577  			diagnosis.AddPluginStatus(status)
   578  			statusesLock.Unlock()
   579  		}
   580  	}
   581  
   582  	beginCheckNode := time.Now()
   583  	statusCode := framework.Success
   584  	defer func() {
   585  		// We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins
   586  		// function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle.
   587  		// Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod.
   588  		metrics.FrameworkExtensionPointDuration.WithLabelValues(metrics.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode))
   589  	}()
   590  
   591  	// Stops searching for more nodes once the configured number of feasible nodes
   592  	// are found.
   593  	fwk.Parallelizer().Until(ctx, numAllNodes, checkNode, metrics.Filter)
   594  	feasibleNodes = feasibleNodes[:feasibleNodesLen]
   595  	if err := errCh.ReceiveError(); err != nil {
   596  		statusCode = framework.Error
   597  		return feasibleNodes, err
   598  	}
   599  	return feasibleNodes, nil
   600  }
   601  
   602  // numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops
   603  // its search for more feasible nodes.
   604  func (sched *Scheduler) numFeasibleNodesToFind(percentageOfNodesToScore *int32, numAllNodes int32) (numNodes int32) {
   605  	if numAllNodes < minFeasibleNodesToFind {
   606  		return numAllNodes
   607  	}
   608  
   609  	// Use profile percentageOfNodesToScore if it's set. Otherwise, use global percentageOfNodesToScore.
   610  	var percentage int32
   611  	if percentageOfNodesToScore != nil {
   612  		percentage = *percentageOfNodesToScore
   613  	} else {
   614  		percentage = sched.percentageOfNodesToScore
   615  	}
   616  
   617  	if percentage == 0 {
   618  		percentage = int32(50) - numAllNodes/125
   619  		if percentage < minFeasibleNodesPercentageToFind {
   620  			percentage = minFeasibleNodesPercentageToFind
   621  		}
   622  	}
   623  
   624  	numNodes = numAllNodes * percentage / 100
   625  	if numNodes < minFeasibleNodesToFind {
   626  		return minFeasibleNodesToFind
   627  	}
   628  
   629  	return numNodes
   630  }
   631  
   632  func findNodesThatPassExtenders(ctx context.Context, extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*v1.Node, statuses framework.NodeToStatusMap) ([]*v1.Node, error) {
   633  	logger := klog.FromContext(ctx)
   634  	// Extenders are called sequentially.
   635  	// Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next
   636  	// extender in a decreasing manner.
   637  	for _, extender := range extenders {
   638  		if len(feasibleNodes) == 0 {
   639  			break
   640  		}
   641  		if !extender.IsInterested(pod) {
   642  			continue
   643  		}
   644  
   645  		// Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in <statuses>,
   646  		// so that the scheduler framework can respect the UnschedulableAndUnresolvable status for
   647  		// particular nodes, and this may eventually improve preemption efficiency.
   648  		// Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable
   649  		// status ahead of others.
   650  		feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes)
   651  		if err != nil {
   652  			if extender.IsIgnorable() {
   653  				logger.Info("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err)
   654  				continue
   655  			}
   656  			return nil, err
   657  		}
   658  
   659  		for failedNodeName, failedMsg := range failedAndUnresolvableMap {
   660  			var aggregatedReasons []string
   661  			if _, found := statuses[failedNodeName]; found {
   662  				aggregatedReasons = statuses[failedNodeName].Reasons()
   663  			}
   664  			aggregatedReasons = append(aggregatedReasons, failedMsg)
   665  			statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...)
   666  		}
   667  
   668  		for failedNodeName, failedMsg := range failedMap {
   669  			if _, found := failedAndUnresolvableMap[failedNodeName]; found {
   670  				// failedAndUnresolvableMap takes precedence over failedMap
   671  				// note that this only happens if the extender returns the node in both maps
   672  				continue
   673  			}
   674  			if _, found := statuses[failedNodeName]; !found {
   675  				statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg)
   676  			} else {
   677  				statuses[failedNodeName].AppendReason(failedMsg)
   678  			}
   679  		}
   680  
   681  		feasibleNodes = feasibleList
   682  	}
   683  	return feasibleNodes, nil
   684  }
   685  
   686  // prioritizeNodes prioritizes the nodes by running the score plugins,
   687  // which return a score for each node from the call to RunScorePlugins().
   688  // The scores from each plugin are added together to make the score for that node, then
   689  // any extenders are run as well.
   690  // All scores are finally combined (added) to get the total weighted scores of all nodes
   691  func prioritizeNodes(
   692  	ctx context.Context,
   693  	extenders []framework.Extender,
   694  	fwk framework.Framework,
   695  	state *framework.CycleState,
   696  	pod *v1.Pod,
   697  	nodes []*v1.Node,
   698  ) ([]framework.NodePluginScores, error) {
   699  	logger := klog.FromContext(ctx)
   700  	// If no priority configs are provided, then all nodes will have a score of one.
   701  	// This is required to generate the priority list in the required format
   702  	if len(extenders) == 0 && !fwk.HasScorePlugins() {
   703  		result := make([]framework.NodePluginScores, 0, len(nodes))
   704  		for i := range nodes {
   705  			result = append(result, framework.NodePluginScores{
   706  				Name:       nodes[i].Name,
   707  				TotalScore: 1,
   708  			})
   709  		}
   710  		return result, nil
   711  	}
   712  
   713  	// Run PreScore plugins.
   714  	preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes)
   715  	if !preScoreStatus.IsSuccess() {
   716  		return nil, preScoreStatus.AsError()
   717  	}
   718  
   719  	// Run the Score plugins.
   720  	nodesScores, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes)
   721  	if !scoreStatus.IsSuccess() {
   722  		return nil, scoreStatus.AsError()
   723  	}
   724  
   725  	// Additional details logged at level 10 if enabled.
   726  	loggerVTen := logger.V(10)
   727  	if loggerVTen.Enabled() {
   728  		for _, nodeScore := range nodesScores {
   729  			for _, pluginScore := range nodeScore.Scores {
   730  				loggerVTen.Info("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", pluginScore.Name, "node", nodeScore.Name, "score", pluginScore.Score)
   731  			}
   732  		}
   733  	}
   734  
   735  	if len(extenders) != 0 && nodes != nil {
   736  		// allNodeExtendersScores has all extenders scores for all nodes.
   737  		// It is keyed with node name.
   738  		allNodeExtendersScores := make(map[string]*framework.NodePluginScores, len(nodes))
   739  		var mu sync.Mutex
   740  		var wg sync.WaitGroup
   741  		for i := range extenders {
   742  			if !extenders[i].IsInterested(pod) {
   743  				continue
   744  			}
   745  			wg.Add(1)
   746  			go func(extIndex int) {
   747  				metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Inc()
   748  				defer func() {
   749  					metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Dec()
   750  					wg.Done()
   751  				}()
   752  				prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes)
   753  				if err != nil {
   754  					// Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities
   755  					logger.V(5).Info("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name())
   756  					return
   757  				}
   758  				mu.Lock()
   759  				defer mu.Unlock()
   760  				for i := range *prioritizedList {
   761  					nodename := (*prioritizedList)[i].Host
   762  					score := (*prioritizedList)[i].Score
   763  					if loggerVTen.Enabled() {
   764  						loggerVTen.Info("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", nodename, "score", score)
   765  					}
   766  
   767  					// MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore,
   768  					// therefore we need to scale the score returned by extenders to the score range used by the scheduler.
   769  					finalscore := score * weight * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority)
   770  
   771  					if allNodeExtendersScores[nodename] == nil {
   772  						allNodeExtendersScores[nodename] = &framework.NodePluginScores{
   773  							Name:   nodename,
   774  							Scores: make([]framework.PluginScore, 0, len(extenders)),
   775  						}
   776  					}
   777  					allNodeExtendersScores[nodename].Scores = append(allNodeExtendersScores[nodename].Scores, framework.PluginScore{
   778  						Name:  extenders[extIndex].Name(),
   779  						Score: finalscore,
   780  					})
   781  					allNodeExtendersScores[nodename].TotalScore += finalscore
   782  				}
   783  			}(i)
   784  		}
   785  		// wait for all go routines to finish
   786  		wg.Wait()
   787  		for i := range nodesScores {
   788  			if score, ok := allNodeExtendersScores[nodes[i].Name]; ok {
   789  				nodesScores[i].Scores = append(nodesScores[i].Scores, score.Scores...)
   790  				nodesScores[i].TotalScore += score.TotalScore
   791  			}
   792  		}
   793  	}
   794  
   795  	if loggerVTen.Enabled() {
   796  		for i := range nodesScores {
   797  			loggerVTen.Info("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", nodesScores[i].Name, "score", nodesScores[i].TotalScore)
   798  		}
   799  	}
   800  	return nodesScores, nil
   801  }
   802  
   803  var errEmptyPriorityList = errors.New("empty priorityList")
   804  
   805  // selectHost takes a prioritized list of nodes and then picks one
   806  // in a reservoir sampling manner from the nodes that had the highest score.
   807  // It also returns the top {count} Nodes,
   808  // and the top of the list will be always the selected host.
   809  func selectHost(nodeScoreList []framework.NodePluginScores, count int) (string, []framework.NodePluginScores, error) {
   810  	if len(nodeScoreList) == 0 {
   811  		return "", nil, errEmptyPriorityList
   812  	}
   813  
   814  	var h nodeScoreHeap = nodeScoreList
   815  	heap.Init(&h)
   816  	cntOfMaxScore := 1
   817  	selectedIndex := 0
   818  	// The top of the heap is the NodeScoreResult with the highest score.
   819  	sortedNodeScoreList := make([]framework.NodePluginScores, 0, count)
   820  	sortedNodeScoreList = append(sortedNodeScoreList, heap.Pop(&h).(framework.NodePluginScores))
   821  
   822  	// This for-loop will continue until all Nodes with the highest scores get checked for a reservoir sampling,
   823  	// and sortedNodeScoreList gets (count - 1) elements.
   824  	for ns := heap.Pop(&h).(framework.NodePluginScores); ; ns = heap.Pop(&h).(framework.NodePluginScores) {
   825  		if ns.TotalScore != sortedNodeScoreList[0].TotalScore && len(sortedNodeScoreList) == count {
   826  			break
   827  		}
   828  
   829  		if ns.TotalScore == sortedNodeScoreList[0].TotalScore {
   830  			cntOfMaxScore++
   831  			if rand.Intn(cntOfMaxScore) == 0 {
   832  				// Replace the candidate with probability of 1/cntOfMaxScore
   833  				selectedIndex = cntOfMaxScore - 1
   834  			}
   835  		}
   836  
   837  		sortedNodeScoreList = append(sortedNodeScoreList, ns)
   838  
   839  		if h.Len() == 0 {
   840  			break
   841  		}
   842  	}
   843  
   844  	if selectedIndex != 0 {
   845  		// replace the first one with selected one
   846  		previous := sortedNodeScoreList[0]
   847  		sortedNodeScoreList[0] = sortedNodeScoreList[selectedIndex]
   848  		sortedNodeScoreList[selectedIndex] = previous
   849  	}
   850  
   851  	if len(sortedNodeScoreList) > count {
   852  		sortedNodeScoreList = sortedNodeScoreList[:count]
   853  	}
   854  
   855  	return sortedNodeScoreList[0].Name, sortedNodeScoreList, nil
   856  }
   857  
   858  // nodeScoreHeap is a heap of framework.NodePluginScores.
   859  type nodeScoreHeap []framework.NodePluginScores
   860  
   861  // nodeScoreHeap implements heap.Interface.
   862  var _ heap.Interface = &nodeScoreHeap{}
   863  
   864  func (h nodeScoreHeap) Len() int           { return len(h) }
   865  func (h nodeScoreHeap) Less(i, j int) bool { return h[i].TotalScore > h[j].TotalScore }
   866  func (h nodeScoreHeap) Swap(i, j int)      { h[i], h[j] = h[j], h[i] }
   867  
   868  func (h *nodeScoreHeap) Push(x interface{}) {
   869  	*h = append(*h, x.(framework.NodePluginScores))
   870  }
   871  
   872  func (h *nodeScoreHeap) Pop() interface{} {
   873  	old := *h
   874  	n := len(old)
   875  	x := old[n-1]
   876  	*h = old[0 : n-1]
   877  	return x
   878  }
   879  
   880  // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous.
   881  // assume modifies `assumed`.
   882  func (sched *Scheduler) assume(logger klog.Logger, assumed *v1.Pod, host string) error {
   883  	// Optimistically assume that the binding will succeed and send it to apiserver
   884  	// in the background.
   885  	// If the binding fails, scheduler will release resources allocated to assumed pod
   886  	// immediately.
   887  	assumed.Spec.NodeName = host
   888  
   889  	if err := sched.Cache.AssumePod(logger, assumed); err != nil {
   890  		logger.Error(err, "Scheduler cache AssumePod failed")
   891  		return err
   892  	}
   893  	// if "assumed" is a nominated pod, we should remove it from internal cache
   894  	if sched.SchedulingQueue != nil {
   895  		sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed)
   896  	}
   897  
   898  	return nil
   899  }
   900  
   901  // bind binds a pod to a given node defined in a binding object.
   902  // The precedence for binding is: (1) extenders and (2) framework plugins.
   903  // We expect this to run asynchronously, so we handle binding metrics internally.
   904  func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (status *framework.Status) {
   905  	logger := klog.FromContext(ctx)
   906  	defer func() {
   907  		sched.finishBinding(logger, fwk, assumed, targetNode, status)
   908  	}()
   909  
   910  	bound, err := sched.extendersBinding(assumed, targetNode)
   911  	if bound {
   912  		return framework.AsStatus(err)
   913  	}
   914  	return fwk.RunBindPlugins(ctx, state, assumed, targetNode)
   915  }
   916  
   917  // TODO(#87159): Move this to a Plugin.
   918  func (sched *Scheduler) extendersBinding(pod *v1.Pod, node string) (bool, error) {
   919  	for _, extender := range sched.Extenders {
   920  		if !extender.IsBinder() || !extender.IsInterested(pod) {
   921  			continue
   922  		}
   923  		return true, extender.Bind(&v1.Binding{
   924  			ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID},
   925  			Target:     v1.ObjectReference{Kind: "Node", Name: node},
   926  		})
   927  	}
   928  	return false, nil
   929  }
   930  
   931  func (sched *Scheduler) finishBinding(logger klog.Logger, fwk framework.Framework, assumed *v1.Pod, targetNode string, status *framework.Status) {
   932  	if finErr := sched.Cache.FinishBinding(logger, assumed); finErr != nil {
   933  		logger.Error(finErr, "Scheduler cache FinishBinding failed")
   934  	}
   935  	if !status.IsSuccess() {
   936  		logger.V(1).Info("Failed to bind pod", "pod", klog.KObj(assumed))
   937  		return
   938  	}
   939  
   940  	fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode)
   941  }
   942  
   943  func getAttemptsLabel(p *framework.QueuedPodInfo) string {
   944  	// We breakdown the pod scheduling duration by attempts capped to a limit
   945  	// to avoid ending up with a high cardinality metric.
   946  	if p.Attempts >= 15 {
   947  		return "15+"
   948  	}
   949  	return strconv.Itoa(p.Attempts)
   950  }
   951  
   952  // handleSchedulingFailure records an event for the pod that indicates the
   953  // pod has failed to schedule. Also, update the pod condition and nominated node name if set.
   954  func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framework.Framework, podInfo *framework.QueuedPodInfo, status *framework.Status, nominatingInfo *framework.NominatingInfo, start time.Time) {
   955  	calledDone := false
   956  	defer func() {
   957  		if !calledDone {
   958  			// Basically, AddUnschedulableIfNotPresent calls DonePod internally.
   959  			// But, AddUnschedulableIfNotPresent isn't called in some corner cases.
   960  			// Here, we call DonePod explicitly to avoid leaking the pod.
   961  			sched.SchedulingQueue.Done(podInfo.Pod.UID)
   962  		}
   963  	}()
   964  
   965  	logger := klog.FromContext(ctx)
   966  	reason := v1.PodReasonSchedulerError
   967  	if status.IsRejected() {
   968  		reason = v1.PodReasonUnschedulable
   969  	}
   970  
   971  	switch reason {
   972  	case v1.PodReasonUnschedulable:
   973  		metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start))
   974  	case v1.PodReasonSchedulerError:
   975  		metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start))
   976  	}
   977  
   978  	pod := podInfo.Pod
   979  	err := status.AsError()
   980  	errMsg := status.Message()
   981  
   982  	if err == ErrNoNodesAvailable {
   983  		logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod))
   984  	} else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently.
   985  		podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins
   986  		podInfo.PendingPlugins = fitError.Diagnosis.PendingPlugins
   987  		logger.V(2).Info("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", errMsg)
   988  	} else if apierrors.IsNotFound(err) {
   989  		logger.V(2).Info("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", errMsg)
   990  		if errStatus, ok := err.(apierrors.APIStatus); ok && errStatus.Status().Details.Kind == "node" {
   991  			nodeName := errStatus.Status().Details.Name
   992  			// when node is not found, We do not remove the node right away. Trying again to get
   993  			// the node and if the node is still not found, then remove it from the scheduler cache.
   994  			_, err := fwk.ClientSet().CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
   995  			if err != nil && apierrors.IsNotFound(err) {
   996  				node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}}
   997  				if err := sched.Cache.RemoveNode(logger, &node); err != nil {
   998  					logger.V(4).Info("Node is not found; failed to remove it from the cache", "node", node.Name)
   999  				}
  1000  			}
  1001  		}
  1002  	} else {
  1003  		logger.Error(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod))
  1004  	}
  1005  
  1006  	// Check if the Pod exists in informer cache.
  1007  	podLister := fwk.SharedInformerFactory().Core().V1().Pods().Lister()
  1008  	cachedPod, e := podLister.Pods(pod.Namespace).Get(pod.Name)
  1009  	if e != nil {
  1010  		logger.Info("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", e)
  1011  		// We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case.
  1012  	} else {
  1013  		// In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler.
  1014  		// It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version.
  1015  		if len(cachedPod.Spec.NodeName) != 0 {
  1016  			logger.Info("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName)
  1017  			// We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case.
  1018  		} else {
  1019  			// As <cachedPod> is from SharedInformer, we need to do a DeepCopy() here.
  1020  			// ignore this err since apiserver doesn't properly validate affinity terms
  1021  			// and we can't fix the validation for backwards compatibility.
  1022  			podInfo.PodInfo, _ = framework.NewPodInfo(cachedPod.DeepCopy())
  1023  			if err := sched.SchedulingQueue.AddUnschedulableIfNotPresent(logger, podInfo, sched.SchedulingQueue.SchedulingCycle()); err != nil {
  1024  				logger.Error(err, "Error occurred")
  1025  			}
  1026  			calledDone = true
  1027  		}
  1028  	}
  1029  
  1030  	// Update the scheduling queue with the nominated pod information. Without
  1031  	// this, there would be a race condition between the next scheduling cycle
  1032  	// and the time the scheduler receives a Pod Update for the nominated pod.
  1033  	// Here we check for nil only for tests.
  1034  	if sched.SchedulingQueue != nil {
  1035  		logger := klog.FromContext(ctx)
  1036  		sched.SchedulingQueue.AddNominatedPod(logger, podInfo.PodInfo, nominatingInfo)
  1037  	}
  1038  
  1039  	if err == nil {
  1040  		// Only tests can reach here.
  1041  		return
  1042  	}
  1043  
  1044  	msg := truncateMessage(errMsg)
  1045  	fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg)
  1046  	if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{
  1047  		Type:    v1.PodScheduled,
  1048  		Status:  v1.ConditionFalse,
  1049  		Reason:  reason,
  1050  		Message: errMsg,
  1051  	}, nominatingInfo); err != nil {
  1052  		klog.FromContext(ctx).Error(err, "Error updating pod", "pod", klog.KObj(pod))
  1053  	}
  1054  }
  1055  
  1056  // truncateMessage truncates a message if it hits the NoteLengthLimit.
  1057  func truncateMessage(message string) string {
  1058  	max := validation.NoteLengthLimit
  1059  	if len(message) <= max {
  1060  		return message
  1061  	}
  1062  	suffix := " ..."
  1063  	return message[:max-len(suffix)] + suffix
  1064  }
  1065  
  1066  func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error {
  1067  	logger := klog.FromContext(ctx)
  1068  	logger.V(3).Info("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason)
  1069  	podStatusCopy := pod.Status.DeepCopy()
  1070  	// NominatedNodeName is updated only if we are trying to set it, and the value is
  1071  	// different from the existing one.
  1072  	nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName
  1073  	if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate {
  1074  		return nil
  1075  	}
  1076  	if nnnNeedsUpdate {
  1077  		podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName
  1078  	}
  1079  	return util.PatchPodStatus(ctx, client, pod, podStatusCopy)
  1080  }