k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/preemption/preemption.go

k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/preemption/preemption.go (about)

     1  /*
     2  Copyright 2021 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package preemption
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"math"
    24  	"sync"
    25  	"sync/atomic"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	policy "k8s.io/api/policy/v1"
    29  	"k8s.io/apimachinery/pkg/labels"
    30  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    31  	"k8s.io/apiserver/pkg/util/feature"
    32  	corelisters "k8s.io/client-go/listers/core/v1"
    33  	policylisters "k8s.io/client-go/listers/policy/v1"
    34  	corev1helpers "k8s.io/component-helpers/scheduling/corev1"
    35  	"k8s.io/klog/v2"
    36  	extenderv1 "k8s.io/kube-scheduler/extender/v1"
    37  	apipod "k8s.io/kubernetes/pkg/api/v1/pod"
    38  	"k8s.io/kubernetes/pkg/features"
    39  	"k8s.io/kubernetes/pkg/scheduler/framework"
    40  	"k8s.io/kubernetes/pkg/scheduler/framework/parallelize"
    41  	"k8s.io/kubernetes/pkg/scheduler/metrics"
    42  	"k8s.io/kubernetes/pkg/scheduler/util"
    43  )
    44  
    45  // Candidate represents a nominated node on which the preemptor can be scheduled,
    46  // along with the list of victims that should be evicted for the preemptor to fit the node.
    47  type Candidate interface {
    48  	// Victims wraps a list of to-be-preempted Pods and the number of PDB violation.
    49  	Victims() *extenderv1.Victims
    50  	// Name returns the target node name where the preemptor gets nominated to run.
    51  	Name() string
    52  }
    53  
    54  type candidate struct {
    55  	victims *extenderv1.Victims
    56  	name    string
    57  }
    58  
    59  // Victims returns s.victims.
    60  func (s *candidate) Victims() *extenderv1.Victims {
    61  	return s.victims
    62  }
    63  
    64  // Name returns s.name.
    65  func (s *candidate) Name() string {
    66  	return s.name
    67  }
    68  
    69  type candidateList struct {
    70  	idx   int32
    71  	items []Candidate
    72  }
    73  
    74  func newCandidateList(size int32) *candidateList {
    75  	return &candidateList{idx: -1, items: make([]Candidate, size)}
    76  }
    77  
    78  // add adds a new candidate to the internal array atomically.
    79  func (cl *candidateList) add(c *candidate) {
    80  	if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) {
    81  		cl.items[idx] = c
    82  	}
    83  }
    84  
    85  // size returns the number of candidate stored. Note that some add() operations
    86  // might still be executing when this is called, so care must be taken to
    87  // ensure that all add() operations complete before accessing the elements of
    88  // the list.
    89  func (cl *candidateList) size() int32 {
    90  	n := atomic.LoadInt32(&cl.idx) + 1
    91  	if n >= int32(len(cl.items)) {
    92  		n = int32(len(cl.items))
    93  	}
    94  	return n
    95  }
    96  
    97  // get returns the internal candidate array. This function is NOT atomic and
    98  // assumes that all add() operations have been completed.
    99  func (cl *candidateList) get() []Candidate {
   100  	return cl.items[:cl.size()]
   101  }
   102  
   103  // Interface is expected to be implemented by different preemption plugins as all those member
   104  // methods might have different behavior compared with the default preemption.
   105  type Interface interface {
   106  	// GetOffsetAndNumCandidates chooses a random offset and calculates the number of candidates that should be
   107  	// shortlisted for dry running preemption.
   108  	GetOffsetAndNumCandidates(nodes int32) (int32, int32)
   109  	// CandidatesToVictimsMap builds a map from the target node to a list of to-be-preempted Pods and the number of PDB violation.
   110  	CandidatesToVictimsMap(candidates []Candidate) map[string]*extenderv1.Victims
   111  	// PodEligibleToPreemptOthers returns one bool and one string. The bool indicates whether this pod should be considered for
   112  	// preempting other pods or not. The string includes the reason if this pod isn't eligible.
   113  	PodEligibleToPreemptOthers(pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string)
   114  	// SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room
   115  	// for "pod" to be scheduled.
   116  	// Note that both `state` and `nodeInfo` are deep copied.
   117  	SelectVictimsOnNode(ctx context.Context, state *framework.CycleState,
   118  		pod *v1.Pod, nodeInfo *framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status)
   119  	// OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted.
   120  	// The ordered score functions will be processed one by one iff we find more than one node with the highest score.
   121  	// Default score functions will be processed if nil returned here for backwards-compatibility.
   122  	OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64
   123  }
   124  
   125  type Evaluator struct {
   126  	PluginName string
   127  	Handler    framework.Handle
   128  	PodLister  corelisters.PodLister
   129  	PdbLister  policylisters.PodDisruptionBudgetLister
   130  	State      *framework.CycleState
   131  	Interface
   132  }
   133  
   134  // Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status.
   135  // The semantics of returned <PostFilterResult, Status> varies on different scenarios:
   136  //
   137  //   - <nil, Error>. This denotes it's a transient/rare error that may be self-healed in future cycles.
   138  //
   139  //   - <nil, Unschedulable>. This status is mostly as expected like the preemptor is waiting for the
   140  //     victims to be fully terminated.
   141  //
   142  //   - In both cases above, a nil PostFilterResult is returned to keep the pod's nominatedNodeName unchanged.
   143  //
   144  //   - <non-nil PostFilterResult, Unschedulable>. It indicates the pod cannot be scheduled even with preemption.
   145  //     In this case, a non-nil PostFilterResult is returned and result.NominatingMode instructs how to deal with
   146  //     the nominatedNodeName.
   147  //
   148  //   - <non-nil PostFilterResult, Success>. It's the regular happy path
   149  //     and the non-empty nominatedNodeName will be applied to the preemptor pod.
   150  func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) {
   151  	logger := klog.FromContext(ctx)
   152  
   153  	// 0) Fetch the latest version of <pod>.
   154  	// It's safe to directly fetch pod here. Because the informer cache has already been
   155  	// initialized when creating the Scheduler obj.
   156  	// However, tests may need to manually initialize the shared pod informer.
   157  	podNamespace, podName := pod.Namespace, pod.Name
   158  	pod, err := ev.PodLister.Pods(pod.Namespace).Get(pod.Name)
   159  	if err != nil {
   160  		logger.Error(err, "Could not get the updated preemptor pod object", "pod", klog.KRef(podNamespace, podName))
   161  		return nil, framework.AsStatus(err)
   162  	}
   163  
   164  	// 1) Ensure the preemptor is eligible to preempt other pods.
   165  	if ok, msg := ev.PodEligibleToPreemptOthers(pod, m[pod.Status.NominatedNodeName]); !ok {
   166  		logger.V(5).Info("Pod is not eligible for preemption", "pod", klog.KObj(pod), "reason", msg)
   167  		return nil, framework.NewStatus(framework.Unschedulable, msg)
   168  	}
   169  
   170  	// 2) Find all preemption candidates.
   171  	candidates, nodeToStatusMap, err := ev.findCandidates(ctx, pod, m)
   172  	if err != nil && len(candidates) == 0 {
   173  		return nil, framework.AsStatus(err)
   174  	}
   175  
   176  	// Return a FitError only when there are no candidates that fit the pod.
   177  	if len(candidates) == 0 {
   178  		fitError := &framework.FitError{
   179  			Pod:         pod,
   180  			NumAllNodes: len(nodeToStatusMap),
   181  			Diagnosis: framework.Diagnosis{
   182  				NodeToStatusMap: nodeToStatusMap,
   183  				// Leave UnschedulablePlugins or PendingPlugins as nil as it won't be used on moving Pods.
   184  			},
   185  		}
   186  		// Specify nominatedNodeName to clear the pod's nominatedNodeName status, if applicable.
   187  		return framework.NewPostFilterResultWithNominatedNode(""), framework.NewStatus(framework.Unschedulable, fitError.Error())
   188  	}
   189  
   190  	// 3) Interact with registered Extenders to filter out some candidates if needed.
   191  	candidates, status := ev.callExtenders(logger, pod, candidates)
   192  	if !status.IsSuccess() {
   193  		return nil, status
   194  	}
   195  
   196  	// 4) Find the best candidate.
   197  	bestCandidate := ev.SelectCandidate(ctx, candidates)
   198  	if bestCandidate == nil || len(bestCandidate.Name()) == 0 {
   199  		return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption")
   200  	}
   201  
   202  	// 5) Perform preparation work before nominating the selected candidate.
   203  	if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() {
   204  		return nil, status
   205  	}
   206  
   207  	return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success)
   208  }
   209  
   210  // FindCandidates calculates a slice of preemption candidates.
   211  // Each candidate is executable to make the given <pod> schedulable.
   212  func (ev *Evaluator) findCandidates(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusMap) ([]Candidate, framework.NodeToStatusMap, error) {
   213  	allNodes, err := ev.Handler.SnapshotSharedLister().NodeInfos().List()
   214  	if err != nil {
   215  		return nil, nil, err
   216  	}
   217  	if len(allNodes) == 0 {
   218  		return nil, nil, errors.New("no nodes available")
   219  	}
   220  	logger := klog.FromContext(ctx)
   221  	potentialNodes, unschedulableNodeStatus := nodesWherePreemptionMightHelp(allNodes, m)
   222  	if len(potentialNodes) == 0 {
   223  		logger.V(3).Info("Preemption will not help schedule pod on any node", "pod", klog.KObj(pod))
   224  		// In this case, we should clean-up any existing nominated node name of the pod.
   225  		if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), pod); err != nil {
   226  			logger.Error(err, "Could not clear the nominatedNodeName field of pod", "pod", klog.KObj(pod))
   227  			// We do not return as this error is not critical.
   228  		}
   229  		return nil, unschedulableNodeStatus, nil
   230  	}
   231  
   232  	pdbs, err := getPodDisruptionBudgets(ev.PdbLister)
   233  	if err != nil {
   234  		return nil, nil, err
   235  	}
   236  
   237  	offset, numCandidates := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes)))
   238  	if loggerV := logger.V(5); logger.Enabled() {
   239  		var sample []string
   240  		for i := offset; i < offset+10 && i < int32(len(potentialNodes)); i++ {
   241  			sample = append(sample, potentialNodes[i].Node().Name)
   242  		}
   243  		loggerV.Info("Selected candidates from a pool of nodes", "potentialNodesCount", len(potentialNodes), "offset", offset, "sampleLength", len(sample), "sample", sample, "candidates", numCandidates)
   244  	}
   245  	candidates, nodeStatuses, err := ev.DryRunPreemption(ctx, pod, potentialNodes, pdbs, offset, numCandidates)
   246  	for node, nodeStatus := range unschedulableNodeStatus {
   247  		nodeStatuses[node] = nodeStatus
   248  	}
   249  	return candidates, nodeStatuses, err
   250  }
   251  
   252  // callExtenders calls given <extenders> to select the list of feasible candidates.
   253  // We will only check <candidates> with extenders that support preemption.
   254  // Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated
   255  // node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles.
   256  func (ev *Evaluator) callExtenders(logger klog.Logger, pod *v1.Pod, candidates []Candidate) ([]Candidate, *framework.Status) {
   257  	extenders := ev.Handler.Extenders()
   258  	nodeLister := ev.Handler.SnapshotSharedLister().NodeInfos()
   259  	if len(extenders) == 0 {
   260  		return candidates, nil
   261  	}
   262  
   263  	// Migrate candidate slice to victimsMap to adapt to the Extender interface.
   264  	// It's only applicable for candidate slice that have unique nominated node name.
   265  	victimsMap := ev.CandidatesToVictimsMap(candidates)
   266  	if len(victimsMap) == 0 {
   267  		return candidates, nil
   268  	}
   269  	for _, extender := range extenders {
   270  		if !extender.SupportsPreemption() || !extender.IsInterested(pod) {
   271  			continue
   272  		}
   273  		nodeNameToVictims, err := extender.ProcessPreemption(pod, victimsMap, nodeLister)
   274  		if err != nil {
   275  			if extender.IsIgnorable() {
   276  				logger.Info("Skipped extender as it returned error and has ignorable flag set",
   277  					"extender", extender.Name(), "err", err)
   278  				continue
   279  			}
   280  			return nil, framework.AsStatus(err)
   281  		}
   282  		// Check if the returned victims are valid.
   283  		for nodeName, victims := range nodeNameToVictims {
   284  			if victims == nil || len(victims.Pods) == 0 {
   285  				if extender.IsIgnorable() {
   286  					delete(nodeNameToVictims, nodeName)
   287  					logger.Info("Ignored node for which the extender didn't report victims", "node", klog.KRef("", nodeName), "extender", extender.Name())
   288  					continue
   289  				}
   290  				return nil, framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeName))
   291  			}
   292  		}
   293  
   294  		// Replace victimsMap with new result after preemption. So the
   295  		// rest of extenders can continue use it as parameter.
   296  		victimsMap = nodeNameToVictims
   297  
   298  		// If node list becomes empty, no preemption can happen regardless of other extenders.
   299  		if len(victimsMap) == 0 {
   300  			break
   301  		}
   302  	}
   303  
   304  	var newCandidates []Candidate
   305  	for nodeName := range victimsMap {
   306  		newCandidates = append(newCandidates, &candidate{
   307  			victims: victimsMap[nodeName],
   308  			name:    nodeName,
   309  		})
   310  	}
   311  	return newCandidates, nil
   312  }
   313  
   314  // SelectCandidate chooses the best-fit candidate from given <candidates> and return it.
   315  // NOTE: This method is exported for easier testing in default preemption.
   316  func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate) Candidate {
   317  	logger := klog.FromContext(ctx)
   318  
   319  	if len(candidates) == 0 {
   320  		return nil
   321  	}
   322  	if len(candidates) == 1 {
   323  		return candidates[0]
   324  	}
   325  
   326  	victimsMap := ev.CandidatesToVictimsMap(candidates)
   327  	scoreFuncs := ev.OrderedScoreFuncs(ctx, victimsMap)
   328  	candidateNode := pickOneNodeForPreemption(logger, victimsMap, scoreFuncs)
   329  
   330  	// Same as candidatesToVictimsMap, this logic is not applicable for out-of-tree
   331  	// preemption plugins that exercise different candidates on the same nominated node.
   332  	if victims := victimsMap[candidateNode]; victims != nil {
   333  		return &candidate{
   334  			victims: victims,
   335  			name:    candidateNode,
   336  		}
   337  	}
   338  
   339  	// We shouldn't reach here.
   340  	logger.Error(errors.New("no candidate selected"), "Should not reach here", "candidates", candidates)
   341  	// To not break the whole flow, return the first candidate.
   342  	return candidates[0]
   343  }
   344  
   345  // prepareCandidate does some preparation work before nominating the selected candidate:
   346  // - Evict the victim pods
   347  // - Reject the victim pods if they are in waitingPod map
   348  // - Clear the low-priority pods' nominatedNodeName status if needed
   349  func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.Pod, pluginName string) *framework.Status {
   350  	fh := ev.Handler
   351  	cs := ev.Handler.ClientSet()
   352  
   353  	ctx, cancel := context.WithCancel(ctx)
   354  	defer cancel()
   355  	logger := klog.FromContext(ctx)
   356  	errCh := parallelize.NewErrorChannel()
   357  	preemptPod := func(index int) {
   358  		victim := c.Victims().Pods[index]
   359  		// If the victim is a WaitingPod, send a reject message to the PermitPlugin.
   360  		// Otherwise we should delete the victim.
   361  		if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil {
   362  			waitingPod.Reject(pluginName, "preempted")
   363  			logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(pod), "waitingPod", klog.KObj(victim), "node", c.Name())
   364  		} else {
   365  			if feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) {
   366  				condition := &v1.PodCondition{
   367  					Type:    v1.DisruptionTarget,
   368  					Status:  v1.ConditionTrue,
   369  					Reason:  v1.PodReasonPreemptionByScheduler,
   370  					Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", pod.Spec.SchedulerName),
   371  				}
   372  				newStatus := pod.Status.DeepCopy()
   373  				updated := apipod.UpdatePodCondition(newStatus, condition)
   374  				if updated {
   375  					if err := util.PatchPodStatus(ctx, cs, victim, newStatus); err != nil {
   376  						logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
   377  						errCh.SendErrorWithCancel(err, cancel)
   378  						return
   379  					}
   380  				}
   381  			}
   382  			if err := util.DeletePod(ctx, cs, victim); err != nil {
   383  				logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod))
   384  				errCh.SendErrorWithCancel(err, cancel)
   385  				return
   386  			}
   387  			logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(pod), "victim", klog.KObj(victim), "node", c.Name())
   388  		}
   389  
   390  		fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", pod.UID, c.Name())
   391  	}
   392  
   393  	fh.Parallelizer().Until(ctx, len(c.Victims().Pods), preemptPod, ev.PluginName)
   394  	if err := errCh.ReceiveError(); err != nil {
   395  		return framework.AsStatus(err)
   396  	}
   397  
   398  	metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods)))
   399  
   400  	// Lower priority pods nominated to run on this node, may no longer fit on
   401  	// this node. So, we should remove their nomination. Removing their
   402  	// nomination updates these pods and moves them to the active queue. It
   403  	// lets scheduler find another place for them.
   404  	nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name())
   405  	if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil {
   406  		logger.Error(err, "Cannot clear 'NominatedNodeName' field")
   407  		// We do not return as this error is not critical.
   408  	}
   409  
   410  	return nil
   411  }
   412  
   413  // nodesWherePreemptionMightHelp returns a list of nodes with failed predicates
   414  // that may be satisfied by removing pods from the node.
   415  func nodesWherePreemptionMightHelp(nodes []*framework.NodeInfo, m framework.NodeToStatusMap) ([]*framework.NodeInfo, framework.NodeToStatusMap) {
   416  	var potentialNodes []*framework.NodeInfo
   417  	nodeStatuses := make(framework.NodeToStatusMap)
   418  	for _, node := range nodes {
   419  		name := node.Node().Name
   420  		// We rely on the status by each plugin - 'Unschedulable' or 'UnschedulableAndUnresolvable'
   421  		// to determine whether preemption may help or not on the node.
   422  		if m[name].Code() == framework.UnschedulableAndUnresolvable {
   423  			nodeStatuses[node.Node().Name] = framework.NewStatus(framework.UnschedulableAndUnresolvable, "Preemption is not helpful for scheduling")
   424  			continue
   425  		}
   426  		potentialNodes = append(potentialNodes, node)
   427  	}
   428  	return potentialNodes, nodeStatuses
   429  }
   430  
   431  func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) {
   432  	if pdbLister != nil {
   433  		return pdbLister.List(labels.Everything())
   434  	}
   435  	return nil, nil
   436  }
   437  
   438  // pickOneNodeForPreemption chooses one node among the given nodes.
   439  // It assumes pods in each map entry are ordered by decreasing priority.
   440  // If the scoreFuns is not empty, It picks a node based on score scoreFuns returns.
   441  // If the scoreFuns is empty,
   442  // It picks a node based on the following criteria:
   443  // 1. A node with minimum number of PDB violations.
   444  // 2. A node with minimum highest priority victim is picked.
   445  // 3. Ties are broken by sum of priorities of all victims.
   446  // 4. If there are still ties, node with the minimum number of victims is picked.
   447  // 5. If there are still ties, node with the latest start time of all highest priority victims is picked.
   448  // 6. If there are still ties, the first such node is picked (sort of randomly).
   449  // The 'minNodes1' and 'minNodes2' are being reused here to save the memory
   450  // allocation and garbage collection time.
   451  func pickOneNodeForPreemption(logger klog.Logger, nodesToVictims map[string]*extenderv1.Victims, scoreFuncs []func(node string) int64) string {
   452  	if len(nodesToVictims) == 0 {
   453  		return ""
   454  	}
   455  
   456  	allCandidates := make([]string, 0, len(nodesToVictims))
   457  	for node := range nodesToVictims {
   458  		allCandidates = append(allCandidates, node)
   459  	}
   460  
   461  	if len(scoreFuncs) == 0 {
   462  		minNumPDBViolatingScoreFunc := func(node string) int64 {
   463  			// The smaller the NumPDBViolations, the higher the score.
   464  			return -nodesToVictims[node].NumPDBViolations
   465  		}
   466  		minHighestPriorityScoreFunc := func(node string) int64 {
   467  			// highestPodPriority is the highest priority among the victims on this node.
   468  			highestPodPriority := corev1helpers.PodPriority(nodesToVictims[node].Pods[0])
   469  			// The smaller the highestPodPriority, the higher the score.
   470  			return -int64(highestPodPriority)
   471  		}
   472  		minSumPrioritiesScoreFunc := func(node string) int64 {
   473  			var sumPriorities int64
   474  			for _, pod := range nodesToVictims[node].Pods {
   475  				// We add MaxInt32+1 to all priorities to make all of them >= 0. This is
   476  				// needed so that a node with a few pods with negative priority is not
   477  				// picked over a node with a smaller number of pods with the same negative
   478  				// priority (and similar scenarios).
   479  				sumPriorities += int64(corev1helpers.PodPriority(pod)) + int64(math.MaxInt32+1)
   480  			}
   481  			// The smaller the sumPriorities, the higher the score.
   482  			return -sumPriorities
   483  		}
   484  		minNumPodsScoreFunc := func(node string) int64 {
   485  			// The smaller the length of pods, the higher the score.
   486  			return -int64(len(nodesToVictims[node].Pods))
   487  		}
   488  		latestStartTimeScoreFunc := func(node string) int64 {
   489  			// Get the earliest start time of all pods on the current node.
   490  			earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node])
   491  			if earliestStartTimeOnNode == nil {
   492  				logger.Error(errors.New("earliestStartTime is nil for node"), "Should not reach here", "node", node)
   493  				return int64(math.MinInt64)
   494  			}
   495  			// The bigger the earliestStartTimeOnNode, the higher the score.
   496  			return earliestStartTimeOnNode.UnixNano()
   497  		}
   498  
   499  		// Each scoreFunc scores the nodes according to specific rules and keeps the name of the node
   500  		// with the highest score. If and only if the scoreFunc has more than one node with the highest
   501  		// score, we will execute the other scoreFunc in order of precedence.
   502  		scoreFuncs = []func(string) int64{
   503  			// A node with a minimum number of PDB is preferable.
   504  			minNumPDBViolatingScoreFunc,
   505  			// A node with a minimum highest priority victim is preferable.
   506  			minHighestPriorityScoreFunc,
   507  			// A node with the smallest sum of priorities is preferable.
   508  			minSumPrioritiesScoreFunc,
   509  			// A node with the minimum number of pods is preferable.
   510  			minNumPodsScoreFunc,
   511  			// A node with the latest start time of all highest priority victims is preferable.
   512  			latestStartTimeScoreFunc,
   513  			// If there are still ties, then the first Node in the list is selected.
   514  		}
   515  	}
   516  
   517  	for _, f := range scoreFuncs {
   518  		selectedNodes := []string{}
   519  		maxScore := int64(math.MinInt64)
   520  		for _, node := range allCandidates {
   521  			score := f(node)
   522  			if score > maxScore {
   523  				maxScore = score
   524  				selectedNodes = []string{}
   525  			}
   526  			if score == maxScore {
   527  				selectedNodes = append(selectedNodes, node)
   528  			}
   529  		}
   530  		if len(selectedNodes) == 1 {
   531  			return selectedNodes[0]
   532  		}
   533  		allCandidates = selectedNodes
   534  	}
   535  
   536  	return allCandidates[0]
   537  }
   538  
   539  // getLowerPriorityNominatedPods returns pods whose priority is smaller than the
   540  // priority of the given "pod" and are nominated to run on the given node.
   541  // Note: We could possibly check if the nominated lower priority pods still fit
   542  // and return those that no longer fit, but that would require lots of
   543  // manipulation of NodeInfo and PreFilter state per nominated pod. It may not be
   544  // worth the complexity, especially because we generally expect to have a very
   545  // small number of nominated pods per node.
   546  func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod {
   547  	podInfos := pn.NominatedPodsForNode(nodeName)
   548  
   549  	if len(podInfos) == 0 {
   550  		return nil
   551  	}
   552  
   553  	var lowerPriorityPods []*v1.Pod
   554  	podPriority := corev1helpers.PodPriority(pod)
   555  	for _, pi := range podInfos {
   556  		if corev1helpers.PodPriority(pi.Pod) < podPriority {
   557  			lowerPriorityPods = append(lowerPriorityPods, pi.Pod)
   558  		}
   559  	}
   560  	return lowerPriorityPods
   561  }
   562  
   563  // DryRunPreemption simulates Preemption logic on <potentialNodes> in parallel,
   564  // returns preemption candidates and a map indicating filtered nodes statuses.
   565  // The number of candidates depends on the constraints defined in the plugin's args. In the returned list of
   566  // candidates, ones that do not violate PDB are preferred over ones that do.
   567  // NOTE: This method is exported for easier testing in default preemption.
   568  func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentialNodes []*framework.NodeInfo,
   569  	pdbs []*policy.PodDisruptionBudget, offset int32, numCandidates int32) ([]Candidate, framework.NodeToStatusMap, error) {
   570  	fh := ev.Handler
   571  	nonViolatingCandidates := newCandidateList(numCandidates)
   572  	violatingCandidates := newCandidateList(numCandidates)
   573  	ctx, cancel := context.WithCancel(ctx)
   574  	defer cancel()
   575  	nodeStatuses := make(framework.NodeToStatusMap)
   576  	var statusesLock sync.Mutex
   577  	var errs []error
   578  	checkNode := func(i int) {
   579  		nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot()
   580  		stateCopy := ev.State.Clone()
   581  		pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs)
   582  		if status.IsSuccess() && len(pods) != 0 {
   583  			victims := extenderv1.Victims{
   584  				Pods:             pods,
   585  				NumPDBViolations: int64(numPDBViolations),
   586  			}
   587  			c := &candidate{
   588  				victims: &victims,
   589  				name:    nodeInfoCopy.Node().Name,
   590  			}
   591  			if numPDBViolations == 0 {
   592  				nonViolatingCandidates.add(c)
   593  			} else {
   594  				violatingCandidates.add(c)
   595  			}
   596  			nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size()
   597  			if nvcSize > 0 && nvcSize+vcSize >= numCandidates {
   598  				cancel()
   599  			}
   600  			return
   601  		}
   602  		if status.IsSuccess() && len(pods) == 0 {
   603  			status = framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeInfoCopy.Node().Name))
   604  		}
   605  		statusesLock.Lock()
   606  		if status.Code() == framework.Error {
   607  			errs = append(errs, status.AsError())
   608  		}
   609  		nodeStatuses[nodeInfoCopy.Node().Name] = status
   610  		statusesLock.Unlock()
   611  	}
   612  	fh.Parallelizer().Until(ctx, len(potentialNodes), checkNode, ev.PluginName)
   613  	return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses, utilerrors.NewAggregate(errs)
   614  }