k8s.io/kubernetes@v1.29.3/pkg/scheduler/framework/preemption/preemption.go (about) 1 /* 2 Copyright 2021 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package preemption 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "math" 24 "sync" 25 "sync/atomic" 26 27 v1 "k8s.io/api/core/v1" 28 policy "k8s.io/api/policy/v1" 29 "k8s.io/apimachinery/pkg/labels" 30 utilerrors "k8s.io/apimachinery/pkg/util/errors" 31 "k8s.io/apiserver/pkg/util/feature" 32 corelisters "k8s.io/client-go/listers/core/v1" 33 policylisters "k8s.io/client-go/listers/policy/v1" 34 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 35 "k8s.io/klog/v2" 36 extenderv1 "k8s.io/kube-scheduler/extender/v1" 37 apipod "k8s.io/kubernetes/pkg/api/v1/pod" 38 "k8s.io/kubernetes/pkg/features" 39 "k8s.io/kubernetes/pkg/scheduler/framework" 40 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 41 "k8s.io/kubernetes/pkg/scheduler/metrics" 42 "k8s.io/kubernetes/pkg/scheduler/util" 43 ) 44 45 // Candidate represents a nominated node on which the preemptor can be scheduled, 46 // along with the list of victims that should be evicted for the preemptor to fit the node. 47 type Candidate interface { 48 // Victims wraps a list of to-be-preempted Pods and the number of PDB violation. 49 Victims() *extenderv1.Victims 50 // Name returns the target node name where the preemptor gets nominated to run. 51 Name() string 52 } 53 54 type candidate struct { 55 victims *extenderv1.Victims 56 name string 57 } 58 59 // Victims returns s.victims. 60 func (s *candidate) Victims() *extenderv1.Victims { 61 return s.victims 62 } 63 64 // Name returns s.name. 65 func (s *candidate) Name() string { 66 return s.name 67 } 68 69 type candidateList struct { 70 idx int32 71 items []Candidate 72 } 73 74 func newCandidateList(size int32) *candidateList { 75 return &candidateList{idx: -1, items: make([]Candidate, size)} 76 } 77 78 // add adds a new candidate to the internal array atomically. 79 func (cl *candidateList) add(c *candidate) { 80 if idx := atomic.AddInt32(&cl.idx, 1); idx < int32(len(cl.items)) { 81 cl.items[idx] = c 82 } 83 } 84 85 // size returns the number of candidate stored. Note that some add() operations 86 // might still be executing when this is called, so care must be taken to 87 // ensure that all add() operations complete before accessing the elements of 88 // the list. 89 func (cl *candidateList) size() int32 { 90 n := atomic.LoadInt32(&cl.idx) + 1 91 if n >= int32(len(cl.items)) { 92 n = int32(len(cl.items)) 93 } 94 return n 95 } 96 97 // get returns the internal candidate array. This function is NOT atomic and 98 // assumes that all add() operations have been completed. 99 func (cl *candidateList) get() []Candidate { 100 return cl.items[:cl.size()] 101 } 102 103 // Interface is expected to be implemented by different preemption plugins as all those member 104 // methods might have different behavior compared with the default preemption. 105 type Interface interface { 106 // GetOffsetAndNumCandidates chooses a random offset and calculates the number of candidates that should be 107 // shortlisted for dry running preemption. 108 GetOffsetAndNumCandidates(nodes int32) (int32, int32) 109 // CandidatesToVictimsMap builds a map from the target node to a list of to-be-preempted Pods and the number of PDB violation. 110 CandidatesToVictimsMap(candidates []Candidate) map[string]*extenderv1.Victims 111 // PodEligibleToPreemptOthers returns one bool and one string. The bool indicates whether this pod should be considered for 112 // preempting other pods or not. The string includes the reason if this pod isn't eligible. 113 PodEligibleToPreemptOthers(pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string) 114 // SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room 115 // for "pod" to be scheduled. 116 // Note that both `state` and `nodeInfo` are deep copied. 117 SelectVictimsOnNode(ctx context.Context, state *framework.CycleState, 118 pod *v1.Pod, nodeInfo *framework.NodeInfo, pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status) 119 // OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted. 120 // The ordered score functions will be processed one by one iff we find more than one node with the highest score. 121 // Default score functions will be processed if nil returned here for backwards-compatibility. 122 OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64 123 } 124 125 type Evaluator struct { 126 PluginName string 127 Handler framework.Handle 128 PodLister corelisters.PodLister 129 PdbLister policylisters.PodDisruptionBudgetLister 130 State *framework.CycleState 131 Interface 132 } 133 134 // Preempt returns a PostFilterResult carrying suggested nominatedNodeName, along with a Status. 135 // The semantics of returned <PostFilterResult, Status> varies on different scenarios: 136 // 137 // - <nil, Error>. This denotes it's a transient/rare error that may be self-healed in future cycles. 138 // 139 // - <nil, Unschedulable>. This status is mostly as expected like the preemptor is waiting for the 140 // victims to be fully terminated. 141 // 142 // - In both cases above, a nil PostFilterResult is returned to keep the pod's nominatedNodeName unchanged. 143 // 144 // - <non-nil PostFilterResult, Unschedulable>. It indicates the pod cannot be scheduled even with preemption. 145 // In this case, a non-nil PostFilterResult is returned and result.NominatingMode instructs how to deal with 146 // the nominatedNodeName. 147 // 148 // - <non-nil PostFilterResult, Success>. It's the regular happy path 149 // and the non-empty nominatedNodeName will be applied to the preemptor pod. 150 func (ev *Evaluator) Preempt(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { 151 logger := klog.FromContext(ctx) 152 153 // 0) Fetch the latest version of <pod>. 154 // It's safe to directly fetch pod here. Because the informer cache has already been 155 // initialized when creating the Scheduler obj. 156 // However, tests may need to manually initialize the shared pod informer. 157 podNamespace, podName := pod.Namespace, pod.Name 158 pod, err := ev.PodLister.Pods(pod.Namespace).Get(pod.Name) 159 if err != nil { 160 logger.Error(err, "Could not get the updated preemptor pod object", "pod", klog.KRef(podNamespace, podName)) 161 return nil, framework.AsStatus(err) 162 } 163 164 // 1) Ensure the preemptor is eligible to preempt other pods. 165 if ok, msg := ev.PodEligibleToPreemptOthers(pod, m[pod.Status.NominatedNodeName]); !ok { 166 logger.V(5).Info("Pod is not eligible for preemption", "pod", klog.KObj(pod), "reason", msg) 167 return nil, framework.NewStatus(framework.Unschedulable, msg) 168 } 169 170 // 2) Find all preemption candidates. 171 candidates, nodeToStatusMap, err := ev.findCandidates(ctx, pod, m) 172 if err != nil && len(candidates) == 0 { 173 return nil, framework.AsStatus(err) 174 } 175 176 // Return a FitError only when there are no candidates that fit the pod. 177 if len(candidates) == 0 { 178 fitError := &framework.FitError{ 179 Pod: pod, 180 NumAllNodes: len(nodeToStatusMap), 181 Diagnosis: framework.Diagnosis{ 182 NodeToStatusMap: nodeToStatusMap, 183 // Leave UnschedulablePlugins or PendingPlugins as nil as it won't be used on moving Pods. 184 }, 185 } 186 // Specify nominatedNodeName to clear the pod's nominatedNodeName status, if applicable. 187 return framework.NewPostFilterResultWithNominatedNode(""), framework.NewStatus(framework.Unschedulable, fitError.Error()) 188 } 189 190 // 3) Interact with registered Extenders to filter out some candidates if needed. 191 candidates, status := ev.callExtenders(logger, pod, candidates) 192 if !status.IsSuccess() { 193 return nil, status 194 } 195 196 // 4) Find the best candidate. 197 bestCandidate := ev.SelectCandidate(ctx, candidates) 198 if bestCandidate == nil || len(bestCandidate.Name()) == 0 { 199 return nil, framework.NewStatus(framework.Unschedulable, "no candidate node for preemption") 200 } 201 202 // 5) Perform preparation work before nominating the selected candidate. 203 if status := ev.prepareCandidate(ctx, bestCandidate, pod, ev.PluginName); !status.IsSuccess() { 204 return nil, status 205 } 206 207 return framework.NewPostFilterResultWithNominatedNode(bestCandidate.Name()), framework.NewStatus(framework.Success) 208 } 209 210 // FindCandidates calculates a slice of preemption candidates. 211 // Each candidate is executable to make the given <pod> schedulable. 212 func (ev *Evaluator) findCandidates(ctx context.Context, pod *v1.Pod, m framework.NodeToStatusMap) ([]Candidate, framework.NodeToStatusMap, error) { 213 allNodes, err := ev.Handler.SnapshotSharedLister().NodeInfos().List() 214 if err != nil { 215 return nil, nil, err 216 } 217 if len(allNodes) == 0 { 218 return nil, nil, errors.New("no nodes available") 219 } 220 logger := klog.FromContext(ctx) 221 potentialNodes, unschedulableNodeStatus := nodesWherePreemptionMightHelp(allNodes, m) 222 if len(potentialNodes) == 0 { 223 logger.V(3).Info("Preemption will not help schedule pod on any node", "pod", klog.KObj(pod)) 224 // In this case, we should clean-up any existing nominated node name of the pod. 225 if err := util.ClearNominatedNodeName(ctx, ev.Handler.ClientSet(), pod); err != nil { 226 logger.Error(err, "Could not clear the nominatedNodeName field of pod", "pod", klog.KObj(pod)) 227 // We do not return as this error is not critical. 228 } 229 return nil, unschedulableNodeStatus, nil 230 } 231 232 pdbs, err := getPodDisruptionBudgets(ev.PdbLister) 233 if err != nil { 234 return nil, nil, err 235 } 236 237 offset, numCandidates := ev.GetOffsetAndNumCandidates(int32(len(potentialNodes))) 238 if loggerV := logger.V(5); logger.Enabled() { 239 var sample []string 240 for i := offset; i < offset+10 && i < int32(len(potentialNodes)); i++ { 241 sample = append(sample, potentialNodes[i].Node().Name) 242 } 243 loggerV.Info("Selected candidates from a pool of nodes", "potentialNodesCount", len(potentialNodes), "offset", offset, "sampleLength", len(sample), "sample", sample, "candidates", numCandidates) 244 } 245 candidates, nodeStatuses, err := ev.DryRunPreemption(ctx, pod, potentialNodes, pdbs, offset, numCandidates) 246 for node, nodeStatus := range unschedulableNodeStatus { 247 nodeStatuses[node] = nodeStatus 248 } 249 return candidates, nodeStatuses, err 250 } 251 252 // callExtenders calls given <extenders> to select the list of feasible candidates. 253 // We will only check <candidates> with extenders that support preemption. 254 // Extenders which do not support preemption may later prevent preemptor from being scheduled on the nominated 255 // node. In that case, scheduler will find a different host for the preemptor in subsequent scheduling cycles. 256 func (ev *Evaluator) callExtenders(logger klog.Logger, pod *v1.Pod, candidates []Candidate) ([]Candidate, *framework.Status) { 257 extenders := ev.Handler.Extenders() 258 nodeLister := ev.Handler.SnapshotSharedLister().NodeInfos() 259 if len(extenders) == 0 { 260 return candidates, nil 261 } 262 263 // Migrate candidate slice to victimsMap to adapt to the Extender interface. 264 // It's only applicable for candidate slice that have unique nominated node name. 265 victimsMap := ev.CandidatesToVictimsMap(candidates) 266 if len(victimsMap) == 0 { 267 return candidates, nil 268 } 269 for _, extender := range extenders { 270 if !extender.SupportsPreemption() || !extender.IsInterested(pod) { 271 continue 272 } 273 nodeNameToVictims, err := extender.ProcessPreemption(pod, victimsMap, nodeLister) 274 if err != nil { 275 if extender.IsIgnorable() { 276 logger.Info("Skipped extender as it returned error and has ignorable flag set", 277 "extender", extender.Name(), "err", err) 278 continue 279 } 280 return nil, framework.AsStatus(err) 281 } 282 // Check if the returned victims are valid. 283 for nodeName, victims := range nodeNameToVictims { 284 if victims == nil || len(victims.Pods) == 0 { 285 if extender.IsIgnorable() { 286 delete(nodeNameToVictims, nodeName) 287 logger.Info("Ignored node for which the extender didn't report victims", "node", klog.KRef("", nodeName), "extender", extender.Name()) 288 continue 289 } 290 return nil, framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeName)) 291 } 292 } 293 294 // Replace victimsMap with new result after preemption. So the 295 // rest of extenders can continue use it as parameter. 296 victimsMap = nodeNameToVictims 297 298 // If node list becomes empty, no preemption can happen regardless of other extenders. 299 if len(victimsMap) == 0 { 300 break 301 } 302 } 303 304 var newCandidates []Candidate 305 for nodeName := range victimsMap { 306 newCandidates = append(newCandidates, &candidate{ 307 victims: victimsMap[nodeName], 308 name: nodeName, 309 }) 310 } 311 return newCandidates, nil 312 } 313 314 // SelectCandidate chooses the best-fit candidate from given <candidates> and return it. 315 // NOTE: This method is exported for easier testing in default preemption. 316 func (ev *Evaluator) SelectCandidate(ctx context.Context, candidates []Candidate) Candidate { 317 logger := klog.FromContext(ctx) 318 319 if len(candidates) == 0 { 320 return nil 321 } 322 if len(candidates) == 1 { 323 return candidates[0] 324 } 325 326 victimsMap := ev.CandidatesToVictimsMap(candidates) 327 scoreFuncs := ev.OrderedScoreFuncs(ctx, victimsMap) 328 candidateNode := pickOneNodeForPreemption(logger, victimsMap, scoreFuncs) 329 330 // Same as candidatesToVictimsMap, this logic is not applicable for out-of-tree 331 // preemption plugins that exercise different candidates on the same nominated node. 332 if victims := victimsMap[candidateNode]; victims != nil { 333 return &candidate{ 334 victims: victims, 335 name: candidateNode, 336 } 337 } 338 339 // We shouldn't reach here. 340 logger.Error(errors.New("no candidate selected"), "Should not reach here", "candidates", candidates) 341 // To not break the whole flow, return the first candidate. 342 return candidates[0] 343 } 344 345 // prepareCandidate does some preparation work before nominating the selected candidate: 346 // - Evict the victim pods 347 // - Reject the victim pods if they are in waitingPod map 348 // - Clear the low-priority pods' nominatedNodeName status if needed 349 func (ev *Evaluator) prepareCandidate(ctx context.Context, c Candidate, pod *v1.Pod, pluginName string) *framework.Status { 350 fh := ev.Handler 351 cs := ev.Handler.ClientSet() 352 353 ctx, cancel := context.WithCancel(ctx) 354 defer cancel() 355 logger := klog.FromContext(ctx) 356 errCh := parallelize.NewErrorChannel() 357 preemptPod := func(index int) { 358 victim := c.Victims().Pods[index] 359 // If the victim is a WaitingPod, send a reject message to the PermitPlugin. 360 // Otherwise we should delete the victim. 361 if waitingPod := fh.GetWaitingPod(victim.UID); waitingPod != nil { 362 waitingPod.Reject(pluginName, "preempted") 363 logger.V(2).Info("Preemptor pod rejected a waiting pod", "preemptor", klog.KObj(pod), "waitingPod", klog.KObj(victim), "node", c.Name()) 364 } else { 365 if feature.DefaultFeatureGate.Enabled(features.PodDisruptionConditions) { 366 condition := &v1.PodCondition{ 367 Type: v1.DisruptionTarget, 368 Status: v1.ConditionTrue, 369 Reason: v1.PodReasonPreemptionByScheduler, 370 Message: fmt.Sprintf("%s: preempting to accommodate a higher priority pod", pod.Spec.SchedulerName), 371 } 372 newStatus := pod.Status.DeepCopy() 373 updated := apipod.UpdatePodCondition(newStatus, condition) 374 if updated { 375 if err := util.PatchPodStatus(ctx, cs, victim, newStatus); err != nil { 376 logger.Error(err, "Could not add DisruptionTarget condition due to preemption", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod)) 377 errCh.SendErrorWithCancel(err, cancel) 378 return 379 } 380 } 381 } 382 if err := util.DeletePod(ctx, cs, victim); err != nil { 383 logger.Error(err, "Preempted pod", "pod", klog.KObj(victim), "preemptor", klog.KObj(pod)) 384 errCh.SendErrorWithCancel(err, cancel) 385 return 386 } 387 logger.V(2).Info("Preemptor Pod preempted victim Pod", "preemptor", klog.KObj(pod), "victim", klog.KObj(victim), "node", c.Name()) 388 } 389 390 fh.EventRecorder().Eventf(victim, pod, v1.EventTypeNormal, "Preempted", "Preempting", "Preempted by pod %v on node %v", pod.UID, c.Name()) 391 } 392 393 fh.Parallelizer().Until(ctx, len(c.Victims().Pods), preemptPod, ev.PluginName) 394 if err := errCh.ReceiveError(); err != nil { 395 return framework.AsStatus(err) 396 } 397 398 metrics.PreemptionVictims.Observe(float64(len(c.Victims().Pods))) 399 400 // Lower priority pods nominated to run on this node, may no longer fit on 401 // this node. So, we should remove their nomination. Removing their 402 // nomination updates these pods and moves them to the active queue. It 403 // lets scheduler find another place for them. 404 nominatedPods := getLowerPriorityNominatedPods(logger, fh, pod, c.Name()) 405 if err := util.ClearNominatedNodeName(ctx, cs, nominatedPods...); err != nil { 406 logger.Error(err, "Cannot clear 'NominatedNodeName' field") 407 // We do not return as this error is not critical. 408 } 409 410 return nil 411 } 412 413 // nodesWherePreemptionMightHelp returns a list of nodes with failed predicates 414 // that may be satisfied by removing pods from the node. 415 func nodesWherePreemptionMightHelp(nodes []*framework.NodeInfo, m framework.NodeToStatusMap) ([]*framework.NodeInfo, framework.NodeToStatusMap) { 416 var potentialNodes []*framework.NodeInfo 417 nodeStatuses := make(framework.NodeToStatusMap) 418 for _, node := range nodes { 419 name := node.Node().Name 420 // We rely on the status by each plugin - 'Unschedulable' or 'UnschedulableAndUnresolvable' 421 // to determine whether preemption may help or not on the node. 422 if m[name].Code() == framework.UnschedulableAndUnresolvable { 423 nodeStatuses[node.Node().Name] = framework.NewStatus(framework.UnschedulableAndUnresolvable, "Preemption is not helpful for scheduling") 424 continue 425 } 426 potentialNodes = append(potentialNodes, node) 427 } 428 return potentialNodes, nodeStatuses 429 } 430 431 func getPodDisruptionBudgets(pdbLister policylisters.PodDisruptionBudgetLister) ([]*policy.PodDisruptionBudget, error) { 432 if pdbLister != nil { 433 return pdbLister.List(labels.Everything()) 434 } 435 return nil, nil 436 } 437 438 // pickOneNodeForPreemption chooses one node among the given nodes. 439 // It assumes pods in each map entry are ordered by decreasing priority. 440 // If the scoreFuns is not empty, It picks a node based on score scoreFuns returns. 441 // If the scoreFuns is empty, 442 // It picks a node based on the following criteria: 443 // 1. A node with minimum number of PDB violations. 444 // 2. A node with minimum highest priority victim is picked. 445 // 3. Ties are broken by sum of priorities of all victims. 446 // 4. If there are still ties, node with the minimum number of victims is picked. 447 // 5. If there are still ties, node with the latest start time of all highest priority victims is picked. 448 // 6. If there are still ties, the first such node is picked (sort of randomly). 449 // The 'minNodes1' and 'minNodes2' are being reused here to save the memory 450 // allocation and garbage collection time. 451 func pickOneNodeForPreemption(logger klog.Logger, nodesToVictims map[string]*extenderv1.Victims, scoreFuncs []func(node string) int64) string { 452 if len(nodesToVictims) == 0 { 453 return "" 454 } 455 456 allCandidates := make([]string, 0, len(nodesToVictims)) 457 for node := range nodesToVictims { 458 allCandidates = append(allCandidates, node) 459 } 460 461 if len(scoreFuncs) == 0 { 462 minNumPDBViolatingScoreFunc := func(node string) int64 { 463 // The smaller the NumPDBViolations, the higher the score. 464 return -nodesToVictims[node].NumPDBViolations 465 } 466 minHighestPriorityScoreFunc := func(node string) int64 { 467 // highestPodPriority is the highest priority among the victims on this node. 468 highestPodPriority := corev1helpers.PodPriority(nodesToVictims[node].Pods[0]) 469 // The smaller the highestPodPriority, the higher the score. 470 return -int64(highestPodPriority) 471 } 472 minSumPrioritiesScoreFunc := func(node string) int64 { 473 var sumPriorities int64 474 for _, pod := range nodesToVictims[node].Pods { 475 // We add MaxInt32+1 to all priorities to make all of them >= 0. This is 476 // needed so that a node with a few pods with negative priority is not 477 // picked over a node with a smaller number of pods with the same negative 478 // priority (and similar scenarios). 479 sumPriorities += int64(corev1helpers.PodPriority(pod)) + int64(math.MaxInt32+1) 480 } 481 // The smaller the sumPriorities, the higher the score. 482 return -sumPriorities 483 } 484 minNumPodsScoreFunc := func(node string) int64 { 485 // The smaller the length of pods, the higher the score. 486 return -int64(len(nodesToVictims[node].Pods)) 487 } 488 latestStartTimeScoreFunc := func(node string) int64 { 489 // Get the earliest start time of all pods on the current node. 490 earliestStartTimeOnNode := util.GetEarliestPodStartTime(nodesToVictims[node]) 491 if earliestStartTimeOnNode == nil { 492 logger.Error(errors.New("earliestStartTime is nil for node"), "Should not reach here", "node", node) 493 return int64(math.MinInt64) 494 } 495 // The bigger the earliestStartTimeOnNode, the higher the score. 496 return earliestStartTimeOnNode.UnixNano() 497 } 498 499 // Each scoreFunc scores the nodes according to specific rules and keeps the name of the node 500 // with the highest score. If and only if the scoreFunc has more than one node with the highest 501 // score, we will execute the other scoreFunc in order of precedence. 502 scoreFuncs = []func(string) int64{ 503 // A node with a minimum number of PDB is preferable. 504 minNumPDBViolatingScoreFunc, 505 // A node with a minimum highest priority victim is preferable. 506 minHighestPriorityScoreFunc, 507 // A node with the smallest sum of priorities is preferable. 508 minSumPrioritiesScoreFunc, 509 // A node with the minimum number of pods is preferable. 510 minNumPodsScoreFunc, 511 // A node with the latest start time of all highest priority victims is preferable. 512 latestStartTimeScoreFunc, 513 // If there are still ties, then the first Node in the list is selected. 514 } 515 } 516 517 for _, f := range scoreFuncs { 518 selectedNodes := []string{} 519 maxScore := int64(math.MinInt64) 520 for _, node := range allCandidates { 521 score := f(node) 522 if score > maxScore { 523 maxScore = score 524 selectedNodes = []string{} 525 } 526 if score == maxScore { 527 selectedNodes = append(selectedNodes, node) 528 } 529 } 530 if len(selectedNodes) == 1 { 531 return selectedNodes[0] 532 } 533 allCandidates = selectedNodes 534 } 535 536 return allCandidates[0] 537 } 538 539 // getLowerPriorityNominatedPods returns pods whose priority is smaller than the 540 // priority of the given "pod" and are nominated to run on the given node. 541 // Note: We could possibly check if the nominated lower priority pods still fit 542 // and return those that no longer fit, but that would require lots of 543 // manipulation of NodeInfo and PreFilter state per nominated pod. It may not be 544 // worth the complexity, especially because we generally expect to have a very 545 // small number of nominated pods per node. 546 func getLowerPriorityNominatedPods(logger klog.Logger, pn framework.PodNominator, pod *v1.Pod, nodeName string) []*v1.Pod { 547 podInfos := pn.NominatedPodsForNode(nodeName) 548 549 if len(podInfos) == 0 { 550 return nil 551 } 552 553 var lowerPriorityPods []*v1.Pod 554 podPriority := corev1helpers.PodPriority(pod) 555 for _, pi := range podInfos { 556 if corev1helpers.PodPriority(pi.Pod) < podPriority { 557 lowerPriorityPods = append(lowerPriorityPods, pi.Pod) 558 } 559 } 560 return lowerPriorityPods 561 } 562 563 // DryRunPreemption simulates Preemption logic on <potentialNodes> in parallel, 564 // returns preemption candidates and a map indicating filtered nodes statuses. 565 // The number of candidates depends on the constraints defined in the plugin's args. In the returned list of 566 // candidates, ones that do not violate PDB are preferred over ones that do. 567 // NOTE: This method is exported for easier testing in default preemption. 568 func (ev *Evaluator) DryRunPreemption(ctx context.Context, pod *v1.Pod, potentialNodes []*framework.NodeInfo, 569 pdbs []*policy.PodDisruptionBudget, offset int32, numCandidates int32) ([]Candidate, framework.NodeToStatusMap, error) { 570 fh := ev.Handler 571 nonViolatingCandidates := newCandidateList(numCandidates) 572 violatingCandidates := newCandidateList(numCandidates) 573 ctx, cancel := context.WithCancel(ctx) 574 defer cancel() 575 nodeStatuses := make(framework.NodeToStatusMap) 576 var statusesLock sync.Mutex 577 var errs []error 578 checkNode := func(i int) { 579 nodeInfoCopy := potentialNodes[(int(offset)+i)%len(potentialNodes)].Snapshot() 580 stateCopy := ev.State.Clone() 581 pods, numPDBViolations, status := ev.SelectVictimsOnNode(ctx, stateCopy, pod, nodeInfoCopy, pdbs) 582 if status.IsSuccess() && len(pods) != 0 { 583 victims := extenderv1.Victims{ 584 Pods: pods, 585 NumPDBViolations: int64(numPDBViolations), 586 } 587 c := &candidate{ 588 victims: &victims, 589 name: nodeInfoCopy.Node().Name, 590 } 591 if numPDBViolations == 0 { 592 nonViolatingCandidates.add(c) 593 } else { 594 violatingCandidates.add(c) 595 } 596 nvcSize, vcSize := nonViolatingCandidates.size(), violatingCandidates.size() 597 if nvcSize > 0 && nvcSize+vcSize >= numCandidates { 598 cancel() 599 } 600 return 601 } 602 if status.IsSuccess() && len(pods) == 0 { 603 status = framework.AsStatus(fmt.Errorf("expected at least one victim pod on node %q", nodeInfoCopy.Node().Name)) 604 } 605 statusesLock.Lock() 606 if status.Code() == framework.Error { 607 errs = append(errs, status.AsError()) 608 } 609 nodeStatuses[nodeInfoCopy.Node().Name] = status 610 statusesLock.Unlock() 611 } 612 fh.Parallelizer().Until(ctx, len(potentialNodes), checkNode, ev.PluginName) 613 return append(nonViolatingCandidates.get(), violatingCandidates.get()...), nodeStatuses, utilerrors.NewAggregate(errs) 614 }