k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption.go (about) 1 /* 2 Copyright 2020 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package defaultpreemption 18 19 import ( 20 "context" 21 "fmt" 22 "math/rand" 23 "sort" 24 25 v1 "k8s.io/api/core/v1" 26 policy "k8s.io/api/policy/v1" 27 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 "k8s.io/apimachinery/pkg/labels" 29 "k8s.io/apimachinery/pkg/runtime" 30 "k8s.io/client-go/informers" 31 corelisters "k8s.io/client-go/listers/core/v1" 32 policylisters "k8s.io/client-go/listers/policy/v1" 33 corev1helpers "k8s.io/component-helpers/scheduling/corev1" 34 "k8s.io/klog/v2" 35 extenderv1 "k8s.io/kube-scheduler/extender/v1" 36 "k8s.io/kubernetes/pkg/scheduler/apis/config" 37 "k8s.io/kubernetes/pkg/scheduler/apis/config/validation" 38 "k8s.io/kubernetes/pkg/scheduler/framework" 39 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 40 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" 41 "k8s.io/kubernetes/pkg/scheduler/framework/preemption" 42 "k8s.io/kubernetes/pkg/scheduler/metrics" 43 "k8s.io/kubernetes/pkg/scheduler/util" 44 ) 45 46 // Name of the plugin used in the plugin registry and configurations. 47 const Name = names.DefaultPreemption 48 49 // DefaultPreemption is a PostFilter plugin implements the preemption logic. 50 type DefaultPreemption struct { 51 fh framework.Handle 52 fts feature.Features 53 args config.DefaultPreemptionArgs 54 podLister corelisters.PodLister 55 pdbLister policylisters.PodDisruptionBudgetLister 56 } 57 58 var _ framework.PostFilterPlugin = &DefaultPreemption{} 59 60 // Name returns name of the plugin. It is used in logs, etc. 61 func (pl *DefaultPreemption) Name() string { 62 return Name 63 } 64 65 // New initializes a new plugin and returns it. 66 func New(_ context.Context, dpArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) { 67 args, ok := dpArgs.(*config.DefaultPreemptionArgs) 68 if !ok { 69 return nil, fmt.Errorf("got args of type %T, want *DefaultPreemptionArgs", dpArgs) 70 } 71 if err := validation.ValidateDefaultPreemptionArgs(nil, args); err != nil { 72 return nil, err 73 } 74 pl := DefaultPreemption{ 75 fh: fh, 76 fts: fts, 77 args: *args, 78 podLister: fh.SharedInformerFactory().Core().V1().Pods().Lister(), 79 pdbLister: getPDBLister(fh.SharedInformerFactory()), 80 } 81 return &pl, nil 82 } 83 84 // PostFilter invoked at the postFilter extension point. 85 func (pl *DefaultPreemption) PostFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, m framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) { 86 defer func() { 87 metrics.PreemptionAttempts.Inc() 88 }() 89 90 pe := preemption.Evaluator{ 91 PluginName: names.DefaultPreemption, 92 Handler: pl.fh, 93 PodLister: pl.podLister, 94 PdbLister: pl.pdbLister, 95 State: state, 96 Interface: pl, 97 } 98 99 result, status := pe.Preempt(ctx, pod, m) 100 msg := status.Message() 101 if len(msg) > 0 { 102 return result, framework.NewStatus(status.Code(), "preemption: "+msg) 103 } 104 return result, status 105 } 106 107 // calculateNumCandidates returns the number of candidates the FindCandidates 108 // method must produce from dry running based on the constraints given by 109 // <minCandidateNodesPercentage> and <minCandidateNodesAbsolute>. The number of 110 // candidates returned will never be greater than <numNodes>. 111 func (pl *DefaultPreemption) calculateNumCandidates(numNodes int32) int32 { 112 n := (numNodes * pl.args.MinCandidateNodesPercentage) / 100 113 if n < pl.args.MinCandidateNodesAbsolute { 114 n = pl.args.MinCandidateNodesAbsolute 115 } 116 if n > numNodes { 117 n = numNodes 118 } 119 return n 120 } 121 122 // GetOffsetAndNumCandidates chooses a random offset and calculates the number 123 // of candidates that should be shortlisted for dry running preemption. 124 func (pl *DefaultPreemption) GetOffsetAndNumCandidates(numNodes int32) (int32, int32) { 125 return rand.Int31n(numNodes), pl.calculateNumCandidates(numNodes) 126 } 127 128 // This function is not applicable for out-of-tree preemption plugins that exercise 129 // different preemption candidates on the same nominated node. 130 func (pl *DefaultPreemption) CandidatesToVictimsMap(candidates []preemption.Candidate) map[string]*extenderv1.Victims { 131 m := make(map[string]*extenderv1.Victims, len(candidates)) 132 for _, c := range candidates { 133 m[c.Name()] = c.Victims() 134 } 135 return m 136 } 137 138 // SelectVictimsOnNode finds minimum set of pods on the given node that should be preempted in order to make enough room 139 // for "pod" to be scheduled. 140 func (pl *DefaultPreemption) SelectVictimsOnNode( 141 ctx context.Context, 142 state *framework.CycleState, 143 pod *v1.Pod, 144 nodeInfo *framework.NodeInfo, 145 pdbs []*policy.PodDisruptionBudget) ([]*v1.Pod, int, *framework.Status) { 146 logger := klog.FromContext(ctx) 147 var potentialVictims []*framework.PodInfo 148 removePod := func(rpi *framework.PodInfo) error { 149 if err := nodeInfo.RemovePod(logger, rpi.Pod); err != nil { 150 return err 151 } 152 status := pl.fh.RunPreFilterExtensionRemovePod(ctx, state, pod, rpi, nodeInfo) 153 if !status.IsSuccess() { 154 return status.AsError() 155 } 156 return nil 157 } 158 addPod := func(api *framework.PodInfo) error { 159 nodeInfo.AddPodInfo(api) 160 status := pl.fh.RunPreFilterExtensionAddPod(ctx, state, pod, api, nodeInfo) 161 if !status.IsSuccess() { 162 return status.AsError() 163 } 164 return nil 165 } 166 // As the first step, remove all the lower priority pods from the node and 167 // check if the given pod can be scheduled. 168 podPriority := corev1helpers.PodPriority(pod) 169 for _, pi := range nodeInfo.Pods { 170 if corev1helpers.PodPriority(pi.Pod) < podPriority { 171 potentialVictims = append(potentialVictims, pi) 172 if err := removePod(pi); err != nil { 173 return nil, 0, framework.AsStatus(err) 174 } 175 } 176 } 177 178 // No potential victims are found, and so we don't need to evaluate the node again since its state didn't change. 179 if len(potentialVictims) == 0 { 180 return nil, 0, framework.NewStatus(framework.UnschedulableAndUnresolvable, "No preemption victims found for incoming pod") 181 } 182 183 // If the new pod does not fit after removing all the lower priority pods, 184 // we are almost done and this node is not suitable for preemption. The only 185 // condition that we could check is if the "pod" is failing to schedule due to 186 // inter-pod affinity to one or more victims, but we have decided not to 187 // support this case for performance reasons. Having affinity to lower 188 // priority pods is not a recommended configuration anyway. 189 if status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo); !status.IsSuccess() { 190 return nil, 0, status 191 } 192 var victims []*v1.Pod 193 numViolatingVictim := 0 194 sort.Slice(potentialVictims, func(i, j int) bool { return util.MoreImportantPod(potentialVictims[i].Pod, potentialVictims[j].Pod) }) 195 // Try to reprieve as many pods as possible. We first try to reprieve the PDB 196 // violating victims and then other non-violating ones. In both cases, we start 197 // from the highest priority victims. 198 violatingVictims, nonViolatingVictims := filterPodsWithPDBViolation(potentialVictims, pdbs) 199 reprievePod := func(pi *framework.PodInfo) (bool, error) { 200 if err := addPod(pi); err != nil { 201 return false, err 202 } 203 status := pl.fh.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo) 204 fits := status.IsSuccess() 205 if !fits { 206 if err := removePod(pi); err != nil { 207 return false, err 208 } 209 rpi := pi.Pod 210 victims = append(victims, rpi) 211 logger.V(5).Info("Pod is a potential preemption victim on node", "pod", klog.KObj(rpi), "node", klog.KObj(nodeInfo.Node())) 212 } 213 return fits, nil 214 } 215 for _, p := range violatingVictims { 216 if fits, err := reprievePod(p); err != nil { 217 return nil, 0, framework.AsStatus(err) 218 } else if !fits { 219 numViolatingVictim++ 220 } 221 } 222 // Now we try to reprieve non-violating victims. 223 for _, p := range nonViolatingVictims { 224 if _, err := reprievePod(p); err != nil { 225 return nil, 0, framework.AsStatus(err) 226 } 227 } 228 return victims, numViolatingVictim, framework.NewStatus(framework.Success) 229 } 230 231 // PodEligibleToPreemptOthers returns one bool and one string. The bool 232 // indicates whether this pod should be considered for preempting other pods or 233 // not. The string includes the reason if this pod isn't eligible. 234 // There're several reasons: 235 // 1. The pod has a preemptionPolicy of Never. 236 // 2. The pod has already preempted other pods and the victims are in their graceful termination period. 237 // Currently we check the node that is nominated for this pod, and as long as there are 238 // terminating pods on this node, we don't attempt to preempt more pods. 239 func (pl *DefaultPreemption) PodEligibleToPreemptOthers(pod *v1.Pod, nominatedNodeStatus *framework.Status) (bool, string) { 240 if pod.Spec.PreemptionPolicy != nil && *pod.Spec.PreemptionPolicy == v1.PreemptNever { 241 return false, "not eligible due to preemptionPolicy=Never." 242 } 243 244 nodeInfos := pl.fh.SnapshotSharedLister().NodeInfos() 245 nomNodeName := pod.Status.NominatedNodeName 246 if len(nomNodeName) > 0 { 247 // If the pod's nominated node is considered as UnschedulableAndUnresolvable by the filters, 248 // then the pod should be considered for preempting again. 249 if nominatedNodeStatus.Code() == framework.UnschedulableAndUnresolvable { 250 return true, "" 251 } 252 253 if nodeInfo, _ := nodeInfos.Get(nomNodeName); nodeInfo != nil { 254 podPriority := corev1helpers.PodPriority(pod) 255 for _, p := range nodeInfo.Pods { 256 if corev1helpers.PodPriority(p.Pod) < podPriority && podTerminatingByPreemption(p.Pod, pl.fts.EnablePodDisruptionConditions) { 257 // There is a terminating pod on the nominated node. 258 return false, "not eligible due to a terminating pod on the nominated node." 259 } 260 } 261 } 262 } 263 return true, "" 264 } 265 266 // OrderedScoreFuncs returns a list of ordered score functions to select preferable node where victims will be preempted. 267 func (pl *DefaultPreemption) OrderedScoreFuncs(ctx context.Context, nodesToVictims map[string]*extenderv1.Victims) []func(node string) int64 { 268 return nil 269 } 270 271 // podTerminatingByPreemption returns the pod's terminating state if feature PodDisruptionConditions is not enabled. 272 // Otherwise, it additionally checks if the termination state is caused by scheduler preemption. 273 func podTerminatingByPreemption(p *v1.Pod, enablePodDisruptionConditions bool) bool { 274 if p.DeletionTimestamp == nil { 275 return false 276 } 277 278 if !enablePodDisruptionConditions { 279 return true 280 } 281 282 for _, condition := range p.Status.Conditions { 283 if condition.Type == v1.DisruptionTarget { 284 return condition.Status == v1.ConditionTrue && condition.Reason == v1.PodReasonPreemptionByScheduler 285 } 286 } 287 return false 288 } 289 290 // filterPodsWithPDBViolation groups the given "pods" into two groups of "violatingPods" 291 // and "nonViolatingPods" based on whether their PDBs will be violated if they are 292 // preempted. 293 // This function is stable and does not change the order of received pods. So, if it 294 // receives a sorted list, grouping will preserve the order of the input list. 295 func filterPodsWithPDBViolation(podInfos []*framework.PodInfo, pdbs []*policy.PodDisruptionBudget) (violatingPodInfos, nonViolatingPodInfos []*framework.PodInfo) { 296 pdbsAllowed := make([]int32, len(pdbs)) 297 for i, pdb := range pdbs { 298 pdbsAllowed[i] = pdb.Status.DisruptionsAllowed 299 } 300 301 for _, podInfo := range podInfos { 302 pod := podInfo.Pod 303 pdbForPodIsViolated := false 304 // A pod with no labels will not match any PDB. So, no need to check. 305 if len(pod.Labels) != 0 { 306 for i, pdb := range pdbs { 307 if pdb.Namespace != pod.Namespace { 308 continue 309 } 310 selector, err := metav1.LabelSelectorAsSelector(pdb.Spec.Selector) 311 if err != nil { 312 // This object has an invalid selector, it does not match the pod 313 continue 314 } 315 // A PDB with a nil or empty selector matches nothing. 316 if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) { 317 continue 318 } 319 320 // Existing in DisruptedPods means it has been processed in API server, 321 // we don't treat it as a violating case. 322 if _, exist := pdb.Status.DisruptedPods[pod.Name]; exist { 323 continue 324 } 325 // Only decrement the matched pdb when it's not in its <DisruptedPods>; 326 // otherwise we may over-decrement the budget number. 327 pdbsAllowed[i]-- 328 // We have found a matching PDB. 329 if pdbsAllowed[i] < 0 { 330 pdbForPodIsViolated = true 331 } 332 } 333 } 334 if pdbForPodIsViolated { 335 violatingPodInfos = append(violatingPodInfos, podInfo) 336 } else { 337 nonViolatingPodInfos = append(nonViolatingPodInfos, podInfo) 338 } 339 } 340 return violatingPodInfos, nonViolatingPodInfos 341 } 342 343 func getPDBLister(informerFactory informers.SharedInformerFactory) policylisters.PodDisruptionBudgetLister { 344 return informerFactory.Policy().V1().PodDisruptionBudgets().Lister() 345 }