k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/schedule_one.go (about) 1 /* 2 Copyright 2014 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package scheduler 18 19 import ( 20 "container/heap" 21 "context" 22 "errors" 23 "fmt" 24 "math/rand" 25 "strconv" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 v1 "k8s.io/api/core/v1" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 33 "k8s.io/apimachinery/pkg/util/sets" 34 clientset "k8s.io/client-go/kubernetes" 35 "k8s.io/klog/v2" 36 extenderv1 "k8s.io/kube-scheduler/extender/v1" 37 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 38 "k8s.io/kubernetes/pkg/apis/core/validation" 39 "k8s.io/kubernetes/pkg/scheduler/framework" 40 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 41 internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" 42 "k8s.io/kubernetes/pkg/scheduler/metrics" 43 "k8s.io/kubernetes/pkg/scheduler/util" 44 utiltrace "k8s.io/utils/trace" 45 ) 46 47 const ( 48 // Percentage of plugin metrics to be sampled. 49 pluginMetricsSamplePercent = 10 50 // minFeasibleNodesToFind is the minimum number of nodes that would be scored 51 // in each scheduling cycle. This is a semi-arbitrary value to ensure that a 52 // certain minimum of nodes are checked for feasibility. This in turn helps 53 // ensure a minimum level of spreading. 54 minFeasibleNodesToFind = 100 55 // minFeasibleNodesPercentageToFind is the minimum percentage of nodes that 56 // would be scored in each scheduling cycle. This is a semi-arbitrary value 57 // to ensure that a certain minimum of nodes are checked for feasibility. 58 // This in turn helps ensure a minimum level of spreading. 59 minFeasibleNodesPercentageToFind = 5 60 // numberOfHighestScoredNodesToReport is the number of node scores 61 // to be included in ScheduleResult. 62 numberOfHighestScoredNodesToReport = 3 63 ) 64 65 // ScheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. 66 func (sched *Scheduler) ScheduleOne(ctx context.Context) { 67 logger := klog.FromContext(ctx) 68 podInfo, err := sched.NextPod(logger) 69 if err != nil { 70 logger.Error(err, "Error while retrieving next pod from scheduling queue") 71 return 72 } 73 // pod could be nil when schedulerQueue is closed 74 if podInfo == nil || podInfo.Pod == nil { 75 return 76 } 77 78 pod := podInfo.Pod 79 // TODO(knelasevero): Remove duplicated keys from log entry calls 80 // When contextualized logging hits GA 81 // https://github.com/kubernetes/kubernetes/issues/111672 82 logger = klog.LoggerWithValues(logger, "pod", klog.KObj(pod)) 83 ctx = klog.NewContext(ctx, logger) 84 logger.V(4).Info("About to try and schedule pod", "pod", klog.KObj(pod)) 85 86 fwk, err := sched.frameworkForPod(pod) 87 if err != nil { 88 // This shouldn't happen, because we only accept for scheduling the pods 89 // which specify a scheduler name that matches one of the profiles. 90 logger.Error(err, "Error occurred") 91 return 92 } 93 if sched.skipPodSchedule(ctx, fwk, pod) { 94 return 95 } 96 97 logger.V(3).Info("Attempting to schedule pod", "pod", klog.KObj(pod)) 98 99 // Synchronously attempt to find a fit for the pod. 100 start := time.Now() 101 state := framework.NewCycleState() 102 state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent) 103 104 // Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty. 105 podsToActivate := framework.NewPodsToActivate() 106 state.Write(framework.PodsToActivateKey, podsToActivate) 107 108 schedulingCycleCtx, cancel := context.WithCancel(ctx) 109 defer cancel() 110 111 scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate) 112 if !status.IsSuccess() { 113 sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start) 114 return 115 } 116 117 // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). 118 go func() { 119 bindingCycleCtx, cancel := context.WithCancel(ctx) 120 defer cancel() 121 122 metrics.Goroutines.WithLabelValues(metrics.Binding).Inc() 123 defer metrics.Goroutines.WithLabelValues(metrics.Binding).Dec() 124 125 status := sched.bindingCycle(bindingCycleCtx, state, fwk, scheduleResult, assumedPodInfo, start, podsToActivate) 126 if !status.IsSuccess() { 127 sched.handleBindingCycleError(bindingCycleCtx, state, fwk, assumedPodInfo, start, scheduleResult, status) 128 return 129 } 130 // Usually, DonePod is called inside the scheduling queue, 131 // but in this case, we need to call it here because this Pod won't go back to the scheduling queue. 132 sched.SchedulingQueue.Done(assumedPodInfo.Pod.UID) 133 }() 134 } 135 136 var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""} 137 138 // schedulingCycle tries to schedule a single Pod. 139 func (sched *Scheduler) schedulingCycle( 140 ctx context.Context, 141 state *framework.CycleState, 142 fwk framework.Framework, 143 podInfo *framework.QueuedPodInfo, 144 start time.Time, 145 podsToActivate *framework.PodsToActivate, 146 ) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) { 147 logger := klog.FromContext(ctx) 148 pod := podInfo.Pod 149 scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod) 150 if err != nil { 151 defer func() { 152 metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) 153 }() 154 if err == ErrNoNodesAvailable { 155 status := framework.NewStatus(framework.UnschedulableAndUnresolvable).WithError(err) 156 return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status 157 } 158 159 fitError, ok := err.(*framework.FitError) 160 if !ok { 161 logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod)) 162 return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, framework.AsStatus(err) 163 } 164 165 // SchedulePod() may have failed because the pod would not fit on any host, so we try to 166 // preempt, with the expectation that the next time the pod is tried for scheduling it 167 // will fit due to the preemption. It is also possible that a different pod will schedule 168 // into the resources that were preempted, but this is harmless. 169 170 if !fwk.HasPostFilterPlugins() { 171 logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed") 172 return ScheduleResult{}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err) 173 } 174 175 // Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle. 176 result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap) 177 msg := status.Message() 178 fitError.Diagnosis.PostFilterMsg = msg 179 if status.Code() == framework.Error { 180 logger.Error(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 181 } else { 182 logger.V(5).Info("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 183 } 184 185 var nominatingInfo *framework.NominatingInfo 186 if result != nil { 187 nominatingInfo = result.NominatingInfo 188 } 189 return ScheduleResult{nominatingInfo: nominatingInfo}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err) 190 } 191 192 metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) 193 // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. 194 // This allows us to keep scheduling without waiting on binding to occur. 195 assumedPodInfo := podInfo.DeepCopy() 196 assumedPod := assumedPodInfo.Pod 197 // assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost 198 err = sched.assume(logger, assumedPod, scheduleResult.SuggestedHost) 199 if err != nil { 200 // This is most probably result of a BUG in retrying logic. 201 // We report an error here so that pod scheduling can be retried. 202 // This relies on the fact that Error will check if the pod has been bound 203 // to a node and if so will not add it back to the unscheduled pods queue 204 // (otherwise this would cause an infinite loop). 205 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.AsStatus(err) 206 } 207 208 // Run the Reserve method of reserve plugins. 209 if sts := fwk.RunReservePluginsReserve(ctx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { 210 // trigger un-reserve to clean up state associated with the reserved Pod 211 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 212 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 213 logger.Error(forgetErr, "Scheduler cache ForgetPod failed") 214 } 215 216 if sts.IsRejected() { 217 fitErr := &framework.FitError{ 218 NumAllNodes: 1, 219 Pod: pod, 220 Diagnosis: framework.Diagnosis{ 221 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts}, 222 }, 223 } 224 fitErr.Diagnosis.AddPluginStatus(sts) 225 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(sts.Code()).WithError(fitErr) 226 } 227 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts 228 } 229 230 // Run "permit" plugins. 231 runPermitStatus := fwk.RunPermitPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) 232 if !runPermitStatus.IsWait() && !runPermitStatus.IsSuccess() { 233 // trigger un-reserve to clean up state associated with the reserved Pod 234 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 235 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 236 logger.Error(forgetErr, "Scheduler cache ForgetPod failed") 237 } 238 239 if runPermitStatus.IsRejected() { 240 fitErr := &framework.FitError{ 241 NumAllNodes: 1, 242 Pod: pod, 243 Diagnosis: framework.Diagnosis{ 244 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus}, 245 }, 246 } 247 fitErr.Diagnosis.AddPluginStatus(runPermitStatus) 248 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(runPermitStatus.Code()).WithError(fitErr) 249 } 250 251 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus 252 } 253 254 // At the end of a successful scheduling cycle, pop and move up Pods if needed. 255 if len(podsToActivate.Map) != 0 { 256 sched.SchedulingQueue.Activate(logger, podsToActivate.Map) 257 // Clear the entries after activation. 258 podsToActivate.Map = make(map[string]*v1.Pod) 259 } 260 261 return scheduleResult, assumedPodInfo, nil 262 } 263 264 // bindingCycle tries to bind an assumed Pod. 265 func (sched *Scheduler) bindingCycle( 266 ctx context.Context, 267 state *framework.CycleState, 268 fwk framework.Framework, 269 scheduleResult ScheduleResult, 270 assumedPodInfo *framework.QueuedPodInfo, 271 start time.Time, 272 podsToActivate *framework.PodsToActivate) *framework.Status { 273 logger := klog.FromContext(ctx) 274 275 assumedPod := assumedPodInfo.Pod 276 277 // Run "permit" plugins. 278 if status := fwk.WaitOnPermit(ctx, assumedPod); !status.IsSuccess() { 279 if status.IsRejected() { 280 fitErr := &framework.FitError{ 281 NumAllNodes: 1, 282 Pod: assumedPodInfo.Pod, 283 Diagnosis: framework.Diagnosis{ 284 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: status}, 285 UnschedulablePlugins: sets.New(status.Plugin()), 286 }, 287 } 288 return framework.NewStatus(status.Code()).WithError(fitErr) 289 } 290 return status 291 } 292 293 // Run "prebind" plugins. 294 if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() { 295 return status 296 } 297 298 // Run "bind" plugins. 299 if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() { 300 return status 301 } 302 303 // Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2. 304 logger.V(2).Info("Successfully bound pod to node", "pod", klog.KObj(assumedPod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) 305 metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start)) 306 metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts)) 307 if assumedPodInfo.InitialAttemptTimestamp != nil { 308 metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) 309 metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) 310 } 311 // Run "postbind" plugins. 312 fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) 313 314 // At the end of a successful binding cycle, move up Pods if needed. 315 if len(podsToActivate.Map) != 0 { 316 sched.SchedulingQueue.Activate(logger, podsToActivate.Map) 317 // Unlike the logic in schedulingCycle(), we don't bother deleting the entries 318 // as `podsToActivate.Map` is no longer consumed. 319 } 320 321 return nil 322 } 323 324 func (sched *Scheduler) handleBindingCycleError( 325 ctx context.Context, 326 state *framework.CycleState, 327 fwk framework.Framework, 328 podInfo *framework.QueuedPodInfo, 329 start time.Time, 330 scheduleResult ScheduleResult, 331 status *framework.Status) { 332 logger := klog.FromContext(ctx) 333 334 assumedPod := podInfo.Pod 335 // trigger un-reserve plugins to clean up state associated with the reserved Pod 336 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 337 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 338 logger.Error(forgetErr, "scheduler cache ForgetPod failed") 339 } else { 340 // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, 341 // as the assumed Pod had occupied a certain amount of resources in scheduler cache. 342 // 343 // Avoid moving the assumed Pod itself as it's always Unschedulable. 344 // It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would 345 // add this event to in-flight events and thus move the assumed pod to backoffQ anyways if the plugins don't have appropriate QueueingHint. 346 if status.IsRejected() { 347 defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, func(pod *v1.Pod) bool { 348 return assumedPod.UID != pod.UID 349 }) 350 } else { 351 sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, nil) 352 } 353 } 354 355 sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start) 356 } 357 358 func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) { 359 fwk, ok := sched.Profiles[pod.Spec.SchedulerName] 360 if !ok { 361 return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName) 362 } 363 return fwk, nil 364 } 365 366 // skipPodSchedule returns true if we could skip scheduling the pod for specified cases. 367 func (sched *Scheduler) skipPodSchedule(ctx context.Context, fwk framework.Framework, pod *v1.Pod) bool { 368 // Case 1: pod is being deleted. 369 if pod.DeletionTimestamp != nil { 370 fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) 371 klog.FromContext(ctx).V(3).Info("Skip schedule deleting pod", "pod", klog.KObj(pod)) 372 return true 373 } 374 375 // Case 2: pod that has been assumed could be skipped. 376 // An assumed pod can be added again to the scheduling queue if it got an update event 377 // during its previous scheduling cycle but before getting assumed. 378 isAssumed, err := sched.Cache.IsAssumedPod(pod) 379 if err != nil { 380 // TODO(91633): pass ctx into a revised HandleError 381 utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err)) 382 return false 383 } 384 return isAssumed 385 } 386 387 // schedulePod tries to schedule the given pod to one of the nodes in the node list. 388 // If it succeeds, it will return the name of the node. 389 // If it fails, it will return a FitError with reasons. 390 func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) { 391 trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name}) 392 defer trace.LogIfLong(100 * time.Millisecond) 393 if err := sched.Cache.UpdateSnapshot(klog.FromContext(ctx), sched.nodeInfoSnapshot); err != nil { 394 return result, err 395 } 396 trace.Step("Snapshotting scheduler cache and node infos done") 397 398 if sched.nodeInfoSnapshot.NumNodes() == 0 { 399 return result, ErrNoNodesAvailable 400 } 401 402 feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod) 403 if err != nil { 404 return result, err 405 } 406 trace.Step("Computing predicates done") 407 408 if len(feasibleNodes) == 0 { 409 return result, &framework.FitError{ 410 Pod: pod, 411 NumAllNodes: sched.nodeInfoSnapshot.NumNodes(), 412 Diagnosis: diagnosis, 413 } 414 } 415 416 // When only one node after predicate, just use it. 417 if len(feasibleNodes) == 1 { 418 return ScheduleResult{ 419 SuggestedHost: feasibleNodes[0].Node().Name, 420 EvaluatedNodes: diagnosis.EvaluatedNodes, 421 FeasibleNodes: 1, 422 }, nil 423 } 424 425 priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes) 426 if err != nil { 427 return result, err 428 } 429 430 host, _, err := selectHost(priorityList, numberOfHighestScoredNodesToReport) 431 trace.Step("Prioritizing done") 432 433 return ScheduleResult{ 434 SuggestedHost: host, 435 EvaluatedNodes: diagnosis.EvaluatedNodes, 436 FeasibleNodes: len(feasibleNodes), 437 }, err 438 } 439 440 // Filters the nodes to find the ones that fit the pod based on the framework 441 // filter plugins and filter extenders. 442 func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*framework.NodeInfo, framework.Diagnosis, error) { 443 logger := klog.FromContext(ctx) 444 445 allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List() 446 if err != nil { 447 return nil, framework.Diagnosis{}, err 448 } 449 450 diagnosis := framework.Diagnosis{ 451 NodeToStatusMap: make(framework.NodeToStatusMap, len(allNodes)), 452 } 453 // Run "prefilter" plugins. 454 preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod) 455 if !s.IsSuccess() { 456 if !s.IsRejected() { 457 return nil, diagnosis, s.AsError() 458 } 459 // All nodes in NodeToStatusMap will have the same status so that they can be handled in the preemption. 460 // Some non trivial refactoring is needed to avoid this copy. 461 for _, n := range allNodes { 462 diagnosis.NodeToStatusMap[n.Node().Name] = s 463 } 464 465 // Record the messages from PreFilter in Diagnosis.PreFilterMsg. 466 msg := s.Message() 467 diagnosis.PreFilterMsg = msg 468 logger.V(5).Info("Status after running PreFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 469 diagnosis.AddPluginStatus(s) 470 return nil, diagnosis, nil 471 } 472 473 // "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption. 474 // This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes. 475 if len(pod.Status.NominatedNodeName) > 0 { 476 feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis) 477 if err != nil { 478 logger.Error(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName) 479 } 480 // Nominated node passes all the filters, scheduler is good to assign this node to the pod. 481 if len(feasibleNodes) != 0 { 482 return feasibleNodes, diagnosis, nil 483 } 484 } 485 486 nodes := allNodes 487 if !preRes.AllNodes() { 488 nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames)) 489 for _, n := range allNodes { 490 if !preRes.NodeNames.Has(n.Node().Name) { 491 // We consider Nodes that are filtered out by PreFilterResult as rejected via UnschedulableAndUnresolvable. 492 // We have to record them in NodeToStatusMap so that they won't be considered as candidates in the preemption. 493 diagnosis.NodeToStatusMap[n.Node().Name] = framework.NewStatus(framework.UnschedulableAndUnresolvable, "node is filtered out by the prefilter result") 494 continue 495 } 496 nodes = append(nodes, n) 497 } 498 } 499 feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, nodes) 500 // always try to update the sched.nextStartNodeIndex regardless of whether an error has occurred 501 // this is helpful to make sure that all the nodes have a chance to be searched 502 processedNodes := len(feasibleNodes) + len(diagnosis.NodeToStatusMap) 503 sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes) 504 if err != nil { 505 return nil, diagnosis, err 506 } 507 508 feasibleNodesAfterExtender, err := findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) 509 if err != nil { 510 return nil, diagnosis, err 511 } 512 if len(feasibleNodesAfterExtender) != len(feasibleNodes) { 513 // Extenders filtered out some nodes. 514 // 515 // Extender doesn't support any kind of requeueing feature like EnqueueExtensions in the scheduling framework. 516 // When Extenders reject some Nodes and the pod ends up being unschedulable, 517 // we put framework.ExtenderName to pInfo.UnschedulablePlugins. 518 // This Pod will be requeued from unschedulable pod pool to activeQ/backoffQ 519 // by any kind of cluster events. 520 // https://github.com/kubernetes/kubernetes/issues/122019 521 if diagnosis.UnschedulablePlugins == nil { 522 diagnosis.UnschedulablePlugins = sets.New[string]() 523 } 524 diagnosis.UnschedulablePlugins.Insert(framework.ExtenderName) 525 } 526 527 return feasibleNodesAfterExtender, diagnosis, nil 528 } 529 530 func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*framework.NodeInfo, error) { 531 nnn := pod.Status.NominatedNodeName 532 nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn) 533 if err != nil { 534 return nil, err 535 } 536 node := []*framework.NodeInfo{nodeInfo} 537 feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, node) 538 if err != nil { 539 return nil, err 540 } 541 542 feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) 543 if err != nil { 544 return nil, err 545 } 546 547 return feasibleNodes, nil 548 } 549 550 // hasScoring checks if scoring nodes is configured. 551 func (sched *Scheduler) hasScoring(fwk framework.Framework) bool { 552 if fwk.HasScorePlugins() { 553 return true 554 } 555 for _, extender := range sched.Extenders { 556 if extender.IsPrioritizer() { 557 return true 558 } 559 } 560 return false 561 } 562 563 // hasExtenderFilters checks if any extenders filter nodes. 564 func (sched *Scheduler) hasExtenderFilters() bool { 565 for _, extender := range sched.Extenders { 566 if extender.IsFilter() { 567 return true 568 } 569 } 570 return false 571 } 572 573 // findNodesThatPassFilters finds the nodes that fit the filter plugins. 574 func (sched *Scheduler) findNodesThatPassFilters( 575 ctx context.Context, 576 fwk framework.Framework, 577 state *framework.CycleState, 578 pod *v1.Pod, 579 diagnosis *framework.Diagnosis, 580 nodes []*framework.NodeInfo) ([]*framework.NodeInfo, error) { 581 numAllNodes := len(nodes) 582 numNodesToFind := sched.numFeasibleNodesToFind(fwk.PercentageOfNodesToScore(), int32(numAllNodes)) 583 if !sched.hasExtenderFilters() && !sched.hasScoring(fwk) { 584 numNodesToFind = 1 585 } 586 587 // Create feasible list with enough space to avoid growing it 588 // and allow assigning. 589 feasibleNodes := make([]*framework.NodeInfo, numNodesToFind) 590 591 if !fwk.HasFilterPlugins() { 592 for i := range feasibleNodes { 593 feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%numAllNodes] 594 } 595 diagnosis.EvaluatedNodes = int(numNodesToFind) 596 return feasibleNodes, nil 597 } 598 599 errCh := parallelize.NewErrorChannel() 600 var feasibleNodesLen int32 601 ctx, cancel := context.WithCancel(ctx) 602 defer cancel() 603 604 type nodeStatus struct { 605 node string 606 status *framework.Status 607 } 608 result := make([]*nodeStatus, numAllNodes) 609 checkNode := func(i int) { 610 // We check the nodes starting from where we left off in the previous scheduling cycle, 611 // this is to make sure all nodes have the same chance of being examined across pods. 612 nodeInfo := nodes[(sched.nextStartNodeIndex+i)%numAllNodes] 613 status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo) 614 if status.Code() == framework.Error { 615 errCh.SendErrorWithCancel(status.AsError(), cancel) 616 return 617 } 618 if status.IsSuccess() { 619 length := atomic.AddInt32(&feasibleNodesLen, 1) 620 if length > numNodesToFind { 621 cancel() 622 atomic.AddInt32(&feasibleNodesLen, -1) 623 } else { 624 feasibleNodes[length-1] = nodeInfo 625 } 626 } else { 627 result[i] = &nodeStatus{node: nodeInfo.Node().Name, status: status} 628 } 629 } 630 631 beginCheckNode := time.Now() 632 statusCode := framework.Success 633 defer func() { 634 // We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins 635 // function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle. 636 // Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod. 637 metrics.FrameworkExtensionPointDuration.WithLabelValues(metrics.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode)) 638 }() 639 640 // Stops searching for more nodes once the configured number of feasible nodes 641 // are found. 642 fwk.Parallelizer().Until(ctx, numAllNodes, checkNode, metrics.Filter) 643 feasibleNodes = feasibleNodes[:feasibleNodesLen] 644 diagnosis.EvaluatedNodes = int(feasibleNodesLen) 645 for _, item := range result { 646 if item == nil { 647 continue 648 } 649 diagnosis.NodeToStatusMap[item.node] = item.status 650 diagnosis.EvaluatedNodes++ 651 diagnosis.AddPluginStatus(item.status) 652 } 653 if err := errCh.ReceiveError(); err != nil { 654 statusCode = framework.Error 655 return feasibleNodes, err 656 } 657 return feasibleNodes, nil 658 } 659 660 // numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops 661 // its search for more feasible nodes. 662 func (sched *Scheduler) numFeasibleNodesToFind(percentageOfNodesToScore *int32, numAllNodes int32) (numNodes int32) { 663 if numAllNodes < minFeasibleNodesToFind { 664 return numAllNodes 665 } 666 667 // Use profile percentageOfNodesToScore if it's set. Otherwise, use global percentageOfNodesToScore. 668 var percentage int32 669 if percentageOfNodesToScore != nil { 670 percentage = *percentageOfNodesToScore 671 } else { 672 percentage = sched.percentageOfNodesToScore 673 } 674 675 if percentage == 0 { 676 percentage = int32(50) - numAllNodes/125 677 if percentage < minFeasibleNodesPercentageToFind { 678 percentage = minFeasibleNodesPercentageToFind 679 } 680 } 681 682 numNodes = numAllNodes * percentage / 100 683 if numNodes < minFeasibleNodesToFind { 684 return minFeasibleNodesToFind 685 } 686 687 return numNodes 688 } 689 690 func findNodesThatPassExtenders(ctx context.Context, extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*framework.NodeInfo, statuses framework.NodeToStatusMap) ([]*framework.NodeInfo, error) { 691 logger := klog.FromContext(ctx) 692 // Extenders are called sequentially. 693 // Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next 694 // extender in a decreasing manner. 695 for _, extender := range extenders { 696 if len(feasibleNodes) == 0 { 697 break 698 } 699 if !extender.IsInterested(pod) { 700 continue 701 } 702 703 // Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in <statuses>, 704 // so that the scheduler framework can respect the UnschedulableAndUnresolvable status for 705 // particular nodes, and this may eventually improve preemption efficiency. 706 // Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable 707 // status ahead of others. 708 feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes) 709 if err != nil { 710 if extender.IsIgnorable() { 711 logger.Info("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err) 712 continue 713 } 714 return nil, err 715 } 716 717 for failedNodeName, failedMsg := range failedAndUnresolvableMap { 718 var aggregatedReasons []string 719 if _, found := statuses[failedNodeName]; found { 720 aggregatedReasons = statuses[failedNodeName].Reasons() 721 } 722 aggregatedReasons = append(aggregatedReasons, failedMsg) 723 statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...) 724 } 725 726 for failedNodeName, failedMsg := range failedMap { 727 if _, found := failedAndUnresolvableMap[failedNodeName]; found { 728 // failedAndUnresolvableMap takes precedence over failedMap 729 // note that this only happens if the extender returns the node in both maps 730 continue 731 } 732 if _, found := statuses[failedNodeName]; !found { 733 statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg) 734 } else { 735 statuses[failedNodeName].AppendReason(failedMsg) 736 } 737 } 738 739 feasibleNodes = feasibleList 740 } 741 return feasibleNodes, nil 742 } 743 744 // prioritizeNodes prioritizes the nodes by running the score plugins, 745 // which return a score for each node from the call to RunScorePlugins(). 746 // The scores from each plugin are added together to make the score for that node, then 747 // any extenders are run as well. 748 // All scores are finally combined (added) to get the total weighted scores of all nodes 749 func prioritizeNodes( 750 ctx context.Context, 751 extenders []framework.Extender, 752 fwk framework.Framework, 753 state *framework.CycleState, 754 pod *v1.Pod, 755 nodes []*framework.NodeInfo, 756 ) ([]framework.NodePluginScores, error) { 757 logger := klog.FromContext(ctx) 758 // If no priority configs are provided, then all nodes will have a score of one. 759 // This is required to generate the priority list in the required format 760 if len(extenders) == 0 && !fwk.HasScorePlugins() { 761 result := make([]framework.NodePluginScores, 0, len(nodes)) 762 for i := range nodes { 763 result = append(result, framework.NodePluginScores{ 764 Name: nodes[i].Node().Name, 765 TotalScore: 1, 766 }) 767 } 768 return result, nil 769 } 770 771 // Run PreScore plugins. 772 preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes) 773 if !preScoreStatus.IsSuccess() { 774 return nil, preScoreStatus.AsError() 775 } 776 777 // Run the Score plugins. 778 nodesScores, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes) 779 if !scoreStatus.IsSuccess() { 780 return nil, scoreStatus.AsError() 781 } 782 783 // Additional details logged at level 10 if enabled. 784 loggerVTen := logger.V(10) 785 if loggerVTen.Enabled() { 786 for _, nodeScore := range nodesScores { 787 for _, pluginScore := range nodeScore.Scores { 788 loggerVTen.Info("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", pluginScore.Name, "node", nodeScore.Name, "score", pluginScore.Score) 789 } 790 } 791 } 792 793 if len(extenders) != 0 && nodes != nil { 794 // allNodeExtendersScores has all extenders scores for all nodes. 795 // It is keyed with node name. 796 allNodeExtendersScores := make(map[string]*framework.NodePluginScores, len(nodes)) 797 var mu sync.Mutex 798 var wg sync.WaitGroup 799 for i := range extenders { 800 if !extenders[i].IsInterested(pod) { 801 continue 802 } 803 wg.Add(1) 804 go func(extIndex int) { 805 metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Inc() 806 defer func() { 807 metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Dec() 808 wg.Done() 809 }() 810 prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes) 811 if err != nil { 812 // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities 813 logger.V(5).Info("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name()) 814 return 815 } 816 mu.Lock() 817 defer mu.Unlock() 818 for i := range *prioritizedList { 819 nodename := (*prioritizedList)[i].Host 820 score := (*prioritizedList)[i].Score 821 if loggerVTen.Enabled() { 822 loggerVTen.Info("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", nodename, "score", score) 823 } 824 825 // MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore, 826 // therefore we need to scale the score returned by extenders to the score range used by the scheduler. 827 finalscore := score * weight * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority) 828 829 if allNodeExtendersScores[nodename] == nil { 830 allNodeExtendersScores[nodename] = &framework.NodePluginScores{ 831 Name: nodename, 832 Scores: make([]framework.PluginScore, 0, len(extenders)), 833 } 834 } 835 allNodeExtendersScores[nodename].Scores = append(allNodeExtendersScores[nodename].Scores, framework.PluginScore{ 836 Name: extenders[extIndex].Name(), 837 Score: finalscore, 838 }) 839 allNodeExtendersScores[nodename].TotalScore += finalscore 840 } 841 }(i) 842 } 843 // wait for all go routines to finish 844 wg.Wait() 845 for i := range nodesScores { 846 if score, ok := allNodeExtendersScores[nodes[i].Node().Name]; ok { 847 nodesScores[i].Scores = append(nodesScores[i].Scores, score.Scores...) 848 nodesScores[i].TotalScore += score.TotalScore 849 } 850 } 851 } 852 853 if loggerVTen.Enabled() { 854 for i := range nodesScores { 855 loggerVTen.Info("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", nodesScores[i].Name, "score", nodesScores[i].TotalScore) 856 } 857 } 858 return nodesScores, nil 859 } 860 861 var errEmptyPriorityList = errors.New("empty priorityList") 862 863 // selectHost takes a prioritized list of nodes and then picks one 864 // in a reservoir sampling manner from the nodes that had the highest score. 865 // It also returns the top {count} Nodes, 866 // and the top of the list will be always the selected host. 867 func selectHost(nodeScoreList []framework.NodePluginScores, count int) (string, []framework.NodePluginScores, error) { 868 if len(nodeScoreList) == 0 { 869 return "", nil, errEmptyPriorityList 870 } 871 872 var h nodeScoreHeap = nodeScoreList 873 heap.Init(&h) 874 cntOfMaxScore := 1 875 selectedIndex := 0 876 // The top of the heap is the NodeScoreResult with the highest score. 877 sortedNodeScoreList := make([]framework.NodePluginScores, 0, count) 878 sortedNodeScoreList = append(sortedNodeScoreList, heap.Pop(&h).(framework.NodePluginScores)) 879 880 // This for-loop will continue until all Nodes with the highest scores get checked for a reservoir sampling, 881 // and sortedNodeScoreList gets (count - 1) elements. 882 for ns := heap.Pop(&h).(framework.NodePluginScores); ; ns = heap.Pop(&h).(framework.NodePluginScores) { 883 if ns.TotalScore != sortedNodeScoreList[0].TotalScore && len(sortedNodeScoreList) == count { 884 break 885 } 886 887 if ns.TotalScore == sortedNodeScoreList[0].TotalScore { 888 cntOfMaxScore++ 889 if rand.Intn(cntOfMaxScore) == 0 { 890 // Replace the candidate with probability of 1/cntOfMaxScore 891 selectedIndex = cntOfMaxScore - 1 892 } 893 } 894 895 sortedNodeScoreList = append(sortedNodeScoreList, ns) 896 897 if h.Len() == 0 { 898 break 899 } 900 } 901 902 if selectedIndex != 0 { 903 // replace the first one with selected one 904 previous := sortedNodeScoreList[0] 905 sortedNodeScoreList[0] = sortedNodeScoreList[selectedIndex] 906 sortedNodeScoreList[selectedIndex] = previous 907 } 908 909 if len(sortedNodeScoreList) > count { 910 sortedNodeScoreList = sortedNodeScoreList[:count] 911 } 912 913 return sortedNodeScoreList[0].Name, sortedNodeScoreList, nil 914 } 915 916 // nodeScoreHeap is a heap of framework.NodePluginScores. 917 type nodeScoreHeap []framework.NodePluginScores 918 919 // nodeScoreHeap implements heap.Interface. 920 var _ heap.Interface = &nodeScoreHeap{} 921 922 func (h nodeScoreHeap) Len() int { return len(h) } 923 func (h nodeScoreHeap) Less(i, j int) bool { return h[i].TotalScore > h[j].TotalScore } 924 func (h nodeScoreHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 925 926 func (h *nodeScoreHeap) Push(x interface{}) { 927 *h = append(*h, x.(framework.NodePluginScores)) 928 } 929 930 func (h *nodeScoreHeap) Pop() interface{} { 931 old := *h 932 n := len(old) 933 x := old[n-1] 934 *h = old[0 : n-1] 935 return x 936 } 937 938 // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. 939 // assume modifies `assumed`. 940 func (sched *Scheduler) assume(logger klog.Logger, assumed *v1.Pod, host string) error { 941 // Optimistically assume that the binding will succeed and send it to apiserver 942 // in the background. 943 // If the binding fails, scheduler will release resources allocated to assumed pod 944 // immediately. 945 assumed.Spec.NodeName = host 946 947 if err := sched.Cache.AssumePod(logger, assumed); err != nil { 948 logger.Error(err, "Scheduler cache AssumePod failed") 949 return err 950 } 951 // if "assumed" is a nominated pod, we should remove it from internal cache 952 if sched.SchedulingQueue != nil { 953 sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed) 954 } 955 956 return nil 957 } 958 959 // bind binds a pod to a given node defined in a binding object. 960 // The precedence for binding is: (1) extenders and (2) framework plugins. 961 // We expect this to run asynchronously, so we handle binding metrics internally. 962 func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (status *framework.Status) { 963 logger := klog.FromContext(ctx) 964 defer func() { 965 sched.finishBinding(logger, fwk, assumed, targetNode, status) 966 }() 967 968 bound, err := sched.extendersBinding(logger, assumed, targetNode) 969 if bound { 970 return framework.AsStatus(err) 971 } 972 return fwk.RunBindPlugins(ctx, state, assumed, targetNode) 973 } 974 975 // TODO(#87159): Move this to a Plugin. 976 func (sched *Scheduler) extendersBinding(logger klog.Logger, pod *v1.Pod, node string) (bool, error) { 977 for _, extender := range sched.Extenders { 978 if !extender.IsBinder() || !extender.IsInterested(pod) { 979 continue 980 } 981 err := extender.Bind(&v1.Binding{ 982 ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID}, 983 Target: v1.ObjectReference{Kind: "Node", Name: node}, 984 }) 985 if err != nil && extender.IsIgnorable() { 986 logger.Info("Skipping extender in bind as it returned error and has ignorable flag set", "extender", extender, "err", err) 987 continue 988 } 989 return true, err 990 } 991 return false, nil 992 } 993 994 func (sched *Scheduler) finishBinding(logger klog.Logger, fwk framework.Framework, assumed *v1.Pod, targetNode string, status *framework.Status) { 995 if finErr := sched.Cache.FinishBinding(logger, assumed); finErr != nil { 996 logger.Error(finErr, "Scheduler cache FinishBinding failed") 997 } 998 if !status.IsSuccess() { 999 logger.V(1).Info("Failed to bind pod", "pod", klog.KObj(assumed)) 1000 return 1001 } 1002 1003 fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode) 1004 } 1005 1006 func getAttemptsLabel(p *framework.QueuedPodInfo) string { 1007 // We breakdown the pod scheduling duration by attempts capped to a limit 1008 // to avoid ending up with a high cardinality metric. 1009 if p.Attempts >= 15 { 1010 return "15+" 1011 } 1012 return strconv.Itoa(p.Attempts) 1013 } 1014 1015 // handleSchedulingFailure records an event for the pod that indicates the 1016 // pod has failed to schedule. Also, update the pod condition and nominated node name if set. 1017 func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framework.Framework, podInfo *framework.QueuedPodInfo, status *framework.Status, nominatingInfo *framework.NominatingInfo, start time.Time) { 1018 calledDone := false 1019 defer func() { 1020 if !calledDone { 1021 // Basically, AddUnschedulableIfNotPresent calls DonePod internally. 1022 // But, AddUnschedulableIfNotPresent isn't called in some corner cases. 1023 // Here, we call DonePod explicitly to avoid leaking the pod. 1024 sched.SchedulingQueue.Done(podInfo.Pod.UID) 1025 } 1026 }() 1027 1028 logger := klog.FromContext(ctx) 1029 reason := v1.PodReasonSchedulerError 1030 if status.IsRejected() { 1031 reason = v1.PodReasonUnschedulable 1032 } 1033 1034 switch reason { 1035 case v1.PodReasonUnschedulable: 1036 metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) 1037 case v1.PodReasonSchedulerError: 1038 metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) 1039 } 1040 1041 pod := podInfo.Pod 1042 err := status.AsError() 1043 errMsg := status.Message() 1044 1045 if err == ErrNoNodesAvailable { 1046 logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) 1047 } else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. 1048 podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins 1049 podInfo.PendingPlugins = fitError.Diagnosis.PendingPlugins 1050 logger.V(2).Info("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", errMsg) 1051 } else { 1052 logger.Error(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod)) 1053 } 1054 1055 // Check if the Pod exists in informer cache. 1056 podLister := fwk.SharedInformerFactory().Core().V1().Pods().Lister() 1057 cachedPod, e := podLister.Pods(pod.Namespace).Get(pod.Name) 1058 if e != nil { 1059 logger.Info("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", e) 1060 // We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case. 1061 } else { 1062 // In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler. 1063 // It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version. 1064 if len(cachedPod.Spec.NodeName) != 0 { 1065 logger.Info("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName) 1066 // We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case. 1067 } else { 1068 // As <cachedPod> is from SharedInformer, we need to do a DeepCopy() here. 1069 // ignore this err since apiserver doesn't properly validate affinity terms 1070 // and we can't fix the validation for backwards compatibility. 1071 podInfo.PodInfo, _ = framework.NewPodInfo(cachedPod.DeepCopy()) 1072 if err := sched.SchedulingQueue.AddUnschedulableIfNotPresent(logger, podInfo, sched.SchedulingQueue.SchedulingCycle()); err != nil { 1073 logger.Error(err, "Error occurred") 1074 } 1075 calledDone = true 1076 } 1077 } 1078 1079 // Update the scheduling queue with the nominated pod information. Without 1080 // this, there would be a race condition between the next scheduling cycle 1081 // and the time the scheduler receives a Pod Update for the nominated pod. 1082 // Here we check for nil only for tests. 1083 if sched.SchedulingQueue != nil { 1084 logger := klog.FromContext(ctx) 1085 sched.SchedulingQueue.AddNominatedPod(logger, podInfo.PodInfo, nominatingInfo) 1086 } 1087 1088 if err == nil { 1089 // Only tests can reach here. 1090 return 1091 } 1092 1093 msg := truncateMessage(errMsg) 1094 fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) 1095 if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{ 1096 Type: v1.PodScheduled, 1097 Status: v1.ConditionFalse, 1098 Reason: reason, 1099 Message: errMsg, 1100 }, nominatingInfo); err != nil { 1101 klog.FromContext(ctx).Error(err, "Error updating pod", "pod", klog.KObj(pod)) 1102 } 1103 } 1104 1105 // truncateMessage truncates a message if it hits the NoteLengthLimit. 1106 func truncateMessage(message string) string { 1107 max := validation.NoteLengthLimit 1108 if len(message) <= max { 1109 return message 1110 } 1111 suffix := " ..." 1112 return message[:max-len(suffix)] + suffix 1113 } 1114 1115 func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { 1116 logger := klog.FromContext(ctx) 1117 logger.V(3).Info("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) 1118 podStatusCopy := pod.Status.DeepCopy() 1119 // NominatedNodeName is updated only if we are trying to set it, and the value is 1120 // different from the existing one. 1121 nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName 1122 if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate { 1123 return nil 1124 } 1125 if nnnNeedsUpdate { 1126 podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName 1127 } 1128 return util.PatchPodStatus(ctx, client, pod, podStatusCopy) 1129 }