k8s.io/kubernetes@v1.29.3/pkg/scheduler/schedule_one.go (about) 1 /* 2 Copyright 2014 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package scheduler 18 19 import ( 20 "container/heap" 21 "context" 22 "errors" 23 "fmt" 24 "math/rand" 25 "strconv" 26 "sync" 27 "sync/atomic" 28 "time" 29 30 v1 "k8s.io/api/core/v1" 31 apierrors "k8s.io/apimachinery/pkg/api/errors" 32 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 34 "k8s.io/apimachinery/pkg/util/sets" 35 clientset "k8s.io/client-go/kubernetes" 36 "k8s.io/klog/v2" 37 extenderv1 "k8s.io/kube-scheduler/extender/v1" 38 podutil "k8s.io/kubernetes/pkg/api/v1/pod" 39 "k8s.io/kubernetes/pkg/apis/core/validation" 40 "k8s.io/kubernetes/pkg/scheduler/framework" 41 "k8s.io/kubernetes/pkg/scheduler/framework/parallelize" 42 internalqueue "k8s.io/kubernetes/pkg/scheduler/internal/queue" 43 "k8s.io/kubernetes/pkg/scheduler/metrics" 44 "k8s.io/kubernetes/pkg/scheduler/util" 45 utiltrace "k8s.io/utils/trace" 46 ) 47 48 const ( 49 // Percentage of plugin metrics to be sampled. 50 pluginMetricsSamplePercent = 10 51 // minFeasibleNodesToFind is the minimum number of nodes that would be scored 52 // in each scheduling cycle. This is a semi-arbitrary value to ensure that a 53 // certain minimum of nodes are checked for feasibility. This in turn helps 54 // ensure a minimum level of spreading. 55 minFeasibleNodesToFind = 100 56 // minFeasibleNodesPercentageToFind is the minimum percentage of nodes that 57 // would be scored in each scheduling cycle. This is a semi-arbitrary value 58 // to ensure that a certain minimum of nodes are checked for feasibility. 59 // This in turn helps ensure a minimum level of spreading. 60 minFeasibleNodesPercentageToFind = 5 61 // numberOfHighestScoredNodesToReport is the number of node scores 62 // to be included in ScheduleResult. 63 numberOfHighestScoredNodesToReport = 3 64 ) 65 66 // scheduleOne does the entire scheduling workflow for a single pod. It is serialized on the scheduling algorithm's host fitting. 67 func (sched *Scheduler) scheduleOne(ctx context.Context) { 68 logger := klog.FromContext(ctx) 69 podInfo, err := sched.NextPod(logger) 70 if err != nil { 71 logger.Error(err, "Error while retrieving next pod from scheduling queue") 72 return 73 } 74 // pod could be nil when schedulerQueue is closed 75 if podInfo == nil || podInfo.Pod == nil { 76 return 77 } 78 79 pod := podInfo.Pod 80 // TODO(knelasevero): Remove duplicated keys from log entry calls 81 // When contextualized logging hits GA 82 // https://github.com/kubernetes/kubernetes/issues/111672 83 logger = klog.LoggerWithValues(logger, "pod", klog.KObj(pod)) 84 ctx = klog.NewContext(ctx, logger) 85 logger.V(4).Info("About to try and schedule pod", "pod", klog.KObj(pod)) 86 87 fwk, err := sched.frameworkForPod(pod) 88 if err != nil { 89 // This shouldn't happen, because we only accept for scheduling the pods 90 // which specify a scheduler name that matches one of the profiles. 91 logger.Error(err, "Error occurred") 92 return 93 } 94 if sched.skipPodSchedule(ctx, fwk, pod) { 95 return 96 } 97 98 logger.V(3).Info("Attempting to schedule pod", "pod", klog.KObj(pod)) 99 100 // Synchronously attempt to find a fit for the pod. 101 start := time.Now() 102 state := framework.NewCycleState() 103 state.SetRecordPluginMetrics(rand.Intn(100) < pluginMetricsSamplePercent) 104 105 // Initialize an empty podsToActivate struct, which will be filled up by plugins or stay empty. 106 podsToActivate := framework.NewPodsToActivate() 107 state.Write(framework.PodsToActivateKey, podsToActivate) 108 109 schedulingCycleCtx, cancel := context.WithCancel(ctx) 110 defer cancel() 111 112 scheduleResult, assumedPodInfo, status := sched.schedulingCycle(schedulingCycleCtx, state, fwk, podInfo, start, podsToActivate) 113 if !status.IsSuccess() { 114 sched.FailureHandler(schedulingCycleCtx, fwk, assumedPodInfo, status, scheduleResult.nominatingInfo, start) 115 return 116 } 117 118 // bind the pod to its host asynchronously (we can do this b/c of the assumption step above). 119 go func() { 120 bindingCycleCtx, cancel := context.WithCancel(ctx) 121 defer cancel() 122 123 metrics.Goroutines.WithLabelValues(metrics.Binding).Inc() 124 defer metrics.Goroutines.WithLabelValues(metrics.Binding).Dec() 125 126 status := sched.bindingCycle(bindingCycleCtx, state, fwk, scheduleResult, assumedPodInfo, start, podsToActivate) 127 if !status.IsSuccess() { 128 sched.handleBindingCycleError(bindingCycleCtx, state, fwk, assumedPodInfo, start, scheduleResult, status) 129 return 130 } 131 // Usually, DonePod is called inside the scheduling queue, 132 // but in this case, we need to call it here because this Pod won't go back to the scheduling queue. 133 sched.SchedulingQueue.Done(assumedPodInfo.Pod.UID) 134 }() 135 } 136 137 var clearNominatedNode = &framework.NominatingInfo{NominatingMode: framework.ModeOverride, NominatedNodeName: ""} 138 139 // schedulingCycle tries to schedule a single Pod. 140 func (sched *Scheduler) schedulingCycle( 141 ctx context.Context, 142 state *framework.CycleState, 143 fwk framework.Framework, 144 podInfo *framework.QueuedPodInfo, 145 start time.Time, 146 podsToActivate *framework.PodsToActivate, 147 ) (ScheduleResult, *framework.QueuedPodInfo, *framework.Status) { 148 logger := klog.FromContext(ctx) 149 pod := podInfo.Pod 150 scheduleResult, err := sched.SchedulePod(ctx, fwk, state, pod) 151 if err != nil { 152 if err == ErrNoNodesAvailable { 153 status := framework.NewStatus(framework.UnschedulableAndUnresolvable).WithError(err) 154 return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status 155 } 156 157 fitError, ok := err.(*framework.FitError) 158 if !ok { 159 logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod)) 160 return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, framework.AsStatus(err) 161 } 162 163 // SchedulePod() may have failed because the pod would not fit on any host, so we try to 164 // preempt, with the expectation that the next time the pod is tried for scheduling it 165 // will fit due to the preemption. It is also possible that a different pod will schedule 166 // into the resources that were preempted, but this is harmless. 167 168 if !fwk.HasPostFilterPlugins() { 169 logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed") 170 return ScheduleResult{}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err) 171 } 172 173 // Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle. 174 result, status := fwk.RunPostFilterPlugins(ctx, state, pod, fitError.Diagnosis.NodeToStatusMap) 175 msg := status.Message() 176 fitError.Diagnosis.PostFilterMsg = msg 177 if status.Code() == framework.Error { 178 logger.Error(nil, "Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 179 } else { 180 logger.V(5).Info("Status after running PostFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 181 } 182 183 var nominatingInfo *framework.NominatingInfo 184 if result != nil { 185 nominatingInfo = result.NominatingInfo 186 } 187 return ScheduleResult{nominatingInfo: nominatingInfo}, podInfo, framework.NewStatus(framework.Unschedulable).WithError(err) 188 } 189 190 metrics.SchedulingAlgorithmLatency.Observe(metrics.SinceInSeconds(start)) 191 // Tell the cache to assume that a pod now is running on a given node, even though it hasn't been bound yet. 192 // This allows us to keep scheduling without waiting on binding to occur. 193 assumedPodInfo := podInfo.DeepCopy() 194 assumedPod := assumedPodInfo.Pod 195 // assume modifies `assumedPod` by setting NodeName=scheduleResult.SuggestedHost 196 err = sched.assume(logger, assumedPod, scheduleResult.SuggestedHost) 197 if err != nil { 198 // This is most probably result of a BUG in retrying logic. 199 // We report an error here so that pod scheduling can be retried. 200 // This relies on the fact that Error will check if the pod has been bound 201 // to a node and if so will not add it back to the unscheduled pods queue 202 // (otherwise this would cause an infinite loop). 203 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.AsStatus(err) 204 } 205 206 // Run the Reserve method of reserve plugins. 207 if sts := fwk.RunReservePluginsReserve(ctx, state, assumedPod, scheduleResult.SuggestedHost); !sts.IsSuccess() { 208 // trigger un-reserve to clean up state associated with the reserved Pod 209 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 210 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 211 logger.Error(forgetErr, "Scheduler cache ForgetPod failed") 212 } 213 214 if sts.IsRejected() { 215 fitErr := &framework.FitError{ 216 NumAllNodes: 1, 217 Pod: pod, 218 Diagnosis: framework.Diagnosis{ 219 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: sts}, 220 }, 221 } 222 fitErr.Diagnosis.AddPluginStatus(sts) 223 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(sts.Code()).WithError(fitErr) 224 } 225 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts 226 } 227 228 // Run "permit" plugins. 229 runPermitStatus := fwk.RunPermitPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) 230 if !runPermitStatus.IsWait() && !runPermitStatus.IsSuccess() { 231 // trigger un-reserve to clean up state associated with the reserved Pod 232 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 233 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 234 logger.Error(forgetErr, "Scheduler cache ForgetPod failed") 235 } 236 237 if runPermitStatus.IsRejected() { 238 fitErr := &framework.FitError{ 239 NumAllNodes: 1, 240 Pod: pod, 241 Diagnosis: framework.Diagnosis{ 242 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: runPermitStatus}, 243 }, 244 } 245 fitErr.Diagnosis.AddPluginStatus(runPermitStatus) 246 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, framework.NewStatus(runPermitStatus.Code()).WithError(fitErr) 247 } 248 249 return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus 250 } 251 252 // At the end of a successful scheduling cycle, pop and move up Pods if needed. 253 if len(podsToActivate.Map) != 0 { 254 sched.SchedulingQueue.Activate(logger, podsToActivate.Map) 255 // Clear the entries after activation. 256 podsToActivate.Map = make(map[string]*v1.Pod) 257 } 258 259 return scheduleResult, assumedPodInfo, nil 260 } 261 262 // bindingCycle tries to bind an assumed Pod. 263 func (sched *Scheduler) bindingCycle( 264 ctx context.Context, 265 state *framework.CycleState, 266 fwk framework.Framework, 267 scheduleResult ScheduleResult, 268 assumedPodInfo *framework.QueuedPodInfo, 269 start time.Time, 270 podsToActivate *framework.PodsToActivate) *framework.Status { 271 logger := klog.FromContext(ctx) 272 273 assumedPod := assumedPodInfo.Pod 274 275 // Run "permit" plugins. 276 if status := fwk.WaitOnPermit(ctx, assumedPod); !status.IsSuccess() { 277 if status.IsRejected() { 278 fitErr := &framework.FitError{ 279 NumAllNodes: 1, 280 Pod: assumedPodInfo.Pod, 281 Diagnosis: framework.Diagnosis{ 282 NodeToStatusMap: framework.NodeToStatusMap{scheduleResult.SuggestedHost: status}, 283 UnschedulablePlugins: sets.New(status.Plugin()), 284 }, 285 } 286 return framework.NewStatus(status.Code()).WithError(fitErr) 287 } 288 return status 289 } 290 291 // Run "prebind" plugins. 292 if status := fwk.RunPreBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost); !status.IsSuccess() { 293 return status 294 } 295 296 // Run "bind" plugins. 297 if status := sched.bind(ctx, fwk, assumedPod, scheduleResult.SuggestedHost, state); !status.IsSuccess() { 298 return status 299 } 300 301 // Calculating nodeResourceString can be heavy. Avoid it if klog verbosity is below 2. 302 logger.V(2).Info("Successfully bound pod to node", "pod", klog.KObj(assumedPod), "node", scheduleResult.SuggestedHost, "evaluatedNodes", scheduleResult.EvaluatedNodes, "feasibleNodes", scheduleResult.FeasibleNodes) 303 metrics.PodScheduled(fwk.ProfileName(), metrics.SinceInSeconds(start)) 304 metrics.PodSchedulingAttempts.Observe(float64(assumedPodInfo.Attempts)) 305 if assumedPodInfo.InitialAttemptTimestamp != nil { 306 metrics.PodSchedulingDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) 307 metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) 308 } 309 // Run "postbind" plugins. 310 fwk.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) 311 312 // At the end of a successful binding cycle, move up Pods if needed. 313 if len(podsToActivate.Map) != 0 { 314 sched.SchedulingQueue.Activate(logger, podsToActivate.Map) 315 // Unlike the logic in schedulingCycle(), we don't bother deleting the entries 316 // as `podsToActivate.Map` is no longer consumed. 317 } 318 319 return nil 320 } 321 322 func (sched *Scheduler) handleBindingCycleError( 323 ctx context.Context, 324 state *framework.CycleState, 325 fwk framework.Framework, 326 podInfo *framework.QueuedPodInfo, 327 start time.Time, 328 scheduleResult ScheduleResult, 329 status *framework.Status) { 330 logger := klog.FromContext(ctx) 331 332 assumedPod := podInfo.Pod 333 // trigger un-reserve plugins to clean up state associated with the reserved Pod 334 fwk.RunReservePluginsUnreserve(ctx, state, assumedPod, scheduleResult.SuggestedHost) 335 if forgetErr := sched.Cache.ForgetPod(logger, assumedPod); forgetErr != nil { 336 logger.Error(forgetErr, "scheduler cache ForgetPod failed") 337 } else { 338 // "Forget"ing an assumed Pod in binding cycle should be treated as a PodDelete event, 339 // as the assumed Pod had occupied a certain amount of resources in scheduler cache. 340 // 341 // Avoid moving the assumed Pod itself as it's always Unschedulable. 342 // It's intentional to "defer" this operation; otherwise MoveAllToActiveOrBackoffQueue() would 343 // update `q.moveRequest` and thus move the assumed pod to backoffQ anyways. 344 if status.IsRejected() { 345 defer sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, func(pod *v1.Pod) bool { 346 return assumedPod.UID != pod.UID 347 }) 348 } else { 349 sched.SchedulingQueue.MoveAllToActiveOrBackoffQueue(logger, internalqueue.AssignedPodDelete, assumedPod, nil, nil) 350 } 351 } 352 353 sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start) 354 } 355 356 func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) { 357 fwk, ok := sched.Profiles[pod.Spec.SchedulerName] 358 if !ok { 359 return nil, fmt.Errorf("profile not found for scheduler name %q", pod.Spec.SchedulerName) 360 } 361 return fwk, nil 362 } 363 364 // skipPodSchedule returns true if we could skip scheduling the pod for specified cases. 365 func (sched *Scheduler) skipPodSchedule(ctx context.Context, fwk framework.Framework, pod *v1.Pod) bool { 366 // Case 1: pod is being deleted. 367 if pod.DeletionTimestamp != nil { 368 fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", "skip schedule deleting pod: %v/%v", pod.Namespace, pod.Name) 369 klog.FromContext(ctx).V(3).Info("Skip schedule deleting pod", "pod", klog.KObj(pod)) 370 return true 371 } 372 373 // Case 2: pod that has been assumed could be skipped. 374 // An assumed pod can be added again to the scheduling queue if it got an update event 375 // during its previous scheduling cycle but before getting assumed. 376 isAssumed, err := sched.Cache.IsAssumedPod(pod) 377 if err != nil { 378 // TODO(91633): pass ctx into a revised HandleError 379 utilruntime.HandleError(fmt.Errorf("failed to check whether pod %s/%s is assumed: %v", pod.Namespace, pod.Name, err)) 380 return false 381 } 382 return isAssumed 383 } 384 385 // schedulePod tries to schedule the given pod to one of the nodes in the node list. 386 // If it succeeds, it will return the name of the node. 387 // If it fails, it will return a FitError with reasons. 388 func (sched *Scheduler) schedulePod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) (result ScheduleResult, err error) { 389 trace := utiltrace.New("Scheduling", utiltrace.Field{Key: "namespace", Value: pod.Namespace}, utiltrace.Field{Key: "name", Value: pod.Name}) 390 defer trace.LogIfLong(100 * time.Millisecond) 391 if err := sched.Cache.UpdateSnapshot(klog.FromContext(ctx), sched.nodeInfoSnapshot); err != nil { 392 return result, err 393 } 394 trace.Step("Snapshotting scheduler cache and node infos done") 395 396 if sched.nodeInfoSnapshot.NumNodes() == 0 { 397 return result, ErrNoNodesAvailable 398 } 399 400 feasibleNodes, diagnosis, err := sched.findNodesThatFitPod(ctx, fwk, state, pod) 401 if err != nil { 402 return result, err 403 } 404 trace.Step("Computing predicates done") 405 406 if len(feasibleNodes) == 0 { 407 return result, &framework.FitError{ 408 Pod: pod, 409 NumAllNodes: sched.nodeInfoSnapshot.NumNodes(), 410 Diagnosis: diagnosis, 411 } 412 } 413 414 // When only one node after predicate, just use it. 415 if len(feasibleNodes) == 1 { 416 return ScheduleResult{ 417 SuggestedHost: feasibleNodes[0].Name, 418 EvaluatedNodes: 1 + len(diagnosis.NodeToStatusMap), 419 FeasibleNodes: 1, 420 }, nil 421 } 422 423 priorityList, err := prioritizeNodes(ctx, sched.Extenders, fwk, state, pod, feasibleNodes) 424 if err != nil { 425 return result, err 426 } 427 428 host, _, err := selectHost(priorityList, numberOfHighestScoredNodesToReport) 429 trace.Step("Prioritizing done") 430 431 return ScheduleResult{ 432 SuggestedHost: host, 433 EvaluatedNodes: len(feasibleNodes) + len(diagnosis.NodeToStatusMap), 434 FeasibleNodes: len(feasibleNodes), 435 }, err 436 } 437 438 // Filters the nodes to find the ones that fit the pod based on the framework 439 // filter plugins and filter extenders. 440 func (sched *Scheduler) findNodesThatFitPod(ctx context.Context, fwk framework.Framework, state *framework.CycleState, pod *v1.Pod) ([]*v1.Node, framework.Diagnosis, error) { 441 logger := klog.FromContext(ctx) 442 diagnosis := framework.Diagnosis{ 443 NodeToStatusMap: make(framework.NodeToStatusMap), 444 } 445 446 allNodes, err := sched.nodeInfoSnapshot.NodeInfos().List() 447 if err != nil { 448 return nil, diagnosis, err 449 } 450 // Run "prefilter" plugins. 451 preRes, s := fwk.RunPreFilterPlugins(ctx, state, pod) 452 if !s.IsSuccess() { 453 if !s.IsRejected() { 454 return nil, diagnosis, s.AsError() 455 } 456 // All nodes in NodeToStatusMap will have the same status so that they can be handled in the preemption. 457 // Some non trivial refactoring is needed to avoid this copy. 458 for _, n := range allNodes { 459 diagnosis.NodeToStatusMap[n.Node().Name] = s 460 } 461 462 // Record the messages from PreFilter in Diagnosis.PreFilterMsg. 463 msg := s.Message() 464 diagnosis.PreFilterMsg = msg 465 logger.V(5).Info("Status after running PreFilter plugins for pod", "pod", klog.KObj(pod), "status", msg) 466 diagnosis.AddPluginStatus(s) 467 return nil, diagnosis, nil 468 } 469 470 // "NominatedNodeName" can potentially be set in a previous scheduling cycle as a result of preemption. 471 // This node is likely the only candidate that will fit the pod, and hence we try it first before iterating over all nodes. 472 if len(pod.Status.NominatedNodeName) > 0 { 473 feasibleNodes, err := sched.evaluateNominatedNode(ctx, pod, fwk, state, diagnosis) 474 if err != nil { 475 logger.Error(err, "Evaluation failed on nominated node", "pod", klog.KObj(pod), "node", pod.Status.NominatedNodeName) 476 } 477 // Nominated node passes all the filters, scheduler is good to assign this node to the pod. 478 if len(feasibleNodes) != 0 { 479 return feasibleNodes, diagnosis, nil 480 } 481 } 482 483 nodes := allNodes 484 if !preRes.AllNodes() { 485 nodes = make([]*framework.NodeInfo, 0, len(preRes.NodeNames)) 486 for n := range preRes.NodeNames { 487 nInfo, err := sched.nodeInfoSnapshot.NodeInfos().Get(n) 488 if err != nil { 489 return nil, diagnosis, err 490 } 491 nodes = append(nodes, nInfo) 492 } 493 } 494 feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, nodes) 495 // always try to update the sched.nextStartNodeIndex regardless of whether an error has occurred 496 // this is helpful to make sure that all the nodes have a chance to be searched 497 processedNodes := len(feasibleNodes) + len(diagnosis.NodeToStatusMap) 498 sched.nextStartNodeIndex = (sched.nextStartNodeIndex + processedNodes) % len(nodes) 499 if err != nil { 500 return nil, diagnosis, err 501 } 502 503 feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) 504 if err != nil { 505 return nil, diagnosis, err 506 } 507 return feasibleNodes, diagnosis, nil 508 } 509 510 func (sched *Scheduler) evaluateNominatedNode(ctx context.Context, pod *v1.Pod, fwk framework.Framework, state *framework.CycleState, diagnosis framework.Diagnosis) ([]*v1.Node, error) { 511 nnn := pod.Status.NominatedNodeName 512 nodeInfo, err := sched.nodeInfoSnapshot.Get(nnn) 513 if err != nil { 514 return nil, err 515 } 516 node := []*framework.NodeInfo{nodeInfo} 517 feasibleNodes, err := sched.findNodesThatPassFilters(ctx, fwk, state, pod, &diagnosis, node) 518 if err != nil { 519 return nil, err 520 } 521 522 feasibleNodes, err = findNodesThatPassExtenders(ctx, sched.Extenders, pod, feasibleNodes, diagnosis.NodeToStatusMap) 523 if err != nil { 524 return nil, err 525 } 526 527 return feasibleNodes, nil 528 } 529 530 // findNodesThatPassFilters finds the nodes that fit the filter plugins. 531 func (sched *Scheduler) findNodesThatPassFilters( 532 ctx context.Context, 533 fwk framework.Framework, 534 state *framework.CycleState, 535 pod *v1.Pod, 536 diagnosis *framework.Diagnosis, 537 nodes []*framework.NodeInfo) ([]*v1.Node, error) { 538 numAllNodes := len(nodes) 539 numNodesToFind := sched.numFeasibleNodesToFind(fwk.PercentageOfNodesToScore(), int32(numAllNodes)) 540 541 // Create feasible list with enough space to avoid growing it 542 // and allow assigning. 543 feasibleNodes := make([]*v1.Node, numNodesToFind) 544 545 if !fwk.HasFilterPlugins() { 546 for i := range feasibleNodes { 547 feasibleNodes[i] = nodes[(sched.nextStartNodeIndex+i)%numAllNodes].Node() 548 } 549 return feasibleNodes, nil 550 } 551 552 errCh := parallelize.NewErrorChannel() 553 var statusesLock sync.Mutex 554 var feasibleNodesLen int32 555 ctx, cancel := context.WithCancel(ctx) 556 defer cancel() 557 checkNode := func(i int) { 558 // We check the nodes starting from where we left off in the previous scheduling cycle, 559 // this is to make sure all nodes have the same chance of being examined across pods. 560 nodeInfo := nodes[(sched.nextStartNodeIndex+i)%numAllNodes] 561 status := fwk.RunFilterPluginsWithNominatedPods(ctx, state, pod, nodeInfo) 562 if status.Code() == framework.Error { 563 errCh.SendErrorWithCancel(status.AsError(), cancel) 564 return 565 } 566 if status.IsSuccess() { 567 length := atomic.AddInt32(&feasibleNodesLen, 1) 568 if length > numNodesToFind { 569 cancel() 570 atomic.AddInt32(&feasibleNodesLen, -1) 571 } else { 572 feasibleNodes[length-1] = nodeInfo.Node() 573 } 574 } else { 575 statusesLock.Lock() 576 diagnosis.NodeToStatusMap[nodeInfo.Node().Name] = status 577 diagnosis.AddPluginStatus(status) 578 statusesLock.Unlock() 579 } 580 } 581 582 beginCheckNode := time.Now() 583 statusCode := framework.Success 584 defer func() { 585 // We record Filter extension point latency here instead of in framework.go because framework.RunFilterPlugins 586 // function is called for each node, whereas we want to have an overall latency for all nodes per scheduling cycle. 587 // Note that this latency also includes latency for `addNominatedPods`, which calls framework.RunPreFilterAddPod. 588 metrics.FrameworkExtensionPointDuration.WithLabelValues(metrics.Filter, statusCode.String(), fwk.ProfileName()).Observe(metrics.SinceInSeconds(beginCheckNode)) 589 }() 590 591 // Stops searching for more nodes once the configured number of feasible nodes 592 // are found. 593 fwk.Parallelizer().Until(ctx, numAllNodes, checkNode, metrics.Filter) 594 feasibleNodes = feasibleNodes[:feasibleNodesLen] 595 if err := errCh.ReceiveError(); err != nil { 596 statusCode = framework.Error 597 return feasibleNodes, err 598 } 599 return feasibleNodes, nil 600 } 601 602 // numFeasibleNodesToFind returns the number of feasible nodes that once found, the scheduler stops 603 // its search for more feasible nodes. 604 func (sched *Scheduler) numFeasibleNodesToFind(percentageOfNodesToScore *int32, numAllNodes int32) (numNodes int32) { 605 if numAllNodes < minFeasibleNodesToFind { 606 return numAllNodes 607 } 608 609 // Use profile percentageOfNodesToScore if it's set. Otherwise, use global percentageOfNodesToScore. 610 var percentage int32 611 if percentageOfNodesToScore != nil { 612 percentage = *percentageOfNodesToScore 613 } else { 614 percentage = sched.percentageOfNodesToScore 615 } 616 617 if percentage == 0 { 618 percentage = int32(50) - numAllNodes/125 619 if percentage < minFeasibleNodesPercentageToFind { 620 percentage = minFeasibleNodesPercentageToFind 621 } 622 } 623 624 numNodes = numAllNodes * percentage / 100 625 if numNodes < minFeasibleNodesToFind { 626 return minFeasibleNodesToFind 627 } 628 629 return numNodes 630 } 631 632 func findNodesThatPassExtenders(ctx context.Context, extenders []framework.Extender, pod *v1.Pod, feasibleNodes []*v1.Node, statuses framework.NodeToStatusMap) ([]*v1.Node, error) { 633 logger := klog.FromContext(ctx) 634 // Extenders are called sequentially. 635 // Nodes in original feasibleNodes can be excluded in one extender, and pass on to the next 636 // extender in a decreasing manner. 637 for _, extender := range extenders { 638 if len(feasibleNodes) == 0 { 639 break 640 } 641 if !extender.IsInterested(pod) { 642 continue 643 } 644 645 // Status of failed nodes in failedAndUnresolvableMap will be added or overwritten in <statuses>, 646 // so that the scheduler framework can respect the UnschedulableAndUnresolvable status for 647 // particular nodes, and this may eventually improve preemption efficiency. 648 // Note: users are recommended to configure the extenders that may return UnschedulableAndUnresolvable 649 // status ahead of others. 650 feasibleList, failedMap, failedAndUnresolvableMap, err := extender.Filter(pod, feasibleNodes) 651 if err != nil { 652 if extender.IsIgnorable() { 653 logger.Info("Skipping extender as it returned error and has ignorable flag set", "extender", extender, "err", err) 654 continue 655 } 656 return nil, err 657 } 658 659 for failedNodeName, failedMsg := range failedAndUnresolvableMap { 660 var aggregatedReasons []string 661 if _, found := statuses[failedNodeName]; found { 662 aggregatedReasons = statuses[failedNodeName].Reasons() 663 } 664 aggregatedReasons = append(aggregatedReasons, failedMsg) 665 statuses[failedNodeName] = framework.NewStatus(framework.UnschedulableAndUnresolvable, aggregatedReasons...) 666 } 667 668 for failedNodeName, failedMsg := range failedMap { 669 if _, found := failedAndUnresolvableMap[failedNodeName]; found { 670 // failedAndUnresolvableMap takes precedence over failedMap 671 // note that this only happens if the extender returns the node in both maps 672 continue 673 } 674 if _, found := statuses[failedNodeName]; !found { 675 statuses[failedNodeName] = framework.NewStatus(framework.Unschedulable, failedMsg) 676 } else { 677 statuses[failedNodeName].AppendReason(failedMsg) 678 } 679 } 680 681 feasibleNodes = feasibleList 682 } 683 return feasibleNodes, nil 684 } 685 686 // prioritizeNodes prioritizes the nodes by running the score plugins, 687 // which return a score for each node from the call to RunScorePlugins(). 688 // The scores from each plugin are added together to make the score for that node, then 689 // any extenders are run as well. 690 // All scores are finally combined (added) to get the total weighted scores of all nodes 691 func prioritizeNodes( 692 ctx context.Context, 693 extenders []framework.Extender, 694 fwk framework.Framework, 695 state *framework.CycleState, 696 pod *v1.Pod, 697 nodes []*v1.Node, 698 ) ([]framework.NodePluginScores, error) { 699 logger := klog.FromContext(ctx) 700 // If no priority configs are provided, then all nodes will have a score of one. 701 // This is required to generate the priority list in the required format 702 if len(extenders) == 0 && !fwk.HasScorePlugins() { 703 result := make([]framework.NodePluginScores, 0, len(nodes)) 704 for i := range nodes { 705 result = append(result, framework.NodePluginScores{ 706 Name: nodes[i].Name, 707 TotalScore: 1, 708 }) 709 } 710 return result, nil 711 } 712 713 // Run PreScore plugins. 714 preScoreStatus := fwk.RunPreScorePlugins(ctx, state, pod, nodes) 715 if !preScoreStatus.IsSuccess() { 716 return nil, preScoreStatus.AsError() 717 } 718 719 // Run the Score plugins. 720 nodesScores, scoreStatus := fwk.RunScorePlugins(ctx, state, pod, nodes) 721 if !scoreStatus.IsSuccess() { 722 return nil, scoreStatus.AsError() 723 } 724 725 // Additional details logged at level 10 if enabled. 726 loggerVTen := logger.V(10) 727 if loggerVTen.Enabled() { 728 for _, nodeScore := range nodesScores { 729 for _, pluginScore := range nodeScore.Scores { 730 loggerVTen.Info("Plugin scored node for pod", "pod", klog.KObj(pod), "plugin", pluginScore.Name, "node", nodeScore.Name, "score", pluginScore.Score) 731 } 732 } 733 } 734 735 if len(extenders) != 0 && nodes != nil { 736 // allNodeExtendersScores has all extenders scores for all nodes. 737 // It is keyed with node name. 738 allNodeExtendersScores := make(map[string]*framework.NodePluginScores, len(nodes)) 739 var mu sync.Mutex 740 var wg sync.WaitGroup 741 for i := range extenders { 742 if !extenders[i].IsInterested(pod) { 743 continue 744 } 745 wg.Add(1) 746 go func(extIndex int) { 747 metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Inc() 748 defer func() { 749 metrics.Goroutines.WithLabelValues(metrics.PrioritizingExtender).Dec() 750 wg.Done() 751 }() 752 prioritizedList, weight, err := extenders[extIndex].Prioritize(pod, nodes) 753 if err != nil { 754 // Prioritization errors from extender can be ignored, let k8s/other extenders determine the priorities 755 logger.V(5).Info("Failed to run extender's priority function. No score given by this extender.", "error", err, "pod", klog.KObj(pod), "extender", extenders[extIndex].Name()) 756 return 757 } 758 mu.Lock() 759 defer mu.Unlock() 760 for i := range *prioritizedList { 761 nodename := (*prioritizedList)[i].Host 762 score := (*prioritizedList)[i].Score 763 if loggerVTen.Enabled() { 764 loggerVTen.Info("Extender scored node for pod", "pod", klog.KObj(pod), "extender", extenders[extIndex].Name(), "node", nodename, "score", score) 765 } 766 767 // MaxExtenderPriority may diverge from the max priority used in the scheduler and defined by MaxNodeScore, 768 // therefore we need to scale the score returned by extenders to the score range used by the scheduler. 769 finalscore := score * weight * (framework.MaxNodeScore / extenderv1.MaxExtenderPriority) 770 771 if allNodeExtendersScores[nodename] == nil { 772 allNodeExtendersScores[nodename] = &framework.NodePluginScores{ 773 Name: nodename, 774 Scores: make([]framework.PluginScore, 0, len(extenders)), 775 } 776 } 777 allNodeExtendersScores[nodename].Scores = append(allNodeExtendersScores[nodename].Scores, framework.PluginScore{ 778 Name: extenders[extIndex].Name(), 779 Score: finalscore, 780 }) 781 allNodeExtendersScores[nodename].TotalScore += finalscore 782 } 783 }(i) 784 } 785 // wait for all go routines to finish 786 wg.Wait() 787 for i := range nodesScores { 788 if score, ok := allNodeExtendersScores[nodes[i].Name]; ok { 789 nodesScores[i].Scores = append(nodesScores[i].Scores, score.Scores...) 790 nodesScores[i].TotalScore += score.TotalScore 791 } 792 } 793 } 794 795 if loggerVTen.Enabled() { 796 for i := range nodesScores { 797 loggerVTen.Info("Calculated node's final score for pod", "pod", klog.KObj(pod), "node", nodesScores[i].Name, "score", nodesScores[i].TotalScore) 798 } 799 } 800 return nodesScores, nil 801 } 802 803 var errEmptyPriorityList = errors.New("empty priorityList") 804 805 // selectHost takes a prioritized list of nodes and then picks one 806 // in a reservoir sampling manner from the nodes that had the highest score. 807 // It also returns the top {count} Nodes, 808 // and the top of the list will be always the selected host. 809 func selectHost(nodeScoreList []framework.NodePluginScores, count int) (string, []framework.NodePluginScores, error) { 810 if len(nodeScoreList) == 0 { 811 return "", nil, errEmptyPriorityList 812 } 813 814 var h nodeScoreHeap = nodeScoreList 815 heap.Init(&h) 816 cntOfMaxScore := 1 817 selectedIndex := 0 818 // The top of the heap is the NodeScoreResult with the highest score. 819 sortedNodeScoreList := make([]framework.NodePluginScores, 0, count) 820 sortedNodeScoreList = append(sortedNodeScoreList, heap.Pop(&h).(framework.NodePluginScores)) 821 822 // This for-loop will continue until all Nodes with the highest scores get checked for a reservoir sampling, 823 // and sortedNodeScoreList gets (count - 1) elements. 824 for ns := heap.Pop(&h).(framework.NodePluginScores); ; ns = heap.Pop(&h).(framework.NodePluginScores) { 825 if ns.TotalScore != sortedNodeScoreList[0].TotalScore && len(sortedNodeScoreList) == count { 826 break 827 } 828 829 if ns.TotalScore == sortedNodeScoreList[0].TotalScore { 830 cntOfMaxScore++ 831 if rand.Intn(cntOfMaxScore) == 0 { 832 // Replace the candidate with probability of 1/cntOfMaxScore 833 selectedIndex = cntOfMaxScore - 1 834 } 835 } 836 837 sortedNodeScoreList = append(sortedNodeScoreList, ns) 838 839 if h.Len() == 0 { 840 break 841 } 842 } 843 844 if selectedIndex != 0 { 845 // replace the first one with selected one 846 previous := sortedNodeScoreList[0] 847 sortedNodeScoreList[0] = sortedNodeScoreList[selectedIndex] 848 sortedNodeScoreList[selectedIndex] = previous 849 } 850 851 if len(sortedNodeScoreList) > count { 852 sortedNodeScoreList = sortedNodeScoreList[:count] 853 } 854 855 return sortedNodeScoreList[0].Name, sortedNodeScoreList, nil 856 } 857 858 // nodeScoreHeap is a heap of framework.NodePluginScores. 859 type nodeScoreHeap []framework.NodePluginScores 860 861 // nodeScoreHeap implements heap.Interface. 862 var _ heap.Interface = &nodeScoreHeap{} 863 864 func (h nodeScoreHeap) Len() int { return len(h) } 865 func (h nodeScoreHeap) Less(i, j int) bool { return h[i].TotalScore > h[j].TotalScore } 866 func (h nodeScoreHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } 867 868 func (h *nodeScoreHeap) Push(x interface{}) { 869 *h = append(*h, x.(framework.NodePluginScores)) 870 } 871 872 func (h *nodeScoreHeap) Pop() interface{} { 873 old := *h 874 n := len(old) 875 x := old[n-1] 876 *h = old[0 : n-1] 877 return x 878 } 879 880 // assume signals to the cache that a pod is already in the cache, so that binding can be asynchronous. 881 // assume modifies `assumed`. 882 func (sched *Scheduler) assume(logger klog.Logger, assumed *v1.Pod, host string) error { 883 // Optimistically assume that the binding will succeed and send it to apiserver 884 // in the background. 885 // If the binding fails, scheduler will release resources allocated to assumed pod 886 // immediately. 887 assumed.Spec.NodeName = host 888 889 if err := sched.Cache.AssumePod(logger, assumed); err != nil { 890 logger.Error(err, "Scheduler cache AssumePod failed") 891 return err 892 } 893 // if "assumed" is a nominated pod, we should remove it from internal cache 894 if sched.SchedulingQueue != nil { 895 sched.SchedulingQueue.DeleteNominatedPodIfExists(assumed) 896 } 897 898 return nil 899 } 900 901 // bind binds a pod to a given node defined in a binding object. 902 // The precedence for binding is: (1) extenders and (2) framework plugins. 903 // We expect this to run asynchronously, so we handle binding metrics internally. 904 func (sched *Scheduler) bind(ctx context.Context, fwk framework.Framework, assumed *v1.Pod, targetNode string, state *framework.CycleState) (status *framework.Status) { 905 logger := klog.FromContext(ctx) 906 defer func() { 907 sched.finishBinding(logger, fwk, assumed, targetNode, status) 908 }() 909 910 bound, err := sched.extendersBinding(assumed, targetNode) 911 if bound { 912 return framework.AsStatus(err) 913 } 914 return fwk.RunBindPlugins(ctx, state, assumed, targetNode) 915 } 916 917 // TODO(#87159): Move this to a Plugin. 918 func (sched *Scheduler) extendersBinding(pod *v1.Pod, node string) (bool, error) { 919 for _, extender := range sched.Extenders { 920 if !extender.IsBinder() || !extender.IsInterested(pod) { 921 continue 922 } 923 return true, extender.Bind(&v1.Binding{ 924 ObjectMeta: metav1.ObjectMeta{Namespace: pod.Namespace, Name: pod.Name, UID: pod.UID}, 925 Target: v1.ObjectReference{Kind: "Node", Name: node}, 926 }) 927 } 928 return false, nil 929 } 930 931 func (sched *Scheduler) finishBinding(logger klog.Logger, fwk framework.Framework, assumed *v1.Pod, targetNode string, status *framework.Status) { 932 if finErr := sched.Cache.FinishBinding(logger, assumed); finErr != nil { 933 logger.Error(finErr, "Scheduler cache FinishBinding failed") 934 } 935 if !status.IsSuccess() { 936 logger.V(1).Info("Failed to bind pod", "pod", klog.KObj(assumed)) 937 return 938 } 939 940 fwk.EventRecorder().Eventf(assumed, nil, v1.EventTypeNormal, "Scheduled", "Binding", "Successfully assigned %v/%v to %v", assumed.Namespace, assumed.Name, targetNode) 941 } 942 943 func getAttemptsLabel(p *framework.QueuedPodInfo) string { 944 // We breakdown the pod scheduling duration by attempts capped to a limit 945 // to avoid ending up with a high cardinality metric. 946 if p.Attempts >= 15 { 947 return "15+" 948 } 949 return strconv.Itoa(p.Attempts) 950 } 951 952 // handleSchedulingFailure records an event for the pod that indicates the 953 // pod has failed to schedule. Also, update the pod condition and nominated node name if set. 954 func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framework.Framework, podInfo *framework.QueuedPodInfo, status *framework.Status, nominatingInfo *framework.NominatingInfo, start time.Time) { 955 calledDone := false 956 defer func() { 957 if !calledDone { 958 // Basically, AddUnschedulableIfNotPresent calls DonePod internally. 959 // But, AddUnschedulableIfNotPresent isn't called in some corner cases. 960 // Here, we call DonePod explicitly to avoid leaking the pod. 961 sched.SchedulingQueue.Done(podInfo.Pod.UID) 962 } 963 }() 964 965 logger := klog.FromContext(ctx) 966 reason := v1.PodReasonSchedulerError 967 if status.IsRejected() { 968 reason = v1.PodReasonUnschedulable 969 } 970 971 switch reason { 972 case v1.PodReasonUnschedulable: 973 metrics.PodUnschedulable(fwk.ProfileName(), metrics.SinceInSeconds(start)) 974 case v1.PodReasonSchedulerError: 975 metrics.PodScheduleError(fwk.ProfileName(), metrics.SinceInSeconds(start)) 976 } 977 978 pod := podInfo.Pod 979 err := status.AsError() 980 errMsg := status.Message() 981 982 if err == ErrNoNodesAvailable { 983 logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) 984 } else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. 985 podInfo.UnschedulablePlugins = fitError.Diagnosis.UnschedulablePlugins 986 podInfo.PendingPlugins = fitError.Diagnosis.PendingPlugins 987 logger.V(2).Info("Unable to schedule pod; no fit; waiting", "pod", klog.KObj(pod), "err", errMsg) 988 } else if apierrors.IsNotFound(err) { 989 logger.V(2).Info("Unable to schedule pod, possibly due to node not found; waiting", "pod", klog.KObj(pod), "err", errMsg) 990 if errStatus, ok := err.(apierrors.APIStatus); ok && errStatus.Status().Details.Kind == "node" { 991 nodeName := errStatus.Status().Details.Name 992 // when node is not found, We do not remove the node right away. Trying again to get 993 // the node and if the node is still not found, then remove it from the scheduler cache. 994 _, err := fwk.ClientSet().CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) 995 if err != nil && apierrors.IsNotFound(err) { 996 node := v1.Node{ObjectMeta: metav1.ObjectMeta{Name: nodeName}} 997 if err := sched.Cache.RemoveNode(logger, &node); err != nil { 998 logger.V(4).Info("Node is not found; failed to remove it from the cache", "node", node.Name) 999 } 1000 } 1001 } 1002 } else { 1003 logger.Error(err, "Error scheduling pod; retrying", "pod", klog.KObj(pod)) 1004 } 1005 1006 // Check if the Pod exists in informer cache. 1007 podLister := fwk.SharedInformerFactory().Core().V1().Pods().Lister() 1008 cachedPod, e := podLister.Pods(pod.Namespace).Get(pod.Name) 1009 if e != nil { 1010 logger.Info("Pod doesn't exist in informer cache", "pod", klog.KObj(pod), "err", e) 1011 // We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case. 1012 } else { 1013 // In the case of extender, the pod may have been bound successfully, but timed out returning its response to the scheduler. 1014 // It could result in the live version to carry .spec.nodeName, and that's inconsistent with the internal-queued version. 1015 if len(cachedPod.Spec.NodeName) != 0 { 1016 logger.Info("Pod has been assigned to node. Abort adding it back to queue.", "pod", klog.KObj(pod), "node", cachedPod.Spec.NodeName) 1017 // We need to call DonePod here because we don't call AddUnschedulableIfNotPresent in this case. 1018 } else { 1019 // As <cachedPod> is from SharedInformer, we need to do a DeepCopy() here. 1020 // ignore this err since apiserver doesn't properly validate affinity terms 1021 // and we can't fix the validation for backwards compatibility. 1022 podInfo.PodInfo, _ = framework.NewPodInfo(cachedPod.DeepCopy()) 1023 if err := sched.SchedulingQueue.AddUnschedulableIfNotPresent(logger, podInfo, sched.SchedulingQueue.SchedulingCycle()); err != nil { 1024 logger.Error(err, "Error occurred") 1025 } 1026 calledDone = true 1027 } 1028 } 1029 1030 // Update the scheduling queue with the nominated pod information. Without 1031 // this, there would be a race condition between the next scheduling cycle 1032 // and the time the scheduler receives a Pod Update for the nominated pod. 1033 // Here we check for nil only for tests. 1034 if sched.SchedulingQueue != nil { 1035 logger := klog.FromContext(ctx) 1036 sched.SchedulingQueue.AddNominatedPod(logger, podInfo.PodInfo, nominatingInfo) 1037 } 1038 1039 if err == nil { 1040 // Only tests can reach here. 1041 return 1042 } 1043 1044 msg := truncateMessage(errMsg) 1045 fwk.EventRecorder().Eventf(pod, nil, v1.EventTypeWarning, "FailedScheduling", "Scheduling", msg) 1046 if err := updatePod(ctx, sched.client, pod, &v1.PodCondition{ 1047 Type: v1.PodScheduled, 1048 Status: v1.ConditionFalse, 1049 Reason: reason, 1050 Message: errMsg, 1051 }, nominatingInfo); err != nil { 1052 klog.FromContext(ctx).Error(err, "Error updating pod", "pod", klog.KObj(pod)) 1053 } 1054 } 1055 1056 // truncateMessage truncates a message if it hits the NoteLengthLimit. 1057 func truncateMessage(message string) string { 1058 max := validation.NoteLengthLimit 1059 if len(message) <= max { 1060 return message 1061 } 1062 suffix := " ..." 1063 return message[:max-len(suffix)] + suffix 1064 } 1065 1066 func updatePod(ctx context.Context, client clientset.Interface, pod *v1.Pod, condition *v1.PodCondition, nominatingInfo *framework.NominatingInfo) error { 1067 logger := klog.FromContext(ctx) 1068 logger.V(3).Info("Updating pod condition", "pod", klog.KObj(pod), "conditionType", condition.Type, "conditionStatus", condition.Status, "conditionReason", condition.Reason) 1069 podStatusCopy := pod.Status.DeepCopy() 1070 // NominatedNodeName is updated only if we are trying to set it, and the value is 1071 // different from the existing one. 1072 nnnNeedsUpdate := nominatingInfo.Mode() == framework.ModeOverride && pod.Status.NominatedNodeName != nominatingInfo.NominatedNodeName 1073 if !podutil.UpdatePodCondition(podStatusCopy, condition) && !nnnNeedsUpdate { 1074 return nil 1075 } 1076 if nnnNeedsUpdate { 1077 podStatusCopy.NominatedNodeName = nominatingInfo.NominatedNodeName 1078 } 1079 return util.PatchPodStatus(ctx, client, pod, podStatusCopy) 1080 }