k8s.io/kubernetes@v1.29.3/pkg/controller/nodelifecycle/node_lifecycle_controller.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // The Controller sets tainted annotations on nodes. 18 // Tainted nodes should not be used for new work loads and 19 // some effort should be given to getting existing work 20 // loads off of tainted nodes. 21 22 package nodelifecycle 23 24 import ( 25 "context" 26 "fmt" 27 "sync" 28 "time" 29 30 "k8s.io/klog/v2" 31 32 coordv1 "k8s.io/api/coordination/v1" 33 v1 "k8s.io/api/core/v1" 34 apiequality "k8s.io/apimachinery/pkg/api/equality" 35 apierrors "k8s.io/apimachinery/pkg/api/errors" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/labels" 38 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 39 "k8s.io/apimachinery/pkg/util/wait" 40 utilfeature "k8s.io/apiserver/pkg/util/feature" 41 appsv1informers "k8s.io/client-go/informers/apps/v1" 42 coordinformers "k8s.io/client-go/informers/coordination/v1" 43 coreinformers "k8s.io/client-go/informers/core/v1" 44 clientset "k8s.io/client-go/kubernetes" 45 "k8s.io/client-go/kubernetes/scheme" 46 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 47 appsv1listers "k8s.io/client-go/listers/apps/v1" 48 coordlisters "k8s.io/client-go/listers/coordination/v1" 49 corelisters "k8s.io/client-go/listers/core/v1" 50 "k8s.io/client-go/tools/cache" 51 "k8s.io/client-go/tools/record" 52 "k8s.io/client-go/util/flowcontrol" 53 "k8s.io/client-go/util/workqueue" 54 nodetopology "k8s.io/component-helpers/node/topology" 55 kubeletapis "k8s.io/kubelet/pkg/apis" 56 "k8s.io/kubernetes/pkg/controller" 57 "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" 58 "k8s.io/kubernetes/pkg/controller/tainteviction" 59 controllerutil "k8s.io/kubernetes/pkg/controller/util/node" 60 "k8s.io/kubernetes/pkg/features" 61 taintutils "k8s.io/kubernetes/pkg/util/taints" 62 ) 63 64 func init() { 65 // Register prometheus metrics 66 Register() 67 } 68 69 var ( 70 // UnreachableTaintTemplate is the taint for when a node becomes unreachable. 71 UnreachableTaintTemplate = &v1.Taint{ 72 Key: v1.TaintNodeUnreachable, 73 Effect: v1.TaintEffectNoExecute, 74 } 75 76 // NotReadyTaintTemplate is the taint for when a node is not ready for 77 // executing pods 78 NotReadyTaintTemplate = &v1.Taint{ 79 Key: v1.TaintNodeNotReady, 80 Effect: v1.TaintEffectNoExecute, 81 } 82 83 // map {NodeConditionType: {ConditionStatus: TaintKey}} 84 // represents which NodeConditionType under which ConditionStatus should be 85 // tainted with which TaintKey 86 // for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs 87 nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{ 88 v1.NodeReady: { 89 v1.ConditionFalse: v1.TaintNodeNotReady, 90 v1.ConditionUnknown: v1.TaintNodeUnreachable, 91 }, 92 v1.NodeMemoryPressure: { 93 v1.ConditionTrue: v1.TaintNodeMemoryPressure, 94 }, 95 v1.NodeDiskPressure: { 96 v1.ConditionTrue: v1.TaintNodeDiskPressure, 97 }, 98 v1.NodeNetworkUnavailable: { 99 v1.ConditionTrue: v1.TaintNodeNetworkUnavailable, 100 }, 101 v1.NodePIDPressure: { 102 v1.ConditionTrue: v1.TaintNodePIDPressure, 103 }, 104 } 105 106 taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{ 107 v1.TaintNodeNotReady: v1.NodeReady, 108 v1.TaintNodeUnreachable: v1.NodeReady, 109 v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable, 110 v1.TaintNodeMemoryPressure: v1.NodeMemoryPressure, 111 v1.TaintNodeDiskPressure: v1.NodeDiskPressure, 112 v1.TaintNodePIDPressure: v1.NodePIDPressure, 113 } 114 ) 115 116 // ZoneState is the state of a given zone. 117 type ZoneState string 118 119 const ( 120 stateInitial = ZoneState("Initial") 121 stateNormal = ZoneState("Normal") 122 stateFullDisruption = ZoneState("FullDisruption") 123 statePartialDisruption = ZoneState("PartialDisruption") 124 ) 125 126 const ( 127 // The amount of time the nodecontroller should sleep between retrying node health updates 128 retrySleepTime = 20 * time.Millisecond 129 nodeNameKeyIndex = "spec.nodeName" 130 // podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass. 131 // Pod update workers will only handle lagging cache pods. 4 workers should be enough. 132 podUpdateWorkerSize = 4 133 // nodeUpdateWorkerSize defines the size of workers for node update or/and pod update. 134 nodeUpdateWorkerSize = 8 135 136 // taintEvictionController is defined here in order to prevent imports of 137 // k8s.io/kubernetes/cmd/kube-controller-manager/names which would result in validation errors. 138 // This constant will be removed upon graduation of the SeparateTaintEvictionController feature. 139 taintEvictionController = "taint-eviction-controller" 140 ) 141 142 // labelReconcileInfo lists Node labels to reconcile, and how to reconcile them. 143 // primaryKey and secondaryKey are keys of labels to reconcile. 144 // - If both keys exist, but their values don't match. Use the value from the 145 // primaryKey as the source of truth to reconcile. 146 // - If ensureSecondaryExists is true, and the secondaryKey does not 147 // exist, secondaryKey will be added with the value of the primaryKey. 148 var labelReconcileInfo = []struct { 149 primaryKey string 150 secondaryKey string 151 ensureSecondaryExists bool 152 }{ 153 { 154 // Reconcile the beta and the stable OS label using the stable label as the source of truth. 155 // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels 156 primaryKey: v1.LabelOSStable, 157 secondaryKey: kubeletapis.LabelOS, 158 ensureSecondaryExists: true, 159 }, 160 { 161 // Reconcile the beta and the stable arch label using the stable label as the source of truth. 162 // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels 163 primaryKey: v1.LabelArchStable, 164 secondaryKey: kubeletapis.LabelArch, 165 ensureSecondaryExists: true, 166 }, 167 } 168 169 type nodeHealthData struct { 170 probeTimestamp metav1.Time 171 readyTransitionTimestamp metav1.Time 172 status *v1.NodeStatus 173 lease *coordv1.Lease 174 } 175 176 func (n *nodeHealthData) deepCopy() *nodeHealthData { 177 if n == nil { 178 return nil 179 } 180 return &nodeHealthData{ 181 probeTimestamp: n.probeTimestamp, 182 readyTransitionTimestamp: n.readyTransitionTimestamp, 183 status: n.status.DeepCopy(), 184 lease: n.lease.DeepCopy(), 185 } 186 } 187 188 type nodeHealthMap struct { 189 lock sync.RWMutex 190 nodeHealths map[string]*nodeHealthData 191 } 192 193 func newNodeHealthMap() *nodeHealthMap { 194 return &nodeHealthMap{ 195 nodeHealths: make(map[string]*nodeHealthData), 196 } 197 } 198 199 // getDeepCopy - returns copy of node health data. 200 // It prevents data being changed after retrieving it from the map. 201 func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData { 202 n.lock.RLock() 203 defer n.lock.RUnlock() 204 return n.nodeHealths[name].deepCopy() 205 } 206 207 func (n *nodeHealthMap) set(name string, data *nodeHealthData) { 208 n.lock.Lock() 209 defer n.lock.Unlock() 210 n.nodeHealths[name] = data 211 } 212 213 type podUpdateItem struct { 214 namespace string 215 name string 216 } 217 218 // Controller is the controller that manages node's life cycle. 219 type Controller struct { 220 taintManager *tainteviction.Controller 221 222 podLister corelisters.PodLister 223 podInformerSynced cache.InformerSynced 224 kubeClient clientset.Interface 225 226 // This timestamp is to be used instead of LastProbeTime stored in Condition. We do this 227 // to avoid the problem with time skew across the cluster. 228 now func() metav1.Time 229 230 enterPartialDisruptionFunc func(nodeNum int) float32 231 enterFullDisruptionFunc func(nodeNum int) float32 232 computeZoneStateFunc func(nodeConditions []*v1.NodeCondition) (int, ZoneState) 233 234 knownNodeSet map[string]*v1.Node 235 // per Node map storing last observed health together with a local time when it was observed. 236 nodeHealthMap *nodeHealthMap 237 238 // evictorLock protects zonePodEvictor and zoneNoExecuteTainter. 239 evictorLock sync.Mutex 240 // workers that are responsible for tainting nodes. 241 zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue 242 243 nodesToRetry sync.Map 244 245 zoneStates map[string]ZoneState 246 247 daemonSetStore appsv1listers.DaemonSetLister 248 daemonSetInformerSynced cache.InformerSynced 249 250 leaseLister coordlisters.LeaseLister 251 leaseInformerSynced cache.InformerSynced 252 nodeLister corelisters.NodeLister 253 nodeInformerSynced cache.InformerSynced 254 255 getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error) 256 257 broadcaster record.EventBroadcaster 258 recorder record.EventRecorder 259 260 // Value controlling Controller monitoring period, i.e. how often does Controller 261 // check node health signal posted from kubelet. This value should be lower than 262 // nodeMonitorGracePeriod. 263 // TODO: Change node health monitor to watch based. 264 nodeMonitorPeriod time.Duration 265 266 // When node is just created, e.g. cluster bootstrap or node creation, we give 267 // a longer grace period. 268 nodeStartupGracePeriod time.Duration 269 270 // Controller will not proactively sync node health, but will monitor node 271 // health signal updated from kubelet. There are 2 kinds of node healthiness 272 // signals: NodeStatus and NodeLease. If it doesn't receive update for this amount 273 // of time, it will start posting "NodeReady==ConditionUnknown". The amount of 274 // time before which Controller start evicting pods is controlled via flag 275 // 'pod-eviction-timeout'. 276 // Note: be cautious when changing the constant, it must work with 277 // nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease 278 // controller. The node health signal update frequency is the minimal of the 279 // two. 280 // There are several constraints: 281 // 1. nodeMonitorGracePeriod must be N times more than the node health signal 282 // update frequency, where N means number of retries allowed for kubelet to 283 // post node status/lease. It is pointless to make nodeMonitorGracePeriod 284 // be less than the node health signal update frequency, since there will 285 // only be fresh values from Kubelet at an interval of node health signal 286 // update frequency. 287 // 2. nodeMonitorGracePeriod can't be too large for user experience - larger 288 // value takes longer for user to see up-to-date node health. 289 nodeMonitorGracePeriod time.Duration 290 291 // Number of workers Controller uses to process node monitor health updates. 292 // Defaults to scheduler.UpdateWorkerSize. 293 nodeUpdateWorkerSize int 294 295 evictionLimiterQPS float32 296 secondaryEvictionLimiterQPS float32 297 largeClusterThreshold int32 298 unhealthyZoneThreshold float32 299 300 nodeUpdateQueue workqueue.Interface 301 podUpdateQueue workqueue.RateLimitingInterface 302 } 303 304 // NewNodeLifecycleController returns a new taint controller. 305 func NewNodeLifecycleController( 306 ctx context.Context, 307 leaseInformer coordinformers.LeaseInformer, 308 podInformer coreinformers.PodInformer, 309 nodeInformer coreinformers.NodeInformer, 310 daemonSetInformer appsv1informers.DaemonSetInformer, 311 kubeClient clientset.Interface, 312 nodeMonitorPeriod time.Duration, 313 nodeStartupGracePeriod time.Duration, 314 nodeMonitorGracePeriod time.Duration, 315 evictionLimiterQPS float32, 316 secondaryEvictionLimiterQPS float32, 317 largeClusterThreshold int32, 318 unhealthyZoneThreshold float32, 319 ) (*Controller, error) { 320 logger := klog.FromContext(ctx) 321 if kubeClient == nil { 322 logger.Error(nil, "kubeClient is nil when starting nodelifecycle Controller") 323 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 324 } 325 326 eventBroadcaster := record.NewBroadcaster() 327 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"}) 328 329 nc := &Controller{ 330 kubeClient: kubeClient, 331 now: metav1.Now, 332 knownNodeSet: make(map[string]*v1.Node), 333 nodeHealthMap: newNodeHealthMap(), 334 broadcaster: eventBroadcaster, 335 recorder: recorder, 336 nodeMonitorPeriod: nodeMonitorPeriod, 337 nodeStartupGracePeriod: nodeStartupGracePeriod, 338 nodeMonitorGracePeriod: nodeMonitorGracePeriod, 339 nodeUpdateWorkerSize: nodeUpdateWorkerSize, 340 zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), 341 nodesToRetry: sync.Map{}, 342 zoneStates: make(map[string]ZoneState), 343 evictionLimiterQPS: evictionLimiterQPS, 344 secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, 345 largeClusterThreshold: largeClusterThreshold, 346 unhealthyZoneThreshold: unhealthyZoneThreshold, 347 nodeUpdateQueue: workqueue.NewNamed("node_lifecycle_controller"), 348 podUpdateQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"), 349 } 350 351 nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc 352 nc.enterFullDisruptionFunc = nc.HealthyQPSFunc 353 nc.computeZoneStateFunc = nc.ComputeZoneState 354 355 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 356 AddFunc: func(obj interface{}) { 357 pod := obj.(*v1.Pod) 358 nc.podUpdated(nil, pod) 359 }, 360 UpdateFunc: func(prev, obj interface{}) { 361 prevPod := prev.(*v1.Pod) 362 newPod := obj.(*v1.Pod) 363 nc.podUpdated(prevPod, newPod) 364 }, 365 DeleteFunc: func(obj interface{}) { 366 pod, isPod := obj.(*v1.Pod) 367 // We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly. 368 if !isPod { 369 deletedState, ok := obj.(cache.DeletedFinalStateUnknown) 370 if !ok { 371 logger.Error(nil, "Received unexpected object", "object", obj) 372 return 373 } 374 pod, ok = deletedState.Obj.(*v1.Pod) 375 if !ok { 376 logger.Error(nil, "DeletedFinalStateUnknown contained non-Pod object", "object", deletedState.Obj) 377 return 378 } 379 } 380 nc.podUpdated(pod, nil) 381 }, 382 }) 383 nc.podInformerSynced = podInformer.Informer().HasSynced 384 podInformer.Informer().AddIndexers(cache.Indexers{ 385 nodeNameKeyIndex: func(obj interface{}) ([]string, error) { 386 pod, ok := obj.(*v1.Pod) 387 if !ok { 388 return []string{}, nil 389 } 390 if len(pod.Spec.NodeName) == 0 { 391 return []string{}, nil 392 } 393 return []string{pod.Spec.NodeName}, nil 394 }, 395 }) 396 397 podIndexer := podInformer.Informer().GetIndexer() 398 nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) { 399 objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName) 400 if err != nil { 401 return nil, err 402 } 403 pods := make([]*v1.Pod, 0, len(objs)) 404 for _, obj := range objs { 405 pod, ok := obj.(*v1.Pod) 406 if !ok { 407 continue 408 } 409 pods = append(pods, pod) 410 } 411 return pods, nil 412 } 413 nc.podLister = podInformer.Lister() 414 nc.nodeLister = nodeInformer.Lister() 415 416 if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) { 417 logger.Info("Running TaintEvictionController as part of NodeLifecyleController") 418 tm, err := tainteviction.New(ctx, kubeClient, podInformer, nodeInformer, taintEvictionController) 419 if err != nil { 420 return nil, err 421 } 422 nc.taintManager = tm 423 } 424 425 logger.Info("Controller will reconcile labels") 426 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 427 AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { 428 nc.nodeUpdateQueue.Add(node.Name) 429 return nil 430 }), 431 UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { 432 nc.nodeUpdateQueue.Add(newNode.Name) 433 return nil 434 }), 435 DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error { 436 nc.nodesToRetry.Delete(node.Name) 437 return nil 438 }), 439 }) 440 441 nc.leaseLister = leaseInformer.Lister() 442 nc.leaseInformerSynced = leaseInformer.Informer().HasSynced 443 444 nc.nodeInformerSynced = nodeInformer.Informer().HasSynced 445 446 nc.daemonSetStore = daemonSetInformer.Lister() 447 nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced 448 449 return nc, nil 450 } 451 452 // Run starts an asynchronous loop that monitors the status of cluster nodes. 453 func (nc *Controller) Run(ctx context.Context) { 454 defer utilruntime.HandleCrash() 455 456 // Start events processing pipeline. 457 nc.broadcaster.StartStructuredLogging(0) 458 logger := klog.FromContext(ctx) 459 logger.Info("Sending events to api server") 460 nc.broadcaster.StartRecordingToSink( 461 &v1core.EventSinkImpl{ 462 Interface: v1core.New(nc.kubeClient.CoreV1().RESTClient()).Events(""), 463 }) 464 defer nc.broadcaster.Shutdown() 465 466 // Close node update queue to cleanup go routine. 467 defer nc.nodeUpdateQueue.ShutDown() 468 defer nc.podUpdateQueue.ShutDown() 469 470 logger.Info("Starting node controller") 471 defer logger.Info("Shutting down node controller") 472 473 if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) { 474 return 475 } 476 477 if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) { 478 logger.Info("Starting", "controller", taintEvictionController) 479 go nc.taintManager.Run(ctx) 480 } 481 482 // Start workers to reconcile labels and/or update NoSchedule taint for nodes. 483 for i := 0; i < nodeUpdateWorkerSize; i++ { 484 // Thanks to "workqueue", each worker just need to get item from queue, because 485 // the item is flagged when got from queue: if new event come, the new item will 486 // be re-queued until "Done", so no more than one worker handle the same item and 487 // no event missed. 488 go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second) 489 } 490 491 for i := 0; i < podUpdateWorkerSize; i++ { 492 go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second) 493 } 494 495 // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated 496 // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. 497 go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod) 498 499 // Incorporate the results of node health signal pushed from kubelet to master. 500 go wait.UntilWithContext(ctx, func(ctx context.Context) { 501 if err := nc.monitorNodeHealth(ctx); err != nil { 502 logger.Error(err, "Error monitoring node health") 503 } 504 }, nc.nodeMonitorPeriod) 505 506 <-ctx.Done() 507 } 508 509 func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) { 510 logger := klog.FromContext(ctx) 511 for { 512 obj, shutdown := nc.nodeUpdateQueue.Get() 513 // "nodeUpdateQueue" will be shutdown when "stopCh" closed; 514 // we do not need to re-check "stopCh" again. 515 if shutdown { 516 return 517 } 518 nodeName := obj.(string) 519 if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil { 520 logger.Error(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName)) 521 // TODO(k82cn): Add nodeName back to the queue 522 } 523 // TODO: re-evaluate whether there are any labels that need to be 524 // reconcile in 1.19. Remove this function if it's no longer necessary. 525 if err := nc.reconcileNodeLabels(ctx, nodeName); err != nil { 526 logger.Error(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName)) 527 // TODO(yujuhong): Add nodeName back to the queue 528 } 529 nc.nodeUpdateQueue.Done(nodeName) 530 } 531 } 532 533 func (nc *Controller) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error { 534 node, err := nc.nodeLister.Get(nodeName) 535 if err != nil { 536 // If node not found, just ignore it. 537 if apierrors.IsNotFound(err) { 538 return nil 539 } 540 return err 541 } 542 543 // Map node's condition to Taints. 544 var taints []v1.Taint 545 for _, condition := range node.Status.Conditions { 546 if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found { 547 if taintKey, found := taintMap[condition.Status]; found { 548 taints = append(taints, v1.Taint{ 549 Key: taintKey, 550 Effect: v1.TaintEffectNoSchedule, 551 }) 552 } 553 } 554 } 555 if node.Spec.Unschedulable { 556 // If unschedulable, append related taint. 557 taints = append(taints, v1.Taint{ 558 Key: v1.TaintNodeUnschedulable, 559 Effect: v1.TaintEffectNoSchedule, 560 }) 561 } 562 563 // Get exist taints of node. 564 nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool { 565 // only NoSchedule taints are candidates to be compared with "taints" later 566 if t.Effect != v1.TaintEffectNoSchedule { 567 return false 568 } 569 // Find unschedulable taint of node. 570 if t.Key == v1.TaintNodeUnschedulable { 571 return true 572 } 573 // Find node condition taints of node. 574 _, found := taintKeyToNodeConditionMap[t.Key] 575 return found 576 }) 577 taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints) 578 // If nothing to add or delete, return true directly. 579 if len(taintsToAdd) == 0 && len(taintsToDel) == 0 { 580 return nil 581 } 582 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) { 583 return fmt.Errorf("failed to swap taints of node %+v", node) 584 } 585 return nil 586 } 587 588 func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { 589 // Extract out the keys of the map in order to not hold 590 // the evictorLock for the entire function and hold it 591 // only when nescessary. 592 var zoneNoExecuteTainterKeys []string 593 func() { 594 nc.evictorLock.Lock() 595 defer nc.evictorLock.Unlock() 596 597 zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter)) 598 for k := range nc.zoneNoExecuteTainter { 599 zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k) 600 } 601 }() 602 logger := klog.FromContext(ctx) 603 for _, k := range zoneNoExecuteTainterKeys { 604 var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue 605 func() { 606 nc.evictorLock.Lock() 607 defer nc.evictorLock.Unlock() 608 // Extracting the value without checking if the key 609 // exists or not is safe to do here since zones do 610 // not get removed, and consequently pod evictors for 611 // these zones also do not get removed, only added. 612 zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k] 613 }() 614 // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). 615 zoneNoExecuteTainterWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { 616 node, err := nc.nodeLister.Get(value.Value) 617 if apierrors.IsNotFound(err) { 618 logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) 619 return true, 0 620 } else if err != nil { 621 logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) 622 // retry in 50 millisecond 623 return false, 50 * time.Millisecond 624 } 625 _, condition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 626 // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. 627 taintToAdd := v1.Taint{} 628 oppositeTaint := v1.Taint{} 629 switch condition.Status { 630 case v1.ConditionFalse: 631 taintToAdd = *NotReadyTaintTemplate 632 oppositeTaint = *UnreachableTaintTemplate 633 case v1.ConditionUnknown: 634 taintToAdd = *UnreachableTaintTemplate 635 oppositeTaint = *NotReadyTaintTemplate 636 default: 637 // It seems that the Node is ready again, so there's no need to taint it. 638 logger.V(4).Info("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value)) 639 return true, 0 640 } 641 result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node) 642 if result { 643 // Count the number of evictions. 644 zone := nodetopology.GetZoneKey(node) 645 evictionsTotal.WithLabelValues(zone).Inc() 646 } 647 648 return result, 0 649 }) 650 } 651 } 652 653 // monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown". 654 // This function will 655 // - add nodes which are not ready or not reachable for a long period of time to a rate-limited 656 // queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function, 657 // - update the PodReady condition Pods according to the state of the Node Ready condition. 658 func (nc *Controller) monitorNodeHealth(ctx context.Context) error { 659 start := nc.now() 660 defer func() { 661 updateAllNodesHealthDuration.Observe(time.Since(start.Time).Seconds()) 662 }() 663 664 // We are listing nodes from local cache as we can tolerate some small delays 665 // comparing to state from etcd and there is eventual consistency anyway. 666 nodes, err := nc.nodeLister.List(labels.Everything()) 667 if err != nil { 668 return err 669 } 670 added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes) 671 logger := klog.FromContext(ctx) 672 for i := range newZoneRepresentatives { 673 nc.addPodEvictorForNewZone(logger, newZoneRepresentatives[i]) 674 } 675 for i := range added { 676 logger.V(1).Info("Controller observed a new Node", "node", klog.KRef("", added[i].Name)) 677 controllerutil.RecordNodeEvent(ctx, nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) 678 nc.knownNodeSet[added[i].Name] = added[i] 679 nc.addPodEvictorForNewZone(logger, added[i]) 680 nc.markNodeAsReachable(ctx, added[i]) 681 } 682 683 for i := range deleted { 684 logger.V(1).Info("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name)) 685 controllerutil.RecordNodeEvent(ctx, nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name)) 686 delete(nc.knownNodeSet, deleted[i].Name) 687 } 688 689 var zoneToNodeConditionsLock sync.Mutex 690 zoneToNodeConditions := map[string][]*v1.NodeCondition{} 691 updateNodeFunc := func(piece int) { 692 start := nc.now() 693 defer func() { 694 updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds()) 695 }() 696 697 var observedReadyCondition v1.NodeCondition 698 var currentReadyCondition *v1.NodeCondition 699 node := nodes[piece].DeepCopy() 700 701 if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) { 702 var err error 703 _, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node) 704 if err == nil { 705 return true, nil 706 } 707 name := node.Name 708 node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) 709 if err != nil { 710 logger.Error(nil, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name)) 711 return false, err 712 } 713 return false, nil 714 }); err != nil { 715 logger.Error(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node)) 716 return 717 } 718 719 // Some nodes may be excluded from disruption checking 720 if !isNodeExcludedFromDisruptionChecks(node) { 721 zoneToNodeConditionsLock.Lock() 722 zoneToNodeConditions[nodetopology.GetZoneKey(node)] = append(zoneToNodeConditions[nodetopology.GetZoneKey(node)], currentReadyCondition) 723 zoneToNodeConditionsLock.Unlock() 724 } 725 726 if currentReadyCondition != nil { 727 pods, err := nc.getPodsAssignedToNode(node.Name) 728 if err != nil { 729 utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err)) 730 if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { 731 // If error happened during node status transition (Ready -> NotReady) 732 // we need to mark node for retry to force MarkPodsNotReady execution 733 // in the next iteration. 734 nc.nodesToRetry.Store(node.Name, struct{}{}) 735 } 736 return 737 } 738 nc.processTaintBaseEviction(ctx, node, &observedReadyCondition) 739 740 _, needsRetry := nc.nodesToRetry.Load(node.Name) 741 switch { 742 case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue: 743 // Report node event only once when status changed. 744 controllerutil.RecordNodeStatusChange(logger, nc.recorder, node, "NodeNotReady") 745 fallthrough 746 case needsRetry && observedReadyCondition.Status != v1.ConditionTrue: 747 if err = controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, node.Name); err != nil { 748 utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err)) 749 nc.nodesToRetry.Store(node.Name, struct{}{}) 750 return 751 } 752 } 753 } 754 nc.nodesToRetry.Delete(node.Name) 755 } 756 757 // Marking the pods not ready on a node requires looping over them and 758 // updating each pod's status one at a time. This is performed serially, and 759 // can take a while if we're processing each node serially as well. So we 760 // process them with bounded concurrency instead, since most of the time is 761 // spent waiting on io. 762 workqueue.ParallelizeUntil(ctx, nc.nodeUpdateWorkerSize, len(nodes), updateNodeFunc) 763 764 nc.handleDisruption(ctx, zoneToNodeConditions, nodes) 765 766 return nil 767 } 768 769 func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) { 770 decisionTimestamp := nc.now() 771 // Check eviction timeout against decisionTimestamp 772 logger := klog.FromContext(ctx) 773 switch observedReadyCondition.Status { 774 case v1.ConditionFalse: 775 // We want to update the taint straight away if Node is already tainted with the UnreachableTaint 776 if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { 777 taintToAdd := *NotReadyTaintTemplate 778 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { 779 logger.Error(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle") 780 } 781 } else if nc.markNodeForTainting(node, v1.ConditionFalse) { 782 logger.V(2).Info("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) 783 } 784 case v1.ConditionUnknown: 785 // We want to update the taint straight away if Node is already tainted with the UnreachableTaint 786 if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { 787 taintToAdd := *UnreachableTaintTemplate 788 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { 789 logger.Error(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle") 790 } 791 } else if nc.markNodeForTainting(node, v1.ConditionUnknown) { 792 logger.V(2).Info("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) 793 } 794 case v1.ConditionTrue: 795 removed, err := nc.markNodeAsReachable(ctx, node) 796 if err != nil { 797 logger.Error(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node)) 798 } 799 if removed { 800 logger.V(2).Info("Node is healthy again, removing all taints", "node", klog.KObj(node)) 801 } 802 } 803 } 804 805 // labelNodeDisruptionExclusion is a label on nodes that controls whether they are 806 // excluded from being considered for disruption checks by the node controller. 807 const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption" 808 809 func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool { 810 if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok { 811 return true 812 } 813 return false 814 } 815 816 // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to 817 // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred. 818 func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) { 819 nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name) 820 defer func() { 821 nc.nodeHealthMap.set(node.Name, nodeHealth) 822 }() 823 824 var gracePeriod time.Duration 825 var observedReadyCondition v1.NodeCondition 826 _, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 827 if currentReadyCondition == nil { 828 // If ready condition is nil, then kubelet (or nodecontroller) never posted node status. 829 // A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set 830 // to node.CreationTimestamp to avoid handle the corner case. 831 observedReadyCondition = v1.NodeCondition{ 832 Type: v1.NodeReady, 833 Status: v1.ConditionUnknown, 834 LastHeartbeatTime: node.CreationTimestamp, 835 LastTransitionTime: node.CreationTimestamp, 836 } 837 gracePeriod = nc.nodeStartupGracePeriod 838 if nodeHealth != nil { 839 nodeHealth.status = &node.Status 840 } else { 841 nodeHealth = &nodeHealthData{ 842 status: &node.Status, 843 probeTimestamp: node.CreationTimestamp, 844 readyTransitionTimestamp: node.CreationTimestamp, 845 } 846 } 847 } else { 848 // If ready condition is not nil, make a copy of it, since we may modify it in place later. 849 observedReadyCondition = *currentReadyCondition 850 gracePeriod = nc.nodeMonitorGracePeriod 851 } 852 // There are following cases to check: 853 // - both saved and new status have no Ready Condition set - we leave everything as it is, 854 // - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd, 855 // - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do, 856 // - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be 857 // unresponsive, so we leave it as it is, 858 // - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State - 859 // everything's in order, no transition occurred, we update only probeTimestamp, 860 // - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State - 861 // Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp. 862 // TODO: things to consider: 863 // - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it, 864 // - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check 865 // if that's the case, but it does not seem necessary. 866 var savedCondition *v1.NodeCondition 867 var savedLease *coordv1.Lease 868 if nodeHealth != nil { 869 _, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) 870 savedLease = nodeHealth.lease 871 } 872 logger := klog.FromContext(ctx) 873 if nodeHealth == nil { 874 logger.Info("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node)) 875 nodeHealth = &nodeHealthData{ 876 status: &node.Status, 877 probeTimestamp: nc.now(), 878 readyTransitionTimestamp: nc.now(), 879 } 880 } else if savedCondition == nil && currentReadyCondition != nil { 881 logger.V(1).Info("Creating timestamp entry for newly observed Node", "node", klog.KObj(node)) 882 nodeHealth = &nodeHealthData{ 883 status: &node.Status, 884 probeTimestamp: nc.now(), 885 readyTransitionTimestamp: nc.now(), 886 } 887 } else if savedCondition != nil && currentReadyCondition == nil { 888 logger.Error(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node)) 889 // TODO: figure out what to do in this case. For now we do the same thing as above. 890 nodeHealth = &nodeHealthData{ 891 status: &node.Status, 892 probeTimestamp: nc.now(), 893 readyTransitionTimestamp: nc.now(), 894 } 895 } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime { 896 var transitionTime metav1.Time 897 // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", 898 // otherwise we leave it as it is. 899 if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { 900 logger.V(3).Info("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition) 901 transitionTime = nc.now() 902 } else { 903 transitionTime = nodeHealth.readyTransitionTimestamp 904 } 905 if loggerV := logger.V(5); loggerV.Enabled() { 906 loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status) 907 } else { 908 logger.V(3).Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node)) 909 } 910 nodeHealth = &nodeHealthData{ 911 status: &node.Status, 912 probeTimestamp: nc.now(), 913 readyTransitionTimestamp: transitionTime, 914 } 915 } 916 // Always update the probe time if node lease is renewed. 917 // Note: If kubelet never posted the node status, but continues renewing the 918 // heartbeat leases, the node controller will assume the node is healthy and 919 // take no action. 920 observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name) 921 if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) { 922 nodeHealth.lease = observedLease 923 nodeHealth.probeTimestamp = nc.now() 924 } 925 926 if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) { 927 // NodeReady condition or lease was last set longer ago than gracePeriod, so 928 // update it to Unknown (regardless of its current value) in the master. 929 930 nodeConditionTypes := []v1.NodeConditionType{ 931 v1.NodeReady, 932 v1.NodeMemoryPressure, 933 v1.NodeDiskPressure, 934 v1.NodePIDPressure, 935 // We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level. 936 // v1.NodeNetworkUnavailable, 937 } 938 939 nowTimestamp := nc.now() 940 for _, nodeConditionType := range nodeConditionTypes { 941 _, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType) 942 if currentCondition == nil { 943 logger.V(2).Info("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node)) 944 node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ 945 Type: nodeConditionType, 946 Status: v1.ConditionUnknown, 947 Reason: "NodeStatusNeverUpdated", 948 Message: "Kubelet never posted node status.", 949 LastHeartbeatTime: node.CreationTimestamp, 950 LastTransitionTime: nowTimestamp, 951 }) 952 } else { 953 logger.V(2).Info("Node hasn't been updated", 954 "node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition) 955 if currentCondition.Status != v1.ConditionUnknown { 956 currentCondition.Status = v1.ConditionUnknown 957 currentCondition.Reason = "NodeStatusUnknown" 958 currentCondition.Message = "Kubelet stopped posting node status." 959 currentCondition.LastTransitionTime = nowTimestamp 960 } 961 } 962 } 963 // We need to update currentReadyCondition due to its value potentially changed. 964 _, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 965 966 if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { 967 if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { 968 logger.Error(err, "Error updating node", "node", klog.KObj(node)) 969 return gracePeriod, observedReadyCondition, currentReadyCondition, err 970 } 971 nodeHealth = &nodeHealthData{ 972 status: &node.Status, 973 probeTimestamp: nodeHealth.probeTimestamp, 974 readyTransitionTimestamp: nc.now(), 975 lease: observedLease, 976 } 977 return gracePeriod, observedReadyCondition, currentReadyCondition, nil 978 } 979 } 980 981 return gracePeriod, observedReadyCondition, currentReadyCondition, nil 982 } 983 984 func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { 985 newZoneStates := map[string]ZoneState{} 986 allAreFullyDisrupted := true 987 logger := klog.FromContext(ctx) 988 for k, v := range zoneToNodeConditions { 989 zoneSize.WithLabelValues(k).Set(float64(len(v))) 990 unhealthy, newState := nc.computeZoneStateFunc(v) 991 zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v))) 992 unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy)) 993 if newState != stateFullDisruption { 994 allAreFullyDisrupted = false 995 } 996 newZoneStates[k] = newState 997 if _, had := nc.zoneStates[k]; !had { 998 logger.Error(nil, "Setting initial state for unseen zone", "zone", k) 999 nc.zoneStates[k] = stateInitial 1000 } 1001 } 1002 1003 allWasFullyDisrupted := true 1004 for k, v := range nc.zoneStates { 1005 if _, have := zoneToNodeConditions[k]; !have { 1006 zoneSize.WithLabelValues(k).Set(0) 1007 zoneHealth.WithLabelValues(k).Set(100) 1008 unhealthyNodes.WithLabelValues(k).Set(0) 1009 delete(nc.zoneStates, k) 1010 continue 1011 } 1012 if v != stateFullDisruption { 1013 allWasFullyDisrupted = false 1014 break 1015 } 1016 } 1017 1018 // At least one node was responding in previous pass or in the current pass. Semantics is as follows: 1019 // - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use, 1020 // - if the new state is "normal" we resume normal operation (go back to default limiter settings), 1021 // - if new state is "fullDisruption" we restore normal eviction rate, 1022 // - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions. 1023 if !allAreFullyDisrupted || !allWasFullyDisrupted { 1024 // We're switching to full disruption mode 1025 if allAreFullyDisrupted { 1026 logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode") 1027 for i := range nodes { 1028 _, err := nc.markNodeAsReachable(ctx, nodes[i]) 1029 if err != nil { 1030 logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) 1031 } 1032 } 1033 // We stop all evictions. 1034 for k := range nc.zoneStates { 1035 nc.zoneNoExecuteTainter[k].SwapLimiter(0) 1036 } 1037 for k := range nc.zoneStates { 1038 nc.zoneStates[k] = stateFullDisruption 1039 } 1040 // All rate limiters are updated, so we can return early here. 1041 return 1042 } 1043 // We're exiting full disruption mode 1044 if allWasFullyDisrupted { 1045 logger.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode") 1046 // When exiting disruption mode update probe timestamps on all Nodes. 1047 now := nc.now() 1048 for i := range nodes { 1049 v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name) 1050 v.probeTimestamp = now 1051 v.readyTransitionTimestamp = now 1052 nc.nodeHealthMap.set(nodes[i].Name, v) 1053 } 1054 // We reset all rate limiters to settings appropriate for the given state. 1055 for k := range nc.zoneStates { 1056 nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k]) 1057 nc.zoneStates[k] = newZoneStates[k] 1058 } 1059 return 1060 } 1061 // We know that there's at least one not-fully disrupted so, 1062 // we can use default behavior for rate limiters 1063 for k, v := range nc.zoneStates { 1064 newState := newZoneStates[k] 1065 if v == newState { 1066 continue 1067 } 1068 logger.Info("Controller detected that zone is now in new state", "zone", k, "newState", newState) 1069 nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState) 1070 nc.zoneStates[k] = newState 1071 } 1072 } 1073 } 1074 1075 func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) { 1076 if newPod == nil { 1077 return 1078 } 1079 if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) { 1080 podItem := podUpdateItem{newPod.Namespace, newPod.Name} 1081 nc.podUpdateQueue.Add(podItem) 1082 } 1083 } 1084 1085 func (nc *Controller) doPodProcessingWorker(ctx context.Context) { 1086 for { 1087 obj, shutdown := nc.podUpdateQueue.Get() 1088 // "podUpdateQueue" will be shutdown when "stopCh" closed; 1089 // we do not need to re-check "stopCh" again. 1090 if shutdown { 1091 return 1092 } 1093 1094 podItem := obj.(podUpdateItem) 1095 nc.processPod(ctx, podItem) 1096 } 1097 } 1098 1099 // processPod is processing events of assigning pods to nodes. In particular: 1100 // 1. for NodeReady=true node, taint eviction for this pod will be cancelled 1101 // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready 1102 // 3. if node doesn't exist in cache, it will be skipped. 1103 func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { 1104 defer nc.podUpdateQueue.Done(podItem) 1105 pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) 1106 logger := klog.FromContext(ctx) 1107 if err != nil { 1108 if apierrors.IsNotFound(err) { 1109 // If the pod was deleted, there is no need to requeue. 1110 return 1111 } 1112 logger.Info("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err) 1113 nc.podUpdateQueue.AddRateLimited(podItem) 1114 return 1115 } 1116 1117 nodeName := pod.Spec.NodeName 1118 1119 nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName) 1120 if nodeHealth == nil { 1121 // Node data is not gathered yet or node has been removed in the meantime. 1122 return 1123 } 1124 1125 _, err = nc.nodeLister.Get(nodeName) 1126 if err != nil { 1127 logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err) 1128 nc.podUpdateQueue.AddRateLimited(podItem) 1129 return 1130 } 1131 1132 _, currentReadyCondition := controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) 1133 if currentReadyCondition == nil { 1134 // Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted). 1135 // In both cases, the pod will be handled correctly (evicted if needed) during processing 1136 // of the next node update event. 1137 return 1138 } 1139 1140 pods := []*v1.Pod{pod} 1141 if currentReadyCondition.Status != v1.ConditionTrue { 1142 if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil { 1143 logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) 1144 nc.podUpdateQueue.AddRateLimited(podItem) 1145 } 1146 } 1147 } 1148 1149 func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) { 1150 switch state { 1151 case stateNormal: 1152 nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) 1153 case statePartialDisruption: 1154 nc.zoneNoExecuteTainter[zone].SwapLimiter( 1155 nc.enterPartialDisruptionFunc(zoneSize)) 1156 case stateFullDisruption: 1157 nc.zoneNoExecuteTainter[zone].SwapLimiter( 1158 nc.enterFullDisruptionFunc(zoneSize)) 1159 } 1160 } 1161 1162 // classifyNodes classifies the allNodes to three categories: 1163 // 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet' 1164 // 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes' 1165 // 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states 1166 func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) { 1167 for i := range allNodes { 1168 if _, has := nc.knownNodeSet[allNodes[i].Name]; !has { 1169 added = append(added, allNodes[i]) 1170 } else { 1171 // Currently, we only consider new zone as updated. 1172 zone := nodetopology.GetZoneKey(allNodes[i]) 1173 if _, found := nc.zoneStates[zone]; !found { 1174 newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i]) 1175 } 1176 } 1177 } 1178 1179 // If there's a difference between lengths of known Nodes and observed nodes 1180 // we must have removed some Node. 1181 if len(nc.knownNodeSet)+len(added) != len(allNodes) { 1182 knowSetCopy := map[string]*v1.Node{} 1183 for k, v := range nc.knownNodeSet { 1184 knowSetCopy[k] = v 1185 } 1186 for i := range allNodes { 1187 delete(knowSetCopy, allNodes[i].Name) 1188 } 1189 for i := range knowSetCopy { 1190 deleted = append(deleted, knowSetCopy[i]) 1191 } 1192 } 1193 return 1194 } 1195 1196 // HealthyQPSFunc returns the default value for cluster eviction rate - we take 1197 // nodeNum for consistency with ReducedQPSFunc. 1198 func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 { 1199 return nc.evictionLimiterQPS 1200 } 1201 1202 // ReducedQPSFunc returns the QPS for when the cluster is large make 1203 // evictions slower, if they're small stop evictions altogether. 1204 func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 { 1205 if int32(nodeNum) > nc.largeClusterThreshold { 1206 return nc.secondaryEvictionLimiterQPS 1207 } 1208 return 0 1209 } 1210 1211 // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor. 1212 func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) { 1213 nc.evictorLock.Lock() 1214 defer nc.evictorLock.Unlock() 1215 zone := nodetopology.GetZoneKey(node) 1216 if _, found := nc.zoneStates[zone]; !found { 1217 nc.zoneStates[zone] = stateInitial 1218 nc.zoneNoExecuteTainter[zone] = 1219 scheduler.NewRateLimitedTimedQueue( 1220 flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) 1221 // Init the metric for the new zone. 1222 logger.Info("Initializing eviction metric for zone", "zone", zone) 1223 evictionsTotal.WithLabelValues(zone).Add(0) 1224 } 1225 } 1226 1227 func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool { 1228 nc.evictorLock.Lock() 1229 defer nc.evictorLock.Unlock() 1230 if status == v1.ConditionFalse { 1231 if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { 1232 nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) 1233 } 1234 } 1235 1236 if status == v1.ConditionUnknown { 1237 if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { 1238 nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) 1239 } 1240 } 1241 1242 return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID)) 1243 } 1244 1245 func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) { 1246 err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate) 1247 logger := klog.FromContext(ctx) 1248 if err != nil { 1249 logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) 1250 return false, err 1251 } 1252 err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate) 1253 if err != nil { 1254 logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) 1255 return false, err 1256 } 1257 nc.evictorLock.Lock() 1258 defer nc.evictorLock.Unlock() 1259 1260 return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name), nil 1261 } 1262 1263 // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. 1264 // The zone is considered: 1265 // - fullyDisrupted if there're no Ready Nodes, 1266 // - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready, 1267 // - normal otherwise 1268 func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) { 1269 readyNodes := 0 1270 notReadyNodes := 0 1271 for i := range nodeReadyConditions { 1272 if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue { 1273 readyNodes++ 1274 } else { 1275 notReadyNodes++ 1276 } 1277 } 1278 switch { 1279 case readyNodes == 0 && notReadyNodes > 0: 1280 return notReadyNodes, stateFullDisruption 1281 case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold: 1282 return notReadyNodes, statePartialDisruption 1283 default: 1284 return notReadyNodes, stateNormal 1285 } 1286 } 1287 1288 // reconcileNodeLabels reconciles node labels. 1289 func (nc *Controller) reconcileNodeLabels(ctx context.Context, nodeName string) error { 1290 node, err := nc.nodeLister.Get(nodeName) 1291 if err != nil { 1292 // If node not found, just ignore it. 1293 if apierrors.IsNotFound(err) { 1294 return nil 1295 } 1296 return err 1297 } 1298 1299 if node.Labels == nil { 1300 // Nothing to reconcile. 1301 return nil 1302 } 1303 1304 labelsToUpdate := map[string]string{} 1305 for _, r := range labelReconcileInfo { 1306 primaryValue, primaryExists := node.Labels[r.primaryKey] 1307 secondaryValue, secondaryExists := node.Labels[r.secondaryKey] 1308 1309 if !primaryExists { 1310 // The primary label key does not exist. This should not happen 1311 // within our supported version skew range, when no external 1312 // components/factors modifying the node object. Ignore this case. 1313 continue 1314 } 1315 if secondaryExists && primaryValue != secondaryValue { 1316 // Secondary label exists, but not consistent with the primary 1317 // label. Need to reconcile. 1318 labelsToUpdate[r.secondaryKey] = primaryValue 1319 1320 } else if !secondaryExists && r.ensureSecondaryExists { 1321 // Apply secondary label based on primary label. 1322 labelsToUpdate[r.secondaryKey] = primaryValue 1323 } 1324 } 1325 1326 if len(labelsToUpdate) == 0 { 1327 return nil 1328 } 1329 if !controllerutil.AddOrUpdateLabelsOnNode(ctx, nc.kubeClient, labelsToUpdate, node) { 1330 return fmt.Errorf("failed update labels for node %+v", node) 1331 } 1332 return nil 1333 }