k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/nodelifecycle/node_lifecycle_controller.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 // The Controller sets tainted annotations on nodes. 18 // Tainted nodes should not be used for new work loads and 19 // some effort should be given to getting existing work 20 // loads off of tainted nodes. 21 22 package nodelifecycle 23 24 import ( 25 "context" 26 "fmt" 27 "sync" 28 "time" 29 30 "k8s.io/klog/v2" 31 32 coordv1 "k8s.io/api/coordination/v1" 33 v1 "k8s.io/api/core/v1" 34 apiequality "k8s.io/apimachinery/pkg/api/equality" 35 apierrors "k8s.io/apimachinery/pkg/api/errors" 36 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 37 "k8s.io/apimachinery/pkg/labels" 38 utilruntime "k8s.io/apimachinery/pkg/util/runtime" 39 "k8s.io/apimachinery/pkg/util/wait" 40 utilfeature "k8s.io/apiserver/pkg/util/feature" 41 appsv1informers "k8s.io/client-go/informers/apps/v1" 42 coordinformers "k8s.io/client-go/informers/coordination/v1" 43 coreinformers "k8s.io/client-go/informers/core/v1" 44 clientset "k8s.io/client-go/kubernetes" 45 "k8s.io/client-go/kubernetes/scheme" 46 v1core "k8s.io/client-go/kubernetes/typed/core/v1" 47 appsv1listers "k8s.io/client-go/listers/apps/v1" 48 coordlisters "k8s.io/client-go/listers/coordination/v1" 49 corelisters "k8s.io/client-go/listers/core/v1" 50 "k8s.io/client-go/tools/cache" 51 "k8s.io/client-go/tools/record" 52 "k8s.io/client-go/util/flowcontrol" 53 "k8s.io/client-go/util/workqueue" 54 nodetopology "k8s.io/component-helpers/node/topology" 55 kubeletapis "k8s.io/kubelet/pkg/apis" 56 "k8s.io/kubernetes/pkg/controller" 57 "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" 58 "k8s.io/kubernetes/pkg/controller/tainteviction" 59 controllerutil "k8s.io/kubernetes/pkg/controller/util/node" 60 "k8s.io/kubernetes/pkg/features" 61 taintutils "k8s.io/kubernetes/pkg/util/taints" 62 ) 63 64 func init() { 65 // Register prometheus metrics 66 Register() 67 } 68 69 var ( 70 // UnreachableTaintTemplate is the taint for when a node becomes unreachable. 71 UnreachableTaintTemplate = &v1.Taint{ 72 Key: v1.TaintNodeUnreachable, 73 Effect: v1.TaintEffectNoExecute, 74 } 75 76 // NotReadyTaintTemplate is the taint for when a node is not ready for 77 // executing pods 78 NotReadyTaintTemplate = &v1.Taint{ 79 Key: v1.TaintNodeNotReady, 80 Effect: v1.TaintEffectNoExecute, 81 } 82 83 // map {NodeConditionType: {ConditionStatus: TaintKey}} 84 // represents which NodeConditionType under which ConditionStatus should be 85 // tainted with which TaintKey 86 // for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs 87 nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{ 88 v1.NodeReady: { 89 v1.ConditionFalse: v1.TaintNodeNotReady, 90 v1.ConditionUnknown: v1.TaintNodeUnreachable, 91 }, 92 v1.NodeMemoryPressure: { 93 v1.ConditionTrue: v1.TaintNodeMemoryPressure, 94 }, 95 v1.NodeDiskPressure: { 96 v1.ConditionTrue: v1.TaintNodeDiskPressure, 97 }, 98 v1.NodeNetworkUnavailable: { 99 v1.ConditionTrue: v1.TaintNodeNetworkUnavailable, 100 }, 101 v1.NodePIDPressure: { 102 v1.ConditionTrue: v1.TaintNodePIDPressure, 103 }, 104 } 105 106 taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{ 107 v1.TaintNodeNotReady: v1.NodeReady, 108 v1.TaintNodeUnreachable: v1.NodeReady, 109 v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable, 110 v1.TaintNodeMemoryPressure: v1.NodeMemoryPressure, 111 v1.TaintNodeDiskPressure: v1.NodeDiskPressure, 112 v1.TaintNodePIDPressure: v1.NodePIDPressure, 113 } 114 ) 115 116 // ZoneState is the state of a given zone. 117 type ZoneState string 118 119 const ( 120 stateInitial = ZoneState("Initial") 121 stateNormal = ZoneState("Normal") 122 stateFullDisruption = ZoneState("FullDisruption") 123 statePartialDisruption = ZoneState("PartialDisruption") 124 ) 125 126 const ( 127 // The amount of time the nodecontroller should sleep between retrying node health updates 128 retrySleepTime = 20 * time.Millisecond 129 nodeNameKeyIndex = "spec.nodeName" 130 // podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass. 131 // Pod update workers will only handle lagging cache pods. 4 workers should be enough. 132 podUpdateWorkerSize = 4 133 // nodeUpdateWorkerSize defines the size of workers for node update or/and pod update. 134 nodeUpdateWorkerSize = 8 135 136 // taintEvictionController is defined here in order to prevent imports of 137 // k8s.io/kubernetes/cmd/kube-controller-manager/names which would result in validation errors. 138 // This constant will be removed upon graduation of the SeparateTaintEvictionController feature. 139 taintEvictionController = "taint-eviction-controller" 140 ) 141 142 // labelReconcileInfo lists Node labels to reconcile, and how to reconcile them. 143 // primaryKey and secondaryKey are keys of labels to reconcile. 144 // - If both keys exist, but their values don't match. Use the value from the 145 // primaryKey as the source of truth to reconcile. 146 // - If ensureSecondaryExists is true, and the secondaryKey does not 147 // exist, secondaryKey will be added with the value of the primaryKey. 148 var labelReconcileInfo = []struct { 149 primaryKey string 150 secondaryKey string 151 ensureSecondaryExists bool 152 }{ 153 { 154 // Reconcile the beta and the stable OS label using the stable label as the source of truth. 155 // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels 156 primaryKey: v1.LabelOSStable, 157 secondaryKey: kubeletapis.LabelOS, 158 ensureSecondaryExists: true, 159 }, 160 { 161 // Reconcile the beta and the stable arch label using the stable label as the source of truth. 162 // TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels 163 primaryKey: v1.LabelArchStable, 164 secondaryKey: kubeletapis.LabelArch, 165 ensureSecondaryExists: true, 166 }, 167 } 168 169 type nodeHealthData struct { 170 probeTimestamp metav1.Time 171 readyTransitionTimestamp metav1.Time 172 status *v1.NodeStatus 173 lease *coordv1.Lease 174 } 175 176 func (n *nodeHealthData) deepCopy() *nodeHealthData { 177 if n == nil { 178 return nil 179 } 180 return &nodeHealthData{ 181 probeTimestamp: n.probeTimestamp, 182 readyTransitionTimestamp: n.readyTransitionTimestamp, 183 status: n.status.DeepCopy(), 184 lease: n.lease.DeepCopy(), 185 } 186 } 187 188 type nodeHealthMap struct { 189 lock sync.RWMutex 190 nodeHealths map[string]*nodeHealthData 191 } 192 193 func newNodeHealthMap() *nodeHealthMap { 194 return &nodeHealthMap{ 195 nodeHealths: make(map[string]*nodeHealthData), 196 } 197 } 198 199 // getDeepCopy - returns copy of node health data. 200 // It prevents data being changed after retrieving it from the map. 201 func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData { 202 n.lock.RLock() 203 defer n.lock.RUnlock() 204 return n.nodeHealths[name].deepCopy() 205 } 206 207 func (n *nodeHealthMap) set(name string, data *nodeHealthData) { 208 n.lock.Lock() 209 defer n.lock.Unlock() 210 n.nodeHealths[name] = data 211 } 212 213 type podUpdateItem struct { 214 namespace string 215 name string 216 } 217 218 // Controller is the controller that manages node's life cycle. 219 type Controller struct { 220 taintManager *tainteviction.Controller 221 222 podLister corelisters.PodLister 223 podInformerSynced cache.InformerSynced 224 kubeClient clientset.Interface 225 226 // This timestamp is to be used instead of LastProbeTime stored in Condition. We do this 227 // to avoid the problem with time skew across the cluster. 228 now func() metav1.Time 229 230 enterPartialDisruptionFunc func(nodeNum int) float32 231 enterFullDisruptionFunc func(nodeNum int) float32 232 computeZoneStateFunc func(nodeConditions []*v1.NodeCondition) (int, ZoneState) 233 234 knownNodeSet map[string]*v1.Node 235 // per Node map storing last observed health together with a local time when it was observed. 236 nodeHealthMap *nodeHealthMap 237 238 // evictorLock protects zonePodEvictor and zoneNoExecuteTainter. 239 evictorLock sync.Mutex 240 // workers that are responsible for tainting nodes. 241 zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue 242 243 nodesToRetry sync.Map 244 245 zoneStates map[string]ZoneState 246 247 daemonSetStore appsv1listers.DaemonSetLister 248 daemonSetInformerSynced cache.InformerSynced 249 250 leaseLister coordlisters.LeaseLister 251 leaseInformerSynced cache.InformerSynced 252 nodeLister corelisters.NodeLister 253 nodeInformerSynced cache.InformerSynced 254 255 getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error) 256 257 broadcaster record.EventBroadcaster 258 recorder record.EventRecorder 259 260 // Value controlling Controller monitoring period, i.e. how often does Controller 261 // check node health signal posted from kubelet. This value should be lower than 262 // nodeMonitorGracePeriod. 263 // TODO: Change node health monitor to watch based. 264 nodeMonitorPeriod time.Duration 265 266 // When node is just created, e.g. cluster bootstrap or node creation, we give 267 // a longer grace period. 268 nodeStartupGracePeriod time.Duration 269 270 // Controller will not proactively sync node health, but will monitor node 271 // health signal updated from kubelet. There are 2 kinds of node healthiness 272 // signals: NodeStatus and NodeLease. If it doesn't receive update for this amount 273 // of time, it will start posting "NodeReady==ConditionUnknown". The amount of 274 // time before which Controller start evicting pods is controlled via flag 275 // 'pod-eviction-timeout'. 276 // Note: be cautious when changing the constant, it must work with 277 // nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease 278 // controller. The node health signal update frequency is the minimal of the 279 // two. 280 // There are several constraints: 281 // 1. nodeMonitorGracePeriod must be N times more than the node health signal 282 // update frequency, where N means number of retries allowed for kubelet to 283 // post node status/lease. It is pointless to make nodeMonitorGracePeriod 284 // be less than the node health signal update frequency, since there will 285 // only be fresh values from Kubelet at an interval of node health signal 286 // update frequency. 287 // 2. nodeMonitorGracePeriod can't be too large for user experience - larger 288 // value takes longer for user to see up-to-date node health. 289 nodeMonitorGracePeriod time.Duration 290 291 // Number of workers Controller uses to process node monitor health updates. 292 // Defaults to scheduler.UpdateWorkerSize. 293 nodeUpdateWorkerSize int 294 295 evictionLimiterQPS float32 296 secondaryEvictionLimiterQPS float32 297 largeClusterThreshold int32 298 unhealthyZoneThreshold float32 299 300 nodeUpdateQueue workqueue.TypedInterface[string] 301 podUpdateQueue workqueue.TypedRateLimitingInterface[podUpdateItem] 302 } 303 304 // NewNodeLifecycleController returns a new taint controller. 305 func NewNodeLifecycleController( 306 ctx context.Context, 307 leaseInformer coordinformers.LeaseInformer, 308 podInformer coreinformers.PodInformer, 309 nodeInformer coreinformers.NodeInformer, 310 daemonSetInformer appsv1informers.DaemonSetInformer, 311 kubeClient clientset.Interface, 312 nodeMonitorPeriod time.Duration, 313 nodeStartupGracePeriod time.Duration, 314 nodeMonitorGracePeriod time.Duration, 315 evictionLimiterQPS float32, 316 secondaryEvictionLimiterQPS float32, 317 largeClusterThreshold int32, 318 unhealthyZoneThreshold float32, 319 ) (*Controller, error) { 320 logger := klog.FromContext(ctx) 321 if kubeClient == nil { 322 logger.Error(nil, "kubeClient is nil when starting nodelifecycle Controller") 323 klog.FlushAndExit(klog.ExitFlushTimeout, 1) 324 } 325 326 eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx)) 327 recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"}) 328 329 nc := &Controller{ 330 kubeClient: kubeClient, 331 now: metav1.Now, 332 knownNodeSet: make(map[string]*v1.Node), 333 nodeHealthMap: newNodeHealthMap(), 334 broadcaster: eventBroadcaster, 335 recorder: recorder, 336 nodeMonitorPeriod: nodeMonitorPeriod, 337 nodeStartupGracePeriod: nodeStartupGracePeriod, 338 nodeMonitorGracePeriod: nodeMonitorGracePeriod, 339 nodeUpdateWorkerSize: nodeUpdateWorkerSize, 340 zoneNoExecuteTainter: make(map[string]*scheduler.RateLimitedTimedQueue), 341 nodesToRetry: sync.Map{}, 342 zoneStates: make(map[string]ZoneState), 343 evictionLimiterQPS: evictionLimiterQPS, 344 secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS, 345 largeClusterThreshold: largeClusterThreshold, 346 unhealthyZoneThreshold: unhealthyZoneThreshold, 347 nodeUpdateQueue: workqueue.NewTypedWithConfig(workqueue.TypedQueueConfig[string]{Name: "node_lifecycle_controller"}), 348 podUpdateQueue: workqueue.NewTypedRateLimitingQueueWithConfig( 349 workqueue.DefaultTypedControllerRateLimiter[podUpdateItem](), 350 workqueue.TypedRateLimitingQueueConfig[podUpdateItem]{ 351 Name: "node_lifecycle_controller_pods", 352 }, 353 ), 354 } 355 356 nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc 357 nc.enterFullDisruptionFunc = nc.HealthyQPSFunc 358 nc.computeZoneStateFunc = nc.ComputeZoneState 359 360 podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 361 AddFunc: func(obj interface{}) { 362 pod := obj.(*v1.Pod) 363 nc.podUpdated(nil, pod) 364 }, 365 UpdateFunc: func(prev, obj interface{}) { 366 prevPod := prev.(*v1.Pod) 367 newPod := obj.(*v1.Pod) 368 nc.podUpdated(prevPod, newPod) 369 }, 370 DeleteFunc: func(obj interface{}) { 371 pod, isPod := obj.(*v1.Pod) 372 // We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly. 373 if !isPod { 374 deletedState, ok := obj.(cache.DeletedFinalStateUnknown) 375 if !ok { 376 logger.Error(nil, "Received unexpected object", "object", obj) 377 return 378 } 379 pod, ok = deletedState.Obj.(*v1.Pod) 380 if !ok { 381 logger.Error(nil, "DeletedFinalStateUnknown contained non-Pod object", "object", deletedState.Obj) 382 return 383 } 384 } 385 nc.podUpdated(pod, nil) 386 }, 387 }) 388 nc.podInformerSynced = podInformer.Informer().HasSynced 389 podInformer.Informer().AddIndexers(cache.Indexers{ 390 nodeNameKeyIndex: func(obj interface{}) ([]string, error) { 391 pod, ok := obj.(*v1.Pod) 392 if !ok { 393 return []string{}, nil 394 } 395 if len(pod.Spec.NodeName) == 0 { 396 return []string{}, nil 397 } 398 return []string{pod.Spec.NodeName}, nil 399 }, 400 }) 401 402 podIndexer := podInformer.Informer().GetIndexer() 403 nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) { 404 objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName) 405 if err != nil { 406 return nil, err 407 } 408 pods := make([]*v1.Pod, 0, len(objs)) 409 for _, obj := range objs { 410 pod, ok := obj.(*v1.Pod) 411 if !ok { 412 continue 413 } 414 pods = append(pods, pod) 415 } 416 return pods, nil 417 } 418 nc.podLister = podInformer.Lister() 419 nc.nodeLister = nodeInformer.Lister() 420 421 if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) { 422 logger.Info("Running TaintEvictionController as part of NodeLifecyleController") 423 tm, err := tainteviction.New(ctx, kubeClient, podInformer, nodeInformer, taintEvictionController) 424 if err != nil { 425 return nil, err 426 } 427 nc.taintManager = tm 428 } 429 430 logger.Info("Controller will reconcile labels") 431 nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ 432 AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { 433 nc.nodeUpdateQueue.Add(node.Name) 434 return nil 435 }), 436 UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { 437 nc.nodeUpdateQueue.Add(newNode.Name) 438 return nil 439 }), 440 DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error { 441 nc.nodesToRetry.Delete(node.Name) 442 return nil 443 }), 444 }) 445 446 nc.leaseLister = leaseInformer.Lister() 447 nc.leaseInformerSynced = leaseInformer.Informer().HasSynced 448 449 nc.nodeInformerSynced = nodeInformer.Informer().HasSynced 450 451 nc.daemonSetStore = daemonSetInformer.Lister() 452 nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced 453 454 return nc, nil 455 } 456 457 // Run starts an asynchronous loop that monitors the status of cluster nodes. 458 func (nc *Controller) Run(ctx context.Context) { 459 defer utilruntime.HandleCrash() 460 461 // Start events processing pipeline. 462 nc.broadcaster.StartStructuredLogging(3) 463 logger := klog.FromContext(ctx) 464 logger.Info("Sending events to api server") 465 nc.broadcaster.StartRecordingToSink( 466 &v1core.EventSinkImpl{ 467 Interface: v1core.New(nc.kubeClient.CoreV1().RESTClient()).Events(""), 468 }) 469 defer nc.broadcaster.Shutdown() 470 471 // Close node update queue to cleanup go routine. 472 defer nc.nodeUpdateQueue.ShutDown() 473 defer nc.podUpdateQueue.ShutDown() 474 475 logger.Info("Starting node controller") 476 defer logger.Info("Shutting down node controller") 477 478 if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) { 479 return 480 } 481 482 if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) { 483 logger.Info("Starting", "controller", taintEvictionController) 484 go nc.taintManager.Run(ctx) 485 } 486 487 // Start workers to reconcile labels and/or update NoSchedule taint for nodes. 488 for i := 0; i < nodeUpdateWorkerSize; i++ { 489 // Thanks to "workqueue", each worker just need to get item from queue, because 490 // the item is flagged when got from queue: if new event come, the new item will 491 // be re-queued until "Done", so no more than one worker handle the same item and 492 // no event missed. 493 go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second) 494 } 495 496 for i := 0; i < podUpdateWorkerSize; i++ { 497 go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second) 498 } 499 500 // Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated 501 // taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints. 502 go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod) 503 504 // Incorporate the results of node health signal pushed from kubelet to master. 505 go wait.UntilWithContext(ctx, func(ctx context.Context) { 506 if err := nc.monitorNodeHealth(ctx); err != nil { 507 logger.Error(err, "Error monitoring node health") 508 } 509 }, nc.nodeMonitorPeriod) 510 511 <-ctx.Done() 512 } 513 514 func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) { 515 logger := klog.FromContext(ctx) 516 for { 517 obj, shutdown := nc.nodeUpdateQueue.Get() 518 // "nodeUpdateQueue" will be shutdown when "stopCh" closed; 519 // we do not need to re-check "stopCh" again. 520 if shutdown { 521 return 522 } 523 nodeName := obj 524 if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil { 525 logger.Error(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName)) 526 // TODO(k82cn): Add nodeName back to the queue 527 } 528 // TODO: re-evaluate whether there are any labels that need to be 529 // reconcile in 1.19. Remove this function if it's no longer necessary. 530 if err := nc.reconcileNodeLabels(ctx, nodeName); err != nil { 531 logger.Error(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName)) 532 // TODO(yujuhong): Add nodeName back to the queue 533 } 534 nc.nodeUpdateQueue.Done(nodeName) 535 } 536 } 537 538 func (nc *Controller) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error { 539 node, err := nc.nodeLister.Get(nodeName) 540 if err != nil { 541 // If node not found, just ignore it. 542 if apierrors.IsNotFound(err) { 543 return nil 544 } 545 return err 546 } 547 548 // Map node's condition to Taints. 549 var taints []v1.Taint 550 for _, condition := range node.Status.Conditions { 551 if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found { 552 if taintKey, found := taintMap[condition.Status]; found { 553 taints = append(taints, v1.Taint{ 554 Key: taintKey, 555 Effect: v1.TaintEffectNoSchedule, 556 }) 557 } 558 } 559 } 560 if node.Spec.Unschedulable { 561 // If unschedulable, append related taint. 562 taints = append(taints, v1.Taint{ 563 Key: v1.TaintNodeUnschedulable, 564 Effect: v1.TaintEffectNoSchedule, 565 }) 566 } 567 568 // Get exist taints of node. 569 nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool { 570 // only NoSchedule taints are candidates to be compared with "taints" later 571 if t.Effect != v1.TaintEffectNoSchedule { 572 return false 573 } 574 // Find unschedulable taint of node. 575 if t.Key == v1.TaintNodeUnschedulable { 576 return true 577 } 578 // Find node condition taints of node. 579 _, found := taintKeyToNodeConditionMap[t.Key] 580 return found 581 }) 582 taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints) 583 // If nothing to add or delete, return true directly. 584 if len(taintsToAdd) == 0 && len(taintsToDel) == 0 { 585 return nil 586 } 587 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) { 588 return fmt.Errorf("failed to swap taints of node %+v", node) 589 } 590 return nil 591 } 592 593 func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { 594 // Extract out the keys of the map in order to not hold 595 // the evictorLock for the entire function and hold it 596 // only when nescessary. 597 var zoneNoExecuteTainterKeys []string 598 func() { 599 nc.evictorLock.Lock() 600 defer nc.evictorLock.Unlock() 601 602 zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter)) 603 for k := range nc.zoneNoExecuteTainter { 604 zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k) 605 } 606 }() 607 logger := klog.FromContext(ctx) 608 for _, k := range zoneNoExecuteTainterKeys { 609 var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue 610 func() { 611 nc.evictorLock.Lock() 612 defer nc.evictorLock.Unlock() 613 // Extracting the value without checking if the key 614 // exists or not is safe to do here since zones do 615 // not get removed, and consequently pod evictors for 616 // these zones also do not get removed, only added. 617 zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k] 618 }() 619 // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). 620 zoneNoExecuteTainterWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { 621 node, err := nc.nodeLister.Get(value.Value) 622 if apierrors.IsNotFound(err) { 623 logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) 624 return true, 0 625 } else if err != nil { 626 logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) 627 // retry in 50 millisecond 628 return false, 50 * time.Millisecond 629 } 630 _, condition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 631 if condition == nil { 632 logger.Info("Failed to get NodeCondition from the node status", "node", klog.KRef("", value.Value)) 633 // retry in 50 millisecond 634 return false, 50 * time.Millisecond 635 } 636 // Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive. 637 taintToAdd := v1.Taint{} 638 oppositeTaint := v1.Taint{} 639 switch condition.Status { 640 case v1.ConditionFalse: 641 taintToAdd = *NotReadyTaintTemplate 642 oppositeTaint = *UnreachableTaintTemplate 643 case v1.ConditionUnknown: 644 taintToAdd = *UnreachableTaintTemplate 645 oppositeTaint = *NotReadyTaintTemplate 646 default: 647 // It seems that the Node is ready again, so there's no need to taint it. 648 logger.V(4).Info("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value)) 649 return true, 0 650 } 651 result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node) 652 if result { 653 // Count the number of evictions. 654 zone := nodetopology.GetZoneKey(node) 655 evictionsTotal.WithLabelValues(zone).Inc() 656 } 657 658 return result, 0 659 }) 660 } 661 } 662 663 // monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown". 664 // This function will 665 // - add nodes which are not ready or not reachable for a long period of time to a rate-limited 666 // queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function, 667 // - update the PodReady condition Pods according to the state of the Node Ready condition. 668 func (nc *Controller) monitorNodeHealth(ctx context.Context) error { 669 start := nc.now() 670 defer func() { 671 updateAllNodesHealthDuration.Observe(time.Since(start.Time).Seconds()) 672 }() 673 674 // We are listing nodes from local cache as we can tolerate some small delays 675 // comparing to state from etcd and there is eventual consistency anyway. 676 nodes, err := nc.nodeLister.List(labels.Everything()) 677 if err != nil { 678 return err 679 } 680 added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes) 681 logger := klog.FromContext(ctx) 682 for i := range newZoneRepresentatives { 683 nc.addPodEvictorForNewZone(logger, newZoneRepresentatives[i]) 684 } 685 for i := range added { 686 logger.V(1).Info("Controller observed a new Node", "node", klog.KRef("", added[i].Name)) 687 controllerutil.RecordNodeEvent(ctx, nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) 688 nc.knownNodeSet[added[i].Name] = added[i] 689 nc.addPodEvictorForNewZone(logger, added[i]) 690 nc.markNodeAsReachable(ctx, added[i]) 691 } 692 693 for i := range deleted { 694 logger.V(1).Info("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name)) 695 controllerutil.RecordNodeEvent(ctx, nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name)) 696 delete(nc.knownNodeSet, deleted[i].Name) 697 } 698 699 var zoneToNodeConditionsLock sync.Mutex 700 zoneToNodeConditions := map[string][]*v1.NodeCondition{} 701 updateNodeFunc := func(piece int) { 702 start := nc.now() 703 defer func() { 704 updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds()) 705 }() 706 707 var observedReadyCondition v1.NodeCondition 708 var currentReadyCondition *v1.NodeCondition 709 node := nodes[piece].DeepCopy() 710 711 if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) { 712 var err error 713 _, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node) 714 if err == nil { 715 return true, nil 716 } 717 name := node.Name 718 node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) 719 if err != nil { 720 logger.Error(nil, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name)) 721 return false, err 722 } 723 return false, nil 724 }); err != nil { 725 logger.Error(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node)) 726 return 727 } 728 729 // Some nodes may be excluded from disruption checking 730 if !isNodeExcludedFromDisruptionChecks(node) { 731 zoneToNodeConditionsLock.Lock() 732 zoneToNodeConditions[nodetopology.GetZoneKey(node)] = append(zoneToNodeConditions[nodetopology.GetZoneKey(node)], currentReadyCondition) 733 zoneToNodeConditionsLock.Unlock() 734 } 735 736 if currentReadyCondition != nil { 737 pods, err := nc.getPodsAssignedToNode(node.Name) 738 if err != nil { 739 utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err)) 740 if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue { 741 // If error happened during node status transition (Ready -> NotReady) 742 // we need to mark node for retry to force MarkPodsNotReady execution 743 // in the next iteration. 744 nc.nodesToRetry.Store(node.Name, struct{}{}) 745 } 746 return 747 } 748 nc.processTaintBaseEviction(ctx, node, &observedReadyCondition) 749 750 _, needsRetry := nc.nodesToRetry.Load(node.Name) 751 switch { 752 case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue: 753 // Report node event only once when status changed. 754 controllerutil.RecordNodeStatusChange(logger, nc.recorder, node, "NodeNotReady") 755 fallthrough 756 case needsRetry && observedReadyCondition.Status != v1.ConditionTrue: 757 if err = controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, node.Name); err != nil { 758 utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err)) 759 nc.nodesToRetry.Store(node.Name, struct{}{}) 760 return 761 } 762 } 763 } 764 nc.nodesToRetry.Delete(node.Name) 765 } 766 767 // Marking the pods not ready on a node requires looping over them and 768 // updating each pod's status one at a time. This is performed serially, and 769 // can take a while if we're processing each node serially as well. So we 770 // process them with bounded concurrency instead, since most of the time is 771 // spent waiting on io. 772 workqueue.ParallelizeUntil(ctx, nc.nodeUpdateWorkerSize, len(nodes), updateNodeFunc) 773 774 nc.handleDisruption(ctx, zoneToNodeConditions, nodes) 775 776 return nil 777 } 778 779 func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) { 780 decisionTimestamp := nc.now() 781 // Check eviction timeout against decisionTimestamp 782 logger := klog.FromContext(ctx) 783 switch observedReadyCondition.Status { 784 case v1.ConditionFalse: 785 // We want to update the taint straight away if Node is already tainted with the UnreachableTaint 786 if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { 787 taintToAdd := *NotReadyTaintTemplate 788 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { 789 logger.Error(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle") 790 } 791 } else if nc.markNodeForTainting(node, v1.ConditionFalse) { 792 logger.V(2).Info("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) 793 } 794 case v1.ConditionUnknown: 795 // We want to update the taint straight away if Node is already tainted with the UnreachableTaint 796 if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { 797 taintToAdd := *UnreachableTaintTemplate 798 if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { 799 logger.Error(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle") 800 } 801 } else if nc.markNodeForTainting(node, v1.ConditionUnknown) { 802 logger.V(2).Info("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) 803 } 804 case v1.ConditionTrue: 805 removed, err := nc.markNodeAsReachable(ctx, node) 806 if err != nil { 807 logger.Error(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node)) 808 } 809 if removed { 810 logger.V(2).Info("Node is healthy again, removing all taints", "node", klog.KObj(node)) 811 } 812 } 813 } 814 815 // labelNodeDisruptionExclusion is a label on nodes that controls whether they are 816 // excluded from being considered for disruption checks by the node controller. 817 const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption" 818 819 func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool { 820 if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok { 821 return true 822 } 823 return false 824 } 825 826 // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to 827 // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred. 828 func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) { 829 nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name) 830 defer func() { 831 nc.nodeHealthMap.set(node.Name, nodeHealth) 832 }() 833 834 var gracePeriod time.Duration 835 var observedReadyCondition v1.NodeCondition 836 _, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 837 if currentReadyCondition == nil { 838 // If ready condition is nil, then kubelet (or nodecontroller) never posted node status. 839 // A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set 840 // to node.CreationTimestamp to avoid handle the corner case. 841 observedReadyCondition = v1.NodeCondition{ 842 Type: v1.NodeReady, 843 Status: v1.ConditionUnknown, 844 LastHeartbeatTime: node.CreationTimestamp, 845 LastTransitionTime: node.CreationTimestamp, 846 } 847 gracePeriod = nc.nodeStartupGracePeriod 848 if nodeHealth != nil { 849 nodeHealth.status = &node.Status 850 } else { 851 nodeHealth = &nodeHealthData{ 852 status: &node.Status, 853 probeTimestamp: node.CreationTimestamp, 854 readyTransitionTimestamp: node.CreationTimestamp, 855 } 856 } 857 } else { 858 // If ready condition is not nil, make a copy of it, since we may modify it in place later. 859 observedReadyCondition = *currentReadyCondition 860 gracePeriod = nc.nodeMonitorGracePeriod 861 } 862 // There are following cases to check: 863 // - both saved and new status have no Ready Condition set - we leave everything as it is, 864 // - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd, 865 // - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do, 866 // - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be 867 // unresponsive, so we leave it as it is, 868 // - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State - 869 // everything's in order, no transition occurred, we update only probeTimestamp, 870 // - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State - 871 // Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp. 872 // TODO: things to consider: 873 // - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it, 874 // - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check 875 // if that's the case, but it does not seem necessary. 876 var savedCondition *v1.NodeCondition 877 var savedLease *coordv1.Lease 878 if nodeHealth != nil { 879 _, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) 880 savedLease = nodeHealth.lease 881 } 882 logger := klog.FromContext(ctx) 883 if nodeHealth == nil { 884 logger.Info("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node)) 885 nodeHealth = &nodeHealthData{ 886 status: &node.Status, 887 probeTimestamp: nc.now(), 888 readyTransitionTimestamp: nc.now(), 889 } 890 } else if savedCondition == nil && currentReadyCondition != nil { 891 logger.V(1).Info("Creating timestamp entry for newly observed Node", "node", klog.KObj(node)) 892 nodeHealth = &nodeHealthData{ 893 status: &node.Status, 894 probeTimestamp: nc.now(), 895 readyTransitionTimestamp: nc.now(), 896 } 897 } else if savedCondition != nil && currentReadyCondition == nil { 898 logger.Error(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node)) 899 // TODO: figure out what to do in this case. For now we do the same thing as above. 900 nodeHealth = &nodeHealthData{ 901 status: &node.Status, 902 probeTimestamp: nc.now(), 903 readyTransitionTimestamp: nc.now(), 904 } 905 } else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime { 906 var transitionTime metav1.Time 907 // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", 908 // otherwise we leave it as it is. 909 if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { 910 logger.V(3).Info("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition) 911 transitionTime = nc.now() 912 } else { 913 transitionTime = nodeHealth.readyTransitionTimestamp 914 } 915 if loggerV := logger.V(5); loggerV.Enabled() { 916 loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status) 917 } else { 918 logger.V(3).Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node)) 919 } 920 nodeHealth = &nodeHealthData{ 921 status: &node.Status, 922 probeTimestamp: nc.now(), 923 readyTransitionTimestamp: transitionTime, 924 } 925 } 926 // Always update the probe time if node lease is renewed. 927 // Note: If kubelet never posted the node status, but continues renewing the 928 // heartbeat leases, the node controller will assume the node is healthy and 929 // take no action. 930 observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name) 931 if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) { 932 nodeHealth.lease = observedLease 933 nodeHealth.probeTimestamp = nc.now() 934 } 935 936 if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) { 937 // NodeReady condition or lease was last set longer ago than gracePeriod, so 938 // update it to Unknown (regardless of its current value) in the master. 939 940 nodeConditionTypes := []v1.NodeConditionType{ 941 v1.NodeReady, 942 v1.NodeMemoryPressure, 943 v1.NodeDiskPressure, 944 v1.NodePIDPressure, 945 // We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level. 946 // v1.NodeNetworkUnavailable, 947 } 948 949 nowTimestamp := nc.now() 950 for _, nodeConditionType := range nodeConditionTypes { 951 _, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType) 952 if currentCondition == nil { 953 logger.V(2).Info("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node)) 954 node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ 955 Type: nodeConditionType, 956 Status: v1.ConditionUnknown, 957 Reason: "NodeStatusNeverUpdated", 958 Message: "Kubelet never posted node status.", 959 LastHeartbeatTime: node.CreationTimestamp, 960 LastTransitionTime: nowTimestamp, 961 }) 962 } else { 963 logger.V(2).Info("Node hasn't been updated", 964 "node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition) 965 if currentCondition.Status != v1.ConditionUnknown { 966 currentCondition.Status = v1.ConditionUnknown 967 currentCondition.Reason = "NodeStatusUnknown" 968 currentCondition.Message = "Kubelet stopped posting node status." 969 currentCondition.LastTransitionTime = nowTimestamp 970 } 971 } 972 } 973 // We need to update currentReadyCondition due to its value potentially changed. 974 _, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady) 975 976 if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { 977 if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { 978 logger.Error(err, "Error updating node", "node", klog.KObj(node)) 979 return gracePeriod, observedReadyCondition, currentReadyCondition, err 980 } 981 nodeHealth = &nodeHealthData{ 982 status: &node.Status, 983 probeTimestamp: nodeHealth.probeTimestamp, 984 readyTransitionTimestamp: nc.now(), 985 lease: observedLease, 986 } 987 return gracePeriod, observedReadyCondition, currentReadyCondition, nil 988 } 989 } 990 991 return gracePeriod, observedReadyCondition, currentReadyCondition, nil 992 } 993 994 func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { 995 newZoneStates := map[string]ZoneState{} 996 allAreFullyDisrupted := true 997 logger := klog.FromContext(ctx) 998 for k, v := range zoneToNodeConditions { 999 zoneSize.WithLabelValues(k).Set(float64(len(v))) 1000 unhealthy, newState := nc.computeZoneStateFunc(v) 1001 zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v))) 1002 unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy)) 1003 if newState != stateFullDisruption { 1004 allAreFullyDisrupted = false 1005 } 1006 newZoneStates[k] = newState 1007 if _, had := nc.zoneStates[k]; !had { 1008 logger.Error(nil, "Setting initial state for unseen zone", "zone", k) 1009 nc.zoneStates[k] = stateInitial 1010 } 1011 } 1012 1013 allWasFullyDisrupted := true 1014 for k, v := range nc.zoneStates { 1015 if _, have := zoneToNodeConditions[k]; !have { 1016 zoneSize.WithLabelValues(k).Set(0) 1017 zoneHealth.WithLabelValues(k).Set(100) 1018 unhealthyNodes.WithLabelValues(k).Set(0) 1019 delete(nc.zoneStates, k) 1020 continue 1021 } 1022 if v != stateFullDisruption { 1023 allWasFullyDisrupted = false 1024 break 1025 } 1026 } 1027 1028 // At least one node was responding in previous pass or in the current pass. Semantics is as follows: 1029 // - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use, 1030 // - if the new state is "normal" we resume normal operation (go back to default limiter settings), 1031 // - if new state is "fullDisruption" we restore normal eviction rate, 1032 // - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions. 1033 if !allAreFullyDisrupted || !allWasFullyDisrupted { 1034 // We're switching to full disruption mode 1035 if allAreFullyDisrupted { 1036 logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode") 1037 for i := range nodes { 1038 _, err := nc.markNodeAsReachable(ctx, nodes[i]) 1039 if err != nil { 1040 logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) 1041 } 1042 } 1043 // We stop all evictions. 1044 for k := range nc.zoneStates { 1045 nc.zoneNoExecuteTainter[k].SwapLimiter(0) 1046 } 1047 for k := range nc.zoneStates { 1048 nc.zoneStates[k] = stateFullDisruption 1049 } 1050 // All rate limiters are updated, so we can return early here. 1051 return 1052 } 1053 // We're exiting full disruption mode 1054 if allWasFullyDisrupted { 1055 logger.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode") 1056 // When exiting disruption mode update probe timestamps on all Nodes. 1057 now := nc.now() 1058 for i := range nodes { 1059 v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name) 1060 v.probeTimestamp = now 1061 v.readyTransitionTimestamp = now 1062 nc.nodeHealthMap.set(nodes[i].Name, v) 1063 } 1064 // We reset all rate limiters to settings appropriate for the given state. 1065 for k := range nc.zoneStates { 1066 nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k]) 1067 nc.zoneStates[k] = newZoneStates[k] 1068 } 1069 return 1070 } 1071 // We know that there's at least one not-fully disrupted so, 1072 // we can use default behavior for rate limiters 1073 for k, v := range nc.zoneStates { 1074 newState := newZoneStates[k] 1075 if v == newState { 1076 continue 1077 } 1078 logger.Info("Controller detected that zone is now in new state", "zone", k, "newState", newState) 1079 nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState) 1080 nc.zoneStates[k] = newState 1081 } 1082 } 1083 } 1084 1085 func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) { 1086 if newPod == nil { 1087 return 1088 } 1089 if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) { 1090 podItem := podUpdateItem{newPod.Namespace, newPod.Name} 1091 nc.podUpdateQueue.Add(podItem) 1092 } 1093 } 1094 1095 func (nc *Controller) doPodProcessingWorker(ctx context.Context) { 1096 for { 1097 obj, shutdown := nc.podUpdateQueue.Get() 1098 // "podUpdateQueue" will be shutdown when "stopCh" closed; 1099 // we do not need to re-check "stopCh" again. 1100 if shutdown { 1101 return 1102 } 1103 1104 podItem := obj 1105 nc.processPod(ctx, podItem) 1106 } 1107 } 1108 1109 // processPod is processing events of assigning pods to nodes. In particular: 1110 // 1. for NodeReady=true node, taint eviction for this pod will be cancelled 1111 // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready 1112 // 3. if node doesn't exist in cache, it will be skipped. 1113 func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { 1114 defer nc.podUpdateQueue.Done(podItem) 1115 pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) 1116 logger := klog.FromContext(ctx) 1117 if err != nil { 1118 if apierrors.IsNotFound(err) { 1119 // If the pod was deleted, there is no need to requeue. 1120 return 1121 } 1122 logger.Info("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err) 1123 nc.podUpdateQueue.AddRateLimited(podItem) 1124 return 1125 } 1126 1127 nodeName := pod.Spec.NodeName 1128 1129 nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName) 1130 if nodeHealth == nil { 1131 // Node data is not gathered yet or node has been removed in the meantime. 1132 return 1133 } 1134 1135 _, err = nc.nodeLister.Get(nodeName) 1136 if err != nil { 1137 logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err) 1138 nc.podUpdateQueue.AddRateLimited(podItem) 1139 return 1140 } 1141 1142 _, currentReadyCondition := controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) 1143 if currentReadyCondition == nil { 1144 // Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted). 1145 // In both cases, the pod will be handled correctly (evicted if needed) during processing 1146 // of the next node update event. 1147 return 1148 } 1149 1150 pods := []*v1.Pod{pod} 1151 if currentReadyCondition.Status != v1.ConditionTrue { 1152 if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil { 1153 logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) 1154 nc.podUpdateQueue.AddRateLimited(podItem) 1155 } 1156 } 1157 } 1158 1159 func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) { 1160 switch state { 1161 case stateNormal: 1162 nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS) 1163 case statePartialDisruption: 1164 nc.zoneNoExecuteTainter[zone].SwapLimiter( 1165 nc.enterPartialDisruptionFunc(zoneSize)) 1166 case stateFullDisruption: 1167 nc.zoneNoExecuteTainter[zone].SwapLimiter( 1168 nc.enterFullDisruptionFunc(zoneSize)) 1169 } 1170 } 1171 1172 // classifyNodes classifies the allNodes to three categories: 1173 // 1. added: the nodes that in 'allNodes', but not in 'knownNodeSet' 1174 // 2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes' 1175 // 3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states 1176 func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) { 1177 for i := range allNodes { 1178 if _, has := nc.knownNodeSet[allNodes[i].Name]; !has { 1179 added = append(added, allNodes[i]) 1180 } else { 1181 // Currently, we only consider new zone as updated. 1182 zone := nodetopology.GetZoneKey(allNodes[i]) 1183 if _, found := nc.zoneStates[zone]; !found { 1184 newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i]) 1185 } 1186 } 1187 } 1188 1189 // If there's a difference between lengths of known Nodes and observed nodes 1190 // we must have removed some Node. 1191 if len(nc.knownNodeSet)+len(added) != len(allNodes) { 1192 knowSetCopy := map[string]*v1.Node{} 1193 for k, v := range nc.knownNodeSet { 1194 knowSetCopy[k] = v 1195 } 1196 for i := range allNodes { 1197 delete(knowSetCopy, allNodes[i].Name) 1198 } 1199 for i := range knowSetCopy { 1200 deleted = append(deleted, knowSetCopy[i]) 1201 } 1202 } 1203 return 1204 } 1205 1206 // HealthyQPSFunc returns the default value for cluster eviction rate - we take 1207 // nodeNum for consistency with ReducedQPSFunc. 1208 func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 { 1209 return nc.evictionLimiterQPS 1210 } 1211 1212 // ReducedQPSFunc returns the QPS for when the cluster is large make 1213 // evictions slower, if they're small stop evictions altogether. 1214 func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 { 1215 if int32(nodeNum) > nc.largeClusterThreshold { 1216 return nc.secondaryEvictionLimiterQPS 1217 } 1218 return 0 1219 } 1220 1221 // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor. 1222 func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) { 1223 nc.evictorLock.Lock() 1224 defer nc.evictorLock.Unlock() 1225 zone := nodetopology.GetZoneKey(node) 1226 if _, found := nc.zoneStates[zone]; !found { 1227 nc.zoneStates[zone] = stateInitial 1228 nc.zoneNoExecuteTainter[zone] = 1229 scheduler.NewRateLimitedTimedQueue( 1230 flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) 1231 // Init the metric for the new zone. 1232 logger.Info("Initializing eviction metric for zone", "zone", zone) 1233 evictionsTotal.WithLabelValues(zone).Add(0) 1234 } 1235 } 1236 1237 func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool { 1238 nc.evictorLock.Lock() 1239 defer nc.evictorLock.Unlock() 1240 if status == v1.ConditionFalse { 1241 if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { 1242 nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) 1243 } 1244 } 1245 1246 if status == v1.ConditionUnknown { 1247 if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { 1248 nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name) 1249 } 1250 } 1251 1252 return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID)) 1253 } 1254 1255 func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) { 1256 err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate) 1257 logger := klog.FromContext(ctx) 1258 if err != nil { 1259 logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) 1260 return false, err 1261 } 1262 err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate) 1263 if err != nil { 1264 logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) 1265 return false, err 1266 } 1267 nc.evictorLock.Lock() 1268 defer nc.evictorLock.Unlock() 1269 1270 return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name), nil 1271 } 1272 1273 // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone. 1274 // The zone is considered: 1275 // - fullyDisrupted if there're no Ready Nodes, 1276 // - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready, 1277 // - normal otherwise 1278 func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) { 1279 readyNodes := 0 1280 notReadyNodes := 0 1281 for i := range nodeReadyConditions { 1282 if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue { 1283 readyNodes++ 1284 } else { 1285 notReadyNodes++ 1286 } 1287 } 1288 switch { 1289 case readyNodes == 0 && notReadyNodes > 0: 1290 return notReadyNodes, stateFullDisruption 1291 case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold: 1292 return notReadyNodes, statePartialDisruption 1293 default: 1294 return notReadyNodes, stateNormal 1295 } 1296 } 1297 1298 // reconcileNodeLabels reconciles node labels. 1299 func (nc *Controller) reconcileNodeLabels(ctx context.Context, nodeName string) error { 1300 node, err := nc.nodeLister.Get(nodeName) 1301 if err != nil { 1302 // If node not found, just ignore it. 1303 if apierrors.IsNotFound(err) { 1304 return nil 1305 } 1306 return err 1307 } 1308 1309 if node.Labels == nil { 1310 // Nothing to reconcile. 1311 return nil 1312 } 1313 1314 labelsToUpdate := map[string]string{} 1315 for _, r := range labelReconcileInfo { 1316 primaryValue, primaryExists := node.Labels[r.primaryKey] 1317 secondaryValue, secondaryExists := node.Labels[r.secondaryKey] 1318 1319 if !primaryExists { 1320 // The primary label key does not exist. This should not happen 1321 // within our supported version skew range, when no external 1322 // components/factors modifying the node object. Ignore this case. 1323 continue 1324 } 1325 if secondaryExists && primaryValue != secondaryValue { 1326 // Secondary label exists, but not consistent with the primary 1327 // label. Need to reconcile. 1328 labelsToUpdate[r.secondaryKey] = primaryValue 1329 1330 } else if !secondaryExists && r.ensureSecondaryExists { 1331 // Apply secondary label based on primary label. 1332 labelsToUpdate[r.secondaryKey] = primaryValue 1333 } 1334 } 1335 1336 if len(labelsToUpdate) == 0 { 1337 return nil 1338 } 1339 if !controllerutil.AddOrUpdateLabelsOnNode(ctx, nc.kubeClient, labelsToUpdate, node) { 1340 return fmt.Errorf("failed update labels for node %+v", node) 1341 } 1342 return nil 1343 }