k8s.io/kubernetes@v1.29.3/pkg/controller/nodelifecycle/node_lifecycle_controller.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // The Controller sets tainted annotations on nodes.
    18  // Tainted nodes should not be used for new work loads and
    19  // some effort should be given to getting existing work
    20  // loads off of tainted nodes.
    21  
    22  package nodelifecycle
    23  
    24  import (
    25  	"context"
    26  	"fmt"
    27  	"sync"
    28  	"time"
    29  
    30  	"k8s.io/klog/v2"
    31  
    32  	coordv1 "k8s.io/api/coordination/v1"
    33  	v1 "k8s.io/api/core/v1"
    34  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    35  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/apimachinery/pkg/labels"
    38  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    39  	"k8s.io/apimachinery/pkg/util/wait"
    40  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    41  	appsv1informers "k8s.io/client-go/informers/apps/v1"
    42  	coordinformers "k8s.io/client-go/informers/coordination/v1"
    43  	coreinformers "k8s.io/client-go/informers/core/v1"
    44  	clientset "k8s.io/client-go/kubernetes"
    45  	"k8s.io/client-go/kubernetes/scheme"
    46  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    47  	appsv1listers "k8s.io/client-go/listers/apps/v1"
    48  	coordlisters "k8s.io/client-go/listers/coordination/v1"
    49  	corelisters "k8s.io/client-go/listers/core/v1"
    50  	"k8s.io/client-go/tools/cache"
    51  	"k8s.io/client-go/tools/record"
    52  	"k8s.io/client-go/util/flowcontrol"
    53  	"k8s.io/client-go/util/workqueue"
    54  	nodetopology "k8s.io/component-helpers/node/topology"
    55  	kubeletapis "k8s.io/kubelet/pkg/apis"
    56  	"k8s.io/kubernetes/pkg/controller"
    57  	"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
    58  	"k8s.io/kubernetes/pkg/controller/tainteviction"
    59  	controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
    60  	"k8s.io/kubernetes/pkg/features"
    61  	taintutils "k8s.io/kubernetes/pkg/util/taints"
    62  )
    63  
    64  func init() {
    65  	// Register prometheus metrics
    66  	Register()
    67  }
    68  
    69  var (
    70  	// UnreachableTaintTemplate is the taint for when a node becomes unreachable.
    71  	UnreachableTaintTemplate = &v1.Taint{
    72  		Key:    v1.TaintNodeUnreachable,
    73  		Effect: v1.TaintEffectNoExecute,
    74  	}
    75  
    76  	// NotReadyTaintTemplate is the taint for when a node is not ready for
    77  	// executing pods
    78  	NotReadyTaintTemplate = &v1.Taint{
    79  		Key:    v1.TaintNodeNotReady,
    80  		Effect: v1.TaintEffectNoExecute,
    81  	}
    82  
    83  	// map {NodeConditionType: {ConditionStatus: TaintKey}}
    84  	// represents which NodeConditionType under which ConditionStatus should be
    85  	// tainted with which TaintKey
    86  	// for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs
    87  	nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{
    88  		v1.NodeReady: {
    89  			v1.ConditionFalse:   v1.TaintNodeNotReady,
    90  			v1.ConditionUnknown: v1.TaintNodeUnreachable,
    91  		},
    92  		v1.NodeMemoryPressure: {
    93  			v1.ConditionTrue: v1.TaintNodeMemoryPressure,
    94  		},
    95  		v1.NodeDiskPressure: {
    96  			v1.ConditionTrue: v1.TaintNodeDiskPressure,
    97  		},
    98  		v1.NodeNetworkUnavailable: {
    99  			v1.ConditionTrue: v1.TaintNodeNetworkUnavailable,
   100  		},
   101  		v1.NodePIDPressure: {
   102  			v1.ConditionTrue: v1.TaintNodePIDPressure,
   103  		},
   104  	}
   105  
   106  	taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{
   107  		v1.TaintNodeNotReady:           v1.NodeReady,
   108  		v1.TaintNodeUnreachable:        v1.NodeReady,
   109  		v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable,
   110  		v1.TaintNodeMemoryPressure:     v1.NodeMemoryPressure,
   111  		v1.TaintNodeDiskPressure:       v1.NodeDiskPressure,
   112  		v1.TaintNodePIDPressure:        v1.NodePIDPressure,
   113  	}
   114  )
   115  
   116  // ZoneState is the state of a given zone.
   117  type ZoneState string
   118  
   119  const (
   120  	stateInitial           = ZoneState("Initial")
   121  	stateNormal            = ZoneState("Normal")
   122  	stateFullDisruption    = ZoneState("FullDisruption")
   123  	statePartialDisruption = ZoneState("PartialDisruption")
   124  )
   125  
   126  const (
   127  	// The amount of time the nodecontroller should sleep between retrying node health updates
   128  	retrySleepTime   = 20 * time.Millisecond
   129  	nodeNameKeyIndex = "spec.nodeName"
   130  	// podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass.
   131  	// Pod update workers will only handle lagging cache pods. 4 workers should be enough.
   132  	podUpdateWorkerSize = 4
   133  	// nodeUpdateWorkerSize defines the size of workers for node update or/and pod update.
   134  	nodeUpdateWorkerSize = 8
   135  
   136  	// taintEvictionController is defined here in order to prevent imports of
   137  	// k8s.io/kubernetes/cmd/kube-controller-manager/names which would result in validation errors.
   138  	// This constant will be removed upon graduation of the SeparateTaintEvictionController feature.
   139  	taintEvictionController = "taint-eviction-controller"
   140  )
   141  
   142  // labelReconcileInfo lists Node labels to reconcile, and how to reconcile them.
   143  // primaryKey and secondaryKey are keys of labels to reconcile.
   144  //   - If both keys exist, but their values don't match. Use the value from the
   145  //     primaryKey as the source of truth to reconcile.
   146  //   - If ensureSecondaryExists is true, and the secondaryKey does not
   147  //     exist, secondaryKey will be added with the value of the primaryKey.
   148  var labelReconcileInfo = []struct {
   149  	primaryKey            string
   150  	secondaryKey          string
   151  	ensureSecondaryExists bool
   152  }{
   153  	{
   154  		// Reconcile the beta and the stable OS label using the stable label as the source of truth.
   155  		// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
   156  		primaryKey:            v1.LabelOSStable,
   157  		secondaryKey:          kubeletapis.LabelOS,
   158  		ensureSecondaryExists: true,
   159  	},
   160  	{
   161  		// Reconcile the beta and the stable arch label using the stable label as the source of truth.
   162  		// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
   163  		primaryKey:            v1.LabelArchStable,
   164  		secondaryKey:          kubeletapis.LabelArch,
   165  		ensureSecondaryExists: true,
   166  	},
   167  }
   168  
   169  type nodeHealthData struct {
   170  	probeTimestamp           metav1.Time
   171  	readyTransitionTimestamp metav1.Time
   172  	status                   *v1.NodeStatus
   173  	lease                    *coordv1.Lease
   174  }
   175  
   176  func (n *nodeHealthData) deepCopy() *nodeHealthData {
   177  	if n == nil {
   178  		return nil
   179  	}
   180  	return &nodeHealthData{
   181  		probeTimestamp:           n.probeTimestamp,
   182  		readyTransitionTimestamp: n.readyTransitionTimestamp,
   183  		status:                   n.status.DeepCopy(),
   184  		lease:                    n.lease.DeepCopy(),
   185  	}
   186  }
   187  
   188  type nodeHealthMap struct {
   189  	lock        sync.RWMutex
   190  	nodeHealths map[string]*nodeHealthData
   191  }
   192  
   193  func newNodeHealthMap() *nodeHealthMap {
   194  	return &nodeHealthMap{
   195  		nodeHealths: make(map[string]*nodeHealthData),
   196  	}
   197  }
   198  
   199  // getDeepCopy - returns copy of node health data.
   200  // It prevents data being changed after retrieving it from the map.
   201  func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData {
   202  	n.lock.RLock()
   203  	defer n.lock.RUnlock()
   204  	return n.nodeHealths[name].deepCopy()
   205  }
   206  
   207  func (n *nodeHealthMap) set(name string, data *nodeHealthData) {
   208  	n.lock.Lock()
   209  	defer n.lock.Unlock()
   210  	n.nodeHealths[name] = data
   211  }
   212  
   213  type podUpdateItem struct {
   214  	namespace string
   215  	name      string
   216  }
   217  
   218  // Controller is the controller that manages node's life cycle.
   219  type Controller struct {
   220  	taintManager *tainteviction.Controller
   221  
   222  	podLister         corelisters.PodLister
   223  	podInformerSynced cache.InformerSynced
   224  	kubeClient        clientset.Interface
   225  
   226  	// This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
   227  	// to avoid the problem with time skew across the cluster.
   228  	now func() metav1.Time
   229  
   230  	enterPartialDisruptionFunc func(nodeNum int) float32
   231  	enterFullDisruptionFunc    func(nodeNum int) float32
   232  	computeZoneStateFunc       func(nodeConditions []*v1.NodeCondition) (int, ZoneState)
   233  
   234  	knownNodeSet map[string]*v1.Node
   235  	// per Node map storing last observed health together with a local time when it was observed.
   236  	nodeHealthMap *nodeHealthMap
   237  
   238  	// evictorLock protects zonePodEvictor and zoneNoExecuteTainter.
   239  	evictorLock sync.Mutex
   240  	// workers that are responsible for tainting nodes.
   241  	zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue
   242  
   243  	nodesToRetry sync.Map
   244  
   245  	zoneStates map[string]ZoneState
   246  
   247  	daemonSetStore          appsv1listers.DaemonSetLister
   248  	daemonSetInformerSynced cache.InformerSynced
   249  
   250  	leaseLister         coordlisters.LeaseLister
   251  	leaseInformerSynced cache.InformerSynced
   252  	nodeLister          corelisters.NodeLister
   253  	nodeInformerSynced  cache.InformerSynced
   254  
   255  	getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error)
   256  
   257  	broadcaster record.EventBroadcaster
   258  	recorder    record.EventRecorder
   259  
   260  	// Value controlling Controller monitoring period, i.e. how often does Controller
   261  	// check node health signal posted from kubelet. This value should be lower than
   262  	// nodeMonitorGracePeriod.
   263  	// TODO: Change node health monitor to watch based.
   264  	nodeMonitorPeriod time.Duration
   265  
   266  	// When node is just created, e.g. cluster bootstrap or node creation, we give
   267  	// a longer grace period.
   268  	nodeStartupGracePeriod time.Duration
   269  
   270  	// Controller will not proactively sync node health, but will monitor node
   271  	// health signal updated from kubelet. There are 2 kinds of node healthiness
   272  	// signals: NodeStatus and NodeLease. If it doesn't receive update for this amount
   273  	// of time, it will start posting "NodeReady==ConditionUnknown". The amount of
   274  	// time before which Controller start evicting pods is controlled via flag
   275  	// 'pod-eviction-timeout'.
   276  	// Note: be cautious when changing the constant, it must work with
   277  	// nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
   278  	// controller. The node health signal update frequency is the minimal of the
   279  	// two.
   280  	// There are several constraints:
   281  	// 1. nodeMonitorGracePeriod must be N times more than  the node health signal
   282  	//    update frequency, where N means number of retries allowed for kubelet to
   283  	//    post node status/lease. It is pointless to make nodeMonitorGracePeriod
   284  	//    be less than the node health signal update frequency, since there will
   285  	//    only be fresh values from Kubelet at an interval of node health signal
   286  	//    update frequency.
   287  	// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
   288  	//    value takes longer for user to see up-to-date node health.
   289  	nodeMonitorGracePeriod time.Duration
   290  
   291  	// Number of workers Controller uses to process node monitor health updates.
   292  	// Defaults to scheduler.UpdateWorkerSize.
   293  	nodeUpdateWorkerSize int
   294  
   295  	evictionLimiterQPS          float32
   296  	secondaryEvictionLimiterQPS float32
   297  	largeClusterThreshold       int32
   298  	unhealthyZoneThreshold      float32
   299  
   300  	nodeUpdateQueue workqueue.Interface
   301  	podUpdateQueue  workqueue.RateLimitingInterface
   302  }
   303  
   304  // NewNodeLifecycleController returns a new taint controller.
   305  func NewNodeLifecycleController(
   306  	ctx context.Context,
   307  	leaseInformer coordinformers.LeaseInformer,
   308  	podInformer coreinformers.PodInformer,
   309  	nodeInformer coreinformers.NodeInformer,
   310  	daemonSetInformer appsv1informers.DaemonSetInformer,
   311  	kubeClient clientset.Interface,
   312  	nodeMonitorPeriod time.Duration,
   313  	nodeStartupGracePeriod time.Duration,
   314  	nodeMonitorGracePeriod time.Duration,
   315  	evictionLimiterQPS float32,
   316  	secondaryEvictionLimiterQPS float32,
   317  	largeClusterThreshold int32,
   318  	unhealthyZoneThreshold float32,
   319  ) (*Controller, error) {
   320  	logger := klog.FromContext(ctx)
   321  	if kubeClient == nil {
   322  		logger.Error(nil, "kubeClient is nil when starting nodelifecycle Controller")
   323  		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
   324  	}
   325  
   326  	eventBroadcaster := record.NewBroadcaster()
   327  	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"})
   328  
   329  	nc := &Controller{
   330  		kubeClient:                  kubeClient,
   331  		now:                         metav1.Now,
   332  		knownNodeSet:                make(map[string]*v1.Node),
   333  		nodeHealthMap:               newNodeHealthMap(),
   334  		broadcaster:                 eventBroadcaster,
   335  		recorder:                    recorder,
   336  		nodeMonitorPeriod:           nodeMonitorPeriod,
   337  		nodeStartupGracePeriod:      nodeStartupGracePeriod,
   338  		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
   339  		nodeUpdateWorkerSize:        nodeUpdateWorkerSize,
   340  		zoneNoExecuteTainter:        make(map[string]*scheduler.RateLimitedTimedQueue),
   341  		nodesToRetry:                sync.Map{},
   342  		zoneStates:                  make(map[string]ZoneState),
   343  		evictionLimiterQPS:          evictionLimiterQPS,
   344  		secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
   345  		largeClusterThreshold:       largeClusterThreshold,
   346  		unhealthyZoneThreshold:      unhealthyZoneThreshold,
   347  		nodeUpdateQueue:             workqueue.NewNamed("node_lifecycle_controller"),
   348  		podUpdateQueue:              workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "node_lifecycle_controller_pods"),
   349  	}
   350  
   351  	nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
   352  	nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
   353  	nc.computeZoneStateFunc = nc.ComputeZoneState
   354  
   355  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   356  		AddFunc: func(obj interface{}) {
   357  			pod := obj.(*v1.Pod)
   358  			nc.podUpdated(nil, pod)
   359  		},
   360  		UpdateFunc: func(prev, obj interface{}) {
   361  			prevPod := prev.(*v1.Pod)
   362  			newPod := obj.(*v1.Pod)
   363  			nc.podUpdated(prevPod, newPod)
   364  		},
   365  		DeleteFunc: func(obj interface{}) {
   366  			pod, isPod := obj.(*v1.Pod)
   367  			// We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
   368  			if !isPod {
   369  				deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
   370  				if !ok {
   371  					logger.Error(nil, "Received unexpected object", "object", obj)
   372  					return
   373  				}
   374  				pod, ok = deletedState.Obj.(*v1.Pod)
   375  				if !ok {
   376  					logger.Error(nil, "DeletedFinalStateUnknown contained non-Pod object", "object", deletedState.Obj)
   377  					return
   378  				}
   379  			}
   380  			nc.podUpdated(pod, nil)
   381  		},
   382  	})
   383  	nc.podInformerSynced = podInformer.Informer().HasSynced
   384  	podInformer.Informer().AddIndexers(cache.Indexers{
   385  		nodeNameKeyIndex: func(obj interface{}) ([]string, error) {
   386  			pod, ok := obj.(*v1.Pod)
   387  			if !ok {
   388  				return []string{}, nil
   389  			}
   390  			if len(pod.Spec.NodeName) == 0 {
   391  				return []string{}, nil
   392  			}
   393  			return []string{pod.Spec.NodeName}, nil
   394  		},
   395  	})
   396  
   397  	podIndexer := podInformer.Informer().GetIndexer()
   398  	nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) {
   399  		objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName)
   400  		if err != nil {
   401  			return nil, err
   402  		}
   403  		pods := make([]*v1.Pod, 0, len(objs))
   404  		for _, obj := range objs {
   405  			pod, ok := obj.(*v1.Pod)
   406  			if !ok {
   407  				continue
   408  			}
   409  			pods = append(pods, pod)
   410  		}
   411  		return pods, nil
   412  	}
   413  	nc.podLister = podInformer.Lister()
   414  	nc.nodeLister = nodeInformer.Lister()
   415  
   416  	if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) {
   417  		logger.Info("Running TaintEvictionController as part of NodeLifecyleController")
   418  		tm, err := tainteviction.New(ctx, kubeClient, podInformer, nodeInformer, taintEvictionController)
   419  		if err != nil {
   420  			return nil, err
   421  		}
   422  		nc.taintManager = tm
   423  	}
   424  
   425  	logger.Info("Controller will reconcile labels")
   426  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   427  		AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error {
   428  			nc.nodeUpdateQueue.Add(node.Name)
   429  			return nil
   430  		}),
   431  		UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
   432  			nc.nodeUpdateQueue.Add(newNode.Name)
   433  			return nil
   434  		}),
   435  		DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error {
   436  			nc.nodesToRetry.Delete(node.Name)
   437  			return nil
   438  		}),
   439  	})
   440  
   441  	nc.leaseLister = leaseInformer.Lister()
   442  	nc.leaseInformerSynced = leaseInformer.Informer().HasSynced
   443  
   444  	nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
   445  
   446  	nc.daemonSetStore = daemonSetInformer.Lister()
   447  	nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced
   448  
   449  	return nc, nil
   450  }
   451  
   452  // Run starts an asynchronous loop that monitors the status of cluster nodes.
   453  func (nc *Controller) Run(ctx context.Context) {
   454  	defer utilruntime.HandleCrash()
   455  
   456  	// Start events processing pipeline.
   457  	nc.broadcaster.StartStructuredLogging(0)
   458  	logger := klog.FromContext(ctx)
   459  	logger.Info("Sending events to api server")
   460  	nc.broadcaster.StartRecordingToSink(
   461  		&v1core.EventSinkImpl{
   462  			Interface: v1core.New(nc.kubeClient.CoreV1().RESTClient()).Events(""),
   463  		})
   464  	defer nc.broadcaster.Shutdown()
   465  
   466  	// Close node update queue to cleanup go routine.
   467  	defer nc.nodeUpdateQueue.ShutDown()
   468  	defer nc.podUpdateQueue.ShutDown()
   469  
   470  	logger.Info("Starting node controller")
   471  	defer logger.Info("Shutting down node controller")
   472  
   473  	if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
   474  		return
   475  	}
   476  
   477  	if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) {
   478  		logger.Info("Starting", "controller", taintEvictionController)
   479  		go nc.taintManager.Run(ctx)
   480  	}
   481  
   482  	// Start workers to reconcile labels and/or update NoSchedule taint for nodes.
   483  	for i := 0; i < nodeUpdateWorkerSize; i++ {
   484  		// Thanks to "workqueue", each worker just need to get item from queue, because
   485  		// the item is flagged when got from queue: if new event come, the new item will
   486  		// be re-queued until "Done", so no more than one worker handle the same item and
   487  		// no event missed.
   488  		go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second)
   489  	}
   490  
   491  	for i := 0; i < podUpdateWorkerSize; i++ {
   492  		go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second)
   493  	}
   494  
   495  	// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
   496  	// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
   497  	go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod)
   498  
   499  	// Incorporate the results of node health signal pushed from kubelet to master.
   500  	go wait.UntilWithContext(ctx, func(ctx context.Context) {
   501  		if err := nc.monitorNodeHealth(ctx); err != nil {
   502  			logger.Error(err, "Error monitoring node health")
   503  		}
   504  	}, nc.nodeMonitorPeriod)
   505  
   506  	<-ctx.Done()
   507  }
   508  
   509  func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) {
   510  	logger := klog.FromContext(ctx)
   511  	for {
   512  		obj, shutdown := nc.nodeUpdateQueue.Get()
   513  		// "nodeUpdateQueue" will be shutdown when "stopCh" closed;
   514  		// we do not need to re-check "stopCh" again.
   515  		if shutdown {
   516  			return
   517  		}
   518  		nodeName := obj.(string)
   519  		if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil {
   520  			logger.Error(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName))
   521  			// TODO(k82cn): Add nodeName back to the queue
   522  		}
   523  		// TODO: re-evaluate whether there are any labels that need to be
   524  		// reconcile in 1.19. Remove this function if it's no longer necessary.
   525  		if err := nc.reconcileNodeLabels(ctx, nodeName); err != nil {
   526  			logger.Error(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName))
   527  			// TODO(yujuhong): Add nodeName back to the queue
   528  		}
   529  		nc.nodeUpdateQueue.Done(nodeName)
   530  	}
   531  }
   532  
   533  func (nc *Controller) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error {
   534  	node, err := nc.nodeLister.Get(nodeName)
   535  	if err != nil {
   536  		// If node not found, just ignore it.
   537  		if apierrors.IsNotFound(err) {
   538  			return nil
   539  		}
   540  		return err
   541  	}
   542  
   543  	// Map node's condition to Taints.
   544  	var taints []v1.Taint
   545  	for _, condition := range node.Status.Conditions {
   546  		if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found {
   547  			if taintKey, found := taintMap[condition.Status]; found {
   548  				taints = append(taints, v1.Taint{
   549  					Key:    taintKey,
   550  					Effect: v1.TaintEffectNoSchedule,
   551  				})
   552  			}
   553  		}
   554  	}
   555  	if node.Spec.Unschedulable {
   556  		// If unschedulable, append related taint.
   557  		taints = append(taints, v1.Taint{
   558  			Key:    v1.TaintNodeUnschedulable,
   559  			Effect: v1.TaintEffectNoSchedule,
   560  		})
   561  	}
   562  
   563  	// Get exist taints of node.
   564  	nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool {
   565  		// only NoSchedule taints are candidates to be compared with "taints" later
   566  		if t.Effect != v1.TaintEffectNoSchedule {
   567  			return false
   568  		}
   569  		// Find unschedulable taint of node.
   570  		if t.Key == v1.TaintNodeUnschedulable {
   571  			return true
   572  		}
   573  		// Find node condition taints of node.
   574  		_, found := taintKeyToNodeConditionMap[t.Key]
   575  		return found
   576  	})
   577  	taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints)
   578  	// If nothing to add or delete, return true directly.
   579  	if len(taintsToAdd) == 0 && len(taintsToDel) == 0 {
   580  		return nil
   581  	}
   582  	if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) {
   583  		return fmt.Errorf("failed to swap taints of node %+v", node)
   584  	}
   585  	return nil
   586  }
   587  
   588  func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) {
   589  	// Extract out the keys of the map in order to not hold
   590  	// the evictorLock for the entire function and hold it
   591  	// only when nescessary.
   592  	var zoneNoExecuteTainterKeys []string
   593  	func() {
   594  		nc.evictorLock.Lock()
   595  		defer nc.evictorLock.Unlock()
   596  
   597  		zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter))
   598  		for k := range nc.zoneNoExecuteTainter {
   599  			zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k)
   600  		}
   601  	}()
   602  	logger := klog.FromContext(ctx)
   603  	for _, k := range zoneNoExecuteTainterKeys {
   604  		var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue
   605  		func() {
   606  			nc.evictorLock.Lock()
   607  			defer nc.evictorLock.Unlock()
   608  			// Extracting the value without checking if the key
   609  			// exists or not is safe to do here since zones do
   610  			// not get removed, and consequently pod evictors for
   611  			// these zones also do not get removed, only added.
   612  			zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k]
   613  		}()
   614  		// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
   615  		zoneNoExecuteTainterWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) {
   616  			node, err := nc.nodeLister.Get(value.Value)
   617  			if apierrors.IsNotFound(err) {
   618  				logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value))
   619  				return true, 0
   620  			} else if err != nil {
   621  				logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err)
   622  				// retry in 50 millisecond
   623  				return false, 50 * time.Millisecond
   624  			}
   625  			_, condition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   626  			// Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
   627  			taintToAdd := v1.Taint{}
   628  			oppositeTaint := v1.Taint{}
   629  			switch condition.Status {
   630  			case v1.ConditionFalse:
   631  				taintToAdd = *NotReadyTaintTemplate
   632  				oppositeTaint = *UnreachableTaintTemplate
   633  			case v1.ConditionUnknown:
   634  				taintToAdd = *UnreachableTaintTemplate
   635  				oppositeTaint = *NotReadyTaintTemplate
   636  			default:
   637  				// It seems that the Node is ready again, so there's no need to taint it.
   638  				logger.V(4).Info("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value))
   639  				return true, 0
   640  			}
   641  			result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
   642  			if result {
   643  				// Count the number of evictions.
   644  				zone := nodetopology.GetZoneKey(node)
   645  				evictionsTotal.WithLabelValues(zone).Inc()
   646  			}
   647  
   648  			return result, 0
   649  		})
   650  	}
   651  }
   652  
   653  // monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown".
   654  // This function will
   655  //   - add nodes which are not ready or not reachable for a long period of time to a rate-limited
   656  //     queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function,
   657  //   - update the PodReady condition Pods according to the state of the Node Ready condition.
   658  func (nc *Controller) monitorNodeHealth(ctx context.Context) error {
   659  	start := nc.now()
   660  	defer func() {
   661  		updateAllNodesHealthDuration.Observe(time.Since(start.Time).Seconds())
   662  	}()
   663  
   664  	// We are listing nodes from local cache as we can tolerate some small delays
   665  	// comparing to state from etcd and there is eventual consistency anyway.
   666  	nodes, err := nc.nodeLister.List(labels.Everything())
   667  	if err != nil {
   668  		return err
   669  	}
   670  	added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes)
   671  	logger := klog.FromContext(ctx)
   672  	for i := range newZoneRepresentatives {
   673  		nc.addPodEvictorForNewZone(logger, newZoneRepresentatives[i])
   674  	}
   675  	for i := range added {
   676  		logger.V(1).Info("Controller observed a new Node", "node", klog.KRef("", added[i].Name))
   677  		controllerutil.RecordNodeEvent(ctx, nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name))
   678  		nc.knownNodeSet[added[i].Name] = added[i]
   679  		nc.addPodEvictorForNewZone(logger, added[i])
   680  		nc.markNodeAsReachable(ctx, added[i])
   681  	}
   682  
   683  	for i := range deleted {
   684  		logger.V(1).Info("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name))
   685  		controllerutil.RecordNodeEvent(ctx, nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name))
   686  		delete(nc.knownNodeSet, deleted[i].Name)
   687  	}
   688  
   689  	var zoneToNodeConditionsLock sync.Mutex
   690  	zoneToNodeConditions := map[string][]*v1.NodeCondition{}
   691  	updateNodeFunc := func(piece int) {
   692  		start := nc.now()
   693  		defer func() {
   694  			updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds())
   695  		}()
   696  
   697  		var observedReadyCondition v1.NodeCondition
   698  		var currentReadyCondition *v1.NodeCondition
   699  		node := nodes[piece].DeepCopy()
   700  
   701  		if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) {
   702  			var err error
   703  			_, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node)
   704  			if err == nil {
   705  				return true, nil
   706  			}
   707  			name := node.Name
   708  			node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
   709  			if err != nil {
   710  				logger.Error(nil, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name))
   711  				return false, err
   712  			}
   713  			return false, nil
   714  		}); err != nil {
   715  			logger.Error(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node))
   716  			return
   717  		}
   718  
   719  		// Some nodes may be excluded from disruption checking
   720  		if !isNodeExcludedFromDisruptionChecks(node) {
   721  			zoneToNodeConditionsLock.Lock()
   722  			zoneToNodeConditions[nodetopology.GetZoneKey(node)] = append(zoneToNodeConditions[nodetopology.GetZoneKey(node)], currentReadyCondition)
   723  			zoneToNodeConditionsLock.Unlock()
   724  		}
   725  
   726  		if currentReadyCondition != nil {
   727  			pods, err := nc.getPodsAssignedToNode(node.Name)
   728  			if err != nil {
   729  				utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err))
   730  				if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue {
   731  					// If error happened during node status transition (Ready -> NotReady)
   732  					// we need to mark node for retry to force MarkPodsNotReady execution
   733  					// in the next iteration.
   734  					nc.nodesToRetry.Store(node.Name, struct{}{})
   735  				}
   736  				return
   737  			}
   738  			nc.processTaintBaseEviction(ctx, node, &observedReadyCondition)
   739  
   740  			_, needsRetry := nc.nodesToRetry.Load(node.Name)
   741  			switch {
   742  			case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue:
   743  				// Report node event only once when status changed.
   744  				controllerutil.RecordNodeStatusChange(logger, nc.recorder, node, "NodeNotReady")
   745  				fallthrough
   746  			case needsRetry && observedReadyCondition.Status != v1.ConditionTrue:
   747  				if err = controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, node.Name); err != nil {
   748  					utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err))
   749  					nc.nodesToRetry.Store(node.Name, struct{}{})
   750  					return
   751  				}
   752  			}
   753  		}
   754  		nc.nodesToRetry.Delete(node.Name)
   755  	}
   756  
   757  	// Marking the pods not ready on a node requires looping over them and
   758  	// updating each pod's status one at a time. This is performed serially, and
   759  	// can take a while if we're processing each node serially as well. So we
   760  	// process them with bounded concurrency instead, since most of the time is
   761  	// spent waiting on io.
   762  	workqueue.ParallelizeUntil(ctx, nc.nodeUpdateWorkerSize, len(nodes), updateNodeFunc)
   763  
   764  	nc.handleDisruption(ctx, zoneToNodeConditions, nodes)
   765  
   766  	return nil
   767  }
   768  
   769  func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) {
   770  	decisionTimestamp := nc.now()
   771  	// Check eviction timeout against decisionTimestamp
   772  	logger := klog.FromContext(ctx)
   773  	switch observedReadyCondition.Status {
   774  	case v1.ConditionFalse:
   775  		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
   776  		if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
   777  			taintToAdd := *NotReadyTaintTemplate
   778  			if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
   779  				logger.Error(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle")
   780  			}
   781  		} else if nc.markNodeForTainting(node, v1.ConditionFalse) {
   782  			logger.V(2).Info("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp)
   783  		}
   784  	case v1.ConditionUnknown:
   785  		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
   786  		if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
   787  			taintToAdd := *UnreachableTaintTemplate
   788  			if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
   789  				logger.Error(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle")
   790  			}
   791  		} else if nc.markNodeForTainting(node, v1.ConditionUnknown) {
   792  			logger.V(2).Info("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp)
   793  		}
   794  	case v1.ConditionTrue:
   795  		removed, err := nc.markNodeAsReachable(ctx, node)
   796  		if err != nil {
   797  			logger.Error(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node))
   798  		}
   799  		if removed {
   800  			logger.V(2).Info("Node is healthy again, removing all taints", "node", klog.KObj(node))
   801  		}
   802  	}
   803  }
   804  
   805  // labelNodeDisruptionExclusion is a label on nodes that controls whether they are
   806  // excluded from being considered for disruption checks by the node controller.
   807  const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
   808  
   809  func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool {
   810  	if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok {
   811  		return true
   812  	}
   813  	return false
   814  }
   815  
   816  // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
   817  // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
   818  func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
   819  	nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
   820  	defer func() {
   821  		nc.nodeHealthMap.set(node.Name, nodeHealth)
   822  	}()
   823  
   824  	var gracePeriod time.Duration
   825  	var observedReadyCondition v1.NodeCondition
   826  	_, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   827  	if currentReadyCondition == nil {
   828  		// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
   829  		// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
   830  		// to node.CreationTimestamp to avoid handle the corner case.
   831  		observedReadyCondition = v1.NodeCondition{
   832  			Type:               v1.NodeReady,
   833  			Status:             v1.ConditionUnknown,
   834  			LastHeartbeatTime:  node.CreationTimestamp,
   835  			LastTransitionTime: node.CreationTimestamp,
   836  		}
   837  		gracePeriod = nc.nodeStartupGracePeriod
   838  		if nodeHealth != nil {
   839  			nodeHealth.status = &node.Status
   840  		} else {
   841  			nodeHealth = &nodeHealthData{
   842  				status:                   &node.Status,
   843  				probeTimestamp:           node.CreationTimestamp,
   844  				readyTransitionTimestamp: node.CreationTimestamp,
   845  			}
   846  		}
   847  	} else {
   848  		// If ready condition is not nil, make a copy of it, since we may modify it in place later.
   849  		observedReadyCondition = *currentReadyCondition
   850  		gracePeriod = nc.nodeMonitorGracePeriod
   851  	}
   852  	// There are following cases to check:
   853  	// - both saved and new status have no Ready Condition set - we leave everything as it is,
   854  	// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
   855  	// - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
   856  	// - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
   857  	//   unresponsive, so we leave it as it is,
   858  	// - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
   859  	//   everything's in order, no transition occurred, we update only probeTimestamp,
   860  	// - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
   861  	//   Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
   862  	// TODO: things to consider:
   863  	//   - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
   864  	//   - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
   865  	//     if that's the case, but it does not seem necessary.
   866  	var savedCondition *v1.NodeCondition
   867  	var savedLease *coordv1.Lease
   868  	if nodeHealth != nil {
   869  		_, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
   870  		savedLease = nodeHealth.lease
   871  	}
   872  	logger := klog.FromContext(ctx)
   873  	if nodeHealth == nil {
   874  		logger.Info("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node))
   875  		nodeHealth = &nodeHealthData{
   876  			status:                   &node.Status,
   877  			probeTimestamp:           nc.now(),
   878  			readyTransitionTimestamp: nc.now(),
   879  		}
   880  	} else if savedCondition == nil && currentReadyCondition != nil {
   881  		logger.V(1).Info("Creating timestamp entry for newly observed Node", "node", klog.KObj(node))
   882  		nodeHealth = &nodeHealthData{
   883  			status:                   &node.Status,
   884  			probeTimestamp:           nc.now(),
   885  			readyTransitionTimestamp: nc.now(),
   886  		}
   887  	} else if savedCondition != nil && currentReadyCondition == nil {
   888  		logger.Error(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node))
   889  		// TODO: figure out what to do in this case. For now we do the same thing as above.
   890  		nodeHealth = &nodeHealthData{
   891  			status:                   &node.Status,
   892  			probeTimestamp:           nc.now(),
   893  			readyTransitionTimestamp: nc.now(),
   894  		}
   895  	} else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime {
   896  		var transitionTime metav1.Time
   897  		// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
   898  		// otherwise we leave it as it is.
   899  		if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime {
   900  			logger.V(3).Info("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition)
   901  			transitionTime = nc.now()
   902  		} else {
   903  			transitionTime = nodeHealth.readyTransitionTimestamp
   904  		}
   905  		if loggerV := logger.V(5); loggerV.Enabled() {
   906  			loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status)
   907  		} else {
   908  			logger.V(3).Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node))
   909  		}
   910  		nodeHealth = &nodeHealthData{
   911  			status:                   &node.Status,
   912  			probeTimestamp:           nc.now(),
   913  			readyTransitionTimestamp: transitionTime,
   914  		}
   915  	}
   916  	// Always update the probe time if node lease is renewed.
   917  	// Note: If kubelet never posted the node status, but continues renewing the
   918  	// heartbeat leases, the node controller will assume the node is healthy and
   919  	// take no action.
   920  	observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
   921  	if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
   922  		nodeHealth.lease = observedLease
   923  		nodeHealth.probeTimestamp = nc.now()
   924  	}
   925  
   926  	if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) {
   927  		// NodeReady condition or lease was last set longer ago than gracePeriod, so
   928  		// update it to Unknown (regardless of its current value) in the master.
   929  
   930  		nodeConditionTypes := []v1.NodeConditionType{
   931  			v1.NodeReady,
   932  			v1.NodeMemoryPressure,
   933  			v1.NodeDiskPressure,
   934  			v1.NodePIDPressure,
   935  			// We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
   936  			// v1.NodeNetworkUnavailable,
   937  		}
   938  
   939  		nowTimestamp := nc.now()
   940  		for _, nodeConditionType := range nodeConditionTypes {
   941  			_, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType)
   942  			if currentCondition == nil {
   943  				logger.V(2).Info("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node))
   944  				node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
   945  					Type:               nodeConditionType,
   946  					Status:             v1.ConditionUnknown,
   947  					Reason:             "NodeStatusNeverUpdated",
   948  					Message:            "Kubelet never posted node status.",
   949  					LastHeartbeatTime:  node.CreationTimestamp,
   950  					LastTransitionTime: nowTimestamp,
   951  				})
   952  			} else {
   953  				logger.V(2).Info("Node hasn't been updated",
   954  					"node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition)
   955  				if currentCondition.Status != v1.ConditionUnknown {
   956  					currentCondition.Status = v1.ConditionUnknown
   957  					currentCondition.Reason = "NodeStatusUnknown"
   958  					currentCondition.Message = "Kubelet stopped posting node status."
   959  					currentCondition.LastTransitionTime = nowTimestamp
   960  				}
   961  			}
   962  		}
   963  		// We need to update currentReadyCondition due to its value potentially changed.
   964  		_, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   965  
   966  		if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) {
   967  			if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil {
   968  				logger.Error(err, "Error updating node", "node", klog.KObj(node))
   969  				return gracePeriod, observedReadyCondition, currentReadyCondition, err
   970  			}
   971  			nodeHealth = &nodeHealthData{
   972  				status:                   &node.Status,
   973  				probeTimestamp:           nodeHealth.probeTimestamp,
   974  				readyTransitionTimestamp: nc.now(),
   975  				lease:                    observedLease,
   976  			}
   977  			return gracePeriod, observedReadyCondition, currentReadyCondition, nil
   978  		}
   979  	}
   980  
   981  	return gracePeriod, observedReadyCondition, currentReadyCondition, nil
   982  }
   983  
   984  func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) {
   985  	newZoneStates := map[string]ZoneState{}
   986  	allAreFullyDisrupted := true
   987  	logger := klog.FromContext(ctx)
   988  	for k, v := range zoneToNodeConditions {
   989  		zoneSize.WithLabelValues(k).Set(float64(len(v)))
   990  		unhealthy, newState := nc.computeZoneStateFunc(v)
   991  		zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v)))
   992  		unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy))
   993  		if newState != stateFullDisruption {
   994  			allAreFullyDisrupted = false
   995  		}
   996  		newZoneStates[k] = newState
   997  		if _, had := nc.zoneStates[k]; !had {
   998  			logger.Error(nil, "Setting initial state for unseen zone", "zone", k)
   999  			nc.zoneStates[k] = stateInitial
  1000  		}
  1001  	}
  1002  
  1003  	allWasFullyDisrupted := true
  1004  	for k, v := range nc.zoneStates {
  1005  		if _, have := zoneToNodeConditions[k]; !have {
  1006  			zoneSize.WithLabelValues(k).Set(0)
  1007  			zoneHealth.WithLabelValues(k).Set(100)
  1008  			unhealthyNodes.WithLabelValues(k).Set(0)
  1009  			delete(nc.zoneStates, k)
  1010  			continue
  1011  		}
  1012  		if v != stateFullDisruption {
  1013  			allWasFullyDisrupted = false
  1014  			break
  1015  		}
  1016  	}
  1017  
  1018  	// At least one node was responding in previous pass or in the current pass. Semantics is as follows:
  1019  	// - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
  1020  	// - if the new state is "normal" we resume normal operation (go back to default limiter settings),
  1021  	// - if new state is "fullDisruption" we restore normal eviction rate,
  1022  	//   - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
  1023  	if !allAreFullyDisrupted || !allWasFullyDisrupted {
  1024  		// We're switching to full disruption mode
  1025  		if allAreFullyDisrupted {
  1026  			logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode")
  1027  			for i := range nodes {
  1028  				_, err := nc.markNodeAsReachable(ctx, nodes[i])
  1029  				if err != nil {
  1030  					logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i]))
  1031  				}
  1032  			}
  1033  			// We stop all evictions.
  1034  			for k := range nc.zoneStates {
  1035  				nc.zoneNoExecuteTainter[k].SwapLimiter(0)
  1036  			}
  1037  			for k := range nc.zoneStates {
  1038  				nc.zoneStates[k] = stateFullDisruption
  1039  			}
  1040  			// All rate limiters are updated, so we can return early here.
  1041  			return
  1042  		}
  1043  		// We're exiting full disruption mode
  1044  		if allWasFullyDisrupted {
  1045  			logger.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode")
  1046  			// When exiting disruption mode update probe timestamps on all Nodes.
  1047  			now := nc.now()
  1048  			for i := range nodes {
  1049  				v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name)
  1050  				v.probeTimestamp = now
  1051  				v.readyTransitionTimestamp = now
  1052  				nc.nodeHealthMap.set(nodes[i].Name, v)
  1053  			}
  1054  			// We reset all rate limiters to settings appropriate for the given state.
  1055  			for k := range nc.zoneStates {
  1056  				nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k])
  1057  				nc.zoneStates[k] = newZoneStates[k]
  1058  			}
  1059  			return
  1060  		}
  1061  		// We know that there's at least one not-fully disrupted so,
  1062  		// we can use default behavior for rate limiters
  1063  		for k, v := range nc.zoneStates {
  1064  			newState := newZoneStates[k]
  1065  			if v == newState {
  1066  				continue
  1067  			}
  1068  			logger.Info("Controller detected that zone is now in new state", "zone", k, "newState", newState)
  1069  			nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState)
  1070  			nc.zoneStates[k] = newState
  1071  		}
  1072  	}
  1073  }
  1074  
  1075  func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) {
  1076  	if newPod == nil {
  1077  		return
  1078  	}
  1079  	if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) {
  1080  		podItem := podUpdateItem{newPod.Namespace, newPod.Name}
  1081  		nc.podUpdateQueue.Add(podItem)
  1082  	}
  1083  }
  1084  
  1085  func (nc *Controller) doPodProcessingWorker(ctx context.Context) {
  1086  	for {
  1087  		obj, shutdown := nc.podUpdateQueue.Get()
  1088  		// "podUpdateQueue" will be shutdown when "stopCh" closed;
  1089  		// we do not need to re-check "stopCh" again.
  1090  		if shutdown {
  1091  			return
  1092  		}
  1093  
  1094  		podItem := obj.(podUpdateItem)
  1095  		nc.processPod(ctx, podItem)
  1096  	}
  1097  }
  1098  
  1099  // processPod is processing events of assigning pods to nodes. In particular:
  1100  // 1. for NodeReady=true node, taint eviction for this pod will be cancelled
  1101  // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready
  1102  // 3. if node doesn't exist in cache, it will be skipped.
  1103  func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) {
  1104  	defer nc.podUpdateQueue.Done(podItem)
  1105  	pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name)
  1106  	logger := klog.FromContext(ctx)
  1107  	if err != nil {
  1108  		if apierrors.IsNotFound(err) {
  1109  			// If the pod was deleted, there is no need to requeue.
  1110  			return
  1111  		}
  1112  		logger.Info("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err)
  1113  		nc.podUpdateQueue.AddRateLimited(podItem)
  1114  		return
  1115  	}
  1116  
  1117  	nodeName := pod.Spec.NodeName
  1118  
  1119  	nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName)
  1120  	if nodeHealth == nil {
  1121  		// Node data is not gathered yet or node has been removed in the meantime.
  1122  		return
  1123  	}
  1124  
  1125  	_, err = nc.nodeLister.Get(nodeName)
  1126  	if err != nil {
  1127  		logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err)
  1128  		nc.podUpdateQueue.AddRateLimited(podItem)
  1129  		return
  1130  	}
  1131  
  1132  	_, currentReadyCondition := controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
  1133  	if currentReadyCondition == nil {
  1134  		// Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted).
  1135  		// In both cases, the pod will be handled correctly (evicted if needed) during processing
  1136  		// of the next node update event.
  1137  		return
  1138  	}
  1139  
  1140  	pods := []*v1.Pod{pod}
  1141  	if currentReadyCondition.Status != v1.ConditionTrue {
  1142  		if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil {
  1143  			logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err)
  1144  			nc.podUpdateQueue.AddRateLimited(podItem)
  1145  		}
  1146  	}
  1147  }
  1148  
  1149  func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) {
  1150  	switch state {
  1151  	case stateNormal:
  1152  		nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS)
  1153  	case statePartialDisruption:
  1154  		nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1155  			nc.enterPartialDisruptionFunc(zoneSize))
  1156  	case stateFullDisruption:
  1157  		nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1158  			nc.enterFullDisruptionFunc(zoneSize))
  1159  	}
  1160  }
  1161  
  1162  // classifyNodes classifies the allNodes to three categories:
  1163  //  1. added: the nodes that in 'allNodes', but not in 'knownNodeSet'
  1164  //  2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes'
  1165  //  3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states
  1166  func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) {
  1167  	for i := range allNodes {
  1168  		if _, has := nc.knownNodeSet[allNodes[i].Name]; !has {
  1169  			added = append(added, allNodes[i])
  1170  		} else {
  1171  			// Currently, we only consider new zone as updated.
  1172  			zone := nodetopology.GetZoneKey(allNodes[i])
  1173  			if _, found := nc.zoneStates[zone]; !found {
  1174  				newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i])
  1175  			}
  1176  		}
  1177  	}
  1178  
  1179  	// If there's a difference between lengths of known Nodes and observed nodes
  1180  	// we must have removed some Node.
  1181  	if len(nc.knownNodeSet)+len(added) != len(allNodes) {
  1182  		knowSetCopy := map[string]*v1.Node{}
  1183  		for k, v := range nc.knownNodeSet {
  1184  			knowSetCopy[k] = v
  1185  		}
  1186  		for i := range allNodes {
  1187  			delete(knowSetCopy, allNodes[i].Name)
  1188  		}
  1189  		for i := range knowSetCopy {
  1190  			deleted = append(deleted, knowSetCopy[i])
  1191  		}
  1192  	}
  1193  	return
  1194  }
  1195  
  1196  // HealthyQPSFunc returns the default value for cluster eviction rate - we take
  1197  // nodeNum for consistency with ReducedQPSFunc.
  1198  func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 {
  1199  	return nc.evictionLimiterQPS
  1200  }
  1201  
  1202  // ReducedQPSFunc returns the QPS for when the cluster is large make
  1203  // evictions slower, if they're small stop evictions altogether.
  1204  func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 {
  1205  	if int32(nodeNum) > nc.largeClusterThreshold {
  1206  		return nc.secondaryEvictionLimiterQPS
  1207  	}
  1208  	return 0
  1209  }
  1210  
  1211  // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor.
  1212  func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) {
  1213  	nc.evictorLock.Lock()
  1214  	defer nc.evictorLock.Unlock()
  1215  	zone := nodetopology.GetZoneKey(node)
  1216  	if _, found := nc.zoneStates[zone]; !found {
  1217  		nc.zoneStates[zone] = stateInitial
  1218  		nc.zoneNoExecuteTainter[zone] =
  1219  			scheduler.NewRateLimitedTimedQueue(
  1220  				flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst))
  1221  		// Init the metric for the new zone.
  1222  		logger.Info("Initializing eviction metric for zone", "zone", zone)
  1223  		evictionsTotal.WithLabelValues(zone).Add(0)
  1224  	}
  1225  }
  1226  
  1227  func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool {
  1228  	nc.evictorLock.Lock()
  1229  	defer nc.evictorLock.Unlock()
  1230  	if status == v1.ConditionFalse {
  1231  		if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
  1232  			nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name)
  1233  		}
  1234  	}
  1235  
  1236  	if status == v1.ConditionUnknown {
  1237  		if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
  1238  			nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name)
  1239  		}
  1240  	}
  1241  
  1242  	return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID))
  1243  }
  1244  
  1245  func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) {
  1246  	err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate)
  1247  	logger := klog.FromContext(ctx)
  1248  	if err != nil {
  1249  		logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node))
  1250  		return false, err
  1251  	}
  1252  	err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate)
  1253  	if err != nil {
  1254  		logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node))
  1255  		return false, err
  1256  	}
  1257  	nc.evictorLock.Lock()
  1258  	defer nc.evictorLock.Unlock()
  1259  
  1260  	return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name), nil
  1261  }
  1262  
  1263  // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
  1264  // The zone is considered:
  1265  // - fullyDisrupted if there're no Ready Nodes,
  1266  // - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
  1267  // - normal otherwise
  1268  func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) {
  1269  	readyNodes := 0
  1270  	notReadyNodes := 0
  1271  	for i := range nodeReadyConditions {
  1272  		if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue {
  1273  			readyNodes++
  1274  		} else {
  1275  			notReadyNodes++
  1276  		}
  1277  	}
  1278  	switch {
  1279  	case readyNodes == 0 && notReadyNodes > 0:
  1280  		return notReadyNodes, stateFullDisruption
  1281  	case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
  1282  		return notReadyNodes, statePartialDisruption
  1283  	default:
  1284  		return notReadyNodes, stateNormal
  1285  	}
  1286  }
  1287  
  1288  // reconcileNodeLabels reconciles node labels.
  1289  func (nc *Controller) reconcileNodeLabels(ctx context.Context, nodeName string) error {
  1290  	node, err := nc.nodeLister.Get(nodeName)
  1291  	if err != nil {
  1292  		// If node not found, just ignore it.
  1293  		if apierrors.IsNotFound(err) {
  1294  			return nil
  1295  		}
  1296  		return err
  1297  	}
  1298  
  1299  	if node.Labels == nil {
  1300  		// Nothing to reconcile.
  1301  		return nil
  1302  	}
  1303  
  1304  	labelsToUpdate := map[string]string{}
  1305  	for _, r := range labelReconcileInfo {
  1306  		primaryValue, primaryExists := node.Labels[r.primaryKey]
  1307  		secondaryValue, secondaryExists := node.Labels[r.secondaryKey]
  1308  
  1309  		if !primaryExists {
  1310  			// The primary label key does not exist. This should not happen
  1311  			// within our supported version skew range, when no external
  1312  			// components/factors modifying the node object. Ignore this case.
  1313  			continue
  1314  		}
  1315  		if secondaryExists && primaryValue != secondaryValue {
  1316  			// Secondary label exists, but not consistent with the primary
  1317  			// label. Need to reconcile.
  1318  			labelsToUpdate[r.secondaryKey] = primaryValue
  1319  
  1320  		} else if !secondaryExists && r.ensureSecondaryExists {
  1321  			// Apply secondary label based on primary label.
  1322  			labelsToUpdate[r.secondaryKey] = primaryValue
  1323  		}
  1324  	}
  1325  
  1326  	if len(labelsToUpdate) == 0 {
  1327  		return nil
  1328  	}
  1329  	if !controllerutil.AddOrUpdateLabelsOnNode(ctx, nc.kubeClient, labelsToUpdate, node) {
  1330  		return fmt.Errorf("failed update labels for node %+v", node)
  1331  	}
  1332  	return nil
  1333  }