k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/nodelifecycle/node_lifecycle_controller.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // The Controller sets tainted annotations on nodes.
    18  // Tainted nodes should not be used for new work loads and
    19  // some effort should be given to getting existing work
    20  // loads off of tainted nodes.
    21  
    22  package nodelifecycle
    23  
    24  import (
    25  	"context"
    26  	"fmt"
    27  	"sync"
    28  	"time"
    29  
    30  	"k8s.io/klog/v2"
    31  
    32  	coordv1 "k8s.io/api/coordination/v1"
    33  	v1 "k8s.io/api/core/v1"
    34  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    35  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    36  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    37  	"k8s.io/apimachinery/pkg/labels"
    38  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    39  	"k8s.io/apimachinery/pkg/util/wait"
    40  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    41  	appsv1informers "k8s.io/client-go/informers/apps/v1"
    42  	coordinformers "k8s.io/client-go/informers/coordination/v1"
    43  	coreinformers "k8s.io/client-go/informers/core/v1"
    44  	clientset "k8s.io/client-go/kubernetes"
    45  	"k8s.io/client-go/kubernetes/scheme"
    46  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    47  	appsv1listers "k8s.io/client-go/listers/apps/v1"
    48  	coordlisters "k8s.io/client-go/listers/coordination/v1"
    49  	corelisters "k8s.io/client-go/listers/core/v1"
    50  	"k8s.io/client-go/tools/cache"
    51  	"k8s.io/client-go/tools/record"
    52  	"k8s.io/client-go/util/flowcontrol"
    53  	"k8s.io/client-go/util/workqueue"
    54  	nodetopology "k8s.io/component-helpers/node/topology"
    55  	kubeletapis "k8s.io/kubelet/pkg/apis"
    56  	"k8s.io/kubernetes/pkg/controller"
    57  	"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
    58  	"k8s.io/kubernetes/pkg/controller/tainteviction"
    59  	controllerutil "k8s.io/kubernetes/pkg/controller/util/node"
    60  	"k8s.io/kubernetes/pkg/features"
    61  	taintutils "k8s.io/kubernetes/pkg/util/taints"
    62  )
    63  
    64  func init() {
    65  	// Register prometheus metrics
    66  	Register()
    67  }
    68  
    69  var (
    70  	// UnreachableTaintTemplate is the taint for when a node becomes unreachable.
    71  	UnreachableTaintTemplate = &v1.Taint{
    72  		Key:    v1.TaintNodeUnreachable,
    73  		Effect: v1.TaintEffectNoExecute,
    74  	}
    75  
    76  	// NotReadyTaintTemplate is the taint for when a node is not ready for
    77  	// executing pods
    78  	NotReadyTaintTemplate = &v1.Taint{
    79  		Key:    v1.TaintNodeNotReady,
    80  		Effect: v1.TaintEffectNoExecute,
    81  	}
    82  
    83  	// map {NodeConditionType: {ConditionStatus: TaintKey}}
    84  	// represents which NodeConditionType under which ConditionStatus should be
    85  	// tainted with which TaintKey
    86  	// for certain NodeConditionType, there are multiple {ConditionStatus,TaintKey} pairs
    87  	nodeConditionToTaintKeyStatusMap = map[v1.NodeConditionType]map[v1.ConditionStatus]string{
    88  		v1.NodeReady: {
    89  			v1.ConditionFalse:   v1.TaintNodeNotReady,
    90  			v1.ConditionUnknown: v1.TaintNodeUnreachable,
    91  		},
    92  		v1.NodeMemoryPressure: {
    93  			v1.ConditionTrue: v1.TaintNodeMemoryPressure,
    94  		},
    95  		v1.NodeDiskPressure: {
    96  			v1.ConditionTrue: v1.TaintNodeDiskPressure,
    97  		},
    98  		v1.NodeNetworkUnavailable: {
    99  			v1.ConditionTrue: v1.TaintNodeNetworkUnavailable,
   100  		},
   101  		v1.NodePIDPressure: {
   102  			v1.ConditionTrue: v1.TaintNodePIDPressure,
   103  		},
   104  	}
   105  
   106  	taintKeyToNodeConditionMap = map[string]v1.NodeConditionType{
   107  		v1.TaintNodeNotReady:           v1.NodeReady,
   108  		v1.TaintNodeUnreachable:        v1.NodeReady,
   109  		v1.TaintNodeNetworkUnavailable: v1.NodeNetworkUnavailable,
   110  		v1.TaintNodeMemoryPressure:     v1.NodeMemoryPressure,
   111  		v1.TaintNodeDiskPressure:       v1.NodeDiskPressure,
   112  		v1.TaintNodePIDPressure:        v1.NodePIDPressure,
   113  	}
   114  )
   115  
   116  // ZoneState is the state of a given zone.
   117  type ZoneState string
   118  
   119  const (
   120  	stateInitial           = ZoneState("Initial")
   121  	stateNormal            = ZoneState("Normal")
   122  	stateFullDisruption    = ZoneState("FullDisruption")
   123  	statePartialDisruption = ZoneState("PartialDisruption")
   124  )
   125  
   126  const (
   127  	// The amount of time the nodecontroller should sleep between retrying node health updates
   128  	retrySleepTime   = 20 * time.Millisecond
   129  	nodeNameKeyIndex = "spec.nodeName"
   130  	// podUpdateWorkerSizes assumes that in most cases pod will be handled by monitorNodeHealth pass.
   131  	// Pod update workers will only handle lagging cache pods. 4 workers should be enough.
   132  	podUpdateWorkerSize = 4
   133  	// nodeUpdateWorkerSize defines the size of workers for node update or/and pod update.
   134  	nodeUpdateWorkerSize = 8
   135  
   136  	// taintEvictionController is defined here in order to prevent imports of
   137  	// k8s.io/kubernetes/cmd/kube-controller-manager/names which would result in validation errors.
   138  	// This constant will be removed upon graduation of the SeparateTaintEvictionController feature.
   139  	taintEvictionController = "taint-eviction-controller"
   140  )
   141  
   142  // labelReconcileInfo lists Node labels to reconcile, and how to reconcile them.
   143  // primaryKey and secondaryKey are keys of labels to reconcile.
   144  //   - If both keys exist, but their values don't match. Use the value from the
   145  //     primaryKey as the source of truth to reconcile.
   146  //   - If ensureSecondaryExists is true, and the secondaryKey does not
   147  //     exist, secondaryKey will be added with the value of the primaryKey.
   148  var labelReconcileInfo = []struct {
   149  	primaryKey            string
   150  	secondaryKey          string
   151  	ensureSecondaryExists bool
   152  }{
   153  	{
   154  		// Reconcile the beta and the stable OS label using the stable label as the source of truth.
   155  		// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
   156  		primaryKey:            v1.LabelOSStable,
   157  		secondaryKey:          kubeletapis.LabelOS,
   158  		ensureSecondaryExists: true,
   159  	},
   160  	{
   161  		// Reconcile the beta and the stable arch label using the stable label as the source of truth.
   162  		// TODO(#89477): no earlier than 1.22: drop the beta labels if they differ from the GA labels
   163  		primaryKey:            v1.LabelArchStable,
   164  		secondaryKey:          kubeletapis.LabelArch,
   165  		ensureSecondaryExists: true,
   166  	},
   167  }
   168  
   169  type nodeHealthData struct {
   170  	probeTimestamp           metav1.Time
   171  	readyTransitionTimestamp metav1.Time
   172  	status                   *v1.NodeStatus
   173  	lease                    *coordv1.Lease
   174  }
   175  
   176  func (n *nodeHealthData) deepCopy() *nodeHealthData {
   177  	if n == nil {
   178  		return nil
   179  	}
   180  	return &nodeHealthData{
   181  		probeTimestamp:           n.probeTimestamp,
   182  		readyTransitionTimestamp: n.readyTransitionTimestamp,
   183  		status:                   n.status.DeepCopy(),
   184  		lease:                    n.lease.DeepCopy(),
   185  	}
   186  }
   187  
   188  type nodeHealthMap struct {
   189  	lock        sync.RWMutex
   190  	nodeHealths map[string]*nodeHealthData
   191  }
   192  
   193  func newNodeHealthMap() *nodeHealthMap {
   194  	return &nodeHealthMap{
   195  		nodeHealths: make(map[string]*nodeHealthData),
   196  	}
   197  }
   198  
   199  // getDeepCopy - returns copy of node health data.
   200  // It prevents data being changed after retrieving it from the map.
   201  func (n *nodeHealthMap) getDeepCopy(name string) *nodeHealthData {
   202  	n.lock.RLock()
   203  	defer n.lock.RUnlock()
   204  	return n.nodeHealths[name].deepCopy()
   205  }
   206  
   207  func (n *nodeHealthMap) set(name string, data *nodeHealthData) {
   208  	n.lock.Lock()
   209  	defer n.lock.Unlock()
   210  	n.nodeHealths[name] = data
   211  }
   212  
   213  type podUpdateItem struct {
   214  	namespace string
   215  	name      string
   216  }
   217  
   218  // Controller is the controller that manages node's life cycle.
   219  type Controller struct {
   220  	taintManager *tainteviction.Controller
   221  
   222  	podLister         corelisters.PodLister
   223  	podInformerSynced cache.InformerSynced
   224  	kubeClient        clientset.Interface
   225  
   226  	// This timestamp is to be used instead of LastProbeTime stored in Condition. We do this
   227  	// to avoid the problem with time skew across the cluster.
   228  	now func() metav1.Time
   229  
   230  	enterPartialDisruptionFunc func(nodeNum int) float32
   231  	enterFullDisruptionFunc    func(nodeNum int) float32
   232  	computeZoneStateFunc       func(nodeConditions []*v1.NodeCondition) (int, ZoneState)
   233  
   234  	knownNodeSet map[string]*v1.Node
   235  	// per Node map storing last observed health together with a local time when it was observed.
   236  	nodeHealthMap *nodeHealthMap
   237  
   238  	// evictorLock protects zonePodEvictor and zoneNoExecuteTainter.
   239  	evictorLock sync.Mutex
   240  	// workers that are responsible for tainting nodes.
   241  	zoneNoExecuteTainter map[string]*scheduler.RateLimitedTimedQueue
   242  
   243  	nodesToRetry sync.Map
   244  
   245  	zoneStates map[string]ZoneState
   246  
   247  	daemonSetStore          appsv1listers.DaemonSetLister
   248  	daemonSetInformerSynced cache.InformerSynced
   249  
   250  	leaseLister         coordlisters.LeaseLister
   251  	leaseInformerSynced cache.InformerSynced
   252  	nodeLister          corelisters.NodeLister
   253  	nodeInformerSynced  cache.InformerSynced
   254  
   255  	getPodsAssignedToNode func(nodeName string) ([]*v1.Pod, error)
   256  
   257  	broadcaster record.EventBroadcaster
   258  	recorder    record.EventRecorder
   259  
   260  	// Value controlling Controller monitoring period, i.e. how often does Controller
   261  	// check node health signal posted from kubelet. This value should be lower than
   262  	// nodeMonitorGracePeriod.
   263  	// TODO: Change node health monitor to watch based.
   264  	nodeMonitorPeriod time.Duration
   265  
   266  	// When node is just created, e.g. cluster bootstrap or node creation, we give
   267  	// a longer grace period.
   268  	nodeStartupGracePeriod time.Duration
   269  
   270  	// Controller will not proactively sync node health, but will monitor node
   271  	// health signal updated from kubelet. There are 2 kinds of node healthiness
   272  	// signals: NodeStatus and NodeLease. If it doesn't receive update for this amount
   273  	// of time, it will start posting "NodeReady==ConditionUnknown". The amount of
   274  	// time before which Controller start evicting pods is controlled via flag
   275  	// 'pod-eviction-timeout'.
   276  	// Note: be cautious when changing the constant, it must work with
   277  	// nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
   278  	// controller. The node health signal update frequency is the minimal of the
   279  	// two.
   280  	// There are several constraints:
   281  	// 1. nodeMonitorGracePeriod must be N times more than  the node health signal
   282  	//    update frequency, where N means number of retries allowed for kubelet to
   283  	//    post node status/lease. It is pointless to make nodeMonitorGracePeriod
   284  	//    be less than the node health signal update frequency, since there will
   285  	//    only be fresh values from Kubelet at an interval of node health signal
   286  	//    update frequency.
   287  	// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
   288  	//    value takes longer for user to see up-to-date node health.
   289  	nodeMonitorGracePeriod time.Duration
   290  
   291  	// Number of workers Controller uses to process node monitor health updates.
   292  	// Defaults to scheduler.UpdateWorkerSize.
   293  	nodeUpdateWorkerSize int
   294  
   295  	evictionLimiterQPS          float32
   296  	secondaryEvictionLimiterQPS float32
   297  	largeClusterThreshold       int32
   298  	unhealthyZoneThreshold      float32
   299  
   300  	nodeUpdateQueue workqueue.TypedInterface[string]
   301  	podUpdateQueue  workqueue.TypedRateLimitingInterface[podUpdateItem]
   302  }
   303  
   304  // NewNodeLifecycleController returns a new taint controller.
   305  func NewNodeLifecycleController(
   306  	ctx context.Context,
   307  	leaseInformer coordinformers.LeaseInformer,
   308  	podInformer coreinformers.PodInformer,
   309  	nodeInformer coreinformers.NodeInformer,
   310  	daemonSetInformer appsv1informers.DaemonSetInformer,
   311  	kubeClient clientset.Interface,
   312  	nodeMonitorPeriod time.Duration,
   313  	nodeStartupGracePeriod time.Duration,
   314  	nodeMonitorGracePeriod time.Duration,
   315  	evictionLimiterQPS float32,
   316  	secondaryEvictionLimiterQPS float32,
   317  	largeClusterThreshold int32,
   318  	unhealthyZoneThreshold float32,
   319  ) (*Controller, error) {
   320  	logger := klog.FromContext(ctx)
   321  	if kubeClient == nil {
   322  		logger.Error(nil, "kubeClient is nil when starting nodelifecycle Controller")
   323  		klog.FlushAndExit(klog.ExitFlushTimeout, 1)
   324  	}
   325  
   326  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
   327  	recorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "node-controller"})
   328  
   329  	nc := &Controller{
   330  		kubeClient:                  kubeClient,
   331  		now:                         metav1.Now,
   332  		knownNodeSet:                make(map[string]*v1.Node),
   333  		nodeHealthMap:               newNodeHealthMap(),
   334  		broadcaster:                 eventBroadcaster,
   335  		recorder:                    recorder,
   336  		nodeMonitorPeriod:           nodeMonitorPeriod,
   337  		nodeStartupGracePeriod:      nodeStartupGracePeriod,
   338  		nodeMonitorGracePeriod:      nodeMonitorGracePeriod,
   339  		nodeUpdateWorkerSize:        nodeUpdateWorkerSize,
   340  		zoneNoExecuteTainter:        make(map[string]*scheduler.RateLimitedTimedQueue),
   341  		nodesToRetry:                sync.Map{},
   342  		zoneStates:                  make(map[string]ZoneState),
   343  		evictionLimiterQPS:          evictionLimiterQPS,
   344  		secondaryEvictionLimiterQPS: secondaryEvictionLimiterQPS,
   345  		largeClusterThreshold:       largeClusterThreshold,
   346  		unhealthyZoneThreshold:      unhealthyZoneThreshold,
   347  		nodeUpdateQueue:             workqueue.NewTypedWithConfig(workqueue.TypedQueueConfig[string]{Name: "node_lifecycle_controller"}),
   348  		podUpdateQueue: workqueue.NewTypedRateLimitingQueueWithConfig(
   349  			workqueue.DefaultTypedControllerRateLimiter[podUpdateItem](),
   350  			workqueue.TypedRateLimitingQueueConfig[podUpdateItem]{
   351  				Name: "node_lifecycle_controller_pods",
   352  			},
   353  		),
   354  	}
   355  
   356  	nc.enterPartialDisruptionFunc = nc.ReducedQPSFunc
   357  	nc.enterFullDisruptionFunc = nc.HealthyQPSFunc
   358  	nc.computeZoneStateFunc = nc.ComputeZoneState
   359  
   360  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   361  		AddFunc: func(obj interface{}) {
   362  			pod := obj.(*v1.Pod)
   363  			nc.podUpdated(nil, pod)
   364  		},
   365  		UpdateFunc: func(prev, obj interface{}) {
   366  			prevPod := prev.(*v1.Pod)
   367  			newPod := obj.(*v1.Pod)
   368  			nc.podUpdated(prevPod, newPod)
   369  		},
   370  		DeleteFunc: func(obj interface{}) {
   371  			pod, isPod := obj.(*v1.Pod)
   372  			// We can get DeletedFinalStateUnknown instead of *v1.Pod here and we need to handle that correctly.
   373  			if !isPod {
   374  				deletedState, ok := obj.(cache.DeletedFinalStateUnknown)
   375  				if !ok {
   376  					logger.Error(nil, "Received unexpected object", "object", obj)
   377  					return
   378  				}
   379  				pod, ok = deletedState.Obj.(*v1.Pod)
   380  				if !ok {
   381  					logger.Error(nil, "DeletedFinalStateUnknown contained non-Pod object", "object", deletedState.Obj)
   382  					return
   383  				}
   384  			}
   385  			nc.podUpdated(pod, nil)
   386  		},
   387  	})
   388  	nc.podInformerSynced = podInformer.Informer().HasSynced
   389  	podInformer.Informer().AddIndexers(cache.Indexers{
   390  		nodeNameKeyIndex: func(obj interface{}) ([]string, error) {
   391  			pod, ok := obj.(*v1.Pod)
   392  			if !ok {
   393  				return []string{}, nil
   394  			}
   395  			if len(pod.Spec.NodeName) == 0 {
   396  				return []string{}, nil
   397  			}
   398  			return []string{pod.Spec.NodeName}, nil
   399  		},
   400  	})
   401  
   402  	podIndexer := podInformer.Informer().GetIndexer()
   403  	nc.getPodsAssignedToNode = func(nodeName string) ([]*v1.Pod, error) {
   404  		objs, err := podIndexer.ByIndex(nodeNameKeyIndex, nodeName)
   405  		if err != nil {
   406  			return nil, err
   407  		}
   408  		pods := make([]*v1.Pod, 0, len(objs))
   409  		for _, obj := range objs {
   410  			pod, ok := obj.(*v1.Pod)
   411  			if !ok {
   412  				continue
   413  			}
   414  			pods = append(pods, pod)
   415  		}
   416  		return pods, nil
   417  	}
   418  	nc.podLister = podInformer.Lister()
   419  	nc.nodeLister = nodeInformer.Lister()
   420  
   421  	if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) {
   422  		logger.Info("Running TaintEvictionController as part of NodeLifecyleController")
   423  		tm, err := tainteviction.New(ctx, kubeClient, podInformer, nodeInformer, taintEvictionController)
   424  		if err != nil {
   425  			return nil, err
   426  		}
   427  		nc.taintManager = tm
   428  	}
   429  
   430  	logger.Info("Controller will reconcile labels")
   431  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   432  		AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error {
   433  			nc.nodeUpdateQueue.Add(node.Name)
   434  			return nil
   435  		}),
   436  		UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error {
   437  			nc.nodeUpdateQueue.Add(newNode.Name)
   438  			return nil
   439  		}),
   440  		DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error {
   441  			nc.nodesToRetry.Delete(node.Name)
   442  			return nil
   443  		}),
   444  	})
   445  
   446  	nc.leaseLister = leaseInformer.Lister()
   447  	nc.leaseInformerSynced = leaseInformer.Informer().HasSynced
   448  
   449  	nc.nodeInformerSynced = nodeInformer.Informer().HasSynced
   450  
   451  	nc.daemonSetStore = daemonSetInformer.Lister()
   452  	nc.daemonSetInformerSynced = daemonSetInformer.Informer().HasSynced
   453  
   454  	return nc, nil
   455  }
   456  
   457  // Run starts an asynchronous loop that monitors the status of cluster nodes.
   458  func (nc *Controller) Run(ctx context.Context) {
   459  	defer utilruntime.HandleCrash()
   460  
   461  	// Start events processing pipeline.
   462  	nc.broadcaster.StartStructuredLogging(3)
   463  	logger := klog.FromContext(ctx)
   464  	logger.Info("Sending events to api server")
   465  	nc.broadcaster.StartRecordingToSink(
   466  		&v1core.EventSinkImpl{
   467  			Interface: v1core.New(nc.kubeClient.CoreV1().RESTClient()).Events(""),
   468  		})
   469  	defer nc.broadcaster.Shutdown()
   470  
   471  	// Close node update queue to cleanup go routine.
   472  	defer nc.nodeUpdateQueue.ShutDown()
   473  	defer nc.podUpdateQueue.ShutDown()
   474  
   475  	logger.Info("Starting node controller")
   476  	defer logger.Info("Shutting down node controller")
   477  
   478  	if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
   479  		return
   480  	}
   481  
   482  	if !utilfeature.DefaultFeatureGate.Enabled(features.SeparateTaintEvictionController) {
   483  		logger.Info("Starting", "controller", taintEvictionController)
   484  		go nc.taintManager.Run(ctx)
   485  	}
   486  
   487  	// Start workers to reconcile labels and/or update NoSchedule taint for nodes.
   488  	for i := 0; i < nodeUpdateWorkerSize; i++ {
   489  		// Thanks to "workqueue", each worker just need to get item from queue, because
   490  		// the item is flagged when got from queue: if new event come, the new item will
   491  		// be re-queued until "Done", so no more than one worker handle the same item and
   492  		// no event missed.
   493  		go wait.UntilWithContext(ctx, nc.doNodeProcessingPassWorker, time.Second)
   494  	}
   495  
   496  	for i := 0; i < podUpdateWorkerSize; i++ {
   497  		go wait.UntilWithContext(ctx, nc.doPodProcessingWorker, time.Second)
   498  	}
   499  
   500  	// Handling taint based evictions. Because we don't want a dedicated logic in TaintManager for NC-originated
   501  	// taints and we normally don't rate limit evictions caused by taints, we need to rate limit adding taints.
   502  	go wait.UntilWithContext(ctx, nc.doNoExecuteTaintingPass, scheduler.NodeEvictionPeriod)
   503  
   504  	// Incorporate the results of node health signal pushed from kubelet to master.
   505  	go wait.UntilWithContext(ctx, func(ctx context.Context) {
   506  		if err := nc.monitorNodeHealth(ctx); err != nil {
   507  			logger.Error(err, "Error monitoring node health")
   508  		}
   509  	}, nc.nodeMonitorPeriod)
   510  
   511  	<-ctx.Done()
   512  }
   513  
   514  func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) {
   515  	logger := klog.FromContext(ctx)
   516  	for {
   517  		obj, shutdown := nc.nodeUpdateQueue.Get()
   518  		// "nodeUpdateQueue" will be shutdown when "stopCh" closed;
   519  		// we do not need to re-check "stopCh" again.
   520  		if shutdown {
   521  			return
   522  		}
   523  		nodeName := obj
   524  		if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil {
   525  			logger.Error(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName))
   526  			// TODO(k82cn): Add nodeName back to the queue
   527  		}
   528  		// TODO: re-evaluate whether there are any labels that need to be
   529  		// reconcile in 1.19. Remove this function if it's no longer necessary.
   530  		if err := nc.reconcileNodeLabels(ctx, nodeName); err != nil {
   531  			logger.Error(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName))
   532  			// TODO(yujuhong): Add nodeName back to the queue
   533  		}
   534  		nc.nodeUpdateQueue.Done(nodeName)
   535  	}
   536  }
   537  
   538  func (nc *Controller) doNoScheduleTaintingPass(ctx context.Context, nodeName string) error {
   539  	node, err := nc.nodeLister.Get(nodeName)
   540  	if err != nil {
   541  		// If node not found, just ignore it.
   542  		if apierrors.IsNotFound(err) {
   543  			return nil
   544  		}
   545  		return err
   546  	}
   547  
   548  	// Map node's condition to Taints.
   549  	var taints []v1.Taint
   550  	for _, condition := range node.Status.Conditions {
   551  		if taintMap, found := nodeConditionToTaintKeyStatusMap[condition.Type]; found {
   552  			if taintKey, found := taintMap[condition.Status]; found {
   553  				taints = append(taints, v1.Taint{
   554  					Key:    taintKey,
   555  					Effect: v1.TaintEffectNoSchedule,
   556  				})
   557  			}
   558  		}
   559  	}
   560  	if node.Spec.Unschedulable {
   561  		// If unschedulable, append related taint.
   562  		taints = append(taints, v1.Taint{
   563  			Key:    v1.TaintNodeUnschedulable,
   564  			Effect: v1.TaintEffectNoSchedule,
   565  		})
   566  	}
   567  
   568  	// Get exist taints of node.
   569  	nodeTaints := taintutils.TaintSetFilter(node.Spec.Taints, func(t *v1.Taint) bool {
   570  		// only NoSchedule taints are candidates to be compared with "taints" later
   571  		if t.Effect != v1.TaintEffectNoSchedule {
   572  			return false
   573  		}
   574  		// Find unschedulable taint of node.
   575  		if t.Key == v1.TaintNodeUnschedulable {
   576  			return true
   577  		}
   578  		// Find node condition taints of node.
   579  		_, found := taintKeyToNodeConditionMap[t.Key]
   580  		return found
   581  	})
   582  	taintsToAdd, taintsToDel := taintutils.TaintSetDiff(taints, nodeTaints)
   583  	// If nothing to add or delete, return true directly.
   584  	if len(taintsToAdd) == 0 && len(taintsToDel) == 0 {
   585  		return nil
   586  	}
   587  	if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, taintsToAdd, taintsToDel, node) {
   588  		return fmt.Errorf("failed to swap taints of node %+v", node)
   589  	}
   590  	return nil
   591  }
   592  
   593  func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) {
   594  	// Extract out the keys of the map in order to not hold
   595  	// the evictorLock for the entire function and hold it
   596  	// only when nescessary.
   597  	var zoneNoExecuteTainterKeys []string
   598  	func() {
   599  		nc.evictorLock.Lock()
   600  		defer nc.evictorLock.Unlock()
   601  
   602  		zoneNoExecuteTainterKeys = make([]string, 0, len(nc.zoneNoExecuteTainter))
   603  		for k := range nc.zoneNoExecuteTainter {
   604  			zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k)
   605  		}
   606  	}()
   607  	logger := klog.FromContext(ctx)
   608  	for _, k := range zoneNoExecuteTainterKeys {
   609  		var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue
   610  		func() {
   611  			nc.evictorLock.Lock()
   612  			defer nc.evictorLock.Unlock()
   613  			// Extracting the value without checking if the key
   614  			// exists or not is safe to do here since zones do
   615  			// not get removed, and consequently pod evictors for
   616  			// these zones also do not get removed, only added.
   617  			zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k]
   618  		}()
   619  		// Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded).
   620  		zoneNoExecuteTainterWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) {
   621  			node, err := nc.nodeLister.Get(value.Value)
   622  			if apierrors.IsNotFound(err) {
   623  				logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value))
   624  				return true, 0
   625  			} else if err != nil {
   626  				logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err)
   627  				// retry in 50 millisecond
   628  				return false, 50 * time.Millisecond
   629  			}
   630  			_, condition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   631  			if condition == nil {
   632  				logger.Info("Failed to get NodeCondition from the node status", "node", klog.KRef("", value.Value))
   633  				// retry in 50 millisecond
   634  				return false, 50 * time.Millisecond
   635  			}
   636  			// Because we want to mimic NodeStatus.Condition["Ready"] we make "unreachable" and "not ready" taints mutually exclusive.
   637  			taintToAdd := v1.Taint{}
   638  			oppositeTaint := v1.Taint{}
   639  			switch condition.Status {
   640  			case v1.ConditionFalse:
   641  				taintToAdd = *NotReadyTaintTemplate
   642  				oppositeTaint = *UnreachableTaintTemplate
   643  			case v1.ConditionUnknown:
   644  				taintToAdd = *UnreachableTaintTemplate
   645  				oppositeTaint = *NotReadyTaintTemplate
   646  			default:
   647  				// It seems that the Node is ready again, so there's no need to taint it.
   648  				logger.V(4).Info("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value))
   649  				return true, 0
   650  			}
   651  			result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node)
   652  			if result {
   653  				// Count the number of evictions.
   654  				zone := nodetopology.GetZoneKey(node)
   655  				evictionsTotal.WithLabelValues(zone).Inc()
   656  			}
   657  
   658  			return result, 0
   659  		})
   660  	}
   661  }
   662  
   663  // monitorNodeHealth verifies node health are constantly updated by kubelet, and if not, post "NodeReady==ConditionUnknown".
   664  // This function will
   665  //   - add nodes which are not ready or not reachable for a long period of time to a rate-limited
   666  //     queue so that NoExecute taints can be added by the goroutine running the doNoExecuteTaintingPass function,
   667  //   - update the PodReady condition Pods according to the state of the Node Ready condition.
   668  func (nc *Controller) monitorNodeHealth(ctx context.Context) error {
   669  	start := nc.now()
   670  	defer func() {
   671  		updateAllNodesHealthDuration.Observe(time.Since(start.Time).Seconds())
   672  	}()
   673  
   674  	// We are listing nodes from local cache as we can tolerate some small delays
   675  	// comparing to state from etcd and there is eventual consistency anyway.
   676  	nodes, err := nc.nodeLister.List(labels.Everything())
   677  	if err != nil {
   678  		return err
   679  	}
   680  	added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes)
   681  	logger := klog.FromContext(ctx)
   682  	for i := range newZoneRepresentatives {
   683  		nc.addPodEvictorForNewZone(logger, newZoneRepresentatives[i])
   684  	}
   685  	for i := range added {
   686  		logger.V(1).Info("Controller observed a new Node", "node", klog.KRef("", added[i].Name))
   687  		controllerutil.RecordNodeEvent(ctx, nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name))
   688  		nc.knownNodeSet[added[i].Name] = added[i]
   689  		nc.addPodEvictorForNewZone(logger, added[i])
   690  		nc.markNodeAsReachable(ctx, added[i])
   691  	}
   692  
   693  	for i := range deleted {
   694  		logger.V(1).Info("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name))
   695  		controllerutil.RecordNodeEvent(ctx, nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name))
   696  		delete(nc.knownNodeSet, deleted[i].Name)
   697  	}
   698  
   699  	var zoneToNodeConditionsLock sync.Mutex
   700  	zoneToNodeConditions := map[string][]*v1.NodeCondition{}
   701  	updateNodeFunc := func(piece int) {
   702  		start := nc.now()
   703  		defer func() {
   704  			updateNodeHealthDuration.Observe(time.Since(start.Time).Seconds())
   705  		}()
   706  
   707  		var observedReadyCondition v1.NodeCondition
   708  		var currentReadyCondition *v1.NodeCondition
   709  		node := nodes[piece].DeepCopy()
   710  
   711  		if err := wait.PollImmediate(retrySleepTime, retrySleepTime*scheduler.NodeHealthUpdateRetry, func() (bool, error) {
   712  			var err error
   713  			_, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeHealth(ctx, node)
   714  			if err == nil {
   715  				return true, nil
   716  			}
   717  			name := node.Name
   718  			node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{})
   719  			if err != nil {
   720  				logger.Error(nil, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name))
   721  				return false, err
   722  			}
   723  			return false, nil
   724  		}); err != nil {
   725  			logger.Error(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node))
   726  			return
   727  		}
   728  
   729  		// Some nodes may be excluded from disruption checking
   730  		if !isNodeExcludedFromDisruptionChecks(node) {
   731  			zoneToNodeConditionsLock.Lock()
   732  			zoneToNodeConditions[nodetopology.GetZoneKey(node)] = append(zoneToNodeConditions[nodetopology.GetZoneKey(node)], currentReadyCondition)
   733  			zoneToNodeConditionsLock.Unlock()
   734  		}
   735  
   736  		if currentReadyCondition != nil {
   737  			pods, err := nc.getPodsAssignedToNode(node.Name)
   738  			if err != nil {
   739  				utilruntime.HandleError(fmt.Errorf("unable to list pods of node %v: %v", node.Name, err))
   740  				if currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue {
   741  					// If error happened during node status transition (Ready -> NotReady)
   742  					// we need to mark node for retry to force MarkPodsNotReady execution
   743  					// in the next iteration.
   744  					nc.nodesToRetry.Store(node.Name, struct{}{})
   745  				}
   746  				return
   747  			}
   748  			nc.processTaintBaseEviction(ctx, node, &observedReadyCondition)
   749  
   750  			_, needsRetry := nc.nodesToRetry.Load(node.Name)
   751  			switch {
   752  			case currentReadyCondition.Status != v1.ConditionTrue && observedReadyCondition.Status == v1.ConditionTrue:
   753  				// Report node event only once when status changed.
   754  				controllerutil.RecordNodeStatusChange(logger, nc.recorder, node, "NodeNotReady")
   755  				fallthrough
   756  			case needsRetry && observedReadyCondition.Status != v1.ConditionTrue:
   757  				if err = controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, node.Name); err != nil {
   758  					utilruntime.HandleError(fmt.Errorf("unable to mark all pods NotReady on node %v: %v; queuing for retry", node.Name, err))
   759  					nc.nodesToRetry.Store(node.Name, struct{}{})
   760  					return
   761  				}
   762  			}
   763  		}
   764  		nc.nodesToRetry.Delete(node.Name)
   765  	}
   766  
   767  	// Marking the pods not ready on a node requires looping over them and
   768  	// updating each pod's status one at a time. This is performed serially, and
   769  	// can take a while if we're processing each node serially as well. So we
   770  	// process them with bounded concurrency instead, since most of the time is
   771  	// spent waiting on io.
   772  	workqueue.ParallelizeUntil(ctx, nc.nodeUpdateWorkerSize, len(nodes), updateNodeFunc)
   773  
   774  	nc.handleDisruption(ctx, zoneToNodeConditions, nodes)
   775  
   776  	return nil
   777  }
   778  
   779  func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) {
   780  	decisionTimestamp := nc.now()
   781  	// Check eviction timeout against decisionTimestamp
   782  	logger := klog.FromContext(ctx)
   783  	switch observedReadyCondition.Status {
   784  	case v1.ConditionFalse:
   785  		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
   786  		if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
   787  			taintToAdd := *NotReadyTaintTemplate
   788  			if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) {
   789  				logger.Error(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle")
   790  			}
   791  		} else if nc.markNodeForTainting(node, v1.ConditionFalse) {
   792  			logger.V(2).Info("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp)
   793  		}
   794  	case v1.ConditionUnknown:
   795  		// We want to update the taint straight away if Node is already tainted with the UnreachableTaint
   796  		if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
   797  			taintToAdd := *UnreachableTaintTemplate
   798  			if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) {
   799  				logger.Error(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle")
   800  			}
   801  		} else if nc.markNodeForTainting(node, v1.ConditionUnknown) {
   802  			logger.V(2).Info("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp)
   803  		}
   804  	case v1.ConditionTrue:
   805  		removed, err := nc.markNodeAsReachable(ctx, node)
   806  		if err != nil {
   807  			logger.Error(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node))
   808  		}
   809  		if removed {
   810  			logger.V(2).Info("Node is healthy again, removing all taints", "node", klog.KObj(node))
   811  		}
   812  	}
   813  }
   814  
   815  // labelNodeDisruptionExclusion is a label on nodes that controls whether they are
   816  // excluded from being considered for disruption checks by the node controller.
   817  const labelNodeDisruptionExclusion = "node.kubernetes.io/exclude-disruption"
   818  
   819  func isNodeExcludedFromDisruptionChecks(node *v1.Node) bool {
   820  	if _, ok := node.Labels[labelNodeDisruptionExclusion]; ok {
   821  		return true
   822  	}
   823  	return false
   824  }
   825  
   826  // tryUpdateNodeHealth checks a given node's conditions and tries to update it. Returns grace period to
   827  // which given node is entitled, state of current and last observed Ready Condition, and an error if it occurred.
   828  func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (time.Duration, v1.NodeCondition, *v1.NodeCondition, error) {
   829  	nodeHealth := nc.nodeHealthMap.getDeepCopy(node.Name)
   830  	defer func() {
   831  		nc.nodeHealthMap.set(node.Name, nodeHealth)
   832  	}()
   833  
   834  	var gracePeriod time.Duration
   835  	var observedReadyCondition v1.NodeCondition
   836  	_, currentReadyCondition := controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   837  	if currentReadyCondition == nil {
   838  		// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
   839  		// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
   840  		// to node.CreationTimestamp to avoid handle the corner case.
   841  		observedReadyCondition = v1.NodeCondition{
   842  			Type:               v1.NodeReady,
   843  			Status:             v1.ConditionUnknown,
   844  			LastHeartbeatTime:  node.CreationTimestamp,
   845  			LastTransitionTime: node.CreationTimestamp,
   846  		}
   847  		gracePeriod = nc.nodeStartupGracePeriod
   848  		if nodeHealth != nil {
   849  			nodeHealth.status = &node.Status
   850  		} else {
   851  			nodeHealth = &nodeHealthData{
   852  				status:                   &node.Status,
   853  				probeTimestamp:           node.CreationTimestamp,
   854  				readyTransitionTimestamp: node.CreationTimestamp,
   855  			}
   856  		}
   857  	} else {
   858  		// If ready condition is not nil, make a copy of it, since we may modify it in place later.
   859  		observedReadyCondition = *currentReadyCondition
   860  		gracePeriod = nc.nodeMonitorGracePeriod
   861  	}
   862  	// There are following cases to check:
   863  	// - both saved and new status have no Ready Condition set - we leave everything as it is,
   864  	// - saved status have no Ready Condition, but current one does - Controller was restarted with Node data already present in etcd,
   865  	// - saved status have some Ready Condition, but current one does not - it's an error, but we fill it up because that's probably a good thing to do,
   866  	// - both saved and current statuses have Ready Conditions and they have the same LastProbeTime - nothing happened on that Node, it may be
   867  	//   unresponsive, so we leave it as it is,
   868  	// - both saved and current statuses have Ready Conditions, they have different LastProbeTimes, but the same Ready Condition State -
   869  	//   everything's in order, no transition occurred, we update only probeTimestamp,
   870  	// - both saved and current statuses have Ready Conditions, different LastProbeTimes and different Ready Condition State -
   871  	//   Ready Condition changed it state since we last seen it, so we update both probeTimestamp and readyTransitionTimestamp.
   872  	// TODO: things to consider:
   873  	//   - if 'LastProbeTime' have gone back in time its probably an error, currently we ignore it,
   874  	//   - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
   875  	//     if that's the case, but it does not seem necessary.
   876  	var savedCondition *v1.NodeCondition
   877  	var savedLease *coordv1.Lease
   878  	if nodeHealth != nil {
   879  		_, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
   880  		savedLease = nodeHealth.lease
   881  	}
   882  	logger := klog.FromContext(ctx)
   883  	if nodeHealth == nil {
   884  		logger.Info("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node))
   885  		nodeHealth = &nodeHealthData{
   886  			status:                   &node.Status,
   887  			probeTimestamp:           nc.now(),
   888  			readyTransitionTimestamp: nc.now(),
   889  		}
   890  	} else if savedCondition == nil && currentReadyCondition != nil {
   891  		logger.V(1).Info("Creating timestamp entry for newly observed Node", "node", klog.KObj(node))
   892  		nodeHealth = &nodeHealthData{
   893  			status:                   &node.Status,
   894  			probeTimestamp:           nc.now(),
   895  			readyTransitionTimestamp: nc.now(),
   896  		}
   897  	} else if savedCondition != nil && currentReadyCondition == nil {
   898  		logger.Error(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node))
   899  		// TODO: figure out what to do in this case. For now we do the same thing as above.
   900  		nodeHealth = &nodeHealthData{
   901  			status:                   &node.Status,
   902  			probeTimestamp:           nc.now(),
   903  			readyTransitionTimestamp: nc.now(),
   904  		}
   905  	} else if savedCondition != nil && currentReadyCondition != nil && savedCondition.LastHeartbeatTime != currentReadyCondition.LastHeartbeatTime {
   906  		var transitionTime metav1.Time
   907  		// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
   908  		// otherwise we leave it as it is.
   909  		if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime {
   910  			logger.V(3).Info("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition)
   911  			transitionTime = nc.now()
   912  		} else {
   913  			transitionTime = nodeHealth.readyTransitionTimestamp
   914  		}
   915  		if loggerV := logger.V(5); loggerV.Enabled() {
   916  			loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status)
   917  		} else {
   918  			logger.V(3).Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node))
   919  		}
   920  		nodeHealth = &nodeHealthData{
   921  			status:                   &node.Status,
   922  			probeTimestamp:           nc.now(),
   923  			readyTransitionTimestamp: transitionTime,
   924  		}
   925  	}
   926  	// Always update the probe time if node lease is renewed.
   927  	// Note: If kubelet never posted the node status, but continues renewing the
   928  	// heartbeat leases, the node controller will assume the node is healthy and
   929  	// take no action.
   930  	observedLease, _ := nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
   931  	if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
   932  		nodeHealth.lease = observedLease
   933  		nodeHealth.probeTimestamp = nc.now()
   934  	}
   935  
   936  	if nc.now().After(nodeHealth.probeTimestamp.Add(gracePeriod)) {
   937  		// NodeReady condition or lease was last set longer ago than gracePeriod, so
   938  		// update it to Unknown (regardless of its current value) in the master.
   939  
   940  		nodeConditionTypes := []v1.NodeConditionType{
   941  			v1.NodeReady,
   942  			v1.NodeMemoryPressure,
   943  			v1.NodeDiskPressure,
   944  			v1.NodePIDPressure,
   945  			// We don't change 'NodeNetworkUnavailable' condition, as it's managed on a control plane level.
   946  			// v1.NodeNetworkUnavailable,
   947  		}
   948  
   949  		nowTimestamp := nc.now()
   950  		for _, nodeConditionType := range nodeConditionTypes {
   951  			_, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType)
   952  			if currentCondition == nil {
   953  				logger.V(2).Info("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node))
   954  				node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
   955  					Type:               nodeConditionType,
   956  					Status:             v1.ConditionUnknown,
   957  					Reason:             "NodeStatusNeverUpdated",
   958  					Message:            "Kubelet never posted node status.",
   959  					LastHeartbeatTime:  node.CreationTimestamp,
   960  					LastTransitionTime: nowTimestamp,
   961  				})
   962  			} else {
   963  				logger.V(2).Info("Node hasn't been updated",
   964  					"node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition)
   965  				if currentCondition.Status != v1.ConditionUnknown {
   966  					currentCondition.Status = v1.ConditionUnknown
   967  					currentCondition.Reason = "NodeStatusUnknown"
   968  					currentCondition.Message = "Kubelet stopped posting node status."
   969  					currentCondition.LastTransitionTime = nowTimestamp
   970  				}
   971  			}
   972  		}
   973  		// We need to update currentReadyCondition due to its value potentially changed.
   974  		_, currentReadyCondition = controllerutil.GetNodeCondition(&node.Status, v1.NodeReady)
   975  
   976  		if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) {
   977  			if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil {
   978  				logger.Error(err, "Error updating node", "node", klog.KObj(node))
   979  				return gracePeriod, observedReadyCondition, currentReadyCondition, err
   980  			}
   981  			nodeHealth = &nodeHealthData{
   982  				status:                   &node.Status,
   983  				probeTimestamp:           nodeHealth.probeTimestamp,
   984  				readyTransitionTimestamp: nc.now(),
   985  				lease:                    observedLease,
   986  			}
   987  			return gracePeriod, observedReadyCondition, currentReadyCondition, nil
   988  		}
   989  	}
   990  
   991  	return gracePeriod, observedReadyCondition, currentReadyCondition, nil
   992  }
   993  
   994  func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) {
   995  	newZoneStates := map[string]ZoneState{}
   996  	allAreFullyDisrupted := true
   997  	logger := klog.FromContext(ctx)
   998  	for k, v := range zoneToNodeConditions {
   999  		zoneSize.WithLabelValues(k).Set(float64(len(v)))
  1000  		unhealthy, newState := nc.computeZoneStateFunc(v)
  1001  		zoneHealth.WithLabelValues(k).Set(float64(100*(len(v)-unhealthy)) / float64(len(v)))
  1002  		unhealthyNodes.WithLabelValues(k).Set(float64(unhealthy))
  1003  		if newState != stateFullDisruption {
  1004  			allAreFullyDisrupted = false
  1005  		}
  1006  		newZoneStates[k] = newState
  1007  		if _, had := nc.zoneStates[k]; !had {
  1008  			logger.Error(nil, "Setting initial state for unseen zone", "zone", k)
  1009  			nc.zoneStates[k] = stateInitial
  1010  		}
  1011  	}
  1012  
  1013  	allWasFullyDisrupted := true
  1014  	for k, v := range nc.zoneStates {
  1015  		if _, have := zoneToNodeConditions[k]; !have {
  1016  			zoneSize.WithLabelValues(k).Set(0)
  1017  			zoneHealth.WithLabelValues(k).Set(100)
  1018  			unhealthyNodes.WithLabelValues(k).Set(0)
  1019  			delete(nc.zoneStates, k)
  1020  			continue
  1021  		}
  1022  		if v != stateFullDisruption {
  1023  			allWasFullyDisrupted = false
  1024  			break
  1025  		}
  1026  	}
  1027  
  1028  	// At least one node was responding in previous pass or in the current pass. Semantics is as follows:
  1029  	// - if the new state is "partialDisruption" we call a user defined function that returns a new limiter to use,
  1030  	// - if the new state is "normal" we resume normal operation (go back to default limiter settings),
  1031  	// - if new state is "fullDisruption" we restore normal eviction rate,
  1032  	//   - unless all zones in the cluster are in "fullDisruption" - in that case we stop all evictions.
  1033  	if !allAreFullyDisrupted || !allWasFullyDisrupted {
  1034  		// We're switching to full disruption mode
  1035  		if allAreFullyDisrupted {
  1036  			logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode")
  1037  			for i := range nodes {
  1038  				_, err := nc.markNodeAsReachable(ctx, nodes[i])
  1039  				if err != nil {
  1040  					logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i]))
  1041  				}
  1042  			}
  1043  			// We stop all evictions.
  1044  			for k := range nc.zoneStates {
  1045  				nc.zoneNoExecuteTainter[k].SwapLimiter(0)
  1046  			}
  1047  			for k := range nc.zoneStates {
  1048  				nc.zoneStates[k] = stateFullDisruption
  1049  			}
  1050  			// All rate limiters are updated, so we can return early here.
  1051  			return
  1052  		}
  1053  		// We're exiting full disruption mode
  1054  		if allWasFullyDisrupted {
  1055  			logger.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode")
  1056  			// When exiting disruption mode update probe timestamps on all Nodes.
  1057  			now := nc.now()
  1058  			for i := range nodes {
  1059  				v := nc.nodeHealthMap.getDeepCopy(nodes[i].Name)
  1060  				v.probeTimestamp = now
  1061  				v.readyTransitionTimestamp = now
  1062  				nc.nodeHealthMap.set(nodes[i].Name, v)
  1063  			}
  1064  			// We reset all rate limiters to settings appropriate for the given state.
  1065  			for k := range nc.zoneStates {
  1066  				nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newZoneStates[k])
  1067  				nc.zoneStates[k] = newZoneStates[k]
  1068  			}
  1069  			return
  1070  		}
  1071  		// We know that there's at least one not-fully disrupted so,
  1072  		// we can use default behavior for rate limiters
  1073  		for k, v := range nc.zoneStates {
  1074  			newState := newZoneStates[k]
  1075  			if v == newState {
  1076  				continue
  1077  			}
  1078  			logger.Info("Controller detected that zone is now in new state", "zone", k, "newState", newState)
  1079  			nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState)
  1080  			nc.zoneStates[k] = newState
  1081  		}
  1082  	}
  1083  }
  1084  
  1085  func (nc *Controller) podUpdated(oldPod, newPod *v1.Pod) {
  1086  	if newPod == nil {
  1087  		return
  1088  	}
  1089  	if len(newPod.Spec.NodeName) != 0 && (oldPod == nil || newPod.Spec.NodeName != oldPod.Spec.NodeName) {
  1090  		podItem := podUpdateItem{newPod.Namespace, newPod.Name}
  1091  		nc.podUpdateQueue.Add(podItem)
  1092  	}
  1093  }
  1094  
  1095  func (nc *Controller) doPodProcessingWorker(ctx context.Context) {
  1096  	for {
  1097  		obj, shutdown := nc.podUpdateQueue.Get()
  1098  		// "podUpdateQueue" will be shutdown when "stopCh" closed;
  1099  		// we do not need to re-check "stopCh" again.
  1100  		if shutdown {
  1101  			return
  1102  		}
  1103  
  1104  		podItem := obj
  1105  		nc.processPod(ctx, podItem)
  1106  	}
  1107  }
  1108  
  1109  // processPod is processing events of assigning pods to nodes. In particular:
  1110  // 1. for NodeReady=true node, taint eviction for this pod will be cancelled
  1111  // 2. for NodeReady=false or unknown node, taint eviction of pod will happen and pod will be marked as not ready
  1112  // 3. if node doesn't exist in cache, it will be skipped.
  1113  func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) {
  1114  	defer nc.podUpdateQueue.Done(podItem)
  1115  	pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name)
  1116  	logger := klog.FromContext(ctx)
  1117  	if err != nil {
  1118  		if apierrors.IsNotFound(err) {
  1119  			// If the pod was deleted, there is no need to requeue.
  1120  			return
  1121  		}
  1122  		logger.Info("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err)
  1123  		nc.podUpdateQueue.AddRateLimited(podItem)
  1124  		return
  1125  	}
  1126  
  1127  	nodeName := pod.Spec.NodeName
  1128  
  1129  	nodeHealth := nc.nodeHealthMap.getDeepCopy(nodeName)
  1130  	if nodeHealth == nil {
  1131  		// Node data is not gathered yet or node has been removed in the meantime.
  1132  		return
  1133  	}
  1134  
  1135  	_, err = nc.nodeLister.Get(nodeName)
  1136  	if err != nil {
  1137  		logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err)
  1138  		nc.podUpdateQueue.AddRateLimited(podItem)
  1139  		return
  1140  	}
  1141  
  1142  	_, currentReadyCondition := controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady)
  1143  	if currentReadyCondition == nil {
  1144  		// Lack of NodeReady condition may only happen after node addition (or if it will be maliciously deleted).
  1145  		// In both cases, the pod will be handled correctly (evicted if needed) during processing
  1146  		// of the next node update event.
  1147  		return
  1148  	}
  1149  
  1150  	pods := []*v1.Pod{pod}
  1151  	if currentReadyCondition.Status != v1.ConditionTrue {
  1152  		if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil {
  1153  			logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err)
  1154  			nc.podUpdateQueue.AddRateLimited(podItem)
  1155  		}
  1156  	}
  1157  }
  1158  
  1159  func (nc *Controller) setLimiterInZone(zone string, zoneSize int, state ZoneState) {
  1160  	switch state {
  1161  	case stateNormal:
  1162  		nc.zoneNoExecuteTainter[zone].SwapLimiter(nc.evictionLimiterQPS)
  1163  	case statePartialDisruption:
  1164  		nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1165  			nc.enterPartialDisruptionFunc(zoneSize))
  1166  	case stateFullDisruption:
  1167  		nc.zoneNoExecuteTainter[zone].SwapLimiter(
  1168  			nc.enterFullDisruptionFunc(zoneSize))
  1169  	}
  1170  }
  1171  
  1172  // classifyNodes classifies the allNodes to three categories:
  1173  //  1. added: the nodes that in 'allNodes', but not in 'knownNodeSet'
  1174  //  2. deleted: the nodes that in 'knownNodeSet', but not in 'allNodes'
  1175  //  3. newZoneRepresentatives: the nodes that in both 'knownNodeSet' and 'allNodes', but no zone states
  1176  func (nc *Controller) classifyNodes(allNodes []*v1.Node) (added, deleted, newZoneRepresentatives []*v1.Node) {
  1177  	for i := range allNodes {
  1178  		if _, has := nc.knownNodeSet[allNodes[i].Name]; !has {
  1179  			added = append(added, allNodes[i])
  1180  		} else {
  1181  			// Currently, we only consider new zone as updated.
  1182  			zone := nodetopology.GetZoneKey(allNodes[i])
  1183  			if _, found := nc.zoneStates[zone]; !found {
  1184  				newZoneRepresentatives = append(newZoneRepresentatives, allNodes[i])
  1185  			}
  1186  		}
  1187  	}
  1188  
  1189  	// If there's a difference between lengths of known Nodes and observed nodes
  1190  	// we must have removed some Node.
  1191  	if len(nc.knownNodeSet)+len(added) != len(allNodes) {
  1192  		knowSetCopy := map[string]*v1.Node{}
  1193  		for k, v := range nc.knownNodeSet {
  1194  			knowSetCopy[k] = v
  1195  		}
  1196  		for i := range allNodes {
  1197  			delete(knowSetCopy, allNodes[i].Name)
  1198  		}
  1199  		for i := range knowSetCopy {
  1200  			deleted = append(deleted, knowSetCopy[i])
  1201  		}
  1202  	}
  1203  	return
  1204  }
  1205  
  1206  // HealthyQPSFunc returns the default value for cluster eviction rate - we take
  1207  // nodeNum for consistency with ReducedQPSFunc.
  1208  func (nc *Controller) HealthyQPSFunc(nodeNum int) float32 {
  1209  	return nc.evictionLimiterQPS
  1210  }
  1211  
  1212  // ReducedQPSFunc returns the QPS for when the cluster is large make
  1213  // evictions slower, if they're small stop evictions altogether.
  1214  func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 {
  1215  	if int32(nodeNum) > nc.largeClusterThreshold {
  1216  		return nc.secondaryEvictionLimiterQPS
  1217  	}
  1218  	return 0
  1219  }
  1220  
  1221  // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor.
  1222  func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) {
  1223  	nc.evictorLock.Lock()
  1224  	defer nc.evictorLock.Unlock()
  1225  	zone := nodetopology.GetZoneKey(node)
  1226  	if _, found := nc.zoneStates[zone]; !found {
  1227  		nc.zoneStates[zone] = stateInitial
  1228  		nc.zoneNoExecuteTainter[zone] =
  1229  			scheduler.NewRateLimitedTimedQueue(
  1230  				flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst))
  1231  		// Init the metric for the new zone.
  1232  		logger.Info("Initializing eviction metric for zone", "zone", zone)
  1233  		evictionsTotal.WithLabelValues(zone).Add(0)
  1234  	}
  1235  }
  1236  
  1237  func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStatus) bool {
  1238  	nc.evictorLock.Lock()
  1239  	defer nc.evictorLock.Unlock()
  1240  	if status == v1.ConditionFalse {
  1241  		if !taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) {
  1242  			nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name)
  1243  		}
  1244  	}
  1245  
  1246  	if status == v1.ConditionUnknown {
  1247  		if !taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) {
  1248  			nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name)
  1249  		}
  1250  	}
  1251  
  1252  	return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Add(node.Name, string(node.UID))
  1253  }
  1254  
  1255  func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) {
  1256  	err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate)
  1257  	logger := klog.FromContext(ctx)
  1258  	if err != nil {
  1259  		logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node))
  1260  		return false, err
  1261  	}
  1262  	err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate)
  1263  	if err != nil {
  1264  		logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node))
  1265  		return false, err
  1266  	}
  1267  	nc.evictorLock.Lock()
  1268  	defer nc.evictorLock.Unlock()
  1269  
  1270  	return nc.zoneNoExecuteTainter[nodetopology.GetZoneKey(node)].Remove(node.Name), nil
  1271  }
  1272  
  1273  // ComputeZoneState returns a slice of NodeReadyConditions for all Nodes in a given zone.
  1274  // The zone is considered:
  1275  // - fullyDisrupted if there're no Ready Nodes,
  1276  // - partiallyDisrupted if at least than nc.unhealthyZoneThreshold percent of Nodes are not Ready,
  1277  // - normal otherwise
  1278  func (nc *Controller) ComputeZoneState(nodeReadyConditions []*v1.NodeCondition) (int, ZoneState) {
  1279  	readyNodes := 0
  1280  	notReadyNodes := 0
  1281  	for i := range nodeReadyConditions {
  1282  		if nodeReadyConditions[i] != nil && nodeReadyConditions[i].Status == v1.ConditionTrue {
  1283  			readyNodes++
  1284  		} else {
  1285  			notReadyNodes++
  1286  		}
  1287  	}
  1288  	switch {
  1289  	case readyNodes == 0 && notReadyNodes > 0:
  1290  		return notReadyNodes, stateFullDisruption
  1291  	case notReadyNodes > 2 && float32(notReadyNodes)/float32(notReadyNodes+readyNodes) >= nc.unhealthyZoneThreshold:
  1292  		return notReadyNodes, statePartialDisruption
  1293  	default:
  1294  		return notReadyNodes, stateNormal
  1295  	}
  1296  }
  1297  
  1298  // reconcileNodeLabels reconciles node labels.
  1299  func (nc *Controller) reconcileNodeLabels(ctx context.Context, nodeName string) error {
  1300  	node, err := nc.nodeLister.Get(nodeName)
  1301  	if err != nil {
  1302  		// If node not found, just ignore it.
  1303  		if apierrors.IsNotFound(err) {
  1304  			return nil
  1305  		}
  1306  		return err
  1307  	}
  1308  
  1309  	if node.Labels == nil {
  1310  		// Nothing to reconcile.
  1311  		return nil
  1312  	}
  1313  
  1314  	labelsToUpdate := map[string]string{}
  1315  	for _, r := range labelReconcileInfo {
  1316  		primaryValue, primaryExists := node.Labels[r.primaryKey]
  1317  		secondaryValue, secondaryExists := node.Labels[r.secondaryKey]
  1318  
  1319  		if !primaryExists {
  1320  			// The primary label key does not exist. This should not happen
  1321  			// within our supported version skew range, when no external
  1322  			// components/factors modifying the node object. Ignore this case.
  1323  			continue
  1324  		}
  1325  		if secondaryExists && primaryValue != secondaryValue {
  1326  			// Secondary label exists, but not consistent with the primary
  1327  			// label. Need to reconcile.
  1328  			labelsToUpdate[r.secondaryKey] = primaryValue
  1329  
  1330  		} else if !secondaryExists && r.ensureSecondaryExists {
  1331  			// Apply secondary label based on primary label.
  1332  			labelsToUpdate[r.secondaryKey] = primaryValue
  1333  		}
  1334  	}
  1335  
  1336  	if len(labelsToUpdate) == 0 {
  1337  		return nil
  1338  	}
  1339  	if !controllerutil.AddOrUpdateLabelsOnNode(ctx, nc.kubeClient, labelsToUpdate, node) {
  1340  		return fmt.Errorf("failed update labels for node %+v", node)
  1341  	}
  1342  	return nil
  1343  }