k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/daemon/daemon_controller.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/daemon/daemon_controller.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package daemon
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"sort"
    24  	"sync"
    25  	"time"
    26  
    27  	apps "k8s.io/api/apps/v1"
    28  	v1 "k8s.io/api/core/v1"
    29  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    30  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    31  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    32  	"k8s.io/apimachinery/pkg/labels"
    33  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    34  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    35  	"k8s.io/apimachinery/pkg/util/wait"
    36  	appsinformers "k8s.io/client-go/informers/apps/v1"
    37  	coreinformers "k8s.io/client-go/informers/core/v1"
    38  	clientset "k8s.io/client-go/kubernetes"
    39  	"k8s.io/client-go/kubernetes/scheme"
    40  	unversionedapps "k8s.io/client-go/kubernetes/typed/apps/v1"
    41  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    42  	appslisters "k8s.io/client-go/listers/apps/v1"
    43  	corelisters "k8s.io/client-go/listers/core/v1"
    44  	"k8s.io/client-go/tools/cache"
    45  	"k8s.io/client-go/tools/record"
    46  	"k8s.io/client-go/util/flowcontrol"
    47  	"k8s.io/client-go/util/workqueue"
    48  	v1helper "k8s.io/component-helpers/scheduling/corev1"
    49  	"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
    50  	"k8s.io/klog/v2"
    51  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    52  	"k8s.io/kubernetes/pkg/controller"
    53  	"k8s.io/kubernetes/pkg/controller/daemon/util"
    54  )
    55  
    56  const (
    57  	// BurstReplicas is a rate limiter for booting pods on a lot of pods.
    58  	// The value of 250 is chosen b/c values that are too high can cause registry DoS issues.
    59  	BurstReplicas = 250
    60  
    61  	// StatusUpdateRetries limits the number of retries if sending a status update to API server fails.
    62  	StatusUpdateRetries = 1
    63  
    64  	// BackoffGCInterval is the time that has to pass before next iteration of backoff GC is run
    65  	BackoffGCInterval = 1 * time.Minute
    66  )
    67  
    68  // Reasons for DaemonSet events
    69  const (
    70  	// SelectingAllReason is added to an event when a DaemonSet selects all Pods.
    71  	SelectingAllReason = "SelectingAll"
    72  	// FailedPlacementReason is added to an event when a DaemonSet can't schedule a Pod to a specified node.
    73  	FailedPlacementReason = "FailedPlacement"
    74  	// FailedDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Failed'.
    75  	FailedDaemonPodReason = "FailedDaemonPod"
    76  	// SucceededDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Succeeded'.
    77  	SucceededDaemonPodReason = "SucceededDaemonPod"
    78  )
    79  
    80  // controllerKind contains the schema.GroupVersionKind for this controller type.
    81  var controllerKind = apps.SchemeGroupVersion.WithKind("DaemonSet")
    82  
    83  // DaemonSetsController is responsible for synchronizing DaemonSet objects stored
    84  // in the system with actual running pods.
    85  type DaemonSetsController struct {
    86  	kubeClient clientset.Interface
    87  
    88  	eventBroadcaster record.EventBroadcaster
    89  	eventRecorder    record.EventRecorder
    90  
    91  	podControl controller.PodControlInterface
    92  	crControl  controller.ControllerRevisionControlInterface
    93  
    94  	// An dsc is temporarily suspended after creating/deleting these many replicas.
    95  	// It resumes normal action after observing the watch events for them.
    96  	burstReplicas int
    97  
    98  	// To allow injection of syncDaemonSet for testing.
    99  	syncHandler func(ctx context.Context, dsKey string) error
   100  	// used for unit testing
   101  	enqueueDaemonSet func(ds *apps.DaemonSet)
   102  	// A TTLCache of pod creates/deletes each ds expects to see
   103  	expectations controller.ControllerExpectationsInterface
   104  	// dsLister can list/get daemonsets from the shared informer's store
   105  	dsLister appslisters.DaemonSetLister
   106  	// dsStoreSynced returns true if the daemonset store has been synced at least once.
   107  	// Added as a member to the struct to allow injection for testing.
   108  	dsStoreSynced cache.InformerSynced
   109  	// historyLister get list/get history from the shared informers's store
   110  	historyLister appslisters.ControllerRevisionLister
   111  	// historyStoreSynced returns true if the history store has been synced at least once.
   112  	// Added as a member to the struct to allow injection for testing.
   113  	historyStoreSynced cache.InformerSynced
   114  	// podLister get list/get pods from the shared informers's store
   115  	podLister corelisters.PodLister
   116  	// podStoreSynced returns true if the pod store has been synced at least once.
   117  	// Added as a member to the struct to allow injection for testing.
   118  	podStoreSynced cache.InformerSynced
   119  	// nodeLister can list/get nodes from the shared informer's store
   120  	nodeLister corelisters.NodeLister
   121  	// nodeStoreSynced returns true if the node store has been synced at least once.
   122  	// Added as a member to the struct to allow injection for testing.
   123  	nodeStoreSynced cache.InformerSynced
   124  
   125  	// DaemonSet keys that need to be synced.
   126  	queue workqueue.TypedRateLimitingInterface[string]
   127  
   128  	failedPodsBackoff *flowcontrol.Backoff
   129  }
   130  
   131  // NewDaemonSetsController creates a new DaemonSetsController
   132  func NewDaemonSetsController(
   133  	ctx context.Context,
   134  	daemonSetInformer appsinformers.DaemonSetInformer,
   135  	historyInformer appsinformers.ControllerRevisionInformer,
   136  	podInformer coreinformers.PodInformer,
   137  	nodeInformer coreinformers.NodeInformer,
   138  	kubeClient clientset.Interface,
   139  	failedPodsBackoff *flowcontrol.Backoff,
   140  ) (*DaemonSetsController, error) {
   141  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
   142  	logger := klog.FromContext(ctx)
   143  	dsc := &DaemonSetsController{
   144  		kubeClient:       kubeClient,
   145  		eventBroadcaster: eventBroadcaster,
   146  		eventRecorder:    eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}),
   147  		podControl: controller.RealPodControl{
   148  			KubeClient: kubeClient,
   149  			Recorder:   eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}),
   150  		},
   151  		crControl: controller.RealControllerRevisionControl{
   152  			KubeClient: kubeClient,
   153  		},
   154  		burstReplicas: BurstReplicas,
   155  		expectations:  controller.NewControllerExpectations(),
   156  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
   157  			workqueue.DefaultTypedControllerRateLimiter[string](),
   158  			workqueue.TypedRateLimitingQueueConfig[string]{
   159  				Name: "daemonset",
   160  			},
   161  		),
   162  	}
   163  
   164  	daemonSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   165  		AddFunc: func(obj interface{}) {
   166  			dsc.addDaemonset(logger, obj)
   167  		},
   168  		UpdateFunc: func(oldObj, newObj interface{}) {
   169  			dsc.updateDaemonset(logger, oldObj, newObj)
   170  		},
   171  		DeleteFunc: func(obj interface{}) {
   172  			dsc.deleteDaemonset(logger, obj)
   173  		},
   174  	})
   175  	dsc.dsLister = daemonSetInformer.Lister()
   176  	dsc.dsStoreSynced = daemonSetInformer.Informer().HasSynced
   177  
   178  	historyInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   179  		AddFunc: func(obj interface{}) {
   180  			dsc.addHistory(logger, obj)
   181  		},
   182  		UpdateFunc: func(oldObj, newObj interface{}) {
   183  			dsc.updateHistory(logger, oldObj, newObj)
   184  		},
   185  		DeleteFunc: func(obj interface{}) {
   186  			dsc.deleteHistory(logger, obj)
   187  		},
   188  	})
   189  	dsc.historyLister = historyInformer.Lister()
   190  	dsc.historyStoreSynced = historyInformer.Informer().HasSynced
   191  
   192  	// Watch for creation/deletion of pods. The reason we watch is that we don't want a daemon set to create/delete
   193  	// more pods until all the effects (expectations) of a daemon set's create/delete have been observed.
   194  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   195  		AddFunc: func(obj interface{}) {
   196  			dsc.addPod(logger, obj)
   197  		},
   198  		UpdateFunc: func(oldObj, newObj interface{}) {
   199  			dsc.updatePod(logger, oldObj, newObj)
   200  		},
   201  		DeleteFunc: func(obj interface{}) {
   202  			dsc.deletePod(logger, obj)
   203  		},
   204  	})
   205  	dsc.podLister = podInformer.Lister()
   206  	dsc.podStoreSynced = podInformer.Informer().HasSynced
   207  
   208  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   209  		AddFunc: func(obj interface{}) {
   210  			dsc.addNode(logger, obj)
   211  		},
   212  		UpdateFunc: func(oldObj, newObj interface{}) {
   213  			dsc.updateNode(logger, oldObj, newObj)
   214  		},
   215  	},
   216  	)
   217  	dsc.nodeStoreSynced = nodeInformer.Informer().HasSynced
   218  	dsc.nodeLister = nodeInformer.Lister()
   219  
   220  	dsc.syncHandler = dsc.syncDaemonSet
   221  	dsc.enqueueDaemonSet = dsc.enqueue
   222  
   223  	dsc.failedPodsBackoff = failedPodsBackoff
   224  
   225  	return dsc, nil
   226  }
   227  
   228  func (dsc *DaemonSetsController) addDaemonset(logger klog.Logger, obj interface{}) {
   229  	ds := obj.(*apps.DaemonSet)
   230  	logger.V(4).Info("Adding daemon set", "daemonset", klog.KObj(ds))
   231  	dsc.enqueueDaemonSet(ds)
   232  }
   233  
   234  func (dsc *DaemonSetsController) updateDaemonset(logger klog.Logger, cur, old interface{}) {
   235  	oldDS := old.(*apps.DaemonSet)
   236  	curDS := cur.(*apps.DaemonSet)
   237  
   238  	// TODO: make a KEP and fix informers to always call the delete event handler on re-create
   239  	if curDS.UID != oldDS.UID {
   240  		key, err := controller.KeyFunc(oldDS)
   241  		if err != nil {
   242  			utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldDS, err))
   243  			return
   244  		}
   245  		dsc.deleteDaemonset(logger, cache.DeletedFinalStateUnknown{
   246  			Key: key,
   247  			Obj: oldDS,
   248  		})
   249  	}
   250  
   251  	logger.V(4).Info("Updating daemon set", "daemonset", klog.KObj(oldDS))
   252  	dsc.enqueueDaemonSet(curDS)
   253  }
   254  
   255  func (dsc *DaemonSetsController) deleteDaemonset(logger klog.Logger, obj interface{}) {
   256  	ds, ok := obj.(*apps.DaemonSet)
   257  	if !ok {
   258  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   259  		if !ok {
   260  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   261  			return
   262  		}
   263  		ds, ok = tombstone.Obj.(*apps.DaemonSet)
   264  		if !ok {
   265  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a DaemonSet %#v", obj))
   266  			return
   267  		}
   268  	}
   269  	logger.V(4).Info("Deleting daemon set", "daemonset", klog.KObj(ds))
   270  
   271  	key, err := controller.KeyFunc(ds)
   272  	if err != nil {
   273  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ds, err))
   274  		return
   275  	}
   276  
   277  	// Delete expectations for the DaemonSet so if we create a new one with the same name it starts clean
   278  	dsc.expectations.DeleteExpectations(logger, key)
   279  
   280  	dsc.queue.Add(key)
   281  }
   282  
   283  // Run begins watching and syncing daemon sets.
   284  func (dsc *DaemonSetsController) Run(ctx context.Context, workers int) {
   285  	defer utilruntime.HandleCrash()
   286  
   287  	dsc.eventBroadcaster.StartStructuredLogging(3)
   288  	dsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: dsc.kubeClient.CoreV1().Events("")})
   289  	defer dsc.eventBroadcaster.Shutdown()
   290  
   291  	defer dsc.queue.ShutDown()
   292  
   293  	logger := klog.FromContext(ctx)
   294  	logger.Info("Starting daemon sets controller")
   295  	defer logger.Info("Shutting down daemon sets controller")
   296  
   297  	if !cache.WaitForNamedCacheSync("daemon sets", ctx.Done(), dsc.podStoreSynced, dsc.nodeStoreSynced, dsc.historyStoreSynced, dsc.dsStoreSynced) {
   298  		return
   299  	}
   300  
   301  	for i := 0; i < workers; i++ {
   302  		go wait.UntilWithContext(ctx, dsc.runWorker, time.Second)
   303  	}
   304  
   305  	go wait.Until(dsc.failedPodsBackoff.GC, BackoffGCInterval, ctx.Done())
   306  
   307  	<-ctx.Done()
   308  }
   309  
   310  func (dsc *DaemonSetsController) runWorker(ctx context.Context) {
   311  	for dsc.processNextWorkItem(ctx) {
   312  	}
   313  }
   314  
   315  // processNextWorkItem deals with one key off the queue.  It returns false when it's time to quit.
   316  func (dsc *DaemonSetsController) processNextWorkItem(ctx context.Context) bool {
   317  	dsKey, quit := dsc.queue.Get()
   318  	if quit {
   319  		return false
   320  	}
   321  	defer dsc.queue.Done(dsKey)
   322  
   323  	err := dsc.syncHandler(ctx, dsKey)
   324  	if err == nil {
   325  		dsc.queue.Forget(dsKey)
   326  		return true
   327  	}
   328  
   329  	utilruntime.HandleError(fmt.Errorf("%v failed with : %v", dsKey, err))
   330  	dsc.queue.AddRateLimited(dsKey)
   331  
   332  	return true
   333  }
   334  
   335  func (dsc *DaemonSetsController) enqueue(ds *apps.DaemonSet) {
   336  	key, err := controller.KeyFunc(ds)
   337  	if err != nil {
   338  		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %#v: %v", ds, err))
   339  		return
   340  	}
   341  
   342  	// TODO: Handle overlapping controllers better. See comment in ReplicationManager.
   343  	dsc.queue.Add(key)
   344  }
   345  
   346  func (dsc *DaemonSetsController) enqueueDaemonSetAfter(obj interface{}, after time.Duration) {
   347  	key, err := controller.KeyFunc(obj)
   348  	if err != nil {
   349  		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
   350  		return
   351  	}
   352  
   353  	// TODO: Handle overlapping controllers better. See comment in ReplicationManager.
   354  	dsc.queue.AddAfter(key, after)
   355  }
   356  
   357  // getDaemonSetsForPod returns a list of DaemonSets that potentially match the pod.
   358  func (dsc *DaemonSetsController) getDaemonSetsForPod(pod *v1.Pod) []*apps.DaemonSet {
   359  	sets, err := dsc.dsLister.GetPodDaemonSets(pod)
   360  	if err != nil {
   361  		return nil
   362  	}
   363  	if len(sets) > 1 {
   364  		// ControllerRef will ensure we don't do anything crazy, but more than one
   365  		// item in this list nevertheless constitutes user error.
   366  		utilruntime.HandleError(fmt.Errorf("user error! more than one daemon is selecting pods with labels: %+v", pod.Labels))
   367  	}
   368  	return sets
   369  }
   370  
   371  // getDaemonSetsForHistory returns a list of DaemonSets that potentially
   372  // match a ControllerRevision.
   373  func (dsc *DaemonSetsController) getDaemonSetsForHistory(logger klog.Logger, history *apps.ControllerRevision) []*apps.DaemonSet {
   374  	daemonSets, err := dsc.dsLister.GetHistoryDaemonSets(history)
   375  	if err != nil || len(daemonSets) == 0 {
   376  		return nil
   377  	}
   378  	if len(daemonSets) > 1 {
   379  		// ControllerRef will ensure we don't do anything crazy, but more than one
   380  		// item in this list nevertheless constitutes user error.
   381  		logger.V(4).Info("Found more than one DaemonSet selecting the ControllerRevision. This is potentially a user error",
   382  			"controllerRevision", klog.KObj(history), "labels", history.Labels)
   383  	}
   384  	return daemonSets
   385  }
   386  
   387  // addHistory enqueues the DaemonSet that manages a ControllerRevision when the ControllerRevision is created
   388  // or when the controller manager is restarted.
   389  func (dsc *DaemonSetsController) addHistory(logger klog.Logger, obj interface{}) {
   390  	history := obj.(*apps.ControllerRevision)
   391  	if history.DeletionTimestamp != nil {
   392  		// On a restart of the controller manager, it's possible for an object to
   393  		// show up in a state that is already pending deletion.
   394  		dsc.deleteHistory(logger, history)
   395  		return
   396  	}
   397  
   398  	// If it has a ControllerRef, that's all that matters.
   399  	if controllerRef := metav1.GetControllerOf(history); controllerRef != nil {
   400  		ds := dsc.resolveControllerRef(history.Namespace, controllerRef)
   401  		if ds == nil {
   402  			return
   403  		}
   404  		logger.V(4).Info("Observed a ControllerRevision", "controllerRevision", klog.KObj(history))
   405  		return
   406  	}
   407  
   408  	// Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync
   409  	// them to see if anyone wants to adopt it.
   410  	daemonSets := dsc.getDaemonSetsForHistory(logger, history)
   411  	if len(daemonSets) == 0 {
   412  		return
   413  	}
   414  	logger.V(4).Info("Orphan ControllerRevision added", "controllerRevision", klog.KObj(history))
   415  	for _, ds := range daemonSets {
   416  		dsc.enqueueDaemonSet(ds)
   417  	}
   418  }
   419  
   420  // updateHistory figures out what DaemonSet(s) manage a ControllerRevision when the ControllerRevision
   421  // is updated and wake them up. If anything of the ControllerRevision has changed, we need to  awaken
   422  // both the old and new DaemonSets.
   423  func (dsc *DaemonSetsController) updateHistory(logger klog.Logger, old, cur interface{}) {
   424  	curHistory := cur.(*apps.ControllerRevision)
   425  	oldHistory := old.(*apps.ControllerRevision)
   426  	if curHistory.ResourceVersion == oldHistory.ResourceVersion {
   427  		// Periodic resync will send update events for all known ControllerRevisions.
   428  		return
   429  	}
   430  
   431  	curControllerRef := metav1.GetControllerOf(curHistory)
   432  	oldControllerRef := metav1.GetControllerOf(oldHistory)
   433  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   434  	if controllerRefChanged && oldControllerRef != nil {
   435  		// The ControllerRef was changed. Sync the old controller, if any.
   436  		if ds := dsc.resolveControllerRef(oldHistory.Namespace, oldControllerRef); ds != nil {
   437  			dsc.enqueueDaemonSet(ds)
   438  		}
   439  	}
   440  
   441  	// If it has a ControllerRef, that's all that matters.
   442  	if curControllerRef != nil {
   443  		ds := dsc.resolveControllerRef(curHistory.Namespace, curControllerRef)
   444  		if ds == nil {
   445  			return
   446  		}
   447  		logger.V(4).Info("Observed an update to a ControllerRevision", "controllerRevision", klog.KObj(curHistory))
   448  		dsc.enqueueDaemonSet(ds)
   449  		return
   450  	}
   451  
   452  	// Otherwise, it's an orphan. If anything changed, sync matching controllers
   453  	// to see if anyone wants to adopt it now.
   454  	labelChanged := !reflect.DeepEqual(curHistory.Labels, oldHistory.Labels)
   455  	if labelChanged || controllerRefChanged {
   456  		daemonSets := dsc.getDaemonSetsForHistory(logger, curHistory)
   457  		if len(daemonSets) == 0 {
   458  			return
   459  		}
   460  		logger.V(4).Info("Orphan ControllerRevision updated", "controllerRevision", klog.KObj(curHistory))
   461  		for _, ds := range daemonSets {
   462  			dsc.enqueueDaemonSet(ds)
   463  		}
   464  	}
   465  }
   466  
   467  // deleteHistory enqueues the DaemonSet that manages a ControllerRevision when
   468  // the ControllerRevision is deleted. obj could be an *app.ControllerRevision, or
   469  // a DeletionFinalStateUnknown marker item.
   470  func (dsc *DaemonSetsController) deleteHistory(logger klog.Logger, obj interface{}) {
   471  	history, ok := obj.(*apps.ControllerRevision)
   472  
   473  	// When a delete is dropped, the relist will notice a ControllerRevision in the store not
   474  	// in the list, leading to the insertion of a tombstone object which contains
   475  	// the deleted key/value. Note that this value might be stale. If the ControllerRevision
   476  	// changed labels the new DaemonSet will not be woken up till the periodic resync.
   477  	if !ok {
   478  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   479  		if !ok {
   480  			utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj))
   481  			return
   482  		}
   483  		history, ok = tombstone.Obj.(*apps.ControllerRevision)
   484  		if !ok {
   485  			utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a ControllerRevision %#v", obj))
   486  			return
   487  		}
   488  	}
   489  
   490  	controllerRef := metav1.GetControllerOf(history)
   491  	if controllerRef == nil {
   492  		// No controller should care about orphans being deleted.
   493  		return
   494  	}
   495  	ds := dsc.resolveControllerRef(history.Namespace, controllerRef)
   496  	if ds == nil {
   497  		return
   498  	}
   499  	logger.V(4).Info("ControllerRevision deleted", "controllerRevision", klog.KObj(history))
   500  	dsc.enqueueDaemonSet(ds)
   501  }
   502  
   503  func (dsc *DaemonSetsController) addPod(logger klog.Logger, obj interface{}) {
   504  	pod := obj.(*v1.Pod)
   505  
   506  	if pod.DeletionTimestamp != nil {
   507  		// on a restart of the controller manager, it's possible a new pod shows up in a state that
   508  		// is already pending deletion. Prevent the pod from being a creation observation.
   509  		dsc.deletePod(logger, pod)
   510  		return
   511  	}
   512  
   513  	// If it has a ControllerRef, that's all that matters.
   514  	if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
   515  		ds := dsc.resolveControllerRef(pod.Namespace, controllerRef)
   516  		if ds == nil {
   517  			return
   518  		}
   519  		dsKey, err := controller.KeyFunc(ds)
   520  		if err != nil {
   521  			return
   522  		}
   523  		logger.V(4).Info("Pod added", "pod", klog.KObj(pod))
   524  		dsc.expectations.CreationObserved(logger, dsKey)
   525  		dsc.enqueueDaemonSet(ds)
   526  		return
   527  	}
   528  
   529  	// Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync
   530  	// them to see if anyone wants to adopt it.
   531  	// DO NOT observe creation because no controller should be waiting for an
   532  	// orphan.
   533  	dss := dsc.getDaemonSetsForPod(pod)
   534  	if len(dss) == 0 {
   535  		return
   536  	}
   537  	logger.V(4).Info("Orphan Pod added", "pod", klog.KObj(pod))
   538  	for _, ds := range dss {
   539  		dsc.enqueueDaemonSet(ds)
   540  	}
   541  }
   542  
   543  // When a pod is updated, figure out what sets manage it and wake them
   544  // up. If the labels of the pod have changed we need to awaken both the old
   545  // and new set. old and cur must be *v1.Pod types.
   546  func (dsc *DaemonSetsController) updatePod(logger klog.Logger, old, cur interface{}) {
   547  	curPod := cur.(*v1.Pod)
   548  	oldPod := old.(*v1.Pod)
   549  	if curPod.ResourceVersion == oldPod.ResourceVersion {
   550  		// Periodic resync will send update events for all known pods.
   551  		// Two different versions of the same pod will always have different RVs.
   552  		return
   553  	}
   554  
   555  	if curPod.DeletionTimestamp != nil {
   556  		// when a pod is deleted gracefully its deletion timestamp is first modified to reflect a grace period,
   557  		// and after such time has passed, the kubelet actually deletes it from the store. We receive an update
   558  		// for modification of the deletion timestamp and expect an ds to create more replicas asap, not wait
   559  		// until the kubelet actually deletes the pod.
   560  		dsc.deletePod(logger, curPod)
   561  		return
   562  	}
   563  
   564  	curControllerRef := metav1.GetControllerOf(curPod)
   565  	oldControllerRef := metav1.GetControllerOf(oldPod)
   566  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   567  	if controllerRefChanged && oldControllerRef != nil {
   568  		// The ControllerRef was changed. Sync the old controller, if any.
   569  		if ds := dsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); ds != nil {
   570  			dsc.enqueueDaemonSet(ds)
   571  		}
   572  	}
   573  
   574  	// If it has a ControllerRef, that's all that matters.
   575  	if curControllerRef != nil {
   576  		ds := dsc.resolveControllerRef(curPod.Namespace, curControllerRef)
   577  		if ds == nil {
   578  			return
   579  		}
   580  		logger.V(4).Info("Pod updated", "pod", klog.KObj(curPod))
   581  		dsc.enqueueDaemonSet(ds)
   582  		changedToReady := !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod)
   583  		// See https://github.com/kubernetes/kubernetes/pull/38076 for more details
   584  		if changedToReady && ds.Spec.MinReadySeconds > 0 {
   585  			// Add a second to avoid milliseconds skew in AddAfter.
   586  			// See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info.
   587  			dsc.enqueueDaemonSetAfter(ds, (time.Duration(ds.Spec.MinReadySeconds)*time.Second)+time.Second)
   588  		}
   589  		return
   590  	}
   591  
   592  	// Otherwise, it's an orphan. If anything changed, sync matching controllers
   593  	// to see if anyone wants to adopt it now.
   594  	dss := dsc.getDaemonSetsForPod(curPod)
   595  	if len(dss) == 0 {
   596  		return
   597  	}
   598  	logger.V(4).Info("Orphan Pod updated", "pod", klog.KObj(curPod))
   599  	labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
   600  	if labelChanged || controllerRefChanged {
   601  		for _, ds := range dss {
   602  			dsc.enqueueDaemonSet(ds)
   603  		}
   604  	}
   605  }
   606  
   607  func (dsc *DaemonSetsController) deletePod(logger klog.Logger, obj interface{}) {
   608  	pod, ok := obj.(*v1.Pod)
   609  	// When a delete is dropped, the relist will notice a pod in the store not
   610  	// in the list, leading to the insertion of a tombstone object which contains
   611  	// the deleted key/value. Note that this value might be stale. If the pod
   612  	// changed labels the new daemonset will not be woken up till the periodic
   613  	// resync.
   614  	if !ok {
   615  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   616  		if !ok {
   617  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   618  			return
   619  		}
   620  		pod, ok = tombstone.Obj.(*v1.Pod)
   621  		if !ok {
   622  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj))
   623  			return
   624  		}
   625  	}
   626  
   627  	controllerRef := metav1.GetControllerOf(pod)
   628  	if controllerRef == nil {
   629  		// No controller should care about orphans being deleted.
   630  		return
   631  	}
   632  	ds := dsc.resolveControllerRef(pod.Namespace, controllerRef)
   633  	if ds == nil {
   634  		return
   635  	}
   636  	dsKey, err := controller.KeyFunc(ds)
   637  	if err != nil {
   638  		return
   639  	}
   640  	logger.V(4).Info("Pod deleted", "pod", klog.KObj(pod))
   641  	dsc.expectations.DeletionObserved(logger, dsKey)
   642  	dsc.enqueueDaemonSet(ds)
   643  }
   644  
   645  func (dsc *DaemonSetsController) addNode(logger klog.Logger, obj interface{}) {
   646  	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
   647  	dsList, err := dsc.dsLister.List(labels.Everything())
   648  	if err != nil {
   649  		logger.V(4).Info("Error enqueueing daemon sets", "err", err)
   650  		return
   651  	}
   652  	node := obj.(*v1.Node)
   653  	for _, ds := range dsList {
   654  		if shouldRun, _ := NodeShouldRunDaemonPod(node, ds); shouldRun {
   655  			dsc.enqueueDaemonSet(ds)
   656  		}
   657  	}
   658  }
   659  
   660  // shouldIgnoreNodeUpdate returns true if Node labels and taints have not changed, otherwise returns false.
   661  // If other calling functions need to use other properties of Node, shouldIgnoreNodeUpdate needs to be updated.
   662  func shouldIgnoreNodeUpdate(oldNode, curNode v1.Node) bool {
   663  	return apiequality.Semantic.DeepEqual(oldNode.Labels, curNode.Labels) &&
   664  		apiequality.Semantic.DeepEqual(oldNode.Spec.Taints, curNode.Spec.Taints)
   665  }
   666  
   667  func (dsc *DaemonSetsController) updateNode(logger klog.Logger, old, cur interface{}) {
   668  	oldNode := old.(*v1.Node)
   669  	curNode := cur.(*v1.Node)
   670  	if shouldIgnoreNodeUpdate(*oldNode, *curNode) {
   671  		return
   672  	}
   673  
   674  	dsList, err := dsc.dsLister.List(labels.Everything())
   675  	if err != nil {
   676  		logger.V(4).Info("Error listing daemon sets", "err", err)
   677  		return
   678  	}
   679  	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
   680  	for _, ds := range dsList {
   681  		// If NodeShouldRunDaemonPod needs to uses other than Labels and Taints (mutable) properties of node, it needs to update shouldIgnoreNodeUpdate.
   682  		oldShouldRun, oldShouldContinueRunning := NodeShouldRunDaemonPod(oldNode, ds)
   683  		currentShouldRun, currentShouldContinueRunning := NodeShouldRunDaemonPod(curNode, ds)
   684  		if (oldShouldRun != currentShouldRun) || (oldShouldContinueRunning != currentShouldContinueRunning) {
   685  			dsc.enqueueDaemonSet(ds)
   686  		}
   687  	}
   688  }
   689  
   690  // getDaemonPods returns daemon pods owned by the given ds.
   691  // This also reconciles ControllerRef by adopting/orphaning.
   692  // Note that returned Pods are pointers to objects in the cache.
   693  // If you want to modify one, you need to deep-copy it first.
   694  func (dsc *DaemonSetsController) getDaemonPods(ctx context.Context, ds *apps.DaemonSet) ([]*v1.Pod, error) {
   695  	selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector)
   696  	if err != nil {
   697  		return nil, err
   698  	}
   699  
   700  	// List all pods to include those that don't match the selector anymore but
   701  	// have a ControllerRef pointing to this controller.
   702  	pods, err := dsc.podLister.Pods(ds.Namespace).List(labels.Everything())
   703  	if err != nil {
   704  		return nil, err
   705  	}
   706  	// If any adoptions are attempted, we should first recheck for deletion with
   707  	// an uncached quorum read sometime after listing Pods (see #42639).
   708  	dsNotDeleted := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) {
   709  		fresh, err := dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace).Get(ctx, ds.Name, metav1.GetOptions{})
   710  		if err != nil {
   711  			return nil, err
   712  		}
   713  		if fresh.UID != ds.UID {
   714  			return nil, fmt.Errorf("original DaemonSet %v/%v is gone: got uid %v, wanted %v", ds.Namespace, ds.Name, fresh.UID, ds.UID)
   715  		}
   716  		return fresh, nil
   717  	})
   718  
   719  	// Use ControllerRefManager to adopt/orphan as needed.
   720  	cm := controller.NewPodControllerRefManager(dsc.podControl, ds, selector, controllerKind, dsNotDeleted)
   721  	return cm.ClaimPods(ctx, pods)
   722  }
   723  
   724  // getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) created for the nodes.
   725  // This also reconciles ControllerRef by adopting/orphaning.
   726  // Note that returned Pods are pointers to objects in the cache.
   727  // If you want to modify one, you need to deep-copy it first.
   728  func (dsc *DaemonSetsController) getNodesToDaemonPods(ctx context.Context, ds *apps.DaemonSet, includeDeletedTerminal bool) (map[string][]*v1.Pod, error) {
   729  	claimedPods, err := dsc.getDaemonPods(ctx, ds)
   730  	if err != nil {
   731  		return nil, err
   732  	}
   733  	// Group Pods by Node name.
   734  	nodeToDaemonPods := make(map[string][]*v1.Pod)
   735  	logger := klog.FromContext(ctx)
   736  	for _, pod := range claimedPods {
   737  		if !includeDeletedTerminal && podutil.IsPodTerminal(pod) && pod.DeletionTimestamp != nil {
   738  			// This Pod has a finalizer or is already scheduled for deletion from the
   739  			// store by the kubelet or the Pod GC. The DS controller doesn't have
   740  			// anything else to do with it.
   741  			continue
   742  		}
   743  		nodeName, err := util.GetTargetNodeName(pod)
   744  		if err != nil {
   745  			logger.V(4).Info("Failed to get target node name of Pod in DaemonSet",
   746  				"pod", klog.KObj(pod), "daemonset", klog.KObj(ds))
   747  			continue
   748  		}
   749  
   750  		nodeToDaemonPods[nodeName] = append(nodeToDaemonPods[nodeName], pod)
   751  	}
   752  
   753  	return nodeToDaemonPods, nil
   754  }
   755  
   756  // resolveControllerRef returns the controller referenced by a ControllerRef,
   757  // or nil if the ControllerRef could not be resolved to a matching controller
   758  // of the correct Kind.
   759  func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.DaemonSet {
   760  	// We can't look up by UID, so look up by Name and then verify UID.
   761  	// Don't even try to look up by Name if it's the wrong Kind.
   762  	if controllerRef.Kind != controllerKind.Kind {
   763  		return nil
   764  	}
   765  	ds, err := dsc.dsLister.DaemonSets(namespace).Get(controllerRef.Name)
   766  	if err != nil {
   767  		return nil
   768  	}
   769  	if ds.UID != controllerRef.UID {
   770  		// The controller we found with this Name is not the same one that the
   771  		// ControllerRef points to.
   772  		return nil
   773  	}
   774  	return ds
   775  }
   776  
   777  // podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
   778  //   - nodesNeedingDaemonPods: the pods need to start on the node
   779  //   - podsToDelete: the Pods need to be deleted on the node
   780  //   - err: unexpected error
   781  func (dsc *DaemonSetsController) podsShouldBeOnNode(
   782  	logger klog.Logger,
   783  	node *v1.Node,
   784  	nodeToDaemonPods map[string][]*v1.Pod,
   785  	ds *apps.DaemonSet,
   786  	hash string,
   787  ) (nodesNeedingDaemonPods, podsToDelete []string) {
   788  
   789  	shouldRun, shouldContinueRunning := NodeShouldRunDaemonPod(node, ds)
   790  	daemonPods, exists := nodeToDaemonPods[node.Name]
   791  
   792  	switch {
   793  	case shouldRun && !exists:
   794  		// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
   795  		nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
   796  	case shouldContinueRunning:
   797  		// If a daemon pod failed, delete it
   798  		// If there's non-daemon pods left on this node, we will create it in the next sync loop
   799  		var daemonPodsRunning []*v1.Pod
   800  		for _, pod := range daemonPods {
   801  			if pod.DeletionTimestamp != nil {
   802  				continue
   803  			}
   804  			if pod.Status.Phase == v1.PodFailed {
   805  				// This is a critical place where DS is often fighting with kubelet that rejects pods.
   806  				// We need to avoid hot looping and backoff.
   807  				backoffKey := failedPodsBackoffKey(ds, node.Name)
   808  
   809  				now := dsc.failedPodsBackoff.Clock.Now()
   810  				inBackoff := dsc.failedPodsBackoff.IsInBackOffSinceUpdate(backoffKey, now)
   811  				if inBackoff {
   812  					delay := dsc.failedPodsBackoff.Get(backoffKey)
   813  					logger.V(4).Info("Deleting failed pod on node has been limited by backoff",
   814  						"pod", klog.KObj(pod), "node", klog.KObj(node), "currentDelay", delay)
   815  					dsc.enqueueDaemonSetAfter(ds, delay)
   816  					continue
   817  				}
   818  
   819  				dsc.failedPodsBackoff.Next(backoffKey, now)
   820  
   821  				msg := fmt.Sprintf("Found failed daemon pod %s/%s on node %s, will try to kill it", pod.Namespace, pod.Name, node.Name)
   822  				logger.V(2).Info("Found failed daemon pod on node, will try to kill it", "pod", klog.KObj(pod), "node", klog.KObj(node))
   823  				// Emit an event so that it's discoverable to users.
   824  				dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, FailedDaemonPodReason, msg)
   825  				podsToDelete = append(podsToDelete, pod.Name)
   826  			} else if pod.Status.Phase == v1.PodSucceeded {
   827  				msg := fmt.Sprintf("Found succeeded daemon pod %s/%s on node %s, will try to delete it", pod.Namespace, pod.Name, node.Name)
   828  				logger.V(2).Info("Found succeeded daemon pod on node, will try to delete it", "pod", klog.KObj(pod), "node", klog.KObj(node))
   829  				// Emit an event so that it's discoverable to users.
   830  				dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, SucceededDaemonPodReason, msg)
   831  				podsToDelete = append(podsToDelete, pod.Name)
   832  			} else {
   833  				daemonPodsRunning = append(daemonPodsRunning, pod)
   834  			}
   835  		}
   836  
   837  		// When surge is not enabled, if there is more than 1 running pod on a node delete all but the oldest
   838  		if !util.AllowsSurge(ds) {
   839  			if len(daemonPodsRunning) <= 1 {
   840  				// There are no excess pods to be pruned, and no pods to create
   841  				break
   842  			}
   843  
   844  			sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
   845  			for i := 1; i < len(daemonPodsRunning); i++ {
   846  				podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name)
   847  			}
   848  			break
   849  		}
   850  
   851  		if len(daemonPodsRunning) <= 1 {
   852  			// // There are no excess pods to be pruned
   853  			if len(daemonPodsRunning) == 0 && shouldRun {
   854  				// We are surging so we need to have at least one non-deleted pod on the node
   855  				nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
   856  			}
   857  			break
   858  		}
   859  
   860  		// When surge is enabled, we allow 2 pods if and only if the oldest pod matching the current hash state
   861  		// is not ready AND the oldest pod that doesn't match the current hash state is ready. All other pods are
   862  		// deleted. If neither pod is ready, only the one matching the current hash revision is kept.
   863  		var oldestNewPod, oldestOldPod *v1.Pod
   864  		sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
   865  		for _, pod := range daemonPodsRunning {
   866  			if pod.Labels[apps.ControllerRevisionHashLabelKey] == hash {
   867  				if oldestNewPod == nil {
   868  					oldestNewPod = pod
   869  					continue
   870  				}
   871  			} else {
   872  				if oldestOldPod == nil {
   873  					oldestOldPod = pod
   874  					continue
   875  				}
   876  			}
   877  			podsToDelete = append(podsToDelete, pod.Name)
   878  		}
   879  		if oldestNewPod != nil && oldestOldPod != nil {
   880  			switch {
   881  			case !podutil.IsPodReady(oldestOldPod):
   882  				logger.V(5).Info("Pod from daemonset is no longer ready and will be replaced with newer pod", "oldPod", klog.KObj(oldestOldPod), "daemonset", klog.KObj(ds), "newPod", klog.KObj(oldestNewPod))
   883  				podsToDelete = append(podsToDelete, oldestOldPod.Name)
   884  			case podutil.IsPodAvailable(oldestNewPod, ds.Spec.MinReadySeconds, metav1.Time{Time: dsc.failedPodsBackoff.Clock.Now()}):
   885  				logger.V(5).Info("Pod from daemonset is now ready and will replace older pod", "newPod", klog.KObj(oldestNewPod), "daemonset", klog.KObj(ds), "oldPod", klog.KObj(oldestOldPod))
   886  				podsToDelete = append(podsToDelete, oldestOldPod.Name)
   887  			}
   888  		}
   889  
   890  	case !shouldContinueRunning && exists:
   891  		// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
   892  		for _, pod := range daemonPods {
   893  			if pod.DeletionTimestamp != nil {
   894  				continue
   895  			}
   896  			podsToDelete = append(podsToDelete, pod.Name)
   897  		}
   898  	}
   899  
   900  	return nodesNeedingDaemonPods, podsToDelete
   901  }
   902  
   903  func (dsc *DaemonSetsController) updateDaemonSet(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash, key string, old []*apps.ControllerRevision) error {
   904  	err := dsc.manage(ctx, ds, nodeList, hash)
   905  	if err != nil {
   906  		return err
   907  	}
   908  
   909  	// Process rolling updates if we're ready.
   910  	if dsc.expectations.SatisfiedExpectations(klog.FromContext(ctx), key) {
   911  		switch ds.Spec.UpdateStrategy.Type {
   912  		case apps.OnDeleteDaemonSetStrategyType:
   913  		case apps.RollingUpdateDaemonSetStrategyType:
   914  			err = dsc.rollingUpdate(ctx, ds, nodeList, hash)
   915  		}
   916  		if err != nil {
   917  			return err
   918  		}
   919  	}
   920  
   921  	err = dsc.cleanupHistory(ctx, ds, old)
   922  	if err != nil {
   923  		return fmt.Errorf("failed to clean up revisions of DaemonSet: %w", err)
   924  	}
   925  
   926  	return nil
   927  }
   928  
   929  // manage manages the scheduling and running of Pods of ds on nodes.
   930  // After figuring out which nodes should run a Pod of ds but not yet running one and
   931  // which nodes should not run a Pod of ds but currently running one, it calls function
   932  // syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds.
   933  func (dsc *DaemonSetsController) manage(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string) error {
   934  	// Find out the pods which are created for the nodes by DaemonSet.
   935  	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false)
   936  	if err != nil {
   937  		return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
   938  	}
   939  
   940  	// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
   941  	// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
   942  	logger := klog.FromContext(ctx)
   943  	var nodesNeedingDaemonPods, podsToDelete []string
   944  	for _, node := range nodeList {
   945  		nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode := dsc.podsShouldBeOnNode(
   946  			logger, node, nodeToDaemonPods, ds, hash)
   947  
   948  		nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
   949  		podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
   950  	}
   951  
   952  	// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
   953  	// If node doesn't exist then pods are never scheduled and can't be deleted by PodGCController.
   954  	podsToDelete = append(podsToDelete, getUnscheduledPodsWithoutNode(nodeList, nodeToDaemonPods)...)
   955  
   956  	// Label new pods using the hash label value of the current history when creating them
   957  	if err = dsc.syncNodes(ctx, ds, podsToDelete, nodesNeedingDaemonPods, hash); err != nil {
   958  		return err
   959  	}
   960  
   961  	return nil
   962  }
   963  
   964  // syncNodes deletes given pods and creates new daemon set pods on the given nodes
   965  // returns slice with errors if any
   966  func (dsc *DaemonSetsController) syncNodes(ctx context.Context, ds *apps.DaemonSet, podsToDelete, nodesNeedingDaemonPods []string, hash string) error {
   967  	// We need to set expectations before creating/deleting pods to avoid race conditions.
   968  	logger := klog.FromContext(ctx)
   969  	dsKey, err := controller.KeyFunc(ds)
   970  	if err != nil {
   971  		return fmt.Errorf("couldn't get key for object %#v: %v", ds, err)
   972  	}
   973  
   974  	createDiff := len(nodesNeedingDaemonPods)
   975  	deleteDiff := len(podsToDelete)
   976  
   977  	if createDiff > dsc.burstReplicas {
   978  		createDiff = dsc.burstReplicas
   979  	}
   980  	if deleteDiff > dsc.burstReplicas {
   981  		deleteDiff = dsc.burstReplicas
   982  	}
   983  
   984  	dsc.expectations.SetExpectations(logger, dsKey, createDiff, deleteDiff)
   985  
   986  	// error channel to communicate back failures.  make the buffer big enough to avoid any blocking
   987  	errCh := make(chan error, createDiff+deleteDiff)
   988  
   989  	logger.V(4).Info("Nodes needing daemon pods for daemon set, creating", "daemonset", klog.KObj(ds), "needCount", nodesNeedingDaemonPods, "createCount", createDiff)
   990  	createWait := sync.WaitGroup{}
   991  	// If the returned error is not nil we have a parse error.
   992  	// The controller handles this via the hash.
   993  	generation, err := util.GetTemplateGeneration(ds)
   994  	if err != nil {
   995  		generation = nil
   996  	}
   997  	template := util.CreatePodTemplate(ds.Spec.Template, generation, hash)
   998  	// Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize
   999  	// and double with each successful iteration in a kind of "slow start".
  1000  	// This handles attempts to start large numbers of pods that would
  1001  	// likely all fail with the same error. For example a project with a
  1002  	// low quota that attempts to create a large number of pods will be
  1003  	// prevented from spamming the API service with the pod create requests
  1004  	// after one of its pods fails.  Conveniently, this also prevents the
  1005  	// event spam that those failures would generate.
  1006  	batchSize := min(createDiff, controller.SlowStartInitialBatchSize)
  1007  	for pos := 0; createDiff > pos; batchSize, pos = min(2*batchSize, createDiff-(pos+batchSize)), pos+batchSize {
  1008  		errorCount := len(errCh)
  1009  		createWait.Add(batchSize)
  1010  		for i := pos; i < pos+batchSize; i++ {
  1011  			go func(ix int) {
  1012  				defer createWait.Done()
  1013  
  1014  				podTemplate := template.DeepCopy()
  1015  				// The pod's NodeAffinity will be updated to make sure the Pod is bound
  1016  				// to the target node by default scheduler. It is safe to do so because there
  1017  				// should be no conflicting node affinity with the target node.
  1018  				podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity(
  1019  					podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix])
  1020  
  1021  				err := dsc.podControl.CreatePods(ctx, ds.Namespace, podTemplate,
  1022  					ds, metav1.NewControllerRef(ds, controllerKind))
  1023  
  1024  				if err != nil {
  1025  					if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
  1026  						// If the namespace is being torn down, we can safely ignore
  1027  						// this error since all subsequent creations will fail.
  1028  						return
  1029  					}
  1030  				}
  1031  				if err != nil {
  1032  					logger.V(2).Info("Failed creation, decrementing expectations for daemon set", "daemonset", klog.KObj(ds))
  1033  					dsc.expectations.CreationObserved(logger, dsKey)
  1034  					errCh <- err
  1035  					utilruntime.HandleError(err)
  1036  				}
  1037  			}(i)
  1038  		}
  1039  		createWait.Wait()
  1040  		// any skipped pods that we never attempted to start shouldn't be expected.
  1041  		skippedPods := createDiff - (batchSize + pos)
  1042  		if errorCount < len(errCh) && skippedPods > 0 {
  1043  			logger.V(2).Info("Slow-start failure. Skipping creation pods, decrementing expectations for daemon set", "skippedPods", skippedPods, "daemonset", klog.KObj(ds))
  1044  			dsc.expectations.LowerExpectations(logger, dsKey, skippedPods, 0)
  1045  			// The skipped pods will be retried later. The next controller resync will
  1046  			// retry the slow start process.
  1047  			break
  1048  		}
  1049  	}
  1050  
  1051  	logger.V(4).Info("Pods to delete for daemon set, deleting", "daemonset", klog.KObj(ds), "toDeleteCount", podsToDelete, "deleteCount", deleteDiff)
  1052  	deleteWait := sync.WaitGroup{}
  1053  	deleteWait.Add(deleteDiff)
  1054  	for i := 0; i < deleteDiff; i++ {
  1055  		go func(ix int) {
  1056  			defer deleteWait.Done()
  1057  			if err := dsc.podControl.DeletePod(ctx, ds.Namespace, podsToDelete[ix], ds); err != nil {
  1058  				dsc.expectations.DeletionObserved(logger, dsKey)
  1059  				if !apierrors.IsNotFound(err) {
  1060  					logger.V(2).Info("Failed deletion, decremented expectations for daemon set", "daemonset", klog.KObj(ds))
  1061  					errCh <- err
  1062  					utilruntime.HandleError(err)
  1063  				}
  1064  			}
  1065  		}(i)
  1066  	}
  1067  	deleteWait.Wait()
  1068  
  1069  	// collect errors if any for proper reporting/retry logic in the controller
  1070  	errors := []error{}
  1071  	close(errCh)
  1072  	for err := range errCh {
  1073  		errors = append(errors, err)
  1074  	}
  1075  	return utilerrors.NewAggregate(errors)
  1076  }
  1077  
  1078  func storeDaemonSetStatus(
  1079  	ctx context.Context,
  1080  	dsClient unversionedapps.DaemonSetInterface,
  1081  	ds *apps.DaemonSet, desiredNumberScheduled,
  1082  	currentNumberScheduled,
  1083  	numberMisscheduled,
  1084  	numberReady,
  1085  	updatedNumberScheduled,
  1086  	numberAvailable,
  1087  	numberUnavailable int,
  1088  	updateObservedGen bool) error {
  1089  	if int(ds.Status.DesiredNumberScheduled) == desiredNumberScheduled &&
  1090  		int(ds.Status.CurrentNumberScheduled) == currentNumberScheduled &&
  1091  		int(ds.Status.NumberMisscheduled) == numberMisscheduled &&
  1092  		int(ds.Status.NumberReady) == numberReady &&
  1093  		int(ds.Status.UpdatedNumberScheduled) == updatedNumberScheduled &&
  1094  		int(ds.Status.NumberAvailable) == numberAvailable &&
  1095  		int(ds.Status.NumberUnavailable) == numberUnavailable &&
  1096  		ds.Status.ObservedGeneration >= ds.Generation {
  1097  		return nil
  1098  	}
  1099  
  1100  	toUpdate := ds.DeepCopy()
  1101  
  1102  	var updateErr, getErr error
  1103  	for i := 0; ; i++ {
  1104  		if updateObservedGen {
  1105  			toUpdate.Status.ObservedGeneration = ds.Generation
  1106  		}
  1107  		toUpdate.Status.DesiredNumberScheduled = int32(desiredNumberScheduled)
  1108  		toUpdate.Status.CurrentNumberScheduled = int32(currentNumberScheduled)
  1109  		toUpdate.Status.NumberMisscheduled = int32(numberMisscheduled)
  1110  		toUpdate.Status.NumberReady = int32(numberReady)
  1111  		toUpdate.Status.UpdatedNumberScheduled = int32(updatedNumberScheduled)
  1112  		toUpdate.Status.NumberAvailable = int32(numberAvailable)
  1113  		toUpdate.Status.NumberUnavailable = int32(numberUnavailable)
  1114  
  1115  		if _, updateErr = dsClient.UpdateStatus(ctx, toUpdate, metav1.UpdateOptions{}); updateErr == nil {
  1116  			return nil
  1117  		}
  1118  
  1119  		// Stop retrying if we exceed statusUpdateRetries - the DaemonSet will be requeued with a rate limit.
  1120  		if i >= StatusUpdateRetries {
  1121  			break
  1122  		}
  1123  		// Update the set with the latest resource version for the next poll
  1124  		if toUpdate, getErr = dsClient.Get(ctx, ds.Name, metav1.GetOptions{}); getErr != nil {
  1125  			// If the GET fails we can't trust status.Replicas anymore. This error
  1126  			// is bound to be more interesting than the update failure.
  1127  			return getErr
  1128  		}
  1129  	}
  1130  	return updateErr
  1131  }
  1132  
  1133  func (dsc *DaemonSetsController) updateDaemonSetStatus(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string, updateObservedGen bool) error {
  1134  	logger := klog.FromContext(ctx)
  1135  	logger.V(4).Info("Updating daemon set status")
  1136  	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false)
  1137  	if err != nil {
  1138  		return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
  1139  	}
  1140  
  1141  	var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable int
  1142  	now := dsc.failedPodsBackoff.Clock.Now()
  1143  	for _, node := range nodeList {
  1144  		shouldRun, _ := NodeShouldRunDaemonPod(node, ds)
  1145  		scheduled := len(nodeToDaemonPods[node.Name]) > 0
  1146  
  1147  		if shouldRun {
  1148  			desiredNumberScheduled++
  1149  			if !scheduled {
  1150  				continue
  1151  			}
  1152  
  1153  			currentNumberScheduled++
  1154  			// Sort the daemon pods by creation time, so that the oldest is first.
  1155  			daemonPods, _ := nodeToDaemonPods[node.Name]
  1156  			sort.Sort(podByCreationTimestampAndPhase(daemonPods))
  1157  			pod := daemonPods[0]
  1158  			if podutil.IsPodReady(pod) {
  1159  				numberReady++
  1160  				if podutil.IsPodAvailable(pod, ds.Spec.MinReadySeconds, metav1.Time{Time: now}) {
  1161  					numberAvailable++
  1162  				}
  1163  			}
  1164  			// If the returned error is not nil we have a parse error.
  1165  			// The controller handles this via the hash.
  1166  			generation, err := util.GetTemplateGeneration(ds)
  1167  			if err != nil {
  1168  				generation = nil
  1169  			}
  1170  			if util.IsPodUpdated(pod, hash, generation) {
  1171  				updatedNumberScheduled++
  1172  			}
  1173  		} else {
  1174  			if scheduled {
  1175  				numberMisscheduled++
  1176  			}
  1177  		}
  1178  	}
  1179  	numberUnavailable := desiredNumberScheduled - numberAvailable
  1180  
  1181  	err = storeDaemonSetStatus(ctx, dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace), ds, desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable, numberUnavailable, updateObservedGen)
  1182  	if err != nil {
  1183  		return fmt.Errorf("error storing status for daemon set %#v: %w", ds, err)
  1184  	}
  1185  
  1186  	// Resync the DaemonSet after MinReadySeconds as a last line of defense to guard against clock-skew.
  1187  	if ds.Spec.MinReadySeconds > 0 && numberReady != numberAvailable {
  1188  		dsc.enqueueDaemonSetAfter(ds, time.Duration(ds.Spec.MinReadySeconds)*time.Second)
  1189  	}
  1190  	return nil
  1191  }
  1192  
  1193  func (dsc *DaemonSetsController) syncDaemonSet(ctx context.Context, key string) error {
  1194  	logger := klog.FromContext(ctx)
  1195  	startTime := dsc.failedPodsBackoff.Clock.Now()
  1196  
  1197  	defer func() {
  1198  		logger.V(4).Info("Finished syncing daemon set", "daemonset", key, "time", dsc.failedPodsBackoff.Clock.Now().Sub(startTime))
  1199  	}()
  1200  
  1201  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
  1202  	if err != nil {
  1203  		return err
  1204  	}
  1205  	ds, err := dsc.dsLister.DaemonSets(namespace).Get(name)
  1206  	if apierrors.IsNotFound(err) {
  1207  		logger.V(3).Info("Daemon set has been deleted", "daemonset", key)
  1208  		dsc.expectations.DeleteExpectations(logger, key)
  1209  		return nil
  1210  	}
  1211  	if err != nil {
  1212  		return fmt.Errorf("unable to retrieve ds %v from store: %v", key, err)
  1213  	}
  1214  
  1215  	nodeList, err := dsc.nodeLister.List(labels.Everything())
  1216  	if err != nil {
  1217  		return fmt.Errorf("couldn't get list of nodes when syncing daemon set %#v: %v", ds, err)
  1218  	}
  1219  
  1220  	everything := metav1.LabelSelector{}
  1221  	if reflect.DeepEqual(ds.Spec.Selector, &everything) {
  1222  		dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, SelectingAllReason, "This daemon set is selecting all pods. A non-empty selector is required.")
  1223  		return nil
  1224  	}
  1225  
  1226  	// Don't process a daemon set until all its creations and deletions have been processed.
  1227  	// For example if daemon set foo asked for 3 new daemon pods in the previous call to manage,
  1228  	// then we do not want to call manage on foo until the daemon pods have been created.
  1229  	dsKey, err := controller.KeyFunc(ds)
  1230  	if err != nil {
  1231  		return fmt.Errorf("couldn't get key for object %#v: %v", ds, err)
  1232  	}
  1233  
  1234  	// If the DaemonSet is being deleted (either by foreground deletion or
  1235  	// orphan deletion), we cannot be sure if the DaemonSet history objects
  1236  	// it owned still exist -- those history objects can either be deleted
  1237  	// or orphaned. Garbage collector doesn't guarantee that it will delete
  1238  	// DaemonSet pods before deleting DaemonSet history objects, because
  1239  	// DaemonSet history doesn't own DaemonSet pods. We cannot reliably
  1240  	// calculate the status of a DaemonSet being deleted. Therefore, return
  1241  	// here without updating status for the DaemonSet being deleted.
  1242  	if ds.DeletionTimestamp != nil {
  1243  		return nil
  1244  	}
  1245  
  1246  	// Construct histories of the DaemonSet, and get the hash of current history
  1247  	cur, old, err := dsc.constructHistory(ctx, ds)
  1248  	if err != nil {
  1249  		return fmt.Errorf("failed to construct revisions of DaemonSet: %v", err)
  1250  	}
  1251  	hash := cur.Labels[apps.DefaultDaemonSetUniqueLabelKey]
  1252  
  1253  	if !dsc.expectations.SatisfiedExpectations(logger, dsKey) {
  1254  		// Only update status. Don't raise observedGeneration since controller didn't process object of that generation.
  1255  		return dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, false)
  1256  	}
  1257  
  1258  	err = dsc.updateDaemonSet(ctx, ds, nodeList, hash, dsKey, old)
  1259  	statusErr := dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, true)
  1260  	switch {
  1261  	case err != nil && statusErr != nil:
  1262  		// If there was an error, and we failed to update status,
  1263  		// log it and return the original error.
  1264  		logger.Error(statusErr, "Failed to update status", "daemonSet", klog.KObj(ds))
  1265  		return err
  1266  	case err != nil:
  1267  		return err
  1268  	case statusErr != nil:
  1269  		return statusErr
  1270  	}
  1271  
  1272  	return nil
  1273  }
  1274  
  1275  // NodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a
  1276  // summary. Returned booleans are:
  1277  //   - shouldRun:
  1278  //     Returns true when a daemonset should run on the node if a daemonset pod is not already
  1279  //     running on that node.
  1280  //   - shouldContinueRunning:
  1281  //     Returns true when a daemonset should continue running on a node if a daemonset pod is already
  1282  //     running on that node.
  1283  func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) {
  1284  	pod := NewPod(ds, node.Name)
  1285  
  1286  	// If the daemon set specifies a node name, check that it matches with node.Name.
  1287  	if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) {
  1288  		return false, false
  1289  	}
  1290  
  1291  	taints := node.Spec.Taints
  1292  	fitsNodeName, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints)
  1293  	if !fitsNodeName || !fitsNodeAffinity {
  1294  		return false, false
  1295  	}
  1296  
  1297  	if !fitsTaints {
  1298  		// Scheduled daemon pods should continue running if they tolerate NoExecute taint.
  1299  		_, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
  1300  			return t.Effect == v1.TaintEffectNoExecute
  1301  		})
  1302  		return false, !hasUntoleratedTaint
  1303  	}
  1304  
  1305  	return true, true
  1306  }
  1307  
  1308  // predicates checks if a DaemonSet's pod can run on a node.
  1309  func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeAffinity, fitsTaints bool) {
  1310  	fitsNodeName = len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name
  1311  	// Ignore parsing errors for backwards compatibility.
  1312  	fitsNodeAffinity, _ = nodeaffinity.GetRequiredNodeAffinity(pod).Match(node)
  1313  	_, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
  1314  		return t.Effect == v1.TaintEffectNoExecute || t.Effect == v1.TaintEffectNoSchedule
  1315  	})
  1316  	fitsTaints = !hasUntoleratedTaint
  1317  	return
  1318  }
  1319  
  1320  // NewPod creates a new pod
  1321  func NewPod(ds *apps.DaemonSet, nodeName string) *v1.Pod {
  1322  	newPod := &v1.Pod{Spec: ds.Spec.Template.Spec, ObjectMeta: ds.Spec.Template.ObjectMeta}
  1323  	newPod.Namespace = ds.Namespace
  1324  	newPod.Spec.NodeName = nodeName
  1325  
  1326  	// Added default tolerations for DaemonSet pods.
  1327  	util.AddOrUpdateDaemonPodTolerations(&newPod.Spec)
  1328  
  1329  	return newPod
  1330  }
  1331  
  1332  type podByCreationTimestampAndPhase []*v1.Pod
  1333  
  1334  func (o podByCreationTimestampAndPhase) Len() int      { return len(o) }
  1335  func (o podByCreationTimestampAndPhase) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  1336  
  1337  func (o podByCreationTimestampAndPhase) Less(i, j int) bool {
  1338  	// Scheduled Pod first
  1339  	if len(o[i].Spec.NodeName) != 0 && len(o[j].Spec.NodeName) == 0 {
  1340  		return true
  1341  	}
  1342  
  1343  	if len(o[i].Spec.NodeName) == 0 && len(o[j].Spec.NodeName) != 0 {
  1344  		return false
  1345  	}
  1346  
  1347  	if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
  1348  		return o[i].Name < o[j].Name
  1349  	}
  1350  	return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
  1351  }
  1352  
  1353  func failedPodsBackoffKey(ds *apps.DaemonSet, nodeName string) string {
  1354  	return fmt.Sprintf("%s/%d/%s", ds.UID, ds.Status.ObservedGeneration, nodeName)
  1355  }
  1356  
  1357  // getUnscheduledPodsWithoutNode returns list of unscheduled pods assigned to not existing nodes.
  1358  // Returned pods can't be deleted by PodGCController so they should be deleted by DaemonSetController.
  1359  func getUnscheduledPodsWithoutNode(runningNodesList []*v1.Node, nodeToDaemonPods map[string][]*v1.Pod) []string {
  1360  	var results []string
  1361  	isNodeRunning := make(map[string]bool, len(runningNodesList))
  1362  	for _, node := range runningNodesList {
  1363  		isNodeRunning[node.Name] = true
  1364  	}
  1365  
  1366  	for n, pods := range nodeToDaemonPods {
  1367  		if isNodeRunning[n] {
  1368  			continue
  1369  		}
  1370  		for _, pod := range pods {
  1371  			if len(pod.Spec.NodeName) == 0 {
  1372  				results = append(results, pod.Name)
  1373  			}
  1374  		}
  1375  	}
  1376  
  1377  	return results
  1378  }