k8s.io/kubernetes@v1.29.3/pkg/controller/daemon/daemon_controller.go

k8s.io/kubernetes@v1.29.3/pkg/controller/daemon/daemon_controller.go (about)

     1  /*
     2  Copyright 2015 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package daemon
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"reflect"
    23  	"sort"
    24  	"sync"
    25  	"time"
    26  
    27  	"k8s.io/klog/v2"
    28  
    29  	apps "k8s.io/api/apps/v1"
    30  	v1 "k8s.io/api/core/v1"
    31  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    32  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    33  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    34  	"k8s.io/apimachinery/pkg/labels"
    35  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    36  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    37  	"k8s.io/apimachinery/pkg/util/wait"
    38  	appsinformers "k8s.io/client-go/informers/apps/v1"
    39  	coreinformers "k8s.io/client-go/informers/core/v1"
    40  	clientset "k8s.io/client-go/kubernetes"
    41  	"k8s.io/client-go/kubernetes/scheme"
    42  	unversionedapps "k8s.io/client-go/kubernetes/typed/apps/v1"
    43  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    44  	appslisters "k8s.io/client-go/listers/apps/v1"
    45  	corelisters "k8s.io/client-go/listers/core/v1"
    46  	"k8s.io/client-go/tools/cache"
    47  	"k8s.io/client-go/tools/record"
    48  	"k8s.io/client-go/util/flowcontrol"
    49  	"k8s.io/client-go/util/workqueue"
    50  	v1helper "k8s.io/component-helpers/scheduling/corev1"
    51  	"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
    52  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    53  	"k8s.io/kubernetes/pkg/controller"
    54  	"k8s.io/kubernetes/pkg/controller/daemon/util"
    55  	"k8s.io/utils/integer"
    56  )
    57  
    58  const (
    59  	// BurstReplicas is a rate limiter for booting pods on a lot of pods.
    60  	// The value of 250 is chosen b/c values that are too high can cause registry DoS issues.
    61  	BurstReplicas = 250
    62  
    63  	// StatusUpdateRetries limits the number of retries if sending a status update to API server fails.
    64  	StatusUpdateRetries = 1
    65  
    66  	// BackoffGCInterval is the time that has to pass before next iteration of backoff GC is run
    67  	BackoffGCInterval = 1 * time.Minute
    68  )
    69  
    70  // Reasons for DaemonSet events
    71  const (
    72  	// SelectingAllReason is added to an event when a DaemonSet selects all Pods.
    73  	SelectingAllReason = "SelectingAll"
    74  	// FailedPlacementReason is added to an event when a DaemonSet can't schedule a Pod to a specified node.
    75  	FailedPlacementReason = "FailedPlacement"
    76  	// FailedDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Failed'.
    77  	FailedDaemonPodReason = "FailedDaemonPod"
    78  	// SucceededDaemonPodReason is added to an event when the status of a Pod of a DaemonSet is 'Succeeded'.
    79  	SucceededDaemonPodReason = "SucceededDaemonPod"
    80  )
    81  
    82  // controllerKind contains the schema.GroupVersionKind for this controller type.
    83  var controllerKind = apps.SchemeGroupVersion.WithKind("DaemonSet")
    84  
    85  // DaemonSetsController is responsible for synchronizing DaemonSet objects stored
    86  // in the system with actual running pods.
    87  type DaemonSetsController struct {
    88  	kubeClient clientset.Interface
    89  
    90  	eventBroadcaster record.EventBroadcaster
    91  	eventRecorder    record.EventRecorder
    92  
    93  	podControl controller.PodControlInterface
    94  	crControl  controller.ControllerRevisionControlInterface
    95  
    96  	// An dsc is temporarily suspended after creating/deleting these many replicas.
    97  	// It resumes normal action after observing the watch events for them.
    98  	burstReplicas int
    99  
   100  	// To allow injection of syncDaemonSet for testing.
   101  	syncHandler func(ctx context.Context, dsKey string) error
   102  	// used for unit testing
   103  	enqueueDaemonSet func(ds *apps.DaemonSet)
   104  	// A TTLCache of pod creates/deletes each ds expects to see
   105  	expectations controller.ControllerExpectationsInterface
   106  	// dsLister can list/get daemonsets from the shared informer's store
   107  	dsLister appslisters.DaemonSetLister
   108  	// dsStoreSynced returns true if the daemonset store has been synced at least once.
   109  	// Added as a member to the struct to allow injection for testing.
   110  	dsStoreSynced cache.InformerSynced
   111  	// historyLister get list/get history from the shared informers's store
   112  	historyLister appslisters.ControllerRevisionLister
   113  	// historyStoreSynced returns true if the history store has been synced at least once.
   114  	// Added as a member to the struct to allow injection for testing.
   115  	historyStoreSynced cache.InformerSynced
   116  	// podLister get list/get pods from the shared informers's store
   117  	podLister corelisters.PodLister
   118  	// podStoreSynced returns true if the pod store has been synced at least once.
   119  	// Added as a member to the struct to allow injection for testing.
   120  	podStoreSynced cache.InformerSynced
   121  	// nodeLister can list/get nodes from the shared informer's store
   122  	nodeLister corelisters.NodeLister
   123  	// nodeStoreSynced returns true if the node store has been synced at least once.
   124  	// Added as a member to the struct to allow injection for testing.
   125  	nodeStoreSynced cache.InformerSynced
   126  
   127  	// DaemonSet keys that need to be synced.
   128  	queue workqueue.RateLimitingInterface
   129  
   130  	failedPodsBackoff *flowcontrol.Backoff
   131  }
   132  
   133  // NewDaemonSetsController creates a new DaemonSetsController
   134  func NewDaemonSetsController(
   135  	ctx context.Context,
   136  	daemonSetInformer appsinformers.DaemonSetInformer,
   137  	historyInformer appsinformers.ControllerRevisionInformer,
   138  	podInformer coreinformers.PodInformer,
   139  	nodeInformer coreinformers.NodeInformer,
   140  	kubeClient clientset.Interface,
   141  	failedPodsBackoff *flowcontrol.Backoff,
   142  ) (*DaemonSetsController, error) {
   143  	eventBroadcaster := record.NewBroadcaster()
   144  	logger := klog.FromContext(ctx)
   145  	dsc := &DaemonSetsController{
   146  		kubeClient:       kubeClient,
   147  		eventBroadcaster: eventBroadcaster,
   148  		eventRecorder:    eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}),
   149  		podControl: controller.RealPodControl{
   150  			KubeClient: kubeClient,
   151  			Recorder:   eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "daemonset-controller"}),
   152  		},
   153  		crControl: controller.RealControllerRevisionControl{
   154  			KubeClient: kubeClient,
   155  		},
   156  		burstReplicas: BurstReplicas,
   157  		expectations:  controller.NewControllerExpectations(),
   158  		queue:         workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "daemonset"),
   159  	}
   160  
   161  	daemonSetInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   162  		AddFunc: func(obj interface{}) {
   163  			dsc.addDaemonset(logger, obj)
   164  		},
   165  		UpdateFunc: func(oldObj, newObj interface{}) {
   166  			dsc.updateDaemonset(logger, oldObj, newObj)
   167  		},
   168  		DeleteFunc: func(obj interface{}) {
   169  			dsc.deleteDaemonset(logger, obj)
   170  		},
   171  	})
   172  	dsc.dsLister = daemonSetInformer.Lister()
   173  	dsc.dsStoreSynced = daemonSetInformer.Informer().HasSynced
   174  
   175  	historyInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   176  		AddFunc: func(obj interface{}) {
   177  			dsc.addHistory(logger, obj)
   178  		},
   179  		UpdateFunc: func(oldObj, newObj interface{}) {
   180  			dsc.updateHistory(logger, oldObj, newObj)
   181  		},
   182  		DeleteFunc: func(obj interface{}) {
   183  			dsc.deleteHistory(logger, obj)
   184  		},
   185  	})
   186  	dsc.historyLister = historyInformer.Lister()
   187  	dsc.historyStoreSynced = historyInformer.Informer().HasSynced
   188  
   189  	// Watch for creation/deletion of pods. The reason we watch is that we don't want a daemon set to create/delete
   190  	// more pods until all the effects (expectations) of a daemon set's create/delete have been observed.
   191  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   192  		AddFunc: func(obj interface{}) {
   193  			dsc.addPod(logger, obj)
   194  		},
   195  		UpdateFunc: func(oldObj, newObj interface{}) {
   196  			dsc.updatePod(logger, oldObj, newObj)
   197  		},
   198  		DeleteFunc: func(obj interface{}) {
   199  			dsc.deletePod(logger, obj)
   200  		},
   201  	})
   202  	dsc.podLister = podInformer.Lister()
   203  	dsc.podStoreSynced = podInformer.Informer().HasSynced
   204  
   205  	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   206  		AddFunc: func(obj interface{}) {
   207  			dsc.addNode(logger, obj)
   208  		},
   209  		UpdateFunc: func(oldObj, newObj interface{}) {
   210  			dsc.updateNode(logger, oldObj, newObj)
   211  		},
   212  	},
   213  	)
   214  	dsc.nodeStoreSynced = nodeInformer.Informer().HasSynced
   215  	dsc.nodeLister = nodeInformer.Lister()
   216  
   217  	dsc.syncHandler = dsc.syncDaemonSet
   218  	dsc.enqueueDaemonSet = dsc.enqueue
   219  
   220  	dsc.failedPodsBackoff = failedPodsBackoff
   221  
   222  	return dsc, nil
   223  }
   224  
   225  func (dsc *DaemonSetsController) addDaemonset(logger klog.Logger, obj interface{}) {
   226  	ds := obj.(*apps.DaemonSet)
   227  	logger.V(4).Info("Adding daemon set", "daemonset", klog.KObj(ds))
   228  	dsc.enqueueDaemonSet(ds)
   229  }
   230  
   231  func (dsc *DaemonSetsController) updateDaemonset(logger klog.Logger, cur, old interface{}) {
   232  	oldDS := old.(*apps.DaemonSet)
   233  	curDS := cur.(*apps.DaemonSet)
   234  
   235  	// TODO: make a KEP and fix informers to always call the delete event handler on re-create
   236  	if curDS.UID != oldDS.UID {
   237  		key, err := controller.KeyFunc(oldDS)
   238  		if err != nil {
   239  			utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldDS, err))
   240  			return
   241  		}
   242  		dsc.deleteDaemonset(logger, cache.DeletedFinalStateUnknown{
   243  			Key: key,
   244  			Obj: oldDS,
   245  		})
   246  	}
   247  
   248  	logger.V(4).Info("Updating daemon set", "daemonset", klog.KObj(oldDS))
   249  	dsc.enqueueDaemonSet(curDS)
   250  }
   251  
   252  func (dsc *DaemonSetsController) deleteDaemonset(logger klog.Logger, obj interface{}) {
   253  	ds, ok := obj.(*apps.DaemonSet)
   254  	if !ok {
   255  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   256  		if !ok {
   257  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   258  			return
   259  		}
   260  		ds, ok = tombstone.Obj.(*apps.DaemonSet)
   261  		if !ok {
   262  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a DaemonSet %#v", obj))
   263  			return
   264  		}
   265  	}
   266  	logger.V(4).Info("Deleting daemon set", "daemonset", klog.KObj(ds))
   267  
   268  	key, err := controller.KeyFunc(ds)
   269  	if err != nil {
   270  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", ds, err))
   271  		return
   272  	}
   273  
   274  	// Delete expectations for the DaemonSet so if we create a new one with the same name it starts clean
   275  	dsc.expectations.DeleteExpectations(logger, key)
   276  
   277  	dsc.queue.Add(key)
   278  }
   279  
   280  // Run begins watching and syncing daemon sets.
   281  func (dsc *DaemonSetsController) Run(ctx context.Context, workers int) {
   282  	defer utilruntime.HandleCrash()
   283  
   284  	dsc.eventBroadcaster.StartStructuredLogging(0)
   285  	dsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: dsc.kubeClient.CoreV1().Events("")})
   286  	defer dsc.eventBroadcaster.Shutdown()
   287  
   288  	defer dsc.queue.ShutDown()
   289  
   290  	logger := klog.FromContext(ctx)
   291  	logger.Info("Starting daemon sets controller")
   292  	defer logger.Info("Shutting down daemon sets controller")
   293  
   294  	if !cache.WaitForNamedCacheSync("daemon sets", ctx.Done(), dsc.podStoreSynced, dsc.nodeStoreSynced, dsc.historyStoreSynced, dsc.dsStoreSynced) {
   295  		return
   296  	}
   297  
   298  	for i := 0; i < workers; i++ {
   299  		go wait.UntilWithContext(ctx, dsc.runWorker, time.Second)
   300  	}
   301  
   302  	go wait.Until(dsc.failedPodsBackoff.GC, BackoffGCInterval, ctx.Done())
   303  
   304  	<-ctx.Done()
   305  }
   306  
   307  func (dsc *DaemonSetsController) runWorker(ctx context.Context) {
   308  	for dsc.processNextWorkItem(ctx) {
   309  	}
   310  }
   311  
   312  // processNextWorkItem deals with one key off the queue.  It returns false when it's time to quit.
   313  func (dsc *DaemonSetsController) processNextWorkItem(ctx context.Context) bool {
   314  	dsKey, quit := dsc.queue.Get()
   315  	if quit {
   316  		return false
   317  	}
   318  	defer dsc.queue.Done(dsKey)
   319  
   320  	err := dsc.syncHandler(ctx, dsKey.(string))
   321  	if err == nil {
   322  		dsc.queue.Forget(dsKey)
   323  		return true
   324  	}
   325  
   326  	utilruntime.HandleError(fmt.Errorf("%v failed with : %v", dsKey, err))
   327  	dsc.queue.AddRateLimited(dsKey)
   328  
   329  	return true
   330  }
   331  
   332  func (dsc *DaemonSetsController) enqueue(ds *apps.DaemonSet) {
   333  	key, err := controller.KeyFunc(ds)
   334  	if err != nil {
   335  		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %#v: %v", ds, err))
   336  		return
   337  	}
   338  
   339  	// TODO: Handle overlapping controllers better. See comment in ReplicationManager.
   340  	dsc.queue.Add(key)
   341  }
   342  
   343  func (dsc *DaemonSetsController) enqueueDaemonSetAfter(obj interface{}, after time.Duration) {
   344  	key, err := controller.KeyFunc(obj)
   345  	if err != nil {
   346  		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
   347  		return
   348  	}
   349  
   350  	// TODO: Handle overlapping controllers better. See comment in ReplicationManager.
   351  	dsc.queue.AddAfter(key, after)
   352  }
   353  
   354  // getDaemonSetsForPod returns a list of DaemonSets that potentially match the pod.
   355  func (dsc *DaemonSetsController) getDaemonSetsForPod(pod *v1.Pod) []*apps.DaemonSet {
   356  	sets, err := dsc.dsLister.GetPodDaemonSets(pod)
   357  	if err != nil {
   358  		return nil
   359  	}
   360  	if len(sets) > 1 {
   361  		// ControllerRef will ensure we don't do anything crazy, but more than one
   362  		// item in this list nevertheless constitutes user error.
   363  		utilruntime.HandleError(fmt.Errorf("user error! more than one daemon is selecting pods with labels: %+v", pod.Labels))
   364  	}
   365  	return sets
   366  }
   367  
   368  // getDaemonSetsForHistory returns a list of DaemonSets that potentially
   369  // match a ControllerRevision.
   370  func (dsc *DaemonSetsController) getDaemonSetsForHistory(logger klog.Logger, history *apps.ControllerRevision) []*apps.DaemonSet {
   371  	daemonSets, err := dsc.dsLister.GetHistoryDaemonSets(history)
   372  	if err != nil || len(daemonSets) == 0 {
   373  		return nil
   374  	}
   375  	if len(daemonSets) > 1 {
   376  		// ControllerRef will ensure we don't do anything crazy, but more than one
   377  		// item in this list nevertheless constitutes user error.
   378  		logger.V(4).Info("Found more than one DaemonSet selecting the ControllerRevision. This is potentially a user error",
   379  			"controllerRevision", klog.KObj(history), "labels", history.Labels)
   380  	}
   381  	return daemonSets
   382  }
   383  
   384  // addHistory enqueues the DaemonSet that manages a ControllerRevision when the ControllerRevision is created
   385  // or when the controller manager is restarted.
   386  func (dsc *DaemonSetsController) addHistory(logger klog.Logger, obj interface{}) {
   387  	history := obj.(*apps.ControllerRevision)
   388  	if history.DeletionTimestamp != nil {
   389  		// On a restart of the controller manager, it's possible for an object to
   390  		// show up in a state that is already pending deletion.
   391  		dsc.deleteHistory(logger, history)
   392  		return
   393  	}
   394  
   395  	// If it has a ControllerRef, that's all that matters.
   396  	if controllerRef := metav1.GetControllerOf(history); controllerRef != nil {
   397  		ds := dsc.resolveControllerRef(history.Namespace, controllerRef)
   398  		if ds == nil {
   399  			return
   400  		}
   401  		logger.V(4).Info("Observed a ControllerRevision", "controllerRevision", klog.KObj(history))
   402  		return
   403  	}
   404  
   405  	// Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync
   406  	// them to see if anyone wants to adopt it.
   407  	daemonSets := dsc.getDaemonSetsForHistory(logger, history)
   408  	if len(daemonSets) == 0 {
   409  		return
   410  	}
   411  	logger.V(4).Info("Orphan ControllerRevision added", "controllerRevision", klog.KObj(history))
   412  	for _, ds := range daemonSets {
   413  		dsc.enqueueDaemonSet(ds)
   414  	}
   415  }
   416  
   417  // updateHistory figures out what DaemonSet(s) manage a ControllerRevision when the ControllerRevision
   418  // is updated and wake them up. If anything of the ControllerRevision has changed, we need to  awaken
   419  // both the old and new DaemonSets.
   420  func (dsc *DaemonSetsController) updateHistory(logger klog.Logger, old, cur interface{}) {
   421  	curHistory := cur.(*apps.ControllerRevision)
   422  	oldHistory := old.(*apps.ControllerRevision)
   423  	if curHistory.ResourceVersion == oldHistory.ResourceVersion {
   424  		// Periodic resync will send update events for all known ControllerRevisions.
   425  		return
   426  	}
   427  
   428  	curControllerRef := metav1.GetControllerOf(curHistory)
   429  	oldControllerRef := metav1.GetControllerOf(oldHistory)
   430  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   431  	if controllerRefChanged && oldControllerRef != nil {
   432  		// The ControllerRef was changed. Sync the old controller, if any.
   433  		if ds := dsc.resolveControllerRef(oldHistory.Namespace, oldControllerRef); ds != nil {
   434  			dsc.enqueueDaemonSet(ds)
   435  		}
   436  	}
   437  
   438  	// If it has a ControllerRef, that's all that matters.
   439  	if curControllerRef != nil {
   440  		ds := dsc.resolveControllerRef(curHistory.Namespace, curControllerRef)
   441  		if ds == nil {
   442  			return
   443  		}
   444  		logger.V(4).Info("Observed an update to a ControllerRevision", "controllerRevision", klog.KObj(curHistory))
   445  		dsc.enqueueDaemonSet(ds)
   446  		return
   447  	}
   448  
   449  	// Otherwise, it's an orphan. If anything changed, sync matching controllers
   450  	// to see if anyone wants to adopt it now.
   451  	labelChanged := !reflect.DeepEqual(curHistory.Labels, oldHistory.Labels)
   452  	if labelChanged || controllerRefChanged {
   453  		daemonSets := dsc.getDaemonSetsForHistory(logger, curHistory)
   454  		if len(daemonSets) == 0 {
   455  			return
   456  		}
   457  		logger.V(4).Info("Orphan ControllerRevision updated", "controllerRevision", klog.KObj(curHistory))
   458  		for _, ds := range daemonSets {
   459  			dsc.enqueueDaemonSet(ds)
   460  		}
   461  	}
   462  }
   463  
   464  // deleteHistory enqueues the DaemonSet that manages a ControllerRevision when
   465  // the ControllerRevision is deleted. obj could be an *app.ControllerRevision, or
   466  // a DeletionFinalStateUnknown marker item.
   467  func (dsc *DaemonSetsController) deleteHistory(logger klog.Logger, obj interface{}) {
   468  	history, ok := obj.(*apps.ControllerRevision)
   469  
   470  	// When a delete is dropped, the relist will notice a ControllerRevision in the store not
   471  	// in the list, leading to the insertion of a tombstone object which contains
   472  	// the deleted key/value. Note that this value might be stale. If the ControllerRevision
   473  	// changed labels the new DaemonSet will not be woken up till the periodic resync.
   474  	if !ok {
   475  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   476  		if !ok {
   477  			utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj))
   478  			return
   479  		}
   480  		history, ok = tombstone.Obj.(*apps.ControllerRevision)
   481  		if !ok {
   482  			utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a ControllerRevision %#v", obj))
   483  			return
   484  		}
   485  	}
   486  
   487  	controllerRef := metav1.GetControllerOf(history)
   488  	if controllerRef == nil {
   489  		// No controller should care about orphans being deleted.
   490  		return
   491  	}
   492  	ds := dsc.resolveControllerRef(history.Namespace, controllerRef)
   493  	if ds == nil {
   494  		return
   495  	}
   496  	logger.V(4).Info("ControllerRevision deleted", "controllerRevision", klog.KObj(history))
   497  	dsc.enqueueDaemonSet(ds)
   498  }
   499  
   500  func (dsc *DaemonSetsController) addPod(logger klog.Logger, obj interface{}) {
   501  	pod := obj.(*v1.Pod)
   502  
   503  	if pod.DeletionTimestamp != nil {
   504  		// on a restart of the controller manager, it's possible a new pod shows up in a state that
   505  		// is already pending deletion. Prevent the pod from being a creation observation.
   506  		dsc.deletePod(logger, pod)
   507  		return
   508  	}
   509  
   510  	// If it has a ControllerRef, that's all that matters.
   511  	if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
   512  		ds := dsc.resolveControllerRef(pod.Namespace, controllerRef)
   513  		if ds == nil {
   514  			return
   515  		}
   516  		dsKey, err := controller.KeyFunc(ds)
   517  		if err != nil {
   518  			return
   519  		}
   520  		logger.V(4).Info("Pod added", "pod", klog.KObj(pod))
   521  		dsc.expectations.CreationObserved(logger, dsKey)
   522  		dsc.enqueueDaemonSet(ds)
   523  		return
   524  	}
   525  
   526  	// Otherwise, it's an orphan. Get a list of all matching DaemonSets and sync
   527  	// them to see if anyone wants to adopt it.
   528  	// DO NOT observe creation because no controller should be waiting for an
   529  	// orphan.
   530  	dss := dsc.getDaemonSetsForPod(pod)
   531  	if len(dss) == 0 {
   532  		return
   533  	}
   534  	logger.V(4).Info("Orphan Pod added", "pod", klog.KObj(pod))
   535  	for _, ds := range dss {
   536  		dsc.enqueueDaemonSet(ds)
   537  	}
   538  }
   539  
   540  // When a pod is updated, figure out what sets manage it and wake them
   541  // up. If the labels of the pod have changed we need to awaken both the old
   542  // and new set. old and cur must be *v1.Pod types.
   543  func (dsc *DaemonSetsController) updatePod(logger klog.Logger, old, cur interface{}) {
   544  	curPod := cur.(*v1.Pod)
   545  	oldPod := old.(*v1.Pod)
   546  	if curPod.ResourceVersion == oldPod.ResourceVersion {
   547  		// Periodic resync will send update events for all known pods.
   548  		// Two different versions of the same pod will always have different RVs.
   549  		return
   550  	}
   551  
   552  	if curPod.DeletionTimestamp != nil {
   553  		// when a pod is deleted gracefully its deletion timestamp is first modified to reflect a grace period,
   554  		// and after such time has passed, the kubelet actually deletes it from the store. We receive an update
   555  		// for modification of the deletion timestamp and expect an ds to create more replicas asap, not wait
   556  		// until the kubelet actually deletes the pod.
   557  		dsc.deletePod(logger, curPod)
   558  		return
   559  	}
   560  
   561  	curControllerRef := metav1.GetControllerOf(curPod)
   562  	oldControllerRef := metav1.GetControllerOf(oldPod)
   563  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   564  	if controllerRefChanged && oldControllerRef != nil {
   565  		// The ControllerRef was changed. Sync the old controller, if any.
   566  		if ds := dsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); ds != nil {
   567  			dsc.enqueueDaemonSet(ds)
   568  		}
   569  	}
   570  
   571  	// If it has a ControllerRef, that's all that matters.
   572  	if curControllerRef != nil {
   573  		ds := dsc.resolveControllerRef(curPod.Namespace, curControllerRef)
   574  		if ds == nil {
   575  			return
   576  		}
   577  		logger.V(4).Info("Pod updated", "pod", klog.KObj(curPod))
   578  		dsc.enqueueDaemonSet(ds)
   579  		changedToReady := !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod)
   580  		// See https://github.com/kubernetes/kubernetes/pull/38076 for more details
   581  		if changedToReady && ds.Spec.MinReadySeconds > 0 {
   582  			// Add a second to avoid milliseconds skew in AddAfter.
   583  			// See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info.
   584  			dsc.enqueueDaemonSetAfter(ds, (time.Duration(ds.Spec.MinReadySeconds)*time.Second)+time.Second)
   585  		}
   586  		return
   587  	}
   588  
   589  	// Otherwise, it's an orphan. If anything changed, sync matching controllers
   590  	// to see if anyone wants to adopt it now.
   591  	dss := dsc.getDaemonSetsForPod(curPod)
   592  	if len(dss) == 0 {
   593  		return
   594  	}
   595  	logger.V(4).Info("Orphan Pod updated", "pod", klog.KObj(curPod))
   596  	labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
   597  	if labelChanged || controllerRefChanged {
   598  		for _, ds := range dss {
   599  			dsc.enqueueDaemonSet(ds)
   600  		}
   601  	}
   602  }
   603  
   604  func (dsc *DaemonSetsController) deletePod(logger klog.Logger, obj interface{}) {
   605  	pod, ok := obj.(*v1.Pod)
   606  	// When a delete is dropped, the relist will notice a pod in the store not
   607  	// in the list, leading to the insertion of a tombstone object which contains
   608  	// the deleted key/value. Note that this value might be stale. If the pod
   609  	// changed labels the new daemonset will not be woken up till the periodic
   610  	// resync.
   611  	if !ok {
   612  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   613  		if !ok {
   614  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   615  			return
   616  		}
   617  		pod, ok = tombstone.Obj.(*v1.Pod)
   618  		if !ok {
   619  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj))
   620  			return
   621  		}
   622  	}
   623  
   624  	controllerRef := metav1.GetControllerOf(pod)
   625  	if controllerRef == nil {
   626  		// No controller should care about orphans being deleted.
   627  		return
   628  	}
   629  	ds := dsc.resolveControllerRef(pod.Namespace, controllerRef)
   630  	if ds == nil {
   631  		return
   632  	}
   633  	dsKey, err := controller.KeyFunc(ds)
   634  	if err != nil {
   635  		return
   636  	}
   637  	logger.V(4).Info("Pod deleted", "pod", klog.KObj(pod))
   638  	dsc.expectations.DeletionObserved(logger, dsKey)
   639  	dsc.enqueueDaemonSet(ds)
   640  }
   641  
   642  func (dsc *DaemonSetsController) addNode(logger klog.Logger, obj interface{}) {
   643  	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
   644  	dsList, err := dsc.dsLister.List(labels.Everything())
   645  	if err != nil {
   646  		logger.V(4).Info("Error enqueueing daemon sets", "err", err)
   647  		return
   648  	}
   649  	node := obj.(*v1.Node)
   650  	for _, ds := range dsList {
   651  		if shouldRun, _ := NodeShouldRunDaemonPod(node, ds); shouldRun {
   652  			dsc.enqueueDaemonSet(ds)
   653  		}
   654  	}
   655  }
   656  
   657  // nodeInSameCondition returns true if all effective types ("Status" is true) equals;
   658  // otherwise, returns false.
   659  func nodeInSameCondition(old []v1.NodeCondition, cur []v1.NodeCondition) bool {
   660  	if len(old) == 0 && len(cur) == 0 {
   661  		return true
   662  	}
   663  
   664  	c1map := map[v1.NodeConditionType]v1.ConditionStatus{}
   665  	for _, c := range old {
   666  		if c.Status == v1.ConditionTrue {
   667  			c1map[c.Type] = c.Status
   668  		}
   669  	}
   670  
   671  	for _, c := range cur {
   672  		if c.Status != v1.ConditionTrue {
   673  			continue
   674  		}
   675  
   676  		if _, found := c1map[c.Type]; !found {
   677  			return false
   678  		}
   679  
   680  		delete(c1map, c.Type)
   681  	}
   682  
   683  	return len(c1map) == 0
   684  }
   685  
   686  func shouldIgnoreNodeUpdate(oldNode, curNode v1.Node) bool {
   687  	if !nodeInSameCondition(oldNode.Status.Conditions, curNode.Status.Conditions) {
   688  		return false
   689  	}
   690  	oldNode.ResourceVersion = curNode.ResourceVersion
   691  	oldNode.Status.Conditions = curNode.Status.Conditions
   692  	return apiequality.Semantic.DeepEqual(oldNode, curNode)
   693  }
   694  
   695  func (dsc *DaemonSetsController) updateNode(logger klog.Logger, old, cur interface{}) {
   696  	oldNode := old.(*v1.Node)
   697  	curNode := cur.(*v1.Node)
   698  	if shouldIgnoreNodeUpdate(*oldNode, *curNode) {
   699  		return
   700  	}
   701  
   702  	dsList, err := dsc.dsLister.List(labels.Everything())
   703  	if err != nil {
   704  		logger.V(4).Info("Error listing daemon sets", "err", err)
   705  		return
   706  	}
   707  	// TODO: it'd be nice to pass a hint with these enqueues, so that each ds would only examine the added node (unless it has other work to do, too).
   708  	for _, ds := range dsList {
   709  		oldShouldRun, oldShouldContinueRunning := NodeShouldRunDaemonPod(oldNode, ds)
   710  		currentShouldRun, currentShouldContinueRunning := NodeShouldRunDaemonPod(curNode, ds)
   711  		if (oldShouldRun != currentShouldRun) || (oldShouldContinueRunning != currentShouldContinueRunning) {
   712  			dsc.enqueueDaemonSet(ds)
   713  		}
   714  	}
   715  }
   716  
   717  // getDaemonPods returns daemon pods owned by the given ds.
   718  // This also reconciles ControllerRef by adopting/orphaning.
   719  // Note that returned Pods are pointers to objects in the cache.
   720  // If you want to modify one, you need to deep-copy it first.
   721  func (dsc *DaemonSetsController) getDaemonPods(ctx context.Context, ds *apps.DaemonSet) ([]*v1.Pod, error) {
   722  	selector, err := metav1.LabelSelectorAsSelector(ds.Spec.Selector)
   723  	if err != nil {
   724  		return nil, err
   725  	}
   726  
   727  	// List all pods to include those that don't match the selector anymore but
   728  	// have a ControllerRef pointing to this controller.
   729  	pods, err := dsc.podLister.Pods(ds.Namespace).List(labels.Everything())
   730  	if err != nil {
   731  		return nil, err
   732  	}
   733  	// If any adoptions are attempted, we should first recheck for deletion with
   734  	// an uncached quorum read sometime after listing Pods (see #42639).
   735  	dsNotDeleted := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) {
   736  		fresh, err := dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace).Get(ctx, ds.Name, metav1.GetOptions{})
   737  		if err != nil {
   738  			return nil, err
   739  		}
   740  		if fresh.UID != ds.UID {
   741  			return nil, fmt.Errorf("original DaemonSet %v/%v is gone: got uid %v, wanted %v", ds.Namespace, ds.Name, fresh.UID, ds.UID)
   742  		}
   743  		return fresh, nil
   744  	})
   745  
   746  	// Use ControllerRefManager to adopt/orphan as needed.
   747  	cm := controller.NewPodControllerRefManager(dsc.podControl, ds, selector, controllerKind, dsNotDeleted)
   748  	return cm.ClaimPods(ctx, pods)
   749  }
   750  
   751  // getNodesToDaemonPods returns a map from nodes to daemon pods (corresponding to ds) created for the nodes.
   752  // This also reconciles ControllerRef by adopting/orphaning.
   753  // Note that returned Pods are pointers to objects in the cache.
   754  // If you want to modify one, you need to deep-copy it first.
   755  func (dsc *DaemonSetsController) getNodesToDaemonPods(ctx context.Context, ds *apps.DaemonSet, includeDeletedTerminal bool) (map[string][]*v1.Pod, error) {
   756  	claimedPods, err := dsc.getDaemonPods(ctx, ds)
   757  	if err != nil {
   758  		return nil, err
   759  	}
   760  	// Group Pods by Node name.
   761  	nodeToDaemonPods := make(map[string][]*v1.Pod)
   762  	logger := klog.FromContext(ctx)
   763  	for _, pod := range claimedPods {
   764  		if !includeDeletedTerminal && podutil.IsPodTerminal(pod) && pod.DeletionTimestamp != nil {
   765  			// This Pod has a finalizer or is already scheduled for deletion from the
   766  			// store by the kubelet or the Pod GC. The DS controller doesn't have
   767  			// anything else to do with it.
   768  			continue
   769  		}
   770  		nodeName, err := util.GetTargetNodeName(pod)
   771  		if err != nil {
   772  			logger.V(4).Info("Failed to get target node name of Pod in DaemonSet",
   773  				"pod", klog.KObj(pod), "daemonset", klog.KObj(ds))
   774  			continue
   775  		}
   776  
   777  		nodeToDaemonPods[nodeName] = append(nodeToDaemonPods[nodeName], pod)
   778  	}
   779  
   780  	return nodeToDaemonPods, nil
   781  }
   782  
   783  // resolveControllerRef returns the controller referenced by a ControllerRef,
   784  // or nil if the ControllerRef could not be resolved to a matching controller
   785  // of the correct Kind.
   786  func (dsc *DaemonSetsController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.DaemonSet {
   787  	// We can't look up by UID, so look up by Name and then verify UID.
   788  	// Don't even try to look up by Name if it's the wrong Kind.
   789  	if controllerRef.Kind != controllerKind.Kind {
   790  		return nil
   791  	}
   792  	ds, err := dsc.dsLister.DaemonSets(namespace).Get(controllerRef.Name)
   793  	if err != nil {
   794  		return nil
   795  	}
   796  	if ds.UID != controllerRef.UID {
   797  		// The controller we found with this Name is not the same one that the
   798  		// ControllerRef points to.
   799  		return nil
   800  	}
   801  	return ds
   802  }
   803  
   804  // podsShouldBeOnNode figures out the DaemonSet pods to be created and deleted on the given node:
   805  //   - nodesNeedingDaemonPods: the pods need to start on the node
   806  //   - podsToDelete: the Pods need to be deleted on the node
   807  //   - err: unexpected error
   808  func (dsc *DaemonSetsController) podsShouldBeOnNode(
   809  	logger klog.Logger,
   810  	node *v1.Node,
   811  	nodeToDaemonPods map[string][]*v1.Pod,
   812  	ds *apps.DaemonSet,
   813  	hash string,
   814  ) (nodesNeedingDaemonPods, podsToDelete []string) {
   815  
   816  	shouldRun, shouldContinueRunning := NodeShouldRunDaemonPod(node, ds)
   817  	daemonPods, exists := nodeToDaemonPods[node.Name]
   818  
   819  	switch {
   820  	case shouldRun && !exists:
   821  		// If daemon pod is supposed to be running on node, but isn't, create daemon pod.
   822  		nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
   823  	case shouldContinueRunning:
   824  		// If a daemon pod failed, delete it
   825  		// If there's non-daemon pods left on this node, we will create it in the next sync loop
   826  		var daemonPodsRunning []*v1.Pod
   827  		for _, pod := range daemonPods {
   828  			if pod.DeletionTimestamp != nil {
   829  				continue
   830  			}
   831  			if pod.Status.Phase == v1.PodFailed {
   832  				// This is a critical place where DS is often fighting with kubelet that rejects pods.
   833  				// We need to avoid hot looping and backoff.
   834  				backoffKey := failedPodsBackoffKey(ds, node.Name)
   835  
   836  				now := dsc.failedPodsBackoff.Clock.Now()
   837  				inBackoff := dsc.failedPodsBackoff.IsInBackOffSinceUpdate(backoffKey, now)
   838  				if inBackoff {
   839  					delay := dsc.failedPodsBackoff.Get(backoffKey)
   840  					logger.V(4).Info("Deleting failed pod on node has been limited by backoff",
   841  						"pod", klog.KObj(pod), "node", klog.KObj(node), "currentDelay", delay)
   842  					dsc.enqueueDaemonSetAfter(ds, delay)
   843  					continue
   844  				}
   845  
   846  				dsc.failedPodsBackoff.Next(backoffKey, now)
   847  
   848  				msg := fmt.Sprintf("Found failed daemon pod %s/%s on node %s, will try to kill it", pod.Namespace, pod.Name, node.Name)
   849  				logger.V(2).Info("Found failed daemon pod on node, will try to kill it", "pod", klog.KObj(pod), "node", klog.KObj(node))
   850  				// Emit an event so that it's discoverable to users.
   851  				dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, FailedDaemonPodReason, msg)
   852  				podsToDelete = append(podsToDelete, pod.Name)
   853  			} else if pod.Status.Phase == v1.PodSucceeded {
   854  				msg := fmt.Sprintf("Found succeeded daemon pod %s/%s on node %s, will try to delete it", pod.Namespace, pod.Name, node.Name)
   855  				logger.V(2).Info("Found succeeded daemon pod on node, will try to delete it", "pod", klog.KObj(pod), "node", klog.KObj(node))
   856  				// Emit an event so that it's discoverable to users.
   857  				dsc.eventRecorder.Eventf(ds, v1.EventTypeNormal, SucceededDaemonPodReason, msg)
   858  				podsToDelete = append(podsToDelete, pod.Name)
   859  			} else {
   860  				daemonPodsRunning = append(daemonPodsRunning, pod)
   861  			}
   862  		}
   863  
   864  		// When surge is not enabled, if there is more than 1 running pod on a node delete all but the oldest
   865  		if !util.AllowsSurge(ds) {
   866  			if len(daemonPodsRunning) <= 1 {
   867  				// There are no excess pods to be pruned, and no pods to create
   868  				break
   869  			}
   870  
   871  			sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
   872  			for i := 1; i < len(daemonPodsRunning); i++ {
   873  				podsToDelete = append(podsToDelete, daemonPodsRunning[i].Name)
   874  			}
   875  			break
   876  		}
   877  
   878  		if len(daemonPodsRunning) <= 1 {
   879  			// // There are no excess pods to be pruned
   880  			if len(daemonPodsRunning) == 0 && shouldRun {
   881  				// We are surging so we need to have at least one non-deleted pod on the node
   882  				nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, node.Name)
   883  			}
   884  			break
   885  		}
   886  
   887  		// When surge is enabled, we allow 2 pods if and only if the oldest pod matching the current hash state
   888  		// is not ready AND the oldest pod that doesn't match the current hash state is ready. All other pods are
   889  		// deleted. If neither pod is ready, only the one matching the current hash revision is kept.
   890  		var oldestNewPod, oldestOldPod *v1.Pod
   891  		sort.Sort(podByCreationTimestampAndPhase(daemonPodsRunning))
   892  		for _, pod := range daemonPodsRunning {
   893  			if pod.Labels[apps.ControllerRevisionHashLabelKey] == hash {
   894  				if oldestNewPod == nil {
   895  					oldestNewPod = pod
   896  					continue
   897  				}
   898  			} else {
   899  				if oldestOldPod == nil {
   900  					oldestOldPod = pod
   901  					continue
   902  				}
   903  			}
   904  			podsToDelete = append(podsToDelete, pod.Name)
   905  		}
   906  		if oldestNewPod != nil && oldestOldPod != nil {
   907  			switch {
   908  			case !podutil.IsPodReady(oldestOldPod):
   909  				logger.V(5).Info("Pod from daemonset is no longer ready and will be replaced with newer pod", "oldPod", klog.KObj(oldestOldPod), "daemonset", klog.KObj(ds), "newPod", klog.KObj(oldestNewPod))
   910  				podsToDelete = append(podsToDelete, oldestOldPod.Name)
   911  			case podutil.IsPodAvailable(oldestNewPod, ds.Spec.MinReadySeconds, metav1.Time{Time: dsc.failedPodsBackoff.Clock.Now()}):
   912  				logger.V(5).Info("Pod from daemonset is now ready and will replace older pod", "newPod", klog.KObj(oldestNewPod), "daemonset", klog.KObj(ds), "oldPod", klog.KObj(oldestOldPod))
   913  				podsToDelete = append(podsToDelete, oldestOldPod.Name)
   914  			}
   915  		}
   916  
   917  	case !shouldContinueRunning && exists:
   918  		// If daemon pod isn't supposed to run on node, but it is, delete all daemon pods on node.
   919  		for _, pod := range daemonPods {
   920  			if pod.DeletionTimestamp != nil {
   921  				continue
   922  			}
   923  			podsToDelete = append(podsToDelete, pod.Name)
   924  		}
   925  	}
   926  
   927  	return nodesNeedingDaemonPods, podsToDelete
   928  }
   929  
   930  func (dsc *DaemonSetsController) updateDaemonSet(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash, key string, old []*apps.ControllerRevision) error {
   931  	err := dsc.manage(ctx, ds, nodeList, hash)
   932  	if err != nil {
   933  		return err
   934  	}
   935  
   936  	// Process rolling updates if we're ready.
   937  	if dsc.expectations.SatisfiedExpectations(klog.FromContext(ctx), key) {
   938  		switch ds.Spec.UpdateStrategy.Type {
   939  		case apps.OnDeleteDaemonSetStrategyType:
   940  		case apps.RollingUpdateDaemonSetStrategyType:
   941  			err = dsc.rollingUpdate(ctx, ds, nodeList, hash)
   942  		}
   943  		if err != nil {
   944  			return err
   945  		}
   946  	}
   947  
   948  	err = dsc.cleanupHistory(ctx, ds, old)
   949  	if err != nil {
   950  		return fmt.Errorf("failed to clean up revisions of DaemonSet: %w", err)
   951  	}
   952  
   953  	return nil
   954  }
   955  
   956  // manage manages the scheduling and running of Pods of ds on nodes.
   957  // After figuring out which nodes should run a Pod of ds but not yet running one and
   958  // which nodes should not run a Pod of ds but currently running one, it calls function
   959  // syncNodes with a list of pods to remove and a list of nodes to run a Pod of ds.
   960  func (dsc *DaemonSetsController) manage(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string) error {
   961  	// Find out the pods which are created for the nodes by DaemonSet.
   962  	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false)
   963  	if err != nil {
   964  		return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
   965  	}
   966  
   967  	// For each node, if the node is running the daemon pod but isn't supposed to, kill the daemon
   968  	// pod. If the node is supposed to run the daemon pod, but isn't, create the daemon pod on the node.
   969  	logger := klog.FromContext(ctx)
   970  	var nodesNeedingDaemonPods, podsToDelete []string
   971  	for _, node := range nodeList {
   972  		nodesNeedingDaemonPodsOnNode, podsToDeleteOnNode := dsc.podsShouldBeOnNode(
   973  			logger, node, nodeToDaemonPods, ds, hash)
   974  
   975  		nodesNeedingDaemonPods = append(nodesNeedingDaemonPods, nodesNeedingDaemonPodsOnNode...)
   976  		podsToDelete = append(podsToDelete, podsToDeleteOnNode...)
   977  	}
   978  
   979  	// Remove unscheduled pods assigned to not existing nodes when daemonset pods are scheduled by scheduler.
   980  	// If node doesn't exist then pods are never scheduled and can't be deleted by PodGCController.
   981  	podsToDelete = append(podsToDelete, getUnscheduledPodsWithoutNode(nodeList, nodeToDaemonPods)...)
   982  
   983  	// Label new pods using the hash label value of the current history when creating them
   984  	if err = dsc.syncNodes(ctx, ds, podsToDelete, nodesNeedingDaemonPods, hash); err != nil {
   985  		return err
   986  	}
   987  
   988  	return nil
   989  }
   990  
   991  // syncNodes deletes given pods and creates new daemon set pods on the given nodes
   992  // returns slice with errors if any
   993  func (dsc *DaemonSetsController) syncNodes(ctx context.Context, ds *apps.DaemonSet, podsToDelete, nodesNeedingDaemonPods []string, hash string) error {
   994  	// We need to set expectations before creating/deleting pods to avoid race conditions.
   995  	logger := klog.FromContext(ctx)
   996  	dsKey, err := controller.KeyFunc(ds)
   997  	if err != nil {
   998  		return fmt.Errorf("couldn't get key for object %#v: %v", ds, err)
   999  	}
  1000  
  1001  	createDiff := len(nodesNeedingDaemonPods)
  1002  	deleteDiff := len(podsToDelete)
  1003  
  1004  	if createDiff > dsc.burstReplicas {
  1005  		createDiff = dsc.burstReplicas
  1006  	}
  1007  	if deleteDiff > dsc.burstReplicas {
  1008  		deleteDiff = dsc.burstReplicas
  1009  	}
  1010  
  1011  	dsc.expectations.SetExpectations(logger, dsKey, createDiff, deleteDiff)
  1012  
  1013  	// error channel to communicate back failures.  make the buffer big enough to avoid any blocking
  1014  	errCh := make(chan error, createDiff+deleteDiff)
  1015  
  1016  	logger.V(4).Info("Nodes needing daemon pods for daemon set, creating", "daemonset", klog.KObj(ds), "needCount", nodesNeedingDaemonPods, "createCount", createDiff)
  1017  	createWait := sync.WaitGroup{}
  1018  	// If the returned error is not nil we have a parse error.
  1019  	// The controller handles this via the hash.
  1020  	generation, err := util.GetTemplateGeneration(ds)
  1021  	if err != nil {
  1022  		generation = nil
  1023  	}
  1024  	template := util.CreatePodTemplate(ds.Spec.Template, generation, hash)
  1025  	// Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize
  1026  	// and double with each successful iteration in a kind of "slow start".
  1027  	// This handles attempts to start large numbers of pods that would
  1028  	// likely all fail with the same error. For example a project with a
  1029  	// low quota that attempts to create a large number of pods will be
  1030  	// prevented from spamming the API service with the pod create requests
  1031  	// after one of its pods fails.  Conveniently, this also prevents the
  1032  	// event spam that those failures would generate.
  1033  	batchSize := integer.IntMin(createDiff, controller.SlowStartInitialBatchSize)
  1034  	for pos := 0; createDiff > pos; batchSize, pos = integer.IntMin(2*batchSize, createDiff-(pos+batchSize)), pos+batchSize {
  1035  		errorCount := len(errCh)
  1036  		createWait.Add(batchSize)
  1037  		for i := pos; i < pos+batchSize; i++ {
  1038  			go func(ix int) {
  1039  				defer createWait.Done()
  1040  
  1041  				podTemplate := template.DeepCopy()
  1042  				// The pod's NodeAffinity will be updated to make sure the Pod is bound
  1043  				// to the target node by default scheduler. It is safe to do so because there
  1044  				// should be no conflicting node affinity with the target node.
  1045  				podTemplate.Spec.Affinity = util.ReplaceDaemonSetPodNodeNameNodeAffinity(
  1046  					podTemplate.Spec.Affinity, nodesNeedingDaemonPods[ix])
  1047  
  1048  				err := dsc.podControl.CreatePods(ctx, ds.Namespace, podTemplate,
  1049  					ds, metav1.NewControllerRef(ds, controllerKind))
  1050  
  1051  				if err != nil {
  1052  					if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
  1053  						// If the namespace is being torn down, we can safely ignore
  1054  						// this error since all subsequent creations will fail.
  1055  						return
  1056  					}
  1057  				}
  1058  				if err != nil {
  1059  					logger.V(2).Info("Failed creation, decrementing expectations for daemon set", "daemonset", klog.KObj(ds))
  1060  					dsc.expectations.CreationObserved(logger, dsKey)
  1061  					errCh <- err
  1062  					utilruntime.HandleError(err)
  1063  				}
  1064  			}(i)
  1065  		}
  1066  		createWait.Wait()
  1067  		// any skipped pods that we never attempted to start shouldn't be expected.
  1068  		skippedPods := createDiff - (batchSize + pos)
  1069  		if errorCount < len(errCh) && skippedPods > 0 {
  1070  			logger.V(2).Info("Slow-start failure. Skipping creation pods, decrementing expectations for daemon set", "skippedPods", skippedPods, "daemonset", klog.KObj(ds))
  1071  			dsc.expectations.LowerExpectations(logger, dsKey, skippedPods, 0)
  1072  			// The skipped pods will be retried later. The next controller resync will
  1073  			// retry the slow start process.
  1074  			break
  1075  		}
  1076  	}
  1077  
  1078  	logger.V(4).Info("Pods to delete for daemon set, deleting", "daemonset", klog.KObj(ds), "toDeleteCount", podsToDelete, "deleteCount", deleteDiff)
  1079  	deleteWait := sync.WaitGroup{}
  1080  	deleteWait.Add(deleteDiff)
  1081  	for i := 0; i < deleteDiff; i++ {
  1082  		go func(ix int) {
  1083  			defer deleteWait.Done()
  1084  			if err := dsc.podControl.DeletePod(ctx, ds.Namespace, podsToDelete[ix], ds); err != nil {
  1085  				dsc.expectations.DeletionObserved(logger, dsKey)
  1086  				if !apierrors.IsNotFound(err) {
  1087  					logger.V(2).Info("Failed deletion, decremented expectations for daemon set", "daemonset", klog.KObj(ds))
  1088  					errCh <- err
  1089  					utilruntime.HandleError(err)
  1090  				}
  1091  			}
  1092  		}(i)
  1093  	}
  1094  	deleteWait.Wait()
  1095  
  1096  	// collect errors if any for proper reporting/retry logic in the controller
  1097  	errors := []error{}
  1098  	close(errCh)
  1099  	for err := range errCh {
  1100  		errors = append(errors, err)
  1101  	}
  1102  	return utilerrors.NewAggregate(errors)
  1103  }
  1104  
  1105  func storeDaemonSetStatus(
  1106  	ctx context.Context,
  1107  	dsClient unversionedapps.DaemonSetInterface,
  1108  	ds *apps.DaemonSet, desiredNumberScheduled,
  1109  	currentNumberScheduled,
  1110  	numberMisscheduled,
  1111  	numberReady,
  1112  	updatedNumberScheduled,
  1113  	numberAvailable,
  1114  	numberUnavailable int,
  1115  	updateObservedGen bool) error {
  1116  	if int(ds.Status.DesiredNumberScheduled) == desiredNumberScheduled &&
  1117  		int(ds.Status.CurrentNumberScheduled) == currentNumberScheduled &&
  1118  		int(ds.Status.NumberMisscheduled) == numberMisscheduled &&
  1119  		int(ds.Status.NumberReady) == numberReady &&
  1120  		int(ds.Status.UpdatedNumberScheduled) == updatedNumberScheduled &&
  1121  		int(ds.Status.NumberAvailable) == numberAvailable &&
  1122  		int(ds.Status.NumberUnavailable) == numberUnavailable &&
  1123  		ds.Status.ObservedGeneration >= ds.Generation {
  1124  		return nil
  1125  	}
  1126  
  1127  	toUpdate := ds.DeepCopy()
  1128  
  1129  	var updateErr, getErr error
  1130  	for i := 0; ; i++ {
  1131  		if updateObservedGen {
  1132  			toUpdate.Status.ObservedGeneration = ds.Generation
  1133  		}
  1134  		toUpdate.Status.DesiredNumberScheduled = int32(desiredNumberScheduled)
  1135  		toUpdate.Status.CurrentNumberScheduled = int32(currentNumberScheduled)
  1136  		toUpdate.Status.NumberMisscheduled = int32(numberMisscheduled)
  1137  		toUpdate.Status.NumberReady = int32(numberReady)
  1138  		toUpdate.Status.UpdatedNumberScheduled = int32(updatedNumberScheduled)
  1139  		toUpdate.Status.NumberAvailable = int32(numberAvailable)
  1140  		toUpdate.Status.NumberUnavailable = int32(numberUnavailable)
  1141  
  1142  		if _, updateErr = dsClient.UpdateStatus(ctx, toUpdate, metav1.UpdateOptions{}); updateErr == nil {
  1143  			return nil
  1144  		}
  1145  
  1146  		// Stop retrying if we exceed statusUpdateRetries - the DaemonSet will be requeued with a rate limit.
  1147  		if i >= StatusUpdateRetries {
  1148  			break
  1149  		}
  1150  		// Update the set with the latest resource version for the next poll
  1151  		if toUpdate, getErr = dsClient.Get(ctx, ds.Name, metav1.GetOptions{}); getErr != nil {
  1152  			// If the GET fails we can't trust status.Replicas anymore. This error
  1153  			// is bound to be more interesting than the update failure.
  1154  			return getErr
  1155  		}
  1156  	}
  1157  	return updateErr
  1158  }
  1159  
  1160  func (dsc *DaemonSetsController) updateDaemonSetStatus(ctx context.Context, ds *apps.DaemonSet, nodeList []*v1.Node, hash string, updateObservedGen bool) error {
  1161  	logger := klog.FromContext(ctx)
  1162  	logger.V(4).Info("Updating daemon set status")
  1163  	nodeToDaemonPods, err := dsc.getNodesToDaemonPods(ctx, ds, false)
  1164  	if err != nil {
  1165  		return fmt.Errorf("couldn't get node to daemon pod mapping for daemon set %q: %v", ds.Name, err)
  1166  	}
  1167  
  1168  	var desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable int
  1169  	now := dsc.failedPodsBackoff.Clock.Now()
  1170  	for _, node := range nodeList {
  1171  		shouldRun, _ := NodeShouldRunDaemonPod(node, ds)
  1172  		scheduled := len(nodeToDaemonPods[node.Name]) > 0
  1173  
  1174  		if shouldRun {
  1175  			desiredNumberScheduled++
  1176  			if !scheduled {
  1177  				continue
  1178  			}
  1179  
  1180  			currentNumberScheduled++
  1181  			// Sort the daemon pods by creation time, so that the oldest is first.
  1182  			daemonPods, _ := nodeToDaemonPods[node.Name]
  1183  			sort.Sort(podByCreationTimestampAndPhase(daemonPods))
  1184  			pod := daemonPods[0]
  1185  			if podutil.IsPodReady(pod) {
  1186  				numberReady++
  1187  				if podutil.IsPodAvailable(pod, ds.Spec.MinReadySeconds, metav1.Time{Time: now}) {
  1188  					numberAvailable++
  1189  				}
  1190  			}
  1191  			// If the returned error is not nil we have a parse error.
  1192  			// The controller handles this via the hash.
  1193  			generation, err := util.GetTemplateGeneration(ds)
  1194  			if err != nil {
  1195  				generation = nil
  1196  			}
  1197  			if util.IsPodUpdated(pod, hash, generation) {
  1198  				updatedNumberScheduled++
  1199  			}
  1200  		} else {
  1201  			if scheduled {
  1202  				numberMisscheduled++
  1203  			}
  1204  		}
  1205  	}
  1206  	numberUnavailable := desiredNumberScheduled - numberAvailable
  1207  
  1208  	err = storeDaemonSetStatus(ctx, dsc.kubeClient.AppsV1().DaemonSets(ds.Namespace), ds, desiredNumberScheduled, currentNumberScheduled, numberMisscheduled, numberReady, updatedNumberScheduled, numberAvailable, numberUnavailable, updateObservedGen)
  1209  	if err != nil {
  1210  		return fmt.Errorf("error storing status for daemon set %#v: %w", ds, err)
  1211  	}
  1212  
  1213  	// Resync the DaemonSet after MinReadySeconds as a last line of defense to guard against clock-skew.
  1214  	if ds.Spec.MinReadySeconds > 0 && numberReady != numberAvailable {
  1215  		dsc.enqueueDaemonSetAfter(ds, time.Duration(ds.Spec.MinReadySeconds)*time.Second)
  1216  	}
  1217  	return nil
  1218  }
  1219  
  1220  func (dsc *DaemonSetsController) syncDaemonSet(ctx context.Context, key string) error {
  1221  	logger := klog.FromContext(ctx)
  1222  	startTime := dsc.failedPodsBackoff.Clock.Now()
  1223  
  1224  	defer func() {
  1225  		logger.V(4).Info("Finished syncing daemon set", "daemonset", key, "time", dsc.failedPodsBackoff.Clock.Now().Sub(startTime))
  1226  	}()
  1227  
  1228  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
  1229  	if err != nil {
  1230  		return err
  1231  	}
  1232  	ds, err := dsc.dsLister.DaemonSets(namespace).Get(name)
  1233  	if apierrors.IsNotFound(err) {
  1234  		logger.V(3).Info("Daemon set has been deleted", "daemonset", key)
  1235  		dsc.expectations.DeleteExpectations(logger, key)
  1236  		return nil
  1237  	}
  1238  	if err != nil {
  1239  		return fmt.Errorf("unable to retrieve ds %v from store: %v", key, err)
  1240  	}
  1241  
  1242  	nodeList, err := dsc.nodeLister.List(labels.Everything())
  1243  	if err != nil {
  1244  		return fmt.Errorf("couldn't get list of nodes when syncing daemon set %#v: %v", ds, err)
  1245  	}
  1246  
  1247  	everything := metav1.LabelSelector{}
  1248  	if reflect.DeepEqual(ds.Spec.Selector, &everything) {
  1249  		dsc.eventRecorder.Eventf(ds, v1.EventTypeWarning, SelectingAllReason, "This daemon set is selecting all pods. A non-empty selector is required.")
  1250  		return nil
  1251  	}
  1252  
  1253  	// Don't process a daemon set until all its creations and deletions have been processed.
  1254  	// For example if daemon set foo asked for 3 new daemon pods in the previous call to manage,
  1255  	// then we do not want to call manage on foo until the daemon pods have been created.
  1256  	dsKey, err := controller.KeyFunc(ds)
  1257  	if err != nil {
  1258  		return fmt.Errorf("couldn't get key for object %#v: %v", ds, err)
  1259  	}
  1260  
  1261  	// If the DaemonSet is being deleted (either by foreground deletion or
  1262  	// orphan deletion), we cannot be sure if the DaemonSet history objects
  1263  	// it owned still exist -- those history objects can either be deleted
  1264  	// or orphaned. Garbage collector doesn't guarantee that it will delete
  1265  	// DaemonSet pods before deleting DaemonSet history objects, because
  1266  	// DaemonSet history doesn't own DaemonSet pods. We cannot reliably
  1267  	// calculate the status of a DaemonSet being deleted. Therefore, return
  1268  	// here without updating status for the DaemonSet being deleted.
  1269  	if ds.DeletionTimestamp != nil {
  1270  		return nil
  1271  	}
  1272  
  1273  	// Construct histories of the DaemonSet, and get the hash of current history
  1274  	cur, old, err := dsc.constructHistory(ctx, ds)
  1275  	if err != nil {
  1276  		return fmt.Errorf("failed to construct revisions of DaemonSet: %v", err)
  1277  	}
  1278  	hash := cur.Labels[apps.DefaultDaemonSetUniqueLabelKey]
  1279  
  1280  	if !dsc.expectations.SatisfiedExpectations(logger, dsKey) {
  1281  		// Only update status. Don't raise observedGeneration since controller didn't process object of that generation.
  1282  		return dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, false)
  1283  	}
  1284  
  1285  	err = dsc.updateDaemonSet(ctx, ds, nodeList, hash, dsKey, old)
  1286  	statusErr := dsc.updateDaemonSetStatus(ctx, ds, nodeList, hash, true)
  1287  	switch {
  1288  	case err != nil && statusErr != nil:
  1289  		// If there was an error, and we failed to update status,
  1290  		// log it and return the original error.
  1291  		logger.Error(statusErr, "Failed to update status", "daemonSet", klog.KObj(ds))
  1292  		return err
  1293  	case err != nil:
  1294  		return err
  1295  	case statusErr != nil:
  1296  		return statusErr
  1297  	}
  1298  
  1299  	return nil
  1300  }
  1301  
  1302  // NodeShouldRunDaemonPod checks a set of preconditions against a (node,daemonset) and returns a
  1303  // summary. Returned booleans are:
  1304  //   - shouldRun:
  1305  //     Returns true when a daemonset should run on the node if a daemonset pod is not already
  1306  //     running on that node.
  1307  //   - shouldContinueRunning:
  1308  //     Returns true when a daemonset should continue running on a node if a daemonset pod is already
  1309  //     running on that node.
  1310  func NodeShouldRunDaemonPod(node *v1.Node, ds *apps.DaemonSet) (bool, bool) {
  1311  	pod := NewPod(ds, node.Name)
  1312  
  1313  	// If the daemon set specifies a node name, check that it matches with node.Name.
  1314  	if !(ds.Spec.Template.Spec.NodeName == "" || ds.Spec.Template.Spec.NodeName == node.Name) {
  1315  		return false, false
  1316  	}
  1317  
  1318  	taints := node.Spec.Taints
  1319  	fitsNodeName, fitsNodeAffinity, fitsTaints := predicates(pod, node, taints)
  1320  	if !fitsNodeName || !fitsNodeAffinity {
  1321  		return false, false
  1322  	}
  1323  
  1324  	if !fitsTaints {
  1325  		// Scheduled daemon pods should continue running if they tolerate NoExecute taint.
  1326  		_, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
  1327  			return t.Effect == v1.TaintEffectNoExecute
  1328  		})
  1329  		return false, !hasUntoleratedTaint
  1330  	}
  1331  
  1332  	return true, true
  1333  }
  1334  
  1335  // predicates checks if a DaemonSet's pod can run on a node.
  1336  func predicates(pod *v1.Pod, node *v1.Node, taints []v1.Taint) (fitsNodeName, fitsNodeAffinity, fitsTaints bool) {
  1337  	fitsNodeName = len(pod.Spec.NodeName) == 0 || pod.Spec.NodeName == node.Name
  1338  	// Ignore parsing errors for backwards compatibility.
  1339  	fitsNodeAffinity, _ = nodeaffinity.GetRequiredNodeAffinity(pod).Match(node)
  1340  	_, hasUntoleratedTaint := v1helper.FindMatchingUntoleratedTaint(taints, pod.Spec.Tolerations, func(t *v1.Taint) bool {
  1341  		return t.Effect == v1.TaintEffectNoExecute || t.Effect == v1.TaintEffectNoSchedule
  1342  	})
  1343  	fitsTaints = !hasUntoleratedTaint
  1344  	return
  1345  }
  1346  
  1347  // NewPod creates a new pod
  1348  func NewPod(ds *apps.DaemonSet, nodeName string) *v1.Pod {
  1349  	newPod := &v1.Pod{Spec: ds.Spec.Template.Spec, ObjectMeta: ds.Spec.Template.ObjectMeta}
  1350  	newPod.Namespace = ds.Namespace
  1351  	newPod.Spec.NodeName = nodeName
  1352  
  1353  	// Added default tolerations for DaemonSet pods.
  1354  	util.AddOrUpdateDaemonPodTolerations(&newPod.Spec)
  1355  
  1356  	return newPod
  1357  }
  1358  
  1359  type podByCreationTimestampAndPhase []*v1.Pod
  1360  
  1361  func (o podByCreationTimestampAndPhase) Len() int      { return len(o) }
  1362  func (o podByCreationTimestampAndPhase) Swap(i, j int) { o[i], o[j] = o[j], o[i] }
  1363  
  1364  func (o podByCreationTimestampAndPhase) Less(i, j int) bool {
  1365  	// Scheduled Pod first
  1366  	if len(o[i].Spec.NodeName) != 0 && len(o[j].Spec.NodeName) == 0 {
  1367  		return true
  1368  	}
  1369  
  1370  	if len(o[i].Spec.NodeName) == 0 && len(o[j].Spec.NodeName) != 0 {
  1371  		return false
  1372  	}
  1373  
  1374  	if o[i].CreationTimestamp.Equal(&o[j].CreationTimestamp) {
  1375  		return o[i].Name < o[j].Name
  1376  	}
  1377  	return o[i].CreationTimestamp.Before(&o[j].CreationTimestamp)
  1378  }
  1379  
  1380  func failedPodsBackoffKey(ds *apps.DaemonSet, nodeName string) string {
  1381  	return fmt.Sprintf("%s/%d/%s", ds.UID, ds.Status.ObservedGeneration, nodeName)
  1382  }
  1383  
  1384  // getUnscheduledPodsWithoutNode returns list of unscheduled pods assigned to not existing nodes.
  1385  // Returned pods can't be deleted by PodGCController so they should be deleted by DaemonSetController.
  1386  func getUnscheduledPodsWithoutNode(runningNodesList []*v1.Node, nodeToDaemonPods map[string][]*v1.Pod) []string {
  1387  	var results []string
  1388  	isNodeRunning := make(map[string]bool, len(runningNodesList))
  1389  	for _, node := range runningNodesList {
  1390  		isNodeRunning[node.Name] = true
  1391  	}
  1392  
  1393  	for n, pods := range nodeToDaemonPods {
  1394  		if isNodeRunning[n] {
  1395  			continue
  1396  		}
  1397  		for _, pod := range pods {
  1398  			if len(pod.Spec.NodeName) == 0 {
  1399  				results = append(results, pod.Name)
  1400  			}
  1401  		}
  1402  	}
  1403  
  1404  	return results
  1405  }