k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/replicaset/replica_set.go

k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/replicaset/replica_set.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  // ### ATTENTION ###
    18  //
    19  // This code implements both ReplicaSet and ReplicationController.
    20  //
    21  // For RC, the objects are converted on the way in and out (see ../replication/),
    22  // as if ReplicationController were just an older API version of ReplicaSet.
    23  // However, RC and RS still have separate storage and separate instantiations
    24  // of the ReplicaSetController object.
    25  //
    26  // Use rsc.Kind in log messages rather than hard-coding "ReplicaSet".
    27  
    28  package replicaset
    29  
    30  import (
    31  	"context"
    32  	"fmt"
    33  	"reflect"
    34  	"sort"
    35  	"strings"
    36  	"sync"
    37  	"time"
    38  
    39  	apps "k8s.io/api/apps/v1"
    40  	v1 "k8s.io/api/core/v1"
    41  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    42  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    43  	"k8s.io/apimachinery/pkg/labels"
    44  	"k8s.io/apimachinery/pkg/runtime/schema"
    45  	"k8s.io/apimachinery/pkg/types"
    46  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    47  	"k8s.io/apimachinery/pkg/util/wait"
    48  	appsinformers "k8s.io/client-go/informers/apps/v1"
    49  	coreinformers "k8s.io/client-go/informers/core/v1"
    50  	clientset "k8s.io/client-go/kubernetes"
    51  	"k8s.io/client-go/kubernetes/scheme"
    52  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    53  	appslisters "k8s.io/client-go/listers/apps/v1"
    54  	corelisters "k8s.io/client-go/listers/core/v1"
    55  	"k8s.io/client-go/tools/cache"
    56  	"k8s.io/client-go/tools/record"
    57  	"k8s.io/client-go/util/workqueue"
    58  	"k8s.io/component-base/metrics/legacyregistry"
    59  	"k8s.io/klog/v2"
    60  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    61  	"k8s.io/kubernetes/pkg/controller"
    62  	"k8s.io/kubernetes/pkg/controller/replicaset/metrics"
    63  )
    64  
    65  const (
    66  	// Realistic value of the burstReplica field for the replica set manager based off
    67  	// performance requirements for kubernetes 1.0.
    68  	BurstReplicas = 500
    69  
    70  	// The number of times we retry updating a ReplicaSet's status.
    71  	statusUpdateRetries = 1
    72  
    73  	// controllerUIDIndex is the name for the ReplicaSet store's index function,
    74  	// which is to index by ReplicaSet's controllerUID.
    75  	controllerUIDIndex = "controllerUID"
    76  )
    77  
    78  // ReplicaSetController is responsible for synchronizing ReplicaSet objects stored
    79  // in the system with actual running pods.
    80  type ReplicaSetController struct {
    81  	// GroupVersionKind indicates the controller type.
    82  	// Different instances of this struct may handle different GVKs.
    83  	// For example, this struct can be used (with adapters) to handle ReplicationController.
    84  	schema.GroupVersionKind
    85  
    86  	kubeClient clientset.Interface
    87  	podControl controller.PodControlInterface
    88  
    89  	eventBroadcaster record.EventBroadcaster
    90  
    91  	// A ReplicaSet is temporarily suspended after creating/deleting these many replicas.
    92  	// It resumes normal action after observing the watch events for them.
    93  	burstReplicas int
    94  	// To allow injection of syncReplicaSet for testing.
    95  	syncHandler func(ctx context.Context, rsKey string) error
    96  
    97  	// A TTLCache of pod creates/deletes each rc expects to see.
    98  	expectations *controller.UIDTrackingControllerExpectations
    99  
   100  	// A store of ReplicaSets, populated by the shared informer passed to NewReplicaSetController
   101  	rsLister appslisters.ReplicaSetLister
   102  	// rsListerSynced returns true if the pod store has been synced at least once.
   103  	// Added as a member to the struct to allow injection for testing.
   104  	rsListerSynced cache.InformerSynced
   105  	rsIndexer      cache.Indexer
   106  
   107  	// A store of pods, populated by the shared informer passed to NewReplicaSetController
   108  	podLister corelisters.PodLister
   109  	// podListerSynced returns true if the pod store has been synced at least once.
   110  	// Added as a member to the struct to allow injection for testing.
   111  	podListerSynced cache.InformerSynced
   112  
   113  	// Controllers that need to be synced
   114  	queue workqueue.TypedRateLimitingInterface[string]
   115  }
   116  
   117  // NewReplicaSetController configures a replica set controller with the specified event recorder
   118  func NewReplicaSetController(ctx context.Context, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int) *ReplicaSetController {
   119  	logger := klog.FromContext(ctx)
   120  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
   121  	if err := metrics.Register(legacyregistry.Register); err != nil {
   122  		logger.Error(err, "unable to register metrics")
   123  	}
   124  	return NewBaseController(logger, rsInformer, podInformer, kubeClient, burstReplicas,
   125  		apps.SchemeGroupVersion.WithKind("ReplicaSet"),
   126  		"replicaset_controller",
   127  		"replicaset",
   128  		controller.RealPodControl{
   129  			KubeClient: kubeClient,
   130  			Recorder:   eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "replicaset-controller"}),
   131  		},
   132  		eventBroadcaster,
   133  	)
   134  }
   135  
   136  // NewBaseController is the implementation of NewReplicaSetController with additional injected
   137  // parameters so that it can also serve as the implementation of NewReplicationController.
   138  func NewBaseController(logger klog.Logger, rsInformer appsinformers.ReplicaSetInformer, podInformer coreinformers.PodInformer, kubeClient clientset.Interface, burstReplicas int,
   139  	gvk schema.GroupVersionKind, metricOwnerName, queueName string, podControl controller.PodControlInterface, eventBroadcaster record.EventBroadcaster) *ReplicaSetController {
   140  
   141  	rsc := &ReplicaSetController{
   142  		GroupVersionKind: gvk,
   143  		kubeClient:       kubeClient,
   144  		podControl:       podControl,
   145  		eventBroadcaster: eventBroadcaster,
   146  		burstReplicas:    burstReplicas,
   147  		expectations:     controller.NewUIDTrackingControllerExpectations(controller.NewControllerExpectations()),
   148  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
   149  			workqueue.DefaultTypedControllerRateLimiter[string](),
   150  			workqueue.TypedRateLimitingQueueConfig[string]{Name: queueName},
   151  		),
   152  	}
   153  
   154  	rsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   155  		AddFunc: func(obj interface{}) {
   156  			rsc.addRS(logger, obj)
   157  		},
   158  		UpdateFunc: func(oldObj, newObj interface{}) {
   159  			rsc.updateRS(logger, oldObj, newObj)
   160  		},
   161  		DeleteFunc: func(obj interface{}) {
   162  			rsc.deleteRS(logger, obj)
   163  		},
   164  	})
   165  	rsInformer.Informer().AddIndexers(cache.Indexers{
   166  		controllerUIDIndex: func(obj interface{}) ([]string, error) {
   167  			rs, ok := obj.(*apps.ReplicaSet)
   168  			if !ok {
   169  				return []string{}, nil
   170  			}
   171  			controllerRef := metav1.GetControllerOf(rs)
   172  			if controllerRef == nil {
   173  				return []string{}, nil
   174  			}
   175  			return []string{string(controllerRef.UID)}, nil
   176  		},
   177  	})
   178  	rsc.rsIndexer = rsInformer.Informer().GetIndexer()
   179  	rsc.rsLister = rsInformer.Lister()
   180  	rsc.rsListerSynced = rsInformer.Informer().HasSynced
   181  
   182  	podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   183  		AddFunc: func(obj interface{}) {
   184  			rsc.addPod(logger, obj)
   185  		},
   186  		// This invokes the ReplicaSet for every pod change, eg: host assignment. Though this might seem like
   187  		// overkill the most frequent pod update is status, and the associated ReplicaSet will only list from
   188  		// local storage, so it should be ok.
   189  		UpdateFunc: func(oldObj, newObj interface{}) {
   190  			rsc.updatePod(logger, oldObj, newObj)
   191  		},
   192  		DeleteFunc: func(obj interface{}) {
   193  			rsc.deletePod(logger, obj)
   194  		},
   195  	})
   196  	rsc.podLister = podInformer.Lister()
   197  	rsc.podListerSynced = podInformer.Informer().HasSynced
   198  
   199  	rsc.syncHandler = rsc.syncReplicaSet
   200  
   201  	return rsc
   202  }
   203  
   204  // Run begins watching and syncing.
   205  func (rsc *ReplicaSetController) Run(ctx context.Context, workers int) {
   206  	defer utilruntime.HandleCrash()
   207  
   208  	// Start events processing pipeline.
   209  	rsc.eventBroadcaster.StartStructuredLogging(3)
   210  	rsc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: rsc.kubeClient.CoreV1().Events("")})
   211  	defer rsc.eventBroadcaster.Shutdown()
   212  
   213  	defer rsc.queue.ShutDown()
   214  
   215  	controllerName := strings.ToLower(rsc.Kind)
   216  	logger := klog.FromContext(ctx)
   217  	logger.Info("Starting controller", "name", controllerName)
   218  	defer logger.Info("Shutting down controller", "name", controllerName)
   219  
   220  	if !cache.WaitForNamedCacheSync(rsc.Kind, ctx.Done(), rsc.podListerSynced, rsc.rsListerSynced) {
   221  		return
   222  	}
   223  
   224  	for i := 0; i < workers; i++ {
   225  		go wait.UntilWithContext(ctx, rsc.worker, time.Second)
   226  	}
   227  
   228  	<-ctx.Done()
   229  }
   230  
   231  // getReplicaSetsWithSameController returns a list of ReplicaSets with the same
   232  // owner as the given ReplicaSet.
   233  func (rsc *ReplicaSetController) getReplicaSetsWithSameController(logger klog.Logger, rs *apps.ReplicaSet) []*apps.ReplicaSet {
   234  	controllerRef := metav1.GetControllerOf(rs)
   235  	if controllerRef == nil {
   236  		utilruntime.HandleError(fmt.Errorf("ReplicaSet has no controller: %v", rs))
   237  		return nil
   238  	}
   239  
   240  	objects, err := rsc.rsIndexer.ByIndex(controllerUIDIndex, string(controllerRef.UID))
   241  	if err != nil {
   242  		utilruntime.HandleError(err)
   243  		return nil
   244  	}
   245  	relatedRSs := make([]*apps.ReplicaSet, 0, len(objects))
   246  	for _, obj := range objects {
   247  		relatedRSs = append(relatedRSs, obj.(*apps.ReplicaSet))
   248  	}
   249  
   250  	if klogV := logger.V(2); klogV.Enabled() {
   251  		klogV.Info("Found related ReplicaSets", "replicaSet", klog.KObj(rs), "relatedReplicaSets", klog.KObjSlice(relatedRSs))
   252  	}
   253  
   254  	return relatedRSs
   255  }
   256  
   257  // getPodReplicaSets returns a list of ReplicaSets matching the given pod.
   258  func (rsc *ReplicaSetController) getPodReplicaSets(pod *v1.Pod) []*apps.ReplicaSet {
   259  	rss, err := rsc.rsLister.GetPodReplicaSets(pod)
   260  	if err != nil {
   261  		return nil
   262  	}
   263  	if len(rss) > 1 {
   264  		// ControllerRef will ensure we don't do anything crazy, but more than one
   265  		// item in this list nevertheless constitutes user error.
   266  		utilruntime.HandleError(fmt.Errorf("user error! more than one %v is selecting pods with labels: %+v", rsc.Kind, pod.Labels))
   267  	}
   268  	return rss
   269  }
   270  
   271  // resolveControllerRef returns the controller referenced by a ControllerRef,
   272  // or nil if the ControllerRef could not be resolved to a matching controller
   273  // of the correct Kind.
   274  func (rsc *ReplicaSetController) resolveControllerRef(namespace string, controllerRef *metav1.OwnerReference) *apps.ReplicaSet {
   275  	// We can't look up by UID, so look up by Name and then verify UID.
   276  	// Don't even try to look up by Name if it's the wrong Kind.
   277  	if controllerRef.Kind != rsc.Kind {
   278  		return nil
   279  	}
   280  	rs, err := rsc.rsLister.ReplicaSets(namespace).Get(controllerRef.Name)
   281  	if err != nil {
   282  		return nil
   283  	}
   284  	if rs.UID != controllerRef.UID {
   285  		// The controller we found with this Name is not the same one that the
   286  		// ControllerRef points to.
   287  		return nil
   288  	}
   289  	return rs
   290  }
   291  
   292  func (rsc *ReplicaSetController) enqueueRS(rs *apps.ReplicaSet) {
   293  	key, err := controller.KeyFunc(rs)
   294  	if err != nil {
   295  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err))
   296  		return
   297  	}
   298  
   299  	rsc.queue.Add(key)
   300  }
   301  
   302  func (rsc *ReplicaSetController) enqueueRSAfter(rs *apps.ReplicaSet, duration time.Duration) {
   303  	key, err := controller.KeyFunc(rs)
   304  	if err != nil {
   305  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err))
   306  		return
   307  	}
   308  
   309  	rsc.queue.AddAfter(key, duration)
   310  }
   311  
   312  func (rsc *ReplicaSetController) addRS(logger klog.Logger, obj interface{}) {
   313  	rs := obj.(*apps.ReplicaSet)
   314  	logger.V(4).Info("Adding", "replicaSet", klog.KObj(rs))
   315  	rsc.enqueueRS(rs)
   316  }
   317  
   318  // callback when RS is updated
   319  func (rsc *ReplicaSetController) updateRS(logger klog.Logger, old, cur interface{}) {
   320  	oldRS := old.(*apps.ReplicaSet)
   321  	curRS := cur.(*apps.ReplicaSet)
   322  
   323  	// TODO: make a KEP and fix informers to always call the delete event handler on re-create
   324  	if curRS.UID != oldRS.UID {
   325  		key, err := controller.KeyFunc(oldRS)
   326  		if err != nil {
   327  			utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", oldRS, err))
   328  			return
   329  		}
   330  		rsc.deleteRS(logger, cache.DeletedFinalStateUnknown{
   331  			Key: key,
   332  			Obj: oldRS,
   333  		})
   334  	}
   335  
   336  	// You might imagine that we only really need to enqueue the
   337  	// replica set when Spec changes, but it is safer to sync any
   338  	// time this function is triggered. That way a full informer
   339  	// resync can requeue any replica set that don't yet have pods
   340  	// but whose last attempts at creating a pod have failed (since
   341  	// we don't block on creation of pods) instead of those
   342  	// replica sets stalling indefinitely. Enqueueing every time
   343  	// does result in some spurious syncs (like when Status.Replica
   344  	// is updated and the watch notification from it retriggers
   345  	// this function), but in general extra resyncs shouldn't be
   346  	// that bad as ReplicaSets that haven't met expectations yet won't
   347  	// sync, and all the listing is done using local stores.
   348  	if *(oldRS.Spec.Replicas) != *(curRS.Spec.Replicas) {
   349  		logger.V(4).Info("replicaSet updated. Desired pod count change.", "replicaSet", klog.KObj(oldRS), "oldReplicas", *(oldRS.Spec.Replicas), "newReplicas", *(curRS.Spec.Replicas))
   350  	}
   351  	rsc.enqueueRS(curRS)
   352  }
   353  
   354  func (rsc *ReplicaSetController) deleteRS(logger klog.Logger, obj interface{}) {
   355  	rs, ok := obj.(*apps.ReplicaSet)
   356  	if !ok {
   357  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   358  		if !ok {
   359  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %#v", obj))
   360  			return
   361  		}
   362  		rs, ok = tombstone.Obj.(*apps.ReplicaSet)
   363  		if !ok {
   364  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a ReplicaSet %#v", obj))
   365  			return
   366  		}
   367  	}
   368  
   369  	key, err := controller.KeyFunc(rs)
   370  	if err != nil {
   371  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err))
   372  		return
   373  	}
   374  
   375  	logger.V(4).Info("Deleting", "replicaSet", klog.KObj(rs))
   376  
   377  	// Delete expectations for the ReplicaSet so if we create a new one with the same name it starts clean
   378  	rsc.expectations.DeleteExpectations(logger, key)
   379  
   380  	rsc.queue.Add(key)
   381  }
   382  
   383  // When a pod is created, enqueue the replica set that manages it and update its expectations.
   384  func (rsc *ReplicaSetController) addPod(logger klog.Logger, obj interface{}) {
   385  	pod := obj.(*v1.Pod)
   386  
   387  	if pod.DeletionTimestamp != nil {
   388  		// on a restart of the controller manager, it's possible a new pod shows up in a state that
   389  		// is already pending deletion. Prevent the pod from being a creation observation.
   390  		rsc.deletePod(logger, pod)
   391  		return
   392  	}
   393  
   394  	// If it has a ControllerRef, that's all that matters.
   395  	if controllerRef := metav1.GetControllerOf(pod); controllerRef != nil {
   396  		rs := rsc.resolveControllerRef(pod.Namespace, controllerRef)
   397  		if rs == nil {
   398  			return
   399  		}
   400  		rsKey, err := controller.KeyFunc(rs)
   401  		if err != nil {
   402  			return
   403  		}
   404  		logger.V(4).Info("Pod created", "pod", klog.KObj(pod), "detail", pod)
   405  		rsc.expectations.CreationObserved(logger, rsKey)
   406  		rsc.queue.Add(rsKey)
   407  		return
   408  	}
   409  
   410  	// Otherwise, it's an orphan. Get a list of all matching ReplicaSets and sync
   411  	// them to see if anyone wants to adopt it.
   412  	// DO NOT observe creation because no controller should be waiting for an
   413  	// orphan.
   414  	rss := rsc.getPodReplicaSets(pod)
   415  	if len(rss) == 0 {
   416  		return
   417  	}
   418  	logger.V(4).Info("Orphan Pod created", "pod", klog.KObj(pod), "detail", pod)
   419  	for _, rs := range rss {
   420  		rsc.enqueueRS(rs)
   421  	}
   422  }
   423  
   424  // When a pod is updated, figure out what replica set/s manage it and wake them
   425  // up. If the labels of the pod have changed we need to awaken both the old
   426  // and new replica set. old and cur must be *v1.Pod types.
   427  func (rsc *ReplicaSetController) updatePod(logger klog.Logger, old, cur interface{}) {
   428  	curPod := cur.(*v1.Pod)
   429  	oldPod := old.(*v1.Pod)
   430  	if curPod.ResourceVersion == oldPod.ResourceVersion {
   431  		// Periodic resync will send update events for all known pods.
   432  		// Two different versions of the same pod will always have different RVs.
   433  		return
   434  	}
   435  
   436  	labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
   437  	if curPod.DeletionTimestamp != nil {
   438  		// when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period,
   439  		// and after such time has passed, the kubelet actually deletes it from the store. We receive an update
   440  		// for modification of the deletion timestamp and expect an rs to create more replicas asap, not wait
   441  		// until the kubelet actually deletes the pod. This is different from the Phase of a pod changing, because
   442  		// an rs never initiates a phase change, and so is never asleep waiting for the same.
   443  		rsc.deletePod(logger, curPod)
   444  		if labelChanged {
   445  			// we don't need to check the oldPod.DeletionTimestamp because DeletionTimestamp cannot be unset.
   446  			rsc.deletePod(logger, oldPod)
   447  		}
   448  		return
   449  	}
   450  
   451  	curControllerRef := metav1.GetControllerOf(curPod)
   452  	oldControllerRef := metav1.GetControllerOf(oldPod)
   453  	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
   454  	if controllerRefChanged && oldControllerRef != nil {
   455  		// The ControllerRef was changed. Sync the old controller, if any.
   456  		if rs := rsc.resolveControllerRef(oldPod.Namespace, oldControllerRef); rs != nil {
   457  			rsc.enqueueRS(rs)
   458  		}
   459  	}
   460  
   461  	// If it has a ControllerRef, that's all that matters.
   462  	if curControllerRef != nil {
   463  		rs := rsc.resolveControllerRef(curPod.Namespace, curControllerRef)
   464  		if rs == nil {
   465  			return
   466  		}
   467  		logger.V(4).Info("Pod objectMeta updated.", "pod", klog.KObj(oldPod), "oldObjectMeta", oldPod.ObjectMeta, "curObjectMeta", curPod.ObjectMeta)
   468  		rsc.enqueueRS(rs)
   469  		// TODO: MinReadySeconds in the Pod will generate an Available condition to be added in
   470  		// the Pod status which in turn will trigger a requeue of the owning replica set thus
   471  		// having its status updated with the newly available replica. For now, we can fake the
   472  		// update by resyncing the controller MinReadySeconds after the it is requeued because
   473  		// a Pod transitioned to Ready.
   474  		// Note that this still suffers from #29229, we are just moving the problem one level
   475  		// "closer" to kubelet (from the deployment to the replica set controller).
   476  		if !podutil.IsPodReady(oldPod) && podutil.IsPodReady(curPod) && rs.Spec.MinReadySeconds > 0 {
   477  			logger.V(2).Info("pod will be enqueued after a while for availability check", "duration", rs.Spec.MinReadySeconds, "kind", rsc.Kind, "pod", klog.KObj(oldPod))
   478  			// Add a second to avoid milliseconds skew in AddAfter.
   479  			// See https://github.com/kubernetes/kubernetes/issues/39785#issuecomment-279959133 for more info.
   480  			rsc.enqueueRSAfter(rs, (time.Duration(rs.Spec.MinReadySeconds)*time.Second)+time.Second)
   481  		}
   482  		return
   483  	}
   484  
   485  	// Otherwise, it's an orphan. If anything changed, sync matching controllers
   486  	// to see if anyone wants to adopt it now.
   487  	if labelChanged || controllerRefChanged {
   488  		rss := rsc.getPodReplicaSets(curPod)
   489  		if len(rss) == 0 {
   490  			return
   491  		}
   492  		logger.V(4).Info("Orphan Pod objectMeta updated.", "pod", klog.KObj(oldPod), "oldObjectMeta", oldPod.ObjectMeta, "curObjectMeta", curPod.ObjectMeta)
   493  		for _, rs := range rss {
   494  			rsc.enqueueRS(rs)
   495  		}
   496  	}
   497  }
   498  
   499  // When a pod is deleted, enqueue the replica set that manages the pod and update its expectations.
   500  // obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.
   501  func (rsc *ReplicaSetController) deletePod(logger klog.Logger, obj interface{}) {
   502  	pod, ok := obj.(*v1.Pod)
   503  
   504  	// When a delete is dropped, the relist will notice a pod in the store not
   505  	// in the list, leading to the insertion of a tombstone object which contains
   506  	// the deleted key/value. Note that this value might be stale. If the pod
   507  	// changed labels the new ReplicaSet will not be woken up till the periodic resync.
   508  	if !ok {
   509  		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
   510  		if !ok {
   511  			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
   512  			return
   513  		}
   514  		pod, ok = tombstone.Obj.(*v1.Pod)
   515  		if !ok {
   516  			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj))
   517  			return
   518  		}
   519  	}
   520  
   521  	controllerRef := metav1.GetControllerOf(pod)
   522  	if controllerRef == nil {
   523  		// No controller should care about orphans being deleted.
   524  		return
   525  	}
   526  	rs := rsc.resolveControllerRef(pod.Namespace, controllerRef)
   527  	if rs == nil {
   528  		return
   529  	}
   530  	rsKey, err := controller.KeyFunc(rs)
   531  	if err != nil {
   532  		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %#v: %v", rs, err))
   533  		return
   534  	}
   535  	logger.V(4).Info("Pod deleted", "delete_by", utilruntime.GetCaller(), "deletion_timestamp", pod.DeletionTimestamp, "pod", klog.KObj(pod))
   536  	rsc.expectations.DeletionObserved(logger, rsKey, controller.PodKey(pod))
   537  	rsc.queue.Add(rsKey)
   538  }
   539  
   540  // worker runs a worker thread that just dequeues items, processes them, and marks them done.
   541  // It enforces that the syncHandler is never invoked concurrently with the same key.
   542  func (rsc *ReplicaSetController) worker(ctx context.Context) {
   543  	for rsc.processNextWorkItem(ctx) {
   544  	}
   545  }
   546  
   547  func (rsc *ReplicaSetController) processNextWorkItem(ctx context.Context) bool {
   548  	key, quit := rsc.queue.Get()
   549  	if quit {
   550  		return false
   551  	}
   552  	defer rsc.queue.Done(key)
   553  
   554  	err := rsc.syncHandler(ctx, key)
   555  	if err == nil {
   556  		rsc.queue.Forget(key)
   557  		return true
   558  	}
   559  
   560  	utilruntime.HandleError(fmt.Errorf("sync %q failed with %v", key, err))
   561  	rsc.queue.AddRateLimited(key)
   562  
   563  	return true
   564  }
   565  
   566  // manageReplicas checks and updates replicas for the given ReplicaSet.
   567  // Does NOT modify <filteredPods>.
   568  // It will requeue the replica set in case of an error while creating/deleting pods.
   569  func (rsc *ReplicaSetController) manageReplicas(ctx context.Context, filteredPods []*v1.Pod, rs *apps.ReplicaSet) error {
   570  	diff := len(filteredPods) - int(*(rs.Spec.Replicas))
   571  	rsKey, err := controller.KeyFunc(rs)
   572  	if err != nil {
   573  		utilruntime.HandleError(fmt.Errorf("couldn't get key for %v %#v: %v", rsc.Kind, rs, err))
   574  		return nil
   575  	}
   576  	logger := klog.FromContext(ctx)
   577  	if diff < 0 {
   578  		diff *= -1
   579  		if diff > rsc.burstReplicas {
   580  			diff = rsc.burstReplicas
   581  		}
   582  		// TODO: Track UIDs of creates just like deletes. The problem currently
   583  		// is we'd need to wait on the result of a create to record the pod's
   584  		// UID, which would require locking *across* the create, which will turn
   585  		// into a performance bottleneck. We should generate a UID for the pod
   586  		// beforehand and store it via ExpectCreations.
   587  		rsc.expectations.ExpectCreations(logger, rsKey, diff)
   588  		logger.V(2).Info("Too few replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "creating", diff)
   589  		// Batch the pod creates. Batch sizes start at SlowStartInitialBatchSize
   590  		// and double with each successful iteration in a kind of "slow start".
   591  		// This handles attempts to start large numbers of pods that would
   592  		// likely all fail with the same error. For example a project with a
   593  		// low quota that attempts to create a large number of pods will be
   594  		// prevented from spamming the API service with the pod create requests
   595  		// after one of its pods fails.  Conveniently, this also prevents the
   596  		// event spam that those failures would generate.
   597  		successfulCreations, err := slowStartBatch(diff, controller.SlowStartInitialBatchSize, func() error {
   598  			err := rsc.podControl.CreatePods(ctx, rs.Namespace, &rs.Spec.Template, rs, metav1.NewControllerRef(rs, rsc.GroupVersionKind))
   599  			if err != nil {
   600  				if apierrors.HasStatusCause(err, v1.NamespaceTerminatingCause) {
   601  					// if the namespace is being terminated, we don't have to do
   602  					// anything because any creation will fail
   603  					return nil
   604  				}
   605  			}
   606  			return err
   607  		})
   608  
   609  		// Any skipped pods that we never attempted to start shouldn't be expected.
   610  		// The skipped pods will be retried later. The next controller resync will
   611  		// retry the slow start process.
   612  		if skippedPods := diff - successfulCreations; skippedPods > 0 {
   613  			logger.V(2).Info("Slow-start failure. Skipping creation of pods, decrementing expectations", "podsSkipped", skippedPods, "kind", rsc.Kind, "replicaSet", klog.KObj(rs))
   614  			for i := 0; i < skippedPods; i++ {
   615  				// Decrement the expected number of creates because the informer won't observe this pod
   616  				rsc.expectations.CreationObserved(logger, rsKey)
   617  			}
   618  		}
   619  		return err
   620  	} else if diff > 0 {
   621  		if diff > rsc.burstReplicas {
   622  			diff = rsc.burstReplicas
   623  		}
   624  		logger.V(2).Info("Too many replicas", "replicaSet", klog.KObj(rs), "need", *(rs.Spec.Replicas), "deleting", diff)
   625  
   626  		relatedPods, err := rsc.getIndirectlyRelatedPods(logger, rs)
   627  		utilruntime.HandleError(err)
   628  
   629  		// Choose which Pods to delete, preferring those in earlier phases of startup.
   630  		podsToDelete := getPodsToDelete(filteredPods, relatedPods, diff)
   631  
   632  		// Snapshot the UIDs (ns/name) of the pods we're expecting to see
   633  		// deleted, so we know to record their expectations exactly once either
   634  		// when we see it as an update of the deletion timestamp, or as a delete.
   635  		// Note that if the labels on a pod/rs change in a way that the pod gets
   636  		// orphaned, the rs will only wake up after the expectations have
   637  		// expired even if other pods are deleted.
   638  		rsc.expectations.ExpectDeletions(logger, rsKey, getPodKeys(podsToDelete))
   639  
   640  		errCh := make(chan error, diff)
   641  		var wg sync.WaitGroup
   642  		wg.Add(diff)
   643  		for _, pod := range podsToDelete {
   644  			go func(targetPod *v1.Pod) {
   645  				defer wg.Done()
   646  				if err := rsc.podControl.DeletePod(ctx, rs.Namespace, targetPod.Name, rs); err != nil {
   647  					// Decrement the expected number of deletes because the informer won't observe this deletion
   648  					podKey := controller.PodKey(targetPod)
   649  					rsc.expectations.DeletionObserved(logger, rsKey, podKey)
   650  					if !apierrors.IsNotFound(err) {
   651  						logger.V(2).Info("Failed to delete pod, decremented expectations", "pod", podKey, "kind", rsc.Kind, "replicaSet", klog.KObj(rs))
   652  						errCh <- err
   653  					}
   654  				}
   655  			}(pod)
   656  		}
   657  		wg.Wait()
   658  
   659  		select {
   660  		case err := <-errCh:
   661  			// all errors have been reported before and they're likely to be the same, so we'll only return the first one we hit.
   662  			if err != nil {
   663  				return err
   664  			}
   665  		default:
   666  		}
   667  	}
   668  
   669  	return nil
   670  }
   671  
   672  // syncReplicaSet will sync the ReplicaSet with the given key if it has had its expectations fulfilled,
   673  // meaning it did not expect to see any more of its pods created or deleted. This function is not meant to be
   674  // invoked concurrently with the same key.
   675  func (rsc *ReplicaSetController) syncReplicaSet(ctx context.Context, key string) error {
   676  	logger := klog.FromContext(ctx)
   677  	startTime := time.Now()
   678  	defer func() {
   679  		logger.Info("Finished syncing", "kind", rsc.Kind, "key", key, "duration", time.Since(startTime))
   680  	}()
   681  
   682  	namespace, name, err := cache.SplitMetaNamespaceKey(key)
   683  	if err != nil {
   684  		return err
   685  	}
   686  	rs, err := rsc.rsLister.ReplicaSets(namespace).Get(name)
   687  	if apierrors.IsNotFound(err) {
   688  		logger.V(4).Info("deleted", "kind", rsc.Kind, "key", key)
   689  		rsc.expectations.DeleteExpectations(logger, key)
   690  		return nil
   691  	}
   692  	if err != nil {
   693  		return err
   694  	}
   695  
   696  	rsNeedsSync := rsc.expectations.SatisfiedExpectations(logger, key)
   697  	selector, err := metav1.LabelSelectorAsSelector(rs.Spec.Selector)
   698  	if err != nil {
   699  		utilruntime.HandleError(fmt.Errorf("error converting pod selector to selector for rs %v/%v: %v", namespace, name, err))
   700  		return nil
   701  	}
   702  
   703  	// list all pods to include the pods that don't match the rs`s selector
   704  	// anymore but has the stale controller ref.
   705  	// TODO: Do the List and Filter in a single pass, or use an index.
   706  	allPods, err := rsc.podLister.Pods(rs.Namespace).List(labels.Everything())
   707  	if err != nil {
   708  		return err
   709  	}
   710  	// Ignore inactive pods.
   711  	filteredPods := controller.FilterActivePods(logger, allPods)
   712  
   713  	// NOTE: filteredPods are pointing to objects from cache - if you need to
   714  	// modify them, you need to copy it first.
   715  	filteredPods, err = rsc.claimPods(ctx, rs, selector, filteredPods)
   716  	if err != nil {
   717  		return err
   718  	}
   719  
   720  	var manageReplicasErr error
   721  	if rsNeedsSync && rs.DeletionTimestamp == nil {
   722  		manageReplicasErr = rsc.manageReplicas(ctx, filteredPods, rs)
   723  	}
   724  	rs = rs.DeepCopy()
   725  	newStatus := calculateStatus(rs, filteredPods, manageReplicasErr)
   726  
   727  	// Always updates status as pods come up or die.
   728  	updatedRS, err := updateReplicaSetStatus(logger, rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace), rs, newStatus)
   729  	if err != nil {
   730  		// Multiple things could lead to this update failing. Requeuing the replica set ensures
   731  		// Returning an error causes a requeue without forcing a hotloop
   732  		return err
   733  	}
   734  	// Resync the ReplicaSet after MinReadySeconds as a last line of defense to guard against clock-skew.
   735  	if manageReplicasErr == nil && updatedRS.Spec.MinReadySeconds > 0 &&
   736  		updatedRS.Status.ReadyReplicas == *(updatedRS.Spec.Replicas) &&
   737  		updatedRS.Status.AvailableReplicas != *(updatedRS.Spec.Replicas) {
   738  		rsc.queue.AddAfter(key, time.Duration(updatedRS.Spec.MinReadySeconds)*time.Second)
   739  	}
   740  	return manageReplicasErr
   741  }
   742  
   743  func (rsc *ReplicaSetController) claimPods(ctx context.Context, rs *apps.ReplicaSet, selector labels.Selector, filteredPods []*v1.Pod) ([]*v1.Pod, error) {
   744  	// If any adoptions are attempted, we should first recheck for deletion with
   745  	// an uncached quorum read sometime after listing Pods (see #42639).
   746  	canAdoptFunc := controller.RecheckDeletionTimestamp(func(ctx context.Context) (metav1.Object, error) {
   747  		fresh, err := rsc.kubeClient.AppsV1().ReplicaSets(rs.Namespace).Get(ctx, rs.Name, metav1.GetOptions{})
   748  		if err != nil {
   749  			return nil, err
   750  		}
   751  		if fresh.UID != rs.UID {
   752  			return nil, fmt.Errorf("original %v %v/%v is gone: got uid %v, wanted %v", rsc.Kind, rs.Namespace, rs.Name, fresh.UID, rs.UID)
   753  		}
   754  		return fresh, nil
   755  	})
   756  	cm := controller.NewPodControllerRefManager(rsc.podControl, rs, selector, rsc.GroupVersionKind, canAdoptFunc)
   757  	return cm.ClaimPods(ctx, filteredPods)
   758  }
   759  
   760  // slowStartBatch tries to call the provided function a total of 'count' times,
   761  // starting slow to check for errors, then speeding up if calls succeed.
   762  //
   763  // It groups the calls into batches, starting with a group of initialBatchSize.
   764  // Within each batch, it may call the function multiple times concurrently.
   765  //
   766  // If a whole batch succeeds, the next batch may get exponentially larger.
   767  // If there are any failures in a batch, all remaining batches are skipped
   768  // after waiting for the current batch to complete.
   769  //
   770  // It returns the number of successful calls to the function.
   771  func slowStartBatch(count int, initialBatchSize int, fn func() error) (int, error) {
   772  	remaining := count
   773  	successes := 0
   774  	for batchSize := min(remaining, initialBatchSize); batchSize > 0; batchSize = min(2*batchSize, remaining) {
   775  		errCh := make(chan error, batchSize)
   776  		var wg sync.WaitGroup
   777  		wg.Add(batchSize)
   778  		for i := 0; i < batchSize; i++ {
   779  			go func() {
   780  				defer wg.Done()
   781  				if err := fn(); err != nil {
   782  					errCh <- err
   783  				}
   784  			}()
   785  		}
   786  		wg.Wait()
   787  		curSuccesses := batchSize - len(errCh)
   788  		successes += curSuccesses
   789  		if len(errCh) > 0 {
   790  			return successes, <-errCh
   791  		}
   792  		remaining -= batchSize
   793  	}
   794  	return successes, nil
   795  }
   796  
   797  // getIndirectlyRelatedPods returns all pods that are owned by any ReplicaSet
   798  // that is owned by the given ReplicaSet's owner.
   799  func (rsc *ReplicaSetController) getIndirectlyRelatedPods(logger klog.Logger, rs *apps.ReplicaSet) ([]*v1.Pod, error) {
   800  	var relatedPods []*v1.Pod
   801  	seen := make(map[types.UID]*apps.ReplicaSet)
   802  	for _, relatedRS := range rsc.getReplicaSetsWithSameController(logger, rs) {
   803  		selector, err := metav1.LabelSelectorAsSelector(relatedRS.Spec.Selector)
   804  		if err != nil {
   805  			// This object has an invalid selector, it does not match any pods
   806  			continue
   807  		}
   808  		pods, err := rsc.podLister.Pods(relatedRS.Namespace).List(selector)
   809  		if err != nil {
   810  			return nil, err
   811  		}
   812  		for _, pod := range pods {
   813  			if otherRS, found := seen[pod.UID]; found {
   814  				logger.V(5).Info("Pod is owned by both", "pod", klog.KObj(pod), "kind", rsc.Kind, "replicaSets", klog.KObjSlice([]klog.KMetadata{otherRS, relatedRS}))
   815  				continue
   816  			}
   817  			seen[pod.UID] = relatedRS
   818  			relatedPods = append(relatedPods, pod)
   819  		}
   820  	}
   821  	logger.V(4).Info("Found related pods", "kind", rsc.Kind, "replicaSet", klog.KObj(rs), "pods", klog.KObjSlice(relatedPods))
   822  	return relatedPods, nil
   823  }
   824  
   825  func getPodsToDelete(filteredPods, relatedPods []*v1.Pod, diff int) []*v1.Pod {
   826  	// No need to sort pods if we are about to delete all of them.
   827  	// diff will always be <= len(filteredPods), so not need to handle > case.
   828  	if diff < len(filteredPods) {
   829  		podsWithRanks := getPodsRankedByRelatedPodsOnSameNode(filteredPods, relatedPods)
   830  		sort.Sort(podsWithRanks)
   831  		reportSortingDeletionAgeRatioMetric(filteredPods, diff)
   832  	}
   833  	return filteredPods[:diff]
   834  }
   835  
   836  func reportSortingDeletionAgeRatioMetric(filteredPods []*v1.Pod, diff int) {
   837  	now := time.Now()
   838  	youngestTime := time.Time{}
   839  	// first we need to check all of the ready pods to get the youngest, as they may not necessarily be sorted by timestamp alone
   840  	for _, pod := range filteredPods {
   841  		if pod.CreationTimestamp.Time.After(youngestTime) && podutil.IsPodReady(pod) {
   842  			youngestTime = pod.CreationTimestamp.Time
   843  		}
   844  	}
   845  
   846  	// for each pod chosen for deletion, report the ratio of its age to the youngest pod's age
   847  	for _, pod := range filteredPods[:diff] {
   848  		if !podutil.IsPodReady(pod) {
   849  			continue
   850  		}
   851  		ratio := float64(now.Sub(pod.CreationTimestamp.Time).Milliseconds() / now.Sub(youngestTime).Milliseconds())
   852  		metrics.SortingDeletionAgeRatio.Observe(ratio)
   853  	}
   854  }
   855  
   856  // getPodsRankedByRelatedPodsOnSameNode returns an ActivePodsWithRanks value
   857  // that wraps podsToRank and assigns each pod a rank equal to the number of
   858  // active pods in relatedPods that are colocated on the same node with the pod.
   859  // relatedPods generally should be a superset of podsToRank.
   860  func getPodsRankedByRelatedPodsOnSameNode(podsToRank, relatedPods []*v1.Pod) controller.ActivePodsWithRanks {
   861  	podsOnNode := make(map[string]int)
   862  	for _, pod := range relatedPods {
   863  		if controller.IsPodActive(pod) {
   864  			podsOnNode[pod.Spec.NodeName]++
   865  		}
   866  	}
   867  	ranks := make([]int, len(podsToRank))
   868  	for i, pod := range podsToRank {
   869  		ranks[i] = podsOnNode[pod.Spec.NodeName]
   870  	}
   871  	return controller.ActivePodsWithRanks{Pods: podsToRank, Rank: ranks, Now: metav1.Now()}
   872  }
   873  
   874  func getPodKeys(pods []*v1.Pod) []string {
   875  	podKeys := make([]string, 0, len(pods))
   876  	for _, pod := range pods {
   877  		podKeys = append(podKeys, controller.PodKey(pod))
   878  	}
   879  	return podKeys
   880  }