k8s.io/kubernetes@v1.29.3/pkg/controller/garbagecollector/garbagecollector.go (about)

     1  /*
     2  Copyright 2016 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package garbagecollector
    18  
    19  import (
    20  	"context"
    21  	goerrors "errors"
    22  	"fmt"
    23  	"reflect"
    24  	"sync"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	"k8s.io/apimachinery/pkg/api/errors"
    29  	"k8s.io/apimachinery/pkg/api/meta"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/runtime/schema"
    32  	"k8s.io/apimachinery/pkg/types"
    33  	utilerrors "k8s.io/apimachinery/pkg/util/errors"
    34  	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
    35  	"k8s.io/apimachinery/pkg/util/sets"
    36  	"k8s.io/apimachinery/pkg/util/wait"
    37  	"k8s.io/client-go/discovery"
    38  	clientset "k8s.io/client-go/kubernetes" // import known versions
    39  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    40  	"k8s.io/client-go/metadata"
    41  	"k8s.io/client-go/tools/cache"
    42  	"k8s.io/client-go/tools/record"
    43  	"k8s.io/client-go/util/workqueue"
    44  	"k8s.io/controller-manager/controller"
    45  	"k8s.io/controller-manager/pkg/informerfactory"
    46  	"k8s.io/klog/v2"
    47  	c "k8s.io/kubernetes/pkg/controller"
    48  	"k8s.io/kubernetes/pkg/controller/apis/config/scheme"
    49  	"k8s.io/kubernetes/pkg/controller/garbagecollector/metrics"
    50  )
    51  
    52  // ResourceResyncTime defines the resync period of the garbage collector's informers.
    53  const ResourceResyncTime time.Duration = 0
    54  
    55  // GarbageCollector runs reflectors to watch for changes of managed API
    56  // objects, funnels the results to a single-threaded dependencyGraphBuilder,
    57  // which builds a graph caching the dependencies among objects. Triggered by the
    58  // graph changes, the dependencyGraphBuilder enqueues objects that can
    59  // potentially be garbage-collected to the `attemptToDelete` queue, and enqueues
    60  // objects whose dependents need to be orphaned to the `attemptToOrphan` queue.
    61  // The GarbageCollector has workers who consume these two queues, send requests
    62  // to the API server to delete/update the objects accordingly.
    63  // Note that having the dependencyGraphBuilder notify the garbage collector
    64  // ensures that the garbage collector operates with a graph that is at least as
    65  // up to date as the notification is sent.
    66  type GarbageCollector struct {
    67  	restMapper     meta.ResettableRESTMapper
    68  	metadataClient metadata.Interface
    69  	// garbage collector attempts to delete the items in attemptToDelete queue when the time is ripe.
    70  	attemptToDelete workqueue.RateLimitingInterface
    71  	// garbage collector attempts to orphan the dependents of the items in the attemptToOrphan queue, then deletes the items.
    72  	attemptToOrphan        workqueue.RateLimitingInterface
    73  	dependencyGraphBuilder *GraphBuilder
    74  	// GC caches the owners that do not exist according to the API server.
    75  	absentOwnerCache *ReferenceCache
    76  
    77  	kubeClient       clientset.Interface
    78  	eventBroadcaster record.EventBroadcaster
    79  
    80  	workerLock sync.RWMutex
    81  }
    82  
    83  var _ controller.Interface = (*GarbageCollector)(nil)
    84  var _ controller.Debuggable = (*GarbageCollector)(nil)
    85  
    86  // NewGarbageCollector creates a new GarbageCollector.
    87  func NewGarbageCollector(
    88  	kubeClient clientset.Interface,
    89  	metadataClient metadata.Interface,
    90  	mapper meta.ResettableRESTMapper,
    91  	ignoredResources map[schema.GroupResource]struct{},
    92  	sharedInformers informerfactory.InformerFactory,
    93  	informersStarted <-chan struct{},
    94  ) (*GarbageCollector, error) {
    95  
    96  	eventBroadcaster := record.NewBroadcaster()
    97  	eventRecorder := eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "garbage-collector-controller"})
    98  
    99  	attemptToDelete := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_attempt_to_delete")
   100  	attemptToOrphan := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_attempt_to_orphan")
   101  	absentOwnerCache := NewReferenceCache(500)
   102  	gc := &GarbageCollector{
   103  		metadataClient:   metadataClient,
   104  		restMapper:       mapper,
   105  		attemptToDelete:  attemptToDelete,
   106  		attemptToOrphan:  attemptToOrphan,
   107  		absentOwnerCache: absentOwnerCache,
   108  		kubeClient:       kubeClient,
   109  		eventBroadcaster: eventBroadcaster,
   110  	}
   111  	gc.dependencyGraphBuilder = &GraphBuilder{
   112  		eventRecorder:    eventRecorder,
   113  		metadataClient:   metadataClient,
   114  		informersStarted: informersStarted,
   115  		restMapper:       mapper,
   116  		graphChanges:     workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "garbage_collector_graph_changes"),
   117  		uidToNode: &concurrentUIDToNode{
   118  			uidToNode: make(map[types.UID]*node),
   119  		},
   120  		attemptToDelete:  attemptToDelete,
   121  		attemptToOrphan:  attemptToOrphan,
   122  		absentOwnerCache: absentOwnerCache,
   123  		sharedInformers:  sharedInformers,
   124  		ignoredResources: ignoredResources,
   125  	}
   126  
   127  	metrics.Register()
   128  
   129  	return gc, nil
   130  }
   131  
   132  // resyncMonitors starts or stops resource monitors as needed to ensure that all
   133  // (and only) those resources present in the map are monitored.
   134  func (gc *GarbageCollector) resyncMonitors(logger klog.Logger, deletableResources map[schema.GroupVersionResource]struct{}) error {
   135  	if err := gc.dependencyGraphBuilder.syncMonitors(logger, deletableResources); err != nil {
   136  		return err
   137  	}
   138  	gc.dependencyGraphBuilder.startMonitors(logger)
   139  	return nil
   140  }
   141  
   142  // Run starts garbage collector workers.
   143  func (gc *GarbageCollector) Run(ctx context.Context, workers int) {
   144  	defer utilruntime.HandleCrash()
   145  	defer gc.attemptToDelete.ShutDown()
   146  	defer gc.attemptToOrphan.ShutDown()
   147  	defer gc.dependencyGraphBuilder.graphChanges.ShutDown()
   148  
   149  	// Start events processing pipeline.
   150  	gc.eventBroadcaster.StartStructuredLogging(0)
   151  	gc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: gc.kubeClient.CoreV1().Events("")})
   152  	defer gc.eventBroadcaster.Shutdown()
   153  
   154  	logger := klog.FromContext(ctx)
   155  	logger.Info("Starting controller", "controller", "garbagecollector")
   156  	defer logger.Info("Shutting down controller", "controller", "garbagecollector")
   157  
   158  	go gc.dependencyGraphBuilder.Run(ctx)
   159  
   160  	if !cache.WaitForNamedCacheSync("garbage collector", ctx.Done(), func() bool {
   161  		return gc.dependencyGraphBuilder.IsSynced(logger)
   162  	}) {
   163  		return
   164  	}
   165  
   166  	logger.Info("All resource monitors have synced. Proceeding to collect garbage")
   167  
   168  	// gc workers
   169  	for i := 0; i < workers; i++ {
   170  		go wait.UntilWithContext(ctx, gc.runAttemptToDeleteWorker, 1*time.Second)
   171  		go wait.Until(func() { gc.runAttemptToOrphanWorker(logger) }, 1*time.Second, ctx.Done())
   172  	}
   173  
   174  	<-ctx.Done()
   175  }
   176  
   177  // Sync periodically resyncs the garbage collector when new resources are
   178  // observed from discovery. When new resources are detected, Sync will stop all
   179  // GC workers, reset gc.restMapper, and resync the monitors.
   180  //
   181  // Note that discoveryClient should NOT be shared with gc.restMapper, otherwise
   182  // the mapper's underlying discovery client will be unnecessarily reset during
   183  // the course of detecting new resources.
   184  func (gc *GarbageCollector) Sync(ctx context.Context, discoveryClient discovery.ServerResourcesInterface, period time.Duration) {
   185  	oldResources := make(map[schema.GroupVersionResource]struct{})
   186  	wait.UntilWithContext(ctx, func(ctx context.Context) {
   187  		logger := klog.FromContext(ctx)
   188  
   189  		// Get the current resource list from discovery.
   190  		newResources, err := GetDeletableResources(logger, discoveryClient)
   191  
   192  		if len(newResources) == 0 {
   193  			logger.V(2).Info("no resources reported by discovery, skipping garbage collector sync")
   194  			metrics.GarbageCollectorResourcesSyncError.Inc()
   195  			return
   196  		}
   197  		if groupLookupFailures, isLookupFailure := discovery.GroupDiscoveryFailedErrorGroups(err); isLookupFailure {
   198  			// In partial discovery cases, preserve existing synced informers for resources in the failed groups, so resyncMonitors will only add informers for newly seen resources
   199  			for k, v := range oldResources {
   200  				if _, failed := groupLookupFailures[k.GroupVersion()]; failed && gc.dependencyGraphBuilder.IsResourceSynced(k) {
   201  					newResources[k] = v
   202  				}
   203  			}
   204  		}
   205  
   206  		// Decide whether discovery has reported a change.
   207  		if reflect.DeepEqual(oldResources, newResources) {
   208  			logger.V(5).Info("no resource updates from discovery, skipping garbage collector sync")
   209  			return
   210  		}
   211  
   212  		// Ensure workers are paused to avoid processing events before informers
   213  		// have resynced.
   214  		gc.workerLock.Lock()
   215  		defer gc.workerLock.Unlock()
   216  
   217  		// Once we get here, we should not unpause workers until we've successfully synced
   218  		attempt := 0
   219  		wait.PollImmediateUntilWithContext(ctx, 100*time.Millisecond, func(ctx context.Context) (bool, error) {
   220  			attempt++
   221  
   222  			// On a reattempt, check if available resources have changed
   223  			if attempt > 1 {
   224  				newResources, err = GetDeletableResources(logger, discoveryClient)
   225  
   226  				if len(newResources) == 0 {
   227  					logger.V(2).Info("no resources reported by discovery", "attempt", attempt)
   228  					metrics.GarbageCollectorResourcesSyncError.Inc()
   229  					return false, nil
   230  				}
   231  				if groupLookupFailures, isLookupFailure := discovery.GroupDiscoveryFailedErrorGroups(err); isLookupFailure {
   232  					// In partial discovery cases, preserve existing synced informers for resources in the failed groups, so resyncMonitors will only add informers for newly seen resources
   233  					for k, v := range oldResources {
   234  						if _, failed := groupLookupFailures[k.GroupVersion()]; failed && gc.dependencyGraphBuilder.IsResourceSynced(k) {
   235  							newResources[k] = v
   236  						}
   237  					}
   238  				}
   239  			}
   240  
   241  			logger.V(2).Info(
   242  				"syncing garbage collector with updated resources from discovery",
   243  				"attempt", attempt,
   244  				"diff", printDiff(oldResources, newResources),
   245  			)
   246  
   247  			// Resetting the REST mapper will also invalidate the underlying discovery
   248  			// client. This is a leaky abstraction and assumes behavior about the REST
   249  			// mapper, but we'll deal with it for now.
   250  			gc.restMapper.Reset()
   251  			logger.V(4).Info("reset restmapper")
   252  
   253  			// Perform the monitor resync and wait for controllers to report cache sync.
   254  			//
   255  			// NOTE: It's possible that newResources will diverge from the resources
   256  			// discovered by restMapper during the call to Reset, since they are
   257  			// distinct discovery clients invalidated at different times. For example,
   258  			// newResources may contain resources not returned in the restMapper's
   259  			// discovery call if the resources appeared in-between the calls. In that
   260  			// case, the restMapper will fail to map some of newResources until the next
   261  			// attempt.
   262  			if err := gc.resyncMonitors(logger, newResources); err != nil {
   263  				utilruntime.HandleError(fmt.Errorf("failed to sync resource monitors (attempt %d): %v", attempt, err))
   264  				metrics.GarbageCollectorResourcesSyncError.Inc()
   265  				return false, nil
   266  			}
   267  			logger.V(4).Info("resynced monitors")
   268  
   269  			// wait for caches to fill for a while (our sync period) before attempting to rediscover resources and retry syncing.
   270  			// this protects us from deadlocks where available resources changed and one of our informer caches will never fill.
   271  			// informers keep attempting to sync in the background, so retrying doesn't interrupt them.
   272  			// the call to resyncMonitors on the reattempt will no-op for resources that still exist.
   273  			// note that workers stay paused until we successfully resync.
   274  			if !cache.WaitForNamedCacheSync("garbage collector", waitForStopOrTimeout(ctx.Done(), period), func() bool {
   275  				return gc.dependencyGraphBuilder.IsSynced(logger)
   276  			}) {
   277  				utilruntime.HandleError(fmt.Errorf("timed out waiting for dependency graph builder sync during GC sync (attempt %d)", attempt))
   278  				metrics.GarbageCollectorResourcesSyncError.Inc()
   279  				return false, nil
   280  			}
   281  
   282  			// success, break out of the loop
   283  			return true, nil
   284  		})
   285  
   286  		// Finally, keep track of our new state. Do this after all preceding steps
   287  		// have succeeded to ensure we'll retry on subsequent syncs if an error
   288  		// occurred.
   289  		oldResources = newResources
   290  		logger.V(2).Info("synced garbage collector")
   291  	}, period)
   292  }
   293  
   294  // printDiff returns a human-readable summary of what resources were added and removed
   295  func printDiff(oldResources, newResources map[schema.GroupVersionResource]struct{}) string {
   296  	removed := sets.NewString()
   297  	for oldResource := range oldResources {
   298  		if _, ok := newResources[oldResource]; !ok {
   299  			removed.Insert(fmt.Sprintf("%+v", oldResource))
   300  		}
   301  	}
   302  	added := sets.NewString()
   303  	for newResource := range newResources {
   304  		if _, ok := oldResources[newResource]; !ok {
   305  			added.Insert(fmt.Sprintf("%+v", newResource))
   306  		}
   307  	}
   308  	return fmt.Sprintf("added: %v, removed: %v", added.List(), removed.List())
   309  }
   310  
   311  // waitForStopOrTimeout returns a stop channel that closes when the provided stop channel closes or when the specified timeout is reached
   312  func waitForStopOrTimeout(stopCh <-chan struct{}, timeout time.Duration) <-chan struct{} {
   313  	stopChWithTimeout := make(chan struct{})
   314  	go func() {
   315  		select {
   316  		case <-stopCh:
   317  		case <-time.After(timeout):
   318  		}
   319  		close(stopChWithTimeout)
   320  	}()
   321  	return stopChWithTimeout
   322  }
   323  
   324  // IsSynced returns true if dependencyGraphBuilder is synced.
   325  func (gc *GarbageCollector) IsSynced(logger klog.Logger) bool {
   326  	return gc.dependencyGraphBuilder.IsSynced(logger)
   327  }
   328  
   329  func (gc *GarbageCollector) runAttemptToDeleteWorker(ctx context.Context) {
   330  	for gc.processAttemptToDeleteWorker(ctx) {
   331  	}
   332  }
   333  
   334  var enqueuedVirtualDeleteEventErr = goerrors.New("enqueued virtual delete event")
   335  
   336  var namespacedOwnerOfClusterScopedObjectErr = goerrors.New("cluster-scoped objects cannot refer to namespaced owners")
   337  
   338  func (gc *GarbageCollector) processAttemptToDeleteWorker(ctx context.Context) bool {
   339  	item, quit := gc.attemptToDelete.Get()
   340  	gc.workerLock.RLock()
   341  	defer gc.workerLock.RUnlock()
   342  	if quit {
   343  		return false
   344  	}
   345  	defer gc.attemptToDelete.Done(item)
   346  
   347  	action := gc.attemptToDeleteWorker(ctx, item)
   348  	switch action {
   349  	case forgetItem:
   350  		gc.attemptToDelete.Forget(item)
   351  	case requeueItem:
   352  		gc.attemptToDelete.AddRateLimited(item)
   353  	}
   354  
   355  	return true
   356  }
   357  
   358  type workQueueItemAction int
   359  
   360  const (
   361  	requeueItem = iota
   362  	forgetItem
   363  )
   364  
   365  func (gc *GarbageCollector) attemptToDeleteWorker(ctx context.Context, item interface{}) workQueueItemAction {
   366  	n, ok := item.(*node)
   367  	if !ok {
   368  		utilruntime.HandleError(fmt.Errorf("expect *node, got %#v", item))
   369  		return forgetItem
   370  	}
   371  
   372  	logger := klog.FromContext(ctx)
   373  
   374  	if !n.isObserved() {
   375  		nodeFromGraph, existsInGraph := gc.dependencyGraphBuilder.uidToNode.Read(n.identity.UID)
   376  		if !existsInGraph {
   377  			// this can happen if attemptToDelete loops on a requeued virtual node because attemptToDeleteItem returned an error,
   378  			// and in the meantime a deletion of the real object associated with that uid was observed
   379  			logger.V(5).Info("item no longer in the graph, skipping attemptToDeleteItem", "item", n.identity)
   380  			return forgetItem
   381  		}
   382  		if nodeFromGraph.isObserved() {
   383  			// this can happen if attemptToDelete loops on a requeued virtual node because attemptToDeleteItem returned an error,
   384  			// and in the meantime the real object associated with that uid was observed
   385  			logger.V(5).Info("item no longer virtual in the graph, skipping attemptToDeleteItem on virtual node", "item", n.identity)
   386  			return forgetItem
   387  		}
   388  	}
   389  
   390  	err := gc.attemptToDeleteItem(ctx, n)
   391  	if err == enqueuedVirtualDeleteEventErr {
   392  		// a virtual event was produced and will be handled by processGraphChanges, no need to requeue this node
   393  		return forgetItem
   394  	} else if err == namespacedOwnerOfClusterScopedObjectErr {
   395  		// a cluster-scoped object referring to a namespaced owner is an error that will not resolve on retry, no need to requeue this node
   396  		return forgetItem
   397  	} else if err != nil {
   398  		if _, ok := err.(*restMappingError); ok {
   399  			// There are at least two ways this can happen:
   400  			// 1. The reference is to an object of a custom type that has not yet been
   401  			//    recognized by gc.restMapper (this is a transient error).
   402  			// 2. The reference is to an invalid group/version. We don't currently
   403  			//    have a way to distinguish this from a valid type we will recognize
   404  			//    after the next discovery sync.
   405  			// For now, record the error and retry.
   406  			logger.V(5).Error(err, "error syncing item", "item", n.identity)
   407  		} else {
   408  			utilruntime.HandleError(fmt.Errorf("error syncing item %s: %v", n, err))
   409  		}
   410  		// retry if garbage collection of an object failed.
   411  		return requeueItem
   412  	} else if !n.isObserved() {
   413  		// requeue if item hasn't been observed via an informer event yet.
   414  		// otherwise a virtual node for an item added AND removed during watch reestablishment can get stuck in the graph and never removed.
   415  		// see https://issue.k8s.io/56121
   416  		logger.V(5).Info("item hasn't been observed via informer yet", "item", n.identity)
   417  		return requeueItem
   418  	}
   419  
   420  	return forgetItem
   421  }
   422  
   423  // isDangling check if a reference is pointing to an object that doesn't exist.
   424  // If isDangling looks up the referenced object at the API server, it also
   425  // returns its latest state.
   426  func (gc *GarbageCollector) isDangling(ctx context.Context, reference metav1.OwnerReference, item *node) (
   427  	dangling bool, owner *metav1.PartialObjectMetadata, err error) {
   428  
   429  	logger := klog.FromContext(ctx)
   430  	// check for recorded absent cluster-scoped parent
   431  	absentOwnerCacheKey := objectReference{OwnerReference: ownerReferenceCoordinates(reference)}
   432  	if gc.absentOwnerCache.Has(absentOwnerCacheKey) {
   433  		logger.V(5).Info("according to the absentOwnerCache, item's owner does not exist",
   434  			"item", item.identity,
   435  			"owner", reference,
   436  		)
   437  		return true, nil, nil
   438  	}
   439  
   440  	// check for recorded absent namespaced parent
   441  	absentOwnerCacheKey.Namespace = item.identity.Namespace
   442  	if gc.absentOwnerCache.Has(absentOwnerCacheKey) {
   443  		logger.V(5).Info("according to the absentOwnerCache, item's owner does not exist in namespace",
   444  			"item", item.identity,
   445  			"owner", reference,
   446  		)
   447  		return true, nil, nil
   448  	}
   449  
   450  	// TODO: we need to verify the reference resource is supported by the
   451  	// system. If it's not a valid resource, the garbage collector should i)
   452  	// ignore the reference when decide if the object should be deleted, and
   453  	// ii) should update the object to remove such references. This is to
   454  	// prevent objects having references to an old resource from being
   455  	// deleted during a cluster upgrade.
   456  	resource, namespaced, err := gc.apiResource(reference.APIVersion, reference.Kind)
   457  	if err != nil {
   458  		return false, nil, err
   459  	}
   460  	if !namespaced {
   461  		absentOwnerCacheKey.Namespace = ""
   462  	}
   463  
   464  	if len(item.identity.Namespace) == 0 && namespaced {
   465  		// item is a cluster-scoped object referring to a namespace-scoped owner, which is not valid.
   466  		// return a marker error, rather than retrying on the lookup failure forever.
   467  		logger.V(2).Info("item is cluster-scoped, but refers to a namespaced owner",
   468  			"item", item.identity,
   469  			"owner", reference,
   470  		)
   471  		return false, nil, namespacedOwnerOfClusterScopedObjectErr
   472  	}
   473  
   474  	// TODO: It's only necessary to talk to the API server if the owner node
   475  	// is a "virtual" node. The local graph could lag behind the real
   476  	// status, but in practice, the difference is small.
   477  	owner, err = gc.metadataClient.Resource(resource).Namespace(resourceDefaultNamespace(namespaced, item.identity.Namespace)).Get(ctx, reference.Name, metav1.GetOptions{})
   478  	switch {
   479  	case errors.IsNotFound(err):
   480  		gc.absentOwnerCache.Add(absentOwnerCacheKey)
   481  		logger.V(5).Info("item's owner is not found",
   482  			"item", item.identity,
   483  			"owner", reference,
   484  		)
   485  		return true, nil, nil
   486  	case err != nil:
   487  		return false, nil, err
   488  	}
   489  
   490  	if owner.GetUID() != reference.UID {
   491  		logger.V(5).Info("item's owner is not found, UID mismatch",
   492  			"item", item.identity,
   493  			"owner", reference,
   494  		)
   495  		gc.absentOwnerCache.Add(absentOwnerCacheKey)
   496  		return true, nil, nil
   497  	}
   498  	return false, owner, nil
   499  }
   500  
   501  // classify the latestReferences to three categories:
   502  // solid: the owner exists, and is not "waitingForDependentsDeletion"
   503  // dangling: the owner does not exist
   504  // waitingForDependentsDeletion: the owner exists, its deletionTimestamp is non-nil, and it has
   505  // FinalizerDeletingDependents
   506  // This function communicates with the server.
   507  func (gc *GarbageCollector) classifyReferences(ctx context.Context, item *node, latestReferences []metav1.OwnerReference) (
   508  	solid, dangling, waitingForDependentsDeletion []metav1.OwnerReference, err error) {
   509  	for _, reference := range latestReferences {
   510  		isDangling, owner, err := gc.isDangling(ctx, reference, item)
   511  		if err != nil {
   512  			return nil, nil, nil, err
   513  		}
   514  		if isDangling {
   515  			dangling = append(dangling, reference)
   516  			continue
   517  		}
   518  
   519  		ownerAccessor, err := meta.Accessor(owner)
   520  		if err != nil {
   521  			return nil, nil, nil, err
   522  		}
   523  		if ownerAccessor.GetDeletionTimestamp() != nil && hasDeleteDependentsFinalizer(ownerAccessor) {
   524  			waitingForDependentsDeletion = append(waitingForDependentsDeletion, reference)
   525  		} else {
   526  			solid = append(solid, reference)
   527  		}
   528  	}
   529  	return solid, dangling, waitingForDependentsDeletion, nil
   530  }
   531  
   532  func ownerRefsToUIDs(refs []metav1.OwnerReference) []types.UID {
   533  	var ret []types.UID
   534  	for _, ref := range refs {
   535  		ret = append(ret, ref.UID)
   536  	}
   537  	return ret
   538  }
   539  
   540  // attemptToDeleteItem looks up the live API object associated with the node,
   541  // and issues a delete IFF the uid matches, the item is not blocked on deleting dependents,
   542  // and all owner references are dangling.
   543  //
   544  // if the API get request returns a NotFound error, or the retrieved item's uid does not match,
   545  // a virtual delete event for the node is enqueued and enqueuedVirtualDeleteEventErr is returned.
   546  func (gc *GarbageCollector) attemptToDeleteItem(ctx context.Context, item *node) error {
   547  	logger := klog.FromContext(ctx)
   548  
   549  	logger.V(2).Info("Processing item",
   550  		"item", item.identity,
   551  		"virtual", !item.isObserved(),
   552  	)
   553  
   554  	// "being deleted" is an one-way trip to the final deletion. We'll just wait for the final deletion, and then process the object's dependents.
   555  	if item.isBeingDeleted() && !item.isDeletingDependents() {
   556  		logger.V(5).Info("processing item returned at once, because its DeletionTimestamp is non-nil",
   557  			"item", item.identity,
   558  		)
   559  		return nil
   560  	}
   561  	// TODO: It's only necessary to talk to the API server if this is a
   562  	// "virtual" node. The local graph could lag behind the real status, but in
   563  	// practice, the difference is small.
   564  	latest, err := gc.getObject(item.identity)
   565  	switch {
   566  	case errors.IsNotFound(err):
   567  		// the GraphBuilder can add "virtual" node for an owner that doesn't
   568  		// exist yet, so we need to enqueue a virtual Delete event to remove
   569  		// the virtual node from GraphBuilder.uidToNode.
   570  		logger.V(5).Info("item not found, generating a virtual delete event",
   571  			"item", item.identity,
   572  		)
   573  		gc.dependencyGraphBuilder.enqueueVirtualDeleteEvent(item.identity)
   574  		return enqueuedVirtualDeleteEventErr
   575  	case err != nil:
   576  		return err
   577  	}
   578  
   579  	if latest.GetUID() != item.identity.UID {
   580  		logger.V(5).Info("UID doesn't match, item not found, generating a virtual delete event",
   581  			"item", item.identity,
   582  		)
   583  		gc.dependencyGraphBuilder.enqueueVirtualDeleteEvent(item.identity)
   584  		return enqueuedVirtualDeleteEventErr
   585  	}
   586  
   587  	// TODO: attemptToOrphanWorker() routine is similar. Consider merging
   588  	// attemptToOrphanWorker() into attemptToDeleteItem() as well.
   589  	if item.isDeletingDependents() {
   590  		return gc.processDeletingDependentsItem(logger, item)
   591  	}
   592  
   593  	// compute if we should delete the item
   594  	ownerReferences := latest.GetOwnerReferences()
   595  	if len(ownerReferences) == 0 {
   596  		logger.V(2).Info("item doesn't have an owner, continue on next item",
   597  			"item", item.identity,
   598  		)
   599  		return nil
   600  	}
   601  
   602  	solid, dangling, waitingForDependentsDeletion, err := gc.classifyReferences(ctx, item, ownerReferences)
   603  	if err != nil {
   604  		return err
   605  	}
   606  	logger.V(5).Info("classify item's references",
   607  		"item", item.identity,
   608  		"solid", solid,
   609  		"dangling", dangling,
   610  		"waitingForDependentsDeletion", waitingForDependentsDeletion,
   611  	)
   612  
   613  	switch {
   614  	case len(solid) != 0:
   615  		logger.V(2).Info("item has at least one existing owner, will not garbage collect",
   616  			"item", item.identity,
   617  			"owner", solid,
   618  		)
   619  		if len(dangling) == 0 && len(waitingForDependentsDeletion) == 0 {
   620  			return nil
   621  		}
   622  		logger.V(2).Info("remove dangling references and waiting references for item",
   623  			"item", item.identity,
   624  			"dangling", dangling,
   625  			"waitingForDependentsDeletion", waitingForDependentsDeletion,
   626  		)
   627  		// waitingForDependentsDeletion needs to be deleted from the
   628  		// ownerReferences, otherwise the referenced objects will be stuck with
   629  		// the FinalizerDeletingDependents and never get deleted.
   630  		ownerUIDs := append(ownerRefsToUIDs(dangling), ownerRefsToUIDs(waitingForDependentsDeletion)...)
   631  		p, err := c.GenerateDeleteOwnerRefStrategicMergeBytes(item.identity.UID, ownerUIDs)
   632  		if err != nil {
   633  			return err
   634  		}
   635  		_, err = gc.patch(item, p, func(n *node) ([]byte, error) {
   636  			return gc.deleteOwnerRefJSONMergePatch(n, ownerUIDs...)
   637  		})
   638  		return err
   639  	case len(waitingForDependentsDeletion) != 0 && item.dependentsLength() != 0:
   640  		deps := item.getDependents()
   641  		for _, dep := range deps {
   642  			if dep.isDeletingDependents() {
   643  				// this circle detection has false positives, we need to
   644  				// apply a more rigorous detection if this turns out to be a
   645  				// problem.
   646  				// there are multiple workers run attemptToDeleteItem in
   647  				// parallel, the circle detection can fail in a race condition.
   648  				logger.V(2).Info("processing item, some of its owners and its dependent have FinalizerDeletingDependents, to prevent potential cycle, its ownerReferences are going to be modified to be non-blocking, then the item is going to be deleted with Foreground",
   649  					"item", item.identity,
   650  					"dependent", dep.identity,
   651  				)
   652  				patch, err := item.unblockOwnerReferencesStrategicMergePatch()
   653  				if err != nil {
   654  					return err
   655  				}
   656  				if _, err := gc.patch(item, patch, gc.unblockOwnerReferencesJSONMergePatch); err != nil {
   657  					return err
   658  				}
   659  				break
   660  			}
   661  		}
   662  		logger.V(2).Info("at least one owner of item has FinalizerDeletingDependents, and the item itself has dependents, so it is going to be deleted in Foreground",
   663  			"item", item.identity,
   664  		)
   665  		// the deletion event will be observed by the graphBuilder, so the item
   666  		// will be processed again in processDeletingDependentsItem. If it
   667  		// doesn't have dependents, the function will remove the
   668  		// FinalizerDeletingDependents from the item, resulting in the final
   669  		// deletion of the item.
   670  		policy := metav1.DeletePropagationForeground
   671  		return gc.deleteObject(item.identity, &policy)
   672  	default:
   673  		// item doesn't have any solid owner, so it needs to be garbage
   674  		// collected. Also, none of item's owners is waiting for the deletion of
   675  		// the dependents, so set propagationPolicy based on existing finalizers.
   676  		var policy metav1.DeletionPropagation
   677  		switch {
   678  		case hasOrphanFinalizer(latest):
   679  			// if an existing orphan finalizer is already on the object, honor it.
   680  			policy = metav1.DeletePropagationOrphan
   681  		case hasDeleteDependentsFinalizer(latest):
   682  			// if an existing foreground finalizer is already on the object, honor it.
   683  			policy = metav1.DeletePropagationForeground
   684  		default:
   685  			// otherwise, default to background.
   686  			policy = metav1.DeletePropagationBackground
   687  		}
   688  		logger.V(2).Info("Deleting item",
   689  			"item", item.identity,
   690  			"propagationPolicy", policy,
   691  		)
   692  		return gc.deleteObject(item.identity, &policy)
   693  	}
   694  }
   695  
   696  // process item that's waiting for its dependents to be deleted
   697  func (gc *GarbageCollector) processDeletingDependentsItem(logger klog.Logger, item *node) error {
   698  	blockingDependents := item.blockingDependents()
   699  	if len(blockingDependents) == 0 {
   700  		logger.V(2).Info("remove DeleteDependents finalizer for item", "item", item.identity)
   701  		return gc.removeFinalizer(logger, item, metav1.FinalizerDeleteDependents)
   702  	}
   703  	for _, dep := range blockingDependents {
   704  		if !dep.isDeletingDependents() {
   705  			logger.V(2).Info("adding dependent to attemptToDelete, because its owner is deletingDependents",
   706  				"item", item.identity,
   707  				"dependent", dep.identity,
   708  			)
   709  			gc.attemptToDelete.Add(dep)
   710  		}
   711  	}
   712  	return nil
   713  }
   714  
   715  // dependents are copies of pointers to the owner's dependents, they don't need to be locked.
   716  func (gc *GarbageCollector) orphanDependents(logger klog.Logger, owner objectReference, dependents []*node) error {
   717  	errCh := make(chan error, len(dependents))
   718  	wg := sync.WaitGroup{}
   719  	wg.Add(len(dependents))
   720  	for i := range dependents {
   721  		go func(dependent *node) {
   722  			defer wg.Done()
   723  			// the dependent.identity.UID is used as precondition
   724  			p, err := c.GenerateDeleteOwnerRefStrategicMergeBytes(dependent.identity.UID, []types.UID{owner.UID})
   725  			if err != nil {
   726  				errCh <- fmt.Errorf("orphaning %s failed, %v", dependent.identity, err)
   727  				return
   728  			}
   729  			_, err = gc.patch(dependent, p, func(n *node) ([]byte, error) {
   730  				return gc.deleteOwnerRefJSONMergePatch(n, owner.UID)
   731  			})
   732  			// note that if the target ownerReference doesn't exist in the
   733  			// dependent, strategic merge patch will NOT return an error.
   734  			if err != nil && !errors.IsNotFound(err) {
   735  				errCh <- fmt.Errorf("orphaning %s failed, %v", dependent.identity, err)
   736  			}
   737  		}(dependents[i])
   738  	}
   739  	wg.Wait()
   740  	close(errCh)
   741  
   742  	var errorsSlice []error
   743  	for e := range errCh {
   744  		errorsSlice = append(errorsSlice, e)
   745  	}
   746  
   747  	if len(errorsSlice) != 0 {
   748  		return fmt.Errorf("failed to orphan dependents of owner %s, got errors: %s", owner, utilerrors.NewAggregate(errorsSlice).Error())
   749  	}
   750  	logger.V(5).Info("successfully updated all dependents", "owner", owner)
   751  	return nil
   752  }
   753  
   754  func (gc *GarbageCollector) runAttemptToOrphanWorker(logger klog.Logger) {
   755  	for gc.processAttemptToOrphanWorker(logger) {
   756  	}
   757  }
   758  
   759  // processAttemptToOrphanWorker dequeues a node from the attemptToOrphan, then finds its
   760  // dependents based on the graph maintained by the GC, then removes it from the
   761  // OwnerReferences of its dependents, and finally updates the owner to remove
   762  // the "Orphan" finalizer. The node is added back into the attemptToOrphan if any of
   763  // these steps fail.
   764  func (gc *GarbageCollector) processAttemptToOrphanWorker(logger klog.Logger) bool {
   765  	item, quit := gc.attemptToOrphan.Get()
   766  	gc.workerLock.RLock()
   767  	defer gc.workerLock.RUnlock()
   768  	if quit {
   769  		return false
   770  	}
   771  	defer gc.attemptToOrphan.Done(item)
   772  
   773  	action := gc.attemptToOrphanWorker(logger, item)
   774  	switch action {
   775  	case forgetItem:
   776  		gc.attemptToOrphan.Forget(item)
   777  	case requeueItem:
   778  		gc.attemptToOrphan.AddRateLimited(item)
   779  	}
   780  
   781  	return true
   782  }
   783  
   784  func (gc *GarbageCollector) attemptToOrphanWorker(logger klog.Logger, item interface{}) workQueueItemAction {
   785  	owner, ok := item.(*node)
   786  	if !ok {
   787  		utilruntime.HandleError(fmt.Errorf("expect *node, got %#v", item))
   788  		return forgetItem
   789  	}
   790  	// we don't need to lock each element, because they never get updated
   791  	owner.dependentsLock.RLock()
   792  	dependents := make([]*node, 0, len(owner.dependents))
   793  	for dependent := range owner.dependents {
   794  		dependents = append(dependents, dependent)
   795  	}
   796  	owner.dependentsLock.RUnlock()
   797  
   798  	err := gc.orphanDependents(logger, owner.identity, dependents)
   799  	if err != nil {
   800  		utilruntime.HandleError(fmt.Errorf("orphanDependents for %s failed with %v", owner.identity, err))
   801  		return requeueItem
   802  	}
   803  	// update the owner, remove "orphaningFinalizer" from its finalizers list
   804  	err = gc.removeFinalizer(logger, owner, metav1.FinalizerOrphanDependents)
   805  	if err != nil {
   806  		utilruntime.HandleError(fmt.Errorf("removeOrphanFinalizer for %s failed with %v", owner.identity, err))
   807  		return requeueItem
   808  	}
   809  	return forgetItem
   810  }
   811  
   812  // *FOR TEST USE ONLY*
   813  // GraphHasUID returns if the GraphBuilder has a particular UID store in its
   814  // uidToNode graph. It's useful for debugging.
   815  // This method is used by integration tests.
   816  func (gc *GarbageCollector) GraphHasUID(u types.UID) bool {
   817  	_, ok := gc.dependencyGraphBuilder.uidToNode.Read(u)
   818  	return ok
   819  }
   820  
   821  // GetDeletableResources returns all resources from discoveryClient that the
   822  // garbage collector should recognize and work with. More specifically, all
   823  // preferred resources which support the 'delete', 'list', and 'watch' verbs.
   824  //
   825  // If an error was encountered fetching resources from the server,
   826  // it is included as well, along with any resources that were successfully resolved.
   827  //
   828  // All discovery errors are considered temporary. Upon encountering any error,
   829  // GetDeletableResources will log and return any discovered resources it was
   830  // able to process (which may be none).
   831  func GetDeletableResources(logger klog.Logger, discoveryClient discovery.ServerResourcesInterface) (map[schema.GroupVersionResource]struct{}, error) {
   832  	preferredResources, lookupErr := discoveryClient.ServerPreferredResources()
   833  	if lookupErr != nil {
   834  		if groupLookupFailures, isLookupFailure := discovery.GroupDiscoveryFailedErrorGroups(lookupErr); isLookupFailure {
   835  			logger.Info("failed to discover some groups", "groups", groupLookupFailures)
   836  		} else {
   837  			logger.Info("failed to discover preferred resources", "error", lookupErr)
   838  		}
   839  	}
   840  	if preferredResources == nil {
   841  		return map[schema.GroupVersionResource]struct{}{}, lookupErr
   842  	}
   843  
   844  	// This is extracted from discovery.GroupVersionResources to allow tolerating
   845  	// failures on a per-resource basis.
   846  	deletableResources := discovery.FilteredBy(discovery.SupportsAllVerbs{Verbs: []string{"delete", "list", "watch"}}, preferredResources)
   847  	deletableGroupVersionResources := map[schema.GroupVersionResource]struct{}{}
   848  	for _, rl := range deletableResources {
   849  		gv, err := schema.ParseGroupVersion(rl.GroupVersion)
   850  		if err != nil {
   851  			logger.Info("ignoring invalid discovered resource", "groupversion", rl.GroupVersion, "error", err)
   852  			continue
   853  		}
   854  		for i := range rl.APIResources {
   855  			deletableGroupVersionResources[schema.GroupVersionResource{Group: gv.Group, Version: gv.Version, Resource: rl.APIResources[i].Name}] = struct{}{}
   856  		}
   857  	}
   858  
   859  	return deletableGroupVersionResources, lookupErr
   860  }
   861  
   862  func (gc *GarbageCollector) Name() string {
   863  	return "garbagecollector"
   864  }