k8s.io/kubernetes@v1.29.3/pkg/controller/resourceclaim/controller.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package resourceclaim
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"strings"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
    28  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    29  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    30  	"k8s.io/apimachinery/pkg/types"
    31  	"k8s.io/apimachinery/pkg/util/runtime"
    32  	"k8s.io/apimachinery/pkg/util/wait"
    33  	corev1apply "k8s.io/client-go/applyconfigurations/core/v1"
    34  	v1informers "k8s.io/client-go/informers/core/v1"
    35  	resourcev1alpha2informers "k8s.io/client-go/informers/resource/v1alpha2"
    36  	clientset "k8s.io/client-go/kubernetes"
    37  	"k8s.io/client-go/kubernetes/scheme"
    38  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    39  	v1listers "k8s.io/client-go/listers/core/v1"
    40  	resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
    41  	"k8s.io/client-go/tools/cache"
    42  	"k8s.io/client-go/tools/record"
    43  	"k8s.io/client-go/util/workqueue"
    44  	"k8s.io/dynamic-resource-allocation/resourceclaim"
    45  	"k8s.io/klog/v2"
    46  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    47  	"k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
    48  	"k8s.io/utils/pointer"
    49  )
    50  
    51  const (
    52  	// podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim templates.
    53  	podResourceClaimIndex = "pod-resource-claim-index"
    54  
    55  	// podResourceClaimAnnotation is the special annotation that generated
    56  	// ResourceClaims get. Its value is the pod.spec.resourceClaims[].name
    57  	// for which it was generated. This is used only inside the controller
    58  	// and not documented as part of the Kubernetes API.
    59  	podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name"
    60  
    61  	// claimPodOwnerIndex is used to find ResourceClaims which have
    62  	// a specific pod as owner. Values for this index are the pod UID.
    63  	claimPodOwnerIndex = "claim-pod-owner-index"
    64  
    65  	// Field manager used to update the pod status.
    66  	fieldManager = "ResourceClaimController"
    67  
    68  	maxUIDCacheEntries = 500
    69  )
    70  
    71  // Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec.
    72  type Controller struct {
    73  	// kubeClient is the kube API client used to communicate with the API
    74  	// server.
    75  	kubeClient clientset.Interface
    76  
    77  	// claimLister is the shared ResourceClaim lister used to fetch and store ResourceClaim
    78  	// objects from the API server. It is shared with other controllers and
    79  	// therefore the ResourceClaim objects in its store should be treated as immutable.
    80  	claimLister  resourcev1alpha2listers.ResourceClaimLister
    81  	claimsSynced cache.InformerSynced
    82  	claimCache   cache.MutationCache
    83  
    84  	// podLister is the shared Pod lister used to fetch Pod
    85  	// objects from the API server. It is shared with other controllers and
    86  	// therefore the Pod objects in its store should be treated as immutable.
    87  	podLister v1listers.PodLister
    88  	podSynced cache.InformerSynced
    89  
    90  	// podSchedulingList is the shared PodSchedulingContext lister used to
    91  	// fetch scheduling objects from the API server. It is shared with other
    92  	// controllers and therefore the objects in its store should be treated
    93  	// as immutable.
    94  	podSchedulingLister resourcev1alpha2listers.PodSchedulingContextLister
    95  	podSchedulingSynced cache.InformerSynced
    96  
    97  	// templateLister is the shared ResourceClaimTemplate lister used to
    98  	// fetch template objects from the API server. It is shared with other
    99  	// controllers and therefore the objects in its store should be treated
   100  	// as immutable.
   101  	templateLister  resourcev1alpha2listers.ResourceClaimTemplateLister
   102  	templatesSynced cache.InformerSynced
   103  
   104  	// podIndexer has the common PodResourceClaim indexer indexer installed To
   105  	// limit iteration over pods to those of interest.
   106  	podIndexer cache.Indexer
   107  
   108  	// recorder is used to record events in the API server
   109  	recorder record.EventRecorder
   110  
   111  	queue workqueue.RateLimitingInterface
   112  
   113  	// The deletedObjects cache keeps track of Pods for which we know that
   114  	// they have existed and have been removed. For those we can be sure
   115  	// that a ReservedFor entry needs to be removed.
   116  	deletedObjects *uidCache
   117  }
   118  
   119  const (
   120  	claimKeyPrefix = "claim:"
   121  	podKeyPrefix   = "pod:"
   122  )
   123  
   124  // NewController creates a ResourceClaim controller.
   125  func NewController(
   126  	logger klog.Logger,
   127  	kubeClient clientset.Interface,
   128  	podInformer v1informers.PodInformer,
   129  	podSchedulingInformer resourcev1alpha2informers.PodSchedulingContextInformer,
   130  	claimInformer resourcev1alpha2informers.ResourceClaimInformer,
   131  	templateInformer resourcev1alpha2informers.ResourceClaimTemplateInformer) (*Controller, error) {
   132  
   133  	ec := &Controller{
   134  		kubeClient:          kubeClient,
   135  		podLister:           podInformer.Lister(),
   136  		podIndexer:          podInformer.Informer().GetIndexer(),
   137  		podSynced:           podInformer.Informer().HasSynced,
   138  		podSchedulingLister: podSchedulingInformer.Lister(),
   139  		podSchedulingSynced: podSchedulingInformer.Informer().HasSynced,
   140  		claimLister:         claimInformer.Lister(),
   141  		claimsSynced:        claimInformer.Informer().HasSynced,
   142  		templateLister:      templateInformer.Lister(),
   143  		templatesSynced:     templateInformer.Informer().HasSynced,
   144  		queue:               workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "resource_claim"),
   145  		deletedObjects:      newUIDCache(maxUIDCacheEntries),
   146  	}
   147  
   148  	metrics.RegisterMetrics()
   149  
   150  	if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   151  		AddFunc: func(obj interface{}) {
   152  			ec.enqueuePod(logger, obj, false)
   153  		},
   154  		UpdateFunc: func(old, updated interface{}) {
   155  			ec.enqueuePod(logger, updated, false)
   156  		},
   157  		DeleteFunc: func(obj interface{}) {
   158  			ec.enqueuePod(logger, obj, true)
   159  		},
   160  	}); err != nil {
   161  		return nil, err
   162  	}
   163  	if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   164  		AddFunc: func(obj interface{}) {
   165  			logger.V(6).Info("new claim", "claimDump", obj)
   166  			ec.enqueueResourceClaim(logger, obj, false)
   167  		},
   168  		UpdateFunc: func(old, updated interface{}) {
   169  			logger.V(6).Info("updated claim", "claimDump", updated)
   170  			ec.enqueueResourceClaim(logger, updated, false)
   171  		},
   172  		DeleteFunc: func(obj interface{}) {
   173  			logger.V(6).Info("deleted claim", "claimDump", obj)
   174  			ec.enqueueResourceClaim(logger, obj, true)
   175  		},
   176  	}); err != nil {
   177  		return nil, err
   178  	}
   179  	if err := ec.podIndexer.AddIndexers(cache.Indexers{podResourceClaimIndex: podResourceClaimIndexFunc}); err != nil {
   180  		return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err)
   181  	}
   182  
   183  	// The mutation cache acts as an additional layer for the informer
   184  	// cache and after a create made by the controller returns that
   185  	// object until the informer catches up. That is necessary
   186  	// when a ResourceClaim got created, updating the pod status fails,
   187  	// and then a retry occurs before the informer cache is updated.
   188  	// In that scenario, the controller would create another claim
   189  	// instead of continuing with the existing one.
   190  	claimInformerCache := claimInformer.Informer().GetIndexer()
   191  	if err := claimInformerCache.AddIndexers(cache.Indexers{claimPodOwnerIndex: claimPodOwnerIndexFunc}); err != nil {
   192  		return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err)
   193  	}
   194  	ec.claimCache = cache.NewIntegerResourceVersionMutationCache(claimInformerCache, claimInformerCache,
   195  		// Very long time to live, unlikely to be needed because
   196  		// the informer cache should get updated soon.
   197  		time.Hour,
   198  		// Allow storing objects not in the underlying cache - that's the point...
   199  		// It's safe because in case of a race (claim is in mutation cache, claim
   200  		// gets deleted, controller updates status based on mutation cache) the
   201  		// "bad" pod status will get detected and fixed when the informer catches up.
   202  		true,
   203  	)
   204  
   205  	return ec, nil
   206  }
   207  
   208  func (ec *Controller) enqueuePod(logger klog.Logger, obj interface{}, deleted bool) {
   209  	if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   210  		obj = d.Obj
   211  	}
   212  	pod, ok := obj.(*v1.Pod)
   213  	if !ok {
   214  		// Not a pod?!
   215  		logger.Error(nil, "enqueuePod called for unexpected object", "type", fmt.Sprintf("%T", obj))
   216  		return
   217  	}
   218  
   219  	if len(pod.Spec.ResourceClaims) == 0 {
   220  		// Nothing to do for it at all.
   221  		return
   222  	}
   223  
   224  	if deleted {
   225  		logger.V(6).Info("pod got deleted", "pod", klog.KObj(pod))
   226  		ec.deletedObjects.Add(pod.UID)
   227  	}
   228  
   229  	logger.V(6).Info("pod with resource claims changed", "pod", klog.KObj(pod), "deleted", deleted)
   230  
   231  	// Release reservations of a deleted or completed pod?
   232  	if needsClaims, reason := podNeedsClaims(pod, deleted); !needsClaims {
   233  		for _, podClaim := range pod.Spec.ResourceClaims {
   234  			claimName, _, err := resourceclaim.Name(pod, &podClaim)
   235  			switch {
   236  			case err != nil:
   237  				// Either the claim was not created (nothing to do here) or
   238  				// the API changed. The later will also get reported elsewhere,
   239  				// so here it's just a debug message.
   240  				logger.V(6).Info("Nothing to do for claim during pod change", "err", err, "reason", reason)
   241  			case claimName != nil:
   242  				key := claimKeyPrefix + pod.Namespace + "/" + *claimName
   243  				logger.V(6).Info("Process claim", "pod", klog.KObj(pod), "key", key, "reason", reason)
   244  				ec.queue.Add(key)
   245  			default:
   246  				// Nothing to do, claim wasn't generated.
   247  				logger.V(6).Info("Nothing to do for skipped claim during pod change", "reason", reason)
   248  			}
   249  		}
   250  	}
   251  
   252  	needsWork, reason := ec.podNeedsWork(pod)
   253  	if needsWork {
   254  		logger.V(6).Info("enqueing pod", "pod", klog.KObj(pod), "reason", reason)
   255  		ec.queue.Add(podKeyPrefix + pod.Namespace + "/" + pod.Name)
   256  		return
   257  	}
   258  	logger.V(6).Info("not enqueing pod", "pod", klog.KObj(pod), "reason", reason)
   259  }
   260  
   261  func podNeedsClaims(pod *v1.Pod, deleted bool) (bool, string) {
   262  	if deleted {
   263  		return false, "pod got removed"
   264  	}
   265  	if podutil.IsPodTerminal(pod) {
   266  		return false, "pod has terminated"
   267  	}
   268  	if pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" {
   269  		return false, "pod got deleted before scheduling"
   270  	}
   271  	// Still needs claims.
   272  	return true, "pod might run"
   273  }
   274  
   275  // podNeedsWork checks whether a new or modified pod needs to be processed
   276  // further by a worker. It returns a boolean with the result and an explanation
   277  // for it.
   278  func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) {
   279  	if pod.DeletionTimestamp != nil {
   280  		// Nothing else to do for the pod.
   281  		return false, "pod is deleted"
   282  	}
   283  
   284  	for _, podClaim := range pod.Spec.ResourceClaims {
   285  		claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim)
   286  		if err != nil {
   287  			return true, err.Error()
   288  		}
   289  		// If the claimName is nil, then it has been determined before
   290  		// that the claim is not needed.
   291  		if claimName == nil {
   292  			return false, "claim is not needed"
   293  		}
   294  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
   295  		if apierrors.IsNotFound(err) {
   296  			if podClaim.Source.ResourceClaimTemplateName != nil {
   297  				return true, "must create ResourceClaim from template"
   298  			}
   299  			// User needs to create claim.
   300  			return false, "claim is missing and must be created by user"
   301  		}
   302  		if err != nil {
   303  			// Shouldn't happen.
   304  			return true, fmt.Sprintf("internal error while checking for claim: %v", err)
   305  		}
   306  
   307  		if checkOwner &&
   308  			resourceclaim.IsForPod(pod, claim) != nil {
   309  			// Cannot proceed with the pod unless that other claim gets deleted.
   310  			return false, "conflicting claim needs to be removed by user"
   311  		}
   312  
   313  		// This check skips over the reasons below that only apply
   314  		// when a pod has been scheduled already. We need to keep checking
   315  		// for more claims that might need to be created.
   316  		if pod.Spec.NodeName == "" {
   317  			continue
   318  		}
   319  
   320  		// Create PodSchedulingContext if the pod got scheduled without triggering
   321  		// delayed allocation.
   322  		//
   323  		// These can happen when:
   324  		// - a user created a pod with spec.nodeName set, perhaps for testing
   325  		// - some scheduler was used which is unaware of DRA
   326  		// - DRA was not enabled in kube-scheduler (version skew, configuration)
   327  		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
   328  			claim.Status.Allocation == nil {
   329  			scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
   330  			if apierrors.IsNotFound(err) {
   331  				return true, "need to create PodSchedulingContext for scheduled pod"
   332  			}
   333  			if err != nil {
   334  				// Shouldn't happen.
   335  				return true, fmt.Sprintf("internal error while checking for PodSchedulingContext: %v", err)
   336  			}
   337  			if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
   338  				// Need to update PodSchedulingContext.
   339  				return true, "need to updated PodSchedulingContext for scheduled pod"
   340  			}
   341  		}
   342  		if claim.Status.Allocation != nil &&
   343  			!resourceclaim.IsReservedForPod(pod, claim) &&
   344  			resourceclaim.CanBeReserved(claim) {
   345  			// Need to reserve it.
   346  			return true, "need to reserve claim for pod"
   347  		}
   348  	}
   349  
   350  	return false, "nothing to do"
   351  }
   352  
   353  func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) {
   354  	if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   355  		obj = d.Obj
   356  	}
   357  	claim, ok := obj.(*resourcev1alpha2.ResourceClaim)
   358  	if !ok {
   359  		return
   360  	}
   361  
   362  	if !deleted {
   363  		// When starting up, we have to check all claims to find those with
   364  		// stale pods in ReservedFor. During an update, a pod might get added
   365  		// that already no longer exists.
   366  		key := claimKeyPrefix + claim.Namespace + "/" + claim.Name
   367  		logger.V(6).Info("enqueing new or updated claim", "claim", klog.KObj(claim), "key", key)
   368  		ec.queue.Add(key)
   369  	} else {
   370  		logger.V(6).Info("not enqueing deleted claim", "claim", klog.KObj(claim))
   371  	}
   372  
   373  	// Also check whether this causes work for any of the currently
   374  	// known pods which use the ResourceClaim.
   375  	objs, err := ec.podIndexer.ByIndex(podResourceClaimIndex, fmt.Sprintf("%s/%s", claim.Namespace, claim.Name))
   376  	if err != nil {
   377  		logger.Error(err, "listing pods from cache")
   378  		return
   379  	}
   380  	if len(objs) == 0 {
   381  		logger.V(6).Info("claim got deleted while not needed by any pod, nothing to do", "claim", klog.KObj(claim))
   382  		return
   383  	}
   384  	for _, obj := range objs {
   385  		ec.enqueuePod(logger, obj, false)
   386  	}
   387  }
   388  
   389  func (ec *Controller) Run(ctx context.Context, workers int) {
   390  	defer runtime.HandleCrash()
   391  	defer ec.queue.ShutDown()
   392  
   393  	logger := klog.FromContext(ctx)
   394  	logger.Info("Starting ephemeral volume controller")
   395  	defer logger.Info("Shutting down ephemeral volume controller")
   396  
   397  	eventBroadcaster := record.NewBroadcaster()
   398  	eventBroadcaster.StartLogging(klog.Infof)
   399  	eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ec.kubeClient.CoreV1().Events("")})
   400  	ec.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "resource_claim"})
   401  	defer eventBroadcaster.Shutdown()
   402  
   403  	if !cache.WaitForNamedCacheSync("ephemeral", ctx.Done(), ec.podSynced, ec.claimsSynced) {
   404  		return
   405  	}
   406  
   407  	for i := 0; i < workers; i++ {
   408  		go wait.UntilWithContext(ctx, ec.runWorker, time.Second)
   409  	}
   410  
   411  	<-ctx.Done()
   412  }
   413  
   414  func (ec *Controller) runWorker(ctx context.Context) {
   415  	for ec.processNextWorkItem(ctx) {
   416  	}
   417  }
   418  
   419  func (ec *Controller) processNextWorkItem(ctx context.Context) bool {
   420  	key, shutdown := ec.queue.Get()
   421  	if shutdown {
   422  		return false
   423  	}
   424  	defer ec.queue.Done(key)
   425  
   426  	err := ec.syncHandler(ctx, key.(string))
   427  	if err == nil {
   428  		ec.queue.Forget(key)
   429  		return true
   430  	}
   431  
   432  	runtime.HandleError(fmt.Errorf("%v failed with: %v", key, err))
   433  	ec.queue.AddRateLimited(key)
   434  
   435  	return true
   436  }
   437  
   438  // syncHandler is invoked for each work item which might need to be processed.
   439  // If an error is returned from this function, the item will be requeued.
   440  func (ec *Controller) syncHandler(ctx context.Context, key string) error {
   441  	sep := strings.Index(key, ":")
   442  	if sep < 0 {
   443  		return fmt.Errorf("unexpected key: %s", key)
   444  	}
   445  	prefix, object := key[0:sep+1], key[sep+1:]
   446  	namespace, name, err := cache.SplitMetaNamespaceKey(object)
   447  	if err != nil {
   448  		return err
   449  	}
   450  
   451  	switch prefix {
   452  	case podKeyPrefix:
   453  		return ec.syncPod(ctx, namespace, name)
   454  	case claimKeyPrefix:
   455  		return ec.syncClaim(ctx, namespace, name)
   456  	default:
   457  		return fmt.Errorf("unexpected key prefix: %s", prefix)
   458  	}
   459  
   460  }
   461  
   462  func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error {
   463  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "pod", klog.KRef(namespace, name))
   464  	ctx = klog.NewContext(ctx, logger)
   465  	pod, err := ec.podLister.Pods(namespace).Get(name)
   466  	if err != nil {
   467  		if apierrors.IsNotFound(err) {
   468  			logger.V(5).Info("nothing to do for pod, it is gone")
   469  			return nil
   470  		}
   471  		return err
   472  	}
   473  
   474  	// Ignore pods which are already getting deleted.
   475  	if pod.DeletionTimestamp != nil {
   476  		logger.V(5).Info("nothing to do for pod, it is marked for deletion")
   477  		return nil
   478  	}
   479  
   480  	var newPodClaims map[string]string
   481  	for _, podClaim := range pod.Spec.ResourceClaims {
   482  		if err := ec.handleClaim(ctx, pod, podClaim, &newPodClaims); err != nil {
   483  			if ec.recorder != nil {
   484  				ec.recorder.Event(pod, v1.EventTypeWarning, "FailedResourceClaimCreation", fmt.Sprintf("PodResourceClaim %s: %v", podClaim.Name, err))
   485  			}
   486  			return fmt.Errorf("pod %s/%s, PodResourceClaim %s: %v", namespace, name, podClaim.Name, err)
   487  		}
   488  	}
   489  
   490  	if newPodClaims != nil {
   491  		// Patch the pod status with the new information about
   492  		// generated ResourceClaims.
   493  		statuses := make([]*corev1apply.PodResourceClaimStatusApplyConfiguration, 0, len(newPodClaims))
   494  		for podClaimName, resourceClaimName := range newPodClaims {
   495  			statuses = append(statuses, corev1apply.PodResourceClaimStatus().WithName(podClaimName).WithResourceClaimName(resourceClaimName))
   496  		}
   497  		podApply := corev1apply.Pod(name, namespace).WithStatus(corev1apply.PodStatus().WithResourceClaimStatuses(statuses...))
   498  		if _, err := ec.kubeClient.CoreV1().Pods(namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true}); err != nil {
   499  			return fmt.Errorf("update pod %s/%s ResourceClaimStatuses: %v", namespace, name, err)
   500  		}
   501  	}
   502  
   503  	if pod.Spec.NodeName == "" {
   504  		// Scheduler will handle PodSchedulingContext and reservations.
   505  		logger.V(5).Info("nothing to do for pod, scheduler will deal with it")
   506  		return nil
   507  	}
   508  
   509  	for _, podClaim := range pod.Spec.ResourceClaims {
   510  		claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim)
   511  		if err != nil {
   512  			return err
   513  		}
   514  		// If nil, then it has been determined that the claim is not needed
   515  		// and can be skipped.
   516  		if claimName == nil {
   517  			continue
   518  		}
   519  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
   520  		if apierrors.IsNotFound(err) {
   521  			return nil
   522  		}
   523  		if err != nil {
   524  			return fmt.Errorf("retrieve claim: %v", err)
   525  		}
   526  		if checkOwner {
   527  			if err := resourceclaim.IsForPod(pod, claim); err != nil {
   528  				return err
   529  			}
   530  		}
   531  		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
   532  			claim.Status.Allocation == nil {
   533  			logger.V(5).Info("create PodSchedulingContext because claim needs to be allocated", "resourceClaim", klog.KObj(claim))
   534  			return ec.ensurePodSchedulingContext(ctx, pod)
   535  		}
   536  		if claim.Status.Allocation != nil &&
   537  			!resourceclaim.IsReservedForPod(pod, claim) &&
   538  			resourceclaim.CanBeReserved(claim) {
   539  			logger.V(5).Info("reserve claim for pod", "resourceClaim", klog.KObj(claim))
   540  			if err := ec.reserveForPod(ctx, pod, claim); err != nil {
   541  				return err
   542  			}
   543  		}
   544  	}
   545  
   546  	return nil
   547  }
   548  
   549  // handleResourceClaim is invoked for each volume of a pod.
   550  func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim, newPodClaims *map[string]string) error {
   551  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "podClaim", podClaim.Name)
   552  	ctx = klog.NewContext(ctx, logger)
   553  	logger.V(5).Info("checking", "podClaim", podClaim.Name)
   554  
   555  	// resourceclaim.Name checks for the situation that the client doesn't
   556  	// know some future addition to the API. Therefore it gets called here
   557  	// even if there is no template to work on, because if some new field
   558  	// gets added, the expectation might be that the controller does
   559  	// something for it.
   560  	claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim)
   561  	switch {
   562  	case errors.Is(err, resourceclaim.ErrClaimNotFound):
   563  		// Continue below.
   564  	case err != nil:
   565  		return fmt.Errorf("checking for claim before creating it: %v", err)
   566  	case claimName == nil:
   567  		// Nothing to do, no claim needed.
   568  		return nil
   569  	case *claimName != "":
   570  		claimName := *claimName
   571  		// The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses,
   572  		// but perhaps it was deleted accidentally. In that case we re-create it.
   573  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName)
   574  		if err != nil && !apierrors.IsNotFound(err) {
   575  			return err
   576  		}
   577  		if claim != nil {
   578  			var err error
   579  			if mustCheckOwner {
   580  				err = resourceclaim.IsForPod(pod, claim)
   581  			}
   582  			if err == nil {
   583  				// Already created, nothing more to do.
   584  				logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName)
   585  				return nil
   586  			}
   587  			logger.Error(err, "claim that was created for the pod is no longer owned by the pod, creating a new one", "podClaim", podClaim.Name, "resourceClaim", claimName)
   588  		}
   589  	}
   590  
   591  	templateName := podClaim.Source.ResourceClaimTemplateName
   592  	if templateName == nil {
   593  		// Nothing to do.
   594  		return nil
   595  	}
   596  
   597  	// Before we create a new ResourceClaim, check if there is an orphaned one.
   598  	// This covers the case that the controller has created it, but then fails
   599  	// before it can update the pod status.
   600  	claim, err := ec.findPodResourceClaim(pod, podClaim)
   601  	if err != nil {
   602  		return fmt.Errorf("finding ResourceClaim for claim %s in pod %s/%s failed: %v", podClaim.Name, pod.Namespace, pod.Name, err)
   603  	}
   604  
   605  	if claim == nil {
   606  		template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName)
   607  		if err != nil {
   608  			return fmt.Errorf("resource claim template %q: %v", *templateName, err)
   609  		}
   610  
   611  		// Create the ResourceClaim with pod as owner, with a generated name that uses
   612  		// <pod>-<claim name> as base.
   613  		isTrue := true
   614  		annotations := template.Spec.ObjectMeta.Annotations
   615  		if annotations == nil {
   616  			annotations = make(map[string]string)
   617  		}
   618  		annotations[podResourceClaimAnnotation] = podClaim.Name
   619  		generateName := pod.Name + "-" + podClaim.Name + "-"
   620  		maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters.
   621  		if len(generateName) > maxBaseLen {
   622  			// We could leave truncation to the apiserver, but as
   623  			// it removes at the end, we would loose everything
   624  			// from the pod claim name when the pod name is long.
   625  			// We can do better and truncate both strings,
   626  			// proportional to their length.
   627  			generateName = pod.Name[0:len(pod.Name)*maxBaseLen/len(generateName)] +
   628  				"-" +
   629  				podClaim.Name[0:len(podClaim.Name)*maxBaseLen/len(generateName)]
   630  		}
   631  		claim = &resourcev1alpha2.ResourceClaim{
   632  			ObjectMeta: metav1.ObjectMeta{
   633  				GenerateName: generateName,
   634  				OwnerReferences: []metav1.OwnerReference{
   635  					{
   636  						APIVersion:         "v1",
   637  						Kind:               "Pod",
   638  						Name:               pod.Name,
   639  						UID:                pod.UID,
   640  						Controller:         &isTrue,
   641  						BlockOwnerDeletion: &isTrue,
   642  					},
   643  				},
   644  				Annotations: annotations,
   645  				Labels:      template.Spec.ObjectMeta.Labels,
   646  			},
   647  			Spec: template.Spec.Spec,
   648  		}
   649  		metrics.ResourceClaimCreateAttempts.Inc()
   650  		claimName := claim.Name
   651  		claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{})
   652  		if err != nil {
   653  			metrics.ResourceClaimCreateFailures.Inc()
   654  			return fmt.Errorf("create ResourceClaim %s: %v", claimName, err)
   655  		}
   656  		ec.claimCache.Mutation(claim)
   657  	}
   658  
   659  	// Remember the new ResourceClaim for a batch PodStatus update in our caller.
   660  	if *newPodClaims == nil {
   661  		*newPodClaims = make(map[string]string)
   662  	}
   663  	(*newPodClaims)[podClaim.Name] = claim.Name
   664  
   665  	return nil
   666  }
   667  
   668  // findPodResourceClaim looks for an existing ResourceClaim with the right
   669  // annotation (ties it to the pod claim) and the right ownership (ties it to
   670  // the pod).
   671  func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceClaim) (*resourcev1alpha2.ResourceClaim, error) {
   672  	// Only claims owned by the pod will get returned here.
   673  	claims, err := ec.claimCache.ByIndex(claimPodOwnerIndex, string(pod.UID))
   674  	if err != nil {
   675  		return nil, err
   676  	}
   677  	deterministicName := pod.Name + "-" + podClaim.Name // Kubernetes <= 1.27 behavior.
   678  	for _, claimObj := range claims {
   679  		claim, ok := claimObj.(*resourcev1alpha2.ResourceClaim)
   680  		if !ok {
   681  			return nil, fmt.Errorf("unexpected object of type %T returned by claim cache", claimObj)
   682  		}
   683  		podClaimName, ok := claim.Annotations[podResourceClaimAnnotation]
   684  		if ok && podClaimName != podClaim.Name {
   685  			continue
   686  		}
   687  
   688  		// No annotation? It might a ResourceClaim created for
   689  		// the pod with a previous Kubernetes release where the
   690  		// ResourceClaim name was deterministic, in which case
   691  		// we have to use it and update the new pod status
   692  		// field accordingly.
   693  		if !ok && claim.Name != deterministicName {
   694  			continue
   695  		}
   696  
   697  		// Pick the first one that matches. There shouldn't be more than one. If there is,
   698  		// then all others will be ignored until the pod gets deleted. Then they also get
   699  		// cleaned up.
   700  		return claim, nil
   701  	}
   702  	return nil, nil
   703  }
   704  
   705  func (ec *Controller) ensurePodSchedulingContext(ctx context.Context, pod *v1.Pod) error {
   706  	scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
   707  	if err != nil && !apierrors.IsNotFound(err) {
   708  		return fmt.Errorf("retrieve PodSchedulingContext: %v", err)
   709  	}
   710  	if scheduling == nil {
   711  		scheduling = &resourcev1alpha2.PodSchedulingContext{
   712  			ObjectMeta: metav1.ObjectMeta{
   713  				Name:      pod.Name,
   714  				Namespace: pod.Namespace,
   715  				OwnerReferences: []metav1.OwnerReference{
   716  					{
   717  						APIVersion: "v1",
   718  						Kind:       "Pod",
   719  						Name:       pod.Name,
   720  						UID:        pod.UID,
   721  						Controller: pointer.Bool(true),
   722  					},
   723  				},
   724  			},
   725  			Spec: resourcev1alpha2.PodSchedulingContextSpec{
   726  				SelectedNode: pod.Spec.NodeName,
   727  				// There is no need for negotiation about
   728  				// potential and suitable nodes anymore, so
   729  				// PotentialNodes can be left empty.
   730  			},
   731  		}
   732  		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Create(ctx, scheduling, metav1.CreateOptions{}); err != nil {
   733  			return fmt.Errorf("create PodSchedulingContext: %v", err)
   734  		}
   735  		return nil
   736  	}
   737  
   738  	if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
   739  		scheduling := scheduling.DeepCopy()
   740  		scheduling.Spec.SelectedNode = pod.Spec.NodeName
   741  		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Update(ctx, scheduling, metav1.UpdateOptions{}); err != nil {
   742  			return fmt.Errorf("update spec.selectedNode in PodSchedulingContext: %v", err)
   743  		}
   744  	}
   745  
   746  	return nil
   747  }
   748  
   749  func (ec *Controller) reserveForPod(ctx context.Context, pod *v1.Pod, claim *resourcev1alpha2.ResourceClaim) error {
   750  	claim = claim.DeepCopy()
   751  	claim.Status.ReservedFor = append(claim.Status.ReservedFor,
   752  		resourcev1alpha2.ResourceClaimConsumerReference{
   753  			Resource: "pods",
   754  			Name:     pod.Name,
   755  			UID:      pod.UID,
   756  		})
   757  	if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
   758  		return fmt.Errorf("reserve claim for pod: %v", err)
   759  	}
   760  	return nil
   761  }
   762  
   763  func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error {
   764  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name))
   765  	ctx = klog.NewContext(ctx, logger)
   766  	claim, err := ec.claimLister.ResourceClaims(namespace).Get(name)
   767  	if err != nil {
   768  		if apierrors.IsNotFound(err) {
   769  			logger.V(5).Info("nothing to do for claim, it is gone")
   770  			return nil
   771  		}
   772  		return err
   773  	}
   774  
   775  	// Check if the ReservedFor entries are all still valid.
   776  	valid := make([]resourcev1alpha2.ResourceClaimConsumerReference, 0, len(claim.Status.ReservedFor))
   777  	for _, reservedFor := range claim.Status.ReservedFor {
   778  		if reservedFor.APIGroup == "" &&
   779  			reservedFor.Resource == "pods" {
   780  			// A pod falls into one of three categories:
   781  			// - we have it in our cache -> don't remove it until we are told that it got removed
   782  			// - we don't have it in our cache anymore, but we have seen it before -> it was deleted, remove it
   783  			// - not in our cache, not seen -> double-check with API server before removal
   784  
   785  			keepEntry := true
   786  
   787  			// Tracking deleted pods in the LRU cache is an
   788  			// optimization. Without this cache, the code would
   789  			// have to do the API call below for every deleted pod
   790  			// to ensure that the pod really doesn't exist. With
   791  			// the cache, most of the time the pod will be recorded
   792  			// as deleted and the API call can be avoided.
   793  			if ec.deletedObjects.Has(reservedFor.UID) {
   794  				// We know that the pod was deleted. This is
   795  				// easy to check and thus is done first.
   796  				keepEntry = false
   797  			} else {
   798  				pod, err := ec.podLister.Pods(claim.Namespace).Get(reservedFor.Name)
   799  				switch {
   800  				case err != nil && !apierrors.IsNotFound(err):
   801  					return err
   802  				case err != nil:
   803  					// We might not have it in our informer cache
   804  					// yet. Removing the pod while the scheduler is
   805  					// scheduling it would be bad. We have to be
   806  					// absolutely sure and thus have to check with
   807  					// the API server.
   808  					pod, err := ec.kubeClient.CoreV1().Pods(claim.Namespace).Get(ctx, reservedFor.Name, metav1.GetOptions{})
   809  					if err != nil && !apierrors.IsNotFound(err) {
   810  						return err
   811  					}
   812  					if pod == nil || pod.UID != reservedFor.UID {
   813  						logger.V(6).Info("remove reservation because pod is gone or got replaced", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   814  						keepEntry = false
   815  					}
   816  				case pod.UID != reservedFor.UID:
   817  					logger.V(6).Info("remove reservation because pod got replaced with new instance", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   818  					keepEntry = false
   819  				case isPodDone(pod):
   820  					logger.V(6).Info("remove reservation because pod will not run anymore", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   821  					keepEntry = false
   822  				}
   823  			}
   824  
   825  			if keepEntry {
   826  				valid = append(valid, reservedFor)
   827  			}
   828  			continue
   829  		}
   830  
   831  		// TODO: support generic object lookup
   832  		return fmt.Errorf("unsupported ReservedFor entry: %v", reservedFor)
   833  	}
   834  
   835  	logger.V(5).Info("claim reserved for counts", "currentCount", len(claim.Status.ReservedFor), "claim", klog.KRef(namespace, name), "updatedCount", len(valid))
   836  	if len(valid) < len(claim.Status.ReservedFor) {
   837  		// TODO (#113700): patch
   838  		claim := claim.DeepCopy()
   839  		claim.Status.ReservedFor = valid
   840  
   841  		// When a ResourceClaim uses delayed allocation, then it makes sense to
   842  		// deallocate the claim as soon as the last consumer stops using
   843  		// it. This ensures that the claim can be allocated again as needed by
   844  		// some future consumer instead of trying to schedule that consumer
   845  		// onto the node that was chosen for the previous consumer. It also
   846  		// releases the underlying resources for use by other claims.
   847  		//
   848  		// This has to be triggered by the transition from "was being used" to
   849  		// "is not used anymore" because a DRA driver is not required to set
   850  		// `status.reservedFor` together with `status.allocation`, i.e. a claim
   851  		// that is "currently unused" should not get deallocated.
   852  		//
   853  		// This does not matter for claims that were created for a pod. For
   854  		// those, the resource claim controller will trigger deletion when the
   855  		// pod is done. However, it doesn't hurt to also trigger deallocation
   856  		// for such claims and not checking for them keeps this code simpler.
   857  		if len(valid) == 0 &&
   858  			claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer {
   859  			claim.Status.DeallocationRequested = true
   860  		}
   861  
   862  		_, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
   863  		if err != nil {
   864  			return err
   865  		}
   866  	}
   867  
   868  	if len(valid) == 0 {
   869  		// Claim is not reserved. If it was generated for a pod and
   870  		// that pod is not going to run, the claim can be
   871  		// deleted. Normally the garbage collector does that, but the
   872  		// pod itself might not get deleted for a while.
   873  		podName, podUID := owningPod(claim)
   874  		if podName != "" {
   875  			pod, err := ec.podLister.Pods(claim.Namespace).Get(podName)
   876  			switch {
   877  			case err == nil:
   878  				// Pod already replaced or not going to run?
   879  				if pod.UID != podUID || isPodDone(pod) {
   880  					// We are certain that the owning pod is not going to need
   881  					// the claim and therefore remove the claim.
   882  					logger.V(5).Info("deleting unused generated claim", "claim", klog.KObj(claim), "pod", klog.KObj(pod))
   883  					err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Delete(ctx, claim.Name, metav1.DeleteOptions{})
   884  					if err != nil {
   885  						return fmt.Errorf("delete claim: %v", err)
   886  					}
   887  				} else {
   888  					logger.V(6).Info("wrong pod content, not deleting claim", "claim", klog.KObj(claim), "podUID", podUID, "podContent", pod)
   889  				}
   890  			case apierrors.IsNotFound(err):
   891  				// We might not know the pod *yet*. Instead of doing an expensive API call,
   892  				// let the garbage collector handle the case that the pod is truly gone.
   893  				logger.V(5).Info("pod for claim not found", "claim", klog.KObj(claim), "pod", klog.KRef(claim.Namespace, podName))
   894  			default:
   895  				return fmt.Errorf("lookup pod: %v", err)
   896  			}
   897  		} else {
   898  			logger.V(5).Info("claim not generated for a pod", "claim", klog.KObj(claim))
   899  		}
   900  	}
   901  
   902  	return nil
   903  }
   904  
   905  func owningPod(claim *resourcev1alpha2.ResourceClaim) (string, types.UID) {
   906  	for _, owner := range claim.OwnerReferences {
   907  		if pointer.BoolDeref(owner.Controller, false) &&
   908  			owner.APIVersion == "v1" &&
   909  			owner.Kind == "Pod" {
   910  			return owner.Name, owner.UID
   911  		}
   912  	}
   913  	return "", ""
   914  }
   915  
   916  // podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (=
   917  // namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod.
   918  func podResourceClaimIndexFunc(obj interface{}) ([]string, error) {
   919  	pod, ok := obj.(*v1.Pod)
   920  	if !ok {
   921  		return []string{}, nil
   922  	}
   923  	keys := []string{}
   924  	for _, podClaim := range pod.Spec.ResourceClaims {
   925  		claimName, _, err := resourceclaim.Name(pod, &podClaim)
   926  		if err != nil || claimName == nil {
   927  			// Index functions are not supposed to fail, the caller will panic.
   928  			// For both error reasons (claim not created yet, unknown API)
   929  			// we simply don't index.
   930  			continue
   931  		}
   932  		keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName))
   933  	}
   934  	return keys, nil
   935  }
   936  
   937  // isPodDone returns true if it is certain that none of the containers are running and never will run.
   938  func isPodDone(pod *v1.Pod) bool {
   939  	return podutil.IsPodPhaseTerminal(pod.Status.Phase) ||
   940  		// Deleted and not scheduled:
   941  		pod.DeletionTimestamp != nil && pod.Spec.NodeName == ""
   942  }
   943  
   944  // claimPodOwnerIndexFunc is an index function that returns the pod UIDs of
   945  // all pods which own the resource claim. Should only be one, though.
   946  func claimPodOwnerIndexFunc(obj interface{}) ([]string, error) {
   947  	claim, ok := obj.(*resourcev1alpha2.ResourceClaim)
   948  	if !ok {
   949  		return nil, nil
   950  	}
   951  	var keys []string
   952  	for _, owner := range claim.OwnerReferences {
   953  		if owner.Controller != nil &&
   954  			*owner.Controller &&
   955  			owner.APIVersion == "v1" &&
   956  			owner.Kind == "Pod" {
   957  			keys = append(keys, string(owner.UID))
   958  		}
   959  	}
   960  	return keys, nil
   961  }