k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/controller/resourceclaim/controller.go (about)

     1  /*
     2  Copyright 2020 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package resourceclaim
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"slices"
    24  	"strings"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/types"
    32  	"k8s.io/apimachinery/pkg/util/runtime"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	corev1apply "k8s.io/client-go/applyconfigurations/core/v1"
    35  	v1informers "k8s.io/client-go/informers/core/v1"
    36  	resourcev1alpha2informers "k8s.io/client-go/informers/resource/v1alpha2"
    37  	clientset "k8s.io/client-go/kubernetes"
    38  	"k8s.io/client-go/kubernetes/scheme"
    39  	v1core "k8s.io/client-go/kubernetes/typed/core/v1"
    40  	v1listers "k8s.io/client-go/listers/core/v1"
    41  	resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
    42  	"k8s.io/client-go/tools/cache"
    43  	"k8s.io/client-go/tools/record"
    44  	"k8s.io/client-go/util/workqueue"
    45  	"k8s.io/dynamic-resource-allocation/resourceclaim"
    46  	"k8s.io/klog/v2"
    47  	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
    48  	"k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
    49  	"k8s.io/utils/pointer"
    50  )
    51  
    52  const (
    53  	// podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim templates.
    54  	podResourceClaimIndex = "pod-resource-claim-index"
    55  
    56  	// podResourceClaimAnnotation is the special annotation that generated
    57  	// ResourceClaims get. Its value is the pod.spec.resourceClaims[].name
    58  	// for which it was generated. This is used only inside the controller
    59  	// and not documented as part of the Kubernetes API.
    60  	podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name"
    61  
    62  	// claimPodOwnerIndex is used to find ResourceClaims which have
    63  	// a specific pod as owner. Values for this index are the pod UID.
    64  	claimPodOwnerIndex = "claim-pod-owner-index"
    65  
    66  	// Field manager used to update the pod status.
    67  	fieldManager = "ResourceClaimController"
    68  
    69  	maxUIDCacheEntries = 500
    70  )
    71  
    72  // Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec.
    73  type Controller struct {
    74  	// kubeClient is the kube API client used to communicate with the API
    75  	// server.
    76  	kubeClient clientset.Interface
    77  
    78  	// claimLister is the shared ResourceClaim lister used to fetch and store ResourceClaim
    79  	// objects from the API server. It is shared with other controllers and
    80  	// therefore the ResourceClaim objects in its store should be treated as immutable.
    81  	claimLister  resourcev1alpha2listers.ResourceClaimLister
    82  	claimsSynced cache.InformerSynced
    83  	claimCache   cache.MutationCache
    84  
    85  	// podLister is the shared Pod lister used to fetch Pod
    86  	// objects from the API server. It is shared with other controllers and
    87  	// therefore the Pod objects in its store should be treated as immutable.
    88  	podLister v1listers.PodLister
    89  	podSynced cache.InformerSynced
    90  
    91  	// podSchedulingList is the shared PodSchedulingContext lister used to
    92  	// fetch scheduling objects from the API server. It is shared with other
    93  	// controllers and therefore the objects in its store should be treated
    94  	// as immutable.
    95  	podSchedulingLister resourcev1alpha2listers.PodSchedulingContextLister
    96  	podSchedulingSynced cache.InformerSynced
    97  
    98  	// templateLister is the shared ResourceClaimTemplate lister used to
    99  	// fetch template objects from the API server. It is shared with other
   100  	// controllers and therefore the objects in its store should be treated
   101  	// as immutable.
   102  	templateLister  resourcev1alpha2listers.ResourceClaimTemplateLister
   103  	templatesSynced cache.InformerSynced
   104  
   105  	// podIndexer has the common PodResourceClaim indexer indexer installed To
   106  	// limit iteration over pods to those of interest.
   107  	podIndexer cache.Indexer
   108  
   109  	// recorder is used to record events in the API server
   110  	recorder record.EventRecorder
   111  
   112  	queue workqueue.TypedRateLimitingInterface[string]
   113  
   114  	// The deletedObjects cache keeps track of Pods for which we know that
   115  	// they have existed and have been removed. For those we can be sure
   116  	// that a ReservedFor entry needs to be removed.
   117  	deletedObjects *uidCache
   118  }
   119  
   120  const (
   121  	claimKeyPrefix = "claim:"
   122  	podKeyPrefix   = "pod:"
   123  )
   124  
   125  // NewController creates a ResourceClaim controller.
   126  func NewController(
   127  	logger klog.Logger,
   128  	kubeClient clientset.Interface,
   129  	podInformer v1informers.PodInformer,
   130  	podSchedulingInformer resourcev1alpha2informers.PodSchedulingContextInformer,
   131  	claimInformer resourcev1alpha2informers.ResourceClaimInformer,
   132  	templateInformer resourcev1alpha2informers.ResourceClaimTemplateInformer) (*Controller, error) {
   133  
   134  	ec := &Controller{
   135  		kubeClient:          kubeClient,
   136  		podLister:           podInformer.Lister(),
   137  		podIndexer:          podInformer.Informer().GetIndexer(),
   138  		podSynced:           podInformer.Informer().HasSynced,
   139  		podSchedulingLister: podSchedulingInformer.Lister(),
   140  		podSchedulingSynced: podSchedulingInformer.Informer().HasSynced,
   141  		claimLister:         claimInformer.Lister(),
   142  		claimsSynced:        claimInformer.Informer().HasSynced,
   143  		templateLister:      templateInformer.Lister(),
   144  		templatesSynced:     templateInformer.Informer().HasSynced,
   145  		queue: workqueue.NewTypedRateLimitingQueueWithConfig(
   146  			workqueue.DefaultTypedControllerRateLimiter[string](),
   147  			workqueue.TypedRateLimitingQueueConfig[string]{Name: "resource_claim"},
   148  		),
   149  		deletedObjects: newUIDCache(maxUIDCacheEntries),
   150  	}
   151  
   152  	metrics.RegisterMetrics()
   153  
   154  	if _, err := podInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   155  		AddFunc: func(obj interface{}) {
   156  			ec.enqueuePod(logger, obj, false)
   157  		},
   158  		UpdateFunc: func(old, updated interface{}) {
   159  			ec.enqueuePod(logger, updated, false)
   160  		},
   161  		DeleteFunc: func(obj interface{}) {
   162  			ec.enqueuePod(logger, obj, true)
   163  		},
   164  	}); err != nil {
   165  		return nil, err
   166  	}
   167  	if _, err := claimInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
   168  		AddFunc: func(obj interface{}) {
   169  			logger.V(6).Info("new claim", "claimDump", obj)
   170  			ec.enqueueResourceClaim(logger, obj, false)
   171  		},
   172  		UpdateFunc: func(old, updated interface{}) {
   173  			logger.V(6).Info("updated claim", "claimDump", updated)
   174  			ec.enqueueResourceClaim(logger, updated, false)
   175  		},
   176  		DeleteFunc: func(obj interface{}) {
   177  			logger.V(6).Info("deleted claim", "claimDump", obj)
   178  			ec.enqueueResourceClaim(logger, obj, true)
   179  		},
   180  	}); err != nil {
   181  		return nil, err
   182  	}
   183  	if err := ec.podIndexer.AddIndexers(cache.Indexers{podResourceClaimIndex: podResourceClaimIndexFunc}); err != nil {
   184  		return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err)
   185  	}
   186  
   187  	// The mutation cache acts as an additional layer for the informer
   188  	// cache and after a create made by the controller returns that
   189  	// object until the informer catches up. That is necessary
   190  	// when a ResourceClaim got created, updating the pod status fails,
   191  	// and then a retry occurs before the informer cache is updated.
   192  	// In that scenario, the controller would create another claim
   193  	// instead of continuing with the existing one.
   194  	claimInformerCache := claimInformer.Informer().GetIndexer()
   195  	if err := claimInformerCache.AddIndexers(cache.Indexers{claimPodOwnerIndex: claimPodOwnerIndexFunc}); err != nil {
   196  		return nil, fmt.Errorf("could not initialize ResourceClaim controller: %w", err)
   197  	}
   198  	ec.claimCache = cache.NewIntegerResourceVersionMutationCache(claimInformerCache, claimInformerCache,
   199  		// Very long time to live, unlikely to be needed because
   200  		// the informer cache should get updated soon.
   201  		time.Hour,
   202  		// Allow storing objects not in the underlying cache - that's the point...
   203  		// It's safe because in case of a race (claim is in mutation cache, claim
   204  		// gets deleted, controller updates status based on mutation cache) the
   205  		// "bad" pod status will get detected and fixed when the informer catches up.
   206  		true,
   207  	)
   208  
   209  	return ec, nil
   210  }
   211  
   212  func (ec *Controller) enqueuePod(logger klog.Logger, obj interface{}, deleted bool) {
   213  	if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   214  		obj = d.Obj
   215  	}
   216  	pod, ok := obj.(*v1.Pod)
   217  	if !ok {
   218  		// Not a pod?!
   219  		logger.Error(nil, "enqueuePod called for unexpected object", "type", fmt.Sprintf("%T", obj))
   220  		return
   221  	}
   222  
   223  	if len(pod.Spec.ResourceClaims) == 0 {
   224  		// Nothing to do for it at all.
   225  		return
   226  	}
   227  
   228  	if deleted {
   229  		logger.V(6).Info("pod got deleted", "pod", klog.KObj(pod))
   230  		ec.deletedObjects.Add(pod.UID)
   231  	}
   232  
   233  	logger.V(6).Info("pod with resource claims changed", "pod", klog.KObj(pod), "deleted", deleted)
   234  
   235  	// Release reservations of a deleted or completed pod?
   236  	if needsClaims, reason := podNeedsClaims(pod, deleted); !needsClaims {
   237  		for _, podClaim := range pod.Spec.ResourceClaims {
   238  			claimName, _, err := resourceclaim.Name(pod, &podClaim)
   239  			switch {
   240  			case err != nil:
   241  				// Either the claim was not created (nothing to do here) or
   242  				// the API changed. The later will also get reported elsewhere,
   243  				// so here it's just a debug message.
   244  				logger.V(6).Info("Nothing to do for claim during pod change", "err", err, "reason", reason)
   245  			case claimName != nil:
   246  				key := claimKeyPrefix + pod.Namespace + "/" + *claimName
   247  				logger.V(6).Info("Process claim", "pod", klog.KObj(pod), "key", key, "reason", reason)
   248  				ec.queue.Add(key)
   249  			default:
   250  				// Nothing to do, claim wasn't generated.
   251  				logger.V(6).Info("Nothing to do for skipped claim during pod change", "reason", reason)
   252  			}
   253  		}
   254  	}
   255  
   256  	needsWork, reason := ec.podNeedsWork(pod)
   257  	if needsWork {
   258  		logger.V(6).Info("enqueing pod", "pod", klog.KObj(pod), "reason", reason)
   259  		ec.queue.Add(podKeyPrefix + pod.Namespace + "/" + pod.Name)
   260  		return
   261  	}
   262  	logger.V(6).Info("not enqueing pod", "pod", klog.KObj(pod), "reason", reason)
   263  }
   264  
   265  func podNeedsClaims(pod *v1.Pod, deleted bool) (bool, string) {
   266  	if deleted {
   267  		return false, "pod got removed"
   268  	}
   269  	if podutil.IsPodTerminal(pod) {
   270  		return false, "pod has terminated"
   271  	}
   272  	if pod.DeletionTimestamp != nil && pod.Spec.NodeName == "" {
   273  		return false, "pod got deleted before scheduling"
   274  	}
   275  	// Still needs claims.
   276  	return true, "pod might run"
   277  }
   278  
   279  // podNeedsWork checks whether a new or modified pod needs to be processed
   280  // further by a worker. It returns a boolean with the result and an explanation
   281  // for it.
   282  func (ec *Controller) podNeedsWork(pod *v1.Pod) (bool, string) {
   283  	if pod.DeletionTimestamp != nil {
   284  		// Nothing else to do for the pod.
   285  		return false, "pod is deleted"
   286  	}
   287  
   288  	for _, podClaim := range pod.Spec.ResourceClaims {
   289  		claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim)
   290  		if err != nil {
   291  			return true, err.Error()
   292  		}
   293  		// If the claimName is nil, then it has been determined before
   294  		// that the claim is not needed.
   295  		if claimName == nil {
   296  			return false, "claim is not needed"
   297  		}
   298  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
   299  		if apierrors.IsNotFound(err) {
   300  			if podClaim.Source.ResourceClaimTemplateName != nil {
   301  				return true, "must create ResourceClaim from template"
   302  			}
   303  			// User needs to create claim.
   304  			return false, "claim is missing and must be created by user"
   305  		}
   306  		if err != nil {
   307  			// Shouldn't happen.
   308  			return true, fmt.Sprintf("internal error while checking for claim: %v", err)
   309  		}
   310  
   311  		if checkOwner &&
   312  			resourceclaim.IsForPod(pod, claim) != nil {
   313  			// Cannot proceed with the pod unless that other claim gets deleted.
   314  			return false, "conflicting claim needs to be removed by user"
   315  		}
   316  
   317  		// This check skips over the reasons below that only apply
   318  		// when a pod has been scheduled already. We need to keep checking
   319  		// for more claims that might need to be created.
   320  		if pod.Spec.NodeName == "" {
   321  			continue
   322  		}
   323  
   324  		// Create PodSchedulingContext if the pod got scheduled without triggering
   325  		// delayed allocation.
   326  		//
   327  		// These can happen when:
   328  		// - a user created a pod with spec.nodeName set, perhaps for testing
   329  		// - some scheduler was used which is unaware of DRA
   330  		// - DRA was not enabled in kube-scheduler (version skew, configuration)
   331  		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
   332  			claim.Status.Allocation == nil {
   333  			scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
   334  			if apierrors.IsNotFound(err) {
   335  				return true, "need to create PodSchedulingContext for scheduled pod"
   336  			}
   337  			if err != nil {
   338  				// Shouldn't happen.
   339  				return true, fmt.Sprintf("internal error while checking for PodSchedulingContext: %v", err)
   340  			}
   341  			if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
   342  				// Need to update PodSchedulingContext.
   343  				return true, "need to updated PodSchedulingContext for scheduled pod"
   344  			}
   345  		}
   346  		if claim.Status.Allocation != nil &&
   347  			!resourceclaim.IsReservedForPod(pod, claim) &&
   348  			resourceclaim.CanBeReserved(claim) {
   349  			// Need to reserve it.
   350  			return true, "need to reserve claim for pod"
   351  		}
   352  	}
   353  
   354  	return false, "nothing to do"
   355  }
   356  
   357  func (ec *Controller) enqueueResourceClaim(logger klog.Logger, obj interface{}, deleted bool) {
   358  	if d, ok := obj.(cache.DeletedFinalStateUnknown); ok {
   359  		obj = d.Obj
   360  	}
   361  	claim, ok := obj.(*resourcev1alpha2.ResourceClaim)
   362  	if !ok {
   363  		return
   364  	}
   365  
   366  	if !deleted {
   367  		// When starting up, we have to check all claims to find those with
   368  		// stale pods in ReservedFor. During an update, a pod might get added
   369  		// that already no longer exists.
   370  		key := claimKeyPrefix + claim.Namespace + "/" + claim.Name
   371  		logger.V(6).Info("enqueing new or updated claim", "claim", klog.KObj(claim), "key", key)
   372  		ec.queue.Add(key)
   373  	} else {
   374  		logger.V(6).Info("not enqueing deleted claim", "claim", klog.KObj(claim))
   375  	}
   376  
   377  	// Also check whether this causes work for any of the currently
   378  	// known pods which use the ResourceClaim.
   379  	objs, err := ec.podIndexer.ByIndex(podResourceClaimIndex, fmt.Sprintf("%s/%s", claim.Namespace, claim.Name))
   380  	if err != nil {
   381  		logger.Error(err, "listing pods from cache")
   382  		return
   383  	}
   384  	if len(objs) == 0 {
   385  		logger.V(6).Info("claim got deleted while not needed by any pod, nothing to do", "claim", klog.KObj(claim))
   386  		return
   387  	}
   388  	for _, obj := range objs {
   389  		ec.enqueuePod(logger, obj, false)
   390  	}
   391  }
   392  
   393  func (ec *Controller) Run(ctx context.Context, workers int) {
   394  	defer runtime.HandleCrash()
   395  	defer ec.queue.ShutDown()
   396  
   397  	logger := klog.FromContext(ctx)
   398  	logger.Info("Starting resource claim controller")
   399  	defer logger.Info("Shutting down resource claim controller")
   400  
   401  	eventBroadcaster := record.NewBroadcaster(record.WithContext(ctx))
   402  	eventBroadcaster.StartLogging(klog.Infof)
   403  	eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ec.kubeClient.CoreV1().Events("")})
   404  	ec.recorder = eventBroadcaster.NewRecorder(scheme.Scheme, v1.EventSource{Component: "resource_claim"})
   405  	defer eventBroadcaster.Shutdown()
   406  
   407  	if !cache.WaitForNamedCacheSync("resource_claim", ctx.Done(), ec.podSynced, ec.podSchedulingSynced, ec.claimsSynced, ec.templatesSynced) {
   408  		return
   409  	}
   410  
   411  	for i := 0; i < workers; i++ {
   412  		go wait.UntilWithContext(ctx, ec.runWorker, time.Second)
   413  	}
   414  
   415  	<-ctx.Done()
   416  }
   417  
   418  func (ec *Controller) runWorker(ctx context.Context) {
   419  	for ec.processNextWorkItem(ctx) {
   420  	}
   421  }
   422  
   423  func (ec *Controller) processNextWorkItem(ctx context.Context) bool {
   424  	key, shutdown := ec.queue.Get()
   425  	if shutdown {
   426  		return false
   427  	}
   428  	defer ec.queue.Done(key)
   429  
   430  	err := ec.syncHandler(ctx, key)
   431  	if err == nil {
   432  		ec.queue.Forget(key)
   433  		return true
   434  	}
   435  
   436  	runtime.HandleError(fmt.Errorf("%v failed with: %v", key, err))
   437  	ec.queue.AddRateLimited(key)
   438  
   439  	return true
   440  }
   441  
   442  // syncHandler is invoked for each work item which might need to be processed.
   443  // If an error is returned from this function, the item will be requeued.
   444  func (ec *Controller) syncHandler(ctx context.Context, key string) error {
   445  	sep := strings.Index(key, ":")
   446  	if sep < 0 {
   447  		return fmt.Errorf("unexpected key: %s", key)
   448  	}
   449  	prefix, object := key[0:sep+1], key[sep+1:]
   450  	namespace, name, err := cache.SplitMetaNamespaceKey(object)
   451  	if err != nil {
   452  		return err
   453  	}
   454  
   455  	switch prefix {
   456  	case podKeyPrefix:
   457  		return ec.syncPod(ctx, namespace, name)
   458  	case claimKeyPrefix:
   459  		return ec.syncClaim(ctx, namespace, name)
   460  	default:
   461  		return fmt.Errorf("unexpected key prefix: %s", prefix)
   462  	}
   463  
   464  }
   465  
   466  func (ec *Controller) syncPod(ctx context.Context, namespace, name string) error {
   467  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "pod", klog.KRef(namespace, name))
   468  	ctx = klog.NewContext(ctx, logger)
   469  	pod, err := ec.podLister.Pods(namespace).Get(name)
   470  	if err != nil {
   471  		if apierrors.IsNotFound(err) {
   472  			logger.V(5).Info("nothing to do for pod, it is gone")
   473  			return nil
   474  		}
   475  		return err
   476  	}
   477  
   478  	// Ignore pods which are already getting deleted.
   479  	if pod.DeletionTimestamp != nil {
   480  		logger.V(5).Info("nothing to do for pod, it is marked for deletion")
   481  		return nil
   482  	}
   483  
   484  	var newPodClaims map[string]string
   485  	for _, podClaim := range pod.Spec.ResourceClaims {
   486  		if err := ec.handleClaim(ctx, pod, podClaim, &newPodClaims); err != nil {
   487  			if ec.recorder != nil {
   488  				ec.recorder.Event(pod, v1.EventTypeWarning, "FailedResourceClaimCreation", fmt.Sprintf("PodResourceClaim %s: %v", podClaim.Name, err))
   489  			}
   490  			return fmt.Errorf("pod %s/%s, PodResourceClaim %s: %v", namespace, name, podClaim.Name, err)
   491  		}
   492  	}
   493  
   494  	if newPodClaims != nil {
   495  		// Patch the pod status with the new information about
   496  		// generated ResourceClaims.
   497  		statuses := make([]*corev1apply.PodResourceClaimStatusApplyConfiguration, 0, len(newPodClaims))
   498  		for podClaimName, resourceClaimName := range newPodClaims {
   499  			statuses = append(statuses, corev1apply.PodResourceClaimStatus().WithName(podClaimName).WithResourceClaimName(resourceClaimName))
   500  		}
   501  		podApply := corev1apply.Pod(name, namespace).WithStatus(corev1apply.PodStatus().WithResourceClaimStatuses(statuses...))
   502  		if _, err := ec.kubeClient.CoreV1().Pods(namespace).ApplyStatus(ctx, podApply, metav1.ApplyOptions{FieldManager: fieldManager, Force: true}); err != nil {
   503  			return fmt.Errorf("update pod %s/%s ResourceClaimStatuses: %v", namespace, name, err)
   504  		}
   505  	}
   506  
   507  	if pod.Spec.NodeName == "" {
   508  		// Scheduler will handle PodSchedulingContext and reservations.
   509  		logger.V(5).Info("nothing to do for pod, scheduler will deal with it")
   510  		return nil
   511  	}
   512  
   513  	for _, podClaim := range pod.Spec.ResourceClaims {
   514  		claimName, checkOwner, err := resourceclaim.Name(pod, &podClaim)
   515  		if err != nil {
   516  			return err
   517  		}
   518  		// If nil, then it has been determined that the claim is not needed
   519  		// and can be skipped.
   520  		if claimName == nil {
   521  			continue
   522  		}
   523  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
   524  		if apierrors.IsNotFound(err) {
   525  			return nil
   526  		}
   527  		if err != nil {
   528  			return fmt.Errorf("retrieve claim: %v", err)
   529  		}
   530  		if checkOwner {
   531  			if err := resourceclaim.IsForPod(pod, claim); err != nil {
   532  				return err
   533  			}
   534  		}
   535  		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
   536  			claim.Status.Allocation == nil {
   537  			logger.V(5).Info("create PodSchedulingContext because claim needs to be allocated", "resourceClaim", klog.KObj(claim))
   538  			return ec.ensurePodSchedulingContext(ctx, pod)
   539  		}
   540  		if claim.Status.Allocation != nil &&
   541  			!resourceclaim.IsReservedForPod(pod, claim) &&
   542  			resourceclaim.CanBeReserved(claim) {
   543  			logger.V(5).Info("reserve claim for pod", "resourceClaim", klog.KObj(claim))
   544  			if err := ec.reserveForPod(ctx, pod, claim); err != nil {
   545  				return err
   546  			}
   547  		}
   548  	}
   549  
   550  	return nil
   551  }
   552  
   553  // handleResourceClaim is invoked for each resource claim of a pod.
   554  func (ec *Controller) handleClaim(ctx context.Context, pod *v1.Pod, podClaim v1.PodResourceClaim, newPodClaims *map[string]string) error {
   555  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "podClaim", podClaim.Name)
   556  	ctx = klog.NewContext(ctx, logger)
   557  	logger.V(5).Info("checking", "podClaim", podClaim.Name)
   558  
   559  	// resourceclaim.Name checks for the situation that the client doesn't
   560  	// know some future addition to the API. Therefore it gets called here
   561  	// even if there is no template to work on, because if some new field
   562  	// gets added, the expectation might be that the controller does
   563  	// something for it.
   564  	claimName, mustCheckOwner, err := resourceclaim.Name(pod, &podClaim)
   565  	switch {
   566  	case errors.Is(err, resourceclaim.ErrClaimNotFound):
   567  		// Continue below.
   568  	case err != nil:
   569  		return fmt.Errorf("checking for claim before creating it: %v", err)
   570  	case claimName == nil:
   571  		// Nothing to do, no claim needed.
   572  		return nil
   573  	case *claimName != "":
   574  		claimName := *claimName
   575  		// The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses,
   576  		// but perhaps it was deleted accidentally. In that case we re-create it.
   577  		claim, err := ec.claimLister.ResourceClaims(pod.Namespace).Get(claimName)
   578  		if err != nil && !apierrors.IsNotFound(err) {
   579  			return err
   580  		}
   581  		if claim != nil {
   582  			var err error
   583  			if mustCheckOwner {
   584  				err = resourceclaim.IsForPod(pod, claim)
   585  			}
   586  			if err == nil {
   587  				// Already created, nothing more to do.
   588  				logger.V(5).Info("claim already created", "podClaim", podClaim.Name, "resourceClaim", claimName)
   589  				return nil
   590  			}
   591  			logger.Error(err, "claim that was created for the pod is no longer owned by the pod, creating a new one", "podClaim", podClaim.Name, "resourceClaim", claimName)
   592  		}
   593  	}
   594  
   595  	templateName := podClaim.Source.ResourceClaimTemplateName
   596  	if templateName == nil {
   597  		// Nothing to do.
   598  		return nil
   599  	}
   600  
   601  	// Before we create a new ResourceClaim, check if there is an orphaned one.
   602  	// This covers the case that the controller has created it, but then fails
   603  	// before it can update the pod status.
   604  	claim, err := ec.findPodResourceClaim(pod, podClaim)
   605  	if err != nil {
   606  		return fmt.Errorf("finding ResourceClaim for claim %s in pod %s/%s failed: %v", podClaim.Name, pod.Namespace, pod.Name, err)
   607  	}
   608  
   609  	if claim == nil {
   610  		template, err := ec.templateLister.ResourceClaimTemplates(pod.Namespace).Get(*templateName)
   611  		if err != nil {
   612  			return fmt.Errorf("resource claim template %q: %v", *templateName, err)
   613  		}
   614  
   615  		// Create the ResourceClaim with pod as owner, with a generated name that uses
   616  		// <pod>-<claim name> as base.
   617  		isTrue := true
   618  		annotations := template.Spec.ObjectMeta.Annotations
   619  		if annotations == nil {
   620  			annotations = make(map[string]string)
   621  		}
   622  		annotations[podResourceClaimAnnotation] = podClaim.Name
   623  		generateName := pod.Name + "-" + podClaim.Name + "-"
   624  		maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters.
   625  		if len(generateName) > maxBaseLen {
   626  			// We could leave truncation to the apiserver, but as
   627  			// it removes at the end, we would loose everything
   628  			// from the pod claim name when the pod name is long.
   629  			// We can do better and truncate both strings,
   630  			// proportional to their length.
   631  			generateName = pod.Name[0:len(pod.Name)*maxBaseLen/len(generateName)] +
   632  				"-" +
   633  				podClaim.Name[0:len(podClaim.Name)*maxBaseLen/len(generateName)]
   634  		}
   635  		claim = &resourcev1alpha2.ResourceClaim{
   636  			ObjectMeta: metav1.ObjectMeta{
   637  				GenerateName: generateName,
   638  				OwnerReferences: []metav1.OwnerReference{
   639  					{
   640  						APIVersion:         "v1",
   641  						Kind:               "Pod",
   642  						Name:               pod.Name,
   643  						UID:                pod.UID,
   644  						Controller:         &isTrue,
   645  						BlockOwnerDeletion: &isTrue,
   646  					},
   647  				},
   648  				Annotations: annotations,
   649  				Labels:      template.Spec.ObjectMeta.Labels,
   650  			},
   651  			Spec: template.Spec.Spec,
   652  		}
   653  		metrics.ResourceClaimCreateAttempts.Inc()
   654  		claimName := claim.Name
   655  		claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(pod.Namespace).Create(ctx, claim, metav1.CreateOptions{})
   656  		if err != nil {
   657  			metrics.ResourceClaimCreateFailures.Inc()
   658  			return fmt.Errorf("create ResourceClaim %s: %v", claimName, err)
   659  		}
   660  		ec.claimCache.Mutation(claim)
   661  	}
   662  
   663  	// Remember the new ResourceClaim for a batch PodStatus update in our caller.
   664  	if *newPodClaims == nil {
   665  		*newPodClaims = make(map[string]string)
   666  	}
   667  	(*newPodClaims)[podClaim.Name] = claim.Name
   668  
   669  	return nil
   670  }
   671  
   672  // findPodResourceClaim looks for an existing ResourceClaim with the right
   673  // annotation (ties it to the pod claim) and the right ownership (ties it to
   674  // the pod).
   675  func (ec *Controller) findPodResourceClaim(pod *v1.Pod, podClaim v1.PodResourceClaim) (*resourcev1alpha2.ResourceClaim, error) {
   676  	// Only claims owned by the pod will get returned here.
   677  	claims, err := ec.claimCache.ByIndex(claimPodOwnerIndex, string(pod.UID))
   678  	if err != nil {
   679  		return nil, err
   680  	}
   681  	deterministicName := pod.Name + "-" + podClaim.Name // Kubernetes <= 1.27 behavior.
   682  	for _, claimObj := range claims {
   683  		claim, ok := claimObj.(*resourcev1alpha2.ResourceClaim)
   684  		if !ok {
   685  			return nil, fmt.Errorf("unexpected object of type %T returned by claim cache", claimObj)
   686  		}
   687  		podClaimName, ok := claim.Annotations[podResourceClaimAnnotation]
   688  		if ok && podClaimName != podClaim.Name {
   689  			continue
   690  		}
   691  
   692  		// No annotation? It might a ResourceClaim created for
   693  		// the pod with a previous Kubernetes release where the
   694  		// ResourceClaim name was deterministic, in which case
   695  		// we have to use it and update the new pod status
   696  		// field accordingly.
   697  		if !ok && claim.Name != deterministicName {
   698  			continue
   699  		}
   700  
   701  		// Pick the first one that matches. There shouldn't be more than one. If there is,
   702  		// then all others will be ignored until the pod gets deleted. Then they also get
   703  		// cleaned up.
   704  		return claim, nil
   705  	}
   706  	return nil, nil
   707  }
   708  
   709  func (ec *Controller) ensurePodSchedulingContext(ctx context.Context, pod *v1.Pod) error {
   710  	scheduling, err := ec.podSchedulingLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
   711  	if err != nil && !apierrors.IsNotFound(err) {
   712  		return fmt.Errorf("retrieve PodSchedulingContext: %v", err)
   713  	}
   714  	if scheduling == nil {
   715  		scheduling = &resourcev1alpha2.PodSchedulingContext{
   716  			ObjectMeta: metav1.ObjectMeta{
   717  				Name:      pod.Name,
   718  				Namespace: pod.Namespace,
   719  				OwnerReferences: []metav1.OwnerReference{
   720  					{
   721  						APIVersion: "v1",
   722  						Kind:       "Pod",
   723  						Name:       pod.Name,
   724  						UID:        pod.UID,
   725  						Controller: pointer.Bool(true),
   726  					},
   727  				},
   728  			},
   729  			Spec: resourcev1alpha2.PodSchedulingContextSpec{
   730  				SelectedNode: pod.Spec.NodeName,
   731  				// There is no need for negotiation about
   732  				// potential and suitable nodes anymore, so
   733  				// PotentialNodes can be left empty.
   734  			},
   735  		}
   736  		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Create(ctx, scheduling, metav1.CreateOptions{}); err != nil {
   737  			return fmt.Errorf("create PodSchedulingContext: %v", err)
   738  		}
   739  		return nil
   740  	}
   741  
   742  	if scheduling.Spec.SelectedNode != pod.Spec.NodeName {
   743  		scheduling := scheduling.DeepCopy()
   744  		scheduling.Spec.SelectedNode = pod.Spec.NodeName
   745  		if _, err := ec.kubeClient.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Update(ctx, scheduling, metav1.UpdateOptions{}); err != nil {
   746  			return fmt.Errorf("update spec.selectedNode in PodSchedulingContext: %v", err)
   747  		}
   748  	}
   749  
   750  	return nil
   751  }
   752  
   753  func (ec *Controller) reserveForPod(ctx context.Context, pod *v1.Pod, claim *resourcev1alpha2.ResourceClaim) error {
   754  	claim = claim.DeepCopy()
   755  	claim.Status.ReservedFor = append(claim.Status.ReservedFor,
   756  		resourcev1alpha2.ResourceClaimConsumerReference{
   757  			Resource: "pods",
   758  			Name:     pod.Name,
   759  			UID:      pod.UID,
   760  		})
   761  	if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
   762  		return fmt.Errorf("reserve claim for pod: %v", err)
   763  	}
   764  	return nil
   765  }
   766  
   767  func (ec *Controller) syncClaim(ctx context.Context, namespace, name string) error {
   768  	logger := klog.LoggerWithValues(klog.FromContext(ctx), "claim", klog.KRef(namespace, name))
   769  	ctx = klog.NewContext(ctx, logger)
   770  	claim, err := ec.claimLister.ResourceClaims(namespace).Get(name)
   771  	if err != nil {
   772  		if apierrors.IsNotFound(err) {
   773  			logger.V(5).Info("nothing to do for claim, it is gone")
   774  			return nil
   775  		}
   776  		return err
   777  	}
   778  
   779  	// Check if the ReservedFor entries are all still valid.
   780  	valid := make([]resourcev1alpha2.ResourceClaimConsumerReference, 0, len(claim.Status.ReservedFor))
   781  	for _, reservedFor := range claim.Status.ReservedFor {
   782  		if reservedFor.APIGroup == "" &&
   783  			reservedFor.Resource == "pods" {
   784  			// A pod falls into one of three categories:
   785  			// - we have it in our cache -> don't remove it until we are told that it got removed
   786  			// - we don't have it in our cache anymore, but we have seen it before -> it was deleted, remove it
   787  			// - not in our cache, not seen -> double-check with API server before removal
   788  
   789  			keepEntry := true
   790  
   791  			// Tracking deleted pods in the LRU cache is an
   792  			// optimization. Without this cache, the code would
   793  			// have to do the API call below for every deleted pod
   794  			// to ensure that the pod really doesn't exist. With
   795  			// the cache, most of the time the pod will be recorded
   796  			// as deleted and the API call can be avoided.
   797  			if ec.deletedObjects.Has(reservedFor.UID) {
   798  				// We know that the pod was deleted. This is
   799  				// easy to check and thus is done first.
   800  				keepEntry = false
   801  			} else {
   802  				pod, err := ec.podLister.Pods(claim.Namespace).Get(reservedFor.Name)
   803  				switch {
   804  				case err != nil && !apierrors.IsNotFound(err):
   805  					return err
   806  				case err != nil:
   807  					// We might not have it in our informer cache
   808  					// yet. Removing the pod while the scheduler is
   809  					// scheduling it would be bad. We have to be
   810  					// absolutely sure and thus have to check with
   811  					// the API server.
   812  					pod, err := ec.kubeClient.CoreV1().Pods(claim.Namespace).Get(ctx, reservedFor.Name, metav1.GetOptions{})
   813  					if err != nil && !apierrors.IsNotFound(err) {
   814  						return err
   815  					}
   816  					if pod == nil || pod.UID != reservedFor.UID {
   817  						logger.V(6).Info("remove reservation because pod is gone or got replaced", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   818  						keepEntry = false
   819  					}
   820  				case pod.UID != reservedFor.UID:
   821  					logger.V(6).Info("remove reservation because pod got replaced with new instance", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   822  					keepEntry = false
   823  				case isPodDone(pod):
   824  					logger.V(6).Info("remove reservation because pod will not run anymore", "pod", klog.KObj(pod), "claim", klog.KRef(namespace, name))
   825  					keepEntry = false
   826  				}
   827  			}
   828  
   829  			if keepEntry {
   830  				valid = append(valid, reservedFor)
   831  			}
   832  			continue
   833  		}
   834  
   835  		// TODO: support generic object lookup
   836  		return fmt.Errorf("unsupported ReservedFor entry: %v", reservedFor)
   837  	}
   838  
   839  	builtinControllerFinalizer := slices.Index(claim.Finalizers, resourcev1alpha2.Finalizer)
   840  	logger.V(5).Info("claim reserved for counts", "currentCount", len(claim.Status.ReservedFor), "claim", klog.KRef(namespace, name), "updatedCount", len(valid), "builtinController", builtinControllerFinalizer >= 0)
   841  	if len(valid) < len(claim.Status.ReservedFor) {
   842  		// This is not using a patch because we want the update to fail if anything
   843  		// changed in the meantime.
   844  		claim := claim.DeepCopy()
   845  		claim.Status.ReservedFor = valid
   846  
   847  		// When a ResourceClaim uses delayed allocation, then it makes sense to
   848  		// deallocate the claim as soon as the last consumer stops using
   849  		// it. This ensures that the claim can be allocated again as needed by
   850  		// some future consumer instead of trying to schedule that consumer
   851  		// onto the node that was chosen for the previous consumer. It also
   852  		// releases the underlying resources for use by other claims.
   853  		//
   854  		// This has to be triggered by the transition from "was being used" to
   855  		// "is not used anymore" because a DRA driver is not required to set
   856  		// `status.reservedFor` together with `status.allocation`, i.e. a claim
   857  		// that is "currently unused" should not get deallocated.
   858  		//
   859  		// This does not matter for claims that were created for a pod. For
   860  		// those, the resource claim controller will trigger deletion when the
   861  		// pod is done. However, it doesn't hurt to also trigger deallocation
   862  		// for such claims and not checking for them keeps this code simpler.
   863  		if len(valid) == 0 {
   864  			if builtinControllerFinalizer >= 0 {
   865  				if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer ||
   866  					claim.DeletionTimestamp != nil {
   867  					// Allocated by scheduler with structured parameters. We can "deallocate"
   868  					// by clearing the allocation.
   869  					claim.Status.Allocation = nil
   870  				}
   871  			} else if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer {
   872  				// DRA driver controller in the control plane
   873  				// needs to do the deallocation.
   874  				claim.Status.DeallocationRequested = true
   875  			}
   876  			// In all other cases, we keep the claim allocated, in particular for immediate allocation
   877  			// with a control plane controller.
   878  		}
   879  
   880  		claim, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
   881  		if err != nil {
   882  			return err
   883  		}
   884  
   885  		// Now also remove the finalizer if it is not needed anymore.
   886  		// Note that the index may have changed as a result of the UpdateStatus call.
   887  		builtinControllerFinalizer := slices.Index(claim.Finalizers, resourcev1alpha2.Finalizer)
   888  		if builtinControllerFinalizer >= 0 && claim.Status.Allocation == nil {
   889  			claim.Finalizers = slices.Delete(claim.Finalizers, builtinControllerFinalizer, builtinControllerFinalizer+1)
   890  			if _, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil {
   891  				return err
   892  			}
   893  		}
   894  	} else if builtinControllerFinalizer >= 0 && claim.DeletionTimestamp != nil && len(valid) == 0 {
   895  		claim := claim.DeepCopy()
   896  		if claim.Status.Allocation != nil {
   897  			// This can happen when a claim with immediate allocation
   898  			// stopped being used, remained allocated, and then got
   899  			// deleted. As above we then need to clear the allocation.
   900  			claim.Status.Allocation = nil
   901  			var err error
   902  			claim, err = ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{})
   903  			if err != nil {
   904  				return err
   905  			}
   906  		}
   907  		// Whether it was allocated or not, remove the finalizer to unblock removal.
   908  		claim.Finalizers = slices.Delete(claim.Finalizers, builtinControllerFinalizer, builtinControllerFinalizer+1)
   909  		_, err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
   910  		if err != nil {
   911  			return err
   912  		}
   913  	}
   914  
   915  	if len(valid) == 0 {
   916  		// Claim is not reserved. If it was generated for a pod and
   917  		// that pod is not going to run, the claim can be
   918  		// deleted. Normally the garbage collector does that, but the
   919  		// pod itself might not get deleted for a while.
   920  		podName, podUID := owningPod(claim)
   921  		if podName != "" {
   922  			pod, err := ec.podLister.Pods(claim.Namespace).Get(podName)
   923  			switch {
   924  			case err == nil:
   925  				// Pod already replaced or not going to run?
   926  				if pod.UID != podUID || isPodDone(pod) {
   927  					// We are certain that the owning pod is not going to need
   928  					// the claim and therefore remove the claim.
   929  					logger.V(5).Info("deleting unused generated claim", "claim", klog.KObj(claim), "pod", klog.KObj(pod))
   930  					err := ec.kubeClient.ResourceV1alpha2().ResourceClaims(claim.Namespace).Delete(ctx, claim.Name, metav1.DeleteOptions{})
   931  					if err != nil {
   932  						return fmt.Errorf("delete claim: %v", err)
   933  					}
   934  				} else {
   935  					logger.V(6).Info("wrong pod content, not deleting claim", "claim", klog.KObj(claim), "podUID", podUID, "podContent", pod)
   936  				}
   937  			case apierrors.IsNotFound(err):
   938  				// We might not know the pod *yet*. Instead of doing an expensive API call,
   939  				// let the garbage collector handle the case that the pod is truly gone.
   940  				logger.V(5).Info("pod for claim not found", "claim", klog.KObj(claim), "pod", klog.KRef(claim.Namespace, podName))
   941  			default:
   942  				return fmt.Errorf("lookup pod: %v", err)
   943  			}
   944  		} else {
   945  			logger.V(5).Info("claim not generated for a pod", "claim", klog.KObj(claim))
   946  		}
   947  	}
   948  
   949  	return nil
   950  }
   951  
   952  func owningPod(claim *resourcev1alpha2.ResourceClaim) (string, types.UID) {
   953  	for _, owner := range claim.OwnerReferences {
   954  		if pointer.BoolDeref(owner.Controller, false) &&
   955  			owner.APIVersion == "v1" &&
   956  			owner.Kind == "Pod" {
   957  			return owner.Name, owner.UID
   958  		}
   959  	}
   960  	return "", ""
   961  }
   962  
   963  // podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (=
   964  // namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod.
   965  func podResourceClaimIndexFunc(obj interface{}) ([]string, error) {
   966  	pod, ok := obj.(*v1.Pod)
   967  	if !ok {
   968  		return []string{}, nil
   969  	}
   970  	keys := []string{}
   971  	for _, podClaim := range pod.Spec.ResourceClaims {
   972  		claimName, _, err := resourceclaim.Name(pod, &podClaim)
   973  		if err != nil || claimName == nil {
   974  			// Index functions are not supposed to fail, the caller will panic.
   975  			// For both error reasons (claim not created yet, unknown API)
   976  			// we simply don't index.
   977  			continue
   978  		}
   979  		keys = append(keys, fmt.Sprintf("%s/%s", pod.Namespace, *claimName))
   980  	}
   981  	return keys, nil
   982  }
   983  
   984  // isPodDone returns true if it is certain that none of the containers are running and never will run.
   985  func isPodDone(pod *v1.Pod) bool {
   986  	return podutil.IsPodPhaseTerminal(pod.Status.Phase) ||
   987  		// Deleted and not scheduled:
   988  		pod.DeletionTimestamp != nil && pod.Spec.NodeName == ""
   989  }
   990  
   991  // claimPodOwnerIndexFunc is an index function that returns the pod UIDs of
   992  // all pods which own the resource claim. Should only be one, though.
   993  func claimPodOwnerIndexFunc(obj interface{}) ([]string, error) {
   994  	claim, ok := obj.(*resourcev1alpha2.ResourceClaim)
   995  	if !ok {
   996  		return nil, nil
   997  	}
   998  	var keys []string
   999  	for _, owner := range claim.OwnerReferences {
  1000  		if owner.Controller != nil &&
  1001  			*owner.Controller &&
  1002  			owner.APIVersion == "v1" &&
  1003  			owner.Kind == "Pod" {
  1004  			keys = append(keys, string(owner.UID))
  1005  		}
  1006  	}
  1007  	return keys, nil
  1008  }