k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go (about)

     1  /*
     2  Copyright 2022 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package dynamicresources
    18  
    19  import (
    20  	"context"
    21  	"encoding/json"
    22  	"errors"
    23  	"fmt"
    24  	"slices"
    25  	"sort"
    26  	"sync"
    27  
    28  	"github.com/google/go-cmp/cmp"
    29  
    30  	v1 "k8s.io/api/core/v1"
    31  	resourcev1alpha2 "k8s.io/api/resource/v1alpha2"
    32  	apiequality "k8s.io/apimachinery/pkg/api/equality"
    33  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    34  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    35  	"k8s.io/apimachinery/pkg/labels"
    36  	"k8s.io/apimachinery/pkg/runtime"
    37  	"k8s.io/apimachinery/pkg/runtime/schema"
    38  	"k8s.io/apimachinery/pkg/types"
    39  	"k8s.io/apimachinery/pkg/util/sets"
    40  	resourcev1alpha2apply "k8s.io/client-go/applyconfigurations/resource/v1alpha2"
    41  	"k8s.io/client-go/kubernetes"
    42  	resourcev1alpha2listers "k8s.io/client-go/listers/resource/v1alpha2"
    43  	"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
    44  	"k8s.io/dynamic-resource-allocation/resourceclaim"
    45  	"k8s.io/klog/v2"
    46  	"k8s.io/kubernetes/pkg/scheduler/framework"
    47  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
    48  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
    49  	schedutil "k8s.io/kubernetes/pkg/scheduler/util"
    50  	"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
    51  	"k8s.io/utils/ptr"
    52  )
    53  
    54  const (
    55  	// Name is the name of the plugin used in Registry and configurations.
    56  	Name = names.DynamicResources
    57  
    58  	stateKey framework.StateKey = Name
    59  )
    60  
    61  // The state is initialized in PreFilter phase. Because we save the pointer in
    62  // framework.CycleState, in the later phases we don't need to call Write method
    63  // to update the value
    64  type stateData struct {
    65  	// preScored is true if PreScore was invoked.
    66  	preScored bool
    67  
    68  	// A copy of all claims for the Pod (i.e. 1:1 match with
    69  	// pod.Spec.ResourceClaims), initially with the status from the start
    70  	// of the scheduling cycle. Each claim instance is read-only because it
    71  	// might come from the informer cache. The instances get replaced when
    72  	// the plugin itself successfully does an Update.
    73  	//
    74  	// Empty if the Pod has no claims.
    75  	claims []*resourcev1alpha2.ResourceClaim
    76  
    77  	// podSchedulingState keeps track of the PodSchedulingContext
    78  	// (if one exists) and the changes made to it.
    79  	podSchedulingState podSchedulingState
    80  
    81  	// resourceModel contains the information about available and allocated resources when using
    82  	// structured parameters and the pod needs this information.
    83  	resources resources
    84  
    85  	// mutex must be locked while accessing any of the fields below.
    86  	mutex sync.Mutex
    87  
    88  	// The indices of all claims that:
    89  	// - are allocated
    90  	// - use delayed allocation or the builtin controller
    91  	// - were not available on at least one node
    92  	//
    93  	// Set in parallel during Filter, so write access there must be
    94  	// protected by the mutex. Used by PostFilter.
    95  	unavailableClaims sets.Set[int]
    96  
    97  	informationsForClaim []informationForClaim
    98  }
    99  
   100  func (d *stateData) Clone() framework.StateData {
   101  	return d
   102  }
   103  
   104  type informationForClaim struct {
   105  	// The availableOnNode node filter of the claim converted from the
   106  	// v1 API to nodeaffinity.NodeSelector by PreFilter for repeated
   107  	// evaluation in Filter. Nil for claim which don't have it.
   108  	availableOnNode *nodeaffinity.NodeSelector
   109  
   110  	// The status of the claim got from the
   111  	// schedulingCtx by PreFilter for repeated
   112  	// evaluation in Filter. Nil for claim which don't have it.
   113  	status *resourcev1alpha2.ResourceClaimSchedulingStatus
   114  
   115  	// structuredParameters is true if the claim is handled via the builtin
   116  	// controller.
   117  	structuredParameters bool
   118  	controller           *claimController
   119  
   120  	// Set by Reserved, published by PreBind.
   121  	allocation           *resourcev1alpha2.AllocationResult
   122  	allocationDriverName string
   123  }
   124  
   125  type podSchedulingState struct {
   126  	// A pointer to the PodSchedulingContext object for the pod, if one exists
   127  	// in the API server.
   128  	//
   129  	// Conceptually, this object belongs into the scheduler framework
   130  	// where it might get shared by different plugins. But in practice,
   131  	// it is currently only used by dynamic provisioning and thus
   132  	// managed entirely here.
   133  	schedulingCtx *resourcev1alpha2.PodSchedulingContext
   134  
   135  	// selectedNode is set if (and only if) a node has been selected.
   136  	selectedNode *string
   137  
   138  	// potentialNodes is set if (and only if) the potential nodes field
   139  	// needs to be updated or set.
   140  	potentialNodes *[]string
   141  }
   142  
   143  func (p *podSchedulingState) isDirty() bool {
   144  	return p.selectedNode != nil ||
   145  		p.potentialNodes != nil
   146  }
   147  
   148  // init checks whether there is already a PodSchedulingContext object.
   149  // Must not be called concurrently,
   150  func (p *podSchedulingState) init(ctx context.Context, pod *v1.Pod, podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister) error {
   151  	schedulingCtx, err := podSchedulingContextLister.PodSchedulingContexts(pod.Namespace).Get(pod.Name)
   152  	switch {
   153  	case apierrors.IsNotFound(err):
   154  		return nil
   155  	case err != nil:
   156  		return err
   157  	default:
   158  		// We have an object, but it might be obsolete.
   159  		if !metav1.IsControlledBy(schedulingCtx, pod) {
   160  			return fmt.Errorf("PodSchedulingContext object with UID %s is not owned by Pod %s/%s", schedulingCtx.UID, pod.Namespace, pod.Name)
   161  		}
   162  	}
   163  	p.schedulingCtx = schedulingCtx
   164  	return nil
   165  }
   166  
   167  // publish creates or updates the PodSchedulingContext object, if necessary.
   168  // Must not be called concurrently.
   169  func (p *podSchedulingState) publish(ctx context.Context, pod *v1.Pod, clientset kubernetes.Interface) error {
   170  	if !p.isDirty() {
   171  		return nil
   172  	}
   173  
   174  	var err error
   175  	logger := klog.FromContext(ctx)
   176  	if p.schedulingCtx != nil {
   177  		// Update it.
   178  		schedulingCtx := p.schedulingCtx.DeepCopy()
   179  		if p.selectedNode != nil {
   180  			schedulingCtx.Spec.SelectedNode = *p.selectedNode
   181  		}
   182  		if p.potentialNodes != nil {
   183  			schedulingCtx.Spec.PotentialNodes = *p.potentialNodes
   184  		}
   185  		if loggerV := logger.V(6); loggerV.Enabled() {
   186  			// At a high enough log level, dump the entire object.
   187  			loggerV.Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx))
   188  		} else {
   189  			logger.V(5).Info("Updating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx))
   190  		}
   191  		_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Update(ctx, schedulingCtx, metav1.UpdateOptions{})
   192  		if apierrors.IsConflict(err) {
   193  			// We don't use SSA by default for performance reasons
   194  			// (https://github.com/kubernetes/kubernetes/issues/113700#issuecomment-1698563918)
   195  			// because most of the time an Update doesn't encounter
   196  			// a conflict and is faster.
   197  			//
   198  			// We could return an error here and rely on
   199  			// backoff+retry, but scheduling attempts are expensive
   200  			// and the backoff delay would cause a (small)
   201  			// slowdown. Therefore we fall back to SSA here if needed.
   202  			//
   203  			// Using SSA instead of Get+Update has the advantage that
   204  			// there is no delay for the Get. SSA is safe because only
   205  			// the scheduler updates these fields.
   206  			spec := resourcev1alpha2apply.PodSchedulingContextSpec()
   207  			spec.SelectedNode = p.selectedNode
   208  			if p.potentialNodes != nil {
   209  				spec.PotentialNodes = *p.potentialNodes
   210  			} else {
   211  				// Unchanged. Has to be set because the object that we send
   212  				// must represent the "fully specified intent". Not sending
   213  				// the list would clear it.
   214  				spec.PotentialNodes = p.schedulingCtx.Spec.PotentialNodes
   215  			}
   216  			schedulingCtxApply := resourcev1alpha2apply.PodSchedulingContext(pod.Name, pod.Namespace).WithSpec(spec)
   217  
   218  			if loggerV := logger.V(6); loggerV.Enabled() {
   219  				// At a high enough log level, dump the entire object.
   220  				loggerV.Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod), "podSchedulingCtxApply", klog.Format(schedulingCtxApply))
   221  			} else {
   222  				logger.V(5).Info("Patching PodSchedulingContext", "podSchedulingCtx", klog.KObj(pod))
   223  			}
   224  			_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Apply(ctx, schedulingCtxApply, metav1.ApplyOptions{FieldManager: "kube-scheduler", Force: true})
   225  		}
   226  
   227  	} else {
   228  		// Create it.
   229  		schedulingCtx := &resourcev1alpha2.PodSchedulingContext{
   230  			ObjectMeta: metav1.ObjectMeta{
   231  				Name:            pod.Name,
   232  				Namespace:       pod.Namespace,
   233  				OwnerReferences: []metav1.OwnerReference{*metav1.NewControllerRef(pod, schema.GroupVersionKind{Version: "v1", Kind: "Pod"})},
   234  			},
   235  		}
   236  		if p.selectedNode != nil {
   237  			schedulingCtx.Spec.SelectedNode = *p.selectedNode
   238  		}
   239  		if p.potentialNodes != nil {
   240  			schedulingCtx.Spec.PotentialNodes = *p.potentialNodes
   241  		}
   242  		if loggerV := logger.V(6); loggerV.Enabled() {
   243  			// At a high enough log level, dump the entire object.
   244  			loggerV.Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx), "podSchedulingCtxObject", klog.Format(schedulingCtx))
   245  		} else {
   246  			logger.V(5).Info("Creating PodSchedulingContext", "podSchedulingCtx", klog.KObj(schedulingCtx))
   247  		}
   248  		_, err = clientset.ResourceV1alpha2().PodSchedulingContexts(schedulingCtx.Namespace).Create(ctx, schedulingCtx, metav1.CreateOptions{})
   249  	}
   250  	if err != nil {
   251  		return err
   252  	}
   253  	p.potentialNodes = nil
   254  	p.selectedNode = nil
   255  	return nil
   256  }
   257  
   258  func statusForClaim(schedulingCtx *resourcev1alpha2.PodSchedulingContext, podClaimName string) *resourcev1alpha2.ResourceClaimSchedulingStatus {
   259  	if schedulingCtx == nil {
   260  		return nil
   261  	}
   262  	for _, status := range schedulingCtx.Status.ResourceClaims {
   263  		if status.Name == podClaimName {
   264  			return &status
   265  		}
   266  	}
   267  	return nil
   268  }
   269  
   270  // dynamicResources is a plugin that ensures that ResourceClaims are allocated.
   271  type dynamicResources struct {
   272  	enabled                    bool
   273  	fh                         framework.Handle
   274  	clientset                  kubernetes.Interface
   275  	claimLister                resourcev1alpha2listers.ResourceClaimLister
   276  	classLister                resourcev1alpha2listers.ResourceClassLister
   277  	podSchedulingContextLister resourcev1alpha2listers.PodSchedulingContextLister
   278  	claimParametersLister      resourcev1alpha2listers.ResourceClaimParametersLister
   279  	classParametersLister      resourcev1alpha2listers.ResourceClassParametersLister
   280  	resourceSliceLister        resourcev1alpha2listers.ResourceSliceLister
   281  	claimNameLookup            *resourceclaim.Lookup
   282  
   283  	// claimAssumeCache enables temporarily storing a newer claim object
   284  	// while the scheduler has allocated it and the corresponding object
   285  	// update from the apiserver has not been processed by the claim
   286  	// informer callbacks. Claims get added here in PreBind and removed by
   287  	// the informer callback (based on the "newer than" comparison in the
   288  	// assume cache).
   289  	//
   290  	// It uses cache.MetaNamespaceKeyFunc to generate object names, which
   291  	// therefore are "<namespace>/<name>".
   292  	//
   293  	// This is necessary to ensure that reconstructing the resource usage
   294  	// at the start of a pod scheduling cycle doesn't reuse the resources
   295  	// assigned to such a claim. Alternatively, claim allocation state
   296  	// could also get tracked across pod scheduling cycles, but that
   297  	// - adds complexity (need to carefully sync state with informer events
   298  	//   for claims and ResourceSlices)
   299  	// - would make integration with cluster autoscaler harder because it would need
   300  	//   to trigger informer callbacks.
   301  	//
   302  	// When implementing cluster autoscaler support, this assume cache or
   303  	// something like it (see https://github.com/kubernetes/kubernetes/pull/112202)
   304  	// might have to be managed by the cluster autoscaler.
   305  	claimAssumeCache *assumecache.AssumeCache
   306  
   307  	// inFlightAllocations is map from claim UUIDs to claim objects for those claims
   308  	// for which allocation was triggered during a scheduling cycle and the
   309  	// corresponding claim status update call in PreBind has not been done
   310  	// yet. If another pod needs the claim, the pod is treated as "not
   311  	// schedulable yet". The cluster event for the claim status update will
   312  	// make it schedulable.
   313  	//
   314  	// This mechanism avoids the following problem:
   315  	// - Pod A triggers allocation for claim X.
   316  	// - Pod B shares access to that claim and gets scheduled because
   317  	//   the claim is assumed to be allocated.
   318  	// - PreBind for pod B is called first, tries to update reservedFor and
   319  	//   fails because the claim is not really allocated yet.
   320  	//
   321  	// We could avoid the ordering problem by allowing either pod A or pod B
   322  	// to set the allocation. But that is more complicated and leads to another
   323  	// problem:
   324  	// - Pod A and B get scheduled as above.
   325  	// - PreBind for pod A gets called first, then fails with a temporary API error.
   326  	//   It removes the updated claim from the assume cache because of that.
   327  	// - PreBind for pod B gets called next and succeeds with adding the
   328  	//   allocation and its own reservedFor entry.
   329  	// - The assume cache is now not reflecting that the claim is allocated,
   330  	//   which could lead to reusing the same resource for some other claim.
   331  	//
   332  	// A sync.Map is used because in practice sharing of a claim between
   333  	// pods is expected to be rare compared to per-pod claim, so we end up
   334  	// hitting the "multiple goroutines read, write, and overwrite entries
   335  	// for disjoint sets of keys" case that sync.Map is optimized for.
   336  	inFlightAllocations sync.Map
   337  }
   338  
   339  // New initializes a new plugin and returns it.
   340  func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
   341  	if !fts.EnableDynamicResourceAllocation {
   342  		// Disabled, won't do anything.
   343  		return &dynamicResources{}, nil
   344  	}
   345  
   346  	logger := klog.FromContext(ctx)
   347  	pl := &dynamicResources{
   348  		enabled:                    true,
   349  		fh:                         fh,
   350  		clientset:                  fh.ClientSet(),
   351  		claimLister:                fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Lister(),
   352  		classLister:                fh.SharedInformerFactory().Resource().V1alpha2().ResourceClasses().Lister(),
   353  		podSchedulingContextLister: fh.SharedInformerFactory().Resource().V1alpha2().PodSchedulingContexts().Lister(),
   354  		claimParametersLister:      fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaimParameters().Lister(),
   355  		classParametersLister:      fh.SharedInformerFactory().Resource().V1alpha2().ResourceClassParameters().Lister(),
   356  		resourceSliceLister:        fh.SharedInformerFactory().Resource().V1alpha2().ResourceSlices().Lister(),
   357  		claimNameLookup:            resourceclaim.NewNameLookup(fh.ClientSet()),
   358  		claimAssumeCache:           assumecache.NewAssumeCache(logger, fh.SharedInformerFactory().Resource().V1alpha2().ResourceClaims().Informer(), "claim", "", nil),
   359  	}
   360  
   361  	return pl, nil
   362  }
   363  
   364  var _ framework.PreEnqueuePlugin = &dynamicResources{}
   365  var _ framework.PreFilterPlugin = &dynamicResources{}
   366  var _ framework.FilterPlugin = &dynamicResources{}
   367  var _ framework.PostFilterPlugin = &dynamicResources{}
   368  var _ framework.PreScorePlugin = &dynamicResources{}
   369  var _ framework.ReservePlugin = &dynamicResources{}
   370  var _ framework.EnqueueExtensions = &dynamicResources{}
   371  var _ framework.PreBindPlugin = &dynamicResources{}
   372  var _ framework.PostBindPlugin = &dynamicResources{}
   373  
   374  // Name returns name of the plugin. It is used in logs, etc.
   375  func (pl *dynamicResources) Name() string {
   376  	return Name
   377  }
   378  
   379  // EventsToRegister returns the possible events that may make a Pod
   380  // failed by this plugin schedulable.
   381  func (pl *dynamicResources) EventsToRegister() []framework.ClusterEventWithHint {
   382  	if !pl.enabled {
   383  		return nil
   384  	}
   385  
   386  	events := []framework.ClusterEventWithHint{
   387  		// Changes for claim or class parameters creation may make pods
   388  		// schedulable which depend on claims using those parameters.
   389  		{Event: framework.ClusterEvent{Resource: framework.ResourceClaimParameters, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimParametersChange},
   390  		{Event: framework.ClusterEvent{Resource: framework.ResourceClassParameters, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClassParametersChange},
   391  
   392  		// Allocation is tracked in ResourceClaims, so any changes may make the pods schedulable.
   393  		{Event: framework.ClusterEvent{Resource: framework.ResourceClaim, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterClaimChange},
   394  		// When a driver has provided additional information, a pod waiting for that information
   395  		// may be schedulable.
   396  		{Event: framework.ClusterEvent{Resource: framework.PodSchedulingContext, ActionType: framework.Add | framework.Update}, QueueingHintFn: pl.isSchedulableAfterPodSchedulingContextChange},
   397  		// A resource might depend on node labels for topology filtering.
   398  		// A new or updated node may make pods schedulable.
   399  		//
   400  		// A note about UpdateNodeTaint event:
   401  		// NodeAdd QueueingHint isn't always called because of the internal feature called preCheck.
   402  		// As a common problematic scenario,
   403  		// when a node is added but not ready, NodeAdd event is filtered out by preCheck and doesn't arrive.
   404  		// In such cases, this plugin may miss some events that actually make pods schedulable.
   405  		// As a workaround, we add UpdateNodeTaint event to catch the case.
   406  		// We can remove UpdateNodeTaint when we remove the preCheck feature.
   407  		// See: https://github.com/kubernetes/kubernetes/issues/110175
   408  		{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel | framework.UpdateNodeTaint}},
   409  		// A pod might be waiting for a class to get created or modified.
   410  		{Event: framework.ClusterEvent{Resource: framework.ResourceClass, ActionType: framework.Add | framework.Update}},
   411  	}
   412  	return events
   413  }
   414  
   415  // PreEnqueue checks if there are known reasons why a pod currently cannot be
   416  // scheduled. When this fails, one of the registered events can trigger another
   417  // attempt.
   418  func (pl *dynamicResources) PreEnqueue(ctx context.Context, pod *v1.Pod) (status *framework.Status) {
   419  	if err := pl.foreachPodResourceClaim(pod, nil); err != nil {
   420  		return statusUnschedulable(klog.FromContext(ctx), err.Error())
   421  	}
   422  	return nil
   423  }
   424  
   425  // isSchedulableAfterClaimParametersChange is invoked for add and update claim parameters events reported by
   426  // an informer. It checks whether that change made a previously unschedulable
   427  // pod schedulable. It errs on the side of letting a pod scheduling attempt
   428  // happen. The delete claim event will not invoke it, so newObj will never be nil.
   429  func (pl *dynamicResources) isSchedulableAfterClaimParametersChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
   430  	originalParameters, modifiedParameters, err := schedutil.As[*resourcev1alpha2.ResourceClaimParameters](oldObj, newObj)
   431  	if err != nil {
   432  		// Shouldn't happen.
   433  		return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimParametersChange: %w", err)
   434  	}
   435  
   436  	usesParameters := false
   437  	if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
   438  		ref := claim.Spec.ParametersRef
   439  		if ref == nil {
   440  			return
   441  		}
   442  
   443  		// Using in-tree parameters directly?
   444  		if ref.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group &&
   445  			ref.Kind == "ResourceClaimParameters" {
   446  			if modifiedParameters.Name == ref.Name {
   447  				usesParameters = true
   448  			}
   449  			return
   450  		}
   451  
   452  		// Need to look for translated parameters.
   453  		generatedFrom := modifiedParameters.GeneratedFrom
   454  		if generatedFrom == nil {
   455  			return
   456  		}
   457  		if generatedFrom.APIGroup == ref.APIGroup &&
   458  			generatedFrom.Kind == ref.Kind &&
   459  			generatedFrom.Name == ref.Name {
   460  			usesParameters = true
   461  		}
   462  	}); err != nil {
   463  		// This is not an unexpected error: we know that
   464  		// foreachPodResourceClaim only returns errors for "not
   465  		// schedulable".
   466  		logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedParameters), "reason", err.Error())
   467  		return framework.QueueSkip, nil
   468  	}
   469  
   470  	if !usesParameters {
   471  		// This were not the parameters the pod was waiting for.
   472  		logger.V(6).Info("unrelated claim parameters got modified", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters))
   473  		return framework.QueueSkip, nil
   474  	}
   475  
   476  	if originalParameters == nil {
   477  		logger.V(4).Info("claim parameters for pod got created", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters))
   478  		return framework.Queue, nil
   479  	}
   480  
   481  	// Modifications may or may not be relevant. If the entire
   482  	// requests are as before, then something else must have changed
   483  	// and we don't care.
   484  	if apiequality.Semantic.DeepEqual(&originalParameters.DriverRequests, &modifiedParameters.DriverRequests) {
   485  		logger.V(6).Info("claim parameters for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters))
   486  		return framework.QueueSkip, nil
   487  	}
   488  
   489  	logger.V(4).Info("requests in claim parameters for pod got updated", "pod", klog.KObj(pod), "claimParameters", klog.KObj(modifiedParameters))
   490  	return framework.Queue, nil
   491  }
   492  
   493  // isSchedulableAfterClassParametersChange is invoked for add and update class parameters events reported by
   494  // an informer. It checks whether that change made a previously unschedulable
   495  // pod schedulable. It errs on the side of letting a pod scheduling attempt
   496  // happen. The delete class event will not invoke it, so newObj will never be nil.
   497  func (pl *dynamicResources) isSchedulableAfterClassParametersChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
   498  	originalParameters, modifiedParameters, err := schedutil.As[*resourcev1alpha2.ResourceClassParameters](oldObj, newObj)
   499  	if err != nil {
   500  		// Shouldn't happen.
   501  		return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClassParametersChange: %w", err)
   502  	}
   503  
   504  	usesParameters := false
   505  	if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
   506  		class, err := pl.classLister.Get(claim.Spec.ResourceClassName)
   507  		if err != nil {
   508  			if !apierrors.IsNotFound(err) {
   509  				logger.Error(err, "look up resource class")
   510  			}
   511  			return
   512  		}
   513  		ref := class.ParametersRef
   514  		if ref == nil {
   515  			return
   516  		}
   517  
   518  		// Using in-tree parameters directly?
   519  		if ref.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group &&
   520  			ref.Kind == "ResourceClassParameters" {
   521  			if modifiedParameters.Name == ref.Name {
   522  				usesParameters = true
   523  			}
   524  			return
   525  		}
   526  
   527  		// Need to look for translated parameters.
   528  		generatedFrom := modifiedParameters.GeneratedFrom
   529  		if generatedFrom == nil {
   530  			return
   531  		}
   532  		if generatedFrom.APIGroup == ref.APIGroup &&
   533  			generatedFrom.Kind == ref.Kind &&
   534  			generatedFrom.Name == ref.Name {
   535  			usesParameters = true
   536  		}
   537  	}); err != nil {
   538  		// This is not an unexpected error: we know that
   539  		// foreachPodResourceClaim only returns errors for "not
   540  		// schedulable".
   541  		logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters), "reason", err.Error())
   542  		return framework.QueueSkip, nil
   543  	}
   544  
   545  	if !usesParameters {
   546  		// This were not the parameters the pod was waiting for.
   547  		logger.V(6).Info("unrelated class parameters got modified", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters))
   548  		return framework.QueueSkip, nil
   549  	}
   550  
   551  	if originalParameters == nil {
   552  		logger.V(4).Info("class parameters for pod got created", "pod", klog.KObj(pod), "class", klog.KObj(modifiedParameters))
   553  		return framework.Queue, nil
   554  	}
   555  
   556  	// Modifications may or may not be relevant. If the entire
   557  	// requests are as before, then something else must have changed
   558  	// and we don't care.
   559  	if apiequality.Semantic.DeepEqual(&originalParameters.Filters, &modifiedParameters.Filters) {
   560  		logger.V(6).Info("class parameters for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters))
   561  		return framework.QueueSkip, nil
   562  	}
   563  
   564  	logger.V(4).Info("filters in class parameters for pod got updated", "pod", klog.KObj(pod), "classParameters", klog.KObj(modifiedParameters))
   565  	return framework.Queue, nil
   566  }
   567  
   568  // isSchedulableAfterClaimChange is invoked for add and update claim events reported by
   569  // an informer. It checks whether that change made a previously unschedulable
   570  // pod schedulable. It errs on the side of letting a pod scheduling attempt
   571  // happen. The delete claim event will not invoke it, so newObj will never be nil.
   572  func (pl *dynamicResources) isSchedulableAfterClaimChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
   573  	originalClaim, modifiedClaim, err := schedutil.As[*resourcev1alpha2.ResourceClaim](oldObj, newObj)
   574  	if err != nil {
   575  		// Shouldn't happen.
   576  		return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterClaimChange: %w", err)
   577  	}
   578  
   579  	usesClaim := false
   580  	if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
   581  		if claim.UID == modifiedClaim.UID {
   582  			usesClaim = true
   583  		}
   584  	}); err != nil {
   585  		// This is not an unexpected error: we know that
   586  		// foreachPodResourceClaim only returns errors for "not
   587  		// schedulable".
   588  		logger.V(4).Info("pod is not schedulable", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "reason", err.Error())
   589  		return framework.QueueSkip, nil
   590  	}
   591  
   592  	if originalClaim != nil &&
   593  		resourceclaim.IsAllocatedWithStructuredParameters(originalClaim) &&
   594  		modifiedClaim.Status.Allocation == nil {
   595  		// A claim with structured parameters was deallocated. This might have made
   596  		// resources available for other pods.
   597  		//
   598  		// TODO (https://github.com/kubernetes/kubernetes/issues/123697):
   599  		// check that the pending claims depend on structured parameters (depends on refactoring foreachPodResourceClaim, see other TODO).
   600  		//
   601  		// There is a small race here:
   602  		// - The dynamicresources plugin allocates claim A and updates the assume cache.
   603  		// - A second pod gets marked as unschedulable based on that assume cache.
   604  		// - Before the informer cache here catches up, the pod runs, terminates and
   605  		//   the claim gets deallocated without ever sending the claim status with
   606  		//   allocation to the scheduler.
   607  		// - The comparison below is for a *very* old claim with no allocation and the
   608  		//   new claim where the allocation is already removed again, so no
   609  		//   RemovedClaimAllocation event gets emitted.
   610  		//
   611  		// This is extremely unlikely and thus a fix is not needed for alpha in Kubernetes 1.30.
   612  		// TODO (https://github.com/kubernetes/kubernetes/issues/123698): The solution is to somehow integrate the assume cache
   613  		// into the event mechanism. This can be tackled together with adding autoscaler
   614  		// support, which also needs to do something with the assume cache.
   615  		logger.V(6).Info("claim with structured parameters got deallocated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
   616  		return framework.Queue, nil
   617  	}
   618  
   619  	if !usesClaim {
   620  		// This was not the claim the pod was waiting for.
   621  		logger.V(6).Info("unrelated claim got modified", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
   622  		return framework.QueueSkip, nil
   623  	}
   624  
   625  	if originalClaim == nil {
   626  		logger.V(4).Info("claim for pod got created", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
   627  		return framework.Queue, nil
   628  	}
   629  
   630  	// Modifications may or may not be relevant. If the entire
   631  	// status is as before, then something else must have changed
   632  	// and we don't care. What happens in practice is that the
   633  	// resource driver adds the finalizer.
   634  	if apiequality.Semantic.DeepEqual(&originalClaim.Status, &modifiedClaim.Status) {
   635  		if loggerV := logger.V(7); loggerV.Enabled() {
   636  			// Log more information.
   637  			loggerV.Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim), "diff", cmp.Diff(originalClaim, modifiedClaim))
   638  		} else {
   639  			logger.V(6).Info("claim for pod got modified where the pod doesn't care", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
   640  		}
   641  		return framework.QueueSkip, nil
   642  	}
   643  
   644  	logger.V(4).Info("status of claim for pod got updated", "pod", klog.KObj(pod), "claim", klog.KObj(modifiedClaim))
   645  	return framework.Queue, nil
   646  }
   647  
   648  // isSchedulableAfterPodSchedulingContextChange is invoked for all
   649  // PodSchedulingContext events reported by an informer. It checks whether that
   650  // change made a previously unschedulable pod schedulable (updated) or a new
   651  // attempt is needed to re-create the object (deleted). It errs on the side of
   652  // letting a pod scheduling attempt happen.
   653  func (pl *dynamicResources) isSchedulableAfterPodSchedulingContextChange(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
   654  	// Deleted? That can happen because we ourselves delete the PodSchedulingContext while
   655  	// working on the pod. This can be ignored.
   656  	if oldObj != nil && newObj == nil {
   657  		logger.V(4).Info("PodSchedulingContext got deleted")
   658  		return framework.QueueSkip, nil
   659  	}
   660  
   661  	oldPodScheduling, newPodScheduling, err := schedutil.As[*resourcev1alpha2.PodSchedulingContext](oldObj, newObj)
   662  	if err != nil {
   663  		// Shouldn't happen.
   664  		return framework.Queue, fmt.Errorf("unexpected object in isSchedulableAfterPodSchedulingContextChange: %w", err)
   665  	}
   666  	podScheduling := newPodScheduling // Never nil because deletes are handled above.
   667  
   668  	if podScheduling.Name != pod.Name || podScheduling.Namespace != pod.Namespace {
   669  		logger.V(7).Info("PodSchedulingContext for unrelated pod got modified", "pod", klog.KObj(pod), "podScheduling", klog.KObj(podScheduling))
   670  		return framework.QueueSkip, nil
   671  	}
   672  
   673  	// If the drivers have provided information about all
   674  	// unallocated claims with delayed allocation, then the next
   675  	// scheduling attempt is able to pick a node, so we let it run
   676  	// immediately if this occurred for the first time, otherwise
   677  	// we allow backoff.
   678  	pendingDelayedClaims := 0
   679  	if err := pl.foreachPodResourceClaim(pod, func(podResourceName string, claim *resourcev1alpha2.ResourceClaim) {
   680  		if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer &&
   681  			claim.Status.Allocation == nil &&
   682  			!podSchedulingHasClaimInfo(podScheduling, podResourceName) {
   683  			pendingDelayedClaims++
   684  		}
   685  	}); err != nil {
   686  		// This is not an unexpected error: we know that
   687  		// foreachPodResourceClaim only returns errors for "not
   688  		// schedulable".
   689  		logger.V(4).Info("pod is not schedulable, keep waiting", "pod", klog.KObj(pod), "reason", err.Error())
   690  		return framework.QueueSkip, nil
   691  	}
   692  
   693  	// Some driver responses missing?
   694  	if pendingDelayedClaims > 0 {
   695  		// We could start a pod scheduling attempt to refresh the
   696  		// potential nodes list.  But pod scheduling attempts are
   697  		// expensive and doing them too often causes the pod to enter
   698  		// backoff. Let's wait instead for all drivers to reply.
   699  		if loggerV := logger.V(6); loggerV.Enabled() {
   700  			loggerV.Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling))
   701  		} else {
   702  			logger.V(5).Info("PodSchedulingContext with missing resource claim information, keep waiting", "pod", klog.KObj(pod))
   703  		}
   704  		return framework.QueueSkip, nil
   705  	}
   706  
   707  	if oldPodScheduling == nil /* create */ ||
   708  		len(oldPodScheduling.Status.ResourceClaims) < len(podScheduling.Status.ResourceClaims) /* new information and not incomplete (checked above) */ {
   709  		// This definitely is new information for the scheduler. Try again immediately.
   710  		logger.V(4).Info("PodSchedulingContext for pod has all required information, schedule immediately", "pod", klog.KObj(pod))
   711  		return framework.Queue, nil
   712  	}
   713  
   714  	// The other situation where the scheduler needs to do
   715  	// something immediately is when the selected node doesn't
   716  	// work: waiting in the backoff queue only helps eventually
   717  	// resources on the selected node become available again. It's
   718  	// much more likely, in particular when trying to fill up the
   719  	// cluster, that the choice simply didn't work out. The risk
   720  	// here is that in a situation where the cluster really is
   721  	// full, backoff won't be used because the scheduler keeps
   722  	// trying different nodes. This should not happen when it has
   723  	// full knowledge about resource availability (=
   724  	// PodSchedulingContext.*.UnsuitableNodes is complete) but may happen
   725  	// when it doesn't (= PodSchedulingContext.*.UnsuitableNodes had to be
   726  	// truncated).
   727  	//
   728  	// Truncation only happens for very large clusters and then may slow
   729  	// down scheduling, but should not break it completely. This is
   730  	// acceptable while DRA is alpha and will be investigated further
   731  	// before moving DRA to beta.
   732  	if podScheduling.Spec.SelectedNode != "" {
   733  		for _, claimStatus := range podScheduling.Status.ResourceClaims {
   734  			if slices.Contains(claimStatus.UnsuitableNodes, podScheduling.Spec.SelectedNode) {
   735  				logger.V(5).Info("PodSchedulingContext has unsuitable selected node, schedule immediately", "pod", klog.KObj(pod), "selectedNode", podScheduling.Spec.SelectedNode, "podResourceName", claimStatus.Name)
   736  				return framework.Queue, nil
   737  			}
   738  		}
   739  	}
   740  
   741  	// Update with only the spec modified?
   742  	if oldPodScheduling != nil &&
   743  		!apiequality.Semantic.DeepEqual(&oldPodScheduling.Spec, &podScheduling.Spec) &&
   744  		apiequality.Semantic.DeepEqual(&oldPodScheduling.Status, &podScheduling.Status) {
   745  		logger.V(5).Info("PodSchedulingContext has only the scheduler spec changes, ignore the update", "pod", klog.KObj(pod))
   746  		return framework.QueueSkip, nil
   747  	}
   748  
   749  	// Once we get here, all changes which are known to require special responses
   750  	// have been checked for. Whatever the change was, we don't know exactly how
   751  	// to handle it and thus return Queue. This will cause the
   752  	// scheduler to treat the event as if no event hint callback had been provided.
   753  	// Developers who want to investigate this can enable a diff at log level 6.
   754  	if loggerV := logger.V(6); loggerV.Enabled() {
   755  		loggerV.Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod), "podSchedulingDiff", cmp.Diff(oldPodScheduling, podScheduling))
   756  	} else {
   757  		logger.V(5).Info("PodSchedulingContext for pod with unknown changes, maybe schedule", "pod", klog.KObj(pod))
   758  	}
   759  	return framework.Queue, nil
   760  
   761  }
   762  
   763  func podSchedulingHasClaimInfo(podScheduling *resourcev1alpha2.PodSchedulingContext, podResourceName string) bool {
   764  	for _, claimStatus := range podScheduling.Status.ResourceClaims {
   765  		if claimStatus.Name == podResourceName {
   766  			return true
   767  		}
   768  	}
   769  	return false
   770  }
   771  
   772  // podResourceClaims returns the ResourceClaims for all pod.Spec.PodResourceClaims.
   773  func (pl *dynamicResources) podResourceClaims(pod *v1.Pod) ([]*resourcev1alpha2.ResourceClaim, error) {
   774  	claims := make([]*resourcev1alpha2.ResourceClaim, 0, len(pod.Spec.ResourceClaims))
   775  	if err := pl.foreachPodResourceClaim(pod, func(_ string, claim *resourcev1alpha2.ResourceClaim) {
   776  		// We store the pointer as returned by the lister. The
   777  		// assumption is that if a claim gets modified while our code
   778  		// runs, the cache will store a new pointer, not mutate the
   779  		// existing object that we point to here.
   780  		claims = append(claims, claim)
   781  	}); err != nil {
   782  		return nil, err
   783  	}
   784  	return claims, nil
   785  }
   786  
   787  // foreachPodResourceClaim checks that each ResourceClaim for the pod exists.
   788  // It calls an optional handler for those claims that it finds.
   789  func (pl *dynamicResources) foreachPodResourceClaim(pod *v1.Pod, cb func(podResourceName string, claim *resourcev1alpha2.ResourceClaim)) error {
   790  	for _, resource := range pod.Spec.ResourceClaims {
   791  		claimName, mustCheckOwner, err := pl.claimNameLookup.Name(pod, &resource)
   792  		if err != nil {
   793  			return err
   794  		}
   795  		// The claim name might be nil if no underlying resource claim
   796  		// was generated for the referenced claim. There are valid use
   797  		// cases when this might happen, so we simply skip it.
   798  		if claimName == nil {
   799  			continue
   800  		}
   801  		claim, err := pl.claimLister.ResourceClaims(pod.Namespace).Get(*claimName)
   802  		if err != nil {
   803  			return err
   804  		}
   805  
   806  		if claim.DeletionTimestamp != nil {
   807  			return fmt.Errorf("resourceclaim %q is being deleted", claim.Name)
   808  		}
   809  
   810  		if mustCheckOwner {
   811  			if err := resourceclaim.IsForPod(pod, claim); err != nil {
   812  				return err
   813  			}
   814  		}
   815  		if cb != nil {
   816  			cb(resource.Name, claim)
   817  		}
   818  	}
   819  	return nil
   820  }
   821  
   822  // PreFilter invoked at the prefilter extension point to check if pod has all
   823  // immediate claims bound. UnschedulableAndUnresolvable is returned if
   824  // the pod cannot be scheduled at the moment on any node.
   825  func (pl *dynamicResources) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
   826  	if !pl.enabled {
   827  		return nil, framework.NewStatus(framework.Skip)
   828  	}
   829  	logger := klog.FromContext(ctx)
   830  
   831  	// If the pod does not reference any claim, we don't need to do
   832  	// anything for it. We just initialize an empty state to record that
   833  	// observation for the other functions. This gets updated below
   834  	// if we get that far.
   835  	s := &stateData{}
   836  	state.Write(stateKey, s)
   837  
   838  	claims, err := pl.podResourceClaims(pod)
   839  	if err != nil {
   840  		return nil, statusUnschedulable(logger, err.Error())
   841  	}
   842  	logger.V(5).Info("pod resource claims", "pod", klog.KObj(pod), "resourceclaims", klog.KObjSlice(claims))
   843  
   844  	// If the pod does not reference any claim,
   845  	// DynamicResources Filter has nothing to do with the Pod.
   846  	if len(claims) == 0 {
   847  		return nil, framework.NewStatus(framework.Skip)
   848  	}
   849  
   850  	// Fetch PodSchedulingContext, it's going to be needed when checking claims.
   851  	if err := s.podSchedulingState.init(ctx, pod, pl.podSchedulingContextLister); err != nil {
   852  		return nil, statusError(logger, err)
   853  	}
   854  
   855  	s.informationsForClaim = make([]informationForClaim, len(claims))
   856  	needResourceInformation := false
   857  	for index, claim := range claims {
   858  		if claim.Status.DeallocationRequested {
   859  			// This will get resolved by the resource driver.
   860  			return nil, statusUnschedulable(logger, "resourceclaim must be reallocated", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
   861  		}
   862  		if claim.Status.Allocation != nil &&
   863  			!resourceclaim.CanBeReserved(claim) &&
   864  			!resourceclaim.IsReservedForPod(pod, claim) {
   865  			// Resource is in use. The pod has to wait.
   866  			return nil, statusUnschedulable(logger, "resourceclaim in use", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
   867  		}
   868  
   869  		if claim.Status.Allocation != nil {
   870  			if claim.Status.Allocation.AvailableOnNodes != nil {
   871  				nodeSelector, err := nodeaffinity.NewNodeSelector(claim.Status.Allocation.AvailableOnNodes)
   872  				if err != nil {
   873  					return nil, statusError(logger, err)
   874  				}
   875  				s.informationsForClaim[index].availableOnNode = nodeSelector
   876  			}
   877  
   878  			// The claim was allocated by the scheduler if it has the finalizer that is
   879  			// reserved for Kubernetes.
   880  			s.informationsForClaim[index].structuredParameters = slices.Contains(claim.Finalizers, resourcev1alpha2.Finalizer)
   881  		} else {
   882  			// The ResourceClass might have a node filter. This is
   883  			// useful for trimming the initial set of potential
   884  			// nodes before we ask the driver(s) for information
   885  			// about the specific pod.
   886  			class, err := pl.classLister.Get(claim.Spec.ResourceClassName)
   887  			if err != nil {
   888  				// If the class cannot be retrieved, allocation cannot proceed.
   889  				if apierrors.IsNotFound(err) {
   890  					// Here we mark the pod as "unschedulable", so it'll sleep in
   891  					// the unscheduleable queue until a ResourceClass event occurs.
   892  					return nil, statusUnschedulable(logger, fmt.Sprintf("resource class %s does not exist", claim.Spec.ResourceClassName))
   893  				}
   894  				// Other error, retry with backoff.
   895  				return nil, statusError(logger, fmt.Errorf("look up resource class: %v", err))
   896  			}
   897  			if class.SuitableNodes != nil {
   898  				selector, err := nodeaffinity.NewNodeSelector(class.SuitableNodes)
   899  				if err != nil {
   900  					return nil, statusError(logger, err)
   901  				}
   902  				s.informationsForClaim[index].availableOnNode = selector
   903  			}
   904  			s.informationsForClaim[index].status = statusForClaim(s.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name)
   905  
   906  			if class.StructuredParameters != nil && *class.StructuredParameters {
   907  				s.informationsForClaim[index].structuredParameters = true
   908  
   909  				// Allocation in flight? Better wait for that
   910  				// to finish, see inFlightAllocations
   911  				// documentation for details.
   912  				if _, found := pl.inFlightAllocations.Load(claim.UID); found {
   913  					return nil, statusUnschedulable(logger, fmt.Sprintf("resource claim %s is in the process of being allocated", klog.KObj(claim)))
   914  				}
   915  
   916  				// We need the claim and class parameters. If
   917  				// they don't exist yet, the pod has to wait.
   918  				//
   919  				// TODO (https://github.com/kubernetes/kubernetes/issues/123697):
   920  				// check this already in foreachPodResourceClaim, together with setting up informationsForClaim.
   921  				// Then PreEnqueue will also check for existence of parameters.
   922  				classParameters, claimParameters, status := pl.lookupParameters(logger, class, claim)
   923  				if status != nil {
   924  					return nil, status
   925  				}
   926  				controller, err := newClaimController(logger, class, classParameters, claimParameters)
   927  				if err != nil {
   928  					return nil, statusError(logger, err)
   929  				}
   930  				s.informationsForClaim[index].controller = controller
   931  				needResourceInformation = true
   932  			} else if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeImmediate {
   933  				// This will get resolved by the resource driver.
   934  				return nil, statusUnschedulable(logger, "unallocated immediate resourceclaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
   935  			}
   936  		}
   937  	}
   938  
   939  	if needResourceInformation {
   940  		// Doing this over and over again for each pod could be avoided
   941  		// by parsing once when creating the plugin and then updating
   942  		// that state in informer callbacks. But that would cause
   943  		// problems for using the plugin in the Cluster Autoscaler. If
   944  		// this step here turns out to be expensive, we may have to
   945  		// maintain and update state more persistently.
   946  		//
   947  		// Claims are treated as "allocated" if they are in the assume cache
   948  		// or currently their allocation is in-flight.
   949  		resources, err := newResourceModel(logger, pl.resourceSliceLister, pl.claimAssumeCache, &pl.inFlightAllocations)
   950  		logger.V(5).Info("Resource usage", "resources", klog.Format(resources))
   951  		if err != nil {
   952  			return nil, statusError(logger, err)
   953  		}
   954  		s.resources = resources
   955  	}
   956  
   957  	s.claims = claims
   958  	return nil, nil
   959  }
   960  
   961  func (pl *dynamicResources) lookupParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass, claim *resourcev1alpha2.ResourceClaim) (classParameters *resourcev1alpha2.ResourceClassParameters, claimParameters *resourcev1alpha2.ResourceClaimParameters, status *framework.Status) {
   962  	classParameters, status = pl.lookupClassParameters(logger, class)
   963  	if status != nil {
   964  		return
   965  	}
   966  	claimParameters, status = pl.lookupClaimParameters(logger, class, claim)
   967  	return
   968  }
   969  
   970  func (pl *dynamicResources) lookupClassParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass) (*resourcev1alpha2.ResourceClassParameters, *framework.Status) {
   971  	defaultClassParameters := resourcev1alpha2.ResourceClassParameters{}
   972  
   973  	if class.ParametersRef == nil {
   974  		return &defaultClassParameters, nil
   975  	}
   976  
   977  	if class.ParametersRef.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group &&
   978  		class.ParametersRef.Kind == "ResourceClassParameters" {
   979  		// Use the parameters which were referenced directly.
   980  		parameters, err := pl.classParametersLister.ResourceClassParameters(class.ParametersRef.Namespace).Get(class.ParametersRef.Name)
   981  		if err != nil {
   982  			if apierrors.IsNotFound(err) {
   983  				return nil, statusUnschedulable(logger, fmt.Sprintf("class parameters %s not found", klog.KRef(class.ParametersRef.Namespace, class.ParametersRef.Name)))
   984  			}
   985  			return nil, statusError(logger, fmt.Errorf("get class parameters %s: %v", klog.KRef(class.Namespace, class.ParametersRef.Name), err))
   986  		}
   987  		return parameters, nil
   988  	}
   989  
   990  	// TODO (https://github.com/kubernetes/kubernetes/issues/123731): use an indexer
   991  	allParameters, err := pl.classParametersLister.ResourceClassParameters(class.Namespace).List(labels.Everything())
   992  	if err != nil {
   993  		return nil, statusError(logger, fmt.Errorf("listing class parameters failed: %v", err))
   994  	}
   995  	for _, parameters := range allParameters {
   996  		if parameters.GeneratedFrom == nil {
   997  			continue
   998  		}
   999  		if parameters.GeneratedFrom.APIGroup == class.ParametersRef.APIGroup &&
  1000  			parameters.GeneratedFrom.Kind == class.ParametersRef.Kind &&
  1001  			parameters.GeneratedFrom.Name == class.ParametersRef.Name &&
  1002  			parameters.GeneratedFrom.Namespace == class.ParametersRef.Namespace {
  1003  			return parameters, nil
  1004  		}
  1005  	}
  1006  	return nil, statusUnschedulable(logger, fmt.Sprintf("generated class parameters for %s.%s %s not found", class.ParametersRef.Kind, class.ParametersRef.APIGroup, klog.KRef(class.Namespace, class.ParametersRef.Name)))
  1007  }
  1008  
  1009  func (pl *dynamicResources) lookupClaimParameters(logger klog.Logger, class *resourcev1alpha2.ResourceClass, claim *resourcev1alpha2.ResourceClaim) (*resourcev1alpha2.ResourceClaimParameters, *framework.Status) {
  1010  	defaultClaimParameters := resourcev1alpha2.ResourceClaimParameters{
  1011  		Shareable: true,
  1012  		DriverRequests: []resourcev1alpha2.DriverRequests{
  1013  			{
  1014  				DriverName: class.DriverName,
  1015  				Requests: []resourcev1alpha2.ResourceRequest{
  1016  					{
  1017  						ResourceRequestModel: resourcev1alpha2.ResourceRequestModel{
  1018  							// TODO: This only works because NamedResources is
  1019  							// the only model currently implemented. We need to
  1020  							// match the default to how the resources of this
  1021  							// class are being advertized in a ResourceSlice.
  1022  							NamedResources: &resourcev1alpha2.NamedResourcesRequest{
  1023  								Selector: "true",
  1024  							},
  1025  						},
  1026  					},
  1027  				},
  1028  			},
  1029  		},
  1030  	}
  1031  
  1032  	if claim.Spec.ParametersRef == nil {
  1033  		return &defaultClaimParameters, nil
  1034  	}
  1035  	if claim.Spec.ParametersRef.APIGroup == resourcev1alpha2.SchemeGroupVersion.Group &&
  1036  		claim.Spec.ParametersRef.Kind == "ResourceClaimParameters" {
  1037  		// Use the parameters which were referenced directly.
  1038  		parameters, err := pl.claimParametersLister.ResourceClaimParameters(claim.Namespace).Get(claim.Spec.ParametersRef.Name)
  1039  		if err != nil {
  1040  			if apierrors.IsNotFound(err) {
  1041  				return nil, statusUnschedulable(logger, fmt.Sprintf("claim parameters %s not found", klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name)))
  1042  			}
  1043  			return nil, statusError(logger, fmt.Errorf("get claim parameters %s: %v", klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name), err))
  1044  		}
  1045  		return parameters, nil
  1046  	}
  1047  
  1048  	// TODO (https://github.com/kubernetes/kubernetes/issues/123731): use an indexer
  1049  	allParameters, err := pl.claimParametersLister.ResourceClaimParameters(claim.Namespace).List(labels.Everything())
  1050  	if err != nil {
  1051  		return nil, statusError(logger, fmt.Errorf("listing claim parameters failed: %v", err))
  1052  	}
  1053  	for _, parameters := range allParameters {
  1054  		if parameters.GeneratedFrom == nil {
  1055  			continue
  1056  		}
  1057  		if parameters.GeneratedFrom.APIGroup == claim.Spec.ParametersRef.APIGroup &&
  1058  			parameters.GeneratedFrom.Kind == claim.Spec.ParametersRef.Kind &&
  1059  			parameters.GeneratedFrom.Name == claim.Spec.ParametersRef.Name {
  1060  			return parameters, nil
  1061  		}
  1062  	}
  1063  	return nil, statusUnschedulable(logger, fmt.Sprintf("generated claim parameters for %s.%s %s not found", claim.Spec.ParametersRef.Kind, claim.Spec.ParametersRef.APIGroup, klog.KRef(claim.Namespace, claim.Spec.ParametersRef.Name)))
  1064  }
  1065  
  1066  // PreFilterExtensions returns prefilter extensions, pod add and remove.
  1067  func (pl *dynamicResources) PreFilterExtensions() framework.PreFilterExtensions {
  1068  	return nil
  1069  }
  1070  
  1071  func getStateData(cs *framework.CycleState) (*stateData, error) {
  1072  	state, err := cs.Read(stateKey)
  1073  	if err != nil {
  1074  		return nil, err
  1075  	}
  1076  	s, ok := state.(*stateData)
  1077  	if !ok {
  1078  		return nil, errors.New("unable to convert state into stateData")
  1079  	}
  1080  	return s, nil
  1081  }
  1082  
  1083  // Filter invoked at the filter extension point.
  1084  // It evaluates if a pod can fit due to the resources it requests,
  1085  // for both allocated and unallocated claims.
  1086  //
  1087  // For claims that are bound, then it checks that the node affinity is
  1088  // satisfied by the given node.
  1089  //
  1090  // For claims that are unbound, it checks whether the claim might get allocated
  1091  // for the node.
  1092  func (pl *dynamicResources) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
  1093  	if !pl.enabled {
  1094  		return nil
  1095  	}
  1096  	state, err := getStateData(cs)
  1097  	if err != nil {
  1098  		return statusError(klog.FromContext(ctx), err)
  1099  	}
  1100  	if len(state.claims) == 0 {
  1101  		return nil
  1102  	}
  1103  
  1104  	logger := klog.FromContext(ctx)
  1105  	node := nodeInfo.Node()
  1106  
  1107  	var unavailableClaims []int
  1108  	for index, claim := range state.claims {
  1109  		logger.V(10).Info("filtering based on resource claims of the pod", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
  1110  		switch {
  1111  		case claim.Status.Allocation != nil:
  1112  			if nodeSelector := state.informationsForClaim[index].availableOnNode; nodeSelector != nil {
  1113  				if !nodeSelector.Match(node) {
  1114  					logger.V(5).Info("AvailableOnNodes does not match", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
  1115  					unavailableClaims = append(unavailableClaims, index)
  1116  				}
  1117  			}
  1118  		case claim.Status.DeallocationRequested:
  1119  			// We shouldn't get here. PreFilter already checked this.
  1120  			return statusUnschedulable(logger, "resourceclaim must be reallocated", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
  1121  		case claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer ||
  1122  			state.informationsForClaim[index].structuredParameters:
  1123  			if selector := state.informationsForClaim[index].availableOnNode; selector != nil {
  1124  				if matches := selector.Match(node); !matches {
  1125  					return statusUnschedulable(logger, "excluded by resource class node filter", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclassName", claim.Spec.ResourceClassName)
  1126  				}
  1127  			}
  1128  			// Can the builtin controller tell us whether the node is suitable?
  1129  			if state.informationsForClaim[index].structuredParameters {
  1130  				suitable, err := state.informationsForClaim[index].controller.nodeIsSuitable(ctx, node.Name, state.resources)
  1131  				if err != nil {
  1132  					// An error indicates that something wasn't configured correctly, for example
  1133  					// writing a CEL expression which doesn't handle a map lookup error. Normally
  1134  					// this should never fail. We could return an error here, but then the pod
  1135  					// would get retried. Instead we ignore the node.
  1136  					return statusUnschedulable(logger, fmt.Sprintf("checking structured parameters failed: %v", err), "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
  1137  				}
  1138  				if !suitable {
  1139  					return statusUnschedulable(logger, "resourceclaim cannot be allocated for the node (unsuitable)", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim))
  1140  				}
  1141  			} else {
  1142  				if status := state.informationsForClaim[index].status; status != nil {
  1143  					for _, unsuitableNode := range status.UnsuitableNodes {
  1144  						if node.Name == unsuitableNode {
  1145  							return statusUnschedulable(logger, "resourceclaim cannot be allocated for the node (unsuitable)", "pod", klog.KObj(pod), "node", klog.KObj(node), "resourceclaim", klog.KObj(claim), "unsuitablenodes", status.UnsuitableNodes)
  1146  						}
  1147  					}
  1148  				}
  1149  			}
  1150  		default:
  1151  			// This claim should have been handled above.
  1152  			// Immediate allocation with control plane controller
  1153  			// was already checked for in PreFilter.
  1154  			return statusError(logger, fmt.Errorf("internal error, unexpected allocation mode %v", claim.Spec.AllocationMode))
  1155  		}
  1156  	}
  1157  
  1158  	if len(unavailableClaims) > 0 {
  1159  		state.mutex.Lock()
  1160  		defer state.mutex.Unlock()
  1161  		if state.unavailableClaims == nil {
  1162  			state.unavailableClaims = sets.New[int]()
  1163  		}
  1164  
  1165  		for _, index := range unavailableClaims {
  1166  			claim := state.claims[index]
  1167  			// Deallocation makes more sense for claims with
  1168  			// delayed allocation. Claims with immediate allocation
  1169  			// would just get allocated again for a random node,
  1170  			// which is unlikely to help the pod.
  1171  			//
  1172  			// Claims with builtin controller are handled like
  1173  			// claims with delayed allocation.
  1174  			if claim.Spec.AllocationMode == resourcev1alpha2.AllocationModeWaitForFirstConsumer ||
  1175  				state.informationsForClaim[index].controller != nil {
  1176  				state.unavailableClaims.Insert(index)
  1177  			}
  1178  		}
  1179  		return statusUnschedulable(logger, "resourceclaim not available on the node", "pod", klog.KObj(pod))
  1180  	}
  1181  
  1182  	return nil
  1183  }
  1184  
  1185  // PostFilter checks whether there are allocated claims that could get
  1186  // deallocated to help get the Pod schedulable. If yes, it picks one and
  1187  // requests its deallocation.  This only gets called when filtering found no
  1188  // suitable node.
  1189  func (pl *dynamicResources) PostFilter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, filteredNodeStatusMap framework.NodeToStatusMap) (*framework.PostFilterResult, *framework.Status) {
  1190  	if !pl.enabled {
  1191  		return nil, framework.NewStatus(framework.Unschedulable, "plugin disabled")
  1192  	}
  1193  	logger := klog.FromContext(ctx)
  1194  	state, err := getStateData(cs)
  1195  	if err != nil {
  1196  		return nil, statusError(logger, err)
  1197  	}
  1198  	if len(state.claims) == 0 {
  1199  		return nil, framework.NewStatus(framework.Unschedulable, "no new claims to deallocate")
  1200  	}
  1201  
  1202  	// Iterating over a map is random. This is intentional here, we want to
  1203  	// pick one claim randomly because there is no better heuristic.
  1204  	for index := range state.unavailableClaims {
  1205  		claim := state.claims[index]
  1206  		if len(claim.Status.ReservedFor) == 0 ||
  1207  			len(claim.Status.ReservedFor) == 1 && claim.Status.ReservedFor[0].UID == pod.UID {
  1208  			// Is the claim is handled by the builtin controller?
  1209  			// Then we can simply clear the allocation. Once the
  1210  			// claim informer catches up, the controllers will
  1211  			// be notified about this change.
  1212  			clearAllocation := state.informationsForClaim[index].structuredParameters
  1213  
  1214  			// Before we tell a driver to deallocate a claim, we
  1215  			// have to stop telling it to allocate. Otherwise,
  1216  			// depending on timing, it will deallocate the claim,
  1217  			// see a PodSchedulingContext with selected node, and
  1218  			// allocate again for that same node.
  1219  			if !clearAllocation &&
  1220  				state.podSchedulingState.schedulingCtx != nil &&
  1221  				state.podSchedulingState.schedulingCtx.Spec.SelectedNode != "" {
  1222  				state.podSchedulingState.selectedNode = ptr.To("")
  1223  				if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
  1224  					return nil, statusError(logger, err)
  1225  				}
  1226  			}
  1227  
  1228  			claim := claim.DeepCopy()
  1229  			claim.Status.ReservedFor = nil
  1230  			if clearAllocation {
  1231  				claim.Status.DriverName = ""
  1232  				claim.Status.Allocation = nil
  1233  			} else {
  1234  				claim.Status.DeallocationRequested = true
  1235  			}
  1236  			logger.V(5).Info("Requesting deallocation of ResourceClaim", "pod", klog.KObj(pod), "resourceclaim", klog.KObj(claim))
  1237  			if _, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).UpdateStatus(ctx, claim, metav1.UpdateOptions{}); err != nil {
  1238  				return nil, statusError(logger, err)
  1239  			}
  1240  			return nil, framework.NewStatus(framework.Unschedulable, "deallocation of ResourceClaim completed")
  1241  		}
  1242  	}
  1243  	return nil, framework.NewStatus(framework.Unschedulable, "still not schedulable")
  1244  }
  1245  
  1246  // PreScore is passed a list of all nodes that would fit the pod. Not all
  1247  // claims are necessarily allocated yet, so here we can set the SuitableNodes
  1248  // field for those which are pending.
  1249  func (pl *dynamicResources) PreScore(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodes []*framework.NodeInfo) *framework.Status {
  1250  	if !pl.enabled {
  1251  		return nil
  1252  	}
  1253  	state, err := getStateData(cs)
  1254  	if err != nil {
  1255  		return statusError(klog.FromContext(ctx), err)
  1256  	}
  1257  	defer func() {
  1258  		state.preScored = true
  1259  	}()
  1260  	if len(state.claims) == 0 {
  1261  		return nil
  1262  	}
  1263  
  1264  	logger := klog.FromContext(ctx)
  1265  	pending := false
  1266  	for index, claim := range state.claims {
  1267  		if claim.Status.Allocation == nil &&
  1268  			state.informationsForClaim[index].controller == nil {
  1269  			pending = true
  1270  			break
  1271  		}
  1272  	}
  1273  	if !pending {
  1274  		logger.V(5).Info("no pending claims with control plane controller", "pod", klog.KObj(pod))
  1275  		return nil
  1276  	}
  1277  
  1278  	if haveAllPotentialNodes(state.podSchedulingState.schedulingCtx, nodes) {
  1279  		logger.V(5).Info("all potential nodes already set", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes))
  1280  		return nil
  1281  	}
  1282  
  1283  	// Remember the potential nodes. The object will get created or
  1284  	// updated in Reserve. This is both an optimization and
  1285  	// covers the case that PreScore doesn't get called when there
  1286  	// is only a single node.
  1287  	logger.V(5).Info("remembering potential nodes", "pod", klog.KObj(pod), "potentialnodes", klog.KObjSlice(nodes))
  1288  	numNodes := len(nodes)
  1289  	if numNodes > resourcev1alpha2.PodSchedulingNodeListMaxSize {
  1290  		numNodes = resourcev1alpha2.PodSchedulingNodeListMaxSize
  1291  	}
  1292  	potentialNodes := make([]string, 0, numNodes)
  1293  	if numNodes == len(nodes) {
  1294  		// Copy all node names.
  1295  		for _, node := range nodes {
  1296  			potentialNodes = append(potentialNodes, node.Node().Name)
  1297  		}
  1298  	} else {
  1299  		// Select a random subset of the nodes to comply with
  1300  		// the PotentialNodes length limit. Randomization is
  1301  		// done for us by Go which iterates over map entries
  1302  		// randomly.
  1303  		nodeNames := map[string]struct{}{}
  1304  		for _, node := range nodes {
  1305  			nodeNames[node.Node().Name] = struct{}{}
  1306  		}
  1307  		for nodeName := range nodeNames {
  1308  			if len(potentialNodes) >= resourcev1alpha2.PodSchedulingNodeListMaxSize {
  1309  				break
  1310  			}
  1311  			potentialNodes = append(potentialNodes, nodeName)
  1312  		}
  1313  	}
  1314  	sort.Strings(potentialNodes)
  1315  	state.podSchedulingState.potentialNodes = &potentialNodes
  1316  	return nil
  1317  }
  1318  
  1319  func haveAllPotentialNodes(schedulingCtx *resourcev1alpha2.PodSchedulingContext, nodes []*framework.NodeInfo) bool {
  1320  	if schedulingCtx == nil {
  1321  		return false
  1322  	}
  1323  	for _, node := range nodes {
  1324  		if !slices.Contains(schedulingCtx.Spec.PotentialNodes, node.Node().Name) {
  1325  			return false
  1326  		}
  1327  	}
  1328  	return true
  1329  }
  1330  
  1331  // Reserve reserves claims for the pod.
  1332  func (pl *dynamicResources) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (status *framework.Status) {
  1333  	if !pl.enabled {
  1334  		return nil
  1335  	}
  1336  	state, err := getStateData(cs)
  1337  	if err != nil {
  1338  		return statusError(klog.FromContext(ctx), err)
  1339  	}
  1340  	if len(state.claims) == 0 {
  1341  		return nil
  1342  	}
  1343  
  1344  	numDelayedAllocationPending := 0
  1345  	numClaimsWithStatusInfo := 0
  1346  	claimsWithBuiltinController := make([]int, 0, len(state.claims))
  1347  	logger := klog.FromContext(ctx)
  1348  	for index, claim := range state.claims {
  1349  		if claim.Status.Allocation != nil {
  1350  			// Allocated, but perhaps not reserved yet. We checked in PreFilter that
  1351  			// the pod could reserve the claim. Instead of reserving here by
  1352  			// updating the ResourceClaim status, we assume that reserving
  1353  			// will work and only do it for real during binding. If it fails at
  1354  			// that time, some other pod was faster and we have to try again.
  1355  			continue
  1356  		}
  1357  
  1358  		// Do we have the builtin controller?
  1359  		if state.informationsForClaim[index].controller != nil {
  1360  			claimsWithBuiltinController = append(claimsWithBuiltinController, index)
  1361  			continue
  1362  		}
  1363  
  1364  		// Must be delayed allocation with control plane controller.
  1365  		numDelayedAllocationPending++
  1366  
  1367  		// Did the driver provide information that steered node
  1368  		// selection towards a node that it can support?
  1369  		if statusForClaim(state.podSchedulingState.schedulingCtx, pod.Spec.ResourceClaims[index].Name) != nil {
  1370  			numClaimsWithStatusInfo++
  1371  		}
  1372  	}
  1373  
  1374  	if numDelayedAllocationPending == 0 && len(claimsWithBuiltinController) == 0 {
  1375  		// Nothing left to do.
  1376  		return nil
  1377  	}
  1378  
  1379  	if !state.preScored && numDelayedAllocationPending > 0 {
  1380  		// There was only one candidate that passed the Filters and
  1381  		// therefore PreScore was not called.
  1382  		//
  1383  		// We need to ask whether that node is suitable, otherwise the
  1384  		// scheduler will pick it forever even when it cannot satisfy
  1385  		// the claim.
  1386  		if state.podSchedulingState.schedulingCtx == nil ||
  1387  			!slices.Contains(state.podSchedulingState.schedulingCtx.Spec.PotentialNodes, nodeName) {
  1388  			potentialNodes := []string{nodeName}
  1389  			state.podSchedulingState.potentialNodes = &potentialNodes
  1390  			logger.V(5).Info("asking for information about single potential node", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
  1391  		}
  1392  	}
  1393  
  1394  	// Prepare allocation of claims handled by the schedulder.
  1395  	for _, index := range claimsWithBuiltinController {
  1396  		claim := state.claims[index]
  1397  		driverName, allocation, err := state.informationsForClaim[index].controller.allocate(ctx, nodeName, state.resources)
  1398  		if err != nil {
  1399  			// We checked before that the node is suitable. This shouldn't have failed,
  1400  			// so treat this as an error.
  1401  			return statusError(logger, fmt.Errorf("claim allocation failed unexpectedly: %v", err))
  1402  		}
  1403  		state.informationsForClaim[index].allocation = allocation
  1404  		state.informationsForClaim[index].allocationDriverName = driverName
  1405  		// Strictly speaking, we don't need to store the full modified object.
  1406  		// The allocation would be enough. The full object is useful for
  1407  		// debugging and testing, so let's make it realistic.
  1408  		claim = claim.DeepCopy()
  1409  		claim.Finalizers = append(claim.Finalizers, resourcev1alpha2.Finalizer)
  1410  		claim.Status.DriverName = driverName
  1411  		claim.Status.Allocation = allocation
  1412  		pl.inFlightAllocations.Store(claim.UID, claim)
  1413  		logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "driver", driverName, "allocation", klog.Format(allocation))
  1414  	}
  1415  
  1416  	// When there is only one pending resource, we can go ahead with
  1417  	// requesting allocation even when we don't have the information from
  1418  	// the driver yet. Otherwise we wait for information before blindly
  1419  	// making a decision that might have to be reversed later.
  1420  	//
  1421  	// If all pending claims are handled with the builtin controller,
  1422  	// there is no need for a PodSchedulingContext change.
  1423  	if numDelayedAllocationPending == 1 && len(claimsWithBuiltinController) == 0 ||
  1424  		numClaimsWithStatusInfo+len(claimsWithBuiltinController) == numDelayedAllocationPending && len(claimsWithBuiltinController) < numDelayedAllocationPending {
  1425  		// TODO: can we increase the chance that the scheduler picks
  1426  		// the same node as before when allocation is on-going,
  1427  		// assuming that that node still fits the pod?  Picking a
  1428  		// different node may lead to some claims being allocated for
  1429  		// one node and others for another, which then would have to be
  1430  		// resolved with deallocation.
  1431  		if state.podSchedulingState.schedulingCtx == nil ||
  1432  			state.podSchedulingState.schedulingCtx.Spec.SelectedNode != nodeName {
  1433  			state.podSchedulingState.selectedNode = &nodeName
  1434  			logger.V(5).Info("start allocation", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
  1435  			// The actual publish happens in PreBind or Unreserve.
  1436  			return nil
  1437  		}
  1438  	}
  1439  
  1440  	// May have been modified earlier in PreScore or above.
  1441  	if state.podSchedulingState.isDirty() {
  1442  		// The actual publish happens in PreBind or Unreserve.
  1443  		return nil
  1444  	}
  1445  
  1446  	// If all pending claims are handled with the builtin controller, then
  1447  	// we can allow the pod to proceed. Allocating and reserving the claims
  1448  	// will be done in PreBind.
  1449  	if numDelayedAllocationPending == 0 {
  1450  		return nil
  1451  	}
  1452  
  1453  	// More than one pending claim and not enough information about all of them.
  1454  	//
  1455  	// TODO: can or should we ensure that schedulingCtx gets aborted while
  1456  	// waiting for resources *before* triggering delayed volume
  1457  	// provisioning?  On the one hand, volume provisioning is currently
  1458  	// irreversible, so it better should come last. On the other hand,
  1459  	// triggering both in parallel might be faster.
  1460  	return statusPending(logger, "waiting for resource driver to provide information", "pod", klog.KObj(pod))
  1461  }
  1462  
  1463  // Unreserve clears the ReservedFor field for all claims.
  1464  // It's idempotent, and does nothing if no state found for the given pod.
  1465  func (pl *dynamicResources) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
  1466  	if !pl.enabled {
  1467  		return
  1468  	}
  1469  	state, err := getStateData(cs)
  1470  	if err != nil {
  1471  		return
  1472  	}
  1473  	if len(state.claims) == 0 {
  1474  		return
  1475  	}
  1476  
  1477  	logger := klog.FromContext(ctx)
  1478  
  1479  	// Was publishing delayed? If yes, do it now.
  1480  	//
  1481  	// The most common scenario is that a different set of potential nodes
  1482  	// was identified. This revised set needs to be published to enable DRA
  1483  	// drivers to provide better guidance for future scheduling attempts.
  1484  	if state.podSchedulingState.isDirty() {
  1485  		if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
  1486  			logger.Error(err, "publish PodSchedulingContext")
  1487  		}
  1488  	}
  1489  
  1490  	for index, claim := range state.claims {
  1491  		// If allocation was in-flight, then it's not anymore and we need to revert the
  1492  		// claim object in the assume cache to what it was before.
  1493  		if state.informationsForClaim[index].controller != nil {
  1494  			if _, found := pl.inFlightAllocations.LoadAndDelete(state.claims[index].UID); found {
  1495  				pl.claimAssumeCache.Restore(claim.Namespace + "/" + claim.Name)
  1496  			}
  1497  		}
  1498  
  1499  		if claim.Status.Allocation != nil &&
  1500  			resourceclaim.IsReservedForPod(pod, claim) {
  1501  			// Remove pod from ReservedFor. A strategic-merge-patch is used
  1502  			// because that allows removing an individual entry without having
  1503  			// the latest slice.
  1504  			patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": { "reservedFor": [ {"$patch": "delete", "uid": %q} ] }}`,
  1505  				claim.UID,
  1506  				pod.UID,
  1507  			)
  1508  			logger.V(5).Info("unreserve", "resourceclaim", klog.KObj(claim), "pod", klog.KObj(pod))
  1509  			claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
  1510  			if err != nil {
  1511  				// We will get here again when pod scheduling is retried.
  1512  				logger.Error(err, "unreserve", "resourceclaim", klog.KObj(claim))
  1513  			}
  1514  		}
  1515  	}
  1516  }
  1517  
  1518  // PreBind gets called in a separate goroutine after it has been determined
  1519  // that the pod should get bound to this node. Because Reserve did not actually
  1520  // reserve claims, we need to do it now. For claims with the builtin controller,
  1521  // we also handle the allocation.
  1522  //
  1523  // If anything fails, we return an error and
  1524  // the pod will have to go into the backoff queue. The scheduler will call
  1525  // Unreserve as part of the error handling.
  1526  func (pl *dynamicResources) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
  1527  	if !pl.enabled {
  1528  		return nil
  1529  	}
  1530  	state, err := getStateData(cs)
  1531  	if err != nil {
  1532  		return statusError(klog.FromContext(ctx), err)
  1533  	}
  1534  	if len(state.claims) == 0 {
  1535  		return nil
  1536  	}
  1537  
  1538  	logger := klog.FromContext(ctx)
  1539  
  1540  	// Was publishing delayed? If yes, do it now and then cause binding to stop.
  1541  	// This will not happen if all claims get handled by builtin controllers.
  1542  	if state.podSchedulingState.isDirty() {
  1543  		if err := state.podSchedulingState.publish(ctx, pod, pl.clientset); err != nil {
  1544  			return statusError(logger, err)
  1545  		}
  1546  		return statusPending(logger, "waiting for resource driver", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName})
  1547  	}
  1548  
  1549  	for index, claim := range state.claims {
  1550  		if !resourceclaim.IsReservedForPod(pod, claim) {
  1551  			claim, err := pl.bindClaim(ctx, state, index, pod, nodeName)
  1552  			if err != nil {
  1553  				return statusError(logger, err)
  1554  			}
  1555  			state.claims[index] = claim
  1556  		}
  1557  	}
  1558  	// If we get here, we know that reserving the claim for
  1559  	// the pod worked and we can proceed with binding it.
  1560  	return nil
  1561  }
  1562  
  1563  // bindClaim gets called by PreBind for claim which is not reserved for the pod yet.
  1564  // It might not even be allocated. bindClaim then ensures that the allocation
  1565  // and reservation are recorded. This finishes the work started in Reserve.
  1566  func (pl *dynamicResources) bindClaim(ctx context.Context, state *stateData, index int, pod *v1.Pod, nodeName string) (patchedClaim *resourcev1alpha2.ResourceClaim, finalErr error) {
  1567  	logger := klog.FromContext(ctx)
  1568  	claim := state.claims[index]
  1569  	allocationPatch := ""
  1570  
  1571  	allocation := state.informationsForClaim[index].allocation
  1572  	logger.V(5).Info("preparing claim status patch", "claim", klog.KObj(state.claims[index]), "allocation", klog.Format(allocation))
  1573  
  1574  	// Do we need to store an allocation result from Reserve?
  1575  	if allocation != nil {
  1576  		buffer, err := json.Marshal(allocation)
  1577  		if err != nil {
  1578  			return nil, fmt.Errorf("marshaling AllocationResult failed: %v", err)
  1579  		}
  1580  		allocationPatch = fmt.Sprintf(`"driverName": %q, "allocation": %s, `, state.informationsForClaim[index].allocationDriverName, string(buffer))
  1581  
  1582  		// The finalizer needs to be added in a normal update. Using a simple update is fine
  1583  		// because we don't expect concurrent modifications while the claim is not allocated
  1584  		// yet. If there are any, we want to fail.
  1585  		//
  1586  		// If we were interrupted in the past, it might already be set and we simply continue.
  1587  		if !slices.Contains(claim.Finalizers, resourcev1alpha2.Finalizer) {
  1588  			claim := state.claims[index].DeepCopy()
  1589  			claim.Finalizers = append(claim.Finalizers, resourcev1alpha2.Finalizer)
  1590  			if _, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}); err != nil {
  1591  				return nil, fmt.Errorf("add finalizer: %v", err)
  1592  			}
  1593  		}
  1594  	}
  1595  
  1596  	// The claim might be stale, for example because the claim can get shared and some
  1597  	// other goroutine has updated it in the meantime. We therefore cannot use
  1598  	// SSA here to add the pod because then we would have to send the entire slice
  1599  	// or use different field manager strings for each entry.
  1600  	//
  1601  	// With a strategic-merge-patch, we can simply send one new entry. The apiserver
  1602  	// validation will catch if two goroutines try to do that at the same time and
  1603  	// the claim cannot be shared.
  1604  	//
  1605  	// Note that this also works when the allocation result gets added twice because
  1606  	// two pods both started using a shared claim: the first pod to get here adds the
  1607  	// allocation result. The second pod then only adds itself to reservedFor.
  1608  	patch := fmt.Sprintf(`{"metadata": {"uid": %q}, "status": {%s "reservedFor": [ {"resource": "pods", "name": %q, "uid": %q} ] }}`,
  1609  		claim.UID,
  1610  		allocationPatch,
  1611  		pod.Name,
  1612  		pod.UID,
  1613  	)
  1614  	if loggerV := logger.V(6); loggerV.Enabled() {
  1615  		logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim), "patch", patch)
  1616  	} else {
  1617  		logger.V(5).Info("reserve", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.KObj(claim))
  1618  	}
  1619  	claim, err := pl.clientset.ResourceV1alpha2().ResourceClaims(claim.Namespace).Patch(ctx, claim.Name, types.StrategicMergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
  1620  	logger.V(5).Info("reserved", "pod", klog.KObj(pod), "node", klog.ObjectRef{Name: nodeName}, "resourceclaim", klog.Format(claim), "err", err)
  1621  	if allocationPatch != "" {
  1622  		// The scheduler was handling allocation. Now that has
  1623  		// completed, either successfully or with a failure.
  1624  		if err == nil {
  1625  			// This can fail, but only for reasons that are okay (concurrent delete or update).
  1626  			// Shouldn't happen in this case.
  1627  			if err := pl.claimAssumeCache.Assume(claim); err != nil {
  1628  				logger.V(5).Info("Claim not stored in assume cache", "err", err)
  1629  			}
  1630  		}
  1631  		pl.inFlightAllocations.Delete(claim.UID)
  1632  	}
  1633  	return claim, err
  1634  }
  1635  
  1636  // PostBind is called after a pod is successfully bound to a node. Now we are
  1637  // sure that a PodSchedulingContext object, if it exists, is definitely not going to
  1638  // be needed anymore and can delete it. This is a one-shot thing, there won't
  1639  // be any retries.  This is okay because it should usually work and in those
  1640  // cases where it doesn't, the garbage collector will eventually clean up.
  1641  func (pl *dynamicResources) PostBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
  1642  	if !pl.enabled {
  1643  		return
  1644  	}
  1645  	state, err := getStateData(cs)
  1646  	if err != nil {
  1647  		return
  1648  	}
  1649  	if len(state.claims) == 0 {
  1650  		return
  1651  	}
  1652  
  1653  	// We cannot know for sure whether the PodSchedulingContext object exists. We
  1654  	// might have created it in the previous pod schedulingCtx cycle and not
  1655  	// have it in our informer cache yet. Let's try to delete, just to be
  1656  	// on the safe side.
  1657  	logger := klog.FromContext(ctx)
  1658  	err = pl.clientset.ResourceV1alpha2().PodSchedulingContexts(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
  1659  	switch {
  1660  	case apierrors.IsNotFound(err):
  1661  		logger.V(5).Info("no PodSchedulingContext object to delete")
  1662  	case err != nil:
  1663  		logger.Error(err, "delete PodSchedulingContext")
  1664  	default:
  1665  		logger.V(5).Info("PodSchedulingContext object deleted")
  1666  	}
  1667  }
  1668  
  1669  // statusUnschedulable ensures that there is a log message associated with the
  1670  // line where the status originated.
  1671  func statusUnschedulable(logger klog.Logger, reason string, kv ...interface{}) *framework.Status {
  1672  	if loggerV := logger.V(5); loggerV.Enabled() {
  1673  		helper, loggerV := loggerV.WithCallStackHelper()
  1674  		helper()
  1675  		kv = append(kv, "reason", reason)
  1676  		// nolint: logcheck // warns because it cannot check key/values
  1677  		loggerV.Info("pod unschedulable", kv...)
  1678  	}
  1679  	return framework.NewStatus(framework.UnschedulableAndUnresolvable, reason)
  1680  }
  1681  
  1682  // statusPending ensures that there is a log message associated with the
  1683  // line where the status originated.
  1684  func statusPending(logger klog.Logger, reason string, kv ...interface{}) *framework.Status {
  1685  	if loggerV := logger.V(5); loggerV.Enabled() {
  1686  		helper, loggerV := loggerV.WithCallStackHelper()
  1687  		helper()
  1688  		kv = append(kv, "reason", reason)
  1689  		// nolint: logcheck // warns because it cannot check key/values
  1690  		loggerV.Info("pod waiting for external component", kv...)
  1691  	}
  1692  
  1693  	// When we return Pending, we want to block the Pod at the same time.
  1694  	return framework.NewStatus(framework.Pending, reason)
  1695  }
  1696  
  1697  // statusError ensures that there is a log message associated with the
  1698  // line where the error originated.
  1699  func statusError(logger klog.Logger, err error, kv ...interface{}) *framework.Status {
  1700  	if loggerV := logger.V(5); loggerV.Enabled() {
  1701  		helper, loggerV := loggerV.WithCallStackHelper()
  1702  		helper()
  1703  		// nolint: logcheck // warns because it cannot check key/values
  1704  		loggerV.Error(err, "dynamic resource plugin failed", kv...)
  1705  	}
  1706  	return framework.AsStatus(err)
  1707  }