k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/volumebinding/binder.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package volumebinding
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sort"
    24  	"strings"
    25  	"time"
    26  
    27  	v1 "k8s.io/api/core/v1"
    28  	storagev1 "k8s.io/api/storage/v1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/labels"
    32  	"k8s.io/apimachinery/pkg/util/sets"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	"k8s.io/apiserver/pkg/storage"
    35  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    36  	coreinformers "k8s.io/client-go/informers/core/v1"
    37  	storageinformers "k8s.io/client-go/informers/storage/v1"
    38  	clientset "k8s.io/client-go/kubernetes"
    39  	corelisters "k8s.io/client-go/listers/core/v1"
    40  	storagelisters "k8s.io/client-go/listers/storage/v1"
    41  	"k8s.io/component-helpers/storage/ephemeral"
    42  	"k8s.io/component-helpers/storage/volume"
    43  	csitrans "k8s.io/csi-translation-lib"
    44  	csiplugins "k8s.io/csi-translation-lib/plugins"
    45  	"k8s.io/klog/v2"
    46  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    47  	"k8s.io/kubernetes/pkg/features"
    48  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
    49  	"k8s.io/kubernetes/pkg/scheduler/util/assumecache"
    50  	"k8s.io/kubernetes/pkg/volume/util"
    51  )
    52  
    53  // ConflictReason is used for the special strings which explain why
    54  // volume binding is impossible for a node.
    55  type ConflictReason string
    56  
    57  // ConflictReasons contains all reasons that explain why volume binding is impossible for a node.
    58  type ConflictReasons []ConflictReason
    59  
    60  func (reasons ConflictReasons) Len() int           { return len(reasons) }
    61  func (reasons ConflictReasons) Less(i, j int) bool { return reasons[i] < reasons[j] }
    62  func (reasons ConflictReasons) Swap(i, j int)      { reasons[i], reasons[j] = reasons[j], reasons[i] }
    63  
    64  const (
    65  	// ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error.
    66  	ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind"
    67  	// ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error.
    68  	ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict"
    69  	// ErrReasonNotEnoughSpace is used when a pod cannot start on a node because not enough storage space is available.
    70  	ErrReasonNotEnoughSpace = "node(s) did not have enough free storage"
    71  	// ErrReasonPVNotExist is used when a pod has one or more PVC(s) bound to non-existent persistent volume(s)"
    72  	ErrReasonPVNotExist = "node(s) unavailable due to one or more pvc(s) bound to non-existent pv(s)"
    73  )
    74  
    75  // BindingInfo holds a binding between PV and PVC.
    76  type BindingInfo struct {
    77  	// PVC that needs to be bound
    78  	pvc *v1.PersistentVolumeClaim
    79  
    80  	// Proposed PV to bind to this PVC
    81  	pv *v1.PersistentVolume
    82  }
    83  
    84  // StorageClassName returns the name of the storage class.
    85  func (b *BindingInfo) StorageClassName() string {
    86  	return b.pv.Spec.StorageClassName
    87  }
    88  
    89  // StorageResource represents storage resource.
    90  type StorageResource struct {
    91  	Requested int64
    92  	Capacity  int64
    93  }
    94  
    95  // StorageResource returns storage resource.
    96  func (b *BindingInfo) StorageResource() *StorageResource {
    97  	// both fields are mandatory
    98  	requestedQty := b.pvc.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
    99  	capacityQty := b.pv.Spec.Capacity[v1.ResourceName(v1.ResourceStorage)]
   100  	return &StorageResource{
   101  		Requested: requestedQty.Value(),
   102  		Capacity:  capacityQty.Value(),
   103  	}
   104  }
   105  
   106  // PodVolumes holds pod's volumes information used in volume scheduling.
   107  type PodVolumes struct {
   108  	// StaticBindings are binding decisions for PVCs which can be bound to
   109  	// pre-provisioned static PVs.
   110  	StaticBindings []*BindingInfo
   111  	// DynamicProvisions are PVCs that require dynamic provisioning
   112  	DynamicProvisions []*v1.PersistentVolumeClaim
   113  }
   114  
   115  // InTreeToCSITranslator contains methods required to check migratable status
   116  // and perform translations from InTree PV's to CSI
   117  type InTreeToCSITranslator interface {
   118  	IsPVMigratable(pv *v1.PersistentVolume) bool
   119  	GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
   120  	TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
   121  }
   122  
   123  // SchedulerVolumeBinder is used by the scheduler VolumeBinding plugin to
   124  // handle PVC/PV binding and dynamic provisioning. The binding decisions are
   125  // integrated into the pod scheduling workflow so that the PV NodeAffinity is
   126  // also considered along with the pod's other scheduling requirements.
   127  //
   128  // This integrates into the existing scheduler workflow as follows:
   129  //  1. The scheduler takes a Pod off the scheduler queue and processes it serially:
   130  //     a. Invokes all pre-filter plugins for the pod. GetPodVolumeClaims() is invoked
   131  //     here, pod volume information will be saved in current scheduling cycle state for later use.
   132  //     If pod has bound immediate PVCs, GetEligibleNodes() is invoked to potentially reduce
   133  //     down the list of eligible nodes based on the bound PV's NodeAffinity (if any).
   134  //     b. Invokes all filter plugins, parallelized across nodes.  FindPodVolumes() is invoked here.
   135  //     c. Invokes all score plugins.  Future/TBD
   136  //     d. Selects the best node for the Pod.
   137  //     e. Invokes all reserve plugins. AssumePodVolumes() is invoked here.
   138  //     i.  If PVC binding is required, cache in-memory only:
   139  //     * For manual binding: update PV objects for prebinding to the corresponding PVCs.
   140  //     * For dynamic provisioning: update PVC object with a selected node from c)
   141  //     * For the pod, which PVCs and PVs need API updates.
   142  //     ii. Afterwards, the main scheduler caches the Pod->Node binding in the scheduler's pod cache,
   143  //     This is handled in the scheduler and not here.
   144  //     f. Asynchronously bind volumes and pod in a separate goroutine
   145  //     i.  BindPodVolumes() is called first in PreBind phase. It makes all the necessary API updates and waits for
   146  //     PV controller to fully bind and provision the PVCs. If binding fails, the Pod is sent
   147  //     back through the scheduler.
   148  //     ii. After BindPodVolumes() is complete, then the scheduler does the final Pod->Node binding.
   149  //  2. Once all the assume operations are done in e), the scheduler processes the next Pod in the scheduler queue
   150  //     while the actual binding operation occurs in the background.
   151  type SchedulerVolumeBinder interface {
   152  	// GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning),
   153  	// unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding.
   154  	GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error)
   155  
   156  	// GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
   157  	// potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
   158  	// in subsequent scheduling stages.
   159  	//
   160  	// If eligibleNodes is 'nil', then it indicates that such eligible node reduction cannot be made
   161  	// and all nodes should be considered.
   162  	GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string])
   163  
   164  	// FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the
   165  	// node and returns pod's volumes information.
   166  	//
   167  	// If a PVC is bound, it checks if the PV's NodeAffinity matches the Node.
   168  	// Otherwise, it tries to find an available PV to bind to the PVC.
   169  	//
   170  	// It returns an error when something went wrong or a list of reasons why the node is
   171  	// (currently) not usable for the pod.
   172  	//
   173  	// If the CSIStorageCapacity feature is enabled, then it also checks for sufficient storage
   174  	// for volumes that still need to be created.
   175  	//
   176  	// This function is called by the scheduler VolumeBinding plugin and can be called in parallel
   177  	FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error)
   178  
   179  	// AssumePodVolumes will:
   180  	// 1. Take the PV matches for unbound PVCs and update the PV cache assuming
   181  	// that the PV is prebound to the PVC.
   182  	// 2. Take the PVCs that need provisioning and update the PVC cache with related
   183  	// annotations set.
   184  	//
   185  	// It returns true if all volumes are fully bound
   186  	//
   187  	// This function is called serially.
   188  	AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error)
   189  
   190  	// RevertAssumedPodVolumes will revert assumed PV and PVC cache.
   191  	RevertAssumedPodVolumes(podVolumes *PodVolumes)
   192  
   193  	// BindPodVolumes will:
   194  	// 1. Initiate the volume binding by making the API call to prebind the PV
   195  	// to its matching PVC.
   196  	// 2. Trigger the volume provisioning by making the API call to set related
   197  	// annotations on the PVC
   198  	// 3. Wait for PVCs to be completely bound by the PV controller
   199  	//
   200  	// This function can be called in parallel.
   201  	BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error
   202  }
   203  
   204  type PodVolumeClaims struct {
   205  	// boundClaims are the pod's bound PVCs.
   206  	boundClaims []*v1.PersistentVolumeClaim
   207  	// unboundClaimsDelayBinding are the pod's unbound with delayed binding (including provisioning) PVCs.
   208  	unboundClaimsDelayBinding []*v1.PersistentVolumeClaim
   209  	// unboundClaimsImmediate are the pod's unbound with immediate binding PVCs (i.e., supposed to be bound already) .
   210  	unboundClaimsImmediate []*v1.PersistentVolumeClaim
   211  	// unboundVolumesDelayBinding are PVs that belong to storage classes of the pod's unbound PVCs with delayed binding.
   212  	unboundVolumesDelayBinding map[string][]*v1.PersistentVolume
   213  }
   214  
   215  type volumeBinder struct {
   216  	kubeClient clientset.Interface
   217  
   218  	classLister   storagelisters.StorageClassLister
   219  	podLister     corelisters.PodLister
   220  	nodeLister    corelisters.NodeLister
   221  	csiNodeLister storagelisters.CSINodeLister
   222  
   223  	pvcCache *PVCAssumeCache
   224  	pvCache  *PVAssumeCache
   225  
   226  	// Amount of time to wait for the bind operation to succeed
   227  	bindTimeout time.Duration
   228  
   229  	translator InTreeToCSITranslator
   230  
   231  	csiDriverLister          storagelisters.CSIDriverLister
   232  	csiStorageCapacityLister storagelisters.CSIStorageCapacityLister
   233  }
   234  
   235  var _ SchedulerVolumeBinder = &volumeBinder{}
   236  
   237  // CapacityCheck contains additional parameters for NewVolumeBinder that
   238  // are only needed when checking volume sizes against available storage
   239  // capacity is desired.
   240  type CapacityCheck struct {
   241  	CSIDriverInformer          storageinformers.CSIDriverInformer
   242  	CSIStorageCapacityInformer storageinformers.CSIStorageCapacityInformer
   243  }
   244  
   245  // NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions.
   246  //
   247  // capacityCheck determines how storage capacity is checked (CSIStorageCapacity feature).
   248  func NewVolumeBinder(
   249  	logger klog.Logger,
   250  	kubeClient clientset.Interface,
   251  	podInformer coreinformers.PodInformer,
   252  	nodeInformer coreinformers.NodeInformer,
   253  	csiNodeInformer storageinformers.CSINodeInformer,
   254  	pvcInformer coreinformers.PersistentVolumeClaimInformer,
   255  	pvInformer coreinformers.PersistentVolumeInformer,
   256  	storageClassInformer storageinformers.StorageClassInformer,
   257  	capacityCheck CapacityCheck,
   258  	bindTimeout time.Duration) SchedulerVolumeBinder {
   259  	b := &volumeBinder{
   260  		kubeClient:    kubeClient,
   261  		podLister:     podInformer.Lister(),
   262  		classLister:   storageClassInformer.Lister(),
   263  		nodeLister:    nodeInformer.Lister(),
   264  		csiNodeLister: csiNodeInformer.Lister(),
   265  		pvcCache:      NewPVCAssumeCache(logger, pvcInformer.Informer()),
   266  		pvCache:       NewPVAssumeCache(logger, pvInformer.Informer()),
   267  		bindTimeout:   bindTimeout,
   268  		translator:    csitrans.New(),
   269  	}
   270  
   271  	b.csiDriverLister = capacityCheck.CSIDriverInformer.Lister()
   272  	b.csiStorageCapacityLister = capacityCheck.CSIStorageCapacityInformer.Lister()
   273  
   274  	return b
   275  }
   276  
   277  // FindPodVolumes finds the matching PVs for PVCs and nodes to provision PVs
   278  // for the given pod and node. If the node does not fit, conflict reasons are
   279  // returned.
   280  func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
   281  	podVolumes = &PodVolumes{}
   282  
   283  	// Warning: Below log needs high verbosity as it can be printed several times (#60933).
   284  	logger.V(5).Info("FindPodVolumes", "pod", klog.KObj(pod), "node", klog.KObj(node))
   285  
   286  	// Initialize to true for pods that don't have volumes. These
   287  	// booleans get translated into reason strings when the function
   288  	// returns without an error.
   289  	unboundVolumesSatisfied := true
   290  	boundVolumesSatisfied := true
   291  	sufficientStorage := true
   292  	boundPVsFound := true
   293  	defer func() {
   294  		if err != nil {
   295  			return
   296  		}
   297  		if !boundVolumesSatisfied {
   298  			reasons = append(reasons, ErrReasonNodeConflict)
   299  		}
   300  		if !unboundVolumesSatisfied {
   301  			reasons = append(reasons, ErrReasonBindConflict)
   302  		}
   303  		if !sufficientStorage {
   304  			reasons = append(reasons, ErrReasonNotEnoughSpace)
   305  		}
   306  		if !boundPVsFound {
   307  			reasons = append(reasons, ErrReasonPVNotExist)
   308  		}
   309  	}()
   310  
   311  	defer func() {
   312  		if err != nil {
   313  			metrics.VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc()
   314  		}
   315  	}()
   316  
   317  	var (
   318  		staticBindings    []*BindingInfo
   319  		dynamicProvisions []*v1.PersistentVolumeClaim
   320  	)
   321  	defer func() {
   322  		// Although we do not distinguish nil from empty in this function, for
   323  		// easier testing, we normalize empty to nil.
   324  		if len(staticBindings) == 0 {
   325  			staticBindings = nil
   326  		}
   327  		if len(dynamicProvisions) == 0 {
   328  			dynamicProvisions = nil
   329  		}
   330  		podVolumes.StaticBindings = staticBindings
   331  		podVolumes.DynamicProvisions = dynamicProvisions
   332  	}()
   333  
   334  	// Check PV node affinity on bound volumes
   335  	if len(podVolumeClaims.boundClaims) > 0 {
   336  		boundVolumesSatisfied, boundPVsFound, err = b.checkBoundClaims(logger, podVolumeClaims.boundClaims, node, pod)
   337  		if err != nil {
   338  			return
   339  		}
   340  	}
   341  
   342  	// Find matching volumes and node for unbound claims
   343  	if len(podVolumeClaims.unboundClaimsDelayBinding) > 0 {
   344  		var (
   345  			claimsToFindMatching []*v1.PersistentVolumeClaim
   346  			claimsToProvision    []*v1.PersistentVolumeClaim
   347  		)
   348  
   349  		// Filter out claims to provision
   350  		for _, claim := range podVolumeClaims.unboundClaimsDelayBinding {
   351  			if selectedNode, ok := claim.Annotations[volume.AnnSelectedNode]; ok {
   352  				if selectedNode != node.Name {
   353  					// Fast path, skip unmatched node.
   354  					unboundVolumesSatisfied = false
   355  					return
   356  				}
   357  				claimsToProvision = append(claimsToProvision, claim)
   358  			} else {
   359  				claimsToFindMatching = append(claimsToFindMatching, claim)
   360  			}
   361  		}
   362  
   363  		// Find matching volumes
   364  		if len(claimsToFindMatching) > 0 {
   365  			var unboundClaims []*v1.PersistentVolumeClaim
   366  			unboundVolumesSatisfied, staticBindings, unboundClaims, err = b.findMatchingVolumes(logger, pod, claimsToFindMatching, podVolumeClaims.unboundVolumesDelayBinding, node)
   367  			if err != nil {
   368  				return
   369  			}
   370  			claimsToProvision = append(claimsToProvision, unboundClaims...)
   371  		}
   372  
   373  		// Check for claims to provision. This is the first time where we potentially
   374  		// find out that storage is not sufficient for the node.
   375  		if len(claimsToProvision) > 0 {
   376  			unboundVolumesSatisfied, sufficientStorage, dynamicProvisions, err = b.checkVolumeProvisions(logger, pod, claimsToProvision, node)
   377  			if err != nil {
   378  				return
   379  			}
   380  		}
   381  	}
   382  
   383  	return
   384  }
   385  
   386  // GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
   387  // potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
   388  // in subsequent scheduling stages.
   389  //
   390  // Returning 'nil' for eligibleNodes indicates that such eligible node reduction cannot be made and all nodes
   391  // should be considered.
   392  func (b *volumeBinder) GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) {
   393  	if len(boundClaims) == 0 {
   394  		return
   395  	}
   396  
   397  	var errs []error
   398  	for _, pvc := range boundClaims {
   399  		pvName := pvc.Spec.VolumeName
   400  		pv, err := b.pvCache.GetPV(pvName)
   401  		if err != nil {
   402  			errs = append(errs, err)
   403  			continue
   404  		}
   405  
   406  		// if the PersistentVolume is local and has node affinity matching specific node(s),
   407  		// add them to the eligible nodes
   408  		nodeNames := util.GetLocalPersistentVolumeNodeNames(pv)
   409  		if len(nodeNames) != 0 {
   410  			// on the first found list of eligible nodes for the local PersistentVolume,
   411  			// insert to the eligible node set.
   412  			if eligibleNodes == nil {
   413  				eligibleNodes = sets.New(nodeNames...)
   414  			} else {
   415  				// for subsequent finding of eligible nodes for the local PersistentVolume,
   416  				// take the intersection of the nodes with the existing eligible nodes
   417  				// for cases if PV1 has node affinity to node1 and PV2 has node affinity to node2,
   418  				// then the eligible node list should be empty.
   419  				eligibleNodes = eligibleNodes.Intersection(sets.New(nodeNames...))
   420  			}
   421  		}
   422  	}
   423  
   424  	if len(errs) > 0 {
   425  		logger.V(4).Info("GetEligibleNodes: one or more error occurred finding eligible nodes", "error", errs)
   426  		return nil
   427  	}
   428  
   429  	if eligibleNodes != nil {
   430  		logger.V(4).Info("GetEligibleNodes: reduced down eligible nodes", "nodes", eligibleNodes)
   431  	}
   432  	return
   433  }
   434  
   435  // AssumePodVolumes will take the matching PVs and PVCs to provision in pod's
   436  // volume information for the chosen node, and:
   437  // 1. Update the pvCache with the new prebound PV.
   438  // 2. Update the pvcCache with the new PVCs with annotations set
   439  // 3. Update PodVolumes again with cached API updates for PVs and PVCs.
   440  func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error) {
   441  	logger.V(4).Info("AssumePodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName))
   442  	defer func() {
   443  		if err != nil {
   444  			metrics.VolumeSchedulingStageFailed.WithLabelValues("assume").Inc()
   445  		}
   446  	}()
   447  
   448  	if allBound := b.arePodVolumesBound(logger, assumedPod); allBound {
   449  		logger.V(4).Info("AssumePodVolumes: all PVCs bound and nothing to do", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName))
   450  		return true, nil
   451  	}
   452  
   453  	// Assume PV
   454  	newBindings := []*BindingInfo{}
   455  	for _, binding := range podVolumes.StaticBindings {
   456  		newPV, dirty, err := volume.GetBindVolumeToClaim(binding.pv, binding.pvc)
   457  		logger.V(5).Info("AssumePodVolumes: GetBindVolumeToClaim",
   458  			"pod", klog.KObj(assumedPod),
   459  			"PV", klog.KObj(binding.pv),
   460  			"PVC", klog.KObj(binding.pvc),
   461  			"newPV", klog.KObj(newPV),
   462  			"dirty", dirty,
   463  		)
   464  		if err != nil {
   465  			logger.Error(err, "AssumePodVolumes: fail to GetBindVolumeToClaim")
   466  			b.revertAssumedPVs(newBindings)
   467  			return false, err
   468  		}
   469  		// TODO: can we assume every time?
   470  		if dirty {
   471  			err = b.pvCache.Assume(newPV)
   472  			if err != nil {
   473  				b.revertAssumedPVs(newBindings)
   474  				return false, err
   475  			}
   476  		}
   477  		newBindings = append(newBindings, &BindingInfo{pv: newPV, pvc: binding.pvc})
   478  	}
   479  
   480  	// Assume PVCs
   481  	newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
   482  	for _, claim := range podVolumes.DynamicProvisions {
   483  		// The claims from method args can be pointing to watcher cache. We must not
   484  		// modify these, therefore create a copy.
   485  		claimClone := claim.DeepCopy()
   486  		metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, volume.AnnSelectedNode, nodeName)
   487  		err = b.pvcCache.Assume(claimClone)
   488  		if err != nil {
   489  			b.revertAssumedPVs(newBindings)
   490  			b.revertAssumedPVCs(newProvisionedPVCs)
   491  			return
   492  		}
   493  
   494  		newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
   495  	}
   496  
   497  	podVolumes.StaticBindings = newBindings
   498  	podVolumes.DynamicProvisions = newProvisionedPVCs
   499  	return
   500  }
   501  
   502  // RevertAssumedPodVolumes will revert assumed PV and PVC cache.
   503  func (b *volumeBinder) RevertAssumedPodVolumes(podVolumes *PodVolumes) {
   504  	b.revertAssumedPVs(podVolumes.StaticBindings)
   505  	b.revertAssumedPVCs(podVolumes.DynamicProvisions)
   506  }
   507  
   508  // BindPodVolumes gets the cached bindings and PVCs to provision in pod's volumes information,
   509  // makes the API update for those PVs/PVCs, and waits for the PVCs to be completely bound
   510  // by the PV controller.
   511  func (b *volumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) (err error) {
   512  	logger := klog.FromContext(ctx)
   513  	logger.V(4).Info("BindPodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", assumedPod.Spec.NodeName))
   514  
   515  	defer func() {
   516  		if err != nil {
   517  			metrics.VolumeSchedulingStageFailed.WithLabelValues("bind").Inc()
   518  		}
   519  	}()
   520  
   521  	bindings := podVolumes.StaticBindings
   522  	claimsToProvision := podVolumes.DynamicProvisions
   523  
   524  	// Start API operations
   525  	err = b.bindAPIUpdate(ctx, assumedPod, bindings, claimsToProvision)
   526  	if err != nil {
   527  		return err
   528  	}
   529  
   530  	err = wait.PollUntilContextTimeout(ctx, time.Second, b.bindTimeout, false, func(ctx context.Context) (bool, error) {
   531  		b, err := b.checkBindings(logger, assumedPod, bindings, claimsToProvision)
   532  		return b, err
   533  	})
   534  	if err != nil {
   535  		return fmt.Errorf("binding volumes: %w", err)
   536  	}
   537  	return nil
   538  }
   539  
   540  func getPodName(pod *v1.Pod) string {
   541  	return pod.Namespace + "/" + pod.Name
   542  }
   543  
   544  func getPVCName(pvc *v1.PersistentVolumeClaim) string {
   545  	return pvc.Namespace + "/" + pvc.Name
   546  }
   547  
   548  // bindAPIUpdate makes the API update for those PVs/PVCs.
   549  func (b *volumeBinder) bindAPIUpdate(ctx context.Context, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) error {
   550  	logger := klog.FromContext(ctx)
   551  	podName := getPodName(pod)
   552  	if bindings == nil {
   553  		return fmt.Errorf("failed to get cached bindings for pod %q", podName)
   554  	}
   555  	if claimsToProvision == nil {
   556  		return fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
   557  	}
   558  
   559  	lastProcessedBinding := 0
   560  	lastProcessedProvisioning := 0
   561  	defer func() {
   562  		// only revert assumed cached updates for volumes we haven't successfully bound
   563  		if lastProcessedBinding < len(bindings) {
   564  			b.revertAssumedPVs(bindings[lastProcessedBinding:])
   565  		}
   566  		// only revert assumed cached updates for claims we haven't updated,
   567  		if lastProcessedProvisioning < len(claimsToProvision) {
   568  			b.revertAssumedPVCs(claimsToProvision[lastProcessedProvisioning:])
   569  		}
   570  	}()
   571  
   572  	var (
   573  		binding *BindingInfo
   574  		i       int
   575  		claim   *v1.PersistentVolumeClaim
   576  	)
   577  
   578  	// Do the actual prebinding. Let the PV controller take care of the rest
   579  	// There is no API rollback if the actual binding fails
   580  	for _, binding = range bindings {
   581  		// TODO: does it hurt if we make an api call and nothing needs to be updated?
   582  		logger.V(5).Info("Updating PersistentVolume: binding to claim", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc))
   583  		newPV, err := b.kubeClient.CoreV1().PersistentVolumes().Update(ctx, binding.pv, metav1.UpdateOptions{})
   584  		if err != nil {
   585  			logger.V(4).Info("Updating PersistentVolume: binding to claim failed", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc), "err", err)
   586  			return err
   587  		}
   588  
   589  		logger.V(2).Info("Updated PersistentVolume with claim. Waiting for binding to complete", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc))
   590  		// Save updated object from apiserver for later checking.
   591  		binding.pv = newPV
   592  		lastProcessedBinding++
   593  	}
   594  
   595  	// Update claims objects to trigger volume provisioning. Let the PV controller take care of the rest
   596  	// PV controller is expected to signal back by removing related annotations if actual provisioning fails
   597  	for i, claim = range claimsToProvision {
   598  		logger.V(5).Info("Updating claims objects to trigger volume provisioning", "pod", klog.KObj(pod), "PVC", klog.KObj(claim))
   599  		newClaim, err := b.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
   600  		if err != nil {
   601  			logger.V(4).Info("Updating PersistentVolumeClaim: binding to volume failed", "PVC", klog.KObj(claim), "err", err)
   602  			return err
   603  		}
   604  
   605  		// Save updated object from apiserver for later checking.
   606  		claimsToProvision[i] = newClaim
   607  		lastProcessedProvisioning++
   608  	}
   609  
   610  	return nil
   611  }
   612  
   613  var (
   614  	versioner = storage.APIObjectVersioner{}
   615  )
   616  
   617  // checkBindings runs through all the PVCs in the Pod and checks:
   618  // * if the PVC is fully bound
   619  // * if there are any conditions that require binding to fail and be retried
   620  //
   621  // It returns true when all of the Pod's PVCs are fully bound, and error if
   622  // binding (and scheduling) needs to be retried
   623  // Note that it checks on API objects not PV/PVC cache, this is because
   624  // PV/PVC cache can be assumed again in main scheduler loop, we must check
   625  // latest state in API server which are shared with PV controller and
   626  // provisioners
   627  func (b *volumeBinder) checkBindings(logger klog.Logger, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) (bool, error) {
   628  	podName := getPodName(pod)
   629  	if bindings == nil {
   630  		return false, fmt.Errorf("failed to get cached bindings for pod %q", podName)
   631  	}
   632  	if claimsToProvision == nil {
   633  		return false, fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
   634  	}
   635  
   636  	node, err := b.nodeLister.Get(pod.Spec.NodeName)
   637  	if err != nil {
   638  		return false, fmt.Errorf("failed to get node %q: %w", pod.Spec.NodeName, err)
   639  	}
   640  
   641  	csiNode, err := b.csiNodeLister.Get(node.Name)
   642  	if err != nil {
   643  		// TODO: return the error once CSINode is created by default
   644  		logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
   645  	}
   646  
   647  	// Check for any conditions that might require scheduling retry
   648  
   649  	// When pod is deleted, binding operation should be cancelled. There is no
   650  	// need to check PV/PVC bindings any more.
   651  	_, err = b.podLister.Pods(pod.Namespace).Get(pod.Name)
   652  	if err != nil {
   653  		if apierrors.IsNotFound(err) {
   654  			return false, fmt.Errorf("pod does not exist any more: %w", err)
   655  		}
   656  		logger.Error(err, "Failed to get pod from the lister", "pod", klog.KObj(pod))
   657  	}
   658  
   659  	for _, binding := range bindings {
   660  		pv, err := b.pvCache.GetAPIPV(binding.pv.Name)
   661  		if err != nil {
   662  			return false, fmt.Errorf("failed to check binding: %w", err)
   663  		}
   664  
   665  		pvc, err := b.pvcCache.GetAPIPVC(getPVCName(binding.pvc))
   666  		if err != nil {
   667  			return false, fmt.Errorf("failed to check binding: %w", err)
   668  		}
   669  
   670  		// Because we updated PV in apiserver, skip if API object is older
   671  		// and wait for new API object propagated from apiserver.
   672  		if versioner.CompareResourceVersion(binding.pv, pv) > 0 {
   673  			return false, nil
   674  		}
   675  
   676  		pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   677  		if err != nil {
   678  			return false, fmt.Errorf("failed to translate pv to csi: %w", err)
   679  		}
   680  
   681  		// Check PV's node affinity (the node might not have the proper label)
   682  		if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil {
   683  			return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err)
   684  		}
   685  
   686  		// Check if pv.ClaimRef got dropped by unbindVolume()
   687  		if pv.Spec.ClaimRef == nil || pv.Spec.ClaimRef.UID == "" {
   688  			return false, fmt.Errorf("ClaimRef got reset for pv %q", pv.Name)
   689  		}
   690  
   691  		// Check if pvc is fully bound
   692  		if !b.isPVCFullyBound(pvc) {
   693  			return false, nil
   694  		}
   695  	}
   696  
   697  	for _, claim := range claimsToProvision {
   698  		pvc, err := b.pvcCache.GetAPIPVC(getPVCName(claim))
   699  		if err != nil {
   700  			return false, fmt.Errorf("failed to check provisioning pvc: %w", err)
   701  		}
   702  
   703  		// Because we updated PVC in apiserver, skip if API object is older
   704  		// and wait for new API object propagated from apiserver.
   705  		if versioner.CompareResourceVersion(claim, pvc) > 0 {
   706  			return false, nil
   707  		}
   708  
   709  		// Check if selectedNode annotation is still set
   710  		if pvc.Annotations == nil {
   711  			return false, fmt.Errorf("selectedNode annotation reset for PVC %q", pvc.Name)
   712  		}
   713  		selectedNode := pvc.Annotations[volume.AnnSelectedNode]
   714  		if selectedNode != pod.Spec.NodeName {
   715  			// If provisioner fails to provision a volume, selectedNode
   716  			// annotation will be removed to signal back to the scheduler to
   717  			// retry.
   718  			return false, fmt.Errorf("provisioning failed for PVC %q", pvc.Name)
   719  		}
   720  
   721  		// If the PVC is bound to a PV, check its node affinity
   722  		if pvc.Spec.VolumeName != "" {
   723  			pv, err := b.pvCache.GetAPIPV(pvc.Spec.VolumeName)
   724  			if err != nil {
   725  				if errors.Is(err, assumecache.ErrNotFound) {
   726  					// We tolerate NotFound error here, because PV is possibly
   727  					// not found because of API delay, we can check next time.
   728  					// And if PV does not exist because it's deleted, PVC will
   729  					// be unbound eventually.
   730  					return false, nil
   731  				}
   732  				return false, fmt.Errorf("failed to get pv %q from cache: %w", pvc.Spec.VolumeName, err)
   733  			}
   734  
   735  			pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   736  			if err != nil {
   737  				return false, err
   738  			}
   739  
   740  			if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil {
   741  				return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err)
   742  			}
   743  		}
   744  
   745  		// Check if pvc is fully bound
   746  		if !b.isPVCFullyBound(pvc) {
   747  			return false, nil
   748  		}
   749  	}
   750  
   751  	// All pvs and pvcs that we operated on are bound
   752  	logger.V(2).Info("All PVCs for pod are bound", "pod", klog.KObj(pod))
   753  	return true, nil
   754  }
   755  
   756  func (b *volumeBinder) isVolumeBound(logger klog.Logger, pod *v1.Pod, vol *v1.Volume) (bound bool, pvc *v1.PersistentVolumeClaim, err error) {
   757  	pvcName := ""
   758  	isEphemeral := false
   759  	switch {
   760  	case vol.PersistentVolumeClaim != nil:
   761  		pvcName = vol.PersistentVolumeClaim.ClaimName
   762  	case vol.Ephemeral != nil:
   763  		// Generic ephemeral inline volumes also use a PVC,
   764  		// just with a computed name, and...
   765  		pvcName = ephemeral.VolumeClaimName(pod, vol)
   766  		isEphemeral = true
   767  	default:
   768  		return true, nil, nil
   769  	}
   770  
   771  	bound, pvc, err = b.isPVCBound(logger, pod.Namespace, pvcName)
   772  	// ... the PVC must be owned by the pod.
   773  	if isEphemeral && err == nil && pvc != nil {
   774  		if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
   775  			return false, nil, err
   776  		}
   777  	}
   778  	return
   779  }
   780  
   781  func (b *volumeBinder) isPVCBound(logger klog.Logger, namespace, pvcName string) (bool, *v1.PersistentVolumeClaim, error) {
   782  	claim := &v1.PersistentVolumeClaim{
   783  		ObjectMeta: metav1.ObjectMeta{
   784  			Name:      pvcName,
   785  			Namespace: namespace,
   786  		},
   787  	}
   788  	pvcKey := getPVCName(claim)
   789  	pvc, err := b.pvcCache.GetPVC(pvcKey)
   790  	if err != nil || pvc == nil {
   791  		return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcKey, err)
   792  	}
   793  
   794  	fullyBound := b.isPVCFullyBound(pvc)
   795  	if fullyBound {
   796  		logger.V(5).Info("PVC is fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName))
   797  	} else {
   798  		if pvc.Spec.VolumeName != "" {
   799  			logger.V(5).Info("PVC is not fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName))
   800  		} else {
   801  			logger.V(5).Info("PVC is not bound", "PVC", klog.KObj(pvc))
   802  		}
   803  	}
   804  	return fullyBound, pvc, nil
   805  }
   806  
   807  func (b *volumeBinder) isPVCFullyBound(pvc *v1.PersistentVolumeClaim) bool {
   808  	return pvc.Spec.VolumeName != "" && metav1.HasAnnotation(pvc.ObjectMeta, volume.AnnBindCompleted)
   809  }
   810  
   811  // arePodVolumesBound returns true if all volumes are fully bound
   812  func (b *volumeBinder) arePodVolumesBound(logger klog.Logger, pod *v1.Pod) bool {
   813  	for _, vol := range pod.Spec.Volumes {
   814  		if isBound, _, _ := b.isVolumeBound(logger, pod, &vol); !isBound {
   815  			// Pod has at least one PVC that needs binding
   816  			return false
   817  		}
   818  	}
   819  	return true
   820  }
   821  
   822  // GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning),
   823  // unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding.
   824  func (b *volumeBinder) GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
   825  	podVolumeClaims = &PodVolumeClaims{
   826  		boundClaims:               []*v1.PersistentVolumeClaim{},
   827  		unboundClaimsImmediate:    []*v1.PersistentVolumeClaim{},
   828  		unboundClaimsDelayBinding: []*v1.PersistentVolumeClaim{},
   829  	}
   830  
   831  	for _, vol := range pod.Spec.Volumes {
   832  		volumeBound, pvc, err := b.isVolumeBound(logger, pod, &vol)
   833  		if err != nil {
   834  			return podVolumeClaims, err
   835  		}
   836  		if pvc == nil {
   837  			continue
   838  		}
   839  		if volumeBound {
   840  			podVolumeClaims.boundClaims = append(podVolumeClaims.boundClaims, pvc)
   841  		} else {
   842  			delayBindingMode, err := volume.IsDelayBindingMode(pvc, b.classLister)
   843  			if err != nil {
   844  				return podVolumeClaims, err
   845  			}
   846  			// Prebound PVCs are treated as unbound immediate binding
   847  			if delayBindingMode && pvc.Spec.VolumeName == "" {
   848  				// Scheduler path
   849  				podVolumeClaims.unboundClaimsDelayBinding = append(podVolumeClaims.unboundClaimsDelayBinding, pvc)
   850  			} else {
   851  				// !delayBindingMode || pvc.Spec.VolumeName != ""
   852  				// Immediate binding should have already been bound
   853  				podVolumeClaims.unboundClaimsImmediate = append(podVolumeClaims.unboundClaimsImmediate, pvc)
   854  			}
   855  		}
   856  	}
   857  
   858  	podVolumeClaims.unboundVolumesDelayBinding = map[string][]*v1.PersistentVolume{}
   859  	for _, pvc := range podVolumeClaims.unboundClaimsDelayBinding {
   860  		// Get storage class name from each PVC
   861  		storageClassName := volume.GetPersistentVolumeClaimClass(pvc)
   862  		podVolumeClaims.unboundVolumesDelayBinding[storageClassName] = b.pvCache.ListPVs(storageClassName)
   863  	}
   864  	return podVolumeClaims, nil
   865  }
   866  
   867  func (b *volumeBinder) checkBoundClaims(logger klog.Logger, claims []*v1.PersistentVolumeClaim, node *v1.Node, pod *v1.Pod) (bool, bool, error) {
   868  	csiNode, err := b.csiNodeLister.Get(node.Name)
   869  	if err != nil {
   870  		// TODO: return the error once CSINode is created by default
   871  		logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
   872  	}
   873  
   874  	for _, pvc := range claims {
   875  		pvName := pvc.Spec.VolumeName
   876  		pv, err := b.pvCache.GetPV(pvName)
   877  		if err != nil {
   878  			if errors.Is(err, assumecache.ErrNotFound) {
   879  				err = nil
   880  			}
   881  			return true, false, err
   882  		}
   883  
   884  		pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   885  		if err != nil {
   886  			return false, true, err
   887  		}
   888  
   889  		err = volume.CheckNodeAffinity(pv, node.Labels)
   890  		if err != nil {
   891  			logger.V(4).Info("PersistentVolume and node mismatch for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod), "err", err)
   892  			return false, true, nil
   893  		}
   894  		logger.V(5).Info("PersistentVolume and node matches for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod))
   895  	}
   896  
   897  	logger.V(4).Info("All bound volumes for pod match with node", "pod", klog.KObj(pod), "node", klog.KObj(node))
   898  	return true, true, nil
   899  }
   900  
   901  // findMatchingVolumes tries to find matching volumes for given claims,
   902  // and return unbound claims for further provision.
   903  func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, claimsToBind []*v1.PersistentVolumeClaim, unboundVolumesDelayBinding map[string][]*v1.PersistentVolume, node *v1.Node) (foundMatches bool, bindings []*BindingInfo, unboundClaims []*v1.PersistentVolumeClaim, err error) {
   904  	// Sort all the claims by increasing size request to get the smallest fits
   905  	sort.Sort(byPVCSize(claimsToBind))
   906  
   907  	chosenPVs := map[string]*v1.PersistentVolume{}
   908  
   909  	foundMatches = true
   910  
   911  	for _, pvc := range claimsToBind {
   912  		// Get storage class name from each PVC
   913  		storageClassName := volume.GetPersistentVolumeClaimClass(pvc)
   914  		pvs := unboundVolumesDelayBinding[storageClassName]
   915  
   916  		// Find a matching PV
   917  		pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true)
   918  		if err != nil {
   919  			return false, nil, nil, err
   920  		}
   921  		if pv == nil {
   922  			logger.V(4).Info("No matching volumes for pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc), "node", klog.KObj(node))
   923  			unboundClaims = append(unboundClaims, pvc)
   924  			foundMatches = false
   925  			continue
   926  		}
   927  
   928  		// matching PV needs to be excluded so we don't select it again
   929  		chosenPVs[pv.Name] = pv
   930  		bindings = append(bindings, &BindingInfo{pv: pv, pvc: pvc})
   931  		logger.V(5).Info("Found matching PV for PVC for pod", "PV", klog.KObj(pv), "PVC", klog.KObj(pvc), "node", klog.KObj(node), "pod", klog.KObj(pod))
   932  	}
   933  
   934  	if foundMatches {
   935  		logger.V(4).Info("Found matching volumes for pod", "pod", klog.KObj(pod), "node", klog.KObj(node))
   936  	}
   937  
   938  	return
   939  }
   940  
   941  // checkVolumeProvisions checks given unbound claims (the claims have gone through func
   942  // findMatchingVolumes, and do not have matching volumes for binding), and return true
   943  // if all of the claims are eligible for dynamic provision.
   944  func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*v1.PersistentVolumeClaim, err error) {
   945  	dynamicProvisions = []*v1.PersistentVolumeClaim{}
   946  
   947  	// We return early with provisionedClaims == nil if a check
   948  	// fails or we encounter an error.
   949  	for _, claim := range claimsToProvision {
   950  		pvcName := getPVCName(claim)
   951  		className := volume.GetPersistentVolumeClaimClass(claim)
   952  		if className == "" {
   953  			return false, false, nil, fmt.Errorf("no class for claim %q", pvcName)
   954  		}
   955  
   956  		class, err := b.classLister.Get(className)
   957  		if err != nil {
   958  			return false, false, nil, fmt.Errorf("failed to find storage class %q", className)
   959  		}
   960  		provisioner := class.Provisioner
   961  		if provisioner == "" || provisioner == volume.NotSupportedProvisioner {
   962  			logger.V(4).Info("Storage class of claim does not support dynamic provisioning", "storageClassName", className, "PVC", klog.KObj(claim))
   963  			return false, true, nil, nil
   964  		}
   965  
   966  		// Check if the node can satisfy the topology requirement in the class
   967  		if !v1helper.MatchTopologySelectorTerms(class.AllowedTopologies, labels.Set(node.Labels)) {
   968  			logger.V(4).Info("Node cannot satisfy provisioning topology requirements of claim", "node", klog.KObj(node), "PVC", klog.KObj(claim))
   969  			return false, true, nil, nil
   970  		}
   971  
   972  		// Check storage capacity.
   973  		sufficient, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
   974  		if err != nil {
   975  			return false, false, nil, err
   976  		}
   977  		if !sufficient {
   978  			// hasEnoughCapacity logs an explanation.
   979  			return true, false, nil, nil
   980  		}
   981  
   982  		dynamicProvisions = append(dynamicProvisions, claim)
   983  
   984  	}
   985  	logger.V(4).Info("Provisioning for claims of pod that has no matching volumes...", "claimCount", len(claimsToProvision), "pod", klog.KObj(pod), "node", klog.KObj(node))
   986  
   987  	return true, true, dynamicProvisions, nil
   988  }
   989  
   990  func (b *volumeBinder) revertAssumedPVs(bindings []*BindingInfo) {
   991  	for _, BindingInfo := range bindings {
   992  		b.pvCache.Restore(BindingInfo.pv.Name)
   993  	}
   994  }
   995  
   996  func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
   997  	for _, claim := range claims {
   998  		b.pvcCache.Restore(getPVCName(claim))
   999  	}
  1000  }
  1001  
  1002  // hasEnoughCapacity checks whether the provisioner has enough capacity left for a new volume of the given size
  1003  // that is available from the node.
  1004  func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, error) {
  1005  	quantity, ok := claim.Spec.Resources.Requests[v1.ResourceStorage]
  1006  	if !ok {
  1007  		// No capacity to check for.
  1008  		return true, nil
  1009  	}
  1010  
  1011  	// Only enabled for CSI drivers which opt into it.
  1012  	driver, err := b.csiDriverLister.Get(provisioner)
  1013  	if err != nil {
  1014  		if apierrors.IsNotFound(err) {
  1015  			// Either the provisioner is not a CSI driver or the driver does not
  1016  			// opt into storage capacity scheduling. Either way, skip
  1017  			// capacity checking.
  1018  			return true, nil
  1019  		}
  1020  		return false, err
  1021  	}
  1022  	if driver.Spec.StorageCapacity == nil || !*driver.Spec.StorageCapacity {
  1023  		return true, nil
  1024  	}
  1025  
  1026  	// Look for a matching CSIStorageCapacity object(s).
  1027  	// TODO (for beta): benchmark this and potentially introduce some kind of lookup structure (https://github.com/kubernetes/enhancements/issues/1698#issuecomment-654356718).
  1028  	capacities, err := b.csiStorageCapacityLister.List(labels.Everything())
  1029  	if err != nil {
  1030  		return false, err
  1031  	}
  1032  
  1033  	sizeInBytes := quantity.Value()
  1034  	for _, capacity := range capacities {
  1035  		if capacity.StorageClassName == storageClass.Name &&
  1036  			capacitySufficient(capacity, sizeInBytes) &&
  1037  			b.nodeHasAccess(logger, node, capacity) {
  1038  			// Enough capacity found.
  1039  			return true, nil
  1040  		}
  1041  	}
  1042  
  1043  	// TODO (?): this doesn't give any information about which pools where considered and why
  1044  	// they had to be rejected. Log that above? But that might be a lot of log output...
  1045  	logger.V(4).Info("Node has no accessible CSIStorageCapacity with enough capacity for PVC",
  1046  		"node", klog.KObj(node), "PVC", klog.KObj(claim), "size", sizeInBytes, "storageClass", klog.KObj(storageClass))
  1047  	return false, nil
  1048  }
  1049  
  1050  func capacitySufficient(capacity *storagev1.CSIStorageCapacity, sizeInBytes int64) bool {
  1051  	limit := capacity.Capacity
  1052  	if capacity.MaximumVolumeSize != nil {
  1053  		// Prefer MaximumVolumeSize if available, it is more precise.
  1054  		limit = capacity.MaximumVolumeSize
  1055  	}
  1056  	return limit != nil && limit.Value() >= sizeInBytes
  1057  }
  1058  
  1059  func (b *volumeBinder) nodeHasAccess(logger klog.Logger, node *v1.Node, capacity *storagev1.CSIStorageCapacity) bool {
  1060  	if capacity.NodeTopology == nil {
  1061  		// Unavailable
  1062  		return false
  1063  	}
  1064  	// Only matching by label is supported.
  1065  	selector, err := metav1.LabelSelectorAsSelector(capacity.NodeTopology)
  1066  	if err != nil {
  1067  		logger.Error(err, "Unexpected error converting to a label selector", "nodeTopology", capacity.NodeTopology)
  1068  		return false
  1069  	}
  1070  	return selector.Matches(labels.Set(node.Labels))
  1071  }
  1072  
  1073  type byPVCSize []*v1.PersistentVolumeClaim
  1074  
  1075  func (a byPVCSize) Len() int {
  1076  	return len(a)
  1077  }
  1078  
  1079  func (a byPVCSize) Swap(i, j int) {
  1080  	a[i], a[j] = a[j], a[i]
  1081  }
  1082  
  1083  func (a byPVCSize) Less(i, j int) bool {
  1084  	iSize := a[i].Spec.Resources.Requests[v1.ResourceStorage]
  1085  	jSize := a[j].Spec.Resources.Requests[v1.ResourceStorage]
  1086  	// return true if iSize is less than jSize
  1087  	return iSize.Cmp(jSize) == -1
  1088  }
  1089  
  1090  // isCSIMigrationOnForPlugin checks if CSI migration is enabled for a given plugin.
  1091  func isCSIMigrationOnForPlugin(pluginName string) bool {
  1092  	switch pluginName {
  1093  	case csiplugins.AWSEBSInTreePluginName:
  1094  		return true
  1095  	case csiplugins.GCEPDInTreePluginName:
  1096  		return true
  1097  	case csiplugins.AzureDiskInTreePluginName:
  1098  		return true
  1099  	case csiplugins.CinderInTreePluginName:
  1100  		return true
  1101  	case csiplugins.PortworxVolumePluginName:
  1102  		return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx)
  1103  	}
  1104  	return false
  1105  }
  1106  
  1107  // isPluginMigratedToCSIOnNode checks if an in-tree plugin has been migrated to a CSI driver on the node.
  1108  func isPluginMigratedToCSIOnNode(pluginName string, csiNode *storagev1.CSINode) bool {
  1109  	if csiNode == nil {
  1110  		return false
  1111  	}
  1112  
  1113  	csiNodeAnn := csiNode.GetAnnotations()
  1114  	if csiNodeAnn == nil {
  1115  		return false
  1116  	}
  1117  
  1118  	var mpaSet sets.Set[string]
  1119  	mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
  1120  	if len(mpa) == 0 {
  1121  		mpaSet = sets.New[string]()
  1122  	} else {
  1123  		tok := strings.Split(mpa, ",")
  1124  		mpaSet = sets.New(tok...)
  1125  	}
  1126  
  1127  	return mpaSet.Has(pluginName)
  1128  }
  1129  
  1130  // tryTranslatePVToCSI will translate the in-tree PV to CSI if it meets the criteria. If not, it returns the unmodified in-tree PV.
  1131  func (b *volumeBinder) tryTranslatePVToCSI(pv *v1.PersistentVolume, csiNode *storagev1.CSINode) (*v1.PersistentVolume, error) {
  1132  	if !b.translator.IsPVMigratable(pv) {
  1133  		return pv, nil
  1134  	}
  1135  
  1136  	pluginName, err := b.translator.GetInTreePluginNameFromSpec(pv, nil)
  1137  	if err != nil {
  1138  		return nil, fmt.Errorf("could not get plugin name from pv: %v", err)
  1139  	}
  1140  
  1141  	if !isCSIMigrationOnForPlugin(pluginName) {
  1142  		return pv, nil
  1143  	}
  1144  
  1145  	if !isPluginMigratedToCSIOnNode(pluginName, csiNode) {
  1146  		return pv, nil
  1147  	}
  1148  
  1149  	transPV, err := b.translator.TranslateInTreePVToCSI(pv)
  1150  	if err != nil {
  1151  		return nil, fmt.Errorf("could not translate pv: %v", err)
  1152  	}
  1153  
  1154  	return transPV, nil
  1155  }