volcano.sh/volcano@v1.9.0/pkg/scheduler/capabilities/volumebinding/binder.go (about)

     1  /*
     2  Copyright 2017 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package volumebinding
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"sort"
    23  	"strings"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	storagev1 "k8s.io/api/storage/v1"
    28  	storagev1beta1 "k8s.io/api/storage/v1beta1"
    29  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    30  	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
    31  	"k8s.io/apimachinery/pkg/labels"
    32  	"k8s.io/apimachinery/pkg/util/sets"
    33  	"k8s.io/apimachinery/pkg/util/wait"
    34  	"k8s.io/apiserver/pkg/storage"
    35  	utilfeature "k8s.io/apiserver/pkg/util/feature"
    36  	coreinformers "k8s.io/client-go/informers/core/v1"
    37  	storageinformers "k8s.io/client-go/informers/storage/v1"
    38  	storageinformersv1beta1 "k8s.io/client-go/informers/storage/v1beta1"
    39  	clientset "k8s.io/client-go/kubernetes"
    40  	corelisters "k8s.io/client-go/listers/core/v1"
    41  	storagelisters "k8s.io/client-go/listers/storage/v1"
    42  	storagelistersv1beta1 "k8s.io/client-go/listers/storage/v1beta1"
    43  	"k8s.io/component-helpers/storage/ephemeral"
    44  	"k8s.io/component-helpers/storage/volume"
    45  	csitrans "k8s.io/csi-translation-lib"
    46  	csiplugins "k8s.io/csi-translation-lib/plugins"
    47  	"k8s.io/klog/v2"
    48  	v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
    49  	"k8s.io/kubernetes/pkg/features"
    50  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics"
    51  	"k8s.io/kubernetes/pkg/volume/util"
    52  )
    53  
    54  // ConflictReason is used for the special strings which explain why
    55  // volume binding is impossible for a node.
    56  type ConflictReason string
    57  
    58  // ConflictReasons contains all reasons that explain why volume binding is impossible for a node.
    59  type ConflictReasons []ConflictReason
    60  
    61  func (reasons ConflictReasons) Len() int           { return len(reasons) }
    62  func (reasons ConflictReasons) Less(i, j int) bool { return reasons[i] < reasons[j] }
    63  func (reasons ConflictReasons) Swap(i, j int)      { reasons[i], reasons[j] = reasons[j], reasons[i] }
    64  
    65  const (
    66  	// ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error.
    67  	ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind"
    68  	// ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error.
    69  	ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict"
    70  	// ErrReasonNotEnoughSpace is used when a pod cannot start on a node because not enough storage space is available.
    71  	ErrReasonNotEnoughSpace = "node(s) did not have enough free storage"
    72  	// ErrReasonPVNotExist is used when a pod has one or more PVC(s) bound to non-existent persistent volume(s)"
    73  	ErrReasonPVNotExist = "node(s) unavailable due to one or more pvc(s) bound to non-existent pv(s)"
    74  )
    75  
    76  // BindingInfo holds a binding between PV and PVC.
    77  type BindingInfo struct {
    78  	// PVC that needs to be bound
    79  	pvc *v1.PersistentVolumeClaim
    80  
    81  	// Proposed PV to bind to this PVC
    82  	pv *v1.PersistentVolume
    83  }
    84  
    85  // StorageClassName returns the name of the storage class.
    86  func (b *BindingInfo) StorageClassName() string {
    87  	return b.pv.Spec.StorageClassName
    88  }
    89  
    90  // StorageResource represents storage resource.
    91  type StorageResource struct {
    92  	Requested int64
    93  	Capacity  int64
    94  }
    95  
    96  // StorageResource returns storage resource.
    97  func (b *BindingInfo) StorageResource() *StorageResource {
    98  	// both fields are mandatory
    99  	requestedQty := b.pvc.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)]
   100  	capacityQty := b.pv.Spec.Capacity[v1.ResourceName(v1.ResourceStorage)]
   101  	return &StorageResource{
   102  		Requested: requestedQty.Value(),
   103  		Capacity:  capacityQty.Value(),
   104  	}
   105  }
   106  
   107  // PodVolumes holds pod's volumes information used in volume scheduling.
   108  type PodVolumes struct {
   109  	// StaticBindings are binding decisions for PVCs which can be bound to
   110  	// pre-provisioned static PVs.
   111  	StaticBindings []*BindingInfo
   112  	// DynamicProvisions are PVCs that require dynamic provisioning
   113  	DynamicProvisions []*v1.PersistentVolumeClaim
   114  }
   115  
   116  // InTreeToCSITranslator contains methods required to check migratable status
   117  // and perform translations from InTree PV's to CSI
   118  type InTreeToCSITranslator interface {
   119  	IsPVMigratable(pv *v1.PersistentVolume) bool
   120  	GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
   121  	TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
   122  }
   123  
   124  // SchedulerVolumeBinder is used by the scheduler VolumeBinding plugin to
   125  // handle PVC/PV binding and dynamic provisioning. The binding decisions are
   126  // integrated into the pod scheduling workflow so that the PV NodeAffinity is
   127  // also considered along with the pod's other scheduling requirements.
   128  //
   129  // This integrates into the existing scheduler workflow as follows:
   130  //  1. The scheduler takes a Pod off the scheduler queue and processes it serially:
   131  //     a. Invokes all pre-filter plugins for the pod. GetPodVolumeClaims() is invoked
   132  //     here, pod volume information will be saved in current scheduling cycle state for later use.
   133  //     If pod has bound immediate PVCs, GetEligibleNodes() is invoked to potentially reduce
   134  //     down the list of eligible nodes based on the bound PV's NodeAffinity (if any).
   135  //     b. Invokes all filter plugins, parallelized across nodes.  FindPodVolumes() is invoked here.
   136  //     c. Invokes all score plugins.  Future/TBD
   137  //     d. Selects the best node for the Pod.
   138  //     e. Invokes all reserve plugins. AssumePodVolumes() is invoked here.
   139  //     i.  If PVC binding is required, cache in-memory only:
   140  //     * For manual binding: update PV objects for prebinding to the corresponding PVCs.
   141  //     * For dynamic provisioning: update PVC object with a selected node from c)
   142  //     * For the pod, which PVCs and PVs need API updates.
   143  //     ii. Afterwards, the main scheduler caches the Pod->Node binding in the scheduler's pod cache,
   144  //     This is handled in the scheduler and not here.
   145  //     f. Asynchronously bind volumes and pod in a separate goroutine
   146  //     i.  BindPodVolumes() is called first in PreBind phase. It makes all the necessary API updates and waits for
   147  //     PV controller to fully bind and provision the PVCs. If binding fails, the Pod is sent
   148  //     back through the scheduler.
   149  //     ii. After BindPodVolumes() is complete, then the scheduler does the final Pod->Node binding.
   150  //  2. Once all the assume operations are done in e), the scheduler processes the next Pod in the scheduler queue
   151  //     while the actual binding operation occurs in the background.
   152  type SchedulerVolumeBinder interface {
   153  	// GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning),
   154  	// unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding.
   155  	GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error)
   156  
   157  	// GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
   158  	// potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
   159  	// in subsequent scheduling stages.
   160  	//
   161  	// If eligibleNodes is 'nil', then it indicates that such eligible node reduction cannot be made
   162  	// and all nodes should be considered.
   163  	GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string])
   164  
   165  	// FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the
   166  	// node and returns pod's volumes information.
   167  	//
   168  	// If a PVC is bound, it checks if the PV's NodeAffinity matches the Node.
   169  	// Otherwise, it tries to find an available PV to bind to the PVC.
   170  	//
   171  	// It returns an error when something went wrong or a list of reasons why the node is
   172  	// (currently) not usable for the pod.
   173  	//
   174  	// If the CSIStorageCapacity feature is enabled, then it also checks for sufficient storage
   175  	// for volumes that still need to be created.
   176  	//
   177  	// This function is called by the scheduler VolumeBinding plugin and can be called in parallel
   178  	FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error)
   179  
   180  	// AssumePodVolumes will:
   181  	// 1. Take the PV matches for unbound PVCs and update the PV cache assuming
   182  	// that the PV is prebound to the PVC.
   183  	// 2. Take the PVCs that need provisioning and update the PVC cache with related
   184  	// annotations set.
   185  	//
   186  	// It returns true if all volumes are fully bound
   187  	//
   188  	// This function is called serially.
   189  	AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error)
   190  
   191  	// RevertAssumedPodVolumes will revert assumed PV and PVC cache.
   192  	RevertAssumedPodVolumes(podVolumes *PodVolumes)
   193  
   194  	// BindPodVolumes will:
   195  	// 1. Initiate the volume binding by making the API call to prebind the PV
   196  	// to its matching PVC.
   197  	// 2. Trigger the volume provisioning by making the API call to set related
   198  	// annotations on the PVC
   199  	// 3. Wait for PVCs to be completely bound by the PV controller
   200  	//
   201  	// This function can be called in parallel.
   202  	BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error
   203  }
   204  
   205  type PodVolumeClaims struct {
   206  	// boundClaims are the pod's bound PVCs.
   207  	boundClaims []*v1.PersistentVolumeClaim
   208  	// unboundClaimsDelayBinding are the pod's unbound with delayed binding (including provisioning) PVCs.
   209  	unboundClaimsDelayBinding []*v1.PersistentVolumeClaim
   210  	// unboundClaimsImmediate are the pod's unbound with immediate binding PVCs (i.e., supposed to be bound already) .
   211  	unboundClaimsImmediate []*v1.PersistentVolumeClaim
   212  	// unboundVolumesDelayBinding are PVs that belong to storage classes of the pod's unbound PVCs with delayed binding.
   213  	unboundVolumesDelayBinding map[string][]*v1.PersistentVolume
   214  }
   215  
   216  type volumeBinder struct {
   217  	kubeClient clientset.Interface
   218  
   219  	classLister   storagelisters.StorageClassLister
   220  	podLister     corelisters.PodLister
   221  	nodeLister    corelisters.NodeLister
   222  	csiNodeLister storagelisters.CSINodeLister
   223  
   224  	pvcCache PVCAssumeCache
   225  	pvCache  PVAssumeCache
   226  
   227  	// Amount of time to wait for the bind operation to succeed
   228  	bindTimeout time.Duration
   229  
   230  	translator InTreeToCSITranslator
   231  
   232  	capacityCheckEnabled     bool
   233  	csiDriverLister          storagelisters.CSIDriverLister
   234  	csiStorageCapacityLister storagelistersv1beta1.CSIStorageCapacityLister
   235  }
   236  
   237  var _ SchedulerVolumeBinder = &volumeBinder{}
   238  
   239  // CapacityCheck contains additional parameters for NewVolumeBinder that
   240  // are only needed when checking volume sizes against available storage
   241  // capacity is desired.
   242  type CapacityCheck struct {
   243  	CSIDriverInformer          storageinformers.CSIDriverInformer
   244  	CSIStorageCapacityInformer storageinformersv1beta1.CSIStorageCapacityInformer
   245  }
   246  
   247  // NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions.
   248  //
   249  // capacityCheck determines how storage capacity is checked (CSIStorageCapacity feature).
   250  func NewVolumeBinder(
   251  	logger klog.Logger,
   252  	kubeClient clientset.Interface,
   253  	podInformer coreinformers.PodInformer,
   254  	nodeInformer coreinformers.NodeInformer,
   255  	csiNodeInformer storageinformers.CSINodeInformer,
   256  	pvcInformer coreinformers.PersistentVolumeClaimInformer,
   257  	pvInformer coreinformers.PersistentVolumeInformer,
   258  	storageClassInformer storageinformers.StorageClassInformer,
   259  	capacityCheck *CapacityCheck,
   260  	bindTimeout time.Duration) SchedulerVolumeBinder {
   261  	b := &volumeBinder{
   262  		kubeClient:    kubeClient,
   263  		podLister:     podInformer.Lister(),
   264  		classLister:   storageClassInformer.Lister(),
   265  		nodeLister:    nodeInformer.Lister(),
   266  		csiNodeLister: csiNodeInformer.Lister(),
   267  		pvcCache:      NewPVCAssumeCache(logger, pvcInformer.Informer()),
   268  		pvCache:       NewPVAssumeCache(logger, pvInformer.Informer()),
   269  		bindTimeout:   bindTimeout,
   270  		translator:    csitrans.New(),
   271  	}
   272  
   273  	if capacityCheck != nil {
   274  		b.capacityCheckEnabled = true
   275  		b.csiDriverLister = capacityCheck.CSIDriverInformer.Lister()
   276  		b.csiStorageCapacityLister = capacityCheck.CSIStorageCapacityInformer.Lister()
   277  	}
   278  
   279  	return b
   280  }
   281  
   282  // FindPodVolumes finds the matching PVs for PVCs and nodes to provision PVs
   283  // for the given pod and node. If the node does not fit, conflict reasons are
   284  // returned.
   285  func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) {
   286  	podVolumes = &PodVolumes{}
   287  
   288  	// Warning: Below log needs high verbosity as it can be printed several times (#60933).
   289  	logger.V(5).Info("FindPodVolumes", "pod", klog.KObj(pod), "node", klog.KObj(node))
   290  
   291  	// Initialize to true for pods that don't have volumes. These
   292  	// booleans get translated into reason strings when the function
   293  	// returns without an error.
   294  	unboundVolumesSatisfied := true
   295  	boundVolumesSatisfied := true
   296  	sufficientStorage := true
   297  	boundPVsFound := true
   298  	defer func() {
   299  		if err != nil {
   300  			return
   301  		}
   302  		if !boundVolumesSatisfied {
   303  			reasons = append(reasons, ErrReasonNodeConflict)
   304  		}
   305  		if !unboundVolumesSatisfied {
   306  			reasons = append(reasons, ErrReasonBindConflict)
   307  		}
   308  		if !sufficientStorage {
   309  			reasons = append(reasons, ErrReasonNotEnoughSpace)
   310  		}
   311  		if !boundPVsFound {
   312  			reasons = append(reasons, ErrReasonPVNotExist)
   313  		}
   314  	}()
   315  
   316  	defer func() {
   317  		if err != nil {
   318  			metrics.VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc()
   319  		}
   320  	}()
   321  
   322  	var (
   323  		staticBindings    []*BindingInfo
   324  		dynamicProvisions []*v1.PersistentVolumeClaim
   325  	)
   326  	defer func() {
   327  		// Although we do not distinguish nil from empty in this function, for
   328  		// easier testing, we normalize empty to nil.
   329  		if len(staticBindings) == 0 {
   330  			staticBindings = nil
   331  		}
   332  		if len(dynamicProvisions) == 0 {
   333  			dynamicProvisions = nil
   334  		}
   335  		podVolumes.StaticBindings = staticBindings
   336  		podVolumes.DynamicProvisions = dynamicProvisions
   337  	}()
   338  
   339  	// Check PV node affinity on bound volumes
   340  	if len(podVolumeClaims.boundClaims) > 0 {
   341  		boundVolumesSatisfied, boundPVsFound, err = b.checkBoundClaims(logger, podVolumeClaims.boundClaims, node, pod)
   342  		if err != nil {
   343  			return
   344  		}
   345  	}
   346  
   347  	// Find matching volumes and node for unbound claims
   348  	if len(podVolumeClaims.unboundClaimsDelayBinding) > 0 {
   349  		var (
   350  			claimsToFindMatching []*v1.PersistentVolumeClaim
   351  			claimsToProvision    []*v1.PersistentVolumeClaim
   352  		)
   353  
   354  		// Filter out claims to provision
   355  		for _, claim := range podVolumeClaims.unboundClaimsDelayBinding {
   356  			if selectedNode, ok := claim.Annotations[volume.AnnSelectedNode]; ok {
   357  				if selectedNode != node.Name {
   358  					// Fast path, skip unmatched node.
   359  					unboundVolumesSatisfied = false
   360  					return
   361  				}
   362  				claimsToProvision = append(claimsToProvision, claim)
   363  			} else {
   364  				claimsToFindMatching = append(claimsToFindMatching, claim)
   365  			}
   366  		}
   367  
   368  		// Find matching volumes
   369  		if len(claimsToFindMatching) > 0 {
   370  			var unboundClaims []*v1.PersistentVolumeClaim
   371  			unboundVolumesSatisfied, staticBindings, unboundClaims, err = b.findMatchingVolumes(logger, pod, claimsToFindMatching, podVolumeClaims.unboundVolumesDelayBinding, node)
   372  			if err != nil {
   373  				return
   374  			}
   375  			claimsToProvision = append(claimsToProvision, unboundClaims...)
   376  		}
   377  
   378  		// Check for claims to provision. This is the first time where we potentially
   379  		// find out that storage is not sufficient for the node.
   380  		if len(claimsToProvision) > 0 {
   381  			unboundVolumesSatisfied, sufficientStorage, dynamicProvisions, err = b.checkVolumeProvisions(logger, pod, claimsToProvision, node)
   382  			if err != nil {
   383  				return
   384  			}
   385  		}
   386  	}
   387  
   388  	return
   389  }
   390  
   391  // GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be
   392  // potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used
   393  // in subsequent scheduling stages.
   394  //
   395  // Returning 'nil' for eligibleNodes indicates that such eligible node reduction cannot be made and all nodes
   396  // should be considered.
   397  func (b *volumeBinder) GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) {
   398  	if len(boundClaims) == 0 {
   399  		return
   400  	}
   401  
   402  	var errs []error
   403  	for _, pvc := range boundClaims {
   404  		pvName := pvc.Spec.VolumeName
   405  		pv, err := b.pvCache.GetPV(pvName)
   406  		if err != nil {
   407  			errs = append(errs, err)
   408  			continue
   409  		}
   410  
   411  		// if the PersistentVolume is local and has node affinity matching specific node(s),
   412  		// add them to the eligible nodes
   413  		nodeNames := util.GetLocalPersistentVolumeNodeNames(pv)
   414  		if len(nodeNames) != 0 {
   415  			// on the first found list of eligible nodes for the local PersistentVolume,
   416  			// insert to the eligible node set.
   417  			if eligibleNodes == nil {
   418  				eligibleNodes = sets.New(nodeNames...)
   419  			} else {
   420  				// for subsequent finding of eligible nodes for the local PersistentVolume,
   421  				// take the intersection of the nodes with the existing eligible nodes
   422  				// for cases if PV1 has node affinity to node1 and PV2 has node affinity to node2,
   423  				// then the eligible node list should be empty.
   424  				eligibleNodes = eligibleNodes.Intersection(sets.New(nodeNames...))
   425  			}
   426  		}
   427  	}
   428  
   429  	if len(errs) > 0 {
   430  		logger.V(4).Info("GetEligibleNodes: one or more error occurred finding eligible nodes", "error", errs)
   431  		return nil
   432  	}
   433  
   434  	if eligibleNodes != nil {
   435  		logger.V(4).Info("GetEligibleNodes: reduced down eligible nodes", "nodes", eligibleNodes)
   436  	}
   437  	return
   438  }
   439  
   440  // AssumePodVolumes will take the matching PVs and PVCs to provision in pod's
   441  // volume information for the chosen node, and:
   442  // 1. Update the pvCache with the new prebound PV.
   443  // 2. Update the pvcCache with the new PVCs with annotations set
   444  // 3. Update PodVolumes again with cached API updates for PVs and PVCs.
   445  func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error) {
   446  	logger.V(4).Info("AssumePodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName))
   447  	defer func() {
   448  		if err != nil {
   449  			metrics.VolumeSchedulingStageFailed.WithLabelValues("assume").Inc()
   450  		}
   451  	}()
   452  
   453  	if allBound := b.arePodVolumesBound(logger, assumedPod); allBound {
   454  		logger.V(4).Info("AssumePodVolumes: all PVCs bound and nothing to do", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName))
   455  		return true, nil
   456  	}
   457  
   458  	// Assume PV
   459  	newBindings := []*BindingInfo{}
   460  	for _, binding := range podVolumes.StaticBindings {
   461  		newPV, dirty, err := volume.GetBindVolumeToClaim(binding.pv, binding.pvc)
   462  		logger.V(5).Info("AssumePodVolumes: GetBindVolumeToClaim",
   463  			"pod", klog.KObj(assumedPod),
   464  			"PV", klog.KObj(binding.pv),
   465  			"PVC", klog.KObj(binding.pvc),
   466  			"newPV", klog.KObj(newPV),
   467  			"dirty", dirty,
   468  		)
   469  		if err != nil {
   470  			logger.Error(err, "AssumePodVolumes: fail to GetBindVolumeToClaim")
   471  			b.revertAssumedPVs(newBindings)
   472  			return false, err
   473  		}
   474  		// TODO: can we assume every time?
   475  		if dirty {
   476  			err = b.pvCache.Assume(newPV)
   477  			if err != nil {
   478  				b.revertAssumedPVs(newBindings)
   479  				return false, err
   480  			}
   481  		}
   482  		newBindings = append(newBindings, &BindingInfo{pv: newPV, pvc: binding.pvc})
   483  	}
   484  
   485  	// Assume PVCs
   486  	newProvisionedPVCs := []*v1.PersistentVolumeClaim{}
   487  	for _, claim := range podVolumes.DynamicProvisions {
   488  		// The claims from method args can be pointing to watcher cache. We must not
   489  		// modify these, therefore create a copy.
   490  		claimClone := claim.DeepCopy()
   491  		metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, volume.AnnSelectedNode, nodeName)
   492  		err = b.pvcCache.Assume(claimClone)
   493  		if err != nil {
   494  			b.revertAssumedPVs(newBindings)
   495  			b.revertAssumedPVCs(newProvisionedPVCs)
   496  			return
   497  		}
   498  
   499  		newProvisionedPVCs = append(newProvisionedPVCs, claimClone)
   500  	}
   501  
   502  	podVolumes.StaticBindings = newBindings
   503  	podVolumes.DynamicProvisions = newProvisionedPVCs
   504  	return
   505  }
   506  
   507  // RevertAssumedPodVolumes will revert assumed PV and PVC cache.
   508  func (b *volumeBinder) RevertAssumedPodVolumes(podVolumes *PodVolumes) {
   509  	b.revertAssumedPVs(podVolumes.StaticBindings)
   510  	b.revertAssumedPVCs(podVolumes.DynamicProvisions)
   511  }
   512  
   513  // BindPodVolumes gets the cached bindings and PVCs to provision in pod's volumes information,
   514  // makes the API update for those PVs/PVCs, and waits for the PVCs to be completely bound
   515  // by the PV controller.
   516  func (b *volumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) (err error) {
   517  	logger := klog.FromContext(ctx)
   518  	logger.V(4).Info("BindPodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", assumedPod.Spec.NodeName))
   519  
   520  	defer func() {
   521  		if err != nil {
   522  			metrics.VolumeSchedulingStageFailed.WithLabelValues("bind").Inc()
   523  		}
   524  	}()
   525  
   526  	if podVolumes == nil {
   527  		klog.Infof("BindPodVolumes for pod(%s): pod volumes is nil", assumedPod.Name)
   528  		return nil
   529  	}
   530  
   531  	bindings := podVolumes.StaticBindings
   532  	claimsToProvision := podVolumes.DynamicProvisions
   533  
   534  	// Start API operations
   535  	err = b.bindAPIUpdate(ctx, assumedPod, bindings, claimsToProvision)
   536  	if err != nil {
   537  		return err
   538  	}
   539  
   540  	err = wait.PollUntilContextTimeout(ctx, time.Second, b.bindTimeout, false, func(ctx context.Context) (bool, error) {
   541  		b, err := b.checkBindings(logger, assumedPod, bindings, claimsToProvision)
   542  		return b, err
   543  	})
   544  	if err != nil {
   545  		return fmt.Errorf("binding volumes: %w", err)
   546  	}
   547  	return nil
   548  }
   549  
   550  func getPodName(pod *v1.Pod) string {
   551  	return pod.Namespace + "/" + pod.Name
   552  }
   553  
   554  func getPVCName(pvc *v1.PersistentVolumeClaim) string {
   555  	return pvc.Namespace + "/" + pvc.Name
   556  }
   557  
   558  // bindAPIUpdate makes the API update for those PVs/PVCs.
   559  func (b *volumeBinder) bindAPIUpdate(ctx context.Context, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) error {
   560  	logger := klog.FromContext(ctx)
   561  	podName := getPodName(pod)
   562  	if bindings == nil {
   563  		return fmt.Errorf("failed to get cached bindings for pod %q", podName)
   564  	}
   565  	if claimsToProvision == nil {
   566  		return fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
   567  	}
   568  
   569  	lastProcessedBinding := 0
   570  	lastProcessedProvisioning := 0
   571  	defer func() {
   572  		// only revert assumed cached updates for volumes we haven't successfully bound
   573  		if lastProcessedBinding < len(bindings) {
   574  			b.revertAssumedPVs(bindings[lastProcessedBinding:])
   575  		}
   576  		// only revert assumed cached updates for claims we haven't updated,
   577  		if lastProcessedProvisioning < len(claimsToProvision) {
   578  			b.revertAssumedPVCs(claimsToProvision[lastProcessedProvisioning:])
   579  		}
   580  	}()
   581  
   582  	var (
   583  		binding *BindingInfo
   584  		i       int
   585  		claim   *v1.PersistentVolumeClaim
   586  	)
   587  
   588  	// Do the actual prebinding. Let the PV controller take care of the rest
   589  	// There is no API rollback if the actual binding fails
   590  	for _, binding = range bindings {
   591  		// TODO: does it hurt if we make an api call and nothing needs to be updated?
   592  		logger.V(5).Info("Updating PersistentVolume: binding to claim", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc))
   593  		newPV, err := b.kubeClient.CoreV1().PersistentVolumes().Update(ctx, binding.pv, metav1.UpdateOptions{})
   594  		if err != nil {
   595  			logger.V(4).Info("Updating PersistentVolume: binding to claim failed", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc), "err", err)
   596  			return err
   597  		}
   598  
   599  		logger.V(2).Info("Updated PersistentVolume with claim. Waiting for binding to complete", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc))
   600  		// Save updated object from apiserver for later checking.
   601  		binding.pv = newPV
   602  		lastProcessedBinding++
   603  	}
   604  
   605  	// Update claims objects to trigger volume provisioning. Let the PV controller take care of the rest
   606  	// PV controller is expected to signal back by removing related annotations if actual provisioning fails
   607  	for i, claim = range claimsToProvision {
   608  		logger.V(5).Info("Updating claims objects to trigger volume provisioning", "pod", klog.KObj(pod), "PVC", klog.KObj(claim))
   609  		newClaim, err := b.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{})
   610  		if err != nil {
   611  			logger.V(4).Info("Updating PersistentVolumeClaim: binding to volume failed", "PVC", klog.KObj(claim), "err", err)
   612  			return err
   613  		}
   614  
   615  		// Save updated object from apiserver for later checking.
   616  		claimsToProvision[i] = newClaim
   617  		lastProcessedProvisioning++
   618  	}
   619  
   620  	return nil
   621  }
   622  
   623  var (
   624  	versioner = storage.APIObjectVersioner{}
   625  )
   626  
   627  // checkBindings runs through all the PVCs in the Pod and checks:
   628  // * if the PVC is fully bound
   629  // * if there are any conditions that require binding to fail and be retried
   630  //
   631  // It returns true when all of the Pod's PVCs are fully bound, and error if
   632  // binding (and scheduling) needs to be retried
   633  // Note that it checks on API objects not PV/PVC cache, this is because
   634  // PV/PVC cache can be assumed again in main scheduler loop, we must check
   635  // latest state in API server which are shared with PV controller and
   636  // provisioners
   637  func (b *volumeBinder) checkBindings(logger klog.Logger, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) (bool, error) {
   638  	podName := getPodName(pod)
   639  	if bindings == nil {
   640  		return false, fmt.Errorf("failed to get cached bindings for pod %q", podName)
   641  	}
   642  	if claimsToProvision == nil {
   643  		return false, fmt.Errorf("failed to get cached claims to provision for pod %q", podName)
   644  	}
   645  
   646  	node, err := b.nodeLister.Get(pod.Spec.NodeName)
   647  	if err != nil {
   648  		return false, fmt.Errorf("failed to get node %q: %w", pod.Spec.NodeName, err)
   649  	}
   650  
   651  	csiNode, err := b.csiNodeLister.Get(node.Name)
   652  	if err != nil {
   653  		// TODO: return the error once CSINode is created by default
   654  		logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
   655  	}
   656  
   657  	// Check for any conditions that might require scheduling retry
   658  
   659  	// When pod is deleted, binding operation should be cancelled. There is no
   660  	// need to check PV/PVC bindings any more.
   661  	_, err = b.podLister.Pods(pod.Namespace).Get(pod.Name)
   662  	if err != nil {
   663  		if apierrors.IsNotFound(err) {
   664  			return false, fmt.Errorf("pod does not exist any more: %w", err)
   665  		}
   666  		logger.Error(err, "Failed to get pod from the lister", "pod", klog.KObj(pod))
   667  	}
   668  
   669  	for _, binding := range bindings {
   670  		pv, err := b.pvCache.GetAPIPV(binding.pv.Name)
   671  		if err != nil {
   672  			return false, fmt.Errorf("failed to check binding: %w", err)
   673  		}
   674  
   675  		pvc, err := b.pvcCache.GetAPIPVC(getPVCName(binding.pvc))
   676  		if err != nil {
   677  			return false, fmt.Errorf("failed to check binding: %w", err)
   678  		}
   679  
   680  		// Because we updated PV in apiserver, skip if API object is older
   681  		// and wait for new API object propagated from apiserver.
   682  		if versioner.CompareResourceVersion(binding.pv, pv) > 0 {
   683  			return false, nil
   684  		}
   685  
   686  		pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   687  		if err != nil {
   688  			return false, fmt.Errorf("failed to translate pv to csi: %w", err)
   689  		}
   690  
   691  		// Check PV's node affinity (the node might not have the proper label)
   692  		if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil {
   693  			return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err)
   694  		}
   695  
   696  		// Check if pv.ClaimRef got dropped by unbindVolume()
   697  		if pv.Spec.ClaimRef == nil || pv.Spec.ClaimRef.UID == "" {
   698  			return false, fmt.Errorf("ClaimRef got reset for pv %q", pv.Name)
   699  		}
   700  
   701  		// Check if pvc is fully bound
   702  		if !b.isPVCFullyBound(pvc) {
   703  			return false, nil
   704  		}
   705  	}
   706  
   707  	for _, claim := range claimsToProvision {
   708  		pvc, err := b.pvcCache.GetAPIPVC(getPVCName(claim))
   709  		if err != nil {
   710  			return false, fmt.Errorf("failed to check provisioning pvc: %w", err)
   711  		}
   712  
   713  		// Because we updated PVC in apiserver, skip if API object is older
   714  		// and wait for new API object propagated from apiserver.
   715  		if versioner.CompareResourceVersion(claim, pvc) > 0 {
   716  			return false, nil
   717  		}
   718  
   719  		// Check if selectedNode annotation is still set
   720  		if pvc.Annotations == nil {
   721  			return false, fmt.Errorf("selectedNode annotation reset for PVC %q", pvc.Name)
   722  		}
   723  		selectedNode := pvc.Annotations[volume.AnnSelectedNode]
   724  		if selectedNode != pod.Spec.NodeName {
   725  			// If provisioner fails to provision a volume, selectedNode
   726  			// annotation will be removed to signal back to the scheduler to
   727  			// retry.
   728  			return false, fmt.Errorf("provisioning failed for PVC %q", pvc.Name)
   729  		}
   730  
   731  		// If the PVC is bound to a PV, check its node affinity
   732  		if pvc.Spec.VolumeName != "" {
   733  			pv, err := b.pvCache.GetAPIPV(pvc.Spec.VolumeName)
   734  			if err != nil {
   735  				if _, ok := err.(*errNotFound); ok {
   736  					// We tolerate NotFound error here, because PV is possibly
   737  					// not found because of API delay, we can check next time.
   738  					// And if PV does not exist because it's deleted, PVC will
   739  					// be unbound eventually.
   740  					return false, nil
   741  				}
   742  				return false, fmt.Errorf("failed to get pv %q from cache: %w", pvc.Spec.VolumeName, err)
   743  			}
   744  
   745  			pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   746  			if err != nil {
   747  				return false, err
   748  			}
   749  
   750  			if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil {
   751  				return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err)
   752  			}
   753  		}
   754  
   755  		// Check if pvc is fully bound
   756  		if !b.isPVCFullyBound(pvc) {
   757  			return false, nil
   758  		}
   759  	}
   760  
   761  	// All pvs and pvcs that we operated on are bound
   762  	logger.V(2).Info("All PVCs for pod are bound", "pod", klog.KObj(pod))
   763  	return true, nil
   764  }
   765  
   766  func (b *volumeBinder) isVolumeBound(logger klog.Logger, pod *v1.Pod, vol *v1.Volume) (bound bool, pvc *v1.PersistentVolumeClaim, err error) {
   767  	pvcName := ""
   768  	isEphemeral := false
   769  	switch {
   770  	case vol.PersistentVolumeClaim != nil:
   771  		pvcName = vol.PersistentVolumeClaim.ClaimName
   772  	case vol.Ephemeral != nil:
   773  		// Generic ephemeral inline volumes also use a PVC,
   774  		// just with a computed name, and...
   775  		pvcName = ephemeral.VolumeClaimName(pod, vol)
   776  		isEphemeral = true
   777  	default:
   778  		return true, nil, nil
   779  	}
   780  
   781  	bound, pvc, err = b.isPVCBound(logger, pod.Namespace, pvcName)
   782  	// ... the PVC must be owned by the pod.
   783  	if isEphemeral && err == nil && pvc != nil {
   784  		if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
   785  			return false, nil, err
   786  		}
   787  	}
   788  	return
   789  }
   790  
   791  func (b *volumeBinder) isPVCBound(logger klog.Logger, namespace, pvcName string) (bool, *v1.PersistentVolumeClaim, error) {
   792  	claim := &v1.PersistentVolumeClaim{
   793  		ObjectMeta: metav1.ObjectMeta{
   794  			Name:      pvcName,
   795  			Namespace: namespace,
   796  		},
   797  	}
   798  	pvcKey := getPVCName(claim)
   799  	pvc, err := b.pvcCache.GetPVC(pvcKey)
   800  	if err != nil || pvc == nil {
   801  		return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcKey, err)
   802  	}
   803  
   804  	fullyBound := b.isPVCFullyBound(pvc)
   805  	if fullyBound {
   806  		logger.V(5).Info("PVC is fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName))
   807  	} else {
   808  		if pvc.Spec.VolumeName != "" {
   809  			logger.V(5).Info("PVC is not fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName))
   810  		} else {
   811  			logger.V(5).Info("PVC is not bound", "PVC", klog.KObj(pvc))
   812  		}
   813  	}
   814  	return fullyBound, pvc, nil
   815  }
   816  
   817  func (b *volumeBinder) isPVCFullyBound(pvc *v1.PersistentVolumeClaim) bool {
   818  	return pvc.Spec.VolumeName != "" && metav1.HasAnnotation(pvc.ObjectMeta, volume.AnnBindCompleted)
   819  }
   820  
   821  // arePodVolumesBound returns true if all volumes are fully bound
   822  func (b *volumeBinder) arePodVolumesBound(logger klog.Logger, pod *v1.Pod) bool {
   823  	for _, vol := range pod.Spec.Volumes {
   824  		if isBound, _, _ := b.isVolumeBound(logger, pod, &vol); !isBound {
   825  			// Pod has at least one PVC that needs binding
   826  			return false
   827  		}
   828  	}
   829  	return true
   830  }
   831  
   832  // GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning),
   833  // unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding.
   834  func (b *volumeBinder) GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) {
   835  	podVolumeClaims = &PodVolumeClaims{
   836  		boundClaims:               []*v1.PersistentVolumeClaim{},
   837  		unboundClaimsImmediate:    []*v1.PersistentVolumeClaim{},
   838  		unboundClaimsDelayBinding: []*v1.PersistentVolumeClaim{},
   839  	}
   840  
   841  	for _, vol := range pod.Spec.Volumes {
   842  		volumeBound, pvc, err := b.isVolumeBound(logger, pod, &vol)
   843  		if err != nil {
   844  			return podVolumeClaims, err
   845  		}
   846  		if pvc == nil {
   847  			continue
   848  		}
   849  		if volumeBound {
   850  			podVolumeClaims.boundClaims = append(podVolumeClaims.boundClaims, pvc)
   851  		} else {
   852  			delayBindingMode, err := volume.IsDelayBindingMode(pvc, b.classLister)
   853  			if err != nil {
   854  				return podVolumeClaims, err
   855  			}
   856  			// Prebound PVCs are treated as unbound immediate binding
   857  			if delayBindingMode && pvc.Spec.VolumeName == "" {
   858  				// Scheduler path
   859  				podVolumeClaims.unboundClaimsDelayBinding = append(podVolumeClaims.unboundClaimsDelayBinding, pvc)
   860  			} else {
   861  				// !delayBindingMode || pvc.Spec.VolumeName != ""
   862  				// Immediate binding should have already been bound
   863  				podVolumeClaims.unboundClaimsImmediate = append(podVolumeClaims.unboundClaimsImmediate, pvc)
   864  			}
   865  		}
   866  	}
   867  
   868  	podVolumeClaims.unboundVolumesDelayBinding = map[string][]*v1.PersistentVolume{}
   869  	for _, pvc := range podVolumeClaims.unboundClaimsDelayBinding {
   870  		// Get storage class name from each PVC
   871  		storageClassName := volume.GetPersistentVolumeClaimClass(pvc)
   872  		podVolumeClaims.unboundVolumesDelayBinding[storageClassName] = b.pvCache.ListPVs(storageClassName)
   873  	}
   874  	return podVolumeClaims, nil
   875  }
   876  
   877  func (b *volumeBinder) checkBoundClaims(logger klog.Logger, claims []*v1.PersistentVolumeClaim, node *v1.Node, pod *v1.Pod) (bool, bool, error) {
   878  	csiNode, err := b.csiNodeLister.Get(node.Name)
   879  	if err != nil {
   880  		// TODO: return the error once CSINode is created by default
   881  		logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
   882  	}
   883  
   884  	for _, pvc := range claims {
   885  		pvName := pvc.Spec.VolumeName
   886  		pv, err := b.pvCache.GetPV(pvName)
   887  		if err != nil {
   888  			if _, ok := err.(*errNotFound); ok {
   889  				err = nil
   890  			}
   891  			return true, false, err
   892  		}
   893  
   894  		pv, err = b.tryTranslatePVToCSI(pv, csiNode)
   895  		if err != nil {
   896  			return false, true, err
   897  		}
   898  
   899  		err = volume.CheckNodeAffinity(pv, node.Labels)
   900  		if err != nil {
   901  			logger.V(4).Info("PersistentVolume and node mismatch for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod), "err", err)
   902  			return false, true, nil
   903  		}
   904  		logger.V(5).Info("PersistentVolume and node matches for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod))
   905  	}
   906  
   907  	logger.V(4).Info("All bound volumes for pod match with node", "pod", klog.KObj(pod), "node", klog.KObj(node))
   908  	return true, true, nil
   909  }
   910  
   911  // findMatchingVolumes tries to find matching volumes for given claims,
   912  // and return unbound claims for further provision.
   913  func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, claimsToBind []*v1.PersistentVolumeClaim, unboundVolumesDelayBinding map[string][]*v1.PersistentVolume, node *v1.Node) (foundMatches bool, bindings []*BindingInfo, unboundClaims []*v1.PersistentVolumeClaim, err error) {
   914  	// Sort all the claims by increasing size request to get the smallest fits
   915  	sort.Sort(byPVCSize(claimsToBind))
   916  
   917  	chosenPVs := map[string]*v1.PersistentVolume{}
   918  
   919  	foundMatches = true
   920  
   921  	for _, pvc := range claimsToBind {
   922  		// Get storage class name from each PVC
   923  		storageClassName := volume.GetPersistentVolumeClaimClass(pvc)
   924  		pvs := unboundVolumesDelayBinding[storageClassName]
   925  
   926  		// Find a matching PV
   927  		pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true)
   928  		if err != nil {
   929  			return false, nil, nil, err
   930  		}
   931  		if pv == nil {
   932  			logger.V(4).Info("No matching volumes for pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc), "node", klog.KObj(node))
   933  			unboundClaims = append(unboundClaims, pvc)
   934  			foundMatches = false
   935  			continue
   936  		}
   937  
   938  		// matching PV needs to be excluded so we don't select it again
   939  		chosenPVs[pv.Name] = pv
   940  		bindings = append(bindings, &BindingInfo{pv: pv, pvc: pvc})
   941  		logger.V(5).Info("Found matching PV for PVC for pod", "PV", klog.KObj(pv), "PVC", klog.KObj(pvc), "node", klog.KObj(node), "pod", klog.KObj(pod))
   942  	}
   943  
   944  	if foundMatches {
   945  		logger.V(4).Info("Found matching volumes for pod", "pod", klog.KObj(pod), "node", klog.KObj(node))
   946  	}
   947  
   948  	return
   949  }
   950  
   951  // checkVolumeProvisions checks given unbound claims (the claims have gone through func
   952  // findMatchingVolumes, and do not have matching volumes for binding), and return true
   953  // if all of the claims are eligible for dynamic provision.
   954  func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*v1.PersistentVolumeClaim, err error) {
   955  	dynamicProvisions = []*v1.PersistentVolumeClaim{}
   956  
   957  	// We return early with provisionedClaims == nil if a check
   958  	// fails or we encounter an error.
   959  	for _, claim := range claimsToProvision {
   960  		pvcName := getPVCName(claim)
   961  		className := volume.GetPersistentVolumeClaimClass(claim)
   962  		if className == "" {
   963  			return false, false, nil, fmt.Errorf("no class for claim %q", pvcName)
   964  		}
   965  
   966  		class, err := b.classLister.Get(className)
   967  		if err != nil {
   968  			return false, false, nil, fmt.Errorf("failed to find storage class %q", className)
   969  		}
   970  		provisioner := class.Provisioner
   971  		if provisioner == "" || provisioner == volume.NotSupportedProvisioner {
   972  			logger.V(4).Info("Storage class of claim does not support dynamic provisioning", "storageClassName", className, "PVC", klog.KObj(claim))
   973  			return false, true, nil, nil
   974  		}
   975  
   976  		// Check if the node can satisfy the topology requirement in the class
   977  		if !v1helper.MatchTopologySelectorTerms(class.AllowedTopologies, labels.Set(node.Labels)) {
   978  			logger.V(4).Info("Node cannot satisfy provisioning topology requirements of claim", "node", klog.KObj(node), "PVC", klog.KObj(claim))
   979  			return false, true, nil, nil
   980  		}
   981  
   982  		// Check storage capacity.
   983  		sufficient, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node)
   984  		if err != nil {
   985  			return false, false, nil, err
   986  		}
   987  		if !sufficient {
   988  			// hasEnoughCapacity logs an explanation.
   989  			return true, false, nil, nil
   990  		}
   991  
   992  		dynamicProvisions = append(dynamicProvisions, claim)
   993  	}
   994  	logger.V(4).Info("Provisioning for claims of pod that has no matching volumes...", "claimCount", len(claimsToProvision), "pod", klog.KObj(pod), "node", klog.KObj(node))
   995  
   996  	return true, true, dynamicProvisions, nil
   997  }
   998  
   999  func (b *volumeBinder) revertAssumedPVs(bindings []*BindingInfo) {
  1000  	for _, BindingInfo := range bindings {
  1001  		b.pvCache.Restore(BindingInfo.pv.Name)
  1002  	}
  1003  }
  1004  
  1005  func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) {
  1006  	for _, claim := range claims {
  1007  		b.pvcCache.Restore(getPVCName(claim))
  1008  	}
  1009  }
  1010  
  1011  // hasEnoughCapacity checks whether the provisioner has enough capacity left for a new volume of the given size
  1012  // that is available from the node.
  1013  func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, error) {
  1014  	// This is an optional feature. If disabled, we assume that
  1015  	// there is enough storage.
  1016  	if !b.capacityCheckEnabled {
  1017  		return true, nil
  1018  	}
  1019  
  1020  	quantity, ok := claim.Spec.Resources.Requests[v1.ResourceStorage]
  1021  	if !ok {
  1022  		// No capacity to check for.
  1023  		return true, nil
  1024  	}
  1025  
  1026  	// Only enabled for CSI drivers which opt into it.
  1027  	driver, err := b.csiDriverLister.Get(provisioner)
  1028  	if err != nil {
  1029  		if apierrors.IsNotFound(err) {
  1030  			// Either the provisioner is not a CSI driver or the driver does not
  1031  			// opt into storage capacity scheduling. Either way, skip
  1032  			// capacity checking.
  1033  			return true, nil
  1034  		}
  1035  		return false, err
  1036  	}
  1037  	if driver.Spec.StorageCapacity == nil || !*driver.Spec.StorageCapacity {
  1038  		return true, nil
  1039  	}
  1040  
  1041  	// Look for a matching CSIStorageCapacity object(s).
  1042  	// TODO (for beta): benchmark this and potentially introduce some kind of lookup structure (https://github.com/kubernetes/enhancements/issues/1698#issuecomment-654356718).
  1043  	capacities, err := b.csiStorageCapacityLister.List(labels.Everything())
  1044  	if err != nil {
  1045  		return false, err
  1046  	}
  1047  
  1048  	sizeInBytes := quantity.Value()
  1049  	for _, capacity := range capacities {
  1050  		if capacity.StorageClassName == storageClass.Name &&
  1051  			capacitySufficient(capacity, sizeInBytes) &&
  1052  			b.nodeHasAccess(logger, node, capacity) {
  1053  			// Enough capacity found.
  1054  			return true, nil
  1055  		}
  1056  	}
  1057  
  1058  	// TODO (?): this doesn't give any information about which pools where considered and why
  1059  	// they had to be rejected. Log that above? But that might be a lot of log output...
  1060  	logger.V(4).Info("Node has no accessible CSIStorageCapacity with enough capacity for PVC",
  1061  		"node", klog.KObj(node), "PVC", klog.KObj(claim), "size", sizeInBytes, "storageClass", klog.KObj(storageClass))
  1062  	return false, nil
  1063  }
  1064  
  1065  func capacitySufficient(capacity *storagev1beta1.CSIStorageCapacity, sizeInBytes int64) bool {
  1066  	limit := capacity.Capacity
  1067  	if capacity.MaximumVolumeSize != nil {
  1068  		// Prefer MaximumVolumeSize if available, it is more precise.
  1069  		limit = capacity.MaximumVolumeSize
  1070  	}
  1071  	return limit != nil && limit.Value() >= sizeInBytes
  1072  }
  1073  
  1074  func (b *volumeBinder) nodeHasAccess(logger klog.Logger, node *v1.Node, capacity *storagev1beta1.CSIStorageCapacity) bool {
  1075  	if capacity.NodeTopology == nil {
  1076  		// Unavailable
  1077  		return false
  1078  	}
  1079  	// Only matching by label is supported.
  1080  	selector, err := metav1.LabelSelectorAsSelector(capacity.NodeTopology)
  1081  	if err != nil {
  1082  		logger.Error(err, "Unexpected error converting to a label selector", "nodeTopology", capacity.NodeTopology)
  1083  		return false
  1084  	}
  1085  	return selector.Matches(labels.Set(node.Labels))
  1086  }
  1087  
  1088  type byPVCSize []*v1.PersistentVolumeClaim
  1089  
  1090  func (a byPVCSize) Len() int {
  1091  	return len(a)
  1092  }
  1093  
  1094  func (a byPVCSize) Swap(i, j int) {
  1095  	a[i], a[j] = a[j], a[i]
  1096  }
  1097  
  1098  func (a byPVCSize) Less(i, j int) bool {
  1099  	iSize := a[i].Spec.Resources.Requests[v1.ResourceStorage]
  1100  	jSize := a[j].Spec.Resources.Requests[v1.ResourceStorage]
  1101  	// return true if iSize is less than jSize
  1102  	return iSize.Cmp(jSize) == -1
  1103  }
  1104  
  1105  // isCSIMigrationOnForPlugin checks if CSI migration is enabled for a given plugin.
  1106  func isCSIMigrationOnForPlugin(pluginName string) bool {
  1107  	switch pluginName {
  1108  	case csiplugins.AWSEBSInTreePluginName:
  1109  		return true
  1110  	case csiplugins.GCEPDInTreePluginName:
  1111  		return true
  1112  	case csiplugins.AzureDiskInTreePluginName:
  1113  		return true
  1114  	case csiplugins.CinderInTreePluginName:
  1115  		return true
  1116  	case csiplugins.PortworxVolumePluginName:
  1117  		return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx)
  1118  	case csiplugins.RBDVolumePluginName:
  1119  		return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationRBD)
  1120  	}
  1121  	return false
  1122  }
  1123  
  1124  // isPluginMigratedToCSIOnNode checks if an in-tree plugin has been migrated to a CSI driver on the node.
  1125  func isPluginMigratedToCSIOnNode(pluginName string, csiNode *storagev1.CSINode) bool {
  1126  	if csiNode == nil {
  1127  		return false
  1128  	}
  1129  
  1130  	csiNodeAnn := csiNode.GetAnnotations()
  1131  	if csiNodeAnn == nil {
  1132  		return false
  1133  	}
  1134  
  1135  	var mpaSet sets.Set[string]
  1136  	mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey]
  1137  	if len(mpa) == 0 {
  1138  		mpaSet = sets.New[string]()
  1139  	} else {
  1140  		tok := strings.Split(mpa, ",")
  1141  		mpaSet = sets.New(tok...)
  1142  	}
  1143  
  1144  	return mpaSet.Has(pluginName)
  1145  }
  1146  
  1147  // tryTranslatePVToCSI will translate the in-tree PV to CSI if it meets the criteria. If not, it returns the unmodified in-tree PV.
  1148  func (b *volumeBinder) tryTranslatePVToCSI(pv *v1.PersistentVolume, csiNode *storagev1.CSINode) (*v1.PersistentVolume, error) {
  1149  	if !b.translator.IsPVMigratable(pv) {
  1150  		return pv, nil
  1151  	}
  1152  
  1153  	pluginName, err := b.translator.GetInTreePluginNameFromSpec(pv, nil)
  1154  	if err != nil {
  1155  		return nil, fmt.Errorf("could not get plugin name from pv: %v", err)
  1156  	}
  1157  
  1158  	if !isCSIMigrationOnForPlugin(pluginName) {
  1159  		return pv, nil
  1160  	}
  1161  
  1162  	if !isPluginMigratedToCSIOnNode(pluginName, csiNode) {
  1163  		return pv, nil
  1164  	}
  1165  
  1166  	transPV, err := b.translator.TranslateInTreePVToCSI(pv)
  1167  	if err != nil {
  1168  		return nil, fmt.Errorf("could not translate pv: %v", err)
  1169  	}
  1170  
  1171  	return transPV, nil
  1172  }