volcano.sh/volcano@v1.9.0/pkg/scheduler/capabilities/volumebinding/binder.go (about) 1 /* 2 Copyright 2017 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package volumebinding 18 19 import ( 20 "context" 21 "fmt" 22 "sort" 23 "strings" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 storagev1 "k8s.io/api/storage/v1" 28 storagev1beta1 "k8s.io/api/storage/v1beta1" 29 apierrors "k8s.io/apimachinery/pkg/api/errors" 30 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 31 "k8s.io/apimachinery/pkg/labels" 32 "k8s.io/apimachinery/pkg/util/sets" 33 "k8s.io/apimachinery/pkg/util/wait" 34 "k8s.io/apiserver/pkg/storage" 35 utilfeature "k8s.io/apiserver/pkg/util/feature" 36 coreinformers "k8s.io/client-go/informers/core/v1" 37 storageinformers "k8s.io/client-go/informers/storage/v1" 38 storageinformersv1beta1 "k8s.io/client-go/informers/storage/v1beta1" 39 clientset "k8s.io/client-go/kubernetes" 40 corelisters "k8s.io/client-go/listers/core/v1" 41 storagelisters "k8s.io/client-go/listers/storage/v1" 42 storagelistersv1beta1 "k8s.io/client-go/listers/storage/v1beta1" 43 "k8s.io/component-helpers/storage/ephemeral" 44 "k8s.io/component-helpers/storage/volume" 45 csitrans "k8s.io/csi-translation-lib" 46 csiplugins "k8s.io/csi-translation-lib/plugins" 47 "k8s.io/klog/v2" 48 v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" 49 "k8s.io/kubernetes/pkg/features" 50 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics" 51 "k8s.io/kubernetes/pkg/volume/util" 52 ) 53 54 // ConflictReason is used for the special strings which explain why 55 // volume binding is impossible for a node. 56 type ConflictReason string 57 58 // ConflictReasons contains all reasons that explain why volume binding is impossible for a node. 59 type ConflictReasons []ConflictReason 60 61 func (reasons ConflictReasons) Len() int { return len(reasons) } 62 func (reasons ConflictReasons) Less(i, j int) bool { return reasons[i] < reasons[j] } 63 func (reasons ConflictReasons) Swap(i, j int) { reasons[i], reasons[j] = reasons[j], reasons[i] } 64 65 const ( 66 // ErrReasonBindConflict is used for VolumeBindingNoMatch predicate error. 67 ErrReasonBindConflict ConflictReason = "node(s) didn't find available persistent volumes to bind" 68 // ErrReasonNodeConflict is used for VolumeNodeAffinityConflict predicate error. 69 ErrReasonNodeConflict ConflictReason = "node(s) had volume node affinity conflict" 70 // ErrReasonNotEnoughSpace is used when a pod cannot start on a node because not enough storage space is available. 71 ErrReasonNotEnoughSpace = "node(s) did not have enough free storage" 72 // ErrReasonPVNotExist is used when a pod has one or more PVC(s) bound to non-existent persistent volume(s)" 73 ErrReasonPVNotExist = "node(s) unavailable due to one or more pvc(s) bound to non-existent pv(s)" 74 ) 75 76 // BindingInfo holds a binding between PV and PVC. 77 type BindingInfo struct { 78 // PVC that needs to be bound 79 pvc *v1.PersistentVolumeClaim 80 81 // Proposed PV to bind to this PVC 82 pv *v1.PersistentVolume 83 } 84 85 // StorageClassName returns the name of the storage class. 86 func (b *BindingInfo) StorageClassName() string { 87 return b.pv.Spec.StorageClassName 88 } 89 90 // StorageResource represents storage resource. 91 type StorageResource struct { 92 Requested int64 93 Capacity int64 94 } 95 96 // StorageResource returns storage resource. 97 func (b *BindingInfo) StorageResource() *StorageResource { 98 // both fields are mandatory 99 requestedQty := b.pvc.Spec.Resources.Requests[v1.ResourceName(v1.ResourceStorage)] 100 capacityQty := b.pv.Spec.Capacity[v1.ResourceName(v1.ResourceStorage)] 101 return &StorageResource{ 102 Requested: requestedQty.Value(), 103 Capacity: capacityQty.Value(), 104 } 105 } 106 107 // PodVolumes holds pod's volumes information used in volume scheduling. 108 type PodVolumes struct { 109 // StaticBindings are binding decisions for PVCs which can be bound to 110 // pre-provisioned static PVs. 111 StaticBindings []*BindingInfo 112 // DynamicProvisions are PVCs that require dynamic provisioning 113 DynamicProvisions []*v1.PersistentVolumeClaim 114 } 115 116 // InTreeToCSITranslator contains methods required to check migratable status 117 // and perform translations from InTree PV's to CSI 118 type InTreeToCSITranslator interface { 119 IsPVMigratable(pv *v1.PersistentVolume) bool 120 GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error) 121 TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error) 122 } 123 124 // SchedulerVolumeBinder is used by the scheduler VolumeBinding plugin to 125 // handle PVC/PV binding and dynamic provisioning. The binding decisions are 126 // integrated into the pod scheduling workflow so that the PV NodeAffinity is 127 // also considered along with the pod's other scheduling requirements. 128 // 129 // This integrates into the existing scheduler workflow as follows: 130 // 1. The scheduler takes a Pod off the scheduler queue and processes it serially: 131 // a. Invokes all pre-filter plugins for the pod. GetPodVolumeClaims() is invoked 132 // here, pod volume information will be saved in current scheduling cycle state for later use. 133 // If pod has bound immediate PVCs, GetEligibleNodes() is invoked to potentially reduce 134 // down the list of eligible nodes based on the bound PV's NodeAffinity (if any). 135 // b. Invokes all filter plugins, parallelized across nodes. FindPodVolumes() is invoked here. 136 // c. Invokes all score plugins. Future/TBD 137 // d. Selects the best node for the Pod. 138 // e. Invokes all reserve plugins. AssumePodVolumes() is invoked here. 139 // i. If PVC binding is required, cache in-memory only: 140 // * For manual binding: update PV objects for prebinding to the corresponding PVCs. 141 // * For dynamic provisioning: update PVC object with a selected node from c) 142 // * For the pod, which PVCs and PVs need API updates. 143 // ii. Afterwards, the main scheduler caches the Pod->Node binding in the scheduler's pod cache, 144 // This is handled in the scheduler and not here. 145 // f. Asynchronously bind volumes and pod in a separate goroutine 146 // i. BindPodVolumes() is called first in PreBind phase. It makes all the necessary API updates and waits for 147 // PV controller to fully bind and provision the PVCs. If binding fails, the Pod is sent 148 // back through the scheduler. 149 // ii. After BindPodVolumes() is complete, then the scheduler does the final Pod->Node binding. 150 // 2. Once all the assume operations are done in e), the scheduler processes the next Pod in the scheduler queue 151 // while the actual binding operation occurs in the background. 152 type SchedulerVolumeBinder interface { 153 // GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning), 154 // unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding. 155 GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) 156 157 // GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be 158 // potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used 159 // in subsequent scheduling stages. 160 // 161 // If eligibleNodes is 'nil', then it indicates that such eligible node reduction cannot be made 162 // and all nodes should be considered. 163 GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) 164 165 // FindPodVolumes checks if all of a Pod's PVCs can be satisfied by the 166 // node and returns pod's volumes information. 167 // 168 // If a PVC is bound, it checks if the PV's NodeAffinity matches the Node. 169 // Otherwise, it tries to find an available PV to bind to the PVC. 170 // 171 // It returns an error when something went wrong or a list of reasons why the node is 172 // (currently) not usable for the pod. 173 // 174 // If the CSIStorageCapacity feature is enabled, then it also checks for sufficient storage 175 // for volumes that still need to be created. 176 // 177 // This function is called by the scheduler VolumeBinding plugin and can be called in parallel 178 FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) 179 180 // AssumePodVolumes will: 181 // 1. Take the PV matches for unbound PVCs and update the PV cache assuming 182 // that the PV is prebound to the PVC. 183 // 2. Take the PVCs that need provisioning and update the PVC cache with related 184 // annotations set. 185 // 186 // It returns true if all volumes are fully bound 187 // 188 // This function is called serially. 189 AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error) 190 191 // RevertAssumedPodVolumes will revert assumed PV and PVC cache. 192 RevertAssumedPodVolumes(podVolumes *PodVolumes) 193 194 // BindPodVolumes will: 195 // 1. Initiate the volume binding by making the API call to prebind the PV 196 // to its matching PVC. 197 // 2. Trigger the volume provisioning by making the API call to set related 198 // annotations on the PVC 199 // 3. Wait for PVCs to be completely bound by the PV controller 200 // 201 // This function can be called in parallel. 202 BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) error 203 } 204 205 type PodVolumeClaims struct { 206 // boundClaims are the pod's bound PVCs. 207 boundClaims []*v1.PersistentVolumeClaim 208 // unboundClaimsDelayBinding are the pod's unbound with delayed binding (including provisioning) PVCs. 209 unboundClaimsDelayBinding []*v1.PersistentVolumeClaim 210 // unboundClaimsImmediate are the pod's unbound with immediate binding PVCs (i.e., supposed to be bound already) . 211 unboundClaimsImmediate []*v1.PersistentVolumeClaim 212 // unboundVolumesDelayBinding are PVs that belong to storage classes of the pod's unbound PVCs with delayed binding. 213 unboundVolumesDelayBinding map[string][]*v1.PersistentVolume 214 } 215 216 type volumeBinder struct { 217 kubeClient clientset.Interface 218 219 classLister storagelisters.StorageClassLister 220 podLister corelisters.PodLister 221 nodeLister corelisters.NodeLister 222 csiNodeLister storagelisters.CSINodeLister 223 224 pvcCache PVCAssumeCache 225 pvCache PVAssumeCache 226 227 // Amount of time to wait for the bind operation to succeed 228 bindTimeout time.Duration 229 230 translator InTreeToCSITranslator 231 232 capacityCheckEnabled bool 233 csiDriverLister storagelisters.CSIDriverLister 234 csiStorageCapacityLister storagelistersv1beta1.CSIStorageCapacityLister 235 } 236 237 var _ SchedulerVolumeBinder = &volumeBinder{} 238 239 // CapacityCheck contains additional parameters for NewVolumeBinder that 240 // are only needed when checking volume sizes against available storage 241 // capacity is desired. 242 type CapacityCheck struct { 243 CSIDriverInformer storageinformers.CSIDriverInformer 244 CSIStorageCapacityInformer storageinformersv1beta1.CSIStorageCapacityInformer 245 } 246 247 // NewVolumeBinder sets up all the caches needed for the scheduler to make volume binding decisions. 248 // 249 // capacityCheck determines how storage capacity is checked (CSIStorageCapacity feature). 250 func NewVolumeBinder( 251 logger klog.Logger, 252 kubeClient clientset.Interface, 253 podInformer coreinformers.PodInformer, 254 nodeInformer coreinformers.NodeInformer, 255 csiNodeInformer storageinformers.CSINodeInformer, 256 pvcInformer coreinformers.PersistentVolumeClaimInformer, 257 pvInformer coreinformers.PersistentVolumeInformer, 258 storageClassInformer storageinformers.StorageClassInformer, 259 capacityCheck *CapacityCheck, 260 bindTimeout time.Duration) SchedulerVolumeBinder { 261 b := &volumeBinder{ 262 kubeClient: kubeClient, 263 podLister: podInformer.Lister(), 264 classLister: storageClassInformer.Lister(), 265 nodeLister: nodeInformer.Lister(), 266 csiNodeLister: csiNodeInformer.Lister(), 267 pvcCache: NewPVCAssumeCache(logger, pvcInformer.Informer()), 268 pvCache: NewPVAssumeCache(logger, pvInformer.Informer()), 269 bindTimeout: bindTimeout, 270 translator: csitrans.New(), 271 } 272 273 if capacityCheck != nil { 274 b.capacityCheckEnabled = true 275 b.csiDriverLister = capacityCheck.CSIDriverInformer.Lister() 276 b.csiStorageCapacityLister = capacityCheck.CSIStorageCapacityInformer.Lister() 277 } 278 279 return b 280 } 281 282 // FindPodVolumes finds the matching PVs for PVCs and nodes to provision PVs 283 // for the given pod and node. If the node does not fit, conflict reasons are 284 // returned. 285 func (b *volumeBinder) FindPodVolumes(logger klog.Logger, pod *v1.Pod, podVolumeClaims *PodVolumeClaims, node *v1.Node) (podVolumes *PodVolumes, reasons ConflictReasons, err error) { 286 podVolumes = &PodVolumes{} 287 288 // Warning: Below log needs high verbosity as it can be printed several times (#60933). 289 logger.V(5).Info("FindPodVolumes", "pod", klog.KObj(pod), "node", klog.KObj(node)) 290 291 // Initialize to true for pods that don't have volumes. These 292 // booleans get translated into reason strings when the function 293 // returns without an error. 294 unboundVolumesSatisfied := true 295 boundVolumesSatisfied := true 296 sufficientStorage := true 297 boundPVsFound := true 298 defer func() { 299 if err != nil { 300 return 301 } 302 if !boundVolumesSatisfied { 303 reasons = append(reasons, ErrReasonNodeConflict) 304 } 305 if !unboundVolumesSatisfied { 306 reasons = append(reasons, ErrReasonBindConflict) 307 } 308 if !sufficientStorage { 309 reasons = append(reasons, ErrReasonNotEnoughSpace) 310 } 311 if !boundPVsFound { 312 reasons = append(reasons, ErrReasonPVNotExist) 313 } 314 }() 315 316 defer func() { 317 if err != nil { 318 metrics.VolumeSchedulingStageFailed.WithLabelValues("predicate").Inc() 319 } 320 }() 321 322 var ( 323 staticBindings []*BindingInfo 324 dynamicProvisions []*v1.PersistentVolumeClaim 325 ) 326 defer func() { 327 // Although we do not distinguish nil from empty in this function, for 328 // easier testing, we normalize empty to nil. 329 if len(staticBindings) == 0 { 330 staticBindings = nil 331 } 332 if len(dynamicProvisions) == 0 { 333 dynamicProvisions = nil 334 } 335 podVolumes.StaticBindings = staticBindings 336 podVolumes.DynamicProvisions = dynamicProvisions 337 }() 338 339 // Check PV node affinity on bound volumes 340 if len(podVolumeClaims.boundClaims) > 0 { 341 boundVolumesSatisfied, boundPVsFound, err = b.checkBoundClaims(logger, podVolumeClaims.boundClaims, node, pod) 342 if err != nil { 343 return 344 } 345 } 346 347 // Find matching volumes and node for unbound claims 348 if len(podVolumeClaims.unboundClaimsDelayBinding) > 0 { 349 var ( 350 claimsToFindMatching []*v1.PersistentVolumeClaim 351 claimsToProvision []*v1.PersistentVolumeClaim 352 ) 353 354 // Filter out claims to provision 355 for _, claim := range podVolumeClaims.unboundClaimsDelayBinding { 356 if selectedNode, ok := claim.Annotations[volume.AnnSelectedNode]; ok { 357 if selectedNode != node.Name { 358 // Fast path, skip unmatched node. 359 unboundVolumesSatisfied = false 360 return 361 } 362 claimsToProvision = append(claimsToProvision, claim) 363 } else { 364 claimsToFindMatching = append(claimsToFindMatching, claim) 365 } 366 } 367 368 // Find matching volumes 369 if len(claimsToFindMatching) > 0 { 370 var unboundClaims []*v1.PersistentVolumeClaim 371 unboundVolumesSatisfied, staticBindings, unboundClaims, err = b.findMatchingVolumes(logger, pod, claimsToFindMatching, podVolumeClaims.unboundVolumesDelayBinding, node) 372 if err != nil { 373 return 374 } 375 claimsToProvision = append(claimsToProvision, unboundClaims...) 376 } 377 378 // Check for claims to provision. This is the first time where we potentially 379 // find out that storage is not sufficient for the node. 380 if len(claimsToProvision) > 0 { 381 unboundVolumesSatisfied, sufficientStorage, dynamicProvisions, err = b.checkVolumeProvisions(logger, pod, claimsToProvision, node) 382 if err != nil { 383 return 384 } 385 } 386 } 387 388 return 389 } 390 391 // GetEligibleNodes checks the existing bound claims of the pod to determine if the list of nodes can be 392 // potentially reduced down to a subset of eligible nodes based on the bound claims which then can be used 393 // in subsequent scheduling stages. 394 // 395 // Returning 'nil' for eligibleNodes indicates that such eligible node reduction cannot be made and all nodes 396 // should be considered. 397 func (b *volumeBinder) GetEligibleNodes(logger klog.Logger, boundClaims []*v1.PersistentVolumeClaim) (eligibleNodes sets.Set[string]) { 398 if len(boundClaims) == 0 { 399 return 400 } 401 402 var errs []error 403 for _, pvc := range boundClaims { 404 pvName := pvc.Spec.VolumeName 405 pv, err := b.pvCache.GetPV(pvName) 406 if err != nil { 407 errs = append(errs, err) 408 continue 409 } 410 411 // if the PersistentVolume is local and has node affinity matching specific node(s), 412 // add them to the eligible nodes 413 nodeNames := util.GetLocalPersistentVolumeNodeNames(pv) 414 if len(nodeNames) != 0 { 415 // on the first found list of eligible nodes for the local PersistentVolume, 416 // insert to the eligible node set. 417 if eligibleNodes == nil { 418 eligibleNodes = sets.New(nodeNames...) 419 } else { 420 // for subsequent finding of eligible nodes for the local PersistentVolume, 421 // take the intersection of the nodes with the existing eligible nodes 422 // for cases if PV1 has node affinity to node1 and PV2 has node affinity to node2, 423 // then the eligible node list should be empty. 424 eligibleNodes = eligibleNodes.Intersection(sets.New(nodeNames...)) 425 } 426 } 427 } 428 429 if len(errs) > 0 { 430 logger.V(4).Info("GetEligibleNodes: one or more error occurred finding eligible nodes", "error", errs) 431 return nil 432 } 433 434 if eligibleNodes != nil { 435 logger.V(4).Info("GetEligibleNodes: reduced down eligible nodes", "nodes", eligibleNodes) 436 } 437 return 438 } 439 440 // AssumePodVolumes will take the matching PVs and PVCs to provision in pod's 441 // volume information for the chosen node, and: 442 // 1. Update the pvCache with the new prebound PV. 443 // 2. Update the pvcCache with the new PVCs with annotations set 444 // 3. Update PodVolumes again with cached API updates for PVs and PVCs. 445 func (b *volumeBinder) AssumePodVolumes(logger klog.Logger, assumedPod *v1.Pod, nodeName string, podVolumes *PodVolumes) (allFullyBound bool, err error) { 446 logger.V(4).Info("AssumePodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName)) 447 defer func() { 448 if err != nil { 449 metrics.VolumeSchedulingStageFailed.WithLabelValues("assume").Inc() 450 } 451 }() 452 453 if allBound := b.arePodVolumesBound(logger, assumedPod); allBound { 454 logger.V(4).Info("AssumePodVolumes: all PVCs bound and nothing to do", "pod", klog.KObj(assumedPod), "node", klog.KRef("", nodeName)) 455 return true, nil 456 } 457 458 // Assume PV 459 newBindings := []*BindingInfo{} 460 for _, binding := range podVolumes.StaticBindings { 461 newPV, dirty, err := volume.GetBindVolumeToClaim(binding.pv, binding.pvc) 462 logger.V(5).Info("AssumePodVolumes: GetBindVolumeToClaim", 463 "pod", klog.KObj(assumedPod), 464 "PV", klog.KObj(binding.pv), 465 "PVC", klog.KObj(binding.pvc), 466 "newPV", klog.KObj(newPV), 467 "dirty", dirty, 468 ) 469 if err != nil { 470 logger.Error(err, "AssumePodVolumes: fail to GetBindVolumeToClaim") 471 b.revertAssumedPVs(newBindings) 472 return false, err 473 } 474 // TODO: can we assume every time? 475 if dirty { 476 err = b.pvCache.Assume(newPV) 477 if err != nil { 478 b.revertAssumedPVs(newBindings) 479 return false, err 480 } 481 } 482 newBindings = append(newBindings, &BindingInfo{pv: newPV, pvc: binding.pvc}) 483 } 484 485 // Assume PVCs 486 newProvisionedPVCs := []*v1.PersistentVolumeClaim{} 487 for _, claim := range podVolumes.DynamicProvisions { 488 // The claims from method args can be pointing to watcher cache. We must not 489 // modify these, therefore create a copy. 490 claimClone := claim.DeepCopy() 491 metav1.SetMetaDataAnnotation(&claimClone.ObjectMeta, volume.AnnSelectedNode, nodeName) 492 err = b.pvcCache.Assume(claimClone) 493 if err != nil { 494 b.revertAssumedPVs(newBindings) 495 b.revertAssumedPVCs(newProvisionedPVCs) 496 return 497 } 498 499 newProvisionedPVCs = append(newProvisionedPVCs, claimClone) 500 } 501 502 podVolumes.StaticBindings = newBindings 503 podVolumes.DynamicProvisions = newProvisionedPVCs 504 return 505 } 506 507 // RevertAssumedPodVolumes will revert assumed PV and PVC cache. 508 func (b *volumeBinder) RevertAssumedPodVolumes(podVolumes *PodVolumes) { 509 b.revertAssumedPVs(podVolumes.StaticBindings) 510 b.revertAssumedPVCs(podVolumes.DynamicProvisions) 511 } 512 513 // BindPodVolumes gets the cached bindings and PVCs to provision in pod's volumes information, 514 // makes the API update for those PVs/PVCs, and waits for the PVCs to be completely bound 515 // by the PV controller. 516 func (b *volumeBinder) BindPodVolumes(ctx context.Context, assumedPod *v1.Pod, podVolumes *PodVolumes) (err error) { 517 logger := klog.FromContext(ctx) 518 logger.V(4).Info("BindPodVolumes", "pod", klog.KObj(assumedPod), "node", klog.KRef("", assumedPod.Spec.NodeName)) 519 520 defer func() { 521 if err != nil { 522 metrics.VolumeSchedulingStageFailed.WithLabelValues("bind").Inc() 523 } 524 }() 525 526 if podVolumes == nil { 527 klog.Infof("BindPodVolumes for pod(%s): pod volumes is nil", assumedPod.Name) 528 return nil 529 } 530 531 bindings := podVolumes.StaticBindings 532 claimsToProvision := podVolumes.DynamicProvisions 533 534 // Start API operations 535 err = b.bindAPIUpdate(ctx, assumedPod, bindings, claimsToProvision) 536 if err != nil { 537 return err 538 } 539 540 err = wait.PollUntilContextTimeout(ctx, time.Second, b.bindTimeout, false, func(ctx context.Context) (bool, error) { 541 b, err := b.checkBindings(logger, assumedPod, bindings, claimsToProvision) 542 return b, err 543 }) 544 if err != nil { 545 return fmt.Errorf("binding volumes: %w", err) 546 } 547 return nil 548 } 549 550 func getPodName(pod *v1.Pod) string { 551 return pod.Namespace + "/" + pod.Name 552 } 553 554 func getPVCName(pvc *v1.PersistentVolumeClaim) string { 555 return pvc.Namespace + "/" + pvc.Name 556 } 557 558 // bindAPIUpdate makes the API update for those PVs/PVCs. 559 func (b *volumeBinder) bindAPIUpdate(ctx context.Context, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) error { 560 logger := klog.FromContext(ctx) 561 podName := getPodName(pod) 562 if bindings == nil { 563 return fmt.Errorf("failed to get cached bindings for pod %q", podName) 564 } 565 if claimsToProvision == nil { 566 return fmt.Errorf("failed to get cached claims to provision for pod %q", podName) 567 } 568 569 lastProcessedBinding := 0 570 lastProcessedProvisioning := 0 571 defer func() { 572 // only revert assumed cached updates for volumes we haven't successfully bound 573 if lastProcessedBinding < len(bindings) { 574 b.revertAssumedPVs(bindings[lastProcessedBinding:]) 575 } 576 // only revert assumed cached updates for claims we haven't updated, 577 if lastProcessedProvisioning < len(claimsToProvision) { 578 b.revertAssumedPVCs(claimsToProvision[lastProcessedProvisioning:]) 579 } 580 }() 581 582 var ( 583 binding *BindingInfo 584 i int 585 claim *v1.PersistentVolumeClaim 586 ) 587 588 // Do the actual prebinding. Let the PV controller take care of the rest 589 // There is no API rollback if the actual binding fails 590 for _, binding = range bindings { 591 // TODO: does it hurt if we make an api call and nothing needs to be updated? 592 logger.V(5).Info("Updating PersistentVolume: binding to claim", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc)) 593 newPV, err := b.kubeClient.CoreV1().PersistentVolumes().Update(ctx, binding.pv, metav1.UpdateOptions{}) 594 if err != nil { 595 logger.V(4).Info("Updating PersistentVolume: binding to claim failed", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc), "err", err) 596 return err 597 } 598 599 logger.V(2).Info("Updated PersistentVolume with claim. Waiting for binding to complete", "pod", klog.KObj(pod), "PV", klog.KObj(binding.pv), "PVC", klog.KObj(binding.pvc)) 600 // Save updated object from apiserver for later checking. 601 binding.pv = newPV 602 lastProcessedBinding++ 603 } 604 605 // Update claims objects to trigger volume provisioning. Let the PV controller take care of the rest 606 // PV controller is expected to signal back by removing related annotations if actual provisioning fails 607 for i, claim = range claimsToProvision { 608 logger.V(5).Info("Updating claims objects to trigger volume provisioning", "pod", klog.KObj(pod), "PVC", klog.KObj(claim)) 609 newClaim, err := b.kubeClient.CoreV1().PersistentVolumeClaims(claim.Namespace).Update(ctx, claim, metav1.UpdateOptions{}) 610 if err != nil { 611 logger.V(4).Info("Updating PersistentVolumeClaim: binding to volume failed", "PVC", klog.KObj(claim), "err", err) 612 return err 613 } 614 615 // Save updated object from apiserver for later checking. 616 claimsToProvision[i] = newClaim 617 lastProcessedProvisioning++ 618 } 619 620 return nil 621 } 622 623 var ( 624 versioner = storage.APIObjectVersioner{} 625 ) 626 627 // checkBindings runs through all the PVCs in the Pod and checks: 628 // * if the PVC is fully bound 629 // * if there are any conditions that require binding to fail and be retried 630 // 631 // It returns true when all of the Pod's PVCs are fully bound, and error if 632 // binding (and scheduling) needs to be retried 633 // Note that it checks on API objects not PV/PVC cache, this is because 634 // PV/PVC cache can be assumed again in main scheduler loop, we must check 635 // latest state in API server which are shared with PV controller and 636 // provisioners 637 func (b *volumeBinder) checkBindings(logger klog.Logger, pod *v1.Pod, bindings []*BindingInfo, claimsToProvision []*v1.PersistentVolumeClaim) (bool, error) { 638 podName := getPodName(pod) 639 if bindings == nil { 640 return false, fmt.Errorf("failed to get cached bindings for pod %q", podName) 641 } 642 if claimsToProvision == nil { 643 return false, fmt.Errorf("failed to get cached claims to provision for pod %q", podName) 644 } 645 646 node, err := b.nodeLister.Get(pod.Spec.NodeName) 647 if err != nil { 648 return false, fmt.Errorf("failed to get node %q: %w", pod.Spec.NodeName, err) 649 } 650 651 csiNode, err := b.csiNodeLister.Get(node.Name) 652 if err != nil { 653 // TODO: return the error once CSINode is created by default 654 logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err) 655 } 656 657 // Check for any conditions that might require scheduling retry 658 659 // When pod is deleted, binding operation should be cancelled. There is no 660 // need to check PV/PVC bindings any more. 661 _, err = b.podLister.Pods(pod.Namespace).Get(pod.Name) 662 if err != nil { 663 if apierrors.IsNotFound(err) { 664 return false, fmt.Errorf("pod does not exist any more: %w", err) 665 } 666 logger.Error(err, "Failed to get pod from the lister", "pod", klog.KObj(pod)) 667 } 668 669 for _, binding := range bindings { 670 pv, err := b.pvCache.GetAPIPV(binding.pv.Name) 671 if err != nil { 672 return false, fmt.Errorf("failed to check binding: %w", err) 673 } 674 675 pvc, err := b.pvcCache.GetAPIPVC(getPVCName(binding.pvc)) 676 if err != nil { 677 return false, fmt.Errorf("failed to check binding: %w", err) 678 } 679 680 // Because we updated PV in apiserver, skip if API object is older 681 // and wait for new API object propagated from apiserver. 682 if versioner.CompareResourceVersion(binding.pv, pv) > 0 { 683 return false, nil 684 } 685 686 pv, err = b.tryTranslatePVToCSI(pv, csiNode) 687 if err != nil { 688 return false, fmt.Errorf("failed to translate pv to csi: %w", err) 689 } 690 691 // Check PV's node affinity (the node might not have the proper label) 692 if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil { 693 return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err) 694 } 695 696 // Check if pv.ClaimRef got dropped by unbindVolume() 697 if pv.Spec.ClaimRef == nil || pv.Spec.ClaimRef.UID == "" { 698 return false, fmt.Errorf("ClaimRef got reset for pv %q", pv.Name) 699 } 700 701 // Check if pvc is fully bound 702 if !b.isPVCFullyBound(pvc) { 703 return false, nil 704 } 705 } 706 707 for _, claim := range claimsToProvision { 708 pvc, err := b.pvcCache.GetAPIPVC(getPVCName(claim)) 709 if err != nil { 710 return false, fmt.Errorf("failed to check provisioning pvc: %w", err) 711 } 712 713 // Because we updated PVC in apiserver, skip if API object is older 714 // and wait for new API object propagated from apiserver. 715 if versioner.CompareResourceVersion(claim, pvc) > 0 { 716 return false, nil 717 } 718 719 // Check if selectedNode annotation is still set 720 if pvc.Annotations == nil { 721 return false, fmt.Errorf("selectedNode annotation reset for PVC %q", pvc.Name) 722 } 723 selectedNode := pvc.Annotations[volume.AnnSelectedNode] 724 if selectedNode != pod.Spec.NodeName { 725 // If provisioner fails to provision a volume, selectedNode 726 // annotation will be removed to signal back to the scheduler to 727 // retry. 728 return false, fmt.Errorf("provisioning failed for PVC %q", pvc.Name) 729 } 730 731 // If the PVC is bound to a PV, check its node affinity 732 if pvc.Spec.VolumeName != "" { 733 pv, err := b.pvCache.GetAPIPV(pvc.Spec.VolumeName) 734 if err != nil { 735 if _, ok := err.(*errNotFound); ok { 736 // We tolerate NotFound error here, because PV is possibly 737 // not found because of API delay, we can check next time. 738 // And if PV does not exist because it's deleted, PVC will 739 // be unbound eventually. 740 return false, nil 741 } 742 return false, fmt.Errorf("failed to get pv %q from cache: %w", pvc.Spec.VolumeName, err) 743 } 744 745 pv, err = b.tryTranslatePVToCSI(pv, csiNode) 746 if err != nil { 747 return false, err 748 } 749 750 if err := volume.CheckNodeAffinity(pv, node.Labels); err != nil { 751 return false, fmt.Errorf("pv %q node affinity doesn't match node %q: %w", pv.Name, node.Name, err) 752 } 753 } 754 755 // Check if pvc is fully bound 756 if !b.isPVCFullyBound(pvc) { 757 return false, nil 758 } 759 } 760 761 // All pvs and pvcs that we operated on are bound 762 logger.V(2).Info("All PVCs for pod are bound", "pod", klog.KObj(pod)) 763 return true, nil 764 } 765 766 func (b *volumeBinder) isVolumeBound(logger klog.Logger, pod *v1.Pod, vol *v1.Volume) (bound bool, pvc *v1.PersistentVolumeClaim, err error) { 767 pvcName := "" 768 isEphemeral := false 769 switch { 770 case vol.PersistentVolumeClaim != nil: 771 pvcName = vol.PersistentVolumeClaim.ClaimName 772 case vol.Ephemeral != nil: 773 // Generic ephemeral inline volumes also use a PVC, 774 // just with a computed name, and... 775 pvcName = ephemeral.VolumeClaimName(pod, vol) 776 isEphemeral = true 777 default: 778 return true, nil, nil 779 } 780 781 bound, pvc, err = b.isPVCBound(logger, pod.Namespace, pvcName) 782 // ... the PVC must be owned by the pod. 783 if isEphemeral && err == nil && pvc != nil { 784 if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil { 785 return false, nil, err 786 } 787 } 788 return 789 } 790 791 func (b *volumeBinder) isPVCBound(logger klog.Logger, namespace, pvcName string) (bool, *v1.PersistentVolumeClaim, error) { 792 claim := &v1.PersistentVolumeClaim{ 793 ObjectMeta: metav1.ObjectMeta{ 794 Name: pvcName, 795 Namespace: namespace, 796 }, 797 } 798 pvcKey := getPVCName(claim) 799 pvc, err := b.pvcCache.GetPVC(pvcKey) 800 if err != nil || pvc == nil { 801 return false, nil, fmt.Errorf("error getting PVC %q: %v", pvcKey, err) 802 } 803 804 fullyBound := b.isPVCFullyBound(pvc) 805 if fullyBound { 806 logger.V(5).Info("PVC is fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName)) 807 } else { 808 if pvc.Spec.VolumeName != "" { 809 logger.V(5).Info("PVC is not fully bound to PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvc.Spec.VolumeName)) 810 } else { 811 logger.V(5).Info("PVC is not bound", "PVC", klog.KObj(pvc)) 812 } 813 } 814 return fullyBound, pvc, nil 815 } 816 817 func (b *volumeBinder) isPVCFullyBound(pvc *v1.PersistentVolumeClaim) bool { 818 return pvc.Spec.VolumeName != "" && metav1.HasAnnotation(pvc.ObjectMeta, volume.AnnBindCompleted) 819 } 820 821 // arePodVolumesBound returns true if all volumes are fully bound 822 func (b *volumeBinder) arePodVolumesBound(logger klog.Logger, pod *v1.Pod) bool { 823 for _, vol := range pod.Spec.Volumes { 824 if isBound, _, _ := b.isVolumeBound(logger, pod, &vol); !isBound { 825 // Pod has at least one PVC that needs binding 826 return false 827 } 828 } 829 return true 830 } 831 832 // GetPodVolumeClaims returns a pod's PVCs separated into bound, unbound with delayed binding (including provisioning), 833 // unbound with immediate binding (including prebound) and PVs that belong to storage classes of unbound PVCs with delayed binding. 834 func (b *volumeBinder) GetPodVolumeClaims(logger klog.Logger, pod *v1.Pod) (podVolumeClaims *PodVolumeClaims, err error) { 835 podVolumeClaims = &PodVolumeClaims{ 836 boundClaims: []*v1.PersistentVolumeClaim{}, 837 unboundClaimsImmediate: []*v1.PersistentVolumeClaim{}, 838 unboundClaimsDelayBinding: []*v1.PersistentVolumeClaim{}, 839 } 840 841 for _, vol := range pod.Spec.Volumes { 842 volumeBound, pvc, err := b.isVolumeBound(logger, pod, &vol) 843 if err != nil { 844 return podVolumeClaims, err 845 } 846 if pvc == nil { 847 continue 848 } 849 if volumeBound { 850 podVolumeClaims.boundClaims = append(podVolumeClaims.boundClaims, pvc) 851 } else { 852 delayBindingMode, err := volume.IsDelayBindingMode(pvc, b.classLister) 853 if err != nil { 854 return podVolumeClaims, err 855 } 856 // Prebound PVCs are treated as unbound immediate binding 857 if delayBindingMode && pvc.Spec.VolumeName == "" { 858 // Scheduler path 859 podVolumeClaims.unboundClaimsDelayBinding = append(podVolumeClaims.unboundClaimsDelayBinding, pvc) 860 } else { 861 // !delayBindingMode || pvc.Spec.VolumeName != "" 862 // Immediate binding should have already been bound 863 podVolumeClaims.unboundClaimsImmediate = append(podVolumeClaims.unboundClaimsImmediate, pvc) 864 } 865 } 866 } 867 868 podVolumeClaims.unboundVolumesDelayBinding = map[string][]*v1.PersistentVolume{} 869 for _, pvc := range podVolumeClaims.unboundClaimsDelayBinding { 870 // Get storage class name from each PVC 871 storageClassName := volume.GetPersistentVolumeClaimClass(pvc) 872 podVolumeClaims.unboundVolumesDelayBinding[storageClassName] = b.pvCache.ListPVs(storageClassName) 873 } 874 return podVolumeClaims, nil 875 } 876 877 func (b *volumeBinder) checkBoundClaims(logger klog.Logger, claims []*v1.PersistentVolumeClaim, node *v1.Node, pod *v1.Pod) (bool, bool, error) { 878 csiNode, err := b.csiNodeLister.Get(node.Name) 879 if err != nil { 880 // TODO: return the error once CSINode is created by default 881 logger.V(4).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err) 882 } 883 884 for _, pvc := range claims { 885 pvName := pvc.Spec.VolumeName 886 pv, err := b.pvCache.GetPV(pvName) 887 if err != nil { 888 if _, ok := err.(*errNotFound); ok { 889 err = nil 890 } 891 return true, false, err 892 } 893 894 pv, err = b.tryTranslatePVToCSI(pv, csiNode) 895 if err != nil { 896 return false, true, err 897 } 898 899 err = volume.CheckNodeAffinity(pv, node.Labels) 900 if err != nil { 901 logger.V(4).Info("PersistentVolume and node mismatch for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod), "err", err) 902 return false, true, nil 903 } 904 logger.V(5).Info("PersistentVolume and node matches for pod", "PV", klog.KRef("", pvName), "node", klog.KObj(node), "pod", klog.KObj(pod)) 905 } 906 907 logger.V(4).Info("All bound volumes for pod match with node", "pod", klog.KObj(pod), "node", klog.KObj(node)) 908 return true, true, nil 909 } 910 911 // findMatchingVolumes tries to find matching volumes for given claims, 912 // and return unbound claims for further provision. 913 func (b *volumeBinder) findMatchingVolumes(logger klog.Logger, pod *v1.Pod, claimsToBind []*v1.PersistentVolumeClaim, unboundVolumesDelayBinding map[string][]*v1.PersistentVolume, node *v1.Node) (foundMatches bool, bindings []*BindingInfo, unboundClaims []*v1.PersistentVolumeClaim, err error) { 914 // Sort all the claims by increasing size request to get the smallest fits 915 sort.Sort(byPVCSize(claimsToBind)) 916 917 chosenPVs := map[string]*v1.PersistentVolume{} 918 919 foundMatches = true 920 921 for _, pvc := range claimsToBind { 922 // Get storage class name from each PVC 923 storageClassName := volume.GetPersistentVolumeClaimClass(pvc) 924 pvs := unboundVolumesDelayBinding[storageClassName] 925 926 // Find a matching PV 927 pv, err := volume.FindMatchingVolume(pvc, pvs, node, chosenPVs, true) 928 if err != nil { 929 return false, nil, nil, err 930 } 931 if pv == nil { 932 logger.V(4).Info("No matching volumes for pod", "pod", klog.KObj(pod), "PVC", klog.KObj(pvc), "node", klog.KObj(node)) 933 unboundClaims = append(unboundClaims, pvc) 934 foundMatches = false 935 continue 936 } 937 938 // matching PV needs to be excluded so we don't select it again 939 chosenPVs[pv.Name] = pv 940 bindings = append(bindings, &BindingInfo{pv: pv, pvc: pvc}) 941 logger.V(5).Info("Found matching PV for PVC for pod", "PV", klog.KObj(pv), "PVC", klog.KObj(pvc), "node", klog.KObj(node), "pod", klog.KObj(pod)) 942 } 943 944 if foundMatches { 945 logger.V(4).Info("Found matching volumes for pod", "pod", klog.KObj(pod), "node", klog.KObj(node)) 946 } 947 948 return 949 } 950 951 // checkVolumeProvisions checks given unbound claims (the claims have gone through func 952 // findMatchingVolumes, and do not have matching volumes for binding), and return true 953 // if all of the claims are eligible for dynamic provision. 954 func (b *volumeBinder) checkVolumeProvisions(logger klog.Logger, pod *v1.Pod, claimsToProvision []*v1.PersistentVolumeClaim, node *v1.Node) (provisionSatisfied, sufficientStorage bool, dynamicProvisions []*v1.PersistentVolumeClaim, err error) { 955 dynamicProvisions = []*v1.PersistentVolumeClaim{} 956 957 // We return early with provisionedClaims == nil if a check 958 // fails or we encounter an error. 959 for _, claim := range claimsToProvision { 960 pvcName := getPVCName(claim) 961 className := volume.GetPersistentVolumeClaimClass(claim) 962 if className == "" { 963 return false, false, nil, fmt.Errorf("no class for claim %q", pvcName) 964 } 965 966 class, err := b.classLister.Get(className) 967 if err != nil { 968 return false, false, nil, fmt.Errorf("failed to find storage class %q", className) 969 } 970 provisioner := class.Provisioner 971 if provisioner == "" || provisioner == volume.NotSupportedProvisioner { 972 logger.V(4).Info("Storage class of claim does not support dynamic provisioning", "storageClassName", className, "PVC", klog.KObj(claim)) 973 return false, true, nil, nil 974 } 975 976 // Check if the node can satisfy the topology requirement in the class 977 if !v1helper.MatchTopologySelectorTerms(class.AllowedTopologies, labels.Set(node.Labels)) { 978 logger.V(4).Info("Node cannot satisfy provisioning topology requirements of claim", "node", klog.KObj(node), "PVC", klog.KObj(claim)) 979 return false, true, nil, nil 980 } 981 982 // Check storage capacity. 983 sufficient, err := b.hasEnoughCapacity(logger, provisioner, claim, class, node) 984 if err != nil { 985 return false, false, nil, err 986 } 987 if !sufficient { 988 // hasEnoughCapacity logs an explanation. 989 return true, false, nil, nil 990 } 991 992 dynamicProvisions = append(dynamicProvisions, claim) 993 } 994 logger.V(4).Info("Provisioning for claims of pod that has no matching volumes...", "claimCount", len(claimsToProvision), "pod", klog.KObj(pod), "node", klog.KObj(node)) 995 996 return true, true, dynamicProvisions, nil 997 } 998 999 func (b *volumeBinder) revertAssumedPVs(bindings []*BindingInfo) { 1000 for _, BindingInfo := range bindings { 1001 b.pvCache.Restore(BindingInfo.pv.Name) 1002 } 1003 } 1004 1005 func (b *volumeBinder) revertAssumedPVCs(claims []*v1.PersistentVolumeClaim) { 1006 for _, claim := range claims { 1007 b.pvcCache.Restore(getPVCName(claim)) 1008 } 1009 } 1010 1011 // hasEnoughCapacity checks whether the provisioner has enough capacity left for a new volume of the given size 1012 // that is available from the node. 1013 func (b *volumeBinder) hasEnoughCapacity(logger klog.Logger, provisioner string, claim *v1.PersistentVolumeClaim, storageClass *storagev1.StorageClass, node *v1.Node) (bool, error) { 1014 // This is an optional feature. If disabled, we assume that 1015 // there is enough storage. 1016 if !b.capacityCheckEnabled { 1017 return true, nil 1018 } 1019 1020 quantity, ok := claim.Spec.Resources.Requests[v1.ResourceStorage] 1021 if !ok { 1022 // No capacity to check for. 1023 return true, nil 1024 } 1025 1026 // Only enabled for CSI drivers which opt into it. 1027 driver, err := b.csiDriverLister.Get(provisioner) 1028 if err != nil { 1029 if apierrors.IsNotFound(err) { 1030 // Either the provisioner is not a CSI driver or the driver does not 1031 // opt into storage capacity scheduling. Either way, skip 1032 // capacity checking. 1033 return true, nil 1034 } 1035 return false, err 1036 } 1037 if driver.Spec.StorageCapacity == nil || !*driver.Spec.StorageCapacity { 1038 return true, nil 1039 } 1040 1041 // Look for a matching CSIStorageCapacity object(s). 1042 // TODO (for beta): benchmark this and potentially introduce some kind of lookup structure (https://github.com/kubernetes/enhancements/issues/1698#issuecomment-654356718). 1043 capacities, err := b.csiStorageCapacityLister.List(labels.Everything()) 1044 if err != nil { 1045 return false, err 1046 } 1047 1048 sizeInBytes := quantity.Value() 1049 for _, capacity := range capacities { 1050 if capacity.StorageClassName == storageClass.Name && 1051 capacitySufficient(capacity, sizeInBytes) && 1052 b.nodeHasAccess(logger, node, capacity) { 1053 // Enough capacity found. 1054 return true, nil 1055 } 1056 } 1057 1058 // TODO (?): this doesn't give any information about which pools where considered and why 1059 // they had to be rejected. Log that above? But that might be a lot of log output... 1060 logger.V(4).Info("Node has no accessible CSIStorageCapacity with enough capacity for PVC", 1061 "node", klog.KObj(node), "PVC", klog.KObj(claim), "size", sizeInBytes, "storageClass", klog.KObj(storageClass)) 1062 return false, nil 1063 } 1064 1065 func capacitySufficient(capacity *storagev1beta1.CSIStorageCapacity, sizeInBytes int64) bool { 1066 limit := capacity.Capacity 1067 if capacity.MaximumVolumeSize != nil { 1068 // Prefer MaximumVolumeSize if available, it is more precise. 1069 limit = capacity.MaximumVolumeSize 1070 } 1071 return limit != nil && limit.Value() >= sizeInBytes 1072 } 1073 1074 func (b *volumeBinder) nodeHasAccess(logger klog.Logger, node *v1.Node, capacity *storagev1beta1.CSIStorageCapacity) bool { 1075 if capacity.NodeTopology == nil { 1076 // Unavailable 1077 return false 1078 } 1079 // Only matching by label is supported. 1080 selector, err := metav1.LabelSelectorAsSelector(capacity.NodeTopology) 1081 if err != nil { 1082 logger.Error(err, "Unexpected error converting to a label selector", "nodeTopology", capacity.NodeTopology) 1083 return false 1084 } 1085 return selector.Matches(labels.Set(node.Labels)) 1086 } 1087 1088 type byPVCSize []*v1.PersistentVolumeClaim 1089 1090 func (a byPVCSize) Len() int { 1091 return len(a) 1092 } 1093 1094 func (a byPVCSize) Swap(i, j int) { 1095 a[i], a[j] = a[j], a[i] 1096 } 1097 1098 func (a byPVCSize) Less(i, j int) bool { 1099 iSize := a[i].Spec.Resources.Requests[v1.ResourceStorage] 1100 jSize := a[j].Spec.Resources.Requests[v1.ResourceStorage] 1101 // return true if iSize is less than jSize 1102 return iSize.Cmp(jSize) == -1 1103 } 1104 1105 // isCSIMigrationOnForPlugin checks if CSI migration is enabled for a given plugin. 1106 func isCSIMigrationOnForPlugin(pluginName string) bool { 1107 switch pluginName { 1108 case csiplugins.AWSEBSInTreePluginName: 1109 return true 1110 case csiplugins.GCEPDInTreePluginName: 1111 return true 1112 case csiplugins.AzureDiskInTreePluginName: 1113 return true 1114 case csiplugins.CinderInTreePluginName: 1115 return true 1116 case csiplugins.PortworxVolumePluginName: 1117 return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationPortworx) 1118 case csiplugins.RBDVolumePluginName: 1119 return utilfeature.DefaultFeatureGate.Enabled(features.CSIMigrationRBD) 1120 } 1121 return false 1122 } 1123 1124 // isPluginMigratedToCSIOnNode checks if an in-tree plugin has been migrated to a CSI driver on the node. 1125 func isPluginMigratedToCSIOnNode(pluginName string, csiNode *storagev1.CSINode) bool { 1126 if csiNode == nil { 1127 return false 1128 } 1129 1130 csiNodeAnn := csiNode.GetAnnotations() 1131 if csiNodeAnn == nil { 1132 return false 1133 } 1134 1135 var mpaSet sets.Set[string] 1136 mpa := csiNodeAnn[v1.MigratedPluginsAnnotationKey] 1137 if len(mpa) == 0 { 1138 mpaSet = sets.New[string]() 1139 } else { 1140 tok := strings.Split(mpa, ",") 1141 mpaSet = sets.New(tok...) 1142 } 1143 1144 return mpaSet.Has(pluginName) 1145 } 1146 1147 // tryTranslatePVToCSI will translate the in-tree PV to CSI if it meets the criteria. If not, it returns the unmodified in-tree PV. 1148 func (b *volumeBinder) tryTranslatePVToCSI(pv *v1.PersistentVolume, csiNode *storagev1.CSINode) (*v1.PersistentVolume, error) { 1149 if !b.translator.IsPVMigratable(pv) { 1150 return pv, nil 1151 } 1152 1153 pluginName, err := b.translator.GetInTreePluginNameFromSpec(pv, nil) 1154 if err != nil { 1155 return nil, fmt.Errorf("could not get plugin name from pv: %v", err) 1156 } 1157 1158 if !isCSIMigrationOnForPlugin(pluginName) { 1159 return pv, nil 1160 } 1161 1162 if !isPluginMigratedToCSIOnNode(pluginName, csiNode) { 1163 return pv, nil 1164 } 1165 1166 transPV, err := b.translator.TranslateInTreePVToCSI(pv) 1167 if err != nil { 1168 return nil, fmt.Errorf("could not translate pv: %v", err) 1169 } 1170 1171 return transPV, nil 1172 }