k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package nodevolumelimits 18 19 import ( 20 "context" 21 "fmt" 22 23 v1 "k8s.io/api/core/v1" 24 storagev1 "k8s.io/api/storage/v1" 25 apierrors "k8s.io/apimachinery/pkg/api/errors" 26 "k8s.io/apimachinery/pkg/runtime" 27 "k8s.io/apimachinery/pkg/util/rand" 28 corelisters "k8s.io/client-go/listers/core/v1" 29 storagelisters "k8s.io/client-go/listers/storage/v1" 30 ephemeral "k8s.io/component-helpers/storage/ephemeral" 31 storagehelpers "k8s.io/component-helpers/storage/volume" 32 csitrans "k8s.io/csi-translation-lib" 33 "k8s.io/klog/v2" 34 "k8s.io/kubernetes/pkg/scheduler/framework" 35 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 36 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" 37 "k8s.io/kubernetes/pkg/scheduler/util" 38 volumeutil "k8s.io/kubernetes/pkg/volume/util" 39 ) 40 41 // InTreeToCSITranslator contains methods required to check migratable status 42 // and perform translations from InTree PV's to CSI 43 type InTreeToCSITranslator interface { 44 IsPVMigratable(pv *v1.PersistentVolume) bool 45 IsInlineMigratable(vol *v1.Volume) bool 46 IsMigratableIntreePluginByName(inTreePluginName string) bool 47 GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error) 48 GetCSINameFromInTreeName(pluginName string) (string, error) 49 TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error) 50 TranslateInTreeInlineVolumeToCSI(volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error) 51 } 52 53 // CSILimits is a plugin that checks node volume limits. 54 type CSILimits struct { 55 csiNodeLister storagelisters.CSINodeLister 56 pvLister corelisters.PersistentVolumeLister 57 pvcLister corelisters.PersistentVolumeClaimLister 58 scLister storagelisters.StorageClassLister 59 60 randomVolumeIDPrefix string 61 62 translator InTreeToCSITranslator 63 } 64 65 var _ framework.PreFilterPlugin = &CSILimits{} 66 var _ framework.FilterPlugin = &CSILimits{} 67 var _ framework.EnqueueExtensions = &CSILimits{} 68 69 // CSIName is the name of the plugin used in the plugin registry and configurations. 70 const CSIName = names.NodeVolumeLimits 71 72 // Name returns name of the plugin. It is used in logs, etc. 73 func (pl *CSILimits) Name() string { 74 return CSIName 75 } 76 77 // EventsToRegister returns the possible events that may make a Pod. 78 // failed by this plugin schedulable. 79 func (pl *CSILimits) EventsToRegister() []framework.ClusterEventWithHint { 80 return []framework.ClusterEventWithHint{ 81 // We don't register any `QueueingHintFn` intentionally 82 // because any new CSINode could make pods that were rejected by CSI volumes schedulable. 83 {Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}}, 84 {Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted}, 85 {Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}}, 86 } 87 } 88 89 func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) { 90 deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj) 91 if err != nil { 92 return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err) 93 } 94 95 if len(deletedPod.Spec.Volumes) == 0 { 96 return framework.QueueSkip, nil 97 } 98 99 if deletedPod.Spec.NodeName == "" { 100 return framework.QueueSkip, nil 101 } 102 103 for _, vol := range deletedPod.Spec.Volumes { 104 if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(&vol) { 105 return framework.Queue, nil 106 } 107 } 108 109 logger.V(5).Info("The deleted pod does not impact the scheduling of the unscheduled pod", "deletedPod", klog.KObj(pod), "pod", klog.KObj(deletedPod)) 110 return framework.QueueSkip, nil 111 } 112 113 // PreFilter invoked at the prefilter extension point 114 // 115 // If the pod haven't those types of volumes, we'll skip the Filter phase 116 func (pl *CSILimits) PreFilter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { 117 volumes := pod.Spec.Volumes 118 for i := range volumes { 119 vol := &volumes[i] 120 if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(vol) { 121 return nil, nil 122 } 123 } 124 125 return nil, framework.NewStatus(framework.Skip) 126 } 127 128 // PreFilterExtensions returns prefilter extensions, pod add and remove. 129 func (pl *CSILimits) PreFilterExtensions() framework.PreFilterExtensions { 130 return nil 131 } 132 133 // Filter invoked at the filter extension point. 134 func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { 135 // If the new pod doesn't have any volume attached to it, the predicate will always be true 136 if len(pod.Spec.Volumes) == 0 { 137 return nil 138 } 139 140 node := nodeInfo.Node() 141 142 logger := klog.FromContext(ctx) 143 144 // If CSINode doesn't exist, the predicate may read the limits from Node object 145 csiNode, err := pl.csiNodeLister.Get(node.Name) 146 if err != nil { 147 // TODO: return the error once CSINode is created by default (2 releases) 148 logger.V(5).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err) 149 } 150 151 newVolumes := make(map[string]string) 152 if err := pl.filterAttachableVolumes(logger, pod, csiNode, true /* new pod */, newVolumes); err != nil { 153 if apierrors.IsNotFound(err) { 154 // PVC is not found. This Pod will never be schedulable until PVC is created. 155 return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) 156 } 157 return framework.AsStatus(err) 158 } 159 160 // If the pod doesn't have any new CSI volumes, the predicate will always be true 161 if len(newVolumes) == 0 { 162 return nil 163 } 164 165 // If the node doesn't have volume limits, the predicate will always be true 166 nodeVolumeLimits := getVolumeLimits(nodeInfo, csiNode) 167 if len(nodeVolumeLimits) == 0 { 168 return nil 169 } 170 171 attachedVolumes := make(map[string]string) 172 for _, existingPod := range nodeInfo.Pods { 173 if err := pl.filterAttachableVolumes(logger, existingPod.Pod, csiNode, false /* existing pod */, attachedVolumes); err != nil { 174 return framework.AsStatus(err) 175 } 176 } 177 178 attachedVolumeCount := map[string]int{} 179 for volumeUniqueName, volumeLimitKey := range attachedVolumes { 180 // Don't count single volume used in multiple pods more than once 181 delete(newVolumes, volumeUniqueName) 182 attachedVolumeCount[volumeLimitKey]++ 183 } 184 185 newVolumeCount := map[string]int{} 186 for _, volumeLimitKey := range newVolumes { 187 newVolumeCount[volumeLimitKey]++ 188 } 189 190 for volumeLimitKey, count := range newVolumeCount { 191 maxVolumeLimit, ok := nodeVolumeLimits[v1.ResourceName(volumeLimitKey)] 192 if ok { 193 currentVolumeCount := attachedVolumeCount[volumeLimitKey] 194 logger.V(5).Info("Found plugin volume limits", "node", node.Name, "volumeLimitKey", volumeLimitKey, 195 "maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count, 196 "pod", klog.KObj(pod)) 197 if currentVolumeCount+count > int(maxVolumeLimit) { 198 return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded) 199 } 200 } 201 } 202 203 return nil 204 } 205 206 func (pl *CSILimits) filterAttachableVolumes( 207 logger klog.Logger, pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error { 208 for _, vol := range pod.Spec.Volumes { 209 pvcName := "" 210 isEphemeral := false 211 switch { 212 case vol.PersistentVolumeClaim != nil: 213 // Normal CSI volume can only be used through PVC 214 pvcName = vol.PersistentVolumeClaim.ClaimName 215 case vol.Ephemeral != nil: 216 // Generic ephemeral inline volumes also use a PVC, 217 // just with a computed name and certain ownership. 218 // That is checked below once the pvc object is 219 // retrieved. 220 pvcName = ephemeral.VolumeClaimName(pod, &vol) 221 isEphemeral = true 222 default: 223 // Inline Volume does not have PVC. 224 // Need to check if CSI migration is enabled for this inline volume. 225 // - If the volume is migratable and CSI migration is enabled, need to count it 226 // as well. 227 // - If the volume is not migratable, it will be count in non_csi filter. 228 if err := pl.checkAttachableInlineVolume(logger, &vol, csiNode, pod, result); err != nil { 229 return err 230 } 231 232 continue 233 } 234 235 if pvcName == "" { 236 return fmt.Errorf("PersistentVolumeClaim had no name") 237 } 238 239 pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName) 240 241 if err != nil { 242 if newPod { 243 // The PVC is required to proceed with 244 // scheduling of a new pod because it cannot 245 // run without it. Bail out immediately. 246 return fmt.Errorf("looking up PVC %s/%s: %w", pod.Namespace, pvcName, err) 247 } 248 // If the PVC is invalid, we don't count the volume because 249 // there's no guarantee that it belongs to the running predicate. 250 logger.V(5).Info("Unable to look up PVC info", "pod", klog.KObj(pod), "PVC", klog.KRef(pod.Namespace, pvcName)) 251 continue 252 } 253 254 // The PVC for an ephemeral volume must be owned by the pod. 255 if isEphemeral { 256 if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil { 257 return err 258 } 259 } 260 261 driverName, volumeHandle := pl.getCSIDriverInfo(logger, csiNode, pvc) 262 if driverName == "" || volumeHandle == "" { 263 logger.V(5).Info("Could not find a CSI driver name or volume handle, not counting volume") 264 continue 265 } 266 267 volumeUniqueName := fmt.Sprintf("%s/%s", driverName, volumeHandle) 268 volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName) 269 result[volumeUniqueName] = volumeLimitKey 270 } 271 return nil 272 } 273 274 // checkAttachableInlineVolume takes an inline volume and add to the result map if the 275 // volume is migratable and CSI migration for this plugin has been enabled. 276 func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Volume, csiNode *storagev1.CSINode, 277 pod *v1.Pod, result map[string]string) error { 278 if !pl.translator.IsInlineMigratable(vol) { 279 return nil 280 } 281 // Check if the intree provisioner CSI migration has been enabled. 282 inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, vol) 283 if err != nil { 284 return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err) 285 } 286 if !isCSIMigrationOn(csiNode, inTreeProvisionerName) { 287 csiNodeName := "" 288 if csiNode != nil { 289 csiNodeName = csiNode.Name 290 } 291 logger.V(5).Info("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName, 292 "pod", klog.KObj(pod), "csiNode", csiNodeName) 293 return nil 294 } 295 // Do translation for the in-tree volume. 296 translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(vol, pod.Namespace) 297 if err != nil || translatedPV == nil { 298 return fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err) 299 } 300 driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName) 301 if err != nil { 302 return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err) 303 } 304 // TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set, 305 // the volume does not support inline. Skip the count. 306 if translatedPV.Spec.PersistentVolumeSource.CSI == nil { 307 return nil 308 } 309 volumeUniqueName := fmt.Sprintf("%s/%s", driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle) 310 volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName) 311 result[volumeUniqueName] = volumeLimitKey 312 return nil 313 } 314 315 // getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC. 316 // If the PVC is from a migrated in-tree plugin, this function will return 317 // the information of the CSI driver that the plugin has been migrated to. 318 func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) { 319 pvName := pvc.Spec.VolumeName 320 321 if pvName == "" { 322 logger.V(5).Info("Persistent volume had no name for claim", "PVC", klog.KObj(pvc)) 323 return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc) 324 } 325 326 pv, err := pl.pvLister.Get(pvName) 327 if err != nil { 328 logger.V(5).Info("Unable to look up PV info for PVC and PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvName)) 329 // If we can't fetch PV associated with PVC, may be it got deleted 330 // or PVC was prebound to a PVC that hasn't been created yet. 331 // fallback to using StorageClass for volume counting 332 return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc) 333 } 334 335 csiSource := pv.Spec.PersistentVolumeSource.CSI 336 if csiSource == nil { 337 // We make a fast path for non-CSI volumes that aren't migratable 338 if !pl.translator.IsPVMigratable(pv) { 339 return "", "" 340 } 341 342 pluginName, err := pl.translator.GetInTreePluginNameFromSpec(pv, nil) 343 if err != nil { 344 logger.V(5).Info("Unable to look up plugin name from PV spec", "err", err) 345 return "", "" 346 } 347 348 if !isCSIMigrationOn(csiNode, pluginName) { 349 logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName) 350 return "", "" 351 } 352 353 csiPV, err := pl.translator.TranslateInTreePVToCSI(pv) 354 if err != nil { 355 logger.V(5).Info("Unable to translate in-tree volume to CSI", "err", err) 356 return "", "" 357 } 358 359 if csiPV.Spec.PersistentVolumeSource.CSI == nil { 360 logger.V(5).Info("Unable to get a valid volume source for translated PV", "PV", pvName) 361 return "", "" 362 } 363 364 csiSource = csiPV.Spec.PersistentVolumeSource.CSI 365 } 366 367 return csiSource.Driver, csiSource.VolumeHandle 368 } 369 370 // getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass. 371 func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) { 372 namespace := pvc.Namespace 373 pvcName := pvc.Name 374 scName := storagehelpers.GetPersistentVolumeClaimClass(pvc) 375 376 // If StorageClass is not set or not found, then PVC must be using immediate binding mode 377 // and hence it must be bound before scheduling. So it is safe to not count it. 378 if scName == "" { 379 logger.V(5).Info("PVC has no StorageClass", "PVC", klog.KObj(pvc)) 380 return "", "" 381 } 382 383 storageClass, err := pl.scLister.Get(scName) 384 if err != nil { 385 logger.V(5).Info("Could not get StorageClass for PVC", "PVC", klog.KObj(pvc), "err", err) 386 return "", "" 387 } 388 389 // We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the 390 // predicate and there is another pod on the same node that uses same volume, then we will overcount 391 // the volume and consider both volumes as different. 392 volumeHandle := fmt.Sprintf("%s-%s/%s", pl.randomVolumeIDPrefix, namespace, pvcName) 393 394 provisioner := storageClass.Provisioner 395 if pl.translator.IsMigratableIntreePluginByName(provisioner) { 396 if !isCSIMigrationOn(csiNode, provisioner) { 397 logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner) 398 return "", "" 399 } 400 401 driverName, err := pl.translator.GetCSINameFromInTreeName(provisioner) 402 if err != nil { 403 logger.V(5).Info("Unable to look up driver name from provisioner name", "provisioner", provisioner, "err", err) 404 return "", "" 405 } 406 return driverName, volumeHandle 407 } 408 409 return provisioner, volumeHandle 410 } 411 412 // NewCSI initializes a new plugin and returns it. 413 func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) { 414 informerFactory := handle.SharedInformerFactory() 415 pvLister := informerFactory.Core().V1().PersistentVolumes().Lister() 416 pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister() 417 csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister() 418 scLister := informerFactory.Storage().V1().StorageClasses().Lister() 419 csiTranslator := csitrans.New() 420 421 return &CSILimits{ 422 csiNodeLister: csiNodesLister, 423 pvLister: pvLister, 424 pvcLister: pvcLister, 425 scLister: scLister, 426 randomVolumeIDPrefix: rand.String(32), 427 translator: csiTranslator, 428 }, nil 429 } 430 431 func getVolumeLimits(nodeInfo *framework.NodeInfo, csiNode *storagev1.CSINode) map[v1.ResourceName]int64 { 432 // TODO: stop getting values from Node object in v1.18 433 nodeVolumeLimits := volumeLimits(nodeInfo) 434 if csiNode != nil { 435 for i := range csiNode.Spec.Drivers { 436 d := csiNode.Spec.Drivers[i] 437 if d.Allocatable != nil && d.Allocatable.Count != nil { 438 // TODO: drop GetCSIAttachLimitKey once we don't get values from Node object (v1.18) 439 k := v1.ResourceName(volumeutil.GetCSIAttachLimitKey(d.Name)) 440 nodeVolumeLimits[k] = int64(*d.Allocatable.Count) 441 } 442 } 443 } 444 return nodeVolumeLimits 445 }