k8s.io/kubernetes@v1.31.0-alpha.0.0.20240520171757-56147500dadc/pkg/scheduler/framework/plugins/nodevolumelimits/csi.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package nodevolumelimits
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  
    23  	v1 "k8s.io/api/core/v1"
    24  	storagev1 "k8s.io/api/storage/v1"
    25  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    26  	"k8s.io/apimachinery/pkg/runtime"
    27  	"k8s.io/apimachinery/pkg/util/rand"
    28  	corelisters "k8s.io/client-go/listers/core/v1"
    29  	storagelisters "k8s.io/client-go/listers/storage/v1"
    30  	ephemeral "k8s.io/component-helpers/storage/ephemeral"
    31  	storagehelpers "k8s.io/component-helpers/storage/volume"
    32  	csitrans "k8s.io/csi-translation-lib"
    33  	"k8s.io/klog/v2"
    34  	"k8s.io/kubernetes/pkg/scheduler/framework"
    35  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
    36  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
    37  	"k8s.io/kubernetes/pkg/scheduler/util"
    38  	volumeutil "k8s.io/kubernetes/pkg/volume/util"
    39  )
    40  
    41  // InTreeToCSITranslator contains methods required to check migratable status
    42  // and perform translations from InTree PV's to CSI
    43  type InTreeToCSITranslator interface {
    44  	IsPVMigratable(pv *v1.PersistentVolume) bool
    45  	IsInlineMigratable(vol *v1.Volume) bool
    46  	IsMigratableIntreePluginByName(inTreePluginName string) bool
    47  	GetInTreePluginNameFromSpec(pv *v1.PersistentVolume, vol *v1.Volume) (string, error)
    48  	GetCSINameFromInTreeName(pluginName string) (string, error)
    49  	TranslateInTreePVToCSI(pv *v1.PersistentVolume) (*v1.PersistentVolume, error)
    50  	TranslateInTreeInlineVolumeToCSI(volume *v1.Volume, podNamespace string) (*v1.PersistentVolume, error)
    51  }
    52  
    53  // CSILimits is a plugin that checks node volume limits.
    54  type CSILimits struct {
    55  	csiNodeLister storagelisters.CSINodeLister
    56  	pvLister      corelisters.PersistentVolumeLister
    57  	pvcLister     corelisters.PersistentVolumeClaimLister
    58  	scLister      storagelisters.StorageClassLister
    59  
    60  	randomVolumeIDPrefix string
    61  
    62  	translator InTreeToCSITranslator
    63  }
    64  
    65  var _ framework.PreFilterPlugin = &CSILimits{}
    66  var _ framework.FilterPlugin = &CSILimits{}
    67  var _ framework.EnqueueExtensions = &CSILimits{}
    68  
    69  // CSIName is the name of the plugin used in the plugin registry and configurations.
    70  const CSIName = names.NodeVolumeLimits
    71  
    72  // Name returns name of the plugin. It is used in logs, etc.
    73  func (pl *CSILimits) Name() string {
    74  	return CSIName
    75  }
    76  
    77  // EventsToRegister returns the possible events that may make a Pod.
    78  // failed by this plugin schedulable.
    79  func (pl *CSILimits) EventsToRegister() []framework.ClusterEventWithHint {
    80  	return []framework.ClusterEventWithHint{
    81  		// We don't register any `QueueingHintFn` intentionally
    82  		// because any new CSINode could make pods that were rejected by CSI volumes schedulable.
    83  		{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add}},
    84  		{Event: framework.ClusterEvent{Resource: framework.Pod, ActionType: framework.Delete}, QueueingHintFn: pl.isSchedulableAfterPodDeleted},
    85  		{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add}},
    86  	}
    87  }
    88  
    89  func (pl *CSILimits) isSchedulableAfterPodDeleted(logger klog.Logger, pod *v1.Pod, oldObj, newObj interface{}) (framework.QueueingHint, error) {
    90  	deletedPod, _, err := util.As[*v1.Pod](oldObj, newObj)
    91  	if err != nil {
    92  		return framework.Queue, fmt.Errorf("unexpected objects in isSchedulableAfterPodDeleted: %w", err)
    93  	}
    94  
    95  	if len(deletedPod.Spec.Volumes) == 0 {
    96  		return framework.QueueSkip, nil
    97  	}
    98  
    99  	if deletedPod.Spec.NodeName == "" {
   100  		return framework.QueueSkip, nil
   101  	}
   102  
   103  	for _, vol := range deletedPod.Spec.Volumes {
   104  		if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(&vol) {
   105  			return framework.Queue, nil
   106  		}
   107  	}
   108  
   109  	logger.V(5).Info("The deleted pod does not impact the scheduling of the unscheduled pod", "deletedPod", klog.KObj(pod), "pod", klog.KObj(deletedPod))
   110  	return framework.QueueSkip, nil
   111  }
   112  
   113  // PreFilter invoked at the prefilter extension point
   114  //
   115  // If the pod haven't those types of volumes, we'll skip the Filter phase
   116  func (pl *CSILimits) PreFilter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
   117  	volumes := pod.Spec.Volumes
   118  	for i := range volumes {
   119  		vol := &volumes[i]
   120  		if vol.PersistentVolumeClaim != nil || vol.Ephemeral != nil || pl.translator.IsInlineMigratable(vol) {
   121  			return nil, nil
   122  		}
   123  	}
   124  
   125  	return nil, framework.NewStatus(framework.Skip)
   126  }
   127  
   128  // PreFilterExtensions returns prefilter extensions, pod add and remove.
   129  func (pl *CSILimits) PreFilterExtensions() framework.PreFilterExtensions {
   130  	return nil
   131  }
   132  
   133  // Filter invoked at the filter extension point.
   134  func (pl *CSILimits) Filter(ctx context.Context, _ *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
   135  	// If the new pod doesn't have any volume attached to it, the predicate will always be true
   136  	if len(pod.Spec.Volumes) == 0 {
   137  		return nil
   138  	}
   139  
   140  	node := nodeInfo.Node()
   141  
   142  	logger := klog.FromContext(ctx)
   143  
   144  	// If CSINode doesn't exist, the predicate may read the limits from Node object
   145  	csiNode, err := pl.csiNodeLister.Get(node.Name)
   146  	if err != nil {
   147  		// TODO: return the error once CSINode is created by default (2 releases)
   148  		logger.V(5).Info("Could not get a CSINode object for the node", "node", klog.KObj(node), "err", err)
   149  	}
   150  
   151  	newVolumes := make(map[string]string)
   152  	if err := pl.filterAttachableVolumes(logger, pod, csiNode, true /* new pod */, newVolumes); err != nil {
   153  		if apierrors.IsNotFound(err) {
   154  			// PVC is not found. This Pod will never be schedulable until PVC is created.
   155  			return framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
   156  		}
   157  		return framework.AsStatus(err)
   158  	}
   159  
   160  	// If the pod doesn't have any new CSI volumes, the predicate will always be true
   161  	if len(newVolumes) == 0 {
   162  		return nil
   163  	}
   164  
   165  	// If the node doesn't have volume limits, the predicate will always be true
   166  	nodeVolumeLimits := getVolumeLimits(nodeInfo, csiNode)
   167  	if len(nodeVolumeLimits) == 0 {
   168  		return nil
   169  	}
   170  
   171  	attachedVolumes := make(map[string]string)
   172  	for _, existingPod := range nodeInfo.Pods {
   173  		if err := pl.filterAttachableVolumes(logger, existingPod.Pod, csiNode, false /* existing pod */, attachedVolumes); err != nil {
   174  			return framework.AsStatus(err)
   175  		}
   176  	}
   177  
   178  	attachedVolumeCount := map[string]int{}
   179  	for volumeUniqueName, volumeLimitKey := range attachedVolumes {
   180  		// Don't count single volume used in multiple pods more than once
   181  		delete(newVolumes, volumeUniqueName)
   182  		attachedVolumeCount[volumeLimitKey]++
   183  	}
   184  
   185  	newVolumeCount := map[string]int{}
   186  	for _, volumeLimitKey := range newVolumes {
   187  		newVolumeCount[volumeLimitKey]++
   188  	}
   189  
   190  	for volumeLimitKey, count := range newVolumeCount {
   191  		maxVolumeLimit, ok := nodeVolumeLimits[v1.ResourceName(volumeLimitKey)]
   192  		if ok {
   193  			currentVolumeCount := attachedVolumeCount[volumeLimitKey]
   194  			logger.V(5).Info("Found plugin volume limits", "node", node.Name, "volumeLimitKey", volumeLimitKey,
   195  				"maxLimits", maxVolumeLimit, "currentVolumeCount", currentVolumeCount, "newVolumeCount", count,
   196  				"pod", klog.KObj(pod))
   197  			if currentVolumeCount+count > int(maxVolumeLimit) {
   198  				return framework.NewStatus(framework.Unschedulable, ErrReasonMaxVolumeCountExceeded)
   199  			}
   200  		}
   201  	}
   202  
   203  	return nil
   204  }
   205  
   206  func (pl *CSILimits) filterAttachableVolumes(
   207  	logger klog.Logger, pod *v1.Pod, csiNode *storagev1.CSINode, newPod bool, result map[string]string) error {
   208  	for _, vol := range pod.Spec.Volumes {
   209  		pvcName := ""
   210  		isEphemeral := false
   211  		switch {
   212  		case vol.PersistentVolumeClaim != nil:
   213  			// Normal CSI volume can only be used through PVC
   214  			pvcName = vol.PersistentVolumeClaim.ClaimName
   215  		case vol.Ephemeral != nil:
   216  			// Generic ephemeral inline volumes also use a PVC,
   217  			// just with a computed name and certain ownership.
   218  			// That is checked below once the pvc object is
   219  			// retrieved.
   220  			pvcName = ephemeral.VolumeClaimName(pod, &vol)
   221  			isEphemeral = true
   222  		default:
   223  			// Inline Volume does not have PVC.
   224  			// Need to check if CSI migration is enabled for this inline volume.
   225  			// - If the volume is migratable and CSI migration is enabled, need to count it
   226  			// as well.
   227  			// - If the volume is not migratable, it will be count in non_csi filter.
   228  			if err := pl.checkAttachableInlineVolume(logger, &vol, csiNode, pod, result); err != nil {
   229  				return err
   230  			}
   231  
   232  			continue
   233  		}
   234  
   235  		if pvcName == "" {
   236  			return fmt.Errorf("PersistentVolumeClaim had no name")
   237  		}
   238  
   239  		pvc, err := pl.pvcLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
   240  
   241  		if err != nil {
   242  			if newPod {
   243  				// The PVC is required to proceed with
   244  				// scheduling of a new pod because it cannot
   245  				// run without it. Bail out immediately.
   246  				return fmt.Errorf("looking up PVC %s/%s: %w", pod.Namespace, pvcName, err)
   247  			}
   248  			// If the PVC is invalid, we don't count the volume because
   249  			// there's no guarantee that it belongs to the running predicate.
   250  			logger.V(5).Info("Unable to look up PVC info", "pod", klog.KObj(pod), "PVC", klog.KRef(pod.Namespace, pvcName))
   251  			continue
   252  		}
   253  
   254  		// The PVC for an ephemeral volume must be owned by the pod.
   255  		if isEphemeral {
   256  			if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
   257  				return err
   258  			}
   259  		}
   260  
   261  		driverName, volumeHandle := pl.getCSIDriverInfo(logger, csiNode, pvc)
   262  		if driverName == "" || volumeHandle == "" {
   263  			logger.V(5).Info("Could not find a CSI driver name or volume handle, not counting volume")
   264  			continue
   265  		}
   266  
   267  		volumeUniqueName := fmt.Sprintf("%s/%s", driverName, volumeHandle)
   268  		volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName)
   269  		result[volumeUniqueName] = volumeLimitKey
   270  	}
   271  	return nil
   272  }
   273  
   274  // checkAttachableInlineVolume takes an inline volume and add to the result map if the
   275  // volume is migratable and CSI migration for this plugin has been enabled.
   276  func (pl *CSILimits) checkAttachableInlineVolume(logger klog.Logger, vol *v1.Volume, csiNode *storagev1.CSINode,
   277  	pod *v1.Pod, result map[string]string) error {
   278  	if !pl.translator.IsInlineMigratable(vol) {
   279  		return nil
   280  	}
   281  	// Check if the intree provisioner CSI migration has been enabled.
   282  	inTreeProvisionerName, err := pl.translator.GetInTreePluginNameFromSpec(nil, vol)
   283  	if err != nil {
   284  		return fmt.Errorf("looking up provisioner name for volume %s: %w", vol.Name, err)
   285  	}
   286  	if !isCSIMigrationOn(csiNode, inTreeProvisionerName) {
   287  		csiNodeName := ""
   288  		if csiNode != nil {
   289  			csiNodeName = csiNode.Name
   290  		}
   291  		logger.V(5).Info("CSI Migration is not enabled for provisioner", "provisioner", inTreeProvisionerName,
   292  			"pod", klog.KObj(pod), "csiNode", csiNodeName)
   293  		return nil
   294  	}
   295  	// Do translation for the in-tree volume.
   296  	translatedPV, err := pl.translator.TranslateInTreeInlineVolumeToCSI(vol, pod.Namespace)
   297  	if err != nil || translatedPV == nil {
   298  		return fmt.Errorf("converting volume(%s) from inline to csi: %w", vol.Name, err)
   299  	}
   300  	driverName, err := pl.translator.GetCSINameFromInTreeName(inTreeProvisionerName)
   301  	if err != nil {
   302  		return fmt.Errorf("looking up CSI driver name for provisioner %s: %w", inTreeProvisionerName, err)
   303  	}
   304  	// TranslateInTreeInlineVolumeToCSI should translate inline volume to CSI. If it is not set,
   305  	// the volume does not support inline. Skip the count.
   306  	if translatedPV.Spec.PersistentVolumeSource.CSI == nil {
   307  		return nil
   308  	}
   309  	volumeUniqueName := fmt.Sprintf("%s/%s", driverName, translatedPV.Spec.PersistentVolumeSource.CSI.VolumeHandle)
   310  	volumeLimitKey := volumeutil.GetCSIAttachLimitKey(driverName)
   311  	result[volumeUniqueName] = volumeLimitKey
   312  	return nil
   313  }
   314  
   315  // getCSIDriverInfo returns the CSI driver name and volume ID of a given PVC.
   316  // If the PVC is from a migrated in-tree plugin, this function will return
   317  // the information of the CSI driver that the plugin has been migrated to.
   318  func (pl *CSILimits) getCSIDriverInfo(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
   319  	pvName := pvc.Spec.VolumeName
   320  
   321  	if pvName == "" {
   322  		logger.V(5).Info("Persistent volume had no name for claim", "PVC", klog.KObj(pvc))
   323  		return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
   324  	}
   325  
   326  	pv, err := pl.pvLister.Get(pvName)
   327  	if err != nil {
   328  		logger.V(5).Info("Unable to look up PV info for PVC and PV", "PVC", klog.KObj(pvc), "PV", klog.KRef("", pvName))
   329  		// If we can't fetch PV associated with PVC, may be it got deleted
   330  		// or PVC was prebound to a PVC that hasn't been created yet.
   331  		// fallback to using StorageClass for volume counting
   332  		return pl.getCSIDriverInfoFromSC(logger, csiNode, pvc)
   333  	}
   334  
   335  	csiSource := pv.Spec.PersistentVolumeSource.CSI
   336  	if csiSource == nil {
   337  		// We make a fast path for non-CSI volumes that aren't migratable
   338  		if !pl.translator.IsPVMigratable(pv) {
   339  			return "", ""
   340  		}
   341  
   342  		pluginName, err := pl.translator.GetInTreePluginNameFromSpec(pv, nil)
   343  		if err != nil {
   344  			logger.V(5).Info("Unable to look up plugin name from PV spec", "err", err)
   345  			return "", ""
   346  		}
   347  
   348  		if !isCSIMigrationOn(csiNode, pluginName) {
   349  			logger.V(5).Info("CSI Migration of plugin is not enabled", "plugin", pluginName)
   350  			return "", ""
   351  		}
   352  
   353  		csiPV, err := pl.translator.TranslateInTreePVToCSI(pv)
   354  		if err != nil {
   355  			logger.V(5).Info("Unable to translate in-tree volume to CSI", "err", err)
   356  			return "", ""
   357  		}
   358  
   359  		if csiPV.Spec.PersistentVolumeSource.CSI == nil {
   360  			logger.V(5).Info("Unable to get a valid volume source for translated PV", "PV", pvName)
   361  			return "", ""
   362  		}
   363  
   364  		csiSource = csiPV.Spec.PersistentVolumeSource.CSI
   365  	}
   366  
   367  	return csiSource.Driver, csiSource.VolumeHandle
   368  }
   369  
   370  // getCSIDriverInfoFromSC returns the CSI driver name and a random volume ID of a given PVC's StorageClass.
   371  func (pl *CSILimits) getCSIDriverInfoFromSC(logger klog.Logger, csiNode *storagev1.CSINode, pvc *v1.PersistentVolumeClaim) (string, string) {
   372  	namespace := pvc.Namespace
   373  	pvcName := pvc.Name
   374  	scName := storagehelpers.GetPersistentVolumeClaimClass(pvc)
   375  
   376  	// If StorageClass is not set or not found, then PVC must be using immediate binding mode
   377  	// and hence it must be bound before scheduling. So it is safe to not count it.
   378  	if scName == "" {
   379  		logger.V(5).Info("PVC has no StorageClass", "PVC", klog.KObj(pvc))
   380  		return "", ""
   381  	}
   382  
   383  	storageClass, err := pl.scLister.Get(scName)
   384  	if err != nil {
   385  		logger.V(5).Info("Could not get StorageClass for PVC", "PVC", klog.KObj(pvc), "err", err)
   386  		return "", ""
   387  	}
   388  
   389  	// We use random prefix to avoid conflict with volume IDs. If PVC is bound during the execution of the
   390  	// predicate and there is another pod on the same node that uses same volume, then we will overcount
   391  	// the volume and consider both volumes as different.
   392  	volumeHandle := fmt.Sprintf("%s-%s/%s", pl.randomVolumeIDPrefix, namespace, pvcName)
   393  
   394  	provisioner := storageClass.Provisioner
   395  	if pl.translator.IsMigratableIntreePluginByName(provisioner) {
   396  		if !isCSIMigrationOn(csiNode, provisioner) {
   397  			logger.V(5).Info("CSI Migration of provisioner is not enabled", "provisioner", provisioner)
   398  			return "", ""
   399  		}
   400  
   401  		driverName, err := pl.translator.GetCSINameFromInTreeName(provisioner)
   402  		if err != nil {
   403  			logger.V(5).Info("Unable to look up driver name from provisioner name", "provisioner", provisioner, "err", err)
   404  			return "", ""
   405  		}
   406  		return driverName, volumeHandle
   407  	}
   408  
   409  	return provisioner, volumeHandle
   410  }
   411  
   412  // NewCSI initializes a new plugin and returns it.
   413  func NewCSI(_ context.Context, _ runtime.Object, handle framework.Handle, fts feature.Features) (framework.Plugin, error) {
   414  	informerFactory := handle.SharedInformerFactory()
   415  	pvLister := informerFactory.Core().V1().PersistentVolumes().Lister()
   416  	pvcLister := informerFactory.Core().V1().PersistentVolumeClaims().Lister()
   417  	csiNodesLister := informerFactory.Storage().V1().CSINodes().Lister()
   418  	scLister := informerFactory.Storage().V1().StorageClasses().Lister()
   419  	csiTranslator := csitrans.New()
   420  
   421  	return &CSILimits{
   422  		csiNodeLister:        csiNodesLister,
   423  		pvLister:             pvLister,
   424  		pvcLister:            pvcLister,
   425  		scLister:             scLister,
   426  		randomVolumeIDPrefix: rand.String(32),
   427  		translator:           csiTranslator,
   428  	}, nil
   429  }
   430  
   431  func getVolumeLimits(nodeInfo *framework.NodeInfo, csiNode *storagev1.CSINode) map[v1.ResourceName]int64 {
   432  	// TODO: stop getting values from Node object in v1.18
   433  	nodeVolumeLimits := volumeLimits(nodeInfo)
   434  	if csiNode != nil {
   435  		for i := range csiNode.Spec.Drivers {
   436  			d := csiNode.Spec.Drivers[i]
   437  			if d.Allocatable != nil && d.Allocatable.Count != nil {
   438  				// TODO: drop GetCSIAttachLimitKey once we don't get values from Node object (v1.18)
   439  				k := v1.ResourceName(volumeutil.GetCSIAttachLimitKey(d.Name))
   440  				nodeVolumeLimits[k] = int64(*d.Allocatable.Count)
   441  			}
   442  		}
   443  	}
   444  	return nodeVolumeLimits
   445  }