volcano.sh/volcano@v1.9.0/pkg/scheduler/capabilities/volumebinding/volume_binding.go (about)

     1  /*
     2  Copyright 2019 The Kubernetes Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package volumebinding
    18  
    19  import (
    20  	"context"
    21  	"errors"
    22  	"fmt"
    23  	"sync"
    24  	"time"
    25  
    26  	v1 "k8s.io/api/core/v1"
    27  	apierrors "k8s.io/apimachinery/pkg/api/errors"
    28  	"k8s.io/apimachinery/pkg/runtime"
    29  	corelisters "k8s.io/client-go/listers/core/v1"
    30  	"k8s.io/component-helpers/storage/ephemeral"
    31  	"k8s.io/klog/v2"
    32  	"k8s.io/kubernetes/pkg/scheduler/apis/config"
    33  	"k8s.io/kubernetes/pkg/scheduler/apis/config/validation"
    34  	"k8s.io/kubernetes/pkg/scheduler/framework"
    35  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
    36  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
    37  	"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
    38  
    39  	"volcano.sh/volcano/cmd/scheduler/app/options"
    40  )
    41  
    42  const (
    43  	stateKey framework.StateKey = Name
    44  
    45  	maxUtilization = 100
    46  )
    47  
    48  // the state is initialized in PreFilter phase. because we save the pointer in
    49  // framework.CycleState, in the later phases we don't need to call Write method
    50  // to update the value
    51  type stateData struct {
    52  	allBound bool
    53  	// podVolumesByNode holds the pod's volume information found in the Filter
    54  	// phase for each node
    55  	// it's initialized in the PreFilter phase
    56  	podVolumesByNode map[string]*PodVolumes
    57  	podVolumeClaims  *PodVolumeClaims
    58  	sync.Mutex
    59  }
    60  
    61  func (d *stateData) Clone() framework.StateData {
    62  	return d
    63  }
    64  
    65  // VolumeBinding is a plugin that binds pod volumes in scheduling.
    66  // In the Filter phase, pod binding cache is created for the pod and used in
    67  // Reserve and PreBind phases.
    68  type VolumeBinding struct {
    69  	Binder    SchedulerVolumeBinder
    70  	PVCLister corelisters.PersistentVolumeClaimLister
    71  	scorer    volumeCapacityScorer
    72  	fts       feature.Features
    73  }
    74  
    75  var _ framework.PreFilterPlugin = &VolumeBinding{}
    76  var _ framework.FilterPlugin = &VolumeBinding{}
    77  var _ framework.ReservePlugin = &VolumeBinding{}
    78  var _ framework.PreBindPlugin = &VolumeBinding{}
    79  var _ framework.ScorePlugin = &VolumeBinding{}
    80  var _ framework.EnqueueExtensions = &VolumeBinding{}
    81  
    82  // Name is the name of the plugin used in Registry and configurations.
    83  const Name = names.VolumeBinding
    84  
    85  // Name returns name of the plugin. It is used in logs, etc.
    86  func (pl *VolumeBinding) Name() string {
    87  	return Name
    88  }
    89  
    90  // EventsToRegister returns the possible events that may make a Pod
    91  // failed by this plugin schedulable.
    92  func (pl *VolumeBinding) EventsToRegister() []framework.ClusterEventWithHint {
    93  	events := []framework.ClusterEventWithHint{
    94  		// Pods may fail because of missing or mis-configured storage class
    95  		// (e.g., allowedTopologies, volumeBindingMode), and hence may become
    96  		// schedulable upon StorageClass Add or Update events.
    97  		{Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add | framework.Update}},
    98  		// We bind PVCs with PVs, so any changes may make the pods schedulable.
    99  		{Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}},
   100  		{Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}},
   101  		// Pods may fail to find available PVs because the node labels do not
   102  		// match the storage class's allowed topologies or PV's node affinity.
   103  		// A new or updated node may make pods schedulable.
   104  		{Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}},
   105  		// We rely on CSI node to translate in-tree PV to CSI.
   106  		{Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}},
   107  		// When CSIStorageCapacity is enabled, pods may become schedulable
   108  		// on CSI driver & storage capacity changes.
   109  		{Event: framework.ClusterEvent{Resource: framework.CSIDriver, ActionType: framework.Add | framework.Update}},
   110  		{Event: framework.ClusterEvent{Resource: framework.CSIStorageCapacity, ActionType: framework.Add | framework.Update}},
   111  	}
   112  	return events
   113  }
   114  
   115  // podHasPVCs returns 2 values:
   116  // - the first one to denote if the given "pod" has any PVC defined.
   117  // - the second one to return any error if the requested PVC is illegal.
   118  func (pl *VolumeBinding) podHasPVCs(pod *v1.Pod) (bool, error) {
   119  	hasPVC := false
   120  	for _, vol := range pod.Spec.Volumes {
   121  		var pvcName string
   122  		isEphemeral := false
   123  		switch {
   124  		case vol.PersistentVolumeClaim != nil:
   125  			pvcName = vol.PersistentVolumeClaim.ClaimName
   126  		case vol.Ephemeral != nil:
   127  			pvcName = ephemeral.VolumeClaimName(pod, &vol)
   128  			isEphemeral = true
   129  		default:
   130  			// Volume is not using a PVC, ignore
   131  			continue
   132  		}
   133  		hasPVC = true
   134  		pvc, err := pl.PVCLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName)
   135  		if err != nil {
   136  			// The error usually has already enough context ("persistentvolumeclaim "myclaim" not found"),
   137  			// but we can do better for generic ephemeral inline volumes where that situation
   138  			// is normal directly after creating a pod.
   139  			if isEphemeral && apierrors.IsNotFound(err) {
   140  				err = fmt.Errorf("waiting for ephemeral volume controller to create the persistentvolumeclaim %q", pvcName)
   141  			}
   142  			return hasPVC, err
   143  		}
   144  
   145  		if pvc.Status.Phase == v1.ClaimLost {
   146  			return hasPVC, fmt.Errorf("persistentvolumeclaim %q bound to non-existent persistentvolume %q", pvc.Name, pvc.Spec.VolumeName)
   147  		}
   148  
   149  		if pvc.DeletionTimestamp != nil {
   150  			return hasPVC, fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name)
   151  		}
   152  
   153  		if isEphemeral {
   154  			if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil {
   155  				return hasPVC, err
   156  			}
   157  		}
   158  	}
   159  	return hasPVC, nil
   160  }
   161  
   162  // PreFilter invoked at the prefilter extension point to check if pod has all
   163  // immediate PVCs bound. If not all immediate PVCs are bound, an
   164  // UnschedulableAndUnresolvable is returned.
   165  func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
   166  	logger := klog.FromContext(ctx)
   167  	// If pod does not reference any PVC, we don't need to do anything.
   168  	if hasPVC, err := pl.podHasPVCs(pod); err != nil {
   169  		return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error())
   170  	} else if !hasPVC {
   171  		state.Write(stateKey, &stateData{})
   172  		return nil, framework.NewStatus(framework.Skip)
   173  	}
   174  	podVolumeClaims, err := pl.Binder.GetPodVolumeClaims(logger, pod)
   175  	if err != nil {
   176  		return nil, framework.AsStatus(err)
   177  	}
   178  	if len(podVolumeClaims.unboundClaimsImmediate) > 0 {
   179  		// Return UnschedulableAndUnresolvable error if immediate claims are
   180  		// not bound. Pod will be moved to active/backoff queues once these
   181  		// claims are bound by PV controller.
   182  		status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
   183  		status.AppendReason("pod has unbound immediate PersistentVolumeClaims")
   184  		return nil, status
   185  	}
   186  	// Attempt to reduce down the number of nodes to consider in subsequent scheduling stages if pod has bound claims.
   187  	var result *framework.PreFilterResult
   188  	if eligibleNodes := pl.Binder.GetEligibleNodes(logger, podVolumeClaims.boundClaims); eligibleNodes != nil {
   189  		result = &framework.PreFilterResult{
   190  			NodeNames: eligibleNodes,
   191  		}
   192  	}
   193  
   194  	state.Write(stateKey, &stateData{
   195  		podVolumesByNode: make(map[string]*PodVolumes),
   196  		podVolumeClaims: &PodVolumeClaims{
   197  			boundClaims:                podVolumeClaims.boundClaims,
   198  			unboundClaimsDelayBinding:  podVolumeClaims.unboundClaimsDelayBinding,
   199  			unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding,
   200  		},
   201  	})
   202  	return result, nil
   203  }
   204  
   205  // PreFilterExtensions returns prefilter extensions, pod add and remove.
   206  func (pl *VolumeBinding) PreFilterExtensions() framework.PreFilterExtensions {
   207  	return nil
   208  }
   209  
   210  func getStateData(cs *framework.CycleState) (*stateData, error) {
   211  	state, err := cs.Read(stateKey)
   212  	if err != nil {
   213  		return nil, err
   214  	}
   215  	s, ok := state.(*stateData)
   216  	if !ok {
   217  		return nil, errors.New("unable to convert state into stateData")
   218  	}
   219  	return s, nil
   220  }
   221  
   222  // Filter invoked at the filter extension point.
   223  // It evaluates if a pod can fit due to the volumes it requests,
   224  // for both bound and unbound PVCs.
   225  //
   226  // For PVCs that are bound, then it checks that the corresponding PV's node affinity is
   227  // satisfied by the given node.
   228  //
   229  // For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements
   230  // and that the PV node affinity is satisfied by the given node.
   231  //
   232  // If storage capacity tracking is enabled, then enough space has to be available
   233  // for the node and volumes that still need to be created.
   234  //
   235  // The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound
   236  // PVCs can be matched with an available and node-compatible PV.
   237  func (pl *VolumeBinding) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
   238  	logger := klog.FromContext(ctx)
   239  	node := nodeInfo.Node()
   240  
   241  	state, err := getStateData(cs)
   242  	if err != nil {
   243  		return framework.AsStatus(err)
   244  	}
   245  
   246  	podVolumes, reasons, err := pl.Binder.FindPodVolumes(logger, pod, state.podVolumeClaims, node)
   247  
   248  	if err != nil {
   249  		return framework.AsStatus(err)
   250  	}
   251  
   252  	if len(reasons) > 0 {
   253  		status := framework.NewStatus(framework.UnschedulableAndUnresolvable)
   254  		for _, reason := range reasons {
   255  			status.AppendReason(string(reason))
   256  		}
   257  		return status
   258  	}
   259  
   260  	// multiple goroutines call `Filter` on different nodes simultaneously and the `CycleState` may be duplicated, so we must use a local lock here
   261  	state.Lock()
   262  	state.podVolumesByNode[node.Name] = podVolumes
   263  	state.Unlock()
   264  	return nil
   265  }
   266  
   267  // Score invoked at the score extension point.
   268  func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) {
   269  	if pl.scorer == nil {
   270  		return 0, nil
   271  	}
   272  	state, err := getStateData(cs)
   273  	if err != nil {
   274  		return 0, framework.AsStatus(err)
   275  	}
   276  	podVolumes, ok := state.podVolumesByNode[nodeName]
   277  	if !ok {
   278  		return 0, nil
   279  	}
   280  	// group by storage class
   281  	classResources := make(classResourceMap)
   282  	for _, staticBinding := range podVolumes.StaticBindings {
   283  		class := staticBinding.StorageClassName()
   284  		storageResource := staticBinding.StorageResource()
   285  		if _, ok := classResources[class]; !ok {
   286  			classResources[class] = &StorageResource{
   287  				Requested: 0,
   288  				Capacity:  0,
   289  			}
   290  		}
   291  		classResources[class].Requested += storageResource.Requested
   292  		classResources[class].Capacity += storageResource.Capacity
   293  	}
   294  	return pl.scorer(classResources), nil
   295  }
   296  
   297  // ScoreExtensions of the Score plugin.
   298  func (pl *VolumeBinding) ScoreExtensions() framework.ScoreExtensions {
   299  	return nil
   300  }
   301  
   302  // Reserve reserves volumes of pod and saves binding status in cycle state.
   303  func (pl *VolumeBinding) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
   304  	state, err := getStateData(cs)
   305  	if err != nil {
   306  		return framework.AsStatus(err)
   307  	}
   308  	// we don't need to hold the lock as only one node will be reserved for the given pod
   309  	podVolumes, ok := state.podVolumesByNode[nodeName]
   310  	if ok {
   311  		allBound, err := pl.Binder.AssumePodVolumes(klog.FromContext(ctx), pod, nodeName, podVolumes)
   312  		if err != nil {
   313  			return framework.AsStatus(err)
   314  		}
   315  		state.allBound = allBound
   316  	} else {
   317  		// may not exist if the pod does not reference any PVC
   318  		state.allBound = true
   319  	}
   320  	return nil
   321  }
   322  
   323  // PreBind will make the API update with the assumed bindings and wait until
   324  // the PV controller has completely finished the binding operation.
   325  //
   326  // If binding errors, times out or gets undone, then an error will be returned to
   327  // retry scheduling.
   328  func (pl *VolumeBinding) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
   329  	s, err := getStateData(cs)
   330  	if err != nil {
   331  		return framework.AsStatus(err)
   332  	}
   333  	if s.allBound {
   334  		// no need to bind volumes
   335  		return nil
   336  	}
   337  	// we don't need to hold the lock as only one node will be pre-bound for the given pod
   338  	podVolumes, ok := s.podVolumesByNode[nodeName]
   339  	if !ok {
   340  		return framework.AsStatus(fmt.Errorf("no pod volumes found for node %q", nodeName))
   341  	}
   342  	logger := klog.FromContext(ctx)
   343  	logger.V(5).Info("Trying to bind volumes for pod", "pod", klog.KObj(pod))
   344  	err = pl.Binder.BindPodVolumes(ctx, pod, podVolumes)
   345  	if err != nil {
   346  		logger.V(5).Info("Failed to bind volumes for pod", "pod", klog.KObj(pod), "err", err)
   347  		return framework.AsStatus(err)
   348  	}
   349  	logger.V(5).Info("Success binding volumes for pod", "pod", klog.KObj(pod))
   350  	return nil
   351  }
   352  
   353  // Unreserve clears assumed PV and PVC cache.
   354  // It's idempotent, and does nothing if no cache found for the given pod.
   355  func (pl *VolumeBinding) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) {
   356  	s, err := getStateData(cs)
   357  	if err != nil {
   358  		return
   359  	}
   360  	// we don't need to hold the lock as only one node may be unreserved
   361  	podVolumes, ok := s.podVolumesByNode[nodeName]
   362  	if !ok {
   363  		return
   364  	}
   365  	pl.Binder.RevertAssumedPodVolumes(podVolumes)
   366  }
   367  
   368  // New initializes a new plugin and returns it.
   369  func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) {
   370  	args, ok := plArgs.(*config.VolumeBindingArgs)
   371  	if !ok {
   372  		return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs)
   373  	}
   374  	if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{
   375  		AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority,
   376  	}); err != nil {
   377  		return nil, err
   378  	}
   379  	podInformer := fh.SharedInformerFactory().Core().V1().Pods()
   380  	nodeInformer := fh.SharedInformerFactory().Core().V1().Nodes()
   381  	pvcInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumeClaims()
   382  	pvInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumes()
   383  	storageClassInformer := fh.SharedInformerFactory().Storage().V1().StorageClasses()
   384  	csiNodeInformer := fh.SharedInformerFactory().Storage().V1().CSINodes()
   385  	var capacityCheck *CapacityCheck
   386  	if options.ServerOpts.EnableCSIStorage {
   387  		capacityCheck = &CapacityCheck{
   388  			CSIDriverInformer:          fh.SharedInformerFactory().Storage().V1().CSIDrivers(),
   389  			CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1beta1().CSIStorageCapacities(),
   390  		}
   391  	}
   392  	binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second)
   393  
   394  	// build score function
   395  	var scorer volumeCapacityScorer
   396  	if fts.EnableVolumeCapacityPriority {
   397  		shape := make(helper.FunctionShape, 0, len(args.Shape))
   398  		for _, point := range args.Shape {
   399  			shape = append(shape, helper.FunctionShapePoint{
   400  				Utilization: int64(point.Utilization),
   401  				Score:       int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore),
   402  			})
   403  		}
   404  		scorer = buildScorerFunction(shape)
   405  	}
   406  	return &VolumeBinding{
   407  		Binder:    binder,
   408  		PVCLister: pvcInformer.Lister(),
   409  		scorer:    scorer,
   410  		fts:       fts,
   411  	}, nil
   412  }