volcano.sh/volcano@v1.9.0/pkg/scheduler/capabilities/volumebinding/volume_binding.go (about) 1 /* 2 Copyright 2019 The Kubernetes Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package volumebinding 18 19 import ( 20 "context" 21 "errors" 22 "fmt" 23 "sync" 24 "time" 25 26 v1 "k8s.io/api/core/v1" 27 apierrors "k8s.io/apimachinery/pkg/api/errors" 28 "k8s.io/apimachinery/pkg/runtime" 29 corelisters "k8s.io/client-go/listers/core/v1" 30 "k8s.io/component-helpers/storage/ephemeral" 31 "k8s.io/klog/v2" 32 "k8s.io/kubernetes/pkg/scheduler/apis/config" 33 "k8s.io/kubernetes/pkg/scheduler/apis/config/validation" 34 "k8s.io/kubernetes/pkg/scheduler/framework" 35 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" 36 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper" 37 "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" 38 39 "volcano.sh/volcano/cmd/scheduler/app/options" 40 ) 41 42 const ( 43 stateKey framework.StateKey = Name 44 45 maxUtilization = 100 46 ) 47 48 // the state is initialized in PreFilter phase. because we save the pointer in 49 // framework.CycleState, in the later phases we don't need to call Write method 50 // to update the value 51 type stateData struct { 52 allBound bool 53 // podVolumesByNode holds the pod's volume information found in the Filter 54 // phase for each node 55 // it's initialized in the PreFilter phase 56 podVolumesByNode map[string]*PodVolumes 57 podVolumeClaims *PodVolumeClaims 58 sync.Mutex 59 } 60 61 func (d *stateData) Clone() framework.StateData { 62 return d 63 } 64 65 // VolumeBinding is a plugin that binds pod volumes in scheduling. 66 // In the Filter phase, pod binding cache is created for the pod and used in 67 // Reserve and PreBind phases. 68 type VolumeBinding struct { 69 Binder SchedulerVolumeBinder 70 PVCLister corelisters.PersistentVolumeClaimLister 71 scorer volumeCapacityScorer 72 fts feature.Features 73 } 74 75 var _ framework.PreFilterPlugin = &VolumeBinding{} 76 var _ framework.FilterPlugin = &VolumeBinding{} 77 var _ framework.ReservePlugin = &VolumeBinding{} 78 var _ framework.PreBindPlugin = &VolumeBinding{} 79 var _ framework.ScorePlugin = &VolumeBinding{} 80 var _ framework.EnqueueExtensions = &VolumeBinding{} 81 82 // Name is the name of the plugin used in Registry and configurations. 83 const Name = names.VolumeBinding 84 85 // Name returns name of the plugin. It is used in logs, etc. 86 func (pl *VolumeBinding) Name() string { 87 return Name 88 } 89 90 // EventsToRegister returns the possible events that may make a Pod 91 // failed by this plugin schedulable. 92 func (pl *VolumeBinding) EventsToRegister() []framework.ClusterEventWithHint { 93 events := []framework.ClusterEventWithHint{ 94 // Pods may fail because of missing or mis-configured storage class 95 // (e.g., allowedTopologies, volumeBindingMode), and hence may become 96 // schedulable upon StorageClass Add or Update events. 97 {Event: framework.ClusterEvent{Resource: framework.StorageClass, ActionType: framework.Add | framework.Update}}, 98 // We bind PVCs with PVs, so any changes may make the pods schedulable. 99 {Event: framework.ClusterEvent{Resource: framework.PersistentVolumeClaim, ActionType: framework.Add | framework.Update}}, 100 {Event: framework.ClusterEvent{Resource: framework.PersistentVolume, ActionType: framework.Add | framework.Update}}, 101 // Pods may fail to find available PVs because the node labels do not 102 // match the storage class's allowed topologies or PV's node affinity. 103 // A new or updated node may make pods schedulable. 104 {Event: framework.ClusterEvent{Resource: framework.Node, ActionType: framework.Add | framework.UpdateNodeLabel}}, 105 // We rely on CSI node to translate in-tree PV to CSI. 106 {Event: framework.ClusterEvent{Resource: framework.CSINode, ActionType: framework.Add | framework.Update}}, 107 // When CSIStorageCapacity is enabled, pods may become schedulable 108 // on CSI driver & storage capacity changes. 109 {Event: framework.ClusterEvent{Resource: framework.CSIDriver, ActionType: framework.Add | framework.Update}}, 110 {Event: framework.ClusterEvent{Resource: framework.CSIStorageCapacity, ActionType: framework.Add | framework.Update}}, 111 } 112 return events 113 } 114 115 // podHasPVCs returns 2 values: 116 // - the first one to denote if the given "pod" has any PVC defined. 117 // - the second one to return any error if the requested PVC is illegal. 118 func (pl *VolumeBinding) podHasPVCs(pod *v1.Pod) (bool, error) { 119 hasPVC := false 120 for _, vol := range pod.Spec.Volumes { 121 var pvcName string 122 isEphemeral := false 123 switch { 124 case vol.PersistentVolumeClaim != nil: 125 pvcName = vol.PersistentVolumeClaim.ClaimName 126 case vol.Ephemeral != nil: 127 pvcName = ephemeral.VolumeClaimName(pod, &vol) 128 isEphemeral = true 129 default: 130 // Volume is not using a PVC, ignore 131 continue 132 } 133 hasPVC = true 134 pvc, err := pl.PVCLister.PersistentVolumeClaims(pod.Namespace).Get(pvcName) 135 if err != nil { 136 // The error usually has already enough context ("persistentvolumeclaim "myclaim" not found"), 137 // but we can do better for generic ephemeral inline volumes where that situation 138 // is normal directly after creating a pod. 139 if isEphemeral && apierrors.IsNotFound(err) { 140 err = fmt.Errorf("waiting for ephemeral volume controller to create the persistentvolumeclaim %q", pvcName) 141 } 142 return hasPVC, err 143 } 144 145 if pvc.Status.Phase == v1.ClaimLost { 146 return hasPVC, fmt.Errorf("persistentvolumeclaim %q bound to non-existent persistentvolume %q", pvc.Name, pvc.Spec.VolumeName) 147 } 148 149 if pvc.DeletionTimestamp != nil { 150 return hasPVC, fmt.Errorf("persistentvolumeclaim %q is being deleted", pvc.Name) 151 } 152 153 if isEphemeral { 154 if err := ephemeral.VolumeIsForPod(pod, pvc); err != nil { 155 return hasPVC, err 156 } 157 } 158 } 159 return hasPVC, nil 160 } 161 162 // PreFilter invoked at the prefilter extension point to check if pod has all 163 // immediate PVCs bound. If not all immediate PVCs are bound, an 164 // UnschedulableAndUnresolvable is returned. 165 func (pl *VolumeBinding) PreFilter(ctx context.Context, state *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) { 166 logger := klog.FromContext(ctx) 167 // If pod does not reference any PVC, we don't need to do anything. 168 if hasPVC, err := pl.podHasPVCs(pod); err != nil { 169 return nil, framework.NewStatus(framework.UnschedulableAndUnresolvable, err.Error()) 170 } else if !hasPVC { 171 state.Write(stateKey, &stateData{}) 172 return nil, framework.NewStatus(framework.Skip) 173 } 174 podVolumeClaims, err := pl.Binder.GetPodVolumeClaims(logger, pod) 175 if err != nil { 176 return nil, framework.AsStatus(err) 177 } 178 if len(podVolumeClaims.unboundClaimsImmediate) > 0 { 179 // Return UnschedulableAndUnresolvable error if immediate claims are 180 // not bound. Pod will be moved to active/backoff queues once these 181 // claims are bound by PV controller. 182 status := framework.NewStatus(framework.UnschedulableAndUnresolvable) 183 status.AppendReason("pod has unbound immediate PersistentVolumeClaims") 184 return nil, status 185 } 186 // Attempt to reduce down the number of nodes to consider in subsequent scheduling stages if pod has bound claims. 187 var result *framework.PreFilterResult 188 if eligibleNodes := pl.Binder.GetEligibleNodes(logger, podVolumeClaims.boundClaims); eligibleNodes != nil { 189 result = &framework.PreFilterResult{ 190 NodeNames: eligibleNodes, 191 } 192 } 193 194 state.Write(stateKey, &stateData{ 195 podVolumesByNode: make(map[string]*PodVolumes), 196 podVolumeClaims: &PodVolumeClaims{ 197 boundClaims: podVolumeClaims.boundClaims, 198 unboundClaimsDelayBinding: podVolumeClaims.unboundClaimsDelayBinding, 199 unboundVolumesDelayBinding: podVolumeClaims.unboundVolumesDelayBinding, 200 }, 201 }) 202 return result, nil 203 } 204 205 // PreFilterExtensions returns prefilter extensions, pod add and remove. 206 func (pl *VolumeBinding) PreFilterExtensions() framework.PreFilterExtensions { 207 return nil 208 } 209 210 func getStateData(cs *framework.CycleState) (*stateData, error) { 211 state, err := cs.Read(stateKey) 212 if err != nil { 213 return nil, err 214 } 215 s, ok := state.(*stateData) 216 if !ok { 217 return nil, errors.New("unable to convert state into stateData") 218 } 219 return s, nil 220 } 221 222 // Filter invoked at the filter extension point. 223 // It evaluates if a pod can fit due to the volumes it requests, 224 // for both bound and unbound PVCs. 225 // 226 // For PVCs that are bound, then it checks that the corresponding PV's node affinity is 227 // satisfied by the given node. 228 // 229 // For PVCs that are unbound, it tries to find available PVs that can satisfy the PVC requirements 230 // and that the PV node affinity is satisfied by the given node. 231 // 232 // If storage capacity tracking is enabled, then enough space has to be available 233 // for the node and volumes that still need to be created. 234 // 235 // The predicate returns true if all bound PVCs have compatible PVs with the node, and if all unbound 236 // PVCs can be matched with an available and node-compatible PV. 237 func (pl *VolumeBinding) Filter(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status { 238 logger := klog.FromContext(ctx) 239 node := nodeInfo.Node() 240 241 state, err := getStateData(cs) 242 if err != nil { 243 return framework.AsStatus(err) 244 } 245 246 podVolumes, reasons, err := pl.Binder.FindPodVolumes(logger, pod, state.podVolumeClaims, node) 247 248 if err != nil { 249 return framework.AsStatus(err) 250 } 251 252 if len(reasons) > 0 { 253 status := framework.NewStatus(framework.UnschedulableAndUnresolvable) 254 for _, reason := range reasons { 255 status.AppendReason(string(reason)) 256 } 257 return status 258 } 259 260 // multiple goroutines call `Filter` on different nodes simultaneously and the `CycleState` may be duplicated, so we must use a local lock here 261 state.Lock() 262 state.podVolumesByNode[node.Name] = podVolumes 263 state.Unlock() 264 return nil 265 } 266 267 // Score invoked at the score extension point. 268 func (pl *VolumeBinding) Score(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) (int64, *framework.Status) { 269 if pl.scorer == nil { 270 return 0, nil 271 } 272 state, err := getStateData(cs) 273 if err != nil { 274 return 0, framework.AsStatus(err) 275 } 276 podVolumes, ok := state.podVolumesByNode[nodeName] 277 if !ok { 278 return 0, nil 279 } 280 // group by storage class 281 classResources := make(classResourceMap) 282 for _, staticBinding := range podVolumes.StaticBindings { 283 class := staticBinding.StorageClassName() 284 storageResource := staticBinding.StorageResource() 285 if _, ok := classResources[class]; !ok { 286 classResources[class] = &StorageResource{ 287 Requested: 0, 288 Capacity: 0, 289 } 290 } 291 classResources[class].Requested += storageResource.Requested 292 classResources[class].Capacity += storageResource.Capacity 293 } 294 return pl.scorer(classResources), nil 295 } 296 297 // ScoreExtensions of the Score plugin. 298 func (pl *VolumeBinding) ScoreExtensions() framework.ScoreExtensions { 299 return nil 300 } 301 302 // Reserve reserves volumes of pod and saves binding status in cycle state. 303 func (pl *VolumeBinding) Reserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { 304 state, err := getStateData(cs) 305 if err != nil { 306 return framework.AsStatus(err) 307 } 308 // we don't need to hold the lock as only one node will be reserved for the given pod 309 podVolumes, ok := state.podVolumesByNode[nodeName] 310 if ok { 311 allBound, err := pl.Binder.AssumePodVolumes(klog.FromContext(ctx), pod, nodeName, podVolumes) 312 if err != nil { 313 return framework.AsStatus(err) 314 } 315 state.allBound = allBound 316 } else { 317 // may not exist if the pod does not reference any PVC 318 state.allBound = true 319 } 320 return nil 321 } 322 323 // PreBind will make the API update with the assumed bindings and wait until 324 // the PV controller has completely finished the binding operation. 325 // 326 // If binding errors, times out or gets undone, then an error will be returned to 327 // retry scheduling. 328 func (pl *VolumeBinding) PreBind(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status { 329 s, err := getStateData(cs) 330 if err != nil { 331 return framework.AsStatus(err) 332 } 333 if s.allBound { 334 // no need to bind volumes 335 return nil 336 } 337 // we don't need to hold the lock as only one node will be pre-bound for the given pod 338 podVolumes, ok := s.podVolumesByNode[nodeName] 339 if !ok { 340 return framework.AsStatus(fmt.Errorf("no pod volumes found for node %q", nodeName)) 341 } 342 logger := klog.FromContext(ctx) 343 logger.V(5).Info("Trying to bind volumes for pod", "pod", klog.KObj(pod)) 344 err = pl.Binder.BindPodVolumes(ctx, pod, podVolumes) 345 if err != nil { 346 logger.V(5).Info("Failed to bind volumes for pod", "pod", klog.KObj(pod), "err", err) 347 return framework.AsStatus(err) 348 } 349 logger.V(5).Info("Success binding volumes for pod", "pod", klog.KObj(pod)) 350 return nil 351 } 352 353 // Unreserve clears assumed PV and PVC cache. 354 // It's idempotent, and does nothing if no cache found for the given pod. 355 func (pl *VolumeBinding) Unreserve(ctx context.Context, cs *framework.CycleState, pod *v1.Pod, nodeName string) { 356 s, err := getStateData(cs) 357 if err != nil { 358 return 359 } 360 // we don't need to hold the lock as only one node may be unreserved 361 podVolumes, ok := s.podVolumesByNode[nodeName] 362 if !ok { 363 return 364 } 365 pl.Binder.RevertAssumedPodVolumes(podVolumes) 366 } 367 368 // New initializes a new plugin and returns it. 369 func New(ctx context.Context, plArgs runtime.Object, fh framework.Handle, fts feature.Features) (framework.Plugin, error) { 370 args, ok := plArgs.(*config.VolumeBindingArgs) 371 if !ok { 372 return nil, fmt.Errorf("want args to be of type VolumeBindingArgs, got %T", plArgs) 373 } 374 if err := validation.ValidateVolumeBindingArgsWithOptions(nil, args, validation.VolumeBindingArgsValidationOptions{ 375 AllowVolumeCapacityPriority: fts.EnableVolumeCapacityPriority, 376 }); err != nil { 377 return nil, err 378 } 379 podInformer := fh.SharedInformerFactory().Core().V1().Pods() 380 nodeInformer := fh.SharedInformerFactory().Core().V1().Nodes() 381 pvcInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumeClaims() 382 pvInformer := fh.SharedInformerFactory().Core().V1().PersistentVolumes() 383 storageClassInformer := fh.SharedInformerFactory().Storage().V1().StorageClasses() 384 csiNodeInformer := fh.SharedInformerFactory().Storage().V1().CSINodes() 385 var capacityCheck *CapacityCheck 386 if options.ServerOpts.EnableCSIStorage { 387 capacityCheck = &CapacityCheck{ 388 CSIDriverInformer: fh.SharedInformerFactory().Storage().V1().CSIDrivers(), 389 CSIStorageCapacityInformer: fh.SharedInformerFactory().Storage().V1beta1().CSIStorageCapacities(), 390 } 391 } 392 binder := NewVolumeBinder(klog.FromContext(ctx), fh.ClientSet(), podInformer, nodeInformer, csiNodeInformer, pvcInformer, pvInformer, storageClassInformer, capacityCheck, time.Duration(args.BindTimeoutSeconds)*time.Second) 393 394 // build score function 395 var scorer volumeCapacityScorer 396 if fts.EnableVolumeCapacityPriority { 397 shape := make(helper.FunctionShape, 0, len(args.Shape)) 398 for _, point := range args.Shape { 399 shape = append(shape, helper.FunctionShapePoint{ 400 Utilization: int64(point.Utilization), 401 Score: int64(point.Score) * (framework.MaxNodeScore / config.MaxCustomPriorityScore), 402 }) 403 } 404 scorer = buildScorerFunction(shape) 405 } 406 return &VolumeBinding{ 407 Binder: binder, 408 PVCLister: pvcInformer.Lister(), 409 scorer: scorer, 410 fts: fts, 411 }, nil 412 }