volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/deviceshare/deviceshare.go (about)

     1  /*
     2  Copyright 2024 The Volcano Authors.
     3  
     4  Licensed under the Apache License, Version 2.0 (the "License");
     5  you may not use this file except in compliance with the License.
     6  You may obtain a copy of the License at
     7  
     8      http://www.apache.org/licenses/LICENSE-2.0
     9  
    10  Unless required by applicable law or agreed to in writing, software
    11  distributed under the License is distributed on an "AS IS" BASIS,
    12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    13  See the License for the specific language governing permissions and
    14  limitations under the License.
    15  */
    16  
    17  package deviceshare
    18  
    19  import (
    20  	"context"
    21  	"fmt"
    22  	"math"
    23  	"reflect"
    24  
    25  	v1 "k8s.io/api/core/v1"
    26  	"k8s.io/klog/v2"
    27  	k8sframework "k8s.io/kubernetes/pkg/scheduler/framework"
    28  
    29  	"volcano.sh/volcano/pkg/scheduler/api"
    30  	"volcano.sh/volcano/pkg/scheduler/api/devices"
    31  	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare"
    32  	"volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu"
    33  	"volcano.sh/volcano/pkg/scheduler/framework"
    34  )
    35  
    36  // PluginName indicates name of volcano scheduler plugin.
    37  const (
    38  	PluginName = "deviceshare"
    39  	// GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML
    40  	GPUSharingPredicate = "deviceshare.GPUSharingEnable"
    41  	NodeLockEnable      = "deviceshare.NodeLockEnable"
    42  	GPUNumberPredicate  = "deviceshare.GPUNumberEnable"
    43  
    44  	VGPUEnable = "deviceshare.VGPUEnable"
    45  
    46  	SchedulePolicyArgument = "deviceshare.SchedulePolicy"
    47  	ScheduleWeight         = "deviceshare.ScheduleWeight"
    48  )
    49  
    50  type deviceSharePlugin struct {
    51  	// Arguments given for the plugin
    52  	pluginArguments framework.Arguments
    53  	schedulePolicy  string
    54  	scheduleWeight  int
    55  }
    56  
    57  // New return priority plugin
    58  func New(arguments framework.Arguments) framework.Plugin {
    59  	dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0}
    60  	enablePredicate(dsp)
    61  	return dsp
    62  }
    63  
    64  func (dp *deviceSharePlugin) Name() string {
    65  	return PluginName
    66  }
    67  
    68  func enablePredicate(dsp *deviceSharePlugin) {
    69  	// Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct.
    70  	args := dsp.pluginArguments
    71  	args.GetBool(&gpushare.GpuSharingEnable, GPUSharingPredicate)
    72  	args.GetBool(&gpushare.GpuNumberEnable, GPUNumberPredicate)
    73  	args.GetBool(&gpushare.NodeLockEnable, NodeLockEnable)
    74  	args.GetBool(&vgpu.VGPUEnable, VGPUEnable)
    75  
    76  	_, ok := args[SchedulePolicyArgument]
    77  	if ok {
    78  		dsp.schedulePolicy = args[SchedulePolicyArgument].(string)
    79  	}
    80  	args.GetInt(&dsp.scheduleWeight, ScheduleWeight)
    81  
    82  	if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable {
    83  		klog.Fatal("can not define true in both gpu sharing and gpu number")
    84  	}
    85  	if (gpushare.GpuSharingEnable || gpushare.GpuNumberEnable) && vgpu.VGPUEnable {
    86  		klog.Fatal("gpu-share and vgpu can't be used together")
    87  	}
    88  }
    89  
    90  func createStatus(code int, reason string) *api.Status {
    91  	status := api.Status{
    92  		Code:   code,
    93  		Reason: reason,
    94  	}
    95  	return &status
    96  }
    97  
    98  func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedulePolicy string) (int64, *k8sframework.Status) {
    99  	s := float64(0)
   100  	for _, devices := range node.Others {
   101  		if devices.(api.Devices).HasDeviceRequest(pod) {
   102  			ns := devices.(api.Devices).ScoreNode(pod, schedulePolicy)
   103  			s += ns
   104  		}
   105  	}
   106  	klog.V(4).Infof("deviceScore for task %s/%s is: %v", pod.Namespace, pod.Name, s)
   107  	return int64(math.Floor(s + 0.5)), nil
   108  }
   109  
   110  func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) {
   111  	// Register event handlers to update task info in PodLister & nodeMap
   112  	ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) {
   113  		predicateStatus := make([]*api.Status, 0)
   114  		// Check PredicateWithCache
   115  		for _, val := range api.RegisteredDevices {
   116  			if dev, ok := node.Others[val].(api.Devices); ok {
   117  				if reflect.ValueOf(dev).IsNil() {
   118  					// TODO When a pod requests a device of the current type, but the current node does not have such a device, an error is thrown
   119  					if dev == nil || dev.HasDeviceRequest(task.Pod) {
   120  						predicateStatus = append(predicateStatus, &api.Status{
   121  							Code:   devices.Unschedulable,
   122  							Reason: "node not initialized with device" + val,
   123  						})
   124  						return predicateStatus, fmt.Errorf("node not initialized with device %s", val)
   125  					}
   126  					klog.V(4).Infof("pod %s/%s did not request device %s on %s, skipping it", task.Pod.Namespace, task.Pod.Name, val, node.Name)
   127  					continue
   128  				}
   129  				code, msg, err := dev.FilterNode(task.Pod, dp.schedulePolicy)
   130  				if err != nil {
   131  					predicateStatus = append(predicateStatus, createStatus(code, msg))
   132  					return predicateStatus, err
   133  				}
   134  				filterNodeStatus := createStatus(code, msg)
   135  				if filterNodeStatus.Code != api.Success {
   136  					predicateStatus = append(predicateStatus, filterNodeStatus)
   137  					return predicateStatus, fmt.Errorf("plugin device filternode predicates failed %s", msg)
   138  				}
   139  			} else {
   140  				klog.Warningf("Devices %s assertion conversion failed, skip", val)
   141  			}
   142  		}
   143  
   144  		klog.V(4).Infof("checkDevices predicates Task <%s/%s> on Node <%s>: fit ",
   145  			task.Namespace, task.Name, node.Name)
   146  
   147  		return predicateStatus, nil
   148  	})
   149  
   150  	ssn.AddNodeOrderFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) {
   151  		// DeviceScore
   152  		if len(dp.schedulePolicy) > 0 {
   153  			score, status := getDeviceScore(context.TODO(), task.Pod, node, dp.schedulePolicy)
   154  			if !status.IsSuccess() {
   155  				klog.Warningf("Node: %s, Calculate Device Score Failed because of Error: %v", node.Name, status.AsError())
   156  				return 0, status.AsError()
   157  			}
   158  
   159  			// TODO: we should use a seperate plugin for devices, and seperate them from predicates and nodeOrder plugin.
   160  			nodeScore := float64(score) * float64(dp.scheduleWeight)
   161  			klog.V(5).Infof("Node: %s, task<%s/%s> Device Score weight %d, score: %f", node.Name, task.Namespace, task.Name, dp.scheduleWeight, nodeScore)
   162  		}
   163  		return 0, nil
   164  	})
   165  }
   166  
   167  func (dp *deviceSharePlugin) OnSessionClose(ssn *framework.Session) {}