volcano.sh/volcano@v1.9.0/pkg/scheduler/plugins/deviceshare/deviceshare.go (about) 1 /* 2 Copyright 2024 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package deviceshare 18 19 import ( 20 "context" 21 "fmt" 22 "math" 23 "reflect" 24 25 v1 "k8s.io/api/core/v1" 26 "k8s.io/klog/v2" 27 k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" 28 29 "volcano.sh/volcano/pkg/scheduler/api" 30 "volcano.sh/volcano/pkg/scheduler/api/devices" 31 "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" 32 "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" 33 "volcano.sh/volcano/pkg/scheduler/framework" 34 ) 35 36 // PluginName indicates name of volcano scheduler plugin. 37 const ( 38 PluginName = "deviceshare" 39 // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML 40 GPUSharingPredicate = "deviceshare.GPUSharingEnable" 41 NodeLockEnable = "deviceshare.NodeLockEnable" 42 GPUNumberPredicate = "deviceshare.GPUNumberEnable" 43 44 VGPUEnable = "deviceshare.VGPUEnable" 45 46 SchedulePolicyArgument = "deviceshare.SchedulePolicy" 47 ScheduleWeight = "deviceshare.ScheduleWeight" 48 ) 49 50 type deviceSharePlugin struct { 51 // Arguments given for the plugin 52 pluginArguments framework.Arguments 53 schedulePolicy string 54 scheduleWeight int 55 } 56 57 // New return priority plugin 58 func New(arguments framework.Arguments) framework.Plugin { 59 dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0} 60 enablePredicate(dsp) 61 return dsp 62 } 63 64 func (dp *deviceSharePlugin) Name() string { 65 return PluginName 66 } 67 68 func enablePredicate(dsp *deviceSharePlugin) { 69 // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct. 70 args := dsp.pluginArguments 71 args.GetBool(&gpushare.GpuSharingEnable, GPUSharingPredicate) 72 args.GetBool(&gpushare.GpuNumberEnable, GPUNumberPredicate) 73 args.GetBool(&gpushare.NodeLockEnable, NodeLockEnable) 74 args.GetBool(&vgpu.VGPUEnable, VGPUEnable) 75 76 _, ok := args[SchedulePolicyArgument] 77 if ok { 78 dsp.schedulePolicy = args[SchedulePolicyArgument].(string) 79 } 80 args.GetInt(&dsp.scheduleWeight, ScheduleWeight) 81 82 if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable { 83 klog.Fatal("can not define true in both gpu sharing and gpu number") 84 } 85 if (gpushare.GpuSharingEnable || gpushare.GpuNumberEnable) && vgpu.VGPUEnable { 86 klog.Fatal("gpu-share and vgpu can't be used together") 87 } 88 } 89 90 func createStatus(code int, reason string) *api.Status { 91 status := api.Status{ 92 Code: code, 93 Reason: reason, 94 } 95 return &status 96 } 97 98 func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedulePolicy string) (int64, *k8sframework.Status) { 99 s := float64(0) 100 for _, devices := range node.Others { 101 if devices.(api.Devices).HasDeviceRequest(pod) { 102 ns := devices.(api.Devices).ScoreNode(pod, schedulePolicy) 103 s += ns 104 } 105 } 106 klog.V(4).Infof("deviceScore for task %s/%s is: %v", pod.Namespace, pod.Name, s) 107 return int64(math.Floor(s + 0.5)), nil 108 } 109 110 func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { 111 // Register event handlers to update task info in PodLister & nodeMap 112 ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { 113 predicateStatus := make([]*api.Status, 0) 114 // Check PredicateWithCache 115 for _, val := range api.RegisteredDevices { 116 if dev, ok := node.Others[val].(api.Devices); ok { 117 if reflect.ValueOf(dev).IsNil() { 118 // TODO When a pod requests a device of the current type, but the current node does not have such a device, an error is thrown 119 if dev == nil || dev.HasDeviceRequest(task.Pod) { 120 predicateStatus = append(predicateStatus, &api.Status{ 121 Code: devices.Unschedulable, 122 Reason: "node not initialized with device" + val, 123 }) 124 return predicateStatus, fmt.Errorf("node not initialized with device %s", val) 125 } 126 klog.V(4).Infof("pod %s/%s did not request device %s on %s, skipping it", task.Pod.Namespace, task.Pod.Name, val, node.Name) 127 continue 128 } 129 code, msg, err := dev.FilterNode(task.Pod, dp.schedulePolicy) 130 if err != nil { 131 predicateStatus = append(predicateStatus, createStatus(code, msg)) 132 return predicateStatus, err 133 } 134 filterNodeStatus := createStatus(code, msg) 135 if filterNodeStatus.Code != api.Success { 136 predicateStatus = append(predicateStatus, filterNodeStatus) 137 return predicateStatus, fmt.Errorf("plugin device filternode predicates failed %s", msg) 138 } 139 } else { 140 klog.Warningf("Devices %s assertion conversion failed, skip", val) 141 } 142 } 143 144 klog.V(4).Infof("checkDevices predicates Task <%s/%s> on Node <%s>: fit ", 145 task.Namespace, task.Name, node.Name) 146 147 return predicateStatus, nil 148 }) 149 150 ssn.AddNodeOrderFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) { 151 // DeviceScore 152 if len(dp.schedulePolicy) > 0 { 153 score, status := getDeviceScore(context.TODO(), task.Pod, node, dp.schedulePolicy) 154 if !status.IsSuccess() { 155 klog.Warningf("Node: %s, Calculate Device Score Failed because of Error: %v", node.Name, status.AsError()) 156 return 0, status.AsError() 157 } 158 159 // TODO: we should use a seperate plugin for devices, and seperate them from predicates and nodeOrder plugin. 160 nodeScore := float64(score) * float64(dp.scheduleWeight) 161 klog.V(5).Infof("Node: %s, task<%s/%s> Device Score weight %d, score: %f", node.Name, task.Namespace, task.Name, dp.scheduleWeight, nodeScore) 162 } 163 return 0, nil 164 }) 165 } 166 167 func (dp *deviceSharePlugin) OnSessionClose(ssn *framework.Session) {}