volcano.sh/volcano@v1.9.0/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go (about) 1 /* 2 Copyright 2023 The Volcano Authors. 3 4 Licensed under the Apache License, Version 2.0 (the "License"); 5 you may not use this file except in compliance with the License. 6 You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10 Unless required by applicable law or agreed to in writing, software 11 distributed under the License is distributed on an "AS IS" BASIS, 12 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 See the License for the specific language governing permissions and 14 limitations under the License. 15 */ 16 17 package gpushare 18 19 import ( 20 "context" 21 "fmt" 22 23 "github.com/pkg/errors" 24 v1 "k8s.io/api/core/v1" 25 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 26 "k8s.io/apimachinery/pkg/types" 27 "k8s.io/client-go/kubernetes" 28 "k8s.io/klog/v2" 29 30 "volcano.sh/volcano/pkg/scheduler/api/devices" 31 "volcano.sh/volcano/pkg/scheduler/plugins/util/nodelock" 32 ) 33 34 // GPUDevice include gpu id, memory and the pods that are sharing it. 35 type GPUDevice struct { 36 // GPU ID 37 ID int 38 // The pods that are sharing this GPU 39 PodMap map[string]*v1.Pod 40 // memory per card 41 Memory uint 42 } 43 44 type GPUDevices struct { 45 Name string 46 47 Device map[int]*GPUDevice 48 } 49 50 // NewGPUDevice creates a device 51 func NewGPUDevice(id int, mem uint) *GPUDevice { 52 return &GPUDevice{ 53 ID: id, 54 Memory: mem, 55 PodMap: map[string]*v1.Pod{}, 56 } 57 } 58 59 func NewGPUDevices(name string, node *v1.Node) *GPUDevices { 60 if node == nil { 61 return nil 62 } 63 memory, ok := node.Status.Capacity[VolcanoGPUResource] 64 if !ok { 65 return nil 66 } 67 totalMemory := memory.Value() 68 69 res, ok := node.Status.Capacity[VolcanoGPUNumber] 70 if !ok { 71 return nil 72 } 73 gpuNumber := res.Value() 74 if gpuNumber == 0 { 75 klog.Warningf("invalid %s=%s", VolcanoGPUNumber, res.String()) 76 return nil 77 } 78 79 memoryPerCard := uint(totalMemory / gpuNumber) 80 gpudevices := GPUDevices{} 81 gpudevices.Device = make(map[int]*GPUDevice) 82 gpudevices.Name = name 83 for i := 0; i < int(gpuNumber); i++ { 84 gpudevices.Device[i] = NewGPUDevice(i, memoryPerCard) 85 } 86 unhealthyGPUs := getUnhealthyGPUs(&gpudevices, node) 87 for i := range unhealthyGPUs { 88 klog.V(4).Infof("delete unhealthy gpu id %d from GPUDevices", unhealthyGPUs[i]) 89 delete(gpudevices.Device, unhealthyGPUs[i]) 90 } 91 return &gpudevices 92 } 93 94 // GetIgnoredDevices return device names which wish vc-scheduler to ignore 95 func (gs *GPUDevices) GetIgnoredDevices() []string { 96 return []string{""} 97 } 98 99 // AddResource adds the pod to GPU pool if it is assigned 100 func (gs *GPUDevices) AddResource(pod *v1.Pod) { 101 gpuRes := getGPUMemoryOfPod(pod) 102 if gpuRes > 0 { 103 ids := GetGPUIndex(pod) 104 for _, id := range ids { 105 if dev := gs.Device[id]; dev != nil { 106 dev.PodMap[string(pod.UID)] = pod 107 } 108 } 109 } 110 } 111 112 // SubResource frees the gpu hold by the pod 113 func (gs *GPUDevices) SubResource(pod *v1.Pod) { 114 gpuRes := getGPUMemoryOfPod(pod) 115 if gpuRes > 0 { 116 ids := GetGPUIndex(pod) 117 for _, id := range ids { 118 if dev := gs.Device[id]; dev != nil { 119 delete(dev.PodMap, string(pod.UID)) 120 } 121 } 122 } 123 } 124 125 func (gs *GPUDevices) HasDeviceRequest(pod *v1.Pod) bool { 126 if GpuSharingEnable && getGPUMemoryOfPod(pod) > 0 || 127 GpuNumberEnable && getGPUNumberOfPod(pod) > 0 { 128 return true 129 } 130 return false 131 } 132 133 func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) error { 134 ids := GetGPUIndex(pod) 135 patch := RemoveGPUIndexPatch() 136 _, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) 137 if err != nil { 138 return errors.Errorf("patch pod %s failed with patch %s: %v", pod.Name, patch, err) 139 } 140 141 for _, id := range ids { 142 if dev, ok := gs.Device[id]; ok { 143 delete(dev.PodMap, string(pod.UID)) 144 } 145 } 146 147 klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, gs.Name) 148 return nil 149 } 150 151 func (gs *GPUDevices) FilterNode(pod *v1.Pod, schedulePolicy string) (int, string, error) { 152 klog.V(4).Infoln("DeviceSharing:Into FitInPod", pod.Name) 153 if GpuSharingEnable { 154 fit, err := checkNodeGPUSharingPredicate(pod, gs) 155 if err != nil || !fit { 156 klog.Errorln("deviceSharing err=", err.Error()) 157 return devices.Unschedulable, fmt.Sprintf("GpuShare %s", err.Error()), err 158 } 159 } 160 if GpuNumberEnable { 161 fit, err := checkNodeGPUNumberPredicate(pod, gs) 162 if err != nil || !fit { 163 klog.Errorln("deviceSharing err=", err.Error()) 164 return devices.Unschedulable, fmt.Sprintf("GpuNumber %s", err.Error()), err 165 } 166 } 167 klog.V(4).Infoln("DeviceSharing:FitInPod successed") 168 return devices.Success, "", nil 169 } 170 171 func (gs *GPUDevices) GetStatus() string { 172 return "" 173 } 174 175 func (gs *GPUDevices) ScoreNode(pod *v1.Pod, schedulePolicy string) float64 { 176 return 0 177 } 178 179 func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error { 180 klog.V(4).Infoln("DeviceSharing:Into AllocateToPod", pod.Name) 181 if getGPUMemoryOfPod(pod) > 0 { 182 if NodeLockEnable { 183 nodelock.UseClient(kubeClient) 184 err := nodelock.LockNode(gs.Name, "gpu") 185 if err != nil { 186 return errors.Errorf("node %s locked for lockname gpushare %s", gs.Name, err.Error()) 187 } 188 } 189 ids := predicateGPUbyMemory(pod, gs) 190 if len(ids) == 0 { 191 return errors.Errorf("the node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace) 192 } 193 id := ids[0] 194 patch := AddGPUIndexPatch([]int{id}) 195 pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) 196 if err != nil { 197 return errors.Errorf("patch pod %s failed with patch %s: %v", pod.Name, patch, err) 198 } 199 dev, ok := gs.Device[id] 200 if !ok { 201 return errors.Errorf("failed to get GPU %d from node %s", id, gs.Name) 202 } 203 dev.PodMap[string(pod.UID)] = pod 204 klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, gs.Name) 205 } 206 if getGPUNumberOfPod(pod) > 0 { 207 ids := predicateGPUbyNumber(pod, gs) 208 if len(ids) == 0 { 209 return errors.Errorf("the node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace) 210 } 211 patch := AddGPUIndexPatch(ids) 212 pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) 213 if err != nil { 214 return errors.Errorf("patch pod %s failed with patch %s: %v", pod.Name, patch, err) 215 } 216 for _, id := range ids { 217 dev, ok := gs.Device[id] 218 if !ok { 219 return errors.Errorf("failed to get GPU %d from node %s", id, gs.Name) 220 } 221 dev.PodMap[string(pod.UID)] = pod 222 } 223 klog.V(4).Infof("predicates with gpu number, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, gs.Name) 224 } 225 return nil 226 }