github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/specutils/nvidia.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package specutils 16 17 import ( 18 "fmt" 19 "path" 20 "path/filepath" 21 "regexp" 22 "strconv" 23 "strings" 24 25 "github.com/MerlinKodo/gvisor/runsc/config" 26 specs "github.com/opencontainers/runtime-spec/specs-go" 27 ) 28 29 const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES" 30 31 // GPUFunctionalityRequested returns true if the user intends for the sandbox 32 // to have access to GPU functionality (e.g. access to /dev/nvidiactl), 33 // irrespective of whether or not they want access to any specific GPU. 34 func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { 35 if !conf.NVProxy { 36 // nvproxy disabled. 37 return false 38 } 39 if !conf.NVProxyDocker { 40 // nvproxy enabled in non-Docker mode. 41 return true 42 } 43 // nvproxy enabled in Docker mode. 44 // GPU access is only requested if NVIDIA_VISIBLE_DEVICES is non-empty 45 // and set to a value that doesn't mean "no GPU". 46 if spec.Process == nil { 47 return false 48 } 49 nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) 50 // A value of "none" means "no GPU device, but still access to driver 51 // functionality", so it is not a value we check for here. 52 return nvd != "" && nvd != "void" 53 } 54 55 // FindAllGPUDevices returns the Nvidia GPU device minor numbers of all GPUs 56 // mounted in the provided rootfs. 57 func FindAllGPUDevices(rootfs string) ([]uint32, error) { 58 devPathPrefix := path.Join(rootfs, "dev/nvidia") 59 nvidiaDeviceRegex := regexp.MustCompile(fmt.Sprintf(`^%s(\d+)$`, devPathPrefix)) 60 paths, err := filepath.Glob(devPathPrefix + "*") 61 if err != nil { 62 return nil, fmt.Errorf("enumerating Nvidia device files: %w", err) 63 } 64 var devMinors []uint32 65 for _, path := range paths { 66 if ms := nvidiaDeviceRegex.FindStringSubmatch(path); ms != nil { 67 index, err := strconv.ParseUint(ms[1], 10, 32) 68 if err != nil { 69 return nil, fmt.Errorf("invalid host device file %q: %w", path, err) 70 } 71 devMinors = append(devMinors, uint32(index)) 72 } 73 } 74 return devMinors, nil 75 } 76 77 // NvidiaDeviceList returns the list of devices that should be visible to the 78 // sandbox. In Docker mode, this is the set of devices specified in 79 // NVIDIA_VISIBLE_DEVICES. In non-Docker mode, this is all Nvidia devices, as 80 // we cannot know the set of usable GPUs until subcontainer creation. 81 func NvidiaDeviceList(spec *specs.Spec, conf *config.Config) (string, error) { 82 if !GPUFunctionalityRequested(spec, conf) { 83 return "", nil 84 } 85 if !conf.NVProxyDocker { 86 // nvproxy enabled in non-Docker mode. 87 // Return all GPUs on the machine. 88 return "all", nil 89 } 90 // nvproxy is enabled in Docker mode. 91 nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) 92 if nvd == "none" { 93 return "", nil 94 } 95 if nvd == "all" { 96 return "all", nil 97 } 98 // Expect nvd to be a list of indices; UUIDs aren't supported 99 // yet. 100 for _, gpuDev := range strings.Split(nvd, ",") { 101 // Validate gpuDev. We only support the following formats for now: 102 // * GPU indices (e.g. 0,1,2) 103 // * GPU UUIDs (e.g. GPU-fef8089b) 104 // 105 // We do not support MIG devices yet. 106 if strings.HasPrefix(gpuDev, "GPU-") { 107 continue 108 } 109 _, err := strconv.ParseUint(gpuDev, 10, 32) 110 if err != nil { 111 return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err) 112 } 113 } 114 return nvd, nil 115 }