gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/specutils/nvidia.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package specutils 16 17 import ( 18 "fmt" 19 "strconv" 20 "strings" 21 22 specs "github.com/opencontainers/runtime-spec/specs-go" 23 "gvisor.dev/gvisor/pkg/log" 24 "gvisor.dev/gvisor/runsc/config" 25 ) 26 27 const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES" 28 29 // AnnotationNVProxy enables nvproxy. 30 const AnnotationNVProxy = "dev.gvisor.internal.nvproxy" 31 32 // NVProxyEnabled checks both the nvproxy annotation and conf.NVProxy to see if nvproxy is enabled. 33 func NVProxyEnabled(spec *specs.Spec, conf *config.Config) bool { 34 if conf.NVProxy { 35 return true 36 } 37 val, ok := spec.Annotations[AnnotationNVProxy] 38 if !ok { 39 return false 40 } 41 ret, err := strconv.ParseBool(val) 42 if err != nil { 43 log.Warningf("nvproxy annotation set to invalid value %q: %w. Skipping.", val, err) 44 } 45 return ret 46 } 47 48 // GPUFunctionalityRequested returns true if the container should have access 49 // to GPU functionality. 50 func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool { 51 if !NVProxyEnabled(spec, conf) { 52 // nvproxy disabled. 53 return false 54 } 55 // In GKE, the nvidia_gpu device plugin injects NVIDIA devices into 56 // spec.Linux.Devices when GPUs are allocated to a container. 57 if spec.Linux != nil { 58 for _, dev := range spec.Linux.Devices { 59 if dev.Path == "/dev/nvidiactl" { 60 return true 61 } 62 } 63 } 64 return gpuFunctionalityRequestedViaHook(spec, conf) 65 } 66 67 // GPUFunctionalityRequestedViaHook returns true if the container should have 68 // access to GPU functionality configured via nvidia-container-runtime-hook. 69 // This hook is used by: 70 // - Docker when using `--gpus` flag from the CLI. 71 // - nvidia-container-runtime when using its legacy mode. 72 func GPUFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool { 73 if !NVProxyEnabled(spec, conf) { 74 // nvproxy disabled. 75 return false 76 } 77 return gpuFunctionalityRequestedViaHook(spec, conf) 78 } 79 80 // Precondition: NVProxyEnabled(spec, conf). 81 func gpuFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool { 82 if !isNvidiaHookPresent(spec, conf) { 83 return false 84 } 85 // In Docker mode, GPU access is only requested if NVIDIA_VISIBLE_DEVICES is 86 // non-empty and set to a value that doesn't mean "no GPU". 87 if spec.Process == nil { 88 return false 89 } 90 nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) 91 // A value of "none" means "no GPU device, but still access to driver 92 // functionality", so it is not a value we check for here. 93 return nvd != "" && nvd != "void" 94 } 95 96 func isNvidiaHookPresent(spec *specs.Spec, conf *config.Config) bool { 97 if conf.NVProxyDocker { 98 // This has the effect of injecting the nvidia-container-runtime-hook. 99 return true 100 } 101 102 if spec.Hooks != nil { 103 for _, h := range spec.Hooks.Prestart { 104 if strings.HasSuffix(h.Path, "/nvidia-container-runtime-hook") { 105 return true 106 } 107 } 108 } 109 return false 110 } 111 112 // ParseNvidiaVisibleDevices parses NVIDIA_VISIBLE_DEVICES env var and returns 113 // the devices specified in it. This can be passed to nvidia-container-cli. 114 // 115 // Precondition: conf.NVProxyDocker && GPUFunctionalityRequested(spec, conf). 116 func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) { 117 nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar) 118 if nvd == "none" { 119 return "", nil 120 } 121 if nvd == "all" { 122 return "all", nil 123 } 124 // Expect nvd to be a list of indices; UUIDs aren't supported 125 // yet. 126 for _, gpuDev := range strings.Split(nvd, ",") { 127 // Validate gpuDev. We only support the following formats for now: 128 // * GPU indices (e.g. 0,1,2) 129 // * GPU UUIDs (e.g. GPU-fef8089b) 130 // 131 // We do not support MIG devices yet. 132 if strings.HasPrefix(gpuDev, "GPU-") { 133 continue 134 } 135 _, err := strconv.ParseUint(gpuDev, 10, 32) 136 if err != nil { 137 return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err) 138 } 139 } 140 return nvd, nil 141 }