gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/specutils/nvidia.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package specutils
    16  
    17  import (
    18  	"fmt"
    19  	"strconv"
    20  	"strings"
    21  
    22  	specs "github.com/opencontainers/runtime-spec/specs-go"
    23  	"gvisor.dev/gvisor/pkg/log"
    24  	"gvisor.dev/gvisor/runsc/config"
    25  )
    26  
    27  const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES"
    28  
    29  // AnnotationNVProxy enables nvproxy.
    30  const AnnotationNVProxy = "dev.gvisor.internal.nvproxy"
    31  
    32  // NVProxyEnabled checks both the nvproxy annotation and conf.NVProxy to see if nvproxy is enabled.
    33  func NVProxyEnabled(spec *specs.Spec, conf *config.Config) bool {
    34  	if conf.NVProxy {
    35  		return true
    36  	}
    37  	val, ok := spec.Annotations[AnnotationNVProxy]
    38  	if !ok {
    39  		return false
    40  	}
    41  	ret, err := strconv.ParseBool(val)
    42  	if err != nil {
    43  		log.Warningf("nvproxy annotation set to invalid value %q: %w. Skipping.", val, err)
    44  	}
    45  	return ret
    46  }
    47  
    48  // GPUFunctionalityRequested returns true if the container should have access
    49  // to GPU functionality.
    50  func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
    51  	if !NVProxyEnabled(spec, conf) {
    52  		// nvproxy disabled.
    53  		return false
    54  	}
    55  	// In GKE, the nvidia_gpu device plugin injects NVIDIA devices into
    56  	// spec.Linux.Devices when GPUs are allocated to a container.
    57  	if spec.Linux != nil {
    58  		for _, dev := range spec.Linux.Devices {
    59  			if dev.Path == "/dev/nvidiactl" {
    60  				return true
    61  			}
    62  		}
    63  	}
    64  	return gpuFunctionalityRequestedViaHook(spec, conf)
    65  }
    66  
    67  // GPUFunctionalityRequestedViaHook returns true if the container should have
    68  // access to GPU functionality configured via nvidia-container-runtime-hook.
    69  // This hook is used by:
    70  // - Docker when using `--gpus` flag from the CLI.
    71  // - nvidia-container-runtime when using its legacy mode.
    72  func GPUFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool {
    73  	if !NVProxyEnabled(spec, conf) {
    74  		// nvproxy disabled.
    75  		return false
    76  	}
    77  	return gpuFunctionalityRequestedViaHook(spec, conf)
    78  }
    79  
    80  // Precondition: NVProxyEnabled(spec, conf).
    81  func gpuFunctionalityRequestedViaHook(spec *specs.Spec, conf *config.Config) bool {
    82  	if !isNvidiaHookPresent(spec, conf) {
    83  		return false
    84  	}
    85  	// In Docker mode, GPU access is only requested if NVIDIA_VISIBLE_DEVICES is
    86  	// non-empty and set to a value that doesn't mean "no GPU".
    87  	if spec.Process == nil {
    88  		return false
    89  	}
    90  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
    91  	// A value of "none" means "no GPU device, but still access to driver
    92  	// functionality", so it is not a value we check for here.
    93  	return nvd != "" && nvd != "void"
    94  }
    95  
    96  func isNvidiaHookPresent(spec *specs.Spec, conf *config.Config) bool {
    97  	if conf.NVProxyDocker {
    98  		// This has the effect of injecting the nvidia-container-runtime-hook.
    99  		return true
   100  	}
   101  
   102  	if spec.Hooks != nil {
   103  		for _, h := range spec.Hooks.Prestart {
   104  			if strings.HasSuffix(h.Path, "/nvidia-container-runtime-hook") {
   105  				return true
   106  			}
   107  		}
   108  	}
   109  	return false
   110  }
   111  
   112  // ParseNvidiaVisibleDevices parses NVIDIA_VISIBLE_DEVICES env var and returns
   113  // the devices specified in it. This can be passed to nvidia-container-cli.
   114  //
   115  // Precondition: conf.NVProxyDocker && GPUFunctionalityRequested(spec, conf).
   116  func ParseNvidiaVisibleDevices(spec *specs.Spec) (string, error) {
   117  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
   118  	if nvd == "none" {
   119  		return "", nil
   120  	}
   121  	if nvd == "all" {
   122  		return "all", nil
   123  	}
   124  	// Expect nvd to be a list of indices; UUIDs aren't supported
   125  	// yet.
   126  	for _, gpuDev := range strings.Split(nvd, ",") {
   127  		// Validate gpuDev. We only support the following formats for now:
   128  		// * GPU indices (e.g. 0,1,2)
   129  		// * GPU UUIDs (e.g. GPU-fef8089b)
   130  		//
   131  		// We do not support MIG devices yet.
   132  		if strings.HasPrefix(gpuDev, "GPU-") {
   133  			continue
   134  		}
   135  		_, err := strconv.ParseUint(gpuDev, 10, 32)
   136  		if err != nil {
   137  			return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err)
   138  		}
   139  	}
   140  	return nvd, nil
   141  }