github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/specutils/nvidia.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package specutils
    16  
    17  import (
    18  	"fmt"
    19  	"path"
    20  	"path/filepath"
    21  	"regexp"
    22  	"strconv"
    23  	"strings"
    24  
    25  	"github.com/MerlinKodo/gvisor/runsc/config"
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  )
    28  
    29  const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES"
    30  
    31  // GPUFunctionalityRequested returns true if the user intends for the sandbox
    32  // to have access to GPU functionality (e.g. access to /dev/nvidiactl),
    33  // irrespective of whether or not they want access to any specific GPU.
    34  func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
    35  	if !conf.NVProxy {
    36  		// nvproxy disabled.
    37  		return false
    38  	}
    39  	if !conf.NVProxyDocker {
    40  		// nvproxy enabled in non-Docker mode.
    41  		return true
    42  	}
    43  	// nvproxy enabled in Docker mode.
    44  	// GPU access is only requested if NVIDIA_VISIBLE_DEVICES is non-empty
    45  	// and set to a value that doesn't mean "no GPU".
    46  	if spec.Process == nil {
    47  		return false
    48  	}
    49  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
    50  	// A value of "none" means "no GPU device, but still access to driver
    51  	// functionality", so it is not a value we check for here.
    52  	return nvd != "" && nvd != "void"
    53  }
    54  
    55  // FindAllGPUDevices returns the Nvidia GPU device minor numbers of all GPUs
    56  // mounted in the provided rootfs.
    57  func FindAllGPUDevices(rootfs string) ([]uint32, error) {
    58  	devPathPrefix := path.Join(rootfs, "dev/nvidia")
    59  	nvidiaDeviceRegex := regexp.MustCompile(fmt.Sprintf(`^%s(\d+)$`, devPathPrefix))
    60  	paths, err := filepath.Glob(devPathPrefix + "*")
    61  	if err != nil {
    62  		return nil, fmt.Errorf("enumerating Nvidia device files: %w", err)
    63  	}
    64  	var devMinors []uint32
    65  	for _, path := range paths {
    66  		if ms := nvidiaDeviceRegex.FindStringSubmatch(path); ms != nil {
    67  			index, err := strconv.ParseUint(ms[1], 10, 32)
    68  			if err != nil {
    69  				return nil, fmt.Errorf("invalid host device file %q: %w", path, err)
    70  			}
    71  			devMinors = append(devMinors, uint32(index))
    72  		}
    73  	}
    74  	return devMinors, nil
    75  }
    76  
    77  // NvidiaDeviceList returns the list of devices that should be visible to the
    78  // sandbox. In Docker mode, this is the set of devices specified in
    79  // NVIDIA_VISIBLE_DEVICES. In non-Docker mode, this is all Nvidia devices, as
    80  // we cannot know the set of usable GPUs until subcontainer creation.
    81  func NvidiaDeviceList(spec *specs.Spec, conf *config.Config) (string, error) {
    82  	if !GPUFunctionalityRequested(spec, conf) {
    83  		return "", nil
    84  	}
    85  	if !conf.NVProxyDocker {
    86  		// nvproxy enabled in non-Docker mode.
    87  		// Return all GPUs on the machine.
    88  		return "all", nil
    89  	}
    90  	// nvproxy is enabled in Docker mode.
    91  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
    92  	if nvd == "none" {
    93  		return "", nil
    94  	}
    95  	if nvd == "all" {
    96  		return "all", nil
    97  	}
    98  	// Expect nvd to be a list of indices; UUIDs aren't supported
    99  	// yet.
   100  	for _, gpuDev := range strings.Split(nvd, ",") {
   101  		// Validate gpuDev. We only support the following formats for now:
   102  		// * GPU indices (e.g. 0,1,2)
   103  		// * GPU UUIDs (e.g. GPU-fef8089b)
   104  		//
   105  		// We do not support MIG devices yet.
   106  		if strings.HasPrefix(gpuDev, "GPU-") {
   107  			continue
   108  		}
   109  		_, err := strconv.ParseUint(gpuDev, 10, 32)
   110  		if err != nil {
   111  			return "", fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", gpuDev, nvd, err)
   112  		}
   113  	}
   114  	return nvd, nil
   115  }