github.com/ttpreport/gvisor-ligolo@v0.0.0-20240123134145-a858404967ba/runsc/specutils/nvidia.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package specutils
    16  
    17  import (
    18  	"fmt"
    19  	"path/filepath"
    20  	"regexp"
    21  	"strconv"
    22  	"strings"
    23  
    24  	specs "github.com/opencontainers/runtime-spec/specs-go"
    25  	"github.com/ttpreport/gvisor-ligolo/pkg/log"
    26  	"github.com/ttpreport/gvisor-ligolo/runsc/config"
    27  )
    28  
    29  const nvdEnvVar = "NVIDIA_VISIBLE_DEVICES"
    30  
    31  // GPUFunctionalityRequested returns true if the user intends for the sandbox
    32  // to have access to GPU functionality (e.g. access to /dev/nvidiactl),
    33  // irrespective of whether or not they want access to any specific GPU.
    34  func GPUFunctionalityRequested(spec *specs.Spec, conf *config.Config) bool {
    35  	if !conf.NVProxy {
    36  		// nvproxy disabled.
    37  		return false
    38  	}
    39  	if !conf.NVProxyDocker {
    40  		// nvproxy enabled in non-Docker mode.
    41  		return true
    42  	}
    43  	// nvproxy enabled in Docker mode.
    44  	// GPU access is only requested if NVIDIA_VISIBLE_DEVICES is non-empty
    45  	// and set to a value that doesn't mean "no GPU".
    46  	if spec.Process == nil {
    47  		return false
    48  	}
    49  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
    50  	// A value of "none" means "no GPU device, but still access to driver
    51  	// functionality", so it is not a value we check for here.
    52  	return nvd != "" && nvd != "void"
    53  }
    54  
    55  // CanAccessAtLeastOneGPU returns true if the sandbox and container should
    56  // be able to access at least one Nvidia GPU. This is a function of the
    57  // sandbox configuration and the container spec's NVIDIA_VISIBLE_DEVICES
    58  // environment variable.
    59  func CanAccessAtLeastOneGPU(spec *specs.Spec, conf *config.Config) bool {
    60  	gpus, err := NvidiaDeviceNumbers(spec, conf)
    61  	if err != nil {
    62  		log.Warningf("Cannot determine if the container should have access to GPUs: %v", err)
    63  		return false
    64  	}
    65  	return len(gpus) > 0
    66  }
    67  
    68  // nvidiaDeviceRegex matches Nvidia GPU device paths.
    69  var nvidiaDeviceRegex = regexp.MustCompile(`^/dev/nvidia(\d+)$`)
    70  
    71  // findAllGPUDevices returns the Nvidia GPU device minor numbers of all GPUs
    72  // on the machine.
    73  func findAllGPUDevices() ([]uint32, error) {
    74  	paths, err := filepath.Glob("/dev/nvidia*")
    75  	if err != nil {
    76  		return nil, fmt.Errorf("enumerating Nvidia device files: %w", err)
    77  	}
    78  	var devMinors []uint32
    79  	for _, path := range paths {
    80  		if ms := nvidiaDeviceRegex.FindStringSubmatch(path); ms != nil {
    81  			index, err := strconv.ParseUint(ms[1], 10, 32)
    82  			if err != nil {
    83  				return nil, fmt.Errorf("invalid host device file %q: %w", path, err)
    84  			}
    85  			devMinors = append(devMinors, uint32(index))
    86  		}
    87  	}
    88  	return devMinors, nil
    89  }
    90  
    91  // NvidiaDeviceNumbers returns the Nvidia GPU device minor numbers that
    92  // should be visible to the specified container.
    93  // In Docker mode, this is the set of devices specified in
    94  // NVIDIA_VISIBLE_DEVICES.
    95  // In non-Docker mode, this is all Nvidia devices, as we cannot know the set
    96  // of usable GPUs until subcontainer creation.
    97  func NvidiaDeviceNumbers(spec *specs.Spec, conf *config.Config) ([]uint32, error) {
    98  	if !GPUFunctionalityRequested(spec, conf) {
    99  		return nil, nil
   100  	}
   101  	if !conf.NVProxyDocker {
   102  		// nvproxy enabled in non-Docker mode.
   103  		// Return all GPUs on the machine.
   104  		return findAllGPUDevices()
   105  	}
   106  	// nvproxy is enabled in Docker mode.
   107  	nvd, _ := EnvVar(spec.Process.Env, nvdEnvVar)
   108  	if nvd == "none" {
   109  		return nil, nil
   110  	}
   111  	if nvd == "all" {
   112  		return findAllGPUDevices()
   113  	}
   114  	var devMinors []uint32
   115  	// Expect nvd to be a list of indices; UUIDs aren't supported
   116  	// yet.
   117  	for _, indexStr := range strings.Split(nvd, ",") {
   118  		index, err := strconv.ParseUint(indexStr, 10, 32)
   119  		if err != nil {
   120  			return nil, fmt.Errorf("invalid %q in NVIDIA_VISIBLE_DEVICES %q: %w", indexStr, nvd, err)
   121  		}
   122  		devMinors = append(devMinors, uint32(index))
   123  	}
   124  	return devMinors, nil
   125  }