gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/test/dockerutil/gpu.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package dockerutil provides utility functions for GPU tests.
    16  package dockerutil
    17  
    18  import (
    19  	"flag"
    20  	"fmt"
    21  	"os"
    22  
    23  	"github.com/docker/docker/api/types/container"
    24  	"github.com/docker/docker/api/types/mount"
    25  )
    26  
    27  // Flags.
    28  var (
    29  	setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker")
    30  )
    31  
    32  // AllGPUCapabilities is the environment variable that enables all NVIDIA GPU
    33  // capabilities within a container.
    34  const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all"
    35  
    36  // GPURunOpts returns Docker run options with GPU support enabled.
    37  func GPURunOpts() RunOpts {
    38  	if !*setCOSGPU {
    39  		return RunOpts{
    40  			Env: []string{AllGPUCapabilities},
    41  			DeviceRequests: []container.DeviceRequest{
    42  				{
    43  					Count:        -1,
    44  					Capabilities: [][]string{{"gpu"}},
    45  					Options:      map[string]string{},
    46  				},
    47  			},
    48  		}
    49  	}
    50  
    51  	// COS has specific settings since it has a custom installer for GPU drivers.
    52  	// See: https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus#install-driver
    53  	devices := []container.DeviceMapping{}
    54  	var nvidiaDevices []string
    55  	for i := 0; true; i++ {
    56  		devicePath := fmt.Sprintf("/dev/nvidia%d", i)
    57  		if _, err := os.Stat(devicePath); err != nil {
    58  			break
    59  		}
    60  		nvidiaDevices = append(nvidiaDevices, devicePath)
    61  	}
    62  	nvidiaDevices = append(nvidiaDevices, "/dev/nvidia-uvm", "/dev/nvidiactl")
    63  	for _, device := range nvidiaDevices {
    64  		devices = append(devices, container.DeviceMapping{
    65  			PathOnHost:        device,
    66  			PathInContainer:   device,
    67  			CgroupPermissions: "rwm",
    68  		})
    69  	}
    70  
    71  	var mounts []mount.Mount
    72  	for _, nvidiaBin := range []string{
    73  		"/home/kubernetes/bin/nvidia/bin",
    74  		"/var/lib/nvidia/bin",
    75  	} {
    76  		if st, err := os.Stat(nvidiaBin); err == nil && st.IsDir() {
    77  			mounts = append(mounts, mount.Mount{
    78  				Source:   nvidiaBin,
    79  				Target:   "/usr/local/nvidia/bin",
    80  				Type:     mount.TypeBind,
    81  				ReadOnly: true,
    82  			})
    83  		}
    84  	}
    85  	for _, nvidiaLib64 := range []string{
    86  		"/home/kubernetes/bin/nvidia/lib64",
    87  		"/var/lib/nvidia/lib64",
    88  	} {
    89  		if st, err := os.Stat(nvidiaLib64); err == nil && st.IsDir() {
    90  			mounts = append(mounts, mount.Mount{
    91  				Source:   nvidiaLib64,
    92  				Target:   "/usr/local/nvidia/lib64",
    93  				Type:     mount.TypeBind,
    94  				ReadOnly: true,
    95  			})
    96  		}
    97  	}
    98  
    99  	return RunOpts{
   100  		Env:     []string{AllGPUCapabilities},
   101  		Mounts:  mounts,
   102  		Devices: devices,
   103  	}
   104  }
   105  
   106  // NumGPU crudely estimates the number of NVIDIA GPUs on the host.
   107  func NumGPU() int {
   108  	numGPU := 0
   109  	for {
   110  		_, err := os.Stat(fmt.Sprintf("/dev/nvidia%d", numGPU))
   111  		if err != nil {
   112  			break
   113  		}
   114  		numGPU++
   115  	}
   116  	return numGPU
   117  }