gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/test/dockerutil/gpu.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package dockerutil provides utility functions for GPU tests. 16 package dockerutil 17 18 import ( 19 "flag" 20 "fmt" 21 "os" 22 23 "github.com/docker/docker/api/types/container" 24 "github.com/docker/docker/api/types/mount" 25 ) 26 27 // Flags. 28 var ( 29 setCOSGPU = flag.Bool("cos-gpu", false, "set to configure GPU settings for COS, as opposed to Docker") 30 ) 31 32 // AllGPUCapabilities is the environment variable that enables all NVIDIA GPU 33 // capabilities within a container. 34 const AllGPUCapabilities = "NVIDIA_DRIVER_CAPABILITIES=all" 35 36 // GPURunOpts returns Docker run options with GPU support enabled. 37 func GPURunOpts() RunOpts { 38 if !*setCOSGPU { 39 return RunOpts{ 40 Env: []string{AllGPUCapabilities}, 41 DeviceRequests: []container.DeviceRequest{ 42 { 43 Count: -1, 44 Capabilities: [][]string{{"gpu"}}, 45 Options: map[string]string{}, 46 }, 47 }, 48 } 49 } 50 51 // COS has specific settings since it has a custom installer for GPU drivers. 52 // See: https://cloud.google.com/container-optimized-os/docs/how-to/run-gpus#install-driver 53 devices := []container.DeviceMapping{} 54 var nvidiaDevices []string 55 for i := 0; true; i++ { 56 devicePath := fmt.Sprintf("/dev/nvidia%d", i) 57 if _, err := os.Stat(devicePath); err != nil { 58 break 59 } 60 nvidiaDevices = append(nvidiaDevices, devicePath) 61 } 62 nvidiaDevices = append(nvidiaDevices, "/dev/nvidia-uvm", "/dev/nvidiactl") 63 for _, device := range nvidiaDevices { 64 devices = append(devices, container.DeviceMapping{ 65 PathOnHost: device, 66 PathInContainer: device, 67 CgroupPermissions: "rwm", 68 }) 69 } 70 71 var mounts []mount.Mount 72 for _, nvidiaBin := range []string{ 73 "/home/kubernetes/bin/nvidia/bin", 74 "/var/lib/nvidia/bin", 75 } { 76 if st, err := os.Stat(nvidiaBin); err == nil && st.IsDir() { 77 mounts = append(mounts, mount.Mount{ 78 Source: nvidiaBin, 79 Target: "/usr/local/nvidia/bin", 80 Type: mount.TypeBind, 81 ReadOnly: true, 82 }) 83 } 84 } 85 for _, nvidiaLib64 := range []string{ 86 "/home/kubernetes/bin/nvidia/lib64", 87 "/var/lib/nvidia/lib64", 88 } { 89 if st, err := os.Stat(nvidiaLib64); err == nil && st.IsDir() { 90 mounts = append(mounts, mount.Mount{ 91 Source: nvidiaLib64, 92 Target: "/usr/local/nvidia/lib64", 93 Type: mount.TypeBind, 94 ReadOnly: true, 95 }) 96 } 97 } 98 99 return RunOpts{ 100 Env: []string{AllGPUCapabilities}, 101 Mounts: mounts, 102 Devices: devices, 103 } 104 } 105 106 // NumGPU crudely estimates the number of NVIDIA GPUs on the host. 107 func NumGPU() int { 108 numGPU := 0 109 for { 110 _, err := os.Stat(fmt.Sprintf("/dev/nvidia%d", numGPU)) 111 if err != nil { 112 break 113 } 114 numGPU++ 115 } 116 return numGPU 117 }