github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/sys/pci.go (about) 1 // Copyright 2023 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 package sys 16 17 import ( 18 "errors" 19 "fmt" 20 "path" 21 regex "regexp" 22 23 "golang.org/x/sys/unix" 24 "github.com/metacubex/gvisor/pkg/abi/linux" 25 "github.com/metacubex/gvisor/pkg/context" 26 "github.com/metacubex/gvisor/pkg/fsutil" 27 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 28 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 29 ) 30 31 const ( 32 pciMainBusDevicePath = "/sys/devices/pci0000:00" 33 accelDevice = "accel" 34 vfioDevice = "vfio-dev" 35 ) 36 37 var ( 38 // Matches PCI device addresses in the main domain. 39 pciDeviceRegex = regex.MustCompile(`0000:([a-fA-F0-9]{2}|[a-fA-F0-9]{4}):[a-fA-F0-9]{2}\.[a-fA-F0-9]{1,2}`) 40 // Matches the directories for the main bus (i.e. pci000:00), 41 // individual devices (e.g. 00:00:04.0), accel (TPU v4), and vfio (TPU v5) 42 sysDevicesDirRegex = regex.MustCompile(`pci0000:00|accel|vfio|(0000:([a-fA-F0-9]{2}|[a-fA-F0-9]{4}):[a-fA-F0-9]{2}\.[a-fA-F0-9]{1,2})`) 43 // Files allowlisted for host passthrough. These files are read-only. 44 sysDevicesFiles = map[string]any{ 45 "vendor": nil, "device": nil, "subsystem_vendor": nil, "subsystem_device": nil, 46 "revision": nil, "class": nil, "numa_node": nil, 47 "resource": nil, "pci_address": nil, "dev": nil, "driver_version": nil, 48 "reset_count": nil, "write_open_count": nil, "status": nil, 49 "is_device_owned": nil, "device_owner": nil, "framework_version": nil, 50 "user_mem_ranges": nil, "interrupt_counts": nil, "chip_model": nil, 51 "bar_offsets": nil, "bar_sizes": nil, "resource0": nil, "resource1": nil, 52 "resource2": nil, "resource3": nil, "resource4": nil, "resource5": nil, 53 } 54 ) 55 56 // Creates TPU devices' symlinks under /sys/class/. TPU device types that are 57 // not present on host will be ignored. 58 // 59 // TPU v4 symlinks are created at /sys/class/accel/accel#. 60 // TPU v5 symlinks go to /sys/class/vfio-dev/vfio#. 61 func (fs *filesystem) newDeviceClassDir(ctx context.Context, creds *auth.Credentials, tpuDeviceTypes []string, pciMainBusDevicePath string) (map[string]map[string]kernfs.Inode, error) { 62 dirs := map[string]map[string]kernfs.Inode{} 63 pciDents, err := hostDirEntries(pciMainBusDevicePath) 64 if err != nil { 65 return nil, err 66 } 67 for _, pciDent := range pciDents { 68 for _, tpuDeviceType := range tpuDeviceTypes { 69 subPath := path.Join(pciMainBusDevicePath, pciDent, tpuDeviceType) 70 dirs[tpuDeviceType] = map[string]kernfs.Inode{} 71 deviceDents, err := hostDirEntries(subPath) 72 if err != nil { 73 // Skips the path that doesn't exist. 74 if err == unix.ENOENT { 75 continue 76 } 77 return nil, err 78 } 79 if numOfDeviceDents := len(deviceDents); numOfDeviceDents != 1 { 80 return nil, fmt.Errorf("exactly one entry is expected at %v while there are %d", subPath, numOfDeviceDents) 81 } 82 dirs[tpuDeviceType][deviceDents[0]] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../devices/pci0000:00/%s/%s/%s", pciDent, tpuDeviceType, deviceDents[0])) 83 } 84 } 85 if len(dirs) == 0 { 86 return nil, errors.New("no TPU device sysfile is found") 87 } 88 return dirs, nil 89 } 90 91 // Create /sys/bus/pci/devices symlinks. 92 func (fs *filesystem) newBusPCIDevicesDir(ctx context.Context, creds *auth.Credentials, pciMainBusDevicePath string) (map[string]kernfs.Inode, error) { 93 pciDevicesDir := map[string]kernfs.Inode{} 94 pciDents, err := hostDirEntries(pciMainBusDevicePath) 95 if err != nil { 96 return nil, err 97 } 98 for _, pciDent := range pciDents { 99 pciDevicesDir[pciDent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../devices/pci0000:00/%s", pciDent)) 100 } 101 102 return pciDevicesDir, nil 103 } 104 105 // Recursively build out sysfs directories according to the allowlisted files, 106 // directories, and symlinks defined in this package. 107 func (fs *filesystem) mirrorPCIBusDeviceDir(ctx context.Context, creds *auth.Credentials, dir string, iommuGroups map[string]string) (map[string]kernfs.Inode, error) { 108 subs := map[string]kernfs.Inode{} 109 dents, err := hostDirEntries(dir) 110 if err != nil { 111 return nil, err 112 } 113 for _, dent := range dents { 114 dentPath := path.Join(dir, dent) 115 dentMode, err := hostFileMode(dentPath) 116 if err != nil { 117 return nil, err 118 } 119 switch dentMode { 120 case unix.S_IFDIR: 121 if match := sysDevicesDirRegex.MatchString(dent); !match { 122 continue 123 } 124 contents, err := fs.mirrorPCIBusDeviceDir(ctx, creds, dentPath, iommuGroups) 125 if err != nil { 126 return nil, err 127 } 128 subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents) 129 case unix.S_IFREG: 130 if _, ok := sysDevicesFiles[dent]; ok { 131 subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, dentPath) 132 } 133 case unix.S_IFLNK: 134 linkContent := "" 135 switch { 136 case pciDeviceRegex.MatchString(dent) || dent == "device": 137 pciDeviceName, err := pciDeviceName(dir) 138 if err != nil { 139 return nil, err 140 } 141 // Both the device and PCI address entries are links to the original PCI 142 // device directory that's at the same place earlier in the dir tree. 143 linkContent = fmt.Sprintf("../../../%s", pciDeviceName) 144 case dent == "iommu_group": 145 pciDeviceName, err := pciDeviceName(dir) 146 if err != nil { 147 return nil, err 148 } 149 iommuGroupNum, exist := iommuGroups[pciDeviceName] 150 if !exist { 151 return nil, fmt.Errorf("no IOMMU group is found for device %v", pciDeviceName) 152 } 153 linkContent = fmt.Sprintf("../../../kernel/iommu_groups/%s", iommuGroupNum) 154 default: 155 continue 156 } 157 subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linkContent) 158 } 159 } 160 return subs, nil 161 } 162 163 // Infer a PCI device's name from its path. 164 func pciDeviceName(pciDevicePath string) (string, error) { 165 pciDeviceName := pciDeviceRegex.FindString(pciDevicePath) 166 if pciDeviceName == "" { 167 return "", fmt.Errorf("no valid device name for the device path at %v", pciDevicePath) 168 } 169 return pciDeviceName, nil 170 } 171 172 func hostFileMode(path string) (uint32, error) { 173 fd, err := unix.Openat(-1, path, unix.O_RDONLY|unix.O_NOFOLLOW|unix.O_PATH, 0) 174 if err != nil { 175 return 0, err 176 } 177 stat := unix.Stat_t{} 178 if err := unix.Fstat(fd, &stat); err != nil { 179 return 0, err 180 } 181 return stat.Mode & unix.S_IFMT, nil 182 } 183 184 func hostDirEntries(path string) ([]string, error) { 185 fd, err := unix.Openat(-1, path, unix.O_RDONLY|unix.O_NOFOLLOW, 0) 186 if err != nil { 187 return nil, err 188 } 189 defer unix.Close(fd) 190 return fsutil.DirentNames(fd) 191 }