gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/sys/sys.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sys implements sysfs. 16 package sys 17 18 import ( 19 "bytes" 20 "fmt" 21 "os" 22 "path" 23 "strconv" 24 25 "golang.org/x/sys/unix" 26 "gvisor.dev/gvisor/pkg/abi/linux" 27 "gvisor.dev/gvisor/pkg/context" 28 "gvisor.dev/gvisor/pkg/coverage" 29 "gvisor.dev/gvisor/pkg/errors/linuxerr" 30 "gvisor.dev/gvisor/pkg/log" 31 "gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs" 32 "gvisor.dev/gvisor/pkg/sentry/kernel" 33 "gvisor.dev/gvisor/pkg/sentry/kernel/auth" 34 "gvisor.dev/gvisor/pkg/sentry/vfs" 35 ) 36 37 const ( 38 // Name is the default filesystem name. 39 Name = "sysfs" 40 defaultSysMode = linux.FileMode(0444) 41 defaultSysDirMode = linux.FileMode(0755) 42 defaultMaxCachedDentries = uint64(1000) 43 iommuGroupSysPath = "/sys/kernel/iommu_groups/" 44 ) 45 46 // FilesystemType implements vfs.FilesystemType. 47 // 48 // +stateify savable 49 type FilesystemType struct{} 50 51 // InternalData contains internal data passed in via 52 // vfs.GetFilesystemOptions.InternalData. 53 // 54 // +stateify savable 55 type InternalData struct { 56 // ProductName is the value to be set to devices/virtual/dmi/id/product_name. 57 ProductName string 58 // EnableTPUProxyPaths is whether to populate sysfs paths used by hardware 59 // accelerators. 60 EnableTPUProxyPaths bool 61 // TestSysfsPathPrefix is a prefix for the sysfs paths. It is useful for 62 // unit testing. 63 TestSysfsPathPrefix string 64 } 65 66 // filesystem implements vfs.FilesystemImpl. 67 // 68 // +stateify savable 69 type filesystem struct { 70 kernfs.Filesystem 71 72 devMinor uint32 73 } 74 75 // Name implements vfs.FilesystemType.Name. 76 func (FilesystemType) Name() string { 77 return Name 78 } 79 80 // Release implements vfs.FilesystemType.Release. 81 func (FilesystemType) Release(ctx context.Context) {} 82 83 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 84 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 85 devMinor, err := vfsObj.GetAnonBlockDevMinor() 86 if err != nil { 87 return nil, nil, err 88 } 89 90 mopts := vfs.GenericParseMountOptions(opts.Data) 91 maxCachedDentries := defaultMaxCachedDentries 92 if str, ok := mopts["dentry_cache_limit"]; ok { 93 delete(mopts, "dentry_cache_limit") 94 maxCachedDentries, err = strconv.ParseUint(str, 10, 64) 95 if err != nil { 96 ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) 97 return nil, nil, linuxerr.EINVAL 98 } 99 } 100 101 fs := &filesystem{ 102 devMinor: devMinor, 103 } 104 fs.MaxCachedDentries = maxCachedDentries 105 fs.VFSFilesystem().Init(vfsObj, &fsType, fs) 106 107 k := kernel.KernelFromContext(ctx) 108 fsDirChildren := make(map[string]kernfs.Inode) 109 // Create an empty directory to serve as the mount point for cgroupfs when 110 // cgroups are available. This emulates Linux behaviour, see 111 // kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically 112 // the init process) is ultimately responsible for actually mounting 113 // cgroupfs, but the kernel creates the mountpoint. For the sentry, the 114 // launcher mounts cgroupfs. 115 if k.CgroupRegistry() != nil { 116 fsDirChildren["cgroup"] = fs.newCgroupDir(ctx, creds, defaultSysDirMode, nil) 117 } 118 119 classSub := map[string]kernfs.Inode{ 120 "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil), 121 } 122 devicesSub := map[string]kernfs.Inode{ 123 "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 124 "cpu": cpuDir(ctx, fs, creds), 125 }), 126 } 127 128 productName := "" 129 busSub := make(map[string]kernfs.Inode) 130 kernelSub := kernelDir(ctx, fs, creds) 131 if opts.InternalData != nil { 132 idata := opts.InternalData.(*InternalData) 133 productName = idata.ProductName 134 if idata.EnableTPUProxyPaths { 135 deviceToIommuGroup, err := pciDeviceIOMMUGroups(path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath)) 136 if err != nil { 137 return nil, nil, err 138 } 139 pciPath := path.Join(idata.TestSysfsPathPrefix, pciMainBusDevicePath) 140 pciMainBusSub, err := fs.mirrorPCIBusDeviceDir(ctx, creds, pciPath, deviceToIommuGroup) 141 if err != nil { 142 return nil, nil, err 143 } 144 devicesSub["pci0000:00"] = fs.newDir(ctx, creds, defaultSysDirMode, pciMainBusSub) 145 146 deviceDirs, err := fs.newDeviceClassDir(ctx, creds, []string{accelDevice, vfioDevice}, pciPath) 147 if err != nil { 148 return nil, nil, err 149 } 150 151 for tpuDeviceType, symlinkDir := range deviceDirs { 152 classSub[tpuDeviceType] = fs.newDir(ctx, creds, defaultSysDirMode, symlinkDir) 153 } 154 pciDevicesSub, err := fs.newBusPCIDevicesDir(ctx, creds, pciPath) 155 if err != nil { 156 return nil, nil, err 157 } 158 busSub["pci"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 159 "devices": fs.newDir(ctx, creds, defaultSysDirMode, pciDevicesSub), 160 }) 161 iommuPath := path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath) 162 iommuGroups, err := fs.mirrorIOMMUGroups(ctx, creds, iommuPath) 163 if err != nil { 164 return nil, nil, err 165 } 166 kernelSub["iommu_groups"] = fs.newDir(ctx, creds, defaultSysDirMode, iommuGroups) 167 } 168 } 169 170 if len(productName) > 0 { 171 log.Debugf("Setting product_name: %q", productName) 172 classSub["dmi"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 173 "id": kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "../../devices/virtual/dmi/id"), 174 }) 175 devicesSub["virtual"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 176 "dmi": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 177 "id": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 178 "product_name": fs.newStaticFile(ctx, creds, defaultSysMode, productName+"\n"), 179 }), 180 }), 181 }) 182 } 183 root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 184 "block": fs.newDir(ctx, creds, defaultSysDirMode, nil), 185 "bus": fs.newDir(ctx, creds, defaultSysDirMode, busSub), 186 "class": fs.newDir(ctx, creds, defaultSysDirMode, classSub), 187 "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil), 188 "devices": fs.newDir(ctx, creds, defaultSysDirMode, devicesSub), 189 "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil), 190 "fs": fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren), 191 "kernel": fs.newDir(ctx, creds, defaultSysDirMode, kernelSub), 192 "module": fs.newDir(ctx, creds, defaultSysDirMode, nil), 193 "power": fs.newDir(ctx, creds, defaultSysDirMode, nil), 194 }) 195 var rootD kernfs.Dentry 196 rootD.InitRoot(&fs.Filesystem, root) 197 return fs.VFSFilesystem(), rootD.VFSDentry(), nil 198 } 199 200 func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { 201 k := kernel.KernelFromContext(ctx) 202 maxCPUCores := k.ApplicationCores() 203 children := map[string]kernfs.Inode{ 204 "online": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 205 "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 206 "present": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 207 } 208 for i := uint(0); i < maxCPUCores; i++ { 209 children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil) 210 } 211 return fs.newDir(ctx, creds, defaultSysDirMode, children) 212 } 213 214 // Returns a map from a PCI device name to its IOMMU group if available. 215 func pciDeviceIOMMUGroups(iommuGroupsPath string) (map[string]string, error) { 216 // IOMMU groups are organized as iommu_group_path/$GROUP, where $GROUP is 217 // the IOMMU group number of which the device is a member. 218 iommuGroupNums, err := hostDirEntries(iommuGroupsPath) 219 if err != nil { 220 // When IOMMU is not enabled, skip the rest of the process. 221 if err == unix.ENOENT { 222 return nil, nil 223 } 224 return nil, err 225 } 226 // The returned map from PCI device name to its IOMMU group. 227 iommuGroups := map[string]string{} 228 for _, iommuGroupNum := range iommuGroupNums { 229 groupDevicesPath := path.Join(iommuGroupsPath, iommuGroupNum, "devices") 230 pciDeviceNames, err := hostDirEntries(groupDevicesPath) 231 if err != nil { 232 return nil, err 233 } 234 // An IOMMU group may include multiple devices. 235 for _, pciDeviceName := range pciDeviceNames { 236 iommuGroups[pciDeviceName] = iommuGroupNum 237 } 238 } 239 return iommuGroups, nil 240 } 241 242 func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) map[string]kernfs.Inode { 243 // Set up /sys/kernel/debug/kcov. Technically, debugfs should be 244 // mounted at debug/, but for our purposes, it is sufficient to keep it 245 // in sys. 246 children := make(map[string]kernfs.Inode) 247 if coverage.KcovSupported() { 248 log.Debugf("Set up /sys/kernel/debug/kcov") 249 children["debug"] = fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{ 250 "kcov": fs.newKcovFile(ctx, creds), 251 }) 252 } 253 return children 254 } 255 256 // Recursively build out IOMMU directories from the host. 257 func (fs *filesystem) mirrorIOMMUGroups(ctx context.Context, creds *auth.Credentials, dir string) (map[string]kernfs.Inode, error) { 258 subs := map[string]kernfs.Inode{} 259 dents, err := hostDirEntries(dir) 260 if err != nil { 261 // TPU before v5 doesn't need IOMMU, skip the whole process for the backward compatibility when the directory can't be found. 262 if err == unix.ENOENT { 263 log.Debugf("Skip the path at %v which cannot be found.", dir) 264 return nil, nil 265 } 266 return nil, err 267 } 268 for _, dent := range dents { 269 absPath := path.Join(dir, dent) 270 mode, err := hostFileMode(absPath) 271 if err != nil { 272 return nil, err 273 } 274 switch mode { 275 case unix.S_IFDIR: 276 contents, err := fs.mirrorIOMMUGroups(ctx, creds, absPath) 277 if err != nil { 278 return nil, err 279 } 280 subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents) 281 case unix.S_IFREG: 282 subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, absPath) 283 case unix.S_IFLNK: 284 if pciDeviceRegex.MatchString(dent) { 285 subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../../devices/pci0000:00/%s", dent)) 286 } 287 } 288 } 289 return subs, nil 290 } 291 292 // Release implements vfs.FilesystemImpl.Release. 293 func (fs *filesystem) Release(ctx context.Context) { 294 fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 295 fs.Filesystem.Release(ctx) 296 } 297 298 // MountOptions implements vfs.FilesystemImpl.MountOptions. 299 func (fs *filesystem) MountOptions() string { 300 return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) 301 } 302 303 // dir implements kernfs.Inode. 304 // 305 // +stateify savable 306 type dir struct { 307 dirRefs 308 kernfs.InodeAlwaysValid 309 kernfs.InodeAttrs 310 kernfs.InodeDirectoryNoNewChildren 311 kernfs.InodeNotAnonymous 312 kernfs.InodeNotSymlink 313 kernfs.InodeTemporary 314 kernfs.InodeWatches 315 kernfs.OrderedChildren 316 317 locks vfs.FileLocks 318 } 319 320 func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { 321 d := &dir{} 322 d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) 323 d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) 324 d.InitRefs() 325 d.IncLinks(d.OrderedChildren.Populate(contents)) 326 return d 327 } 328 329 func (fs *filesystem) newCgroupDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { 330 d := &cgroupDir{} 331 d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) 332 d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) 333 d.InitRefs() 334 d.IncLinks(d.OrderedChildren.Populate(contents)) 335 return d 336 } 337 338 // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. 339 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 340 return linuxerr.EPERM 341 } 342 343 // Open implements kernfs.Inode.Open. 344 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 345 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | 346 linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY 347 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ 348 SeekEnd: kernfs.SeekEndStaticEntries, 349 }) 350 if err != nil { 351 return nil, err 352 } 353 return fd.VFSFileDescription(), nil 354 } 355 356 // DecRef implements kernfs.Inode.DecRef. 357 func (d *dir) DecRef(ctx context.Context) { 358 d.dirRefs.DecRef(func() { d.Destroy(ctx) }) 359 } 360 361 // StatFS implements kernfs.Inode.StatFS. 362 func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { 363 return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil 364 } 365 366 // cgroupDir implements kernfs.Inode. 367 // 368 // +stateify savable 369 type cgroupDir struct { 370 dir 371 } 372 373 // StatFS implements kernfs.Inode.StatFS. 374 func (d *cgroupDir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { 375 return vfs.GenericStatFS(linux.TMPFS_MAGIC), nil 376 } 377 378 // cpuFile implements kernfs.Inode. 379 // 380 // +stateify savable 381 type cpuFile struct { 382 implStatFS 383 kernfs.DynamicBytesFile 384 385 maxCores uint 386 } 387 388 // Generate implements vfs.DynamicBytesSource.Generate. 389 func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 390 fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) 391 return nil 392 } 393 394 func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode { 395 c := &cpuFile{maxCores: maxCores} 396 c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) 397 return c 398 } 399 400 // +stateify savable 401 type implStatFS struct{} 402 403 // StatFS implements kernfs.Inode.StatFS. 404 func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { 405 return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil 406 } 407 408 // +stateify savable 409 type staticFile struct { 410 kernfs.DynamicBytesFile 411 vfs.StaticData 412 } 413 414 func (fs *filesystem) newStaticFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { 415 s := &staticFile{StaticData: vfs.StaticData{Data: data}} 416 s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, mode) 417 return s 418 } 419 420 // hostFile is an inode whose contents are generated by reading from the 421 // host. 422 // 423 // +stateify savable 424 type hostFile struct { 425 kernfs.DynamicBytesFile 426 hostPath string 427 } 428 429 func (hf *hostFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 430 fd, err := unix.Openat(-1, hf.hostPath, unix.O_RDONLY|unix.O_NOFOLLOW, 0) 431 if err != nil { 432 return err 433 } 434 file := os.NewFile(uintptr(fd), hf.hostPath) 435 defer file.Close() 436 _, err = buf.ReadFrom(file) 437 return err 438 } 439 440 func (fs *filesystem) newHostFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, hostPath string) kernfs.Inode { 441 hf := &hostFile{hostPath: hostPath} 442 hf.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), hf, mode) 443 return hf 444 }