github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/sys/sys.go (about) 1 // Copyright 2019 The gVisor Authors. 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 // Package sys implements sysfs. 16 package sys 17 18 import ( 19 "bytes" 20 "fmt" 21 "os" 22 "path" 23 "strconv" 24 25 "golang.org/x/sys/unix" 26 "github.com/metacubex/gvisor/pkg/abi/linux" 27 "github.com/metacubex/gvisor/pkg/context" 28 "github.com/metacubex/gvisor/pkg/coverage" 29 "github.com/metacubex/gvisor/pkg/errors/linuxerr" 30 "github.com/metacubex/gvisor/pkg/log" 31 "github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs" 32 "github.com/metacubex/gvisor/pkg/sentry/kernel" 33 "github.com/metacubex/gvisor/pkg/sentry/kernel/auth" 34 "github.com/metacubex/gvisor/pkg/sentry/vfs" 35 ) 36 37 const ( 38 // Name is the default filesystem name. 39 Name = "sysfs" 40 defaultSysMode = linux.FileMode(0444) 41 defaultSysDirMode = linux.FileMode(0755) 42 defaultMaxCachedDentries = uint64(1000) 43 iommuGroupSysPath = "/sys/kernel/iommu_groups/" 44 ) 45 46 // FilesystemType implements vfs.FilesystemType. 47 // 48 // +stateify savable 49 type FilesystemType struct{} 50 51 // InternalData contains internal data passed in via 52 // vfs.GetFilesystemOptions.InternalData. 53 // 54 // +stateify savable 55 type InternalData struct { 56 // ProductName is the value to be set to devices/virtual/dmi/id/product_name. 57 ProductName string 58 // EnableTPUProxyPaths is whether to populate sysfs paths used by hardware 59 // accelerators. 60 EnableTPUProxyPaths bool 61 // TestSysfsPathPrefix is a prefix for the sysfs paths. It is useful for 62 // unit testing. 63 TestSysfsPathPrefix string 64 } 65 66 // filesystem implements vfs.FilesystemImpl. 67 // 68 // +stateify savable 69 type filesystem struct { 70 kernfs.Filesystem 71 72 devMinor uint32 73 } 74 75 // Name implements vfs.FilesystemType.Name. 76 func (FilesystemType) Name() string { 77 return Name 78 } 79 80 // Release implements vfs.FilesystemType.Release. 81 func (FilesystemType) Release(ctx context.Context) {} 82 83 // GetFilesystem implements vfs.FilesystemType.GetFilesystem. 84 func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { 85 devMinor, err := vfsObj.GetAnonBlockDevMinor() 86 if err != nil { 87 return nil, nil, err 88 } 89 90 mopts := vfs.GenericParseMountOptions(opts.Data) 91 maxCachedDentries := defaultMaxCachedDentries 92 if str, ok := mopts["dentry_cache_limit"]; ok { 93 delete(mopts, "dentry_cache_limit") 94 maxCachedDentries, err = strconv.ParseUint(str, 10, 64) 95 if err != nil { 96 ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str) 97 return nil, nil, linuxerr.EINVAL 98 } 99 } 100 101 fs := &filesystem{ 102 devMinor: devMinor, 103 } 104 fs.MaxCachedDentries = maxCachedDentries 105 fs.VFSFilesystem().Init(vfsObj, &fsType, fs) 106 107 k := kernel.KernelFromContext(ctx) 108 fsDirChildren := make(map[string]kernfs.Inode) 109 // Create an empty directory to serve as the mount point for cgroupfs when 110 // cgroups are available. This emulates Linux behaviour, see 111 // kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically 112 // the init process) is ultimately responsible for actually mounting 113 // cgroupfs, but the kernel creates the mountpoint. For the sentry, the 114 // launcher mounts cgroupfs. 115 if k.CgroupRegistry() != nil { 116 fsDirChildren["cgroup"] = fs.newCgroupDir(ctx, creds, defaultSysDirMode, nil) 117 } 118 119 classSub := map[string]kernfs.Inode{ 120 "power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil), 121 "net": fs.newDir(ctx, creds, defaultSysDirMode, fs.newNetDir(ctx, creds, defaultSysDirMode)), 122 } 123 devicesSub := map[string]kernfs.Inode{ 124 "system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 125 "cpu": cpuDir(ctx, fs, creds), 126 }), 127 } 128 129 productName := "" 130 busSub := make(map[string]kernfs.Inode) 131 kernelSub := kernelDir(ctx, fs, creds) 132 if opts.InternalData != nil { 133 idata := opts.InternalData.(*InternalData) 134 productName = idata.ProductName 135 if idata.EnableTPUProxyPaths { 136 deviceToIommuGroup, err := pciDeviceIOMMUGroups(path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath)) 137 if err != nil { 138 return nil, nil, err 139 } 140 pciPath := path.Join(idata.TestSysfsPathPrefix, pciMainBusDevicePath) 141 pciMainBusSub, err := fs.mirrorPCIBusDeviceDir(ctx, creds, pciPath, deviceToIommuGroup) 142 if err != nil { 143 return nil, nil, err 144 } 145 devicesSub["pci0000:00"] = fs.newDir(ctx, creds, defaultSysDirMode, pciMainBusSub) 146 147 deviceDirs, err := fs.newDeviceClassDir(ctx, creds, []string{accelDevice, vfioDevice}, pciPath) 148 if err != nil { 149 return nil, nil, err 150 } 151 152 for tpuDeviceType, symlinkDir := range deviceDirs { 153 classSub[tpuDeviceType] = fs.newDir(ctx, creds, defaultSysDirMode, symlinkDir) 154 } 155 pciDevicesSub, err := fs.newBusPCIDevicesDir(ctx, creds, pciPath) 156 if err != nil { 157 return nil, nil, err 158 } 159 busSub["pci"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 160 "devices": fs.newDir(ctx, creds, defaultSysDirMode, pciDevicesSub), 161 }) 162 iommuPath := path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath) 163 iommuGroups, err := fs.mirrorIOMMUGroups(ctx, creds, iommuPath) 164 if err != nil { 165 return nil, nil, err 166 } 167 kernelSub["iommu_groups"] = fs.newDir(ctx, creds, defaultSysDirMode, iommuGroups) 168 } 169 } 170 171 if len(productName) > 0 { 172 log.Debugf("Setting product_name: %q", productName) 173 classSub["dmi"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 174 "id": kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "../../devices/virtual/dmi/id"), 175 }) 176 devicesSub["virtual"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 177 "dmi": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 178 "id": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 179 "product_name": fs.newStaticFile(ctx, creds, defaultSysMode, productName+"\n"), 180 }), 181 }), 182 }) 183 } 184 root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{ 185 "block": fs.newDir(ctx, creds, defaultSysDirMode, nil), 186 "bus": fs.newDir(ctx, creds, defaultSysDirMode, busSub), 187 "class": fs.newDir(ctx, creds, defaultSysDirMode, classSub), 188 "dev": fs.newDir(ctx, creds, defaultSysDirMode, nil), 189 "devices": fs.newDir(ctx, creds, defaultSysDirMode, devicesSub), 190 "firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil), 191 "fs": fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren), 192 "kernel": fs.newDir(ctx, creds, defaultSysDirMode, kernelSub), 193 "module": fs.newDir(ctx, creds, defaultSysDirMode, nil), 194 "power": fs.newDir(ctx, creds, defaultSysDirMode, nil), 195 }) 196 var rootD kernfs.Dentry 197 rootD.InitRoot(&fs.Filesystem, root) 198 return fs.VFSFilesystem(), rootD.VFSDentry(), nil 199 } 200 201 func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode { 202 k := kernel.KernelFromContext(ctx) 203 maxCPUCores := k.ApplicationCores() 204 children := map[string]kernfs.Inode{ 205 "online": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 206 "possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 207 "present": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)), 208 } 209 for i := uint(0); i < maxCPUCores; i++ { 210 children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil) 211 } 212 return fs.newDir(ctx, creds, defaultSysDirMode, children) 213 } 214 215 // Returns a map from a PCI device name to its IOMMU group if available. 216 func pciDeviceIOMMUGroups(iommuGroupsPath string) (map[string]string, error) { 217 // IOMMU groups are organized as iommu_group_path/$GROUP, where $GROUP is 218 // the IOMMU group number of which the device is a member. 219 iommuGroupNums, err := hostDirEntries(iommuGroupsPath) 220 if err != nil { 221 // When IOMMU is not enabled, skip the rest of the process. 222 if err == unix.ENOENT { 223 return nil, nil 224 } 225 return nil, err 226 } 227 // The returned map from PCI device name to its IOMMU group. 228 iommuGroups := map[string]string{} 229 for _, iommuGroupNum := range iommuGroupNums { 230 groupDevicesPath := path.Join(iommuGroupsPath, iommuGroupNum, "devices") 231 pciDeviceNames, err := hostDirEntries(groupDevicesPath) 232 if err != nil { 233 return nil, err 234 } 235 // An IOMMU group may include multiple devices. 236 for _, pciDeviceName := range pciDeviceNames { 237 iommuGroups[pciDeviceName] = iommuGroupNum 238 } 239 } 240 return iommuGroups, nil 241 } 242 243 func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) map[string]kernfs.Inode { 244 // Set up /sys/kernel/debug/kcov. Technically, debugfs should be 245 // mounted at debug/, but for our purposes, it is sufficient to keep it 246 // in sys. 247 children := make(map[string]kernfs.Inode) 248 if coverage.KcovSupported() { 249 log.Debugf("Set up /sys/kernel/debug/kcov") 250 children["debug"] = fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{ 251 "kcov": fs.newKcovFile(ctx, creds), 252 }) 253 } 254 return children 255 } 256 257 // Recursively build out IOMMU directories from the host. 258 func (fs *filesystem) mirrorIOMMUGroups(ctx context.Context, creds *auth.Credentials, dir string) (map[string]kernfs.Inode, error) { 259 subs := map[string]kernfs.Inode{} 260 dents, err := hostDirEntries(dir) 261 if err != nil { 262 // TPU before v5 doesn't need IOMMU, skip the whole process for the backward compatibility when the directory can't be found. 263 if err == unix.ENOENT { 264 log.Debugf("Skip the path at %v which cannot be found.", dir) 265 return nil, nil 266 } 267 return nil, err 268 } 269 for _, dent := range dents { 270 absPath := path.Join(dir, dent) 271 mode, err := hostFileMode(absPath) 272 if err != nil { 273 return nil, err 274 } 275 switch mode { 276 case unix.S_IFDIR: 277 contents, err := fs.mirrorIOMMUGroups(ctx, creds, absPath) 278 if err != nil { 279 return nil, err 280 } 281 subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents) 282 case unix.S_IFREG: 283 subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, absPath) 284 case unix.S_IFLNK: 285 if pciDeviceRegex.MatchString(dent) { 286 subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../../devices/pci0000:00/%s", dent)) 287 } 288 } 289 } 290 return subs, nil 291 } 292 293 // Release implements vfs.FilesystemImpl.Release. 294 func (fs *filesystem) Release(ctx context.Context) { 295 fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor) 296 fs.Filesystem.Release(ctx) 297 } 298 299 // MountOptions implements vfs.FilesystemImpl.MountOptions. 300 func (fs *filesystem) MountOptions() string { 301 return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries) 302 } 303 304 // dir implements kernfs.Inode. 305 // 306 // +stateify savable 307 type dir struct { 308 dirRefs 309 kernfs.InodeAlwaysValid 310 kernfs.InodeAttrs 311 kernfs.InodeDirectoryNoNewChildren 312 kernfs.InodeNotAnonymous 313 kernfs.InodeNotSymlink 314 kernfs.InodeTemporary 315 kernfs.InodeWatches 316 kernfs.OrderedChildren 317 318 locks vfs.FileLocks 319 } 320 321 func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { 322 d := &dir{} 323 d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) 324 d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) 325 d.InitRefs() 326 d.IncLinks(d.OrderedChildren.Populate(contents)) 327 return d 328 } 329 330 func (fs *filesystem) newCgroupDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode { 331 d := &cgroupDir{} 332 d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755) 333 d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{}) 334 d.InitRefs() 335 d.IncLinks(d.OrderedChildren.Populate(contents)) 336 return d 337 } 338 339 // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed. 340 func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error { 341 return linuxerr.EPERM 342 } 343 344 // Open implements kernfs.Inode.Open. 345 func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) { 346 opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | 347 linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY 348 fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{ 349 SeekEnd: kernfs.SeekEndStaticEntries, 350 }) 351 if err != nil { 352 return nil, err 353 } 354 return fd.VFSFileDescription(), nil 355 } 356 357 // DecRef implements kernfs.Inode.DecRef. 358 func (d *dir) DecRef(ctx context.Context) { 359 d.dirRefs.DecRef(func() { d.Destroy(ctx) }) 360 } 361 362 // StatFS implements kernfs.Inode.StatFS. 363 func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { 364 return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil 365 } 366 367 // cgroupDir implements kernfs.Inode. 368 // 369 // +stateify savable 370 type cgroupDir struct { 371 dir 372 } 373 374 // StatFS implements kernfs.Inode.StatFS. 375 func (d *cgroupDir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) { 376 return vfs.GenericStatFS(linux.TMPFS_MAGIC), nil 377 } 378 379 // cpuFile implements kernfs.Inode. 380 // 381 // +stateify savable 382 type cpuFile struct { 383 implStatFS 384 kernfs.DynamicBytesFile 385 386 maxCores uint 387 } 388 389 // Generate implements vfs.DynamicBytesSource.Generate. 390 func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 391 fmt.Fprintf(buf, "0-%d\n", c.maxCores-1) 392 return nil 393 } 394 395 func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode { 396 c := &cpuFile{maxCores: maxCores} 397 c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode) 398 return c 399 } 400 401 // +stateify savable 402 type implStatFS struct{} 403 404 // StatFS implements kernfs.Inode.StatFS. 405 func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) { 406 return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil 407 } 408 409 // +stateify savable 410 type staticFile struct { 411 kernfs.DynamicBytesFile 412 vfs.StaticData 413 } 414 415 func (fs *filesystem) newStaticFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode { 416 s := &staticFile{StaticData: vfs.StaticData{Data: data}} 417 s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, mode) 418 return s 419 } 420 421 // hostFile is an inode whose contents are generated by reading from the 422 // host. 423 // 424 // +stateify savable 425 type hostFile struct { 426 kernfs.DynamicBytesFile 427 hostPath string 428 } 429 430 func (hf *hostFile) Generate(ctx context.Context, buf *bytes.Buffer) error { 431 fd, err := unix.Openat(-1, hf.hostPath, unix.O_RDONLY|unix.O_NOFOLLOW, 0) 432 if err != nil { 433 return err 434 } 435 file := os.NewFile(uintptr(fd), hf.hostPath) 436 defer file.Close() 437 _, err = buf.ReadFrom(file) 438 return err 439 } 440 441 func (fs *filesystem) newHostFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, hostPath string) kernfs.Inode { 442 hf := &hostFile{hostPath: hostPath} 443 hf.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), hf, mode) 444 return hf 445 }