github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/sys/sys.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sys implements sysfs.
    16  package sys
    17  
    18  import (
    19  	"bytes"
    20  	"fmt"
    21  	"os"
    22  	"path"
    23  	"strconv"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"github.com/metacubex/gvisor/pkg/abi/linux"
    27  	"github.com/metacubex/gvisor/pkg/context"
    28  	"github.com/metacubex/gvisor/pkg/coverage"
    29  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    30  	"github.com/metacubex/gvisor/pkg/log"
    31  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/kernfs"
    32  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    33  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    34  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    35  )
    36  
    37  const (
    38  	// Name is the default filesystem name.
    39  	Name                     = "sysfs"
    40  	defaultSysMode           = linux.FileMode(0444)
    41  	defaultSysDirMode        = linux.FileMode(0755)
    42  	defaultMaxCachedDentries = uint64(1000)
    43  	iommuGroupSysPath        = "/sys/kernel/iommu_groups/"
    44  )
    45  
    46  // FilesystemType implements vfs.FilesystemType.
    47  //
    48  // +stateify savable
    49  type FilesystemType struct{}
    50  
    51  // InternalData contains internal data passed in via
    52  // vfs.GetFilesystemOptions.InternalData.
    53  //
    54  // +stateify savable
    55  type InternalData struct {
    56  	// ProductName is the value to be set to devices/virtual/dmi/id/product_name.
    57  	ProductName string
    58  	// EnableTPUProxyPaths is whether to populate sysfs paths used by hardware
    59  	// accelerators.
    60  	EnableTPUProxyPaths bool
    61  	// TestSysfsPathPrefix is a prefix for the sysfs paths. It is useful for
    62  	// unit testing.
    63  	TestSysfsPathPrefix string
    64  }
    65  
    66  // filesystem implements vfs.FilesystemImpl.
    67  //
    68  // +stateify savable
    69  type filesystem struct {
    70  	kernfs.Filesystem
    71  
    72  	devMinor uint32
    73  }
    74  
    75  // Name implements vfs.FilesystemType.Name.
    76  func (FilesystemType) Name() string {
    77  	return Name
    78  }
    79  
    80  // Release implements vfs.FilesystemType.Release.
    81  func (FilesystemType) Release(ctx context.Context) {}
    82  
    83  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
    84  func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
    85  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
    86  	if err != nil {
    87  		return nil, nil, err
    88  	}
    89  
    90  	mopts := vfs.GenericParseMountOptions(opts.Data)
    91  	maxCachedDentries := defaultMaxCachedDentries
    92  	if str, ok := mopts["dentry_cache_limit"]; ok {
    93  		delete(mopts, "dentry_cache_limit")
    94  		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
    95  		if err != nil {
    96  			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
    97  			return nil, nil, linuxerr.EINVAL
    98  		}
    99  	}
   100  
   101  	fs := &filesystem{
   102  		devMinor: devMinor,
   103  	}
   104  	fs.MaxCachedDentries = maxCachedDentries
   105  	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
   106  
   107  	k := kernel.KernelFromContext(ctx)
   108  	fsDirChildren := make(map[string]kernfs.Inode)
   109  	// Create an empty directory to serve as the mount point for cgroupfs when
   110  	// cgroups are available. This emulates Linux behaviour, see
   111  	// kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically
   112  	// the init process) is ultimately responsible for actually mounting
   113  	// cgroupfs, but the kernel creates the mountpoint. For the sentry, the
   114  	// launcher mounts cgroupfs.
   115  	if k.CgroupRegistry() != nil {
   116  		fsDirChildren["cgroup"] = fs.newCgroupDir(ctx, creds, defaultSysDirMode, nil)
   117  	}
   118  
   119  	classSub := map[string]kernfs.Inode{
   120  		"power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
   121  		"net":          fs.newDir(ctx, creds, defaultSysDirMode, fs.newNetDir(ctx, creds, defaultSysDirMode)),
   122  	}
   123  	devicesSub := map[string]kernfs.Inode{
   124  		"system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   125  			"cpu": cpuDir(ctx, fs, creds),
   126  		}),
   127  	}
   128  
   129  	productName := ""
   130  	busSub := make(map[string]kernfs.Inode)
   131  	kernelSub := kernelDir(ctx, fs, creds)
   132  	if opts.InternalData != nil {
   133  		idata := opts.InternalData.(*InternalData)
   134  		productName = idata.ProductName
   135  		if idata.EnableTPUProxyPaths {
   136  			deviceToIommuGroup, err := pciDeviceIOMMUGroups(path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath))
   137  			if err != nil {
   138  				return nil, nil, err
   139  			}
   140  			pciPath := path.Join(idata.TestSysfsPathPrefix, pciMainBusDevicePath)
   141  			pciMainBusSub, err := fs.mirrorPCIBusDeviceDir(ctx, creds, pciPath, deviceToIommuGroup)
   142  			if err != nil {
   143  				return nil, nil, err
   144  			}
   145  			devicesSub["pci0000:00"] = fs.newDir(ctx, creds, defaultSysDirMode, pciMainBusSub)
   146  
   147  			deviceDirs, err := fs.newDeviceClassDir(ctx, creds, []string{accelDevice, vfioDevice}, pciPath)
   148  			if err != nil {
   149  				return nil, nil, err
   150  			}
   151  
   152  			for tpuDeviceType, symlinkDir := range deviceDirs {
   153  				classSub[tpuDeviceType] = fs.newDir(ctx, creds, defaultSysDirMode, symlinkDir)
   154  			}
   155  			pciDevicesSub, err := fs.newBusPCIDevicesDir(ctx, creds, pciPath)
   156  			if err != nil {
   157  				return nil, nil, err
   158  			}
   159  			busSub["pci"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   160  				"devices": fs.newDir(ctx, creds, defaultSysDirMode, pciDevicesSub),
   161  			})
   162  			iommuPath := path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath)
   163  			iommuGroups, err := fs.mirrorIOMMUGroups(ctx, creds, iommuPath)
   164  			if err != nil {
   165  				return nil, nil, err
   166  			}
   167  			kernelSub["iommu_groups"] = fs.newDir(ctx, creds, defaultSysDirMode, iommuGroups)
   168  		}
   169  	}
   170  
   171  	if len(productName) > 0 {
   172  		log.Debugf("Setting product_name: %q", productName)
   173  		classSub["dmi"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   174  			"id": kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "../../devices/virtual/dmi/id"),
   175  		})
   176  		devicesSub["virtual"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   177  			"dmi": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   178  				"id": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   179  					"product_name": fs.newStaticFile(ctx, creds, defaultSysMode, productName+"\n"),
   180  				}),
   181  			}),
   182  		})
   183  	}
   184  	root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   185  		"block":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
   186  		"bus":      fs.newDir(ctx, creds, defaultSysDirMode, busSub),
   187  		"class":    fs.newDir(ctx, creds, defaultSysDirMode, classSub),
   188  		"dev":      fs.newDir(ctx, creds, defaultSysDirMode, nil),
   189  		"devices":  fs.newDir(ctx, creds, defaultSysDirMode, devicesSub),
   190  		"firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
   191  		"fs":       fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren),
   192  		"kernel":   fs.newDir(ctx, creds, defaultSysDirMode, kernelSub),
   193  		"module":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
   194  		"power":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
   195  	})
   196  	var rootD kernfs.Dentry
   197  	rootD.InitRoot(&fs.Filesystem, root)
   198  	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
   199  }
   200  
   201  func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
   202  	k := kernel.KernelFromContext(ctx)
   203  	maxCPUCores := k.ApplicationCores()
   204  	children := map[string]kernfs.Inode{
   205  		"online":   fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   206  		"possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   207  		"present":  fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   208  	}
   209  	for i := uint(0); i < maxCPUCores; i++ {
   210  		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
   211  	}
   212  	return fs.newDir(ctx, creds, defaultSysDirMode, children)
   213  }
   214  
   215  // Returns a map from a PCI device name to its IOMMU group if available.
   216  func pciDeviceIOMMUGroups(iommuGroupsPath string) (map[string]string, error) {
   217  	// IOMMU groups are organized as iommu_group_path/$GROUP, where $GROUP is
   218  	// the IOMMU group number of which the device is a member.
   219  	iommuGroupNums, err := hostDirEntries(iommuGroupsPath)
   220  	if err != nil {
   221  		// When IOMMU is not enabled, skip the rest of the process.
   222  		if err == unix.ENOENT {
   223  			return nil, nil
   224  		}
   225  		return nil, err
   226  	}
   227  	// The returned map from PCI device name to its IOMMU group.
   228  	iommuGroups := map[string]string{}
   229  	for _, iommuGroupNum := range iommuGroupNums {
   230  		groupDevicesPath := path.Join(iommuGroupsPath, iommuGroupNum, "devices")
   231  		pciDeviceNames, err := hostDirEntries(groupDevicesPath)
   232  		if err != nil {
   233  			return nil, err
   234  		}
   235  		// An IOMMU group may include multiple devices.
   236  		for _, pciDeviceName := range pciDeviceNames {
   237  			iommuGroups[pciDeviceName] = iommuGroupNum
   238  		}
   239  	}
   240  	return iommuGroups, nil
   241  }
   242  
   243  func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) map[string]kernfs.Inode {
   244  	// Set up /sys/kernel/debug/kcov. Technically, debugfs should be
   245  	// mounted at debug/, but for our purposes, it is sufficient to keep it
   246  	// in sys.
   247  	children := make(map[string]kernfs.Inode)
   248  	if coverage.KcovSupported() {
   249  		log.Debugf("Set up /sys/kernel/debug/kcov")
   250  		children["debug"] = fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
   251  			"kcov": fs.newKcovFile(ctx, creds),
   252  		})
   253  	}
   254  	return children
   255  }
   256  
   257  // Recursively build out IOMMU directories from the host.
   258  func (fs *filesystem) mirrorIOMMUGroups(ctx context.Context, creds *auth.Credentials, dir string) (map[string]kernfs.Inode, error) {
   259  	subs := map[string]kernfs.Inode{}
   260  	dents, err := hostDirEntries(dir)
   261  	if err != nil {
   262  		// TPU before v5 doesn't need IOMMU, skip the whole process for the backward compatibility when the directory can't be found.
   263  		if err == unix.ENOENT {
   264  			log.Debugf("Skip the path at %v which cannot be found.", dir)
   265  			return nil, nil
   266  		}
   267  		return nil, err
   268  	}
   269  	for _, dent := range dents {
   270  		absPath := path.Join(dir, dent)
   271  		mode, err := hostFileMode(absPath)
   272  		if err != nil {
   273  			return nil, err
   274  		}
   275  		switch mode {
   276  		case unix.S_IFDIR:
   277  			contents, err := fs.mirrorIOMMUGroups(ctx, creds, absPath)
   278  			if err != nil {
   279  				return nil, err
   280  			}
   281  			subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents)
   282  		case unix.S_IFREG:
   283  			subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, absPath)
   284  		case unix.S_IFLNK:
   285  			if pciDeviceRegex.MatchString(dent) {
   286  				subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../../devices/pci0000:00/%s", dent))
   287  			}
   288  		}
   289  	}
   290  	return subs, nil
   291  }
   292  
   293  // Release implements vfs.FilesystemImpl.Release.
   294  func (fs *filesystem) Release(ctx context.Context) {
   295  	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   296  	fs.Filesystem.Release(ctx)
   297  }
   298  
   299  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   300  func (fs *filesystem) MountOptions() string {
   301  	return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
   302  }
   303  
   304  // dir implements kernfs.Inode.
   305  //
   306  // +stateify savable
   307  type dir struct {
   308  	dirRefs
   309  	kernfs.InodeAlwaysValid
   310  	kernfs.InodeAttrs
   311  	kernfs.InodeDirectoryNoNewChildren
   312  	kernfs.InodeNotAnonymous
   313  	kernfs.InodeNotSymlink
   314  	kernfs.InodeTemporary
   315  	kernfs.InodeWatches
   316  	kernfs.OrderedChildren
   317  
   318  	locks vfs.FileLocks
   319  }
   320  
   321  func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
   322  	d := &dir{}
   323  	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
   324  	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   325  	d.InitRefs()
   326  	d.IncLinks(d.OrderedChildren.Populate(contents))
   327  	return d
   328  }
   329  
   330  func (fs *filesystem) newCgroupDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
   331  	d := &cgroupDir{}
   332  	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
   333  	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   334  	d.InitRefs()
   335  	d.IncLinks(d.OrderedChildren.Populate(contents))
   336  	return d
   337  }
   338  
   339  // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
   340  func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   341  	return linuxerr.EPERM
   342  }
   343  
   344  // Open implements kernfs.Inode.Open.
   345  func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   346  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC |
   347  		linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY
   348  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
   349  		SeekEnd: kernfs.SeekEndStaticEntries,
   350  	})
   351  	if err != nil {
   352  		return nil, err
   353  	}
   354  	return fd.VFSFileDescription(), nil
   355  }
   356  
   357  // DecRef implements kernfs.Inode.DecRef.
   358  func (d *dir) DecRef(ctx context.Context) {
   359  	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
   360  }
   361  
   362  // StatFS implements kernfs.Inode.StatFS.
   363  func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
   364  	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
   365  }
   366  
   367  // cgroupDir implements kernfs.Inode.
   368  //
   369  // +stateify savable
   370  type cgroupDir struct {
   371  	dir
   372  }
   373  
   374  // StatFS implements kernfs.Inode.StatFS.
   375  func (d *cgroupDir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
   376  	return vfs.GenericStatFS(linux.TMPFS_MAGIC), nil
   377  }
   378  
   379  // cpuFile implements kernfs.Inode.
   380  //
   381  // +stateify savable
   382  type cpuFile struct {
   383  	implStatFS
   384  	kernfs.DynamicBytesFile
   385  
   386  	maxCores uint
   387  }
   388  
   389  // Generate implements vfs.DynamicBytesSource.Generate.
   390  func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   391  	fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
   392  	return nil
   393  }
   394  
   395  func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
   396  	c := &cpuFile{maxCores: maxCores}
   397  	c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
   398  	return c
   399  }
   400  
   401  // +stateify savable
   402  type implStatFS struct{}
   403  
   404  // StatFS implements kernfs.Inode.StatFS.
   405  func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   406  	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
   407  }
   408  
   409  // +stateify savable
   410  type staticFile struct {
   411  	kernfs.DynamicBytesFile
   412  	vfs.StaticData
   413  }
   414  
   415  func (fs *filesystem) newStaticFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
   416  	s := &staticFile{StaticData: vfs.StaticData{Data: data}}
   417  	s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, mode)
   418  	return s
   419  }
   420  
   421  // hostFile is an inode whose contents are generated by reading from the
   422  // host.
   423  //
   424  // +stateify savable
   425  type hostFile struct {
   426  	kernfs.DynamicBytesFile
   427  	hostPath string
   428  }
   429  
   430  func (hf *hostFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   431  	fd, err := unix.Openat(-1, hf.hostPath, unix.O_RDONLY|unix.O_NOFOLLOW, 0)
   432  	if err != nil {
   433  		return err
   434  	}
   435  	file := os.NewFile(uintptr(fd), hf.hostPath)
   436  	defer file.Close()
   437  	_, err = buf.ReadFrom(file)
   438  	return err
   439  }
   440  
   441  func (fs *filesystem) newHostFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, hostPath string) kernfs.Inode {
   442  	hf := &hostFile{hostPath: hostPath}
   443  	hf.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), hf, mode)
   444  	return hf
   445  }