gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/sys/sys.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package sys implements sysfs.
    16  package sys
    17  
    18  import (
    19  	"bytes"
    20  	"fmt"
    21  	"os"
    22  	"path"
    23  	"strconv"
    24  
    25  	"golang.org/x/sys/unix"
    26  	"gvisor.dev/gvisor/pkg/abi/linux"
    27  	"gvisor.dev/gvisor/pkg/context"
    28  	"gvisor.dev/gvisor/pkg/coverage"
    29  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    30  	"gvisor.dev/gvisor/pkg/log"
    31  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
    32  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    33  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    34  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    35  )
    36  
    37  const (
    38  	// Name is the default filesystem name.
    39  	Name                     = "sysfs"
    40  	defaultSysMode           = linux.FileMode(0444)
    41  	defaultSysDirMode        = linux.FileMode(0755)
    42  	defaultMaxCachedDentries = uint64(1000)
    43  	iommuGroupSysPath        = "/sys/kernel/iommu_groups/"
    44  )
    45  
    46  // FilesystemType implements vfs.FilesystemType.
    47  //
    48  // +stateify savable
    49  type FilesystemType struct{}
    50  
    51  // InternalData contains internal data passed in via
    52  // vfs.GetFilesystemOptions.InternalData.
    53  //
    54  // +stateify savable
    55  type InternalData struct {
    56  	// ProductName is the value to be set to devices/virtual/dmi/id/product_name.
    57  	ProductName string
    58  	// EnableTPUProxyPaths is whether to populate sysfs paths used by hardware
    59  	// accelerators.
    60  	EnableTPUProxyPaths bool
    61  	// TestSysfsPathPrefix is a prefix for the sysfs paths. It is useful for
    62  	// unit testing.
    63  	TestSysfsPathPrefix string
    64  }
    65  
    66  // filesystem implements vfs.FilesystemImpl.
    67  //
    68  // +stateify savable
    69  type filesystem struct {
    70  	kernfs.Filesystem
    71  
    72  	devMinor uint32
    73  }
    74  
    75  // Name implements vfs.FilesystemType.Name.
    76  func (FilesystemType) Name() string {
    77  	return Name
    78  }
    79  
    80  // Release implements vfs.FilesystemType.Release.
    81  func (FilesystemType) Release(ctx context.Context) {}
    82  
    83  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
    84  func (fsType FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
    85  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
    86  	if err != nil {
    87  		return nil, nil, err
    88  	}
    89  
    90  	mopts := vfs.GenericParseMountOptions(opts.Data)
    91  	maxCachedDentries := defaultMaxCachedDentries
    92  	if str, ok := mopts["dentry_cache_limit"]; ok {
    93  		delete(mopts, "dentry_cache_limit")
    94  		maxCachedDentries, err = strconv.ParseUint(str, 10, 64)
    95  		if err != nil {
    96  			ctx.Warningf("sys.FilesystemType.GetFilesystem: invalid dentry cache limit: dentry_cache_limit=%s", str)
    97  			return nil, nil, linuxerr.EINVAL
    98  		}
    99  	}
   100  
   101  	fs := &filesystem{
   102  		devMinor: devMinor,
   103  	}
   104  	fs.MaxCachedDentries = maxCachedDentries
   105  	fs.VFSFilesystem().Init(vfsObj, &fsType, fs)
   106  
   107  	k := kernel.KernelFromContext(ctx)
   108  	fsDirChildren := make(map[string]kernfs.Inode)
   109  	// Create an empty directory to serve as the mount point for cgroupfs when
   110  	// cgroups are available. This emulates Linux behaviour, see
   111  	// kernel/cgroup.c:cgroup_init(). Note that in Linux, userspace (typically
   112  	// the init process) is ultimately responsible for actually mounting
   113  	// cgroupfs, but the kernel creates the mountpoint. For the sentry, the
   114  	// launcher mounts cgroupfs.
   115  	if k.CgroupRegistry() != nil {
   116  		fsDirChildren["cgroup"] = fs.newCgroupDir(ctx, creds, defaultSysDirMode, nil)
   117  	}
   118  
   119  	classSub := map[string]kernfs.Inode{
   120  		"power_supply": fs.newDir(ctx, creds, defaultSysDirMode, nil),
   121  	}
   122  	devicesSub := map[string]kernfs.Inode{
   123  		"system": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   124  			"cpu": cpuDir(ctx, fs, creds),
   125  		}),
   126  	}
   127  
   128  	productName := ""
   129  	busSub := make(map[string]kernfs.Inode)
   130  	kernelSub := kernelDir(ctx, fs, creds)
   131  	if opts.InternalData != nil {
   132  		idata := opts.InternalData.(*InternalData)
   133  		productName = idata.ProductName
   134  		if idata.EnableTPUProxyPaths {
   135  			deviceToIommuGroup, err := pciDeviceIOMMUGroups(path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath))
   136  			if err != nil {
   137  				return nil, nil, err
   138  			}
   139  			pciPath := path.Join(idata.TestSysfsPathPrefix, pciMainBusDevicePath)
   140  			pciMainBusSub, err := fs.mirrorPCIBusDeviceDir(ctx, creds, pciPath, deviceToIommuGroup)
   141  			if err != nil {
   142  				return nil, nil, err
   143  			}
   144  			devicesSub["pci0000:00"] = fs.newDir(ctx, creds, defaultSysDirMode, pciMainBusSub)
   145  
   146  			deviceDirs, err := fs.newDeviceClassDir(ctx, creds, []string{accelDevice, vfioDevice}, pciPath)
   147  			if err != nil {
   148  				return nil, nil, err
   149  			}
   150  
   151  			for tpuDeviceType, symlinkDir := range deviceDirs {
   152  				classSub[tpuDeviceType] = fs.newDir(ctx, creds, defaultSysDirMode, symlinkDir)
   153  			}
   154  			pciDevicesSub, err := fs.newBusPCIDevicesDir(ctx, creds, pciPath)
   155  			if err != nil {
   156  				return nil, nil, err
   157  			}
   158  			busSub["pci"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   159  				"devices": fs.newDir(ctx, creds, defaultSysDirMode, pciDevicesSub),
   160  			})
   161  			iommuPath := path.Join(idata.TestSysfsPathPrefix, iommuGroupSysPath)
   162  			iommuGroups, err := fs.mirrorIOMMUGroups(ctx, creds, iommuPath)
   163  			if err != nil {
   164  				return nil, nil, err
   165  			}
   166  			kernelSub["iommu_groups"] = fs.newDir(ctx, creds, defaultSysDirMode, iommuGroups)
   167  		}
   168  	}
   169  
   170  	if len(productName) > 0 {
   171  		log.Debugf("Setting product_name: %q", productName)
   172  		classSub["dmi"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   173  			"id": kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), "../../devices/virtual/dmi/id"),
   174  		})
   175  		devicesSub["virtual"] = fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   176  			"dmi": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   177  				"id": fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   178  					"product_name": fs.newStaticFile(ctx, creds, defaultSysMode, productName+"\n"),
   179  				}),
   180  			}),
   181  		})
   182  	}
   183  	root := fs.newDir(ctx, creds, defaultSysDirMode, map[string]kernfs.Inode{
   184  		"block":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
   185  		"bus":      fs.newDir(ctx, creds, defaultSysDirMode, busSub),
   186  		"class":    fs.newDir(ctx, creds, defaultSysDirMode, classSub),
   187  		"dev":      fs.newDir(ctx, creds, defaultSysDirMode, nil),
   188  		"devices":  fs.newDir(ctx, creds, defaultSysDirMode, devicesSub),
   189  		"firmware": fs.newDir(ctx, creds, defaultSysDirMode, nil),
   190  		"fs":       fs.newDir(ctx, creds, defaultSysDirMode, fsDirChildren),
   191  		"kernel":   fs.newDir(ctx, creds, defaultSysDirMode, kernelSub),
   192  		"module":   fs.newDir(ctx, creds, defaultSysDirMode, nil),
   193  		"power":    fs.newDir(ctx, creds, defaultSysDirMode, nil),
   194  	})
   195  	var rootD kernfs.Dentry
   196  	rootD.InitRoot(&fs.Filesystem, root)
   197  	return fs.VFSFilesystem(), rootD.VFSDentry(), nil
   198  }
   199  
   200  func cpuDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) kernfs.Inode {
   201  	k := kernel.KernelFromContext(ctx)
   202  	maxCPUCores := k.ApplicationCores()
   203  	children := map[string]kernfs.Inode{
   204  		"online":   fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   205  		"possible": fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   206  		"present":  fs.newCPUFile(ctx, creds, maxCPUCores, linux.FileMode(0444)),
   207  	}
   208  	for i := uint(0); i < maxCPUCores; i++ {
   209  		children[fmt.Sprintf("cpu%d", i)] = fs.newDir(ctx, creds, linux.FileMode(0555), nil)
   210  	}
   211  	return fs.newDir(ctx, creds, defaultSysDirMode, children)
   212  }
   213  
   214  // Returns a map from a PCI device name to its IOMMU group if available.
   215  func pciDeviceIOMMUGroups(iommuGroupsPath string) (map[string]string, error) {
   216  	// IOMMU groups are organized as iommu_group_path/$GROUP, where $GROUP is
   217  	// the IOMMU group number of which the device is a member.
   218  	iommuGroupNums, err := hostDirEntries(iommuGroupsPath)
   219  	if err != nil {
   220  		// When IOMMU is not enabled, skip the rest of the process.
   221  		if err == unix.ENOENT {
   222  			return nil, nil
   223  		}
   224  		return nil, err
   225  	}
   226  	// The returned map from PCI device name to its IOMMU group.
   227  	iommuGroups := map[string]string{}
   228  	for _, iommuGroupNum := range iommuGroupNums {
   229  		groupDevicesPath := path.Join(iommuGroupsPath, iommuGroupNum, "devices")
   230  		pciDeviceNames, err := hostDirEntries(groupDevicesPath)
   231  		if err != nil {
   232  			return nil, err
   233  		}
   234  		// An IOMMU group may include multiple devices.
   235  		for _, pciDeviceName := range pciDeviceNames {
   236  			iommuGroups[pciDeviceName] = iommuGroupNum
   237  		}
   238  	}
   239  	return iommuGroups, nil
   240  }
   241  
   242  func kernelDir(ctx context.Context, fs *filesystem, creds *auth.Credentials) map[string]kernfs.Inode {
   243  	// Set up /sys/kernel/debug/kcov. Technically, debugfs should be
   244  	// mounted at debug/, but for our purposes, it is sufficient to keep it
   245  	// in sys.
   246  	children := make(map[string]kernfs.Inode)
   247  	if coverage.KcovSupported() {
   248  		log.Debugf("Set up /sys/kernel/debug/kcov")
   249  		children["debug"] = fs.newDir(ctx, creds, linux.FileMode(0700), map[string]kernfs.Inode{
   250  			"kcov": fs.newKcovFile(ctx, creds),
   251  		})
   252  	}
   253  	return children
   254  }
   255  
   256  // Recursively build out IOMMU directories from the host.
   257  func (fs *filesystem) mirrorIOMMUGroups(ctx context.Context, creds *auth.Credentials, dir string) (map[string]kernfs.Inode, error) {
   258  	subs := map[string]kernfs.Inode{}
   259  	dents, err := hostDirEntries(dir)
   260  	if err != nil {
   261  		// TPU before v5 doesn't need IOMMU, skip the whole process for the backward compatibility when the directory can't be found.
   262  		if err == unix.ENOENT {
   263  			log.Debugf("Skip the path at %v which cannot be found.", dir)
   264  			return nil, nil
   265  		}
   266  		return nil, err
   267  	}
   268  	for _, dent := range dents {
   269  		absPath := path.Join(dir, dent)
   270  		mode, err := hostFileMode(absPath)
   271  		if err != nil {
   272  			return nil, err
   273  		}
   274  		switch mode {
   275  		case unix.S_IFDIR:
   276  			contents, err := fs.mirrorIOMMUGroups(ctx, creds, absPath)
   277  			if err != nil {
   278  				return nil, err
   279  			}
   280  			subs[dent] = fs.newDir(ctx, creds, defaultSysMode, contents)
   281  		case unix.S_IFREG:
   282  			subs[dent] = fs.newHostFile(ctx, creds, defaultSysMode, absPath)
   283  		case unix.S_IFLNK:
   284  			if pciDeviceRegex.MatchString(dent) {
   285  				subs[dent] = kernfs.NewStaticSymlink(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), fmt.Sprintf("../../../../devices/pci0000:00/%s", dent))
   286  			}
   287  		}
   288  	}
   289  	return subs, nil
   290  }
   291  
   292  // Release implements vfs.FilesystemImpl.Release.
   293  func (fs *filesystem) Release(ctx context.Context) {
   294  	fs.Filesystem.VFSFilesystem().VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   295  	fs.Filesystem.Release(ctx)
   296  }
   297  
   298  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   299  func (fs *filesystem) MountOptions() string {
   300  	return fmt.Sprintf("dentry_cache_limit=%d", fs.MaxCachedDentries)
   301  }
   302  
   303  // dir implements kernfs.Inode.
   304  //
   305  // +stateify savable
   306  type dir struct {
   307  	dirRefs
   308  	kernfs.InodeAlwaysValid
   309  	kernfs.InodeAttrs
   310  	kernfs.InodeDirectoryNoNewChildren
   311  	kernfs.InodeNotAnonymous
   312  	kernfs.InodeNotSymlink
   313  	kernfs.InodeTemporary
   314  	kernfs.InodeWatches
   315  	kernfs.OrderedChildren
   316  
   317  	locks vfs.FileLocks
   318  }
   319  
   320  func (fs *filesystem) newDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
   321  	d := &dir{}
   322  	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
   323  	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   324  	d.InitRefs()
   325  	d.IncLinks(d.OrderedChildren.Populate(contents))
   326  	return d
   327  }
   328  
   329  func (fs *filesystem) newCgroupDir(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, contents map[string]kernfs.Inode) kernfs.Inode {
   330  	d := &cgroupDir{}
   331  	d.InodeAttrs.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), linux.ModeDirectory|0755)
   332  	d.OrderedChildren.Init(kernfs.OrderedChildrenOptions{})
   333  	d.InitRefs()
   334  	d.IncLinks(d.OrderedChildren.Populate(contents))
   335  	return d
   336  }
   337  
   338  // SetStat implements kernfs.Inode.SetStat not allowing inode attributes to be changed.
   339  func (*dir) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   340  	return linuxerr.EPERM
   341  }
   342  
   343  // Open implements kernfs.Inode.Open.
   344  func (d *dir) Open(ctx context.Context, rp *vfs.ResolvingPath, kd *kernfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   345  	opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC |
   346  		linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NONBLOCK | linux.O_NOCTTY
   347  	fd, err := kernfs.NewGenericDirectoryFD(rp.Mount(), kd, &d.OrderedChildren, &d.locks, &opts, kernfs.GenericDirectoryFDOptions{
   348  		SeekEnd: kernfs.SeekEndStaticEntries,
   349  	})
   350  	if err != nil {
   351  		return nil, err
   352  	}
   353  	return fd.VFSFileDescription(), nil
   354  }
   355  
   356  // DecRef implements kernfs.Inode.DecRef.
   357  func (d *dir) DecRef(ctx context.Context) {
   358  	d.dirRefs.DecRef(func() { d.Destroy(ctx) })
   359  }
   360  
   361  // StatFS implements kernfs.Inode.StatFS.
   362  func (d *dir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
   363  	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
   364  }
   365  
   366  // cgroupDir implements kernfs.Inode.
   367  //
   368  // +stateify savable
   369  type cgroupDir struct {
   370  	dir
   371  }
   372  
   373  // StatFS implements kernfs.Inode.StatFS.
   374  func (d *cgroupDir) StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error) {
   375  	return vfs.GenericStatFS(linux.TMPFS_MAGIC), nil
   376  }
   377  
   378  // cpuFile implements kernfs.Inode.
   379  //
   380  // +stateify savable
   381  type cpuFile struct {
   382  	implStatFS
   383  	kernfs.DynamicBytesFile
   384  
   385  	maxCores uint
   386  }
   387  
   388  // Generate implements vfs.DynamicBytesSource.Generate.
   389  func (c *cpuFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   390  	fmt.Fprintf(buf, "0-%d\n", c.maxCores-1)
   391  	return nil
   392  }
   393  
   394  func (fs *filesystem) newCPUFile(ctx context.Context, creds *auth.Credentials, maxCores uint, mode linux.FileMode) kernfs.Inode {
   395  	c := &cpuFile{maxCores: maxCores}
   396  	c.DynamicBytesFile.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), c, mode)
   397  	return c
   398  }
   399  
   400  // +stateify savable
   401  type implStatFS struct{}
   402  
   403  // StatFS implements kernfs.Inode.StatFS.
   404  func (*implStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   405  	return vfs.GenericStatFS(linux.SYSFS_MAGIC), nil
   406  }
   407  
   408  // +stateify savable
   409  type staticFile struct {
   410  	kernfs.DynamicBytesFile
   411  	vfs.StaticData
   412  }
   413  
   414  func (fs *filesystem) newStaticFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, data string) kernfs.Inode {
   415  	s := &staticFile{StaticData: vfs.StaticData{Data: data}}
   416  	s.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), s, mode)
   417  	return s
   418  }
   419  
   420  // hostFile is an inode whose contents are generated by reading from the
   421  // host.
   422  //
   423  // +stateify savable
   424  type hostFile struct {
   425  	kernfs.DynamicBytesFile
   426  	hostPath string
   427  }
   428  
   429  func (hf *hostFile) Generate(ctx context.Context, buf *bytes.Buffer) error {
   430  	fd, err := unix.Openat(-1, hf.hostPath, unix.O_RDONLY|unix.O_NOFOLLOW, 0)
   431  	if err != nil {
   432  		return err
   433  	}
   434  	file := os.NewFile(uintptr(fd), hf.hostPath)
   435  	defer file.Close()
   436  	_, err = buf.ReadFrom(file)
   437  	return err
   438  }
   439  
   440  func (fs *filesystem) newHostFile(ctx context.Context, creds *auth.Credentials, mode linux.FileMode, hostPath string) kernfs.Inode {
   441  	hf := &hostFile{hostPath: hostPath}
   442  	hf.Init(ctx, creds, linux.UNNAMED_MAJOR, fs.devMinor, fs.NextIno(), hf, mode)
   443  	return hf
   444  }