github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/erofs/erofs.go

github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/erofs/erofs.go (about)

     1  // Copyright 2023 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package erofs implements erofs.
    16  package erofs
    17  
    18  import (
    19  	"os"
    20  	"runtime"
    21  	"strconv"
    22  	"sync"
    23  	"sync/atomic"
    24  
    25  	"github.com/metacubex/gvisor/pkg/abi/linux"
    26  	"github.com/metacubex/gvisor/pkg/cleanup"
    27  	"github.com/metacubex/gvisor/pkg/context"
    28  	"github.com/metacubex/gvisor/pkg/erofs"
    29  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    30  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    31  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    32  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    33  )
    34  
    35  // Name is the filesystem name. It is part of the interface used by users,
    36  // e.g. via annotations, and shouldn't change.
    37  const Name = "erofs"
    38  
    39  // Mount option names for EROFS.
    40  const (
    41  	moptImageFD = "ifd"
    42  )
    43  
    44  // FilesystemType implements vfs.FilesystemType.
    45  //
    46  // +stateify savable
    47  type FilesystemType struct{}
    48  
    49  // filesystem implements vfs.FilesystemImpl.
    50  //
    51  // +stateify savable
    52  type filesystem struct {
    53  	vfsfs vfs.Filesystem
    54  
    55  	// Immutable options.
    56  	mopts string
    57  	iopts InternalFilesystemOptions
    58  
    59  	// devMinor is the filesystem's minor device number. devMinor is immutable.
    60  	devMinor uint32
    61  
    62  	// root is the root dentry. root is immutable.
    63  	root *dentry
    64  
    65  	// image is the EROFS image. image is immutable.
    66  	image *erofs.Image
    67  
    68  	// mf implements memmap.File for this image.
    69  	mf imageMemmapFile
    70  
    71  	// inodeBuckets contains the inodes in use. Multiple buckets are used to
    72  	// reduce the lock contention. Bucket is chosen based on the hash calculation
    73  	// on nid in filesystem.inodeBucket.
    74  	inodeBuckets []inodeBucket
    75  }
    76  
    77  // InternalFilesystemOptions may be passed as
    78  // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
    79  //
    80  // +stateify savable
    81  type InternalFilesystemOptions struct {
    82  	// If UniqueID is non-empty, it is an opaque string used to reassociate the
    83  	// filesystem with a new image FD during restoration from checkpoint.
    84  	UniqueID vfs.RestoreID
    85  }
    86  
    87  // Name implements vfs.FilesystemType.Name.
    88  func (FilesystemType) Name() string {
    89  	return Name
    90  }
    91  
    92  // Release implements vfs.FilesystemType.Release.
    93  func (FilesystemType) Release(ctx context.Context) {}
    94  
    95  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
    96  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
    97  	mopts := vfs.GenericParseMountOptions(opts.Data)
    98  
    99  	var cu cleanup.Cleanup
   100  	defer cu.Clean()
   101  
   102  	fd, err := getFDFromMountOptionsMap(ctx, mopts)
   103  	if err != nil {
   104  		return nil, nil, err
   105  	}
   106  
   107  	f := os.NewFile(uintptr(fd), "EROFS image file")
   108  	image, err := erofs.OpenImage(f)
   109  	if err != nil {
   110  		f.Close()
   111  		return nil, nil, err
   112  	}
   113  	cu.Add(func() { image.Close() })
   114  
   115  	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
   116  	if opts.InternalData != nil && !ok {
   117  		ctx.Warningf("erofs.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted erofs.InternalFilesystemOptions", opts.InternalData)
   118  		return nil, nil, linuxerr.EINVAL
   119  	}
   120  
   121  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   122  	if err != nil {
   123  		return nil, nil, err
   124  	}
   125  
   126  	fs := &filesystem{
   127  		mopts:    opts.Data,
   128  		iopts:    iopts,
   129  		image:    image,
   130  		devMinor: devMinor,
   131  		mf:       imageMemmapFile{image: image},
   132  	}
   133  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   134  	cu.Add(func() { fs.vfsfs.DecRef(ctx) })
   135  
   136  	fs.inodeBuckets = make([]inodeBucket, runtime.GOMAXPROCS(0))
   137  	for i := range fs.inodeBuckets {
   138  		fs.inodeBuckets[i].init()
   139  	}
   140  
   141  	root, err := fs.newDentry(image.RootNid())
   142  	if err != nil {
   143  		return nil, nil, err
   144  	}
   145  
   146  	// Increase the root's reference count to 2. One reference is returned to
   147  	// the caller, and the other is held by fs.
   148  	root.IncRef()
   149  	fs.root = root
   150  
   151  	cu.Release()
   152  	return &fs.vfsfs, &root.vfsd, nil
   153  }
   154  
   155  func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
   156  	ifdstr, ok := mopts[moptImageFD]
   157  	if !ok {
   158  		ctx.Warningf("erofs.getFDFromMountOptionsMap: image FD must be specified as '%s=<file descriptor>'", moptImageFD)
   159  		return -1, linuxerr.EINVAL
   160  	}
   161  	delete(mopts, moptImageFD)
   162  
   163  	ifd, err := strconv.Atoi(ifdstr)
   164  	if err != nil {
   165  		ctx.Warningf("erofs.getFDFromMountOptionsMap: invalid image FD: %s=%s", moptImageFD, ifdstr)
   166  		return -1, linuxerr.EINVAL
   167  	}
   168  
   169  	return ifd, nil
   170  }
   171  
   172  // Release implements vfs.FilesystemImpl.Release.
   173  func (fs *filesystem) Release(ctx context.Context) {
   174  	// An extra reference was held by the filesystem on the root.
   175  	if fs.root != nil {
   176  		fs.root.DecRef(ctx)
   177  	}
   178  	fs.image.Close()
   179  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   180  }
   181  
   182  func (fs *filesystem) statFS() linux.Statfs {
   183  	blockSize := int64(fs.image.BlockSize())
   184  	return linux.Statfs{
   185  		Type:         erofs.SuperBlockMagicV1,
   186  		NameLength:   erofs.MaxNameLen,
   187  		BlockSize:    blockSize,
   188  		FragmentSize: blockSize,
   189  		Blocks:       uint64(fs.image.Blocks()),
   190  	}
   191  }
   192  
   193  // +stateify savable
   194  type inodeBucket struct {
   195  	// mu protects inodeMap.
   196  	mu sync.RWMutex `state:"nosave"`
   197  
   198  	// inodeMap contains the inodes indexed by nid.
   199  	// +checklocks:mu
   200  	inodeMap map[uint64]*inode
   201  }
   202  
   203  func (ib *inodeBucket) init() {
   204  	ib.inodeMap = make(map[uint64]*inode) // +checklocksignore
   205  }
   206  
   207  // getInode returns the inode identified by nid. A reference on inode is also
   208  // returned to caller.
   209  func (ib *inodeBucket) getInode(nid uint64) *inode {
   210  	ib.mu.RLock()
   211  	defer ib.mu.RUnlock()
   212  	i := ib.inodeMap[nid]
   213  	if i != nil {
   214  		i.IncRef()
   215  	}
   216  	return i
   217  }
   218  
   219  // addInode adds the inode identified by nid into the bucket. It will first check
   220  // whether the old inode exists. If not, it will call newInode() to get the new inode.
   221  // The inode eventually saved in the bucket will be returned with a reference for caller.
   222  func (ib *inodeBucket) addInode(nid uint64, newInode func() *inode) *inode {
   223  	ib.mu.Lock()
   224  	defer ib.mu.Unlock()
   225  	if i, ok := ib.inodeMap[nid]; ok {
   226  		i.IncRef()
   227  		return i
   228  	}
   229  	i := newInode()
   230  	ib.inodeMap[nid] = i
   231  	return i
   232  }
   233  
   234  // removeInode removes the inode identified by nid.
   235  func (ib *inodeBucket) removeInode(nid uint64) {
   236  	ib.mu.Lock()
   237  	delete(ib.inodeMap, nid)
   238  	ib.mu.Unlock()
   239  }
   240  
   241  func (fs *filesystem) inodeBucket(nid uint64) *inodeBucket {
   242  	bucket := nid % uint64(len(fs.inodeBuckets))
   243  	return &fs.inodeBuckets[bucket]
   244  }
   245  
   246  // inode represents a filesystem object.
   247  //
   248  // Each dentry holds a reference on the inode it represents. An inode will
   249  // be dropped once its reference count reaches zero. We do not cache inodes
   250  // directly. The caching policy is implemented on top of dentries.
   251  //
   252  // +stateify savable
   253  type inode struct {
   254  	erofs.Inode
   255  
   256  	// inodeRefs is the reference count.
   257  	inodeRefs
   258  
   259  	// fs is the owning filesystem.
   260  	fs *filesystem
   261  
   262  	// dirMu protects dirents. dirents is immutable after creation.
   263  	dirMu sync.RWMutex `state:"nosave"`
   264  	// +checklocks:dirMu
   265  	dirents []vfs.Dirent `state:"nosave"`
   266  
   267  	// mapsMu protects mappings.
   268  	mapsMu sync.Mutex `state:"nosave"`
   269  
   270  	// mappings tracks the mappings of the file into memmap.MappingSpaces
   271  	// if this inode represents a regular file.
   272  	// +checklocks:mapsMu
   273  	mappings memmap.MappingSet
   274  
   275  	// locks supports POSIX and BSD style locks.
   276  	locks vfs.FileLocks
   277  
   278  	// Inotify watches for this inode.
   279  	watches vfs.Watches
   280  }
   281  
   282  // getInode returns the inode identified by nid. A reference on inode is also
   283  // returned to caller.
   284  func (fs *filesystem) getInode(nid uint64) (*inode, error) {
   285  	bucket := fs.inodeBucket(nid)
   286  
   287  	// Fast path, inode already exists.
   288  	if i := bucket.getInode(nid); i != nil {
   289  		return i, nil
   290  	}
   291  
   292  	// Slow path, create a new inode.
   293  	//
   294  	// Construct the underlying inode object from the image without taking
   295  	// the bucket lock first to reduce the contention.
   296  	ino, err := fs.image.Inode(nid)
   297  	if err != nil {
   298  		return nil, err
   299  	}
   300  	return bucket.addInode(nid, func() *inode {
   301  		i := &inode{
   302  			Inode: ino,
   303  			fs:    fs,
   304  		}
   305  		i.InitRefs()
   306  		return i
   307  	}), nil
   308  
   309  }
   310  
   311  // DecRef should be called when you're finished with an inode.
   312  func (i *inode) DecRef(ctx context.Context) {
   313  	i.inodeRefs.DecRef(func() {
   314  		nid := i.Nid()
   315  		i.fs.inodeBucket(nid).removeInode(nid)
   316  	})
   317  }
   318  
   319  func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   320  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(i.Mode()), auth.KUID(i.UID()), auth.KGID(i.GID()))
   321  }
   322  
   323  func (i *inode) statTo(stat *linux.Statx) {
   324  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
   325  		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
   326  		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
   327  		linux.STATX_MTIME
   328  	stat.Blksize = i.fs.image.BlockSize()
   329  	stat.Nlink = i.Nlink()
   330  	stat.UID = i.UID()
   331  	stat.GID = i.GID()
   332  	stat.Mode = i.Mode()
   333  	stat.Ino = i.Nid()
   334  	stat.Size = i.Size()
   335  	stat.Blocks = (stat.Size + 511) / 512
   336  	stat.Mtime = linux.StatxTimestamp{
   337  		Sec:  int64(i.Mtime()),
   338  		Nsec: i.MtimeNsec(),
   339  	}
   340  	stat.Atime = stat.Mtime
   341  	stat.Ctime = stat.Mtime
   342  	stat.DevMajor = linux.UNNAMED_MAJOR
   343  	stat.DevMinor = i.fs.devMinor
   344  }
   345  
   346  func (i *inode) fileType() uint16 {
   347  	return i.Mode() & linux.S_IFMT
   348  }
   349  
   350  // dentry implements vfs.DentryImpl.
   351  //
   352  // The filesystem is read-only and currently we never drop the cached dentries
   353  // until the filesystem is unmounted. The reference model works like this:
   354  //
   355  //   - The initial reference count of each dentry is one, which is the reference
   356  //     held by the parent (so when the reference count is one, it also means that
   357  //     this is a cached dentry, i.e. not in use).
   358  //
   359  //   - When a dentry is used (e.g. opened by someone), its reference count will
   360  //     be increased and the new reference is held by caller.
   361  //
   362  //   - The reference count of root dentry is two. One reference is returned to
   363  //     the caller of `GetFilesystem()`, and the other is held by `fs`.
   364  //
   365  // TODO: This can lead to unbounded memory growth in sentry due to the ever-growing
   366  // dentry tree. We should have a dentry LRU cache, similar to what fsimpl/gofer does.
   367  //
   368  // +stateify savable
   369  type dentry struct {
   370  	vfsd vfs.Dentry
   371  
   372  	// dentryRefs is the reference count.
   373  	dentryRefs
   374  
   375  	// parent is this dentry's parent directory. If this dentry is
   376  	// a file system root, parent is nil.
   377  	parent atomic.Pointer[dentry] `state:".(*dentry)"`
   378  
   379  	// name is this dentry's name in its parent. If this dentry is
   380  	// a file system root, name is the empty string.
   381  	name string
   382  
   383  	// inode is the inode represented by this dentry.
   384  	inode *inode
   385  
   386  	// dirMu serializes changes to the dentry tree.
   387  	dirMu sync.RWMutex `state:"nosave"`
   388  
   389  	// childMap contains the mappings of child names to dentries if this
   390  	// dentry represents a directory.
   391  	// +checklocks:dirMu
   392  	childMap map[string]*dentry
   393  }
   394  
   395  // The caller is expected to handle dentry insertion into dentry tree.
   396  func (fs *filesystem) newDentry(nid uint64) (*dentry, error) {
   397  	i, err := fs.getInode(nid)
   398  	if err != nil {
   399  		return nil, err
   400  	}
   401  	d := &dentry{
   402  		inode: i,
   403  	}
   404  	d.InitRefs()
   405  	d.vfsd.Init(d)
   406  	return d, nil
   407  }
   408  
   409  // DecRef implements vfs.DentryImpl.DecRef.
   410  func (d *dentry) DecRef(ctx context.Context) {
   411  	d.dentryRefs.DecRef(func() {
   412  		d.dirMu.Lock()
   413  		for _, c := range d.childMap {
   414  			c.DecRef(ctx)
   415  		}
   416  		d.childMap = nil
   417  		d.dirMu.Unlock()
   418  		d.inode.DecRef(ctx)
   419  	})
   420  }
   421  
   422  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   423  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   424  	if d.inode.IsDir() {
   425  		events |= linux.IN_ISDIR
   426  	}
   427  	// The ordering below is important, Linux always notifies the parent first.
   428  	if parent := d.parent.Load(); parent != nil {
   429  		parent.inode.watches.Notify(ctx, d.name, events, cookie, et, false)
   430  	}
   431  	d.inode.watches.Notify(ctx, "", events, cookie, et, false)
   432  }
   433  
   434  // Watches implements vfs.DentryImpl.Watches.
   435  func (d *dentry) Watches() *vfs.Watches {
   436  	return &d.inode.watches
   437  }
   438  
   439  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
   440  func (d *dentry) OnZeroWatches(ctx context.Context) {}
   441  
   442  func (d *dentry) open(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
   443  	ats := vfs.AccessTypesForOpenFlags(opts)
   444  	if err := d.inode.checkPermissions(rp.Credentials(), ats); err != nil {
   445  		return nil, err
   446  	}
   447  
   448  	switch d.inode.fileType() {
   449  	case linux.S_IFREG:
   450  		if ats&vfs.MayWrite != 0 {
   451  			return nil, linuxerr.EROFS
   452  		}
   453  		var fd regularFileFD
   454  		fd.LockFD.Init(&d.inode.locks)
   455  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   456  			return nil, err
   457  		}
   458  		return &fd.vfsfd, nil
   459  
   460  	case linux.S_IFDIR:
   461  		// Can't open directories with O_CREAT.
   462  		if opts.Flags&linux.O_CREAT != 0 {
   463  			return nil, linuxerr.EISDIR
   464  		}
   465  		// Can't open directories writably.
   466  		if ats&vfs.MayWrite != 0 {
   467  			return nil, linuxerr.EISDIR
   468  		}
   469  		if opts.Flags&linux.O_DIRECT != 0 {
   470  			return nil, linuxerr.EINVAL
   471  		}
   472  		var fd directoryFD
   473  		fd.LockFD.Init(&d.inode.locks)
   474  		if err := fd.vfsfd.Init(&fd, opts.Flags, rp.Mount(), &d.vfsd, &vfs.FileDescriptionOptions{AllowDirectIO: true}); err != nil {
   475  			return nil, err
   476  		}
   477  		return &fd.vfsfd, nil
   478  
   479  	case linux.S_IFLNK:
   480  		// Can't open symlinks without O_PATH, which is handled at the VFS layer.
   481  		return nil, linuxerr.ELOOP
   482  
   483  	default:
   484  		return nil, linuxerr.ENXIO
   485  	}
   486  }
   487  
   488  // +stateify savable
   489  type fileDescription struct {
   490  	vfsfd vfs.FileDescription
   491  	vfs.FileDescriptionDefaultImpl
   492  	vfs.LockFD
   493  
   494  	lockLogging sync.Once `state:"nosave"`
   495  }
   496  
   497  func (fd *fileDescription) filesystem() *filesystem {
   498  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   499  }
   500  
   501  func (fd *fileDescription) dentry() *dentry {
   502  	return fd.vfsfd.Dentry().Impl().(*dentry)
   503  }
   504  
   505  func (fd *fileDescription) inode() *inode {
   506  	return fd.dentry().inode
   507  }
   508  
   509  // Stat implements vfs.FileDescriptionImpl.Stat.
   510  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   511  	var stat linux.Statx
   512  	fd.inode().statTo(&stat)
   513  	return stat, nil
   514  }
   515  
   516  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   517  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   518  	return linuxerr.EROFS
   519  }
   520  
   521  // StatFS implements vfs.FileDescriptionImpl.StatFS.
   522  func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   523  	return fd.filesystem().statFS(), nil
   524  }
   525  
   526  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   527  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   528  	return nil, linuxerr.ENOTSUP
   529  }
   530  
   531  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   532  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   533  	return "", linuxerr.ENOTSUP
   534  }
   535  
   536  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   537  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   538  	return linuxerr.EROFS
   539  }
   540  
   541  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   542  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   543  	return linuxerr.EROFS
   544  }
   545  
   546  // Sync implements vfs.FileDescriptionImpl.Sync.
   547  func (*fileDescription) Sync(context.Context) error {
   548  	return nil
   549  }
   550  
   551  // Release implements vfs.FileDescriptionImpl.Release.
   552  func (*fileDescription) Release(ctx context.Context) {}