github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/ext/filesystem.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package ext
    16  
    17  import (
    18  	"errors"
    19  	"io"
    20  
    21  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    22  	"github.com/SagerNet/gvisor/pkg/context"
    23  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    24  	"github.com/SagerNet/gvisor/pkg/fspath"
    25  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/ext/disklayout"
    26  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    27  	"github.com/SagerNet/gvisor/pkg/sentry/socket/unix/transport"
    28  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    29  	"github.com/SagerNet/gvisor/pkg/sync"
    30  	"github.com/SagerNet/gvisor/pkg/syserror"
    31  )
    32  
    33  var (
    34  	// errResolveDirent indicates that the vfs.ResolvingPath.Component() does
    35  	// not exist on the dentry tree but does exist on disk. So it has to be read in
    36  	// using the in-memory dirent and added to the dentry tree. Usually indicates
    37  	// the need to lock filesystem.mu for writing.
    38  	errResolveDirent = errors.New("resolve path component using dirent")
    39  )
    40  
    41  // filesystem implements vfs.FilesystemImpl.
    42  //
    43  // +stateify savable
    44  type filesystem struct {
    45  	vfsfs vfs.Filesystem
    46  
    47  	// mu serializes changes to the Dentry tree.
    48  	mu sync.RWMutex `state:"nosave"`
    49  
    50  	// dev represents the underlying fs device. It does not require protection
    51  	// because io.ReaderAt permits concurrent read calls to it. It translates to
    52  	// the pread syscall which passes on the read request directly to the device
    53  	// driver. Device drivers are intelligent in serving multiple concurrent read
    54  	// requests in the optimal order (taking locality into consideration).
    55  	dev io.ReaderAt
    56  
    57  	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
    58  	// Inodes should be removed from this once their reference count hits 0.
    59  	//
    60  	// Protected by mu because most additions (see IterDirents) and all removals
    61  	// from this corresponds to a change in the dentry tree.
    62  	inodeCache map[uint32]*inode
    63  
    64  	// sb represents the filesystem superblock. Immutable after initialization.
    65  	sb disklayout.SuperBlock
    66  
    67  	// bgs represents all the block group descriptors for the filesystem.
    68  	// Immutable after initialization.
    69  	bgs []disklayout.BlockGroup
    70  
    71  	// devMinor is this filesystem's device minor number. Immutable after
    72  	// initialization.
    73  	devMinor uint32
    74  }
    75  
    76  // Compiles only if filesystem implements vfs.FilesystemImpl.
    77  var _ vfs.FilesystemImpl = (*filesystem)(nil)
    78  
    79  // stepLocked resolves rp.Component() in parent directory vfsd. The write
    80  // parameter passed tells if the caller has acquired filesystem.mu for writing
    81  // or not. If set to true, an existing inode on disk can be added to the dentry
    82  // tree if not present already.
    83  //
    84  // stepLocked is loosely analogous to fs/namei.c:walk_component().
    85  //
    86  // Preconditions:
    87  // * filesystem.mu must be locked (for writing if write param is true).
    88  // * !rp.Done().
    89  // * inode == vfsd.Impl().(*Dentry).inode.
    90  func stepLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry, inode *inode, write bool) (*vfs.Dentry, *inode, error) {
    91  	if !inode.isDir() {
    92  		return nil, nil, syserror.ENOTDIR
    93  	}
    94  	if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
    95  		return nil, nil, err
    96  	}
    97  
    98  	for {
    99  		name := rp.Component()
   100  		if name == "." {
   101  			rp.Advance()
   102  			return vfsd, inode, nil
   103  		}
   104  		d := vfsd.Impl().(*dentry)
   105  		if name == ".." {
   106  			isRoot, err := rp.CheckRoot(ctx, vfsd)
   107  			if err != nil {
   108  				return nil, nil, err
   109  			}
   110  			if isRoot || d.parent == nil {
   111  				rp.Advance()
   112  				return vfsd, inode, nil
   113  			}
   114  			if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
   115  				return nil, nil, err
   116  			}
   117  			rp.Advance()
   118  			return &d.parent.vfsd, d.parent.inode, nil
   119  		}
   120  
   121  		dir := inode.impl.(*directory)
   122  		child, ok := dir.childCache[name]
   123  		if !ok {
   124  			// We may need to instantiate a new dentry for this child.
   125  			childDirent, ok := dir.childMap[name]
   126  			if !ok {
   127  				// The underlying inode does not exist on disk.
   128  				return nil, nil, syserror.ENOENT
   129  			}
   130  
   131  			if !write {
   132  				// filesystem.mu must be held for writing to add to the dentry tree.
   133  				return nil, nil, errResolveDirent
   134  			}
   135  
   136  			// Create and add the component's dirent to the dentry tree.
   137  			fs := rp.Mount().Filesystem().Impl().(*filesystem)
   138  			childInode, err := fs.getOrCreateInodeLocked(childDirent.diskDirent.Inode())
   139  			if err != nil {
   140  				return nil, nil, err
   141  			}
   142  			// incRef because this is being added to the dentry tree.
   143  			childInode.incRef()
   144  			child = newDentry(childInode)
   145  			child.parent = d
   146  			child.name = name
   147  			dir.childCache[name] = child
   148  		}
   149  		if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
   150  			return nil, nil, err
   151  		}
   152  		if child.inode.isSymlink() && rp.ShouldFollowSymlink() {
   153  			if err := rp.HandleSymlink(child.inode.impl.(*symlink).target); err != nil {
   154  				return nil, nil, err
   155  			}
   156  			continue
   157  		}
   158  		rp.Advance()
   159  		return &child.vfsd, child.inode, nil
   160  	}
   161  }
   162  
   163  // walkLocked resolves rp to an existing file. The write parameter
   164  // passed tells if the caller has acquired filesystem.mu for writing or not.
   165  // If set to true, additions can be made to the dentry tree while walking.
   166  // If errResolveDirent is returned, the walk needs to be continued with an
   167  // upgraded filesystem.mu.
   168  //
   169  // walkLocked is loosely analogous to Linux's fs/namei.c:path_lookupat().
   170  //
   171  // Preconditions:
   172  // * filesystem.mu must be locked (for writing if write param is true).
   173  func walkLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
   174  	vfsd := rp.Start()
   175  	inode := vfsd.Impl().(*dentry).inode
   176  	for !rp.Done() {
   177  		var err error
   178  		vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
   179  		if err != nil {
   180  			return nil, nil, err
   181  		}
   182  	}
   183  	if rp.MustBeDir() && !inode.isDir() {
   184  		return nil, nil, syserror.ENOTDIR
   185  	}
   186  	return vfsd, inode, nil
   187  }
   188  
   189  // walkParentLocked resolves all but the last path component of rp to an
   190  // existing directory. It does not check that the returned directory is
   191  // searchable by the provider of rp. The write parameter passed tells if the
   192  // caller has acquired filesystem.mu for writing or not. If set to true,
   193  // additions can be made to the dentry tree while walking.
   194  // If errResolveDirent is returned, the walk needs to be continued with an
   195  // upgraded filesystem.mu.
   196  //
   197  // walkParentLocked is loosely analogous to Linux's fs/namei.c:path_parentat().
   198  //
   199  // Preconditions:
   200  // * filesystem.mu must be locked (for writing if write param is true).
   201  // * !rp.Done().
   202  func walkParentLocked(ctx context.Context, rp *vfs.ResolvingPath, write bool) (*vfs.Dentry, *inode, error) {
   203  	vfsd := rp.Start()
   204  	inode := vfsd.Impl().(*dentry).inode
   205  	for !rp.Final() {
   206  		var err error
   207  		vfsd, inode, err = stepLocked(ctx, rp, vfsd, inode, write)
   208  		if err != nil {
   209  			return nil, nil, err
   210  		}
   211  	}
   212  	if !inode.isDir() {
   213  		return nil, nil, syserror.ENOTDIR
   214  	}
   215  	return vfsd, inode, nil
   216  }
   217  
   218  // walk resolves rp to an existing file. If parent is set to true, it resolves
   219  // the rp till the parent of the last component which should be an existing
   220  // directory. If parent is false then resolves rp entirely. Attemps to resolve
   221  // the path as far as it can with a read lock and upgrades the lock if needed.
   222  func (fs *filesystem) walk(ctx context.Context, rp *vfs.ResolvingPath, parent bool) (*vfs.Dentry, *inode, error) {
   223  	var (
   224  		vfsd  *vfs.Dentry
   225  		inode *inode
   226  		err   error
   227  	)
   228  
   229  	// Try walking with the hopes that all dentries have already been pulled out
   230  	// of disk. This reduces congestion (allows concurrent walks).
   231  	fs.mu.RLock()
   232  	if parent {
   233  		vfsd, inode, err = walkParentLocked(ctx, rp, false)
   234  	} else {
   235  		vfsd, inode, err = walkLocked(ctx, rp, false)
   236  	}
   237  	fs.mu.RUnlock()
   238  
   239  	if err == errResolveDirent {
   240  		// Upgrade lock and continue walking. Lock upgrading in the middle of the
   241  		// walk is fine as this is a read only filesystem.
   242  		fs.mu.Lock()
   243  		if parent {
   244  			vfsd, inode, err = walkParentLocked(ctx, rp, true)
   245  		} else {
   246  			vfsd, inode, err = walkLocked(ctx, rp, true)
   247  		}
   248  		fs.mu.Unlock()
   249  	}
   250  
   251  	return vfsd, inode, err
   252  }
   253  
   254  // getOrCreateInodeLocked gets the inode corresponding to the inode number passed in.
   255  // It creates a new one with the given inode number if one does not exist.
   256  // The caller must increment the ref count if adding this to the dentry tree.
   257  //
   258  // Precondition: must be holding fs.mu for writing.
   259  func (fs *filesystem) getOrCreateInodeLocked(inodeNum uint32) (*inode, error) {
   260  	if in, ok := fs.inodeCache[inodeNum]; ok {
   261  		return in, nil
   262  	}
   263  
   264  	in, err := newInode(fs, inodeNum)
   265  	if err != nil {
   266  		return nil, err
   267  	}
   268  
   269  	fs.inodeCache[inodeNum] = in
   270  	return in, nil
   271  }
   272  
   273  // statTo writes the statfs fields to the output parameter.
   274  func (fs *filesystem) statTo(stat *linux.Statfs) {
   275  	stat.Type = uint64(fs.sb.Magic())
   276  	stat.BlockSize = int64(fs.sb.BlockSize())
   277  	stat.Blocks = fs.sb.BlocksCount()
   278  	stat.BlocksFree = fs.sb.FreeBlocksCount()
   279  	stat.BlocksAvailable = fs.sb.FreeBlocksCount()
   280  	stat.Files = uint64(fs.sb.InodesCount())
   281  	stat.FilesFree = uint64(fs.sb.FreeInodesCount())
   282  	stat.NameLength = disklayout.MaxFileName
   283  	stat.FragmentSize = int64(fs.sb.BlockSize())
   284  	// TODO(b/134676337): Set Statfs.Flags and Statfs.FSID.
   285  }
   286  
   287  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   288  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   289  	_, inode, err := fs.walk(ctx, rp, false)
   290  	if err != nil {
   291  		return err
   292  	}
   293  	return inode.checkPermissions(rp.Credentials(), ats)
   294  }
   295  
   296  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   297  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   298  	vfsd, inode, err := fs.walk(ctx, rp, false)
   299  	if err != nil {
   300  		return nil, err
   301  	}
   302  
   303  	if opts.CheckSearchable {
   304  		if !inode.isDir() {
   305  			return nil, syserror.ENOTDIR
   306  		}
   307  		if err := inode.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   308  			return nil, err
   309  		}
   310  	}
   311  
   312  	inode.incRef()
   313  	return vfsd, nil
   314  }
   315  
   316  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   317  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   318  	vfsd, inode, err := fs.walk(ctx, rp, true)
   319  	if err != nil {
   320  		return nil, err
   321  	}
   322  	inode.incRef()
   323  	return vfsd, nil
   324  }
   325  
   326  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   327  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   328  	vfsd, inode, err := fs.walk(ctx, rp, false)
   329  	if err != nil {
   330  		return nil, err
   331  	}
   332  
   333  	// EROFS is returned if write access is needed.
   334  	if vfs.MayWriteFileWithOpenFlags(opts.Flags) || opts.Flags&(linux.O_CREAT|linux.O_EXCL|linux.O_TMPFILE) != 0 {
   335  		return nil, linuxerr.EROFS
   336  	}
   337  	return inode.open(rp, vfsd, &opts)
   338  }
   339  
   340  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
   341  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
   342  	_, inode, err := fs.walk(ctx, rp, false)
   343  	if err != nil {
   344  		return "", err
   345  	}
   346  	symlink, ok := inode.impl.(*symlink)
   347  	if !ok {
   348  		return "", linuxerr.EINVAL
   349  	}
   350  	return symlink.target, nil
   351  }
   352  
   353  // StatAt implements vfs.FilesystemImpl.StatAt.
   354  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
   355  	_, inode, err := fs.walk(ctx, rp, false)
   356  	if err != nil {
   357  		return linux.Statx{}, err
   358  	}
   359  	var stat linux.Statx
   360  	inode.statTo(&stat)
   361  	return stat, nil
   362  }
   363  
   364  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
   365  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
   366  	if _, _, err := fs.walk(ctx, rp, false); err != nil {
   367  		return linux.Statfs{}, err
   368  	}
   369  
   370  	var stat linux.Statfs
   371  	fs.statTo(&stat)
   372  	return stat, nil
   373  }
   374  
   375  // Release implements vfs.FilesystemImpl.Release.
   376  func (fs *filesystem) Release(ctx context.Context) {
   377  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   378  }
   379  
   380  // Sync implements vfs.FilesystemImpl.Sync.
   381  func (fs *filesystem) Sync(ctx context.Context) error {
   382  	// This is a readonly filesystem for now.
   383  	return nil
   384  }
   385  
   386  // The vfs.FilesystemImpl functions below return EROFS because their respective
   387  // man pages say that EROFS must be returned if the path resolves to a file on
   388  // this read-only filesystem.
   389  
   390  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   391  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   392  	if rp.Done() {
   393  		return syserror.EEXIST
   394  	}
   395  
   396  	if _, _, err := fs.walk(ctx, rp, true); err != nil {
   397  		return err
   398  	}
   399  
   400  	return linuxerr.EROFS
   401  }
   402  
   403  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   404  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   405  	if rp.Done() {
   406  		return syserror.EEXIST
   407  	}
   408  
   409  	if _, _, err := fs.walk(ctx, rp, true); err != nil {
   410  		return err
   411  	}
   412  
   413  	return linuxerr.EROFS
   414  }
   415  
   416  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   417  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   418  	if rp.Done() {
   419  		return syserror.EEXIST
   420  	}
   421  
   422  	_, _, err := fs.walk(ctx, rp, true)
   423  	if err != nil {
   424  		return err
   425  	}
   426  
   427  	return linuxerr.EROFS
   428  }
   429  
   430  // RenameAt implements vfs.FilesystemImpl.RenameAt.
   431  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
   432  	if rp.Done() {
   433  		return syserror.ENOENT
   434  	}
   435  
   436  	_, _, err := fs.walk(ctx, rp, false)
   437  	if err != nil {
   438  		return err
   439  	}
   440  
   441  	return linuxerr.EROFS
   442  }
   443  
   444  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
   445  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   446  	_, inode, err := fs.walk(ctx, rp, false)
   447  	if err != nil {
   448  		return err
   449  	}
   450  
   451  	if !inode.isDir() {
   452  		return syserror.ENOTDIR
   453  	}
   454  
   455  	return linuxerr.EROFS
   456  }
   457  
   458  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
   459  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
   460  	_, _, err := fs.walk(ctx, rp, false)
   461  	if err != nil {
   462  		return err
   463  	}
   464  
   465  	return linuxerr.EROFS
   466  }
   467  
   468  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
   469  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
   470  	if rp.Done() {
   471  		return syserror.EEXIST
   472  	}
   473  
   474  	_, _, err := fs.walk(ctx, rp, true)
   475  	if err != nil {
   476  		return err
   477  	}
   478  
   479  	return linuxerr.EROFS
   480  }
   481  
   482  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
   483  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
   484  	_, inode, err := fs.walk(ctx, rp, false)
   485  	if err != nil {
   486  		return err
   487  	}
   488  
   489  	if inode.isDir() {
   490  		return syserror.EISDIR
   491  	}
   492  
   493  	return linuxerr.EROFS
   494  }
   495  
   496  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   497  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   498  	_, inode, err := fs.walk(ctx, rp, false)
   499  	if err != nil {
   500  		return nil, err
   501  	}
   502  	if err := inode.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   503  		return nil, err
   504  	}
   505  
   506  	// TODO(b/134676337): Support sockets.
   507  	return nil, linuxerr.ECONNREFUSED
   508  }
   509  
   510  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
   511  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
   512  	_, _, err := fs.walk(ctx, rp, false)
   513  	if err != nil {
   514  		return nil, err
   515  	}
   516  	return nil, linuxerr.ENOTSUP
   517  }
   518  
   519  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
   520  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
   521  	_, _, err := fs.walk(ctx, rp, false)
   522  	if err != nil {
   523  		return "", err
   524  	}
   525  	return "", linuxerr.ENOTSUP
   526  }
   527  
   528  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
   529  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
   530  	_, _, err := fs.walk(ctx, rp, false)
   531  	if err != nil {
   532  		return err
   533  	}
   534  	return linuxerr.ENOTSUP
   535  }
   536  
   537  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
   538  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
   539  	_, _, err := fs.walk(ctx, rp, false)
   540  	if err != nil {
   541  		return err
   542  	}
   543  	return linuxerr.ENOTSUP
   544  }
   545  
   546  // PrependPath implements vfs.FilesystemImpl.PrependPath.
   547  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
   548  	fs.mu.RLock()
   549  	defer fs.mu.RUnlock()
   550  	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
   551  }
   552  
   553  // MountOptions implements vfs.FilesystemImpl.MountOptions.
   554  func (fs *filesystem) MountOptions() string {
   555  	return ""
   556  }