github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tmpfs provides an in-memory filesystem whose contents are
    16  // application-mutable, consistent with Linux's tmpfs.
    17  //
    18  // Lock order:
    19  //
    20  // filesystem.mu
    21  //   inode.mu
    22  //     regularFileFD.offMu
    23  //       *** "memmap.Mappable locks" below this point
    24  //       regularFile.mapsMu
    25  //         *** "memmap.Mappable locks taken by Translate" below this point
    26  //         regularFile.dataMu
    27  //     directory.iterMu
    28  package tmpfs
    29  
    30  import (
    31  	"fmt"
    32  	"math"
    33  	"strconv"
    34  	"strings"
    35  	"sync/atomic"
    36  
    37  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    38  	"github.com/SagerNet/gvisor/pkg/context"
    39  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    40  	"github.com/SagerNet/gvisor/pkg/hostarch"
    41  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    42  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/time"
    43  	"github.com/SagerNet/gvisor/pkg/sentry/pgalloc"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    45  	"github.com/SagerNet/gvisor/pkg/sentry/vfs/memxattr"
    46  	"github.com/SagerNet/gvisor/pkg/sync"
    47  	"github.com/SagerNet/gvisor/pkg/syserror"
    48  )
    49  
    50  // Name is the default filesystem name.
    51  const Name = "tmpfs"
    52  
    53  // FilesystemType implements vfs.FilesystemType.
    54  //
    55  // +stateify savable
    56  type FilesystemType struct{}
    57  
    58  // filesystem implements vfs.FilesystemImpl.
    59  //
    60  // +stateify savable
    61  type filesystem struct {
    62  	vfsfs vfs.Filesystem
    63  
    64  	// mfp is used to allocate memory that stores regular file contents. mfp is
    65  	// immutable.
    66  	mfp pgalloc.MemoryFileProvider
    67  
    68  	// clock is a realtime clock used to set timestamps in file operations.
    69  	clock time.Clock
    70  
    71  	// devMinor is the filesystem's minor device number. devMinor is immutable.
    72  	devMinor uint32
    73  
    74  	// mopts contains the tmpfs-specific mount options passed to this
    75  	// filesystem. Immutable.
    76  	mopts string
    77  
    78  	// mu serializes changes to the Dentry tree.
    79  	mu sync.RWMutex `state:"nosave"`
    80  
    81  	nextInoMinusOne uint64 // accessed using atomic memory operations
    82  
    83  	root *dentry
    84  }
    85  
    86  // Name implements vfs.FilesystemType.Name.
    87  func (FilesystemType) Name() string {
    88  	return Name
    89  }
    90  
    91  // Release implements vfs.FilesystemType.Release.
    92  func (FilesystemType) Release(ctx context.Context) {}
    93  
    94  // FilesystemOpts is used to pass configuration data to tmpfs.
    95  //
    96  // +stateify savable
    97  type FilesystemOpts struct {
    98  	// RootFileType is the FileType of the filesystem root. Valid values
    99  	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
   100  	RootFileType uint16
   101  
   102  	// RootSymlinkTarget is the target of the root symlink. Only valid if
   103  	// RootFileType == S_IFLNK.
   104  	RootSymlinkTarget string
   105  
   106  	// FilesystemType allows setting a different FilesystemType for this
   107  	// tmpfs filesystem. This allows tmpfs to "impersonate" other
   108  	// filesystems, like ramdiskfs and cgroupfs.
   109  	FilesystemType vfs.FilesystemType
   110  }
   111  
   112  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   113  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   114  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   115  	if mfp == nil {
   116  		panic("MemoryFileProviderFromContext returned nil")
   117  	}
   118  
   119  	rootFileType := uint16(linux.S_IFDIR)
   120  	newFSType := vfs.FilesystemType(&fstype)
   121  	tmpfsOpts, ok := opts.InternalData.(FilesystemOpts)
   122  	if ok {
   123  		if tmpfsOpts.RootFileType != 0 {
   124  			rootFileType = tmpfsOpts.RootFileType
   125  		}
   126  		if tmpfsOpts.FilesystemType != nil {
   127  			newFSType = tmpfsOpts.FilesystemType
   128  		}
   129  	}
   130  
   131  	mopts := vfs.GenericParseMountOptions(opts.Data)
   132  	rootMode := linux.FileMode(0777)
   133  	if rootFileType == linux.S_IFDIR {
   134  		rootMode = 01777
   135  	}
   136  	modeStr, ok := mopts["mode"]
   137  	if ok {
   138  		delete(mopts, "mode")
   139  		mode, err := strconv.ParseUint(modeStr, 8, 32)
   140  		if err != nil {
   141  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
   142  			return nil, nil, linuxerr.EINVAL
   143  		}
   144  		rootMode = linux.FileMode(mode & 07777)
   145  	}
   146  	rootKUID := creds.EffectiveKUID
   147  	uidStr, ok := mopts["uid"]
   148  	if ok {
   149  		delete(mopts, "uid")
   150  		uid, err := strconv.ParseUint(uidStr, 10, 32)
   151  		if err != nil {
   152  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
   153  			return nil, nil, linuxerr.EINVAL
   154  		}
   155  		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
   156  		if !kuid.Ok() {
   157  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
   158  			return nil, nil, linuxerr.EINVAL
   159  		}
   160  		rootKUID = kuid
   161  	}
   162  	rootKGID := creds.EffectiveKGID
   163  	gidStr, ok := mopts["gid"]
   164  	if ok {
   165  		delete(mopts, "gid")
   166  		gid, err := strconv.ParseUint(gidStr, 10, 32)
   167  		if err != nil {
   168  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
   169  			return nil, nil, linuxerr.EINVAL
   170  		}
   171  		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
   172  		if !kgid.Ok() {
   173  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
   174  			return nil, nil, linuxerr.EINVAL
   175  		}
   176  		rootKGID = kgid
   177  	}
   178  	if len(mopts) != 0 {
   179  		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   180  		return nil, nil, linuxerr.EINVAL
   181  	}
   182  
   183  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   184  	if err != nil {
   185  		return nil, nil, err
   186  	}
   187  	clock := time.RealtimeClockFromContext(ctx)
   188  	fs := filesystem{
   189  		mfp:      mfp,
   190  		clock:    clock,
   191  		devMinor: devMinor,
   192  		mopts:    opts.Data,
   193  	}
   194  	fs.vfsfs.Init(vfsObj, newFSType, &fs)
   195  
   196  	var root *dentry
   197  	switch rootFileType {
   198  	case linux.S_IFREG:
   199  		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */))
   200  	case linux.S_IFLNK:
   201  		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */))
   202  	case linux.S_IFDIR:
   203  		root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry
   204  	default:
   205  		fs.vfsfs.DecRef(ctx)
   206  		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
   207  	}
   208  	fs.root = root
   209  	return &fs.vfsfs, &root.vfsd, nil
   210  }
   211  
   212  // NewFilesystem returns a new tmpfs filesystem.
   213  func NewFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials) (*vfs.Filesystem, *vfs.Dentry, error) {
   214  	return FilesystemType{}.GetFilesystem(ctx, vfsObj, creds, "", vfs.GetFilesystemOptions{})
   215  }
   216  
   217  // Release implements vfs.FilesystemImpl.Release.
   218  func (fs *filesystem) Release(ctx context.Context) {
   219  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   220  	fs.mu.Lock()
   221  	if fs.root.inode.isDir() {
   222  		fs.root.releaseChildrenLocked(ctx)
   223  	}
   224  	fs.mu.Unlock()
   225  }
   226  
   227  // releaseChildrenLocked is called on the mount point by filesystem.Release() to
   228  // destroy all objects in the mount. It performs a depth-first walk of the
   229  // filesystem and "unlinks" everything by decrementing link counts
   230  // appropriately. There should be no open file descriptors when this is called,
   231  // so each inode should only have one outstanding reference that is removed once
   232  // its link count hits zero.
   233  //
   234  // Note that we do not update filesystem state precisely while tearing down (for
   235  // instance, the child maps are ignored)--we only care to remove all remaining
   236  // references so that every filesystem object gets destroyed. Also note that we
   237  // do not need to trigger DecRef on the mount point itself or any child mount;
   238  // these are taken care of by the destructor of the enclosing MountNamespace.
   239  //
   240  // Precondition: filesystem.mu is held.
   241  func (d *dentry) releaseChildrenLocked(ctx context.Context) {
   242  	dir := d.inode.impl.(*directory)
   243  	for _, child := range dir.childMap {
   244  		if child.inode.isDir() {
   245  			child.releaseChildrenLocked(ctx)
   246  			child.inode.decLinksLocked(ctx) // link for child/.
   247  			dir.inode.decLinksLocked(ctx)   // link for child/..
   248  		}
   249  		child.inode.decLinksLocked(ctx) // link for child
   250  	}
   251  }
   252  
   253  // immutable
   254  var globalStatfs = linux.Statfs{
   255  	Type:         linux.TMPFS_MAGIC,
   256  	BlockSize:    hostarch.PageSize,
   257  	FragmentSize: hostarch.PageSize,
   258  	NameLength:   linux.NAME_MAX,
   259  
   260  	// tmpfs currently does not support configurable size limits. In Linux,
   261  	// such a tmpfs mount will return f_blocks == f_bfree == f_bavail == 0 from
   262  	// statfs(2). However, many applications treat this as having a size limit
   263  	// of 0. To work around this, claim to have a very large but non-zero size,
   264  	// chosen to ensure that BlockSize * Blocks does not overflow int64 (which
   265  	// applications may also handle incorrectly).
   266  	// TODO(b/29637826): allow configuring a tmpfs size and enforce it.
   267  	Blocks:          math.MaxInt64 / hostarch.PageSize,
   268  	BlocksFree:      math.MaxInt64 / hostarch.PageSize,
   269  	BlocksAvailable: math.MaxInt64 / hostarch.PageSize,
   270  }
   271  
   272  // dentry implements vfs.DentryImpl.
   273  //
   274  // +stateify savable
   275  type dentry struct {
   276  	vfsd vfs.Dentry
   277  
   278  	// parent is this dentry's parent directory. Each referenced dentry holds a
   279  	// reference on parent.dentry. If this dentry is a filesystem root, parent
   280  	// is nil. parent is protected by filesystem.mu.
   281  	parent *dentry
   282  
   283  	// name is the name of this dentry in its parent. If this dentry is a
   284  	// filesystem root, name is the empty string. name is protected by
   285  	// filesystem.mu.
   286  	name string
   287  
   288  	// dentryEntry (ugh) links dentries into their parent directory.childList.
   289  	dentryEntry
   290  
   291  	// inode is the inode represented by this dentry. Multiple Dentries may
   292  	// share a single non-directory inode (with hard links). inode is
   293  	// immutable.
   294  	//
   295  	// tmpfs doesn't count references on dentries; because the dentry tree is
   296  	// the sole source of truth, it is by definition always consistent with the
   297  	// state of the filesystem. However, it does count references on inodes,
   298  	// because inode resources are released when all references are dropped.
   299  	// dentry therefore forwards reference counting directly to inode.
   300  	inode *inode
   301  }
   302  
   303  func (fs *filesystem) newDentry(inode *inode) *dentry {
   304  	d := &dentry{
   305  		inode: inode,
   306  	}
   307  	d.vfsd.Init(d)
   308  	return d
   309  }
   310  
   311  // IncRef implements vfs.DentryImpl.IncRef.
   312  func (d *dentry) IncRef() {
   313  	d.inode.incRef()
   314  }
   315  
   316  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   317  func (d *dentry) TryIncRef() bool {
   318  	return d.inode.tryIncRef()
   319  }
   320  
   321  // DecRef implements vfs.DentryImpl.DecRef.
   322  func (d *dentry) DecRef(ctx context.Context) {
   323  	d.inode.decRef(ctx)
   324  }
   325  
   326  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   327  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   328  	if d.inode.isDir() {
   329  		events |= linux.IN_ISDIR
   330  	}
   331  
   332  	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   333  	// that d was deleted.
   334  	deleted := d.vfsd.IsDead()
   335  
   336  	d.inode.fs.mu.RLock()
   337  	// The ordering below is important, Linux always notifies the parent first.
   338  	if d.parent != nil {
   339  		d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   340  	}
   341  	d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
   342  	d.inode.fs.mu.RUnlock()
   343  }
   344  
   345  // Watches implements vfs.DentryImpl.Watches.
   346  func (d *dentry) Watches() *vfs.Watches {
   347  	return &d.inode.watches
   348  }
   349  
   350  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   351  func (d *dentry) OnZeroWatches(context.Context) {}
   352  
   353  // inode represents a filesystem object.
   354  //
   355  // +stateify savable
   356  type inode struct {
   357  	// fs is the owning filesystem. fs is immutable.
   358  	fs *filesystem
   359  
   360  	// A reference is held on all inodes as long as they are reachable in the
   361  	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
   362  	// nlink reaches 0.
   363  	refs inodeRefs
   364  
   365  	// xattrs implements extended attributes.
   366  	//
   367  	// TODO(b/148380782): Support xattrs other than user.*
   368  	xattrs memxattr.SimpleExtendedAttributes
   369  
   370  	// Inode metadata. Writing multiple fields atomically requires holding
   371  	// mu, othewise atomic operations can be used.
   372  	mu    sync.Mutex `state:"nosave"`
   373  	mode  uint32     // file type and mode
   374  	nlink uint32     // protected by filesystem.mu instead of inode.mu
   375  	uid   uint32     // auth.KUID, but stored as raw uint32 for sync/atomic
   376  	gid   uint32     // auth.KGID, but ...
   377  	ino   uint64     // immutable
   378  
   379  	// Linux's tmpfs has no concept of btime.
   380  	atime int64 // nanoseconds
   381  	ctime int64 // nanoseconds
   382  	mtime int64 // nanoseconds
   383  
   384  	locks vfs.FileLocks
   385  
   386  	// Inotify watches for this inode.
   387  	watches vfs.Watches
   388  
   389  	impl interface{} // immutable
   390  }
   391  
   392  const maxLinks = math.MaxUint32
   393  
   394  func (i *inode) init(impl interface{}, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) {
   395  	if mode.FileType() == 0 {
   396  		panic("file type is required in FileMode")
   397  	}
   398  
   399  	// Inherit the group and setgid bit as in fs/inode.c:inode_init_owner().
   400  	if parentDir != nil && parentDir.inode.mode&linux.S_ISGID == linux.S_ISGID {
   401  		kgid = auth.KGID(parentDir.inode.gid)
   402  		if mode&linux.S_IFDIR == linux.S_IFDIR {
   403  			mode |= linux.S_ISGID
   404  		}
   405  	}
   406  
   407  	i.fs = fs
   408  	i.mode = uint32(mode)
   409  	i.uid = uint32(kuid)
   410  	i.gid = uint32(kgid)
   411  	i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1)
   412  	// Tmpfs creation sets atime, ctime, and mtime to current time.
   413  	now := fs.clock.Now().Nanoseconds()
   414  	i.atime = now
   415  	i.ctime = now
   416  	i.mtime = now
   417  	// i.nlink initialized by caller
   418  	i.impl = impl
   419  	i.refs.InitRefs()
   420  }
   421  
   422  // incLinksLocked increments i's link count.
   423  //
   424  // Preconditions:
   425  // * filesystem.mu must be locked for writing.
   426  // * i.nlink != 0.
   427  // * i.nlink < maxLinks.
   428  func (i *inode) incLinksLocked() {
   429  	if i.nlink == 0 {
   430  		panic("tmpfs.inode.incLinksLocked() called with no existing links")
   431  	}
   432  	if i.nlink == maxLinks {
   433  		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
   434  	}
   435  	atomic.AddUint32(&i.nlink, 1)
   436  }
   437  
   438  // decLinksLocked decrements i's link count. If the link count reaches 0, we
   439  // remove a reference on i as well.
   440  //
   441  // Preconditions:
   442  // * filesystem.mu must be locked for writing.
   443  // * i.nlink != 0.
   444  func (i *inode) decLinksLocked(ctx context.Context) {
   445  	if i.nlink == 0 {
   446  		panic("tmpfs.inode.decLinksLocked() called with no existing links")
   447  	}
   448  	if atomic.AddUint32(&i.nlink, ^uint32(0)) == 0 {
   449  		i.decRef(ctx)
   450  	}
   451  }
   452  
   453  func (i *inode) incRef() {
   454  	i.refs.IncRef()
   455  }
   456  
   457  func (i *inode) tryIncRef() bool {
   458  	return i.refs.TryIncRef()
   459  }
   460  
   461  func (i *inode) decRef(ctx context.Context) {
   462  	i.refs.DecRef(func() {
   463  		i.watches.HandleDeletion(ctx)
   464  		if regFile, ok := i.impl.(*regularFile); ok {
   465  			// Release memory used by regFile to store data. Since regFile is
   466  			// no longer usable, we don't need to grab any locks or update any
   467  			// metadata.
   468  			regFile.data.DropAll(regFile.memFile)
   469  		}
   470  	})
   471  }
   472  
   473  func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   474  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   475  	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid)))
   476  }
   477  
   478  // Go won't inline this function, and returning linux.Statx (which is quite
   479  // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
   480  // output parameter.
   481  //
   482  // Note that Linux does not guarantee to return consistent data (in the case of
   483  // a concurrent modification), so we do not require holding inode.mu.
   484  func (i *inode) statTo(stat *linux.Statx) {
   485  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
   486  		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
   487  		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
   488  		linux.STATX_MTIME
   489  	stat.Blksize = hostarch.PageSize
   490  	stat.Nlink = atomic.LoadUint32(&i.nlink)
   491  	stat.UID = atomic.LoadUint32(&i.uid)
   492  	stat.GID = atomic.LoadUint32(&i.gid)
   493  	stat.Mode = uint16(atomic.LoadUint32(&i.mode))
   494  	stat.Ino = i.ino
   495  	stat.Atime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.atime))
   496  	stat.Ctime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.ctime))
   497  	stat.Mtime = linux.NsecToStatxTimestamp(atomic.LoadInt64(&i.mtime))
   498  	stat.DevMajor = linux.UNNAMED_MAJOR
   499  	stat.DevMinor = i.fs.devMinor
   500  	switch impl := i.impl.(type) {
   501  	case *regularFile:
   502  		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
   503  		stat.Size = uint64(atomic.LoadUint64(&impl.size))
   504  		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
   505  		// too expensive to compute here. Cache it in regularFile.
   506  		stat.Blocks = allocatedBlocksForSize(stat.Size)
   507  	case *directory:
   508  		// "20" is mm/shmem.c:BOGO_DIRENT_SIZE.
   509  		stat.Size = 20 * (2 + uint64(atomic.LoadInt64(&impl.numChildren)))
   510  		// stat.Blocks is 0.
   511  	case *symlink:
   512  		stat.Size = uint64(len(impl.target))
   513  		// stat.Blocks is 0.
   514  	case *namedPipe, *socketFile:
   515  		// stat.Size and stat.Blocks are 0.
   516  	case *deviceFile:
   517  		// stat.Size and stat.Blocks are 0.
   518  		stat.RdevMajor = impl.major
   519  		stat.RdevMinor = impl.minor
   520  	default:
   521  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   522  	}
   523  }
   524  
   525  func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
   526  	stat := &opts.Stat
   527  	if stat.Mask == 0 {
   528  		return nil
   529  	}
   530  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
   531  		return linuxerr.EPERM
   532  	}
   533  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   534  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil {
   535  		return err
   536  	}
   537  
   538  	i.mu.Lock()
   539  	defer i.mu.Unlock()
   540  	var (
   541  		needsMtimeBump bool
   542  		needsCtimeBump bool
   543  	)
   544  	clearSID := false
   545  	mask := stat.Mask
   546  	if mask&linux.STATX_SIZE != 0 {
   547  		switch impl := i.impl.(type) {
   548  		case *regularFile:
   549  			updated, err := impl.truncateLocked(stat.Size)
   550  			if err != nil {
   551  				return err
   552  			}
   553  			if updated {
   554  				clearSID = true
   555  				needsMtimeBump = true
   556  				needsCtimeBump = true
   557  			}
   558  		case *directory:
   559  			return syserror.EISDIR
   560  		default:
   561  			return linuxerr.EINVAL
   562  		}
   563  	}
   564  	if mask&linux.STATX_UID != 0 {
   565  		atomic.StoreUint32(&i.uid, stat.UID)
   566  		needsCtimeBump = true
   567  		clearSID = true
   568  	}
   569  	if mask&linux.STATX_GID != 0 {
   570  		atomic.StoreUint32(&i.gid, stat.GID)
   571  		needsCtimeBump = true
   572  		clearSID = true
   573  	}
   574  	if mask&linux.STATX_MODE != 0 {
   575  		for {
   576  			old := atomic.LoadUint32(&i.mode)
   577  			ft := old & linux.S_IFMT
   578  			newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
   579  			if clearSID {
   580  				newMode = vfs.ClearSUIDAndSGID(newMode)
   581  			}
   582  			if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped {
   583  				clearSID = false
   584  				break
   585  			}
   586  		}
   587  		needsCtimeBump = true
   588  	}
   589  	now := i.fs.clock.Now().Nanoseconds()
   590  	if mask&linux.STATX_ATIME != 0 {
   591  		if stat.Atime.Nsec == linux.UTIME_NOW {
   592  			atomic.StoreInt64(&i.atime, now)
   593  		} else {
   594  			atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped())
   595  		}
   596  		needsCtimeBump = true
   597  	}
   598  	if mask&linux.STATX_MTIME != 0 {
   599  		if stat.Mtime.Nsec == linux.UTIME_NOW {
   600  			atomic.StoreInt64(&i.mtime, now)
   601  		} else {
   602  			atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped())
   603  		}
   604  		needsCtimeBump = true
   605  		// Ignore the mtime bump, since we just set it ourselves.
   606  		needsMtimeBump = false
   607  	}
   608  	if mask&linux.STATX_CTIME != 0 {
   609  		if stat.Ctime.Nsec == linux.UTIME_NOW {
   610  			atomic.StoreInt64(&i.ctime, now)
   611  		} else {
   612  			atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped())
   613  		}
   614  		// Ignore the ctime bump, since we just set it ourselves.
   615  		needsCtimeBump = false
   616  	}
   617  
   618  	// We may have to clear the SUID/SGID bits, but didn't do so as part of
   619  	// STATX_MODE.
   620  	if clearSID {
   621  		for {
   622  			old := atomic.LoadUint32(&i.mode)
   623  			newMode := vfs.ClearSUIDAndSGID(old)
   624  			if swapped := atomic.CompareAndSwapUint32(&i.mode, old, newMode); swapped {
   625  				break
   626  			}
   627  		}
   628  		needsCtimeBump = true
   629  	}
   630  
   631  	if needsMtimeBump {
   632  		atomic.StoreInt64(&i.mtime, now)
   633  	}
   634  	if needsCtimeBump {
   635  		atomic.StoreInt64(&i.ctime, now)
   636  	}
   637  
   638  	return nil
   639  }
   640  
   641  // allocatedBlocksForSize returns the number of 512B blocks needed to
   642  // accommodate the given size in bytes, as appropriate for struct
   643  // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
   644  // size is independent of the "preferred block size for I/O", struct
   645  // stat::st_blksize and struct statx::stx_blksize.)
   646  func allocatedBlocksForSize(size uint64) uint64 {
   647  	return (size + 511) / 512
   648  }
   649  
   650  func (i *inode) direntType() uint8 {
   651  	switch impl := i.impl.(type) {
   652  	case *regularFile:
   653  		return linux.DT_REG
   654  	case *directory:
   655  		return linux.DT_DIR
   656  	case *symlink:
   657  		return linux.DT_LNK
   658  	case *socketFile:
   659  		return linux.DT_SOCK
   660  	case *namedPipe:
   661  		return linux.DT_FIFO
   662  	case *deviceFile:
   663  		switch impl.kind {
   664  		case vfs.BlockDevice:
   665  			return linux.DT_BLK
   666  		case vfs.CharDevice:
   667  			return linux.DT_CHR
   668  		default:
   669  			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
   670  		}
   671  	default:
   672  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   673  	}
   674  }
   675  
   676  func (i *inode) isDir() bool {
   677  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   678  	return mode.FileType() == linux.S_IFDIR
   679  }
   680  
   681  func (i *inode) touchAtime(mnt *vfs.Mount) {
   682  	if mnt.Flags.NoATime {
   683  		return
   684  	}
   685  	if err := mnt.CheckBeginWrite(); err != nil {
   686  		return
   687  	}
   688  	now := i.fs.clock.Now().Nanoseconds()
   689  	i.mu.Lock()
   690  	atomic.StoreInt64(&i.atime, now)
   691  	i.mu.Unlock()
   692  	mnt.EndWrite()
   693  }
   694  
   695  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   696  func (i *inode) touchCtime() {
   697  	now := i.fs.clock.Now().Nanoseconds()
   698  	i.mu.Lock()
   699  	atomic.StoreInt64(&i.ctime, now)
   700  	i.mu.Unlock()
   701  }
   702  
   703  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   704  func (i *inode) touchCMtime() {
   705  	now := i.fs.clock.Now().Nanoseconds()
   706  	i.mu.Lock()
   707  	atomic.StoreInt64(&i.mtime, now)
   708  	atomic.StoreInt64(&i.ctime, now)
   709  	i.mu.Unlock()
   710  }
   711  
   712  // Preconditions:
   713  // * The caller has called vfs.Mount.CheckBeginWrite().
   714  // * inode.mu must be locked.
   715  func (i *inode) touchCMtimeLocked() {
   716  	now := i.fs.clock.Now().Nanoseconds()
   717  	atomic.StoreInt64(&i.mtime, now)
   718  	atomic.StoreInt64(&i.ctime, now)
   719  }
   720  
   721  func checkXattrName(name string) error {
   722  	// Linux's tmpfs supports "security" and "trusted" xattr namespaces, and
   723  	// (depending on build configuration) POSIX ACL xattr namespaces
   724  	// ("system.posix_acl_access" and "system.posix_acl_default"). We don't
   725  	// support POSIX ACLs or the "security" namespace (b/148380782).
   726  	if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
   727  		return nil
   728  	}
   729  	// We support the "user" namespace because we have tests that depend on
   730  	// this feature.
   731  	if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
   732  		return nil
   733  	}
   734  	return syserror.EOPNOTSUPP
   735  }
   736  
   737  func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) {
   738  	return i.xattrs.ListXattr(creds, size)
   739  }
   740  
   741  func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
   742  	if err := checkXattrName(opts.Name); err != nil {
   743  		return "", err
   744  	}
   745  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   746  	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
   747  	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
   748  	if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil {
   749  		return "", err
   750  	}
   751  	return i.xattrs.GetXattr(creds, mode, kuid, opts)
   752  }
   753  
   754  func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
   755  	if err := checkXattrName(opts.Name); err != nil {
   756  		return err
   757  	}
   758  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   759  	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
   760  	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
   761  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   762  		return err
   763  	}
   764  	return i.xattrs.SetXattr(creds, mode, kuid, opts)
   765  }
   766  
   767  func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
   768  	if err := checkXattrName(name); err != nil {
   769  		return err
   770  	}
   771  	mode := linux.FileMode(atomic.LoadUint32(&i.mode))
   772  	kuid := auth.KUID(atomic.LoadUint32(&i.uid))
   773  	kgid := auth.KGID(atomic.LoadUint32(&i.gid))
   774  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   775  		return err
   776  	}
   777  	return i.xattrs.RemoveXattr(creds, mode, kuid, name)
   778  }
   779  
   780  // fileDescription is embedded by tmpfs implementations of
   781  // vfs.FileDescriptionImpl.
   782  //
   783  // +stateify savable
   784  type fileDescription struct {
   785  	vfsfd vfs.FileDescription
   786  	vfs.FileDescriptionDefaultImpl
   787  	vfs.LockFD
   788  }
   789  
   790  func (fd *fileDescription) filesystem() *filesystem {
   791  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   792  }
   793  
   794  func (fd *fileDescription) dentry() *dentry {
   795  	return fd.vfsfd.Dentry().Impl().(*dentry)
   796  }
   797  
   798  func (fd *fileDescription) inode() *inode {
   799  	return fd.dentry().inode
   800  }
   801  
   802  // Stat implements vfs.FileDescriptionImpl.Stat.
   803  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   804  	var stat linux.Statx
   805  	fd.inode().statTo(&stat)
   806  	return stat, nil
   807  }
   808  
   809  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   810  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   811  	creds := auth.CredentialsFromContext(ctx)
   812  	d := fd.dentry()
   813  	if err := d.inode.setStat(ctx, creds, &opts); err != nil {
   814  		return err
   815  	}
   816  
   817  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
   818  		d.InotifyWithParent(ctx, ev, 0, vfs.InodeEvent)
   819  	}
   820  	return nil
   821  }
   822  
   823  // StatFS implements vfs.FileDescriptionImpl.StatFS.
   824  func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   825  	return globalStatfs, nil
   826  }
   827  
   828  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   829  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   830  	return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size)
   831  }
   832  
   833  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   834  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   835  	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
   836  }
   837  
   838  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   839  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   840  	d := fd.dentry()
   841  	if err := d.inode.setXattr(auth.CredentialsFromContext(ctx), &opts); err != nil {
   842  		return err
   843  	}
   844  
   845  	// Generate inotify events.
   846  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   847  	return nil
   848  }
   849  
   850  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   851  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   852  	d := fd.dentry()
   853  	if err := d.inode.removeXattr(auth.CredentialsFromContext(ctx), name); err != nil {
   854  		return err
   855  	}
   856  
   857  	// Generate inotify events.
   858  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   859  	return nil
   860  }
   861  
   862  // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
   863  // filesystem state is in-memory.
   864  func (*fileDescription) Sync(context.Context) error {
   865  	return nil
   866  }