github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tmpfs provides an in-memory filesystem whose contents are
    16  // application-mutable, consistent with Linux's tmpfs.
    17  //
    18  // Lock order:
    19  //
    20  //	filesystem.mu
    21  //		inode.mu
    22  //		  regularFileFD.offMu
    23  //		    *** "memmap.Mappable locks" below this point
    24  //		    regularFile.mapsMu
    25  //		      *** "memmap.Mappable locks taken by Translate" below this point
    26  //		      regularFile.dataMu
    27  //		        fs.pagesUsedMu
    28  //		  directory.iterMu
    29  package tmpfs
    30  
    31  import (
    32  	"fmt"
    33  	"math"
    34  	"strconv"
    35  	"strings"
    36  
    37  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    38  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    39  	"github.com/MerlinKodo/gvisor/pkg/context"
    40  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    41  	"github.com/MerlinKodo/gvisor/pkg/fd"
    42  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    43  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    44  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    45  	"github.com/MerlinKodo/gvisor/pkg/sentry/pgalloc"
    46  	"github.com/MerlinKodo/gvisor/pkg/sentry/usage"
    47  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    48  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs/memxattr"
    49  )
    50  
    51  // Name is the default filesystem name.
    52  const Name = "tmpfs"
    53  
    54  // FilesystemType implements vfs.FilesystemType.
    55  //
    56  // +stateify savable
    57  type FilesystemType struct{}
    58  
    59  // filesystem implements vfs.FilesystemImpl.
    60  //
    61  // +stateify savable
    62  type filesystem struct {
    63  	vfsfs vfs.Filesystem
    64  
    65  	// mf is used to allocate memory that stores regular file contents. mf is
    66  	// immutable, except it may to changed during restore.
    67  	mf *pgalloc.MemoryFile `state:"nosave"`
    68  
    69  	// privateMF indicates whether mf is private to this tmpfs mount. If so,
    70  	// tmpfs takes ownership of mf. privateMF is immutable.
    71  	privateMF bool
    72  
    73  	// mfp is used to provide mf, when privateMF == false. This is required to
    74  	// re-provide mf on restore. mfp is immutable.
    75  	mfp pgalloc.MemoryFileProvider
    76  
    77  	// clock is a realtime clock used to set timestamps in file operations.
    78  	clock time.Clock
    79  
    80  	// devMinor is the filesystem's minor device number. devMinor is immutable.
    81  	devMinor uint32
    82  
    83  	// mopts contains the tmpfs-specific mount options passed to this
    84  	// filesystem. Immutable.
    85  	mopts string
    86  
    87  	// usage is the memory accounting category under which pages backing
    88  	// files in this filesystem are accounted.
    89  	usage usage.MemoryKind
    90  
    91  	// mu serializes changes to the Dentry tree.
    92  	mu filesystemRWMutex `state:"nosave"`
    93  
    94  	nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations
    95  
    96  	root *dentry
    97  
    98  	maxFilenameLen int
    99  
   100  	// maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages.
   101  	// This field is immutable.
   102  	maxSizeInPages uint64
   103  
   104  	// pagesUsed is the number of pages used by this filesystem.
   105  	pagesUsed atomicbitops.Uint64
   106  
   107  	// allowXattrPrefix is a set of xattr namespace prefixes that this
   108  	// tmpfs mount will allow. It is immutable.
   109  	allowXattrPrefix map[string]struct{}
   110  }
   111  
   112  // Name implements vfs.FilesystemType.Name.
   113  func (FilesystemType) Name() string {
   114  	return Name
   115  }
   116  
   117  // Release implements vfs.FilesystemType.Release.
   118  func (FilesystemType) Release(ctx context.Context) {}
   119  
   120  // FilesystemOpts is used to pass configuration data to tmpfs.
   121  //
   122  // +stateify savable
   123  type FilesystemOpts struct {
   124  	// RootFileType is the FileType of the filesystem root. Valid values
   125  	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
   126  	RootFileType uint16
   127  
   128  	// RootSymlinkTarget is the target of the root symlink. Only valid if
   129  	// RootFileType == S_IFLNK.
   130  	RootSymlinkTarget string
   131  
   132  	// FilesystemType allows setting a different FilesystemType for this
   133  	// tmpfs filesystem. This allows tmpfs to "impersonate" other
   134  	// filesystems, like ramdiskfs and cgroupfs.
   135  	FilesystemType vfs.FilesystemType
   136  
   137  	// Usage is the memory accounting category under which pages backing files in
   138  	// the filesystem are accounted.
   139  	Usage *usage.MemoryKind
   140  
   141  	// MaxFilenameLen is the maximum filename length allowed by the tmpfs.
   142  	MaxFilenameLen int
   143  
   144  	// FilestoreFD is the FD for the memory file that will be used to store file
   145  	// data. If this is nil, then MemoryFileProviderFromContext() is used.
   146  	FilestoreFD *fd.FD
   147  
   148  	// DisableDefaultSizeLimit disables setting a default size limit. In Linux,
   149  	// SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super().
   150  	DisableDefaultSizeLimit bool
   151  
   152  	// AllowXattrPrefix is a set of xattr namespace prefixes that this
   153  	// tmpfs mount will allow.
   154  	AllowXattrPrefix []string
   155  }
   156  
   157  // Default size limit mount option. It is immutable after initialization.
   158  var defaultSizeLimit uint64
   159  
   160  // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts
   161  // that do not specify a size= mount option. This must be called only once,
   162  // before any tmpfs filesystems are created.
   163  func SetDefaultSizeLimit(sizeLimit uint64) {
   164  	defaultSizeLimit = sizeLimit
   165  }
   166  
   167  func getDefaultSizeLimit(disable bool) uint64 {
   168  	if disable || defaultSizeLimit == 0 {
   169  		// The size limit is used to populate statfs(2) results. If Linux tmpfs is
   170  		// mounted with no size option, then statfs(2) returns f_blocks == f_bfree
   171  		// == f_bavail == 0. However, many applications treat this as having a size
   172  		// limit of 0. To work around this, return a very large but non-zero size
   173  		// limit, chosen to ensure that it does not overflow int64.
   174  		return math.MaxInt64
   175  	}
   176  	return defaultSizeLimit
   177  }
   178  
   179  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   180  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   181  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   182  	if mfp == nil {
   183  		panic("MemoryFileProviderFromContext returned nil")
   184  	}
   185  	mf := mfp.MemoryFile()
   186  	privateMF := false
   187  
   188  	rootFileType := uint16(linux.S_IFDIR)
   189  	disableDefaultSizeLimit := false
   190  	newFSType := vfs.FilesystemType(&fstype)
   191  
   192  	// By default we support only "trusted" and "user" namespaces. Linux
   193  	// also supports "security" and (if configured) POSIX ACL namespaces
   194  	// "system.posix_acl_access" and "system.posix_acl_default".
   195  	allowXattrPrefix := map[string]struct{}{
   196  		linux.XATTR_TRUSTED_PREFIX: struct{}{},
   197  		linux.XATTR_USER_PREFIX:    struct{}{},
   198  		// The "security" namespace is allowed, but it always returns an error.
   199  		linux.XATTR_SECURITY_PREFIX: struct{}{},
   200  	}
   201  
   202  	tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts)
   203  	if tmpfsOptsOk {
   204  		if tmpfsOpts.RootFileType != 0 {
   205  			rootFileType = tmpfsOpts.RootFileType
   206  		}
   207  		if tmpfsOpts.FilesystemType != nil {
   208  			newFSType = tmpfsOpts.FilesystemType
   209  		}
   210  		disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit
   211  		if tmpfsOpts.FilestoreFD != nil {
   212  			mfOpts := pgalloc.MemoryFileOpts{
   213  				// tmpfsOpts.FilestoreFD may be backed by a file on disk (not memfd),
   214  				// which needs to be decommited on destroy to release disk space.
   215  				DecommitOnDestroy: true,
   216  				// sentry's seccomp filters don't allow the mmap(2) syscalls that
   217  				// pgalloc.IMAWorkAroundForMemFile() uses. Users of tmpfsOpts.FilestoreFD
   218  				// are expected to have performed the work around outside the sandbox.
   219  				DisableIMAWorkAround: true,
   220  				// Custom filestore FDs are usually backed by files on disk. Ideally we
   221  				// would confirm with fstatfs(2) but that is prohibited by seccomp.
   222  				DiskBackedFile: true,
   223  			}
   224  			var err error
   225  			mf, err = pgalloc.NewMemoryFile(tmpfsOpts.FilestoreFD.ReleaseToFile("overlay-filestore"), mfOpts)
   226  			if err != nil {
   227  				ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: pgalloc.NewMemoryFile failed: %v", err)
   228  				return nil, nil, err
   229  			}
   230  			privateMF = true
   231  		}
   232  
   233  		for _, xattr := range tmpfsOpts.AllowXattrPrefix {
   234  			allowXattrPrefix[xattr] = struct{}{}
   235  		}
   236  	}
   237  
   238  	mopts := vfs.GenericParseMountOptions(opts.Data)
   239  	rootMode := linux.FileMode(0777)
   240  	if rootFileType == linux.S_IFDIR {
   241  		rootMode = 01777
   242  	}
   243  	modeStr, ok := mopts["mode"]
   244  	if ok {
   245  		delete(mopts, "mode")
   246  		mode, err := strconv.ParseUint(modeStr, 8, 32)
   247  		if err != nil {
   248  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
   249  			return nil, nil, linuxerr.EINVAL
   250  		}
   251  		rootMode = linux.FileMode(mode & 07777)
   252  	}
   253  	rootKUID := creds.EffectiveKUID
   254  	uidStr, ok := mopts["uid"]
   255  	if ok {
   256  		delete(mopts, "uid")
   257  		uid, err := strconv.ParseUint(uidStr, 10, 32)
   258  		if err != nil {
   259  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
   260  			return nil, nil, linuxerr.EINVAL
   261  		}
   262  		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
   263  		if !kuid.Ok() {
   264  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
   265  			return nil, nil, linuxerr.EINVAL
   266  		}
   267  		rootKUID = kuid
   268  	}
   269  	rootKGID := creds.EffectiveKGID
   270  	gidStr, ok := mopts["gid"]
   271  	if ok {
   272  		delete(mopts, "gid")
   273  		gid, err := strconv.ParseUint(gidStr, 10, 32)
   274  		if err != nil {
   275  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
   276  			return nil, nil, linuxerr.EINVAL
   277  		}
   278  		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
   279  		if !kgid.Ok() {
   280  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
   281  			return nil, nil, linuxerr.EINVAL
   282  		}
   283  		rootKGID = kgid
   284  	}
   285  	maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize
   286  	maxSizeStr, ok := mopts["size"]
   287  	if ok {
   288  		delete(mopts, "size")
   289  		maxSizeInBytes, err := parseSize(maxSizeStr)
   290  		if err != nil {
   291  			ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err)
   292  			return nil, nil, linuxerr.EINVAL
   293  		}
   294  		// Convert size in bytes to nearest Page Size bytes
   295  		// as Linux allocates memory in terms of Page size.
   296  		maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes)
   297  		if !ok {
   298  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok)
   299  			return nil, nil, linuxerr.EINVAL
   300  		}
   301  	}
   302  
   303  	if len(mopts) != 0 {
   304  		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   305  		return nil, nil, linuxerr.EINVAL
   306  	}
   307  
   308  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   309  	if err != nil {
   310  		return nil, nil, err
   311  	}
   312  	clock := time.RealtimeClockFromContext(ctx)
   313  	memUsage := usage.Tmpfs
   314  	if tmpfsOpts.Usage != nil {
   315  		memUsage = *tmpfsOpts.Usage
   316  	}
   317  	fs := filesystem{
   318  		mf:               mf,
   319  		privateMF:        privateMF,
   320  		mfp:              mfp,
   321  		clock:            clock,
   322  		devMinor:         devMinor,
   323  		mopts:            opts.Data,
   324  		usage:            memUsage,
   325  		maxFilenameLen:   linux.NAME_MAX,
   326  		maxSizeInPages:   maxSizeInPages,
   327  		allowXattrPrefix: allowXattrPrefix,
   328  	}
   329  	fs.vfsfs.Init(vfsObj, newFSType, &fs)
   330  	if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 {
   331  		fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen
   332  	}
   333  
   334  	var root *dentry
   335  	switch rootFileType {
   336  	case linux.S_IFREG:
   337  		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */))
   338  	case linux.S_IFLNK:
   339  		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */))
   340  	case linux.S_IFDIR:
   341  		root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry
   342  	default:
   343  		fs.vfsfs.DecRef(ctx)
   344  		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
   345  	}
   346  	fs.root = root
   347  	return &fs.vfsfs, &root.vfsd, nil
   348  }
   349  
   350  // Release implements vfs.FilesystemImpl.Release.
   351  func (fs *filesystem) Release(ctx context.Context) {
   352  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   353  	fs.mu.Lock()
   354  	if fs.root.inode.isDir() {
   355  		fs.root.releaseChildrenLocked(ctx)
   356  	}
   357  	fs.mu.Unlock()
   358  	if fs.privateMF {
   359  		fs.mf.Destroy()
   360  	}
   361  }
   362  
   363  // releaseChildrenLocked is called on the mount point by filesystem.Release() to
   364  // destroy all objects in the mount. It performs a depth-first walk of the
   365  // filesystem and "unlinks" everything by decrementing link counts
   366  // appropriately. There should be no open file descriptors when this is called,
   367  // so each inode should only have one outstanding reference that is removed once
   368  // its link count hits zero.
   369  //
   370  // Note that we do not update filesystem state precisely while tearing down (for
   371  // instance, the child maps are ignored)--we only care to remove all remaining
   372  // references so that every filesystem object gets destroyed. Also note that we
   373  // do not need to trigger DecRef on the mount point itself or any child mount;
   374  // these are taken care of by the destructor of the enclosing MountNamespace.
   375  //
   376  // Precondition: filesystem.mu is held.
   377  func (d *dentry) releaseChildrenLocked(ctx context.Context) {
   378  	dir := d.inode.impl.(*directory)
   379  	for _, child := range dir.childMap {
   380  		if child.inode.isDir() {
   381  			child.releaseChildrenLocked(ctx)
   382  			child.inode.decLinksLocked(ctx) // link for child/.
   383  			dir.inode.decLinksLocked(ctx)   // link for child/..
   384  		}
   385  		child.inode.decLinksLocked(ctx) // link for child
   386  	}
   387  }
   388  
   389  func (fs *filesystem) statFS() linux.Statfs {
   390  	st := linux.Statfs{
   391  		Type:         linux.TMPFS_MAGIC,
   392  		BlockSize:    hostarch.PageSize,
   393  		FragmentSize: hostarch.PageSize,
   394  		NameLength:   linux.NAME_MAX,
   395  	}
   396  
   397  	// If size is set for tmpfs return set values.
   398  	st.Blocks = fs.maxSizeInPages
   399  	pagesUsed := fs.pagesUsed.Load()
   400  	st.BlocksFree = fs.maxSizeInPages - pagesUsed
   401  	st.BlocksAvailable = fs.maxSizeInPages - pagesUsed
   402  	return st
   403  }
   404  
   405  // dentry implements vfs.DentryImpl.
   406  //
   407  // +stateify savable
   408  type dentry struct {
   409  	vfsd vfs.Dentry
   410  
   411  	// parent is this dentry's parent directory. Each referenced dentry holds a
   412  	// reference on parent.dentry. If this dentry is a filesystem root, parent
   413  	// is nil. parent is protected by filesystem.mu.
   414  	parent *dentry
   415  
   416  	// name is the name of this dentry in its parent. If this dentry is a
   417  	// filesystem root, name is the empty string. name is protected by
   418  	// filesystem.mu.
   419  	name string
   420  
   421  	// dentryEntry (ugh) links dentries into their parent directory.childList.
   422  	dentryEntry
   423  
   424  	// inode is the inode represented by this dentry. Multiple Dentries may
   425  	// share a single non-directory inode (with hard links). inode is
   426  	// immutable.
   427  	//
   428  	// tmpfs doesn't count references on dentries; because the dentry tree is
   429  	// the sole source of truth, it is by definition always consistent with the
   430  	// state of the filesystem. However, it does count references on inodes,
   431  	// because inode resources are released when all references are dropped.
   432  	// dentry therefore forwards reference counting directly to inode.
   433  	inode *inode
   434  }
   435  
   436  func (fs *filesystem) newDentry(inode *inode) *dentry {
   437  	d := &dentry{
   438  		inode: inode,
   439  	}
   440  	d.vfsd.Init(d)
   441  	return d
   442  }
   443  
   444  // IncRef implements vfs.DentryImpl.IncRef.
   445  func (d *dentry) IncRef() {
   446  	d.inode.incRef()
   447  }
   448  
   449  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   450  func (d *dentry) TryIncRef() bool {
   451  	return d.inode.tryIncRef()
   452  }
   453  
   454  // DecRef implements vfs.DentryImpl.DecRef.
   455  func (d *dentry) DecRef(ctx context.Context) {
   456  	d.inode.decRef(ctx)
   457  }
   458  
   459  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   460  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   461  	if d.inode.isDir() {
   462  		events |= linux.IN_ISDIR
   463  	}
   464  
   465  	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   466  	// that d was deleted.
   467  	deleted := d.vfsd.IsDead()
   468  
   469  	d.inode.fs.mu.RLock()
   470  	// The ordering below is important, Linux always notifies the parent first.
   471  	if d.parent != nil {
   472  		d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   473  	}
   474  	d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
   475  	d.inode.fs.mu.RUnlock()
   476  }
   477  
   478  // Watches implements vfs.DentryImpl.Watches.
   479  func (d *dentry) Watches() *vfs.Watches {
   480  	return &d.inode.watches
   481  }
   482  
   483  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   484  func (d *dentry) OnZeroWatches(context.Context) {}
   485  
   486  // inode represents a filesystem object.
   487  //
   488  // +stateify savable
   489  type inode struct {
   490  	// fs is the owning filesystem. fs is immutable.
   491  	fs *filesystem
   492  
   493  	// A reference is held on all inodes as long as they are reachable in the
   494  	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
   495  	// nlink reaches 0.
   496  	refs inodeRefs
   497  
   498  	// xattrs implements extended attributes.
   499  	//
   500  	// TODO(b/148380782): Support xattrs other than user.*
   501  	xattrs memxattr.SimpleExtendedAttributes
   502  
   503  	// Inode metadata. Writing multiple fields atomically requires holding
   504  	// mu, othewise atomic operations can be used.
   505  	mu    inodeMutex          `state:"nosave"`
   506  	mode  atomicbitops.Uint32 // file type and mode
   507  	nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu
   508  	uid   atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   509  	gid   atomicbitops.Uint32 // auth.KGID, but ...
   510  	ino   uint64              // immutable
   511  
   512  	// Linux's tmpfs has no concept of btime.
   513  	atime atomicbitops.Int64 // nanoseconds
   514  	ctime atomicbitops.Int64 // nanoseconds
   515  	mtime atomicbitops.Int64 // nanoseconds
   516  
   517  	locks vfs.FileLocks
   518  
   519  	// Inotify watches for this inode.
   520  	watches vfs.Watches
   521  
   522  	impl any // immutable
   523  }
   524  
   525  const maxLinks = math.MaxUint32
   526  
   527  func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) {
   528  	if mode.FileType() == 0 {
   529  		panic("file type is required in FileMode")
   530  	}
   531  
   532  	// Inherit the group and setgid bit as in fs/inode.c:inode_init_owner().
   533  	if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID {
   534  		kgid = auth.KGID(parentDir.inode.gid.Load())
   535  		if mode&linux.S_IFDIR == linux.S_IFDIR {
   536  			mode |= linux.S_ISGID
   537  		}
   538  	}
   539  
   540  	i.fs = fs
   541  	i.mode = atomicbitops.FromUint32(uint32(mode))
   542  	i.uid = atomicbitops.FromUint32(uint32(kuid))
   543  	i.gid = atomicbitops.FromUint32(uint32(kgid))
   544  	i.ino = fs.nextInoMinusOne.Add(1)
   545  	// Tmpfs creation sets atime, ctime, and mtime to current time.
   546  	now := fs.clock.Now().Nanoseconds()
   547  	i.atime = atomicbitops.FromInt64(now)
   548  	i.ctime = atomicbitops.FromInt64(now)
   549  	i.mtime = atomicbitops.FromInt64(now)
   550  	// i.nlink initialized by caller
   551  	i.impl = impl
   552  	i.refs.InitRefs()
   553  }
   554  
   555  // incLinksLocked increments i's link count.
   556  //
   557  // Preconditions:
   558  //   - filesystem.mu must be locked for writing.
   559  //   - i.mu must be lcoked.
   560  //   - i.nlink != 0.
   561  //   - i.nlink < maxLinks.
   562  func (i *inode) incLinksLocked() {
   563  	if i.nlink.RacyLoad() == 0 {
   564  		panic("tmpfs.inode.incLinksLocked() called with no existing links")
   565  	}
   566  	if i.nlink.RacyLoad() == maxLinks {
   567  		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
   568  	}
   569  	i.nlink.Add(1)
   570  }
   571  
   572  // decLinksLocked decrements i's link count. If the link count reaches 0, we
   573  // remove a reference on i as well.
   574  //
   575  // Preconditions:
   576  //   - filesystem.mu must be locked for writing.
   577  //   - i.mu must be lcoked.
   578  //   - i.nlink != 0.
   579  func (i *inode) decLinksLocked(ctx context.Context) {
   580  	if i.nlink.RacyLoad() == 0 {
   581  		panic("tmpfs.inode.decLinksLocked() called with no existing links")
   582  	}
   583  	if i.nlink.Add(^uint32(0)) == 0 {
   584  		i.decRef(ctx)
   585  	}
   586  }
   587  
   588  func (i *inode) incRef() {
   589  	i.refs.IncRef()
   590  }
   591  
   592  func (i *inode) tryIncRef() bool {
   593  	return i.refs.TryIncRef()
   594  }
   595  
   596  func (i *inode) decRef(ctx context.Context) {
   597  	i.refs.DecRef(func() {
   598  		i.watches.HandleDeletion(ctx)
   599  		// Remove pages used if child being removed is a SymLink or Regular File.
   600  		switch impl := i.impl.(type) {
   601  		case *symlink:
   602  			if len(impl.target) >= shortSymlinkLen {
   603  				impl.inode.fs.unaccountPages(1)
   604  			}
   605  		case *regularFile:
   606  			// Release memory used by regFile to store data. Since regFile is
   607  			// no longer usable, we don't need to grab any locks or update any
   608  			// metadata.
   609  			pagesDec := impl.data.DropAll(i.fs.mf)
   610  			impl.inode.fs.unaccountPages(pagesDec)
   611  		}
   612  
   613  	})
   614  }
   615  
   616  func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   617  	mode := linux.FileMode(i.mode.Load())
   618  	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load()))
   619  }
   620  
   621  // Go won't inline this function, and returning linux.Statx (which is quite
   622  // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
   623  // output parameter.
   624  //
   625  // Note that Linux does not guarantee to return consistent data (in the case of
   626  // a concurrent modification), so we do not require holding inode.mu.
   627  func (i *inode) statTo(stat *linux.Statx) {
   628  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
   629  		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
   630  		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
   631  		linux.STATX_MTIME
   632  	stat.Blksize = hostarch.PageSize
   633  	stat.Nlink = i.nlink.Load()
   634  	stat.UID = i.uid.Load()
   635  	stat.GID = i.gid.Load()
   636  	stat.Mode = uint16(i.mode.Load())
   637  	stat.Ino = i.ino
   638  	stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load())
   639  	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load())
   640  	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load())
   641  	stat.DevMajor = linux.UNNAMED_MAJOR
   642  	stat.DevMinor = i.fs.devMinor
   643  	switch impl := i.impl.(type) {
   644  	case *regularFile:
   645  		stat.Size = uint64(impl.size.Load())
   646  		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
   647  		// too expensive to compute here. Cache it in regularFile.
   648  		stat.Blocks = allocatedBlocksForSize(stat.Size)
   649  	case *directory:
   650  		stat.Size = direntSize * (2 + uint64(impl.numChildren.Load()))
   651  		// stat.Blocks is 0.
   652  	case *symlink:
   653  		stat.Size = uint64(len(impl.target))
   654  		// stat.Blocks is 0.
   655  	case *namedPipe, *socketFile:
   656  		// stat.Size and stat.Blocks are 0.
   657  	case *deviceFile:
   658  		// stat.Size and stat.Blocks are 0.
   659  		stat.RdevMajor = impl.major
   660  		stat.RdevMinor = impl.minor
   661  	default:
   662  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   663  	}
   664  }
   665  
   666  func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
   667  	stat := &opts.Stat
   668  	if stat.Mask == 0 {
   669  		return nil
   670  	}
   671  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
   672  		return linuxerr.EPERM
   673  	}
   674  	mode := linux.FileMode(i.mode.Load())
   675  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil {
   676  		return err
   677  	}
   678  
   679  	i.mu.Lock()
   680  	defer i.mu.Unlock()
   681  	var (
   682  		needsMtimeBump bool
   683  		needsCtimeBump bool
   684  	)
   685  	clearSID := false
   686  	mask := stat.Mask
   687  	if mask&linux.STATX_SIZE != 0 {
   688  		switch impl := i.impl.(type) {
   689  		case *regularFile:
   690  			updated, err := impl.truncateLocked(stat.Size)
   691  			if err != nil {
   692  				return err
   693  			}
   694  			if updated {
   695  				clearSID = true
   696  				needsMtimeBump = true
   697  				needsCtimeBump = true
   698  			}
   699  		case *directory:
   700  			return linuxerr.EISDIR
   701  		default:
   702  			return linuxerr.EINVAL
   703  		}
   704  	}
   705  	if mask&linux.STATX_UID != 0 {
   706  		i.uid.Store(stat.UID)
   707  		needsCtimeBump = true
   708  		clearSID = true
   709  	}
   710  	if mask&linux.STATX_GID != 0 {
   711  		i.gid.Store(stat.GID)
   712  		needsCtimeBump = true
   713  		clearSID = true
   714  	}
   715  	if mask&linux.STATX_MODE != 0 {
   716  		for {
   717  			old := i.mode.Load()
   718  			ft := old & linux.S_IFMT
   719  			newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
   720  			if clearSID {
   721  				newMode = vfs.ClearSUIDAndSGID(newMode)
   722  			}
   723  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   724  				clearSID = false
   725  				break
   726  			}
   727  		}
   728  		needsCtimeBump = true
   729  	}
   730  	now := i.fs.clock.Now().Nanoseconds()
   731  	if mask&linux.STATX_ATIME != 0 {
   732  		if stat.Atime.Nsec == linux.UTIME_NOW {
   733  			i.atime.Store(now)
   734  		} else {
   735  			i.atime.Store(stat.Atime.ToNsecCapped())
   736  		}
   737  		needsCtimeBump = true
   738  	}
   739  	if mask&linux.STATX_MTIME != 0 {
   740  		if stat.Mtime.Nsec == linux.UTIME_NOW {
   741  			i.mtime.Store(now)
   742  		} else {
   743  			i.mtime.Store(stat.Mtime.ToNsecCapped())
   744  		}
   745  		needsCtimeBump = true
   746  		// Ignore the mtime bump, since we just set it ourselves.
   747  		needsMtimeBump = false
   748  	}
   749  	if mask&linux.STATX_CTIME != 0 {
   750  		if stat.Ctime.Nsec == linux.UTIME_NOW {
   751  			i.ctime.Store(now)
   752  		} else {
   753  			i.ctime.Store(stat.Ctime.ToNsecCapped())
   754  		}
   755  		// Ignore the ctime bump, since we just set it ourselves.
   756  		needsCtimeBump = false
   757  	}
   758  
   759  	// We may have to clear the SUID/SGID bits, but didn't do so as part of
   760  	// STATX_MODE.
   761  	if clearSID {
   762  		for {
   763  			old := i.mode.Load()
   764  			newMode := vfs.ClearSUIDAndSGID(old)
   765  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   766  				break
   767  			}
   768  		}
   769  		needsCtimeBump = true
   770  	}
   771  
   772  	if needsMtimeBump {
   773  		i.mtime.Store(now)
   774  	}
   775  	if needsCtimeBump {
   776  		i.ctime.Store(now)
   777  	}
   778  
   779  	return nil
   780  }
   781  
   782  // allocatedBlocksForSize returns the number of 512B blocks needed to
   783  // accommodate the given size in bytes, as appropriate for struct
   784  // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
   785  // size is independent of the "preferred block size for I/O", struct
   786  // stat::st_blksize and struct statx::stx_blksize.)
   787  func allocatedBlocksForSize(size uint64) uint64 {
   788  	return (size + 511) / 512
   789  }
   790  
   791  func (i *inode) direntType() uint8 {
   792  	switch impl := i.impl.(type) {
   793  	case *regularFile:
   794  		return linux.DT_REG
   795  	case *directory:
   796  		return linux.DT_DIR
   797  	case *symlink:
   798  		return linux.DT_LNK
   799  	case *socketFile:
   800  		return linux.DT_SOCK
   801  	case *namedPipe:
   802  		return linux.DT_FIFO
   803  	case *deviceFile:
   804  		switch impl.kind {
   805  		case vfs.BlockDevice:
   806  			return linux.DT_BLK
   807  		case vfs.CharDevice:
   808  			return linux.DT_CHR
   809  		default:
   810  			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
   811  		}
   812  	default:
   813  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   814  	}
   815  }
   816  
   817  func (i *inode) isDir() bool {
   818  	mode := linux.FileMode(i.mode.Load())
   819  	return mode.FileType() == linux.S_IFDIR
   820  }
   821  
   822  func (i *inode) touchAtime(mnt *vfs.Mount) {
   823  	if mnt.Flags.NoATime {
   824  		return
   825  	}
   826  	if err := mnt.CheckBeginWrite(); err != nil {
   827  		return
   828  	}
   829  	now := i.fs.clock.Now().Nanoseconds()
   830  	i.mu.Lock()
   831  	i.atime.Store(now)
   832  	i.mu.Unlock()
   833  	mnt.EndWrite()
   834  }
   835  
   836  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   837  func (i *inode) touchCtime() {
   838  	now := i.fs.clock.Now().Nanoseconds()
   839  	i.mu.Lock()
   840  	i.ctime.Store(now)
   841  	i.mu.Unlock()
   842  }
   843  
   844  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   845  func (i *inode) touchCMtime() {
   846  	now := i.fs.clock.Now().Nanoseconds()
   847  	i.mu.Lock()
   848  	i.mtime.Store(now)
   849  	i.ctime.Store(now)
   850  	i.mu.Unlock()
   851  }
   852  
   853  // Preconditions:
   854  //   - The caller has called vfs.Mount.CheckBeginWrite().
   855  //   - inode.mu must be locked.
   856  func (i *inode) touchCMtimeLocked() {
   857  	now := i.fs.clock.Now().Nanoseconds()
   858  	i.mtime.Store(now)
   859  	i.ctime.Store(now)
   860  }
   861  
   862  func (i *inode) checkXattrPrefix(name string) error {
   863  	for prefix := range i.fs.allowXattrPrefix {
   864  		if strings.HasPrefix(name, prefix) {
   865  			return nil
   866  		}
   867  	}
   868  	return linuxerr.EOPNOTSUPP
   869  }
   870  
   871  func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) {
   872  	return i.xattrs.ListXattr(creds, size)
   873  }
   874  
   875  func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
   876  	if err := i.checkXattrPrefix(opts.Name); err != nil {
   877  		return "", err
   878  	}
   879  	mode := linux.FileMode(i.mode.Load())
   880  	kuid := auth.KUID(i.uid.Load())
   881  	kgid := auth.KGID(i.gid.Load())
   882  	if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil {
   883  		return "", err
   884  	}
   885  	return i.xattrs.GetXattr(creds, mode, kuid, opts)
   886  }
   887  
   888  func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
   889  	if err := i.checkXattrPrefix(opts.Name); err != nil {
   890  		return err
   891  	}
   892  	mode := linux.FileMode(i.mode.Load())
   893  	kuid := auth.KUID(i.uid.Load())
   894  	kgid := auth.KGID(i.gid.Load())
   895  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   896  		return err
   897  	}
   898  	return i.xattrs.SetXattr(creds, mode, kuid, opts)
   899  }
   900  
   901  func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
   902  	if err := i.checkXattrPrefix(name); err != nil {
   903  		return err
   904  	}
   905  	mode := linux.FileMode(i.mode.Load())
   906  	kuid := auth.KUID(i.uid.Load())
   907  	kgid := auth.KGID(i.gid.Load())
   908  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   909  		return err
   910  	}
   911  	return i.xattrs.RemoveXattr(creds, mode, kuid, name)
   912  }
   913  
   914  // fileDescription is embedded by tmpfs implementations of
   915  // vfs.FileDescriptionImpl.
   916  //
   917  // +stateify savable
   918  type fileDescription struct {
   919  	vfsfd vfs.FileDescription
   920  	vfs.FileDescriptionDefaultImpl
   921  	vfs.LockFD
   922  }
   923  
   924  func (fd *fileDescription) filesystem() *filesystem {
   925  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   926  }
   927  
   928  func (fd *fileDescription) dentry() *dentry {
   929  	return fd.vfsfd.Dentry().Impl().(*dentry)
   930  }
   931  
   932  func (fd *fileDescription) inode() *inode {
   933  	return fd.dentry().inode
   934  }
   935  
   936  // Stat implements vfs.FileDescriptionImpl.Stat.
   937  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   938  	var stat linux.Statx
   939  	fd.inode().statTo(&stat)
   940  	return stat, nil
   941  }
   942  
   943  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   944  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   945  	return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts)
   946  }
   947  
   948  // StatFS implements vfs.FileDescriptionImpl.StatFS.
   949  func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   950  	return fd.filesystem().statFS(), nil
   951  }
   952  
   953  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   954  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   955  	return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size)
   956  }
   957  
   958  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   959  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   960  	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
   961  }
   962  
   963  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   964  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   965  	return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts)
   966  }
   967  
   968  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   969  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   970  	return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name)
   971  }
   972  
   973  // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
   974  // filesystem state is in-memory.
   975  func (*fileDescription) Sync(context.Context) error {
   976  	return nil
   977  }
   978  
   979  // parseSize converts size in string to an integer bytes.
   980  // Supported suffixes in string are:K, M, G, T, P, E.
   981  func parseSize(s string) (uint64, error) {
   982  	if len(s) == 0 {
   983  		return 0, fmt.Errorf("size parameter empty")
   984  	}
   985  	suffix := s[len(s)-1]
   986  	count := 1
   987  	switch suffix {
   988  	case 'e', 'E':
   989  		count = count << 10
   990  		fallthrough
   991  	case 'p', 'P':
   992  		count = count << 10
   993  		fallthrough
   994  	case 't', 'T':
   995  		count = count << 10
   996  		fallthrough
   997  	case 'g', 'G':
   998  		count = count << 10
   999  		fallthrough
  1000  	case 'm', 'M':
  1001  		count = count << 10
  1002  		fallthrough
  1003  	case 'k', 'K':
  1004  		count = count << 10
  1005  		s = s[:len(s)-1]
  1006  	}
  1007  	byteTmp, err := strconv.ParseUint(s, 10, 64)
  1008  	if err != nil {
  1009  		return 0, linuxerr.EINVAL
  1010  	}
  1011  	// Check for overflow.
  1012  	bytes := byteTmp * uint64(count)
  1013  	if byteTmp != 0 && bytes/byteTmp != uint64(count) {
  1014  		return 0, fmt.Errorf("size overflow")
  1015  	}
  1016  	return bytes, err
  1017  }