github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tmpfs provides an in-memory filesystem whose contents are
    16  // application-mutable, consistent with Linux's tmpfs.
    17  //
    18  // Lock order:
    19  //
    20  //	filesystem.mu
    21  //		inode.mu
    22  //		  regularFileFD.offMu
    23  //		    *** "memmap.Mappable locks" below this point
    24  //		    regularFile.mapsMu
    25  //		      *** "memmap.Mappable locks taken by Translate" below this point
    26  //		      regularFile.dataMu
    27  //		        fs.pagesUsedMu
    28  //		  directory.iterMu
    29  package tmpfs
    30  
    31  import (
    32  	"fmt"
    33  	"math"
    34  	"strconv"
    35  	"strings"
    36  	"sync/atomic"
    37  
    38  	"github.com/metacubex/gvisor/pkg/abi/linux"
    39  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    40  	"github.com/metacubex/gvisor/pkg/context"
    41  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    42  	"github.com/metacubex/gvisor/pkg/hostarch"
    43  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    44  	"github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    45  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    46  	"github.com/metacubex/gvisor/pkg/sentry/usage"
    47  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    48  	"github.com/metacubex/gvisor/pkg/sentry/vfs/memxattr"
    49  )
    50  
    51  // Name is the default filesystem name.
    52  const Name = "tmpfs"
    53  
    54  // FilesystemType implements vfs.FilesystemType.
    55  //
    56  // +stateify savable
    57  type FilesystemType struct{}
    58  
    59  // filesystem implements vfs.FilesystemImpl.
    60  //
    61  // +stateify savable
    62  type filesystem struct {
    63  	vfsfs vfs.Filesystem
    64  
    65  	// mf is used to allocate memory that stores regular file contents. mf is
    66  	// immutable, except it is changed during restore.
    67  	mf *pgalloc.MemoryFile `state:".(string)"`
    68  
    69  	// clock is a realtime clock used to set timestamps in file operations.
    70  	clock time.Clock
    71  
    72  	// devMinor is the filesystem's minor device number. devMinor is immutable.
    73  	devMinor uint32
    74  
    75  	// mopts contains the tmpfs-specific mount options passed to this
    76  	// filesystem. Immutable.
    77  	mopts string
    78  
    79  	// usage is the memory accounting category under which pages backing
    80  	// files in this filesystem are accounted.
    81  	usage usage.MemoryKind
    82  
    83  	// mu serializes changes to the Dentry tree.
    84  	mu filesystemRWMutex `state:"nosave"`
    85  
    86  	nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations
    87  
    88  	root *dentry
    89  
    90  	maxFilenameLen int
    91  
    92  	// maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages.
    93  	// This field is immutable.
    94  	maxSizeInPages uint64
    95  
    96  	// pagesUsed is the number of pages used by this filesystem.
    97  	pagesUsed atomicbitops.Uint64
    98  
    99  	// allowXattrPrefix is a set of xattr namespace prefixes that this
   100  	// tmpfs mount will allow. It is immutable.
   101  	allowXattrPrefix map[string]struct{}
   102  }
   103  
   104  // Name implements vfs.FilesystemType.Name.
   105  func (FilesystemType) Name() string {
   106  	return Name
   107  }
   108  
   109  // Release implements vfs.FilesystemType.Release.
   110  func (FilesystemType) Release(ctx context.Context) {}
   111  
   112  // FilesystemOpts is used to pass configuration data to tmpfs.
   113  //
   114  // +stateify savable
   115  type FilesystemOpts struct {
   116  	// RootFileType is the FileType of the filesystem root. Valid values
   117  	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
   118  	RootFileType uint16
   119  
   120  	// RootSymlinkTarget is the target of the root symlink. Only valid if
   121  	// RootFileType == S_IFLNK.
   122  	RootSymlinkTarget string
   123  
   124  	// FilesystemType allows setting a different FilesystemType for this
   125  	// tmpfs filesystem. This allows tmpfs to "impersonate" other
   126  	// filesystems, like ramdiskfs and cgroupfs.
   127  	FilesystemType vfs.FilesystemType
   128  
   129  	// Usage is the memory accounting category under which pages backing files in
   130  	// the filesystem are accounted.
   131  	Usage *usage.MemoryKind
   132  
   133  	// MaxFilenameLen is the maximum filename length allowed by the tmpfs.
   134  	MaxFilenameLen int
   135  
   136  	// MemoryFile is the memory file that will be used to store file data. If
   137  	// this is nil, then MemoryFileFromContext() is used.
   138  	MemoryFile *pgalloc.MemoryFile
   139  
   140  	// DisableDefaultSizeLimit disables setting a default size limit. In Linux,
   141  	// SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super().
   142  	DisableDefaultSizeLimit bool
   143  
   144  	// AllowXattrPrefix is a set of xattr namespace prefixes that this
   145  	// tmpfs mount will allow.
   146  	AllowXattrPrefix []string
   147  }
   148  
   149  // Default size limit mount option. It is immutable after initialization.
   150  var defaultSizeLimit uint64
   151  
   152  // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts
   153  // that do not specify a size= mount option. This must be called only once,
   154  // before any tmpfs filesystems are created.
   155  func SetDefaultSizeLimit(sizeLimit uint64) {
   156  	defaultSizeLimit = sizeLimit
   157  }
   158  
   159  func getDefaultSizeLimit(disable bool) uint64 {
   160  	if disable || defaultSizeLimit == 0 {
   161  		// The size limit is used to populate statfs(2) results. If Linux tmpfs is
   162  		// mounted with no size option, then statfs(2) returns f_blocks == f_bfree
   163  		// == f_bavail == 0. However, many applications treat this as having a size
   164  		// limit of 0. To work around this, return a very large but non-zero size
   165  		// limit, chosen to ensure that it does not overflow int64.
   166  		return math.MaxInt64
   167  	}
   168  	return defaultSizeLimit
   169  }
   170  
   171  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   172  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   173  	mf := pgalloc.MemoryFileFromContext(ctx)
   174  	if mf == nil {
   175  		panic("CtxMemoryFile returned nil")
   176  	}
   177  	rootFileType := uint16(linux.S_IFDIR)
   178  	disableDefaultSizeLimit := false
   179  	newFSType := vfs.FilesystemType(&fstype)
   180  
   181  	// By default we support only "trusted" and "user" namespaces. Linux
   182  	// also supports "security" and (if configured) POSIX ACL namespaces
   183  	// "system.posix_acl_access" and "system.posix_acl_default".
   184  	allowXattrPrefix := map[string]struct{}{
   185  		linux.XATTR_TRUSTED_PREFIX: struct{}{},
   186  		linux.XATTR_USER_PREFIX:    struct{}{},
   187  		// The "security" namespace is allowed, but it always returns an error.
   188  		linux.XATTR_SECURITY_PREFIX: struct{}{},
   189  	}
   190  
   191  	tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts)
   192  	if tmpfsOptsOk {
   193  		if tmpfsOpts.RootFileType != 0 {
   194  			rootFileType = tmpfsOpts.RootFileType
   195  		}
   196  		if tmpfsOpts.FilesystemType != nil {
   197  			newFSType = tmpfsOpts.FilesystemType
   198  		}
   199  		disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit
   200  		if tmpfsOpts.MemoryFile != nil {
   201  			mf = tmpfsOpts.MemoryFile
   202  		}
   203  		for _, xattr := range tmpfsOpts.AllowXattrPrefix {
   204  			allowXattrPrefix[xattr] = struct{}{}
   205  		}
   206  	}
   207  
   208  	mopts := vfs.GenericParseMountOptions(opts.Data)
   209  	rootMode := linux.FileMode(0777)
   210  	if rootFileType == linux.S_IFDIR {
   211  		rootMode = 01777
   212  	}
   213  	modeStr, ok := mopts["mode"]
   214  	if ok {
   215  		delete(mopts, "mode")
   216  		mode, err := strconv.ParseUint(modeStr, 8, 32)
   217  		if err != nil {
   218  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
   219  			return nil, nil, linuxerr.EINVAL
   220  		}
   221  		rootMode = linux.FileMode(mode & 07777)
   222  	}
   223  	rootKUID := creds.EffectiveKUID
   224  	uidStr, ok := mopts["uid"]
   225  	if ok {
   226  		delete(mopts, "uid")
   227  		uid, err := strconv.ParseUint(uidStr, 10, 32)
   228  		if err != nil {
   229  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
   230  			return nil, nil, linuxerr.EINVAL
   231  		}
   232  		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
   233  		if !kuid.Ok() {
   234  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
   235  			return nil, nil, linuxerr.EINVAL
   236  		}
   237  		rootKUID = kuid
   238  	}
   239  	rootKGID := creds.EffectiveKGID
   240  	gidStr, ok := mopts["gid"]
   241  	if ok {
   242  		delete(mopts, "gid")
   243  		gid, err := strconv.ParseUint(gidStr, 10, 32)
   244  		if err != nil {
   245  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
   246  			return nil, nil, linuxerr.EINVAL
   247  		}
   248  		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
   249  		if !kgid.Ok() {
   250  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
   251  			return nil, nil, linuxerr.EINVAL
   252  		}
   253  		rootKGID = kgid
   254  	}
   255  	maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize
   256  	maxSizeStr, ok := mopts["size"]
   257  	if ok {
   258  		delete(mopts, "size")
   259  		maxSizeInBytes, err := parseSize(maxSizeStr)
   260  		if err != nil {
   261  			ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err)
   262  			return nil, nil, linuxerr.EINVAL
   263  		}
   264  		// Convert size in bytes to nearest Page Size bytes
   265  		// as Linux allocates memory in terms of Page size.
   266  		maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes)
   267  		if !ok {
   268  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok)
   269  			return nil, nil, linuxerr.EINVAL
   270  		}
   271  	}
   272  
   273  	if len(mopts) != 0 {
   274  		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   275  		return nil, nil, linuxerr.EINVAL
   276  	}
   277  
   278  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   279  	if err != nil {
   280  		return nil, nil, err
   281  	}
   282  	clock := time.RealtimeClockFromContext(ctx)
   283  	memUsage := usage.Tmpfs
   284  	if tmpfsOpts.Usage != nil {
   285  		memUsage = *tmpfsOpts.Usage
   286  	}
   287  	fs := filesystem{
   288  		mf:               mf,
   289  		clock:            clock,
   290  		devMinor:         devMinor,
   291  		mopts:            opts.Data,
   292  		usage:            memUsage,
   293  		maxFilenameLen:   linux.NAME_MAX,
   294  		maxSizeInPages:   maxSizeInPages,
   295  		allowXattrPrefix: allowXattrPrefix,
   296  	}
   297  	fs.vfsfs.Init(vfsObj, newFSType, &fs)
   298  	if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 {
   299  		fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen
   300  	}
   301  
   302  	var root *dentry
   303  	switch rootFileType {
   304  	case linux.S_IFREG:
   305  		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */))
   306  	case linux.S_IFLNK:
   307  		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */))
   308  	case linux.S_IFDIR:
   309  		root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry
   310  	default:
   311  		fs.vfsfs.DecRef(ctx)
   312  		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
   313  	}
   314  	fs.root = root
   315  	return &fs.vfsfs, &root.vfsd, nil
   316  }
   317  
   318  // Release implements vfs.FilesystemImpl.Release.
   319  func (fs *filesystem) Release(ctx context.Context) {
   320  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   321  	fs.mu.Lock()
   322  	if fs.root.inode.isDir() {
   323  		fs.root.releaseChildrenLocked(ctx)
   324  	}
   325  	fs.mu.Unlock()
   326  	if fs.mf.RestoreID() != "" {
   327  		// If RestoreID is set, then this is a private MemoryFile which needs to be
   328  		// destroyed since this tmpfs is the only user.
   329  		fs.mf.Destroy()
   330  	}
   331  }
   332  
   333  // releaseChildrenLocked is called on the mount point by filesystem.Release() to
   334  // destroy all objects in the mount. It performs a depth-first walk of the
   335  // filesystem and "unlinks" everything by decrementing link counts
   336  // appropriately. There should be no open file descriptors when this is called,
   337  // so each inode should only have one outstanding reference that is removed once
   338  // its link count hits zero.
   339  //
   340  // Note that we do not update filesystem state precisely while tearing down (for
   341  // instance, the child maps are ignored)--we only care to remove all remaining
   342  // references so that every filesystem object gets destroyed. Also note that we
   343  // do not need to trigger DecRef on the mount point itself or any child mount;
   344  // these are taken care of by the destructor of the enclosing MountNamespace.
   345  //
   346  // Precondition: filesystem.mu is held.
   347  func (d *dentry) releaseChildrenLocked(ctx context.Context) {
   348  	dir := d.inode.impl.(*directory)
   349  	for _, child := range dir.childMap {
   350  		if child.inode.isDir() {
   351  			child.releaseChildrenLocked(ctx)
   352  			child.inode.decLinksLocked(ctx) // link for child/.
   353  			dir.inode.decLinksLocked(ctx)   // link for child/..
   354  		}
   355  		child.inode.decLinksLocked(ctx) // link for child
   356  	}
   357  }
   358  
   359  func (fs *filesystem) statFS() linux.Statfs {
   360  	st := linux.Statfs{
   361  		Type:         linux.TMPFS_MAGIC,
   362  		BlockSize:    hostarch.PageSize,
   363  		FragmentSize: hostarch.PageSize,
   364  		NameLength:   linux.NAME_MAX,
   365  	}
   366  
   367  	// If size is set for tmpfs return set values.
   368  	st.Blocks = fs.maxSizeInPages
   369  	pagesUsed := fs.pagesUsed.Load()
   370  	st.BlocksFree = fs.maxSizeInPages - pagesUsed
   371  	st.BlocksAvailable = fs.maxSizeInPages - pagesUsed
   372  	return st
   373  }
   374  
   375  // dentry implements vfs.DentryImpl.
   376  //
   377  // +stateify savable
   378  type dentry struct {
   379  	vfsd vfs.Dentry
   380  
   381  	// parent is this dentry's parent directory. Each referenced dentry holds a
   382  	// reference on parent.dentry. If this dentry is a filesystem root, parent
   383  	// is nil. parent is protected by filesystem.mu.
   384  	parent atomic.Pointer[dentry] `state:".(*dentry)"`
   385  
   386  	// name is the name of this dentry in its parent. If this dentry is a
   387  	// filesystem root, name is the empty string. name is protected by
   388  	// filesystem.mu.
   389  	name string
   390  
   391  	// dentryEntry (ugh) links dentries into their parent directory.childList.
   392  	dentryEntry
   393  
   394  	// inode is the inode represented by this dentry. Multiple Dentries may
   395  	// share a single non-directory inode (with hard links). inode is
   396  	// immutable.
   397  	//
   398  	// tmpfs doesn't count references on dentries; because the dentry tree is
   399  	// the sole source of truth, it is by definition always consistent with the
   400  	// state of the filesystem. However, it does count references on inodes,
   401  	// because inode resources are released when all references are dropped.
   402  	// dentry therefore forwards reference counting directly to inode.
   403  	inode *inode
   404  }
   405  
   406  func (fs *filesystem) newDentry(inode *inode) *dentry {
   407  	d := &dentry{
   408  		inode: inode,
   409  	}
   410  	d.vfsd.Init(d)
   411  	return d
   412  }
   413  
   414  // IncRef implements vfs.DentryImpl.IncRef.
   415  func (d *dentry) IncRef() {
   416  	d.inode.incRef()
   417  }
   418  
   419  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   420  func (d *dentry) TryIncRef() bool {
   421  	return d.inode.tryIncRef()
   422  }
   423  
   424  // DecRef implements vfs.DentryImpl.DecRef.
   425  func (d *dentry) DecRef(ctx context.Context) {
   426  	d.inode.decRef(ctx)
   427  }
   428  
   429  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   430  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   431  	if d.inode.isDir() {
   432  		events |= linux.IN_ISDIR
   433  	}
   434  
   435  	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   436  	// that d was deleted.
   437  	deleted := d.vfsd.IsDead()
   438  
   439  	d.inode.fs.mu.RLock()
   440  	// The ordering below is important, Linux always notifies the parent first.
   441  	parent := d.parent.Load()
   442  	if parent != nil {
   443  		parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   444  	}
   445  	d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
   446  	d.inode.fs.mu.RUnlock()
   447  }
   448  
   449  // Watches implements vfs.DentryImpl.Watches.
   450  func (d *dentry) Watches() *vfs.Watches {
   451  	return &d.inode.watches
   452  }
   453  
   454  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   455  func (d *dentry) OnZeroWatches(context.Context) {}
   456  
   457  // inode represents a filesystem object.
   458  //
   459  // +stateify savable
   460  type inode struct {
   461  	// fs is the owning filesystem. fs is immutable.
   462  	fs *filesystem
   463  
   464  	// A reference is held on all inodes as long as they are reachable in the
   465  	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
   466  	// nlink reaches 0.
   467  	refs inodeRefs
   468  
   469  	// xattrs implements extended attributes.
   470  	//
   471  	// TODO(b/148380782): Support xattrs other than user.*
   472  	xattrs memxattr.SimpleExtendedAttributes
   473  
   474  	// Inode metadata. Writing multiple fields atomically requires holding
   475  	// mu, otherwise atomic operations can be used.
   476  	mu    inodeMutex          `state:"nosave"`
   477  	mode  atomicbitops.Uint32 // file type and mode
   478  	nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu
   479  	uid   atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   480  	gid   atomicbitops.Uint32 // auth.KGID, but ...
   481  	ino   uint64              // immutable
   482  
   483  	// Linux's tmpfs has no concept of btime.
   484  	atime atomicbitops.Int64 // nanoseconds
   485  	ctime atomicbitops.Int64 // nanoseconds
   486  	mtime atomicbitops.Int64 // nanoseconds
   487  
   488  	locks vfs.FileLocks
   489  
   490  	// Inotify watches for this inode.
   491  	watches vfs.Watches
   492  
   493  	impl any // immutable
   494  }
   495  
   496  const maxLinks = math.MaxUint32
   497  
   498  func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) {
   499  	if mode.FileType() == 0 {
   500  		panic("file type is required in FileMode")
   501  	}
   502  
   503  	// Inherit the group and setgid bit as in fs/inode.c:inode_init_owner().
   504  	if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID {
   505  		kgid = auth.KGID(parentDir.inode.gid.Load())
   506  		if mode&linux.S_IFDIR == linux.S_IFDIR {
   507  			mode |= linux.S_ISGID
   508  		}
   509  	}
   510  
   511  	i.fs = fs
   512  	i.mode = atomicbitops.FromUint32(uint32(mode))
   513  	i.uid = atomicbitops.FromUint32(uint32(kuid))
   514  	i.gid = atomicbitops.FromUint32(uint32(kgid))
   515  	i.ino = fs.nextInoMinusOne.Add(1)
   516  	// Tmpfs creation sets atime, ctime, and mtime to current time.
   517  	now := fs.clock.Now().Nanoseconds()
   518  	i.atime = atomicbitops.FromInt64(now)
   519  	i.ctime = atomicbitops.FromInt64(now)
   520  	i.mtime = atomicbitops.FromInt64(now)
   521  	// i.nlink initialized by caller
   522  	i.impl = impl
   523  	i.refs.InitRefs()
   524  }
   525  
   526  // incLinksLocked increments i's link count.
   527  //
   528  // Preconditions:
   529  //   - filesystem.mu must be locked for writing.
   530  //   - i.mu must be lcoked.
   531  //   - i.nlink != 0.
   532  //   - i.nlink < maxLinks.
   533  func (i *inode) incLinksLocked() {
   534  	if i.nlink.RacyLoad() == 0 {
   535  		panic("tmpfs.inode.incLinksLocked() called with no existing links")
   536  	}
   537  	if i.nlink.RacyLoad() == maxLinks {
   538  		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
   539  	}
   540  	i.nlink.Add(1)
   541  }
   542  
   543  // decLinksLocked decrements i's link count. If the link count reaches 0, we
   544  // remove a reference on i as well.
   545  //
   546  // Preconditions:
   547  //   - filesystem.mu must be locked for writing.
   548  //   - i.mu must be lcoked.
   549  //   - i.nlink != 0.
   550  func (i *inode) decLinksLocked(ctx context.Context) {
   551  	if i.nlink.RacyLoad() == 0 {
   552  		panic("tmpfs.inode.decLinksLocked() called with no existing links")
   553  	}
   554  	if i.nlink.Add(^uint32(0)) == 0 {
   555  		i.decRef(ctx)
   556  	}
   557  }
   558  
   559  func (i *inode) incRef() {
   560  	i.refs.IncRef()
   561  }
   562  
   563  func (i *inode) tryIncRef() bool {
   564  	return i.refs.TryIncRef()
   565  }
   566  
   567  func (i *inode) decRef(ctx context.Context) {
   568  	i.refs.DecRef(func() {
   569  		i.watches.HandleDeletion(ctx)
   570  		// Remove pages used if child being removed is a SymLink or Regular File.
   571  		switch impl := i.impl.(type) {
   572  		case *symlink:
   573  			if len(impl.target) >= shortSymlinkLen {
   574  				impl.inode.fs.unaccountPages(1)
   575  			}
   576  		case *regularFile:
   577  			// Release memory used by regFile to store data. Since regFile is
   578  			// no longer usable, we don't need to grab any locks or update any
   579  			// metadata.
   580  			pagesDec := impl.data.DropAll(i.fs.mf)
   581  			impl.inode.fs.unaccountPages(pagesDec)
   582  		}
   583  
   584  	})
   585  }
   586  
   587  func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   588  	mode := linux.FileMode(i.mode.Load())
   589  	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load()))
   590  }
   591  
   592  // Go won't inline this function, and returning linux.Statx (which is quite
   593  // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
   594  // output parameter.
   595  //
   596  // Note that Linux does not guarantee to return consistent data (in the case of
   597  // a concurrent modification), so we do not require holding inode.mu.
   598  func (i *inode) statTo(stat *linux.Statx) {
   599  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
   600  		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
   601  		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
   602  		linux.STATX_MTIME
   603  	stat.Blksize = hostarch.PageSize
   604  	stat.Nlink = i.nlink.Load()
   605  	stat.UID = i.uid.Load()
   606  	stat.GID = i.gid.Load()
   607  	stat.Mode = uint16(i.mode.Load())
   608  	stat.Ino = i.ino
   609  	stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load())
   610  	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load())
   611  	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load())
   612  	stat.DevMajor = linux.UNNAMED_MAJOR
   613  	stat.DevMinor = i.fs.devMinor
   614  	switch impl := i.impl.(type) {
   615  	case *regularFile:
   616  		stat.Size = uint64(impl.size.Load())
   617  		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
   618  		// too expensive to compute here. Cache it in regularFile.
   619  		stat.Blocks = allocatedBlocksForSize(stat.Size)
   620  	case *directory:
   621  		stat.Size = direntSize * (2 + uint64(impl.numChildren.Load()))
   622  		// stat.Blocks is 0.
   623  	case *symlink:
   624  		stat.Size = uint64(len(impl.target))
   625  		// stat.Blocks is 0.
   626  	case *namedPipe, *socketFile:
   627  		// stat.Size and stat.Blocks are 0.
   628  	case *deviceFile:
   629  		// stat.Size and stat.Blocks are 0.
   630  		stat.RdevMajor = impl.major
   631  		stat.RdevMinor = impl.minor
   632  	default:
   633  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   634  	}
   635  }
   636  
   637  func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
   638  	stat := &opts.Stat
   639  	if stat.Mask == 0 {
   640  		return nil
   641  	}
   642  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
   643  		return linuxerr.EPERM
   644  	}
   645  	mode := linux.FileMode(i.mode.Load())
   646  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil {
   647  		return err
   648  	}
   649  
   650  	i.mu.Lock()
   651  	defer i.mu.Unlock()
   652  	var (
   653  		needsMtimeBump bool
   654  		needsCtimeBump bool
   655  	)
   656  	clearSID := false
   657  	mask := stat.Mask
   658  	if mask&linux.STATX_SIZE != 0 {
   659  		switch impl := i.impl.(type) {
   660  		case *regularFile:
   661  			updated, err := impl.truncateLocked(stat.Size)
   662  			if err != nil {
   663  				return err
   664  			}
   665  			if updated {
   666  				clearSID = true
   667  				needsMtimeBump = true
   668  				needsCtimeBump = true
   669  			}
   670  		case *directory:
   671  			return linuxerr.EISDIR
   672  		default:
   673  			return linuxerr.EINVAL
   674  		}
   675  	}
   676  	if mask&linux.STATX_UID != 0 {
   677  		i.uid.Store(stat.UID)
   678  		needsCtimeBump = true
   679  		clearSID = true
   680  	}
   681  	if mask&linux.STATX_GID != 0 {
   682  		i.gid.Store(stat.GID)
   683  		needsCtimeBump = true
   684  		clearSID = true
   685  	}
   686  	if mask&linux.STATX_MODE != 0 {
   687  		for {
   688  			old := i.mode.Load()
   689  			ft := old & linux.S_IFMT
   690  			newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
   691  			if clearSID {
   692  				newMode = vfs.ClearSUIDAndSGID(newMode)
   693  			}
   694  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   695  				clearSID = false
   696  				break
   697  			}
   698  		}
   699  		needsCtimeBump = true
   700  	}
   701  	now := i.fs.clock.Now().Nanoseconds()
   702  	if mask&linux.STATX_ATIME != 0 {
   703  		if stat.Atime.Nsec == linux.UTIME_NOW {
   704  			i.atime.Store(now)
   705  		} else {
   706  			i.atime.Store(stat.Atime.ToNsecCapped())
   707  		}
   708  		needsCtimeBump = true
   709  	}
   710  	if mask&linux.STATX_MTIME != 0 {
   711  		if stat.Mtime.Nsec == linux.UTIME_NOW {
   712  			i.mtime.Store(now)
   713  		} else {
   714  			i.mtime.Store(stat.Mtime.ToNsecCapped())
   715  		}
   716  		needsCtimeBump = true
   717  		// Ignore the mtime bump, since we just set it ourselves.
   718  		needsMtimeBump = false
   719  	}
   720  	if mask&linux.STATX_CTIME != 0 {
   721  		if stat.Ctime.Nsec == linux.UTIME_NOW {
   722  			i.ctime.Store(now)
   723  		} else {
   724  			i.ctime.Store(stat.Ctime.ToNsecCapped())
   725  		}
   726  		// Ignore the ctime bump, since we just set it ourselves.
   727  		needsCtimeBump = false
   728  	}
   729  
   730  	// We may have to clear the SUID/SGID bits, but didn't do so as part of
   731  	// STATX_MODE.
   732  	if clearSID {
   733  		for {
   734  			old := i.mode.Load()
   735  			newMode := vfs.ClearSUIDAndSGID(old)
   736  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   737  				break
   738  			}
   739  		}
   740  		needsCtimeBump = true
   741  	}
   742  
   743  	if needsMtimeBump {
   744  		i.mtime.Store(now)
   745  	}
   746  	if needsCtimeBump {
   747  		i.ctime.Store(now)
   748  	}
   749  
   750  	return nil
   751  }
   752  
   753  // allocatedBlocksForSize returns the number of 512B blocks needed to
   754  // accommodate the given size in bytes, as appropriate for struct
   755  // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
   756  // size is independent of the "preferred block size for I/O", struct
   757  // stat::st_blksize and struct statx::stx_blksize.)
   758  func allocatedBlocksForSize(size uint64) uint64 {
   759  	return (size + 511) / 512
   760  }
   761  
   762  func (i *inode) direntType() uint8 {
   763  	switch impl := i.impl.(type) {
   764  	case *regularFile:
   765  		return linux.DT_REG
   766  	case *directory:
   767  		return linux.DT_DIR
   768  	case *symlink:
   769  		return linux.DT_LNK
   770  	case *socketFile:
   771  		return linux.DT_SOCK
   772  	case *namedPipe:
   773  		return linux.DT_FIFO
   774  	case *deviceFile:
   775  		switch impl.kind {
   776  		case vfs.BlockDevice:
   777  			return linux.DT_BLK
   778  		case vfs.CharDevice:
   779  			return linux.DT_CHR
   780  		default:
   781  			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
   782  		}
   783  	default:
   784  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   785  	}
   786  }
   787  
   788  func (i *inode) isDir() bool {
   789  	mode := linux.FileMode(i.mode.Load())
   790  	return mode.FileType() == linux.S_IFDIR
   791  }
   792  
   793  func (i *inode) touchAtime(mnt *vfs.Mount) {
   794  	if mnt.Options().Flags.NoATime {
   795  		return
   796  	}
   797  	if err := mnt.CheckBeginWrite(); err != nil {
   798  		return
   799  	}
   800  	now := i.fs.clock.Now().Nanoseconds()
   801  	i.mu.Lock()
   802  	i.atime.Store(now)
   803  	i.mu.Unlock()
   804  	mnt.EndWrite()
   805  }
   806  
   807  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   808  func (i *inode) touchCtime() {
   809  	now := i.fs.clock.Now().Nanoseconds()
   810  	i.mu.Lock()
   811  	i.ctime.Store(now)
   812  	i.mu.Unlock()
   813  }
   814  
   815  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   816  func (i *inode) touchCMtime() {
   817  	now := i.fs.clock.Now().Nanoseconds()
   818  	i.mu.Lock()
   819  	i.mtime.Store(now)
   820  	i.ctime.Store(now)
   821  	i.mu.Unlock()
   822  }
   823  
   824  // Preconditions:
   825  //   - The caller has called vfs.Mount.CheckBeginWrite().
   826  //   - inode.mu must be locked.
   827  func (i *inode) touchCMtimeLocked() {
   828  	now := i.fs.clock.Now().Nanoseconds()
   829  	i.mtime.Store(now)
   830  	i.ctime.Store(now)
   831  }
   832  
   833  func (i *inode) checkXattrPrefix(name string) error {
   834  	for prefix := range i.fs.allowXattrPrefix {
   835  		if strings.HasPrefix(name, prefix) {
   836  			return nil
   837  		}
   838  	}
   839  	return linuxerr.EOPNOTSUPP
   840  }
   841  
   842  func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) {
   843  	return i.xattrs.ListXattr(creds, size)
   844  }
   845  
   846  func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
   847  	if err := i.checkXattrPrefix(opts.Name); err != nil {
   848  		return "", err
   849  	}
   850  	mode := linux.FileMode(i.mode.Load())
   851  	kuid := auth.KUID(i.uid.Load())
   852  	kgid := auth.KGID(i.gid.Load())
   853  	if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil {
   854  		return "", err
   855  	}
   856  	return i.xattrs.GetXattr(creds, mode, kuid, opts)
   857  }
   858  
   859  func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
   860  	if err := i.checkXattrPrefix(opts.Name); err != nil {
   861  		return err
   862  	}
   863  	mode := linux.FileMode(i.mode.Load())
   864  	kuid := auth.KUID(i.uid.Load())
   865  	kgid := auth.KGID(i.gid.Load())
   866  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   867  		return err
   868  	}
   869  	return i.xattrs.SetXattr(creds, mode, kuid, opts)
   870  }
   871  
   872  func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
   873  	if err := i.checkXattrPrefix(name); err != nil {
   874  		return err
   875  	}
   876  	mode := linux.FileMode(i.mode.Load())
   877  	kuid := auth.KUID(i.uid.Load())
   878  	kgid := auth.KGID(i.gid.Load())
   879  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   880  		return err
   881  	}
   882  	return i.xattrs.RemoveXattr(creds, mode, kuid, name)
   883  }
   884  
   885  // fileDescription is embedded by tmpfs implementations of
   886  // vfs.FileDescriptionImpl.
   887  //
   888  // +stateify savable
   889  type fileDescription struct {
   890  	vfsfd vfs.FileDescription
   891  	vfs.FileDescriptionDefaultImpl
   892  	vfs.LockFD
   893  }
   894  
   895  func (fd *fileDescription) filesystem() *filesystem {
   896  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   897  }
   898  
   899  func (fd *fileDescription) dentry() *dentry {
   900  	return fd.vfsfd.Dentry().Impl().(*dentry)
   901  }
   902  
   903  func (fd *fileDescription) inode() *inode {
   904  	return fd.dentry().inode
   905  }
   906  
   907  // Stat implements vfs.FileDescriptionImpl.Stat.
   908  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   909  	var stat linux.Statx
   910  	fd.inode().statTo(&stat)
   911  	return stat, nil
   912  }
   913  
   914  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   915  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   916  	return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts)
   917  }
   918  
   919  // StatFS implements vfs.FileDescriptionImpl.StatFS.
   920  func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   921  	return fd.filesystem().statFS(), nil
   922  }
   923  
   924  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   925  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   926  	return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size)
   927  }
   928  
   929  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   930  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   931  	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
   932  }
   933  
   934  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   935  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   936  	return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts)
   937  }
   938  
   939  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   940  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   941  	return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name)
   942  }
   943  
   944  // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
   945  // filesystem state is in-memory.
   946  func (*fileDescription) Sync(context.Context) error {
   947  	return nil
   948  }
   949  
   950  // parseSize converts size in string to an integer bytes.
   951  // Supported suffixes in string are:K, M, G, T, P, E.
   952  func parseSize(s string) (uint64, error) {
   953  	if len(s) == 0 {
   954  		return 0, fmt.Errorf("size parameter empty")
   955  	}
   956  	suffix := s[len(s)-1]
   957  	count := 1
   958  	switch suffix {
   959  	case 'e', 'E':
   960  		count = count << 10
   961  		fallthrough
   962  	case 'p', 'P':
   963  		count = count << 10
   964  		fallthrough
   965  	case 't', 'T':
   966  		count = count << 10
   967  		fallthrough
   968  	case 'g', 'G':
   969  		count = count << 10
   970  		fallthrough
   971  	case 'm', 'M':
   972  		count = count << 10
   973  		fallthrough
   974  	case 'k', 'K':
   975  		count = count << 10
   976  		s = s[:len(s)-1]
   977  	}
   978  	byteTmp, err := strconv.ParseUint(s, 10, 64)
   979  	if err != nil {
   980  		return 0, linuxerr.EINVAL
   981  	}
   982  	// Check for overflow.
   983  	bytes := byteTmp * uint64(count)
   984  	if byteTmp != 0 && bytes/byteTmp != uint64(count) {
   985  		return 0, fmt.Errorf("size overflow")
   986  	}
   987  	return bytes, err
   988  }