github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/tmpfs/tmpfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package tmpfs provides an in-memory filesystem whose contents are
    16  // application-mutable, consistent with Linux's tmpfs.
    17  //
    18  // Lock order:
    19  //
    20  //	filesystem.mu
    21  //		inode.mu
    22  //		  regularFileFD.offMu
    23  //		    *** "memmap.Mappable locks" below this point
    24  //		    regularFile.mapsMu
    25  //		      *** "memmap.Mappable locks taken by Translate" below this point
    26  //		      regularFile.dataMu
    27  //		        fs.pagesUsedMu
    28  //		  directory.iterMu
    29  package tmpfs
    30  
    31  import (
    32  	"fmt"
    33  	"math"
    34  	"strconv"
    35  	"strings"
    36  
    37  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/fd"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/usage"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs/memxattr"
    49  )
    50  
    51  // Name is the default filesystem name.
    52  const Name = "tmpfs"
    53  
    54  // FilesystemType implements vfs.FilesystemType.
    55  //
    56  // +stateify savable
    57  type FilesystemType struct{}
    58  
    59  // filesystem implements vfs.FilesystemImpl.
    60  //
    61  // +stateify savable
    62  type filesystem struct {
    63  	vfsfs vfs.Filesystem
    64  
    65  	// mf is used to allocate memory that stores regular file contents. mf is
    66  	// immutable, except it may to changed during restore.
    67  	mf *pgalloc.MemoryFile `state:"nosave"`
    68  
    69  	// privateMF indicates whether mf is private to this tmpfs mount. If so,
    70  	// tmpfs takes ownership of mf. privateMF is immutable.
    71  	privateMF bool
    72  
    73  	// mfp is used to provide mf, when privateMF == false. This is required to
    74  	// re-provide mf on restore. mfp is immutable.
    75  	mfp pgalloc.MemoryFileProvider
    76  
    77  	// clock is a realtime clock used to set timestamps in file operations.
    78  	clock time.Clock
    79  
    80  	// devMinor is the filesystem's minor device number. devMinor is immutable.
    81  	devMinor uint32
    82  
    83  	// mopts contains the tmpfs-specific mount options passed to this
    84  	// filesystem. Immutable.
    85  	mopts string
    86  
    87  	// usage is the memory accounting category under which pages backing
    88  	// files in this filesystem are accounted.
    89  	usage usage.MemoryKind
    90  
    91  	// mu serializes changes to the Dentry tree.
    92  	mu filesystemRWMutex `state:"nosave"`
    93  
    94  	nextInoMinusOne atomicbitops.Uint64 // accessed using atomic memory operations
    95  
    96  	root *dentry
    97  
    98  	maxFilenameLen int
    99  
   100  	// maxSizeInPages is the maximum permissible size for the tmpfs in terms of pages.
   101  	// This field is immutable.
   102  	maxSizeInPages uint64
   103  
   104  	// pagesUsed is the number of pages used by this filesystem.
   105  	pagesUsed atomicbitops.Uint64
   106  }
   107  
   108  // Name implements vfs.FilesystemType.Name.
   109  func (FilesystemType) Name() string {
   110  	return Name
   111  }
   112  
   113  // Release implements vfs.FilesystemType.Release.
   114  func (FilesystemType) Release(ctx context.Context) {}
   115  
   116  // FilesystemOpts is used to pass configuration data to tmpfs.
   117  //
   118  // +stateify savable
   119  type FilesystemOpts struct {
   120  	// RootFileType is the FileType of the filesystem root. Valid values
   121  	// are: S_IFDIR, S_IFREG, and S_IFLNK. Defaults to S_IFDIR.
   122  	RootFileType uint16
   123  
   124  	// RootSymlinkTarget is the target of the root symlink. Only valid if
   125  	// RootFileType == S_IFLNK.
   126  	RootSymlinkTarget string
   127  
   128  	// FilesystemType allows setting a different FilesystemType for this
   129  	// tmpfs filesystem. This allows tmpfs to "impersonate" other
   130  	// filesystems, like ramdiskfs and cgroupfs.
   131  	FilesystemType vfs.FilesystemType
   132  
   133  	// Usage is the memory accounting category under which pages backing files in
   134  	// the filesystem are accounted.
   135  	Usage *usage.MemoryKind
   136  
   137  	// MaxFilenameLen is the maximum filename length allowed by the tmpfs.
   138  	MaxFilenameLen int
   139  
   140  	// FilestoreFD is the FD for the memory file that will be used to store file
   141  	// data. If this is nil, then MemoryFileProviderFromContext() is used.
   142  	FilestoreFD *fd.FD
   143  
   144  	// DisableDefaultSizeLimit disables setting a default size limit. In Linux,
   145  	// SB_KERNMOUNT has this effect on tmpfs mounts; see mm/shmem.c:shmem_fill_super().
   146  	DisableDefaultSizeLimit bool
   147  }
   148  
   149  // Default size limit mount option. It is immutable after initialization.
   150  var defaultSizeLimit uint64
   151  
   152  // SetDefaultSizeLimit configures the size limit to be used for tmpfs mounts
   153  // that do not specify a size= mount option. This must be called only once,
   154  // before any tmpfs filesystems are created.
   155  func SetDefaultSizeLimit(sizeLimit uint64) {
   156  	defaultSizeLimit = sizeLimit
   157  }
   158  
   159  func getDefaultSizeLimit(disable bool) uint64 {
   160  	if disable || defaultSizeLimit == 0 {
   161  		// The size limit is used to populate statfs(2) results. If Linux tmpfs is
   162  		// mounted with no size option, then statfs(2) returns f_blocks == f_bfree
   163  		// == f_bavail == 0. However, many applications treat this as having a size
   164  		// limit of 0. To work around this, return a very large but non-zero size
   165  		// limit, chosen to ensure that it does not overflow int64.
   166  		return math.MaxInt64
   167  	}
   168  	return defaultSizeLimit
   169  }
   170  
   171  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   172  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, _ string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   173  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   174  	if mfp == nil {
   175  		panic("MemoryFileProviderFromContext returned nil")
   176  	}
   177  	mf := mfp.MemoryFile()
   178  	privateMF := false
   179  
   180  	rootFileType := uint16(linux.S_IFDIR)
   181  	disableDefaultSizeLimit := false
   182  	newFSType := vfs.FilesystemType(&fstype)
   183  	tmpfsOpts, tmpfsOptsOk := opts.InternalData.(FilesystemOpts)
   184  	if tmpfsOptsOk {
   185  		if tmpfsOpts.RootFileType != 0 {
   186  			rootFileType = tmpfsOpts.RootFileType
   187  		}
   188  		if tmpfsOpts.FilesystemType != nil {
   189  			newFSType = tmpfsOpts.FilesystemType
   190  		}
   191  		disableDefaultSizeLimit = tmpfsOpts.DisableDefaultSizeLimit
   192  		if tmpfsOpts.FilestoreFD != nil {
   193  			mfOpts := pgalloc.MemoryFileOpts{
   194  				// tmpfsOpts.FilestoreFD may be backed by a file on disk (not memfd),
   195  				// which needs to be decommited on destroy to release disk space.
   196  				DecommitOnDestroy: true,
   197  				// sentry's seccomp filters don't allow the mmap(2) syscalls that
   198  				// pgalloc.IMAWorkAroundForMemFile() uses. Users of tmpfsOpts.FilestoreFD
   199  				// are expected to have performed the work around outside the sandbox.
   200  				DisableIMAWorkAround: true,
   201  				// Custom filestore FDs are usually backed by files on disk. Ideally we
   202  				// would confirm with fstatfs(2) but that is prohibited by seccomp.
   203  				DiskBackedFile: true,
   204  			}
   205  			var err error
   206  			mf, err = pgalloc.NewMemoryFile(tmpfsOpts.FilestoreFD.ReleaseToFile("overlay-filestore"), mfOpts)
   207  			if err != nil {
   208  				ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: pgalloc.NewMemoryFile failed: %v", err)
   209  				return nil, nil, err
   210  			}
   211  			privateMF = true
   212  		}
   213  	}
   214  
   215  	mopts := vfs.GenericParseMountOptions(opts.Data)
   216  	rootMode := linux.FileMode(0777)
   217  	if rootFileType == linux.S_IFDIR {
   218  		rootMode = 01777
   219  	}
   220  	modeStr, ok := mopts["mode"]
   221  	if ok {
   222  		delete(mopts, "mode")
   223  		mode, err := strconv.ParseUint(modeStr, 8, 32)
   224  		if err != nil {
   225  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid mode: %q", modeStr)
   226  			return nil, nil, linuxerr.EINVAL
   227  		}
   228  		rootMode = linux.FileMode(mode & 07777)
   229  	}
   230  	rootKUID := creds.EffectiveKUID
   231  	uidStr, ok := mopts["uid"]
   232  	if ok {
   233  		delete(mopts, "uid")
   234  		uid, err := strconv.ParseUint(uidStr, 10, 32)
   235  		if err != nil {
   236  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid uid: %q", uidStr)
   237  			return nil, nil, linuxerr.EINVAL
   238  		}
   239  		kuid := creds.UserNamespace.MapToKUID(auth.UID(uid))
   240  		if !kuid.Ok() {
   241  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped uid: %d", uid)
   242  			return nil, nil, linuxerr.EINVAL
   243  		}
   244  		rootKUID = kuid
   245  	}
   246  	rootKGID := creds.EffectiveKGID
   247  	gidStr, ok := mopts["gid"]
   248  	if ok {
   249  		delete(mopts, "gid")
   250  		gid, err := strconv.ParseUint(gidStr, 10, 32)
   251  		if err != nil {
   252  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: invalid gid: %q", gidStr)
   253  			return nil, nil, linuxerr.EINVAL
   254  		}
   255  		kgid := creds.UserNamespace.MapToKGID(auth.GID(gid))
   256  		if !kgid.Ok() {
   257  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unmapped gid: %d", gid)
   258  			return nil, nil, linuxerr.EINVAL
   259  		}
   260  		rootKGID = kgid
   261  	}
   262  	maxSizeInPages := getDefaultSizeLimit(disableDefaultSizeLimit) / hostarch.PageSize
   263  	maxSizeStr, ok := mopts["size"]
   264  	if ok {
   265  		delete(mopts, "size")
   266  		maxSizeInBytes, err := parseSize(maxSizeStr)
   267  		if err != nil {
   268  			ctx.Debugf("tmpfs.FilesystemType.GetFilesystem: parseSize() failed: %v", err)
   269  			return nil, nil, linuxerr.EINVAL
   270  		}
   271  		// Convert size in bytes to nearest Page Size bytes
   272  		// as Linux allocates memory in terms of Page size.
   273  		maxSizeInPages, ok = hostarch.ToPagesRoundUp(maxSizeInBytes)
   274  		if !ok {
   275  			ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: Pages RoundUp Overflow error: %q", ok)
   276  			return nil, nil, linuxerr.EINVAL
   277  		}
   278  	}
   279  
   280  	if len(mopts) != 0 {
   281  		ctx.Warningf("tmpfs.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   282  		return nil, nil, linuxerr.EINVAL
   283  	}
   284  
   285  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   286  	if err != nil {
   287  		return nil, nil, err
   288  	}
   289  	clock := time.RealtimeClockFromContext(ctx)
   290  	memUsage := usage.Tmpfs
   291  	if tmpfsOpts.Usage != nil {
   292  		memUsage = *tmpfsOpts.Usage
   293  	}
   294  	fs := filesystem{
   295  		mf:             mf,
   296  		privateMF:      privateMF,
   297  		mfp:            mfp,
   298  		clock:          clock,
   299  		devMinor:       devMinor,
   300  		mopts:          opts.Data,
   301  		usage:          memUsage,
   302  		maxFilenameLen: linux.NAME_MAX,
   303  		maxSizeInPages: maxSizeInPages,
   304  	}
   305  	fs.vfsfs.Init(vfsObj, newFSType, &fs)
   306  	if tmpfsOptsOk && tmpfsOpts.MaxFilenameLen > 0 {
   307  		fs.maxFilenameLen = tmpfsOpts.MaxFilenameLen
   308  	}
   309  
   310  	var root *dentry
   311  	switch rootFileType {
   312  	case linux.S_IFREG:
   313  		root = fs.newDentry(fs.newRegularFile(rootKUID, rootKGID, rootMode, nil /* parentDir */))
   314  	case linux.S_IFLNK:
   315  		root = fs.newDentry(fs.newSymlink(rootKUID, rootKGID, rootMode, tmpfsOpts.RootSymlinkTarget, nil /* parentDir */))
   316  	case linux.S_IFDIR:
   317  		root = &fs.newDirectory(rootKUID, rootKGID, rootMode, nil /* parentDir */).dentry
   318  	default:
   319  		fs.vfsfs.DecRef(ctx)
   320  		return nil, nil, fmt.Errorf("invalid tmpfs root file type: %#o", rootFileType)
   321  	}
   322  	fs.root = root
   323  	return &fs.vfsfs, &root.vfsd, nil
   324  }
   325  
   326  // Release implements vfs.FilesystemImpl.Release.
   327  func (fs *filesystem) Release(ctx context.Context) {
   328  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   329  	fs.mu.Lock()
   330  	if fs.root.inode.isDir() {
   331  		fs.root.releaseChildrenLocked(ctx)
   332  	}
   333  	fs.mu.Unlock()
   334  	if fs.privateMF {
   335  		fs.mf.Destroy()
   336  	}
   337  }
   338  
   339  // releaseChildrenLocked is called on the mount point by filesystem.Release() to
   340  // destroy all objects in the mount. It performs a depth-first walk of the
   341  // filesystem and "unlinks" everything by decrementing link counts
   342  // appropriately. There should be no open file descriptors when this is called,
   343  // so each inode should only have one outstanding reference that is removed once
   344  // its link count hits zero.
   345  //
   346  // Note that we do not update filesystem state precisely while tearing down (for
   347  // instance, the child maps are ignored)--we only care to remove all remaining
   348  // references so that every filesystem object gets destroyed. Also note that we
   349  // do not need to trigger DecRef on the mount point itself or any child mount;
   350  // these are taken care of by the destructor of the enclosing MountNamespace.
   351  //
   352  // Precondition: filesystem.mu is held.
   353  func (d *dentry) releaseChildrenLocked(ctx context.Context) {
   354  	dir := d.inode.impl.(*directory)
   355  	for _, child := range dir.childMap {
   356  		if child.inode.isDir() {
   357  			child.releaseChildrenLocked(ctx)
   358  			child.inode.decLinksLocked(ctx) // link for child/.
   359  			dir.inode.decLinksLocked(ctx)   // link for child/..
   360  		}
   361  		child.inode.decLinksLocked(ctx) // link for child
   362  	}
   363  }
   364  
   365  func (fs *filesystem) statFS() linux.Statfs {
   366  	st := linux.Statfs{
   367  		Type:         linux.TMPFS_MAGIC,
   368  		BlockSize:    hostarch.PageSize,
   369  		FragmentSize: hostarch.PageSize,
   370  		NameLength:   linux.NAME_MAX,
   371  	}
   372  
   373  	// If size is set for tmpfs return set values.
   374  	st.Blocks = fs.maxSizeInPages
   375  	pagesUsed := fs.pagesUsed.Load()
   376  	st.BlocksFree = fs.maxSizeInPages - pagesUsed
   377  	st.BlocksAvailable = fs.maxSizeInPages - pagesUsed
   378  	return st
   379  }
   380  
   381  // dentry implements vfs.DentryImpl.
   382  //
   383  // +stateify savable
   384  type dentry struct {
   385  	vfsd vfs.Dentry
   386  
   387  	// parent is this dentry's parent directory. Each referenced dentry holds a
   388  	// reference on parent.dentry. If this dentry is a filesystem root, parent
   389  	// is nil. parent is protected by filesystem.mu.
   390  	parent *dentry
   391  
   392  	// name is the name of this dentry in its parent. If this dentry is a
   393  	// filesystem root, name is the empty string. name is protected by
   394  	// filesystem.mu.
   395  	name string
   396  
   397  	// dentryEntry (ugh) links dentries into their parent directory.childList.
   398  	dentryEntry
   399  
   400  	// inode is the inode represented by this dentry. Multiple Dentries may
   401  	// share a single non-directory inode (with hard links). inode is
   402  	// immutable.
   403  	//
   404  	// tmpfs doesn't count references on dentries; because the dentry tree is
   405  	// the sole source of truth, it is by definition always consistent with the
   406  	// state of the filesystem. However, it does count references on inodes,
   407  	// because inode resources are released when all references are dropped.
   408  	// dentry therefore forwards reference counting directly to inode.
   409  	inode *inode
   410  }
   411  
   412  func (fs *filesystem) newDentry(inode *inode) *dentry {
   413  	d := &dentry{
   414  		inode: inode,
   415  	}
   416  	d.vfsd.Init(d)
   417  	return d
   418  }
   419  
   420  // IncRef implements vfs.DentryImpl.IncRef.
   421  func (d *dentry) IncRef() {
   422  	d.inode.incRef()
   423  }
   424  
   425  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   426  func (d *dentry) TryIncRef() bool {
   427  	return d.inode.tryIncRef()
   428  }
   429  
   430  // DecRef implements vfs.DentryImpl.DecRef.
   431  func (d *dentry) DecRef(ctx context.Context) {
   432  	d.inode.decRef(ctx)
   433  }
   434  
   435  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   436  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   437  	if d.inode.isDir() {
   438  		events |= linux.IN_ISDIR
   439  	}
   440  
   441  	// tmpfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   442  	// that d was deleted.
   443  	deleted := d.vfsd.IsDead()
   444  
   445  	d.inode.fs.mu.RLock()
   446  	// The ordering below is important, Linux always notifies the parent first.
   447  	if d.parent != nil {
   448  		d.parent.inode.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   449  	}
   450  	d.inode.watches.Notify(ctx, "", events, cookie, et, deleted)
   451  	d.inode.fs.mu.RUnlock()
   452  }
   453  
   454  // Watches implements vfs.DentryImpl.Watches.
   455  func (d *dentry) Watches() *vfs.Watches {
   456  	return &d.inode.watches
   457  }
   458  
   459  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   460  func (d *dentry) OnZeroWatches(context.Context) {}
   461  
   462  // inode represents a filesystem object.
   463  //
   464  // +stateify savable
   465  type inode struct {
   466  	// fs is the owning filesystem. fs is immutable.
   467  	fs *filesystem
   468  
   469  	// A reference is held on all inodes as long as they are reachable in the
   470  	// filesystem tree, i.e. nlink is nonzero. This reference is dropped when
   471  	// nlink reaches 0.
   472  	refs inodeRefs
   473  
   474  	// xattrs implements extended attributes.
   475  	//
   476  	// TODO(b/148380782): Support xattrs other than user.*
   477  	xattrs memxattr.SimpleExtendedAttributes
   478  
   479  	// Inode metadata. Writing multiple fields atomically requires holding
   480  	// mu, othewise atomic operations can be used.
   481  	mu    inodeMutex          `state:"nosave"`
   482  	mode  atomicbitops.Uint32 // file type and mode
   483  	nlink atomicbitops.Uint32 // protected by filesystem.mu instead of inode.mu
   484  	uid   atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   485  	gid   atomicbitops.Uint32 // auth.KGID, but ...
   486  	ino   uint64              // immutable
   487  
   488  	// Linux's tmpfs has no concept of btime.
   489  	atime atomicbitops.Int64 // nanoseconds
   490  	ctime atomicbitops.Int64 // nanoseconds
   491  	mtime atomicbitops.Int64 // nanoseconds
   492  
   493  	locks vfs.FileLocks
   494  
   495  	// Inotify watches for this inode.
   496  	watches vfs.Watches
   497  
   498  	impl any // immutable
   499  }
   500  
   501  const maxLinks = math.MaxUint32
   502  
   503  func (i *inode) init(impl any, fs *filesystem, kuid auth.KUID, kgid auth.KGID, mode linux.FileMode, parentDir *directory) {
   504  	if mode.FileType() == 0 {
   505  		panic("file type is required in FileMode")
   506  	}
   507  
   508  	// Inherit the group and setgid bit as in fs/inode.c:inode_init_owner().
   509  	if parentDir != nil && parentDir.inode.mode.Load()&linux.S_ISGID == linux.S_ISGID {
   510  		kgid = auth.KGID(parentDir.inode.gid.Load())
   511  		if mode&linux.S_IFDIR == linux.S_IFDIR {
   512  			mode |= linux.S_ISGID
   513  		}
   514  	}
   515  
   516  	i.fs = fs
   517  	i.mode = atomicbitops.FromUint32(uint32(mode))
   518  	i.uid = atomicbitops.FromUint32(uint32(kuid))
   519  	i.gid = atomicbitops.FromUint32(uint32(kgid))
   520  	i.ino = fs.nextInoMinusOne.Add(1)
   521  	// Tmpfs creation sets atime, ctime, and mtime to current time.
   522  	now := fs.clock.Now().Nanoseconds()
   523  	i.atime = atomicbitops.FromInt64(now)
   524  	i.ctime = atomicbitops.FromInt64(now)
   525  	i.mtime = atomicbitops.FromInt64(now)
   526  	// i.nlink initialized by caller
   527  	i.impl = impl
   528  	i.refs.InitRefs()
   529  }
   530  
   531  // incLinksLocked increments i's link count.
   532  //
   533  // Preconditions:
   534  //   - filesystem.mu must be locked for writing.
   535  //   - i.mu must be lcoked.
   536  //   - i.nlink != 0.
   537  //   - i.nlink < maxLinks.
   538  func (i *inode) incLinksLocked() {
   539  	if i.nlink.RacyLoad() == 0 {
   540  		panic("tmpfs.inode.incLinksLocked() called with no existing links")
   541  	}
   542  	if i.nlink.RacyLoad() == maxLinks {
   543  		panic("tmpfs.inode.incLinksLocked() called with maximum link count")
   544  	}
   545  	i.nlink.Add(1)
   546  }
   547  
   548  // decLinksLocked decrements i's link count. If the link count reaches 0, we
   549  // remove a reference on i as well.
   550  //
   551  // Preconditions:
   552  //   - filesystem.mu must be locked for writing.
   553  //   - i.mu must be lcoked.
   554  //   - i.nlink != 0.
   555  func (i *inode) decLinksLocked(ctx context.Context) {
   556  	if i.nlink.RacyLoad() == 0 {
   557  		panic("tmpfs.inode.decLinksLocked() called with no existing links")
   558  	}
   559  	if i.nlink.Add(^uint32(0)) == 0 {
   560  		i.decRef(ctx)
   561  	}
   562  }
   563  
   564  func (i *inode) incRef() {
   565  	i.refs.IncRef()
   566  }
   567  
   568  func (i *inode) tryIncRef() bool {
   569  	return i.refs.TryIncRef()
   570  }
   571  
   572  func (i *inode) decRef(ctx context.Context) {
   573  	i.refs.DecRef(func() {
   574  		i.watches.HandleDeletion(ctx)
   575  		// Remove pages used if child being removed is a SymLink or Regular File.
   576  		switch impl := i.impl.(type) {
   577  		case *symlink:
   578  			if len(impl.target) >= shortSymlinkLen {
   579  				impl.inode.fs.unaccountPages(1)
   580  			}
   581  		case *regularFile:
   582  			// Release memory used by regFile to store data. Since regFile is
   583  			// no longer usable, we don't need to grab any locks or update any
   584  			// metadata.
   585  			pagesDec := impl.data.DropAll(i.fs.mf)
   586  			impl.inode.fs.unaccountPages(pagesDec)
   587  		}
   588  
   589  	})
   590  }
   591  
   592  func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   593  	mode := linux.FileMode(i.mode.Load())
   594  	return vfs.GenericCheckPermissions(creds, ats, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load()))
   595  }
   596  
   597  // Go won't inline this function, and returning linux.Statx (which is quite
   598  // big) means spending a lot of time in runtime.duffcopy(), so instead it's an
   599  // output parameter.
   600  //
   601  // Note that Linux does not guarantee to return consistent data (in the case of
   602  // a concurrent modification), so we do not require holding inode.mu.
   603  func (i *inode) statTo(stat *linux.Statx) {
   604  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK |
   605  		linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_SIZE |
   606  		linux.STATX_BLOCKS | linux.STATX_ATIME | linux.STATX_CTIME |
   607  		linux.STATX_MTIME
   608  	stat.Blksize = hostarch.PageSize
   609  	stat.Nlink = i.nlink.Load()
   610  	stat.UID = i.uid.Load()
   611  	stat.GID = i.gid.Load()
   612  	stat.Mode = uint16(i.mode.Load())
   613  	stat.Ino = i.ino
   614  	stat.Atime = linux.NsecToStatxTimestamp(i.atime.Load())
   615  	stat.Ctime = linux.NsecToStatxTimestamp(i.ctime.Load())
   616  	stat.Mtime = linux.NsecToStatxTimestamp(i.mtime.Load())
   617  	stat.DevMajor = linux.UNNAMED_MAJOR
   618  	stat.DevMinor = i.fs.devMinor
   619  	switch impl := i.impl.(type) {
   620  	case *regularFile:
   621  		stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS
   622  		stat.Size = uint64(impl.size.Load())
   623  		// TODO(jamieliu): This should be impl.data.Span() / 512, but this is
   624  		// too expensive to compute here. Cache it in regularFile.
   625  		stat.Blocks = allocatedBlocksForSize(stat.Size)
   626  	case *directory:
   627  		stat.Size = direntSize * (2 + uint64(impl.numChildren.Load()))
   628  		// stat.Blocks is 0.
   629  	case *symlink:
   630  		stat.Size = uint64(len(impl.target))
   631  		// stat.Blocks is 0.
   632  	case *namedPipe, *socketFile:
   633  		// stat.Size and stat.Blocks are 0.
   634  	case *deviceFile:
   635  		// stat.Size and stat.Blocks are 0.
   636  		stat.RdevMajor = impl.major
   637  		stat.RdevMinor = impl.minor
   638  	default:
   639  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   640  	}
   641  }
   642  
   643  func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions) error {
   644  	stat := &opts.Stat
   645  	if stat.Mask == 0 {
   646  		return nil
   647  	}
   648  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 {
   649  		return linuxerr.EPERM
   650  	}
   651  	mode := linux.FileMode(i.mode.Load())
   652  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(i.uid.Load()), auth.KGID(i.gid.Load())); err != nil {
   653  		return err
   654  	}
   655  
   656  	i.mu.Lock()
   657  	defer i.mu.Unlock()
   658  	var (
   659  		needsMtimeBump bool
   660  		needsCtimeBump bool
   661  	)
   662  	clearSID := false
   663  	mask := stat.Mask
   664  	if mask&linux.STATX_SIZE != 0 {
   665  		switch impl := i.impl.(type) {
   666  		case *regularFile:
   667  			updated, err := impl.truncateLocked(stat.Size)
   668  			if err != nil {
   669  				return err
   670  			}
   671  			if updated {
   672  				clearSID = true
   673  				needsMtimeBump = true
   674  				needsCtimeBump = true
   675  			}
   676  		case *directory:
   677  			return linuxerr.EISDIR
   678  		default:
   679  			return linuxerr.EINVAL
   680  		}
   681  	}
   682  	if mask&linux.STATX_UID != 0 {
   683  		i.uid.Store(stat.UID)
   684  		needsCtimeBump = true
   685  		clearSID = true
   686  	}
   687  	if mask&linux.STATX_GID != 0 {
   688  		i.gid.Store(stat.GID)
   689  		needsCtimeBump = true
   690  		clearSID = true
   691  	}
   692  	if mask&linux.STATX_MODE != 0 {
   693  		for {
   694  			old := i.mode.Load()
   695  			ft := old & linux.S_IFMT
   696  			newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
   697  			if clearSID {
   698  				newMode = vfs.ClearSUIDAndSGID(newMode)
   699  			}
   700  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   701  				clearSID = false
   702  				break
   703  			}
   704  		}
   705  		needsCtimeBump = true
   706  	}
   707  	now := i.fs.clock.Now().Nanoseconds()
   708  	if mask&linux.STATX_ATIME != 0 {
   709  		if stat.Atime.Nsec == linux.UTIME_NOW {
   710  			i.atime.Store(now)
   711  		} else {
   712  			i.atime.Store(stat.Atime.ToNsecCapped())
   713  		}
   714  		needsCtimeBump = true
   715  	}
   716  	if mask&linux.STATX_MTIME != 0 {
   717  		if stat.Mtime.Nsec == linux.UTIME_NOW {
   718  			i.mtime.Store(now)
   719  		} else {
   720  			i.mtime.Store(stat.Mtime.ToNsecCapped())
   721  		}
   722  		needsCtimeBump = true
   723  		// Ignore the mtime bump, since we just set it ourselves.
   724  		needsMtimeBump = false
   725  	}
   726  	if mask&linux.STATX_CTIME != 0 {
   727  		if stat.Ctime.Nsec == linux.UTIME_NOW {
   728  			i.ctime.Store(now)
   729  		} else {
   730  			i.ctime.Store(stat.Ctime.ToNsecCapped())
   731  		}
   732  		// Ignore the ctime bump, since we just set it ourselves.
   733  		needsCtimeBump = false
   734  	}
   735  
   736  	// We may have to clear the SUID/SGID bits, but didn't do so as part of
   737  	// STATX_MODE.
   738  	if clearSID {
   739  		for {
   740  			old := i.mode.Load()
   741  			newMode := vfs.ClearSUIDAndSGID(old)
   742  			if swapped := i.mode.CompareAndSwap(old, newMode); swapped {
   743  				break
   744  			}
   745  		}
   746  		needsCtimeBump = true
   747  	}
   748  
   749  	if needsMtimeBump {
   750  		i.mtime.Store(now)
   751  	}
   752  	if needsCtimeBump {
   753  		i.ctime.Store(now)
   754  	}
   755  
   756  	return nil
   757  }
   758  
   759  // allocatedBlocksForSize returns the number of 512B blocks needed to
   760  // accommodate the given size in bytes, as appropriate for struct
   761  // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block
   762  // size is independent of the "preferred block size for I/O", struct
   763  // stat::st_blksize and struct statx::stx_blksize.)
   764  func allocatedBlocksForSize(size uint64) uint64 {
   765  	return (size + 511) / 512
   766  }
   767  
   768  func (i *inode) direntType() uint8 {
   769  	switch impl := i.impl.(type) {
   770  	case *regularFile:
   771  		return linux.DT_REG
   772  	case *directory:
   773  		return linux.DT_DIR
   774  	case *symlink:
   775  		return linux.DT_LNK
   776  	case *socketFile:
   777  		return linux.DT_SOCK
   778  	case *namedPipe:
   779  		return linux.DT_FIFO
   780  	case *deviceFile:
   781  		switch impl.kind {
   782  		case vfs.BlockDevice:
   783  			return linux.DT_BLK
   784  		case vfs.CharDevice:
   785  			return linux.DT_CHR
   786  		default:
   787  			panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind))
   788  		}
   789  	default:
   790  		panic(fmt.Sprintf("unknown inode type: %T", i.impl))
   791  	}
   792  }
   793  
   794  func (i *inode) isDir() bool {
   795  	mode := linux.FileMode(i.mode.Load())
   796  	return mode.FileType() == linux.S_IFDIR
   797  }
   798  
   799  func (i *inode) touchAtime(mnt *vfs.Mount) {
   800  	if mnt.Flags.NoATime {
   801  		return
   802  	}
   803  	if err := mnt.CheckBeginWrite(); err != nil {
   804  		return
   805  	}
   806  	now := i.fs.clock.Now().Nanoseconds()
   807  	i.mu.Lock()
   808  	i.atime.Store(now)
   809  	i.mu.Unlock()
   810  	mnt.EndWrite()
   811  }
   812  
   813  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   814  func (i *inode) touchCtime() {
   815  	now := i.fs.clock.Now().Nanoseconds()
   816  	i.mu.Lock()
   817  	i.ctime.Store(now)
   818  	i.mu.Unlock()
   819  }
   820  
   821  // Preconditions: The caller has called vfs.Mount.CheckBeginWrite().
   822  func (i *inode) touchCMtime() {
   823  	now := i.fs.clock.Now().Nanoseconds()
   824  	i.mu.Lock()
   825  	i.mtime.Store(now)
   826  	i.ctime.Store(now)
   827  	i.mu.Unlock()
   828  }
   829  
   830  // Preconditions:
   831  //   - The caller has called vfs.Mount.CheckBeginWrite().
   832  //   - inode.mu must be locked.
   833  func (i *inode) touchCMtimeLocked() {
   834  	now := i.fs.clock.Now().Nanoseconds()
   835  	i.mtime.Store(now)
   836  	i.ctime.Store(now)
   837  }
   838  
   839  func checkXattrName(name string) error {
   840  	// Linux's tmpfs supports "security" and "trusted" xattr namespaces, and
   841  	// (depending on build configuration) POSIX ACL xattr namespaces
   842  	// ("system.posix_acl_access" and "system.posix_acl_default"). We don't
   843  	// support POSIX ACLs or the "security" namespace (b/148380782).
   844  	if strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
   845  		return nil
   846  	}
   847  	// We support the "user" namespace because we have tests that depend on
   848  	// this feature.
   849  	if strings.HasPrefix(name, linux.XATTR_USER_PREFIX) {
   850  		return nil
   851  	}
   852  	return linuxerr.EOPNOTSUPP
   853  }
   854  
   855  func (i *inode) listXattr(creds *auth.Credentials, size uint64) ([]string, error) {
   856  	return i.xattrs.ListXattr(creds, size)
   857  }
   858  
   859  func (i *inode) getXattr(creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
   860  	if err := checkXattrName(opts.Name); err != nil {
   861  		return "", err
   862  	}
   863  	mode := linux.FileMode(i.mode.Load())
   864  	kuid := auth.KUID(i.uid.Load())
   865  	kgid := auth.KGID(i.gid.Load())
   866  	if err := vfs.GenericCheckPermissions(creds, vfs.MayRead, mode, kuid, kgid); err != nil {
   867  		return "", err
   868  	}
   869  	return i.xattrs.GetXattr(creds, mode, kuid, opts)
   870  }
   871  
   872  func (i *inode) setXattr(creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
   873  	if err := checkXattrName(opts.Name); err != nil {
   874  		return err
   875  	}
   876  	mode := linux.FileMode(i.mode.Load())
   877  	kuid := auth.KUID(i.uid.Load())
   878  	kgid := auth.KGID(i.gid.Load())
   879  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   880  		return err
   881  	}
   882  	return i.xattrs.SetXattr(creds, mode, kuid, opts)
   883  }
   884  
   885  func (i *inode) removeXattr(creds *auth.Credentials, name string) error {
   886  	if err := checkXattrName(name); err != nil {
   887  		return err
   888  	}
   889  	mode := linux.FileMode(i.mode.Load())
   890  	kuid := auth.KUID(i.uid.Load())
   891  	kgid := auth.KGID(i.gid.Load())
   892  	if err := vfs.GenericCheckPermissions(creds, vfs.MayWrite, mode, kuid, kgid); err != nil {
   893  		return err
   894  	}
   895  	return i.xattrs.RemoveXattr(creds, mode, kuid, name)
   896  }
   897  
   898  // fileDescription is embedded by tmpfs implementations of
   899  // vfs.FileDescriptionImpl.
   900  //
   901  // +stateify savable
   902  type fileDescription struct {
   903  	vfsfd vfs.FileDescription
   904  	vfs.FileDescriptionDefaultImpl
   905  	vfs.LockFD
   906  }
   907  
   908  func (fd *fileDescription) filesystem() *filesystem {
   909  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   910  }
   911  
   912  func (fd *fileDescription) dentry() *dentry {
   913  	return fd.vfsfd.Dentry().Impl().(*dentry)
   914  }
   915  
   916  func (fd *fileDescription) inode() *inode {
   917  	return fd.dentry().inode
   918  }
   919  
   920  // Stat implements vfs.FileDescriptionImpl.Stat.
   921  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
   922  	var stat linux.Statx
   923  	fd.inode().statTo(&stat)
   924  	return stat, nil
   925  }
   926  
   927  // SetStat implements vfs.FileDescriptionImpl.SetStat.
   928  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
   929  	return fd.dentry().inode.setStat(ctx, auth.CredentialsFromContext(ctx), &opts)
   930  }
   931  
   932  // StatFS implements vfs.FileDescriptionImpl.StatFS.
   933  func (fd *fileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
   934  	return fd.filesystem().statFS(), nil
   935  }
   936  
   937  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   938  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   939  	return fd.inode().listXattr(auth.CredentialsFromContext(ctx), size)
   940  }
   941  
   942  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   943  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   944  	return fd.inode().getXattr(auth.CredentialsFromContext(ctx), &opts)
   945  }
   946  
   947  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   948  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   949  	return fd.dentry().inode.setXattr(auth.CredentialsFromContext(ctx), &opts)
   950  }
   951  
   952  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   953  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   954  	return fd.dentry().inode.removeXattr(auth.CredentialsFromContext(ctx), name)
   955  }
   956  
   957  // Sync implements vfs.FileDescriptionImpl.Sync. It does nothing because all
   958  // filesystem state is in-memory.
   959  func (*fileDescription) Sync(context.Context) error {
   960  	return nil
   961  }
   962  
   963  // parseSize converts size in string to an integer bytes.
   964  // Supported suffixes in string are:K, M, G, T, P, E.
   965  func parseSize(s string) (uint64, error) {
   966  	if len(s) == 0 {
   967  		return 0, fmt.Errorf("size parameter empty")
   968  	}
   969  	suffix := s[len(s)-1]
   970  	count := 1
   971  	switch suffix {
   972  	case 'e', 'E':
   973  		count = count << 10
   974  		fallthrough
   975  	case 'p', 'P':
   976  		count = count << 10
   977  		fallthrough
   978  	case 't', 'T':
   979  		count = count << 10
   980  		fallthrough
   981  	case 'g', 'G':
   982  		count = count << 10
   983  		fallthrough
   984  	case 'm', 'M':
   985  		count = count << 10
   986  		fallthrough
   987  	case 'k', 'K':
   988  		count = count << 10
   989  		s = s[:len(s)-1]
   990  	}
   991  	byteTmp, err := strconv.ParseUint(s, 10, 64)
   992  	if err != nil {
   993  		return 0, linuxerr.EINVAL
   994  	}
   995  	// Check for overflow.
   996  	bytes := byteTmp * uint64(count)
   997  	if byteTmp != 0 && bytes/byteTmp != uint64(count) {
   998  		return 0, fmt.Errorf("size overflow")
   999  	}
  1000  	return bytes, err
  1001  }