github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/gofer.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gofer provides a filesystem implementation that is backed by a 9p
    16  // server, interchangeably referred to as "gofers" throughout this package.
    17  //
    18  // Lock order:
    19  //
    20  //	regularFileFD/directoryFD.mu
    21  //	  filesystem.renameMu
    22  //	    dentry.cachingMu
    23  //	      dentryCache.mu
    24  //	      dentry.opMu
    25  //	        dentry.childrenMu
    26  //	        filesystem.syncMu
    27  //	        dentry.metadataMu
    28  //	          *** "memmap.Mappable locks" below this point
    29  //	          dentry.mapsMu
    30  //	            *** "memmap.Mappable locks taken by Translate" below this point
    31  //	            dentry.handleMu
    32  //	              dentry.dataMu
    33  //	          filesystem.inoMu
    34  //	specialFileFD.mu
    35  //	  specialFileFD.bufMu
    36  //
    37  // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that
    38  // either ancestor dentries are locked before descendant dentries, or that
    39  // filesystem.renameMu is locked for writing.
    40  package gofer
    41  
    42  import (
    43  	"fmt"
    44  	"path"
    45  	"strconv"
    46  	"strings"
    47  	"sync/atomic"
    48  
    49  	"golang.org/x/sys/unix"
    50  	"github.com/metacubex/gvisor/pkg/abi/linux"
    51  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    52  	"github.com/metacubex/gvisor/pkg/cleanup"
    53  	"github.com/metacubex/gvisor/pkg/context"
    54  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    55  	"github.com/metacubex/gvisor/pkg/hostarch"
    56  	"github.com/metacubex/gvisor/pkg/lisafs"
    57  	"github.com/metacubex/gvisor/pkg/log"
    58  	"github.com/metacubex/gvisor/pkg/refs"
    59  	fslock "github.com/metacubex/gvisor/pkg/sentry/fsimpl/lock"
    60  	"github.com/metacubex/gvisor/pkg/sentry/fsutil"
    61  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    62  	"github.com/metacubex/gvisor/pkg/sentry/kernel/pipe"
    63  	ktime "github.com/metacubex/gvisor/pkg/sentry/kernel/time"
    64  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    65  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    66  	"github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport"
    67  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    68  	"github.com/metacubex/gvisor/pkg/sync"
    69  	"github.com/metacubex/gvisor/pkg/unet"
    70  )
    71  
    72  // Name is the default filesystem name.
    73  const Name = "9p"
    74  
    75  // Mount option names for goferfs.
    76  const (
    77  	moptTransport                = "trans"
    78  	moptReadFD                   = "rfdno"
    79  	moptWriteFD                  = "wfdno"
    80  	moptAname                    = "aname"
    81  	moptDfltUID                  = "dfltuid"
    82  	moptDfltGID                  = "dfltgid"
    83  	moptCache                    = "cache"
    84  	moptForcePageCache           = "force_page_cache"
    85  	moptLimitHostFDTranslation   = "limit_host_fd_translation"
    86  	moptOverlayfsStaleRead       = "overlayfs_stale_read"
    87  	moptDisableFileHandleSharing = "disable_file_handle_sharing"
    88  	moptDisableFifoOpen          = "disable_fifo_open"
    89  
    90  	// Directfs options.
    91  	moptDirectfs = "directfs"
    92  )
    93  
    94  // Valid values for the "cache" mount option.
    95  const (
    96  	cacheFSCache             = "fscache"
    97  	cacheFSCacheWritethrough = "fscache_writethrough"
    98  	cacheRemoteRevalidating  = "remote_revalidating"
    99  )
   100  
   101  // SupportedMountOptions is the set of mount options that can be set externally.
   102  var SupportedMountOptions = []string{moptOverlayfsStaleRead, moptDisableFileHandleSharing}
   103  
   104  const (
   105  	defaultMaxCachedDentries  = 1000
   106  	maxCachedNegativeChildren = 1000
   107  )
   108  
   109  // stringFixedCache is a fixed sized cache, once initialized,
   110  // its size never changes.
   111  //
   112  // +stateify savable
   113  type stringFixedCache struct {
   114  	// namesList stores negative names with fifo list.
   115  	// name stored in namesList only means it used to be negative
   116  	// at the moment you pushed it to the list.
   117  	namesList stringList
   118  	size      uint64
   119  }
   120  
   121  func (cache *stringFixedCache) isInited() bool {
   122  	return cache.size != 0
   123  }
   124  
   125  func (cache *stringFixedCache) init(size uint64) {
   126  	elements := make([]stringListElem, size)
   127  	for i := uint64(0); i < size; i++ {
   128  		cache.namesList.PushFront(&elements[i])
   129  	}
   130  	cache.size = size
   131  }
   132  
   133  // Update will push name to the front of the list,
   134  // and pop the tail value.
   135  func (cache *stringFixedCache) add(name string) string {
   136  	tail := cache.namesList.Back()
   137  	victimName := tail.str
   138  	tail.str = name
   139  	cache.namesList.Remove(tail)
   140  	cache.namesList.PushFront(tail)
   141  	return victimName
   142  }
   143  
   144  // +stateify savable
   145  type dentryCache struct {
   146  	// mu protects the below fields.
   147  	mu sync.Mutex `state:"nosave"`
   148  	// dentries contains all dentries with 0 references. Due to race conditions,
   149  	// it may also contain dentries with non-zero references.
   150  	dentries dentryList
   151  	// dentriesLen is the number of dentries in dentries.
   152  	dentriesLen uint64
   153  	// maxCachedDentries is the maximum number of cacheable dentries.
   154  	maxCachedDentries uint64
   155  }
   156  
   157  // SetDentryCacheSize sets the size of the global gofer dentry cache.
   158  func SetDentryCacheSize(size int) {
   159  	if size < 0 {
   160  		return
   161  	}
   162  	if globalDentryCache != nil {
   163  		log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.")
   164  		return
   165  	}
   166  	globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)}
   167  }
   168  
   169  // globalDentryCache is a global cache of dentries across all gofers.
   170  var globalDentryCache *dentryCache
   171  
   172  // Valid values for "trans" mount option.
   173  const transportModeFD = "fd"
   174  
   175  // FilesystemType implements vfs.FilesystemType.
   176  //
   177  // +stateify savable
   178  type FilesystemType struct{}
   179  
   180  // filesystem implements vfs.FilesystemImpl.
   181  //
   182  // +stateify savable
   183  type filesystem struct {
   184  	vfsfs vfs.Filesystem
   185  
   186  	// mf is used to allocate memory that caches regular file contents. mf is
   187  	// immutable.
   188  	mf *pgalloc.MemoryFile `state:"nosave"`
   189  
   190  	// Immutable options.
   191  	opts  filesystemOptions
   192  	iopts InternalFilesystemOptions
   193  
   194  	// client is the LISAFS client used for communicating with the server. client
   195  	// is immutable.
   196  	client *lisafs.Client `state:"nosave"`
   197  
   198  	// clock is a realtime clock used to set timestamps in file operations.
   199  	clock ktime.Clock
   200  
   201  	// devMinor is the filesystem's minor device number. devMinor is immutable.
   202  	devMinor uint32
   203  
   204  	// root is the root dentry. root is immutable.
   205  	root *dentry
   206  
   207  	// renameMu serves two purposes:
   208  	//
   209  	//	- It synchronizes path resolution with renaming initiated by this
   210  	//		client.
   211  	//
   212  	//	- It is held by path resolution to ensure that reachable dentries remain
   213  	//		valid. A dentry is reachable by path resolution if it has a non-zero
   214  	//		reference count (such that it is usable as vfs.ResolvingPath.Start() or
   215  	//		is reachable from its children), or if it is a child dentry (such that
   216  	//		it is reachable from its parent).
   217  	renameMu sync.RWMutex `state:"nosave"`
   218  
   219  	dentryCache *dentryCache
   220  
   221  	// syncableDentries contains all non-synthetic dentries. specialFileFDs
   222  	// contains all open specialFileFDs. These fields are protected by syncMu.
   223  	syncMu           sync.Mutex `state:"nosave"`
   224  	syncableDentries dentryList
   225  	specialFileFDs   specialFDList
   226  
   227  	// inoByKey maps previously-observed device ID and host inode numbers to
   228  	// internal inode numbers assigned to those files. inoByKey is not preserved
   229  	// across checkpoint/restore because inode numbers may be reused between
   230  	// different gofer processes, so inode numbers may be repeated for different
   231  	// files across checkpoint/restore. inoByKey is protected by inoMu.
   232  	inoMu    sync.Mutex        `state:"nosave"`
   233  	inoByKey map[inoKey]uint64 `state:"nosave"`
   234  
   235  	// lastIno is the last inode number assigned to a file. lastIno is accessed
   236  	// using atomic memory operations.
   237  	lastIno atomicbitops.Uint64
   238  
   239  	// savedDentryRW records open read/write handles during save/restore.
   240  	savedDentryRW map[*dentry]savedDentryRW
   241  
   242  	// released is nonzero once filesystem.Release has been called.
   243  	released atomicbitops.Int32
   244  }
   245  
   246  // +stateify savable
   247  type filesystemOptions struct {
   248  	fd      int
   249  	aname   string
   250  	interop InteropMode // derived from the "cache" mount option
   251  	dfltuid auth.KUID
   252  	dfltgid auth.KGID
   253  
   254  	// If forcePageCache is true, host FDs may not be used for application
   255  	// memory mappings even if available; instead, the client must perform its
   256  	// own caching of regular file pages. This is primarily useful for testing.
   257  	forcePageCache bool
   258  
   259  	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
   260  	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
   261  	// makes memory accounting behavior more consistent between cases where
   262  	// host FDs are / are not available, but may increase the frequency of
   263  	// sentry-handled page faults on files for which a host FD is available.
   264  	limitHostFDTranslation bool
   265  
   266  	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
   267  	// filesystem may not be coherent with writable host FDs opened later, so
   268  	// all uses of the former must be replaced by uses of the latter. This is
   269  	// usually only the case when the remote filesystem is a Linux overlayfs
   270  	// mount. (Prior to Linux 4.18, patch series centered on commit
   271  	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
   272  	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
   273  	// series, only memory mappings are incoherent.)
   274  	overlayfsStaleRead bool
   275  
   276  	// If regularFilesUseSpecialFileFD is true, application FDs representing
   277  	// regular files will use distinct file handles for each FD, in the same
   278  	// way that application FDs representing "special files" such as sockets
   279  	// do. Note that this disables client caching for regular files. This option
   280  	// may regress performance due to excessive Open RPCs. This option is not
   281  	// supported with overlayfsStaleRead for now.
   282  	regularFilesUseSpecialFileFD bool
   283  
   284  	// If disableFifoOpen is true, application attempts to open(2) a host FIFO
   285  	// are disallowed.
   286  	disableFifoOpen bool
   287  
   288  	// directfs holds options for directfs mode.
   289  	directfs directfsOpts
   290  }
   291  
   292  // +stateify savable
   293  type directfsOpts struct {
   294  	// If directfs is enabled, the gofer client does not make RPCs to the gofer
   295  	// process. Instead, it makes host syscalls to perform file operations.
   296  	enabled bool
   297  }
   298  
   299  // InteropMode controls the client's interaction with other remote filesystem
   300  // users.
   301  //
   302  // +stateify savable
   303  type InteropMode uint32
   304  
   305  const (
   306  	// InteropModeExclusive is appropriate when the filesystem client is the
   307  	// only user of the remote filesystem.
   308  	//
   309  	//	- The client may cache arbitrary filesystem state (file data, metadata,
   310  	//		filesystem structure, etc.).
   311  	//
   312  	//	- Client changes to filesystem state may be sent to the remote
   313  	//		filesystem asynchronously, except when server permission checks are
   314  	//		necessary.
   315  	//
   316  	//	- File timestamps are based on client clocks. This ensures that users of
   317  	//		the client observe timestamps that are coherent with their own clocks
   318  	//		and consistent with Linux's semantics (in particular, it is not always
   319  	//		possible for clients to set arbitrary atimes and mtimes depending on the
   320  	//		remote filesystem implementation, and never possible for clients to set
   321  	//		arbitrary ctimes.)
   322  	InteropModeExclusive InteropMode = iota
   323  
   324  	// InteropModeWritethrough is appropriate when there are read-only users of
   325  	// the remote filesystem that expect to observe changes made by the
   326  	// filesystem client.
   327  	//
   328  	//	- The client may cache arbitrary filesystem state.
   329  	//
   330  	//	- Client changes to filesystem state must be sent to the remote
   331  	//		filesystem synchronously.
   332  	//
   333  	//	- File timestamps are based on client clocks. As a corollary, access
   334  	//		timestamp changes from other remote filesystem users will not be visible
   335  	//		to the client.
   336  	InteropModeWritethrough
   337  
   338  	// InteropModeShared is appropriate when there are users of the remote
   339  	// filesystem that may mutate its state other than the client.
   340  	//
   341  	//	- The client must verify ("revalidate") cached filesystem state before
   342  	//		using it.
   343  	//
   344  	//	- Client changes to filesystem state must be sent to the remote
   345  	//		filesystem synchronously.
   346  	//
   347  	//	- File timestamps are based on server clocks. This is necessary to
   348  	//		ensure that timestamp changes are synchronized between remote filesystem
   349  	//		users.
   350  	//
   351  	// Note that the correctness of InteropModeShared depends on the server
   352  	// correctly implementing 9P fids (i.e. each fid immutably represents a
   353  	// single filesystem object), even in the presence of remote filesystem
   354  	// mutations from other users. If this is violated, the behavior of the
   355  	// client is undefined.
   356  	InteropModeShared
   357  )
   358  
   359  // InternalFilesystemOptions may be passed as
   360  // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
   361  //
   362  // +stateify savable
   363  type InternalFilesystemOptions struct {
   364  	// If UniqueID is non-empty, it is an opaque string used to reassociate the
   365  	// filesystem with a new server FD during restoration from checkpoint.
   366  	UniqueID vfs.RestoreID
   367  
   368  	// If LeakConnection is true, do not close the connection to the server
   369  	// when the Filesystem is released. This is necessary for deployments in
   370  	// which servers can handle only a single client and report failure if that
   371  	// client disconnects.
   372  	LeakConnection bool
   373  
   374  	// If OpenSocketsByConnecting is true, silently translate attempts to open
   375  	// files identifying as sockets to connect RPCs.
   376  	OpenSocketsByConnecting bool
   377  }
   378  
   379  // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
   380  // UIDs and GIDs used for files that do not provide a specific owner or group
   381  // respectively.
   382  const (
   383  	// uint32(-2) doesn't work in Go.
   384  	_V9FS_DEFUID = auth.KUID(4294967294)
   385  	_V9FS_DEFGID = auth.KGID(4294967294)
   386  )
   387  
   388  // Name implements vfs.FilesystemType.Name.
   389  func (FilesystemType) Name() string {
   390  	return Name
   391  }
   392  
   393  // Release implements vfs.FilesystemType.Release.
   394  func (FilesystemType) Release(ctx context.Context) {}
   395  
   396  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   397  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   398  	mf := pgalloc.MemoryFileFromContext(ctx)
   399  	if mf == nil {
   400  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: CtxMemoryFile is nil")
   401  		return nil, nil, linuxerr.EINVAL
   402  	}
   403  
   404  	mopts := vfs.GenericParseMountOptions(opts.Data)
   405  	var fsopts filesystemOptions
   406  
   407  	fd, err := getFDFromMountOptionsMap(ctx, mopts)
   408  	if err != nil {
   409  		return nil, nil, err
   410  	}
   411  	fsopts.fd = fd
   412  
   413  	// Get the attach name.
   414  	fsopts.aname = "/"
   415  	if aname, ok := mopts[moptAname]; ok {
   416  		delete(mopts, moptAname)
   417  		if !path.IsAbs(aname) {
   418  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname)
   419  			return nil, nil, linuxerr.EINVAL
   420  		}
   421  		fsopts.aname = path.Clean(aname)
   422  	}
   423  
   424  	// Parse the cache policy. For historical reasons, this defaults to the
   425  	// least generally-applicable option, InteropModeExclusive.
   426  	fsopts.interop = InteropModeExclusive
   427  	if cache, ok := mopts[moptCache]; ok {
   428  		delete(mopts, moptCache)
   429  		switch cache {
   430  		case cacheFSCache:
   431  			fsopts.interop = InteropModeExclusive
   432  		case cacheFSCacheWritethrough:
   433  			fsopts.interop = InteropModeWritethrough
   434  		case cacheRemoteRevalidating:
   435  			fsopts.interop = InteropModeShared
   436  		default:
   437  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache)
   438  			return nil, nil, linuxerr.EINVAL
   439  		}
   440  	}
   441  
   442  	// Parse the default UID and GID.
   443  	fsopts.dfltuid = _V9FS_DEFUID
   444  	if dfltuidstr, ok := mopts[moptDfltUID]; ok {
   445  		delete(mopts, moptDfltUID)
   446  		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
   447  		if err != nil {
   448  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr)
   449  			return nil, nil, linuxerr.EINVAL
   450  		}
   451  		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
   452  		// in the caller's user namespace, but goferfs isn't
   453  		// application-mountable.
   454  		fsopts.dfltuid = auth.KUID(dfltuid)
   455  	}
   456  	fsopts.dfltgid = _V9FS_DEFGID
   457  	if dfltgidstr, ok := mopts[moptDfltGID]; ok {
   458  		delete(mopts, moptDfltGID)
   459  		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
   460  		if err != nil {
   461  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr)
   462  			return nil, nil, linuxerr.EINVAL
   463  		}
   464  		fsopts.dfltgid = auth.KGID(dfltgid)
   465  	}
   466  
   467  	// Handle simple flags.
   468  	if _, ok := mopts[moptDisableFileHandleSharing]; ok {
   469  		delete(mopts, moptDisableFileHandleSharing)
   470  		fsopts.regularFilesUseSpecialFileFD = true
   471  	}
   472  	if _, ok := mopts[moptDisableFifoOpen]; ok {
   473  		delete(mopts, moptDisableFifoOpen)
   474  		fsopts.disableFifoOpen = true
   475  	}
   476  	if _, ok := mopts[moptForcePageCache]; ok {
   477  		delete(mopts, moptForcePageCache)
   478  		fsopts.forcePageCache = true
   479  	}
   480  	if _, ok := mopts[moptLimitHostFDTranslation]; ok {
   481  		delete(mopts, moptLimitHostFDTranslation)
   482  		fsopts.limitHostFDTranslation = true
   483  	}
   484  	if _, ok := mopts[moptOverlayfsStaleRead]; ok {
   485  		delete(mopts, moptOverlayfsStaleRead)
   486  		fsopts.overlayfsStaleRead = true
   487  	}
   488  	if _, ok := mopts[moptDirectfs]; ok {
   489  		delete(mopts, moptDirectfs)
   490  		fsopts.directfs.enabled = true
   491  	}
   492  	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
   493  	// "cache=none".
   494  
   495  	// Check for unparsed options.
   496  	if len(mopts) != 0 {
   497  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   498  		return nil, nil, linuxerr.EINVAL
   499  	}
   500  
   501  	// Validation.
   502  	if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead {
   503  		// These options are not supported together. To support this, when a dentry
   504  		// is opened writably for the first time, we need to iterate over all the
   505  		// specialFileFDs of that dentry that represent a regular file and call
   506  		// fd.hostFileMapper.RegenerateMappings(writable_fd).
   507  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.")
   508  		return nil, nil, linuxerr.EINVAL
   509  	}
   510  
   511  	// Handle internal options.
   512  	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
   513  	if opts.InternalData != nil && !ok {
   514  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
   515  		return nil, nil, linuxerr.EINVAL
   516  	}
   517  	// If !ok, iopts being the zero value is correct.
   518  
   519  	// Construct the filesystem object.
   520  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   521  	if err != nil {
   522  		return nil, nil, err
   523  	}
   524  	fs := &filesystem{
   525  		mf:       mf,
   526  		opts:     fsopts,
   527  		iopts:    iopts,
   528  		clock:    ktime.RealtimeClockFromContext(ctx),
   529  		devMinor: devMinor,
   530  		inoByKey: make(map[inoKey]uint64),
   531  	}
   532  
   533  	// Did the user configure a global dentry cache?
   534  	if globalDentryCache != nil {
   535  		fs.dentryCache = globalDentryCache
   536  	} else {
   537  		fs.dentryCache = &dentryCache{maxCachedDentries: defaultMaxCachedDentries}
   538  	}
   539  
   540  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   541  
   542  	rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx)
   543  	if err != nil {
   544  		fs.vfsfs.DecRef(ctx)
   545  		return nil, nil, err
   546  	}
   547  	if fs.opts.directfs.enabled {
   548  		fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD))
   549  	} else {
   550  		fs.root, err = fs.newLisafsDentry(ctx, &rootInode)
   551  	}
   552  	if err != nil {
   553  		fs.vfsfs.DecRef(ctx)
   554  		return nil, nil, err
   555  	}
   556  	// Set the root's reference count to 2. One reference is returned to the
   557  	// caller, and the other is held by fs to prevent the root from being "cached"
   558  	// and subsequently evicted.
   559  	fs.root.refs = atomicbitops.FromInt64(2)
   560  	return &fs.vfsfs, &fs.root.vfsd, nil
   561  }
   562  
   563  // initClientAndGetRoot initializes fs.client and returns the root inode for
   564  // this mount point. It handles the attach point (fs.opts.aname) resolution.
   565  func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) {
   566  	sock, err := unet.NewSocket(fs.opts.fd)
   567  	if err != nil {
   568  		return lisafs.Inode{}, -1, err
   569  	}
   570  
   571  	ctx.UninterruptibleSleepStart(false)
   572  	defer ctx.UninterruptibleSleepFinish(false)
   573  
   574  	var (
   575  		rootInode  lisafs.Inode
   576  		rootHostFD int
   577  	)
   578  	fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock)
   579  	if err != nil {
   580  		return lisafs.Inode{}, -1, err
   581  	}
   582  
   583  	cu := cleanup.Make(func() {
   584  		if rootHostFD >= 0 {
   585  			_ = unix.Close(rootHostFD)
   586  		}
   587  		rootControlFD := fs.client.NewFD(rootInode.ControlFD)
   588  		rootControlFD.Close(ctx, false /* flush */)
   589  	})
   590  	defer cu.Clean()
   591  
   592  	if fs.opts.directfs.enabled {
   593  		if fs.opts.aname != "/" {
   594  			log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname)
   595  			return lisafs.Inode{}, -1, unix.EINVAL
   596  		}
   597  		if rootHostFD < 0 {
   598  			log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled")
   599  			return lisafs.Inode{}, -1, unix.EINVAL
   600  		}
   601  	} else {
   602  		if rootHostFD >= 0 {
   603  			log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it")
   604  			_ = unix.Close(rootHostFD)
   605  			rootHostFD = -1
   606  		}
   607  		// Use flipcall channels with lisafs because it makes a lot of RPCs.
   608  		if err := fs.client.StartChannels(); err != nil {
   609  			return lisafs.Inode{}, -1, err
   610  		}
   611  		rootInode, err = fs.handleAnameLisafs(ctx, rootInode)
   612  		if err != nil {
   613  			return lisafs.Inode{}, -1, err
   614  		}
   615  	}
   616  	cu.Release()
   617  	return rootInode, rootHostFD, nil
   618  }
   619  
   620  func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
   621  	// Check that the transport is "fd".
   622  	trans, ok := mopts[moptTransport]
   623  	if !ok || trans != transportModeFD {
   624  		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD)
   625  		return -1, linuxerr.EINVAL
   626  	}
   627  	delete(mopts, moptTransport)
   628  
   629  	// Check that read and write FDs are provided and identical.
   630  	rfdstr, ok := mopts[moptReadFD]
   631  	if !ok {
   632  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD)
   633  		return -1, linuxerr.EINVAL
   634  	}
   635  	delete(mopts, moptReadFD)
   636  	rfd, err := strconv.Atoi(rfdstr)
   637  	if err != nil {
   638  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr)
   639  		return -1, linuxerr.EINVAL
   640  	}
   641  	wfdstr, ok := mopts[moptWriteFD]
   642  	if !ok {
   643  		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD)
   644  		return -1, linuxerr.EINVAL
   645  	}
   646  	delete(mopts, moptWriteFD)
   647  	wfd, err := strconv.Atoi(wfdstr)
   648  	if err != nil {
   649  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr)
   650  		return -1, linuxerr.EINVAL
   651  	}
   652  	if rfd != wfd {
   653  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
   654  		return -1, linuxerr.EINVAL
   655  	}
   656  	return rfd, nil
   657  }
   658  
   659  // Release implements vfs.FilesystemImpl.Release.
   660  func (fs *filesystem) Release(ctx context.Context) {
   661  	fs.released.Store(1)
   662  
   663  	mf := fs.mf
   664  	fs.syncMu.Lock()
   665  	for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() {
   666  		d := elem.d
   667  		d.handleMu.Lock()
   668  		d.dataMu.Lock()
   669  		if d.isWriteHandleOk() {
   670  			// Write dirty cached data to the remote file.
   671  			h := d.writeHandle()
   672  			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   673  				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
   674  			}
   675  			// TODO(jamieliu): Do we need to flushf/fsync d?
   676  		}
   677  		// Discard cached pages.
   678  		d.cache.DropAll(mf)
   679  		d.dirty.RemoveAll()
   680  		d.dataMu.Unlock()
   681  		// Close host FDs if they exist.
   682  		d.closeHostFDs()
   683  		d.handleMu.Unlock()
   684  	}
   685  	// There can't be any specialFileFDs still using fs, since each such
   686  	// FileDescription would hold a reference on a Mount holding a reference on
   687  	// fs.
   688  	fs.syncMu.Unlock()
   689  
   690  	// If leak checking is enabled, release all outstanding references in the
   691  	// filesystem. We deliberately avoid doing this outside of leak checking; we
   692  	// have released all external resources above rather than relying on dentry
   693  	// destructors. fs.root may be nil if creating the client or initializing the
   694  	// root dentry failed in GetFilesystem.
   695  	if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil {
   696  		fs.renameMu.Lock()
   697  		fs.root.releaseSyntheticRecursiveLocked(ctx)
   698  		fs.evictAllCachedDentriesLocked(ctx)
   699  		fs.renameMu.Unlock()
   700  
   701  		// An extra reference was held by the filesystem on the root to prevent it from
   702  		// being cached/evicted.
   703  		fs.root.DecRef(ctx)
   704  	}
   705  
   706  	if !fs.iopts.LeakConnection {
   707  		// Close the connection to the server. This implicitly closes all FDs.
   708  		if fs.client != nil {
   709  			fs.client.Close()
   710  		}
   711  	}
   712  
   713  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   714  }
   715  
   716  // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
   717  // the reference count on every synthetic dentry. Synthetic dentries have one
   718  // reference for existence that should be dropped during filesystem.Release.
   719  //
   720  // Precondition: d.fs.renameMu is locked for writing.
   721  func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
   722  	if d.isSynthetic() {
   723  		d.decRefNoCaching()
   724  		d.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
   725  	}
   726  	if d.isDir() {
   727  		var children []*dentry
   728  		d.childrenMu.Lock()
   729  		for _, child := range d.children {
   730  			children = append(children, child)
   731  		}
   732  		d.childrenMu.Unlock()
   733  		for _, child := range children {
   734  			if child != nil {
   735  				child.releaseSyntheticRecursiveLocked(ctx)
   736  			}
   737  		}
   738  	}
   739  }
   740  
   741  // inoKey is the key used to identify the inode backed by this dentry.
   742  //
   743  // +stateify savable
   744  type inoKey struct {
   745  	ino      uint64
   746  	devMinor uint32
   747  	devMajor uint32
   748  }
   749  
   750  func inoKeyFromStatx(stat *linux.Statx) inoKey {
   751  	return inoKey{
   752  		ino:      stat.Ino,
   753  		devMinor: stat.DevMinor,
   754  		devMajor: stat.DevMajor,
   755  	}
   756  }
   757  
   758  func inoKeyFromStat(stat *unix.Stat_t) inoKey {
   759  	return inoKey{
   760  		ino:      stat.Ino,
   761  		devMinor: unix.Minor(stat.Dev),
   762  		devMajor: unix.Major(stat.Dev),
   763  	}
   764  }
   765  
   766  // dentry implements vfs.DentryImpl.
   767  //
   768  // +stateify savable
   769  type dentry struct {
   770  	vfsd vfs.Dentry
   771  
   772  	// refs is the reference count. Each dentry holds a reference on its
   773  	// parent, even if disowned. An additional reference is held on all
   774  	// synthetic dentries until they are unlinked or invalidated. When refs
   775  	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
   776  	// -1, the dentry has already been destroyed. refs is accessed using atomic
   777  	// memory operations.
   778  	refs atomicbitops.Int64
   779  
   780  	// fs is the owning filesystem. fs is immutable.
   781  	fs *filesystem
   782  
   783  	// parent is this dentry's parent directory. Each dentry holds a reference
   784  	// on its parent. If this dentry is a filesystem root, parent is nil.
   785  	// parent is protected by filesystem.renameMu.
   786  	parent atomic.Pointer[dentry] `state:".(*dentry)"`
   787  
   788  	// name is the name of this dentry in its parent. If this dentry is a
   789  	// filesystem root, name is the empty string. name is protected by
   790  	// filesystem.renameMu.
   791  	name string
   792  
   793  	// inoKey is used to identify this dentry's inode.
   794  	inoKey inoKey
   795  
   796  	// If deleted is non-zero, the file represented by this dentry has been
   797  	// deleted is accessed using atomic memory operations.
   798  	deleted atomicbitops.Uint32
   799  
   800  	// cachingMu is used to synchronize concurrent dentry caching attempts on
   801  	// this dentry.
   802  	cachingMu sync.Mutex `state:"nosave"`
   803  
   804  	// If cached is true, this dentry is part of filesystem.dentryCache. cached
   805  	// is protected by cachingMu.
   806  	cached bool
   807  
   808  	// cacheEntry links dentry into filesystem.dentryCache.dentries. It is
   809  	// protected by filesystem.dentryCache.mu.
   810  	cacheEntry dentryListElem
   811  
   812  	// syncableListEntry links dentry into filesystem.syncableDentries. It is
   813  	// protected by filesystem.syncMu.
   814  	syncableListEntry dentryListElem
   815  
   816  	// opMu synchronizes operations on this dentry. Operations that mutate
   817  	// the dentry tree must hold this lock for writing. Operations that
   818  	// only read the tree must hold for reading.
   819  	opMu sync.RWMutex `state:"nosave"`
   820  
   821  	// childrenMu protects the cached children data for this dentry.
   822  	childrenMu sync.Mutex `state:"nosave"`
   823  
   824  	// If this dentry represents a directory, children contains:
   825  	//
   826  	//	- Mappings of child filenames to dentries representing those children.
   827  	//
   828  	//	- Mappings of child filenames that are known not to exist to nil
   829  	//		dentries (only if InteropModeShared is not in effect and the directory
   830  	//		is not synthetic).
   831  	//
   832  	// +checklocks:childrenMu
   833  	children map[string]*dentry
   834  
   835  	// If this dentry represents a directory, negativeChildrenCache cache
   836  	// names of negative children. negativeChildrenCache is not saved since
   837  	// dentry.prepareSaveRecursive() drops all negative children.
   838  	//
   839  	// +checklocks:childrenMu
   840  	negativeChildrenCache stringFixedCache `state:"nosave"`
   841  	// If this dentry represents a directory, negativeChildren is the number of
   842  	// negative children cached in dentry.children. negativeChildren is not
   843  	// saved since dentry.prepareSaveRecursive() drops all negative children.
   844  	//
   845  	// +checklocks:childrenMu
   846  	negativeChildren int `state:"nosave"`
   847  
   848  	// If this dentry represents a directory, syntheticChildren is the number
   849  	// of child dentries for which dentry.isSynthetic() == true.
   850  	//
   851  	// +checklocks:childrenMu
   852  	syntheticChildren int
   853  
   854  	// If this dentry represents a directory,
   855  	// dentry.cachedMetadataAuthoritative() == true, and dirents is not
   856  	// nil, then dirents is a cache of all entries in the directory, in the
   857  	// order they were returned by the server. childrenSet just stores the
   858  	// `Name` field of all dirents in a set for fast query. dirents and
   859  	// childrenSet share the same lifecycle.
   860  	//
   861  	// +checklocks:childrenMu
   862  	dirents []vfs.Dirent `state:"nosave"`
   863  	// +checklocks:childrenMu
   864  	childrenSet map[string]struct{} `state:"nosave"`
   865  
   866  	// Cached metadata; protected by metadataMu.
   867  	// To access:
   868  	//   - In situations where consistency is not required (like stat), these
   869  	//     can be accessed using atomic operations only (without locking).
   870  	//   - Lock metadataMu and can access without atomic operations.
   871  	// To mutate:
   872  	//   - Lock metadataMu and use atomic operations to update because we might
   873  	//     have atomic readers that don't hold the lock.
   874  	metadataMu sync.Mutex          `state:"nosave"`
   875  	ino        uint64              // immutable
   876  	mode       atomicbitops.Uint32 // type is immutable, perms are mutable
   877  	uid        atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   878  	gid        atomicbitops.Uint32 // auth.KGID, but ...
   879  	blockSize  atomicbitops.Uint32 // 0 if unknown
   880  	// Timestamps, all nsecs from the Unix epoch.
   881  	atime atomicbitops.Int64
   882  	mtime atomicbitops.Int64
   883  	ctime atomicbitops.Int64
   884  	btime atomicbitops.Int64
   885  	// File size, which differs from other metadata in two ways:
   886  	//
   887  	//	- We make a best-effort attempt to keep it up to date even if
   888  	//		!dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
   889  	//
   890  	//	- size is protected by both metadataMu and dataMu (i.e. both must be
   891  	//		locked to mutate it; locking either is sufficient to access it).
   892  	size atomicbitops.Uint64
   893  	// If this dentry does not represent a synthetic file, deleted is 0, and
   894  	// atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
   895  	// remote file's timestamps, which should be updated when this dentry is
   896  	// evicted.
   897  	atimeDirty atomicbitops.Uint32
   898  	mtimeDirty atomicbitops.Uint32
   899  
   900  	// nlink counts the number of hard links to this dentry. It's updated and
   901  	// accessed using atomic operations. It's not protected by metadataMu like the
   902  	// other metadata fields.
   903  	nlink atomicbitops.Uint32
   904  
   905  	mapsMu sync.Mutex `state:"nosave"`
   906  
   907  	// If this dentry represents a regular file, mappings tracks mappings of
   908  	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
   909  	mappings memmap.MappingSet
   910  
   911  	//	- If this dentry represents a regular file or directory, readFD (if not
   912  	//    -1) is a host FD used for reads by all regularFileFDs/directoryFDs
   913  	//    representing this dentry.
   914  	//
   915  	//	- If this dentry represents a regular file, writeFD (if not -1) is a host
   916  	//    FD used for writes by all regularFileFDs representing this dentry.
   917  	//
   918  	//	- If this dentry represents a regular file, mmapFD is the host FD used
   919  	//		for memory mappings. If mmapFD is -1, no such FD is available, and the
   920  	//		internal page cache implementation is used for memory mappings instead.
   921  	//
   922  	// These fields are protected by handleMu. readFD, writeFD, and mmapFD are
   923  	// additionally written using atomic memory operations, allowing them to be
   924  	// read (albeit racily) with atomic.LoadInt32() without locking handleMu.
   925  	//
   926  	// readFD and writeFD may or may not be the same file descriptor. Once either
   927  	// transitions from closed (-1) to open, it may be mutated with handleMu
   928  	// locked, but cannot be closed until the dentry is destroyed.
   929  	//
   930  	// readFD and writeFD may or may not be the same file descriptor. mmapFD is
   931  	// always either -1 or equal to readFD; if the file has been opened for
   932  	// writing, it is additionally either -1 or equal to writeFD.
   933  	handleMu sync.RWMutex       `state:"nosave"`
   934  	readFD   atomicbitops.Int32 `state:"nosave"`
   935  	writeFD  atomicbitops.Int32 `state:"nosave"`
   936  	mmapFD   atomicbitops.Int32 `state:"nosave"`
   937  
   938  	dataMu sync.RWMutex `state:"nosave"`
   939  
   940  	// If this dentry represents a regular file that is client-cached, cache
   941  	// maps offsets into the cached file to offsets into
   942  	// filesystem.mfp.MemoryFile() that store the file's data. cache is
   943  	// protected by dataMu.
   944  	cache fsutil.FileRangeSet
   945  
   946  	// If this dentry represents a regular file that is client-cached, dirty
   947  	// tracks dirty segments in cache. dirty is protected by dataMu.
   948  	dirty fsutil.DirtySet
   949  
   950  	// pf implements memmap.File for mappings of hostFD.
   951  	pf dentryPlatformFile
   952  
   953  	// If this dentry represents a symbolic link, InteropModeShared is not in
   954  	// effect, and haveTarget is true, target is the symlink target. haveTarget
   955  	// and target are protected by dataMu.
   956  	haveTarget bool
   957  	target     string
   958  
   959  	// If this dentry represents a synthetic socket file, endpoint is the
   960  	// transport endpoint bound to this file.
   961  	endpoint transport.BoundEndpoint
   962  
   963  	// If this dentry represents a synthetic named pipe, pipe is the pipe
   964  	// endpoint bound to this file.
   965  	pipe *pipe.VFSPipe
   966  
   967  	locks vfs.FileLocks
   968  
   969  	// Inotify watches for this dentry.
   970  	//
   971  	// Note that inotify may behave unexpectedly in the presence of hard links,
   972  	// because dentries corresponding to the same file have separate inotify
   973  	// watches when they should share the same set. This is the case because it is
   974  	// impossible for us to know for sure whether two dentries correspond to the
   975  	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
   976  	// a more in-depth discussion on this matter).
   977  	watches vfs.Watches
   978  
   979  	// impl is the specific dentry implementation for non-synthetic dentries.
   980  	// impl is immutable.
   981  	//
   982  	// If impl is nil, this dentry represents a synthetic file, i.e. a
   983  	// file that does not exist on the host filesystem. As of this writing, the
   984  	// only files that can be synthetic are sockets, pipes, and directories.
   985  	impl any
   986  }
   987  
   988  // +stateify savable
   989  type stringListElem struct {
   990  	// str is the string that this elem represents.
   991  	str string
   992  	stringEntry
   993  }
   994  
   995  // +stateify savable
   996  type dentryListElem struct {
   997  	// d is the dentry that this elem represents.
   998  	d *dentry
   999  	dentryEntry
  1000  }
  1001  
  1002  func (fs *filesystem) inoFromKey(key inoKey) uint64 {
  1003  	fs.inoMu.Lock()
  1004  	defer fs.inoMu.Unlock()
  1005  
  1006  	if ino, ok := fs.inoByKey[key]; ok {
  1007  		return ino
  1008  	}
  1009  	ino := fs.nextIno()
  1010  	fs.inoByKey[key] = ino
  1011  	return ino
  1012  }
  1013  
  1014  func (fs *filesystem) nextIno() uint64 {
  1015  	return fs.lastIno.Add(1)
  1016  }
  1017  
  1018  // init must be called before first use of d.
  1019  func (d *dentry) init(impl any) {
  1020  	d.pf.dentry = d
  1021  	d.cacheEntry.d = d
  1022  	d.syncableListEntry.d = d
  1023  	// Nested impl-inheritance pattern. In memory it looks like:
  1024  	// [[[ vfs.Dentry ] dentry ] dentryImpl ]
  1025  	// All 3 abstractions are allocated in one allocation. We achieve this by
  1026  	// making each outer dentry implementation hold the inner dentry by value.
  1027  	// Then the outer most dentry is allocated and we initialize fields inward.
  1028  	// Each inner dentry has a pointer to the next level of implementation.
  1029  	d.impl = impl
  1030  	d.vfsd.Init(d)
  1031  	refs.Register(d)
  1032  }
  1033  
  1034  func (d *dentry) isSynthetic() bool {
  1035  	return d.impl == nil
  1036  }
  1037  
  1038  func (d *dentry) cachedMetadataAuthoritative() bool {
  1039  	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
  1040  }
  1041  
  1042  // updateMetadataFromStatxLocked is called to update d's metadata after an update
  1043  // from the remote filesystem.
  1044  // Precondition: d.metadataMu must be locked.
  1045  // +checklocks:d.metadataMu
  1046  func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) {
  1047  	if stat.Mask&linux.STATX_TYPE != 0 {
  1048  		if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want {
  1049  			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
  1050  		}
  1051  	}
  1052  	if stat.Mask&linux.STATX_MODE != 0 {
  1053  		d.mode.Store(uint32(stat.Mode))
  1054  	}
  1055  	if stat.Mask&linux.STATX_UID != 0 {
  1056  		d.uid.Store(dentryUID(lisafs.UID(stat.UID)))
  1057  	}
  1058  	if stat.Mask&linux.STATX_GID != 0 {
  1059  		d.gid.Store(dentryGID(lisafs.GID(stat.GID)))
  1060  	}
  1061  	if stat.Blksize != 0 {
  1062  		d.blockSize.Store(stat.Blksize)
  1063  	}
  1064  	// Don't override newer client-defined timestamps with old server-defined
  1065  	// ones.
  1066  	if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 {
  1067  		d.atime.Store(dentryTimestamp(stat.Atime))
  1068  	}
  1069  	if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 {
  1070  		d.mtime.Store(dentryTimestamp(stat.Mtime))
  1071  	}
  1072  	if stat.Mask&linux.STATX_CTIME != 0 {
  1073  		d.ctime.Store(dentryTimestamp(stat.Ctime))
  1074  	}
  1075  	if stat.Mask&linux.STATX_BTIME != 0 {
  1076  		d.btime.Store(dentryTimestamp(stat.Btime))
  1077  	}
  1078  	if stat.Mask&linux.STATX_NLINK != 0 {
  1079  		d.nlink.Store(stat.Nlink)
  1080  	}
  1081  	if stat.Mask&linux.STATX_SIZE != 0 {
  1082  		d.updateSizeLocked(stat.Size)
  1083  	}
  1084  }
  1085  
  1086  // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked,
  1087  // except that it takes a unix.Stat_t argument.
  1088  // Precondition: d.metadataMu must be locked.
  1089  // +checklocks:d.metadataMu
  1090  func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error {
  1091  	if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want {
  1092  		panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got))
  1093  	}
  1094  	d.mode.Store(stat.Mode)
  1095  	d.uid.Store(stat.Uid)
  1096  	d.gid.Store(stat.Gid)
  1097  	d.blockSize.Store(uint32(stat.Blksize))
  1098  	// Don't override newer client-defined timestamps with old host-defined
  1099  	// ones.
  1100  	if d.atimeDirty.Load() == 0 {
  1101  		d.atime.Store(dentryTimestampFromUnix(stat.Atim))
  1102  	}
  1103  	if d.mtimeDirty.Load() == 0 {
  1104  		d.mtime.Store(dentryTimestampFromUnix(stat.Mtim))
  1105  	}
  1106  	d.ctime.Store(dentryTimestampFromUnix(stat.Ctim))
  1107  	d.nlink.Store(uint32(stat.Nlink))
  1108  	d.updateSizeLocked(uint64(stat.Size))
  1109  	return nil
  1110  }
  1111  
  1112  // Preconditions: !d.isSynthetic().
  1113  // Preconditions: d.metadataMu is locked.
  1114  // +checklocks:d.metadataMu
  1115  func (d *dentry) refreshSizeLocked(ctx context.Context) error {
  1116  	d.handleMu.RLock()
  1117  
  1118  	// Can use RacyLoad() because handleMu is locked.
  1119  	if d.writeFD.RacyLoad() < 0 {
  1120  		d.handleMu.RUnlock()
  1121  		// Use a suitable FD if we don't have a writable host FD.
  1122  		return d.updateMetadataLocked(ctx, noHandle)
  1123  	}
  1124  
  1125  	// Using statx(2) with a minimal mask is faster than fstat(2).
  1126  	var stat unix.Statx_t
  1127  	// Can use RacyLoad() because handleMu is locked.
  1128  	err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat)
  1129  	d.handleMu.RUnlock() // must be released before updateSizeLocked()
  1130  	if err != nil {
  1131  		return err
  1132  	}
  1133  	d.updateSizeLocked(stat.Size)
  1134  	return nil
  1135  }
  1136  
  1137  // Preconditions: !d.isSynthetic().
  1138  func (d *dentry) updateMetadata(ctx context.Context) error {
  1139  	// d.metadataMu must be locked *before* we stat so that we do not end up
  1140  	// updating stale attributes in d.updateMetadataFromStatLocked().
  1141  	d.metadataMu.Lock()
  1142  	defer d.metadataMu.Unlock()
  1143  	return d.updateMetadataLocked(ctx, noHandle)
  1144  }
  1145  
  1146  func (d *dentry) fileType() uint32 {
  1147  	return d.mode.Load() & linux.S_IFMT
  1148  }
  1149  
  1150  func (d *dentry) statTo(stat *linux.Statx) {
  1151  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
  1152  	stat.Blksize = d.blockSize.Load()
  1153  	stat.Nlink = d.nlink.Load()
  1154  	if stat.Nlink == 0 {
  1155  		// The remote filesystem doesn't support link count; just make
  1156  		// something up. This is consistent with Linux, where
  1157  		// fs/inode.c:inode_init_always() initializes link count to 1, and
  1158  		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
  1159  		// it's not provided by the remote filesystem.
  1160  		stat.Nlink = 1
  1161  	}
  1162  	stat.UID = d.uid.Load()
  1163  	stat.GID = d.gid.Load()
  1164  	stat.Mode = uint16(d.mode.Load())
  1165  	stat.Ino = uint64(d.ino)
  1166  	stat.Size = d.size.Load()
  1167  	// This is consistent with regularFileFD.Seek(), which treats regular files
  1168  	// as having no holes.
  1169  	stat.Blocks = (stat.Size + 511) / 512
  1170  	stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load())
  1171  	stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load())
  1172  	stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load())
  1173  	stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load())
  1174  	stat.DevMajor = linux.UNNAMED_MAJOR
  1175  	stat.DevMinor = d.fs.devMinor
  1176  }
  1177  
  1178  // Precondition: fs.renameMu is locked.
  1179  func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
  1180  	stat := &opts.Stat
  1181  	if stat.Mask == 0 {
  1182  		return nil
  1183  	}
  1184  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
  1185  		return linuxerr.EPERM
  1186  	}
  1187  	mode := linux.FileMode(d.mode.Load())
  1188  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil {
  1189  		return err
  1190  	}
  1191  	if err := mnt.CheckBeginWrite(); err != nil {
  1192  		return err
  1193  	}
  1194  	defer mnt.EndWrite()
  1195  
  1196  	if stat.Mask&linux.STATX_SIZE != 0 {
  1197  		// Reject attempts to truncate files other than regular files, since
  1198  		// filesystem implementations may return the wrong errno.
  1199  		switch mode.FileType() {
  1200  		case linux.S_IFREG:
  1201  			// ok
  1202  		case linux.S_IFDIR:
  1203  			return linuxerr.EISDIR
  1204  		default:
  1205  			return linuxerr.EINVAL
  1206  		}
  1207  	}
  1208  
  1209  	var now int64
  1210  	if d.cachedMetadataAuthoritative() {
  1211  		// Truncate updates mtime.
  1212  		if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
  1213  			stat.Mask |= linux.STATX_MTIME
  1214  			stat.Mtime = linux.StatxTimestamp{
  1215  				Nsec: linux.UTIME_NOW,
  1216  			}
  1217  		}
  1218  
  1219  		// Use client clocks for timestamps.
  1220  		now = d.fs.clock.Now().Nanoseconds()
  1221  		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
  1222  			stat.Atime = linux.NsecToStatxTimestamp(now)
  1223  		}
  1224  		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
  1225  			stat.Mtime = linux.NsecToStatxTimestamp(now)
  1226  		}
  1227  	}
  1228  
  1229  	d.metadataMu.Lock()
  1230  	defer d.metadataMu.Unlock()
  1231  
  1232  	// As with Linux, if the UID, GID, or file size is changing, we have to
  1233  	// clear permission bits. Note that when set, clearSGID may cause
  1234  	// permissions to be updated.
  1235  	clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) ||
  1236  		(stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) ||
  1237  		stat.Mask&linux.STATX_SIZE != 0
  1238  	if clearSGID {
  1239  		if stat.Mask&linux.STATX_MODE != 0 {
  1240  			stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode)))
  1241  		} else {
  1242  			oldMode := d.mode.Load()
  1243  			if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode {
  1244  				stat.Mode = uint16(updatedMode)
  1245  				stat.Mask |= linux.STATX_MODE
  1246  			}
  1247  		}
  1248  	}
  1249  
  1250  	// failureMask indicates which attributes could not be set on the remote
  1251  	// filesystem. p9 returns an error if any of the attributes could not be set
  1252  	// but that leads to inconsistency as the server could have set a few
  1253  	// attributes successfully but a later failure will cause the successful ones
  1254  	// to not be updated in the dentry cache.
  1255  	var failureMask uint32
  1256  	var failureErr error
  1257  	if !d.isSynthetic() {
  1258  		if stat.Mask != 0 {
  1259  			if err := d.prepareSetStat(ctx, stat); err != nil {
  1260  				return err
  1261  			}
  1262  			d.handleMu.RLock()
  1263  			if stat.Mask&linux.STATX_SIZE != 0 {
  1264  				// d.dataMu must be held around the update to both the remote
  1265  				// file's size and d.size to serialize with writeback (which
  1266  				// might otherwise write data back up to the old d.size after
  1267  				// the remote file has been truncated).
  1268  				d.dataMu.Lock()
  1269  			}
  1270  			var err error
  1271  			failureMask, failureErr, err = d.setStatLocked(ctx, stat)
  1272  			d.handleMu.RUnlock()
  1273  			if err != nil {
  1274  				if stat.Mask&linux.STATX_SIZE != 0 {
  1275  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1276  				}
  1277  				return err
  1278  			}
  1279  			if stat.Mask&linux.STATX_SIZE != 0 {
  1280  				if failureMask&linux.STATX_SIZE == 0 {
  1281  					// d.size should be kept up to date, and privatized
  1282  					// copy-on-write mappings of truncated pages need to be
  1283  					// invalidated, even if InteropModeShared is in effect.
  1284  					d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above
  1285  				} else {
  1286  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1287  				}
  1288  			}
  1289  		}
  1290  		if d.fs.opts.interop == InteropModeShared {
  1291  			// There's no point to updating d's metadata in this case since
  1292  			// it'll be overwritten by revalidation before the next time it's
  1293  			// used anyway. (InteropModeShared inhibits client caching of
  1294  			// regular file data, so there's no cache to truncate either.)
  1295  			return nil
  1296  		}
  1297  	}
  1298  	if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 {
  1299  		d.mode.Store(d.fileType() | uint32(stat.Mode))
  1300  	}
  1301  	if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 {
  1302  		d.uid.Store(stat.UID)
  1303  	}
  1304  	if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 {
  1305  		d.gid.Store(stat.GID)
  1306  	}
  1307  	// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
  1308  	// if d.cachedMetadataAuthoritative() then we converted stat.Atime and
  1309  	// stat.Mtime to client-local timestamps above, and if
  1310  	// !d.cachedMetadataAuthoritative() then we returned after calling
  1311  	// d.file.setAttr(). For the same reason, now must have been initialized.
  1312  	if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 {
  1313  		d.atime.Store(stat.Atime.ToNsec())
  1314  		d.atimeDirty.Store(0)
  1315  	}
  1316  	if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 {
  1317  		d.mtime.Store(stat.Mtime.ToNsec())
  1318  		d.mtimeDirty.Store(0)
  1319  	}
  1320  	d.ctime.Store(now)
  1321  	if failureMask != 0 {
  1322  		// Setting some attribute failed on the remote filesystem.
  1323  		return failureErr
  1324  	}
  1325  	return nil
  1326  }
  1327  
  1328  // doAllocate performs an allocate operation on d. Note that d.metadataMu will
  1329  // be held when allocate is called.
  1330  func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
  1331  	d.metadataMu.Lock()
  1332  	defer d.metadataMu.Unlock()
  1333  
  1334  	// Allocating a smaller size is a noop.
  1335  	size := offset + length
  1336  	if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() {
  1337  		return nil
  1338  	}
  1339  
  1340  	err := allocate()
  1341  	if err != nil {
  1342  		return err
  1343  	}
  1344  	d.updateSizeLocked(size)
  1345  	if d.cachedMetadataAuthoritative() {
  1346  		d.touchCMtimeLocked()
  1347  	}
  1348  	return nil
  1349  }
  1350  
  1351  // Preconditions: d.metadataMu must be locked.
  1352  func (d *dentry) updateSizeLocked(newSize uint64) {
  1353  	d.dataMu.Lock()
  1354  	d.updateSizeAndUnlockDataMuLocked(newSize)
  1355  }
  1356  
  1357  // Preconditions: d.metadataMu and d.dataMu must be locked.
  1358  //
  1359  // Postconditions: d.dataMu is unlocked.
  1360  // +checklocksrelease:d.dataMu
  1361  func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) {
  1362  	oldSize := d.size.RacyLoad()
  1363  	d.size.Store(newSize)
  1364  	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
  1365  	// below. This allows concurrent calls to Read/Translate/etc. These
  1366  	// functions synchronize with truncation by refusing to use cache
  1367  	// contents beyond the new d.size. (We are still holding d.metadataMu,
  1368  	// so we can't race with Write or another truncate.)
  1369  	d.dataMu.Unlock()
  1370  	if newSize < oldSize {
  1371  		oldpgend, _ := hostarch.PageRoundUp(oldSize)
  1372  		newpgend, _ := hostarch.PageRoundUp(newSize)
  1373  		if oldpgend != newpgend {
  1374  			d.mapsMu.Lock()
  1375  			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
  1376  				// Compare Linux's mm/truncate.c:truncate_setsize() =>
  1377  				// truncate_pagecache() =>
  1378  				// mm/memory.c:unmap_mapping_range(evencows=1).
  1379  				InvalidatePrivate: true,
  1380  			})
  1381  			d.mapsMu.Unlock()
  1382  		}
  1383  		// We are now guaranteed that there are no translations of
  1384  		// truncated pages, and can remove them from the cache. Since
  1385  		// truncated pages have been removed from the remote file, they
  1386  		// should be dropped without being written back.
  1387  		d.dataMu.Lock()
  1388  		d.cache.Truncate(newSize, d.fs.mf)
  1389  		d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend})
  1390  		d.dataMu.Unlock()
  1391  	}
  1392  }
  1393  
  1394  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
  1395  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()))
  1396  }
  1397  
  1398  func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
  1399  	// Deny access to the "system" namespaces since applications
  1400  	// may expect these to affect kernel behavior in unimplemented ways
  1401  	// (b/148380782). Allow all other extended attributes to be passed through
  1402  	// to the remote filesystem. This is inconsistent with Linux's 9p client,
  1403  	// but consistent with other filesystems (e.g. FUSE).
  1404  	//
  1405  	// NOTE(b/202533394): Also disallow "trusted" namespace for now. This is
  1406  	// consistent with the VFS1 gofer client.
  1407  	if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
  1408  		return linuxerr.EOPNOTSUPP
  1409  	}
  1410  	mode := linux.FileMode(d.mode.Load())
  1411  	kuid := auth.KUID(d.uid.Load())
  1412  	kgid := auth.KGID(d.gid.Load())
  1413  	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
  1414  		return err
  1415  	}
  1416  	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
  1417  }
  1418  
  1419  func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
  1420  	return vfs.CheckDeleteSticky(
  1421  		creds,
  1422  		linux.FileMode(d.mode.Load()),
  1423  		auth.KUID(d.uid.Load()),
  1424  		auth.KUID(child.uid.Load()),
  1425  		auth.KGID(child.gid.Load()),
  1426  	)
  1427  }
  1428  
  1429  func dentryUID(uid lisafs.UID) uint32 {
  1430  	if !uid.Ok() {
  1431  		return uint32(auth.OverflowUID)
  1432  	}
  1433  	return uint32(uid)
  1434  }
  1435  
  1436  func dentryGID(gid lisafs.GID) uint32 {
  1437  	if !gid.Ok() {
  1438  		return uint32(auth.OverflowGID)
  1439  	}
  1440  	return uint32(gid)
  1441  }
  1442  
  1443  // IncRef implements vfs.DentryImpl.IncRef.
  1444  func (d *dentry) IncRef() {
  1445  	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
  1446  	// d.checkCachingLocked().
  1447  	r := d.refs.Add(1)
  1448  	if d.LogRefs() {
  1449  		refs.LogIncRef(d, r)
  1450  	}
  1451  }
  1452  
  1453  // TryIncRef implements vfs.DentryImpl.TryIncRef.
  1454  func (d *dentry) TryIncRef() bool {
  1455  	for {
  1456  		r := d.refs.Load()
  1457  		if r <= 0 {
  1458  			return false
  1459  		}
  1460  		if d.refs.CompareAndSwap(r, r+1) {
  1461  			if d.LogRefs() {
  1462  				refs.LogTryIncRef(d, r+1)
  1463  			}
  1464  			return true
  1465  		}
  1466  	}
  1467  }
  1468  
  1469  // DecRef implements vfs.DentryImpl.DecRef.
  1470  func (d *dentry) DecRef(ctx context.Context) {
  1471  	if d.decRefNoCaching() == 0 {
  1472  		d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1473  	}
  1474  }
  1475  
  1476  // decRefNoCaching decrements d's reference count without calling
  1477  // d.checkCachingLocked, even if d's reference count reaches 0; callers are
  1478  // responsible for ensuring that d.checkCachingLocked will be called later.
  1479  func (d *dentry) decRefNoCaching() int64 {
  1480  	r := d.refs.Add(-1)
  1481  	if d.LogRefs() {
  1482  		refs.LogDecRef(d, r)
  1483  	}
  1484  	if r < 0 {
  1485  		panic("gofer.dentry.decRefNoCaching() called without holding a reference")
  1486  	}
  1487  	return r
  1488  }
  1489  
  1490  // RefType implements refs.CheckedObject.Type.
  1491  func (d *dentry) RefType() string {
  1492  	return "gofer.dentry"
  1493  }
  1494  
  1495  // LeakMessage implements refs.CheckedObject.LeakMessage.
  1496  func (d *dentry) LeakMessage() string {
  1497  	return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load())
  1498  }
  1499  
  1500  // LogRefs implements refs.CheckedObject.LogRefs.
  1501  //
  1502  // This should only be set to true for debugging purposes, as it can generate an
  1503  // extremely large amount of output and drastically degrade performance.
  1504  func (d *dentry) LogRefs() bool {
  1505  	return false
  1506  }
  1507  
  1508  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
  1509  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
  1510  	if d.isDir() {
  1511  		events |= linux.IN_ISDIR
  1512  	}
  1513  
  1514  	d.fs.renameMu.RLock()
  1515  	// The ordering below is important, Linux always notifies the parent first.
  1516  	if parent := d.parent.Load(); parent != nil {
  1517  		parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
  1518  	}
  1519  	d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
  1520  	d.fs.renameMu.RUnlock()
  1521  }
  1522  
  1523  // Watches implements vfs.DentryImpl.Watches.
  1524  func (d *dentry) Watches() *vfs.Watches {
  1525  	return &d.watches
  1526  }
  1527  
  1528  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
  1529  //
  1530  // If no watches are left on this dentry and it has no references, cache it.
  1531  func (d *dentry) OnZeroWatches(ctx context.Context) {
  1532  	d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1533  }
  1534  
  1535  // checkCachingLocked should be called after d's reference count becomes 0 or
  1536  // it becomes disowned.
  1537  //
  1538  // For performance, checkCachingLocked can also be called after d's reference
  1539  // count becomes non-zero, so that d can be removed from the LRU cache. This
  1540  // may help in reducing the size of the cache and hence reduce evictions. Note
  1541  // that this is not necessary for correctness.
  1542  //
  1543  // It may be called on a destroyed dentry. For example,
  1544  // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
  1545  // for the same dentry when the dentry is visited more than once in the same
  1546  // operation. One of the calls may destroy the dentry, so subsequent calls will
  1547  // do nothing.
  1548  //
  1549  // Preconditions: d.fs.renameMu must be locked for writing if
  1550  // renameMuWriteLocked is true; it may be temporarily unlocked.
  1551  func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
  1552  	d.cachingMu.Lock()
  1553  	refs := d.refs.Load()
  1554  	if refs == -1 {
  1555  		// Dentry has already been destroyed.
  1556  		d.cachingMu.Unlock()
  1557  		return
  1558  	}
  1559  	if refs > 0 {
  1560  		// fs.dentryCache.dentries is permitted to contain dentries with non-zero
  1561  		// refs, which are skipped by fs.evictCachedDentryLocked() upon reaching
  1562  		// the end of the LRU. But it is still beneficial to remove d from the
  1563  		// cache as we are already holding d.cachingMu. Keeping a cleaner cache
  1564  		// also reduces the number of evictions (which is expensive as it acquires
  1565  		// fs.renameMu).
  1566  		d.removeFromCacheLocked()
  1567  		d.cachingMu.Unlock()
  1568  		return
  1569  	}
  1570  	// Deleted and invalidated dentries with zero references are no longer
  1571  	// reachable by path resolution and should be dropped immediately.
  1572  	if d.vfsd.IsDead() {
  1573  		d.removeFromCacheLocked()
  1574  		d.cachingMu.Unlock()
  1575  		if !renameMuWriteLocked {
  1576  			// Need to lock d.fs.renameMu for writing as needed by d.destroyLocked().
  1577  			d.fs.renameMu.Lock()
  1578  			defer d.fs.renameMu.Unlock()
  1579  			// Now that renameMu is locked for writing, no more refs can be taken on
  1580  			// d because path resolution requires renameMu for reading at least.
  1581  			if d.refs.Load() != 0 {
  1582  				// Destroy d only if its ref is still 0. If not, either someone took a
  1583  				// ref on it or it got destroyed before fs.renameMu could be acquired.
  1584  				return
  1585  			}
  1586  		}
  1587  		if d.isDeleted() {
  1588  			d.watches.HandleDeletion(ctx)
  1589  		}
  1590  		d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point.
  1591  		return
  1592  	}
  1593  	if d.vfsd.IsEvictable() {
  1594  		d.cachingMu.Unlock()
  1595  		// Attempt to evict.
  1596  		if renameMuWriteLocked {
  1597  			d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case.
  1598  			return
  1599  		}
  1600  		d.evict(ctx)
  1601  		return
  1602  	}
  1603  	// If d still has inotify watches and it is not deleted or invalidated, it
  1604  	// can't be evicted. Otherwise, we will lose its watches, even if a new
  1605  	// dentry is created for the same file in the future. Note that the size of
  1606  	// d.watches cannot concurrently transition from zero to non-zero, because
  1607  	// adding a watch requires holding a reference on d.
  1608  	if d.watches.Size() > 0 {
  1609  		// As in the refs > 0 case, removing d is beneficial.
  1610  		d.removeFromCacheLocked()
  1611  		d.cachingMu.Unlock()
  1612  		return
  1613  	}
  1614  
  1615  	if d.fs.released.Load() != 0 {
  1616  		d.cachingMu.Unlock()
  1617  		if !renameMuWriteLocked {
  1618  			// Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
  1619  			// needed by d.destroyLocked() later.
  1620  			d.fs.renameMu.Lock()
  1621  			defer d.fs.renameMu.Unlock()
  1622  		}
  1623  		if parent := d.parent.Load(); parent != nil {
  1624  			parent.childrenMu.Lock()
  1625  			delete(parent.children, d.name)
  1626  			parent.childrenMu.Unlock()
  1627  		}
  1628  		d.destroyLocked(ctx) // +checklocksforce: see above.
  1629  		return
  1630  	}
  1631  
  1632  	d.fs.dentryCache.mu.Lock()
  1633  	// If d is already cached, just move it to the front of the LRU.
  1634  	if d.cached {
  1635  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1636  		d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1637  		d.fs.dentryCache.mu.Unlock()
  1638  		d.cachingMu.Unlock()
  1639  		return
  1640  	}
  1641  	// Cache the dentry, then evict the least recently used cached dentry if
  1642  	// the cache becomes over-full.
  1643  	d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1644  	d.fs.dentryCache.dentriesLen++
  1645  	d.cached = true
  1646  	shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries
  1647  	d.fs.dentryCache.mu.Unlock()
  1648  	d.cachingMu.Unlock()
  1649  
  1650  	if shouldEvict {
  1651  		if !renameMuWriteLocked {
  1652  			// Need to lock d.fs.renameMu for writing as needed by
  1653  			// d.evictCachedDentryLocked().
  1654  			d.fs.renameMu.Lock()
  1655  			defer d.fs.renameMu.Unlock()
  1656  		}
  1657  		d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
  1658  	}
  1659  }
  1660  
  1661  // Preconditions: d.cachingMu must be locked.
  1662  func (d *dentry) removeFromCacheLocked() {
  1663  	if d.cached {
  1664  		d.fs.dentryCache.mu.Lock()
  1665  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1666  		d.fs.dentryCache.dentriesLen--
  1667  		d.fs.dentryCache.mu.Unlock()
  1668  		d.cached = false
  1669  	}
  1670  }
  1671  
  1672  // Precondition: fs.renameMu must be locked for writing; it may be temporarily
  1673  // unlocked.
  1674  // +checklocks:fs.renameMu
  1675  func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
  1676  	for fs.dentryCache.dentriesLen != 0 {
  1677  		fs.evictCachedDentryLocked(ctx)
  1678  	}
  1679  }
  1680  
  1681  // Preconditions:
  1682  //   - fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1683  //
  1684  // +checklocks:fs.renameMu
  1685  func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
  1686  	fs.dentryCache.mu.Lock()
  1687  	victim := fs.dentryCache.dentries.Back()
  1688  	fs.dentryCache.mu.Unlock()
  1689  	if victim == nil {
  1690  		// fs.dentryCache.dentries may have become empty between when it was
  1691  		// checked and when we locked fs.dentryCache.mu.
  1692  		return
  1693  	}
  1694  
  1695  	if victim.d.fs == fs {
  1696  		victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs
  1697  		return
  1698  	}
  1699  
  1700  	// The dentry cache is shared between all gofer filesystems and the victim is
  1701  	// from another filesystem. Have that filesystem do the work. We unlock
  1702  	// fs.renameMu to prevent deadlock: two filesystems could otherwise wait on
  1703  	// each others' renameMu.
  1704  	fs.renameMu.Unlock()
  1705  	defer fs.renameMu.Lock()
  1706  	victim.d.evict(ctx)
  1707  }
  1708  
  1709  // Preconditions:
  1710  //   - d.fs.renameMu must not be locked for writing.
  1711  func (d *dentry) evict(ctx context.Context) {
  1712  	d.fs.renameMu.Lock()
  1713  	defer d.fs.renameMu.Unlock()
  1714  	d.evictLocked(ctx)
  1715  }
  1716  
  1717  // Preconditions:
  1718  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1719  //
  1720  // +checklocks:d.fs.renameMu
  1721  func (d *dentry) evictLocked(ctx context.Context) {
  1722  	d.cachingMu.Lock()
  1723  	d.removeFromCacheLocked()
  1724  	// d.refs or d.watches.Size() may have become non-zero from an earlier path
  1725  	// resolution since it was inserted into fs.dentryCache.dentries.
  1726  	if d.refs.Load() != 0 || d.watches.Size() != 0 {
  1727  		d.cachingMu.Unlock()
  1728  		return
  1729  	}
  1730  	if parent := d.parent.Load(); parent != nil {
  1731  		parent.opMu.Lock()
  1732  		if !d.vfsd.IsDead() {
  1733  			// Note that d can't be a mount point (in any mount namespace), since VFS
  1734  			// holds references on mount points.
  1735  			rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd)
  1736  			for _, rc := range rcs {
  1737  				rc.DecRef(ctx)
  1738  			}
  1739  
  1740  			parent.childrenMu.Lock()
  1741  			delete(parent.children, d.name)
  1742  			parent.childrenMu.Unlock()
  1743  
  1744  			// We're only deleting the dentry, not the file it
  1745  			// represents, so we don't need to update
  1746  			// victim parent.dirents etc.
  1747  		}
  1748  		parent.opMu.Unlock()
  1749  	}
  1750  	// Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any
  1751  	// concurrent caching attempts on d will attempt to destroy it and so will
  1752  	// try to acquire fs.renameMu (which we have already acquiredd). Hence,
  1753  	// fs.renameMu will synchronize the destroy attempts.
  1754  	d.cachingMu.Unlock()
  1755  	d.destroyLocked(ctx) // +checklocksforce: owned as precondition.
  1756  }
  1757  
  1758  // destroyDisconnected destroys an uncached, unparented dentry. There are no
  1759  // locking preconditions.
  1760  func (d *dentry) destroyDisconnected(ctx context.Context) {
  1761  	mf := d.fs.mf
  1762  
  1763  	d.handleMu.Lock()
  1764  	d.dataMu.Lock()
  1765  
  1766  	if d.isWriteHandleOk() {
  1767  		// Write dirty pages back to the remote filesystem.
  1768  		h := d.writeHandle()
  1769  		if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
  1770  			log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
  1771  		}
  1772  	}
  1773  	// Discard cached data.
  1774  	if !d.cache.IsEmpty() {
  1775  		mf.MarkAllUnevictable(d)
  1776  		d.cache.DropAll(mf)
  1777  		d.dirty.RemoveAll()
  1778  	}
  1779  	d.dataMu.Unlock()
  1780  
  1781  	// Close any resources held by the implementation.
  1782  	d.destroyImpl(ctx)
  1783  
  1784  	// Can use RacyLoad() because handleMu is locked.
  1785  	if d.readFD.RacyLoad() >= 0 {
  1786  		_ = unix.Close(int(d.readFD.RacyLoad()))
  1787  	}
  1788  	if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() {
  1789  		_ = unix.Close(int(d.writeFD.RacyLoad()))
  1790  	}
  1791  	d.readFD = atomicbitops.FromInt32(-1)
  1792  	d.writeFD = atomicbitops.FromInt32(-1)
  1793  	d.mmapFD = atomicbitops.FromInt32(-1)
  1794  	d.handleMu.Unlock()
  1795  
  1796  	if !d.isSynthetic() {
  1797  		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
  1798  		// i.e. client and server timestamps may differ (because e.g. a client
  1799  		// write was serviced by the page cache, and only written back to the
  1800  		// remote file later). Ideally, we'd write client timestamps back to
  1801  		// the remote filesystem so that timestamps for a new dentry
  1802  		// instantiated for the same file would remain coherent. Unfortunately,
  1803  		// this turns out to be too expensive in many cases, so for now we
  1804  		// don't do this.
  1805  
  1806  		// Remove d from the set of syncable dentries.
  1807  		d.fs.syncMu.Lock()
  1808  		d.fs.syncableDentries.Remove(&d.syncableListEntry)
  1809  		d.fs.syncMu.Unlock()
  1810  	}
  1811  
  1812  	// Drop references and stop tracking this child.
  1813  	d.refs.Store(-1)
  1814  	refs.Unregister(d)
  1815  }
  1816  
  1817  // destroyLocked destroys the dentry.
  1818  //
  1819  // Preconditions:
  1820  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1821  //   - d.refs == 0.
  1822  //   - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
  1823  //     from its former parent dentry.
  1824  //
  1825  // +checklocks:d.fs.renameMu
  1826  func (d *dentry) destroyLocked(ctx context.Context) {
  1827  	switch d.refs.Load() {
  1828  	case 0:
  1829  		// Mark the dentry destroyed.
  1830  		d.refs.Store(-1)
  1831  	case -1:
  1832  		panic("dentry.destroyLocked() called on already destroyed dentry")
  1833  	default:
  1834  		panic("dentry.destroyLocked() called with references on the dentry")
  1835  	}
  1836  
  1837  	// Allow the following to proceed without renameMu locked to improve
  1838  	// scalability.
  1839  	d.fs.renameMu.Unlock()
  1840  
  1841  	// No locks need to be held during destoryDisconnected.
  1842  	d.destroyDisconnected(ctx)
  1843  
  1844  	d.fs.renameMu.Lock()
  1845  
  1846  	// Drop the reference held by d on its parent without recursively locking
  1847  	// d.fs.renameMu.
  1848  
  1849  	if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 {
  1850  		parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
  1851  	}
  1852  }
  1853  
  1854  func (d *dentry) isDeleted() bool {
  1855  	return d.deleted.Load() != 0
  1856  }
  1857  
  1858  func (d *dentry) setDeleted() {
  1859  	d.deleted.Store(1)
  1860  }
  1861  
  1862  func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) {
  1863  	if d.isSynthetic() {
  1864  		return nil, nil
  1865  	}
  1866  
  1867  	return d.listXattrImpl(ctx, size)
  1868  }
  1869  
  1870  func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
  1871  	if d.isSynthetic() {
  1872  		return "", linuxerr.ENODATA
  1873  	}
  1874  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
  1875  		return "", err
  1876  	}
  1877  	return d.getXattrImpl(ctx, opts)
  1878  }
  1879  
  1880  func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
  1881  	if d.isSynthetic() {
  1882  		return linuxerr.EPERM
  1883  	}
  1884  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
  1885  		return err
  1886  	}
  1887  	return d.setXattrImpl(ctx, opts)
  1888  }
  1889  
  1890  func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
  1891  	if d.isSynthetic() {
  1892  		return linuxerr.EPERM
  1893  	}
  1894  	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
  1895  		return err
  1896  	}
  1897  	return d.removeXattrImpl(ctx, name)
  1898  }
  1899  
  1900  // Preconditions:
  1901  //   - !d.isSynthetic().
  1902  //   - d.isRegularFile() || d.isDir().
  1903  //   - fs.renameMu is locked.
  1904  func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
  1905  	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
  1906  	// O_TRUNC).
  1907  	if !trunc {
  1908  		d.handleMu.RLock()
  1909  		canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk())
  1910  		d.handleMu.RUnlock()
  1911  		if canReuseCurHandle {
  1912  			// Current handles are sufficient.
  1913  			return nil
  1914  		}
  1915  	}
  1916  
  1917  	d.handleMu.Lock()
  1918  	needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc
  1919  	if !needNewHandle {
  1920  		d.handleMu.Unlock()
  1921  		return nil
  1922  	}
  1923  
  1924  	var fdsToCloseArr [2]int32
  1925  	fdsToClose := fdsToCloseArr[:0]
  1926  	invalidateTranslations := false
  1927  	// Get a new handle. If this file has been opened for both reading and
  1928  	// writing, try to get a single handle that is usable for both:
  1929  	//
  1930  	//	- Writable memory mappings of a host FD require that the host FD is
  1931  	//		opened for both reading and writing.
  1932  	//
  1933  	//	- NOTE(b/141991141): Some filesystems may not ensure coherence
  1934  	//		between multiple handles for the same file.
  1935  	openReadable := d.isReadHandleOk() || read
  1936  	openWritable := d.isWriteHandleOk() || write
  1937  	h, err := d.openHandle(ctx, openReadable, openWritable, trunc)
  1938  	if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) {
  1939  		// It may not be possible to use a single handle for both
  1940  		// reading and writing, since permissions on the file may have
  1941  		// changed to e.g. disallow reading after previously being
  1942  		// opened for reading. In this case, we have no choice but to
  1943  		// use separate handles for reading and writing.
  1944  		ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
  1945  		openReadable = read
  1946  		openWritable = write
  1947  		h, err = d.openHandle(ctx, openReadable, openWritable, trunc)
  1948  	}
  1949  	if err != nil {
  1950  		d.handleMu.Unlock()
  1951  		return err
  1952  	}
  1953  
  1954  	// Update d.readFD and d.writeFD
  1955  	if h.fd >= 0 {
  1956  		if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) {
  1957  			// Replace existing FDs with this one.
  1958  			if d.readFD.RacyLoad() >= 0 {
  1959  				// We already have a readable FD that may be in use by
  1960  				// concurrent callers of d.pf.FD().
  1961  				if d.fs.opts.overlayfsStaleRead {
  1962  					// If overlayfsStaleRead is in effect, then the new FD
  1963  					// may not be coherent with the existing one, so we
  1964  					// have no choice but to switch to mappings of the new
  1965  					// FD in both the application and sentry.
  1966  					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
  1967  						d.handleMu.Unlock()
  1968  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
  1969  						h.close(ctx)
  1970  						return err
  1971  					}
  1972  					fdsToClose = append(fdsToClose, d.readFD.RacyLoad())
  1973  					invalidateTranslations = true
  1974  					d.readFD.Store(h.fd)
  1975  				} else {
  1976  					// Otherwise, we want to avoid invalidating existing
  1977  					// memmap.Translations (which is expensive); instead, use
  1978  					// dup3 to make the old file descriptor refer to the new
  1979  					// file description, then close the new file descriptor
  1980  					// (which is no longer needed). Racing callers of d.pf.FD()
  1981  					// may use the old or new file description, but this
  1982  					// doesn't matter since they refer to the same file, and
  1983  					// any racing mappings must be read-only.
  1984  					if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil {
  1985  						oldFD := d.readFD.RacyLoad()
  1986  						d.handleMu.Unlock()
  1987  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err)
  1988  						h.close(ctx)
  1989  						return err
  1990  					}
  1991  					fdsToClose = append(fdsToClose, h.fd)
  1992  					h.fd = d.readFD.RacyLoad()
  1993  				}
  1994  			} else {
  1995  				d.readFD.Store(h.fd)
  1996  			}
  1997  			if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 {
  1998  				fdsToClose = append(fdsToClose, d.writeFD.RacyLoad())
  1999  			}
  2000  			d.writeFD.Store(h.fd)
  2001  			d.mmapFD.Store(h.fd)
  2002  		} else if openReadable && d.readFD.RacyLoad() < 0 {
  2003  			readHandleWasOk := d.isReadHandleOk()
  2004  			d.readFD.Store(h.fd)
  2005  			// If the file has not been opened for writing, the new FD may
  2006  			// be used for read-only memory mappings. If the file was
  2007  			// previously opened for reading (without an FD), then existing
  2008  			// translations of the file may use the internal page cache;
  2009  			// invalidate those mappings.
  2010  			if !d.isWriteHandleOk() {
  2011  				invalidateTranslations = readHandleWasOk
  2012  				d.mmapFD.Store(h.fd)
  2013  			}
  2014  		} else if openWritable && d.writeFD.RacyLoad() < 0 {
  2015  			d.writeFD.Store(h.fd)
  2016  			if d.readFD.RacyLoad() >= 0 {
  2017  				// We have an existing read-only FD, but the file has just
  2018  				// been opened for writing, so we need to start supporting
  2019  				// writable memory mappings. However, the new FD is not
  2020  				// readable, so we have no FD that can be used to create
  2021  				// writable memory mappings. Switch to using the internal
  2022  				// page cache.
  2023  				invalidateTranslations = true
  2024  				d.mmapFD.Store(-1)
  2025  			}
  2026  		} else {
  2027  			// The new FD is not useful.
  2028  			fdsToClose = append(fdsToClose, h.fd)
  2029  		}
  2030  	} else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 {
  2031  		// We have an existing read-only FD, but the file has just been
  2032  		// opened for writing, so we need to start supporting writable
  2033  		// memory mappings. However, we have no writable host FD. Switch to
  2034  		// using the internal page cache.
  2035  		invalidateTranslations = true
  2036  		d.mmapFD.Store(-1)
  2037  	}
  2038  
  2039  	d.updateHandles(ctx, h, openReadable, openWritable)
  2040  	d.handleMu.Unlock()
  2041  
  2042  	if invalidateTranslations {
  2043  		// Invalidate application mappings that may be using an old FD; they
  2044  		// will be replaced with mappings using the new FD after future calls
  2045  		// to d.Translate(). This requires holding d.mapsMu, which precedes
  2046  		// d.handleMu in the lock order.
  2047  		d.mapsMu.Lock()
  2048  		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
  2049  		d.mapsMu.Unlock()
  2050  	}
  2051  	for _, fd := range fdsToClose {
  2052  		unix.Close(int(fd))
  2053  	}
  2054  
  2055  	return nil
  2056  }
  2057  
  2058  func (d *dentry) syncRemoteFile(ctx context.Context) error {
  2059  	d.handleMu.RLock()
  2060  	defer d.handleMu.RUnlock()
  2061  	return d.syncRemoteFileLocked(ctx)
  2062  }
  2063  
  2064  // Preconditions: d.handleMu must be locked.
  2065  func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
  2066  	// Prefer syncing write handles over read handles, since some remote
  2067  	// filesystem implementations may not sync changes made through write
  2068  	// handles otherwise.
  2069  	wh := d.writeHandle()
  2070  	wh.sync(ctx)
  2071  	rh := d.readHandle()
  2072  	rh.sync(ctx)
  2073  	return nil
  2074  }
  2075  
  2076  func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
  2077  	d.handleMu.RLock()
  2078  	defer d.handleMu.RUnlock()
  2079  	if d.isWriteHandleOk() {
  2080  		// Write back dirty pages to the remote file.
  2081  		d.dataMu.Lock()
  2082  		h := d.writeHandle()
  2083  		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt)
  2084  		d.dataMu.Unlock()
  2085  		if err != nil {
  2086  			return err
  2087  		}
  2088  	}
  2089  	if err := d.syncRemoteFileLocked(ctx); err != nil {
  2090  		if !forFilesystemSync {
  2091  			return err
  2092  		}
  2093  		// Only return err if we can reasonably have expected sync to succeed
  2094  		// (d is a regular file and was opened for writing).
  2095  		if d.isRegularFile() && d.isWriteHandleOk() {
  2096  			return err
  2097  		}
  2098  		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
  2099  	}
  2100  	return nil
  2101  }
  2102  
  2103  // incLinks increments link count.
  2104  func (d *dentry) incLinks() {
  2105  	if d.nlink.Load() == 0 {
  2106  		// The remote filesystem doesn't support link count.
  2107  		return
  2108  	}
  2109  	d.nlink.Add(1)
  2110  }
  2111  
  2112  // decLinks decrements link count.
  2113  func (d *dentry) decLinks() {
  2114  	if d.nlink.Load() == 0 {
  2115  		// The remote filesystem doesn't support link count.
  2116  		return
  2117  	}
  2118  	d.nlink.Add(^uint32(0))
  2119  }
  2120  
  2121  // fileDescription is embedded by gofer implementations of
  2122  // vfs.FileDescriptionImpl.
  2123  //
  2124  // +stateify savable
  2125  type fileDescription struct {
  2126  	vfsfd vfs.FileDescription
  2127  	vfs.FileDescriptionDefaultImpl
  2128  	vfs.LockFD
  2129  
  2130  	lockLogging sync.Once `state:"nosave"`
  2131  }
  2132  
  2133  func (fd *fileDescription) filesystem() *filesystem {
  2134  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
  2135  }
  2136  
  2137  func (fd *fileDescription) dentry() *dentry {
  2138  	return fd.vfsfd.Dentry().Impl().(*dentry)
  2139  }
  2140  
  2141  // Stat implements vfs.FileDescriptionImpl.Stat.
  2142  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
  2143  	d := fd.dentry()
  2144  	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
  2145  	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
  2146  		// Use specialFileFD.handle.fileLisa for the Stat if available, for the
  2147  		// same reason that we try to use open FD in updateMetadataLocked().
  2148  		var err error
  2149  		if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok {
  2150  			err = sffd.updateMetadata(ctx)
  2151  		} else {
  2152  			err = d.updateMetadata(ctx)
  2153  		}
  2154  		if err != nil {
  2155  			return linux.Statx{}, err
  2156  		}
  2157  	}
  2158  	var stat linux.Statx
  2159  	d.statTo(&stat)
  2160  	return stat, nil
  2161  }
  2162  
  2163  // SetStat implements vfs.FileDescriptionImpl.SetStat.
  2164  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
  2165  	fs := fd.filesystem()
  2166  	fs.renameMu.RLock()
  2167  	defer fs.renameMu.RUnlock()
  2168  	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount())
  2169  }
  2170  
  2171  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
  2172  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
  2173  	return fd.dentry().listXattr(ctx, size)
  2174  }
  2175  
  2176  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
  2177  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
  2178  	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2179  }
  2180  
  2181  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
  2182  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
  2183  	return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2184  }
  2185  
  2186  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
  2187  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
  2188  	return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name)
  2189  }
  2190  
  2191  // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
  2192  func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error {
  2193  	fd.lockLogging.Do(func() {
  2194  		log.Infof("File lock using gofer file handled internally.")
  2195  	})
  2196  	return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block)
  2197  }
  2198  
  2199  // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
  2200  func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error {
  2201  	fd.lockLogging.Do(func() {
  2202  		log.Infof("Range lock using gofer file handled internally.")
  2203  	})
  2204  	return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block)
  2205  }
  2206  
  2207  // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
  2208  func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
  2209  	return fd.Locks().UnlockPOSIX(ctx, uid, r)
  2210  }
  2211  
  2212  // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally
  2213  // holds some information around the intent behind resolving the path.
  2214  type resolvingPath struct {
  2215  	*vfs.ResolvingPath
  2216  
  2217  	// excludeLast indicates whether the intent is to resolve until the last path
  2218  	// component. If true, the last path component should remain unresolved.
  2219  	excludeLast bool
  2220  }
  2221  
  2222  func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath {
  2223  	return resolvingPath{ResolvingPath: rp, excludeLast: false}
  2224  }
  2225  
  2226  func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath {
  2227  	return resolvingPath{ResolvingPath: rp, excludeLast: true}
  2228  }
  2229  
  2230  func (rp *resolvingPath) done() bool {
  2231  	if rp.excludeLast {
  2232  		return rp.Final()
  2233  	}
  2234  	return rp.Done()
  2235  }
  2236  
  2237  func (rp *resolvingPath) copy() resolvingPath {
  2238  	return resolvingPath{
  2239  		ResolvingPath: rp.ResolvingPath.Copy(),
  2240  		excludeLast:   rp.excludeLast,
  2241  	}
  2242  }
  2243  
  2244  // Precondition: !rp.done() && rp.Component() is not "." or "..".
  2245  func (rp *resolvingPath) getComponents(emit func(string) bool) {
  2246  	rp.GetComponents(rp.excludeLast, emit)
  2247  }