gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/gofer/gofer.go

gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/gofer/gofer.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gofer provides a filesystem implementation that is backed by a 9p
    16  // server, interchangeably referred to as "gofers" throughout this package.
    17  //
    18  // Lock order:
    19  //
    20  //	regularFileFD/directoryFD.mu
    21  //	  filesystem.renameMu
    22  //	    dentry.cachingMu
    23  //	      dentryCache.mu
    24  //	      dentry.opMu
    25  //	        dentry.childrenMu
    26  //	        filesystem.syncMu
    27  //	        dentry.metadataMu
    28  //	          *** "memmap.Mappable locks" below this point
    29  //	          dentry.mapsMu
    30  //	            *** "memmap.Mappable locks taken by Translate" below this point
    31  //	            dentry.handleMu
    32  //	              dentry.dataMu
    33  //	          filesystem.inoMu
    34  //	specialFileFD.mu
    35  //	  specialFileFD.bufMu
    36  //
    37  // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that
    38  // either ancestor dentries are locked before descendant dentries, or that
    39  // filesystem.renameMu is locked for writing.
    40  package gofer
    41  
    42  import (
    43  	"fmt"
    44  	"path"
    45  	"strconv"
    46  	"strings"
    47  	"sync/atomic"
    48  
    49  	"golang.org/x/sys/unix"
    50  	"gvisor.dev/gvisor/pkg/abi/linux"
    51  	"gvisor.dev/gvisor/pkg/atomicbitops"
    52  	"gvisor.dev/gvisor/pkg/cleanup"
    53  	"gvisor.dev/gvisor/pkg/context"
    54  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    55  	"gvisor.dev/gvisor/pkg/hostarch"
    56  	"gvisor.dev/gvisor/pkg/lisafs"
    57  	"gvisor.dev/gvisor/pkg/log"
    58  	"gvisor.dev/gvisor/pkg/refs"
    59  	fslock "gvisor.dev/gvisor/pkg/sentry/fsimpl/lock"
    60  	"gvisor.dev/gvisor/pkg/sentry/fsutil"
    61  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    62  	"gvisor.dev/gvisor/pkg/sentry/kernel/pipe"
    63  	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
    64  	"gvisor.dev/gvisor/pkg/sentry/memmap"
    65  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    66  	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
    67  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    68  	"gvisor.dev/gvisor/pkg/sync"
    69  	"gvisor.dev/gvisor/pkg/unet"
    70  )
    71  
    72  // Name is the default filesystem name.
    73  const Name = "9p"
    74  
    75  // Mount option names for goferfs.
    76  const (
    77  	moptTransport                = "trans"
    78  	moptReadFD                   = "rfdno"
    79  	moptWriteFD                  = "wfdno"
    80  	moptAname                    = "aname"
    81  	moptDfltUID                  = "dfltuid"
    82  	moptDfltGID                  = "dfltgid"
    83  	moptCache                    = "cache"
    84  	moptDcache                   = "dcache"
    85  	moptForcePageCache           = "force_page_cache"
    86  	moptLimitHostFDTranslation   = "limit_host_fd_translation"
    87  	moptOverlayfsStaleRead       = "overlayfs_stale_read"
    88  	moptDisableFileHandleSharing = "disable_file_handle_sharing"
    89  	moptDisableFifoOpen          = "disable_fifo_open"
    90  
    91  	// Directfs options.
    92  	moptDirectfs = "directfs"
    93  )
    94  
    95  // Valid values for the "cache" mount option.
    96  const (
    97  	cacheFSCache             = "fscache"
    98  	cacheFSCacheWritethrough = "fscache_writethrough"
    99  	cacheRemoteRevalidating  = "remote_revalidating"
   100  )
   101  
   102  // SupportedMountOptions is the set of mount options that can be set externally.
   103  var SupportedMountOptions = []string{moptOverlayfsStaleRead, moptDisableFileHandleSharing, moptDcache}
   104  
   105  const (
   106  	defaultMaxCachedDentries  = 1000
   107  	maxCachedNegativeChildren = 1000
   108  )
   109  
   110  // stringFixedCache is a fixed sized cache, once initialized,
   111  // its size never changes.
   112  //
   113  // +stateify savable
   114  type stringFixedCache struct {
   115  	// namesList stores negative names with fifo list.
   116  	// name stored in namesList only means it used to be negative
   117  	// at the moment you pushed it to the list.
   118  	namesList stringList
   119  	size      uint64
   120  }
   121  
   122  func (cache *stringFixedCache) isInited() bool {
   123  	return cache.size != 0
   124  }
   125  
   126  func (cache *stringFixedCache) init(size uint64) {
   127  	elements := make([]stringListElem, size)
   128  	for i := uint64(0); i < size; i++ {
   129  		cache.namesList.PushFront(&elements[i])
   130  	}
   131  	cache.size = size
   132  }
   133  
   134  // Update will push name to the front of the list,
   135  // and pop the tail value.
   136  func (cache *stringFixedCache) add(name string) string {
   137  	tail := cache.namesList.Back()
   138  	victimName := tail.str
   139  	tail.str = name
   140  	cache.namesList.Remove(tail)
   141  	cache.namesList.PushFront(tail)
   142  	return victimName
   143  }
   144  
   145  // +stateify savable
   146  type dentryCache struct {
   147  	// maxCachedDentries is the maximum number of cacheable dentries.
   148  	// maxCachedDentries is immutable.
   149  	maxCachedDentries uint64
   150  	// mu protects the below fields.
   151  	mu sync.Mutex `state:"nosave"`
   152  	// dentries contains all dentries with 0 references. Due to race conditions,
   153  	// it may also contain dentries with non-zero references.
   154  	dentries dentryList
   155  	// dentriesLen is the number of dentries in dentries.
   156  	dentriesLen uint64
   157  }
   158  
   159  // SetDentryCacheSize sets the size of the global gofer dentry cache.
   160  func SetDentryCacheSize(size int) {
   161  	if size < 0 {
   162  		return
   163  	}
   164  	if globalDentryCache != nil {
   165  		log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.")
   166  		return
   167  	}
   168  	globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)}
   169  }
   170  
   171  // globalDentryCache is a global cache of dentries across all gofer clients.
   172  var globalDentryCache *dentryCache
   173  
   174  // Valid values for "trans" mount option.
   175  const transportModeFD = "fd"
   176  
   177  // FilesystemType implements vfs.FilesystemType.
   178  //
   179  // +stateify savable
   180  type FilesystemType struct{}
   181  
   182  // filesystem implements vfs.FilesystemImpl.
   183  //
   184  // +stateify savable
   185  type filesystem struct {
   186  	vfsfs vfs.Filesystem
   187  
   188  	// mf is used to allocate memory that caches regular file contents. mf is
   189  	// immutable.
   190  	mf *pgalloc.MemoryFile `state:"nosave"`
   191  
   192  	// Immutable options.
   193  	opts  filesystemOptions
   194  	iopts InternalFilesystemOptions
   195  
   196  	// client is the LISAFS client used for communicating with the server. client
   197  	// is immutable.
   198  	client *lisafs.Client `state:"nosave"`
   199  
   200  	// clock is a realtime clock used to set timestamps in file operations.
   201  	clock ktime.Clock
   202  
   203  	// devMinor is the filesystem's minor device number. devMinor is immutable.
   204  	devMinor uint32
   205  
   206  	// root is the root dentry. root is immutable.
   207  	root *dentry
   208  
   209  	// renameMu serves two purposes:
   210  	//
   211  	//	- It synchronizes path resolution with renaming initiated by this
   212  	//		client.
   213  	//
   214  	//	- It is held by path resolution to ensure that reachable dentries remain
   215  	//		valid. A dentry is reachable by path resolution if it has a non-zero
   216  	//		reference count (such that it is usable as vfs.ResolvingPath.Start() or
   217  	//		is reachable from its children), or if it is a child dentry (such that
   218  	//		it is reachable from its parent).
   219  	renameMu sync.RWMutex `state:"nosave"`
   220  
   221  	dentryCache *dentryCache
   222  
   223  	// syncableDentries contains all non-synthetic dentries. specialFileFDs
   224  	// contains all open specialFileFDs. These fields are protected by syncMu.
   225  	syncMu           sync.Mutex `state:"nosave"`
   226  	syncableDentries dentryList
   227  	specialFileFDs   specialFDList
   228  
   229  	// inoByKey maps previously-observed device ID and host inode numbers to
   230  	// internal inode numbers assigned to those files. inoByKey is not preserved
   231  	// across checkpoint/restore because inode numbers may be reused between
   232  	// different gofer processes, so inode numbers may be repeated for different
   233  	// files across checkpoint/restore. inoByKey is protected by inoMu.
   234  	inoMu    sync.Mutex        `state:"nosave"`
   235  	inoByKey map[inoKey]uint64 `state:"nosave"`
   236  
   237  	// lastIno is the last inode number assigned to a file. lastIno is accessed
   238  	// using atomic memory operations.
   239  	lastIno atomicbitops.Uint64
   240  
   241  	// savedDentryRW records open read/write handles during save/restore.
   242  	savedDentryRW map[*dentry]savedDentryRW
   243  
   244  	// released is nonzero once filesystem.Release has been called.
   245  	released atomicbitops.Int32
   246  }
   247  
   248  // +stateify savable
   249  type filesystemOptions struct {
   250  	fd      int
   251  	aname   string
   252  	interop InteropMode // derived from the "cache" mount option
   253  	dfltuid auth.KUID
   254  	dfltgid auth.KGID
   255  
   256  	// dcache is the maximum number of dentries that can be cached. This is
   257  	// effective only if globalDentryCache is not being used.
   258  	dcache uint64
   259  
   260  	// If forcePageCache is true, host FDs may not be used for application
   261  	// memory mappings even if available; instead, the client must perform its
   262  	// own caching of regular file pages. This is primarily useful for testing.
   263  	forcePageCache bool
   264  
   265  	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
   266  	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
   267  	// makes memory accounting behavior more consistent between cases where
   268  	// host FDs are / are not available, but may increase the frequency of
   269  	// sentry-handled page faults on files for which a host FD is available.
   270  	limitHostFDTranslation bool
   271  
   272  	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
   273  	// filesystem may not be coherent with writable host FDs opened later, so
   274  	// all uses of the former must be replaced by uses of the latter. This is
   275  	// usually only the case when the remote filesystem is a Linux overlayfs
   276  	// mount. (Prior to Linux 4.18, patch series centered on commit
   277  	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
   278  	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
   279  	// series, only memory mappings are incoherent.)
   280  	overlayfsStaleRead bool
   281  
   282  	// If regularFilesUseSpecialFileFD is true, application FDs representing
   283  	// regular files will use distinct file handles for each FD, in the same
   284  	// way that application FDs representing "special files" such as sockets
   285  	// do. Note that this disables client caching for regular files. This option
   286  	// may regress performance due to excessive Open RPCs. This option is not
   287  	// supported with overlayfsStaleRead for now.
   288  	regularFilesUseSpecialFileFD bool
   289  
   290  	// If disableFifoOpen is true, application attempts to open(2) a host FIFO
   291  	// are disallowed.
   292  	disableFifoOpen bool
   293  
   294  	// directfs holds options for directfs mode.
   295  	directfs directfsOpts
   296  }
   297  
   298  // +stateify savable
   299  type directfsOpts struct {
   300  	// If directfs is enabled, the gofer client does not make RPCs to the gofer
   301  	// process. Instead, it makes host syscalls to perform file operations.
   302  	enabled bool
   303  }
   304  
   305  // InteropMode controls the client's interaction with other remote filesystem
   306  // users.
   307  //
   308  // +stateify savable
   309  type InteropMode uint32
   310  
   311  const (
   312  	// InteropModeExclusive is appropriate when the filesystem client is the
   313  	// only user of the remote filesystem.
   314  	//
   315  	//	- The client may cache arbitrary filesystem state (file data, metadata,
   316  	//		filesystem structure, etc.).
   317  	//
   318  	//	- Client changes to filesystem state may be sent to the remote
   319  	//		filesystem asynchronously, except when server permission checks are
   320  	//		necessary.
   321  	//
   322  	//	- File timestamps are based on client clocks. This ensures that users of
   323  	//		the client observe timestamps that are coherent with their own clocks
   324  	//		and consistent with Linux's semantics (in particular, it is not always
   325  	//		possible for clients to set arbitrary atimes and mtimes depending on the
   326  	//		remote filesystem implementation, and never possible for clients to set
   327  	//		arbitrary ctimes.)
   328  	InteropModeExclusive InteropMode = iota
   329  
   330  	// InteropModeWritethrough is appropriate when there are read-only users of
   331  	// the remote filesystem that expect to observe changes made by the
   332  	// filesystem client.
   333  	//
   334  	//	- The client may cache arbitrary filesystem state.
   335  	//
   336  	//	- Client changes to filesystem state must be sent to the remote
   337  	//		filesystem synchronously.
   338  	//
   339  	//	- File timestamps are based on client clocks. As a corollary, access
   340  	//		timestamp changes from other remote filesystem users will not be visible
   341  	//		to the client.
   342  	InteropModeWritethrough
   343  
   344  	// InteropModeShared is appropriate when there are users of the remote
   345  	// filesystem that may mutate its state other than the client.
   346  	//
   347  	//	- The client must verify ("revalidate") cached filesystem state before
   348  	//		using it.
   349  	//
   350  	//	- Client changes to filesystem state must be sent to the remote
   351  	//		filesystem synchronously.
   352  	//
   353  	//	- File timestamps are based on server clocks. This is necessary to
   354  	//		ensure that timestamp changes are synchronized between remote filesystem
   355  	//		users.
   356  	//
   357  	// Note that the correctness of InteropModeShared depends on the server
   358  	// correctly implementing 9P fids (i.e. each fid immutably represents a
   359  	// single filesystem object), even in the presence of remote filesystem
   360  	// mutations from other users. If this is violated, the behavior of the
   361  	// client is undefined.
   362  	InteropModeShared
   363  )
   364  
   365  // InternalFilesystemOptions may be passed as
   366  // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
   367  //
   368  // +stateify savable
   369  type InternalFilesystemOptions struct {
   370  	// If UniqueID is non-empty, it is an opaque string used to reassociate the
   371  	// filesystem with a new server FD during restoration from checkpoint.
   372  	UniqueID vfs.RestoreID
   373  
   374  	// If LeakConnection is true, do not close the connection to the server
   375  	// when the Filesystem is released. This is necessary for deployments in
   376  	// which servers can handle only a single client and report failure if that
   377  	// client disconnects.
   378  	LeakConnection bool
   379  
   380  	// If OpenSocketsByConnecting is true, silently translate attempts to open
   381  	// files identifying as sockets to connect RPCs.
   382  	OpenSocketsByConnecting bool
   383  }
   384  
   385  // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
   386  // UIDs and GIDs used for files that do not provide a specific owner or group
   387  // respectively.
   388  const (
   389  	// uint32(-2) doesn't work in Go.
   390  	_V9FS_DEFUID = auth.KUID(4294967294)
   391  	_V9FS_DEFGID = auth.KGID(4294967294)
   392  )
   393  
   394  // Name implements vfs.FilesystemType.Name.
   395  func (FilesystemType) Name() string {
   396  	return Name
   397  }
   398  
   399  // Release implements vfs.FilesystemType.Release.
   400  func (FilesystemType) Release(ctx context.Context) {}
   401  
   402  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   403  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   404  	mf := pgalloc.MemoryFileFromContext(ctx)
   405  	if mf == nil {
   406  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: CtxMemoryFile is nil")
   407  		return nil, nil, linuxerr.EINVAL
   408  	}
   409  
   410  	mopts := vfs.GenericParseMountOptions(opts.Data)
   411  	var fsopts filesystemOptions
   412  
   413  	fd, err := getFDFromMountOptionsMap(ctx, mopts)
   414  	if err != nil {
   415  		return nil, nil, err
   416  	}
   417  	fsopts.fd = fd
   418  
   419  	// Get the attach name.
   420  	fsopts.aname = "/"
   421  	if aname, ok := mopts[moptAname]; ok {
   422  		delete(mopts, moptAname)
   423  		if !path.IsAbs(aname) {
   424  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname)
   425  			return nil, nil, linuxerr.EINVAL
   426  		}
   427  		fsopts.aname = path.Clean(aname)
   428  	}
   429  
   430  	// Parse the cache policy. For historical reasons, this defaults to the
   431  	// least generally-applicable option, InteropModeExclusive.
   432  	fsopts.interop = InteropModeExclusive
   433  	if cache, ok := mopts[moptCache]; ok {
   434  		delete(mopts, moptCache)
   435  		switch cache {
   436  		case cacheFSCache:
   437  			fsopts.interop = InteropModeExclusive
   438  		case cacheFSCacheWritethrough:
   439  			fsopts.interop = InteropModeWritethrough
   440  		case cacheRemoteRevalidating:
   441  			fsopts.interop = InteropModeShared
   442  		default:
   443  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache)
   444  			return nil, nil, linuxerr.EINVAL
   445  		}
   446  	}
   447  
   448  	// Parse the dentry cache size.
   449  	fsopts.dcache = defaultMaxCachedDentries
   450  	if dcacheStr, ok := mopts[moptDcache]; ok {
   451  		delete(mopts, moptDcache)
   452  		dcache, err := strconv.ParseInt(dcacheStr, 10, 64)
   453  		if err != nil {
   454  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid dcache: %s=%s", moptDcache, dcacheStr)
   455  			return nil, nil, linuxerr.EINVAL
   456  		}
   457  		if dcache >= 0 {
   458  			fsopts.dcache = uint64(dcache)
   459  		}
   460  	}
   461  
   462  	// Parse the default UID and GID.
   463  	fsopts.dfltuid = _V9FS_DEFUID
   464  	if dfltuidstr, ok := mopts[moptDfltUID]; ok {
   465  		delete(mopts, moptDfltUID)
   466  		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
   467  		if err != nil {
   468  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr)
   469  			return nil, nil, linuxerr.EINVAL
   470  		}
   471  		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
   472  		// in the caller's user namespace, but goferfs isn't
   473  		// application-mountable.
   474  		fsopts.dfltuid = auth.KUID(dfltuid)
   475  	}
   476  	fsopts.dfltgid = _V9FS_DEFGID
   477  	if dfltgidstr, ok := mopts[moptDfltGID]; ok {
   478  		delete(mopts, moptDfltGID)
   479  		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
   480  		if err != nil {
   481  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr)
   482  			return nil, nil, linuxerr.EINVAL
   483  		}
   484  		fsopts.dfltgid = auth.KGID(dfltgid)
   485  	}
   486  
   487  	// Handle simple flags.
   488  	if _, ok := mopts[moptDisableFileHandleSharing]; ok {
   489  		delete(mopts, moptDisableFileHandleSharing)
   490  		fsopts.regularFilesUseSpecialFileFD = true
   491  	}
   492  	if _, ok := mopts[moptDisableFifoOpen]; ok {
   493  		delete(mopts, moptDisableFifoOpen)
   494  		fsopts.disableFifoOpen = true
   495  	}
   496  	if _, ok := mopts[moptForcePageCache]; ok {
   497  		delete(mopts, moptForcePageCache)
   498  		fsopts.forcePageCache = true
   499  	}
   500  	if _, ok := mopts[moptLimitHostFDTranslation]; ok {
   501  		delete(mopts, moptLimitHostFDTranslation)
   502  		fsopts.limitHostFDTranslation = true
   503  	}
   504  	if _, ok := mopts[moptOverlayfsStaleRead]; ok {
   505  		delete(mopts, moptOverlayfsStaleRead)
   506  		fsopts.overlayfsStaleRead = true
   507  	}
   508  	if _, ok := mopts[moptDirectfs]; ok {
   509  		delete(mopts, moptDirectfs)
   510  		fsopts.directfs.enabled = true
   511  	}
   512  	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
   513  	// "cache=none".
   514  
   515  	// Check for unparsed options.
   516  	if len(mopts) != 0 {
   517  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   518  		return nil, nil, linuxerr.EINVAL
   519  	}
   520  
   521  	// Validation.
   522  	if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead {
   523  		// These options are not supported together. To support this, when a dentry
   524  		// is opened writably for the first time, we need to iterate over all the
   525  		// specialFileFDs of that dentry that represent a regular file and call
   526  		// fd.hostFileMapper.RegenerateMappings(writable_fd).
   527  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.")
   528  		return nil, nil, linuxerr.EINVAL
   529  	}
   530  
   531  	// Handle internal options.
   532  	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
   533  	if opts.InternalData != nil && !ok {
   534  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
   535  		return nil, nil, linuxerr.EINVAL
   536  	}
   537  	// If !ok, iopts being the zero value is correct.
   538  
   539  	// Construct the filesystem object.
   540  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   541  	if err != nil {
   542  		return nil, nil, err
   543  	}
   544  	fs := &filesystem{
   545  		mf:       mf,
   546  		opts:     fsopts,
   547  		iopts:    iopts,
   548  		clock:    ktime.RealtimeClockFromContext(ctx),
   549  		devMinor: devMinor,
   550  		inoByKey: make(map[inoKey]uint64),
   551  	}
   552  
   553  	// Did the user configure a global dentry cache?
   554  	if globalDentryCache != nil {
   555  		fs.dentryCache = globalDentryCache
   556  	} else {
   557  		fs.dentryCache = &dentryCache{maxCachedDentries: fsopts.dcache}
   558  	}
   559  
   560  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   561  
   562  	rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx)
   563  	if err != nil {
   564  		fs.vfsfs.DecRef(ctx)
   565  		return nil, nil, err
   566  	}
   567  	if fs.opts.directfs.enabled {
   568  		fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD))
   569  	} else {
   570  		fs.root, err = fs.newLisafsDentry(ctx, &rootInode)
   571  	}
   572  	if err != nil {
   573  		fs.vfsfs.DecRef(ctx)
   574  		return nil, nil, err
   575  	}
   576  	// Set the root's reference count to 2. One reference is returned to the
   577  	// caller, and the other is held by fs to prevent the root from being "cached"
   578  	// and subsequently evicted.
   579  	fs.root.refs = atomicbitops.FromInt64(2)
   580  	return &fs.vfsfs, &fs.root.vfsd, nil
   581  }
   582  
   583  // initClientAndGetRoot initializes fs.client and returns the root inode for
   584  // this mount point. It handles the attach point (fs.opts.aname) resolution.
   585  func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) {
   586  	sock, err := unet.NewSocket(fs.opts.fd)
   587  	if err != nil {
   588  		return lisafs.Inode{}, -1, err
   589  	}
   590  
   591  	ctx.UninterruptibleSleepStart(false)
   592  	defer ctx.UninterruptibleSleepFinish(false)
   593  
   594  	var (
   595  		rootInode  lisafs.Inode
   596  		rootHostFD int
   597  	)
   598  	fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock)
   599  	if err != nil {
   600  		return lisafs.Inode{}, -1, err
   601  	}
   602  
   603  	cu := cleanup.Make(func() {
   604  		if rootHostFD >= 0 {
   605  			_ = unix.Close(rootHostFD)
   606  		}
   607  		rootControlFD := fs.client.NewFD(rootInode.ControlFD)
   608  		rootControlFD.Close(ctx, false /* flush */)
   609  	})
   610  	defer cu.Clean()
   611  
   612  	if fs.opts.directfs.enabled {
   613  		if fs.opts.aname != "/" {
   614  			log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname)
   615  			return lisafs.Inode{}, -1, unix.EINVAL
   616  		}
   617  		if rootHostFD < 0 {
   618  			log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled")
   619  			return lisafs.Inode{}, -1, unix.EINVAL
   620  		}
   621  	} else {
   622  		if rootHostFD >= 0 {
   623  			log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it")
   624  			_ = unix.Close(rootHostFD)
   625  			rootHostFD = -1
   626  		}
   627  		// Use flipcall channels with lisafs because it makes a lot of RPCs.
   628  		if err := fs.client.StartChannels(); err != nil {
   629  			return lisafs.Inode{}, -1, err
   630  		}
   631  		rootInode, err = fs.handleAnameLisafs(ctx, rootInode)
   632  		if err != nil {
   633  			return lisafs.Inode{}, -1, err
   634  		}
   635  	}
   636  	cu.Release()
   637  	return rootInode, rootHostFD, nil
   638  }
   639  
   640  func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
   641  	// Check that the transport is "fd".
   642  	trans, ok := mopts[moptTransport]
   643  	if !ok || trans != transportModeFD {
   644  		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD)
   645  		return -1, linuxerr.EINVAL
   646  	}
   647  	delete(mopts, moptTransport)
   648  
   649  	// Check that read and write FDs are provided and identical.
   650  	rfdstr, ok := mopts[moptReadFD]
   651  	if !ok {
   652  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD)
   653  		return -1, linuxerr.EINVAL
   654  	}
   655  	delete(mopts, moptReadFD)
   656  	rfd, err := strconv.Atoi(rfdstr)
   657  	if err != nil {
   658  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr)
   659  		return -1, linuxerr.EINVAL
   660  	}
   661  	wfdstr, ok := mopts[moptWriteFD]
   662  	if !ok {
   663  		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD)
   664  		return -1, linuxerr.EINVAL
   665  	}
   666  	delete(mopts, moptWriteFD)
   667  	wfd, err := strconv.Atoi(wfdstr)
   668  	if err != nil {
   669  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr)
   670  		return -1, linuxerr.EINVAL
   671  	}
   672  	if rfd != wfd {
   673  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
   674  		return -1, linuxerr.EINVAL
   675  	}
   676  	return rfd, nil
   677  }
   678  
   679  // Release implements vfs.FilesystemImpl.Release.
   680  func (fs *filesystem) Release(ctx context.Context) {
   681  	fs.released.Store(1)
   682  
   683  	mf := fs.mf
   684  	fs.syncMu.Lock()
   685  	for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() {
   686  		d := elem.d
   687  		d.handleMu.Lock()
   688  		d.dataMu.Lock()
   689  		if d.isWriteHandleOk() {
   690  			// Write dirty cached data to the remote file.
   691  			h := d.writeHandle()
   692  			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   693  				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
   694  			}
   695  			// TODO(jamieliu): Do we need to flushf/fsync d?
   696  		}
   697  		// Discard cached pages.
   698  		d.cache.DropAll(mf)
   699  		d.dirty.RemoveAll()
   700  		d.dataMu.Unlock()
   701  		// Close host FDs if they exist.
   702  		d.closeHostFDs()
   703  		d.handleMu.Unlock()
   704  	}
   705  	// There can't be any specialFileFDs still using fs, since each such
   706  	// FileDescription would hold a reference on a Mount holding a reference on
   707  	// fs.
   708  	fs.syncMu.Unlock()
   709  
   710  	// If leak checking is enabled, release all outstanding references in the
   711  	// filesystem. We deliberately avoid doing this outside of leak checking; we
   712  	// have released all external resources above rather than relying on dentry
   713  	// destructors. fs.root may be nil if creating the client or initializing the
   714  	// root dentry failed in GetFilesystem.
   715  	if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil {
   716  		fs.renameMu.Lock()
   717  		fs.root.releaseSyntheticRecursiveLocked(ctx)
   718  		fs.evictAllCachedDentriesLocked(ctx)
   719  		fs.renameMu.Unlock()
   720  
   721  		// An extra reference was held by the filesystem on the root to prevent it from
   722  		// being cached/evicted.
   723  		fs.root.DecRef(ctx)
   724  	}
   725  
   726  	if !fs.iopts.LeakConnection {
   727  		// Close the connection to the server. This implicitly closes all FDs.
   728  		if fs.client != nil {
   729  			fs.client.Close()
   730  		}
   731  	}
   732  
   733  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   734  }
   735  
   736  // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
   737  // the reference count on every synthetic dentry. Synthetic dentries have one
   738  // reference for existence that should be dropped during filesystem.Release.
   739  //
   740  // Precondition: d.fs.renameMu is locked for writing.
   741  func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
   742  	if d.isSynthetic() {
   743  		d.decRefNoCaching()
   744  		d.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
   745  	}
   746  	if d.isDir() {
   747  		var children []*dentry
   748  		d.childrenMu.Lock()
   749  		for _, child := range d.children {
   750  			children = append(children, child)
   751  		}
   752  		d.childrenMu.Unlock()
   753  		for _, child := range children {
   754  			if child != nil {
   755  				child.releaseSyntheticRecursiveLocked(ctx)
   756  			}
   757  		}
   758  	}
   759  }
   760  
   761  // inoKey is the key used to identify the inode backed by this dentry.
   762  //
   763  // +stateify savable
   764  type inoKey struct {
   765  	ino      uint64
   766  	devMinor uint32
   767  	devMajor uint32
   768  }
   769  
   770  func inoKeyFromStatx(stat *linux.Statx) inoKey {
   771  	return inoKey{
   772  		ino:      stat.Ino,
   773  		devMinor: stat.DevMinor,
   774  		devMajor: stat.DevMajor,
   775  	}
   776  }
   777  
   778  func inoKeyFromStat(stat *unix.Stat_t) inoKey {
   779  	return inoKey{
   780  		ino:      stat.Ino,
   781  		devMinor: unix.Minor(stat.Dev),
   782  		devMajor: unix.Major(stat.Dev),
   783  	}
   784  }
   785  
   786  // dentry implements vfs.DentryImpl.
   787  //
   788  // +stateify savable
   789  type dentry struct {
   790  	vfsd vfs.Dentry
   791  
   792  	// refs is the reference count. Each dentry holds a reference on its
   793  	// parent, even if disowned. An additional reference is held on all
   794  	// synthetic dentries until they are unlinked or invalidated. When refs
   795  	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
   796  	// -1, the dentry has already been destroyed. refs is accessed using atomic
   797  	// memory operations.
   798  	refs atomicbitops.Int64
   799  
   800  	// fs is the owning filesystem. fs is immutable.
   801  	fs *filesystem
   802  
   803  	// parent is this dentry's parent directory. Each dentry holds a reference
   804  	// on its parent. If this dentry is a filesystem root, parent is nil.
   805  	// parent is protected by filesystem.renameMu.
   806  	parent atomic.Pointer[dentry] `state:".(*dentry)"`
   807  
   808  	// name is the name of this dentry in its parent. If this dentry is a
   809  	// filesystem root, name is the empty string. name is protected by
   810  	// filesystem.renameMu.
   811  	name string
   812  
   813  	// inoKey is used to identify this dentry's inode.
   814  	inoKey inoKey
   815  
   816  	// If deleted is non-zero, the file represented by this dentry has been
   817  	// deleted is accessed using atomic memory operations.
   818  	deleted atomicbitops.Uint32
   819  
   820  	// cachingMu is used to synchronize concurrent dentry caching attempts on
   821  	// this dentry.
   822  	cachingMu sync.Mutex `state:"nosave"`
   823  
   824  	// If cached is true, this dentry is part of filesystem.dentryCache. cached
   825  	// is protected by cachingMu.
   826  	cached bool
   827  
   828  	// cacheEntry links dentry into filesystem.dentryCache.dentries. It is
   829  	// protected by filesystem.dentryCache.mu.
   830  	cacheEntry dentryListElem
   831  
   832  	// syncableListEntry links dentry into filesystem.syncableDentries. It is
   833  	// protected by filesystem.syncMu.
   834  	syncableListEntry dentryListElem
   835  
   836  	// opMu synchronizes operations on this dentry. Operations that mutate
   837  	// the dentry tree must hold this lock for writing. Operations that
   838  	// only read the tree must hold for reading.
   839  	opMu sync.RWMutex `state:"nosave"`
   840  
   841  	// childrenMu protects the cached children data for this dentry.
   842  	childrenMu sync.Mutex `state:"nosave"`
   843  
   844  	// If this dentry represents a directory, children contains:
   845  	//
   846  	//	- Mappings of child filenames to dentries representing those children.
   847  	//
   848  	//	- Mappings of child filenames that are known not to exist to nil
   849  	//		dentries (only if InteropModeShared is not in effect and the directory
   850  	//		is not synthetic).
   851  	//
   852  	// +checklocks:childrenMu
   853  	children map[string]*dentry
   854  
   855  	// If this dentry represents a directory, negativeChildrenCache cache
   856  	// names of negative children. negativeChildrenCache is not saved since
   857  	// dentry.prepareSaveRecursive() drops all negative children.
   858  	//
   859  	// +checklocks:childrenMu
   860  	negativeChildrenCache stringFixedCache `state:"nosave"`
   861  	// If this dentry represents a directory, negativeChildren is the number of
   862  	// negative children cached in dentry.children. negativeChildren is not
   863  	// saved since dentry.prepareSaveRecursive() drops all negative children.
   864  	//
   865  	// +checklocks:childrenMu
   866  	negativeChildren int `state:"nosave"`
   867  
   868  	// If this dentry represents a directory, syntheticChildren is the number
   869  	// of child dentries for which dentry.isSynthetic() == true.
   870  	//
   871  	// +checklocks:childrenMu
   872  	syntheticChildren int
   873  
   874  	// If this dentry represents a directory,
   875  	// dentry.cachedMetadataAuthoritative() == true, and dirents is not
   876  	// nil, then dirents is a cache of all entries in the directory, in the
   877  	// order they were returned by the server. childrenSet just stores the
   878  	// `Name` field of all dirents in a set for fast query. dirents and
   879  	// childrenSet share the same lifecycle.
   880  	//
   881  	// +checklocks:childrenMu
   882  	dirents []vfs.Dirent `state:"nosave"`
   883  	// +checklocks:childrenMu
   884  	childrenSet map[string]struct{} `state:"nosave"`
   885  
   886  	// Cached metadata; protected by metadataMu.
   887  	// To access:
   888  	//   - In situations where consistency is not required (like stat), these
   889  	//     can be accessed using atomic operations only (without locking).
   890  	//   - Lock metadataMu and can access without atomic operations.
   891  	// To mutate:
   892  	//   - Lock metadataMu and use atomic operations to update because we might
   893  	//     have atomic readers that don't hold the lock.
   894  	metadataMu sync.Mutex          `state:"nosave"`
   895  	ino        uint64              // immutable
   896  	mode       atomicbitops.Uint32 // type is immutable, perms are mutable
   897  	uid        atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   898  	gid        atomicbitops.Uint32 // auth.KGID, but ...
   899  	blockSize  atomicbitops.Uint32 // 0 if unknown
   900  	// Timestamps, all nsecs from the Unix epoch.
   901  	atime atomicbitops.Int64
   902  	mtime atomicbitops.Int64
   903  	ctime atomicbitops.Int64
   904  	btime atomicbitops.Int64
   905  	// File size, which differs from other metadata in two ways:
   906  	//
   907  	//	- We make a best-effort attempt to keep it up to date even if
   908  	//		!dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
   909  	//
   910  	//	- size is protected by both metadataMu and dataMu (i.e. both must be
   911  	//		locked to mutate it; locking either is sufficient to access it).
   912  	size atomicbitops.Uint64
   913  	// If this dentry does not represent a synthetic file, deleted is 0, and
   914  	// atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
   915  	// remote file's timestamps, which should be updated when this dentry is
   916  	// evicted.
   917  	atimeDirty atomicbitops.Uint32
   918  	mtimeDirty atomicbitops.Uint32
   919  
   920  	// nlink counts the number of hard links to this dentry. It's updated and
   921  	// accessed using atomic operations. It's not protected by metadataMu like the
   922  	// other metadata fields.
   923  	nlink atomicbitops.Uint32
   924  
   925  	mapsMu sync.Mutex `state:"nosave"`
   926  
   927  	// If this dentry represents a regular file, mappings tracks mappings of
   928  	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
   929  	mappings memmap.MappingSet
   930  
   931  	//	- If this dentry represents a regular file or directory, readFD (if not
   932  	//    -1) is a host FD used for reads by all regularFileFDs/directoryFDs
   933  	//    representing this dentry.
   934  	//
   935  	//	- If this dentry represents a regular file, writeFD (if not -1) is a host
   936  	//    FD used for writes by all regularFileFDs representing this dentry.
   937  	//
   938  	//	- If this dentry represents a regular file, mmapFD is the host FD used
   939  	//		for memory mappings. If mmapFD is -1, no such FD is available, and the
   940  	//		internal page cache implementation is used for memory mappings instead.
   941  	//
   942  	// These fields are protected by handleMu. readFD, writeFD, and mmapFD are
   943  	// additionally written using atomic memory operations, allowing them to be
   944  	// read (albeit racily) with atomic.LoadInt32() without locking handleMu.
   945  	//
   946  	// readFD and writeFD may or may not be the same file descriptor. Once either
   947  	// transitions from closed (-1) to open, it may be mutated with handleMu
   948  	// locked, but cannot be closed until the dentry is destroyed.
   949  	//
   950  	// readFD and writeFD may or may not be the same file descriptor. mmapFD is
   951  	// always either -1 or equal to readFD; if the file has been opened for
   952  	// writing, it is additionally either -1 or equal to writeFD.
   953  	handleMu sync.RWMutex       `state:"nosave"`
   954  	readFD   atomicbitops.Int32 `state:"nosave"`
   955  	writeFD  atomicbitops.Int32 `state:"nosave"`
   956  	mmapFD   atomicbitops.Int32 `state:"nosave"`
   957  
   958  	dataMu sync.RWMutex `state:"nosave"`
   959  
   960  	// If this dentry represents a regular file that is client-cached, cache
   961  	// maps offsets into the cached file to offsets into
   962  	// filesystem.mfp.MemoryFile() that store the file's data. cache is
   963  	// protected by dataMu.
   964  	cache fsutil.FileRangeSet
   965  
   966  	// If this dentry represents a regular file that is client-cached, dirty
   967  	// tracks dirty segments in cache. dirty is protected by dataMu.
   968  	dirty fsutil.DirtySet
   969  
   970  	// pf implements memmap.File for mappings of hostFD.
   971  	pf dentryPlatformFile
   972  
   973  	// If this dentry represents a symbolic link, InteropModeShared is not in
   974  	// effect, and haveTarget is true, target is the symlink target. haveTarget
   975  	// and target are protected by dataMu.
   976  	haveTarget bool
   977  	target     string
   978  
   979  	// If this dentry represents a synthetic socket file, endpoint is the
   980  	// transport endpoint bound to this file.
   981  	endpoint transport.BoundEndpoint
   982  
   983  	// If this dentry represents a synthetic named pipe, pipe is the pipe
   984  	// endpoint bound to this file.
   985  	pipe *pipe.VFSPipe
   986  
   987  	locks vfs.FileLocks
   988  
   989  	// Inotify watches for this dentry.
   990  	//
   991  	// Note that inotify may behave unexpectedly in the presence of hard links,
   992  	// because dentries corresponding to the same file have separate inotify
   993  	// watches when they should share the same set. This is the case because it is
   994  	// impossible for us to know for sure whether two dentries correspond to the
   995  	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
   996  	// a more in-depth discussion on this matter).
   997  	watches vfs.Watches
   998  
   999  	// impl is the specific dentry implementation for non-synthetic dentries.
  1000  	// impl is immutable.
  1001  	//
  1002  	// If impl is nil, this dentry represents a synthetic file, i.e. a
  1003  	// file that does not exist on the host filesystem. As of this writing, the
  1004  	// only files that can be synthetic are sockets, pipes, and directories.
  1005  	impl any
  1006  }
  1007  
  1008  // +stateify savable
  1009  type stringListElem struct {
  1010  	// str is the string that this elem represents.
  1011  	str string
  1012  	stringEntry
  1013  }
  1014  
  1015  // +stateify savable
  1016  type dentryListElem struct {
  1017  	// d is the dentry that this elem represents.
  1018  	d *dentry
  1019  	dentryEntry
  1020  }
  1021  
  1022  func (fs *filesystem) inoFromKey(key inoKey) uint64 {
  1023  	fs.inoMu.Lock()
  1024  	defer fs.inoMu.Unlock()
  1025  
  1026  	if ino, ok := fs.inoByKey[key]; ok {
  1027  		return ino
  1028  	}
  1029  	ino := fs.nextIno()
  1030  	fs.inoByKey[key] = ino
  1031  	return ino
  1032  }
  1033  
  1034  func (fs *filesystem) nextIno() uint64 {
  1035  	return fs.lastIno.Add(1)
  1036  }
  1037  
  1038  // init must be called before first use of d.
  1039  func (d *dentry) init(impl any) {
  1040  	d.pf.dentry = d
  1041  	d.cacheEntry.d = d
  1042  	d.syncableListEntry.d = d
  1043  	// Nested impl-inheritance pattern. In memory it looks like:
  1044  	// [[[ vfs.Dentry ] dentry ] dentryImpl ]
  1045  	// All 3 abstractions are allocated in one allocation. We achieve this by
  1046  	// making each outer dentry implementation hold the inner dentry by value.
  1047  	// Then the outer most dentry is allocated and we initialize fields inward.
  1048  	// Each inner dentry has a pointer to the next level of implementation.
  1049  	d.impl = impl
  1050  	d.vfsd.Init(d)
  1051  	refs.Register(d)
  1052  }
  1053  
  1054  func (d *dentry) isSynthetic() bool {
  1055  	return d.impl == nil
  1056  }
  1057  
  1058  func (d *dentry) cachedMetadataAuthoritative() bool {
  1059  	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
  1060  }
  1061  
  1062  // updateMetadataFromStatxLocked is called to update d's metadata after an update
  1063  // from the remote filesystem.
  1064  // Precondition: d.metadataMu must be locked.
  1065  // +checklocks:d.metadataMu
  1066  func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) {
  1067  	if stat.Mask&linux.STATX_TYPE != 0 {
  1068  		if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want {
  1069  			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
  1070  		}
  1071  	}
  1072  	if stat.Mask&linux.STATX_MODE != 0 {
  1073  		d.mode.Store(uint32(stat.Mode))
  1074  	}
  1075  	if stat.Mask&linux.STATX_UID != 0 {
  1076  		d.uid.Store(dentryUID(lisafs.UID(stat.UID)))
  1077  	}
  1078  	if stat.Mask&linux.STATX_GID != 0 {
  1079  		d.gid.Store(dentryGID(lisafs.GID(stat.GID)))
  1080  	}
  1081  	if stat.Blksize != 0 {
  1082  		d.blockSize.Store(stat.Blksize)
  1083  	}
  1084  	// Don't override newer client-defined timestamps with old server-defined
  1085  	// ones.
  1086  	if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 {
  1087  		d.atime.Store(dentryTimestamp(stat.Atime))
  1088  	}
  1089  	if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 {
  1090  		d.mtime.Store(dentryTimestamp(stat.Mtime))
  1091  	}
  1092  	if stat.Mask&linux.STATX_CTIME != 0 {
  1093  		d.ctime.Store(dentryTimestamp(stat.Ctime))
  1094  	}
  1095  	if stat.Mask&linux.STATX_BTIME != 0 {
  1096  		d.btime.Store(dentryTimestamp(stat.Btime))
  1097  	}
  1098  	if stat.Mask&linux.STATX_NLINK != 0 {
  1099  		d.nlink.Store(stat.Nlink)
  1100  	}
  1101  	if stat.Mask&linux.STATX_SIZE != 0 {
  1102  		d.updateSizeLocked(stat.Size)
  1103  	}
  1104  }
  1105  
  1106  // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked,
  1107  // except that it takes a unix.Stat_t argument.
  1108  // Precondition: d.metadataMu must be locked.
  1109  // +checklocks:d.metadataMu
  1110  func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error {
  1111  	if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want {
  1112  		panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got))
  1113  	}
  1114  	d.mode.Store(stat.Mode)
  1115  	d.uid.Store(stat.Uid)
  1116  	d.gid.Store(stat.Gid)
  1117  	d.blockSize.Store(uint32(stat.Blksize))
  1118  	// Don't override newer client-defined timestamps with old host-defined
  1119  	// ones.
  1120  	if d.atimeDirty.Load() == 0 {
  1121  		d.atime.Store(dentryTimestampFromUnix(stat.Atim))
  1122  	}
  1123  	if d.mtimeDirty.Load() == 0 {
  1124  		d.mtime.Store(dentryTimestampFromUnix(stat.Mtim))
  1125  	}
  1126  	d.ctime.Store(dentryTimestampFromUnix(stat.Ctim))
  1127  	d.nlink.Store(uint32(stat.Nlink))
  1128  	d.updateSizeLocked(uint64(stat.Size))
  1129  	return nil
  1130  }
  1131  
  1132  // Preconditions: !d.isSynthetic().
  1133  // Preconditions: d.metadataMu is locked.
  1134  // +checklocks:d.metadataMu
  1135  func (d *dentry) refreshSizeLocked(ctx context.Context) error {
  1136  	d.handleMu.RLock()
  1137  
  1138  	// Can use RacyLoad() because handleMu is locked.
  1139  	if d.writeFD.RacyLoad() < 0 {
  1140  		d.handleMu.RUnlock()
  1141  		// Use a suitable FD if we don't have a writable host FD.
  1142  		return d.updateMetadataLocked(ctx, noHandle)
  1143  	}
  1144  
  1145  	// Using statx(2) with a minimal mask is faster than fstat(2).
  1146  	var stat unix.Statx_t
  1147  	// Can use RacyLoad() because handleMu is locked.
  1148  	err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat)
  1149  	d.handleMu.RUnlock() // must be released before updateSizeLocked()
  1150  	if err != nil {
  1151  		return err
  1152  	}
  1153  	d.updateSizeLocked(stat.Size)
  1154  	return nil
  1155  }
  1156  
  1157  // Preconditions: !d.isSynthetic().
  1158  func (d *dentry) updateMetadata(ctx context.Context) error {
  1159  	// d.metadataMu must be locked *before* we stat so that we do not end up
  1160  	// updating stale attributes in d.updateMetadataFromStatLocked().
  1161  	d.metadataMu.Lock()
  1162  	defer d.metadataMu.Unlock()
  1163  	return d.updateMetadataLocked(ctx, noHandle)
  1164  }
  1165  
  1166  func (d *dentry) fileType() uint32 {
  1167  	return d.mode.Load() & linux.S_IFMT
  1168  }
  1169  
  1170  func (d *dentry) statTo(stat *linux.Statx) {
  1171  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
  1172  	stat.Blksize = d.blockSize.Load()
  1173  	stat.Nlink = d.nlink.Load()
  1174  	if stat.Nlink == 0 {
  1175  		// The remote filesystem doesn't support link count; just make
  1176  		// something up. This is consistent with Linux, where
  1177  		// fs/inode.c:inode_init_always() initializes link count to 1, and
  1178  		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
  1179  		// it's not provided by the remote filesystem.
  1180  		stat.Nlink = 1
  1181  	}
  1182  	stat.UID = d.uid.Load()
  1183  	stat.GID = d.gid.Load()
  1184  	stat.Mode = uint16(d.mode.Load())
  1185  	stat.Ino = uint64(d.ino)
  1186  	stat.Size = d.size.Load()
  1187  	// This is consistent with regularFileFD.Seek(), which treats regular files
  1188  	// as having no holes.
  1189  	stat.Blocks = (stat.Size + 511) / 512
  1190  	stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load())
  1191  	stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load())
  1192  	stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load())
  1193  	stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load())
  1194  	stat.DevMajor = linux.UNNAMED_MAJOR
  1195  	stat.DevMinor = d.fs.devMinor
  1196  }
  1197  
  1198  // Precondition: fs.renameMu is locked.
  1199  func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
  1200  	stat := &opts.Stat
  1201  	if stat.Mask == 0 {
  1202  		return nil
  1203  	}
  1204  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
  1205  		return linuxerr.EPERM
  1206  	}
  1207  	mode := linux.FileMode(d.mode.Load())
  1208  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil {
  1209  		return err
  1210  	}
  1211  	if err := mnt.CheckBeginWrite(); err != nil {
  1212  		return err
  1213  	}
  1214  	defer mnt.EndWrite()
  1215  
  1216  	if stat.Mask&linux.STATX_SIZE != 0 {
  1217  		// Reject attempts to truncate files other than regular files, since
  1218  		// filesystem implementations may return the wrong errno.
  1219  		switch mode.FileType() {
  1220  		case linux.S_IFREG:
  1221  			// ok
  1222  		case linux.S_IFDIR:
  1223  			return linuxerr.EISDIR
  1224  		default:
  1225  			return linuxerr.EINVAL
  1226  		}
  1227  	}
  1228  
  1229  	var now int64
  1230  	if d.cachedMetadataAuthoritative() {
  1231  		// Truncate updates mtime.
  1232  		if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
  1233  			stat.Mask |= linux.STATX_MTIME
  1234  			stat.Mtime = linux.StatxTimestamp{
  1235  				Nsec: linux.UTIME_NOW,
  1236  			}
  1237  		}
  1238  
  1239  		// Use client clocks for timestamps.
  1240  		now = d.fs.clock.Now().Nanoseconds()
  1241  		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
  1242  			stat.Atime = linux.NsecToStatxTimestamp(now)
  1243  		}
  1244  		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
  1245  			stat.Mtime = linux.NsecToStatxTimestamp(now)
  1246  		}
  1247  	}
  1248  
  1249  	d.metadataMu.Lock()
  1250  	defer d.metadataMu.Unlock()
  1251  
  1252  	// As with Linux, if the UID, GID, or file size is changing, we have to
  1253  	// clear permission bits. Note that when set, clearSGID may cause
  1254  	// permissions to be updated.
  1255  	clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) ||
  1256  		(stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) ||
  1257  		stat.Mask&linux.STATX_SIZE != 0
  1258  	if clearSGID {
  1259  		if stat.Mask&linux.STATX_MODE != 0 {
  1260  			stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode)))
  1261  		} else {
  1262  			oldMode := d.mode.Load()
  1263  			if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode {
  1264  				stat.Mode = uint16(updatedMode)
  1265  				stat.Mask |= linux.STATX_MODE
  1266  			}
  1267  		}
  1268  	}
  1269  
  1270  	// failureMask indicates which attributes could not be set on the remote
  1271  	// filesystem. p9 returns an error if any of the attributes could not be set
  1272  	// but that leads to inconsistency as the server could have set a few
  1273  	// attributes successfully but a later failure will cause the successful ones
  1274  	// to not be updated in the dentry cache.
  1275  	var failureMask uint32
  1276  	var failureErr error
  1277  	if !d.isSynthetic() {
  1278  		if stat.Mask != 0 {
  1279  			if err := d.prepareSetStat(ctx, stat); err != nil {
  1280  				return err
  1281  			}
  1282  			d.handleMu.RLock()
  1283  			if stat.Mask&linux.STATX_SIZE != 0 {
  1284  				// d.dataMu must be held around the update to both the remote
  1285  				// file's size and d.size to serialize with writeback (which
  1286  				// might otherwise write data back up to the old d.size after
  1287  				// the remote file has been truncated).
  1288  				d.dataMu.Lock()
  1289  			}
  1290  			var err error
  1291  			failureMask, failureErr, err = d.setStatLocked(ctx, stat)
  1292  			d.handleMu.RUnlock()
  1293  			if err != nil {
  1294  				if stat.Mask&linux.STATX_SIZE != 0 {
  1295  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1296  				}
  1297  				return err
  1298  			}
  1299  			if stat.Mask&linux.STATX_SIZE != 0 {
  1300  				if failureMask&linux.STATX_SIZE == 0 {
  1301  					// d.size should be kept up to date, and privatized
  1302  					// copy-on-write mappings of truncated pages need to be
  1303  					// invalidated, even if InteropModeShared is in effect.
  1304  					d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above
  1305  				} else {
  1306  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1307  				}
  1308  			}
  1309  		}
  1310  		if d.fs.opts.interop == InteropModeShared {
  1311  			// There's no point to updating d's metadata in this case since
  1312  			// it'll be overwritten by revalidation before the next time it's
  1313  			// used anyway. (InteropModeShared inhibits client caching of
  1314  			// regular file data, so there's no cache to truncate either.)
  1315  			return nil
  1316  		}
  1317  	}
  1318  	if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 {
  1319  		d.mode.Store(d.fileType() | uint32(stat.Mode))
  1320  	}
  1321  	if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 {
  1322  		d.uid.Store(stat.UID)
  1323  	}
  1324  	if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 {
  1325  		d.gid.Store(stat.GID)
  1326  	}
  1327  	// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
  1328  	// if d.cachedMetadataAuthoritative() then we converted stat.Atime and
  1329  	// stat.Mtime to client-local timestamps above, and if
  1330  	// !d.cachedMetadataAuthoritative() then we returned after calling
  1331  	// d.file.setAttr(). For the same reason, now must have been initialized.
  1332  	if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 {
  1333  		d.atime.Store(stat.Atime.ToNsec())
  1334  		d.atimeDirty.Store(0)
  1335  	}
  1336  	if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 {
  1337  		d.mtime.Store(stat.Mtime.ToNsec())
  1338  		d.mtimeDirty.Store(0)
  1339  	}
  1340  	d.ctime.Store(now)
  1341  	if failureMask != 0 {
  1342  		// Setting some attribute failed on the remote filesystem.
  1343  		return failureErr
  1344  	}
  1345  	return nil
  1346  }
  1347  
  1348  // doAllocate performs an allocate operation on d. Note that d.metadataMu will
  1349  // be held when allocate is called.
  1350  func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
  1351  	d.metadataMu.Lock()
  1352  	defer d.metadataMu.Unlock()
  1353  
  1354  	// Allocating a smaller size is a noop.
  1355  	size := offset + length
  1356  	if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() {
  1357  		return nil
  1358  	}
  1359  
  1360  	err := allocate()
  1361  	if err != nil {
  1362  		return err
  1363  	}
  1364  	d.updateSizeLocked(size)
  1365  	if d.cachedMetadataAuthoritative() {
  1366  		d.touchCMtimeLocked()
  1367  	}
  1368  	return nil
  1369  }
  1370  
  1371  // Preconditions: d.metadataMu must be locked.
  1372  func (d *dentry) updateSizeLocked(newSize uint64) {
  1373  	d.dataMu.Lock()
  1374  	d.updateSizeAndUnlockDataMuLocked(newSize)
  1375  }
  1376  
  1377  // Preconditions: d.metadataMu and d.dataMu must be locked.
  1378  //
  1379  // Postconditions: d.dataMu is unlocked.
  1380  // +checklocksrelease:d.dataMu
  1381  func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) {
  1382  	oldSize := d.size.RacyLoad()
  1383  	d.size.Store(newSize)
  1384  	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
  1385  	// below. This allows concurrent calls to Read/Translate/etc. These
  1386  	// functions synchronize with truncation by refusing to use cache
  1387  	// contents beyond the new d.size. (We are still holding d.metadataMu,
  1388  	// so we can't race with Write or another truncate.)
  1389  	d.dataMu.Unlock()
  1390  	if newSize < oldSize {
  1391  		oldpgend, _ := hostarch.PageRoundUp(oldSize)
  1392  		newpgend, _ := hostarch.PageRoundUp(newSize)
  1393  		if oldpgend != newpgend {
  1394  			d.mapsMu.Lock()
  1395  			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
  1396  				// Compare Linux's mm/truncate.c:truncate_setsize() =>
  1397  				// truncate_pagecache() =>
  1398  				// mm/memory.c:unmap_mapping_range(evencows=1).
  1399  				InvalidatePrivate: true,
  1400  			})
  1401  			d.mapsMu.Unlock()
  1402  		}
  1403  		// We are now guaranteed that there are no translations of
  1404  		// truncated pages, and can remove them from the cache. Since
  1405  		// truncated pages have been removed from the remote file, they
  1406  		// should be dropped without being written back.
  1407  		d.dataMu.Lock()
  1408  		d.cache.Truncate(newSize, d.fs.mf)
  1409  		d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend})
  1410  		d.dataMu.Unlock()
  1411  	}
  1412  }
  1413  
  1414  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
  1415  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()))
  1416  }
  1417  
  1418  func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
  1419  	// Deny access to the "system" namespaces since applications
  1420  	// may expect these to affect kernel behavior in unimplemented ways
  1421  	// (b/148380782). Allow all other extended attributes to be passed through
  1422  	// to the remote filesystem. This is inconsistent with Linux's 9p client,
  1423  	// but consistent with other filesystems (e.g. FUSE).
  1424  	//
  1425  	// NOTE(b/202533394): Also disallow "trusted" namespace for now. This is
  1426  	// consistent with the VFS1 gofer client.
  1427  	if strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
  1428  		return linuxerr.EOPNOTSUPP
  1429  	}
  1430  	mode := linux.FileMode(d.mode.Load())
  1431  	kuid := auth.KUID(d.uid.Load())
  1432  	kgid := auth.KGID(d.gid.Load())
  1433  	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
  1434  		return err
  1435  	}
  1436  	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
  1437  }
  1438  
  1439  func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
  1440  	return vfs.CheckDeleteSticky(
  1441  		creds,
  1442  		linux.FileMode(d.mode.Load()),
  1443  		auth.KUID(d.uid.Load()),
  1444  		auth.KUID(child.uid.Load()),
  1445  		auth.KGID(child.gid.Load()),
  1446  	)
  1447  }
  1448  
  1449  func dentryUID(uid lisafs.UID) uint32 {
  1450  	if !uid.Ok() {
  1451  		return uint32(auth.OverflowUID)
  1452  	}
  1453  	return uint32(uid)
  1454  }
  1455  
  1456  func dentryGID(gid lisafs.GID) uint32 {
  1457  	if !gid.Ok() {
  1458  		return uint32(auth.OverflowGID)
  1459  	}
  1460  	return uint32(gid)
  1461  }
  1462  
  1463  // IncRef implements vfs.DentryImpl.IncRef.
  1464  func (d *dentry) IncRef() {
  1465  	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
  1466  	// d.checkCachingLocked().
  1467  	r := d.refs.Add(1)
  1468  	if d.LogRefs() {
  1469  		refs.LogIncRef(d, r)
  1470  	}
  1471  }
  1472  
  1473  // TryIncRef implements vfs.DentryImpl.TryIncRef.
  1474  func (d *dentry) TryIncRef() bool {
  1475  	for {
  1476  		r := d.refs.Load()
  1477  		if r <= 0 {
  1478  			return false
  1479  		}
  1480  		if d.refs.CompareAndSwap(r, r+1) {
  1481  			if d.LogRefs() {
  1482  				refs.LogTryIncRef(d, r+1)
  1483  			}
  1484  			return true
  1485  		}
  1486  	}
  1487  }
  1488  
  1489  // DecRef implements vfs.DentryImpl.DecRef.
  1490  func (d *dentry) DecRef(ctx context.Context) {
  1491  	if d.decRefNoCaching() == 0 {
  1492  		d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1493  	}
  1494  }
  1495  
  1496  // decRefNoCaching decrements d's reference count without calling
  1497  // d.checkCachingLocked, even if d's reference count reaches 0; callers are
  1498  // responsible for ensuring that d.checkCachingLocked will be called later.
  1499  func (d *dentry) decRefNoCaching() int64 {
  1500  	r := d.refs.Add(-1)
  1501  	if d.LogRefs() {
  1502  		refs.LogDecRef(d, r)
  1503  	}
  1504  	if r < 0 {
  1505  		panic("gofer.dentry.decRefNoCaching() called without holding a reference")
  1506  	}
  1507  	return r
  1508  }
  1509  
  1510  // RefType implements refs.CheckedObject.Type.
  1511  func (d *dentry) RefType() string {
  1512  	return "gofer.dentry"
  1513  }
  1514  
  1515  // LeakMessage implements refs.CheckedObject.LeakMessage.
  1516  func (d *dentry) LeakMessage() string {
  1517  	return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load())
  1518  }
  1519  
  1520  // LogRefs implements refs.CheckedObject.LogRefs.
  1521  //
  1522  // This should only be set to true for debugging purposes, as it can generate an
  1523  // extremely large amount of output and drastically degrade performance.
  1524  func (d *dentry) LogRefs() bool {
  1525  	return false
  1526  }
  1527  
  1528  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
  1529  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
  1530  	if d.isDir() {
  1531  		events |= linux.IN_ISDIR
  1532  	}
  1533  
  1534  	d.fs.renameMu.RLock()
  1535  	// The ordering below is important, Linux always notifies the parent first.
  1536  	if parent := d.parent.Load(); parent != nil {
  1537  		parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
  1538  	}
  1539  	d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
  1540  	d.fs.renameMu.RUnlock()
  1541  }
  1542  
  1543  // Watches implements vfs.DentryImpl.Watches.
  1544  func (d *dentry) Watches() *vfs.Watches {
  1545  	return &d.watches
  1546  }
  1547  
  1548  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
  1549  //
  1550  // If no watches are left on this dentry and it has no references, cache it.
  1551  func (d *dentry) OnZeroWatches(ctx context.Context) {
  1552  	d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1553  }
  1554  
  1555  // checkCachingLocked should be called after d's reference count becomes 0 or
  1556  // it becomes disowned.
  1557  //
  1558  // For performance, checkCachingLocked can also be called after d's reference
  1559  // count becomes non-zero, so that d can be removed from the LRU cache. This
  1560  // may help in reducing the size of the cache and hence reduce evictions. Note
  1561  // that this is not necessary for correctness.
  1562  //
  1563  // It may be called on a destroyed dentry. For example,
  1564  // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
  1565  // for the same dentry when the dentry is visited more than once in the same
  1566  // operation. One of the calls may destroy the dentry, so subsequent calls will
  1567  // do nothing.
  1568  //
  1569  // Preconditions: d.fs.renameMu must be locked for writing if
  1570  // renameMuWriteLocked is true; it may be temporarily unlocked.
  1571  func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
  1572  	d.cachingMu.Lock()
  1573  	refs := d.refs.Load()
  1574  	if refs == -1 {
  1575  		// Dentry has already been destroyed.
  1576  		d.cachingMu.Unlock()
  1577  		return
  1578  	}
  1579  	if refs > 0 {
  1580  		// fs.dentryCache.dentries is permitted to contain dentries with non-zero
  1581  		// refs, which are skipped by fs.evictCachedDentryLocked() upon reaching
  1582  		// the end of the LRU. But it is still beneficial to remove d from the
  1583  		// cache as we are already holding d.cachingMu. Keeping a cleaner cache
  1584  		// also reduces the number of evictions (which is expensive as it acquires
  1585  		// fs.renameMu).
  1586  		d.removeFromCacheLocked()
  1587  		d.cachingMu.Unlock()
  1588  		return
  1589  	}
  1590  	// Deleted and invalidated dentries with zero references are no longer
  1591  	// reachable by path resolution and should be dropped immediately.
  1592  	if d.vfsd.IsDead() {
  1593  		d.removeFromCacheLocked()
  1594  		d.cachingMu.Unlock()
  1595  		if !renameMuWriteLocked {
  1596  			// Need to lock d.fs.renameMu for writing as needed by d.destroyLocked().
  1597  			d.fs.renameMu.Lock()
  1598  			defer d.fs.renameMu.Unlock()
  1599  			// Now that renameMu is locked for writing, no more refs can be taken on
  1600  			// d because path resolution requires renameMu for reading at least.
  1601  			if d.refs.Load() != 0 {
  1602  				// Destroy d only if its ref is still 0. If not, either someone took a
  1603  				// ref on it or it got destroyed before fs.renameMu could be acquired.
  1604  				return
  1605  			}
  1606  		}
  1607  		if d.isDeleted() {
  1608  			d.watches.HandleDeletion(ctx)
  1609  		}
  1610  		d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point.
  1611  		return
  1612  	}
  1613  	if d.vfsd.IsEvictable() {
  1614  		d.cachingMu.Unlock()
  1615  		// Attempt to evict.
  1616  		if renameMuWriteLocked {
  1617  			d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case.
  1618  			return
  1619  		}
  1620  		d.evict(ctx)
  1621  		return
  1622  	}
  1623  	// If d still has inotify watches and it is not deleted or invalidated, it
  1624  	// can't be evicted. Otherwise, we will lose its watches, even if a new
  1625  	// dentry is created for the same file in the future. Note that the size of
  1626  	// d.watches cannot concurrently transition from zero to non-zero, because
  1627  	// adding a watch requires holding a reference on d.
  1628  	if d.watches.Size() > 0 {
  1629  		// As in the refs > 0 case, removing d is beneficial.
  1630  		d.removeFromCacheLocked()
  1631  		d.cachingMu.Unlock()
  1632  		return
  1633  	}
  1634  
  1635  	if d.fs.released.Load() != 0 {
  1636  		d.cachingMu.Unlock()
  1637  		if !renameMuWriteLocked {
  1638  			// Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
  1639  			// needed by d.destroyLocked() later.
  1640  			d.fs.renameMu.Lock()
  1641  			defer d.fs.renameMu.Unlock()
  1642  		}
  1643  		if parent := d.parent.Load(); parent != nil {
  1644  			parent.childrenMu.Lock()
  1645  			delete(parent.children, d.name)
  1646  			parent.childrenMu.Unlock()
  1647  		}
  1648  		d.destroyLocked(ctx) // +checklocksforce: see above.
  1649  		return
  1650  	}
  1651  
  1652  	d.fs.dentryCache.mu.Lock()
  1653  	// If d is already cached, just move it to the front of the LRU.
  1654  	if d.cached {
  1655  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1656  		d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1657  		d.fs.dentryCache.mu.Unlock()
  1658  		d.cachingMu.Unlock()
  1659  		return
  1660  	}
  1661  	// Cache the dentry, then evict the least recently used cached dentry if
  1662  	// the cache becomes over-full.
  1663  	d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1664  	d.fs.dentryCache.dentriesLen++
  1665  	d.cached = true
  1666  	shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries
  1667  	d.fs.dentryCache.mu.Unlock()
  1668  	d.cachingMu.Unlock()
  1669  
  1670  	if shouldEvict {
  1671  		if !renameMuWriteLocked {
  1672  			// Need to lock d.fs.renameMu for writing as needed by
  1673  			// d.evictCachedDentryLocked().
  1674  			d.fs.renameMu.Lock()
  1675  			defer d.fs.renameMu.Unlock()
  1676  		}
  1677  		d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
  1678  	}
  1679  }
  1680  
  1681  // Preconditions: d.cachingMu must be locked.
  1682  func (d *dentry) removeFromCacheLocked() {
  1683  	if d.cached {
  1684  		d.fs.dentryCache.mu.Lock()
  1685  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1686  		d.fs.dentryCache.dentriesLen--
  1687  		d.fs.dentryCache.mu.Unlock()
  1688  		d.cached = false
  1689  	}
  1690  }
  1691  
  1692  // Precondition: fs.renameMu must be locked for writing; it may be temporarily
  1693  // unlocked.
  1694  // +checklocks:fs.renameMu
  1695  func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
  1696  	for fs.dentryCache.dentriesLen != 0 {
  1697  		fs.evictCachedDentryLocked(ctx)
  1698  	}
  1699  }
  1700  
  1701  // Preconditions:
  1702  //   - fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1703  //
  1704  // +checklocks:fs.renameMu
  1705  func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
  1706  	fs.dentryCache.mu.Lock()
  1707  	victim := fs.dentryCache.dentries.Back()
  1708  	fs.dentryCache.mu.Unlock()
  1709  	if victim == nil {
  1710  		// fs.dentryCache.dentries may have become empty between when it was
  1711  		// checked and when we locked fs.dentryCache.mu.
  1712  		return
  1713  	}
  1714  
  1715  	if victim.d.fs == fs {
  1716  		victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs
  1717  		return
  1718  	}
  1719  
  1720  	// The dentry cache is shared between all gofer filesystems and the victim is
  1721  	// from another filesystem. Have that filesystem do the work. We unlock
  1722  	// fs.renameMu to prevent deadlock: two filesystems could otherwise wait on
  1723  	// each others' renameMu.
  1724  	fs.renameMu.Unlock()
  1725  	defer fs.renameMu.Lock()
  1726  	victim.d.evict(ctx)
  1727  }
  1728  
  1729  // Preconditions:
  1730  //   - d.fs.renameMu must not be locked for writing.
  1731  func (d *dentry) evict(ctx context.Context) {
  1732  	d.fs.renameMu.Lock()
  1733  	defer d.fs.renameMu.Unlock()
  1734  	d.evictLocked(ctx)
  1735  }
  1736  
  1737  // Preconditions:
  1738  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1739  //
  1740  // +checklocks:d.fs.renameMu
  1741  func (d *dentry) evictLocked(ctx context.Context) {
  1742  	d.cachingMu.Lock()
  1743  	d.removeFromCacheLocked()
  1744  	// d.refs or d.watches.Size() may have become non-zero from an earlier path
  1745  	// resolution since it was inserted into fs.dentryCache.dentries.
  1746  	if d.refs.Load() != 0 || d.watches.Size() != 0 {
  1747  		d.cachingMu.Unlock()
  1748  		return
  1749  	}
  1750  	if parent := d.parent.Load(); parent != nil {
  1751  		parent.opMu.Lock()
  1752  		if !d.vfsd.IsDead() {
  1753  			// Note that d can't be a mount point (in any mount namespace), since VFS
  1754  			// holds references on mount points.
  1755  			rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd)
  1756  			for _, rc := range rcs {
  1757  				rc.DecRef(ctx)
  1758  			}
  1759  
  1760  			parent.childrenMu.Lock()
  1761  			delete(parent.children, d.name)
  1762  			parent.childrenMu.Unlock()
  1763  
  1764  			// We're only deleting the dentry, not the file it
  1765  			// represents, so we don't need to update
  1766  			// victim parent.dirents etc.
  1767  		}
  1768  		parent.opMu.Unlock()
  1769  	}
  1770  	// Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any
  1771  	// concurrent caching attempts on d will attempt to destroy it and so will
  1772  	// try to acquire fs.renameMu (which we have already acquiredd). Hence,
  1773  	// fs.renameMu will synchronize the destroy attempts.
  1774  	d.cachingMu.Unlock()
  1775  	d.destroyLocked(ctx) // +checklocksforce: owned as precondition.
  1776  }
  1777  
  1778  // destroyDisconnected destroys an uncached, unparented dentry. There are no
  1779  // locking preconditions.
  1780  func (d *dentry) destroyDisconnected(ctx context.Context) {
  1781  	mf := d.fs.mf
  1782  
  1783  	d.handleMu.Lock()
  1784  	d.dataMu.Lock()
  1785  
  1786  	if d.isWriteHandleOk() {
  1787  		// Write dirty pages back to the remote filesystem.
  1788  		h := d.writeHandle()
  1789  		if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
  1790  			log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
  1791  		}
  1792  	}
  1793  	// Discard cached data.
  1794  	if !d.cache.IsEmpty() {
  1795  		mf.MarkAllUnevictable(d)
  1796  		d.cache.DropAll(mf)
  1797  		d.dirty.RemoveAll()
  1798  	}
  1799  	d.dataMu.Unlock()
  1800  
  1801  	// Close any resources held by the implementation.
  1802  	d.destroyImpl(ctx)
  1803  
  1804  	// Can use RacyLoad() because handleMu is locked.
  1805  	if d.readFD.RacyLoad() >= 0 {
  1806  		_ = unix.Close(int(d.readFD.RacyLoad()))
  1807  	}
  1808  	if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() {
  1809  		_ = unix.Close(int(d.writeFD.RacyLoad()))
  1810  	}
  1811  	d.readFD = atomicbitops.FromInt32(-1)
  1812  	d.writeFD = atomicbitops.FromInt32(-1)
  1813  	d.mmapFD = atomicbitops.FromInt32(-1)
  1814  	d.handleMu.Unlock()
  1815  
  1816  	if !d.isSynthetic() {
  1817  		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
  1818  		// i.e. client and server timestamps may differ (because e.g. a client
  1819  		// write was serviced by the page cache, and only written back to the
  1820  		// remote file later). Ideally, we'd write client timestamps back to
  1821  		// the remote filesystem so that timestamps for a new dentry
  1822  		// instantiated for the same file would remain coherent. Unfortunately,
  1823  		// this turns out to be too expensive in many cases, so for now we
  1824  		// don't do this.
  1825  
  1826  		// Remove d from the set of syncable dentries.
  1827  		d.fs.syncMu.Lock()
  1828  		d.fs.syncableDentries.Remove(&d.syncableListEntry)
  1829  		d.fs.syncMu.Unlock()
  1830  	}
  1831  
  1832  	// Drop references and stop tracking this child.
  1833  	d.refs.Store(-1)
  1834  	refs.Unregister(d)
  1835  }
  1836  
  1837  // destroyLocked destroys the dentry.
  1838  //
  1839  // Preconditions:
  1840  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1841  //   - d.refs == 0.
  1842  //   - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
  1843  //     from its former parent dentry.
  1844  //
  1845  // +checklocks:d.fs.renameMu
  1846  func (d *dentry) destroyLocked(ctx context.Context) {
  1847  	switch d.refs.Load() {
  1848  	case 0:
  1849  		// Mark the dentry destroyed.
  1850  		d.refs.Store(-1)
  1851  	case -1:
  1852  		panic("dentry.destroyLocked() called on already destroyed dentry")
  1853  	default:
  1854  		panic("dentry.destroyLocked() called with references on the dentry")
  1855  	}
  1856  
  1857  	// Allow the following to proceed without renameMu locked to improve
  1858  	// scalability.
  1859  	d.fs.renameMu.Unlock()
  1860  
  1861  	// No locks need to be held during destoryDisconnected.
  1862  	d.destroyDisconnected(ctx)
  1863  
  1864  	d.fs.renameMu.Lock()
  1865  
  1866  	// Drop the reference held by d on its parent without recursively locking
  1867  	// d.fs.renameMu.
  1868  
  1869  	if parent := d.parent.Load(); parent != nil && parent.decRefNoCaching() == 0 {
  1870  		parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
  1871  	}
  1872  }
  1873  
  1874  func (d *dentry) isDeleted() bool {
  1875  	return d.deleted.Load() != 0
  1876  }
  1877  
  1878  func (d *dentry) setDeleted() {
  1879  	d.deleted.Store(1)
  1880  }
  1881  
  1882  func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) {
  1883  	if d.isSynthetic() {
  1884  		return nil, nil
  1885  	}
  1886  
  1887  	return d.listXattrImpl(ctx, size)
  1888  }
  1889  
  1890  func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
  1891  	if d.isSynthetic() {
  1892  		return "", linuxerr.ENODATA
  1893  	}
  1894  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
  1895  		return "", err
  1896  	}
  1897  	return d.getXattrImpl(ctx, opts)
  1898  }
  1899  
  1900  func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
  1901  	if d.isSynthetic() {
  1902  		return linuxerr.EPERM
  1903  	}
  1904  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
  1905  		return err
  1906  	}
  1907  	return d.setXattrImpl(ctx, opts)
  1908  }
  1909  
  1910  func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
  1911  	if d.isSynthetic() {
  1912  		return linuxerr.EPERM
  1913  	}
  1914  	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
  1915  		return err
  1916  	}
  1917  	return d.removeXattrImpl(ctx, name)
  1918  }
  1919  
  1920  // Preconditions:
  1921  //   - !d.isSynthetic().
  1922  //   - d.isRegularFile() || d.isDir().
  1923  //   - fs.renameMu is locked.
  1924  func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
  1925  	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
  1926  	// O_TRUNC).
  1927  	if !trunc {
  1928  		d.handleMu.RLock()
  1929  		canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk())
  1930  		d.handleMu.RUnlock()
  1931  		if canReuseCurHandle {
  1932  			// Current handles are sufficient.
  1933  			return nil
  1934  		}
  1935  	}
  1936  
  1937  	d.handleMu.Lock()
  1938  	needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc
  1939  	if !needNewHandle {
  1940  		d.handleMu.Unlock()
  1941  		return nil
  1942  	}
  1943  
  1944  	var fdsToCloseArr [2]int32
  1945  	fdsToClose := fdsToCloseArr[:0]
  1946  	invalidateTranslations := false
  1947  	// Get a new handle. If this file has been opened for both reading and
  1948  	// writing, try to get a single handle that is usable for both:
  1949  	//
  1950  	//	- Writable memory mappings of a host FD require that the host FD is
  1951  	//		opened for both reading and writing.
  1952  	//
  1953  	//	- NOTE(b/141991141): Some filesystems may not ensure coherence
  1954  	//		between multiple handles for the same file.
  1955  	openReadable := d.isReadHandleOk() || read
  1956  	openWritable := d.isWriteHandleOk() || write
  1957  	h, err := d.openHandle(ctx, openReadable, openWritable, trunc)
  1958  	if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) {
  1959  		// It may not be possible to use a single handle for both
  1960  		// reading and writing, since permissions on the file may have
  1961  		// changed to e.g. disallow reading after previously being
  1962  		// opened for reading. In this case, we have no choice but to
  1963  		// use separate handles for reading and writing.
  1964  		ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
  1965  		openReadable = read
  1966  		openWritable = write
  1967  		h, err = d.openHandle(ctx, openReadable, openWritable, trunc)
  1968  	}
  1969  	if err != nil {
  1970  		d.handleMu.Unlock()
  1971  		return err
  1972  	}
  1973  
  1974  	// Update d.readFD and d.writeFD
  1975  	if h.fd >= 0 {
  1976  		if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) {
  1977  			// Replace existing FDs with this one.
  1978  			if d.readFD.RacyLoad() >= 0 {
  1979  				// We already have a readable FD that may be in use by
  1980  				// concurrent callers of d.pf.FD().
  1981  				if d.fs.opts.overlayfsStaleRead {
  1982  					// If overlayfsStaleRead is in effect, then the new FD
  1983  					// may not be coherent with the existing one, so we
  1984  					// have no choice but to switch to mappings of the new
  1985  					// FD in both the application and sentry.
  1986  					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
  1987  						d.handleMu.Unlock()
  1988  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
  1989  						h.close(ctx)
  1990  						return err
  1991  					}
  1992  					fdsToClose = append(fdsToClose, d.readFD.RacyLoad())
  1993  					invalidateTranslations = true
  1994  					d.readFD.Store(h.fd)
  1995  				} else {
  1996  					// Otherwise, we want to avoid invalidating existing
  1997  					// memmap.Translations (which is expensive); instead, use
  1998  					// dup3 to make the old file descriptor refer to the new
  1999  					// file description, then close the new file descriptor
  2000  					// (which is no longer needed). Racing callers of d.pf.FD()
  2001  					// may use the old or new file description, but this
  2002  					// doesn't matter since they refer to the same file, and
  2003  					// any racing mappings must be read-only.
  2004  					if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil {
  2005  						oldFD := d.readFD.RacyLoad()
  2006  						d.handleMu.Unlock()
  2007  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err)
  2008  						h.close(ctx)
  2009  						return err
  2010  					}
  2011  					fdsToClose = append(fdsToClose, h.fd)
  2012  					h.fd = d.readFD.RacyLoad()
  2013  				}
  2014  			} else {
  2015  				d.readFD.Store(h.fd)
  2016  			}
  2017  			if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 {
  2018  				fdsToClose = append(fdsToClose, d.writeFD.RacyLoad())
  2019  			}
  2020  			d.writeFD.Store(h.fd)
  2021  			d.mmapFD.Store(h.fd)
  2022  		} else if openReadable && d.readFD.RacyLoad() < 0 {
  2023  			readHandleWasOk := d.isReadHandleOk()
  2024  			d.readFD.Store(h.fd)
  2025  			// If the file has not been opened for writing, the new FD may
  2026  			// be used for read-only memory mappings. If the file was
  2027  			// previously opened for reading (without an FD), then existing
  2028  			// translations of the file may use the internal page cache;
  2029  			// invalidate those mappings.
  2030  			if !d.isWriteHandleOk() {
  2031  				invalidateTranslations = readHandleWasOk
  2032  				d.mmapFD.Store(h.fd)
  2033  			}
  2034  		} else if openWritable && d.writeFD.RacyLoad() < 0 {
  2035  			d.writeFD.Store(h.fd)
  2036  			if d.readFD.RacyLoad() >= 0 {
  2037  				// We have an existing read-only FD, but the file has just
  2038  				// been opened for writing, so we need to start supporting
  2039  				// writable memory mappings. However, the new FD is not
  2040  				// readable, so we have no FD that can be used to create
  2041  				// writable memory mappings. Switch to using the internal
  2042  				// page cache.
  2043  				invalidateTranslations = true
  2044  				d.mmapFD.Store(-1)
  2045  			}
  2046  		} else {
  2047  			// The new FD is not useful.
  2048  			fdsToClose = append(fdsToClose, h.fd)
  2049  		}
  2050  	} else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 {
  2051  		// We have an existing read-only FD, but the file has just been
  2052  		// opened for writing, so we need to start supporting writable
  2053  		// memory mappings. However, we have no writable host FD. Switch to
  2054  		// using the internal page cache.
  2055  		invalidateTranslations = true
  2056  		d.mmapFD.Store(-1)
  2057  	}
  2058  
  2059  	d.updateHandles(ctx, h, openReadable, openWritable)
  2060  	d.handleMu.Unlock()
  2061  
  2062  	if invalidateTranslations {
  2063  		// Invalidate application mappings that may be using an old FD; they
  2064  		// will be replaced with mappings using the new FD after future calls
  2065  		// to d.Translate(). This requires holding d.mapsMu, which precedes
  2066  		// d.handleMu in the lock order.
  2067  		d.mapsMu.Lock()
  2068  		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
  2069  		d.mapsMu.Unlock()
  2070  	}
  2071  	for _, fd := range fdsToClose {
  2072  		unix.Close(int(fd))
  2073  	}
  2074  
  2075  	return nil
  2076  }
  2077  
  2078  func (d *dentry) syncRemoteFile(ctx context.Context) error {
  2079  	d.handleMu.RLock()
  2080  	defer d.handleMu.RUnlock()
  2081  	return d.syncRemoteFileLocked(ctx)
  2082  }
  2083  
  2084  // Preconditions: d.handleMu must be locked.
  2085  func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
  2086  	// Prefer syncing write handles over read handles, since some remote
  2087  	// filesystem implementations may not sync changes made through write
  2088  	// handles otherwise.
  2089  	wh := d.writeHandle()
  2090  	wh.sync(ctx)
  2091  	rh := d.readHandle()
  2092  	rh.sync(ctx)
  2093  	return nil
  2094  }
  2095  
  2096  func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
  2097  	d.handleMu.RLock()
  2098  	defer d.handleMu.RUnlock()
  2099  	if d.isWriteHandleOk() {
  2100  		// Write back dirty pages to the remote file.
  2101  		d.dataMu.Lock()
  2102  		h := d.writeHandle()
  2103  		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mf, h.writeFromBlocksAt)
  2104  		d.dataMu.Unlock()
  2105  		if err != nil {
  2106  			return err
  2107  		}
  2108  	}
  2109  	if err := d.syncRemoteFileLocked(ctx); err != nil {
  2110  		if !forFilesystemSync {
  2111  			return err
  2112  		}
  2113  		// Only return err if we can reasonably have expected sync to succeed
  2114  		// (d is a regular file and was opened for writing).
  2115  		if d.isRegularFile() && d.isWriteHandleOk() {
  2116  			return err
  2117  		}
  2118  		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
  2119  	}
  2120  	return nil
  2121  }
  2122  
  2123  // incLinks increments link count.
  2124  func (d *dentry) incLinks() {
  2125  	if d.nlink.Load() == 0 {
  2126  		// The remote filesystem doesn't support link count.
  2127  		return
  2128  	}
  2129  	d.nlink.Add(1)
  2130  }
  2131  
  2132  // decLinks decrements link count.
  2133  func (d *dentry) decLinks() {
  2134  	if d.nlink.Load() == 0 {
  2135  		// The remote filesystem doesn't support link count.
  2136  		return
  2137  	}
  2138  	d.nlink.Add(^uint32(0))
  2139  }
  2140  
  2141  // fileDescription is embedded by gofer implementations of
  2142  // vfs.FileDescriptionImpl.
  2143  //
  2144  // +stateify savable
  2145  type fileDescription struct {
  2146  	vfsfd vfs.FileDescription
  2147  	vfs.FileDescriptionDefaultImpl
  2148  	vfs.LockFD
  2149  
  2150  	lockLogging sync.Once `state:"nosave"`
  2151  }
  2152  
  2153  func (fd *fileDescription) filesystem() *filesystem {
  2154  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
  2155  }
  2156  
  2157  func (fd *fileDescription) dentry() *dentry {
  2158  	return fd.vfsfd.Dentry().Impl().(*dentry)
  2159  }
  2160  
  2161  // Stat implements vfs.FileDescriptionImpl.Stat.
  2162  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
  2163  	d := fd.dentry()
  2164  	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
  2165  	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
  2166  		// Use specialFileFD.handle.fileLisa for the Stat if available, for the
  2167  		// same reason that we try to use open FD in updateMetadataLocked().
  2168  		var err error
  2169  		if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok {
  2170  			err = sffd.updateMetadata(ctx)
  2171  		} else {
  2172  			err = d.updateMetadata(ctx)
  2173  		}
  2174  		if err != nil {
  2175  			return linux.Statx{}, err
  2176  		}
  2177  	}
  2178  	var stat linux.Statx
  2179  	d.statTo(&stat)
  2180  	return stat, nil
  2181  }
  2182  
  2183  // SetStat implements vfs.FileDescriptionImpl.SetStat.
  2184  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
  2185  	fs := fd.filesystem()
  2186  	fs.renameMu.RLock()
  2187  	defer fs.renameMu.RUnlock()
  2188  	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount())
  2189  }
  2190  
  2191  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
  2192  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
  2193  	return fd.dentry().listXattr(ctx, size)
  2194  }
  2195  
  2196  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
  2197  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
  2198  	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2199  }
  2200  
  2201  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
  2202  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
  2203  	return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2204  }
  2205  
  2206  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
  2207  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
  2208  	return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name)
  2209  }
  2210  
  2211  // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
  2212  func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error {
  2213  	fd.lockLogging.Do(func() {
  2214  		log.Infof("File lock using gofer file handled internally.")
  2215  	})
  2216  	return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block)
  2217  }
  2218  
  2219  // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
  2220  func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error {
  2221  	fd.lockLogging.Do(func() {
  2222  		log.Infof("Range lock using gofer file handled internally.")
  2223  	})
  2224  	return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block)
  2225  }
  2226  
  2227  // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
  2228  func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
  2229  	return fd.Locks().UnlockPOSIX(ctx, uid, r)
  2230  }
  2231  
  2232  // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally
  2233  // holds some information around the intent behind resolving the path.
  2234  type resolvingPath struct {
  2235  	*vfs.ResolvingPath
  2236  
  2237  	// excludeLast indicates whether the intent is to resolve until the last path
  2238  	// component. If true, the last path component should remain unresolved.
  2239  	excludeLast bool
  2240  }
  2241  
  2242  func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath {
  2243  	return resolvingPath{ResolvingPath: rp, excludeLast: false}
  2244  }
  2245  
  2246  func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath {
  2247  	return resolvingPath{ResolvingPath: rp, excludeLast: true}
  2248  }
  2249  
  2250  func (rp *resolvingPath) done() bool {
  2251  	if rp.excludeLast {
  2252  		return rp.Final()
  2253  	}
  2254  	return rp.Done()
  2255  }
  2256  
  2257  func (rp *resolvingPath) copy() resolvingPath {
  2258  	return resolvingPath{
  2259  		ResolvingPath: rp.ResolvingPath.Copy(),
  2260  		excludeLast:   rp.excludeLast,
  2261  	}
  2262  }
  2263  
  2264  // Precondition: !rp.done() && rp.Component() is not "." or "..".
  2265  func (rp *resolvingPath) getComponents(emit func(string) bool) {
  2266  	rp.GetComponents(rp.excludeLast, emit)
  2267  }