github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/pkg/sentry/fsimpl/gofer/gofer.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package gofer provides a filesystem implementation that is backed by a 9p
    16  // server, interchangably referred to as "gofers" throughout this package.
    17  //
    18  // Lock order:
    19  //
    20  //	regularFileFD/directoryFD.mu
    21  //	  filesystem.renameMu
    22  //	    dentry.cachingMu
    23  //	      dentryCache.mu
    24  //	      dentry.opMu
    25  //	        dentry.childrenMu
    26  //	        filesystem.syncMu
    27  //	        dentry.metadataMu
    28  //	          *** "memmap.Mappable locks" below this point
    29  //	          dentry.mapsMu
    30  //	            *** "memmap.Mappable locks taken by Translate" below this point
    31  //	            dentry.handleMu
    32  //	              dentry.dataMu
    33  //	          filesystem.inoMu
    34  //	specialFileFD.mu
    35  //	  specialFileFD.bufMu
    36  //
    37  // Locking dentry.opMu and dentry.metadataMu in multiple dentries requires that
    38  // either ancestor dentries are locked before descendant dentries, or that
    39  // filesystem.renameMu is locked for writing.
    40  package gofer
    41  
    42  import (
    43  	"fmt"
    44  	"path"
    45  	"strconv"
    46  	"strings"
    47  
    48  	"golang.org/x/sys/unix"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/atomicbitops"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    52  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    54  	"github.com/nicocha30/gvisor-ligolo/pkg/hostarch"
    55  	"github.com/nicocha30/gvisor-ligolo/pkg/lisafs"
    56  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    57  	"github.com/nicocha30/gvisor-ligolo/pkg/refs"
    58  	fslock "github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/lock"
    59  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsutil"
    60  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    61  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/pipe"
    62  	ktime "github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/time"
    63  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/memmap"
    64  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/pgalloc"
    65  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/socket/unix/transport"
    66  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    67  	"github.com/nicocha30/gvisor-ligolo/pkg/sync"
    68  	"github.com/nicocha30/gvisor-ligolo/pkg/unet"
    69  )
    70  
    71  // Name is the default filesystem name.
    72  const Name = "9p"
    73  
    74  // Mount option names for goferfs.
    75  const (
    76  	moptTransport                = "trans"
    77  	moptReadFD                   = "rfdno"
    78  	moptWriteFD                  = "wfdno"
    79  	moptAname                    = "aname"
    80  	moptDfltUID                  = "dfltuid"
    81  	moptDfltGID                  = "dfltgid"
    82  	moptCache                    = "cache"
    83  	moptForcePageCache           = "force_page_cache"
    84  	moptLimitHostFDTranslation   = "limit_host_fd_translation"
    85  	moptOverlayfsStaleRead       = "overlayfs_stale_read"
    86  	moptDisableFileHandleSharing = "disable_file_handle_sharing"
    87  	moptDisableFifoOpen          = "disable_fifo_open"
    88  
    89  	// Directfs options.
    90  	moptDirectfs = "directfs"
    91  )
    92  
    93  // Valid values for the "cache" mount option.
    94  const (
    95  	cacheFSCache             = "fscache"
    96  	cacheFSCacheWritethrough = "fscache_writethrough"
    97  	cacheRemoteRevalidating  = "remote_revalidating"
    98  )
    99  
   100  const (
   101  	defaultMaxCachedDentries  = 1000
   102  	maxCachedNegativeChildren = 1000
   103  )
   104  
   105  // stringFixedCache is a fixed sized cache, once initialized,
   106  // its size never changes.
   107  //
   108  // +stateify savable
   109  type stringFixedCache struct {
   110  	// namesList stores negative names with fifo list.
   111  	// name stored in namesList only means it used to be negative
   112  	// at the moment you pushed it to the list.
   113  	namesList stringList
   114  	size      uint64
   115  }
   116  
   117  func (cache *stringFixedCache) isInited() bool {
   118  	return cache.size != 0
   119  }
   120  
   121  func (cache *stringFixedCache) init(size uint64) {
   122  	elements := make([]stringListElem, size)
   123  	for i := uint64(0); i < size; i++ {
   124  		cache.namesList.PushFront(&elements[i])
   125  	}
   126  	cache.size = size
   127  }
   128  
   129  // Update will push name to the front of the list,
   130  // and pop the tail value.
   131  func (cache *stringFixedCache) add(name string) string {
   132  	tail := cache.namesList.Back()
   133  	victimName := tail.str
   134  	tail.str = name
   135  	cache.namesList.Remove(tail)
   136  	cache.namesList.PushFront(tail)
   137  	return victimName
   138  }
   139  
   140  // +stateify savable
   141  type dentryCache struct {
   142  	// mu protects the below fields.
   143  	mu sync.Mutex `state:"nosave"`
   144  	// dentries contains all dentries with 0 references. Due to race conditions,
   145  	// it may also contain dentries with non-zero references.
   146  	dentries dentryList
   147  	// dentriesLen is the number of dentries in dentries.
   148  	dentriesLen uint64
   149  	// maxCachedDentries is the maximum number of cachable dentries.
   150  	maxCachedDentries uint64
   151  }
   152  
   153  // SetDentryCacheSize sets the size of the global gofer dentry cache.
   154  func SetDentryCacheSize(size int) {
   155  	if size < 0 {
   156  		return
   157  	}
   158  	if globalDentryCache != nil {
   159  		log.Warningf("Global dentry cache has already been initialized. Ignoring subsequent attempt.")
   160  		return
   161  	}
   162  	globalDentryCache = &dentryCache{maxCachedDentries: uint64(size)}
   163  }
   164  
   165  // globalDentryCache is a global cache of dentries across all gofers.
   166  var globalDentryCache *dentryCache
   167  
   168  // Valid values for "trans" mount option.
   169  const transportModeFD = "fd"
   170  
   171  // FilesystemType implements vfs.FilesystemType.
   172  //
   173  // +stateify savable
   174  type FilesystemType struct{}
   175  
   176  // filesystem implements vfs.FilesystemImpl.
   177  //
   178  // +stateify savable
   179  type filesystem struct {
   180  	vfsfs vfs.Filesystem
   181  
   182  	// mfp is used to allocate memory that caches regular file contents. mfp is
   183  	// immutable.
   184  	mfp pgalloc.MemoryFileProvider
   185  
   186  	// Immutable options.
   187  	opts  filesystemOptions
   188  	iopts InternalFilesystemOptions
   189  
   190  	// client is the LISAFS client used for communicating with the server. client
   191  	// is immutable.
   192  	client *lisafs.Client `state:"nosave"`
   193  
   194  	// clock is a realtime clock used to set timestamps in file operations.
   195  	clock ktime.Clock
   196  
   197  	// devMinor is the filesystem's minor device number. devMinor is immutable.
   198  	devMinor uint32
   199  
   200  	// root is the root dentry. root is immutable.
   201  	root *dentry
   202  
   203  	// renameMu serves two purposes:
   204  	//
   205  	//	- It synchronizes path resolution with renaming initiated by this
   206  	//		client.
   207  	//
   208  	//	- It is held by path resolution to ensure that reachable dentries remain
   209  	//		valid. A dentry is reachable by path resolution if it has a non-zero
   210  	//		reference count (such that it is usable as vfs.ResolvingPath.Start() or
   211  	//		is reachable from its children), or if it is a child dentry (such that
   212  	//		it is reachable from its parent).
   213  	renameMu sync.RWMutex `state:"nosave"`
   214  
   215  	dentryCache *dentryCache
   216  
   217  	// syncableDentries contains all non-synthetic dentries. specialFileFDs
   218  	// contains all open specialFileFDs. These fields are protected by syncMu.
   219  	syncMu           sync.Mutex `state:"nosave"`
   220  	syncableDentries dentryList
   221  	specialFileFDs   specialFDList
   222  
   223  	// inoByKey maps previously-observed device ID and host inode numbers to
   224  	// internal inode numbers assigned to those files. inoByKey is not preserved
   225  	// across checkpoint/restore because inode numbers may be reused between
   226  	// different gofer processes, so inode numbers may be repeated for different
   227  	// files across checkpoint/restore. inoByKey is protected by inoMu.
   228  	inoMu    sync.Mutex        `state:"nosave"`
   229  	inoByKey map[inoKey]uint64 `state:"nosave"`
   230  
   231  	// lastIno is the last inode number assigned to a file. lastIno is accessed
   232  	// using atomic memory operations.
   233  	lastIno atomicbitops.Uint64
   234  
   235  	// savedDentryRW records open read/write handles during save/restore.
   236  	savedDentryRW map[*dentry]savedDentryRW
   237  
   238  	// released is nonzero once filesystem.Release has been called.
   239  	released atomicbitops.Int32
   240  }
   241  
   242  // +stateify savable
   243  type filesystemOptions struct {
   244  	fd      int
   245  	aname   string
   246  	interop InteropMode // derived from the "cache" mount option
   247  	dfltuid auth.KUID
   248  	dfltgid auth.KGID
   249  
   250  	// If forcePageCache is true, host FDs may not be used for application
   251  	// memory mappings even if available; instead, the client must perform its
   252  	// own caching of regular file pages. This is primarily useful for testing.
   253  	forcePageCache bool
   254  
   255  	// If limitHostFDTranslation is true, apply maxFillRange() constraints to
   256  	// host FD mappings returned by dentry.(memmap.Mappable).Translate(). This
   257  	// makes memory accounting behavior more consistent between cases where
   258  	// host FDs are / are not available, but may increase the frequency of
   259  	// sentry-handled page faults on files for which a host FD is available.
   260  	limitHostFDTranslation bool
   261  
   262  	// If overlayfsStaleRead is true, O_RDONLY host FDs provided by the remote
   263  	// filesystem may not be coherent with writable host FDs opened later, so
   264  	// all uses of the former must be replaced by uses of the latter. This is
   265  	// usually only the case when the remote filesystem is a Linux overlayfs
   266  	// mount. (Prior to Linux 4.18, patch series centered on commit
   267  	// d1d04ef8572b "ovl: stack file ops", both I/O and memory mappings were
   268  	// incoherent between pre-copy-up and post-copy-up FDs; after that patch
   269  	// series, only memory mappings are incoherent.)
   270  	overlayfsStaleRead bool
   271  
   272  	// If regularFilesUseSpecialFileFD is true, application FDs representing
   273  	// regular files will use distinct file handles for each FD, in the same
   274  	// way that application FDs representing "special files" such as sockets
   275  	// do. Note that this disables client caching for regular files. This option
   276  	// may regress performance due to excessive Open RPCs. This option is not
   277  	// supported with overlayfsStaleRead for now.
   278  	regularFilesUseSpecialFileFD bool
   279  
   280  	// If disableFifoOpen is true, application attempts to open(2) a host FIFO
   281  	// are disallowed.
   282  	disableFifoOpen bool
   283  
   284  	// directfs holds options for directfs mode.
   285  	directfs directfsOpts
   286  }
   287  
   288  // +stateify savable
   289  type directfsOpts struct {
   290  	// If directfs is enabled, the gofer client does not make RPCs to the gofer
   291  	// process. Instead, it makes host syscalls to perform file operations.
   292  	enabled bool
   293  }
   294  
   295  // InteropMode controls the client's interaction with other remote filesystem
   296  // users.
   297  //
   298  // +stateify savable
   299  type InteropMode uint32
   300  
   301  const (
   302  	// InteropModeExclusive is appropriate when the filesystem client is the
   303  	// only user of the remote filesystem.
   304  	//
   305  	//	- The client may cache arbitrary filesystem state (file data, metadata,
   306  	//		filesystem structure, etc.).
   307  	//
   308  	//	- Client changes to filesystem state may be sent to the remote
   309  	//		filesystem asynchronously, except when server permission checks are
   310  	//		necessary.
   311  	//
   312  	//	- File timestamps are based on client clocks. This ensures that users of
   313  	//		the client observe timestamps that are coherent with their own clocks
   314  	//		and consistent with Linux's semantics (in particular, it is not always
   315  	//		possible for clients to set arbitrary atimes and mtimes depending on the
   316  	//		remote filesystem implementation, and never possible for clients to set
   317  	//		arbitrary ctimes.)
   318  	InteropModeExclusive InteropMode = iota
   319  
   320  	// InteropModeWritethrough is appropriate when there are read-only users of
   321  	// the remote filesystem that expect to observe changes made by the
   322  	// filesystem client.
   323  	//
   324  	//	- The client may cache arbitrary filesystem state.
   325  	//
   326  	//	- Client changes to filesystem state must be sent to the remote
   327  	//		filesystem synchronously.
   328  	//
   329  	//	- File timestamps are based on client clocks. As a corollary, access
   330  	//		timestamp changes from other remote filesystem users will not be visible
   331  	//		to the client.
   332  	InteropModeWritethrough
   333  
   334  	// InteropModeShared is appropriate when there are users of the remote
   335  	// filesystem that may mutate its state other than the client.
   336  	//
   337  	//	- The client must verify ("revalidate") cached filesystem state before
   338  	//		using it.
   339  	//
   340  	//	- Client changes to filesystem state must be sent to the remote
   341  	//		filesystem synchronously.
   342  	//
   343  	//	- File timestamps are based on server clocks. This is necessary to
   344  	//		ensure that timestamp changes are synchronized between remote filesystem
   345  	//		users.
   346  	//
   347  	// Note that the correctness of InteropModeShared depends on the server
   348  	// correctly implementing 9P fids (i.e. each fid immutably represents a
   349  	// single filesystem object), even in the presence of remote filesystem
   350  	// mutations from other users. If this is violated, the behavior of the
   351  	// client is undefined.
   352  	InteropModeShared
   353  )
   354  
   355  // InternalFilesystemOptions may be passed as
   356  // vfs.GetFilesystemOptions.InternalData to FilesystemType.GetFilesystem.
   357  //
   358  // +stateify savable
   359  type InternalFilesystemOptions struct {
   360  	// If UniqueID is non-empty, it is an opaque string used to reassociate the
   361  	// filesystem with a new server FD during restoration from checkpoint.
   362  	UniqueID string
   363  
   364  	// If LeakConnection is true, do not close the connection to the server
   365  	// when the Filesystem is released. This is necessary for deployments in
   366  	// which servers can handle only a single client and report failure if that
   367  	// client disconnects.
   368  	LeakConnection bool
   369  
   370  	// If OpenSocketsByConnecting is true, silently translate attempts to open
   371  	// files identifying as sockets to connect RPCs.
   372  	OpenSocketsByConnecting bool
   373  }
   374  
   375  // _V9FS_DEFUID and _V9FS_DEFGID (from Linux's fs/9p/v9fs.h) are the default
   376  // UIDs and GIDs used for files that do not provide a specific owner or group
   377  // respectively.
   378  const (
   379  	// uint32(-2) doesn't work in Go.
   380  	_V9FS_DEFUID = auth.KUID(4294967294)
   381  	_V9FS_DEFGID = auth.KGID(4294967294)
   382  )
   383  
   384  // Name implements vfs.FilesystemType.Name.
   385  func (FilesystemType) Name() string {
   386  	return Name
   387  }
   388  
   389  // Release implements vfs.FilesystemType.Release.
   390  func (FilesystemType) Release(ctx context.Context) {}
   391  
   392  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   393  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   394  	mfp := pgalloc.MemoryFileProviderFromContext(ctx)
   395  	if mfp == nil {
   396  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: context does not provide a pgalloc.MemoryFileProvider")
   397  		return nil, nil, linuxerr.EINVAL
   398  	}
   399  
   400  	mopts := vfs.GenericParseMountOptions(opts.Data)
   401  	var fsopts filesystemOptions
   402  
   403  	fd, err := getFDFromMountOptionsMap(ctx, mopts)
   404  	if err != nil {
   405  		return nil, nil, err
   406  	}
   407  	fsopts.fd = fd
   408  
   409  	// Get the attach name.
   410  	fsopts.aname = "/"
   411  	if aname, ok := mopts[moptAname]; ok {
   412  		delete(mopts, moptAname)
   413  		if !path.IsAbs(aname) {
   414  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: aname is not absolute: %s=%s", moptAname, aname)
   415  			return nil, nil, linuxerr.EINVAL
   416  		}
   417  		fsopts.aname = path.Clean(aname)
   418  	}
   419  
   420  	// Parse the cache policy. For historical reasons, this defaults to the
   421  	// least generally-applicable option, InteropModeExclusive.
   422  	fsopts.interop = InteropModeExclusive
   423  	if cache, ok := mopts[moptCache]; ok {
   424  		delete(mopts, moptCache)
   425  		switch cache {
   426  		case cacheFSCache:
   427  			fsopts.interop = InteropModeExclusive
   428  		case cacheFSCacheWritethrough:
   429  			fsopts.interop = InteropModeWritethrough
   430  		case cacheRemoteRevalidating:
   431  			fsopts.interop = InteropModeShared
   432  		default:
   433  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid cache policy: %s=%s", moptCache, cache)
   434  			return nil, nil, linuxerr.EINVAL
   435  		}
   436  	}
   437  
   438  	// Parse the default UID and GID.
   439  	fsopts.dfltuid = _V9FS_DEFUID
   440  	if dfltuidstr, ok := mopts[moptDfltUID]; ok {
   441  		delete(mopts, moptDfltUID)
   442  		dfltuid, err := strconv.ParseUint(dfltuidstr, 10, 32)
   443  		if err != nil {
   444  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltUID, dfltuidstr)
   445  			return nil, nil, linuxerr.EINVAL
   446  		}
   447  		// In Linux, dfltuid is interpreted as a UID and is converted to a KUID
   448  		// in the caller's user namespace, but goferfs isn't
   449  		// application-mountable.
   450  		fsopts.dfltuid = auth.KUID(dfltuid)
   451  	}
   452  	fsopts.dfltgid = _V9FS_DEFGID
   453  	if dfltgidstr, ok := mopts[moptDfltGID]; ok {
   454  		delete(mopts, moptDfltGID)
   455  		dfltgid, err := strconv.ParseUint(dfltgidstr, 10, 32)
   456  		if err != nil {
   457  			ctx.Warningf("gofer.FilesystemType.GetFilesystem: invalid default UID: %s=%s", moptDfltGID, dfltgidstr)
   458  			return nil, nil, linuxerr.EINVAL
   459  		}
   460  		fsopts.dfltgid = auth.KGID(dfltgid)
   461  	}
   462  
   463  	// Handle simple flags.
   464  	if _, ok := mopts[moptDisableFileHandleSharing]; ok {
   465  		delete(mopts, moptDisableFileHandleSharing)
   466  		fsopts.regularFilesUseSpecialFileFD = true
   467  	}
   468  	if _, ok := mopts[moptDisableFifoOpen]; ok {
   469  		delete(mopts, moptDisableFifoOpen)
   470  		fsopts.disableFifoOpen = true
   471  	}
   472  	if _, ok := mopts[moptForcePageCache]; ok {
   473  		delete(mopts, moptForcePageCache)
   474  		fsopts.forcePageCache = true
   475  	}
   476  	if _, ok := mopts[moptLimitHostFDTranslation]; ok {
   477  		delete(mopts, moptLimitHostFDTranslation)
   478  		fsopts.limitHostFDTranslation = true
   479  	}
   480  	if _, ok := mopts[moptOverlayfsStaleRead]; ok {
   481  		delete(mopts, moptOverlayfsStaleRead)
   482  		fsopts.overlayfsStaleRead = true
   483  	}
   484  	if _, ok := mopts[moptDirectfs]; ok {
   485  		delete(mopts, moptDirectfs)
   486  		fsopts.directfs.enabled = true
   487  	}
   488  	// fsopts.regularFilesUseSpecialFileFD can only be enabled by specifying
   489  	// "cache=none".
   490  
   491  	// Check for unparsed options.
   492  	if len(mopts) != 0 {
   493  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: unknown options: %v", mopts)
   494  		return nil, nil, linuxerr.EINVAL
   495  	}
   496  
   497  	// Validation.
   498  	if fsopts.regularFilesUseSpecialFileFD && fsopts.overlayfsStaleRead {
   499  		// These options are not supported together. To support this, when a dentry
   500  		// is opened writably for the first time, we need to iterate over all the
   501  		// specialFileFDs of that dentry that represent a regular file and call
   502  		// fd.hostFileMapper.RegenerateMappings(writable_fd).
   503  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: regularFilesUseSpecialFileFD and overlayfsStaleRead options are not supported together.")
   504  		return nil, nil, linuxerr.EINVAL
   505  	}
   506  
   507  	// Handle internal options.
   508  	iopts, ok := opts.InternalData.(InternalFilesystemOptions)
   509  	if opts.InternalData != nil && !ok {
   510  		ctx.Warningf("gofer.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted gofer.InternalFilesystemOptions", opts.InternalData)
   511  		return nil, nil, linuxerr.EINVAL
   512  	}
   513  	// If !ok, iopts being the zero value is correct.
   514  
   515  	// Construct the filesystem object.
   516  	devMinor, err := vfsObj.GetAnonBlockDevMinor()
   517  	if err != nil {
   518  		return nil, nil, err
   519  	}
   520  	fs := &filesystem{
   521  		mfp:      mfp,
   522  		opts:     fsopts,
   523  		iopts:    iopts,
   524  		clock:    ktime.RealtimeClockFromContext(ctx),
   525  		devMinor: devMinor,
   526  		inoByKey: make(map[inoKey]uint64),
   527  	}
   528  
   529  	// Did the user configure a global dentry cache?
   530  	if globalDentryCache != nil {
   531  		fs.dentryCache = globalDentryCache
   532  	} else {
   533  		fs.dentryCache = &dentryCache{maxCachedDentries: defaultMaxCachedDentries}
   534  	}
   535  
   536  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   537  
   538  	rootInode, rootHostFD, err := fs.initClientAndGetRoot(ctx)
   539  	if err != nil {
   540  		fs.vfsfs.DecRef(ctx)
   541  		return nil, nil, err
   542  	}
   543  	if fs.opts.directfs.enabled {
   544  		fs.root, err = fs.getDirectfsRootDentry(ctx, rootHostFD, fs.client.NewFD(rootInode.ControlFD))
   545  	} else {
   546  		fs.root, err = fs.newLisafsDentry(ctx, &rootInode)
   547  	}
   548  	if err != nil {
   549  		fs.vfsfs.DecRef(ctx)
   550  		return nil, nil, err
   551  	}
   552  	// Set the root's reference count to 2. One reference is returned to the
   553  	// caller, and the other is held by fs to prevent the root from being "cached"
   554  	// and subsequently evicted.
   555  	fs.root.refs = atomicbitops.FromInt64(2)
   556  	return &fs.vfsfs, &fs.root.vfsd, nil
   557  }
   558  
   559  // initClientAndGetRoot initializes fs.client and returns the root inode for
   560  // this mount point. It handles the attach point (fs.opts.aname) resolution.
   561  func (fs *filesystem) initClientAndGetRoot(ctx context.Context) (lisafs.Inode, int, error) {
   562  	sock, err := unet.NewSocket(fs.opts.fd)
   563  	if err != nil {
   564  		return lisafs.Inode{}, -1, err
   565  	}
   566  
   567  	ctx.UninterruptibleSleepStart(false)
   568  	defer ctx.UninterruptibleSleepFinish(false)
   569  
   570  	var (
   571  		rootInode  lisafs.Inode
   572  		rootHostFD int
   573  	)
   574  	fs.client, rootInode, rootHostFD, err = lisafs.NewClient(sock)
   575  	if err != nil {
   576  		return lisafs.Inode{}, -1, err
   577  	}
   578  
   579  	cu := cleanup.Make(func() {
   580  		if rootHostFD >= 0 {
   581  			_ = unix.Close(rootHostFD)
   582  		}
   583  		rootControlFD := fs.client.NewFD(rootInode.ControlFD)
   584  		rootControlFD.Close(ctx, false /* flush */)
   585  	})
   586  	defer cu.Clean()
   587  
   588  	if fs.opts.directfs.enabled {
   589  		if fs.opts.aname != "/" {
   590  			log.Warningf("directfs does not support aname filesystem option: aname=%q", fs.opts.aname)
   591  			return lisafs.Inode{}, -1, unix.EINVAL
   592  		}
   593  		if rootHostFD < 0 {
   594  			log.Warningf("Mount RPC did not return host FD to mount point with directfs enabled")
   595  			return lisafs.Inode{}, -1, unix.EINVAL
   596  		}
   597  	} else {
   598  		if rootHostFD >= 0 {
   599  			log.Warningf("Mount RPC returned a host FD to mount point without directfs, we didn't ask for it")
   600  			_ = unix.Close(rootHostFD)
   601  			rootHostFD = -1
   602  		}
   603  		// Use flipcall channels with lisafs because it makes a lot of RPCs.
   604  		if err := fs.client.StartChannels(); err != nil {
   605  			return lisafs.Inode{}, -1, err
   606  		}
   607  		rootInode, err = fs.handleAnameLisafs(ctx, rootInode)
   608  		if err != nil {
   609  			return lisafs.Inode{}, -1, err
   610  		}
   611  	}
   612  	cu.Release()
   613  	return rootInode, rootHostFD, nil
   614  }
   615  
   616  func getFDFromMountOptionsMap(ctx context.Context, mopts map[string]string) (int, error) {
   617  	// Check that the transport is "fd".
   618  	trans, ok := mopts[moptTransport]
   619  	if !ok || trans != transportModeFD {
   620  		ctx.Warningf("gofer.getFDFromMountOptionsMap: transport must be specified as '%s=%s'", moptTransport, transportModeFD)
   621  		return -1, linuxerr.EINVAL
   622  	}
   623  	delete(mopts, moptTransport)
   624  
   625  	// Check that read and write FDs are provided and identical.
   626  	rfdstr, ok := mopts[moptReadFD]
   627  	if !ok {
   628  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD must be specified as '%s=<file descriptor>'", moptReadFD)
   629  		return -1, linuxerr.EINVAL
   630  	}
   631  	delete(mopts, moptReadFD)
   632  	rfd, err := strconv.Atoi(rfdstr)
   633  	if err != nil {
   634  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid read FD: %s=%s", moptReadFD, rfdstr)
   635  		return -1, linuxerr.EINVAL
   636  	}
   637  	wfdstr, ok := mopts[moptWriteFD]
   638  	if !ok {
   639  		ctx.Warningf("gofer.getFDFromMountOptionsMap: write FD must be specified as '%s=<file descriptor>'", moptWriteFD)
   640  		return -1, linuxerr.EINVAL
   641  	}
   642  	delete(mopts, moptWriteFD)
   643  	wfd, err := strconv.Atoi(wfdstr)
   644  	if err != nil {
   645  		ctx.Warningf("gofer.getFDFromMountOptionsMap: invalid write FD: %s=%s", moptWriteFD, wfdstr)
   646  		return -1, linuxerr.EINVAL
   647  	}
   648  	if rfd != wfd {
   649  		ctx.Warningf("gofer.getFDFromMountOptionsMap: read FD (%d) and write FD (%d) must be equal", rfd, wfd)
   650  		return -1, linuxerr.EINVAL
   651  	}
   652  	return rfd, nil
   653  }
   654  
   655  // Release implements vfs.FilesystemImpl.Release.
   656  func (fs *filesystem) Release(ctx context.Context) {
   657  	fs.released.Store(1)
   658  
   659  	mf := fs.mfp.MemoryFile()
   660  	fs.syncMu.Lock()
   661  	for elem := fs.syncableDentries.Front(); elem != nil; elem = elem.Next() {
   662  		d := elem.d
   663  		d.handleMu.Lock()
   664  		d.dataMu.Lock()
   665  		if d.isWriteHandleOk() {
   666  			// Write dirty cached data to the remote file.
   667  			h := d.writeHandle()
   668  			if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
   669  				log.Warningf("gofer.filesystem.Release: failed to flush dentry: %v", err)
   670  			}
   671  			// TODO(jamieliu): Do we need to flushf/fsync d?
   672  		}
   673  		// Discard cached pages.
   674  		d.cache.DropAll(mf)
   675  		d.dirty.RemoveAll()
   676  		d.dataMu.Unlock()
   677  		// Close host FDs if they exist. We can use RacyLoad() because d.handleMu
   678  		// is locked.
   679  		if d.readFD.RacyLoad() >= 0 {
   680  			_ = unix.Close(int(d.readFD.RacyLoad()))
   681  		}
   682  		if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() {
   683  			_ = unix.Close(int(d.writeFD.RacyLoad()))
   684  		}
   685  		d.readFD = atomicbitops.FromInt32(-1)
   686  		d.writeFD = atomicbitops.FromInt32(-1)
   687  		d.mmapFD = atomicbitops.FromInt32(-1)
   688  		d.handleMu.Unlock()
   689  	}
   690  	// There can't be any specialFileFDs still using fs, since each such
   691  	// FileDescription would hold a reference on a Mount holding a reference on
   692  	// fs.
   693  	fs.syncMu.Unlock()
   694  
   695  	// If leak checking is enabled, release all outstanding references in the
   696  	// filesystem. We deliberately avoid doing this outside of leak checking; we
   697  	// have released all external resources above rather than relying on dentry
   698  	// destructors. fs.root may be nil if creating the client or initializing the
   699  	// root dentry failed in GetFilesystem.
   700  	if refs.GetLeakMode() != refs.NoLeakChecking && fs.root != nil {
   701  		fs.renameMu.Lock()
   702  		fs.root.releaseSyntheticRecursiveLocked(ctx)
   703  		fs.evictAllCachedDentriesLocked(ctx)
   704  		fs.renameMu.Unlock()
   705  
   706  		// An extra reference was held by the filesystem on the root to prevent it from
   707  		// being cached/evicted.
   708  		fs.root.DecRef(ctx)
   709  	}
   710  
   711  	if !fs.iopts.LeakConnection {
   712  		// Close the connection to the server. This implicitly closes all FDs.
   713  		if fs.client != nil {
   714  			fs.client.Close()
   715  		}
   716  	}
   717  
   718  	fs.vfsfs.VirtualFilesystem().PutAnonBlockDevMinor(fs.devMinor)
   719  }
   720  
   721  // releaseSyntheticRecursiveLocked traverses the tree with root d and decrements
   722  // the reference count on every synthetic dentry. Synthetic dentries have one
   723  // reference for existence that should be dropped during filesystem.Release.
   724  //
   725  // Precondition: d.fs.renameMu is locked for writing.
   726  func (d *dentry) releaseSyntheticRecursiveLocked(ctx context.Context) {
   727  	if d.isSynthetic() {
   728  		d.decRefNoCaching()
   729  		d.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
   730  	}
   731  	if d.isDir() {
   732  		var children []*dentry
   733  		d.childrenMu.Lock()
   734  		for _, child := range d.children {
   735  			children = append(children, child)
   736  		}
   737  		d.childrenMu.Unlock()
   738  		for _, child := range children {
   739  			if child != nil {
   740  				child.releaseSyntheticRecursiveLocked(ctx)
   741  			}
   742  		}
   743  	}
   744  }
   745  
   746  // inoKey is the key used to identify the inode backed by this dentry.
   747  //
   748  // +stateify savable
   749  type inoKey struct {
   750  	ino      uint64
   751  	devMinor uint32
   752  	devMajor uint32
   753  }
   754  
   755  func inoKeyFromStatx(stat *linux.Statx) inoKey {
   756  	return inoKey{
   757  		ino:      stat.Ino,
   758  		devMinor: stat.DevMinor,
   759  		devMajor: stat.DevMajor,
   760  	}
   761  }
   762  
   763  func inoKeyFromStat(stat *unix.Stat_t) inoKey {
   764  	return inoKey{
   765  		ino:      stat.Ino,
   766  		devMinor: unix.Minor(stat.Dev),
   767  		devMajor: unix.Major(stat.Dev),
   768  	}
   769  }
   770  
   771  // dentry implements vfs.DentryImpl.
   772  //
   773  // +stateify savable
   774  type dentry struct {
   775  	vfsd vfs.Dentry
   776  
   777  	// refs is the reference count. Each dentry holds a reference on its
   778  	// parent, even if disowned. An additional reference is held on all
   779  	// synthetic dentries until they are unlinked or invalidated. When refs
   780  	// reaches 0, the dentry may be added to the cache or destroyed. If refs ==
   781  	// -1, the dentry has already been destroyed. refs is accessed using atomic
   782  	// memory operations.
   783  	refs atomicbitops.Int64
   784  
   785  	// fs is the owning filesystem. fs is immutable.
   786  	fs *filesystem
   787  
   788  	// parent is this dentry's parent directory. Each dentry holds a reference
   789  	// on its parent. If this dentry is a filesystem root, parent is nil.
   790  	// parent is protected by filesystem.renameMu.
   791  	parent *dentry
   792  
   793  	// name is the name of this dentry in its parent. If this dentry is a
   794  	// filesystem root, name is the empty string. name is protected by
   795  	// filesystem.renameMu.
   796  	name string
   797  
   798  	// inoKey is used to identify this dentry's inode.
   799  	inoKey inoKey
   800  
   801  	// If deleted is non-zero, the file represented by this dentry has been
   802  	// deleted is accessed using atomic memory operations.
   803  	deleted atomicbitops.Uint32
   804  
   805  	// cachingMu is used to synchronize concurrent dentry caching attempts on
   806  	// this dentry.
   807  	cachingMu sync.Mutex `state:"nosave"`
   808  
   809  	// If cached is true, this dentry is part of filesystem.dentryCache. cached
   810  	// is protected by cachingMu.
   811  	cached bool
   812  
   813  	// cacheEntry links dentry into filesystem.dentryCache.dentries. It is
   814  	// protected by filesystem.dentryCache.mu.
   815  	cacheEntry dentryListElem
   816  
   817  	// syncableListEntry links dentry into filesystem.syncableDentries. It is
   818  	// protected by filesystem.syncMu.
   819  	syncableListEntry dentryListElem
   820  
   821  	// opMu synchronizes operations on this dentry. Operations that mutate
   822  	// the dentry tree must hold this lock for writing. Operations that
   823  	// only read the tree must hold for reading.
   824  	opMu sync.RWMutex `state:"nosave"`
   825  
   826  	// childrenMu protects the cached children data for this dentry.
   827  	childrenMu sync.Mutex `state:"nosave"`
   828  
   829  	// If this dentry represents a directory, children contains:
   830  	//
   831  	//	- Mappings of child filenames to dentries representing those children.
   832  	//
   833  	//	- Mappings of child filenames that are known not to exist to nil
   834  	//		dentries (only if InteropModeShared is not in effect and the directory
   835  	//		is not synthetic).
   836  	//
   837  	// +checklocks:childrenMu
   838  	children map[string]*dentry
   839  
   840  	// If this dentry represents a directory, negativeChildrenCache cache
   841  	// names of negative children.
   842  	//
   843  	// +checklocks:childrenMu
   844  	negativeChildrenCache stringFixedCache
   845  	// If this dentry represents a directory, negativeChildren is the number
   846  	// of negative children cached in dentry.children
   847  	//
   848  	// +checklocks:childrenMu
   849  	negativeChildren int
   850  
   851  	// If this dentry represents a directory, syntheticChildren is the number
   852  	// of child dentries for which dentry.isSynthetic() == true.
   853  	//
   854  	// +checklocks:childrenMu
   855  	syntheticChildren int
   856  
   857  	// If this dentry represents a directory,
   858  	// dentry.cachedMetadataAuthoritative() == true, and dirents is not
   859  	// nil, then dirents is a cache of all entries in the directory, in the
   860  	// order they were returned by the server. childrenSet just stores the
   861  	// `Name` field of all dirents in a set for fast query. dirents and
   862  	// childrenSet share the same lifecycle.
   863  	//
   864  	// +checklocks:childrenMu
   865  	dirents []vfs.Dirent
   866  	// +checklocks:childrenMu
   867  	childrenSet map[string]struct{}
   868  
   869  	// Cached metadata; protected by metadataMu.
   870  	// To access:
   871  	//   - In situations where consistency is not required (like stat), these
   872  	//     can be accessed using atomic operations only (without locking).
   873  	//   - Lock metadataMu and can access without atomic operations.
   874  	// To mutate:
   875  	//   - Lock metadataMu and use atomic operations to update because we might
   876  	//     have atomic readers that don't hold the lock.
   877  	metadataMu sync.Mutex          `state:"nosave"`
   878  	ino        uint64              // immutable
   879  	mode       atomicbitops.Uint32 // type is immutable, perms are mutable
   880  	uid        atomicbitops.Uint32 // auth.KUID, but stored as raw uint32 for sync/atomic
   881  	gid        atomicbitops.Uint32 // auth.KGID, but ...
   882  	blockSize  atomicbitops.Uint32 // 0 if unknown
   883  	// Timestamps, all nsecs from the Unix epoch.
   884  	atime atomicbitops.Int64
   885  	mtime atomicbitops.Int64
   886  	ctime atomicbitops.Int64
   887  	btime atomicbitops.Int64
   888  	// File size, which differs from other metadata in two ways:
   889  	//
   890  	//	- We make a best-effort attempt to keep it up to date even if
   891  	//		!dentry.cachedMetadataAuthoritative() for the sake of O_APPEND writes.
   892  	//
   893  	//	- size is protected by both metadataMu and dataMu (i.e. both must be
   894  	//		locked to mutate it; locking either is sufficient to access it).
   895  	size atomicbitops.Uint64
   896  	// If this dentry does not represent a synthetic file, deleted is 0, and
   897  	// atimeDirty/mtimeDirty are non-zero, atime/mtime may have diverged from the
   898  	// remote file's timestamps, which should be updated when this dentry is
   899  	// evicted.
   900  	atimeDirty atomicbitops.Uint32
   901  	mtimeDirty atomicbitops.Uint32
   902  
   903  	// nlink counts the number of hard links to this dentry. It's updated and
   904  	// accessed using atomic operations. It's not protected by metadataMu like the
   905  	// other metadata fields.
   906  	nlink atomicbitops.Uint32
   907  
   908  	mapsMu sync.Mutex `state:"nosave"`
   909  
   910  	// If this dentry represents a regular file, mappings tracks mappings of
   911  	// the file into memmap.MappingSpaces. mappings is protected by mapsMu.
   912  	mappings memmap.MappingSet
   913  
   914  	//	- If this dentry represents a regular file or directory, readFD (if not
   915  	//    -1) is a host FD used for reads by all regularFileFDs/directoryFDs
   916  	//    representing this dentry.
   917  	//
   918  	//	- If this dentry represents a regular file, writeFD (if not -1) is a host
   919  	//    FD used for writes by all regularFileFDs representing this dentry.
   920  	//
   921  	//	- If this dentry represents a regular file, mmapFD is the host FD used
   922  	//		for memory mappings. If mmapFD is -1, no such FD is available, and the
   923  	//		internal page cache implementation is used for memory mappings instead.
   924  	//
   925  	// These fields are protected by handleMu. readFD, writeFD, and mmapFD are
   926  	// additionally written using atomic memory operations, allowing them to be
   927  	// read (albeit racily) with atomic.LoadInt32() without locking handleMu.
   928  	//
   929  	// readFD and writeFD may or may not be the same file descriptor. Once either
   930  	// transitions from closed (-1) to open, it may be mutated with handleMu
   931  	// locked, but cannot be closed until the dentry is destroyed.
   932  	//
   933  	// readFD and writeFD may or may not be the same file descriptor. mmapFD is
   934  	// always either -1 or equal to readFD; if the file has been opened for
   935  	// writing, it is additionally either -1 or equal to writeFD.
   936  	handleMu sync.RWMutex       `state:"nosave"`
   937  	readFD   atomicbitops.Int32 `state:"nosave"`
   938  	writeFD  atomicbitops.Int32 `state:"nosave"`
   939  	mmapFD   atomicbitops.Int32 `state:"nosave"`
   940  
   941  	dataMu sync.RWMutex `state:"nosave"`
   942  
   943  	// If this dentry represents a regular file that is client-cached, cache
   944  	// maps offsets into the cached file to offsets into
   945  	// filesystem.mfp.MemoryFile() that store the file's data. cache is
   946  	// protected by dataMu.
   947  	cache fsutil.FileRangeSet
   948  
   949  	// If this dentry represents a regular file that is client-cached, dirty
   950  	// tracks dirty segments in cache. dirty is protected by dataMu.
   951  	dirty fsutil.DirtySet
   952  
   953  	// pf implements platform.File for mappings of hostFD.
   954  	pf dentryPlatformFile
   955  
   956  	// If this dentry represents a symbolic link, InteropModeShared is not in
   957  	// effect, and haveTarget is true, target is the symlink target. haveTarget
   958  	// and target are protected by dataMu.
   959  	haveTarget bool
   960  	target     string
   961  
   962  	// If this dentry represents a synthetic socket file, endpoint is the
   963  	// transport endpoint bound to this file.
   964  	endpoint transport.BoundEndpoint
   965  
   966  	// If this dentry represents a synthetic named pipe, pipe is the pipe
   967  	// endpoint bound to this file.
   968  	pipe *pipe.VFSPipe
   969  
   970  	locks vfs.FileLocks
   971  
   972  	// Inotify watches for this dentry.
   973  	//
   974  	// Note that inotify may behave unexpectedly in the presence of hard links,
   975  	// because dentries corresponding to the same file have separate inotify
   976  	// watches when they should share the same set. This is the case because it is
   977  	// impossible for us to know for sure whether two dentries correspond to the
   978  	// same underlying file (see the gofer filesystem section fo vfs/inotify.md for
   979  	// a more in-depth discussion on this matter).
   980  	watches vfs.Watches
   981  
   982  	// impl is the specific dentry implementation for non-synthetic dentries.
   983  	// impl is immutable.
   984  	//
   985  	// If impl is nil, this dentry represents a synthetic file, i.e. a
   986  	// file that does not exist on the host filesystem. As of this writing, the
   987  	// only files that can be synthetic are sockets, pipes, and directories.
   988  	impl any
   989  }
   990  
   991  // +stateify savable
   992  type stringListElem struct {
   993  	// str is the string that this elem represents.
   994  	str string
   995  	stringEntry
   996  }
   997  
   998  // +stateify savable
   999  type dentryListElem struct {
  1000  	// d is the dentry that this elem represents.
  1001  	d *dentry
  1002  	dentryEntry
  1003  }
  1004  
  1005  func (fs *filesystem) inoFromKey(key inoKey) uint64 {
  1006  	fs.inoMu.Lock()
  1007  	defer fs.inoMu.Unlock()
  1008  
  1009  	if ino, ok := fs.inoByKey[key]; ok {
  1010  		return ino
  1011  	}
  1012  	ino := fs.nextIno()
  1013  	fs.inoByKey[key] = ino
  1014  	return ino
  1015  }
  1016  
  1017  func (fs *filesystem) nextIno() uint64 {
  1018  	return fs.lastIno.Add(1)
  1019  }
  1020  
  1021  // init must be called before first use of d.
  1022  func (d *dentry) init(impl any) {
  1023  	d.pf.dentry = d
  1024  	d.cacheEntry.d = d
  1025  	d.syncableListEntry.d = d
  1026  	// Nested impl-inheritance pattern. In memory it looks like:
  1027  	// [[[ vfs.Dentry ] dentry ] dentryImpl ]
  1028  	// All 3 abstractions are allocated in one allocation. We achieve this by
  1029  	// making each outer dentry implementation hold the inner dentry by value.
  1030  	// Then the outer most dentry is allocated and we initialize fields inward.
  1031  	// Each inner dentry has a pointer to the next level of implementation.
  1032  	d.impl = impl
  1033  	d.vfsd.Init(d)
  1034  	refs.Register(d)
  1035  }
  1036  
  1037  func (d *dentry) isSynthetic() bool {
  1038  	return d.impl == nil
  1039  }
  1040  
  1041  func (d *dentry) cachedMetadataAuthoritative() bool {
  1042  	return d.fs.opts.interop != InteropModeShared || d.isSynthetic()
  1043  }
  1044  
  1045  // updateMetadataFromStatxLocked is called to update d's metadata after an update
  1046  // from the remote filesystem.
  1047  // Precondition: d.metadataMu must be locked.
  1048  // +checklocks:d.metadataMu
  1049  func (d *lisafsDentry) updateMetadataFromStatxLocked(stat *linux.Statx) {
  1050  	if stat.Mask&linux.STATX_TYPE != 0 {
  1051  		if got, want := stat.Mode&linux.FileTypeMask, d.fileType(); uint32(got) != want {
  1052  			panic(fmt.Sprintf("gofer.dentry file type changed from %#o to %#o", want, got))
  1053  		}
  1054  	}
  1055  	if stat.Mask&linux.STATX_MODE != 0 {
  1056  		d.mode.Store(uint32(stat.Mode))
  1057  	}
  1058  	if stat.Mask&linux.STATX_UID != 0 {
  1059  		d.uid.Store(dentryUID(lisafs.UID(stat.UID)))
  1060  	}
  1061  	if stat.Mask&linux.STATX_GID != 0 {
  1062  		d.gid.Store(dentryGID(lisafs.GID(stat.GID)))
  1063  	}
  1064  	if stat.Blksize != 0 {
  1065  		d.blockSize.Store(stat.Blksize)
  1066  	}
  1067  	// Don't override newer client-defined timestamps with old server-defined
  1068  	// ones.
  1069  	if stat.Mask&linux.STATX_ATIME != 0 && d.atimeDirty.Load() == 0 {
  1070  		d.atime.Store(dentryTimestamp(stat.Atime))
  1071  	}
  1072  	if stat.Mask&linux.STATX_MTIME != 0 && d.mtimeDirty.Load() == 0 {
  1073  		d.mtime.Store(dentryTimestamp(stat.Mtime))
  1074  	}
  1075  	if stat.Mask&linux.STATX_CTIME != 0 {
  1076  		d.ctime.Store(dentryTimestamp(stat.Ctime))
  1077  	}
  1078  	if stat.Mask&linux.STATX_BTIME != 0 {
  1079  		d.btime.Store(dentryTimestamp(stat.Btime))
  1080  	}
  1081  	if stat.Mask&linux.STATX_NLINK != 0 {
  1082  		d.nlink.Store(stat.Nlink)
  1083  	}
  1084  	if stat.Mask&linux.STATX_SIZE != 0 {
  1085  		d.updateSizeLocked(stat.Size)
  1086  	}
  1087  }
  1088  
  1089  // updateMetadataFromStatLocked is similar to updateMetadataFromStatxLocked,
  1090  // except that it takes a unix.Stat_t argument.
  1091  // Precondition: d.metadataMu must be locked.
  1092  // +checklocks:d.metadataMu
  1093  func (d *directfsDentry) updateMetadataFromStatLocked(stat *unix.Stat_t) error {
  1094  	if got, want := stat.Mode&unix.S_IFMT, d.fileType(); got != want {
  1095  		panic(fmt.Sprintf("direct.dentry file type changed from %#o to %#o", want, got))
  1096  	}
  1097  	d.mode.Store(stat.Mode)
  1098  	d.uid.Store(stat.Uid)
  1099  	d.gid.Store(stat.Gid)
  1100  	d.blockSize.Store(uint32(stat.Blksize))
  1101  	// Don't override newer client-defined timestamps with old host-defined
  1102  	// ones.
  1103  	if d.atimeDirty.Load() == 0 {
  1104  		d.atime.Store(dentryTimestampFromUnix(stat.Atim))
  1105  	}
  1106  	if d.mtimeDirty.Load() == 0 {
  1107  		d.mtime.Store(dentryTimestampFromUnix(stat.Mtim))
  1108  	}
  1109  	d.ctime.Store(dentryTimestampFromUnix(stat.Ctim))
  1110  	d.nlink.Store(uint32(stat.Nlink))
  1111  	d.updateSizeLocked(uint64(stat.Size))
  1112  	return nil
  1113  }
  1114  
  1115  // Preconditions: !d.isSynthetic().
  1116  // Preconditions: d.metadataMu is locked.
  1117  // +checklocks:d.metadataMu
  1118  func (d *dentry) refreshSizeLocked(ctx context.Context) error {
  1119  	d.handleMu.RLock()
  1120  
  1121  	// Can use RacyLoad() because handleMu is locked.
  1122  	if d.writeFD.RacyLoad() < 0 {
  1123  		d.handleMu.RUnlock()
  1124  		// Use a suitable FD if we don't have a writable host FD.
  1125  		return d.updateMetadataLocked(ctx, noHandle)
  1126  	}
  1127  
  1128  	// Using statx(2) with a minimal mask is faster than fstat(2).
  1129  	var stat unix.Statx_t
  1130  	// Can use RacyLoad() because handleMu is locked.
  1131  	err := unix.Statx(int(d.writeFD.RacyLoad()), "", unix.AT_EMPTY_PATH, unix.STATX_SIZE, &stat)
  1132  	d.handleMu.RUnlock() // must be released before updateSizeLocked()
  1133  	if err != nil {
  1134  		return err
  1135  	}
  1136  	d.updateSizeLocked(stat.Size)
  1137  	return nil
  1138  }
  1139  
  1140  // Preconditions: !d.isSynthetic().
  1141  func (d *dentry) updateMetadata(ctx context.Context) error {
  1142  	// d.metadataMu must be locked *before* we stat so that we do not end up
  1143  	// updating stale attributes in d.updateMetadataFromStatLocked().
  1144  	d.metadataMu.Lock()
  1145  	defer d.metadataMu.Unlock()
  1146  	return d.updateMetadataLocked(ctx, noHandle)
  1147  }
  1148  
  1149  func (d *dentry) fileType() uint32 {
  1150  	return d.mode.Load() & linux.S_IFMT
  1151  }
  1152  
  1153  func (d *dentry) statTo(stat *linux.Statx) {
  1154  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_INO | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME
  1155  	stat.Blksize = d.blockSize.Load()
  1156  	stat.Nlink = d.nlink.Load()
  1157  	if stat.Nlink == 0 {
  1158  		// The remote filesystem doesn't support link count; just make
  1159  		// something up. This is consistent with Linux, where
  1160  		// fs/inode.c:inode_init_always() initializes link count to 1, and
  1161  		// fs/9p/vfs_inode_dotl.c:v9fs_stat2inode_dotl() doesn't touch it if
  1162  		// it's not provided by the remote filesystem.
  1163  		stat.Nlink = 1
  1164  	}
  1165  	stat.UID = d.uid.Load()
  1166  	stat.GID = d.gid.Load()
  1167  	stat.Mode = uint16(d.mode.Load())
  1168  	stat.Ino = uint64(d.ino)
  1169  	stat.Size = d.size.Load()
  1170  	// This is consistent with regularFileFD.Seek(), which treats regular files
  1171  	// as having no holes.
  1172  	stat.Blocks = (stat.Size + 511) / 512
  1173  	stat.Atime = linux.NsecToStatxTimestamp(d.atime.Load())
  1174  	stat.Btime = linux.NsecToStatxTimestamp(d.btime.Load())
  1175  	stat.Ctime = linux.NsecToStatxTimestamp(d.ctime.Load())
  1176  	stat.Mtime = linux.NsecToStatxTimestamp(d.mtime.Load())
  1177  	stat.DevMajor = linux.UNNAMED_MAJOR
  1178  	stat.DevMinor = d.fs.devMinor
  1179  }
  1180  
  1181  // Precondition: fs.renameMu is locked.
  1182  func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, opts *vfs.SetStatOptions, mnt *vfs.Mount) error {
  1183  	stat := &opts.Stat
  1184  	if stat.Mask == 0 {
  1185  		return nil
  1186  	}
  1187  	if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
  1188  		return linuxerr.EPERM
  1189  	}
  1190  	mode := linux.FileMode(d.mode.Load())
  1191  	if err := vfs.CheckSetStat(ctx, creds, opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil {
  1192  		return err
  1193  	}
  1194  	if err := mnt.CheckBeginWrite(); err != nil {
  1195  		return err
  1196  	}
  1197  	defer mnt.EndWrite()
  1198  
  1199  	if stat.Mask&linux.STATX_SIZE != 0 {
  1200  		// Reject attempts to truncate files other than regular files, since
  1201  		// filesystem implementations may return the wrong errno.
  1202  		switch mode.FileType() {
  1203  		case linux.S_IFREG:
  1204  			// ok
  1205  		case linux.S_IFDIR:
  1206  			return linuxerr.EISDIR
  1207  		default:
  1208  			return linuxerr.EINVAL
  1209  		}
  1210  	}
  1211  
  1212  	var now int64
  1213  	if d.cachedMetadataAuthoritative() {
  1214  		// Truncate updates mtime.
  1215  		if stat.Mask&(linux.STATX_SIZE|linux.STATX_MTIME) == linux.STATX_SIZE {
  1216  			stat.Mask |= linux.STATX_MTIME
  1217  			stat.Mtime = linux.StatxTimestamp{
  1218  				Nsec: linux.UTIME_NOW,
  1219  			}
  1220  		}
  1221  
  1222  		// Use client clocks for timestamps.
  1223  		now = d.fs.clock.Now().Nanoseconds()
  1224  		if stat.Mask&linux.STATX_ATIME != 0 && stat.Atime.Nsec == linux.UTIME_NOW {
  1225  			stat.Atime = linux.NsecToStatxTimestamp(now)
  1226  		}
  1227  		if stat.Mask&linux.STATX_MTIME != 0 && stat.Mtime.Nsec == linux.UTIME_NOW {
  1228  			stat.Mtime = linux.NsecToStatxTimestamp(now)
  1229  		}
  1230  	}
  1231  
  1232  	d.metadataMu.Lock()
  1233  	defer d.metadataMu.Unlock()
  1234  
  1235  	// As with Linux, if the UID, GID, or file size is changing, we have to
  1236  	// clear permission bits. Note that when set, clearSGID may cause
  1237  	// permissions to be updated.
  1238  	clearSGID := (stat.Mask&linux.STATX_UID != 0 && stat.UID != d.uid.Load()) ||
  1239  		(stat.Mask&linux.STATX_GID != 0 && stat.GID != d.gid.Load()) ||
  1240  		stat.Mask&linux.STATX_SIZE != 0
  1241  	if clearSGID {
  1242  		if stat.Mask&linux.STATX_MODE != 0 {
  1243  			stat.Mode = uint16(vfs.ClearSUIDAndSGID(uint32(stat.Mode)))
  1244  		} else {
  1245  			oldMode := d.mode.Load()
  1246  			if updatedMode := vfs.ClearSUIDAndSGID(oldMode); updatedMode != oldMode {
  1247  				stat.Mode = uint16(updatedMode)
  1248  				stat.Mask |= linux.STATX_MODE
  1249  			}
  1250  		}
  1251  	}
  1252  
  1253  	// failureMask indicates which attributes could not be set on the remote
  1254  	// filesystem. p9 returns an error if any of the attributes could not be set
  1255  	// but that leads to inconsistency as the server could have set a few
  1256  	// attributes successfully but a later failure will cause the successful ones
  1257  	// to not be updated in the dentry cache.
  1258  	var failureMask uint32
  1259  	var failureErr error
  1260  	if !d.isSynthetic() {
  1261  		if stat.Mask != 0 {
  1262  			if err := d.prepareSetStat(ctx, stat); err != nil {
  1263  				return err
  1264  			}
  1265  			d.handleMu.RLock()
  1266  			if stat.Mask&linux.STATX_SIZE != 0 {
  1267  				// d.dataMu must be held around the update to both the remote
  1268  				// file's size and d.size to serialize with writeback (which
  1269  				// might otherwise write data back up to the old d.size after
  1270  				// the remote file has been truncated).
  1271  				d.dataMu.Lock()
  1272  			}
  1273  			var err error
  1274  			failureMask, failureErr, err = d.setStatLocked(ctx, stat)
  1275  			d.handleMu.RUnlock()
  1276  			if err != nil {
  1277  				if stat.Mask&linux.STATX_SIZE != 0 {
  1278  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1279  				}
  1280  				return err
  1281  			}
  1282  			if stat.Mask&linux.STATX_SIZE != 0 {
  1283  				if failureMask&linux.STATX_SIZE == 0 {
  1284  					// d.size should be kept up to date, and privatized
  1285  					// copy-on-write mappings of truncated pages need to be
  1286  					// invalidated, even if InteropModeShared is in effect.
  1287  					d.updateSizeAndUnlockDataMuLocked(stat.Size) // +checklocksforce: locked conditionally above
  1288  				} else {
  1289  					d.dataMu.Unlock() // +checklocksforce: locked conditionally above
  1290  				}
  1291  			}
  1292  		}
  1293  		if d.fs.opts.interop == InteropModeShared {
  1294  			// There's no point to updating d's metadata in this case since
  1295  			// it'll be overwritten by revalidation before the next time it's
  1296  			// used anyway. (InteropModeShared inhibits client caching of
  1297  			// regular file data, so there's no cache to truncate either.)
  1298  			return nil
  1299  		}
  1300  	}
  1301  	if stat.Mask&linux.STATX_MODE != 0 && failureMask&linux.STATX_MODE == 0 {
  1302  		d.mode.Store(d.fileType() | uint32(stat.Mode))
  1303  	}
  1304  	if stat.Mask&linux.STATX_UID != 0 && failureMask&linux.STATX_UID == 0 {
  1305  		d.uid.Store(stat.UID)
  1306  	}
  1307  	if stat.Mask&linux.STATX_GID != 0 && failureMask&linux.STATX_GID == 0 {
  1308  		d.gid.Store(stat.GID)
  1309  	}
  1310  	// Note that stat.Atime.Nsec and stat.Mtime.Nsec can't be UTIME_NOW because
  1311  	// if d.cachedMetadataAuthoritative() then we converted stat.Atime and
  1312  	// stat.Mtime to client-local timestamps above, and if
  1313  	// !d.cachedMetadataAuthoritative() then we returned after calling
  1314  	// d.file.setAttr(). For the same reason, now must have been initialized.
  1315  	if stat.Mask&linux.STATX_ATIME != 0 && failureMask&linux.STATX_ATIME == 0 {
  1316  		d.atime.Store(stat.Atime.ToNsec())
  1317  		d.atimeDirty.Store(0)
  1318  	}
  1319  	if stat.Mask&linux.STATX_MTIME != 0 && failureMask&linux.STATX_MTIME == 0 {
  1320  		d.mtime.Store(stat.Mtime.ToNsec())
  1321  		d.mtimeDirty.Store(0)
  1322  	}
  1323  	d.ctime.Store(now)
  1324  	if failureMask != 0 {
  1325  		// Setting some attribute failed on the remote filesystem.
  1326  		return failureErr
  1327  	}
  1328  	return nil
  1329  }
  1330  
  1331  // doAllocate performs an allocate operation on d. Note that d.metadataMu will
  1332  // be held when allocate is called.
  1333  func (d *dentry) doAllocate(ctx context.Context, offset, length uint64, allocate func() error) error {
  1334  	d.metadataMu.Lock()
  1335  	defer d.metadataMu.Unlock()
  1336  
  1337  	// Allocating a smaller size is a noop.
  1338  	size := offset + length
  1339  	if d.cachedMetadataAuthoritative() && size <= d.size.RacyLoad() {
  1340  		return nil
  1341  	}
  1342  
  1343  	err := allocate()
  1344  	if err != nil {
  1345  		return err
  1346  	}
  1347  	d.updateSizeLocked(size)
  1348  	if d.cachedMetadataAuthoritative() {
  1349  		d.touchCMtimeLocked()
  1350  	}
  1351  	return nil
  1352  }
  1353  
  1354  // Preconditions: d.metadataMu must be locked.
  1355  func (d *dentry) updateSizeLocked(newSize uint64) {
  1356  	d.dataMu.Lock()
  1357  	d.updateSizeAndUnlockDataMuLocked(newSize)
  1358  }
  1359  
  1360  // Preconditions: d.metadataMu and d.dataMu must be locked.
  1361  //
  1362  // Postconditions: d.dataMu is unlocked.
  1363  // +checklocksrelease:d.dataMu
  1364  func (d *dentry) updateSizeAndUnlockDataMuLocked(newSize uint64) {
  1365  	oldSize := d.size.RacyLoad()
  1366  	d.size.Store(newSize)
  1367  	// d.dataMu must be unlocked to lock d.mapsMu and invalidate mappings
  1368  	// below. This allows concurrent calls to Read/Translate/etc. These
  1369  	// functions synchronize with truncation by refusing to use cache
  1370  	// contents beyond the new d.size. (We are still holding d.metadataMu,
  1371  	// so we can't race with Write or another truncate.)
  1372  	d.dataMu.Unlock()
  1373  	if newSize < oldSize {
  1374  		oldpgend, _ := hostarch.PageRoundUp(oldSize)
  1375  		newpgend, _ := hostarch.PageRoundUp(newSize)
  1376  		if oldpgend != newpgend {
  1377  			d.mapsMu.Lock()
  1378  			d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
  1379  				// Compare Linux's mm/truncate.c:truncate_setsize() =>
  1380  				// truncate_pagecache() =>
  1381  				// mm/memory.c:unmap_mapping_range(evencows=1).
  1382  				InvalidatePrivate: true,
  1383  			})
  1384  			d.mapsMu.Unlock()
  1385  		}
  1386  		// We are now guaranteed that there are no translations of
  1387  		// truncated pages, and can remove them from the cache. Since
  1388  		// truncated pages have been removed from the remote file, they
  1389  		// should be dropped without being written back.
  1390  		d.dataMu.Lock()
  1391  		d.cache.Truncate(newSize, d.fs.mfp.MemoryFile())
  1392  		d.dirty.KeepClean(memmap.MappableRange{newSize, oldpgend})
  1393  		d.dataMu.Unlock()
  1394  	}
  1395  }
  1396  
  1397  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
  1398  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()))
  1399  }
  1400  
  1401  func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
  1402  	// Deny access to the "security" and "system" namespaces since applications
  1403  	// may expect these to affect kernel behavior in unimplemented ways
  1404  	// (b/148380782). Allow all other extended attributes to be passed through
  1405  	// to the remote filesystem. This is inconsistent with Linux's 9p client,
  1406  	// but consistent with other filesystems (e.g. FUSE).
  1407  	//
  1408  	// NOTE(b/202533394): Also disallow "trusted" namespace for now. This is
  1409  	// consistent with the VFS1 gofer client.
  1410  	if strings.HasPrefix(name, linux.XATTR_SECURITY_PREFIX) || strings.HasPrefix(name, linux.XATTR_SYSTEM_PREFIX) || strings.HasPrefix(name, linux.XATTR_TRUSTED_PREFIX) {
  1411  		return linuxerr.EOPNOTSUPP
  1412  	}
  1413  	mode := linux.FileMode(d.mode.Load())
  1414  	kuid := auth.KUID(d.uid.Load())
  1415  	kgid := auth.KGID(d.gid.Load())
  1416  	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
  1417  		return err
  1418  	}
  1419  	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
  1420  }
  1421  
  1422  func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
  1423  	return vfs.CheckDeleteSticky(
  1424  		creds,
  1425  		linux.FileMode(d.mode.Load()),
  1426  		auth.KUID(d.uid.Load()),
  1427  		auth.KUID(child.uid.Load()),
  1428  		auth.KGID(child.gid.Load()),
  1429  	)
  1430  }
  1431  
  1432  func dentryUID(uid lisafs.UID) uint32 {
  1433  	if !uid.Ok() {
  1434  		return uint32(auth.OverflowUID)
  1435  	}
  1436  	return uint32(uid)
  1437  }
  1438  
  1439  func dentryGID(gid lisafs.GID) uint32 {
  1440  	if !gid.Ok() {
  1441  		return uint32(auth.OverflowGID)
  1442  	}
  1443  	return uint32(gid)
  1444  }
  1445  
  1446  // IncRef implements vfs.DentryImpl.IncRef.
  1447  func (d *dentry) IncRef() {
  1448  	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
  1449  	// d.checkCachingLocked().
  1450  	r := d.refs.Add(1)
  1451  	if d.LogRefs() {
  1452  		refs.LogIncRef(d, r)
  1453  	}
  1454  }
  1455  
  1456  // TryIncRef implements vfs.DentryImpl.TryIncRef.
  1457  func (d *dentry) TryIncRef() bool {
  1458  	for {
  1459  		r := d.refs.Load()
  1460  		if r <= 0 {
  1461  			return false
  1462  		}
  1463  		if d.refs.CompareAndSwap(r, r+1) {
  1464  			if d.LogRefs() {
  1465  				refs.LogTryIncRef(d, r+1)
  1466  			}
  1467  			return true
  1468  		}
  1469  	}
  1470  }
  1471  
  1472  // DecRef implements vfs.DentryImpl.DecRef.
  1473  func (d *dentry) DecRef(ctx context.Context) {
  1474  	if d.decRefNoCaching() == 0 {
  1475  		d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1476  	}
  1477  }
  1478  
  1479  // decRefNoCaching decrements d's reference count without calling
  1480  // d.checkCachingLocked, even if d's reference count reaches 0; callers are
  1481  // responsible for ensuring that d.checkCachingLocked will be called later.
  1482  func (d *dentry) decRefNoCaching() int64 {
  1483  	r := d.refs.Add(-1)
  1484  	if d.LogRefs() {
  1485  		refs.LogDecRef(d, r)
  1486  	}
  1487  	if r < 0 {
  1488  		panic("gofer.dentry.decRefNoCaching() called without holding a reference")
  1489  	}
  1490  	return r
  1491  }
  1492  
  1493  // RefType implements refs.CheckedObject.Type.
  1494  func (d *dentry) RefType() string {
  1495  	return "gofer.dentry"
  1496  }
  1497  
  1498  // LeakMessage implements refs.CheckedObject.LeakMessage.
  1499  func (d *dentry) LeakMessage() string {
  1500  	return fmt.Sprintf("[gofer.dentry %p] reference count of %d instead of -1", d, d.refs.Load())
  1501  }
  1502  
  1503  // LogRefs implements refs.CheckedObject.LogRefs.
  1504  //
  1505  // This should only be set to true for debugging purposes, as it can generate an
  1506  // extremely large amount of output and drastically degrade performance.
  1507  func (d *dentry) LogRefs() bool {
  1508  	return false
  1509  }
  1510  
  1511  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
  1512  func (d *dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
  1513  	if d.isDir() {
  1514  		events |= linux.IN_ISDIR
  1515  	}
  1516  
  1517  	d.fs.renameMu.RLock()
  1518  	// The ordering below is important, Linux always notifies the parent first.
  1519  	if d.parent != nil {
  1520  		d.parent.watches.Notify(ctx, d.name, events, cookie, et, d.isDeleted())
  1521  	}
  1522  	d.watches.Notify(ctx, "", events, cookie, et, d.isDeleted())
  1523  	d.fs.renameMu.RUnlock()
  1524  }
  1525  
  1526  // Watches implements vfs.DentryImpl.Watches.
  1527  func (d *dentry) Watches() *vfs.Watches {
  1528  	return &d.watches
  1529  }
  1530  
  1531  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
  1532  //
  1533  // If no watches are left on this dentry and it has no references, cache it.
  1534  func (d *dentry) OnZeroWatches(ctx context.Context) {
  1535  	d.checkCachingLocked(ctx, false /* renameMuWriteLocked */)
  1536  }
  1537  
  1538  // checkCachingLocked should be called after d's reference count becomes 0 or
  1539  // it becomes disowned.
  1540  //
  1541  // For performance, checkCachingLocked can also be called after d's reference
  1542  // count becomes non-zero, so that d can be removed from the LRU cache. This
  1543  // may help in reducing the size of the cache and hence reduce evictions. Note
  1544  // that this is not necessary for correctness.
  1545  //
  1546  // It may be called on a destroyed dentry. For example,
  1547  // renameMu[R]UnlockAndCheckCaching may call checkCachingLocked multiple times
  1548  // for the same dentry when the dentry is visited more than once in the same
  1549  // operation. One of the calls may destroy the dentry, so subsequent calls will
  1550  // do nothing.
  1551  //
  1552  // Preconditions: d.fs.renameMu must be locked for writing if
  1553  // renameMuWriteLocked is true; it may be temporarily unlocked.
  1554  func (d *dentry) checkCachingLocked(ctx context.Context, renameMuWriteLocked bool) {
  1555  	d.cachingMu.Lock()
  1556  	refs := d.refs.Load()
  1557  	if refs == -1 {
  1558  		// Dentry has already been destroyed.
  1559  		d.cachingMu.Unlock()
  1560  		return
  1561  	}
  1562  	if refs > 0 {
  1563  		// fs.dentryCache.dentries is permitted to contain dentries with non-zero
  1564  		// refs, which are skipped by fs.evictCachedDentryLocked() upon reaching
  1565  		// the end of the LRU. But it is still beneficial to remove d from the
  1566  		// cache as we are already holding d.cachingMu. Keeping a cleaner cache
  1567  		// also reduces the number of evictions (which is expensive as it acquires
  1568  		// fs.renameMu).
  1569  		d.removeFromCacheLocked()
  1570  		d.cachingMu.Unlock()
  1571  		return
  1572  	}
  1573  	// Deleted and invalidated dentries with zero references are no longer
  1574  	// reachable by path resolution and should be dropped immediately.
  1575  	if d.vfsd.IsDead() {
  1576  		d.removeFromCacheLocked()
  1577  		d.cachingMu.Unlock()
  1578  		if !renameMuWriteLocked {
  1579  			// Need to lock d.fs.renameMu for writing as needed by d.destroyLocked().
  1580  			d.fs.renameMu.Lock()
  1581  			defer d.fs.renameMu.Unlock()
  1582  			// Now that renameMu is locked for writing, no more refs can be taken on
  1583  			// d because path resolution requires renameMu for reading at least.
  1584  			if d.refs.Load() != 0 {
  1585  				// Destroy d only if its ref is still 0. If not, either someone took a
  1586  				// ref on it or it got destroyed before fs.renameMu could be acquired.
  1587  				return
  1588  			}
  1589  		}
  1590  		if d.isDeleted() {
  1591  			d.watches.HandleDeletion(ctx)
  1592  		}
  1593  		d.destroyLocked(ctx) // +checklocksforce: renameMu must be acquired at this point.
  1594  		return
  1595  	}
  1596  	if d.vfsd.IsEvictable() {
  1597  		d.cachingMu.Unlock()
  1598  		// Attempt to evict.
  1599  		if renameMuWriteLocked {
  1600  			d.evictLocked(ctx) // +checklocksforce: renameMu is locked in this case.
  1601  			return
  1602  		}
  1603  		d.evict(ctx)
  1604  		return
  1605  	}
  1606  	// If d still has inotify watches and it is not deleted or invalidated, it
  1607  	// can't be evicted. Otherwise, we will lose its watches, even if a new
  1608  	// dentry is created for the same file in the future. Note that the size of
  1609  	// d.watches cannot concurrently transition from zero to non-zero, because
  1610  	// adding a watch requires holding a reference on d.
  1611  	if d.watches.Size() > 0 {
  1612  		// As in the refs > 0 case, removing d is beneficial.
  1613  		d.removeFromCacheLocked()
  1614  		d.cachingMu.Unlock()
  1615  		return
  1616  	}
  1617  
  1618  	if d.fs.released.Load() != 0 {
  1619  		d.cachingMu.Unlock()
  1620  		if !renameMuWriteLocked {
  1621  			// Need to lock d.fs.renameMu to access d.parent. Lock it for writing as
  1622  			// needed by d.destroyLocked() later.
  1623  			d.fs.renameMu.Lock()
  1624  			defer d.fs.renameMu.Unlock()
  1625  		}
  1626  		if d.parent != nil {
  1627  			d.parent.childrenMu.Lock()
  1628  			delete(d.parent.children, d.name)
  1629  			d.parent.childrenMu.Unlock()
  1630  		}
  1631  		d.destroyLocked(ctx) // +checklocksforce: see above.
  1632  		return
  1633  	}
  1634  
  1635  	d.fs.dentryCache.mu.Lock()
  1636  	// If d is already cached, just move it to the front of the LRU.
  1637  	if d.cached {
  1638  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1639  		d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1640  		d.fs.dentryCache.mu.Unlock()
  1641  		d.cachingMu.Unlock()
  1642  		return
  1643  	}
  1644  	// Cache the dentry, then evict the least recently used cached dentry if
  1645  	// the cache becomes over-full.
  1646  	d.fs.dentryCache.dentries.PushFront(&d.cacheEntry)
  1647  	d.fs.dentryCache.dentriesLen++
  1648  	d.cached = true
  1649  	shouldEvict := d.fs.dentryCache.dentriesLen > d.fs.dentryCache.maxCachedDentries
  1650  	d.fs.dentryCache.mu.Unlock()
  1651  	d.cachingMu.Unlock()
  1652  
  1653  	if shouldEvict {
  1654  		if !renameMuWriteLocked {
  1655  			// Need to lock d.fs.renameMu for writing as needed by
  1656  			// d.evictCachedDentryLocked().
  1657  			d.fs.renameMu.Lock()
  1658  			defer d.fs.renameMu.Unlock()
  1659  		}
  1660  		d.fs.evictCachedDentryLocked(ctx) // +checklocksforce: see above.
  1661  	}
  1662  }
  1663  
  1664  // Preconditions: d.cachingMu must be locked.
  1665  func (d *dentry) removeFromCacheLocked() {
  1666  	if d.cached {
  1667  		d.fs.dentryCache.mu.Lock()
  1668  		d.fs.dentryCache.dentries.Remove(&d.cacheEntry)
  1669  		d.fs.dentryCache.dentriesLen--
  1670  		d.fs.dentryCache.mu.Unlock()
  1671  		d.cached = false
  1672  	}
  1673  }
  1674  
  1675  // Precondition: fs.renameMu must be locked for writing; it may be temporarily
  1676  // unlocked.
  1677  // +checklocks:fs.renameMu
  1678  func (fs *filesystem) evictAllCachedDentriesLocked(ctx context.Context) {
  1679  	for fs.dentryCache.dentriesLen != 0 {
  1680  		fs.evictCachedDentryLocked(ctx)
  1681  	}
  1682  }
  1683  
  1684  // Preconditions:
  1685  //   - fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1686  //
  1687  // +checklocks:fs.renameMu
  1688  func (fs *filesystem) evictCachedDentryLocked(ctx context.Context) {
  1689  	fs.dentryCache.mu.Lock()
  1690  	victim := fs.dentryCache.dentries.Back()
  1691  	fs.dentryCache.mu.Unlock()
  1692  	if victim == nil {
  1693  		// fs.dentryCache.dentries may have become empty between when it was
  1694  		// checked and when we locked fs.dentryCache.mu.
  1695  		return
  1696  	}
  1697  
  1698  	if victim.d.fs == fs {
  1699  		victim.d.evictLocked(ctx) // +checklocksforce: owned as precondition, victim.fs == fs
  1700  		return
  1701  	}
  1702  
  1703  	// The dentry cache is shared between all gofer filesystems and the victim is
  1704  	// from another filesystem. Have that filesystem do the work. We unlock
  1705  	// fs.renameMu to prevent deadlock: two filesystems could otherwise wait on
  1706  	// each others' renameMu.
  1707  	fs.renameMu.Unlock()
  1708  	defer fs.renameMu.Lock()
  1709  	victim.d.evict(ctx)
  1710  }
  1711  
  1712  // Preconditions:
  1713  //   - d.fs.renameMu must not be locked for writing.
  1714  func (d *dentry) evict(ctx context.Context) {
  1715  	d.fs.renameMu.Lock()
  1716  	defer d.fs.renameMu.Unlock()
  1717  	d.evictLocked(ctx)
  1718  }
  1719  
  1720  // Preconditions:
  1721  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1722  //
  1723  // +checklocks:d.fs.renameMu
  1724  func (d *dentry) evictLocked(ctx context.Context) {
  1725  	d.cachingMu.Lock()
  1726  	d.removeFromCacheLocked()
  1727  	// d.refs or d.watches.Size() may have become non-zero from an earlier path
  1728  	// resolution since it was inserted into fs.dentryCache.dentries.
  1729  	if d.refs.Load() != 0 || d.watches.Size() != 0 {
  1730  		d.cachingMu.Unlock()
  1731  		return
  1732  	}
  1733  	if d.parent != nil {
  1734  		d.parent.opMu.Lock()
  1735  		if !d.vfsd.IsDead() {
  1736  			// Note that d can't be a mount point (in any mount namespace), since VFS
  1737  			// holds references on mount points.
  1738  			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, &d.vfsd)
  1739  
  1740  			d.parent.childrenMu.Lock()
  1741  			delete(d.parent.children, d.name)
  1742  			d.parent.childrenMu.Unlock()
  1743  
  1744  			// We're only deleting the dentry, not the file it
  1745  			// represents, so we don't need to update
  1746  			// victim parent.dirents etc.
  1747  		}
  1748  		d.parent.opMu.Unlock()
  1749  	}
  1750  	// Safe to unlock cachingMu now that d.vfsd.IsDead(). Henceforth any
  1751  	// concurrent caching attempts on d will attempt to destroy it and so will
  1752  	// try to acquire fs.renameMu (which we have already acquiredd). Hence,
  1753  	// fs.renameMu will synchronize the destroy attempts.
  1754  	d.cachingMu.Unlock()
  1755  	d.destroyLocked(ctx) // +checklocksforce: owned as precondition.
  1756  }
  1757  
  1758  // destroyDisconnected destroys an uncached, unparented dentry. There are no
  1759  // locking preconditions.
  1760  func (d *dentry) destroyDisconnected(ctx context.Context) {
  1761  	mf := d.fs.mfp.MemoryFile()
  1762  
  1763  	d.handleMu.Lock()
  1764  	d.dataMu.Lock()
  1765  
  1766  	if d.isWriteHandleOk() {
  1767  		// Write dirty pages back to the remote filesystem.
  1768  		h := d.writeHandle()
  1769  		if err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), mf, h.writeFromBlocksAt); err != nil {
  1770  			log.Warningf("gofer.dentry.destroyLocked: failed to write dirty data back: %v", err)
  1771  		}
  1772  	}
  1773  	// Discard cached data.
  1774  	if !d.cache.IsEmpty() {
  1775  		mf.MarkAllUnevictable(d)
  1776  		d.cache.DropAll(mf)
  1777  		d.dirty.RemoveAll()
  1778  	}
  1779  	d.dataMu.Unlock()
  1780  
  1781  	// Close any resources held by the implementation.
  1782  	d.destroyImpl(ctx)
  1783  
  1784  	// Can use RacyLoad() because handleMu is locked.
  1785  	if d.readFD.RacyLoad() >= 0 {
  1786  		_ = unix.Close(int(d.readFD.RacyLoad()))
  1787  	}
  1788  	if d.writeFD.RacyLoad() >= 0 && d.readFD.RacyLoad() != d.writeFD.RacyLoad() {
  1789  		_ = unix.Close(int(d.writeFD.RacyLoad()))
  1790  	}
  1791  	d.readFD = atomicbitops.FromInt32(-1)
  1792  	d.writeFD = atomicbitops.FromInt32(-1)
  1793  	d.mmapFD = atomicbitops.FromInt32(-1)
  1794  	d.handleMu.Unlock()
  1795  
  1796  	if !d.isSynthetic() {
  1797  		// Note that it's possible that d.atimeDirty or d.mtimeDirty are true,
  1798  		// i.e. client and server timestamps may differ (because e.g. a client
  1799  		// write was serviced by the page cache, and only written back to the
  1800  		// remote file later). Ideally, we'd write client timestamps back to
  1801  		// the remote filesystem so that timestamps for a new dentry
  1802  		// instantiated for the same file would remain coherent. Unfortunately,
  1803  		// this turns out to be too expensive in many cases, so for now we
  1804  		// don't do this.
  1805  
  1806  		// Remove d from the set of syncable dentries.
  1807  		d.fs.syncMu.Lock()
  1808  		d.fs.syncableDentries.Remove(&d.syncableListEntry)
  1809  		d.fs.syncMu.Unlock()
  1810  	}
  1811  
  1812  	// Drop references and stop tracking this child.
  1813  	d.refs.Store(-1)
  1814  	refs.Unregister(d)
  1815  }
  1816  
  1817  // destroyLocked destroys the dentry.
  1818  //
  1819  // Preconditions:
  1820  //   - d.fs.renameMu must be locked for writing; it may be temporarily unlocked.
  1821  //   - d.refs == 0.
  1822  //   - d.parent.children[d.name] != d, i.e. d is not reachable by path traversal
  1823  //     from its former parent dentry.
  1824  //
  1825  // +checklocks:d.fs.renameMu
  1826  func (d *dentry) destroyLocked(ctx context.Context) {
  1827  	switch d.refs.Load() {
  1828  	case 0:
  1829  		// Mark the dentry destroyed.
  1830  		d.refs.Store(-1)
  1831  	case -1:
  1832  		panic("dentry.destroyLocked() called on already destroyed dentry")
  1833  	default:
  1834  		panic("dentry.destroyLocked() called with references on the dentry")
  1835  	}
  1836  
  1837  	// Allow the following to proceed without renameMu locked to improve
  1838  	// scalability.
  1839  	d.fs.renameMu.Unlock()
  1840  
  1841  	// No locks need to be held during destoryDisconnected.
  1842  	d.destroyDisconnected(ctx)
  1843  
  1844  	d.fs.renameMu.Lock()
  1845  
  1846  	// Drop the reference held by d on its parent without recursively locking
  1847  	// d.fs.renameMu.
  1848  	if d.parent != nil && d.parent.decRefNoCaching() == 0 {
  1849  		d.parent.checkCachingLocked(ctx, true /* renameMuWriteLocked */)
  1850  	}
  1851  }
  1852  
  1853  func (d *dentry) isDeleted() bool {
  1854  	return d.deleted.Load() != 0
  1855  }
  1856  
  1857  func (d *dentry) setDeleted() {
  1858  	d.deleted.Store(1)
  1859  }
  1860  
  1861  func (d *dentry) listXattr(ctx context.Context, size uint64) ([]string, error) {
  1862  	if d.isSynthetic() {
  1863  		return nil, nil
  1864  	}
  1865  
  1866  	return d.listXattrImpl(ctx, size)
  1867  }
  1868  
  1869  func (d *dentry) getXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
  1870  	if d.isSynthetic() {
  1871  		return "", linuxerr.ENODATA
  1872  	}
  1873  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
  1874  		return "", err
  1875  	}
  1876  	return d.getXattrImpl(ctx, opts)
  1877  }
  1878  
  1879  func (d *dentry) setXattr(ctx context.Context, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
  1880  	if d.isSynthetic() {
  1881  		return linuxerr.EPERM
  1882  	}
  1883  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
  1884  		return err
  1885  	}
  1886  	return d.setXattrImpl(ctx, opts)
  1887  }
  1888  
  1889  func (d *dentry) removeXattr(ctx context.Context, creds *auth.Credentials, name string) error {
  1890  	if d.isSynthetic() {
  1891  		return linuxerr.EPERM
  1892  	}
  1893  	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
  1894  		return err
  1895  	}
  1896  	return d.removeXattrImpl(ctx, name)
  1897  }
  1898  
  1899  // Preconditions:
  1900  //   - !d.isSynthetic().
  1901  //   - d.isRegularFile() || d.isDir().
  1902  //   - fs.renameMu is locked.
  1903  func (d *dentry) ensureSharedHandle(ctx context.Context, read, write, trunc bool) error {
  1904  	// O_TRUNC unconditionally requires us to obtain a new handle (opened with
  1905  	// O_TRUNC).
  1906  	if !trunc {
  1907  		d.handleMu.RLock()
  1908  		canReuseCurHandle := (!read || d.isReadHandleOk()) && (!write || d.isWriteHandleOk())
  1909  		d.handleMu.RUnlock()
  1910  		if canReuseCurHandle {
  1911  			// Current handles are sufficient.
  1912  			return nil
  1913  		}
  1914  	}
  1915  
  1916  	d.handleMu.Lock()
  1917  	needNewHandle := (read && !d.isReadHandleOk()) || (write && !d.isWriteHandleOk()) || trunc
  1918  	if !needNewHandle {
  1919  		d.handleMu.Unlock()
  1920  		return nil
  1921  	}
  1922  
  1923  	var fdsToCloseArr [2]int32
  1924  	fdsToClose := fdsToCloseArr[:0]
  1925  	invalidateTranslations := false
  1926  	// Get a new handle. If this file has been opened for both reading and
  1927  	// writing, try to get a single handle that is usable for both:
  1928  	//
  1929  	//	- Writable memory mappings of a host FD require that the host FD is
  1930  	//		opened for both reading and writing.
  1931  	//
  1932  	//	- NOTE(b/141991141): Some filesystems may not ensure coherence
  1933  	//		between multiple handles for the same file.
  1934  	openReadable := d.isReadHandleOk() || read
  1935  	openWritable := d.isWriteHandleOk() || write
  1936  	h, err := d.openHandle(ctx, openReadable, openWritable, trunc)
  1937  	if linuxerr.Equals(linuxerr.EACCES, err) && (openReadable != read || openWritable != write) {
  1938  		// It may not be possible to use a single handle for both
  1939  		// reading and writing, since permissions on the file may have
  1940  		// changed to e.g. disallow reading after previously being
  1941  		// opened for reading. In this case, we have no choice but to
  1942  		// use separate handles for reading and writing.
  1943  		ctx.Debugf("gofer.dentry.ensureSharedHandle: bifurcating read/write handles for dentry %p", d)
  1944  		openReadable = read
  1945  		openWritable = write
  1946  		h, err = d.openHandle(ctx, openReadable, openWritable, trunc)
  1947  	}
  1948  	if err != nil {
  1949  		d.handleMu.Unlock()
  1950  		return err
  1951  	}
  1952  
  1953  	// Update d.readFD and d.writeFD
  1954  	if h.fd >= 0 {
  1955  		if openReadable && openWritable && (d.readFD.RacyLoad() < 0 || d.writeFD.RacyLoad() < 0 || d.readFD.RacyLoad() != d.writeFD.RacyLoad()) {
  1956  			// Replace existing FDs with this one.
  1957  			if d.readFD.RacyLoad() >= 0 {
  1958  				// We already have a readable FD that may be in use by
  1959  				// concurrent callers of d.pf.FD().
  1960  				if d.fs.opts.overlayfsStaleRead {
  1961  					// If overlayfsStaleRead is in effect, then the new FD
  1962  					// may not be coherent with the existing one, so we
  1963  					// have no choice but to switch to mappings of the new
  1964  					// FD in both the application and sentry.
  1965  					if err := d.pf.hostFileMapper.RegenerateMappings(int(h.fd)); err != nil {
  1966  						d.handleMu.Unlock()
  1967  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to replace sentry mappings of old FD with mappings of new FD: %v", err)
  1968  						h.close(ctx)
  1969  						return err
  1970  					}
  1971  					fdsToClose = append(fdsToClose, d.readFD.RacyLoad())
  1972  					invalidateTranslations = true
  1973  					d.readFD.Store(h.fd)
  1974  				} else {
  1975  					// Otherwise, we want to avoid invalidating existing
  1976  					// memmap.Translations (which is expensive); instead, use
  1977  					// dup3 to make the old file descriptor refer to the new
  1978  					// file description, then close the new file descriptor
  1979  					// (which is no longer needed). Racing callers of d.pf.FD()
  1980  					// may use the old or new file description, but this
  1981  					// doesn't matter since they refer to the same file, and
  1982  					// any racing mappings must be read-only.
  1983  					if err := unix.Dup3(int(h.fd), int(d.readFD.RacyLoad()), unix.O_CLOEXEC); err != nil {
  1984  						oldFD := d.readFD.RacyLoad()
  1985  						d.handleMu.Unlock()
  1986  						ctx.Warningf("gofer.dentry.ensureSharedHandle: failed to dup fd %d to fd %d: %v", h.fd, oldFD, err)
  1987  						h.close(ctx)
  1988  						return err
  1989  					}
  1990  					fdsToClose = append(fdsToClose, h.fd)
  1991  					h.fd = d.readFD.RacyLoad()
  1992  				}
  1993  			} else {
  1994  				d.readFD.Store(h.fd)
  1995  			}
  1996  			if d.writeFD.RacyLoad() != h.fd && d.writeFD.RacyLoad() >= 0 {
  1997  				fdsToClose = append(fdsToClose, d.writeFD.RacyLoad())
  1998  			}
  1999  			d.writeFD.Store(h.fd)
  2000  			d.mmapFD.Store(h.fd)
  2001  		} else if openReadable && d.readFD.RacyLoad() < 0 {
  2002  			readHandleWasOk := d.isReadHandleOk()
  2003  			d.readFD.Store(h.fd)
  2004  			// If the file has not been opened for writing, the new FD may
  2005  			// be used for read-only memory mappings. If the file was
  2006  			// previously opened for reading (without an FD), then existing
  2007  			// translations of the file may use the internal page cache;
  2008  			// invalidate those mappings.
  2009  			if !d.isWriteHandleOk() {
  2010  				invalidateTranslations = readHandleWasOk
  2011  				d.mmapFD.Store(h.fd)
  2012  			}
  2013  		} else if openWritable && d.writeFD.RacyLoad() < 0 {
  2014  			d.writeFD.Store(h.fd)
  2015  			if d.readFD.RacyLoad() >= 0 {
  2016  				// We have an existing read-only FD, but the file has just
  2017  				// been opened for writing, so we need to start supporting
  2018  				// writable memory mappings. However, the new FD is not
  2019  				// readable, so we have no FD that can be used to create
  2020  				// writable memory mappings. Switch to using the internal
  2021  				// page cache.
  2022  				invalidateTranslations = true
  2023  				d.mmapFD.Store(-1)
  2024  			}
  2025  		} else {
  2026  			// The new FD is not useful.
  2027  			fdsToClose = append(fdsToClose, h.fd)
  2028  		}
  2029  	} else if openWritable && d.writeFD.RacyLoad() < 0 && d.mmapFD.RacyLoad() >= 0 {
  2030  		// We have an existing read-only FD, but the file has just been
  2031  		// opened for writing, so we need to start supporting writable
  2032  		// memory mappings. However, we have no writable host FD. Switch to
  2033  		// using the internal page cache.
  2034  		invalidateTranslations = true
  2035  		d.mmapFD.Store(-1)
  2036  	}
  2037  
  2038  	d.updateHandles(ctx, h, openReadable, openWritable)
  2039  	d.handleMu.Unlock()
  2040  
  2041  	if invalidateTranslations {
  2042  		// Invalidate application mappings that may be using an old FD; they
  2043  		// will be replaced with mappings using the new FD after future calls
  2044  		// to d.Translate(). This requires holding d.mapsMu, which precedes
  2045  		// d.handleMu in the lock order.
  2046  		d.mapsMu.Lock()
  2047  		d.mappings.InvalidateAll(memmap.InvalidateOpts{})
  2048  		d.mapsMu.Unlock()
  2049  	}
  2050  	for _, fd := range fdsToClose {
  2051  		unix.Close(int(fd))
  2052  	}
  2053  
  2054  	return nil
  2055  }
  2056  
  2057  func (d *dentry) syncRemoteFile(ctx context.Context) error {
  2058  	d.handleMu.RLock()
  2059  	defer d.handleMu.RUnlock()
  2060  	return d.syncRemoteFileLocked(ctx)
  2061  }
  2062  
  2063  // Preconditions: d.handleMu must be locked.
  2064  func (d *dentry) syncRemoteFileLocked(ctx context.Context) error {
  2065  	// Prefer syncing write handles over read handles, since some remote
  2066  	// filesystem implementations may not sync changes made through write
  2067  	// handles otherwise.
  2068  	wh := d.writeHandle()
  2069  	wh.sync(ctx)
  2070  	rh := d.readHandle()
  2071  	rh.sync(ctx)
  2072  	return nil
  2073  }
  2074  
  2075  func (d *dentry) syncCachedFile(ctx context.Context, forFilesystemSync bool) error {
  2076  	d.handleMu.RLock()
  2077  	defer d.handleMu.RUnlock()
  2078  	if d.isWriteHandleOk() {
  2079  		// Write back dirty pages to the remote file.
  2080  		d.dataMu.Lock()
  2081  		h := d.writeHandle()
  2082  		err := fsutil.SyncDirtyAll(ctx, &d.cache, &d.dirty, d.size.Load(), d.fs.mfp.MemoryFile(), h.writeFromBlocksAt)
  2083  		d.dataMu.Unlock()
  2084  		if err != nil {
  2085  			return err
  2086  		}
  2087  	}
  2088  	if err := d.syncRemoteFileLocked(ctx); err != nil {
  2089  		if !forFilesystemSync {
  2090  			return err
  2091  		}
  2092  		// Only return err if we can reasonably have expected sync to succeed
  2093  		// (d is a regular file and was opened for writing).
  2094  		if d.isRegularFile() && d.isWriteHandleOk() {
  2095  			return err
  2096  		}
  2097  		ctx.Debugf("gofer.dentry.syncCachedFile: syncing non-writable or non-regular-file dentry failed: %v", err)
  2098  	}
  2099  	return nil
  2100  }
  2101  
  2102  // incLinks increments link count.
  2103  func (d *dentry) incLinks() {
  2104  	if d.nlink.Load() == 0 {
  2105  		// The remote filesystem doesn't support link count.
  2106  		return
  2107  	}
  2108  	d.nlink.Add(1)
  2109  }
  2110  
  2111  // decLinks decrements link count.
  2112  func (d *dentry) decLinks() {
  2113  	if d.nlink.Load() == 0 {
  2114  		// The remote filesystem doesn't support link count.
  2115  		return
  2116  	}
  2117  	d.nlink.Add(^uint32(0))
  2118  }
  2119  
  2120  // fileDescription is embedded by gofer implementations of
  2121  // vfs.FileDescriptionImpl.
  2122  //
  2123  // +stateify savable
  2124  type fileDescription struct {
  2125  	vfsfd vfs.FileDescription
  2126  	vfs.FileDescriptionDefaultImpl
  2127  	vfs.LockFD
  2128  
  2129  	lockLogging sync.Once `state:"nosave"`
  2130  }
  2131  
  2132  func (fd *fileDescription) filesystem() *filesystem {
  2133  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
  2134  }
  2135  
  2136  func (fd *fileDescription) dentry() *dentry {
  2137  	return fd.vfsfd.Dentry().Impl().(*dentry)
  2138  }
  2139  
  2140  // Stat implements vfs.FileDescriptionImpl.Stat.
  2141  func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) {
  2142  	d := fd.dentry()
  2143  	const validMask = uint32(linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME | linux.STATX_SIZE | linux.STATX_BLOCKS | linux.STATX_BTIME)
  2144  	if !d.cachedMetadataAuthoritative() && opts.Mask&validMask != 0 && opts.Sync != linux.AT_STATX_DONT_SYNC {
  2145  		// Use specialFileFD.handle.fileLisa for the Stat if available, for the
  2146  		// same reason that we try to use open FD in updateMetadataLocked().
  2147  		var err error
  2148  		if sffd, ok := fd.vfsfd.Impl().(*specialFileFD); ok {
  2149  			err = sffd.updateMetadata(ctx)
  2150  		} else {
  2151  			err = d.updateMetadata(ctx)
  2152  		}
  2153  		if err != nil {
  2154  			return linux.Statx{}, err
  2155  		}
  2156  	}
  2157  	var stat linux.Statx
  2158  	d.statTo(&stat)
  2159  	return stat, nil
  2160  }
  2161  
  2162  // SetStat implements vfs.FileDescriptionImpl.SetStat.
  2163  func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error {
  2164  	fs := fd.filesystem()
  2165  	fs.renameMu.RLock()
  2166  	defer fs.renameMu.RUnlock()
  2167  	return fd.dentry().setStat(ctx, auth.CredentialsFromContext(ctx), &opts, fd.vfsfd.Mount())
  2168  }
  2169  
  2170  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
  2171  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
  2172  	return fd.dentry().listXattr(ctx, size)
  2173  }
  2174  
  2175  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
  2176  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
  2177  	return fd.dentry().getXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2178  }
  2179  
  2180  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
  2181  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
  2182  	return fd.dentry().setXattr(ctx, auth.CredentialsFromContext(ctx), &opts)
  2183  }
  2184  
  2185  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
  2186  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
  2187  	return fd.dentry().removeXattr(ctx, auth.CredentialsFromContext(ctx), name)
  2188  }
  2189  
  2190  // LockBSD implements vfs.FileDescriptionImpl.LockBSD.
  2191  func (fd *fileDescription) LockBSD(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, block bool) error {
  2192  	fd.lockLogging.Do(func() {
  2193  		log.Infof("File lock using gofer file handled internally.")
  2194  	})
  2195  	return fd.LockFD.LockBSD(ctx, uid, ownerPID, t, block)
  2196  }
  2197  
  2198  // LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
  2199  func (fd *fileDescription) LockPOSIX(ctx context.Context, uid fslock.UniqueID, ownerPID int32, t fslock.LockType, r fslock.LockRange, block bool) error {
  2200  	fd.lockLogging.Do(func() {
  2201  		log.Infof("Range lock using gofer file handled internally.")
  2202  	})
  2203  	return fd.Locks().LockPOSIX(ctx, uid, ownerPID, t, r, block)
  2204  }
  2205  
  2206  // UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
  2207  func (fd *fileDescription) UnlockPOSIX(ctx context.Context, uid fslock.UniqueID, r fslock.LockRange) error {
  2208  	return fd.Locks().UnlockPOSIX(ctx, uid, r)
  2209  }
  2210  
  2211  // resolvingPath is just a wrapper around *vfs.ResolvingPath. It additionally
  2212  // holds some information around the intent behind resolving the path.
  2213  type resolvingPath struct {
  2214  	*vfs.ResolvingPath
  2215  
  2216  	// excludeLast indicates whether the intent is to resolve until the last path
  2217  	// component. If true, the last path component should remain unresolved.
  2218  	excludeLast bool
  2219  }
  2220  
  2221  func resolvingPathFull(rp *vfs.ResolvingPath) resolvingPath {
  2222  	return resolvingPath{ResolvingPath: rp, excludeLast: false}
  2223  }
  2224  
  2225  func resolvingPathParent(rp *vfs.ResolvingPath) resolvingPath {
  2226  	return resolvingPath{ResolvingPath: rp, excludeLast: true}
  2227  }
  2228  
  2229  func (rp *resolvingPath) done() bool {
  2230  	if rp.excludeLast {
  2231  		return rp.Final()
  2232  	}
  2233  	return rp.Done()
  2234  }
  2235  
  2236  func (rp *resolvingPath) copy() resolvingPath {
  2237  	return resolvingPath{
  2238  		ResolvingPath: rp.ResolvingPath.Copy(),
  2239  		excludeLast:   rp.excludeLast,
  2240  	}
  2241  }
  2242  
  2243  // Precondition: !rp.done() && rp.Component() is not "." or "..".
  2244  func (rp *resolvingPath) getComponents(emit func(string) bool) {
  2245  	rp.GetComponents(rp.excludeLast, emit)
  2246  }