gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/pkg/sentry/fsimpl/overlay/filesystem.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package overlay
    16  
    17  import (
    18  	"fmt"
    19  	"strings"
    20  
    21  	"gvisor.dev/gvisor/pkg/abi/linux"
    22  	"gvisor.dev/gvisor/pkg/atomicbitops"
    23  	"gvisor.dev/gvisor/pkg/context"
    24  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    25  	"gvisor.dev/gvisor/pkg/fspath"
    26  	"gvisor.dev/gvisor/pkg/log"
    27  	"gvisor.dev/gvisor/pkg/refs"
    28  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    29  	"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
    30  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    31  	"gvisor.dev/gvisor/pkg/sync"
    32  )
    33  
    34  // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
    35  // attributes.
    36  // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
    37  const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
    38  
    39  // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
    40  // opaque directories.
    41  // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
    42  const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
    43  
    44  func isWhiteout(stat *linux.Statx) bool {
    45  	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
    46  }
    47  
    48  // Sync implements vfs.FilesystemImpl.Sync.
    49  func (fs *filesystem) Sync(ctx context.Context) error {
    50  	if fs.opts.UpperRoot.Ok() {
    51  		return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
    52  	}
    53  	return nil
    54  }
    55  
    56  var dentrySlicePool = sync.Pool{
    57  	New: func() any {
    58  		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
    59  		return &ds
    60  	},
    61  }
    62  
    63  func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
    64  	if ds == nil {
    65  		ds = dentrySlicePool.Get().(*[]*dentry)
    66  	}
    67  	*ds = append(*ds, d)
    68  	return ds
    69  }
    70  
    71  // Preconditions: ds != nil.
    72  func putDentrySlice(ds *[]*dentry) {
    73  	// Allow dentries to be GC'd.
    74  	for i := range *ds {
    75  		(*ds)[i] = nil
    76  	}
    77  	*ds = (*ds)[:0]
    78  	dentrySlicePool.Put(ds)
    79  }
    80  
    81  // renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
    82  // dentry.checkDropLocked on all dentries in *dsp with fs.renameMu locked for
    83  // writing.
    84  //
    85  // dsp is a pointer-to-pointer since defer evaluates its arguments immediately,
    86  // but dentry slices are allocated lazily, and it's much easier to say "defer
    87  // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
    88  // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
    89  //
    90  // +checklocksreleaseread:fs.renameMu
    91  func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) {
    92  	fs.renameMu.RUnlock()
    93  	if *dsp == nil {
    94  		return
    95  	}
    96  	ds := **dsp
    97  	// Only go through calling dentry.checkDropLocked() (which requires
    98  	// re-locking renameMu) if we actually have any dentries with zero refs.
    99  	checkAny := false
   100  	for i := range ds {
   101  		if ds[i].refs.Load() == 0 {
   102  			checkAny = true
   103  			break
   104  		}
   105  	}
   106  	if checkAny {
   107  		fs.renameMu.Lock()
   108  		for _, d := range ds {
   109  			d.checkDropLocked(ctx)
   110  		}
   111  		fs.renameMu.Unlock()
   112  	}
   113  	putDentrySlice(*dsp)
   114  }
   115  
   116  // +checklocksrelease:fs.renameMu
   117  func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
   118  	if *ds == nil {
   119  		fs.renameMu.Unlock()
   120  		return
   121  	}
   122  	for _, d := range **ds {
   123  		d.checkDropLocked(ctx)
   124  	}
   125  	fs.renameMu.Unlock()
   126  	putDentrySlice(*ds)
   127  }
   128  
   129  // stepLocked resolves rp.Component() to an existing file, starting from the
   130  // given directory.
   131  //
   132  // Dentries which may have a reference count of zero, and which therefore
   133  // should be dropped once traversal is complete, are appended to ds.
   134  //
   135  // Preconditions:
   136  //   - fs.renameMu must be locked.
   137  //   - d.dirMu must be locked.
   138  //   - !rp.Done().
   139  func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, lookupLayer, bool, error) {
   140  	if !d.isDir() {
   141  		return nil, lookupLayerNone, false, linuxerr.ENOTDIR
   142  	}
   143  	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   144  		return nil, lookupLayerNone, false, err
   145  	}
   146  	name := rp.Component()
   147  	if name == "." {
   148  		rp.Advance()
   149  		return d, d.topLookupLayer(), false, nil
   150  	}
   151  	if name == ".." {
   152  		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
   153  			return nil, lookupLayerNone, false, err
   154  		} else if isRoot || d.parent.Load() == nil {
   155  			rp.Advance()
   156  			return d, d.topLookupLayer(), false, nil
   157  		}
   158  		if err := rp.CheckMount(ctx, &d.parent.Load().vfsd); err != nil {
   159  			return nil, lookupLayerNone, false, err
   160  		}
   161  		rp.Advance()
   162  		parent := d.parent.Load()
   163  		return parent, parent.topLookupLayer(), false, nil
   164  	}
   165  	if uint64(len(name)) > fs.maxFilenameLen {
   166  		return nil, lookupLayerNone, false, linuxerr.ENAMETOOLONG
   167  	}
   168  	child, topLookupLayer, err := fs.getChildLocked(ctx, d, name, ds)
   169  	if err != nil {
   170  		return nil, topLookupLayer, false, err
   171  	}
   172  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
   173  		return nil, lookupLayerNone, false, err
   174  	}
   175  	if child.isSymlink() && rp.ShouldFollowSymlink() {
   176  		target, err := child.readlink(ctx)
   177  		if err != nil {
   178  			return nil, lookupLayerNone, false, err
   179  		}
   180  		followedSymlink, err := rp.HandleSymlink(target)
   181  		return d, topLookupLayer, followedSymlink, err
   182  	}
   183  	rp.Advance()
   184  	return child, topLookupLayer, false, nil
   185  }
   186  
   187  // Preconditions:
   188  //   - fs.renameMu must be locked.
   189  //   - d.dirMu must be locked.
   190  func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, lookupLayer, error) {
   191  	if child, ok := parent.children[name]; ok {
   192  		return child, child.topLookupLayer(), nil
   193  	}
   194  	child, topLookupLayer, err := fs.lookupLocked(ctx, parent, name)
   195  	if err != nil {
   196  		return nil, topLookupLayer, err
   197  	}
   198  	if parent.children == nil {
   199  		parent.children = make(map[string]*dentry)
   200  	}
   201  	parent.children[name] = child
   202  	// child's refcount is initially 0, so it may be dropped after traversal.
   203  	*ds = appendDentry(*ds, child)
   204  	return child, topLookupLayer, nil
   205  }
   206  
   207  // Preconditions:
   208  //   - fs.renameMu must be locked.
   209  //   - parent.dirMu must be locked.
   210  func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, lookupLayer, error) {
   211  	childPath := fspath.Parse(name)
   212  	child := fs.newDentry()
   213  	topLookupLayer := lookupLayerNone
   214  	var lookupErr error
   215  
   216  	vfsObj := fs.vfsfs.VirtualFilesystem()
   217  	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
   218  		childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
   219  			Root:  parentVD,
   220  			Start: parentVD,
   221  			Path:  childPath,
   222  		}, &vfs.GetDentryOptions{})
   223  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
   224  			// The file doesn't exist on this layer. Proceed to the next one.
   225  			return true
   226  		}
   227  		if err != nil {
   228  			lookupErr = err
   229  			return false
   230  		}
   231  		defer childVD.DecRef(ctx)
   232  
   233  		mask := uint32(linux.STATX_TYPE)
   234  		if topLookupLayer == lookupLayerNone {
   235  			// Mode, UID, GID, and (for non-directories) inode number come from
   236  			// the topmost layer on which the file exists.
   237  			mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   238  		}
   239  		stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
   240  			Root:  childVD,
   241  			Start: childVD,
   242  		}, &vfs.StatOptions{
   243  			Mask: mask,
   244  		})
   245  		if err != nil {
   246  			lookupErr = err
   247  			return false
   248  		}
   249  		if stat.Mask&mask != mask {
   250  			lookupErr = linuxerr.EREMOTE
   251  			return false
   252  		}
   253  
   254  		if isWhiteout(&stat) {
   255  			// This is a whiteout, so it "doesn't exist" on this layer, and
   256  			// layers below this one are ignored.
   257  			if isUpper {
   258  				topLookupLayer = lookupLayerUpperWhiteout
   259  			}
   260  			return false
   261  		}
   262  		isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
   263  		if topLookupLayer != lookupLayerNone && !isDir {
   264  			// Directories are not merged with non-directory files from lower
   265  			// layers; instead, layers including and below the first
   266  			// non-directory file are ignored. (This file must be a directory
   267  			// on previous layers, since lower layers aren't searched for
   268  			// non-directory files.)
   269  			return false
   270  		}
   271  
   272  		// Update child to include this layer.
   273  		childVD.IncRef()
   274  		if isUpper {
   275  			child.upperVD = childVD
   276  			child.copiedUp = atomicbitops.FromUint32(1)
   277  		} else {
   278  			child.lowerVDs = append(child.lowerVDs, childVD)
   279  		}
   280  		if topLookupLayer == lookupLayerNone {
   281  			if isUpper {
   282  				topLookupLayer = lookupLayerUpper
   283  			} else {
   284  				topLookupLayer = lookupLayerLower
   285  			}
   286  			child.mode = atomicbitops.FromUint32(uint32(stat.Mode))
   287  			child.uid = atomicbitops.FromUint32(stat.UID)
   288  			child.gid = atomicbitops.FromUint32(stat.GID)
   289  			child.devMajor = atomicbitops.FromUint32(stat.DevMajor)
   290  			child.devMinor = atomicbitops.FromUint32(stat.DevMinor)
   291  			child.ino = atomicbitops.FromUint64(stat.Ino)
   292  		}
   293  
   294  		// For non-directory files, only the topmost layer that contains a file
   295  		// matters.
   296  		if !isDir {
   297  			return false
   298  		}
   299  
   300  		// Directories use the lowest layer inode and device numbers to generate a
   301  		// filesystem local inode number. This way the inode number does not change
   302  		// after copy ups.
   303  		child.devMajor = atomicbitops.FromUint32(stat.DevMajor)
   304  		child.devMinor = atomicbitops.FromUint32(stat.DevMinor)
   305  		child.ino = atomicbitops.FromUint64(stat.Ino)
   306  
   307  		// Directories are merged with directories from lower layers if they
   308  		// are not explicitly opaque.
   309  		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
   310  			Root:  childVD,
   311  			Start: childVD,
   312  		}, &vfs.GetXattrOptions{
   313  			Name: _OVL_XATTR_OPAQUE,
   314  			Size: 1,
   315  		})
   316  		return !(err == nil && opaqueVal == "y")
   317  	})
   318  
   319  	if lookupErr != nil {
   320  		child.destroyLocked(ctx)
   321  		return nil, topLookupLayer, lookupErr
   322  	}
   323  	if !topLookupLayer.existsInOverlay() {
   324  		child.destroyLocked(ctx)
   325  		return nil, topLookupLayer, linuxerr.ENOENT
   326  	}
   327  
   328  	// Device and inode numbers were copied from the topmost layer above for
   329  	// non-directories. They were copied from the bottommost layer for
   330  	// directories. Override them if necessary. We can use RacyLoad() because
   331  	// child is still being initialized.
   332  	if child.isDir() {
   333  		child.ino.Store(fs.newDirIno(child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), child.ino.RacyLoad()))
   334  		child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   335  		child.devMinor = atomicbitops.FromUint32(fs.dirDevMinor)
   336  	} else if !child.upperVD.Ok() {
   337  		childDevMinor, err := fs.getLowerDevMinor(child.devMajor.RacyLoad(), child.devMinor.RacyLoad())
   338  		if err != nil {
   339  			ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), err)
   340  			child.destroyLocked(ctx)
   341  			return nil, topLookupLayer, err
   342  		}
   343  		child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   344  		child.devMinor = atomicbitops.FromUint32(childDevMinor)
   345  	}
   346  
   347  	parent.IncRef()
   348  	child.parent.Store(parent)
   349  	child.name = name
   350  	return child, topLookupLayer, nil
   351  }
   352  
   353  // lookupLayerLocked is similar to lookupLocked, but only returns information
   354  // about the file rather than a dentry.
   355  //
   356  // Preconditions:
   357  //   - fs.renameMu must be locked.
   358  //   - parent.dirMu must be locked.
   359  func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
   360  	childPath := fspath.Parse(name)
   361  	lookupLayer := lookupLayerNone
   362  	var lookupErr error
   363  
   364  	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
   365  		stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
   366  			Root:  parentVD,
   367  			Start: parentVD,
   368  			Path:  childPath,
   369  		}, &vfs.StatOptions{
   370  			Mask: linux.STATX_TYPE,
   371  		})
   372  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
   373  			// The file doesn't exist on this layer. Proceed to the next
   374  			// one.
   375  			return true
   376  		}
   377  		if err != nil {
   378  			lookupErr = err
   379  			return false
   380  		}
   381  		if stat.Mask&linux.STATX_TYPE == 0 {
   382  			// Linux's overlayfs tends to return EREMOTE in cases where a file
   383  			// is unusable for reasons that are not better captured by another
   384  			// errno.
   385  			lookupErr = linuxerr.EREMOTE
   386  			return false
   387  		}
   388  		if isWhiteout(&stat) {
   389  			// This is a whiteout, so it "doesn't exist" on this layer, and
   390  			// layers below this one are ignored.
   391  			if isUpper {
   392  				lookupLayer = lookupLayerUpperWhiteout
   393  			}
   394  			return false
   395  		}
   396  		// The file exists; we can stop searching.
   397  		if isUpper {
   398  			lookupLayer = lookupLayerUpper
   399  		} else {
   400  			lookupLayer = lookupLayerLower
   401  		}
   402  		return false
   403  	})
   404  
   405  	return lookupLayer, lookupErr
   406  }
   407  
   408  type lookupLayer int
   409  
   410  const (
   411  	// lookupLayerNone indicates that no file exists at the given path on the
   412  	// upper layer, and is either whited out or does not exist on lower layers.
   413  	// Therefore, the file does not exist in the overlay filesystem, and file
   414  	// creation may proceed normally (if an upper layer exists).
   415  	lookupLayerNone lookupLayer = iota
   416  
   417  	// lookupLayerLower indicates that no file exists at the given path on the
   418  	// upper layer, but exists on a lower layer. Therefore, the file exists in
   419  	// the overlay filesystem, but must be copied-up before mutation.
   420  	lookupLayerLower
   421  
   422  	// lookupLayerUpper indicates that a non-whiteout file exists at the given
   423  	// path on the upper layer. Therefore, the file exists in the overlay
   424  	// filesystem, and is already copied-up.
   425  	lookupLayerUpper
   426  
   427  	// lookupLayerUpperWhiteout indicates that a whiteout exists at the given
   428  	// path on the upper layer. Therefore, the file does not exist in the
   429  	// overlay filesystem, and file creation must remove the whiteout before
   430  	// proceeding.
   431  	lookupLayerUpperWhiteout
   432  )
   433  
   434  func (ll lookupLayer) existsInOverlay() bool {
   435  	return ll == lookupLayerLower || ll == lookupLayerUpper
   436  }
   437  
   438  // walkParentDirLocked resolves all but the last path component of rp to an
   439  // existing directory, starting from the given directory (which is usually
   440  // rp.Start().Impl().(*dentry)). It does not check that the returned directory
   441  // is searchable by the provider of rp.
   442  //
   443  // Preconditions:
   444  //   - fs.renameMu must be locked.
   445  //   - !rp.Done().
   446  func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
   447  	for !rp.Final() {
   448  		d.dirMu.Lock()
   449  		next, _, _, err := fs.stepLocked(ctx, rp, d, ds)
   450  		d.dirMu.Unlock()
   451  		if err != nil {
   452  			return nil, err
   453  		}
   454  		d = next
   455  	}
   456  	if !d.isDir() {
   457  		return nil, linuxerr.ENOTDIR
   458  	}
   459  	return d, nil
   460  }
   461  
   462  // resolveLocked resolves rp to an existing file.
   463  //
   464  // Preconditions: fs.renameMu must be locked.
   465  func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
   466  	d := rp.Start().Impl().(*dentry)
   467  	for !rp.Done() {
   468  		d.dirMu.Lock()
   469  		next, _, _, err := fs.stepLocked(ctx, rp, d, ds)
   470  		d.dirMu.Unlock()
   471  		if err != nil {
   472  			return nil, err
   473  		}
   474  		d = next
   475  	}
   476  	if rp.MustBeDir() && !d.isDir() {
   477  		return nil, linuxerr.ENOTDIR
   478  	}
   479  	return d, nil
   480  }
   481  
   482  type createType int
   483  
   484  const (
   485  	createNonDirectory createType = iota
   486  	createDirectory
   487  	createSyntheticMountpoint
   488  )
   489  
   490  // doCreateAt checks that creating a file at rp is permitted, then invokes
   491  // create to do so.
   492  //
   493  // Preconditions:
   494  //   - !rp.Done().
   495  //   - For the final path component in rp, !rp.ShouldFollowSymlink().
   496  func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
   497  	var ds *[]*dentry
   498  	fs.renameMu.RLock()
   499  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   500  	start := rp.Start().Impl().(*dentry)
   501  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   502  	if err != nil {
   503  		return err
   504  	}
   505  	name := rp.Component()
   506  	if name == "." || name == ".." {
   507  		return linuxerr.EEXIST
   508  	}
   509  	if uint64(len(name)) > fs.maxFilenameLen {
   510  		return linuxerr.ENAMETOOLONG
   511  	}
   512  	if parent.vfsd.IsDead() {
   513  		return linuxerr.ENOENT
   514  	}
   515  
   516  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   517  		return err
   518  	}
   519  
   520  	parent.dirMu.Lock()
   521  	defer parent.dirMu.Unlock()
   522  
   523  	// Determine if a file already exists at name.
   524  	if _, ok := parent.children[name]; ok {
   525  		return linuxerr.EEXIST
   526  	}
   527  	childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
   528  	if err != nil {
   529  		return err
   530  	}
   531  	if childLayer.existsInOverlay() {
   532  		return linuxerr.EEXIST
   533  	}
   534  
   535  	if ct == createNonDirectory && rp.MustBeDir() {
   536  		return linuxerr.ENOENT
   537  	}
   538  
   539  	mnt := rp.Mount()
   540  	if err := mnt.CheckBeginWrite(); err != nil {
   541  		return err
   542  	}
   543  	defer mnt.EndWrite()
   544  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   545  		return err
   546  	}
   547  	// Ensure that the parent directory is copied-up so that we can create the
   548  	// new file in the upper layer.
   549  	if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil {
   550  		return err
   551  	}
   552  
   553  	// Finally create the new file.
   554  	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
   555  		return err
   556  	}
   557  
   558  	parent.dirents = nil
   559  	ev := linux.IN_CREATE
   560  	if ct != createNonDirectory {
   561  		ev |= linux.IN_ISDIR
   562  	}
   563  	parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
   564  	return nil
   565  }
   566  
   567  // CreateWhiteout creates a whiteout at pop. Whiteouts are created with
   568  // character devices with device ID = 0.
   569  //
   570  // Preconditions: pop's parent directory has been copied up.
   571  func CreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, pop *vfs.PathOperation) error {
   572  	return vfsObj.MknodAt(ctx, creds, pop, &vfs.MknodOptions{
   573  		Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
   574  		// DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
   575  	})
   576  }
   577  
   578  func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
   579  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, pop); err != nil {
   580  		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
   581  	}
   582  }
   583  
   584  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   585  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   586  	var ds *[]*dentry
   587  	fs.renameMu.RLock()
   588  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   589  	d, err := fs.resolveLocked(ctx, rp, &ds)
   590  	if err != nil {
   591  		return err
   592  	}
   593  	if err := d.checkPermissions(creds, ats); err != nil {
   594  		return err
   595  	}
   596  	if !ats.MayWrite() {
   597  		// Not requesting write permission.  Allow it.
   598  		return nil
   599  	}
   600  	if rp.Mount().ReadOnly() {
   601  		return linuxerr.EROFS
   602  	}
   603  	if !d.upperVD.Ok() && !d.canBeCopiedUp() {
   604  		// A lower layer file that can not be copied up, can not be written to.
   605  		// Error out here. Don't give the application false hopes.
   606  		return linuxerr.EACCES
   607  	}
   608  	return nil
   609  }
   610  
   611  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   612  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   613  	var ds *[]*dentry
   614  	fs.renameMu.RLock()
   615  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   616  	d, err := fs.resolveLocked(ctx, rp, &ds)
   617  	if err != nil {
   618  		return nil, err
   619  	}
   620  	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   621  		return nil, err
   622  	}
   623  	layerVD := d.topLayer()
   624  	return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
   625  		Root:  layerVD,
   626  		Start: layerVD,
   627  	}, &opts)
   628  }
   629  
   630  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   631  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   632  	var ds *[]*dentry
   633  	fs.renameMu.RLock()
   634  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   635  	d, err := fs.resolveLocked(ctx, rp, &ds)
   636  	if err != nil {
   637  		return nil, err
   638  	}
   639  	if opts.CheckSearchable {
   640  		if !d.isDir() {
   641  			return nil, linuxerr.ENOTDIR
   642  		}
   643  		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   644  			return nil, err
   645  		}
   646  	}
   647  	d.IncRef()
   648  	return &d.vfsd, nil
   649  }
   650  
   651  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   652  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   653  	var ds *[]*dentry
   654  	fs.renameMu.RLock()
   655  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   656  	start := rp.Start().Impl().(*dentry)
   657  	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   658  	if err != nil {
   659  		return nil, err
   660  	}
   661  	d.IncRef()
   662  	return &d.vfsd, nil
   663  }
   664  
   665  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   666  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   667  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   668  		if rp.Mount() != vd.Mount() {
   669  			return linuxerr.EXDEV
   670  		}
   671  		old := vd.Dentry().Impl().(*dentry)
   672  		if old.isDir() {
   673  			return linuxerr.EPERM
   674  		}
   675  		if err := old.copyUpLocked(ctx); err != nil {
   676  			return err
   677  		}
   678  		vfsObj := fs.vfsfs.VirtualFilesystem()
   679  		newpop := vfs.PathOperation{
   680  			Root:  parent.upperVD,
   681  			Start: parent.upperVD,
   682  			Path:  fspath.Parse(childName),
   683  		}
   684  		if haveUpperWhiteout {
   685  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
   686  				return err
   687  			}
   688  		}
   689  		if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
   690  			Root:  old.upperVD,
   691  			Start: old.upperVD,
   692  		}, &newpop); err != nil {
   693  			if haveUpperWhiteout {
   694  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
   695  			}
   696  			return err
   697  		}
   698  		creds := rp.Credentials()
   699  		if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
   700  			Stat: linux.Statx{
   701  				Mask: linux.STATX_UID | linux.STATX_GID,
   702  				UID:  uint32(creds.EffectiveKUID),
   703  				GID:  uint32(creds.EffectiveKGID),
   704  			},
   705  		}); err != nil {
   706  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
   707  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
   708  			} else if haveUpperWhiteout {
   709  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
   710  			}
   711  			return err
   712  		}
   713  		old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
   714  		return nil
   715  	})
   716  }
   717  
   718  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   719  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   720  	ct := createDirectory
   721  	if opts.ForSyntheticMountpoint {
   722  		ct = createSyntheticMountpoint
   723  	}
   724  	return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   725  		vfsObj := fs.vfsfs.VirtualFilesystem()
   726  		pop := vfs.PathOperation{
   727  			Root:  parent.upperVD,
   728  			Start: parent.upperVD,
   729  			Path:  fspath.Parse(childName),
   730  		}
   731  		if haveUpperWhiteout {
   732  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
   733  				return err
   734  			}
   735  		}
   736  		if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
   737  			if haveUpperWhiteout {
   738  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   739  			}
   740  			return err
   741  		}
   742  
   743  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
   744  			Stat: parent.newChildOwnerStat(opts.Mode, rp.Credentials()),
   745  		}); err != nil {
   746  			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
   747  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
   748  			} else if haveUpperWhiteout {
   749  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   750  			}
   751  			return err
   752  		}
   753  		if haveUpperWhiteout {
   754  			// A whiteout is being replaced with this new directory. There may be
   755  			// directories on lower layers (previously hidden by the whiteout) that
   756  			// the new directory should not be merged with, so mark as opaque.
   757  			// See fs/overlayfs/dir.c:ovl_create_over_whiteout() -> ovl_set_opaque().
   758  			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
   759  				Name:  _OVL_XATTR_OPAQUE,
   760  				Value: "y",
   761  			}); err != nil {
   762  				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
   763  					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
   764  				} else {
   765  					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   766  				}
   767  				return err
   768  			}
   769  		} else if len(parent.lowerVDs) > 0 {
   770  			// If haveUpperWhiteout is false and the parent is merged, then we should
   771  			// apply an optimization. We know that nothing exists on the parent's
   772  			// lower layers. Otherwise doCreateAt() would have failed with EEXIST.
   773  			// Mark the new directory opaque to avoid unnecessary lower lookups in
   774  			// fs.lookupLocked(). Allow it to fail since this is an optimization.
   775  			// See fs/overlayfs/dir.c:ovl_create_upper() -> ovl_set_opaque().
   776  			_ = vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
   777  				Name:  _OVL_XATTR_OPAQUE,
   778  				Value: "y",
   779  			})
   780  		}
   781  		return nil
   782  	})
   783  }
   784  
   785  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   786  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   787  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   788  		// Disallow attempts to create whiteouts.
   789  		if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
   790  			return linuxerr.EPERM
   791  		}
   792  		vfsObj := fs.vfsfs.VirtualFilesystem()
   793  		pop := vfs.PathOperation{
   794  			Root:  parent.upperVD,
   795  			Start: parent.upperVD,
   796  			Path:  fspath.Parse(childName),
   797  		}
   798  		if haveUpperWhiteout {
   799  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
   800  				return err
   801  			}
   802  		}
   803  		if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
   804  			if haveUpperWhiteout {
   805  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   806  			}
   807  			return err
   808  		}
   809  		creds := rp.Credentials()
   810  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
   811  			Stat: parent.newChildOwnerStat(opts.Mode, creds),
   812  		}); err != nil {
   813  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
   814  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
   815  			} else if haveUpperWhiteout {
   816  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   817  			}
   818  			return err
   819  		}
   820  		return nil
   821  	})
   822  }
   823  
   824  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   825  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   826  	mayCreate := opts.Flags&linux.O_CREAT != 0
   827  	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
   828  
   829  	var ds *[]*dentry
   830  	fs.renameMu.RLock()
   831  	unlocked := false
   832  	unlock := func() {
   833  		if !unlocked {
   834  			fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   835  			unlocked = true
   836  		}
   837  	}
   838  	defer unlock()
   839  
   840  	start := rp.Start().Impl().(*dentry)
   841  	if rp.Done() {
   842  		if mayCreate && rp.MustBeDir() {
   843  			return nil, linuxerr.EISDIR
   844  		}
   845  		if mustCreate {
   846  			return nil, linuxerr.EEXIST
   847  		}
   848  		if err := start.ensureOpenableLocked(ctx, rp, &opts); err != nil {
   849  			return nil, err
   850  		}
   851  		start.IncRef()
   852  		defer start.DecRef(ctx)
   853  		unlock()
   854  		return start.openCopiedUp(ctx, rp, &opts)
   855  	}
   856  
   857  afterTrailingSymlink:
   858  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   859  	if err != nil {
   860  		return nil, err
   861  	}
   862  	// Check for search permission in the parent directory.
   863  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   864  		return nil, err
   865  	}
   866  	// Reject attempts to open directories with O_CREAT.
   867  	if mayCreate && rp.MustBeDir() {
   868  		return nil, linuxerr.EISDIR
   869  	}
   870  	// Determine whether or not we need to create a file.
   871  	parent.dirMu.Lock()
   872  	child, topLookupLayer, followedSymlink, err := fs.stepLocked(ctx, rp, parent, &ds)
   873  	if followedSymlink {
   874  		parent.dirMu.Unlock()
   875  		if mustCreate {
   876  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   877  			return nil, linuxerr.EEXIST
   878  		}
   879  		if err != nil {
   880  			// If followedSymlink && err != nil, then this symlink resolution error
   881  			// must be handled by the VFS layer.
   882  			return nil, err
   883  		}
   884  		start = parent
   885  		goto afterTrailingSymlink
   886  	}
   887  	if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate {
   888  		fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds, topLookupLayer == lookupLayerUpperWhiteout)
   889  		parent.dirMu.Unlock()
   890  		return fd, err
   891  	}
   892  	parent.dirMu.Unlock()
   893  	if err != nil {
   894  		return nil, err
   895  	}
   896  	if mustCreate {
   897  		return nil, linuxerr.EEXIST
   898  	}
   899  	if rp.MustBeDir() && !child.isDir() {
   900  		return nil, linuxerr.ENOTDIR
   901  	}
   902  	if err := child.ensureOpenableLocked(ctx, rp, &opts); err != nil {
   903  		return nil, err
   904  	}
   905  	child.IncRef()
   906  	defer child.DecRef(ctx)
   907  	unlock()
   908  	return child.openCopiedUp(ctx, rp, &opts)
   909  }
   910  
   911  // Preconditions: filesystem.renameMu must be locked.
   912  func (d *dentry) ensureOpenableLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) error {
   913  	ats := vfs.AccessTypesForOpenFlags(opts)
   914  	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
   915  		return err
   916  	}
   917  	if d.isDir() {
   918  		if ats.MayWrite() {
   919  			return linuxerr.EISDIR
   920  		}
   921  		if opts.Flags&linux.O_CREAT != 0 {
   922  			return linuxerr.EISDIR
   923  		}
   924  		if opts.Flags&linux.O_DIRECT != 0 {
   925  			return linuxerr.EINVAL
   926  		}
   927  		return nil
   928  	}
   929  
   930  	if !ats.MayWrite() {
   931  		return nil
   932  	}
   933  
   934  	// Copy up!
   935  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   936  		return err
   937  	}
   938  	defer rp.Mount().EndWrite()
   939  	return d.copyUpLocked(ctx)
   940  }
   941  
   942  // Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has
   943  // been copied up.
   944  func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
   945  	mnt := rp.Mount()
   946  
   947  	// Directory FDs open FDs from each layer when directory entries are read,
   948  	// so they don't require opening an FD from d.topLayer() up front.
   949  	ftype := d.mode.Load() & linux.S_IFMT
   950  	if ftype == linux.S_IFDIR {
   951  		fd := &directoryFD{}
   952  		fd.LockFD.Init(&d.locks)
   953  		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
   954  			UseDentryMetadata: true,
   955  		}); err != nil {
   956  			return nil, err
   957  		}
   958  		return &fd.vfsfd, nil
   959  	}
   960  
   961  	layerVD, isUpper := d.topLayerInfo()
   962  	layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
   963  		Root:  layerVD,
   964  		Start: layerVD,
   965  	}, opts)
   966  	if err != nil {
   967  		return nil, err
   968  	}
   969  	if ftype != linux.S_IFREG {
   970  		return layerFD, nil
   971  	}
   972  	layerFlags := layerFD.StatusFlags()
   973  	fd := &regularFileFD{
   974  		copiedUp:    isUpper,
   975  		cachedFD:    layerFD,
   976  		cachedFlags: layerFlags,
   977  	}
   978  	fd.LockFD.Init(&d.locks)
   979  	layerFDOpts := layerFD.Options()
   980  	if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
   981  		layerFD.DecRef(ctx)
   982  		return nil, err
   983  	}
   984  	return &fd.vfsfd, nil
   985  }
   986  
   987  // Preconditions:
   988  //   - parent.dirMu must be locked.
   989  //   - parent does not already contain a child named rp.Component().
   990  func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry, haveUpperWhiteout bool) (*vfs.FileDescription, error) {
   991  	creds := rp.Credentials()
   992  	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
   993  		return nil, err
   994  	}
   995  	if parent.vfsd.IsDead() {
   996  		return nil, linuxerr.ENOENT
   997  	}
   998  	mnt := rp.Mount()
   999  	if err := mnt.CheckBeginWrite(); err != nil {
  1000  		return nil, err
  1001  	}
  1002  	defer mnt.EndWrite()
  1003  
  1004  	if err := parent.copyUpLocked(ctx); err != nil {
  1005  		return nil, err
  1006  	}
  1007  
  1008  	vfsObj := fs.vfsfs.VirtualFilesystem()
  1009  	childName := rp.Component()
  1010  	pop := vfs.PathOperation{
  1011  		Root:  parent.upperVD,
  1012  		Start: parent.upperVD,
  1013  		Path:  fspath.Parse(childName),
  1014  	}
  1015  	// Unlink the whiteout if it exists.
  1016  	if haveUpperWhiteout {
  1017  		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1018  			log.Warningf("overlay.filesystem.createAndOpenLocked: failed to unlink whiteout: %v", err)
  1019  			return nil, err
  1020  		}
  1021  	}
  1022  	// Create the file on the upper layer, and get an FD representing it.
  1023  	upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
  1024  		Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
  1025  		Mode:  opts.Mode,
  1026  	})
  1027  	if err != nil {
  1028  		if haveUpperWhiteout {
  1029  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1030  		}
  1031  		return nil, err
  1032  	}
  1033  
  1034  	// Change the file's owner to the caller. We can't use upperFD.SetStat()
  1035  	// because it will pick up creds from ctx.
  1036  	if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
  1037  		Stat: parent.newChildOwnerStat(opts.Mode, creds),
  1038  	}); err != nil {
  1039  		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1040  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
  1041  		} else if haveUpperWhiteout {
  1042  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1043  		}
  1044  		return nil, err
  1045  	}
  1046  	// Re-lookup to get a dentry representing the new file, which is needed for
  1047  	// the returned FD.
  1048  	child, _, err := fs.getChildLocked(ctx, parent, childName, ds)
  1049  	if err != nil {
  1050  		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1051  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
  1052  		} else if haveUpperWhiteout {
  1053  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1054  		}
  1055  		return nil, err
  1056  	}
  1057  	// Finally construct the overlay FD. Below this point, we don't perform
  1058  	// cleanup (the file was created successfully even if we can no longer open
  1059  	// it for some reason).
  1060  	parent.dirents = nil
  1061  	upperFlags := upperFD.StatusFlags()
  1062  	fd := &regularFileFD{
  1063  		copiedUp:    true,
  1064  		cachedFD:    upperFD,
  1065  		cachedFlags: upperFlags,
  1066  	}
  1067  	fd.LockFD.Init(&child.locks)
  1068  	upperFDOpts := upperFD.Options()
  1069  	if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
  1070  		upperFD.DecRef(ctx)
  1071  		return nil, err
  1072  	}
  1073  	parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
  1074  	return &fd.vfsfd, nil
  1075  }
  1076  
  1077  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
  1078  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
  1079  	var ds *[]*dentry
  1080  	fs.renameMu.RLock()
  1081  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1082  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1083  	if err != nil {
  1084  		return "", err
  1085  	}
  1086  	layerVD := d.topLayer()
  1087  	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
  1088  		Root:  layerVD,
  1089  		Start: layerVD,
  1090  	})
  1091  }
  1092  
  1093  // RenameAt implements vfs.FilesystemImpl.RenameAt.
  1094  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
  1095  	// Resolve newParent first to verify that it's on this Mount.
  1096  	var ds *[]*dentry
  1097  	fs.renameMu.Lock()
  1098  	// We need to DecRef outside of fs.mu because forgetting a dead mountpoint
  1099  	// could result in this filesystem being released which acquires fs.mu.
  1100  	var toDecRef []refs.RefCounter
  1101  	defer func() {
  1102  		for _, ref := range toDecRef {
  1103  			ref.DecRef(ctx)
  1104  		}
  1105  	}()
  1106  	defer fs.renameMuUnlockAndCheckDrop(ctx, &ds)
  1107  	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
  1108  	if err != nil {
  1109  		return err
  1110  	}
  1111  
  1112  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
  1113  		return linuxerr.EINVAL
  1114  	}
  1115  
  1116  	newName := rp.Component()
  1117  	if newName == "." || newName == ".." {
  1118  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
  1119  			return linuxerr.EEXIST
  1120  		}
  1121  		return linuxerr.EBUSY
  1122  	}
  1123  	if uint64(len(newName)) > fs.maxFilenameLen {
  1124  		return linuxerr.ENAMETOOLONG
  1125  	}
  1126  	// Do not check for newName length, since different filesystem
  1127  	// implementations impose different name limits. upperfs.RenameAt() will fail
  1128  	// appropriately if it has to.
  1129  	mnt := rp.Mount()
  1130  	if mnt != oldParentVD.Mount() {
  1131  		return linuxerr.EXDEV
  1132  	}
  1133  	if err := mnt.CheckBeginWrite(); err != nil {
  1134  		return err
  1135  	}
  1136  	defer mnt.EndWrite()
  1137  
  1138  	oldParent := oldParentVD.Dentry().Impl().(*dentry)
  1139  	creds := rp.Credentials()
  1140  	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
  1141  		return err
  1142  	}
  1143  	// We need a dentry representing the renamed file since, if it's a
  1144  	// directory, we need to check for write permission on it.
  1145  	oldParent.dirMu.Lock()
  1146  	defer oldParent.dirMu.Unlock()
  1147  	renamed, _, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
  1148  	if err != nil {
  1149  		return err
  1150  	}
  1151  	if err := oldParent.mayDelete(creds, renamed); err != nil {
  1152  		return err
  1153  	}
  1154  	if renamed.isDir() {
  1155  		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
  1156  			return linuxerr.EINVAL
  1157  		}
  1158  		if oldParent != newParent {
  1159  			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
  1160  				return err
  1161  			}
  1162  		}
  1163  	} else {
  1164  		if opts.MustBeDir || rp.MustBeDir() {
  1165  			return linuxerr.ENOTDIR
  1166  		}
  1167  	}
  1168  
  1169  	if oldParent != newParent {
  1170  		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
  1171  			return err
  1172  		}
  1173  		newParent.dirMu.NestedLock(dirLockNew)
  1174  		defer newParent.dirMu.NestedUnlock(dirLockNew)
  1175  	}
  1176  	if newParent.vfsd.IsDead() {
  1177  		return linuxerr.ENOENT
  1178  	}
  1179  	var (
  1180  		replaced      *dentry
  1181  		replacedVFSD  *vfs.Dentry
  1182  		replacedLayer lookupLayer
  1183  		whiteouts     map[string]bool
  1184  	)
  1185  	replaced, replacedLayer, err = fs.getChildLocked(ctx, newParent, newName, &ds)
  1186  	if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) {
  1187  		return err
  1188  	}
  1189  	if replaced != nil {
  1190  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
  1191  			return linuxerr.EEXIST
  1192  		}
  1193  		replacedVFSD = &replaced.vfsd
  1194  		if replaced.isDir() {
  1195  			if !renamed.isDir() {
  1196  				return linuxerr.EISDIR
  1197  			}
  1198  			if genericIsAncestorDentry(replaced, renamed) {
  1199  				return linuxerr.ENOTEMPTY
  1200  			}
  1201  			replaced.dirMu.NestedLock(dirLockReplaced)
  1202  			defer replaced.dirMu.NestedUnlock(dirLockReplaced)
  1203  			whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
  1204  			if err != nil {
  1205  				return err
  1206  			}
  1207  		} else {
  1208  			if rp.MustBeDir() || renamed.isDir() {
  1209  				return linuxerr.ENOTDIR
  1210  			}
  1211  		}
  1212  	}
  1213  
  1214  	if oldParent == newParent && oldName == newName {
  1215  		return nil
  1216  	}
  1217  
  1218  	// renamed and oldParent need to be copied-up before they're renamed on the
  1219  	// upper layer.
  1220  	if err := renamed.copyUpLocked(ctx); err != nil {
  1221  		return err
  1222  	}
  1223  	// If renamed is a directory, all of its descendants need to be copied-up
  1224  	// before they're renamed on the upper layer.
  1225  	if renamed.isDir() {
  1226  		if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
  1227  			return err
  1228  		}
  1229  	}
  1230  	// newParent must be copied-up before it can contain renamed on the upper
  1231  	// layer.
  1232  	if err := newParent.copyUpLocked(ctx); err != nil {
  1233  		return err
  1234  	}
  1235  	// If replaced exists, it doesn't need to be copied-up, but we do need to
  1236  	// serialize with copy-up. Holding renameMu for writing should be
  1237  	// sufficient, but out of an abundance of caution...
  1238  	if replaced != nil {
  1239  		replaced.copyMu.RLock()
  1240  		defer replaced.copyMu.RUnlock()
  1241  	}
  1242  
  1243  	vfsObj := rp.VirtualFilesystem()
  1244  	mntns := vfs.MountNamespaceFromContext(ctx)
  1245  	defer mntns.DecRef(ctx)
  1246  	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
  1247  		return err
  1248  	}
  1249  
  1250  	newpop := vfs.PathOperation{
  1251  		Root:  newParent.upperVD,
  1252  		Start: newParent.upperVD,
  1253  		Path:  fspath.Parse(newName),
  1254  	}
  1255  
  1256  	needRecreateWhiteouts := false
  1257  	cleanupRecreateWhiteouts := func() {
  1258  		if !needRecreateWhiteouts {
  1259  			return
  1260  		}
  1261  		for whiteoutName, whiteoutUpper := range whiteouts {
  1262  			if !whiteoutUpper {
  1263  				continue
  1264  			}
  1265  			if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{
  1266  				Root:  replaced.upperVD,
  1267  				Start: replaced.upperVD,
  1268  				Path:  fspath.Parse(whiteoutName),
  1269  			}); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1270  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
  1271  			}
  1272  		}
  1273  	}
  1274  	if renamed.isDir() {
  1275  		if replacedLayer == lookupLayerUpper {
  1276  			// Remove whiteouts from the directory being replaced.
  1277  			needRecreateWhiteouts = true
  1278  			for whiteoutName, whiteoutUpper := range whiteouts {
  1279  				if !whiteoutUpper {
  1280  					continue
  1281  				}
  1282  				if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
  1283  					Root:  replaced.upperVD,
  1284  					Start: replaced.upperVD,
  1285  					Path:  fspath.Parse(whiteoutName),
  1286  				}); err != nil {
  1287  					vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1288  					cleanupRecreateWhiteouts()
  1289  					return err
  1290  				}
  1291  			}
  1292  		} else if replacedLayer == lookupLayerUpperWhiteout {
  1293  			// We need to explicitly remove the whiteout since otherwise rename
  1294  			// on the upper layer will fail with ENOTDIR.
  1295  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
  1296  				vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1297  				return err
  1298  			}
  1299  		}
  1300  	}
  1301  
  1302  	// Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
  1303  	// regular rename and create the whiteout at the origin manually. Unlike
  1304  	// RENAME_WHITEOUT, this isn't atomic with respect to other users of the
  1305  	// upper filesystem, but this is already the case for virtually all other
  1306  	// overlay filesystem operations too.
  1307  	oldpop := vfs.PathOperation{
  1308  		Root:  oldParent.upperVD,
  1309  		Start: oldParent.upperVD,
  1310  		Path:  fspath.Parse(oldName),
  1311  	}
  1312  	if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
  1313  		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1314  		cleanupRecreateWhiteouts()
  1315  		return err
  1316  	}
  1317  
  1318  	// Below this point, the renamed dentry is now at newpop, and anything we
  1319  	// replaced is gone forever. Commit the rename, update the overlay
  1320  	// filesystem tree, and abandon attempts to recover from errors.
  1321  	toDecRef = vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
  1322  	delete(oldParent.children, oldName)
  1323  	if replaced != nil {
  1324  		// Lower dentries of replaced are not reachable from the overlay anymore.
  1325  		// NOTE(b/237573779): Ask lower filesystem to release resources for this
  1326  		// dentry whenever possible to reduce resource usage.
  1327  		for _, replaceLower := range replaced.lowerVDs {
  1328  			replaceLower.Dentry().MarkEvictable()
  1329  		}
  1330  		ds = appendDentry(ds, replaced)
  1331  	}
  1332  	if oldParent != newParent {
  1333  		newParent.dirents = nil
  1334  		// This can't drop the last reference on oldParent because one is held
  1335  		// by oldParentVD, so lock recursion is impossible.
  1336  		oldParent.DecRef(ctx)
  1337  		ds = appendDentry(ds, oldParent)
  1338  		newParent.IncRef()
  1339  		renamed.parent.Store(newParent)
  1340  	}
  1341  	renamed.name = newName
  1342  	if newParent.children == nil {
  1343  		newParent.children = make(map[string]*dentry)
  1344  	}
  1345  	newParent.children[newName] = renamed
  1346  	oldParent.dirents = nil
  1347  
  1348  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &oldpop); err != nil {
  1349  		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
  1350  	}
  1351  	if renamed.isDir() {
  1352  		if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
  1353  			Name:  _OVL_XATTR_OPAQUE,
  1354  			Value: "y",
  1355  		}); err != nil {
  1356  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
  1357  		}
  1358  	}
  1359  
  1360  	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
  1361  	return nil
  1362  }
  1363  
  1364  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
  1365  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
  1366  	var ds *[]*dentry
  1367  	fs.renameMu.RLock()
  1368  	// We need to DecRef outside of fs.mu because forgetting a dead mountpoint
  1369  	// could result in this filesystem being released which acquires fs.mu.
  1370  	var toDecRef []refs.RefCounter
  1371  	defer func() {
  1372  		for _, ref := range toDecRef {
  1373  			ref.DecRef(ctx)
  1374  		}
  1375  	}()
  1376  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1377  	start := rp.Start().Impl().(*dentry)
  1378  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
  1379  	if err != nil {
  1380  		return err
  1381  	}
  1382  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
  1383  		return err
  1384  	}
  1385  	if err := rp.Mount().CheckBeginWrite(); err != nil {
  1386  		return err
  1387  	}
  1388  	defer rp.Mount().EndWrite()
  1389  	name := rp.Component()
  1390  	if name == "." {
  1391  		return linuxerr.EINVAL
  1392  	}
  1393  	if name == ".." {
  1394  		return linuxerr.ENOTEMPTY
  1395  	}
  1396  	vfsObj := rp.VirtualFilesystem()
  1397  	mntns := vfs.MountNamespaceFromContext(ctx)
  1398  	defer mntns.DecRef(ctx)
  1399  	parent.dirMu.Lock()
  1400  	defer parent.dirMu.Unlock()
  1401  
  1402  	// Ensure that parent is copied-up before potentially holding child.copyMu
  1403  	// below.
  1404  	if err := parent.copyUpLocked(ctx); err != nil {
  1405  		return err
  1406  	}
  1407  
  1408  	// We need a dentry representing the child directory being removed in order
  1409  	// to verify that it's empty.
  1410  	child, _, err := fs.getChildLocked(ctx, parent, name, &ds)
  1411  	if err != nil {
  1412  		return err
  1413  	}
  1414  	if !child.isDir() {
  1415  		return linuxerr.ENOTDIR
  1416  	}
  1417  	if err := parent.mayDelete(rp.Credentials(), child); err != nil {
  1418  		return err
  1419  	}
  1420  	child.dirMu.NestedLock(dirLockChild)
  1421  	defer child.dirMu.NestedUnlock(dirLockChild)
  1422  	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
  1423  	if err != nil {
  1424  		return err
  1425  	}
  1426  	child.copyMu.RLock()
  1427  	defer child.copyMu.RUnlock()
  1428  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
  1429  		return err
  1430  	}
  1431  
  1432  	pop := vfs.PathOperation{
  1433  		Root:  parent.upperVD,
  1434  		Start: parent.upperVD,
  1435  		Path:  fspath.Parse(name),
  1436  	}
  1437  	if child.upperVD.Ok() {
  1438  		cleanupRecreateWhiteouts := func() {
  1439  			if !child.upperVD.Ok() {
  1440  				return
  1441  			}
  1442  			for whiteoutName, whiteoutUpper := range whiteouts {
  1443  				if !whiteoutUpper {
  1444  					continue
  1445  				}
  1446  				if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{
  1447  					Root:  child.upperVD,
  1448  					Start: child.upperVD,
  1449  					Path:  fspath.Parse(whiteoutName),
  1450  				}); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1451  					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
  1452  				}
  1453  			}
  1454  		}
  1455  		// Remove existing whiteouts on the upper layer.
  1456  		for whiteoutName, whiteoutUpper := range whiteouts {
  1457  			if !whiteoutUpper {
  1458  				continue
  1459  			}
  1460  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
  1461  				Root:  child.upperVD,
  1462  				Start: child.upperVD,
  1463  				Path:  fspath.Parse(whiteoutName),
  1464  			}); err != nil {
  1465  				vfsObj.AbortDeleteDentry(&child.vfsd)
  1466  				cleanupRecreateWhiteouts()
  1467  				return err
  1468  			}
  1469  		}
  1470  		// Remove the existing directory on the upper layer.
  1471  		if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
  1472  			vfsObj.AbortDeleteDentry(&child.vfsd)
  1473  			cleanupRecreateWhiteouts()
  1474  			return err
  1475  		}
  1476  	}
  1477  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil {
  1478  		vfsObj.AbortDeleteDentry(&child.vfsd)
  1479  		if child.upperVD.Ok() {
  1480  			// Don't attempt to recover from this: the original directory is
  1481  			// already gone, so any dentries representing it are invalid, and
  1482  			// creating a new directory won't undo that.
  1483  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after removing upper layer directory during RmdirAt: %v", err))
  1484  		}
  1485  		return err
  1486  	}
  1487  
  1488  	toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
  1489  	delete(parent.children, name)
  1490  	ds = appendDentry(ds, child)
  1491  	parent.dirents = nil
  1492  	parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
  1493  	return nil
  1494  }
  1495  
  1496  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
  1497  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
  1498  	var ds *[]*dentry
  1499  	fs.renameMu.RLock()
  1500  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1501  	if err != nil {
  1502  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1503  		return err
  1504  	}
  1505  	err = d.setStatLocked(ctx, rp, opts)
  1506  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1507  	if err != nil {
  1508  		return err
  1509  	}
  1510  
  1511  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
  1512  		d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
  1513  	}
  1514  	return nil
  1515  }
  1516  
  1517  // Precondition: d.fs.renameMu must be held for reading.
  1518  func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
  1519  	mode := linux.FileMode(d.mode.Load())
  1520  	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil {
  1521  		return err
  1522  	}
  1523  	mnt := rp.Mount()
  1524  	if err := mnt.CheckBeginWrite(); err != nil {
  1525  		return err
  1526  	}
  1527  	defer mnt.EndWrite()
  1528  	if err := d.copyUpLocked(ctx); err != nil {
  1529  		return err
  1530  	}
  1531  	// Changes to d's attributes are serialized by d.copyMu.
  1532  	d.copyMu.Lock()
  1533  	defer d.copyMu.Unlock()
  1534  	if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
  1535  		Root:  d.upperVD,
  1536  		Start: d.upperVD,
  1537  	}, &opts); err != nil {
  1538  		return err
  1539  	}
  1540  	d.updateAfterSetStatLocked(&opts)
  1541  	return nil
  1542  }
  1543  
  1544  // StatAt implements vfs.FilesystemImpl.StatAt.
  1545  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
  1546  	var ds *[]*dentry
  1547  	fs.renameMu.RLock()
  1548  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1549  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1550  	if err != nil {
  1551  		return linux.Statx{}, err
  1552  	}
  1553  
  1554  	var stat linux.Statx
  1555  	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
  1556  		layerVD := d.topLayer()
  1557  		stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
  1558  			Root:  layerVD,
  1559  			Start: layerVD,
  1560  		}, &vfs.StatOptions{
  1561  			Mask: layerMask,
  1562  			Sync: opts.Sync,
  1563  		})
  1564  		if err != nil {
  1565  			return linux.Statx{}, err
  1566  		}
  1567  	}
  1568  	d.statInternalTo(ctx, &opts, &stat)
  1569  	return stat, nil
  1570  }
  1571  
  1572  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
  1573  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
  1574  	var ds *[]*dentry
  1575  	fs.renameMu.RLock()
  1576  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1577  	_, err := fs.resolveLocked(ctx, rp, &ds)
  1578  	if err != nil {
  1579  		return linux.Statfs{}, err
  1580  	}
  1581  	return fs.statFS(ctx)
  1582  }
  1583  
  1584  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
  1585  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
  1586  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
  1587  		vfsObj := fs.vfsfs.VirtualFilesystem()
  1588  		pop := vfs.PathOperation{
  1589  			Root:  parent.upperVD,
  1590  			Start: parent.upperVD,
  1591  			Path:  fspath.Parse(childName),
  1592  		}
  1593  		if haveUpperWhiteout {
  1594  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1595  				return err
  1596  			}
  1597  		}
  1598  		if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
  1599  			if haveUpperWhiteout {
  1600  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1601  			}
  1602  			return err
  1603  		}
  1604  		creds := rp.Credentials()
  1605  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
  1606  			Stat: linux.Statx{
  1607  				Mask: linux.STATX_UID | linux.STATX_GID,
  1608  				UID:  uint32(creds.EffectiveKUID),
  1609  				GID:  uint32(creds.EffectiveKGID),
  1610  			},
  1611  		}); err != nil {
  1612  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1613  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
  1614  			} else if haveUpperWhiteout {
  1615  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1616  			}
  1617  			return err
  1618  		}
  1619  		return nil
  1620  	})
  1621  }
  1622  
  1623  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
  1624  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
  1625  	var ds *[]*dentry
  1626  	fs.renameMu.RLock()
  1627  	// We need to DecRef outside of fs.renameMu because forgetting a dead
  1628  	// mountpoint could result in this filesystem being released which acquires
  1629  	// fs.renameMu.
  1630  	var toDecRef []refs.RefCounter
  1631  	defer func() {
  1632  		for _, ref := range toDecRef {
  1633  			ref.DecRef(ctx)
  1634  		}
  1635  	}()
  1636  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1637  	start := rp.Start().Impl().(*dentry)
  1638  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
  1639  	if err != nil {
  1640  		return err
  1641  	}
  1642  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
  1643  		return err
  1644  	}
  1645  	if err := rp.Mount().CheckBeginWrite(); err != nil {
  1646  		return err
  1647  	}
  1648  	defer rp.Mount().EndWrite()
  1649  	name := rp.Component()
  1650  	if name == "." || name == ".." {
  1651  		return linuxerr.EISDIR
  1652  	}
  1653  	if rp.MustBeDir() {
  1654  		return linuxerr.ENOTDIR
  1655  	}
  1656  	vfsObj := rp.VirtualFilesystem()
  1657  	mntns := vfs.MountNamespaceFromContext(ctx)
  1658  	defer mntns.DecRef(ctx)
  1659  	parent.dirMu.Lock()
  1660  	defer parent.dirMu.Unlock()
  1661  
  1662  	// Ensure that parent is copied-up before potentially holding child.copyMu
  1663  	// below.
  1664  	if err := parent.copyUpLocked(ctx); err != nil {
  1665  		return err
  1666  	}
  1667  
  1668  	// We need a dentry representing the child being removed in order to verify
  1669  	// that it's not a directory.
  1670  	child, childLayer, err := fs.getChildLocked(ctx, parent, name, &ds)
  1671  	if err != nil {
  1672  		return err
  1673  	}
  1674  	if child.isDir() {
  1675  		return linuxerr.EISDIR
  1676  	}
  1677  	if err := parent.mayDelete(rp.Credentials(), child); err != nil {
  1678  		return err
  1679  	}
  1680  	// Hold child.copyMu to prevent it from being copied-up during
  1681  	// deletion.
  1682  	child.copyMu.RLock()
  1683  	defer child.copyMu.RUnlock()
  1684  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
  1685  		return err
  1686  	}
  1687  
  1688  	pop := vfs.PathOperation{
  1689  		Root:  parent.upperVD,
  1690  		Start: parent.upperVD,
  1691  		Path:  fspath.Parse(name),
  1692  	}
  1693  	if childLayer == lookupLayerUpper {
  1694  		// Remove the existing file on the upper layer.
  1695  		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1696  			vfsObj.AbortDeleteDentry(&child.vfsd)
  1697  			return err
  1698  		}
  1699  	}
  1700  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil {
  1701  		vfsObj.AbortDeleteDentry(&child.vfsd)
  1702  		if childLayer == lookupLayerUpper {
  1703  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after unlinking upper layer file during UnlinkAt: %v", err))
  1704  		}
  1705  		return err
  1706  	}
  1707  
  1708  	toDecRef = vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
  1709  	delete(parent.children, name)
  1710  	if !child.isDir() {
  1711  		// Once a whiteout is created, non-directory dentries on the lower layers
  1712  		// are no longer reachable from the overlayfs. Ask filesystems to release
  1713  		// their resources whenever possible.
  1714  		for _, lowerDentry := range child.lowerVDs {
  1715  			lowerDentry.Dentry().MarkEvictable()
  1716  		}
  1717  	}
  1718  	ds = appendDentry(ds, child)
  1719  	vfs.InotifyRemoveChild(ctx, &child.watches, &parent.watches, name)
  1720  	parent.dirents = nil
  1721  	return nil
  1722  }
  1723  
  1724  // isOverlayXattr returns whether the given extended attribute configures the
  1725  // overlay.
  1726  func isOverlayXattr(name string) bool {
  1727  	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
  1728  }
  1729  
  1730  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
  1731  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
  1732  	var ds *[]*dentry
  1733  	fs.renameMu.RLock()
  1734  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1735  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1736  	if err != nil {
  1737  		return nil, err
  1738  	}
  1739  
  1740  	return fs.listXattr(ctx, d, size)
  1741  }
  1742  
  1743  func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
  1744  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1745  	top := d.topLayer()
  1746  	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
  1747  	if err != nil {
  1748  		return nil, err
  1749  	}
  1750  
  1751  	// Filter out all overlay attributes.
  1752  	n := 0
  1753  	for _, name := range names {
  1754  		if !isOverlayXattr(name) {
  1755  			names[n] = name
  1756  			n++
  1757  		}
  1758  	}
  1759  	return names[:n], err
  1760  }
  1761  
  1762  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
  1763  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
  1764  	var ds *[]*dentry
  1765  	fs.renameMu.RLock()
  1766  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1767  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1768  	if err != nil {
  1769  		return "", err
  1770  	}
  1771  
  1772  	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
  1773  }
  1774  
  1775  func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
  1776  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
  1777  		return "", err
  1778  	}
  1779  
  1780  	// Return EOPNOTSUPP when fetching an overlay attribute.
  1781  	// See fs/overlayfs/super.c:ovl_own_xattr_get().
  1782  	if isOverlayXattr(opts.Name) {
  1783  		return "", linuxerr.EOPNOTSUPP
  1784  	}
  1785  
  1786  	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
  1787  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1788  	top := d.topLayer()
  1789  	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
  1790  }
  1791  
  1792  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
  1793  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
  1794  	var ds *[]*dentry
  1795  	fs.renameMu.RLock()
  1796  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1797  	if err != nil {
  1798  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1799  		return err
  1800  	}
  1801  
  1802  	err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
  1803  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1804  	if err != nil {
  1805  		return err
  1806  	}
  1807  
  1808  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
  1809  	return nil
  1810  }
  1811  
  1812  // Precondition: fs.renameMu must be locked.
  1813  func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
  1814  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
  1815  		return err
  1816  	}
  1817  
  1818  	// Return EOPNOTSUPP when setting an overlay attribute.
  1819  	// See fs/overlayfs/super.c:ovl_own_xattr_set().
  1820  	if isOverlayXattr(opts.Name) {
  1821  		return linuxerr.EOPNOTSUPP
  1822  	}
  1823  
  1824  	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
  1825  	if err := mnt.CheckBeginWrite(); err != nil {
  1826  		return err
  1827  	}
  1828  	defer mnt.EndWrite()
  1829  	if err := d.copyUpLocked(ctx); err != nil {
  1830  		return err
  1831  	}
  1832  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1833  	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
  1834  }
  1835  
  1836  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
  1837  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
  1838  	var ds *[]*dentry
  1839  	fs.renameMu.RLock()
  1840  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1841  	if err != nil {
  1842  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1843  		return err
  1844  	}
  1845  
  1846  	err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
  1847  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1848  	if err != nil {
  1849  		return err
  1850  	}
  1851  
  1852  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
  1853  	return nil
  1854  }
  1855  
  1856  // Precondition: fs.renameMu must be locked.
  1857  func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
  1858  	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
  1859  		return err
  1860  	}
  1861  
  1862  	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
  1863  	// Linux passes the remove request to xattr_handler->set.
  1864  	// See fs/xattr.c:vfs_removexattr().
  1865  	if isOverlayXattr(name) {
  1866  		return linuxerr.EOPNOTSUPP
  1867  	}
  1868  
  1869  	if err := mnt.CheckBeginWrite(); err != nil {
  1870  		return err
  1871  	}
  1872  	defer mnt.EndWrite()
  1873  	if err := d.copyUpLocked(ctx); err != nil {
  1874  		return err
  1875  	}
  1876  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1877  	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
  1878  }
  1879  
  1880  // PrependPath implements vfs.FilesystemImpl.PrependPath.
  1881  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
  1882  	fs.renameMu.RLock()
  1883  	defer fs.renameMu.RUnlock()
  1884  	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
  1885  }
  1886  
  1887  // MountOptions implements vfs.FilesystemImpl.MountOptions.
  1888  func (fs *filesystem) MountOptions() string {
  1889  	// Return the mount options from the topmost layer.
  1890  	var vd vfs.VirtualDentry
  1891  	if fs.opts.UpperRoot.Ok() {
  1892  		vd = fs.opts.UpperRoot
  1893  	} else {
  1894  		vd = fs.opts.LowerRoots[0]
  1895  	}
  1896  	return vd.Mount().Filesystem().Impl().MountOptions()
  1897  }