github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/overlay/filesystem.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package overlay
    16  
    17  import (
    18  	"fmt"
    19  	"strings"
    20  
    21  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    22  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    23  	"github.com/MerlinKodo/gvisor/pkg/context"
    24  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    25  	"github.com/MerlinKodo/gvisor/pkg/fspath"
    26  	"github.com/MerlinKodo/gvisor/pkg/log"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    28  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix/transport"
    29  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    30  	"github.com/MerlinKodo/gvisor/pkg/sync"
    31  )
    32  
    33  // _OVL_XATTR_PREFIX is an extended attribute key prefix to identify overlayfs
    34  // attributes.
    35  // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_PREFIX
    36  const _OVL_XATTR_PREFIX = linux.XATTR_TRUSTED_PREFIX + "overlay."
    37  
    38  // _OVL_XATTR_OPAQUE is an extended attribute key whose value is set to "y" for
    39  // opaque directories.
    40  // Linux: fs/overlayfs/overlayfs.h:OVL_XATTR_OPAQUE
    41  const _OVL_XATTR_OPAQUE = _OVL_XATTR_PREFIX + "opaque"
    42  
    43  func isWhiteout(stat *linux.Statx) bool {
    44  	return stat.Mode&linux.S_IFMT == linux.S_IFCHR && stat.RdevMajor == 0 && stat.RdevMinor == 0
    45  }
    46  
    47  // Sync implements vfs.FilesystemImpl.Sync.
    48  func (fs *filesystem) Sync(ctx context.Context) error {
    49  	if fs.opts.UpperRoot.Ok() {
    50  		return fs.opts.UpperRoot.Mount().Filesystem().Impl().Sync(ctx)
    51  	}
    52  	return nil
    53  }
    54  
    55  var dentrySlicePool = sync.Pool{
    56  	New: func() any {
    57  		ds := make([]*dentry, 0, 4) // arbitrary non-zero initial capacity
    58  		return &ds
    59  	},
    60  }
    61  
    62  func appendDentry(ds *[]*dentry, d *dentry) *[]*dentry {
    63  	if ds == nil {
    64  		ds = dentrySlicePool.Get().(*[]*dentry)
    65  	}
    66  	*ds = append(*ds, d)
    67  	return ds
    68  }
    69  
    70  // Preconditions: ds != nil.
    71  func putDentrySlice(ds *[]*dentry) {
    72  	// Allow dentries to be GC'd.
    73  	for i := range *ds {
    74  		(*ds)[i] = nil
    75  	}
    76  	*ds = (*ds)[:0]
    77  	dentrySlicePool.Put(ds)
    78  }
    79  
    80  // renameMuRUnlockAndCheckDrop calls fs.renameMu.RUnlock(), then calls
    81  // dentry.checkDropLocked on all dentries in *dsp with fs.renameMu locked for
    82  // writing.
    83  //
    84  // dsp is a pointer-to-pointer since defer evaluates its arguments immediately,
    85  // but dentry slices are allocated lazily, and it's much easier to say "defer
    86  // fs.renameMuRUnlockAndCheckDrop(&ds)" than "defer func() {
    87  // fs.renameMuRUnlockAndCheckDrop(ds) }()" to work around this.
    88  //
    89  // +checklocksreleaseread:fs.renameMu
    90  func (fs *filesystem) renameMuRUnlockAndCheckDrop(ctx context.Context, dsp **[]*dentry) {
    91  	fs.renameMu.RUnlock()
    92  	if *dsp == nil {
    93  		return
    94  	}
    95  	ds := **dsp
    96  	// Only go through calling dentry.checkDropLocked() (which requires
    97  	// re-locking renameMu) if we actually have any dentries with zero refs.
    98  	checkAny := false
    99  	for i := range ds {
   100  		if ds[i].refs.Load() == 0 {
   101  			checkAny = true
   102  			break
   103  		}
   104  	}
   105  	if checkAny {
   106  		fs.renameMu.Lock()
   107  		for _, d := range ds {
   108  			d.checkDropLocked(ctx)
   109  		}
   110  		fs.renameMu.Unlock()
   111  	}
   112  	putDentrySlice(*dsp)
   113  }
   114  
   115  // +checklocksrelease:fs.renameMu
   116  func (fs *filesystem) renameMuUnlockAndCheckDrop(ctx context.Context, ds **[]*dentry) {
   117  	if *ds == nil {
   118  		fs.renameMu.Unlock()
   119  		return
   120  	}
   121  	for _, d := range **ds {
   122  		d.checkDropLocked(ctx)
   123  	}
   124  	fs.renameMu.Unlock()
   125  	putDentrySlice(*ds)
   126  }
   127  
   128  // stepLocked resolves rp.Component() to an existing file, starting from the
   129  // given directory.
   130  //
   131  // Dentries which may have a reference count of zero, and which therefore
   132  // should be dropped once traversal is complete, are appended to ds.
   133  //
   134  // Preconditions:
   135  //   - fs.renameMu must be locked.
   136  //   - d.dirMu must be locked.
   137  //   - !rp.Done().
   138  func (fs *filesystem) stepLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, lookupLayer, bool, error) {
   139  	if !d.isDir() {
   140  		return nil, lookupLayerNone, false, linuxerr.ENOTDIR
   141  	}
   142  	if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   143  		return nil, lookupLayerNone, false, err
   144  	}
   145  	name := rp.Component()
   146  	if name == "." {
   147  		rp.Advance()
   148  		return d, d.topLookupLayer(), false, nil
   149  	}
   150  	if name == ".." {
   151  		if isRoot, err := rp.CheckRoot(ctx, &d.vfsd); err != nil {
   152  			return nil, lookupLayerNone, false, err
   153  		} else if isRoot || d.parent == nil {
   154  			rp.Advance()
   155  			return d, d.topLookupLayer(), false, nil
   156  		}
   157  		if err := rp.CheckMount(ctx, &d.parent.vfsd); err != nil {
   158  			return nil, lookupLayerNone, false, err
   159  		}
   160  		rp.Advance()
   161  		return d.parent, d.parent.topLookupLayer(), false, nil
   162  	}
   163  	if uint64(len(name)) > fs.maxFilenameLen {
   164  		return nil, lookupLayerNone, false, linuxerr.ENAMETOOLONG
   165  	}
   166  	child, topLookupLayer, err := fs.getChildLocked(ctx, d, name, ds)
   167  	if err != nil {
   168  		return nil, topLookupLayer, false, err
   169  	}
   170  	if err := rp.CheckMount(ctx, &child.vfsd); err != nil {
   171  		return nil, lookupLayerNone, false, err
   172  	}
   173  	if child.isSymlink() && rp.ShouldFollowSymlink() {
   174  		target, err := child.readlink(ctx)
   175  		if err != nil {
   176  			return nil, lookupLayerNone, false, err
   177  		}
   178  		followedSymlink, err := rp.HandleSymlink(target)
   179  		return d, topLookupLayer, followedSymlink, err
   180  	}
   181  	rp.Advance()
   182  	return child, topLookupLayer, false, nil
   183  }
   184  
   185  // Preconditions:
   186  //   - fs.renameMu must be locked.
   187  //   - d.dirMu must be locked.
   188  func (fs *filesystem) getChildLocked(ctx context.Context, parent *dentry, name string, ds **[]*dentry) (*dentry, lookupLayer, error) {
   189  	if child, ok := parent.children[name]; ok {
   190  		return child, child.topLookupLayer(), nil
   191  	}
   192  	child, topLookupLayer, err := fs.lookupLocked(ctx, parent, name)
   193  	if err != nil {
   194  		return nil, topLookupLayer, err
   195  	}
   196  	if parent.children == nil {
   197  		parent.children = make(map[string]*dentry)
   198  	}
   199  	parent.children[name] = child
   200  	// child's refcount is initially 0, so it may be dropped after traversal.
   201  	*ds = appendDentry(*ds, child)
   202  	return child, topLookupLayer, nil
   203  }
   204  
   205  // Preconditions:
   206  //   - fs.renameMu must be locked.
   207  //   - parent.dirMu must be locked.
   208  func (fs *filesystem) lookupLocked(ctx context.Context, parent *dentry, name string) (*dentry, lookupLayer, error) {
   209  	childPath := fspath.Parse(name)
   210  	child := fs.newDentry()
   211  	topLookupLayer := lookupLayerNone
   212  	var lookupErr error
   213  
   214  	vfsObj := fs.vfsfs.VirtualFilesystem()
   215  	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
   216  		childVD, err := vfsObj.GetDentryAt(ctx, fs.creds, &vfs.PathOperation{
   217  			Root:  parentVD,
   218  			Start: parentVD,
   219  			Path:  childPath,
   220  		}, &vfs.GetDentryOptions{})
   221  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
   222  			// The file doesn't exist on this layer. Proceed to the next one.
   223  			return true
   224  		}
   225  		if err != nil {
   226  			lookupErr = err
   227  			return false
   228  		}
   229  		defer childVD.DecRef(ctx)
   230  
   231  		mask := uint32(linux.STATX_TYPE)
   232  		if topLookupLayer == lookupLayerNone {
   233  			// Mode, UID, GID, and (for non-directories) inode number come from
   234  			// the topmost layer on which the file exists.
   235  			mask |= linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   236  		}
   237  		stat, err := vfsObj.StatAt(ctx, fs.creds, &vfs.PathOperation{
   238  			Root:  childVD,
   239  			Start: childVD,
   240  		}, &vfs.StatOptions{
   241  			Mask: mask,
   242  		})
   243  		if err != nil {
   244  			lookupErr = err
   245  			return false
   246  		}
   247  		if stat.Mask&mask != mask {
   248  			lookupErr = linuxerr.EREMOTE
   249  			return false
   250  		}
   251  
   252  		if isWhiteout(&stat) {
   253  			// This is a whiteout, so it "doesn't exist" on this layer, and
   254  			// layers below this one are ignored.
   255  			if isUpper {
   256  				topLookupLayer = lookupLayerUpperWhiteout
   257  			}
   258  			return false
   259  		}
   260  		isDir := stat.Mode&linux.S_IFMT == linux.S_IFDIR
   261  		if topLookupLayer != lookupLayerNone && !isDir {
   262  			// Directories are not merged with non-directory files from lower
   263  			// layers; instead, layers including and below the first
   264  			// non-directory file are ignored. (This file must be a directory
   265  			// on previous layers, since lower layers aren't searched for
   266  			// non-directory files.)
   267  			return false
   268  		}
   269  
   270  		// Update child to include this layer.
   271  		childVD.IncRef()
   272  		if isUpper {
   273  			child.upperVD = childVD
   274  			child.copiedUp = atomicbitops.FromUint32(1)
   275  		} else {
   276  			child.lowerVDs = append(child.lowerVDs, childVD)
   277  		}
   278  		if topLookupLayer == lookupLayerNone {
   279  			if isUpper {
   280  				topLookupLayer = lookupLayerUpper
   281  			} else {
   282  				topLookupLayer = lookupLayerLower
   283  			}
   284  			child.mode = atomicbitops.FromUint32(uint32(stat.Mode))
   285  			child.uid = atomicbitops.FromUint32(stat.UID)
   286  			child.gid = atomicbitops.FromUint32(stat.GID)
   287  			child.devMajor = atomicbitops.FromUint32(stat.DevMajor)
   288  			child.devMinor = atomicbitops.FromUint32(stat.DevMinor)
   289  			child.ino = atomicbitops.FromUint64(stat.Ino)
   290  		}
   291  
   292  		// For non-directory files, only the topmost layer that contains a file
   293  		// matters.
   294  		if !isDir {
   295  			return false
   296  		}
   297  
   298  		// Directories use the lowest layer inode and device numbers to generate a
   299  		// filesystem local inode number. This way the inode number does not change
   300  		// after copy ups.
   301  		child.devMajor = atomicbitops.FromUint32(stat.DevMajor)
   302  		child.devMinor = atomicbitops.FromUint32(stat.DevMinor)
   303  		child.ino = atomicbitops.FromUint64(stat.Ino)
   304  
   305  		// Directories are merged with directories from lower layers if they
   306  		// are not explicitly opaque.
   307  		opaqueVal, err := vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{
   308  			Root:  childVD,
   309  			Start: childVD,
   310  		}, &vfs.GetXattrOptions{
   311  			Name: _OVL_XATTR_OPAQUE,
   312  			Size: 1,
   313  		})
   314  		return !(err == nil && opaqueVal == "y")
   315  	})
   316  
   317  	if lookupErr != nil {
   318  		child.destroyLocked(ctx)
   319  		return nil, topLookupLayer, lookupErr
   320  	}
   321  	if !topLookupLayer.existsInOverlay() {
   322  		child.destroyLocked(ctx)
   323  		return nil, topLookupLayer, linuxerr.ENOENT
   324  	}
   325  
   326  	// Device and inode numbers were copied from the topmost layer above for
   327  	// non-directories. They were copied from the bottommost layer for
   328  	// directories. Override them if necessary. We can use RacyLoad() because
   329  	// child is still being initialized.
   330  	if child.isDir() {
   331  		child.ino.Store(fs.newDirIno(child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), child.ino.RacyLoad()))
   332  		child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   333  		child.devMinor = atomicbitops.FromUint32(fs.dirDevMinor)
   334  	} else if !child.upperVD.Ok() {
   335  		childDevMinor, err := fs.getLowerDevMinor(child.devMajor.RacyLoad(), child.devMinor.RacyLoad())
   336  		if err != nil {
   337  			ctx.Infof("overlay.filesystem.lookupLocked: failed to map lower layer device number (%d, %d) to an overlay-specific device number: %v", child.devMajor.RacyLoad(), child.devMinor.RacyLoad(), err)
   338  			child.destroyLocked(ctx)
   339  			return nil, topLookupLayer, err
   340  		}
   341  		child.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   342  		child.devMinor = atomicbitops.FromUint32(childDevMinor)
   343  	}
   344  
   345  	parent.IncRef()
   346  	child.parent = parent
   347  	child.name = name
   348  	return child, topLookupLayer, nil
   349  }
   350  
   351  // lookupLayerLocked is similar to lookupLocked, but only returns information
   352  // about the file rather than a dentry.
   353  //
   354  // Preconditions:
   355  //   - fs.renameMu must be locked.
   356  //   - parent.dirMu must be locked.
   357  func (fs *filesystem) lookupLayerLocked(ctx context.Context, parent *dentry, name string) (lookupLayer, error) {
   358  	childPath := fspath.Parse(name)
   359  	lookupLayer := lookupLayerNone
   360  	var lookupErr error
   361  
   362  	parent.iterLayers(func(parentVD vfs.VirtualDentry, isUpper bool) bool {
   363  		stat, err := fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
   364  			Root:  parentVD,
   365  			Start: parentVD,
   366  			Path:  childPath,
   367  		}, &vfs.StatOptions{
   368  			Mask: linux.STATX_TYPE,
   369  		})
   370  		if linuxerr.Equals(linuxerr.ENOENT, err) || linuxerr.Equals(linuxerr.ENAMETOOLONG, err) {
   371  			// The file doesn't exist on this layer. Proceed to the next
   372  			// one.
   373  			return true
   374  		}
   375  		if err != nil {
   376  			lookupErr = err
   377  			return false
   378  		}
   379  		if stat.Mask&linux.STATX_TYPE == 0 {
   380  			// Linux's overlayfs tends to return EREMOTE in cases where a file
   381  			// is unusable for reasons that are not better captured by another
   382  			// errno.
   383  			lookupErr = linuxerr.EREMOTE
   384  			return false
   385  		}
   386  		if isWhiteout(&stat) {
   387  			// This is a whiteout, so it "doesn't exist" on this layer, and
   388  			// layers below this one are ignored.
   389  			if isUpper {
   390  				lookupLayer = lookupLayerUpperWhiteout
   391  			}
   392  			return false
   393  		}
   394  		// The file exists; we can stop searching.
   395  		if isUpper {
   396  			lookupLayer = lookupLayerUpper
   397  		} else {
   398  			lookupLayer = lookupLayerLower
   399  		}
   400  		return false
   401  	})
   402  
   403  	return lookupLayer, lookupErr
   404  }
   405  
   406  type lookupLayer int
   407  
   408  const (
   409  	// lookupLayerNone indicates that no file exists at the given path on the
   410  	// upper layer, and is either whited out or does not exist on lower layers.
   411  	// Therefore, the file does not exist in the overlay filesystem, and file
   412  	// creation may proceed normally (if an upper layer exists).
   413  	lookupLayerNone lookupLayer = iota
   414  
   415  	// lookupLayerLower indicates that no file exists at the given path on the
   416  	// upper layer, but exists on a lower layer. Therefore, the file exists in
   417  	// the overlay filesystem, but must be copied-up before mutation.
   418  	lookupLayerLower
   419  
   420  	// lookupLayerUpper indicates that a non-whiteout file exists at the given
   421  	// path on the upper layer. Therefore, the file exists in the overlay
   422  	// filesystem, and is already copied-up.
   423  	lookupLayerUpper
   424  
   425  	// lookupLayerUpperWhiteout indicates that a whiteout exists at the given
   426  	// path on the upper layer. Therefore, the file does not exist in the
   427  	// overlay filesystem, and file creation must remove the whiteout before
   428  	// proceeding.
   429  	lookupLayerUpperWhiteout
   430  )
   431  
   432  func (ll lookupLayer) existsInOverlay() bool {
   433  	return ll == lookupLayerLower || ll == lookupLayerUpper
   434  }
   435  
   436  // walkParentDirLocked resolves all but the last path component of rp to an
   437  // existing directory, starting from the given directory (which is usually
   438  // rp.Start().Impl().(*dentry)). It does not check that the returned directory
   439  // is searchable by the provider of rp.
   440  //
   441  // Preconditions:
   442  //   - fs.renameMu must be locked.
   443  //   - !rp.Done().
   444  func (fs *filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath, d *dentry, ds **[]*dentry) (*dentry, error) {
   445  	for !rp.Final() {
   446  		d.dirMu.Lock()
   447  		next, _, _, err := fs.stepLocked(ctx, rp, d, ds)
   448  		d.dirMu.Unlock()
   449  		if err != nil {
   450  			return nil, err
   451  		}
   452  		d = next
   453  	}
   454  	if !d.isDir() {
   455  		return nil, linuxerr.ENOTDIR
   456  	}
   457  	return d, nil
   458  }
   459  
   460  // resolveLocked resolves rp to an existing file.
   461  //
   462  // Preconditions: fs.renameMu must be locked.
   463  func (fs *filesystem) resolveLocked(ctx context.Context, rp *vfs.ResolvingPath, ds **[]*dentry) (*dentry, error) {
   464  	d := rp.Start().Impl().(*dentry)
   465  	for !rp.Done() {
   466  		d.dirMu.Lock()
   467  		next, _, _, err := fs.stepLocked(ctx, rp, d, ds)
   468  		d.dirMu.Unlock()
   469  		if err != nil {
   470  			return nil, err
   471  		}
   472  		d = next
   473  	}
   474  	if rp.MustBeDir() && !d.isDir() {
   475  		return nil, linuxerr.ENOTDIR
   476  	}
   477  	return d, nil
   478  }
   479  
   480  type createType int
   481  
   482  const (
   483  	createNonDirectory createType = iota
   484  	createDirectory
   485  	createSyntheticMountpoint
   486  )
   487  
   488  // doCreateAt checks that creating a file at rp is permitted, then invokes
   489  // create to do so.
   490  //
   491  // Preconditions:
   492  //   - !rp.Done().
   493  //   - For the final path component in rp, !rp.ShouldFollowSymlink().
   494  func (fs *filesystem) doCreateAt(ctx context.Context, rp *vfs.ResolvingPath, ct createType, create func(parent *dentry, name string, haveUpperWhiteout bool) error) error {
   495  	var ds *[]*dentry
   496  	fs.renameMu.RLock()
   497  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   498  	start := rp.Start().Impl().(*dentry)
   499  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   500  	if err != nil {
   501  		return err
   502  	}
   503  	name := rp.Component()
   504  	if name == "." || name == ".." {
   505  		return linuxerr.EEXIST
   506  	}
   507  	if uint64(len(name)) > fs.maxFilenameLen {
   508  		return linuxerr.ENAMETOOLONG
   509  	}
   510  	if parent.vfsd.IsDead() {
   511  		return linuxerr.ENOENT
   512  	}
   513  
   514  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   515  		return err
   516  	}
   517  
   518  	parent.dirMu.Lock()
   519  	defer parent.dirMu.Unlock()
   520  
   521  	// Determine if a file already exists at name.
   522  	if _, ok := parent.children[name]; ok {
   523  		return linuxerr.EEXIST
   524  	}
   525  	childLayer, err := fs.lookupLayerLocked(ctx, parent, name)
   526  	if err != nil {
   527  		return err
   528  	}
   529  	if childLayer.existsInOverlay() {
   530  		return linuxerr.EEXIST
   531  	}
   532  
   533  	if ct == createNonDirectory && rp.MustBeDir() {
   534  		return linuxerr.ENOENT
   535  	}
   536  
   537  	mnt := rp.Mount()
   538  	if err := mnt.CheckBeginWrite(); err != nil {
   539  		return err
   540  	}
   541  	defer mnt.EndWrite()
   542  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
   543  		return err
   544  	}
   545  	// Ensure that the parent directory is copied-up so that we can create the
   546  	// new file in the upper layer.
   547  	if err := parent.copyUpMaybeSyntheticMountpointLocked(ctx, ct == createSyntheticMountpoint); err != nil {
   548  		return err
   549  	}
   550  
   551  	// Finally create the new file.
   552  	if err := create(parent, name, childLayer == lookupLayerUpperWhiteout); err != nil {
   553  		return err
   554  	}
   555  
   556  	parent.dirents = nil
   557  	ev := linux.IN_CREATE
   558  	if ct != createNonDirectory {
   559  		ev |= linux.IN_ISDIR
   560  	}
   561  	parent.watches.Notify(ctx, name, uint32(ev), 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
   562  	return nil
   563  }
   564  
   565  // CreateWhiteout creates a whiteout at pop. Whiteouts are created with
   566  // character devices with device ID = 0.
   567  //
   568  // Preconditions: pop's parent directory has been copied up.
   569  func CreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, pop *vfs.PathOperation) error {
   570  	return vfsObj.MknodAt(ctx, creds, pop, &vfs.MknodOptions{
   571  		Mode: linux.S_IFCHR, // permissions == include/linux/fs.h:WHITEOUT_MODE == 0
   572  		// DevMajor == DevMinor == 0, from include/linux/fs.h:WHITEOUT_DEV
   573  	})
   574  }
   575  
   576  func (fs *filesystem) cleanupRecreateWhiteout(ctx context.Context, vfsObj *vfs.VirtualFilesystem, pop *vfs.PathOperation) {
   577  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, pop); err != nil {
   578  		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate whiteout after failed file creation: %v", err))
   579  	}
   580  }
   581  
   582  // AccessAt implements vfs.Filesystem.Impl.AccessAt.
   583  func (fs *filesystem) AccessAt(ctx context.Context, rp *vfs.ResolvingPath, creds *auth.Credentials, ats vfs.AccessTypes) error {
   584  	var ds *[]*dentry
   585  	fs.renameMu.RLock()
   586  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   587  	d, err := fs.resolveLocked(ctx, rp, &ds)
   588  	if err != nil {
   589  		return err
   590  	}
   591  	if err := d.checkPermissions(creds, ats); err != nil {
   592  		return err
   593  	}
   594  	if !ats.MayWrite() {
   595  		// Not requesting write permission.  Allow it.
   596  		return nil
   597  	}
   598  	if rp.Mount().ReadOnly() {
   599  		return linuxerr.EROFS
   600  	}
   601  	if !d.upperVD.Ok() && !d.canBeCopiedUp() {
   602  		// A lower layer file that can not be copied up, can not be written to.
   603  		// Error out here. Don't give the application false hopes.
   604  		return linuxerr.EACCES
   605  	}
   606  	return nil
   607  }
   608  
   609  // BoundEndpointAt implements vfs.FilesystemImpl.BoundEndpointAt.
   610  func (fs *filesystem) BoundEndpointAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.BoundEndpointOptions) (transport.BoundEndpoint, error) {
   611  	var ds *[]*dentry
   612  	fs.renameMu.RLock()
   613  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   614  	d, err := fs.resolveLocked(ctx, rp, &ds)
   615  	if err != nil {
   616  		return nil, err
   617  	}
   618  	if err := d.checkPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
   619  		return nil, err
   620  	}
   621  	layerVD := d.topLayer()
   622  	return fs.vfsfs.VirtualFilesystem().BoundEndpointAt(ctx, fs.creds, &vfs.PathOperation{
   623  		Root:  layerVD,
   624  		Start: layerVD,
   625  	}, &opts)
   626  }
   627  
   628  // GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
   629  func (fs *filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
   630  	var ds *[]*dentry
   631  	fs.renameMu.RLock()
   632  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   633  	d, err := fs.resolveLocked(ctx, rp, &ds)
   634  	if err != nil {
   635  		return nil, err
   636  	}
   637  	if opts.CheckSearchable {
   638  		if !d.isDir() {
   639  			return nil, linuxerr.ENOTDIR
   640  		}
   641  		if err := d.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   642  			return nil, err
   643  		}
   644  	}
   645  	d.IncRef()
   646  	return &d.vfsd, nil
   647  }
   648  
   649  // GetParentDentryAt implements vfs.FilesystemImpl.GetParentDentryAt.
   650  func (fs *filesystem) GetParentDentryAt(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, error) {
   651  	var ds *[]*dentry
   652  	fs.renameMu.RLock()
   653  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   654  	start := rp.Start().Impl().(*dentry)
   655  	d, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   656  	if err != nil {
   657  		return nil, err
   658  	}
   659  	d.IncRef()
   660  	return &d.vfsd, nil
   661  }
   662  
   663  // LinkAt implements vfs.FilesystemImpl.LinkAt.
   664  func (fs *filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
   665  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   666  		if rp.Mount() != vd.Mount() {
   667  			return linuxerr.EXDEV
   668  		}
   669  		old := vd.Dentry().Impl().(*dentry)
   670  		if old.isDir() {
   671  			return linuxerr.EPERM
   672  		}
   673  		if err := old.copyUpLocked(ctx); err != nil {
   674  			return err
   675  		}
   676  		vfsObj := fs.vfsfs.VirtualFilesystem()
   677  		newpop := vfs.PathOperation{
   678  			Root:  parent.upperVD,
   679  			Start: parent.upperVD,
   680  			Path:  fspath.Parse(childName),
   681  		}
   682  		if haveUpperWhiteout {
   683  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
   684  				return err
   685  			}
   686  		}
   687  		if err := vfsObj.LinkAt(ctx, fs.creds, &vfs.PathOperation{
   688  			Root:  old.upperVD,
   689  			Start: old.upperVD,
   690  		}, &newpop); err != nil {
   691  			if haveUpperWhiteout {
   692  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
   693  			}
   694  			return err
   695  		}
   696  		creds := rp.Credentials()
   697  		if err := vfsObj.SetStatAt(ctx, fs.creds, &newpop, &vfs.SetStatOptions{
   698  			Stat: linux.Statx{
   699  				Mask: linux.STATX_UID | linux.STATX_GID,
   700  				UID:  uint32(creds.EffectiveKUID),
   701  				GID:  uint32(creds.EffectiveKGID),
   702  			},
   703  		}); err != nil {
   704  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); cleanupErr != nil {
   705  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after LinkAt metadata update failure: %v", cleanupErr))
   706  			} else if haveUpperWhiteout {
   707  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &newpop)
   708  			}
   709  			return err
   710  		}
   711  		old.watches.Notify(ctx, "", linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent, false /* unlinked */)
   712  		return nil
   713  	})
   714  }
   715  
   716  // MkdirAt implements vfs.FilesystemImpl.MkdirAt.
   717  func (fs *filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
   718  	ct := createDirectory
   719  	if opts.ForSyntheticMountpoint {
   720  		ct = createSyntheticMountpoint
   721  	}
   722  	return fs.doCreateAt(ctx, rp, ct, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   723  		vfsObj := fs.vfsfs.VirtualFilesystem()
   724  		pop := vfs.PathOperation{
   725  			Root:  parent.upperVD,
   726  			Start: parent.upperVD,
   727  			Path:  fspath.Parse(childName),
   728  		}
   729  		if haveUpperWhiteout {
   730  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
   731  				return err
   732  			}
   733  		}
   734  		if err := vfsObj.MkdirAt(ctx, fs.creds, &pop, &opts); err != nil {
   735  			if haveUpperWhiteout {
   736  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   737  			}
   738  			return err
   739  		}
   740  
   741  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
   742  			Stat: parent.newChildOwnerStat(opts.Mode, rp.Credentials()),
   743  		}); err != nil {
   744  			if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
   745  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt metadata update failure: %v", cleanupErr))
   746  			} else if haveUpperWhiteout {
   747  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   748  			}
   749  			return err
   750  		}
   751  		if haveUpperWhiteout {
   752  			// A whiteout is being replaced with this new directory. There may be
   753  			// directories on lower layers (previously hidden by the whiteout) that
   754  			// the new directory should not be merged with, so mark as opaque.
   755  			// See fs/overlayfs/dir.c:ovl_create_over_whiteout() -> ovl_set_opaque().
   756  			if err := vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
   757  				Name:  _OVL_XATTR_OPAQUE,
   758  				Value: "y",
   759  			}); err != nil {
   760  				if cleanupErr := vfsObj.RmdirAt(ctx, fs.creds, &pop); cleanupErr != nil {
   761  					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer directory after MkdirAt set-opaque failure: %v", cleanupErr))
   762  				} else {
   763  					fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   764  				}
   765  				return err
   766  			}
   767  		} else if len(parent.lowerVDs) > 0 {
   768  			// If haveUpperWhiteout is false and the parent is merged, then we should
   769  			// apply an optimization. We know that nothing exists on the parent's
   770  			// lower layers. Otherwise doCreateAt() would have failed with EEXIST.
   771  			// Mark the new directory opaque to avoid unnecessary lower lookups in
   772  			// fs.lookupLocked(). Allow it to fail since this is an optimization.
   773  			// See fs/overlayfs/dir.c:ovl_create_upper() -> ovl_set_opaque().
   774  			_ = vfsObj.SetXattrAt(ctx, fs.creds, &pop, &vfs.SetXattrOptions{
   775  				Name:  _OVL_XATTR_OPAQUE,
   776  				Value: "y",
   777  			})
   778  		}
   779  		return nil
   780  	})
   781  }
   782  
   783  // MknodAt implements vfs.FilesystemImpl.MknodAt.
   784  func (fs *filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
   785  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
   786  		// Disallow attempts to create whiteouts.
   787  		if opts.Mode&linux.S_IFMT == linux.S_IFCHR && opts.DevMajor == 0 && opts.DevMinor == 0 {
   788  			return linuxerr.EPERM
   789  		}
   790  		vfsObj := fs.vfsfs.VirtualFilesystem()
   791  		pop := vfs.PathOperation{
   792  			Root:  parent.upperVD,
   793  			Start: parent.upperVD,
   794  			Path:  fspath.Parse(childName),
   795  		}
   796  		if haveUpperWhiteout {
   797  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
   798  				return err
   799  			}
   800  		}
   801  		if err := vfsObj.MknodAt(ctx, fs.creds, &pop, &opts); err != nil {
   802  			if haveUpperWhiteout {
   803  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   804  			}
   805  			return err
   806  		}
   807  		creds := rp.Credentials()
   808  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
   809  			Stat: parent.newChildOwnerStat(opts.Mode, creds),
   810  		}); err != nil {
   811  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
   812  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after MknodAt metadata update failure: %v", cleanupErr))
   813  			} else if haveUpperWhiteout {
   814  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
   815  			}
   816  			return err
   817  		}
   818  		return nil
   819  	})
   820  }
   821  
   822  // OpenAt implements vfs.FilesystemImpl.OpenAt.
   823  func (fs *filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   824  	mayCreate := opts.Flags&linux.O_CREAT != 0
   825  	mustCreate := opts.Flags&(linux.O_CREAT|linux.O_EXCL) == (linux.O_CREAT | linux.O_EXCL)
   826  
   827  	var ds *[]*dentry
   828  	fs.renameMu.RLock()
   829  	unlocked := false
   830  	unlock := func() {
   831  		if !unlocked {
   832  			fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
   833  			unlocked = true
   834  		}
   835  	}
   836  	defer unlock()
   837  
   838  	start := rp.Start().Impl().(*dentry)
   839  	if rp.Done() {
   840  		if mayCreate && rp.MustBeDir() {
   841  			return nil, linuxerr.EISDIR
   842  		}
   843  		if mustCreate {
   844  			return nil, linuxerr.EEXIST
   845  		}
   846  		if err := start.ensureOpenableLocked(ctx, rp, &opts); err != nil {
   847  			return nil, err
   848  		}
   849  		start.IncRef()
   850  		defer start.DecRef(ctx)
   851  		unlock()
   852  		return start.openCopiedUp(ctx, rp, &opts)
   853  	}
   854  
   855  afterTrailingSymlink:
   856  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
   857  	if err != nil {
   858  		return nil, err
   859  	}
   860  	// Check for search permission in the parent directory.
   861  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayExec); err != nil {
   862  		return nil, err
   863  	}
   864  	// Reject attempts to open directories with O_CREAT.
   865  	if mayCreate && rp.MustBeDir() {
   866  		return nil, linuxerr.EISDIR
   867  	}
   868  	// Determine whether or not we need to create a file.
   869  	parent.dirMu.Lock()
   870  	child, topLookupLayer, followedSymlink, err := fs.stepLocked(ctx, rp, parent, &ds)
   871  	if followedSymlink {
   872  		parent.dirMu.Unlock()
   873  		if mustCreate {
   874  			// EEXIST must be returned if an existing symlink is opened with O_EXCL.
   875  			return nil, linuxerr.EEXIST
   876  		}
   877  		if err != nil {
   878  			// If followedSymlink && err != nil, then this symlink resolution error
   879  			// must be handled by the VFS layer.
   880  			return nil, err
   881  		}
   882  		start = parent
   883  		goto afterTrailingSymlink
   884  	}
   885  	if linuxerr.Equals(linuxerr.ENOENT, err) && mayCreate {
   886  		fd, err := fs.createAndOpenLocked(ctx, rp, parent, &opts, &ds, topLookupLayer == lookupLayerUpperWhiteout)
   887  		parent.dirMu.Unlock()
   888  		return fd, err
   889  	}
   890  	parent.dirMu.Unlock()
   891  	if err != nil {
   892  		return nil, err
   893  	}
   894  	if mustCreate {
   895  		return nil, linuxerr.EEXIST
   896  	}
   897  	if rp.MustBeDir() && !child.isDir() {
   898  		return nil, linuxerr.ENOTDIR
   899  	}
   900  	if err := child.ensureOpenableLocked(ctx, rp, &opts); err != nil {
   901  		return nil, err
   902  	}
   903  	child.IncRef()
   904  	defer child.DecRef(ctx)
   905  	unlock()
   906  	return child.openCopiedUp(ctx, rp, &opts)
   907  }
   908  
   909  // Preconditions: filesystem.renameMu must be locked.
   910  func (d *dentry) ensureOpenableLocked(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) error {
   911  	ats := vfs.AccessTypesForOpenFlags(opts)
   912  	if err := d.checkPermissions(rp.Credentials(), ats); err != nil {
   913  		return err
   914  	}
   915  	if d.isDir() {
   916  		if ats.MayWrite() {
   917  			return linuxerr.EISDIR
   918  		}
   919  		if opts.Flags&linux.O_CREAT != 0 {
   920  			return linuxerr.EISDIR
   921  		}
   922  		if opts.Flags&linux.O_DIRECT != 0 {
   923  			return linuxerr.EINVAL
   924  		}
   925  		return nil
   926  	}
   927  
   928  	if !ats.MayWrite() {
   929  		return nil
   930  	}
   931  
   932  	// Copy up!
   933  	if err := rp.Mount().CheckBeginWrite(); err != nil {
   934  		return err
   935  	}
   936  	defer rp.Mount().EndWrite()
   937  	return d.copyUpLocked(ctx)
   938  }
   939  
   940  // Preconditions: If vfs.AccessTypesForOpenFlags(opts).MayWrite(), then d has
   941  // been copied up.
   942  func (d *dentry) openCopiedUp(ctx context.Context, rp *vfs.ResolvingPath, opts *vfs.OpenOptions) (*vfs.FileDescription, error) {
   943  	mnt := rp.Mount()
   944  
   945  	// Directory FDs open FDs from each layer when directory entries are read,
   946  	// so they don't require opening an FD from d.topLayer() up front.
   947  	ftype := d.mode.Load() & linux.S_IFMT
   948  	if ftype == linux.S_IFDIR {
   949  		fd := &directoryFD{}
   950  		fd.LockFD.Init(&d.locks)
   951  		if err := fd.vfsfd.Init(fd, opts.Flags, mnt, &d.vfsd, &vfs.FileDescriptionOptions{
   952  			UseDentryMetadata: true,
   953  		}); err != nil {
   954  			return nil, err
   955  		}
   956  		return &fd.vfsfd, nil
   957  	}
   958  
   959  	layerVD, isUpper := d.topLayerInfo()
   960  	layerFD, err := rp.VirtualFilesystem().OpenAt(ctx, d.fs.creds, &vfs.PathOperation{
   961  		Root:  layerVD,
   962  		Start: layerVD,
   963  	}, opts)
   964  	if err != nil {
   965  		return nil, err
   966  	}
   967  	if ftype != linux.S_IFREG {
   968  		return layerFD, nil
   969  	}
   970  	layerFlags := layerFD.StatusFlags()
   971  	fd := &regularFileFD{
   972  		copiedUp:    isUpper,
   973  		cachedFD:    layerFD,
   974  		cachedFlags: layerFlags,
   975  	}
   976  	fd.LockFD.Init(&d.locks)
   977  	layerFDOpts := layerFD.Options()
   978  	if err := fd.vfsfd.Init(fd, layerFlags, mnt, &d.vfsd, &layerFDOpts); err != nil {
   979  		layerFD.DecRef(ctx)
   980  		return nil, err
   981  	}
   982  	return &fd.vfsfd, nil
   983  }
   984  
   985  // Preconditions:
   986  //   - parent.dirMu must be locked.
   987  //   - parent does not already contain a child named rp.Component().
   988  func (fs *filesystem) createAndOpenLocked(ctx context.Context, rp *vfs.ResolvingPath, parent *dentry, opts *vfs.OpenOptions, ds **[]*dentry, haveUpperWhiteout bool) (*vfs.FileDescription, error) {
   989  	creds := rp.Credentials()
   990  	if err := parent.checkPermissions(creds, vfs.MayWrite); err != nil {
   991  		return nil, err
   992  	}
   993  	if parent.vfsd.IsDead() {
   994  		return nil, linuxerr.ENOENT
   995  	}
   996  	mnt := rp.Mount()
   997  	if err := mnt.CheckBeginWrite(); err != nil {
   998  		return nil, err
   999  	}
  1000  	defer mnt.EndWrite()
  1001  
  1002  	if err := parent.copyUpLocked(ctx); err != nil {
  1003  		return nil, err
  1004  	}
  1005  
  1006  	vfsObj := fs.vfsfs.VirtualFilesystem()
  1007  	childName := rp.Component()
  1008  	pop := vfs.PathOperation{
  1009  		Root:  parent.upperVD,
  1010  		Start: parent.upperVD,
  1011  		Path:  fspath.Parse(childName),
  1012  	}
  1013  	// Unlink the whiteout if it exists.
  1014  	if haveUpperWhiteout {
  1015  		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1016  			log.Warningf("overlay.filesystem.createAndOpenLocked: failed to unlink whiteout: %v", err)
  1017  			return nil, err
  1018  		}
  1019  	}
  1020  	// Create the file on the upper layer, and get an FD representing it.
  1021  	upperFD, err := vfsObj.OpenAt(ctx, fs.creds, &pop, &vfs.OpenOptions{
  1022  		Flags: opts.Flags&^vfs.FileCreationFlags | linux.O_CREAT | linux.O_EXCL,
  1023  		Mode:  opts.Mode,
  1024  	})
  1025  	if err != nil {
  1026  		if haveUpperWhiteout {
  1027  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1028  		}
  1029  		return nil, err
  1030  	}
  1031  
  1032  	// Change the file's owner to the caller. We can't use upperFD.SetStat()
  1033  	// because it will pick up creds from ctx.
  1034  	if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
  1035  		Stat: parent.newChildOwnerStat(opts.Mode, creds),
  1036  	}); err != nil {
  1037  		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1038  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) metadata update failure: %v", cleanupErr))
  1039  		} else if haveUpperWhiteout {
  1040  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1041  		}
  1042  		return nil, err
  1043  	}
  1044  	// Re-lookup to get a dentry representing the new file, which is needed for
  1045  	// the returned FD.
  1046  	child, _, err := fs.getChildLocked(ctx, parent, childName, ds)
  1047  	if err != nil {
  1048  		if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1049  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after OpenAt(O_CREAT) dentry lookup failure: %v", cleanupErr))
  1050  		} else if haveUpperWhiteout {
  1051  			fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1052  		}
  1053  		return nil, err
  1054  	}
  1055  	// Finally construct the overlay FD. Below this point, we don't perform
  1056  	// cleanup (the file was created successfully even if we can no longer open
  1057  	// it for some reason).
  1058  	parent.dirents = nil
  1059  	upperFlags := upperFD.StatusFlags()
  1060  	fd := &regularFileFD{
  1061  		copiedUp:    true,
  1062  		cachedFD:    upperFD,
  1063  		cachedFlags: upperFlags,
  1064  	}
  1065  	fd.LockFD.Init(&child.locks)
  1066  	upperFDOpts := upperFD.Options()
  1067  	if err := fd.vfsfd.Init(fd, upperFlags, mnt, &child.vfsd, &upperFDOpts); err != nil {
  1068  		upperFD.DecRef(ctx)
  1069  		return nil, err
  1070  	}
  1071  	parent.watches.Notify(ctx, childName, linux.IN_CREATE, 0 /* cookie */, vfs.PathEvent, false /* unlinked */)
  1072  	return &fd.vfsfd, nil
  1073  }
  1074  
  1075  // ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
  1076  func (fs *filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
  1077  	var ds *[]*dentry
  1078  	fs.renameMu.RLock()
  1079  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1080  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1081  	if err != nil {
  1082  		return "", err
  1083  	}
  1084  	layerVD := d.topLayer()
  1085  	return fs.vfsfs.VirtualFilesystem().ReadlinkAt(ctx, d.fs.creds, &vfs.PathOperation{
  1086  		Root:  layerVD,
  1087  		Start: layerVD,
  1088  	})
  1089  }
  1090  
  1091  // RenameAt implements vfs.FilesystemImpl.RenameAt.
  1092  func (fs *filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, oldParentVD vfs.VirtualDentry, oldName string, opts vfs.RenameOptions) error {
  1093  	// Resolve newParent first to verify that it's on this Mount.
  1094  	var ds *[]*dentry
  1095  	fs.renameMu.Lock()
  1096  	defer fs.renameMuUnlockAndCheckDrop(ctx, &ds)
  1097  	newParent, err := fs.walkParentDirLocked(ctx, rp, rp.Start().Impl().(*dentry), &ds)
  1098  	if err != nil {
  1099  		return err
  1100  	}
  1101  
  1102  	if opts.Flags&^linux.RENAME_NOREPLACE != 0 {
  1103  		return linuxerr.EINVAL
  1104  	}
  1105  
  1106  	newName := rp.Component()
  1107  	if newName == "." || newName == ".." {
  1108  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
  1109  			return linuxerr.EEXIST
  1110  		}
  1111  		return linuxerr.EBUSY
  1112  	}
  1113  	if uint64(len(newName)) > fs.maxFilenameLen {
  1114  		return linuxerr.ENAMETOOLONG
  1115  	}
  1116  	// Do not check for newName length, since different filesystem
  1117  	// implementations impose different name limits. upperfs.RenameAt() will fail
  1118  	// appropriately if it has to.
  1119  	mnt := rp.Mount()
  1120  	if mnt != oldParentVD.Mount() {
  1121  		return linuxerr.EXDEV
  1122  	}
  1123  	if err := mnt.CheckBeginWrite(); err != nil {
  1124  		return err
  1125  	}
  1126  	defer mnt.EndWrite()
  1127  
  1128  	oldParent := oldParentVD.Dentry().Impl().(*dentry)
  1129  	creds := rp.Credentials()
  1130  	if err := oldParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
  1131  		return err
  1132  	}
  1133  	// We need a dentry representing the renamed file since, if it's a
  1134  	// directory, we need to check for write permission on it.
  1135  	oldParent.dirMu.Lock()
  1136  	defer oldParent.dirMu.Unlock()
  1137  	renamed, _, err := fs.getChildLocked(ctx, oldParent, oldName, &ds)
  1138  	if err != nil {
  1139  		return err
  1140  	}
  1141  	if err := oldParent.mayDelete(creds, renamed); err != nil {
  1142  		return err
  1143  	}
  1144  	if renamed.isDir() {
  1145  		if renamed == newParent || genericIsAncestorDentry(renamed, newParent) {
  1146  			return linuxerr.EINVAL
  1147  		}
  1148  		if oldParent != newParent {
  1149  			if err := renamed.checkPermissions(creds, vfs.MayWrite); err != nil {
  1150  				return err
  1151  			}
  1152  		}
  1153  	} else {
  1154  		if opts.MustBeDir || rp.MustBeDir() {
  1155  			return linuxerr.ENOTDIR
  1156  		}
  1157  	}
  1158  
  1159  	if oldParent != newParent {
  1160  		if err := newParent.checkPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
  1161  			return err
  1162  		}
  1163  		newParent.dirMu.NestedLock(dirLockNew)
  1164  		defer newParent.dirMu.NestedUnlock(dirLockNew)
  1165  	}
  1166  	if newParent.vfsd.IsDead() {
  1167  		return linuxerr.ENOENT
  1168  	}
  1169  	var (
  1170  		replaced      *dentry
  1171  		replacedVFSD  *vfs.Dentry
  1172  		replacedLayer lookupLayer
  1173  		whiteouts     map[string]bool
  1174  	)
  1175  	replaced, replacedLayer, err = fs.getChildLocked(ctx, newParent, newName, &ds)
  1176  	if err != nil && !linuxerr.Equals(linuxerr.ENOENT, err) {
  1177  		return err
  1178  	}
  1179  	if replaced != nil {
  1180  		if opts.Flags&linux.RENAME_NOREPLACE != 0 {
  1181  			return linuxerr.EEXIST
  1182  		}
  1183  		replacedVFSD = &replaced.vfsd
  1184  		if replaced.isDir() {
  1185  			if !renamed.isDir() {
  1186  				return linuxerr.EISDIR
  1187  			}
  1188  			if genericIsAncestorDentry(replaced, renamed) {
  1189  				return linuxerr.ENOTEMPTY
  1190  			}
  1191  			replaced.dirMu.NestedLock(dirLockReplaced)
  1192  			defer replaced.dirMu.NestedUnlock(dirLockReplaced)
  1193  			whiteouts, err = replaced.collectWhiteoutsForRmdirLocked(ctx)
  1194  			if err != nil {
  1195  				return err
  1196  			}
  1197  		} else {
  1198  			if rp.MustBeDir() || renamed.isDir() {
  1199  				return linuxerr.ENOTDIR
  1200  			}
  1201  		}
  1202  	}
  1203  
  1204  	if oldParent == newParent && oldName == newName {
  1205  		return nil
  1206  	}
  1207  
  1208  	// renamed and oldParent need to be copied-up before they're renamed on the
  1209  	// upper layer.
  1210  	if err := renamed.copyUpLocked(ctx); err != nil {
  1211  		return err
  1212  	}
  1213  	// If renamed is a directory, all of its descendants need to be copied-up
  1214  	// before they're renamed on the upper layer.
  1215  	if renamed.isDir() {
  1216  		if err := renamed.copyUpDescendantsLocked(ctx, &ds); err != nil {
  1217  			return err
  1218  		}
  1219  	}
  1220  	// newParent must be copied-up before it can contain renamed on the upper
  1221  	// layer.
  1222  	if err := newParent.copyUpLocked(ctx); err != nil {
  1223  		return err
  1224  	}
  1225  	// If replaced exists, it doesn't need to be copied-up, but we do need to
  1226  	// serialize with copy-up. Holding renameMu for writing should be
  1227  	// sufficient, but out of an abundance of caution...
  1228  	if replaced != nil {
  1229  		replaced.copyMu.RLock()
  1230  		defer replaced.copyMu.RUnlock()
  1231  	}
  1232  
  1233  	vfsObj := rp.VirtualFilesystem()
  1234  	mntns := vfs.MountNamespaceFromContext(ctx)
  1235  	defer mntns.DecRef(ctx)
  1236  	if err := vfsObj.PrepareRenameDentry(mntns, &renamed.vfsd, replacedVFSD); err != nil {
  1237  		return err
  1238  	}
  1239  
  1240  	newpop := vfs.PathOperation{
  1241  		Root:  newParent.upperVD,
  1242  		Start: newParent.upperVD,
  1243  		Path:  fspath.Parse(newName),
  1244  	}
  1245  
  1246  	needRecreateWhiteouts := false
  1247  	cleanupRecreateWhiteouts := func() {
  1248  		if !needRecreateWhiteouts {
  1249  			return
  1250  		}
  1251  		for whiteoutName, whiteoutUpper := range whiteouts {
  1252  			if !whiteoutUpper {
  1253  				continue
  1254  			}
  1255  			if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{
  1256  				Root:  replaced.upperVD,
  1257  				Start: replaced.upperVD,
  1258  				Path:  fspath.Parse(whiteoutName),
  1259  			}); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1260  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RenameAt failure: %v", err))
  1261  			}
  1262  		}
  1263  	}
  1264  	if renamed.isDir() {
  1265  		if replacedLayer == lookupLayerUpper {
  1266  			// Remove whiteouts from the directory being replaced.
  1267  			needRecreateWhiteouts = true
  1268  			for whiteoutName, whiteoutUpper := range whiteouts {
  1269  				if !whiteoutUpper {
  1270  					continue
  1271  				}
  1272  				if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
  1273  					Root:  replaced.upperVD,
  1274  					Start: replaced.upperVD,
  1275  					Path:  fspath.Parse(whiteoutName),
  1276  				}); err != nil {
  1277  					vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1278  					cleanupRecreateWhiteouts()
  1279  					return err
  1280  				}
  1281  			}
  1282  		} else if replacedLayer == lookupLayerUpperWhiteout {
  1283  			// We need to explicitly remove the whiteout since otherwise rename
  1284  			// on the upper layer will fail with ENOTDIR.
  1285  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &newpop); err != nil {
  1286  				vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1287  				return err
  1288  			}
  1289  		}
  1290  	}
  1291  
  1292  	// Essentially no gVisor filesystem supports RENAME_WHITEOUT, so just do a
  1293  	// regular rename and create the whiteout at the origin manually. Unlike
  1294  	// RENAME_WHITEOUT, this isn't atomic with respect to other users of the
  1295  	// upper filesystem, but this is already the case for virtually all other
  1296  	// overlay filesystem operations too.
  1297  	oldpop := vfs.PathOperation{
  1298  		Root:  oldParent.upperVD,
  1299  		Start: oldParent.upperVD,
  1300  		Path:  fspath.Parse(oldName),
  1301  	}
  1302  	if err := vfsObj.RenameAt(ctx, creds, &oldpop, &newpop, &opts); err != nil {
  1303  		vfsObj.AbortRenameDentry(&renamed.vfsd, replacedVFSD)
  1304  		cleanupRecreateWhiteouts()
  1305  		return err
  1306  	}
  1307  
  1308  	// Below this point, the renamed dentry is now at newpop, and anything we
  1309  	// replaced is gone forever. Commit the rename, update the overlay
  1310  	// filesystem tree, and abandon attempts to recover from errors.
  1311  	vfsObj.CommitRenameReplaceDentry(ctx, &renamed.vfsd, replacedVFSD)
  1312  	delete(oldParent.children, oldName)
  1313  	if replaced != nil {
  1314  		// Lower dentries of replaced are not reachable from the overlay anymore.
  1315  		// NOTE(b/237573779): Ask lower filesystem to release resources for this
  1316  		// dentry whenever possible to reduce resource usage.
  1317  		for _, replaceLower := range replaced.lowerVDs {
  1318  			replaceLower.Dentry().MarkEvictable()
  1319  		}
  1320  		ds = appendDentry(ds, replaced)
  1321  	}
  1322  	if oldParent != newParent {
  1323  		newParent.dirents = nil
  1324  		// This can't drop the last reference on oldParent because one is held
  1325  		// by oldParentVD, so lock recursion is impossible.
  1326  		oldParent.DecRef(ctx)
  1327  		ds = appendDentry(ds, oldParent)
  1328  		newParent.IncRef()
  1329  		renamed.parent = newParent
  1330  	}
  1331  	renamed.name = newName
  1332  	if newParent.children == nil {
  1333  		newParent.children = make(map[string]*dentry)
  1334  	}
  1335  	newParent.children[newName] = renamed
  1336  	oldParent.dirents = nil
  1337  
  1338  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &oldpop); err != nil {
  1339  		panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout at origin after RenameAt: %v", err))
  1340  	}
  1341  	if renamed.isDir() {
  1342  		if err := vfsObj.SetXattrAt(ctx, fs.creds, &newpop, &vfs.SetXattrOptions{
  1343  			Name:  _OVL_XATTR_OPAQUE,
  1344  			Value: "y",
  1345  		}); err != nil {
  1346  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to make renamed directory opaque: %v", err))
  1347  		}
  1348  	}
  1349  
  1350  	vfs.InotifyRename(ctx, &renamed.watches, &oldParent.watches, &newParent.watches, oldName, newName, renamed.isDir())
  1351  	return nil
  1352  }
  1353  
  1354  // RmdirAt implements vfs.FilesystemImpl.RmdirAt.
  1355  func (fs *filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
  1356  	var ds *[]*dentry
  1357  	fs.renameMu.RLock()
  1358  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1359  	start := rp.Start().Impl().(*dentry)
  1360  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
  1361  	if err != nil {
  1362  		return err
  1363  	}
  1364  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
  1365  		return err
  1366  	}
  1367  	if err := rp.Mount().CheckBeginWrite(); err != nil {
  1368  		return err
  1369  	}
  1370  	defer rp.Mount().EndWrite()
  1371  	name := rp.Component()
  1372  	if name == "." {
  1373  		return linuxerr.EINVAL
  1374  	}
  1375  	if name == ".." {
  1376  		return linuxerr.ENOTEMPTY
  1377  	}
  1378  	vfsObj := rp.VirtualFilesystem()
  1379  	mntns := vfs.MountNamespaceFromContext(ctx)
  1380  	defer mntns.DecRef(ctx)
  1381  	parent.dirMu.Lock()
  1382  	defer parent.dirMu.Unlock()
  1383  
  1384  	// Ensure that parent is copied-up before potentially holding child.copyMu
  1385  	// below.
  1386  	if err := parent.copyUpLocked(ctx); err != nil {
  1387  		return err
  1388  	}
  1389  
  1390  	// We need a dentry representing the child directory being removed in order
  1391  	// to verify that it's empty.
  1392  	child, _, err := fs.getChildLocked(ctx, parent, name, &ds)
  1393  	if err != nil {
  1394  		return err
  1395  	}
  1396  	if !child.isDir() {
  1397  		return linuxerr.ENOTDIR
  1398  	}
  1399  	if err := parent.mayDelete(rp.Credentials(), child); err != nil {
  1400  		return err
  1401  	}
  1402  	child.dirMu.NestedLock(dirLockChild)
  1403  	defer child.dirMu.NestedUnlock(dirLockChild)
  1404  	whiteouts, err := child.collectWhiteoutsForRmdirLocked(ctx)
  1405  	if err != nil {
  1406  		return err
  1407  	}
  1408  	child.copyMu.RLock()
  1409  	defer child.copyMu.RUnlock()
  1410  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
  1411  		return err
  1412  	}
  1413  
  1414  	pop := vfs.PathOperation{
  1415  		Root:  parent.upperVD,
  1416  		Start: parent.upperVD,
  1417  		Path:  fspath.Parse(name),
  1418  	}
  1419  	if child.upperVD.Ok() {
  1420  		cleanupRecreateWhiteouts := func() {
  1421  			if !child.upperVD.Ok() {
  1422  				return
  1423  			}
  1424  			for whiteoutName, whiteoutUpper := range whiteouts {
  1425  				if !whiteoutUpper {
  1426  					continue
  1427  				}
  1428  				if err := CreateWhiteout(ctx, vfsObj, fs.creds, &vfs.PathOperation{
  1429  					Root:  child.upperVD,
  1430  					Start: child.upperVD,
  1431  					Path:  fspath.Parse(whiteoutName),
  1432  				}); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1433  					panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to recreate deleted whiteout after RmdirAt failure: %v", err))
  1434  				}
  1435  			}
  1436  		}
  1437  		// Remove existing whiteouts on the upper layer.
  1438  		for whiteoutName, whiteoutUpper := range whiteouts {
  1439  			if !whiteoutUpper {
  1440  				continue
  1441  			}
  1442  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &vfs.PathOperation{
  1443  				Root:  child.upperVD,
  1444  				Start: child.upperVD,
  1445  				Path:  fspath.Parse(whiteoutName),
  1446  			}); err != nil {
  1447  				vfsObj.AbortDeleteDentry(&child.vfsd)
  1448  				cleanupRecreateWhiteouts()
  1449  				return err
  1450  			}
  1451  		}
  1452  		// Remove the existing directory on the upper layer.
  1453  		if err := vfsObj.RmdirAt(ctx, fs.creds, &pop); err != nil {
  1454  			vfsObj.AbortDeleteDentry(&child.vfsd)
  1455  			cleanupRecreateWhiteouts()
  1456  			return err
  1457  		}
  1458  	}
  1459  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil {
  1460  		vfsObj.AbortDeleteDentry(&child.vfsd)
  1461  		if child.upperVD.Ok() {
  1462  			// Don't attempt to recover from this: the original directory is
  1463  			// already gone, so any dentries representing it are invalid, and
  1464  			// creating a new directory won't undo that.
  1465  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after removing upper layer directory during RmdirAt: %v", err))
  1466  		}
  1467  		return err
  1468  	}
  1469  
  1470  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
  1471  	delete(parent.children, name)
  1472  	ds = appendDentry(ds, child)
  1473  	parent.dirents = nil
  1474  	parent.watches.Notify(ctx, name, linux.IN_DELETE|linux.IN_ISDIR, 0 /* cookie */, vfs.InodeEvent, true /* unlinked */)
  1475  	return nil
  1476  }
  1477  
  1478  // SetStatAt implements vfs.FilesystemImpl.SetStatAt.
  1479  func (fs *filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
  1480  	var ds *[]*dentry
  1481  	fs.renameMu.RLock()
  1482  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1483  	if err != nil {
  1484  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1485  		return err
  1486  	}
  1487  	err = d.setStatLocked(ctx, rp, opts)
  1488  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1489  	if err != nil {
  1490  		return err
  1491  	}
  1492  
  1493  	if ev := vfs.InotifyEventFromStatMask(opts.Stat.Mask); ev != 0 {
  1494  		d.InotifyWithParent(ctx, ev, 0 /* cookie */, vfs.InodeEvent)
  1495  	}
  1496  	return nil
  1497  }
  1498  
  1499  // Precondition: d.fs.renameMu must be held for reading.
  1500  func (d *dentry) setStatLocked(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
  1501  	mode := linux.FileMode(d.mode.Load())
  1502  	if err := vfs.CheckSetStat(ctx, rp.Credentials(), &opts, mode, auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load())); err != nil {
  1503  		return err
  1504  	}
  1505  	mnt := rp.Mount()
  1506  	if err := mnt.CheckBeginWrite(); err != nil {
  1507  		return err
  1508  	}
  1509  	defer mnt.EndWrite()
  1510  	if err := d.copyUpLocked(ctx); err != nil {
  1511  		return err
  1512  	}
  1513  	// Changes to d's attributes are serialized by d.copyMu.
  1514  	d.copyMu.Lock()
  1515  	defer d.copyMu.Unlock()
  1516  	if err := d.fs.vfsfs.VirtualFilesystem().SetStatAt(ctx, d.fs.creds, &vfs.PathOperation{
  1517  		Root:  d.upperVD,
  1518  		Start: d.upperVD,
  1519  	}, &opts); err != nil {
  1520  		return err
  1521  	}
  1522  	d.updateAfterSetStatLocked(&opts)
  1523  	return nil
  1524  }
  1525  
  1526  // StatAt implements vfs.FilesystemImpl.StatAt.
  1527  func (fs *filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
  1528  	var ds *[]*dentry
  1529  	fs.renameMu.RLock()
  1530  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1531  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1532  	if err != nil {
  1533  		return linux.Statx{}, err
  1534  	}
  1535  
  1536  	var stat linux.Statx
  1537  	if layerMask := opts.Mask &^ statInternalMask; layerMask != 0 {
  1538  		layerVD := d.topLayer()
  1539  		stat, err = fs.vfsfs.VirtualFilesystem().StatAt(ctx, fs.creds, &vfs.PathOperation{
  1540  			Root:  layerVD,
  1541  			Start: layerVD,
  1542  		}, &vfs.StatOptions{
  1543  			Mask: layerMask,
  1544  			Sync: opts.Sync,
  1545  		})
  1546  		if err != nil {
  1547  			return linux.Statx{}, err
  1548  		}
  1549  	}
  1550  	d.statInternalTo(ctx, &opts, &stat)
  1551  	return stat, nil
  1552  }
  1553  
  1554  // StatFSAt implements vfs.FilesystemImpl.StatFSAt.
  1555  func (fs *filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
  1556  	var ds *[]*dentry
  1557  	fs.renameMu.RLock()
  1558  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1559  	_, err := fs.resolveLocked(ctx, rp, &ds)
  1560  	if err != nil {
  1561  		return linux.Statfs{}, err
  1562  	}
  1563  	return fs.statFS(ctx)
  1564  }
  1565  
  1566  // SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
  1567  func (fs *filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
  1568  	return fs.doCreateAt(ctx, rp, createNonDirectory, func(parent *dentry, childName string, haveUpperWhiteout bool) error {
  1569  		vfsObj := fs.vfsfs.VirtualFilesystem()
  1570  		pop := vfs.PathOperation{
  1571  			Root:  parent.upperVD,
  1572  			Start: parent.upperVD,
  1573  			Path:  fspath.Parse(childName),
  1574  		}
  1575  		if haveUpperWhiteout {
  1576  			if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1577  				return err
  1578  			}
  1579  		}
  1580  		if err := vfsObj.SymlinkAt(ctx, fs.creds, &pop, target); err != nil {
  1581  			if haveUpperWhiteout {
  1582  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1583  			}
  1584  			return err
  1585  		}
  1586  		creds := rp.Credentials()
  1587  		if err := vfsObj.SetStatAt(ctx, fs.creds, &pop, &vfs.SetStatOptions{
  1588  			Stat: linux.Statx{
  1589  				Mask: linux.STATX_UID | linux.STATX_GID,
  1590  				UID:  uint32(creds.EffectiveKUID),
  1591  				GID:  uint32(creds.EffectiveKGID),
  1592  			},
  1593  		}); err != nil {
  1594  			if cleanupErr := vfsObj.UnlinkAt(ctx, fs.creds, &pop); cleanupErr != nil {
  1595  				panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to delete upper layer file after SymlinkAt metadata update failure: %v", cleanupErr))
  1596  			} else if haveUpperWhiteout {
  1597  				fs.cleanupRecreateWhiteout(ctx, vfsObj, &pop)
  1598  			}
  1599  			return err
  1600  		}
  1601  		return nil
  1602  	})
  1603  }
  1604  
  1605  // UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
  1606  func (fs *filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
  1607  	var ds *[]*dentry
  1608  	fs.renameMu.RLock()
  1609  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1610  	start := rp.Start().Impl().(*dentry)
  1611  	parent, err := fs.walkParentDirLocked(ctx, rp, start, &ds)
  1612  	if err != nil {
  1613  		return err
  1614  	}
  1615  	if err := parent.checkPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
  1616  		return err
  1617  	}
  1618  	if err := rp.Mount().CheckBeginWrite(); err != nil {
  1619  		return err
  1620  	}
  1621  	defer rp.Mount().EndWrite()
  1622  	name := rp.Component()
  1623  	if name == "." || name == ".." {
  1624  		return linuxerr.EISDIR
  1625  	}
  1626  	if rp.MustBeDir() {
  1627  		return linuxerr.ENOTDIR
  1628  	}
  1629  	vfsObj := rp.VirtualFilesystem()
  1630  	mntns := vfs.MountNamespaceFromContext(ctx)
  1631  	defer mntns.DecRef(ctx)
  1632  	parent.dirMu.Lock()
  1633  	defer parent.dirMu.Unlock()
  1634  
  1635  	// Ensure that parent is copied-up before potentially holding child.copyMu
  1636  	// below.
  1637  	if err := parent.copyUpLocked(ctx); err != nil {
  1638  		return err
  1639  	}
  1640  
  1641  	// We need a dentry representing the child being removed in order to verify
  1642  	// that it's not a directory.
  1643  	child, childLayer, err := fs.getChildLocked(ctx, parent, name, &ds)
  1644  	if err != nil {
  1645  		return err
  1646  	}
  1647  	if child.isDir() {
  1648  		return linuxerr.EISDIR
  1649  	}
  1650  	if err := parent.mayDelete(rp.Credentials(), child); err != nil {
  1651  		return err
  1652  	}
  1653  	// Hold child.copyMu to prevent it from being copied-up during
  1654  	// deletion.
  1655  	child.copyMu.RLock()
  1656  	defer child.copyMu.RUnlock()
  1657  	if err := vfsObj.PrepareDeleteDentry(mntns, &child.vfsd); err != nil {
  1658  		return err
  1659  	}
  1660  
  1661  	pop := vfs.PathOperation{
  1662  		Root:  parent.upperVD,
  1663  		Start: parent.upperVD,
  1664  		Path:  fspath.Parse(name),
  1665  	}
  1666  	if childLayer == lookupLayerUpper {
  1667  		// Remove the existing file on the upper layer.
  1668  		if err := vfsObj.UnlinkAt(ctx, fs.creds, &pop); err != nil {
  1669  			vfsObj.AbortDeleteDentry(&child.vfsd)
  1670  			return err
  1671  		}
  1672  	}
  1673  	if err := CreateWhiteout(ctx, vfsObj, fs.creds, &pop); err != nil {
  1674  		vfsObj.AbortDeleteDentry(&child.vfsd)
  1675  		if childLayer == lookupLayerUpper {
  1676  			panic(fmt.Sprintf("unrecoverable overlayfs inconsistency: failed to create whiteout after unlinking upper layer file during UnlinkAt: %v", err))
  1677  		}
  1678  		return err
  1679  	}
  1680  
  1681  	vfsObj.CommitDeleteDentry(ctx, &child.vfsd)
  1682  	delete(parent.children, name)
  1683  	if !child.isDir() {
  1684  		// Once a whiteout is created, non-directory dentries on the lower layers
  1685  		// are no longer reachable from the overlayfs. Ask filesystems to release
  1686  		// their resources whenever possible.
  1687  		for _, lowerDentry := range child.lowerVDs {
  1688  			lowerDentry.Dentry().MarkEvictable()
  1689  		}
  1690  	}
  1691  	ds = appendDentry(ds, child)
  1692  	vfs.InotifyRemoveChild(ctx, &child.watches, &parent.watches, name)
  1693  	parent.dirents = nil
  1694  	return nil
  1695  }
  1696  
  1697  // isOverlayXattr returns whether the given extended attribute configures the
  1698  // overlay.
  1699  func isOverlayXattr(name string) bool {
  1700  	return strings.HasPrefix(name, _OVL_XATTR_PREFIX)
  1701  }
  1702  
  1703  // ListXattrAt implements vfs.FilesystemImpl.ListXattrAt.
  1704  func (fs *filesystem) ListXattrAt(ctx context.Context, rp *vfs.ResolvingPath, size uint64) ([]string, error) {
  1705  	var ds *[]*dentry
  1706  	fs.renameMu.RLock()
  1707  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1708  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1709  	if err != nil {
  1710  		return nil, err
  1711  	}
  1712  
  1713  	return fs.listXattr(ctx, d, size)
  1714  }
  1715  
  1716  func (fs *filesystem) listXattr(ctx context.Context, d *dentry, size uint64) ([]string, error) {
  1717  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1718  	top := d.topLayer()
  1719  	names, err := vfsObj.ListXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, size)
  1720  	if err != nil {
  1721  		return nil, err
  1722  	}
  1723  
  1724  	// Filter out all overlay attributes.
  1725  	n := 0
  1726  	for _, name := range names {
  1727  		if !isOverlayXattr(name) {
  1728  			names[n] = name
  1729  			n++
  1730  		}
  1731  	}
  1732  	return names[:n], err
  1733  }
  1734  
  1735  // GetXattrAt implements vfs.FilesystemImpl.GetXattrAt.
  1736  func (fs *filesystem) GetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetXattrOptions) (string, error) {
  1737  	var ds *[]*dentry
  1738  	fs.renameMu.RLock()
  1739  	defer fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1740  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1741  	if err != nil {
  1742  		return "", err
  1743  	}
  1744  
  1745  	return fs.getXattr(ctx, d, rp.Credentials(), &opts)
  1746  }
  1747  
  1748  func (fs *filesystem) getXattr(ctx context.Context, d *dentry, creds *auth.Credentials, opts *vfs.GetXattrOptions) (string, error) {
  1749  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayRead); err != nil {
  1750  		return "", err
  1751  	}
  1752  
  1753  	// Return EOPNOTSUPP when fetching an overlay attribute.
  1754  	// See fs/overlayfs/super.c:ovl_own_xattr_get().
  1755  	if isOverlayXattr(opts.Name) {
  1756  		return "", linuxerr.EOPNOTSUPP
  1757  	}
  1758  
  1759  	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_get().
  1760  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1761  	top := d.topLayer()
  1762  	return vfsObj.GetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: top, Start: top}, opts)
  1763  }
  1764  
  1765  // SetXattrAt implements vfs.FilesystemImpl.SetXattrAt.
  1766  func (fs *filesystem) SetXattrAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetXattrOptions) error {
  1767  	var ds *[]*dentry
  1768  	fs.renameMu.RLock()
  1769  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1770  	if err != nil {
  1771  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1772  		return err
  1773  	}
  1774  
  1775  	err = fs.setXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), &opts)
  1776  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1777  	if err != nil {
  1778  		return err
  1779  	}
  1780  
  1781  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
  1782  	return nil
  1783  }
  1784  
  1785  // Precondition: fs.renameMu must be locked.
  1786  func (fs *filesystem) setXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, opts *vfs.SetXattrOptions) error {
  1787  	if err := d.checkXattrPermissions(creds, opts.Name, vfs.MayWrite); err != nil {
  1788  		return err
  1789  	}
  1790  
  1791  	// Return EOPNOTSUPP when setting an overlay attribute.
  1792  	// See fs/overlayfs/super.c:ovl_own_xattr_set().
  1793  	if isOverlayXattr(opts.Name) {
  1794  		return linuxerr.EOPNOTSUPP
  1795  	}
  1796  
  1797  	// Analogous to fs/overlayfs/super.c:ovl_other_xattr_set().
  1798  	if err := mnt.CheckBeginWrite(); err != nil {
  1799  		return err
  1800  	}
  1801  	defer mnt.EndWrite()
  1802  	if err := d.copyUpLocked(ctx); err != nil {
  1803  		return err
  1804  	}
  1805  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1806  	return vfsObj.SetXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, opts)
  1807  }
  1808  
  1809  // RemoveXattrAt implements vfs.FilesystemImpl.RemoveXattrAt.
  1810  func (fs *filesystem) RemoveXattrAt(ctx context.Context, rp *vfs.ResolvingPath, name string) error {
  1811  	var ds *[]*dentry
  1812  	fs.renameMu.RLock()
  1813  	d, err := fs.resolveLocked(ctx, rp, &ds)
  1814  	if err != nil {
  1815  		fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1816  		return err
  1817  	}
  1818  
  1819  	err = fs.removeXattrLocked(ctx, d, rp.Mount(), rp.Credentials(), name)
  1820  	fs.renameMuRUnlockAndCheckDrop(ctx, &ds)
  1821  	if err != nil {
  1822  		return err
  1823  	}
  1824  
  1825  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0 /* cookie */, vfs.InodeEvent)
  1826  	return nil
  1827  }
  1828  
  1829  // Precondition: fs.renameMu must be locked.
  1830  func (fs *filesystem) removeXattrLocked(ctx context.Context, d *dentry, mnt *vfs.Mount, creds *auth.Credentials, name string) error {
  1831  	if err := d.checkXattrPermissions(creds, name, vfs.MayWrite); err != nil {
  1832  		return err
  1833  	}
  1834  
  1835  	// Like SetXattrAt, return EOPNOTSUPP when removing an overlay attribute.
  1836  	// Linux passes the remove request to xattr_handler->set.
  1837  	// See fs/xattr.c:vfs_removexattr().
  1838  	if isOverlayXattr(name) {
  1839  		return linuxerr.EOPNOTSUPP
  1840  	}
  1841  
  1842  	if err := mnt.CheckBeginWrite(); err != nil {
  1843  		return err
  1844  	}
  1845  	defer mnt.EndWrite()
  1846  	if err := d.copyUpLocked(ctx); err != nil {
  1847  		return err
  1848  	}
  1849  	vfsObj := d.fs.vfsfs.VirtualFilesystem()
  1850  	return vfsObj.RemoveXattrAt(ctx, fs.creds, &vfs.PathOperation{Root: d.upperVD, Start: d.upperVD}, name)
  1851  }
  1852  
  1853  // PrependPath implements vfs.FilesystemImpl.PrependPath.
  1854  func (fs *filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
  1855  	fs.renameMu.RLock()
  1856  	defer fs.renameMu.RUnlock()
  1857  	return genericPrependPath(vfsroot, vd.Mount(), vd.Dentry().Impl().(*dentry), b)
  1858  }
  1859  
  1860  // MountOptions implements vfs.FilesystemImpl.MountOptions.
  1861  func (fs *filesystem) MountOptions() string {
  1862  	// Return the mount options from the topmost layer.
  1863  	var vd vfs.VirtualDentry
  1864  	if fs.opts.UpperRoot.Ok() {
  1865  		vd = fs.opts.UpperRoot
  1866  	} else {
  1867  		vd = fs.opts.LowerRoots[0]
  1868  	}
  1869  	return vd.Mount().Filesystem().Impl().MountOptions()
  1870  }