github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/overlay/overlay.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package overlay provides an overlay filesystem implementation, which
    16  // synthesizes a filesystem by composing one or more immutable filesystems
    17  // ("lower layers") with an optional mutable filesystem ("upper layer").
    18  //
    19  // Lock order:
    20  //
    21  // directoryFD.mu / regularFileFD.mu
    22  //   filesystem.renameMu
    23  //     dentry.dirMu
    24  //       dentry.copyMu
    25  //         filesystem.devMu
    26  //         *** "memmap.Mappable locks" below this point
    27  //         dentry.mapsMu
    28  //           *** "memmap.Mappable locks taken by Translate" below this point
    29  //           dentry.dataMu
    30  //
    31  // Locking dentry.dirMu in multiple dentries requires that parent dentries are
    32  // locked before child dentries, and that filesystem.renameMu is locked to
    33  // stabilize this relationship.
    34  package overlay
    35  
    36  import (
    37  	"fmt"
    38  	"strings"
    39  	"sync/atomic"
    40  
    41  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    42  	"github.com/SagerNet/gvisor/pkg/context"
    43  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    44  	"github.com/SagerNet/gvisor/pkg/fspath"
    45  	"github.com/SagerNet/gvisor/pkg/refsvfs2"
    46  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/memmap"
    48  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    49  	"github.com/SagerNet/gvisor/pkg/sync"
    50  )
    51  
    52  // Name is the default filesystem name.
    53  const Name = "overlay"
    54  
    55  // FilesystemType implements vfs.FilesystemType.
    56  //
    57  // +stateify savable
    58  type FilesystemType struct{}
    59  
    60  // Name implements vfs.FilesystemType.Name.
    61  func (FilesystemType) Name() string {
    62  	return Name
    63  }
    64  
    65  // Release implements FilesystemType.Release.
    66  func (FilesystemType) Release(ctx context.Context) {}
    67  
    68  // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
    69  // FilesystemType.GetFilesystem.
    70  //
    71  // +stateify savable
    72  type FilesystemOptions struct {
    73  	// Callers passing FilesystemOptions to
    74  	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
    75  	// the vfs.Mounts comprising the layers of the overlay filesystem do not
    76  	// contain submounts.
    77  
    78  	// If UpperRoot.Ok(), it is the root of the writable upper layer of the
    79  	// overlay.
    80  	UpperRoot vfs.VirtualDentry
    81  
    82  	// LowerRoots contains the roots of the immutable lower layers of the
    83  	// overlay. LowerRoots is immutable.
    84  	LowerRoots []vfs.VirtualDentry
    85  }
    86  
    87  // filesystem implements vfs.FilesystemImpl.
    88  //
    89  // +stateify savable
    90  type filesystem struct {
    91  	vfsfs vfs.Filesystem
    92  
    93  	// Immutable options.
    94  	opts FilesystemOptions
    95  
    96  	// creds is a copy of the filesystem's creator's credentials, which are
    97  	// used for accesses to the filesystem's layers. creds is immutable.
    98  	creds *auth.Credentials
    99  
   100  	// privateDevMinors maps device numbers from layer filesystems to device
   101  	// minor numbers assigned to files originating from that filesystem.
   102  	//
   103  	// For non-directory files, this remapping is necessary for lower layers
   104  	// because a file on a lower layer, and that same file on an overlay, are
   105  	// distinguishable because they will diverge after copy-up. (Once a
   106  	// non-directory file has been copied up, its contents on the upper layer
   107  	// completely determine its contents in the overlay, so this is no longer
   108  	// true; but we still do the mapping for consistency.)
   109  	//
   110  	// For directories, this remapping may be necessary even if the directory
   111  	// exists on the upper layer due to directory merging; rather than make the
   112  	// mapping conditional on whether the directory is opaque, we again
   113  	// unconditionally apply the mapping unconditionally.
   114  	//
   115  	// privateDevMinors is protected by devMu.
   116  	devMu            sync.Mutex `state:"nosave"`
   117  	privateDevMinors map[layerDevNumber]uint32
   118  
   119  	// renameMu synchronizes renaming with non-renaming operations in order to
   120  	// ensure consistent lock ordering between dentry.dirMu in different
   121  	// dentries.
   122  	renameMu sync.RWMutex `state:"nosave"`
   123  }
   124  
   125  // +stateify savable
   126  type layerDevNumber struct {
   127  	major uint32
   128  	minor uint32
   129  }
   130  
   131  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   132  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   133  	mopts := vfs.GenericParseMountOptions(opts.Data)
   134  	fsoptsRaw := opts.InternalData
   135  	fsopts, ok := fsoptsRaw.(FilesystemOptions)
   136  	if fsoptsRaw != nil && !ok {
   137  		ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
   138  		return nil, nil, linuxerr.EINVAL
   139  	}
   140  	vfsroot := vfs.RootFromContext(ctx)
   141  	if vfsroot.Ok() {
   142  		defer vfsroot.DecRef(ctx)
   143  	}
   144  
   145  	if upperPathname, ok := mopts["upperdir"]; ok {
   146  		if fsopts.UpperRoot.Ok() {
   147  			ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
   148  			return nil, nil, linuxerr.EINVAL
   149  		}
   150  		delete(mopts, "upperdir")
   151  		// Linux overlayfs also requires a workdir when upperdir is
   152  		// specified; we don't, so silently ignore this option.
   153  		delete(mopts, "workdir")
   154  		upperPath := fspath.Parse(upperPathname)
   155  		if !upperPath.Absolute {
   156  			ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
   157  			return nil, nil, linuxerr.EINVAL
   158  		}
   159  		upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
   160  			Root:               vfsroot,
   161  			Start:              vfsroot,
   162  			Path:               upperPath,
   163  			FollowFinalSymlink: true,
   164  		}, &vfs.GetDentryOptions{
   165  			CheckSearchable: true,
   166  		})
   167  		if err != nil {
   168  			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
   169  			return nil, nil, err
   170  		}
   171  		privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
   172  		upperRoot.DecRef(ctx)
   173  		if err != nil {
   174  			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
   175  			return nil, nil, err
   176  		}
   177  		defer privateUpperRoot.DecRef(ctx)
   178  		fsopts.UpperRoot = privateUpperRoot
   179  	}
   180  
   181  	if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
   182  		if len(fsopts.LowerRoots) != 0 {
   183  			ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
   184  			return nil, nil, linuxerr.EINVAL
   185  		}
   186  		delete(mopts, "lowerdir")
   187  		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
   188  		for _, lowerPathname := range lowerPathnames {
   189  			lowerPath := fspath.Parse(lowerPathname)
   190  			if !lowerPath.Absolute {
   191  				ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
   192  				return nil, nil, linuxerr.EINVAL
   193  			}
   194  			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
   195  				Root:               vfsroot,
   196  				Start:              vfsroot,
   197  				Path:               lowerPath,
   198  				FollowFinalSymlink: true,
   199  			}, &vfs.GetDentryOptions{
   200  				CheckSearchable: true,
   201  			})
   202  			if err != nil {
   203  				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
   204  				return nil, nil, err
   205  			}
   206  			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
   207  			lowerRoot.DecRef(ctx)
   208  			if err != nil {
   209  				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
   210  				return nil, nil, err
   211  			}
   212  			defer privateLowerRoot.DecRef(ctx)
   213  			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
   214  		}
   215  	}
   216  
   217  	if len(mopts) != 0 {
   218  		ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
   219  		return nil, nil, linuxerr.EINVAL
   220  	}
   221  
   222  	if len(fsopts.LowerRoots) == 0 {
   223  		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
   224  		return nil, nil, linuxerr.EINVAL
   225  	}
   226  	if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
   227  		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
   228  		return nil, nil, linuxerr.EINVAL
   229  	}
   230  	const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
   231  	if len(fsopts.LowerRoots) > maxLowerLayers {
   232  		ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
   233  		return nil, nil, linuxerr.EINVAL
   234  	}
   235  
   236  	// Take extra references held by the filesystem.
   237  	if fsopts.UpperRoot.Ok() {
   238  		fsopts.UpperRoot.IncRef()
   239  	}
   240  	for _, lowerRoot := range fsopts.LowerRoots {
   241  		lowerRoot.IncRef()
   242  	}
   243  
   244  	fs := &filesystem{
   245  		opts:             fsopts,
   246  		creds:            creds.Fork(),
   247  		privateDevMinors: make(map[layerDevNumber]uint32),
   248  	}
   249  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   250  
   251  	// Construct the root dentry.
   252  	root := fs.newDentry()
   253  	root.refs = 1
   254  	if fs.opts.UpperRoot.Ok() {
   255  		fs.opts.UpperRoot.IncRef()
   256  		root.copiedUp = 1
   257  		root.upperVD = fs.opts.UpperRoot
   258  	}
   259  	for _, lowerRoot := range fs.opts.LowerRoots {
   260  		lowerRoot.IncRef()
   261  		root.lowerVDs = append(root.lowerVDs, lowerRoot)
   262  	}
   263  	rootTopVD := root.topLayer()
   264  	// Get metadata from the topmost layer. See fs.lookupLocked().
   265  	const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   266  	rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
   267  		Root:  rootTopVD,
   268  		Start: rootTopVD,
   269  	}, &vfs.StatOptions{
   270  		Mask: rootStatMask,
   271  	})
   272  	if err != nil {
   273  		root.destroyLocked(ctx)
   274  		fs.vfsfs.DecRef(ctx)
   275  		return nil, nil, err
   276  	}
   277  	if rootStat.Mask&rootStatMask != rootStatMask {
   278  		root.destroyLocked(ctx)
   279  		fs.vfsfs.DecRef(ctx)
   280  		return nil, nil, linuxerr.EREMOTE
   281  	}
   282  	if isWhiteout(&rootStat) {
   283  		ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
   284  		root.destroyLocked(ctx)
   285  		fs.vfsfs.DecRef(ctx)
   286  		return nil, nil, linuxerr.EINVAL
   287  	}
   288  	root.mode = uint32(rootStat.Mode)
   289  	root.uid = rootStat.UID
   290  	root.gid = rootStat.GID
   291  	root.devMajor = linux.UNNAMED_MAJOR
   292  	rootDevMinor, err := fs.getPrivateDevMinor(rootStat.DevMajor, rootStat.DevMinor)
   293  	if err != nil {
   294  		ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
   295  		root.destroyLocked(ctx)
   296  		fs.vfsfs.DecRef(ctx)
   297  		return nil, nil, err
   298  	}
   299  	root.devMinor = rootDevMinor
   300  	root.ino = rootStat.Ino
   301  
   302  	return &fs.vfsfs, &root.vfsd, nil
   303  }
   304  
   305  // clonePrivateMount creates a non-recursive bind mount rooted at vd, not
   306  // associated with any MountNamespace, and returns the root of the new mount.
   307  // (This is required to ensure that each layer of an overlay comprises only a
   308  // single mount, and therefore can't cross into e.g. the overlay filesystem
   309  // itself, risking lock recursion.) A reference is held on the returned
   310  // VirtualDentry.
   311  func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
   312  	oldmnt := vd.Mount()
   313  	opts := oldmnt.Options()
   314  	if forceReadOnly {
   315  		opts.ReadOnly = true
   316  	}
   317  	newmnt, err := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
   318  	if err != nil {
   319  		return vfs.VirtualDentry{}, err
   320  	}
   321  	// Take a reference on the dentry which will be owned by the returned
   322  	// VirtualDentry.
   323  	d := vd.Dentry()
   324  	d.IncRef()
   325  	return vfs.MakeVirtualDentry(newmnt, d), nil
   326  }
   327  
   328  // Release implements vfs.FilesystemImpl.Release.
   329  func (fs *filesystem) Release(ctx context.Context) {
   330  	vfsObj := fs.vfsfs.VirtualFilesystem()
   331  	for _, devMinor := range fs.privateDevMinors {
   332  		vfsObj.PutAnonBlockDevMinor(devMinor)
   333  	}
   334  	if fs.opts.UpperRoot.Ok() {
   335  		fs.opts.UpperRoot.DecRef(ctx)
   336  	}
   337  	for _, lowerRoot := range fs.opts.LowerRoots {
   338  		lowerRoot.DecRef(ctx)
   339  	}
   340  }
   341  
   342  func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
   343  	// Always statfs the root of the topmost layer. Compare Linux's
   344  	// fs/overlayfs/super.c:ovl_statfs().
   345  	var rootVD vfs.VirtualDentry
   346  	if fs.opts.UpperRoot.Ok() {
   347  		rootVD = fs.opts.UpperRoot
   348  	} else {
   349  		rootVD = fs.opts.LowerRoots[0]
   350  	}
   351  	fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
   352  		Root:  rootVD,
   353  		Start: rootVD,
   354  	})
   355  	if err != nil {
   356  		return linux.Statfs{}, err
   357  	}
   358  	fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
   359  	return fsstat, nil
   360  }
   361  
   362  func (fs *filesystem) getPrivateDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
   363  	fs.devMu.Lock()
   364  	defer fs.devMu.Unlock()
   365  	orig := layerDevNumber{layerMajor, layerMinor}
   366  	if minor, ok := fs.privateDevMinors[orig]; ok {
   367  		return minor, nil
   368  	}
   369  	minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
   370  	if err != nil {
   371  		return 0, err
   372  	}
   373  	fs.privateDevMinors[orig] = minor
   374  	return minor, nil
   375  }
   376  
   377  // dentry implements vfs.DentryImpl.
   378  //
   379  // +stateify savable
   380  type dentry struct {
   381  	vfsd vfs.Dentry
   382  
   383  	refs int64
   384  
   385  	// fs is the owning filesystem. fs is immutable.
   386  	fs *filesystem
   387  
   388  	// mode, uid, and gid are the file mode, owner, and group of the file in
   389  	// the topmost layer (and therefore the overlay file as well), and are used
   390  	// for permission checks on this dentry. These fields are protected by
   391  	// copyMu and accessed using atomic memory operations.
   392  	mode uint32
   393  	uid  uint32
   394  	gid  uint32
   395  
   396  	// copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
   397  	// 0 otherwise. copiedUp is accessed using atomic memory operations.
   398  	copiedUp uint32
   399  
   400  	// parent is the dentry corresponding to this dentry's parent directory.
   401  	// name is this dentry's name in parent. If this dentry is a filesystem
   402  	// root, parent is nil and name is the empty string. parent and name are
   403  	// protected by fs.renameMu.
   404  	parent *dentry
   405  	name   string
   406  
   407  	// If this dentry represents a directory, children maps the names of
   408  	// children for which dentries have been instantiated to those dentries,
   409  	// and dirents (if not nil) is a cache of dirents as returned by
   410  	// directoryFDs representing this directory. children is protected by
   411  	// dirMu.
   412  	dirMu    sync.Mutex `state:"nosave"`
   413  	children map[string]*dentry
   414  	dirents  []vfs.Dirent
   415  
   416  	// upperVD and lowerVDs are the files from the overlay filesystem's layers
   417  	// that comprise the file on the overlay filesystem.
   418  	//
   419  	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
   420  	// be copied up) with copyMu locked for writing; otherwise, it is
   421  	// immutable. lowerVDs is always immutable.
   422  	copyMu   sync.RWMutex `state:"nosave"`
   423  	upperVD  vfs.VirtualDentry
   424  	lowerVDs []vfs.VirtualDentry
   425  
   426  	// inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
   427  	// len(inlineLowerVDs).
   428  	inlineLowerVDs [1]vfs.VirtualDentry
   429  
   430  	// devMajor, devMinor, and ino are the device major/minor and inode numbers
   431  	// used by this dentry. These fields are protected by copyMu and accessed
   432  	// using atomic memory operations.
   433  	devMajor uint32
   434  	devMinor uint32
   435  	ino      uint64
   436  
   437  	// If this dentry represents a regular file, then:
   438  	//
   439  	// - mapsMu is used to synchronize between copy-up and memmap.Mappable
   440  	// methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
   441  	//
   442  	// - dataMu is used to synchronize between copy-up and
   443  	// dentry.(memmap.Mappable).Translate.
   444  	//
   445  	// - lowerMappings tracks memory mappings of the file. lowerMappings is
   446  	// used to invalidate mappings of the lower layer when the file is copied
   447  	// up to ensure that they remain coherent with subsequent writes to the
   448  	// file. (Note that, as of this writing, Linux overlayfs does not do this;
   449  	// this feature is a gVisor extension.) lowerMappings is protected by
   450  	// mapsMu.
   451  	//
   452  	// - If this dentry is copied-up, then wrappedMappable is the Mappable
   453  	// obtained from a call to the current top layer's
   454  	// FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
   455  	// (from a call to regularFileFD.ensureMappable()), it cannot become nil.
   456  	// wrappedMappable is protected by mapsMu and dataMu.
   457  	//
   458  	// - isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
   459  	// accessed using atomic memory operations.
   460  	mapsMu          sync.Mutex `state:"nosave"`
   461  	lowerMappings   memmap.MappingSet
   462  	dataMu          sync.RWMutex `state:"nosave"`
   463  	wrappedMappable memmap.Mappable
   464  	isMappable      uint32
   465  
   466  	locks vfs.FileLocks
   467  
   468  	// watches is the set of inotify watches on the file repesented by this dentry.
   469  	//
   470  	// Note that hard links to the same file will not share the same set of
   471  	// watches, due to the fact that we do not have inode structures in this
   472  	// overlay implementation.
   473  	watches vfs.Watches
   474  }
   475  
   476  // newDentry creates a new dentry. The dentry initially has no references; it
   477  // is the caller's responsibility to set the dentry's reference count and/or
   478  // call dentry.destroy() as appropriate. The dentry is initially invalid in
   479  // that it contains no layers; the caller is responsible for setting them.
   480  func (fs *filesystem) newDentry() *dentry {
   481  	d := &dentry{
   482  		fs: fs,
   483  	}
   484  	d.lowerVDs = d.inlineLowerVDs[:0]
   485  	d.vfsd.Init(d)
   486  	refsvfs2.Register(d)
   487  	return d
   488  }
   489  
   490  // IncRef implements vfs.DentryImpl.IncRef.
   491  func (d *dentry) IncRef() {
   492  	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
   493  	// d.checkDropLocked().
   494  	r := atomic.AddInt64(&d.refs, 1)
   495  	if d.LogRefs() {
   496  		refsvfs2.LogIncRef(d, r)
   497  	}
   498  }
   499  
   500  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   501  func (d *dentry) TryIncRef() bool {
   502  	for {
   503  		r := atomic.LoadInt64(&d.refs)
   504  		if r <= 0 {
   505  			return false
   506  		}
   507  		if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
   508  			if d.LogRefs() {
   509  				refsvfs2.LogTryIncRef(d, r+1)
   510  			}
   511  			return true
   512  		}
   513  	}
   514  }
   515  
   516  // DecRef implements vfs.DentryImpl.DecRef.
   517  func (d *dentry) DecRef(ctx context.Context) {
   518  	r := atomic.AddInt64(&d.refs, -1)
   519  	if d.LogRefs() {
   520  		refsvfs2.LogDecRef(d, r)
   521  	}
   522  	if r == 0 {
   523  		d.fs.renameMu.Lock()
   524  		d.checkDropLocked(ctx)
   525  		d.fs.renameMu.Unlock()
   526  	} else if r < 0 {
   527  		panic("overlay.dentry.DecRef() called without holding a reference")
   528  	}
   529  }
   530  
   531  func (d *dentry) decRefLocked(ctx context.Context) {
   532  	r := atomic.AddInt64(&d.refs, -1)
   533  	if d.LogRefs() {
   534  		refsvfs2.LogDecRef(d, r)
   535  	}
   536  	if r == 0 {
   537  		d.checkDropLocked(ctx)
   538  	} else if r < 0 {
   539  		panic("overlay.dentry.decRefLocked() called without holding a reference")
   540  	}
   541  }
   542  
   543  // checkDropLocked should be called after d's reference count becomes 0 or it
   544  // becomes deleted.
   545  //
   546  // Preconditions: d.fs.renameMu must be locked for writing.
   547  func (d *dentry) checkDropLocked(ctx context.Context) {
   548  	// Dentries with a positive reference count must be retained. (The only way
   549  	// to obtain a reference on a dentry with zero references is via path
   550  	// resolution, which requires renameMu, so if d.refs is zero then it will
   551  	// remain zero while we hold renameMu for writing.) Dentries with a
   552  	// negative reference count have already been destroyed.
   553  	if atomic.LoadInt64(&d.refs) != 0 {
   554  		return
   555  	}
   556  
   557  	// Make sure that we do not lose watches on dentries that have not been
   558  	// deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
   559  	// d.vfsd.IsDead() indicates that d was deleted.
   560  	if !d.vfsd.IsDead() && d.watches.Size() > 0 {
   561  		return
   562  	}
   563  
   564  	// Refs is still zero; destroy it.
   565  	d.destroyLocked(ctx)
   566  	return
   567  }
   568  
   569  // destroyLocked destroys the dentry.
   570  //
   571  // Preconditions:
   572  // * d.fs.renameMu must be locked for writing.
   573  // * d.refs == 0.
   574  func (d *dentry) destroyLocked(ctx context.Context) {
   575  	switch atomic.LoadInt64(&d.refs) {
   576  	case 0:
   577  		// Mark the dentry destroyed.
   578  		atomic.StoreInt64(&d.refs, -1)
   579  	case -1:
   580  		panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
   581  	default:
   582  		panic("overlay.dentry.destroyLocked() called with references on the dentry")
   583  	}
   584  
   585  	if d.upperVD.Ok() {
   586  		d.upperVD.DecRef(ctx)
   587  	}
   588  	for _, lowerVD := range d.lowerVDs {
   589  		lowerVD.DecRef(ctx)
   590  	}
   591  
   592  	d.watches.HandleDeletion(ctx)
   593  
   594  	if d.parent != nil {
   595  		d.parent.dirMu.Lock()
   596  		if !d.vfsd.IsDead() {
   597  			delete(d.parent.children, d.name)
   598  		}
   599  		d.parent.dirMu.Unlock()
   600  		// Drop the reference held by d on its parent without recursively
   601  		// locking d.fs.renameMu.
   602  		d.parent.decRefLocked(ctx)
   603  	}
   604  	refsvfs2.Unregister(d)
   605  }
   606  
   607  // RefType implements refsvfs2.CheckedObject.Type.
   608  func (d *dentry) RefType() string {
   609  	return "overlay.dentry"
   610  }
   611  
   612  // LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
   613  func (d *dentry) LeakMessage() string {
   614  	return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
   615  }
   616  
   617  // LogRefs implements refsvfs2.CheckedObject.LogRefs.
   618  //
   619  // This should only be set to true for debugging purposes, as it can generate an
   620  // extremely large amount of output and drastically degrade performance.
   621  func (d *dentry) LogRefs() bool {
   622  	return false
   623  }
   624  
   625  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   626  func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
   627  	if d.isDir() {
   628  		events |= linux.IN_ISDIR
   629  	}
   630  
   631  	// overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   632  	// that d was deleted.
   633  	deleted := d.vfsd.IsDead()
   634  
   635  	d.fs.renameMu.RLock()
   636  	// The ordering below is important, Linux always notifies the parent first.
   637  	if d.parent != nil {
   638  		d.parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   639  	}
   640  	d.watches.Notify(ctx, "", events, cookie, et, deleted)
   641  	d.fs.renameMu.RUnlock()
   642  }
   643  
   644  // Watches implements vfs.DentryImpl.Watches.
   645  func (d *dentry) Watches() *vfs.Watches {
   646  	return &d.watches
   647  }
   648  
   649  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
   650  func (d *dentry) OnZeroWatches(ctx context.Context) {
   651  	if atomic.LoadInt64(&d.refs) == 0 {
   652  		d.fs.renameMu.Lock()
   653  		d.checkDropLocked(ctx)
   654  		d.fs.renameMu.Unlock()
   655  	}
   656  }
   657  
   658  // iterLayers invokes yield on each layer comprising d, from top to bottom. If
   659  // any call to yield returns false, iterLayer stops iteration.
   660  func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
   661  	if d.isCopiedUp() {
   662  		if !yield(d.upperVD, true) {
   663  			return
   664  		}
   665  	}
   666  	for _, lowerVD := range d.lowerVDs {
   667  		if !yield(lowerVD, false) {
   668  			return
   669  		}
   670  	}
   671  }
   672  
   673  func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
   674  	if d.isCopiedUp() {
   675  		return d.upperVD, true
   676  	}
   677  	return d.lowerVDs[0], false
   678  }
   679  
   680  func (d *dentry) topLayer() vfs.VirtualDentry {
   681  	vd, _ := d.topLayerInfo()
   682  	return vd
   683  }
   684  
   685  func (d *dentry) topLookupLayer() lookupLayer {
   686  	if d.upperVD.Ok() {
   687  		return lookupLayerUpper
   688  	}
   689  	return lookupLayerLower
   690  }
   691  
   692  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   693  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(atomic.LoadUint32(&d.mode)), auth.KUID(atomic.LoadUint32(&d.uid)), auth.KGID(atomic.LoadUint32(&d.gid)))
   694  }
   695  
   696  func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
   697  	mode := linux.FileMode(atomic.LoadUint32(&d.mode))
   698  	kuid := auth.KUID(atomic.LoadUint32(&d.uid))
   699  	kgid := auth.KGID(atomic.LoadUint32(&d.gid))
   700  	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
   701  		return err
   702  	}
   703  	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
   704  }
   705  
   706  // statInternalMask is the set of stat fields that is set by
   707  // dentry.statInternalTo().
   708  const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   709  
   710  // statInternalTo writes fields to stat that are stored in d, and therefore do
   711  // not requiring invoking StatAt on the overlay's layers.
   712  func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
   713  	stat.Mask |= statInternalMask
   714  	if d.isDir() {
   715  		// Linux sets nlink to 1 for merged directories
   716  		// (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
   717  		// correct more often ("." and the directory's entry in its parent),
   718  		// and some of our tests expect this.
   719  		stat.Nlink = 2
   720  	}
   721  	stat.UID = atomic.LoadUint32(&d.uid)
   722  	stat.GID = atomic.LoadUint32(&d.gid)
   723  	stat.Mode = uint16(atomic.LoadUint32(&d.mode))
   724  	stat.Ino = atomic.LoadUint64(&d.ino)
   725  	stat.DevMajor = atomic.LoadUint32(&d.devMajor)
   726  	stat.DevMinor = atomic.LoadUint32(&d.devMinor)
   727  }
   728  
   729  // Preconditions: d.copyMu must be locked for writing.
   730  func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
   731  	if opts.Stat.Mask&linux.STATX_MODE != 0 {
   732  		atomic.StoreUint32(&d.mode, (d.mode&linux.S_IFMT)|uint32(opts.Stat.Mode&^linux.S_IFMT))
   733  	}
   734  	if opts.Stat.Mask&linux.STATX_UID != 0 {
   735  		atomic.StoreUint32(&d.uid, opts.Stat.UID)
   736  	}
   737  	if opts.Stat.Mask&linux.STATX_GID != 0 {
   738  		atomic.StoreUint32(&d.gid, opts.Stat.GID)
   739  	}
   740  }
   741  
   742  func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
   743  	return vfs.CheckDeleteSticky(
   744  		creds,
   745  		linux.FileMode(atomic.LoadUint32(&d.mode)),
   746  		auth.KUID(atomic.LoadUint32(&d.uid)),
   747  		auth.KUID(atomic.LoadUint32(&child.uid)),
   748  		auth.KGID(atomic.LoadUint32(&child.gid)),
   749  	)
   750  }
   751  
   752  // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of
   753  // children.
   754  func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx {
   755  	stat := linux.Statx{
   756  		Mask: uint32(linux.STATX_UID | linux.STATX_GID),
   757  		UID:  uint32(creds.EffectiveKUID),
   758  		GID:  uint32(creds.EffectiveKGID),
   759  	}
   760  	// Set GID and possibly the SGID bit if the parent is an SGID directory.
   761  	d.copyMu.RLock()
   762  	defer d.copyMu.RUnlock()
   763  	if atomic.LoadUint32(&d.mode)&linux.ModeSetGID == linux.ModeSetGID {
   764  		stat.GID = atomic.LoadUint32(&d.gid)
   765  		if stat.Mode&linux.ModeDirectory == linux.ModeDirectory {
   766  			stat.Mode = uint16(mode) | linux.ModeSetGID
   767  			stat.Mask |= linux.STATX_MODE
   768  		}
   769  	}
   770  	return stat
   771  }
   772  
   773  // fileDescription is embedded by overlay implementations of
   774  // vfs.FileDescriptionImpl.
   775  //
   776  // +stateify savable
   777  type fileDescription struct {
   778  	vfsfd vfs.FileDescription
   779  	vfs.FileDescriptionDefaultImpl
   780  	vfs.LockFD
   781  }
   782  
   783  func (fd *fileDescription) filesystem() *filesystem {
   784  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   785  }
   786  
   787  func (fd *fileDescription) dentry() *dentry {
   788  	return fd.vfsfd.Dentry().Impl().(*dentry)
   789  }
   790  
   791  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   792  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   793  	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
   794  }
   795  
   796  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   797  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   798  	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
   799  }
   800  
   801  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   802  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   803  	fs := fd.filesystem()
   804  	d := fd.dentry()
   805  
   806  	fs.renameMu.RLock()
   807  	err := fs.setXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
   808  	fs.renameMu.RUnlock()
   809  	if err != nil {
   810  		return err
   811  	}
   812  
   813  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   814  	return nil
   815  }
   816  
   817  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   818  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   819  	fs := fd.filesystem()
   820  	d := fd.dentry()
   821  
   822  	fs.renameMu.RLock()
   823  	err := fs.removeXattrLocked(ctx, d, fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
   824  	fs.renameMu.RUnlock()
   825  	if err != nil {
   826  		return err
   827  	}
   828  
   829  	d.InotifyWithParent(ctx, linux.IN_ATTRIB, 0, vfs.InodeEvent)
   830  	return nil
   831  }