github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/overlay/overlay.go (about)

     1  // Copyright 2020 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package overlay provides an overlay filesystem implementation, which
    16  // synthesizes a filesystem by composing one or more immutable filesystems
    17  // ("lower layers") with an optional mutable filesystem ("upper layer").
    18  //
    19  // Lock order:
    20  //
    21  //	directoryFD.mu / regularFileFD.mu
    22  //		filesystem.renameMu
    23  //			dentry.dirMu
    24  //		    dentry.copyMu
    25  //		      filesystem.devMu
    26  //		      *** "memmap.Mappable locks" below this point
    27  //		      dentry.mapsMu
    28  //		        *** "memmap.Mappable locks taken by Translate" below this point
    29  //		        dentry.dataMu
    30  //
    31  // Locking dentry.dirMu in multiple dentries requires that parent dentries are
    32  // locked before child dentries, and that filesystem.renameMu is locked to
    33  // stabilize this relationship.
    34  package overlay
    35  
    36  import (
    37  	"fmt"
    38  	"strings"
    39  	"sync/atomic"
    40  
    41  	"github.com/metacubex/gvisor/pkg/abi/linux"
    42  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    43  	"github.com/metacubex/gvisor/pkg/context"
    44  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    45  	"github.com/metacubex/gvisor/pkg/fspath"
    46  	"github.com/metacubex/gvisor/pkg/refs"
    47  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    48  	"github.com/metacubex/gvisor/pkg/sentry/memmap"
    49  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    50  	"github.com/metacubex/gvisor/pkg/sync"
    51  )
    52  
    53  // Name is the default filesystem name.
    54  const Name = "overlay"
    55  
    56  // FilesystemType implements vfs.FilesystemType.
    57  //
    58  // +stateify savable
    59  type FilesystemType struct{}
    60  
    61  // Name implements vfs.FilesystemType.Name.
    62  func (FilesystemType) Name() string {
    63  	return Name
    64  }
    65  
    66  // Release implements FilesystemType.Release.
    67  func (FilesystemType) Release(ctx context.Context) {}
    68  
    69  // FilesystemOptions may be passed as vfs.GetFilesystemOptions.InternalData to
    70  // FilesystemType.GetFilesystem.
    71  //
    72  // +stateify savable
    73  type FilesystemOptions struct {
    74  	// Callers passing FilesystemOptions to
    75  	// overlay.FilesystemType.GetFilesystem() are responsible for ensuring that
    76  	// the vfs.Mounts comprising the layers of the overlay filesystem do not
    77  	// contain submounts.
    78  
    79  	// If UpperRoot.Ok(), it is the root of the writable upper layer of the
    80  	// overlay.
    81  	UpperRoot vfs.VirtualDentry
    82  
    83  	// LowerRoots contains the roots of the immutable lower layers of the
    84  	// overlay. LowerRoots is immutable.
    85  	LowerRoots []vfs.VirtualDentry
    86  }
    87  
    88  // filesystem implements vfs.FilesystemImpl.
    89  //
    90  // +stateify savable
    91  type filesystem struct {
    92  	vfsfs vfs.Filesystem
    93  
    94  	// Immutable options.
    95  	opts FilesystemOptions
    96  
    97  	// creds is a copy of the filesystem's creator's credentials, which are
    98  	// used for accesses to the filesystem's layers. creds is immutable.
    99  	creds *auth.Credentials
   100  
   101  	// dirDevMinor is the device minor number used for directories. dirDevMinor
   102  	// is immutable.
   103  	dirDevMinor uint32
   104  
   105  	// lowerDevMinors maps device numbers from lower layer filesystems to
   106  	// device minor numbers assigned to non-directory files originating from
   107  	// that filesystem. (This remapping is necessary for lower layers because a
   108  	// file on a lower layer, and that same file on an overlay, are
   109  	// distinguishable because they will diverge after copy-up; this isn't true
   110  	// for non-directory files already on the upper layer.) lowerDevMinors is
   111  	// protected by devMu.
   112  	devMu          devMutex `state:"nosave"`
   113  	lowerDevMinors map[layerDevNumber]uint32
   114  
   115  	// renameMu synchronizes renaming with non-renaming operations in order to
   116  	// ensure consistent lock ordering between dentry.dirMu in different
   117  	// dentries.
   118  	renameMu renameRWMutex `state:"nosave"`
   119  
   120  	// dirInoCache caches overlay-private directory inode numbers by mapped
   121  	// bottommost device numbers and inode number. dirInoCache is protected by
   122  	// dirInoCacheMu.
   123  	dirInoCacheMu dirInoCacheMutex `state:"nosave"`
   124  	dirInoCache   map[layerDevNoAndIno]uint64
   125  
   126  	// lastDirIno is the last inode number assigned to a directory. lastDirIno
   127  	// is protected by dirInoCacheMu.
   128  	lastDirIno uint64
   129  
   130  	// MaxFilenameLen is the maximum filename length allowed by the overlayfs.
   131  	maxFilenameLen uint64
   132  }
   133  
   134  // +stateify savable
   135  type layerDevNumber struct {
   136  	major uint32
   137  	minor uint32
   138  }
   139  
   140  // +stateify savable
   141  type layerDevNoAndIno struct {
   142  	layerDevNumber
   143  	ino uint64
   144  }
   145  
   146  // GetFilesystem implements vfs.FilesystemType.GetFilesystem.
   147  func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) {
   148  	mopts := vfs.GenericParseMountOptions(opts.Data)
   149  	fsoptsRaw := opts.InternalData
   150  	fsopts, ok := fsoptsRaw.(FilesystemOptions)
   151  	if fsoptsRaw != nil && !ok {
   152  		ctx.Infof("overlay.FilesystemType.GetFilesystem: GetFilesystemOptions.InternalData has type %T, wanted overlay.FilesystemOptions or nil", fsoptsRaw)
   153  		return nil, nil, linuxerr.EINVAL
   154  	}
   155  	vfsroot := vfs.RootFromContext(ctx)
   156  	if vfsroot.Ok() {
   157  		defer vfsroot.DecRef(ctx)
   158  	}
   159  
   160  	if upperPathname, ok := mopts["upperdir"]; ok {
   161  		if fsopts.UpperRoot.Ok() {
   162  			ctx.Infof("overlay.FilesystemType.GetFilesystem: both upperdir and FilesystemOptions.UpperRoot are specified")
   163  			return nil, nil, linuxerr.EINVAL
   164  		}
   165  		delete(mopts, "upperdir")
   166  		// Linux overlayfs also requires a workdir when upperdir is
   167  		// specified; we don't, so silently ignore this option.
   168  		if workdir, ok := mopts["workdir"]; ok {
   169  			// Linux creates the "work" directory in `workdir`.
   170  			// Docker calls chown on it and fails if it doesn't
   171  			// exist.
   172  			workdirPath := fspath.Parse(workdir + "/work")
   173  			if !workdirPath.Absolute {
   174  				ctx.Infof("overlay.FilesystemType.GetFilesystem: workdir %q must be absolute", workdir)
   175  				return nil, nil, linuxerr.EINVAL
   176  			}
   177  			pop := vfs.PathOperation{
   178  				Root:               vfsroot,
   179  				Start:              vfsroot,
   180  				Path:               workdirPath,
   181  				FollowFinalSymlink: false,
   182  			}
   183  			mode := vfs.MkdirOptions{
   184  				Mode: linux.ModeUserAll,
   185  			}
   186  			if err := vfsObj.MkdirAt(ctx, creds, &pop, &mode); err != nil && err != linuxerr.EEXIST {
   187  				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to create %s/work: %v", workdir, err)
   188  			}
   189  			delete(mopts, "workdir")
   190  		}
   191  		upperPath := fspath.Parse(upperPathname)
   192  		if !upperPath.Absolute {
   193  			ctx.Infof("overlay.FilesystemType.GetFilesystem: upperdir %q must be absolute", upperPathname)
   194  			return nil, nil, linuxerr.EINVAL
   195  		}
   196  		upperRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
   197  			Root:               vfsroot,
   198  			Start:              vfsroot,
   199  			Path:               upperPath,
   200  			FollowFinalSymlink: true,
   201  		}, &vfs.GetDentryOptions{
   202  			CheckSearchable: true,
   203  		})
   204  		if err != nil {
   205  			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve upperdir %q: %v", upperPathname, err)
   206  			return nil, nil, err
   207  		}
   208  		// TODO(b/286942303): Only tmpfs supports whiteouts and
   209  		// trusted.overlay attributes. Don't allow to use non-tmpfs
   210  		// mounts on upper levels for mounts created through the mount
   211  		// syscall. In gVisor configs, users can specify any
   212  		// configurations on their own risk.
   213  		if !opts.InternalMount && upperRoot.Mount().Filesystem().FilesystemType().Name() != "tmpfs" {
   214  			return nil, nil, linuxerr.EINVAL
   215  		}
   216  		privateUpperRoot, err := clonePrivateMount(vfsObj, upperRoot, false /* forceReadOnly */)
   217  		upperRoot.DecRef(ctx)
   218  		if err != nil {
   219  			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of upperdir %q: %v", upperPathname, err)
   220  			return nil, nil, err
   221  		}
   222  		defer privateUpperRoot.DecRef(ctx)
   223  		fsopts.UpperRoot = privateUpperRoot
   224  	}
   225  
   226  	if lowerPathnamesStr, ok := mopts["lowerdir"]; ok {
   227  		if len(fsopts.LowerRoots) != 0 {
   228  			ctx.Infof("overlay.FilesystemType.GetFilesystem: both lowerdir and FilesystemOptions.LowerRoots are specified")
   229  			return nil, nil, linuxerr.EINVAL
   230  		}
   231  		delete(mopts, "lowerdir")
   232  		lowerPathnames := strings.Split(lowerPathnamesStr, ":")
   233  		for _, lowerPathname := range lowerPathnames {
   234  			lowerPath := fspath.Parse(lowerPathname)
   235  			if !lowerPath.Absolute {
   236  				ctx.Infof("overlay.FilesystemType.GetFilesystem: lowerdir %q must be absolute", lowerPathname)
   237  				return nil, nil, linuxerr.EINVAL
   238  			}
   239  			lowerRoot, err := vfsObj.GetDentryAt(ctx, creds, &vfs.PathOperation{
   240  				Root:               vfsroot,
   241  				Start:              vfsroot,
   242  				Path:               lowerPath,
   243  				FollowFinalSymlink: true,
   244  			}, &vfs.GetDentryOptions{
   245  				CheckSearchable: true,
   246  			})
   247  			if err != nil {
   248  				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to resolve lowerdir %q: %v", lowerPathname, err)
   249  				return nil, nil, err
   250  			}
   251  			privateLowerRoot, err := clonePrivateMount(vfsObj, lowerRoot, true /* forceReadOnly */)
   252  			lowerRoot.DecRef(ctx)
   253  			if err != nil {
   254  				ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to make private bind mount of lowerdir %q: %v", lowerPathname, err)
   255  				return nil, nil, err
   256  			}
   257  			defer privateLowerRoot.DecRef(ctx)
   258  			fsopts.LowerRoots = append(fsopts.LowerRoots, privateLowerRoot)
   259  		}
   260  	}
   261  
   262  	if len(mopts) != 0 {
   263  		ctx.Infof("overlay.FilesystemType.GetFilesystem: unused options: %v", mopts)
   264  		return nil, nil, linuxerr.EINVAL
   265  	}
   266  
   267  	if len(fsopts.LowerRoots) == 0 {
   268  		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least one lower layer is required")
   269  		return nil, nil, linuxerr.EINVAL
   270  	}
   271  	if len(fsopts.LowerRoots) < 2 && !fsopts.UpperRoot.Ok() {
   272  		ctx.Infof("overlay.FilesystemType.GetFilesystem: at least two lower layers are required when no upper layer is present")
   273  		return nil, nil, linuxerr.EINVAL
   274  	}
   275  	const maxLowerLayers = 500 // Linux: fs/overlay/super.c:OVL_MAX_STACK
   276  	if len(fsopts.LowerRoots) > maxLowerLayers {
   277  		ctx.Infof("overlay.FilesystemType.GetFilesystem: %d lower layers specified, maximum %d", len(fsopts.LowerRoots), maxLowerLayers)
   278  		return nil, nil, linuxerr.EINVAL
   279  	}
   280  
   281  	// Allocate dirDevMinor. lowerDevMinors are allocated dynamically.
   282  	dirDevMinor, err := vfsObj.GetAnonBlockDevMinor()
   283  	if err != nil {
   284  		return nil, nil, err
   285  	}
   286  
   287  	// Take extra references held by the filesystem.
   288  	if fsopts.UpperRoot.Ok() {
   289  		fsopts.UpperRoot.IncRef()
   290  	}
   291  	for _, lowerRoot := range fsopts.LowerRoots {
   292  		lowerRoot.IncRef()
   293  	}
   294  
   295  	fs := &filesystem{
   296  		opts:           fsopts,
   297  		creds:          creds.Fork(),
   298  		dirDevMinor:    dirDevMinor,
   299  		lowerDevMinors: make(map[layerDevNumber]uint32),
   300  		dirInoCache:    make(map[layerDevNoAndIno]uint64),
   301  		maxFilenameLen: linux.NAME_MAX,
   302  	}
   303  	fs.vfsfs.Init(vfsObj, &fstype, fs)
   304  
   305  	// Configure max filename length. Similar to what Linux does in
   306  	// fs/overlayfs/super.c:ovl_fill_super() -> ... -> ovl_check_namelen().
   307  	if fsopts.UpperRoot.Ok() {
   308  		if err := fs.updateMaxNameLen(ctx, creds, vfsObj, fs.opts.UpperRoot); err != nil {
   309  			ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on upper layer root: %v", err)
   310  		}
   311  	}
   312  	for _, lowerRoot := range fsopts.LowerRoots {
   313  		if err := fs.updateMaxNameLen(ctx, creds, vfsObj, lowerRoot); err != nil {
   314  			ctx.Debugf("overlay.FilesystemType.GetFilesystem: failed to StatFSAt on lower layer root: %v", err)
   315  		}
   316  	}
   317  
   318  	// Construct the root dentry.
   319  	root := fs.newDentry()
   320  	root.refs = atomicbitops.FromInt64(1)
   321  	if fs.opts.UpperRoot.Ok() {
   322  		fs.opts.UpperRoot.IncRef()
   323  		root.copiedUp = atomicbitops.FromUint32(1)
   324  		root.upperVD = fs.opts.UpperRoot
   325  	}
   326  	for _, lowerRoot := range fs.opts.LowerRoots {
   327  		lowerRoot.IncRef()
   328  		root.lowerVDs = append(root.lowerVDs, lowerRoot)
   329  	}
   330  	rootTopVD := root.topLayer()
   331  	// Get metadata from the topmost layer. See fs.lookupLocked().
   332  	const rootStatMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   333  	rootStat, err := vfsObj.StatAt(ctx, creds, &vfs.PathOperation{
   334  		Root:  rootTopVD,
   335  		Start: rootTopVD,
   336  	}, &vfs.StatOptions{
   337  		Mask: rootStatMask,
   338  	})
   339  	if err != nil {
   340  		root.destroyLocked(ctx)
   341  		fs.vfsfs.DecRef(ctx)
   342  		return nil, nil, err
   343  	}
   344  	if rootStat.Mask&rootStatMask != rootStatMask {
   345  		root.destroyLocked(ctx)
   346  		fs.vfsfs.DecRef(ctx)
   347  		return nil, nil, linuxerr.EREMOTE
   348  	}
   349  	if isWhiteout(&rootStat) {
   350  		ctx.Infof("overlay.FilesystemType.GetFilesystem: filesystem root is a whiteout")
   351  		root.destroyLocked(ctx)
   352  		fs.vfsfs.DecRef(ctx)
   353  		return nil, nil, linuxerr.EINVAL
   354  	}
   355  	root.mode = atomicbitops.FromUint32(uint32(rootStat.Mode))
   356  	root.uid = atomicbitops.FromUint32(rootStat.UID)
   357  	root.gid = atomicbitops.FromUint32(rootStat.GID)
   358  	if rootStat.Mode&linux.S_IFMT == linux.S_IFDIR {
   359  		root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   360  		root.devMinor = atomicbitops.FromUint32(fs.dirDevMinor)
   361  		// For root dir, it is okay to use top most level's stat to compute inode
   362  		// number because we don't allow copy ups on root dentries.
   363  		root.ino.Store(fs.newDirIno(rootStat.DevMajor, rootStat.DevMinor, rootStat.Ino))
   364  	} else if !root.upperVD.Ok() {
   365  		root.devMajor = atomicbitops.FromUint32(linux.UNNAMED_MAJOR)
   366  		rootDevMinor, err := fs.getLowerDevMinor(rootStat.DevMajor, rootStat.DevMinor)
   367  		if err != nil {
   368  			ctx.Infof("overlay.FilesystemType.GetFilesystem: failed to get device number for root: %v", err)
   369  			root.destroyLocked(ctx)
   370  			fs.vfsfs.DecRef(ctx)
   371  			return nil, nil, err
   372  		}
   373  		root.devMinor = atomicbitops.FromUint32(rootDevMinor)
   374  		root.ino.Store(rootStat.Ino)
   375  	} else {
   376  		root.devMajor = atomicbitops.FromUint32(rootStat.DevMajor)
   377  		root.devMinor = atomicbitops.FromUint32(rootStat.DevMinor)
   378  		root.ino.Store(rootStat.Ino)
   379  	}
   380  
   381  	return &fs.vfsfs, &root.vfsd, nil
   382  }
   383  
   384  // clonePrivateMount creates a non-recursive bind mount rooted at vd, not
   385  // associated with any MountNamespace, and returns the root of the new mount.
   386  // (This is required to ensure that each layer of an overlay comprises only a
   387  // single mount, and therefore can't cross into e.g. the overlay filesystem
   388  // itself, risking lock recursion.) A reference is held on the returned
   389  // VirtualDentry.
   390  func clonePrivateMount(vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry, forceReadOnly bool) (vfs.VirtualDentry, error) {
   391  	oldmnt := vd.Mount()
   392  	opts := oldmnt.Options()
   393  	if forceReadOnly {
   394  		opts.ReadOnly = true
   395  	}
   396  	newmnt := vfsObj.NewDisconnectedMount(oldmnt.Filesystem(), vd.Dentry(), &opts)
   397  	// Take a reference on the dentry which will be owned by the returned
   398  	// VirtualDentry.
   399  	d := vd.Dentry()
   400  	d.IncRef()
   401  	return vfs.MakeVirtualDentry(newmnt, d), nil
   402  }
   403  
   404  // Release implements vfs.FilesystemImpl.Release.
   405  func (fs *filesystem) Release(ctx context.Context) {
   406  	vfsObj := fs.vfsfs.VirtualFilesystem()
   407  	vfsObj.PutAnonBlockDevMinor(fs.dirDevMinor)
   408  	for _, lowerDevMinor := range fs.lowerDevMinors {
   409  		vfsObj.PutAnonBlockDevMinor(lowerDevMinor)
   410  	}
   411  	if fs.opts.UpperRoot.Ok() {
   412  		fs.opts.UpperRoot.DecRef(ctx)
   413  	}
   414  	for _, lowerRoot := range fs.opts.LowerRoots {
   415  		lowerRoot.DecRef(ctx)
   416  	}
   417  }
   418  
   419  // updateMaxNameLen is analogous to fs/overlayfs/super.c:ovl_check_namelen().
   420  func (fs *filesystem) updateMaxNameLen(ctx context.Context, creds *auth.Credentials, vfsObj *vfs.VirtualFilesystem, vd vfs.VirtualDentry) error {
   421  	statfs, err := vfsObj.StatFSAt(ctx, creds, &vfs.PathOperation{
   422  		Root:  vd,
   423  		Start: vd,
   424  	})
   425  	if err != nil {
   426  		return err
   427  	}
   428  	if statfs.NameLength > fs.maxFilenameLen {
   429  		fs.maxFilenameLen = statfs.NameLength
   430  	}
   431  	return nil
   432  }
   433  
   434  func (fs *filesystem) statFS(ctx context.Context) (linux.Statfs, error) {
   435  	// Always statfs the root of the topmost layer. Compare Linux's
   436  	// fs/overlayfs/super.c:ovl_statfs().
   437  	var rootVD vfs.VirtualDentry
   438  	if fs.opts.UpperRoot.Ok() {
   439  		rootVD = fs.opts.UpperRoot
   440  	} else {
   441  		rootVD = fs.opts.LowerRoots[0]
   442  	}
   443  	fsstat, err := fs.vfsfs.VirtualFilesystem().StatFSAt(ctx, fs.creds, &vfs.PathOperation{
   444  		Root:  rootVD,
   445  		Start: rootVD,
   446  	})
   447  	if err != nil {
   448  		return linux.Statfs{}, err
   449  	}
   450  	fsstat.Type = linux.OVERLAYFS_SUPER_MAGIC
   451  	return fsstat, nil
   452  }
   453  
   454  func (fs *filesystem) newDirIno(layerMajor, layerMinor uint32, layerIno uint64) uint64 {
   455  	fs.dirInoCacheMu.Lock()
   456  	defer fs.dirInoCacheMu.Unlock()
   457  	orig := layerDevNoAndIno{
   458  		layerDevNumber: layerDevNumber{layerMajor, layerMinor},
   459  		ino:            layerIno,
   460  	}
   461  	if ino, ok := fs.dirInoCache[orig]; ok {
   462  		return ino
   463  	}
   464  	fs.lastDirIno++
   465  	newIno := fs.lastDirIno
   466  	fs.dirInoCache[orig] = newIno
   467  	return newIno
   468  }
   469  
   470  func (fs *filesystem) getLowerDevMinor(layerMajor, layerMinor uint32) (uint32, error) {
   471  	fs.devMu.Lock()
   472  	defer fs.devMu.Unlock()
   473  	orig := layerDevNumber{layerMajor, layerMinor}
   474  	if minor, ok := fs.lowerDevMinors[orig]; ok {
   475  		return minor, nil
   476  	}
   477  	minor, err := fs.vfsfs.VirtualFilesystem().GetAnonBlockDevMinor()
   478  	if err != nil {
   479  		return 0, err
   480  	}
   481  	fs.lowerDevMinors[orig] = minor
   482  	return minor, nil
   483  }
   484  
   485  // IsDescendant implements vfs.FilesystemImpl.IsDescendant.
   486  func (fs *filesystem) IsDescendant(vfsroot, vd vfs.VirtualDentry) bool {
   487  	return genericIsDescendant(vfsroot.Dentry(), vd.Dentry().Impl().(*dentry))
   488  }
   489  
   490  // dentry implements vfs.DentryImpl.
   491  //
   492  // +stateify savable
   493  type dentry struct {
   494  	vfsd vfs.Dentry
   495  
   496  	refs atomicbitops.Int64
   497  
   498  	// fs is the owning filesystem. fs is immutable.
   499  	fs *filesystem
   500  
   501  	// mode, uid, and gid are the file mode, owner, and group of the file in
   502  	// the topmost layer (and therefore the overlay file as well), and are used
   503  	// for permission checks on this dentry. These fields are protected by
   504  	// copyMu.
   505  	mode atomicbitops.Uint32
   506  	uid  atomicbitops.Uint32
   507  	gid  atomicbitops.Uint32
   508  
   509  	// copiedUp is 1 if this dentry has been copied-up (i.e. upperVD.Ok()) and
   510  	// 0 otherwise.
   511  	copiedUp atomicbitops.Uint32
   512  
   513  	// parent is the dentry corresponding to this dentry's parent directory.
   514  	// name is this dentry's name in parent. If this dentry is a filesystem
   515  	// root, parent is nil and name is the empty string. parent and name are
   516  	// protected by fs.renameMu.
   517  	parent atomic.Pointer[dentry] `state:".(*dentry)"`
   518  	name   string
   519  
   520  	// If this dentry represents a directory, children maps the names of
   521  	// children for which dentries have been instantiated to those dentries,
   522  	// and dirents (if not nil) is a cache of dirents as returned by
   523  	// directoryFDs representing this directory. children is protected by
   524  	// dirMu.
   525  	dirMu    dirMutex `state:"nosave"`
   526  	children map[string]*dentry
   527  	dirents  []vfs.Dirent
   528  
   529  	// upperVD and lowerVDs are the files from the overlay filesystem's layers
   530  	// that comprise the file on the overlay filesystem.
   531  	//
   532  	// If !upperVD.Ok(), it can transition to a valid vfs.VirtualDentry (i.e.
   533  	// be copied up) with copyMu locked for writing; otherwise, it is
   534  	// immutable. lowerVDs is always immutable.
   535  	copyMu   sync.RWMutex `state:"nosave"`
   536  	upperVD  vfs.VirtualDentry
   537  	lowerVDs []vfs.VirtualDentry
   538  
   539  	// inlineLowerVDs backs lowerVDs in the common case where len(lowerVDs) <=
   540  	// len(inlineLowerVDs).
   541  	inlineLowerVDs [1]vfs.VirtualDentry
   542  
   543  	// devMajor, devMinor, and ino are the device major/minor and inode numbers
   544  	// used by this dentry. These fields are protected by copyMu.
   545  	devMajor atomicbitops.Uint32
   546  	devMinor atomicbitops.Uint32
   547  	ino      atomicbitops.Uint64
   548  
   549  	// If this dentry represents a regular file, then:
   550  	//
   551  	//	- mapsMu is used to synchronize between copy-up and memmap.Mappable
   552  	//		methods on dentry preceding mm.MemoryManager.activeMu in the lock order.
   553  	//
   554  	//	- dataMu is used to synchronize between copy-up and
   555  	//		dentry.(memmap.Mappable).Translate.
   556  	//
   557  	//	- lowerMappings tracks memory mappings of the file. lowerMappings is
   558  	//		used to invalidate mappings of the lower layer when the file is copied
   559  	//		up to ensure that they remain coherent with subsequent writes to the
   560  	//		file. (Note that, as of this writing, Linux overlayfs does not do this;
   561  	//		this feature is a gVisor extension.) lowerMappings is protected by
   562  	//		mapsMu.
   563  	//
   564  	//	- If this dentry is copied-up, then wrappedMappable is the Mappable
   565  	//		obtained from a call to the current top layer's
   566  	//		FileDescription.ConfigureMMap(). Once wrappedMappable becomes non-nil
   567  	//		(from a call to regularFileFD.ensureMappable()), it cannot become nil.
   568  	//		wrappedMappable is protected by mapsMu and dataMu.
   569  	//
   570  	//	- isMappable is non-zero iff wrappedMappable is non-nil. isMappable is
   571  	//		accessed using atomic memory operations.
   572  	//
   573  	//	- wrappedMappable is protected by mapsMu and dataMu. In addition,
   574  	//	  it has to be immutable if copyMu is taken for write.
   575  	//        copyUpMaybeSyntheticMountpointLocked relies on this behavior.
   576  	mapsMu          mapsMutex `state:"nosave"`
   577  	lowerMappings   memmap.MappingSet
   578  	dataMu          dataRWMutex `state:"nosave"`
   579  	wrappedMappable memmap.Mappable
   580  	isMappable      atomicbitops.Uint32
   581  
   582  	locks vfs.FileLocks
   583  
   584  	// watches is the set of inotify watches on the file represented by this dentry.
   585  	//
   586  	// Note that hard links to the same file will not share the same set of
   587  	// watches, due to the fact that we do not have inode structures in this
   588  	// overlay implementation.
   589  	watches vfs.Watches
   590  }
   591  
   592  // newDentry creates a new dentry. The dentry initially has no references; it
   593  // is the caller's responsibility to set the dentry's reference count and/or
   594  // call dentry.destroy() as appropriate. The dentry is initially invalid in
   595  // that it contains no layers; the caller is responsible for setting them.
   596  func (fs *filesystem) newDentry() *dentry {
   597  	d := &dentry{
   598  		fs: fs,
   599  	}
   600  	d.lowerVDs = d.inlineLowerVDs[:0]
   601  	d.vfsd.Init(d)
   602  	refs.Register(d)
   603  	return d
   604  }
   605  
   606  // IncRef implements vfs.DentryImpl.IncRef.
   607  func (d *dentry) IncRef() {
   608  	// d.refs may be 0 if d.fs.renameMu is locked, which serializes against
   609  	// d.checkDropLocked().
   610  	r := d.refs.Add(1)
   611  	if d.LogRefs() {
   612  		refs.LogIncRef(d, r)
   613  	}
   614  }
   615  
   616  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   617  func (d *dentry) TryIncRef() bool {
   618  	for {
   619  		r := d.refs.Load()
   620  		if r <= 0 {
   621  			return false
   622  		}
   623  		if d.refs.CompareAndSwap(r, r+1) {
   624  			if d.LogRefs() {
   625  				refs.LogTryIncRef(d, r+1)
   626  			}
   627  			return true
   628  		}
   629  	}
   630  }
   631  
   632  // DecRef implements vfs.DentryImpl.DecRef.
   633  func (d *dentry) DecRef(ctx context.Context) {
   634  	r := d.refs.Add(-1)
   635  	if d.LogRefs() {
   636  		refs.LogDecRef(d, r)
   637  	}
   638  	if r == 0 {
   639  		d.fs.renameMu.Lock()
   640  		d.checkDropLocked(ctx)
   641  		d.fs.renameMu.Unlock()
   642  	} else if r < 0 {
   643  		panic("overlay.dentry.DecRef() called without holding a reference")
   644  	}
   645  }
   646  
   647  func (d *dentry) decRefLocked(ctx context.Context) {
   648  	r := d.refs.Add(-1)
   649  	if d.LogRefs() {
   650  		refs.LogDecRef(d, r)
   651  	}
   652  	if r == 0 {
   653  		d.checkDropLocked(ctx)
   654  	} else if r < 0 {
   655  		panic("overlay.dentry.decRefLocked() called without holding a reference")
   656  	}
   657  }
   658  
   659  // checkDropLocked should be called after d's reference count becomes 0 or it
   660  // becomes deleted.
   661  //
   662  // Preconditions: d.fs.renameMu must be locked for writing.
   663  func (d *dentry) checkDropLocked(ctx context.Context) {
   664  	// Dentries with a positive reference count must be retained. (The only way
   665  	// to obtain a reference on a dentry with zero references is via path
   666  	// resolution, which requires renameMu, so if d.refs is zero then it will
   667  	// remain zero while we hold renameMu for writing.) Dentries with a
   668  	// negative reference count have already been destroyed.
   669  	if d.refs.Load() != 0 {
   670  		return
   671  	}
   672  
   673  	// Make sure that we do not lose watches on dentries that have not been
   674  	// deleted. Note that overlayfs never calls VFS.InvalidateDentry(), so
   675  	// d.vfsd.IsDead() indicates that d was deleted.
   676  	if !d.vfsd.IsDead() && d.watches.Size() > 0 {
   677  		return
   678  	}
   679  
   680  	// Refs is still zero; destroy it.
   681  	d.destroyLocked(ctx)
   682  	return
   683  }
   684  
   685  // destroyLocked destroys the dentry.
   686  //
   687  // Preconditions:
   688  //   - d.fs.renameMu must be locked for writing.
   689  //   - d.refs == 0.
   690  func (d *dentry) destroyLocked(ctx context.Context) {
   691  	switch d.refs.Load() {
   692  	case 0:
   693  		// Mark the dentry destroyed.
   694  		d.refs.Store(-1)
   695  	case -1:
   696  		panic("overlay.dentry.destroyLocked() called on already destroyed dentry")
   697  	default:
   698  		panic("overlay.dentry.destroyLocked() called with references on the dentry")
   699  	}
   700  
   701  	if d.upperVD.Ok() {
   702  		d.upperVD.DecRef(ctx)
   703  	}
   704  	for _, lowerVD := range d.lowerVDs {
   705  		lowerVD.DecRef(ctx)
   706  	}
   707  
   708  	d.watches.HandleDeletion(ctx)
   709  
   710  	if parent := d.parent.Load(); parent != nil {
   711  		parent.dirMu.Lock()
   712  		if !d.vfsd.IsDead() {
   713  			delete(parent.children, d.name)
   714  		}
   715  		parent.dirMu.Unlock()
   716  		// Drop the reference held by d on its parent without recursively
   717  		// locking d.fs.renameMu.
   718  		parent.decRefLocked(ctx)
   719  	}
   720  	refs.Unregister(d)
   721  }
   722  
   723  // RefType implements refs.CheckedObject.Type.
   724  func (d *dentry) RefType() string {
   725  	return "overlay.dentry"
   726  }
   727  
   728  // LeakMessage implements refs.CheckedObject.LeakMessage.
   729  func (d *dentry) LeakMessage() string {
   730  	return fmt.Sprintf("[overlay.dentry %p] reference count of %d instead of -1", d, d.refs.Load())
   731  }
   732  
   733  // LogRefs implements refs.CheckedObject.LogRefs.
   734  //
   735  // This should only be set to true for debugging purposes, as it can generate an
   736  // extremely large amount of output and drastically degrade performance.
   737  func (d *dentry) LogRefs() bool {
   738  	return false
   739  }
   740  
   741  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   742  func (d *dentry) InotifyWithParent(ctx context.Context, events uint32, cookie uint32, et vfs.EventType) {
   743  	if d.isDir() {
   744  		events |= linux.IN_ISDIR
   745  	}
   746  
   747  	// overlayfs never calls VFS.InvalidateDentry(), so d.vfsd.IsDead() indicates
   748  	// that d was deleted.
   749  	deleted := d.vfsd.IsDead()
   750  
   751  	d.fs.renameMu.RLock()
   752  	// The ordering below is important, Linux always notifies the parent first.
   753  	if parent := d.parent.Load(); parent != nil {
   754  		parent.watches.Notify(ctx, d.name, events, cookie, et, deleted)
   755  	}
   756  	d.watches.Notify(ctx, "", events, cookie, et, deleted)
   757  	d.fs.renameMu.RUnlock()
   758  }
   759  
   760  // Watches implements vfs.DentryImpl.Watches.
   761  func (d *dentry) Watches() *vfs.Watches {
   762  	return &d.watches
   763  }
   764  
   765  // OnZeroWatches implements vfs.DentryImpl.OnZeroWatches.
   766  func (d *dentry) OnZeroWatches(ctx context.Context) {
   767  	if d.refs.Load() == 0 {
   768  		d.fs.renameMu.Lock()
   769  		d.checkDropLocked(ctx)
   770  		d.fs.renameMu.Unlock()
   771  	}
   772  }
   773  
   774  // iterLayers invokes yield on each layer comprising d, from top to bottom. If
   775  // any call to yield returns false, iterLayer stops iteration.
   776  func (d *dentry) iterLayers(yield func(vd vfs.VirtualDentry, isUpper bool) bool) {
   777  	if d.isCopiedUp() {
   778  		if !yield(d.upperVD, true) {
   779  			return
   780  		}
   781  	}
   782  	for _, lowerVD := range d.lowerVDs {
   783  		if !yield(lowerVD, false) {
   784  			return
   785  		}
   786  	}
   787  }
   788  
   789  func (d *dentry) topLayerInfo() (vd vfs.VirtualDentry, isUpper bool) {
   790  	if d.isCopiedUp() {
   791  		return d.upperVD, true
   792  	}
   793  	return d.lowerVDs[0], false
   794  }
   795  
   796  func (d *dentry) topLayer() vfs.VirtualDentry {
   797  	vd, _ := d.topLayerInfo()
   798  	return vd
   799  }
   800  
   801  func (d *dentry) topLookupLayer() lookupLayer {
   802  	if d.upperVD.Ok() {
   803  		return lookupLayerUpper
   804  	}
   805  	return lookupLayerLower
   806  }
   807  
   808  func (d *dentry) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes) error {
   809  	return vfs.GenericCheckPermissions(creds, ats, linux.FileMode(d.mode.Load()), auth.KUID(d.uid.Load()), auth.KGID(d.gid.Load()))
   810  }
   811  
   812  func (d *dentry) checkXattrPermissions(creds *auth.Credentials, name string, ats vfs.AccessTypes) error {
   813  	mode := linux.FileMode(d.mode.Load())
   814  	kuid := auth.KUID(d.uid.Load())
   815  	kgid := auth.KGID(d.gid.Load())
   816  	if err := vfs.GenericCheckPermissions(creds, ats, mode, kuid, kgid); err != nil {
   817  		return err
   818  	}
   819  	return vfs.CheckXattrPermissions(creds, ats, mode, kuid, name)
   820  }
   821  
   822  // statInternalMask is the set of stat fields that is set by
   823  // dentry.statInternalTo().
   824  const statInternalMask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO
   825  
   826  // statInternalTo writes fields to stat that are stored in d, and therefore do
   827  // not requiring invoking StatAt on the overlay's layers.
   828  func (d *dentry) statInternalTo(ctx context.Context, opts *vfs.StatOptions, stat *linux.Statx) {
   829  	stat.Mask |= statInternalMask
   830  	if d.isDir() {
   831  		// Linux sets nlink to 1 for merged directories
   832  		// (fs/overlayfs/inode.c:ovl_getattr()); we set it to 2 because this is
   833  		// correct more often ("." and the directory's entry in its parent),
   834  		// and some of our tests expect this.
   835  		stat.Nlink = 2
   836  	}
   837  	stat.UID = d.uid.Load()
   838  	stat.GID = d.gid.Load()
   839  	stat.Mode = uint16(d.mode.Load())
   840  	stat.Ino = d.ino.Load()
   841  	stat.DevMajor = d.devMajor.Load()
   842  	stat.DevMinor = d.devMinor.Load()
   843  }
   844  
   845  // Preconditions: d.copyMu must be locked for writing.
   846  func (d *dentry) updateAfterSetStatLocked(opts *vfs.SetStatOptions) {
   847  	if opts.Stat.Mask&linux.STATX_MODE != 0 {
   848  		d.mode.Store((d.mode.RacyLoad() & linux.S_IFMT) | uint32(opts.Stat.Mode&^linux.S_IFMT))
   849  	}
   850  	if opts.Stat.Mask&linux.STATX_UID != 0 {
   851  		d.uid.Store(opts.Stat.UID)
   852  	}
   853  	if opts.Stat.Mask&linux.STATX_GID != 0 {
   854  		d.gid.Store(opts.Stat.GID)
   855  	}
   856  }
   857  
   858  func (d *dentry) mayDelete(creds *auth.Credentials, child *dentry) error {
   859  	return vfs.CheckDeleteSticky(
   860  		creds,
   861  		linux.FileMode(d.mode.Load()),
   862  		auth.KUID(d.uid.Load()),
   863  		auth.KUID(child.uid.Load()),
   864  		auth.KGID(child.gid.Load()),
   865  	)
   866  }
   867  
   868  // newChildOwnerStat returns a Statx for configuring the UID, GID, and mode of
   869  // children.
   870  func (d *dentry) newChildOwnerStat(mode linux.FileMode, creds *auth.Credentials) linux.Statx {
   871  	stat := linux.Statx{
   872  		Mask: uint32(linux.STATX_UID | linux.STATX_GID),
   873  		UID:  uint32(creds.EffectiveKUID),
   874  		GID:  uint32(creds.EffectiveKGID),
   875  	}
   876  	// Set GID and possibly the SGID bit if the parent is an SGID directory.
   877  	d.copyMu.RLock()
   878  	defer d.copyMu.RUnlock()
   879  	if d.mode.Load()&linux.ModeSetGID == linux.ModeSetGID {
   880  		stat.GID = d.gid.Load()
   881  		if stat.Mode&linux.ModeDirectory == linux.ModeDirectory {
   882  			stat.Mode = uint16(mode) | linux.ModeSetGID
   883  			stat.Mask |= linux.STATX_MODE
   884  		}
   885  	}
   886  	return stat
   887  }
   888  
   889  // fileDescription is embedded by overlay implementations of
   890  // vfs.FileDescriptionImpl.
   891  //
   892  // +stateify savable
   893  type fileDescription struct {
   894  	vfsfd vfs.FileDescription
   895  	vfs.FileDescriptionDefaultImpl
   896  	vfs.LockFD
   897  }
   898  
   899  func (fd *fileDescription) filesystem() *filesystem {
   900  	return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem)
   901  }
   902  
   903  func (fd *fileDescription) dentry() *dentry {
   904  	return fd.vfsfd.Dentry().Impl().(*dentry)
   905  }
   906  
   907  // ListXattr implements vfs.FileDescriptionImpl.ListXattr.
   908  func (fd *fileDescription) ListXattr(ctx context.Context, size uint64) ([]string, error) {
   909  	return fd.filesystem().listXattr(ctx, fd.dentry(), size)
   910  }
   911  
   912  // GetXattr implements vfs.FileDescriptionImpl.GetXattr.
   913  func (fd *fileDescription) GetXattr(ctx context.Context, opts vfs.GetXattrOptions) (string, error) {
   914  	return fd.filesystem().getXattr(ctx, fd.dentry(), auth.CredentialsFromContext(ctx), &opts)
   915  }
   916  
   917  // SetXattr implements vfs.FileDescriptionImpl.SetXattr.
   918  func (fd *fileDescription) SetXattr(ctx context.Context, opts vfs.SetXattrOptions) error {
   919  	fs := fd.filesystem()
   920  	fs.renameMu.RLock()
   921  	defer fs.renameMu.RUnlock()
   922  	return fs.setXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), &opts)
   923  }
   924  
   925  // RemoveXattr implements vfs.FileDescriptionImpl.RemoveXattr.
   926  func (fd *fileDescription) RemoveXattr(ctx context.Context, name string) error {
   927  	fs := fd.filesystem()
   928  	fs.renameMu.RLock()
   929  	defer fs.renameMu.RUnlock()
   930  	return fs.removeXattrLocked(ctx, fd.dentry(), fd.vfsfd.Mount(), auth.CredentialsFromContext(ctx), name)
   931  }