github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/kernfs/kernfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernfs provides the tools to implement inode-based filesystems.
    16  // Kernfs has two main features:
    17  //
    18  //  1. The Inode interface, which maps VFS's path-based filesystem operations to
    19  //     specific filesystem nodes. Kernfs uses the Inode interface to provide a
    20  //     blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
    21  //     the synchronization mechanism for all filesystem operations by holding a
    22  //     filesystem-wide lock across all operations.
    23  //
    24  //  2. Various utility types which provide generic implementations for various
    25  //     parts of the Inode and vfs.FileDescription interfaces. Client filesystems
    26  //     based on kernfs can embed the appropriate set of these to avoid having to
    27  //     reimplement common filesystem operations. See inode_impl_util.go and
    28  //     fd_impl_util.go.
    29  //
    30  // Reference Model:
    31  //
    32  // Kernfs dentries represents named pointers to inodes. Kernfs is solely
    33  // responsible for maintaining and modifying its dentry tree; inode
    34  // implementations can not access the tree. Dentries and inodes have
    35  // independent lifetimes and reference counts. A child dentry unconditionally
    36  // holds a reference on its parent directory's dentry. A dentry also holds a
    37  // reference on the inode it points to (although that might not be the only
    38  // reference on the inode). Due to this inodes can outlive the dentries that
    39  // point to them. Multiple dentries can point to the same inode (for example,
    40  // in the case of hardlinks). File descriptors hold a reference to the dentry
    41  // they're opened on.
    42  //
    43  // Dentries are guaranteed to exist while holding Filesystem.mu for
    44  // reading. Dropping dentries require holding Filesystem.mu for writing. To
    45  // queue dentries for destruction from a read critical section, see
    46  // Filesystem.deferDecRef.
    47  //
    48  // Lock ordering:
    49  //
    50  //	kernfs.Filesystem.mu
    51  //		kernel.TaskSet.mu
    52  //	  	kernel.Task.mu
    53  //		kernfs.Dentry.dirMu
    54  //	  	vfs.VirtualFilesystem.mountMu
    55  //	    	vfs.Dentry.mu
    56  //		(inode implementation locks, if any)
    57  //
    58  // kernfs.Filesystem.deferredDecRefsMu
    59  package kernfs
    60  
    61  import (
    62  	"fmt"
    63  	"sync/atomic"
    64  
    65  	"github.com/metacubex/gvisor/pkg/abi/linux"
    66  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    67  	"github.com/metacubex/gvisor/pkg/context"
    68  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    69  	"github.com/metacubex/gvisor/pkg/fspath"
    70  	"github.com/metacubex/gvisor/pkg/refs"
    71  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    72  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    73  	"github.com/metacubex/gvisor/pkg/sync"
    74  )
    75  
    76  // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
    77  // filesystem. Concrete implementations are expected to embed this in their own
    78  // Filesystem type.
    79  //
    80  // +stateify savable
    81  type Filesystem struct {
    82  	vfsfs vfs.Filesystem
    83  
    84  	deferredDecRefsMu deferredDecRefsMutex `state:"nosave"`
    85  
    86  	// deferredDecRefs is a list of dentries waiting to be DecRef()ed. This is
    87  	// used to defer dentry destruction until mu can be acquired for
    88  	// writing. Protected by deferredDecRefsMu.
    89  	deferredDecRefs []refs.RefCounter
    90  
    91  	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
    92  	// for reading guarantees continued existence of any resolved dentries, but
    93  	// the dentry tree may be modified.
    94  	//
    95  	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
    96  	// example:
    97  	//
    98  	//   fs.mu.Lock()
    99  	//   defer fs.mu.Unlock()
   100  	//   ...
   101  	//   dentry1.DecRef()
   102  	//   defer dentry2.DecRef() // Ok, will run before Unlock.
   103  	//
   104  	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
   105  	// example:
   106  	//
   107  	//   fs.mu.RLock()
   108  	//   defer fs.processDeferredDecRefs()
   109  	//   defer fs.mu.RUnlock()
   110  	//   ...
   111  	//   fs.deferDecRef(dentry)
   112  	mu filesystemRWMutex `state:"nosave"`
   113  
   114  	// nextInoMinusOne is used to to allocate inode numbers on this
   115  	// filesystem. Must be accessed by atomic operations.
   116  	nextInoMinusOne atomicbitops.Uint64
   117  
   118  	// cachedDentries contains all dentries with 0 references. (Due to race
   119  	// conditions, it may also contain dentries with non-zero references.)
   120  	// cachedDentriesLen is the number of dentries in cachedDentries. These
   121  	// fields are protected by mu.
   122  	cachedDentries    dentryList
   123  	cachedDentriesLen uint64
   124  
   125  	// MaxCachedDentries is the maximum size of cachedDentries. If not set,
   126  	// defaults to 0 and kernfs does not cache any dentries. This is immutable.
   127  	MaxCachedDentries uint64
   128  
   129  	// root is the root dentry of this filesystem. Note that root may be nil for
   130  	// filesystems on a disconnected mount without a root (e.g. pipefs, sockfs,
   131  	// hostfs). Filesystem holds an extra reference on root to prevent it from
   132  	// being destroyed prematurely. This is immutable.
   133  	root *Dentry
   134  }
   135  
   136  // deferDecRef defers dropping a dentry ref until the next call to
   137  // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
   138  // This may be called while Filesystem.mu or Dentry.dirMu is locked.
   139  func (fs *Filesystem) deferDecRef(d refs.RefCounter) {
   140  	fs.deferredDecRefsMu.Lock()
   141  	fs.deferredDecRefs = append(fs.deferredDecRefs, d)
   142  	fs.deferredDecRefsMu.Unlock()
   143  }
   144  
   145  // SafeDecRefFD safely DecRef the FileDescription making sure DecRef is deferred
   146  // in case Filesystem.mu is held. See comment on Filesystem.mu.
   147  func (fs *Filesystem) SafeDecRefFD(ctx context.Context, fd *vfs.FileDescription) {
   148  	if d, ok := fd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
   149  		// Only defer if dentry belongs to this filesystem, since locks cannot cross
   150  		// filesystems.
   151  		fs.deferDecRef(fd)
   152  		return
   153  	}
   154  	fd.DecRef(ctx)
   155  }
   156  
   157  // SafeDecRef safely DecRef the virtual dentry making sure DecRef is deferred
   158  // in case Filesystem.mu is held. See comment on Filesystem.mu.
   159  func (fs *Filesystem) SafeDecRef(ctx context.Context, vd vfs.VirtualDentry) {
   160  	if d, ok := vd.Dentry().Impl().(*Dentry); ok && d.fs == fs {
   161  		// Only defer if dentry belongs to this filesystem, since locks cannot cross
   162  		// filesystems.
   163  		fs.deferDecRef(&vd)
   164  		return
   165  	}
   166  	vd.DecRef(ctx)
   167  }
   168  
   169  // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
   170  // deferredDecRefs list. See comment on Filesystem.mu.
   171  //
   172  // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
   173  func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
   174  	fs.deferredDecRefsMu.Lock()
   175  	for _, d := range fs.deferredDecRefs {
   176  		// Defer the DecRef call so that we are not holding deferredDecRefsMu
   177  		// when DecRef is called.
   178  		defer d.DecRef(ctx)
   179  	}
   180  	fs.deferredDecRefs = fs.deferredDecRefs[:0] // Keep slice memory for reuse.
   181  	fs.deferredDecRefsMu.Unlock()
   182  }
   183  
   184  // VFSFilesystem returns the generic vfs filesystem object.
   185  func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
   186  	return &fs.vfsfs
   187  }
   188  
   189  // NextIno allocates a new inode number on this filesystem.
   190  func (fs *Filesystem) NextIno() uint64 {
   191  	return fs.nextInoMinusOne.Add(1)
   192  }
   193  
   194  // These consts are used in the Dentry.flags field.
   195  const (
   196  	// Dentry points to a directory inode.
   197  	dflagsIsDir = 1 << iota
   198  
   199  	// Dentry points to a symlink inode.
   200  	dflagsIsSymlink
   201  )
   202  
   203  // Dentry implements vfs.DentryImpl.
   204  //
   205  // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
   206  // named reference to an inode. A dentry generally lives as long as it's part of
   207  // a mounted filesystem tree. Kernfs drops dentries once all references to them
   208  // are dropped. Dentries hold a single reference to the inode they point
   209  // to, and child dentries hold a reference on their parent.
   210  //
   211  // Must be initialized by Init prior to first use.
   212  //
   213  // +stateify savable
   214  type Dentry struct {
   215  	vfsd vfs.Dentry
   216  
   217  	// refs is the reference count. When refs reaches 0, the dentry may be
   218  	// added to the cache or destroyed. If refs == -1, the dentry has already
   219  	// been destroyed. refs are allowed to go to 0 and increase again. refs is
   220  	// accessed using atomic memory operations.
   221  	refs atomicbitops.Int64
   222  
   223  	// fs is the owning filesystem. fs is immutable.
   224  	fs *Filesystem
   225  
   226  	// flags caches useful information about the dentry from the inode. See the
   227  	// dflags* consts above.
   228  	flags atomicbitops.Uint32
   229  
   230  	parent atomic.Pointer[Dentry] `state:".(*Dentry)"`
   231  
   232  	name string
   233  
   234  	// If cached is true, dentryEntry links dentry into
   235  	// Filesystem.cachedDentries. cached and dentryEntry are protected by
   236  	// Filesystem.mu.
   237  	cached bool
   238  	dentryEntry
   239  
   240  	// dirMu protects children and the names of child Dentries.
   241  	//
   242  	// Note that holding fs.mu for writing is not sufficient;
   243  	// revalidateChildLocked(), which is a very hot path, may modify children with
   244  	// fs.mu acquired for reading only.
   245  	dirMu    sync.Mutex `state:"nosave"`
   246  	children map[string]*Dentry
   247  
   248  	inode Inode
   249  
   250  	// If deleted is non-zero, the file represented by this dentry has been
   251  	// deleted. deleted is accessed using atomic memory operations.
   252  	deleted atomicbitops.Uint32
   253  }
   254  
   255  // IncRef implements vfs.DentryImpl.IncRef.
   256  func (d *Dentry) IncRef() {
   257  	// d.refs may be 0 if d.fs.mu is locked, which serializes against
   258  	// d.cacheLocked().
   259  	r := d.refs.Add(1)
   260  	if d.LogRefs() {
   261  		refs.LogIncRef(d, r)
   262  	}
   263  }
   264  
   265  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   266  func (d *Dentry) TryIncRef() bool {
   267  	for {
   268  		r := d.refs.Load()
   269  		if r <= 0 {
   270  			return false
   271  		}
   272  		if d.refs.CompareAndSwap(r, r+1) {
   273  			if d.LogRefs() {
   274  				refs.LogTryIncRef(d, r+1)
   275  			}
   276  			return true
   277  		}
   278  	}
   279  }
   280  
   281  // DecRef implements vfs.DentryImpl.DecRef.
   282  func (d *Dentry) DecRef(ctx context.Context) {
   283  	r := d.refs.Add(-1)
   284  	if d.LogRefs() {
   285  		refs.LogDecRef(d, r)
   286  	}
   287  	if r == 0 {
   288  		if d.inode.Anonymous() {
   289  			// Nothing to cache. Skip right to destroy. This avoids
   290  			// taking fs.mu in the DecRef() path for anonymous
   291  			// inodes.
   292  			d.destroy(ctx)
   293  			return
   294  		}
   295  
   296  		d.fs.mu.Lock()
   297  		defer d.fs.mu.Unlock()
   298  		d.cacheLocked(ctx)
   299  	} else if r < 0 {
   300  		panic("kernfs.Dentry.DecRef() called without holding a reference")
   301  	}
   302  }
   303  
   304  func (d *Dentry) decRefLocked(ctx context.Context) {
   305  	r := d.refs.Add(-1)
   306  	if d.LogRefs() {
   307  		refs.LogDecRef(d, r)
   308  	}
   309  	if r == 0 {
   310  		d.cacheLocked(ctx)
   311  	} else if r < 0 {
   312  		panic("kernfs.Dentry.DecRef() called without holding a reference")
   313  	}
   314  }
   315  
   316  // cacheLocked should be called after d's reference count becomes 0. The ref
   317  // count check may happen before acquiring d.fs.mu so there might be a race
   318  // condition where the ref count is increased again by the time the caller
   319  // acquires d.fs.mu. This race is handled.
   320  // Only reachable dentries are added to the cache. However, a dentry might
   321  // become unreachable *while* it is in the cache due to invalidation.
   322  //
   323  // Preconditions: d.fs.mu must be locked for writing.
   324  func (d *Dentry) cacheLocked(ctx context.Context) {
   325  	// Dentries with a non-zero reference count must be retained. (The only way
   326  	// to obtain a reference on a dentry with zero references is via path
   327  	// resolution, which requires d.fs.mu, so if d.refs is zero then it will
   328  	// remain zero while we hold d.fs.mu for writing.)
   329  	refs := d.refs.Load()
   330  	if refs == -1 {
   331  		// Dentry has already been destroyed.
   332  		return
   333  	}
   334  	if refs > 0 {
   335  		if d.cached {
   336  			d.fs.cachedDentries.Remove(d)
   337  			d.fs.cachedDentriesLen--
   338  			d.cached = false
   339  		}
   340  		return
   341  	}
   342  	// If the dentry is deleted and invalidated or has no parent, then it is no
   343  	// longer reachable by path resolution and should be dropped immediately
   344  	// because it has zero references.
   345  	// Note that a dentry may not always have a parent; for example magic links
   346  	// as described in Inode.Getlink.
   347  	if isDead, parent := d.VFSDentry().IsDead(), d.parent.Load(); isDead || parent == nil {
   348  		if !isDead {
   349  			rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
   350  			for _, rc := range rcs {
   351  				d.fs.deferDecRef(rc)
   352  			}
   353  		}
   354  		if d.cached {
   355  			d.fs.cachedDentries.Remove(d)
   356  			d.fs.cachedDentriesLen--
   357  			d.cached = false
   358  		}
   359  		if d.isDeleted() {
   360  			d.inode.Watches().HandleDeletion(ctx)
   361  		}
   362  		d.destroy(ctx)
   363  		if parent != nil {
   364  			parent.decRefLocked(ctx)
   365  		}
   366  		return
   367  	}
   368  	if d.VFSDentry().IsEvictable() {
   369  		d.evictLocked(ctx)
   370  		return
   371  	}
   372  	// If d is already cached, just move it to the front of the LRU.
   373  	if d.cached {
   374  		d.fs.cachedDentries.Remove(d)
   375  		d.fs.cachedDentries.PushFront(d)
   376  		return
   377  	}
   378  	// Cache the dentry, then evict the least recently used cached dentry if
   379  	// the cache becomes over-full.
   380  	d.fs.cachedDentries.PushFront(d)
   381  	d.fs.cachedDentriesLen++
   382  	d.cached = true
   383  	if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
   384  		return
   385  	}
   386  	d.fs.evictCachedDentryLocked(ctx)
   387  	// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
   388  	// back down to fs.opts.maxCachedDentries, so we don't loop.
   389  }
   390  
   391  // Preconditions:
   392  //   - fs.mu must be locked for writing.
   393  func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) {
   394  	// Evict the least recently used dentry because cache size is greater than
   395  	// max cache size (configured on mount).
   396  	fs.cachedDentries.Back().evictLocked(ctx)
   397  }
   398  
   399  // Preconditions:
   400  //   - d.fs.mu must be locked for writing.
   401  func (d *Dentry) evictLocked(ctx context.Context) {
   402  	if d == nil {
   403  		return
   404  	}
   405  	if d.cached {
   406  		d.fs.cachedDentries.Remove(d)
   407  		d.fs.cachedDentriesLen--
   408  		d.cached = false
   409  	}
   410  	// victim.refs may have become non-zero from an earlier path resolution
   411  	// after it was inserted into fs.cachedDentries.
   412  	if d.refs.Load() == 0 {
   413  		if !d.vfsd.IsDead() {
   414  			parent := d.parent.Load()
   415  			parent.dirMu.Lock()
   416  			// Note that victim can't be a mount point (in any mount
   417  			// namespace), since VFS holds references on mount points.
   418  			rcs := d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
   419  			for _, rc := range rcs {
   420  				d.fs.deferDecRef(rc)
   421  			}
   422  			delete(parent.children, d.name)
   423  			parent.dirMu.Unlock()
   424  		}
   425  		d.destroy(ctx)
   426  		if parent := d.parent.Load(); parent != nil {
   427  			parent.decRefLocked(ctx)
   428  		}
   429  	}
   430  }
   431  
   432  // destroy destroys the dentry.
   433  //
   434  // Preconditions:
   435  //   - d.refs == 0.
   436  //   - d should have been removed from d.parent.children, i.e. d is not reachable
   437  //     by path traversal.
   438  //   - d.vfsd.IsDead() is true.
   439  func (d *Dentry) destroy(ctx context.Context) {
   440  	switch refs := d.refs.Load(); refs {
   441  	case 0:
   442  		// Mark the dentry destroyed.
   443  		d.refs.Store(-1)
   444  	case -1:
   445  		panic("dentry.destroy() called on already destroyed dentry")
   446  	default:
   447  		panic("dentry.destroy() called with references on the dentry")
   448  	}
   449  
   450  	d.inode.DecRef(ctx) // IncRef from Init.
   451  
   452  	refs.Unregister(d)
   453  }
   454  
   455  // RefType implements refs.CheckedObject.Type.
   456  func (d *Dentry) RefType() string {
   457  	return "kernfs.Dentry"
   458  }
   459  
   460  // LeakMessage implements refs.CheckedObject.LeakMessage.
   461  func (d *Dentry) LeakMessage() string {
   462  	return fmt.Sprintf("[kernfs.Dentry %p] reference count of %d instead of -1", d, d.refs.Load())
   463  }
   464  
   465  // LogRefs implements refs.CheckedObject.LogRefs.
   466  //
   467  // This should only be set to true for debugging purposes, as it can generate an
   468  // extremely large amount of output and drastically degrade performance.
   469  func (d *Dentry) LogRefs() bool {
   470  	return false
   471  }
   472  
   473  // InitRoot initializes this dentry as the root of the filesystem.
   474  //
   475  // Precondition: Caller must hold a reference on inode.
   476  //
   477  // Postcondition: Caller's reference on inode is transferred to the dentry.
   478  func (d *Dentry) InitRoot(fs *Filesystem, inode Inode) {
   479  	d.Init(fs, inode)
   480  	fs.root = d
   481  	// Hold an extra reference on the root dentry. It is held by fs to prevent the
   482  	// root from being "cached" and subsequently evicted.
   483  	d.IncRef()
   484  }
   485  
   486  // Init initializes this dentry.
   487  //
   488  // Precondition: Caller must hold a reference on inode.
   489  //
   490  // Postcondition: Caller's reference on inode is transferred to the dentry.
   491  func (d *Dentry) Init(fs *Filesystem, inode Inode) {
   492  	d.vfsd.Init(d)
   493  	d.fs = fs
   494  	d.inode = inode
   495  	d.refs.Store(1)
   496  	ftype := inode.Mode().FileType()
   497  	if ftype == linux.ModeDirectory {
   498  		d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsDir)
   499  	}
   500  	if ftype == linux.ModeSymlink {
   501  		d.flags = atomicbitops.FromUint32(d.flags.RacyLoad() | dflagsIsSymlink)
   502  	}
   503  	refs.Register(d)
   504  }
   505  
   506  // VFSDentry returns the generic vfs dentry for this kernfs dentry.
   507  func (d *Dentry) VFSDentry() *vfs.Dentry {
   508  	return &d.vfsd
   509  }
   510  
   511  func (d *Dentry) isDeleted() bool {
   512  	return d.deleted.Load() != 0
   513  }
   514  
   515  func (d *Dentry) setDeleted() {
   516  	d.deleted.Store(1)
   517  }
   518  
   519  // isDir checks whether the dentry points to a directory inode.
   520  func (d *Dentry) isDir() bool {
   521  	return d.flags.Load()&dflagsIsDir != 0
   522  }
   523  
   524  // isSymlink checks whether the dentry points to a symlink inode.
   525  func (d *Dentry) isSymlink() bool {
   526  	return d.flags.Load()&dflagsIsSymlink != 0
   527  }
   528  
   529  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   530  func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {
   531  	if d.isDir() {
   532  		events |= linux.IN_ISDIR
   533  	}
   534  
   535  	// Linux always notifies the parent first.
   536  
   537  	// Don't bother looking for a parent if the inode is anonymous. It
   538  	// won't have one.
   539  	if !d.inode.Anonymous() {
   540  		d.fs.mu.RLock()
   541  		if parent := d.parent.Load(); parent != nil {
   542  			parent.inode.Watches().Notify(ctx, d.name, events, cookie, et, d.isDeleted())
   543  		}
   544  		d.fs.mu.RUnlock()
   545  	}
   546  
   547  	d.inode.Watches().Notify(ctx, "", events, cookie, et, d.isDeleted())
   548  }
   549  
   550  // Watches implements vfs.DentryImpl.Watches.
   551  func (d *Dentry) Watches() *vfs.Watches {
   552  	return d.inode.Watches()
   553  }
   554  
   555  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   556  func (d *Dentry) OnZeroWatches(context.Context) {}
   557  
   558  // insertChild inserts child into the vfs dentry cache with the given name under
   559  // this dentry. This does not update the directory inode, so calling this on its
   560  // own isn't sufficient to insert a child into a directory.
   561  //
   562  // Preconditions:
   563  //   - d must represent a directory inode.
   564  //   - d.fs.mu must be locked for at least reading.
   565  func (d *Dentry) insertChild(name string, child *Dentry) {
   566  	d.dirMu.Lock()
   567  	d.insertChildLocked(name, child)
   568  	d.dirMu.Unlock()
   569  }
   570  
   571  // insertChildLocked is equivalent to insertChild, with additional
   572  // preconditions.
   573  //
   574  // Preconditions:
   575  //   - d must represent a directory inode.
   576  //   - d.dirMu must be locked.
   577  //   - d.fs.mu must be locked for at least reading.
   578  func (d *Dentry) insertChildLocked(name string, child *Dentry) {
   579  	if !d.isDir() {
   580  		panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
   581  	}
   582  	d.IncRef() // DecRef in child's Dentry.destroy.
   583  	child.parent.Store(d)
   584  	child.name = name
   585  	if d.children == nil {
   586  		d.children = make(map[string]*Dentry)
   587  	}
   588  	d.children[name] = child
   589  }
   590  
   591  // Inode returns the dentry's inode.
   592  func (d *Dentry) Inode() Inode {
   593  	return d.inode
   594  }
   595  
   596  // FSLocalPath returns an absolute path to d, relative to the root of its
   597  // filesystem.
   598  func (d *Dentry) FSLocalPath() string {
   599  	var b fspath.Builder
   600  	_ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b)
   601  	b.PrependByte('/')
   602  	return b.String()
   603  }
   604  
   605  // WalkDentryTree traverses p in the dentry tree for this filesystem. Note that
   606  // this only traverses the dentry tree and is not a general path traversal. No
   607  // symlinks and dynamic children are resolved, and no permission checks are
   608  // performed. The caller is responsible for ensuring the returned Dentry exists
   609  // for an appropriate lifetime.
   610  //
   611  // p is interpreted starting at d, and may be absolute or relative (absolute vs
   612  // relative paths both refer to the same target here, since p is absolute from
   613  // d). p may contain "." and "..", but will not allow traversal above d (similar
   614  // to ".." at the root dentry).
   615  //
   616  // This is useful for filesystem internals, where the filesystem may not be
   617  // mounted yet. For a mounted filesystem, use GetDentryAt.
   618  func (d *Dentry) WalkDentryTree(ctx context.Context, vfsObj *vfs.VirtualFilesystem, p fspath.Path) (*Dentry, error) {
   619  	d.fs.mu.RLock()
   620  	defer d.fs.processDeferredDecRefs(ctx)
   621  	defer d.fs.mu.RUnlock()
   622  
   623  	target := d
   624  
   625  	for pit := p.Begin; pit.Ok(); pit = pit.Next() {
   626  		pc := pit.String()
   627  
   628  		switch {
   629  		case target == nil:
   630  			return nil, linuxerr.ENOENT
   631  		case pc == ".":
   632  			// No-op, consume component and continue.
   633  		case pc == "..":
   634  			if target == d {
   635  				// Don't let .. traverse above the start point of the walk.
   636  				continue
   637  			}
   638  			target = target.parent.Load()
   639  			// Parent doesn't need revalidation since we revalidated it on the
   640  			// way to the child, and we're still holding fs.mu.
   641  		default:
   642  			var err error
   643  
   644  			d.dirMu.Lock()
   645  			target, err = d.fs.revalidateChildLocked(ctx, vfsObj, target, pc, target.children[pc])
   646  			d.dirMu.Unlock()
   647  
   648  			if err != nil {
   649  				return nil, err
   650  			}
   651  		}
   652  	}
   653  
   654  	if target == nil {
   655  		return nil, linuxerr.ENOENT
   656  	}
   657  
   658  	target.IncRef()
   659  	return target, nil
   660  }
   661  
   662  // Parent returns the parent of this Dentry. This is not safe in general, the
   663  // filesystem may concurrently move d elsewhere. The caller is responsible for
   664  // ensuring the returned result remains valid while it is used.
   665  func (d *Dentry) Parent() *Dentry {
   666  	return d.parent.Load()
   667  }
   668  
   669  // The Inode interface maps filesystem-level operations that operate on paths to
   670  // equivalent operations on specific filesystem nodes.
   671  //
   672  // The interface methods are groups into logical categories as sub interfaces
   673  // below. Generally, an implementation for each sub interface can be provided by
   674  // embedding an appropriate type from inode_impl_utils.go. The sub interfaces
   675  // are purely organizational. Methods declared directly in the main interface
   676  // have no generic implementations, and should be explicitly provided by the
   677  // client filesystem.
   678  //
   679  // Generally, implementations are not responsible for tasks that are common to
   680  // all filesystems. These include:
   681  //
   682  //   - Checking that dentries passed to methods are of the appropriate file type.
   683  //   - Checking permissions.
   684  //
   685  // Inode functions may be called holding filesystem wide locks and are not
   686  // allowed to call vfs functions that may reenter, unless otherwise noted.
   687  //
   688  // Specific responsibilities of implementations are documented below.
   689  type Inode interface {
   690  	// Methods related to reference counting. A generic implementation is
   691  	// provided by InodeNoopRefCount. These methods are generally called by the
   692  	// equivalent Dentry methods.
   693  	inodeRefs
   694  
   695  	// Methods related to node metadata. A generic implementation is provided by
   696  	// InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
   697  	// managing link counts.
   698  	inodeMetadata
   699  
   700  	// Method for inodes that represent symlink. InodeNotSymlink provides a
   701  	// blanket implementation for all non-symlink inodes.
   702  	inodeSymlink
   703  
   704  	// Method for inodes that represent directories. InodeNotDirectory provides
   705  	// a blanket implementation for all non-directory inodes.
   706  	inodeDirectory
   707  
   708  	// Open creates a file description for the filesystem object represented by
   709  	// this inode. The returned file description should hold a reference on the
   710  	// dentry for its lifetime.
   711  	//
   712  	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
   713  	// the inode on which Open() is being called.
   714  	Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
   715  
   716  	// StatFS returns filesystem statistics for the client filesystem. This
   717  	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
   718  	// doesn't support statfs(2), this should return ENOSYS.
   719  	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
   720  
   721  	// Keep indicates whether the dentry created after Inode.Lookup should be
   722  	// kept in the kernfs dentry tree.
   723  	Keep() bool
   724  
   725  	// Valid should return true if this inode is still valid, or needs to
   726  	// be resolved again by a call to Lookup.
   727  	Valid(ctx context.Context) bool
   728  
   729  	// Watches returns the set of inotify watches associated with this inode.
   730  	Watches() *vfs.Watches
   731  
   732  	// Anonymous indicates that the Inode is anonymous. It will never have
   733  	// a name or parent.
   734  	Anonymous() bool
   735  }
   736  
   737  type inodeRefs interface {
   738  	IncRef()
   739  	DecRef(ctx context.Context)
   740  	TryIncRef() bool
   741  }
   742  
   743  type inodeMetadata interface {
   744  	// CheckPermissions checks that creds may access this inode for the
   745  	// requested access type, per the the rules of
   746  	// fs/namei.c:generic_permission().
   747  	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
   748  
   749  	// Mode returns the (struct stat)::st_mode value for this inode. This is
   750  	// separated from Stat for performance.
   751  	Mode() linux.FileMode
   752  
   753  	// UID returns the (struct stat)::st_uid value for this inode. This is
   754  	// separated from Stat for performance.
   755  	UID() auth.KUID
   756  
   757  	// GID returns the (struct stat)::st_gid value for this inode. This is
   758  	// separated from Stat for performance.
   759  	GID() auth.KGID
   760  
   761  	// Stat returns the metadata for this inode. This corresponds to
   762  	// vfs.FilesystemImpl.StatAt.
   763  	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
   764  
   765  	// SetStat updates the metadata for this inode. This corresponds to
   766  	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
   767  	// if the operation can be performed (see vfs.CheckSetStat() for common
   768  	// checks).
   769  	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
   770  }
   771  
   772  // Precondition: All methods in this interface may only be called on directory
   773  // inodes.
   774  type inodeDirectory interface {
   775  	// The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
   776  	// that will be hashed into the dentry tree.
   777  	//
   778  	// These inode constructors are inode-level operations rather than
   779  	// filesystem-level operations to allow client filesystems to mix different
   780  	// implementations based on the new node's location in the
   781  	// filesystem.
   782  
   783  	// HasChildren returns true if the directory inode has any children.
   784  	HasChildren() bool
   785  
   786  	// NewFile creates a new regular file inode.
   787  	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
   788  
   789  	// NewDir creates a new directory inode.
   790  	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
   791  
   792  	// NewLink creates a new hardlink to a specified inode in this
   793  	// directory. Implementations should create a new kernfs Dentry pointing to
   794  	// target, and update target's link count.
   795  	NewLink(ctx context.Context, name string, target Inode) (Inode, error)
   796  
   797  	// NewSymlink creates a new symbolic link inode.
   798  	NewSymlink(ctx context.Context, name, target string) (Inode, error)
   799  
   800  	// NewNode creates a new filesystem node for a mknod syscall.
   801  	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
   802  
   803  	// Unlink removes a child dentry from this directory inode.
   804  	Unlink(ctx context.Context, name string, child Inode) error
   805  
   806  	// RmDir removes an empty child directory from this directory
   807  	// inode. Implementations must update the parent directory's link count,
   808  	// if required. Implementations are not responsible for checking that child
   809  	// is a directory, or checking for an empty directory.
   810  	RmDir(ctx context.Context, name string, child Inode) error
   811  
   812  	// Rename is called on the source directory containing an inode being
   813  	// renamed. child points to the resolved child in the source directory.
   814  	// dstDir is guaranteed to be a directory inode.
   815  	//
   816  	// On a successful call to Rename, the caller updates the dentry tree to
   817  	// reflect the name change.
   818  	//
   819  	// Precondition: Caller must serialize concurrent calls to Rename.
   820  	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
   821  
   822  	// Lookup should return an appropriate inode if name should resolve to a
   823  	// child of this directory inode. This gives the directory an opportunity
   824  	// on every lookup to resolve additional entries. This is only called when
   825  	// the inode is a directory.
   826  	//
   827  	// The child returned by Lookup will be hashed into the VFS dentry tree,
   828  	// at least for the duration of the current FS operation.
   829  	//
   830  	// Lookup must return the child with an extra reference whose ownership is
   831  	// transferred to the dentry that is created to point to that inode. If
   832  	// Inode.Keep returns false, that new dentry will be dropped at the end of
   833  	// the current filesystem operation (before returning back to the VFS
   834  	// layer) if no other ref is picked on that dentry. If Inode.Keep returns
   835  	// true, then the dentry will be cached into the dentry tree until it is
   836  	// Unlink'd or RmDir'd.
   837  	Lookup(ctx context.Context, name string) (Inode, error)
   838  
   839  	// IterDirents is used to iterate over dynamically created entries. It invokes
   840  	// cb on each entry in the directory represented by the Inode.
   841  	// 'offset' is the offset for the entire IterDirents call, which may include
   842  	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
   843  	// inside the entries returned by this IterDirents invocation. In other words,
   844  	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
   845  	// the return value, while 'relOffset' is the place to start iteration.
   846  	IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
   847  }
   848  
   849  type inodeSymlink interface {
   850  	// Readlink returns the target of a symbolic link. If an inode is not a
   851  	// symlink, the implementation should return EINVAL.
   852  	//
   853  	// Readlink is called with no kernfs locks held, so it may reenter if needed
   854  	// to resolve symlink targets.
   855  	Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
   856  
   857  	// Getlink returns the target of a symbolic link, as used by path
   858  	// resolution:
   859  	//
   860  	//	- If the inode is a "magic link" (a link whose target is most accurately
   861  	//		represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
   862  	//		nil). A reference is taken on the returned VirtualDentry.
   863  	//
   864  	//	- If the inode is an ordinary symlink, Getlink returns (zero-value
   865  	//		VirtualDentry, symlink target, nil).
   866  	//
   867  	//	- If the inode is not a symlink, Getlink returns (zero-value
   868  	//		VirtualDentry, "", EINVAL).
   869  	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
   870  }