github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/kernfs/kernfs.go

github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/pkg/sentry/fsimpl/kernfs/kernfs.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  // Package kernfs provides the tools to implement inode-based filesystems.
    16  // Kernfs has two main features:
    17  //
    18  // 1. The Inode interface, which maps VFS2's path-based filesystem operations to
    19  //    specific filesystem nodes. Kernfs uses the Inode interface to provide a
    20  //    blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as
    21  //    the synchronization mechanism for all filesystem operations by holding a
    22  //    filesystem-wide lock across all operations.
    23  //
    24  // 2. Various utility types which provide generic implementations for various
    25  //    parts of the Inode and vfs.FileDescription interfaces. Client filesystems
    26  //    based on kernfs can embed the appropriate set of these to avoid having to
    27  //    reimplement common filesystem operations. See inode_impl_util.go and
    28  //    fd_impl_util.go.
    29  //
    30  // Reference Model:
    31  //
    32  // Kernfs dentries represents named pointers to inodes. Kernfs is solely
    33  // reponsible for maintaining and modifying its dentry tree; inode
    34  // implementations can not access the tree. Dentries and inodes have
    35  // independent lifetimes and reference counts. A child dentry unconditionally
    36  // holds a reference on its parent directory's dentry. A dentry also holds a
    37  // reference on the inode it points to (although that might not be the only
    38  // reference on the inode). Due to this inodes can outlive the dentries that
    39  // point to them. Multiple dentries can point to the same inode (for example,
    40  // in the case of hardlinks). File descriptors hold a reference to the dentry
    41  // they're opened on.
    42  //
    43  // Dentries are guaranteed to exist while holding Filesystem.mu for
    44  // reading. Dropping dentries require holding Filesystem.mu for writing. To
    45  // queue dentries for destruction from a read critical section, see
    46  // Filesystem.deferDecRef.
    47  //
    48  // Lock ordering:
    49  //
    50  // kernfs.Filesystem.mu
    51  //   kernfs.Dentry.dirMu
    52  //     vfs.VirtualFilesystem.mountMu
    53  //       vfs.Dentry.mu
    54  //   (inode implementation locks, if any)
    55  // kernfs.Filesystem.droppedDentriesMu
    56  package kernfs
    57  
    58  import (
    59  	"fmt"
    60  	"sync/atomic"
    61  
    62  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    63  	"github.com/SagerNet/gvisor/pkg/context"
    64  	"github.com/SagerNet/gvisor/pkg/fspath"
    65  	"github.com/SagerNet/gvisor/pkg/refsvfs2"
    66  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    67  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    68  	"github.com/SagerNet/gvisor/pkg/sync"
    69  )
    70  
    71  // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory
    72  // filesystem. Concrete implementations are expected to embed this in their own
    73  // Filesystem type.
    74  //
    75  // +stateify savable
    76  type Filesystem struct {
    77  	vfsfs vfs.Filesystem
    78  
    79  	droppedDentriesMu sync.Mutex `state:"nosave"`
    80  
    81  	// droppedDentries is a list of dentries waiting to be DecRef()ed. This is
    82  	// used to defer dentry destruction until mu can be acquired for
    83  	// writing. Protected by droppedDentriesMu.
    84  	droppedDentries []*Dentry
    85  
    86  	// mu synchronizes the lifetime of Dentries on this filesystem. Holding it
    87  	// for reading guarantees continued existence of any resolved dentries, but
    88  	// the dentry tree may be modified.
    89  	//
    90  	// Kernfs dentries can only be DecRef()ed while holding mu for writing. For
    91  	// example:
    92  	//
    93  	//   fs.mu.Lock()
    94  	//   defer fs.mu.Unlock()
    95  	//   ...
    96  	//   dentry1.DecRef()
    97  	//   defer dentry2.DecRef() // Ok, will run before Unlock.
    98  	//
    99  	// If discarding dentries in a read context, use Filesystem.deferDecRef. For
   100  	// example:
   101  	//
   102  	//   fs.mu.RLock()
   103  	//   defer fs.processDeferredDecRefs()
   104  	//   defer fs.mu.RUnlock()
   105  	//   ...
   106  	//   fs.deferDecRef(dentry)
   107  	mu sync.RWMutex `state:"nosave"`
   108  
   109  	// nextInoMinusOne is used to to allocate inode numbers on this
   110  	// filesystem. Must be accessed by atomic operations.
   111  	nextInoMinusOne uint64
   112  
   113  	// cachedDentries contains all dentries with 0 references. (Due to race
   114  	// conditions, it may also contain dentries with non-zero references.)
   115  	// cachedDentriesLen is the number of dentries in cachedDentries. These
   116  	// fields are protected by mu.
   117  	cachedDentries    dentryList
   118  	cachedDentriesLen uint64
   119  
   120  	// MaxCachedDentries is the maximum size of cachedDentries. If not set,
   121  	// defaults to 0 and kernfs does not cache any dentries. This is immutable.
   122  	MaxCachedDentries uint64
   123  
   124  	// root is the root dentry of this filesystem. Note that root may be nil for
   125  	// filesystems on a disconnected mount without a root (e.g. pipefs, sockfs,
   126  	// hostfs). Filesystem holds an extra reference on root to prevent it from
   127  	// being destroyed prematurely. This is immutable.
   128  	root *Dentry
   129  }
   130  
   131  // deferDecRef defers dropping a dentry ref until the next call to
   132  // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu.
   133  // This may be called while Filesystem.mu or Dentry.dirMu is locked.
   134  func (fs *Filesystem) deferDecRef(d *Dentry) {
   135  	fs.droppedDentriesMu.Lock()
   136  	fs.droppedDentries = append(fs.droppedDentries, d)
   137  	fs.droppedDentriesMu.Unlock()
   138  }
   139  
   140  // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the
   141  // droppedDentries list. See comment on Filesystem.mu.
   142  //
   143  // Precondition: Filesystem.mu or Dentry.dirMu must NOT be locked.
   144  func (fs *Filesystem) processDeferredDecRefs(ctx context.Context) {
   145  	fs.droppedDentriesMu.Lock()
   146  	for _, d := range fs.droppedDentries {
   147  		// Defer the DecRef call so that we are not holding droppedDentriesMu
   148  		// when DecRef is called.
   149  		defer d.DecRef(ctx)
   150  	}
   151  	fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse.
   152  	fs.droppedDentriesMu.Unlock()
   153  }
   154  
   155  // VFSFilesystem returns the generic vfs filesystem object.
   156  func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem {
   157  	return &fs.vfsfs
   158  }
   159  
   160  // NextIno allocates a new inode number on this filesystem.
   161  func (fs *Filesystem) NextIno() uint64 {
   162  	return atomic.AddUint64(&fs.nextInoMinusOne, 1)
   163  }
   164  
   165  // These consts are used in the Dentry.flags field.
   166  const (
   167  	// Dentry points to a directory inode.
   168  	dflagsIsDir = 1 << iota
   169  
   170  	// Dentry points to a symlink inode.
   171  	dflagsIsSymlink
   172  )
   173  
   174  // Dentry implements vfs.DentryImpl.
   175  //
   176  // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a
   177  // named reference to an inode. A dentry generally lives as long as it's part of
   178  // a mounted filesystem tree. Kernfs drops dentries once all references to them
   179  // are dropped. Dentries hold a single reference to the inode they point
   180  // to, and child dentries hold a reference on their parent.
   181  //
   182  // Must be initialized by Init prior to first use.
   183  //
   184  // +stateify savable
   185  type Dentry struct {
   186  	vfsd vfs.Dentry
   187  
   188  	// refs is the reference count. When refs reaches 0, the dentry may be
   189  	// added to the cache or destroyed. If refs == -1, the dentry has already
   190  	// been destroyed. refs are allowed to go to 0 and increase again. refs is
   191  	// accessed using atomic memory operations.
   192  	refs int64
   193  
   194  	// fs is the owning filesystem. fs is immutable.
   195  	fs *Filesystem
   196  
   197  	// flags caches useful information about the dentry from the inode. See the
   198  	// dflags* consts above. Must be accessed by atomic ops.
   199  	flags uint32
   200  
   201  	parent *Dentry
   202  	name   string
   203  
   204  	// If cached is true, dentryEntry links dentry into
   205  	// Filesystem.cachedDentries. cached and dentryEntry are protected by
   206  	// Filesystem.mu.
   207  	cached bool
   208  	dentryEntry
   209  
   210  	// dirMu protects children and the names of child Dentries.
   211  	//
   212  	// Note that holding fs.mu for writing is not sufficient;
   213  	// revalidateChildLocked(), which is a very hot path, may modify children with
   214  	// fs.mu acquired for reading only.
   215  	dirMu    sync.Mutex `state:"nosave"`
   216  	children map[string]*Dentry
   217  
   218  	inode Inode
   219  }
   220  
   221  // IncRef implements vfs.DentryImpl.IncRef.
   222  func (d *Dentry) IncRef() {
   223  	// d.refs may be 0 if d.fs.mu is locked, which serializes against
   224  	// d.cacheLocked().
   225  	r := atomic.AddInt64(&d.refs, 1)
   226  	if d.LogRefs() {
   227  		refsvfs2.LogIncRef(d, r)
   228  	}
   229  }
   230  
   231  // TryIncRef implements vfs.DentryImpl.TryIncRef.
   232  func (d *Dentry) TryIncRef() bool {
   233  	for {
   234  		r := atomic.LoadInt64(&d.refs)
   235  		if r <= 0 {
   236  			return false
   237  		}
   238  		if atomic.CompareAndSwapInt64(&d.refs, r, r+1) {
   239  			if d.LogRefs() {
   240  				refsvfs2.LogTryIncRef(d, r+1)
   241  			}
   242  			return true
   243  		}
   244  	}
   245  }
   246  
   247  // DecRef implements vfs.DentryImpl.DecRef.
   248  func (d *Dentry) DecRef(ctx context.Context) {
   249  	r := atomic.AddInt64(&d.refs, -1)
   250  	if d.LogRefs() {
   251  		refsvfs2.LogDecRef(d, r)
   252  	}
   253  	if r == 0 {
   254  		d.fs.mu.Lock()
   255  		d.cacheLocked(ctx)
   256  		d.fs.mu.Unlock()
   257  	} else if r < 0 {
   258  		panic("kernfs.Dentry.DecRef() called without holding a reference")
   259  	}
   260  }
   261  
   262  func (d *Dentry) decRefLocked(ctx context.Context) {
   263  	r := atomic.AddInt64(&d.refs, -1)
   264  	if d.LogRefs() {
   265  		refsvfs2.LogDecRef(d, r)
   266  	}
   267  	if r == 0 {
   268  		d.cacheLocked(ctx)
   269  	} else if r < 0 {
   270  		panic("kernfs.Dentry.DecRef() called without holding a reference")
   271  	}
   272  }
   273  
   274  // cacheLocked should be called after d's reference count becomes 0. The ref
   275  // count check may happen before acquiring d.fs.mu so there might be a race
   276  // condition where the ref count is increased again by the time the caller
   277  // acquires d.fs.mu. This race is handled.
   278  // Only reachable dentries are added to the cache. However, a dentry might
   279  // become unreachable *while* it is in the cache due to invalidation.
   280  //
   281  // Preconditions: d.fs.mu must be locked for writing.
   282  func (d *Dentry) cacheLocked(ctx context.Context) {
   283  	// Dentries with a non-zero reference count must be retained. (The only way
   284  	// to obtain a reference on a dentry with zero references is via path
   285  	// resolution, which requires d.fs.mu, so if d.refs is zero then it will
   286  	// remain zero while we hold d.fs.mu for writing.)
   287  	refs := atomic.LoadInt64(&d.refs)
   288  	if refs == -1 {
   289  		// Dentry has already been destroyed.
   290  		return
   291  	}
   292  	if refs > 0 {
   293  		if d.cached {
   294  			d.fs.cachedDentries.Remove(d)
   295  			d.fs.cachedDentriesLen--
   296  			d.cached = false
   297  		}
   298  		return
   299  	}
   300  	// If the dentry is deleted and invalidated or has no parent, then it is no
   301  	// longer reachable by path resolution and should be dropped immediately
   302  	// because it has zero references.
   303  	// Note that a dentry may not always have a parent; for example magic links
   304  	// as described in Inode.Getlink.
   305  	if isDead := d.VFSDentry().IsDead(); isDead || d.parent == nil {
   306  		if !isDead {
   307  			d.fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, d.VFSDentry())
   308  		}
   309  		if d.cached {
   310  			d.fs.cachedDentries.Remove(d)
   311  			d.fs.cachedDentriesLen--
   312  			d.cached = false
   313  		}
   314  		d.destroyLocked(ctx)
   315  		return
   316  	}
   317  	// If d is already cached, just move it to the front of the LRU.
   318  	if d.cached {
   319  		d.fs.cachedDentries.Remove(d)
   320  		d.fs.cachedDentries.PushFront(d)
   321  		return
   322  	}
   323  	// Cache the dentry, then evict the least recently used cached dentry if
   324  	// the cache becomes over-full.
   325  	d.fs.cachedDentries.PushFront(d)
   326  	d.fs.cachedDentriesLen++
   327  	d.cached = true
   328  	if d.fs.cachedDentriesLen <= d.fs.MaxCachedDentries {
   329  		return
   330  	}
   331  	d.fs.evictCachedDentryLocked(ctx)
   332  	// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
   333  	// back down to fs.opts.maxCachedDentries, so we don't loop.
   334  }
   335  
   336  // Preconditions:
   337  // * fs.mu must be locked for writing.
   338  // * fs.cachedDentriesLen != 0.
   339  func (fs *Filesystem) evictCachedDentryLocked(ctx context.Context) {
   340  	// Evict the least recently used dentry because cache size is greater than
   341  	// max cache size (configured on mount).
   342  	victim := fs.cachedDentries.Back()
   343  	fs.cachedDentries.Remove(victim)
   344  	fs.cachedDentriesLen--
   345  	victim.cached = false
   346  	// victim.refs may have become non-zero from an earlier path resolution
   347  	// after it was inserted into fs.cachedDentries.
   348  	if atomic.LoadInt64(&victim.refs) == 0 {
   349  		if !victim.vfsd.IsDead() {
   350  			victim.parent.dirMu.Lock()
   351  			// Note that victim can't be a mount point (in any mount
   352  			// namespace), since VFS holds references on mount points.
   353  			fs.vfsfs.VirtualFilesystem().InvalidateDentry(ctx, victim.VFSDentry())
   354  			delete(victim.parent.children, victim.name)
   355  			victim.parent.dirMu.Unlock()
   356  		}
   357  		victim.destroyLocked(ctx)
   358  	}
   359  	// Whether or not victim was destroyed, we brought fs.cachedDentriesLen
   360  	// back down to fs.MaxCachedDentries, so we don't loop.
   361  }
   362  
   363  // destroyLocked destroys the dentry.
   364  //
   365  // Preconditions:
   366  // * d.fs.mu must be locked for writing.
   367  // * d.refs == 0.
   368  // * d should have been removed from d.parent.children, i.e. d is not reachable
   369  //   by path traversal.
   370  // * d.vfsd.IsDead() is true.
   371  func (d *Dentry) destroyLocked(ctx context.Context) {
   372  	refs := atomic.LoadInt64(&d.refs)
   373  	switch refs {
   374  	case 0:
   375  		// Mark the dentry destroyed.
   376  		atomic.StoreInt64(&d.refs, -1)
   377  	case -1:
   378  		panic("dentry.destroyLocked() called on already destroyed dentry")
   379  	default:
   380  		panic("dentry.destroyLocked() called with references on the dentry")
   381  	}
   382  
   383  	d.inode.DecRef(ctx) // IncRef from Init.
   384  	d.inode = nil
   385  
   386  	if d.parent != nil {
   387  		d.parent.decRefLocked(ctx)
   388  	}
   389  
   390  	refsvfs2.Unregister(d)
   391  }
   392  
   393  // RefType implements refsvfs2.CheckedObject.Type.
   394  func (d *Dentry) RefType() string {
   395  	return "kernfs.Dentry"
   396  }
   397  
   398  // LeakMessage implements refsvfs2.CheckedObject.LeakMessage.
   399  func (d *Dentry) LeakMessage() string {
   400  	return fmt.Sprintf("[kernfs.Dentry %p] reference count of %d instead of -1", d, atomic.LoadInt64(&d.refs))
   401  }
   402  
   403  // LogRefs implements refsvfs2.CheckedObject.LogRefs.
   404  //
   405  // This should only be set to true for debugging purposes, as it can generate an
   406  // extremely large amount of output and drastically degrade performance.
   407  func (d *Dentry) LogRefs() bool {
   408  	return false
   409  }
   410  
   411  // InitRoot initializes this dentry as the root of the filesystem.
   412  //
   413  // Precondition: Caller must hold a reference on inode.
   414  //
   415  // Postcondition: Caller's reference on inode is transferred to the dentry.
   416  func (d *Dentry) InitRoot(fs *Filesystem, inode Inode) {
   417  	d.Init(fs, inode)
   418  	fs.root = d
   419  	// Hold an extra reference on the root dentry. It is held by fs to prevent the
   420  	// root from being "cached" and subsequently evicted.
   421  	d.IncRef()
   422  }
   423  
   424  // Init initializes this dentry.
   425  //
   426  // Precondition: Caller must hold a reference on inode.
   427  //
   428  // Postcondition: Caller's reference on inode is transferred to the dentry.
   429  func (d *Dentry) Init(fs *Filesystem, inode Inode) {
   430  	d.vfsd.Init(d)
   431  	d.fs = fs
   432  	d.inode = inode
   433  	atomic.StoreInt64(&d.refs, 1)
   434  	ftype := inode.Mode().FileType()
   435  	if ftype == linux.ModeDirectory {
   436  		d.flags |= dflagsIsDir
   437  	}
   438  	if ftype == linux.ModeSymlink {
   439  		d.flags |= dflagsIsSymlink
   440  	}
   441  	refsvfs2.Register(d)
   442  }
   443  
   444  // VFSDentry returns the generic vfs dentry for this kernfs dentry.
   445  func (d *Dentry) VFSDentry() *vfs.Dentry {
   446  	return &d.vfsd
   447  }
   448  
   449  // isDir checks whether the dentry points to a directory inode.
   450  func (d *Dentry) isDir() bool {
   451  	return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0
   452  }
   453  
   454  // isSymlink checks whether the dentry points to a symlink inode.
   455  func (d *Dentry) isSymlink() bool {
   456  	return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0
   457  }
   458  
   459  // InotifyWithParent implements vfs.DentryImpl.InotifyWithParent.
   460  //
   461  // Although Linux technically supports inotify on pseudo filesystems (inotify
   462  // is implemented at the vfs layer), it is not particularly useful. It is left
   463  // unimplemented until someone actually needs it.
   464  func (d *Dentry) InotifyWithParent(ctx context.Context, events, cookie uint32, et vfs.EventType) {}
   465  
   466  // Watches implements vfs.DentryImpl.Watches.
   467  func (d *Dentry) Watches() *vfs.Watches {
   468  	return nil
   469  }
   470  
   471  // OnZeroWatches implements vfs.Dentry.OnZeroWatches.
   472  func (d *Dentry) OnZeroWatches(context.Context) {}
   473  
   474  // insertChild inserts child into the vfs dentry cache with the given name under
   475  // this dentry. This does not update the directory inode, so calling this on its
   476  // own isn't sufficient to insert a child into a directory.
   477  //
   478  // Preconditions:
   479  // * d must represent a directory inode.
   480  // * d.fs.mu must be locked for at least reading.
   481  func (d *Dentry) insertChild(name string, child *Dentry) {
   482  	d.dirMu.Lock()
   483  	d.insertChildLocked(name, child)
   484  	d.dirMu.Unlock()
   485  }
   486  
   487  // insertChildLocked is equivalent to insertChild, with additional
   488  // preconditions.
   489  //
   490  // Preconditions:
   491  // * d must represent a directory inode.
   492  // * d.dirMu must be locked.
   493  // * d.fs.mu must be locked for at least reading.
   494  func (d *Dentry) insertChildLocked(name string, child *Dentry) {
   495  	if !d.isDir() {
   496  		panic(fmt.Sprintf("insertChildLocked called on non-directory Dentry: %+v.", d))
   497  	}
   498  	d.IncRef() // DecRef in child's Dentry.destroy.
   499  	child.parent = d
   500  	child.name = name
   501  	if d.children == nil {
   502  		d.children = make(map[string]*Dentry)
   503  	}
   504  	d.children[name] = child
   505  }
   506  
   507  // Inode returns the dentry's inode.
   508  func (d *Dentry) Inode() Inode {
   509  	return d.inode
   510  }
   511  
   512  // FSLocalPath returns an absolute path to d, relative to the root of its
   513  // filesystem.
   514  func (d *Dentry) FSLocalPath() string {
   515  	var b fspath.Builder
   516  	_ = genericPrependPath(vfs.VirtualDentry{}, nil, d, &b)
   517  	b.PrependByte('/')
   518  	return b.String()
   519  }
   520  
   521  // The Inode interface maps filesystem-level operations that operate on paths to
   522  // equivalent operations on specific filesystem nodes.
   523  //
   524  // The interface methods are groups into logical categories as sub interfaces
   525  // below. Generally, an implementation for each sub interface can be provided by
   526  // embedding an appropriate type from inode_impl_utils.go. The sub interfaces
   527  // are purely organizational. Methods declared directly in the main interface
   528  // have no generic implementations, and should be explicitly provided by the
   529  // client filesystem.
   530  //
   531  // Generally, implementations are not responsible for tasks that are common to
   532  // all filesystems. These include:
   533  //
   534  // - Checking that dentries passed to methods are of the appropriate file type.
   535  // - Checking permissions.
   536  //
   537  // Inode functions may be called holding filesystem wide locks and are not
   538  // allowed to call vfs functions that may reenter, unless otherwise noted.
   539  //
   540  // Specific responsibilities of implementations are documented below.
   541  type Inode interface {
   542  	// Methods related to reference counting. A generic implementation is
   543  	// provided by InodeNoopRefCount. These methods are generally called by the
   544  	// equivalent Dentry methods.
   545  	inodeRefs
   546  
   547  	// Methods related to node metadata. A generic implementation is provided by
   548  	// InodeAttrs. Note that a concrete filesystem using kernfs is responsible for
   549  	// managing link counts.
   550  	inodeMetadata
   551  
   552  	// Method for inodes that represent symlink. InodeNotSymlink provides a
   553  	// blanket implementation for all non-symlink inodes.
   554  	inodeSymlink
   555  
   556  	// Method for inodes that represent directories. InodeNotDirectory provides
   557  	// a blanket implementation for all non-directory inodes.
   558  	inodeDirectory
   559  
   560  	// Open creates a file description for the filesystem object represented by
   561  	// this inode. The returned file description should hold a reference on the
   562  	// dentry for its lifetime.
   563  	//
   564  	// Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing
   565  	// the inode on which Open() is being called.
   566  	Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error)
   567  
   568  	// StatFS returns filesystem statistics for the client filesystem. This
   569  	// corresponds to vfs.FilesystemImpl.StatFSAt. If the client filesystem
   570  	// doesn't support statfs(2), this should return ENOSYS.
   571  	StatFS(ctx context.Context, fs *vfs.Filesystem) (linux.Statfs, error)
   572  
   573  	// Keep indicates whether the dentry created after Inode.Lookup should be
   574  	// kept in the kernfs dentry tree.
   575  	Keep() bool
   576  
   577  	// Valid should return true if this inode is still valid, or needs to
   578  	// be resolved again by a call to Lookup.
   579  	Valid(ctx context.Context) bool
   580  }
   581  
   582  type inodeRefs interface {
   583  	IncRef()
   584  	DecRef(ctx context.Context)
   585  	TryIncRef() bool
   586  }
   587  
   588  type inodeMetadata interface {
   589  	// CheckPermissions checks that creds may access this inode for the
   590  	// requested access type, per the the rules of
   591  	// fs/namei.c:generic_permission().
   592  	CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error
   593  
   594  	// Mode returns the (struct stat)::st_mode value for this inode. This is
   595  	// separated from Stat for performance.
   596  	Mode() linux.FileMode
   597  
   598  	// Stat returns the metadata for this inode. This corresponds to
   599  	// vfs.FilesystemImpl.StatAt.
   600  	Stat(ctx context.Context, fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error)
   601  
   602  	// SetStat updates the metadata for this inode. This corresponds to
   603  	// vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking
   604  	// if the operation can be performed (see vfs.CheckSetStat() for common
   605  	// checks).
   606  	SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error
   607  }
   608  
   609  // Precondition: All methods in this interface may only be called on directory
   610  // inodes.
   611  type inodeDirectory interface {
   612  	// The New{File,Dir,Node,Link,Symlink} methods below should return a new inode
   613  	// that will be hashed into the dentry tree.
   614  	//
   615  	// These inode constructors are inode-level operations rather than
   616  	// filesystem-level operations to allow client filesystems to mix different
   617  	// implementations based on the new node's location in the
   618  	// filesystem.
   619  
   620  	// HasChildren returns true if the directory inode has any children.
   621  	HasChildren() bool
   622  
   623  	// NewFile creates a new regular file inode.
   624  	NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (Inode, error)
   625  
   626  	// NewDir creates a new directory inode.
   627  	NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (Inode, error)
   628  
   629  	// NewLink creates a new hardlink to a specified inode in this
   630  	// directory. Implementations should create a new kernfs Dentry pointing to
   631  	// target, and update target's link count.
   632  	NewLink(ctx context.Context, name string, target Inode) (Inode, error)
   633  
   634  	// NewSymlink creates a new symbolic link inode.
   635  	NewSymlink(ctx context.Context, name, target string) (Inode, error)
   636  
   637  	// NewNode creates a new filesystem node for a mknod syscall.
   638  	NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (Inode, error)
   639  
   640  	// Unlink removes a child dentry from this directory inode.
   641  	Unlink(ctx context.Context, name string, child Inode) error
   642  
   643  	// RmDir removes an empty child directory from this directory
   644  	// inode. Implementations must update the parent directory's link count,
   645  	// if required. Implementations are not responsible for checking that child
   646  	// is a directory, checking for an empty directory.
   647  	RmDir(ctx context.Context, name string, child Inode) error
   648  
   649  	// Rename is called on the source directory containing an inode being
   650  	// renamed. child should point to the resolved child in the source
   651  	// directory.
   652  	//
   653  	// Precondition: Caller must serialize concurrent calls to Rename.
   654  	Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error
   655  
   656  	// Lookup should return an appropriate inode if name should resolve to a
   657  	// child of this directory inode. This gives the directory an opportunity
   658  	// on every lookup to resolve additional entries. This is only called when
   659  	// the inode is a directory.
   660  	//
   661  	// The child returned by Lookup will be hashed into the VFS dentry tree,
   662  	// at least for the duration of the current FS operation.
   663  	//
   664  	// Lookup must return the child with an extra reference whose ownership is
   665  	// transferred to the dentry that is created to point to that inode. If
   666  	// Inode.Keep returns false, that new dentry will be dropped at the end of
   667  	// the current filesystem operation (before returning back to the VFS
   668  	// layer) if no other ref is picked on that dentry. If Inode.Keep returns
   669  	// true, then the dentry will be cached into the dentry tree until it is
   670  	// Unlink'd or RmDir'd.
   671  	Lookup(ctx context.Context, name string) (Inode, error)
   672  
   673  	// IterDirents is used to iterate over dynamically created entries. It invokes
   674  	// cb on each entry in the directory represented by the Inode.
   675  	// 'offset' is the offset for the entire IterDirents call, which may include
   676  	// results from the caller (e.g. "." and ".."). 'relOffset' is the offset
   677  	// inside the entries returned by this IterDirents invocation. In other words,
   678  	// 'offset' should be used to calculate each vfs.Dirent.NextOff as well as
   679  	// the return value, while 'relOffset' is the place to start iteration.
   680  	IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error)
   681  }
   682  
   683  type inodeSymlink interface {
   684  	// Readlink returns the target of a symbolic link. If an inode is not a
   685  	// symlink, the implementation should return EINVAL.
   686  	//
   687  	// Readlink is called with no kernfs locks held, so it may reenter if needed
   688  	// to resolve symlink targets.
   689  	Readlink(ctx context.Context, mnt *vfs.Mount) (string, error)
   690  
   691  	// Getlink returns the target of a symbolic link, as used by path
   692  	// resolution:
   693  	//
   694  	// - If the inode is a "magic link" (a link whose target is most accurately
   695  	// represented as a VirtualDentry), Getlink returns (ok VirtualDentry, "",
   696  	// nil). A reference is taken on the returned VirtualDentry.
   697  	//
   698  	// - If the inode is an ordinary symlink, Getlink returns (zero-value
   699  	// VirtualDentry, symlink target, nil).
   700  	//
   701  	// - If the inode is not a symlink, Getlink returns (zero-value
   702  	// VirtualDentry, "", EINVAL).
   703  	Getlink(ctx context.Context, mnt *vfs.Mount) (vfs.VirtualDentry, string, error)
   704  }