github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/kernfs/inode_impl_util.go (about)

     1  // Copyright 2019 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package kernfs
    16  
    17  import (
    18  	"fmt"
    19  
    20  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    21  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    22  	"github.com/MerlinKodo/gvisor/pkg/context"
    23  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    24  	"github.com/MerlinKodo/gvisor/pkg/hostarch"
    25  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    26  	ktime "github.com/MerlinKodo/gvisor/pkg/sentry/kernel/time"
    27  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    28  	"github.com/MerlinKodo/gvisor/pkg/sync"
    29  )
    30  
    31  // InodeNoopRefCount partially implements the Inode interface, specifically the
    32  // inodeRefs sub interface. InodeNoopRefCount implements a simple reference
    33  // count for inodes, performing no extra actions when references are obtained or
    34  // released. This is suitable for simple file inodes that don't reference any
    35  // resources.
    36  //
    37  // +stateify savable
    38  type InodeNoopRefCount struct {
    39  	InodeTemporary
    40  }
    41  
    42  // IncRef implements Inode.IncRef.
    43  func (InodeNoopRefCount) IncRef() {
    44  }
    45  
    46  // DecRef implements Inode.DecRef.
    47  func (InodeNoopRefCount) DecRef(context.Context) {
    48  }
    49  
    50  // TryIncRef implements Inode.TryIncRef.
    51  func (InodeNoopRefCount) TryIncRef() bool {
    52  	return true
    53  }
    54  
    55  // InodeDirectoryNoNewChildren partially implements the Inode interface.
    56  // InodeDirectoryNoNewChildren represents a directory inode which does not
    57  // support creation of new children.
    58  //
    59  // +stateify savable
    60  type InodeDirectoryNoNewChildren struct{}
    61  
    62  // NewFile implements Inode.NewFile.
    63  func (InodeDirectoryNoNewChildren) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
    64  	return nil, linuxerr.EPERM
    65  }
    66  
    67  // NewDir implements Inode.NewDir.
    68  func (InodeDirectoryNoNewChildren) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
    69  	return nil, linuxerr.EPERM
    70  }
    71  
    72  // NewLink implements Inode.NewLink.
    73  func (InodeDirectoryNoNewChildren) NewLink(context.Context, string, Inode) (Inode, error) {
    74  	return nil, linuxerr.EPERM
    75  }
    76  
    77  // NewSymlink implements Inode.NewSymlink.
    78  func (InodeDirectoryNoNewChildren) NewSymlink(context.Context, string, string) (Inode, error) {
    79  	return nil, linuxerr.EPERM
    80  }
    81  
    82  // NewNode implements Inode.NewNode.
    83  func (InodeDirectoryNoNewChildren) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
    84  	return nil, linuxerr.EPERM
    85  }
    86  
    87  // InodeNotDirectory partially implements the Inode interface, specifically the
    88  // inodeDirectory and inodeDynamicDirectory sub interfaces. Inodes that do not
    89  // represent directories can embed this to provide no-op implementations for
    90  // directory-related functions.
    91  //
    92  // +stateify savable
    93  type InodeNotDirectory struct {
    94  	InodeAlwaysValid
    95  }
    96  
    97  // HasChildren implements Inode.HasChildren.
    98  func (InodeNotDirectory) HasChildren() bool {
    99  	return false
   100  }
   101  
   102  // NewFile implements Inode.NewFile.
   103  func (InodeNotDirectory) NewFile(context.Context, string, vfs.OpenOptions) (Inode, error) {
   104  	panic("NewFile called on non-directory inode")
   105  }
   106  
   107  // NewDir implements Inode.NewDir.
   108  func (InodeNotDirectory) NewDir(context.Context, string, vfs.MkdirOptions) (Inode, error) {
   109  	panic("NewDir called on non-directory inode")
   110  }
   111  
   112  // NewLink implements Inode.NewLinkink.
   113  func (InodeNotDirectory) NewLink(context.Context, string, Inode) (Inode, error) {
   114  	panic("NewLink called on non-directory inode")
   115  }
   116  
   117  // NewSymlink implements Inode.NewSymlink.
   118  func (InodeNotDirectory) NewSymlink(context.Context, string, string) (Inode, error) {
   119  	panic("NewSymlink called on non-directory inode")
   120  }
   121  
   122  // NewNode implements Inode.NewNode.
   123  func (InodeNotDirectory) NewNode(context.Context, string, vfs.MknodOptions) (Inode, error) {
   124  	panic("NewNode called on non-directory inode")
   125  }
   126  
   127  // Unlink implements Inode.Unlink.
   128  func (InodeNotDirectory) Unlink(context.Context, string, Inode) error {
   129  	panic("Unlink called on non-directory inode")
   130  }
   131  
   132  // RmDir implements Inode.RmDir.
   133  func (InodeNotDirectory) RmDir(context.Context, string, Inode) error {
   134  	panic("RmDir called on non-directory inode")
   135  }
   136  
   137  // Rename implements Inode.Rename.
   138  func (InodeNotDirectory) Rename(context.Context, string, string, Inode, Inode) error {
   139  	panic("Rename called on non-directory inode")
   140  }
   141  
   142  // Lookup implements Inode.Lookup.
   143  func (InodeNotDirectory) Lookup(ctx context.Context, name string) (Inode, error) {
   144  	panic("Lookup called on non-directory inode")
   145  }
   146  
   147  // IterDirents implements Inode.IterDirents.
   148  func (InodeNotDirectory) IterDirents(ctx context.Context, mnt *vfs.Mount, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
   149  	panic("IterDirents called on non-directory inode")
   150  }
   151  
   152  // InodeNotSymlink partially implements the Inode interface, specifically the
   153  // inodeSymlink sub interface. All inodes that are not symlinks may embed this
   154  // to return the appropriate errors from symlink-related functions.
   155  //
   156  // +stateify savable
   157  type InodeNotSymlink struct{}
   158  
   159  // Readlink implements Inode.Readlink.
   160  func (InodeNotSymlink) Readlink(context.Context, *vfs.Mount) (string, error) {
   161  	return "", linuxerr.EINVAL
   162  }
   163  
   164  // Getlink implements Inode.Getlink.
   165  func (InodeNotSymlink) Getlink(context.Context, *vfs.Mount) (vfs.VirtualDentry, string, error) {
   166  	return vfs.VirtualDentry{}, "", linuxerr.EINVAL
   167  }
   168  
   169  // InodeAttrs partially implements the Inode interface, specifically the
   170  // inodeMetadata sub interface. InodeAttrs provides functionality related to
   171  // inode attributes.
   172  //
   173  // Must be initialized by Init prior to first use.
   174  //
   175  // +stateify savable
   176  type InodeAttrs struct {
   177  	devMajor  uint32
   178  	devMinor  uint32
   179  	ino       atomicbitops.Uint64
   180  	mode      atomicbitops.Uint32
   181  	uid       atomicbitops.Uint32
   182  	gid       atomicbitops.Uint32
   183  	nlink     atomicbitops.Uint32
   184  	blockSize atomicbitops.Uint32
   185  
   186  	// Timestamps, all nsecs from the Unix epoch.
   187  	atime atomicbitops.Int64
   188  	mtime atomicbitops.Int64
   189  	ctime atomicbitops.Int64
   190  }
   191  
   192  // Init initializes this InodeAttrs.
   193  func (a *InodeAttrs) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, mode linux.FileMode) {
   194  	if mode.FileType() == 0 {
   195  		panic(fmt.Sprintf("No file type specified in 'mode' for InodeAttrs.Init(): mode=0%o", mode))
   196  	}
   197  
   198  	nlink := uint32(1)
   199  	if mode.FileType() == linux.ModeDirectory {
   200  		nlink = 2
   201  	}
   202  	a.devMajor = devMajor
   203  	a.devMinor = devMinor
   204  	a.ino.Store(ino)
   205  	a.mode.Store(uint32(mode))
   206  	a.uid.Store(uint32(creds.EffectiveKUID))
   207  	a.gid.Store(uint32(creds.EffectiveKGID))
   208  	a.nlink.Store(nlink)
   209  	a.blockSize.Store(hostarch.PageSize)
   210  	now := ktime.NowFromContext(ctx).Nanoseconds()
   211  	a.atime.Store(now)
   212  	a.mtime.Store(now)
   213  	a.ctime.Store(now)
   214  }
   215  
   216  // DevMajor returns the device major number.
   217  func (a *InodeAttrs) DevMajor() uint32 {
   218  	return a.devMajor
   219  }
   220  
   221  // DevMinor returns the device minor number.
   222  func (a *InodeAttrs) DevMinor() uint32 {
   223  	return a.devMinor
   224  }
   225  
   226  // Ino returns the inode id.
   227  func (a *InodeAttrs) Ino() uint64 {
   228  	return a.ino.Load()
   229  }
   230  
   231  // UID implements Inode.UID.
   232  func (a *InodeAttrs) UID() auth.KUID {
   233  	return auth.KUID(a.uid.Load())
   234  }
   235  
   236  // GID implements Inode.GID.
   237  func (a *InodeAttrs) GID() auth.KGID {
   238  	return auth.KGID(a.gid.Load())
   239  }
   240  
   241  // Mode implements Inode.Mode.
   242  func (a *InodeAttrs) Mode() linux.FileMode {
   243  	return linux.FileMode(a.mode.Load())
   244  }
   245  
   246  // Links returns the link count.
   247  func (a *InodeAttrs) Links() uint32 {
   248  	return a.nlink.Load()
   249  }
   250  
   251  // TouchAtime updates a.atime to the current time.
   252  func (a *InodeAttrs) TouchAtime(ctx context.Context, mnt *vfs.Mount) {
   253  	if mnt.Flags.NoATime || mnt.ReadOnly() {
   254  		return
   255  	}
   256  	if err := mnt.CheckBeginWrite(); err != nil {
   257  		return
   258  	}
   259  	a.atime.Store(ktime.NowFromContext(ctx).Nanoseconds())
   260  	mnt.EndWrite()
   261  }
   262  
   263  // TouchCMtime updates a.{c/m}time to the current time. The caller should
   264  // synchronize calls to this so that ctime and mtime are updated to the same
   265  // value.
   266  func (a *InodeAttrs) TouchCMtime(ctx context.Context) {
   267  	now := ktime.NowFromContext(ctx).Nanoseconds()
   268  	a.mtime.Store(now)
   269  	a.ctime.Store(now)
   270  }
   271  
   272  // Stat partially implements Inode.Stat. Note that this function doesn't provide
   273  // all the stat fields, and the embedder should consider extending the result
   274  // with filesystem-specific fields.
   275  func (a *InodeAttrs) Stat(context.Context, *vfs.Filesystem, vfs.StatOptions) (linux.Statx, error) {
   276  	var stat linux.Statx
   277  	stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_NLINK | linux.STATX_ATIME | linux.STATX_MTIME | linux.STATX_CTIME
   278  	stat.DevMajor = a.devMajor
   279  	stat.DevMinor = a.devMinor
   280  	stat.Ino = a.ino.Load()
   281  	stat.Mode = uint16(a.Mode())
   282  	stat.UID = a.uid.Load()
   283  	stat.GID = a.gid.Load()
   284  	stat.Nlink = a.nlink.Load()
   285  	stat.Blksize = a.blockSize.Load()
   286  	stat.Atime = linux.NsecToStatxTimestamp(a.atime.Load())
   287  	stat.Mtime = linux.NsecToStatxTimestamp(a.mtime.Load())
   288  	stat.Ctime = linux.NsecToStatxTimestamp(a.ctime.Load())
   289  	return stat, nil
   290  }
   291  
   292  // SetStat implements Inode.SetStat.
   293  func (a *InodeAttrs) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error {
   294  	if opts.Stat.Mask == 0 {
   295  		return nil
   296  	}
   297  
   298  	// Note that not all fields are modifiable. For example, the file type and
   299  	// inode numbers are immutable after node creation. Setting the size is often
   300  	// allowed by kernfs files but does not do anything. If some other behavior is
   301  	// needed, the embedder should consider extending SetStat.
   302  	if opts.Stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_SIZE) != 0 {
   303  		return linuxerr.EPERM
   304  	}
   305  	if opts.Stat.Mask&linux.STATX_SIZE != 0 && a.Mode().IsDir() {
   306  		return linuxerr.EISDIR
   307  	}
   308  	if err := vfs.CheckSetStat(ctx, creds, &opts, a.Mode(), auth.KUID(a.uid.Load()), auth.KGID(a.gid.Load())); err != nil {
   309  		return err
   310  	}
   311  
   312  	clearSID := false
   313  	stat := opts.Stat
   314  	if stat.Mask&linux.STATX_UID != 0 {
   315  		a.uid.Store(stat.UID)
   316  		clearSID = true
   317  	}
   318  	if stat.Mask&linux.STATX_GID != 0 {
   319  		a.gid.Store(stat.GID)
   320  		clearSID = true
   321  	}
   322  	if stat.Mask&linux.STATX_MODE != 0 {
   323  		for {
   324  			old := a.mode.Load()
   325  			ft := old & linux.S_IFMT
   326  			newMode := ft | uint32(stat.Mode & ^uint16(linux.S_IFMT))
   327  			if clearSID {
   328  				newMode = vfs.ClearSUIDAndSGID(newMode)
   329  			}
   330  			if swapped := a.mode.CompareAndSwap(old, newMode); swapped {
   331  				clearSID = false
   332  				break
   333  			}
   334  		}
   335  	}
   336  
   337  	// We may have to clear the SUID/SGID bits, but didn't do so as part of
   338  	// STATX_MODE.
   339  	if clearSID {
   340  		for {
   341  			old := a.mode.Load()
   342  			newMode := vfs.ClearSUIDAndSGID(old)
   343  			if swapped := a.mode.CompareAndSwap(old, newMode); swapped {
   344  				break
   345  			}
   346  		}
   347  	}
   348  
   349  	now := ktime.NowFromContext(ctx).Nanoseconds()
   350  	if stat.Mask&linux.STATX_ATIME != 0 {
   351  		if stat.Atime.Nsec == linux.UTIME_NOW {
   352  			stat.Atime = linux.NsecToStatxTimestamp(now)
   353  		}
   354  		a.atime.Store(stat.Atime.ToNsec())
   355  	}
   356  	if stat.Mask&linux.STATX_MTIME != 0 {
   357  		if stat.Mtime.Nsec == linux.UTIME_NOW {
   358  			stat.Mtime = linux.NsecToStatxTimestamp(now)
   359  		}
   360  		a.mtime.Store(stat.Mtime.ToNsec())
   361  	}
   362  
   363  	return nil
   364  }
   365  
   366  // CheckPermissions implements Inode.CheckPermissions.
   367  func (a *InodeAttrs) CheckPermissions(_ context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error {
   368  	return vfs.GenericCheckPermissions(
   369  		creds,
   370  		ats,
   371  		a.Mode(),
   372  		auth.KUID(a.uid.Load()),
   373  		auth.KGID(a.gid.Load()),
   374  	)
   375  }
   376  
   377  // IncLinks implements Inode.IncLinks.
   378  func (a *InodeAttrs) IncLinks(n uint32) {
   379  	if a.nlink.Add(n) <= n {
   380  		panic("InodeLink.IncLinks called with no existing links")
   381  	}
   382  }
   383  
   384  // DecLinks implements Inode.DecLinks.
   385  func (a *InodeAttrs) DecLinks() {
   386  	if nlink := a.nlink.Add(^uint32(0)); nlink == ^uint32(0) {
   387  		// Negative overflow
   388  		panic("Inode.DecLinks called at 0 links")
   389  	}
   390  }
   391  
   392  // +stateify savable
   393  type slot struct {
   394  	name   string
   395  	inode  Inode
   396  	static bool
   397  	slotEntry
   398  }
   399  
   400  // OrderedChildrenOptions contains initialization options for OrderedChildren.
   401  //
   402  // +stateify savable
   403  type OrderedChildrenOptions struct {
   404  	// Writable indicates whether vfs.FilesystemImpl methods implemented by
   405  	// OrderedChildren may modify the tracked children. This applies to
   406  	// operations related to rename, unlink and rmdir. If an OrderedChildren is
   407  	// not writable, these operations all fail with EPERM.
   408  	//
   409  	// Note that writable users must implement the sticky bit (I_SVTX).
   410  	Writable bool
   411  }
   412  
   413  // inodeWithOrderedChildren allows extraction of an OrderedChildren from an
   414  // Inode implementation. A concrete type that both implements the Inode
   415  // interface and embeds OrderedChildren will be castable to this interface, and
   416  // we can get to the embedded OrderedChildren through the orderedChildren
   417  // method.
   418  type inodeWithOrderedChildren interface {
   419  	Inode
   420  	orderedChildren() *OrderedChildren
   421  }
   422  
   423  // OrderedChildren partially implements the Inode interface. OrderedChildren can
   424  // be embedded in directory inodes to keep track of children in the
   425  // directory, and can then be used to implement a generic directory FD -- see
   426  // GenericDirectoryFD.
   427  //
   428  // OrderedChildren can represent a node in an Inode tree. The children inodes
   429  // might be directories themselves using OrderedChildren; hence extending the
   430  // tree. The parent inode (OrderedChildren user) holds a ref on all its static
   431  // children. This lets the static inodes outlive their associated dentry.
   432  // While the dentry might have to be regenerated via a Lookup() call, we can
   433  // keep reusing the same static inode. These static children inodes are finally
   434  // DecRef'd when this directory inode is being destroyed. This makes
   435  // OrderedChildren suitable for static directory entries as well.
   436  //
   437  // Must be initialize with Init before first use.
   438  //
   439  // +stateify savable
   440  type OrderedChildren struct {
   441  	// Can children be modified by user syscalls? It set to false, interface
   442  	// methods that would modify the children return EPERM. Immutable.
   443  	writable bool
   444  
   445  	mu    sync.RWMutex `state:"nosave"`
   446  	order slotList
   447  	set   map[string]*slot
   448  }
   449  
   450  // orderedChildren implements inodeWithOrderedChildren.orderedChildren.
   451  func (o *OrderedChildren) orderedChildren() *OrderedChildren {
   452  	return o
   453  }
   454  
   455  // Init initializes an OrderedChildren.
   456  func (o *OrderedChildren) Init(opts OrderedChildrenOptions) {
   457  	o.writable = opts.Writable
   458  	o.set = make(map[string]*slot)
   459  }
   460  
   461  // Destroy clears the children stored in o. It should be called by structs
   462  // embedding OrderedChildren upon destruction, i.e. when their reference count
   463  // reaches zero.
   464  func (o *OrderedChildren) Destroy(ctx context.Context) {
   465  	o.mu.Lock()
   466  	defer o.mu.Unlock()
   467  	// Drop the ref that o owns on the static inodes it holds.
   468  	for _, s := range o.set {
   469  		if s.static {
   470  			s.inode.DecRef(ctx)
   471  		}
   472  	}
   473  	o.order.Reset()
   474  	o.set = nil
   475  }
   476  
   477  // Populate inserts static children into this OrderedChildren.
   478  // Populate returns the number of directories inserted, which the caller
   479  // may use to update the link count for the parent directory.
   480  //
   481  // Precondition:
   482  //   - d must represent a directory inode.
   483  //   - children must not contain any conflicting entries already in o.
   484  //   - Caller must hold a reference on all inodes passed.
   485  //
   486  // Postcondition: Caller's references on inodes are transferred to o.
   487  func (o *OrderedChildren) Populate(children map[string]Inode) uint32 {
   488  	var links uint32
   489  	for name, child := range children {
   490  		if child.Mode().IsDir() {
   491  			links++
   492  		}
   493  		if err := o.insert(name, child, true); err != nil {
   494  			panic(fmt.Sprintf("Collision when attempting to insert child %q (%+v)", name, child))
   495  		}
   496  	}
   497  	return links
   498  }
   499  
   500  // Lookup implements Inode.Lookup.
   501  func (o *OrderedChildren) Lookup(ctx context.Context, name string) (Inode, error) {
   502  	o.mu.RLock()
   503  	defer o.mu.RUnlock()
   504  
   505  	s, ok := o.set[name]
   506  	if !ok {
   507  		return nil, linuxerr.ENOENT
   508  	}
   509  
   510  	s.inode.IncRef() // This ref is passed to the dentry upon creation via Init.
   511  	return s.inode, nil
   512  }
   513  
   514  // ForEachChild calls fn on all childrens tracked by this ordered children.
   515  func (o *OrderedChildren) ForEachChild(fn func(string, Inode)) {
   516  	o.mu.RLock()
   517  	defer o.mu.RUnlock()
   518  
   519  	for name, slot := range o.set {
   520  		fn(name, slot.inode)
   521  	}
   522  }
   523  
   524  // IterDirents implements Inode.IterDirents.
   525  func (o *OrderedChildren) IterDirents(ctx context.Context, mnt *vfs.Mount, cb vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) {
   526  	// All entries from OrderedChildren have already been handled in
   527  	// GenericDirectoryFD.IterDirents.
   528  	return offset, nil
   529  }
   530  
   531  // HasChildren implements Inode.HasChildren.
   532  func (o *OrderedChildren) HasChildren() bool {
   533  	o.mu.RLock()
   534  	defer o.mu.RUnlock()
   535  	return len(o.set) > 0
   536  }
   537  
   538  // Insert inserts a dynamic child into o. This ignores the writability of o, as
   539  // this is not part of the vfs.FilesystemImpl interface, and is a lower-level operation.
   540  func (o *OrderedChildren) Insert(name string, child Inode) error {
   541  	return o.insert(name, child, false)
   542  }
   543  
   544  // Inserter is like Insert, but obtains the child to insert by calling
   545  // makeChild. makeChild is only called if the insert will succeed. This allows
   546  // the caller to atomically check and insert a child without having to
   547  // clean up the child on failure.
   548  func (o *OrderedChildren) Inserter(name string, makeChild func() Inode) (Inode, error) {
   549  	o.mu.Lock()
   550  	defer o.mu.Unlock()
   551  	if _, ok := o.set[name]; ok {
   552  		return nil, linuxerr.EEXIST
   553  	}
   554  
   555  	// Note: We must not fail after we call makeChild().
   556  
   557  	child := makeChild()
   558  	s := &slot{
   559  		name:   name,
   560  		inode:  child,
   561  		static: false,
   562  	}
   563  	o.order.PushBack(s)
   564  	o.set[name] = s
   565  	return child, nil
   566  }
   567  
   568  // insert inserts child into o.
   569  //
   570  // Precondition: Caller must be holding a ref on child if static is true.
   571  //
   572  // Postcondition: Caller's ref on child is transferred to o if static is true.
   573  func (o *OrderedChildren) insert(name string, child Inode, static bool) error {
   574  	o.mu.Lock()
   575  	defer o.mu.Unlock()
   576  	if _, ok := o.set[name]; ok {
   577  		return linuxerr.EEXIST
   578  	}
   579  	s := &slot{
   580  		name:   name,
   581  		inode:  child,
   582  		static: static,
   583  	}
   584  	o.order.PushBack(s)
   585  	o.set[name] = s
   586  	return nil
   587  }
   588  
   589  // Precondition: caller must hold o.mu for writing.
   590  func (o *OrderedChildren) removeLocked(name string) {
   591  	if s, ok := o.set[name]; ok {
   592  		if s.static {
   593  			panic(fmt.Sprintf("removeLocked called on a static inode: %v", s.inode))
   594  		}
   595  		delete(o.set, name)
   596  		o.order.Remove(s)
   597  	}
   598  }
   599  
   600  // Precondition: caller must hold o.mu for reading or writing.
   601  func (o *OrderedChildren) checkExistingLocked(name string, child Inode) error {
   602  	s, ok := o.set[name]
   603  	if !ok {
   604  		return linuxerr.ENOENT
   605  	}
   606  	if s.inode != child {
   607  		panic(fmt.Sprintf("Inode doesn't match what kernfs thinks! Name: %q, OrderedChild: %p, kernfs: %p", name, s.inode, child))
   608  	}
   609  	return nil
   610  }
   611  
   612  // Unlink implements Inode.Unlink.
   613  func (o *OrderedChildren) Unlink(ctx context.Context, name string, child Inode) error {
   614  	if !o.writable {
   615  		return linuxerr.EPERM
   616  	}
   617  	o.mu.Lock()
   618  	defer o.mu.Unlock()
   619  	if err := o.checkExistingLocked(name, child); err != nil {
   620  		return err
   621  	}
   622  
   623  	o.removeLocked(name)
   624  	return nil
   625  }
   626  
   627  // RmDir implements Inode.RmDir.
   628  func (o *OrderedChildren) RmDir(ctx context.Context, name string, child Inode) error {
   629  	// We're not responsible for checking that child is a directory, that it's
   630  	// empty, or updating any link counts; so this is the same as unlink.
   631  	return o.Unlink(ctx, name, child)
   632  }
   633  
   634  // Rename implements Inode.Rename.
   635  //
   636  // Precondition: Rename may only be called across two directory inodes with
   637  // identical implementations of Rename. Practically, this means filesystems that
   638  // implement Rename by embedding OrderedChildren for any directory
   639  // implementation must use OrderedChildren for all directory implementations
   640  // that will support Rename.
   641  //
   642  // Postcondition: reference on any replaced dentry transferred to caller.
   643  func (o *OrderedChildren) Rename(ctx context.Context, oldname, newname string, child, dstDir Inode) error {
   644  	if !o.writable {
   645  		return linuxerr.EPERM
   646  	}
   647  	dstIOC, ok := dstDir.(inodeWithOrderedChildren)
   648  	if !ok {
   649  		return linuxerr.EXDEV
   650  	}
   651  	dst := dstIOC.orderedChildren()
   652  	if !dst.writable {
   653  		return linuxerr.EPERM
   654  	}
   655  
   656  	// Note: There's a potential deadlock below if concurrent calls to Rename
   657  	// refer to the same src and dst directories in reverse. We avoid any
   658  	// ordering issues because the caller is required to serialize concurrent
   659  	// calls to Rename in accordance with the interface declaration.
   660  	o.mu.Lock()
   661  	defer o.mu.Unlock()
   662  	if dst != o {
   663  		dst.mu.Lock()
   664  		defer dst.mu.Unlock()
   665  	}
   666  
   667  	// Ensure target inode exists in src.
   668  	if err := o.checkExistingLocked(oldname, child); err != nil {
   669  		return err
   670  	}
   671  
   672  	// Ensure no name collision in dst.
   673  	if _, ok := dst.set[newname]; ok {
   674  		return linuxerr.EEXIST
   675  	}
   676  
   677  	// Remove from src.
   678  	o.removeLocked(oldname)
   679  
   680  	// Add to dst.
   681  	s := &slot{
   682  		name:  newname,
   683  		inode: child,
   684  	}
   685  	dst.order.PushBack(s)
   686  	dst.set[newname] = s
   687  
   688  	return nil
   689  }
   690  
   691  // nthLocked returns an iterator to the nth child tracked by this object. The
   692  // iterator is valid until the caller releases o.mu. Returns nil if the
   693  // requested index falls out of bounds.
   694  //
   695  // Preconditon: Caller must hold o.mu for reading.
   696  func (o *OrderedChildren) nthLocked(i int64) *slot {
   697  	for it := o.order.Front(); it != nil && i >= 0; it = it.Next() {
   698  		if i == 0 {
   699  			return it
   700  		}
   701  		i--
   702  	}
   703  	return nil
   704  }
   705  
   706  // InodeSymlink partially implements Inode interface for symlinks.
   707  //
   708  // +stateify savable
   709  type InodeSymlink struct {
   710  	InodeNotDirectory
   711  }
   712  
   713  // Open implements Inode.Open.
   714  func (InodeSymlink) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   715  	return nil, linuxerr.ELOOP
   716  }
   717  
   718  // StaticDirectory is a standard implementation of a directory with static
   719  // contents.
   720  //
   721  // +stateify savable
   722  type StaticDirectory struct {
   723  	InodeAlwaysValid
   724  	InodeAttrs
   725  	InodeDirectoryNoNewChildren
   726  	InodeNoStatFS
   727  	InodeNotAnonymous
   728  	InodeNotSymlink
   729  	InodeTemporary
   730  	InodeWatches
   731  	OrderedChildren
   732  	StaticDirectoryRefs
   733  
   734  	locks  vfs.FileLocks
   735  	fdOpts GenericDirectoryFDOptions
   736  }
   737  
   738  var _ Inode = (*StaticDirectory)(nil)
   739  
   740  // NewStaticDir creates a new static directory and returns its dentry.
   741  func NewStaticDir(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, children map[string]Inode, fdOpts GenericDirectoryFDOptions) Inode {
   742  	inode := &StaticDirectory{}
   743  	inode.Init(ctx, creds, devMajor, devMinor, ino, perm, fdOpts)
   744  	inode.InitRefs()
   745  
   746  	inode.OrderedChildren.Init(OrderedChildrenOptions{})
   747  	links := inode.OrderedChildren.Populate(children)
   748  	inode.IncLinks(links)
   749  
   750  	return inode
   751  }
   752  
   753  // Init initializes StaticDirectory.
   754  func (s *StaticDirectory) Init(ctx context.Context, creds *auth.Credentials, devMajor, devMinor uint32, ino uint64, perm linux.FileMode, fdOpts GenericDirectoryFDOptions) {
   755  	if perm&^linux.PermissionsMask != 0 {
   756  		panic(fmt.Sprintf("Only permission mask must be set: %x", perm&linux.PermissionsMask))
   757  	}
   758  	s.fdOpts = fdOpts
   759  	s.InodeAttrs.Init(ctx, creds, devMajor, devMinor, ino, linux.ModeDirectory|perm)
   760  }
   761  
   762  // Open implements Inode.Open.
   763  func (s *StaticDirectory) Open(ctx context.Context, rp *vfs.ResolvingPath, d *Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
   764  	fd, err := NewGenericDirectoryFD(rp.Mount(), d, &s.OrderedChildren, &s.locks, &opts, s.fdOpts)
   765  	if err != nil {
   766  		return nil, err
   767  	}
   768  	return fd.VFSFileDescription(), nil
   769  }
   770  
   771  // SetStat implements Inode.SetStat not allowing inode attributes to be changed.
   772  func (*StaticDirectory) SetStat(context.Context, *vfs.Filesystem, *auth.Credentials, vfs.SetStatOptions) error {
   773  	return linuxerr.EPERM
   774  }
   775  
   776  // DecRef implements Inode.DecRef.
   777  func (s *StaticDirectory) DecRef(ctx context.Context) {
   778  	s.StaticDirectoryRefs.DecRef(func() { s.Destroy(ctx) })
   779  }
   780  
   781  // InodeAlwaysValid partially implements Inode.
   782  //
   783  // +stateify savable
   784  type InodeAlwaysValid struct{}
   785  
   786  // Valid implements Inode.Valid.
   787  func (*InodeAlwaysValid) Valid(context.Context) bool {
   788  	return true
   789  }
   790  
   791  // InodeTemporary partially implements Inode.
   792  //
   793  // +stateify savable
   794  type InodeTemporary struct{}
   795  
   796  // Keep implements Inode.Keep.
   797  func (*InodeTemporary) Keep() bool {
   798  	return false
   799  }
   800  
   801  // InodeNoStatFS partially implements the Inode interface, where the client
   802  // filesystem doesn't support statfs(2).
   803  //
   804  // +stateify savable
   805  type InodeNoStatFS struct{}
   806  
   807  // StatFS implements Inode.StatFS.
   808  func (*InodeNoStatFS) StatFS(context.Context, *vfs.Filesystem) (linux.Statfs, error) {
   809  	return linux.Statfs{}, linuxerr.ENOSYS
   810  }
   811  
   812  // InodeWatches partially implements Inode.
   813  //
   814  // +stateify savable
   815  type InodeWatches struct {
   816  	watches vfs.Watches
   817  }
   818  
   819  // Watches implements Inode.Watches.
   820  func (i *InodeWatches) Watches() *vfs.Watches {
   821  	return &i.watches
   822  }
   823  
   824  // InodeAnonymous partially implements Inode.
   825  //
   826  // +stateify savable
   827  type InodeAnonymous struct{}
   828  
   829  // Anonymous implements Inode.Anonymous
   830  func (*InodeAnonymous) Anonymous() bool {
   831  	return true
   832  }
   833  
   834  // InodeNotAnonymous partially implements Inode.
   835  //
   836  // +stateify savable
   837  type InodeNotAnonymous struct{}
   838  
   839  // Anonymous implements Inode.Anonymous
   840  func (*InodeNotAnonymous) Anonymous() bool {
   841  	return false
   842  }