github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/pkg/sentry/fsimpl/gofer/directfs_dentry.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gofer
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"path"
    21  	"path/filepath"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/metacubex/gvisor/pkg/abi/linux"
    25  	"github.com/metacubex/gvisor/pkg/atomicbitops"
    26  	"github.com/metacubex/gvisor/pkg/context"
    27  	"github.com/metacubex/gvisor/pkg/fsutil"
    28  	"github.com/metacubex/gvisor/pkg/lisafs"
    29  	"github.com/metacubex/gvisor/pkg/log"
    30  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    31  	"github.com/metacubex/gvisor/pkg/sentry/socket/unix/transport"
    32  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    33  )
    34  
    35  // LINT.IfChange
    36  
    37  const (
    38  	hostOpenFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
    39  )
    40  
    41  // tryOpen tries to open() with different modes in the following order:
    42  //  1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
    43  //     Use non-blocking to prevent getting stuck inside open(2) for
    44  //     FIFOs. This option has no effect on regular files.
    45  //  2. PATH: for symlinks, sockets.
    46  func tryOpen(open func(int) (int, error)) (int, error) {
    47  	flags := []int{
    48  		unix.O_RDONLY | unix.O_NONBLOCK,
    49  		unix.O_PATH,
    50  	}
    51  
    52  	var (
    53  		hostFD int
    54  		err    error
    55  	)
    56  	for _, flag := range flags {
    57  		hostFD, err = open(flag | hostOpenFlags)
    58  		if err == nil {
    59  			return hostFD, nil
    60  		}
    61  
    62  		if err == unix.ENOENT {
    63  			// File doesn't exist, no point in retrying.
    64  			break
    65  		}
    66  	}
    67  	return -1, err
    68  }
    69  
    70  // getDirectfsRootDentry creates a new dentry representing the root dentry for
    71  // this mountpoint. getDirectfsRootDentry takes ownership of rootHostFD and
    72  // rootControlFD.
    73  func (fs *filesystem) getDirectfsRootDentry(ctx context.Context, rootHostFD int, rootControlFD lisafs.ClientFD) (*dentry, error) {
    74  	d, err := fs.newDirectfsDentry(rootHostFD)
    75  	if err != nil {
    76  		log.Warningf("newDirectfsDentry failed for mount point dentry: %v", err)
    77  		rootControlFD.Close(ctx, false /* flush */)
    78  		return nil, err
    79  	}
    80  	d.impl.(*directfsDentry).controlFDLisa = rootControlFD
    81  	return d, nil
    82  }
    83  
    84  // directfsDentry is a host dentry implementation. It represents a dentry
    85  // backed by a host file descriptor. All operations are directly performed on
    86  // the host. A gofer is only involved for some operations on the mount point
    87  // dentry (when dentry.parent = nil). We are forced to fall back to the gofer
    88  // due to the lack of procfs in the sandbox process.
    89  //
    90  // +stateify savable
    91  type directfsDentry struct {
    92  	dentry
    93  
    94  	// controlFD is the host FD to this file. controlFD is immutable until
    95  	// destruction, which is synchronized with dentry.handleMu.
    96  	controlFD int
    97  
    98  	// controlFDLisa is a lisafs control FD on this dentry.
    99  	// This is used to fallback to using lisafs RPCs in the following cases:
   100  	// * When parent dentry is required to perform operations but
   101  	//   dentry.parent = nil (root dentry).
   102  	// * For path-based syscalls (like connect(2) and bind(2)) on sockets.
   103  	//
   104  	// For the root dentry, controlFDLisa is always set and is immutable.
   105  	// For sockets, controlFDLisa is protected by dentry.handleMu and is
   106  	// immutable after initialization.
   107  	controlFDLisa lisafs.ClientFD `state:"nosave"`
   108  }
   109  
   110  // newDirectfsDentry creates a new dentry representing the given file. The dentry
   111  // initially has no references, but is not cached; it is the caller's
   112  // responsibility to set the dentry's reference count and/or call
   113  // dentry.checkCachingLocked() as appropriate.
   114  // newDirectDentry takes ownership of controlFD.
   115  func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) {
   116  	var stat unix.Stat_t
   117  	if err := unix.Fstat(controlFD, &stat); err != nil {
   118  		log.Warningf("failed to fstat(2) FD %d: %v", controlFD, err)
   119  		_ = unix.Close(controlFD)
   120  		return nil, err
   121  	}
   122  	inoKey := inoKeyFromStat(&stat)
   123  	d := &directfsDentry{
   124  		dentry: dentry{
   125  			fs:        fs,
   126  			inoKey:    inoKey,
   127  			ino:       fs.inoFromKey(inoKey),
   128  			mode:      atomicbitops.FromUint32(stat.Mode),
   129  			uid:       atomicbitops.FromUint32(stat.Uid),
   130  			gid:       atomicbitops.FromUint32(stat.Gid),
   131  			blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)),
   132  			readFD:    atomicbitops.FromInt32(-1),
   133  			writeFD:   atomicbitops.FromInt32(-1),
   134  			mmapFD:    atomicbitops.FromInt32(-1),
   135  			size:      atomicbitops.FromUint64(uint64(stat.Size)),
   136  			atime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)),
   137  			mtime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)),
   138  			ctime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)),
   139  			nlink:     atomicbitops.FromUint32(uint32(stat.Nlink)),
   140  		},
   141  		controlFD: controlFD,
   142  	}
   143  	d.dentry.init(d)
   144  	fs.syncMu.Lock()
   145  	fs.syncableDentries.PushBack(&d.syncableListEntry)
   146  	fs.syncMu.Unlock()
   147  	return &d.dentry, nil
   148  }
   149  
   150  // Precondition: fs.renameMu is locked.
   151  func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) {
   152  	parent := d.parent.Load()
   153  	if parent == nil {
   154  		// This is a mount point. We don't have parent. Fallback to using lisafs.
   155  		if !d.controlFDLisa.Ok() {
   156  			panic("directfsDentry.controlFDLisa is not set for mount point dentry")
   157  		}
   158  		openFD, hostFD, err := d.controlFDLisa.OpenAt(ctx, flags)
   159  		if err != nil {
   160  			return noHandle, err
   161  		}
   162  		d.fs.client.CloseFD(ctx, openFD, true /* flush */)
   163  		if hostFD < 0 {
   164  			log.Warningf("gofer did not donate an FD for mount point")
   165  			return noHandle, unix.EIO
   166  		}
   167  		return handle{fd: int32(hostFD)}, nil
   168  	}
   169  
   170  	// The only way to re-open an FD with different flags is via procfs or
   171  	// openat(2) from the parent. Procfs does not exist here. So use parent.
   172  	flags |= hostOpenFlags
   173  	openFD, err := unix.Openat(parent.impl.(*directfsDentry).controlFD, d.name, int(flags), 0)
   174  	if err != nil {
   175  		return noHandle, err
   176  	}
   177  	return handle{fd: int32(openFD)}, nil
   178  }
   179  
   180  // Precondition: fs.renameMu is locked.
   181  func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error {
   182  	d.handleMu.Lock()
   183  	defer d.handleMu.Unlock()
   184  	if d.controlFDLisa.Ok() {
   185  		return nil
   186  	}
   187  
   188  	var names []string
   189  	root := d
   190  	for root.parent.Load() != nil {
   191  		names = append(names, root.name)
   192  		root = root.parent.Load().impl.(*directfsDentry)
   193  	}
   194  	if !root.controlFDLisa.Ok() {
   195  		panic("controlFDLisa is not set for mount point dentry")
   196  	}
   197  	if len(names) == 0 {
   198  		return nil // d == root
   199  	}
   200  	// Reverse names.
   201  	last := len(names) - 1
   202  	for i := 0; i < len(names)/2; i++ {
   203  		names[i], names[last-i] = names[last-i], names[i]
   204  	}
   205  	status, inodes, err := root.controlFDLisa.WalkMultiple(ctx, names)
   206  	if err != nil {
   207  		return err
   208  	}
   209  	defer func() {
   210  		// Close everything except for inodes[last] if it exists.
   211  		for i := 0; i < len(inodes) && i < last; i++ {
   212  			flush := i == last-1 || i == len(inodes)-1
   213  			d.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush)
   214  		}
   215  	}()
   216  	switch status {
   217  	case lisafs.WalkComponentDoesNotExist:
   218  		return unix.ENOENT
   219  	case lisafs.WalkComponentSymlink:
   220  		log.Warningf("intermediate path component was a symlink? names = %v, inodes = %+v", names, inodes)
   221  		return unix.ELOOP
   222  	case lisafs.WalkSuccess:
   223  		d.controlFDLisa = d.fs.client.NewFD(inodes[last].ControlFD)
   224  		return nil
   225  	}
   226  	panic("unreachable")
   227  }
   228  
   229  // Precondition: d.metadataMu must be locked.
   230  //
   231  // +checklocks:d.metadataMu
   232  func (d *directfsDentry) updateMetadataLocked(h handle) error {
   233  	handleMuRLocked := false
   234  	if h.fd < 0 {
   235  		// Use open FDs in preferenece to the control FD. Control FDs may be opened
   236  		// with O_PATH. This may be significantly more efficient in some
   237  		// implementations. Prefer a writable FD over a readable one since some
   238  		// filesystem implementations may update a writable FD's metadata after
   239  		// writes, without making metadata updates immediately visible to read-only
   240  		// FDs representing the same file.
   241  		d.handleMu.RLock()
   242  		switch {
   243  		case d.writeFD.RacyLoad() >= 0:
   244  			h.fd = d.writeFD.RacyLoad()
   245  			handleMuRLocked = true
   246  		case d.readFD.RacyLoad() >= 0:
   247  			h.fd = d.readFD.RacyLoad()
   248  			handleMuRLocked = true
   249  		default:
   250  			h.fd = int32(d.controlFD)
   251  			d.handleMu.RUnlock()
   252  		}
   253  	}
   254  
   255  	var stat unix.Stat_t
   256  	err := unix.Fstat(int(h.fd), &stat)
   257  	if handleMuRLocked {
   258  		// handleMu must be released before updateMetadataFromStatLocked().
   259  		d.handleMu.RUnlock() // +checklocksforce: complex case.
   260  	}
   261  	if err != nil {
   262  		return err
   263  	}
   264  	return d.updateMetadataFromStatLocked(&stat)
   265  }
   266  
   267  // Precondition: fs.renameMu is locked if d is a socket.
   268  func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error {
   269  	if !d.isSocket() {
   270  		return unix.Fchmod(d.controlFD, uint32(mode))
   271  	}
   272  
   273  	// fchmod(2) on socket files created via bind(2) fails. We need to
   274  	// fchmodat(2) it from its parent.
   275  	if parent := d.parent.Load(); parent != nil {
   276  		// We have parent FD, just use that. Note that AT_SYMLINK_NOFOLLOW flag is
   277  		// currently not supported. So we don't use it.
   278  		return unix.Fchmodat(parent.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */)
   279  	}
   280  
   281  	// This is a mount point socket. We don't have a parent FD. Fallback to using
   282  	// lisafs.
   283  	if !d.controlFDLisa.Ok() {
   284  		panic("directfsDentry.controlFDLisa is not set for mount point socket")
   285  	}
   286  
   287  	return chmod(ctx, d.controlFDLisa, mode)
   288  }
   289  
   290  // Preconditions:
   291  //   - d.handleMu is locked if d is a regular file.
   292  //   - fs.renameMu is locked if d is a symlink.
   293  func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error {
   294  	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME) == 0 {
   295  		return nil
   296  	}
   297  
   298  	utimes := [2]unix.Timespec{
   299  		{Sec: 0, Nsec: unix.UTIME_OMIT},
   300  		{Sec: 0, Nsec: unix.UTIME_OMIT},
   301  	}
   302  	if stat.Mask&unix.STATX_ATIME != 0 {
   303  		utimes[0].Sec = stat.Atime.Sec
   304  		utimes[0].Nsec = int64(stat.Atime.Nsec)
   305  	}
   306  	if stat.Mask&unix.STATX_MTIME != 0 {
   307  		utimes[1].Sec = stat.Mtime.Sec
   308  		utimes[1].Nsec = int64(stat.Mtime.Nsec)
   309  	}
   310  
   311  	if !d.isSymlink() {
   312  		hostFD := d.controlFD
   313  		if d.isRegularFile() {
   314  			// utimensat(2) requires a writable FD for regular files. See BUGS
   315  			// section. dentry.prepareSetStat() should have acquired a writable FD.
   316  			hostFD = int(d.writeFD.RacyLoad())
   317  		}
   318  		// Non-symlinks can operate directly on the fd using an empty name.
   319  		return fsutil.Utimensat(hostFD, "", utimes, 0)
   320  	}
   321  
   322  	// utimensat operates different that other syscalls. To operate on a
   323  	// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
   324  	// name.
   325  	if parent := d.parent.Load(); parent != nil {
   326  		return fsutil.Utimensat(parent.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW)
   327  	}
   328  
   329  	// This is a mount point symlink. We don't have a parent FD. Fallback to
   330  	// using lisafs.
   331  	if !d.controlFDLisa.Ok() {
   332  		panic("directfsDentry.controlFDLisa is not set for mount point symlink")
   333  	}
   334  
   335  	setStat := linux.Statx{
   336  		Mask:  stat.Mask & (linux.STATX_ATIME | linux.STATX_MTIME),
   337  		Atime: stat.Atime,
   338  		Mtime: stat.Mtime,
   339  	}
   340  	_, failureErr, err := d.controlFDLisa.SetStat(ctx, &setStat)
   341  	if err != nil {
   342  		return err
   343  	}
   344  	return failureErr
   345  }
   346  
   347  // Precondition: fs.renameMu is locked.
   348  func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error {
   349  	if stat.Mask&unix.STATX_SIZE != 0 ||
   350  		(stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 && d.isRegularFile()) {
   351  		// Need to ensure a writable FD is available. See setStatLocked() to
   352  		// understand why.
   353  		return d.ensureSharedHandle(ctx, false /* read */, true /* write */, false /* trunc */)
   354  	}
   355  	return nil
   356  }
   357  
   358  // Preconditions:
   359  //   - d.handleMu is locked.
   360  //   - fs.renameMu is locked.
   361  func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) (failureMask uint32, failureErr error) {
   362  	if stat.Mask&unix.STATX_MODE != 0 {
   363  		if err := d.chmod(ctx, stat.Mode&^unix.S_IFMT); err != nil {
   364  			failureMask |= unix.STATX_MODE
   365  			failureErr = err
   366  		}
   367  	}
   368  
   369  	if stat.Mask&unix.STATX_SIZE != 0 {
   370  		// ftruncate(2) requires a writable FD.
   371  		if err := unix.Ftruncate(int(d.writeFD.RacyLoad()), int64(stat.Size)); err != nil {
   372  			failureMask |= unix.STATX_SIZE
   373  			failureErr = err
   374  		}
   375  	}
   376  
   377  	if err := d.utimensat(ctx, stat); err != nil {
   378  		failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME))
   379  		failureErr = err
   380  	}
   381  
   382  	if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 {
   383  		// "If the owner or group is specified as -1, then that ID is not changed"
   384  		// - chown(2)
   385  		uid := -1
   386  		if stat.Mask&unix.STATX_UID != 0 {
   387  			uid = int(stat.UID)
   388  		}
   389  		gid := -1
   390  		if stat.Mask&unix.STATX_GID != 0 {
   391  			gid = int(stat.GID)
   392  		}
   393  		if err := fchown(d.controlFD, uid, gid); err != nil {
   394  			failureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID)
   395  			failureErr = err
   396  		}
   397  	}
   398  	return
   399  }
   400  
   401  func fchown(fd, uid, gid int) error {
   402  	return unix.Fchownat(fd, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
   403  }
   404  
   405  // Precondition: d.handleMu must be locked.
   406  func (d *directfsDentry) destroy(ctx context.Context) {
   407  	if d.controlFD >= 0 {
   408  		_ = unix.Close(d.controlFD)
   409  		d.controlFD = -1
   410  	}
   411  	if d.controlFDLisa.Ok() {
   412  		d.controlFDLisa.Close(ctx, true /* flush */)
   413  	}
   414  }
   415  
   416  func (d *directfsDentry) getHostChild(name string) (*dentry, error) {
   417  	childFD, err := tryOpen(func(flags int) (int, error) {
   418  		return unix.Openat(d.controlFD, name, flags, 0)
   419  	})
   420  	if err != nil {
   421  		return nil, err
   422  	}
   423  	return d.fs.newDirectfsDentry(childFD)
   424  }
   425  
   426  func (d *directfsDentry) getXattr(name string, size uint64) (string, error) {
   427  	data := make([]byte, size)
   428  	if _, err := unix.Fgetxattr(d.controlFD, name, data); err != nil {
   429  		return "", err
   430  	}
   431  	return string(data), nil
   432  }
   433  
   434  // getCreatedChild opens the newly created child, sets its uid/gid, constructs
   435  // a disconnected dentry and returns it.
   436  func (d *directfsDentry) getCreatedChild(name string, uid, gid int, isDir bool) (*dentry, error) {
   437  	unlinkFlags := 0
   438  	extraOpenFlags := 0
   439  	if isDir {
   440  		extraOpenFlags |= unix.O_DIRECTORY
   441  		unlinkFlags |= unix.AT_REMOVEDIR
   442  	}
   443  	deleteChild := func() {
   444  		// Best effort attempt to remove the newly created child on failure.
   445  		if err := unix.Unlinkat(d.controlFD, name, unlinkFlags); err != nil {
   446  			log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err)
   447  		}
   448  	}
   449  
   450  	childFD, err := tryOpen(func(flags int) (int, error) {
   451  		return unix.Openat(d.controlFD, name, flags|extraOpenFlags, 0)
   452  	})
   453  	if err != nil {
   454  		deleteChild()
   455  		return nil, err
   456  	}
   457  
   458  	// "If the owner or group is specified as -1, then that ID is not changed"
   459  	// - chown(2). Only bother making the syscall if the owner is changing.
   460  	if uid != -1 || gid != -1 {
   461  		if err := fchown(childFD, uid, gid); err != nil {
   462  			deleteChild()
   463  			_ = unix.Close(childFD)
   464  			return nil, err
   465  		}
   466  	}
   467  	child, err := d.fs.newDirectfsDentry(childFD)
   468  	if err != nil {
   469  		// Ownership of childFD was passed to newDirectDentry(), so no need to
   470  		// clean that up.
   471  		deleteChild()
   472  		return nil, err
   473  	}
   474  	return child, nil
   475  }
   476  
   477  func (d *directfsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) {
   478  	if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); ok {
   479  		return d.bindAt(ctx, name, creds, opts)
   480  	}
   481  
   482  	// From mknod(2) man page:
   483  	// "EPERM: [...] if the filesystem containing pathname does not support
   484  	// the type of node requested."
   485  	if opts.Mode.FileType() != linux.ModeRegular {
   486  		return nil, unix.EPERM
   487  	}
   488  
   489  	if err := unix.Mknodat(d.controlFD, name, uint32(opts.Mode), 0); err != nil {
   490  		return nil, err
   491  	}
   492  	return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */)
   493  }
   494  
   495  // Precondition: opts.Endpoint != nil and is transport.HostBoundEndpoint type.
   496  func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) {
   497  	// There are no filesystems mounted in the sandbox process's mount namespace.
   498  	// So we can't perform absolute path traversals. So fallback to using lisafs.
   499  	if err := d.ensureLisafsControlFD(ctx); err != nil {
   500  		return nil, err
   501  	}
   502  	sockType := opts.Endpoint.(transport.Endpoint).Type()
   503  	childInode, boundSocketFD, err := d.controlFDLisa.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID))
   504  	if err != nil {
   505  		return nil, err
   506  	}
   507  	d.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */)
   508  	// Update opts.Endpoint that it is bound.
   509  	hbep := opts.Endpoint.(transport.HostBoundEndpoint)
   510  	if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil {
   511  		if err := unix.Unlinkat(d.controlFD, name, 0); err != nil {
   512  			log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err)
   513  		}
   514  		return nil, err
   515  	}
   516  	// Socket already has the right UID/GID set, so use uid = gid = -1.
   517  	child, err := d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */)
   518  	if err != nil {
   519  		hbep.ResetBoundSocketFD(ctx)
   520  		return nil, err
   521  	}
   522  	// Set the endpoint on the newly created child dentry.
   523  	child.endpoint = opts.Endpoint
   524  	return child, nil
   525  }
   526  
   527  // Precondition: d.fs.renameMu must be locked.
   528  func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, error) {
   529  	// Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires
   530  	// CAP_DAC_READ_SEARCH in the *root* userns. With directfs, the sandbox
   531  	// process has CAP_DAC_READ_SEARCH in its own userns. But the sandbox is
   532  	// running in a different userns. So we can't use AT_EMPTY_PATH. Fallback to
   533  	// using olddirfd to call linkat(2).
   534  	// Also note that d and target are from the same mount. Given target is a
   535  	// non-directory and d is a directory, target.parent must exist.
   536  	if err := unix.Linkat(target.parent.Load().impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil {
   537  		return nil, err
   538  	}
   539  	// Note that we don't need to set uid/gid for the new child. This is a hard
   540  	// link. The original file already has the right owner.
   541  	// TODO(gvisor.dev/issue/6739): Hard linked dentries should share the same
   542  	// inode fields.
   543  	return d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */)
   544  }
   545  
   546  func (d *directfsDentry) mkdir(name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, error) {
   547  	if err := unix.Mkdirat(d.controlFD, name, uint32(mode)); err != nil {
   548  		return nil, err
   549  	}
   550  	return d.getCreatedChild(name, int(uid), int(gid), true /* isDir */)
   551  }
   552  
   553  func (d *directfsDentry) symlink(name, target string, creds *auth.Credentials) (*dentry, error) {
   554  	if err := unix.Symlinkat(target, d.controlFD, name); err != nil {
   555  		return nil, err
   556  	}
   557  	return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */)
   558  }
   559  
   560  func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, handle, error) {
   561  	createFlags := unix.O_CREAT | unix.O_EXCL | int(accessFlags) | hostOpenFlags
   562  	childHandleFD, err := unix.Openat(d.controlFD, name, createFlags, uint32(mode&^linux.FileTypeMask))
   563  	if err != nil {
   564  		return nil, noHandle, err
   565  	}
   566  
   567  	child, err := d.getCreatedChild(name, int(uid), int(gid), false /* isDir */)
   568  	if err != nil {
   569  		_ = unix.Close(childHandleFD)
   570  		return nil, noHandle, err
   571  	}
   572  	return child, handle{fd: int32(childHandleFD)}, nil
   573  }
   574  
   575  func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8)) error {
   576  	readFD := int(d.readFD.RacyLoad())
   577  	if _, err := unix.Seek(readFD, 0, 0); err != nil {
   578  		return err
   579  	}
   580  
   581  	return fsutil.ForEachDirent(readFD, func(ino uint64, off int64, ftype uint8, name string, reclen uint16) {
   582  		// We also want the device ID, which annoyingly incurs an additional
   583  		// syscall per dirent.
   584  		// TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat.
   585  		stat, err := fsutil.StatAt(d.controlFD, name)
   586  		if err != nil {
   587  			log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(&d.dentry), name), err)
   588  			return
   589  		}
   590  		recordDirent(name, inoKeyFromStat(&stat), ftype)
   591  	})
   592  }
   593  
   594  // Precondition: fs.renameMu is locked.
   595  func (d *directfsDentry) connect(ctx context.Context, sockType linux.SockType) (int, error) {
   596  	// There are no filesystems mounted in the sandbox process's mount namespace.
   597  	// So we can't perform absolute path traversals. So fallback to using lisafs.
   598  	if err := d.ensureLisafsControlFD(ctx); err != nil {
   599  		return -1, err
   600  	}
   601  	return d.controlFDLisa.Connect(ctx, sockType)
   602  }
   603  
   604  func (d *directfsDentry) readlink() (string, error) {
   605  	// This is similar to what os.Readlink does.
   606  	for linkLen := 128; linkLen < math.MaxUint16; linkLen *= 2 {
   607  		b := make([]byte, linkLen)
   608  		n, err := unix.Readlinkat(d.controlFD, "", b)
   609  
   610  		if err != nil {
   611  			return "", err
   612  		}
   613  		if n < int(linkLen) {
   614  			return string(b[:n]), nil
   615  		}
   616  	}
   617  	return "", unix.ENOMEM
   618  }
   619  
   620  func (d *directfsDentry) statfs() (linux.Statfs, error) {
   621  	var statFS unix.Statfs_t
   622  	if err := unix.Fstatfs(d.controlFD, &statFS); err != nil {
   623  		return linux.Statfs{}, err
   624  	}
   625  	return linux.Statfs{
   626  		BlockSize:       statFS.Bsize,
   627  		FragmentSize:    statFS.Bsize,
   628  		Blocks:          statFS.Blocks,
   629  		BlocksFree:      statFS.Bfree,
   630  		BlocksAvailable: statFS.Bavail,
   631  		Files:           statFS.Files,
   632  		FilesFree:       statFS.Ffree,
   633  		NameLength:      uint64(statFS.Namelen),
   634  	}, nil
   635  }
   636  
   637  func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *vfs.CompleteRestoreOptions) error {
   638  	if controlFD < 0 {
   639  		log.Warningf("directfsDentry.restoreFile called with invalid controlFD")
   640  		return unix.EINVAL
   641  	}
   642  	var stat unix.Stat_t
   643  	if err := unix.Fstat(controlFD, &stat); err != nil {
   644  		_ = unix.Close(controlFD)
   645  		return err
   646  	}
   647  
   648  	d.controlFD = controlFD
   649  	// We do not preserve inoKey across checkpoint/restore, so:
   650  	//
   651  	//	- We must assume that the host filesystem did not change in a way that
   652  	//		would invalidate dentries, since we can't revalidate dentries by
   653  	//		checking inoKey.
   654  	//
   655  	//	- We need to associate the new inoKey with the existing d.ino.
   656  	d.inoKey = inoKeyFromStat(&stat)
   657  	d.fs.inoMu.Lock()
   658  	d.fs.inoByKey[d.inoKey] = d.ino
   659  	d.fs.inoMu.Unlock()
   660  
   661  	// Check metadata stability before updating metadata.
   662  	d.metadataMu.Lock()
   663  	defer d.metadataMu.Unlock()
   664  	if d.isRegularFile() {
   665  		if opts.ValidateFileSizes {
   666  			if d.size.RacyLoad() != uint64(stat.Size) {
   667  				return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(&d.dentry), d.size.Load(), stat.Size)}
   668  			}
   669  		}
   670  		if opts.ValidateFileModificationTimestamps {
   671  			if want := dentryTimestampFromUnix(stat.Mtim); d.mtime.RacyLoad() != want {
   672  				return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(&d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))}
   673  			}
   674  		}
   675  	}
   676  	if !d.cachedMetadataAuthoritative() {
   677  		d.updateMetadataFromStatLocked(&stat)
   678  	}
   679  
   680  	if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok {
   681  		if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
   682  			return err
   683  		}
   684  	}
   685  
   686  	return nil
   687  }
   688  
   689  // doRevalidationDirectfs stats all dentries in `state`. It will update or
   690  // invalidate dentries in the cache based on the result.
   691  //
   692  // Preconditions:
   693  //   - fs.renameMu must be locked.
   694  //   - InteropModeShared is in effect.
   695  func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error {
   696  	// Explicitly declare start dentry, instead of using the function receiver.
   697  	// The function receiver has to be named `d` (to be consistent with other
   698  	// receivers). But `d` variable is also used below in various places. This
   699  	// helps with readability and makes code less error prone.
   700  	start := state.start.impl.(*directfsDentry)
   701  	if state.refreshStart {
   702  		start.updateMetadata(ctx)
   703  	}
   704  
   705  	parent := start
   706  	for _, d := range state.dentries {
   707  		childFD, err := unix.Openat(parent.controlFD, d.name, unix.O_PATH|hostOpenFlags, 0)
   708  		if err != nil && err != unix.ENOENT {
   709  			return err
   710  		}
   711  
   712  		var stat unix.Stat_t
   713  		// Lock metadata *before* getting attributes for d.
   714  		d.metadataMu.Lock()
   715  		found := err == nil
   716  		if found {
   717  			err = unix.Fstat(childFD, &stat)
   718  			_ = unix.Close(childFD)
   719  			if err != nil {
   720  				d.metadataMu.Unlock()
   721  				return err
   722  			}
   723  		}
   724  
   725  		// Note that synthetic dentries will always fail this comparison check.
   726  		if !found || d.inoKey != inoKeyFromStat(&stat) {
   727  			d.metadataMu.Unlock()
   728  			if !found && d.isSynthetic() {
   729  				// We have a synthetic file, and no remote file has arisen to replace
   730  				// it.
   731  				return nil
   732  			}
   733  			// The file at this path has changed or no longer exists. Mark the
   734  			// dentry invalidated.
   735  			d.invalidate(ctx, vfsObj, ds)
   736  			return nil
   737  		}
   738  
   739  		// The file at this path hasn't changed. Just update cached metadata.
   740  		d.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.metadataMu is locked above.
   741  		d.metadataMu.Unlock()
   742  
   743  		// Advance parent.
   744  		parent = d.impl.(*directfsDentry)
   745  	}
   746  	return nil
   747  }
   748  
   749  // LINT.ThenChange(../../../../runsc/fsgofer/lisafs.go)