github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/pkg/sentry/fsimpl/gofer/directfs_dentry.go (about)

     1  // Copyright 2022 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package gofer
    16  
    17  import (
    18  	"fmt"
    19  	"math"
    20  	"path"
    21  	"path/filepath"
    22  
    23  	"golang.org/x/sys/unix"
    24  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    25  	"github.com/MerlinKodo/gvisor/pkg/atomicbitops"
    26  	"github.com/MerlinKodo/gvisor/pkg/context"
    27  	"github.com/MerlinKodo/gvisor/pkg/fsutil"
    28  	"github.com/MerlinKodo/gvisor/pkg/lisafs"
    29  	"github.com/MerlinKodo/gvisor/pkg/log"
    30  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    31  	"github.com/MerlinKodo/gvisor/pkg/sentry/socket/unix/transport"
    32  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    33  )
    34  
    35  // LINT.IfChange
    36  
    37  const (
    38  	hostOpenFlags = unix.O_NOFOLLOW | unix.O_CLOEXEC
    39  )
    40  
    41  // tryOpen tries to open() with different modes in the following order:
    42  //  1. RDONLY | NONBLOCK: for all files, directories, ro mounts, FIFOs.
    43  //     Use non-blocking to prevent getting stuck inside open(2) for
    44  //     FIFOs. This option has no effect on regular files.
    45  //  2. PATH: for symlinks, sockets.
    46  func tryOpen(open func(int) (int, error)) (int, error) {
    47  	flags := []int{
    48  		unix.O_RDONLY | unix.O_NONBLOCK,
    49  		unix.O_PATH,
    50  	}
    51  
    52  	var (
    53  		hostFD int
    54  		err    error
    55  	)
    56  	for _, flag := range flags {
    57  		hostFD, err = open(flag | hostOpenFlags)
    58  		if err == nil {
    59  			return hostFD, nil
    60  		}
    61  
    62  		if err == unix.ENOENT {
    63  			// File doesn't exist, no point in retrying.
    64  			break
    65  		}
    66  	}
    67  	return -1, err
    68  }
    69  
    70  // getDirectfsRootDentry creates a new dentry representing the root dentry for
    71  // this mountpoint. getDirectfsRootDentry takes ownership of rootHostFD and
    72  // rootControlFD.
    73  func (fs *filesystem) getDirectfsRootDentry(ctx context.Context, rootHostFD int, rootControlFD lisafs.ClientFD) (*dentry, error) {
    74  	d, err := fs.newDirectfsDentry(rootHostFD)
    75  	if err != nil {
    76  		log.Warningf("newDirectfsDentry failed for mount point dentry: %v", err)
    77  		rootControlFD.Close(ctx, false /* flush */)
    78  		return nil, err
    79  	}
    80  	d.impl.(*directfsDentry).controlFDLisa = rootControlFD
    81  	return d, nil
    82  }
    83  
    84  // directfsDentry is a host dentry implementation. It represents a dentry
    85  // backed by a host file descriptor. All operations are directly performed on
    86  // the host. A gofer is only involved for some operations on the mount point
    87  // dentry (when dentry.parent = nil). We are forced to fall back to the gofer
    88  // due to the lack of procfs in the sandbox process.
    89  //
    90  // +stateify savable
    91  type directfsDentry struct {
    92  	dentry
    93  
    94  	// controlFD is the host FD to this file. controlFD is immutable.
    95  	controlFD int
    96  
    97  	// controlFDLisa is a lisafs control FD on this dentry.
    98  	// This is used to fallback to using lisafs RPCs in the following cases:
    99  	// * When parent dentry is required to perform operations but
   100  	//   dentry.parent = nil (root dentry).
   101  	// * For path-based syscalls (like connect(2) and bind(2)) on sockets.
   102  	//
   103  	// For the root dentry, controlFDLisa is always set and is immutable.
   104  	// For sockets, controlFDLisa is protected by dentry.handleMu and is
   105  	// immutable after initialization.
   106  	controlFDLisa lisafs.ClientFD `state:"nosave"`
   107  }
   108  
   109  // newDirectfsDentry creates a new dentry representing the given file. The dentry
   110  // initially has no references, but is not cached; it is the caller's
   111  // responsibility to set the dentry's reference count and/or call
   112  // dentry.checkCachingLocked() as appropriate.
   113  // newDirectDentry takes ownership of controlFD.
   114  func (fs *filesystem) newDirectfsDentry(controlFD int) (*dentry, error) {
   115  	var stat unix.Stat_t
   116  	if err := unix.Fstat(controlFD, &stat); err != nil {
   117  		log.Warningf("failed to fstat(2) FD %d: %v", controlFD, err)
   118  		_ = unix.Close(controlFD)
   119  		return nil, err
   120  	}
   121  	inoKey := inoKeyFromStat(&stat)
   122  	d := &directfsDentry{
   123  		dentry: dentry{
   124  			fs:        fs,
   125  			inoKey:    inoKey,
   126  			ino:       fs.inoFromKey(inoKey),
   127  			mode:      atomicbitops.FromUint32(stat.Mode),
   128  			uid:       atomicbitops.FromUint32(stat.Uid),
   129  			gid:       atomicbitops.FromUint32(stat.Gid),
   130  			blockSize: atomicbitops.FromUint32(uint32(stat.Blksize)),
   131  			readFD:    atomicbitops.FromInt32(-1),
   132  			writeFD:   atomicbitops.FromInt32(-1),
   133  			mmapFD:    atomicbitops.FromInt32(-1),
   134  			size:      atomicbitops.FromUint64(uint64(stat.Size)),
   135  			atime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Atim)),
   136  			mtime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Mtim)),
   137  			ctime:     atomicbitops.FromInt64(dentryTimestampFromUnix(stat.Ctim)),
   138  			nlink:     atomicbitops.FromUint32(uint32(stat.Nlink)),
   139  		},
   140  		controlFD: controlFD,
   141  	}
   142  	d.dentry.init(d)
   143  	fs.syncMu.Lock()
   144  	fs.syncableDentries.PushBack(&d.syncableListEntry)
   145  	fs.syncMu.Unlock()
   146  	return &d.dentry, nil
   147  }
   148  
   149  // Precondition: fs.renameMu is locked.
   150  func (d *directfsDentry) openHandle(ctx context.Context, flags uint32) (handle, error) {
   151  	if d.parent == nil {
   152  		// This is a mount point. We don't have parent. Fallback to using lisafs.
   153  		if !d.controlFDLisa.Ok() {
   154  			panic("directfsDentry.controlFDLisa is not set for mount point dentry")
   155  		}
   156  		openFD, hostFD, err := d.controlFDLisa.OpenAt(ctx, flags)
   157  		if err != nil {
   158  			return noHandle, err
   159  		}
   160  		d.fs.client.CloseFD(ctx, openFD, true /* flush */)
   161  		if hostFD < 0 {
   162  			log.Warningf("gofer did not donate an FD for mount point")
   163  			return noHandle, unix.EIO
   164  		}
   165  		return handle{fd: int32(hostFD)}, nil
   166  	}
   167  
   168  	// The only way to re-open an FD with different flags is via procfs or
   169  	// openat(2) from the parent. Procfs does not exist here. So use parent.
   170  	flags |= hostOpenFlags
   171  	openFD, err := unix.Openat(d.parent.impl.(*directfsDentry).controlFD, d.name, int(flags), 0)
   172  	if err != nil {
   173  		return noHandle, err
   174  	}
   175  	return handle{fd: int32(openFD)}, nil
   176  }
   177  
   178  // Precondition: fs.renameMu is locked.
   179  func (d *directfsDentry) ensureLisafsControlFD(ctx context.Context) error {
   180  	d.handleMu.Lock()
   181  	defer d.handleMu.Unlock()
   182  	if d.controlFDLisa.Ok() {
   183  		return nil
   184  	}
   185  
   186  	var names []string
   187  	root := d
   188  	for root.parent != nil {
   189  		names = append(names, root.name)
   190  		root = root.parent.impl.(*directfsDentry)
   191  	}
   192  	if !root.controlFDLisa.Ok() {
   193  		panic("controlFDLisa is not set for mount point dentry")
   194  	}
   195  	if len(names) == 0 {
   196  		return nil // d == root
   197  	}
   198  	// Reverse names.
   199  	last := len(names) - 1
   200  	for i := 0; i < len(names)/2; i++ {
   201  		names[i], names[last-i] = names[last-i], names[i]
   202  	}
   203  	status, inodes, err := root.controlFDLisa.WalkMultiple(ctx, names)
   204  	if err != nil {
   205  		return err
   206  	}
   207  	defer func() {
   208  		// Close everything except for inodes[last] if it exists.
   209  		for i := 0; i < len(inodes) && i < last; i++ {
   210  			flush := i == last-1 || i == len(inodes)-1
   211  			d.fs.client.CloseFD(ctx, inodes[i].ControlFD, flush)
   212  		}
   213  	}()
   214  	switch status {
   215  	case lisafs.WalkComponentDoesNotExist:
   216  		return unix.ENOENT
   217  	case lisafs.WalkComponentSymlink:
   218  		log.Warningf("intermediate path component was a symlink? names = %v, inodes = %+v", names, inodes)
   219  		return unix.ELOOP
   220  	case lisafs.WalkSuccess:
   221  		d.controlFDLisa = d.fs.client.NewFD(inodes[last].ControlFD)
   222  		return nil
   223  	}
   224  	panic("unreachable")
   225  }
   226  
   227  // Precondition: d.metadataMu must be locked.
   228  //
   229  // +checklocks:d.metadataMu
   230  func (d *directfsDentry) updateMetadataLocked(h handle) error {
   231  	handleMuRLocked := false
   232  	if h.fd < 0 {
   233  		// Use open FDs in preferenece to the control FD. Control FDs may be opened
   234  		// with O_PATH. This may be significantly more efficient in some
   235  		// implementations. Prefer a writable FD over a readable one since some
   236  		// filesystem implementations may update a writable FD's metadata after
   237  		// writes, without making metadata updates immediately visible to read-only
   238  		// FDs representing the same file.
   239  		d.handleMu.RLock()
   240  		switch {
   241  		case d.writeFD.RacyLoad() >= 0:
   242  			h.fd = d.writeFD.RacyLoad()
   243  			handleMuRLocked = true
   244  		case d.readFD.RacyLoad() >= 0:
   245  			h.fd = d.readFD.RacyLoad()
   246  			handleMuRLocked = true
   247  		default:
   248  			h.fd = int32(d.controlFD)
   249  			d.handleMu.RUnlock()
   250  		}
   251  	}
   252  
   253  	var stat unix.Stat_t
   254  	err := unix.Fstat(int(h.fd), &stat)
   255  	if handleMuRLocked {
   256  		// handleMu must be released before updateMetadataFromStatLocked().
   257  		d.handleMu.RUnlock() // +checklocksforce: complex case.
   258  	}
   259  	if err != nil {
   260  		return err
   261  	}
   262  	return d.updateMetadataFromStatLocked(&stat)
   263  }
   264  
   265  // Precondition: fs.renameMu is locked if d is a socket.
   266  func (d *directfsDentry) chmod(ctx context.Context, mode uint16) error {
   267  	if !d.isSocket() {
   268  		return unix.Fchmod(d.controlFD, uint32(mode))
   269  	}
   270  
   271  	// fchmod(2) on socket files created via bind(2) fails. We need to
   272  	// fchmodat(2) it from its parent.
   273  	if d.parent != nil {
   274  		// We have parent FD, just use that. Note that AT_SYMLINK_NOFOLLOW flag is
   275  		// currently not supported. So we don't use it.
   276  		return unix.Fchmodat(d.parent.impl.(*directfsDentry).controlFD, d.name, uint32(mode), 0 /* flags */)
   277  	}
   278  
   279  	// This is a mount point socket. We don't have a parent FD. Fallback to using
   280  	// lisafs.
   281  	if !d.controlFDLisa.Ok() {
   282  		panic("directfsDentry.controlFDLisa is not set for mount point socket")
   283  	}
   284  
   285  	return chmod(ctx, d.controlFDLisa, mode)
   286  }
   287  
   288  // Preconditions:
   289  //   - d.handleMu is locked if d is a regular file.
   290  //   - fs.renameMu is locked if d is a symlink.
   291  func (d *directfsDentry) utimensat(ctx context.Context, stat *linux.Statx) error {
   292  	if stat.Mask&(linux.STATX_ATIME|linux.STATX_MTIME) == 0 {
   293  		return nil
   294  	}
   295  
   296  	utimes := [2]unix.Timespec{
   297  		{Sec: 0, Nsec: unix.UTIME_OMIT},
   298  		{Sec: 0, Nsec: unix.UTIME_OMIT},
   299  	}
   300  	if stat.Mask&unix.STATX_ATIME != 0 {
   301  		utimes[0].Sec = stat.Atime.Sec
   302  		utimes[0].Nsec = int64(stat.Atime.Nsec)
   303  	}
   304  	if stat.Mask&unix.STATX_MTIME != 0 {
   305  		utimes[1].Sec = stat.Mtime.Sec
   306  		utimes[1].Nsec = int64(stat.Mtime.Nsec)
   307  	}
   308  
   309  	if !d.isSymlink() {
   310  		hostFD := d.controlFD
   311  		if d.isRegularFile() {
   312  			// utimensat(2) requires a writable FD for regular files. See BUGS
   313  			// section. dentry.prepareSetStat() should have acquired a writable FD.
   314  			hostFD = int(d.writeFD.RacyLoad())
   315  		}
   316  		// Non-symlinks can operate directly on the fd using an empty name.
   317  		return fsutil.Utimensat(hostFD, "", utimes, 0)
   318  	}
   319  
   320  	// utimensat operates different that other syscalls. To operate on a
   321  	// symlink it *requires* AT_SYMLINK_NOFOLLOW with dirFD and a non-empty
   322  	// name.
   323  	if d.parent != nil {
   324  		return fsutil.Utimensat(d.parent.impl.(*directfsDentry).controlFD, d.name, utimes, unix.AT_SYMLINK_NOFOLLOW)
   325  	}
   326  
   327  	// This is a mount point symlink. We don't have a parent FD. Fallback to
   328  	// using lisafs.
   329  	if !d.controlFDLisa.Ok() {
   330  		panic("directfsDentry.controlFDLisa is not set for mount point symlink")
   331  	}
   332  
   333  	setStat := linux.Statx{
   334  		Mask:  stat.Mask & (linux.STATX_ATIME | linux.STATX_MTIME),
   335  		Atime: stat.Atime,
   336  		Mtime: stat.Mtime,
   337  	}
   338  	_, failureErr, err := d.controlFDLisa.SetStat(ctx, &setStat)
   339  	if err != nil {
   340  		return err
   341  	}
   342  	return failureErr
   343  }
   344  
   345  // Precondition: fs.renameMu is locked.
   346  func (d *directfsDentry) prepareSetStat(ctx context.Context, stat *linux.Statx) error {
   347  	if stat.Mask&unix.STATX_SIZE != 0 ||
   348  		(stat.Mask&(unix.STATX_ATIME|unix.STATX_MTIME) != 0 && d.isRegularFile()) {
   349  		// Need to ensure a writable FD is available. See setStatLocked() to
   350  		// understand why.
   351  		return d.ensureSharedHandle(ctx, false /* read */, true /* write */, false /* trunc */)
   352  	}
   353  	return nil
   354  }
   355  
   356  // Preconditions:
   357  //   - d.handleMu is locked.
   358  //   - fs.renameMu is locked.
   359  func (d *directfsDentry) setStatLocked(ctx context.Context, stat *linux.Statx) (failureMask uint32, failureErr error) {
   360  	if stat.Mask&unix.STATX_MODE != 0 {
   361  		if err := d.chmod(ctx, stat.Mode&^unix.S_IFMT); err != nil {
   362  			failureMask |= unix.STATX_MODE
   363  			failureErr = err
   364  		}
   365  	}
   366  
   367  	if stat.Mask&unix.STATX_SIZE != 0 {
   368  		// ftruncate(2) requires a writable FD.
   369  		if err := unix.Ftruncate(int(d.writeFD.RacyLoad()), int64(stat.Size)); err != nil {
   370  			failureMask |= unix.STATX_SIZE
   371  			failureErr = err
   372  		}
   373  	}
   374  
   375  	if err := d.utimensat(ctx, stat); err != nil {
   376  		failureMask |= (stat.Mask & (unix.STATX_ATIME | unix.STATX_MTIME))
   377  		failureErr = err
   378  	}
   379  
   380  	if stat.Mask&(unix.STATX_UID|unix.STATX_GID) != 0 {
   381  		// "If the owner or group is specified as -1, then that ID is not changed"
   382  		// - chown(2)
   383  		uid := -1
   384  		if stat.Mask&unix.STATX_UID != 0 {
   385  			uid = int(stat.UID)
   386  		}
   387  		gid := -1
   388  		if stat.Mask&unix.STATX_GID != 0 {
   389  			gid = int(stat.GID)
   390  		}
   391  		if err := fchown(d.controlFD, uid, gid); err != nil {
   392  			failureMask |= stat.Mask & (unix.STATX_UID | unix.STATX_GID)
   393  			failureErr = err
   394  		}
   395  	}
   396  	return
   397  }
   398  
   399  func fchown(fd, uid, gid int) error {
   400  	return unix.Fchownat(fd, "", uid, gid, unix.AT_EMPTY_PATH|unix.AT_SYMLINK_NOFOLLOW)
   401  }
   402  
   403  func (d *directfsDentry) destroy(ctx context.Context) {
   404  	if d.controlFD >= 0 {
   405  		_ = unix.Close(d.controlFD)
   406  	}
   407  	if d.controlFDLisa.Ok() {
   408  		d.controlFDLisa.Close(ctx, true /* flush */)
   409  	}
   410  }
   411  
   412  func (d *directfsDentry) getHostChild(name string) (*dentry, error) {
   413  	childFD, err := tryOpen(func(flags int) (int, error) {
   414  		return unix.Openat(d.controlFD, name, flags, 0)
   415  	})
   416  	if err != nil {
   417  		return nil, err
   418  	}
   419  	return d.fs.newDirectfsDentry(childFD)
   420  }
   421  
   422  // getCreatedChild opens the newly created child, sets its uid/gid, constructs
   423  // a disconnected dentry and returns it.
   424  func (d *directfsDentry) getCreatedChild(name string, uid, gid int, isDir bool) (*dentry, error) {
   425  	unlinkFlags := 0
   426  	extraOpenFlags := 0
   427  	if isDir {
   428  		extraOpenFlags |= unix.O_DIRECTORY
   429  		unlinkFlags |= unix.AT_REMOVEDIR
   430  	}
   431  	deleteChild := func() {
   432  		// Best effort attempt to remove the newly created child on failure.
   433  		if err := unix.Unlinkat(d.controlFD, name, unlinkFlags); err != nil {
   434  			log.Warningf("error unlinking newly created child %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err)
   435  		}
   436  	}
   437  
   438  	childFD, err := tryOpen(func(flags int) (int, error) {
   439  		return unix.Openat(d.controlFD, name, flags|extraOpenFlags, 0)
   440  	})
   441  	if err != nil {
   442  		deleteChild()
   443  		return nil, err
   444  	}
   445  
   446  	// "If the owner or group is specified as -1, then that ID is not changed"
   447  	// - chown(2). Only bother making the syscall if the owner is changing.
   448  	if uid != -1 || gid != -1 {
   449  		if err := fchown(childFD, uid, gid); err != nil {
   450  			deleteChild()
   451  			_ = unix.Close(childFD)
   452  			return nil, err
   453  		}
   454  	}
   455  	child, err := d.fs.newDirectfsDentry(childFD)
   456  	if err != nil {
   457  		// Ownership of childFD was passed to newDirectDentry(), so no need to
   458  		// clean that up.
   459  		deleteChild()
   460  		return nil, err
   461  	}
   462  	return child, nil
   463  }
   464  
   465  func (d *directfsDentry) mknod(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) {
   466  	if _, ok := opts.Endpoint.(transport.HostBoundEndpoint); ok {
   467  		return d.bindAt(ctx, name, creds, opts)
   468  	}
   469  
   470  	// From mknod(2) man page:
   471  	// "EPERM: [...] if the filesystem containing pathname does not support
   472  	// the type of node requested."
   473  	if opts.Mode.FileType() != linux.ModeRegular {
   474  		return nil, unix.EPERM
   475  	}
   476  
   477  	if err := unix.Mknodat(d.controlFD, name, uint32(opts.Mode), 0); err != nil {
   478  		return nil, err
   479  	}
   480  	return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */)
   481  }
   482  
   483  // Precondition: opts.Endpoint != nil and is transport.HostBoundEndpoint type.
   484  func (d *directfsDentry) bindAt(ctx context.Context, name string, creds *auth.Credentials, opts *vfs.MknodOptions) (*dentry, error) {
   485  	// There are no filesystems mounted in the sandbox process's mount namespace.
   486  	// So we can't perform absolute path traversals. So fallback to using lisafs.
   487  	if err := d.ensureLisafsControlFD(ctx); err != nil {
   488  		return nil, err
   489  	}
   490  	sockType := opts.Endpoint.(transport.Endpoint).Type()
   491  	childInode, boundSocketFD, err := d.controlFDLisa.BindAt(ctx, sockType, name, opts.Mode, lisafs.UID(creds.EffectiveKUID), lisafs.GID(creds.EffectiveKGID))
   492  	if err != nil {
   493  		return nil, err
   494  	}
   495  	d.fs.client.CloseFD(ctx, childInode.ControlFD, true /* flush */)
   496  	// Update opts.Endpoint that it is bound.
   497  	hbep := opts.Endpoint.(transport.HostBoundEndpoint)
   498  	if err := hbep.SetBoundSocketFD(ctx, boundSocketFD); err != nil {
   499  		if err := unix.Unlinkat(d.controlFD, name, 0); err != nil {
   500  			log.Warningf("error unlinking newly created socket %q after failure: %v", filepath.Join(genericDebugPathname(&d.dentry), name), err)
   501  		}
   502  		return nil, err
   503  	}
   504  	// Socket already has the right UID/GID set, so use uid = gid = -1.
   505  	child, err := d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */)
   506  	if err != nil {
   507  		hbep.ResetBoundSocketFD(ctx)
   508  		return nil, err
   509  	}
   510  	// Set the endpoint on the newly created child dentry.
   511  	child.endpoint = opts.Endpoint
   512  	return child, nil
   513  }
   514  
   515  // Precondition: d.fs.renameMu must be locked.
   516  func (d *directfsDentry) link(target *directfsDentry, name string) (*dentry, error) {
   517  	// Using linkat(targetFD, "", newdirfd, name, AT_EMPTY_PATH) requires
   518  	// CAP_DAC_READ_SEARCH in the *root* userns. With directfs, the sandbox
   519  	// process has CAP_DAC_READ_SEARCH in its own userns. But the sandbox is
   520  	// running in a different userns. So we can't use AT_EMPTY_PATH. Fallback to
   521  	// using olddirfd to call linkat(2).
   522  	// Also note that d and target are from the same mount. Given target is a
   523  	// non-directory and d is a directory, target.parent must exist.
   524  	if err := unix.Linkat(target.parent.impl.(*directfsDentry).controlFD, target.name, d.controlFD, name, 0); err != nil {
   525  		return nil, err
   526  	}
   527  	// Note that we don't need to set uid/gid for the new child. This is a hard
   528  	// link. The original file already has the right owner.
   529  	return d.getCreatedChild(name, -1 /* uid */, -1 /* gid */, false /* isDir */)
   530  }
   531  
   532  func (d *directfsDentry) mkdir(name string, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, error) {
   533  	if err := unix.Mkdirat(d.controlFD, name, uint32(mode)); err != nil {
   534  		return nil, err
   535  	}
   536  	return d.getCreatedChild(name, int(uid), int(gid), true /* isDir */)
   537  }
   538  
   539  func (d *directfsDentry) symlink(name, target string, creds *auth.Credentials) (*dentry, error) {
   540  	if err := unix.Symlinkat(target, d.controlFD, name); err != nil {
   541  		return nil, err
   542  	}
   543  	return d.getCreatedChild(name, int(creds.EffectiveKUID), int(creds.EffectiveKGID), false /* isDir */)
   544  }
   545  
   546  func (d *directfsDentry) openCreate(name string, accessFlags uint32, mode linux.FileMode, uid auth.KUID, gid auth.KGID) (*dentry, handle, error) {
   547  	createFlags := unix.O_CREAT | unix.O_EXCL | int(accessFlags) | hostOpenFlags
   548  	childHandleFD, err := unix.Openat(d.controlFD, name, createFlags, uint32(mode&^linux.FileTypeMask))
   549  	if err != nil {
   550  		return nil, noHandle, err
   551  	}
   552  
   553  	child, err := d.getCreatedChild(name, int(uid), int(gid), false /* isDir */)
   554  	if err != nil {
   555  		_ = unix.Close(childHandleFD)
   556  		return nil, noHandle, err
   557  	}
   558  	return child, handle{fd: int32(childHandleFD)}, nil
   559  }
   560  
   561  func (d *directfsDentry) getDirentsLocked(recordDirent func(name string, key inoKey, dType uint8)) error {
   562  	readFD := int(d.readFD.RacyLoad())
   563  	if _, err := unix.Seek(readFD, 0, 0); err != nil {
   564  		return err
   565  	}
   566  
   567  	var direntsBuf [8192]byte
   568  	for {
   569  		n, err := unix.Getdents(readFD, direntsBuf[:])
   570  		if err != nil {
   571  			return err
   572  		}
   573  		if n <= 0 {
   574  			return nil
   575  		}
   576  
   577  		fsutil.ParseDirents(direntsBuf[:n], func(ino uint64, off int64, ftype uint8, name string, reclen uint16) bool {
   578  			// We also want the device ID, which annoyingly incurs an additional
   579  			// syscall per dirent.
   580  			// TODO(gvisor.dev/issue/6665): Get rid of per-dirent stat.
   581  			stat, err := fsutil.StatAt(d.controlFD, name)
   582  			if err != nil {
   583  				log.Warningf("Getdent64: skipping file %q with failed stat, err: %v", path.Join(genericDebugPathname(&d.dentry), name), err)
   584  				return true
   585  			}
   586  			recordDirent(name, inoKeyFromStat(&stat), ftype)
   587  			return true
   588  		})
   589  	}
   590  }
   591  
   592  // Precondition: fs.renameMu is locked.
   593  func (d *directfsDentry) connect(ctx context.Context, sockType linux.SockType) (int, error) {
   594  	// There are no filesystems mounted in the sandbox process's mount namespace.
   595  	// So we can't perform absolute path traversals. So fallback to using lisafs.
   596  	if err := d.ensureLisafsControlFD(ctx); err != nil {
   597  		return -1, err
   598  	}
   599  	return d.controlFDLisa.Connect(ctx, sockType)
   600  }
   601  
   602  func (d *directfsDentry) readlink() (string, error) {
   603  	// This is similar to what os.Readlink does.
   604  	for linkLen := 128; linkLen < math.MaxUint16; linkLen *= 2 {
   605  		b := make([]byte, linkLen)
   606  		n, err := unix.Readlinkat(d.controlFD, "", b)
   607  
   608  		if err != nil {
   609  			return "", err
   610  		}
   611  		if n < int(linkLen) {
   612  			return string(b[:n]), nil
   613  		}
   614  	}
   615  	return "", unix.ENOMEM
   616  }
   617  
   618  func (d *directfsDentry) statfs() (linux.Statfs, error) {
   619  	var statFS unix.Statfs_t
   620  	if err := unix.Fstatfs(d.controlFD, &statFS); err != nil {
   621  		return linux.Statfs{}, err
   622  	}
   623  	return linux.Statfs{
   624  		BlockSize:       statFS.Bsize,
   625  		FragmentSize:    statFS.Bsize,
   626  		Blocks:          statFS.Blocks,
   627  		BlocksFree:      statFS.Bfree,
   628  		BlocksAvailable: statFS.Bavail,
   629  		Files:           statFS.Files,
   630  		FilesFree:       statFS.Ffree,
   631  		NameLength:      uint64(statFS.Namelen),
   632  	}, nil
   633  }
   634  
   635  func (d *directfsDentry) restoreFile(ctx context.Context, controlFD int, opts *vfs.CompleteRestoreOptions) error {
   636  	if controlFD < 0 {
   637  		log.Warningf("directfsDentry.restoreFile called with invalid controlFD")
   638  		return unix.EINVAL
   639  	}
   640  	var stat unix.Stat_t
   641  	if err := unix.Fstat(controlFD, &stat); err != nil {
   642  		_ = unix.Close(controlFD)
   643  		return err
   644  	}
   645  
   646  	d.controlFD = controlFD
   647  	// We do not preserve inoKey across checkpoint/restore, so:
   648  	//
   649  	//	- We must assume that the host filesystem did not change in a way that
   650  	//		would invalidate dentries, since we can't revalidate dentries by
   651  	//		checking inoKey.
   652  	//
   653  	//	- We need to associate the new inoKey with the existing d.ino.
   654  	d.inoKey = inoKeyFromStat(&stat)
   655  	d.fs.inoMu.Lock()
   656  	d.fs.inoByKey[d.inoKey] = d.ino
   657  	d.fs.inoMu.Unlock()
   658  
   659  	// Check metadata stability before updating metadata.
   660  	d.metadataMu.Lock()
   661  	defer d.metadataMu.Unlock()
   662  	if d.isRegularFile() {
   663  		if opts.ValidateFileSizes {
   664  			if d.size.RacyLoad() != uint64(stat.Size) {
   665  				return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: file size validation failed: size changed from %d to %d", genericDebugPathname(&d.dentry), d.size.Load(), stat.Size)}
   666  			}
   667  		}
   668  		if opts.ValidateFileModificationTimestamps {
   669  			if want := dentryTimestampFromUnix(stat.Mtim); d.mtime.RacyLoad() != want {
   670  				return vfs.ErrCorruption{fmt.Errorf("gofer.dentry(%q).restoreFile: mtime validation failed: mtime changed from %+v to %+v", genericDebugPathname(&d.dentry), linux.NsecToStatxTimestamp(d.mtime.RacyLoad()), linux.NsecToStatxTimestamp(want))}
   671  			}
   672  		}
   673  	}
   674  	if !d.cachedMetadataAuthoritative() {
   675  		d.updateMetadataFromStatLocked(&stat)
   676  	}
   677  
   678  	if rw, ok := d.fs.savedDentryRW[&d.dentry]; ok {
   679  		if err := d.ensureSharedHandle(ctx, rw.read, rw.write, false /* trunc */); err != nil {
   680  			return err
   681  		}
   682  	}
   683  
   684  	return nil
   685  }
   686  
   687  // doRevalidationDirectfs stats all dentries in `state`. It will update or
   688  // invalidate dentries in the cache based on the result.
   689  //
   690  // Preconditions:
   691  //   - fs.renameMu must be locked.
   692  //   - InteropModeShared is in effect.
   693  func doRevalidationDirectfs(ctx context.Context, vfsObj *vfs.VirtualFilesystem, state *revalidateState, ds **[]*dentry) error {
   694  	// Explicitly declare start dentry, instead of using the function receiver.
   695  	// The function receiver has to be named `d` (to be consistent with other
   696  	// receivers). But `d` variable is also used below in various places. This
   697  	// helps with readability and makes code less error prone.
   698  	start := state.start.impl.(*directfsDentry)
   699  	if state.refreshStart {
   700  		start.updateMetadata(ctx)
   701  	}
   702  
   703  	parent := start
   704  	for _, d := range state.dentries {
   705  		childFD, err := unix.Openat(parent.controlFD, d.name, unix.O_PATH|hostOpenFlags, 0)
   706  		if err != nil && err != unix.ENOENT {
   707  			return err
   708  		}
   709  
   710  		var stat unix.Stat_t
   711  		// Lock metadata *before* getting attributes for d.
   712  		d.metadataMu.Lock()
   713  		found := err == nil
   714  		if found {
   715  			err = unix.Fstat(childFD, &stat)
   716  			_ = unix.Close(childFD)
   717  			if err != nil {
   718  				d.metadataMu.Unlock()
   719  				return err
   720  			}
   721  		}
   722  
   723  		// Note that synthetic dentries will always fail this comparison check.
   724  		if !found || d.inoKey != inoKeyFromStat(&stat) {
   725  			d.metadataMu.Unlock()
   726  			if !found && d.isSynthetic() {
   727  				// We have a synthetic file, and no remote file has arisen to replace
   728  				// it.
   729  				return nil
   730  			}
   731  			// The file at this path has changed or no longer exists. Mark the
   732  			// dentry invalidated.
   733  			d.invalidate(ctx, vfsObj, ds)
   734  			return nil
   735  		}
   736  
   737  		// The file at this path hasn't changed. Just update cached metadata.
   738  		d.impl.(*directfsDentry).updateMetadataFromStatLocked(&stat) // +checklocksforce: d.metadataMu is locked above.
   739  		d.metadataMu.Unlock()
   740  
   741  		// Advance parent.
   742  		parent = d.impl.(*directfsDentry)
   743  	}
   744  	return nil
   745  }
   746  
   747  // LINT.ThenChange(../../../../runsc/fsgofer/lisafs.go)