github.com/MerlinKodo/gvisor@v0.0.0-20231110090155-957f62ecf90e/runsc/boot/vfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"path"
    20  	"path/filepath"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	"github.com/MerlinKodo/gvisor/pkg/abi/linux"
    27  	"github.com/MerlinKodo/gvisor/pkg/cleanup"
    28  	"github.com/MerlinKodo/gvisor/pkg/context"
    29  	"github.com/MerlinKodo/gvisor/pkg/errors/linuxerr"
    30  	"github.com/MerlinKodo/gvisor/pkg/fd"
    31  	"github.com/MerlinKodo/gvisor/pkg/fspath"
    32  	"github.com/MerlinKodo/gvisor/pkg/log"
    33  	"github.com/MerlinKodo/gvisor/pkg/sentry/devices/accel"
    34  	"github.com/MerlinKodo/gvisor/pkg/sentry/devices/memdev"
    35  	"github.com/MerlinKodo/gvisor/pkg/sentry/devices/nvproxy"
    36  	"github.com/MerlinKodo/gvisor/pkg/sentry/devices/ttydev"
    37  	"github.com/MerlinKodo/gvisor/pkg/sentry/devices/tundev"
    38  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/cgroupfs"
    39  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/devpts"
    40  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/devtmpfs"
    41  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/fuse"
    42  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/gofer"
    43  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/mqfs"
    44  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/overlay"
    45  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/proc"
    46  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/sys"
    47  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/tmpfs"
    48  	"github.com/MerlinKodo/gvisor/pkg/sentry/fsimpl/user"
    49  	"github.com/MerlinKodo/gvisor/pkg/sentry/inet"
    50  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel"
    51  	"github.com/MerlinKodo/gvisor/pkg/sentry/kernel/auth"
    52  	"github.com/MerlinKodo/gvisor/pkg/sentry/vfs"
    53  	"github.com/MerlinKodo/gvisor/runsc/config"
    54  	"github.com/MerlinKodo/gvisor/runsc/specutils"
    55  	specs "github.com/opencontainers/runtime-spec/specs-go"
    56  )
    57  
    58  // Supported filesystems that map to different internal filesystems.
    59  const (
    60  	Bind   = "bind"
    61  	Nonefs = "none"
    62  )
    63  
    64  // SelfOverlayFilestorePrefix is the prefix in the file name of the
    65  // self overlay filestore file.
    66  const SelfOverlayFilestorePrefix = ".gvisor.overlay.img."
    67  
    68  // SelfOverlayFilestorePath returns the path at which the self overlay
    69  // filestore file is stored for a given mount.
    70  func SelfOverlayFilestorePath(mountSrc, sandboxID string) string {
    71  	// We will place the filestore file in a gVisor specific hidden file inside
    72  	// the mount being overlay-ed itself. The same volume can be overlay-ed by
    73  	// multiple sandboxes. So make the filestore file unique to a sandbox by
    74  	// suffixing the sandbox ID.
    75  	return path.Join(mountSrc, selfOverlayFilestoreName(sandboxID))
    76  }
    77  
    78  func selfOverlayFilestoreName(sandboxID string) string {
    79  	return SelfOverlayFilestorePrefix + sandboxID
    80  }
    81  
    82  // tmpfs has some extra supported options that we must pass through.
    83  var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"}
    84  
    85  func registerFilesystems(k *kernel.Kernel, info *containerInfo) error {
    86  	ctx := k.SupervisorContext()
    87  	creds := auth.NewRootCredentials(k.RootUserNamespace())
    88  	vfsObj := k.VFS()
    89  
    90  	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    91  		AllowUserMount: true,
    92  		AllowUserList:  true,
    93  	})
    94  	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    95  		AllowUserList: true,
    96  		// TODO(b/29356795): Users may mount this once the terminals are in a
    97  		//  usable state.
    98  		AllowUserMount: false,
    99  	})
   100  	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   101  		AllowUserMount: true,
   102  		AllowUserList:  true,
   103  	})
   104  	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   105  		AllowUserMount: true,
   106  		AllowUserList:  true,
   107  	})
   108  	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   109  		AllowUserList: true,
   110  	})
   111  	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   112  		AllowUserMount: true,
   113  		AllowUserList:  true,
   114  	})
   115  	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   116  		AllowUserMount: true,
   117  		AllowUserList:  true,
   118  	})
   119  	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   120  		AllowUserMount: true,
   121  		AllowUserList:  true,
   122  	})
   123  	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   124  		AllowUserMount: true,
   125  		AllowUserList:  true,
   126  	})
   127  	vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   128  		AllowUserMount: true,
   129  		AllowUserList:  true,
   130  	})
   131  
   132  	// Register devices.
   133  	if err := memdev.Register(vfsObj); err != nil {
   134  		return fmt.Errorf("registering memdev: %w", err)
   135  	}
   136  	if err := ttydev.Register(vfsObj); err != nil {
   137  		return fmt.Errorf("registering ttydev: %w", err)
   138  	}
   139  	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
   140  	if tunSupported {
   141  		if err := tundev.Register(vfsObj); err != nil {
   142  			return fmt.Errorf("registering tundev: %v", err)
   143  		}
   144  	}
   145  	if err := fuse.Register(vfsObj); err != nil {
   146  		return fmt.Errorf("registering fusedev: %w", err)
   147  	}
   148  
   149  	// Setup files in devtmpfs.
   150  	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
   151  	if err != nil {
   152  		return fmt.Errorf("creating devtmpfs accessor: %w", err)
   153  	}
   154  	defer a.Release(ctx)
   155  
   156  	if err := a.UserspaceInit(ctx); err != nil {
   157  		return fmt.Errorf("initializing userspace: %w", err)
   158  	}
   159  	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
   160  		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
   161  	}
   162  	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
   163  		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
   164  	}
   165  	if tunSupported {
   166  		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
   167  			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
   168  		}
   169  	}
   170  	if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
   171  		return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
   172  	}
   173  
   174  	if err := nvproxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil {
   175  		return err
   176  	}
   177  
   178  	if err := tpuProxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil {
   179  		return err
   180  	}
   181  
   182  	return nil
   183  }
   184  
   185  func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   186  	// Create context with root credentials to mount the filesystem (the current
   187  	// user may not be privileged enough).
   188  	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   189  	rootProcArgs := *procArgs
   190  	rootProcArgs.WorkingDirectory = "/"
   191  	rootProcArgs.Credentials = rootCreds
   192  	rootProcArgs.Umask = 0022
   193  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   194  	rootCtx := rootProcArgs.NewContext(mntr.k)
   195  
   196  	mns, err := mntr.mountAll(rootCtx, rootCreds, info.conf, &rootProcArgs)
   197  	if err != nil {
   198  		return fmt.Errorf("failed to setupFS: %w", err)
   199  	}
   200  	procArgs.MountNamespace = mns
   201  
   202  	mnsRoot := mns.Root(rootCtx)
   203  	defer mnsRoot.DecRef(rootCtx)
   204  
   205  	if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil {
   206  		return fmt.Errorf("failed to create device files: %w", err)
   207  	}
   208  
   209  	// We are executing a file directly. Do not resolve the executable path.
   210  	if procArgs.File != nil {
   211  		return nil
   212  	}
   213  	// Resolve the executable path from working dir and environment.
   214  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   215  	if err != nil {
   216  		return err
   217  	}
   218  	procArgs.Filename = resolved
   219  	return nil
   220  }
   221  
   222  // compileMounts returns the supported mounts from the mount spec, adding any
   223  // mandatory mounts that are required by the OCI specification.
   224  //
   225  // This function must NOT add/remove any gofer mounts or change their order.
   226  func compileMounts(spec *specs.Spec, conf *config.Config) []specs.Mount {
   227  	// Keep track of whether proc and sys were mounted.
   228  	var procMounted, sysMounted, devMounted, devptsMounted bool
   229  	var mounts []specs.Mount
   230  
   231  	// Mount all submounts from the spec.
   232  	for _, m := range spec.Mounts {
   233  		// Unconditionally drop any cgroupfs mounts. If requested, we'll add our
   234  		// own below.
   235  		if m.Type == cgroupfs.Name {
   236  			continue
   237  		}
   238  		switch filepath.Clean(m.Destination) {
   239  		case "/proc":
   240  			procMounted = true
   241  		case "/sys":
   242  			sysMounted = true
   243  		case "/dev":
   244  			m.Type = devtmpfs.Name
   245  			devMounted = true
   246  		case "/dev/pts":
   247  			m.Type = devpts.Name
   248  			devptsMounted = true
   249  		}
   250  		mounts = append(mounts, m)
   251  	}
   252  
   253  	// Mount proc and sys even if the user did not ask for it, as the spec
   254  	// says we SHOULD.
   255  	var mandatoryMounts []specs.Mount
   256  
   257  	if conf.Cgroupfs {
   258  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   259  			Type:        tmpfs.Name,
   260  			Destination: "/sys/fs/cgroup",
   261  		})
   262  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   263  			Type:        cgroupfs.Name,
   264  			Destination: "/sys/fs/cgroup/memory",
   265  			Options:     []string{"memory"},
   266  		})
   267  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   268  			Type:        cgroupfs.Name,
   269  			Destination: "/sys/fs/cgroup/cpu",
   270  			Options:     []string{"cpu"},
   271  		})
   272  	}
   273  
   274  	if !procMounted {
   275  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   276  			Type:        proc.Name,
   277  			Destination: "/proc",
   278  		})
   279  	}
   280  	if !sysMounted {
   281  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   282  			Type:        sys.Name,
   283  			Destination: "/sys",
   284  		})
   285  	}
   286  	if !devMounted {
   287  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   288  			Type:        devtmpfs.Name,
   289  			Destination: "/dev",
   290  		})
   291  	}
   292  	if !devptsMounted {
   293  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   294  			Type:        devpts.Name,
   295  			Destination: "/dev/pts",
   296  		})
   297  	}
   298  
   299  	// The mandatory mounts should be ordered right after the root, in case
   300  	// there are submounts of these mandatory mounts already in the spec.
   301  	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
   302  
   303  	return mounts
   304  }
   305  
   306  // goferMountData creates a slice of gofer mount data.
   307  func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string {
   308  	opts := []string{
   309  		"trans=fd",
   310  		"rfdno=" + strconv.Itoa(fd),
   311  		"wfdno=" + strconv.Itoa(fd),
   312  	}
   313  	if fa == config.FileAccessShared {
   314  		opts = append(opts, "cache=remote_revalidating")
   315  	}
   316  	if conf.DirectFS {
   317  		opts = append(opts, "directfs")
   318  	}
   319  	if !conf.HostFifo.AllowOpen() {
   320  		opts = append(opts, "disable_fifo_open")
   321  	}
   322  	return opts
   323  }
   324  
   325  // parseAndFilterOptions parses a MountOptions slice and filters by the allowed
   326  // keys.
   327  func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
   328  	var out []string
   329  	for _, o := range opts {
   330  		ok, err := parseMountOption(o, allowedKeys...)
   331  		if err != nil {
   332  			return nil, err
   333  		}
   334  		if ok {
   335  			out = append(out, o)
   336  		}
   337  	}
   338  	return out, nil
   339  }
   340  
   341  func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
   342  	kv := strings.SplitN(opt, "=", 3)
   343  	if len(kv) > 2 {
   344  		return false, fmt.Errorf("invalid option %q", opt)
   345  	}
   346  	return specutils.ContainsStr(allowedKeys, kv[0]), nil
   347  }
   348  
   349  type fdDispenser struct {
   350  	fds []*fd.FD
   351  }
   352  
   353  func (f *fdDispenser) remove() int {
   354  	return f.removeAsFD().Release()
   355  }
   356  
   357  func (f *fdDispenser) removeAsFD() *fd.FD {
   358  	if f.empty() {
   359  		panic("fdDispenser out of fds")
   360  	}
   361  	rv := f.fds[0]
   362  	f.fds = f.fds[1:]
   363  	return rv
   364  }
   365  
   366  func (f *fdDispenser) empty() bool {
   367  	return len(f.fds) == 0
   368  }
   369  
   370  type containerMounter struct {
   371  	root *specs.Root
   372  
   373  	// mounts is the set of submounts for the container. It's a copy from the spec
   374  	// that may be freely modified without affecting the original spec.
   375  	mounts []specs.Mount
   376  
   377  	// fds is the list of FDs to be dispensed for mounts that require it.
   378  	fds fdDispenser
   379  
   380  	// overlayFilestoreFDs are the FDs to the regular files that will back the
   381  	// tmpfs upper mount in the overlay mounts.
   382  	overlayFilestoreFDs fdDispenser
   383  
   384  	// overlayMediums contains information about how the gofer mounts have been
   385  	// overlaid. The first entry is for rootfs and the following entries are for
   386  	// bind mounts in `mounts` slice above (in the same order).
   387  	overlayMediums []OverlayMedium
   388  
   389  	k *kernel.Kernel
   390  
   391  	// hints is the set of pod mount hints for the sandbox.
   392  	hints *PodMountHints
   393  
   394  	// sharedMounts is a map of shared mounts that can be reused across
   395  	// containers.
   396  	sharedMounts map[string]*vfs.Mount
   397  
   398  	// productName is the value to show in
   399  	// /sys/devices/virtual/dmi/id/product_name.
   400  	productName string
   401  
   402  	// sandboxID is the ID for the whole sandbox.
   403  	sandboxID string
   404  }
   405  
   406  func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter {
   407  	return &containerMounter{
   408  		root:                info.spec.Root,
   409  		mounts:              compileMounts(info.spec, info.conf),
   410  		fds:                 fdDispenser{fds: info.goferFDs},
   411  		overlayFilestoreFDs: fdDispenser{fds: info.overlayFilestoreFDs},
   412  		overlayMediums:      info.overlayMediums,
   413  		k:                   k,
   414  		hints:               hints,
   415  		sharedMounts:        sharedMounts,
   416  		productName:         productName,
   417  		sandboxID:           sandboxID,
   418  	}
   419  }
   420  
   421  func (c *containerMounter) checkDispenser() error {
   422  	if !c.fds.empty() {
   423  		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
   424  	}
   425  	return nil
   426  }
   427  
   428  func getMountAccessType(conf *config.Config, mount *specs.Mount, hint *MountHint) config.FileAccessType {
   429  	if hint != nil {
   430  		return hint.fileAccessType()
   431  	}
   432  	return conf.FileAccessMounts
   433  }
   434  
   435  func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
   436  	log.Infof("Configuring container's file system")
   437  
   438  	mns, err := c.createMountNamespace(rootCtx, conf, rootCreds)
   439  	if err != nil {
   440  		return nil, fmt.Errorf("creating mount namespace: %w", err)
   441  	}
   442  	rootProcArgs.MountNamespace = mns
   443  
   444  	root := mns.Root(rootCtx)
   445  	defer root.DecRef(rootCtx)
   446  	if root.Mount().ReadOnly() {
   447  		// Switch to ReadWrite while we setup submounts.
   448  		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
   449  			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
   450  		}
   451  		// Restore back to ReadOnly at the end.
   452  		defer func() {
   453  			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
   454  				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
   455  			}
   456  		}()
   457  	}
   458  
   459  	// Mount submounts.
   460  	if err := c.mountSubmounts(rootCtx, conf, mns, rootCreds); err != nil {
   461  		return nil, fmt.Errorf("mounting submounts: %w", err)
   462  	}
   463  
   464  	return mns, nil
   465  }
   466  
   467  // createMountNamespace creates the container's root mount and namespace.
   468  func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
   469  	ioFD := c.fds.remove()
   470  	data := goferMountData(ioFD, conf.FileAccess, conf)
   471  
   472  	// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   473  	// can only send mount options for specs.Mounts (specs.Root is missing
   474  	// Options field). So assume root is always on top of overlayfs.
   475  	data = append(data, "overlayfs_stale_read")
   476  
   477  	// Configure the gofer dentry cache size.
   478  	gofer.SetDentryCacheSize(conf.DCache)
   479  
   480  	log.Infof("Mounting root with gofer, ioFD: %d", ioFD)
   481  	opts := &vfs.MountOptions{
   482  		ReadOnly: c.root.Readonly,
   483  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   484  			Data: strings.Join(data, ","),
   485  			InternalData: gofer.InternalFilesystemOptions{
   486  				UniqueID: "/",
   487  			},
   488  		},
   489  		InternalMount: true,
   490  	}
   491  
   492  	fsName := gofer.Name
   493  	if c.overlayMediums[0].IsEnabled() {
   494  		log.Infof("Adding overlay on top of root")
   495  		var (
   496  			err              error
   497  			cleanup          func()
   498  			overlayFilestore *fd.FD
   499  		)
   500  		if c.overlayMediums[0].IsBackedByHostFile() {
   501  			overlayFilestore = c.overlayFilestoreFDs.removeAsFD()
   502  		}
   503  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, overlayFilestore, c.overlayMediums[0])
   504  		if err != nil {
   505  			return nil, fmt.Errorf("mounting root with overlay: %w", err)
   506  		}
   507  		defer cleanup()
   508  		fsName = overlay.Name
   509  	}
   510  
   511  	// The namespace root mount can't be changed, so let's mount a dummy
   512  	// read-only tmpfs here. It simplifies creation of containers without
   513  	// leaking the root file system.
   514  	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs",
   515  		&vfs.MountOptions{ReadOnly: true}, c.k)
   516  	if err != nil {
   517  		return nil, fmt.Errorf("setting up mount namespace: %w", err)
   518  	}
   519  	defer mns.DecRef(ctx)
   520  
   521  	mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts)
   522  	if err != nil {
   523  		return nil, fmt.Errorf("creating root file system: %w", err)
   524  	}
   525  	defer mnt.DecRef(ctx)
   526  	root := mns.Root(ctx)
   527  	defer root.DecRef(ctx)
   528  	target := &vfs.PathOperation{
   529  		Root:  root,
   530  		Start: root,
   531  	}
   532  	if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil {
   533  		return nil, fmt.Errorf("mounting root file system: %w", err)
   534  	}
   535  
   536  	mns.IncRef()
   537  	return mns, nil
   538  }
   539  
   540  // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
   541  // layer using tmpfs, and return overlay mount options. "cleanup" must be called
   542  // after the options have been used to mount the overlay, to release refs on
   543  // lower and upper mounts.
   544  func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, medium OverlayMedium) (*vfs.MountOptions, func(), error) {
   545  	// First copy options from lower layer to upper layer and overlay. Clear
   546  	// filesystem specific options.
   547  	upperOpts := *lowerOpts
   548  	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   549  
   550  	overlayOpts := *lowerOpts
   551  	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   552  
   553  	// All writes go to the upper layer, be paranoid and make lower readonly.
   554  	lowerOpts.ReadOnly = true
   555  	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
   556  	if err != nil {
   557  		return nil, nil, err
   558  	}
   559  	cu := cleanup.Make(func() { lower.DecRef(ctx) })
   560  	defer cu.Clean()
   561  
   562  	// Determine the lower layer's root's type.
   563  	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
   564  	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
   565  		Root:  lowerRootVD,
   566  		Start: lowerRootVD,
   567  	}, &vfs.StatOptions{
   568  		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
   569  	})
   570  	if err != nil {
   571  		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
   572  	}
   573  	if stat.Mask&linux.STATX_TYPE == 0 {
   574  		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
   575  	}
   576  	rootType := stat.Mode & linux.S_IFMT
   577  	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
   578  		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
   579  	}
   580  
   581  	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
   582  	tmpfsOpts := tmpfs.FilesystemOpts{
   583  		RootFileType: uint16(rootType),
   584  		FilestoreFD:  filestoreFD,
   585  		// If a mount is being overlaid, it should not be limited by the default
   586  		// tmpfs size limit.
   587  		DisableDefaultSizeLimit: true,
   588  	}
   589  	upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts
   590  	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
   591  	if err != nil {
   592  		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
   593  	}
   594  	cu.Add(func() { upper.DecRef(ctx) })
   595  
   596  	// If the overlay mount consists of a regular file, copy up its contents
   597  	// from the lower layer, since in the overlay the otherwise-empty upper
   598  	// layer file will take precedence.
   599  	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
   600  	if rootType == linux.S_IFREG {
   601  		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   602  			Root:  lowerRootVD,
   603  			Start: lowerRootVD,
   604  		}, &vfs.OpenOptions{
   605  			Flags: linux.O_RDONLY,
   606  		})
   607  		if err != nil {
   608  			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
   609  		}
   610  		defer lowerFD.DecRef(ctx)
   611  		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   612  			Root:  upperRootVD,
   613  			Start: upperRootVD,
   614  		}, &vfs.OpenOptions{
   615  			Flags: linux.O_WRONLY,
   616  		})
   617  		if err != nil {
   618  			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
   619  		}
   620  		defer upperFD.DecRef(ctx)
   621  		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
   622  			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
   623  		}
   624  	}
   625  
   626  	// We need to hide the filestore from the containerized application.
   627  	if medium == SelfMedium {
   628  		if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{
   629  			Root:  upperRootVD,
   630  			Start: upperRootVD,
   631  			Path:  fspath.Parse(selfOverlayFilestoreName(c.sandboxID)),
   632  		}); err != nil {
   633  			return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err)
   634  		}
   635  	}
   636  
   637  	// Propagate the lower layer's root's owner, group, and mode to the upper
   638  	// layer's root for consistency with VFS1.
   639  	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
   640  		Root:  upperRootVD,
   641  		Start: upperRootVD,
   642  	}, &vfs.SetStatOptions{
   643  		Stat: linux.Statx{
   644  			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
   645  			UID:  stat.UID,
   646  			GID:  stat.GID,
   647  			Mode: stat.Mode,
   648  		},
   649  	})
   650  	if err != nil {
   651  		return nil, nil, err
   652  	}
   653  
   654  	// Configure overlay with both layers.
   655  	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
   656  		UpperRoot:  upperRootVD,
   657  		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
   658  	}
   659  	return &overlayOpts, cu.Release(), nil
   660  }
   661  
   662  func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
   663  	mounts, err := c.prepareMounts()
   664  	if err != nil {
   665  		return err
   666  	}
   667  
   668  	for i := range mounts {
   669  		submount := &mounts[i]
   670  		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
   671  		var (
   672  			mnt *vfs.Mount
   673  			err error
   674  		)
   675  
   676  		if submount.hint != nil && submount.hint.shouldShareMount() {
   677  			sharedMount, ok := c.sharedMounts[submount.hint.Mount.Source]
   678  			if !ok {
   679  				return fmt.Errorf("shared mount %q not found", submount.hint.Name)
   680  			}
   681  			mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount.mount, submount.hint, sharedMount)
   682  			if err != nil {
   683  				return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err)
   684  			}
   685  		} else {
   686  			mnt, err = c.mountSubmount(ctx, conf, mns, creds, submount)
   687  			if err != nil {
   688  				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
   689  			}
   690  		}
   691  
   692  		if mnt != nil && mnt.ReadOnly() {
   693  			// Switch to ReadWrite while we setup submounts.
   694  			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
   695  				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
   696  			}
   697  			// Restore back to ReadOnly at the end.
   698  			defer func() {
   699  				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
   700  					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
   701  				}
   702  			}()
   703  		}
   704  	}
   705  
   706  	if err := c.mountTmp(ctx, conf, creds, mns); err != nil {
   707  		return fmt.Errorf(`mount submount "/tmp": %w`, err)
   708  	}
   709  	return nil
   710  }
   711  
   712  type mountInfo struct {
   713  	mount              *specs.Mount
   714  	fd                 int
   715  	hint               *MountHint
   716  	overlayMedium      OverlayMedium
   717  	overlayFilestoreFD *fd.FD
   718  }
   719  
   720  func newNonGoferMountInfo(mount *specs.Mount) *mountInfo {
   721  	return &mountInfo{mount: mount, fd: -1}
   722  }
   723  
   724  func (c *containerMounter) prepareMounts() ([]mountInfo, error) {
   725  	// Associate bind mounts with their FDs before sorting since there is an
   726  	// undocumented assumption that FDs are dispensed in the order in which
   727  	// they are required by mounts.
   728  	var mounts []mountInfo
   729  	goferMntIdx := 1 // First index is for rootfs.
   730  	for i := range c.mounts {
   731  		m := &c.mounts[i]
   732  		specutils.MaybeConvertToBindMount(m)
   733  
   734  		// Only bind mounts use host FDs; see
   735  		// containerMounter.getMountNameAndOptions.
   736  		info := mountInfo{
   737  			mount:         m,
   738  			fd:            -1,
   739  			hint:          c.hints.FindMount(m),
   740  			overlayMedium: NoOverlay,
   741  		}
   742  		if specutils.IsGoferMount(*m) {
   743  			info.fd = c.fds.remove()
   744  			info.overlayMedium = c.overlayMediums[goferMntIdx]
   745  			if info.overlayMedium.IsBackedByHostFile() {
   746  				info.overlayFilestoreFD = c.overlayFilestoreFDs.removeAsFD()
   747  			}
   748  			goferMntIdx++
   749  		}
   750  		mounts = append(mounts, info)
   751  	}
   752  	if err := c.checkDispenser(); err != nil {
   753  		return nil, err
   754  	}
   755  
   756  	// Sort the mounts so that we don't place children before parents.
   757  	sort.Slice(mounts, func(i, j int) bool {
   758  		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
   759  	})
   760  
   761  	return mounts, nil
   762  }
   763  
   764  func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) {
   765  	fsName, opts, err := getMountNameAndOptions(conf, submount, c.productName)
   766  	if err != nil {
   767  		return nil, fmt.Errorf("mountOptions failed: %w", err)
   768  	}
   769  	if len(fsName) == 0 {
   770  		// Filesystem is not supported (e.g. cgroup), just skip it.
   771  		return nil, nil
   772  	}
   773  
   774  	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
   775  		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
   776  	}
   777  
   778  	if submount.overlayMedium.IsEnabled() {
   779  		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
   780  		var cleanup func()
   781  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.overlayFilestoreFD, submount.overlayMedium)
   782  		if err != nil {
   783  			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
   784  		}
   785  		defer cleanup()
   786  		fsName = overlay.Name
   787  	}
   788  
   789  	root := mns.Root(ctx)
   790  	defer root.DecRef(ctx)
   791  	target := &vfs.PathOperation{
   792  		Root:  root,
   793  		Start: root,
   794  		Path:  fspath.Parse(submount.mount.Destination),
   795  	}
   796  	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
   797  	if err != nil {
   798  		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
   799  	}
   800  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
   801  	return mnt, nil
   802  }
   803  
   804  // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
   805  // used for mounts.
   806  func getMountNameAndOptions(conf *config.Config, m *mountInfo, productName string) (string, *vfs.MountOptions, error) {
   807  	fsName := m.mount.Type
   808  	var (
   809  		data         []string
   810  		internalData any
   811  	)
   812  
   813  	// Find filesystem name and FS specific data field.
   814  	switch m.mount.Type {
   815  	case devpts.Name, devtmpfs.Name, proc.Name:
   816  		// Nothing to do.
   817  
   818  	case Nonefs:
   819  		fsName = sys.Name
   820  
   821  	case sys.Name:
   822  		sysData := &sys.InternalData{EnableAccelSysfs: conf.TPUProxy}
   823  		if len(productName) > 0 {
   824  			sysData.ProductName = productName
   825  		}
   826  		internalData = sysData
   827  
   828  	case tmpfs.Name:
   829  		var err error
   830  		data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...)
   831  		if err != nil {
   832  			return "", nil, err
   833  		}
   834  
   835  	case Bind:
   836  		fsName = gofer.Name
   837  		if m.fd < 0 {
   838  			// Check that an FD was provided to fails fast.
   839  			return "", nil, fmt.Errorf("gofer mount requires a connection FD")
   840  		}
   841  		data = goferMountData(m.fd, getMountAccessType(conf, m.mount, m.hint), conf)
   842  		internalData = gofer.InternalFilesystemOptions{
   843  			UniqueID: m.mount.Destination,
   844  		}
   845  
   846  	case cgroupfs.Name:
   847  		var err error
   848  		data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...)
   849  		if err != nil {
   850  			return "", nil, err
   851  		}
   852  
   853  	default:
   854  		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
   855  		return "", nil, nil
   856  	}
   857  
   858  	opts := ParseMountOptions(m.mount.Options)
   859  	opts.GetFilesystemOptions = vfs.GetFilesystemOptions{
   860  		Data:         strings.Join(data, ","),
   861  		InternalData: internalData,
   862  	}
   863  
   864  	return fsName, opts, nil
   865  }
   866  
   867  // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions.
   868  func ParseMountOptions(opts []string) *vfs.MountOptions {
   869  	mountOpts := &vfs.MountOptions{
   870  		InternalMount: true,
   871  	}
   872  	// Note: update mountHint.CheckCompatible when more options are added.
   873  	for _, o := range opts {
   874  		switch o {
   875  		case "ro":
   876  			mountOpts.ReadOnly = true
   877  		case "noatime":
   878  			mountOpts.Flags.NoATime = true
   879  		case "noexec":
   880  			mountOpts.Flags.NoExec = true
   881  		case "rw", "atime", "exec":
   882  			// These use the default value and don't need to be set.
   883  		case "bind", "rbind":
   884  			// These are the same as a mount with type="bind".
   885  		default:
   886  			log.Warningf("ignoring unknown mount option %q", o)
   887  		}
   888  	}
   889  	return mountOpts
   890  }
   891  
   892  func parseKeyValue(s string) (string, string, bool) {
   893  	tokens := strings.SplitN(s, "=", 2)
   894  	if len(tokens) < 2 {
   895  		return "", "", false
   896  	}
   897  	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
   898  }
   899  
   900  // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
   901  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
   902  // the host /tmp, but this is a nice optimization, and fixes some apps that call
   903  // mknod in /tmp. It's unsafe to mount tmpfs if:
   904  //  1. /tmp is mounted explicitly: we should not override user's wish
   905  //  2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
   906  //
   907  // Note that when there are submounts inside of '/tmp', directories for the
   908  // mount points must be present, making '/tmp' not empty anymore.
   909  func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
   910  	for _, m := range c.mounts {
   911  		// m.Destination has been cleaned, so it's to use equality here.
   912  		if m.Destination == "/tmp" {
   913  			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
   914  			return nil
   915  		}
   916  	}
   917  
   918  	root := mns.Root(ctx)
   919  	defer root.DecRef(ctx)
   920  	pop := vfs.PathOperation{
   921  		Root:  root,
   922  		Start: root,
   923  		Path:  fspath.Parse("/tmp"),
   924  	}
   925  	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
   926  	switch {
   927  	case err == nil:
   928  		defer fd.DecRef(ctx)
   929  
   930  		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
   931  			if dirent.Name != "." && dirent.Name != ".." {
   932  				return linuxerr.ENOTEMPTY
   933  			}
   934  			return nil
   935  		}))
   936  		switch {
   937  		case err == nil:
   938  			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
   939  		case linuxerr.Equals(linuxerr.ENOTEMPTY, err):
   940  			// If more than "." and ".." is found, skip internal tmpfs to prevent
   941  			// hiding existing files.
   942  			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
   943  			return nil
   944  		default:
   945  			return fmt.Errorf("fd.IterDirents failed: %v", err)
   946  		}
   947  		fallthrough
   948  
   949  	case linuxerr.Equals(linuxerr.ENOENT, err):
   950  		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
   951  		// tmpfs.
   952  		tmpMount := specs.Mount{
   953  			Type:        tmpfs.Name,
   954  			Destination: "/tmp",
   955  			// Sticky bit is added to prevent accidental deletion of files from
   956  			// another user. This is normally done for /tmp.
   957  			Options: []string{"mode=01777"},
   958  		}
   959  		if _, err := c.mountSubmount(ctx, conf, mns, creds, newNonGoferMountInfo(&tmpMount)); err != nil {
   960  			return fmt.Errorf("mountSubmount failed: %v", err)
   961  		}
   962  		return nil
   963  
   964  	case linuxerr.Equals(linuxerr.ENOTDIR, err):
   965  		// Not a dir?! Let it be.
   966  		return nil
   967  
   968  	default:
   969  		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
   970  	}
   971  }
   972  
   973  // processHints processes annotations that container hints about how volumes
   974  // should be mounted (e.g. a volume shared between containers).
   975  // Precondition: Must be only called once during the loader sequence
   976  // for the root container.
   977  // Postcondition: Initialized l.sharedMounts on success.
   978  func (l *Loader) processHints(conf *config.Config, creds *auth.Credentials) error {
   979  	ctx := l.k.SupervisorContext()
   980  	var sharedMounts map[string]*vfs.Mount
   981  	for _, hint := range l.mountHints.Mounts {
   982  		if !hint.shouldShareMount() {
   983  			continue
   984  		}
   985  
   986  		log.Infof("Mounting master of shared mount %q from %q type %q", hint.Name, hint.Mount.Source, hint.Mount.Type)
   987  		mnt, err := l.mountSharedMaster(ctx, conf, hint, creds)
   988  		if err != nil {
   989  			return fmt.Errorf("mounting shared master %q: %v", hint.Name, err)
   990  		}
   991  		if sharedMounts == nil {
   992  			sharedMounts = make(map[string]*vfs.Mount)
   993  		}
   994  		sharedMounts[hint.Mount.Source] = mnt
   995  	}
   996  	l.sharedMounts = sharedMounts
   997  	return nil
   998  }
   999  
  1000  // mountSharedMaster mounts the master of a volume that is shared among
  1001  // containers in a pod.
  1002  func (l *Loader) mountSharedMaster(ctx context.Context, conf *config.Config, hint *MountHint, creds *auth.Credentials) (*vfs.Mount, error) {
  1003  	// Map mount type to filesystem name, and parse out the options that we are
  1004  	// capable of dealing with.
  1005  	mntInfo := newNonGoferMountInfo(&hint.Mount)
  1006  	fsName, opts, err := getMountNameAndOptions(conf, mntInfo, l.productName)
  1007  	if err != nil {
  1008  		return nil, err
  1009  	}
  1010  	if len(fsName) == 0 {
  1011  		return nil, fmt.Errorf("mount type not supported %q", hint.Mount.Type)
  1012  	}
  1013  	return l.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
  1014  }
  1015  
  1016  // mountSharedSubmount binds mount to a previously mounted volume that is shared
  1017  // among containers in the same pod.
  1018  func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, srcHint *MountHint, srcMount *vfs.Mount) (*vfs.Mount, error) {
  1019  	if err := srcHint.checkCompatible(mount); err != nil {
  1020  		return nil, err
  1021  	}
  1022  
  1023  	// Ignore data and useOverlay because these were already applied to
  1024  	// the master mount.
  1025  	_, opts, err := getMountNameAndOptions(conf, newNonGoferMountInfo(mount), c.productName)
  1026  	if err != nil {
  1027  		return nil, err
  1028  	}
  1029  	newMnt := c.k.VFS().NewDisconnectedMount(srcMount.Filesystem(), srcMount.Root(), opts)
  1030  	defer newMnt.DecRef(ctx)
  1031  
  1032  	root := mns.Root(ctx)
  1033  	defer root.DecRef(ctx)
  1034  	target := &vfs.PathOperation{
  1035  		Root:  root,
  1036  		Start: root,
  1037  		Path:  fspath.Parse(mount.Destination),
  1038  	}
  1039  
  1040  	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
  1041  		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
  1042  	}
  1043  
  1044  	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
  1045  		return nil, err
  1046  	}
  1047  	log.Infof("Mounted %q type shared bind to %q", mount.Destination, srcHint.Name)
  1048  	return newMnt, nil
  1049  }
  1050  
  1051  func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
  1052  	root := mns.Root(ctx)
  1053  	defer root.DecRef(ctx)
  1054  	target := &vfs.PathOperation{
  1055  		Root:  root,
  1056  		Start: root,
  1057  		Path:  fspath.Parse(dest),
  1058  	}
  1059  	// First check if mount point exists. When overlay is enabled, gofer doesn't
  1060  	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
  1061  	// because MkdirAt fails with EROFS even if file exists.
  1062  	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
  1063  	if err == nil {
  1064  		// File exists, we're done.
  1065  		vd.DecRef(ctx)
  1066  		return nil
  1067  	}
  1068  	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
  1069  }
  1070  
  1071  // configureRestore returns an updated context.Context including filesystem
  1072  // state used by restore defined by conf.
  1073  func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) {
  1074  	fdmap := make(map[string]int)
  1075  	fdmap["/"] = c.fds.remove()
  1076  	mounts, err := c.prepareMounts()
  1077  	if err != nil {
  1078  		return ctx, err
  1079  	}
  1080  	for i := range c.mounts {
  1081  		submount := &mounts[i]
  1082  		if submount.fd >= 0 {
  1083  			fdmap[submount.mount.Destination] = submount.fd
  1084  		}
  1085  	}
  1086  	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
  1087  }
  1088  
  1089  func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error {
  1090  	if info.spec.Linux == nil {
  1091  		return nil
  1092  	}
  1093  	for _, dev := range info.spec.Linux.Devices {
  1094  		pop := vfs.PathOperation{
  1095  			Root:  root,
  1096  			Start: root,
  1097  			Path:  fspath.Parse(dev.Path),
  1098  		}
  1099  		opts := vfs.MknodOptions{
  1100  			Mode: linux.FileMode(dev.FileMode.Perm()),
  1101  		}
  1102  		// See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices.
  1103  		switch dev.Type {
  1104  		case "b":
  1105  			opts.Mode |= linux.S_IFBLK
  1106  			opts.DevMajor = uint32(dev.Major)
  1107  			opts.DevMinor = uint32(dev.Minor)
  1108  		case "c", "u":
  1109  			opts.Mode |= linux.S_IFCHR
  1110  			opts.DevMajor = uint32(dev.Major)
  1111  			opts.DevMinor = uint32(dev.Minor)
  1112  		case "p":
  1113  			opts.Mode |= linux.S_IFIFO
  1114  		default:
  1115  			return fmt.Errorf("specified device at %q has invalid type %q", dev.Path, dev.Type)
  1116  		}
  1117  		if dev.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && opts.DevMajor != info.nvidiaUVMDevMajor {
  1118  			// nvidia-uvm's major device number is dynamically assigned, so the
  1119  			// number that it has on the host may differ from the number that
  1120  			// it has in sentry VFS; switch from the former to the latter.
  1121  			log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", dev.Major, info.nvidiaUVMDevMajor)
  1122  			opts.DevMajor = info.nvidiaUVMDevMajor
  1123  		}
  1124  		if err := vfsObj.MkdirAllAt(ctx, path.Dir(dev.Path), root, creds, &vfs.MkdirOptions{
  1125  			Mode: 0o755,
  1126  		}, true /* mustBeDir */); err != nil {
  1127  			return fmt.Errorf("failed to create ancestor directories of %q: %w", dev.Path, err)
  1128  		}
  1129  		// EEXIST is silently ignored; compare
  1130  		// opencontainers/runc:libcontainer/rootfs_linux.go:createDeviceNode().
  1131  		created := true
  1132  		if err := vfsObj.MknodAt(ctx, creds, &pop, &opts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1133  			if linuxerr.Equals(linuxerr.EEXIST, err) {
  1134  				created = false
  1135  			} else {
  1136  				return fmt.Errorf("failed to create device file at %q: %w", dev.Path, err)
  1137  			}
  1138  		}
  1139  		if created && (dev.UID != nil || dev.GID != nil) {
  1140  			var opts vfs.SetStatOptions
  1141  			if dev.UID != nil {
  1142  				opts.Stat.Mask |= linux.STATX_UID
  1143  				opts.Stat.UID = *dev.UID
  1144  			}
  1145  			if dev.GID != nil {
  1146  				opts.Stat.Mask |= linux.STATX_GID
  1147  				opts.Stat.GID = *dev.GID
  1148  			}
  1149  			if err := vfsObj.SetStatAt(ctx, creds, &pop, &opts); err != nil {
  1150  				return fmt.Errorf("failed to set UID/GID for device file %q: %w", dev.Path, err)
  1151  			}
  1152  		}
  1153  	}
  1154  	return nil
  1155  }
  1156  
  1157  func tpuProxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error {
  1158  	if !info.conf.TPUProxy {
  1159  		return nil
  1160  	}
  1161  	// At this point /dev/accel just contains the TPU devices have been mounted
  1162  	// into the sandbox chroot. Enumerate all of them and create sentry devices.
  1163  	paths, err := filepath.Glob("/dev/accel*")
  1164  	if err != nil {
  1165  		return fmt.Errorf("enumerating accel device files: %w", err)
  1166  	}
  1167  	for _, path := range paths {
  1168  		accelDeviceRegex := regexp.MustCompile(`^/dev/accel(\d+)$`)
  1169  		if ms := accelDeviceRegex.FindStringSubmatch(path); ms != nil {
  1170  			deviceNum, _ := strconv.ParseUint(ms[1], 10, 32)
  1171  			if err := accel.Register(vfsObj, uint32(deviceNum)); err != nil {
  1172  				return fmt.Errorf("registering accel driver: %w", err)
  1173  			}
  1174  			if err := accel.CreateDevtmpfsFile(ctx, a, uint32(deviceNum)); err != nil {
  1175  				return fmt.Errorf("creating accel device file %q: %w", deviceNum, err)
  1176  			}
  1177  		}
  1178  	}
  1179  	return nil
  1180  }
  1181  
  1182  func nvproxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error {
  1183  	if !specutils.GPUFunctionalityRequested(info.spec, info.conf) {
  1184  		return nil
  1185  	}
  1186  	uvmDevMajor, err := k.VFS().GetDynamicCharDevMajor()
  1187  	if err != nil {
  1188  		return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err)
  1189  	}
  1190  	if err := nvproxy.Register(vfsObj, uvmDevMajor); err != nil {
  1191  		return fmt.Errorf("registering nvproxy driver: %w", err)
  1192  	}
  1193  	info.nvidiaUVMDevMajor = uvmDevMajor
  1194  	if info.conf.NVProxyDocker {
  1195  		// In Docker mode, create all the device files now.
  1196  		// In non-Docker mode, these are instead created as part of
  1197  		// `createDeviceFiles`, using the spec's Device list.
  1198  		minors, err := specutils.FindAllGPUDevices("/")
  1199  		if err != nil {
  1200  			return fmt.Errorf("getting nvidia devices: %w", err)
  1201  		}
  1202  		if err := nvproxy.CreateDriverDevtmpfsFiles(ctx, a, uvmDevMajor); err != nil {
  1203  			return fmt.Errorf("creating nvproxy devtmpfs files: %w", err)
  1204  		}
  1205  		for _, minor := range minors {
  1206  			if err := nvproxy.CreateIndexDevtmpfsFile(ctx, a, minor); err != nil {
  1207  				return fmt.Errorf("creating nvproxy devtmpfs file for device minor %d: %w", minor, err)
  1208  			}
  1209  		}
  1210  	}
  1211  	return nil
  1212  }