gvisor.dev/gvisor@v0.0.0-20240520182842-f9d4d51c7e0f/runsc/boot/vfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"path"
    21  	"path/filepath"
    22  	"regexp"
    23  	"slices"
    24  	"sort"
    25  	"strconv"
    26  	"strings"
    27  
    28  	specs "github.com/opencontainers/runtime-spec/specs-go"
    29  	"gvisor.dev/gvisor/pkg/abi/linux"
    30  	"gvisor.dev/gvisor/pkg/abi/nvgpu"
    31  	"gvisor.dev/gvisor/pkg/abi/tpu"
    32  	"gvisor.dev/gvisor/pkg/cleanup"
    33  	"gvisor.dev/gvisor/pkg/context"
    34  	"gvisor.dev/gvisor/pkg/devutil"
    35  	"gvisor.dev/gvisor/pkg/errors/linuxerr"
    36  	"gvisor.dev/gvisor/pkg/fd"
    37  	"gvisor.dev/gvisor/pkg/fspath"
    38  	"gvisor.dev/gvisor/pkg/log"
    39  	"gvisor.dev/gvisor/pkg/sentry/devices/accel"
    40  	"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
    41  	"gvisor.dev/gvisor/pkg/sentry/devices/nvproxy"
    42  	"gvisor.dev/gvisor/pkg/sentry/devices/tpuproxy"
    43  	"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
    44  	"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
    45  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/cgroupfs"
    46  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/dev"
    47  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
    48  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
    49  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs"
    50  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
    51  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
    52  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/mqfs"
    53  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
    54  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
    55  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
    56  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
    57  	"gvisor.dev/gvisor/pkg/sentry/fsimpl/user"
    58  	"gvisor.dev/gvisor/pkg/sentry/inet"
    59  	"gvisor.dev/gvisor/pkg/sentry/kernel"
    60  	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
    61  	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
    62  	"gvisor.dev/gvisor/pkg/sentry/vfs"
    63  	"gvisor.dev/gvisor/runsc/config"
    64  	"gvisor.dev/gvisor/runsc/specutils"
    65  )
    66  
    67  // Supported filesystems that map to different internal filesystems.
    68  const (
    69  	Bind   = "bind"
    70  	Nonefs = "none"
    71  )
    72  
    73  // SelfFilestorePrefix is the prefix of the self filestore file name.
    74  const SelfFilestorePrefix = ".gvisor.filestore."
    75  
    76  const (
    77  	pciPathGlobTPUv4 = "/sys/devices/pci0000:00/*/accel/accel*"
    78  	pciPathGlobTPUv5 = "/sys/devices/pci0000:00/*/vfio-dev/vfio*"
    79  )
    80  
    81  // SelfFilestorePath returns the path at which the self filestore file is
    82  // stored for a given mount.
    83  func SelfFilestorePath(mountSrc, sandboxID string) string {
    84  	// We will place the filestore file in a gVisor specific hidden file inside
    85  	// the mount being overlaid itself. The same volume can be overlaid by
    86  	// multiple sandboxes. So make the filestore file unique to a sandbox by
    87  	// suffixing the sandbox ID.
    88  	return path.Join(mountSrc, selfFilestoreName(sandboxID))
    89  }
    90  
    91  func selfFilestoreName(sandboxID string) string {
    92  	return SelfFilestorePrefix + sandboxID
    93  }
    94  
    95  // tmpfs has some extra supported options that we must pass through.
    96  var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"}
    97  
    98  func registerFilesystems(k *kernel.Kernel, info *containerInfo) error {
    99  	ctx := k.SupervisorContext()
   100  	vfsObj := k.VFS()
   101  
   102  	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   103  		AllowUserMount: true,
   104  		AllowUserList:  true,
   105  	})
   106  	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   107  		AllowUserList: true,
   108  		// TODO(b/29356795): Users may mount this once the terminals are in a
   109  		//  usable state.
   110  		AllowUserMount: true,
   111  	})
   112  	vfsObj.MustRegisterFilesystemType(dev.Name, &dev.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{})
   113  	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   114  		AllowUserMount: true,
   115  		AllowUserList:  true,
   116  	})
   117  	vfsObj.MustRegisterFilesystemType(erofs.Name, &erofs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   118  		AllowUserList: true,
   119  	})
   120  	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   121  		AllowUserMount: true,
   122  		AllowUserList:  true,
   123  	})
   124  	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   125  		AllowUserList: true,
   126  	})
   127  	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   128  		AllowUserMount: true,
   129  		AllowUserList:  true,
   130  	})
   131  	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   132  		AllowUserMount: true,
   133  		AllowUserList:  true,
   134  	})
   135  	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   136  		AllowUserMount: true,
   137  		AllowUserList:  true,
   138  	})
   139  	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   140  		AllowUserMount: true,
   141  		AllowUserList:  true,
   142  	})
   143  	vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   144  		AllowUserMount: true,
   145  		AllowUserList:  true,
   146  	})
   147  
   148  	// Register devices.
   149  	if err := memdev.Register(vfsObj); err != nil {
   150  		return fmt.Errorf("registering memdev: %w", err)
   151  	}
   152  	if err := ttydev.Register(vfsObj); err != nil {
   153  		return fmt.Errorf("registering ttydev: %w", err)
   154  	}
   155  	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
   156  	if tunSupported {
   157  		if err := tundev.Register(vfsObj); err != nil {
   158  			return fmt.Errorf("registering tundev: %v", err)
   159  		}
   160  	}
   161  	if err := fuse.Register(vfsObj); err != nil {
   162  		return fmt.Errorf("registering fusedev: %w", err)
   163  	}
   164  
   165  	if err := nvproxyRegisterDevices(info, vfsObj); err != nil {
   166  		return err
   167  	}
   168  
   169  	if err := tpuProxyRegisterDevices(info, vfsObj); err != nil {
   170  		return err
   171  	}
   172  
   173  	return nil
   174  }
   175  
   176  func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   177  	// Create context with root credentials to mount the filesystem (the current
   178  	// user may not be privileged enough).
   179  	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   180  	rootProcArgs := *procArgs
   181  	rootProcArgs.WorkingDirectory = "/"
   182  	rootProcArgs.Credentials = rootCreds
   183  	rootProcArgs.Umask = 0022
   184  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   185  	rootCtx := rootProcArgs.NewContext(mntr.k)
   186  
   187  	mns, err := mntr.mountAll(rootCtx, rootCreds, info.spec, info.conf, &rootProcArgs)
   188  	if err != nil {
   189  		return fmt.Errorf("failed to setupFS: %w", err)
   190  	}
   191  	procArgs.MountNamespace = mns
   192  
   193  	// If cgroups are mounted, then only check for the cgroup mounts per
   194  	// container. Otherwise the root cgroups will be enabled.
   195  	if mntr.cgroupsMounted {
   196  		cgroupRegistry := mntr.k.CgroupRegistry()
   197  		for _, ctrl := range kernel.CgroupCtrls {
   198  			cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+mntr.containerID)
   199  			if err != nil {
   200  				return fmt.Errorf("cgroup mount for controller %v not found", ctrl)
   201  			}
   202  			if procArgs.InitialCgroups == nil {
   203  				procArgs.InitialCgroups = make(map[kernel.Cgroup]struct{}, len(kernel.CgroupCtrls))
   204  			}
   205  			procArgs.InitialCgroups[cg] = struct{}{}
   206  		}
   207  	}
   208  
   209  	mnsRoot := mns.Root(rootCtx)
   210  	defer mnsRoot.DecRef(rootCtx)
   211  
   212  	if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil {
   213  		return fmt.Errorf("failed to create device files: %w", err)
   214  	}
   215  
   216  	// We are executing a file directly. Do not resolve the executable path.
   217  	if procArgs.File != nil {
   218  		return nil
   219  	}
   220  	// Resolve the executable path from working dir and environment.
   221  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   222  	if err != nil {
   223  		return err
   224  	}
   225  	procArgs.Filename = resolved
   226  	return nil
   227  }
   228  
   229  // compileMounts returns the supported mounts from the mount spec, adding any
   230  // mandatory mounts that are required by the OCI specification.
   231  //
   232  // This function must NOT add/remove any gofer mounts or change their order.
   233  func compileMounts(spec *specs.Spec, conf *config.Config, containerID string) []specs.Mount {
   234  	// Keep track of whether proc and sys were mounted.
   235  	var procMounted, sysMounted, devMounted, devptsMounted, cgroupsMounted bool
   236  	var mounts []specs.Mount
   237  
   238  	// Mount all submounts from the spec.
   239  	for _, m := range spec.Mounts {
   240  		// Mount all the cgroup controllers when "/sys/fs/cgroup" mount
   241  		// is present. If any other cgroup controller mounts are there,
   242  		// it will be a no-op, drop them.
   243  		if m.Type == cgroupfs.Name && cgroupsMounted {
   244  			continue
   245  		}
   246  
   247  		switch filepath.Clean(m.Destination) {
   248  		case "/proc":
   249  			procMounted = true
   250  		case "/sys":
   251  			sysMounted = true
   252  		case "/dev":
   253  			m.Type = dev.Name
   254  			devMounted = true
   255  		case "/dev/pts":
   256  			m.Type = devpts.Name
   257  			devptsMounted = true
   258  		case "/sys/fs/cgroup":
   259  			cgroupsMounted = true
   260  		}
   261  
   262  		mounts = append(mounts, m)
   263  	}
   264  
   265  	// Mount proc and sys even if the user did not ask for it, as the spec
   266  	// says we SHOULD.
   267  	var mandatoryMounts []specs.Mount
   268  
   269  	if !procMounted {
   270  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   271  			Type:        proc.Name,
   272  			Destination: "/proc",
   273  		})
   274  	}
   275  	if !sysMounted {
   276  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   277  			Type:        sys.Name,
   278  			Destination: "/sys",
   279  		})
   280  	}
   281  	if !devMounted {
   282  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   283  			Type:        dev.Name,
   284  			Destination: "/dev",
   285  		})
   286  	}
   287  	if !devptsMounted {
   288  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   289  			Type:        devpts.Name,
   290  			Destination: "/dev/pts",
   291  		})
   292  	}
   293  
   294  	// The mandatory mounts should be ordered right after the root, in case
   295  	// there are submounts of these mandatory mounts already in the spec.
   296  	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
   297  
   298  	return mounts
   299  }
   300  
   301  // goferMountData creates a slice of gofer mount data.
   302  func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string {
   303  	opts := []string{
   304  		"trans=fd",
   305  		"rfdno=" + strconv.Itoa(fd),
   306  		"wfdno=" + strconv.Itoa(fd),
   307  	}
   308  	if fa == config.FileAccessShared {
   309  		opts = append(opts, "cache=remote_revalidating")
   310  	}
   311  	if conf.DirectFS {
   312  		opts = append(opts, "directfs")
   313  	}
   314  	if !conf.HostFifo.AllowOpen() {
   315  		opts = append(opts, "disable_fifo_open")
   316  	}
   317  	return opts
   318  }
   319  
   320  // consumeMountOptions consumes mount options from opts based on allowedKeys
   321  // and returns the remaining and consumed options.
   322  func consumeMountOptions(opts []string, allowedKeys ...string) ([]string, []string, error) {
   323  	var rem, out []string
   324  	for _, o := range opts {
   325  		ok, err := parseMountOption(o, allowedKeys...)
   326  		if err != nil {
   327  			return nil, nil, err
   328  		}
   329  		if ok {
   330  			out = append(out, o)
   331  		} else {
   332  			rem = append(rem, o)
   333  		}
   334  	}
   335  	return rem, out, nil
   336  }
   337  
   338  func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
   339  	kv := strings.SplitN(opt, "=", 3)
   340  	if len(kv) > 2 {
   341  		return false, fmt.Errorf("invalid option %q", opt)
   342  	}
   343  	return slices.Contains(allowedKeys, kv[0]), nil
   344  }
   345  
   346  type fdDispenser struct {
   347  	fds []*fd.FD
   348  }
   349  
   350  func (f *fdDispenser) remove() int {
   351  	return f.removeAsFD().Release()
   352  }
   353  
   354  func (f *fdDispenser) removeAsFD() *fd.FD {
   355  	if f.empty() {
   356  		panic("fdDispenser out of fds")
   357  	}
   358  	rv := f.fds[0]
   359  	f.fds = f.fds[1:]
   360  	return rv
   361  }
   362  
   363  func (f *fdDispenser) empty() bool {
   364  	return len(f.fds) == 0
   365  }
   366  
   367  type containerMounter struct {
   368  	root *specs.Root
   369  
   370  	// mounts is the set of submounts for the container. It's a copy from the spec
   371  	// that may be freely modified without affecting the original spec.
   372  	mounts []specs.Mount
   373  
   374  	// goferFDs is the list of FDs to be dispensed for gofer mounts.
   375  	goferFDs fdDispenser
   376  
   377  	// goferFilestoreFDs are FDs to the regular files that will back the tmpfs or
   378  	// overlayfs mount for certain gofer mounts.
   379  	goferFilestoreFDs fdDispenser
   380  
   381  	// devGoferFD is the FD to attach the sandbox to the dev gofer.
   382  	devGoferFD *fd.FD
   383  
   384  	// goferMountConfs contains information about how the gofer mounts have been
   385  	// configured. The first entry is for rootfs and the following entries are
   386  	// for bind mounts in Spec.Mounts (in the same order).
   387  	goferMountConfs []GoferMountConf
   388  
   389  	k *kernel.Kernel
   390  
   391  	// hints is the set of pod mount hints for the sandbox.
   392  	hints *PodMountHints
   393  
   394  	// sharedMounts is a map of shared mounts that can be reused across
   395  	// containers.
   396  	sharedMounts map[string]*vfs.Mount
   397  
   398  	// productName is the value to show in
   399  	// /sys/devices/virtual/dmi/id/product_name.
   400  	productName string
   401  
   402  	// containerID is the ID for the container.
   403  	containerID string
   404  
   405  	// sandboxID is the ID for the whole sandbox.
   406  	sandboxID     string
   407  	containerName string
   408  
   409  	// cgroupsMounted indicates if cgroups are mounted in the container.
   410  	// This is used to set the InitialCgroups before starting the container
   411  	// process.
   412  	cgroupsMounted bool
   413  }
   414  
   415  func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter {
   416  	return &containerMounter{
   417  		root:              info.spec.Root,
   418  		mounts:            compileMounts(info.spec, info.conf, info.procArgs.ContainerID),
   419  		goferFDs:          fdDispenser{fds: info.goferFDs},
   420  		goferFilestoreFDs: fdDispenser{fds: info.goferFilestoreFDs},
   421  		devGoferFD:        info.devGoferFD,
   422  		goferMountConfs:   info.goferMountConfs,
   423  		k:                 k,
   424  		hints:             hints,
   425  		sharedMounts:      sharedMounts,
   426  		productName:       productName,
   427  		containerID:       info.cid,
   428  		sandboxID:         sandboxID,
   429  		containerName:     info.containerName,
   430  	}
   431  }
   432  
   433  func (c *containerMounter) checkDispenser() error {
   434  	if !c.goferFDs.empty() {
   435  		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.goferFDs)
   436  	}
   437  	if !c.goferFilestoreFDs.empty() {
   438  		return fmt.Errorf("not all gofer Filestore FDs were consumed, remaining: %v", c.goferFilestoreFDs)
   439  	}
   440  	if c.devGoferFD != nil && c.devGoferFD.FD() >= 0 {
   441  		return fmt.Errorf("dev gofer FD was not consumed: %d", c.devGoferFD.FD())
   442  	}
   443  	return nil
   444  }
   445  
   446  func getMountAccessType(conf *config.Config, hint *MountHint) config.FileAccessType {
   447  	if hint != nil {
   448  		return hint.fileAccessType()
   449  	}
   450  	return conf.FileAccessMounts
   451  }
   452  
   453  func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, spec *specs.Spec, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
   454  	log.Infof("Configuring container's file system")
   455  
   456  	mns, err := c.createMountNamespace(rootCtx, conf, rootCreds)
   457  	if err != nil {
   458  		return nil, fmt.Errorf("creating mount namespace: %w", err)
   459  	}
   460  	rootProcArgs.MountNamespace = mns
   461  
   462  	root := mns.Root(rootCtx)
   463  	defer root.DecRef(rootCtx)
   464  	if root.Mount().ReadOnly() {
   465  		// Switch to ReadWrite while we setup submounts.
   466  		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
   467  			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
   468  		}
   469  		// Restore back to ReadOnly at the end.
   470  		defer func() {
   471  			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
   472  				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
   473  			}
   474  		}()
   475  	}
   476  
   477  	// Mount submounts.
   478  	if err := c.mountSubmounts(rootCtx, spec, conf, mns, rootCreds); err != nil {
   479  		return nil, fmt.Errorf("mounting submounts: %w", err)
   480  	}
   481  
   482  	return mns, nil
   483  }
   484  
   485  // createMountNamespace creates the container's root mount and namespace.
   486  func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
   487  	ioFD := c.goferFDs.remove()
   488  	rootfsConf := c.goferMountConfs[0]
   489  
   490  	var (
   491  		fsName string
   492  		opts   *vfs.MountOptions
   493  	)
   494  	switch {
   495  	case rootfsConf.ShouldUseLisafs():
   496  		fsName = gofer.Name
   497  
   498  		data := goferMountData(ioFD, conf.FileAccess, conf)
   499  
   500  		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   501  		// can only send mount options for specs.Mounts (specs.Root is missing
   502  		// Options field). So assume root is always on top of overlayfs.
   503  		data = append(data, "overlayfs_stale_read")
   504  
   505  		// Configure the gofer dentry cache size.
   506  		gofer.SetDentryCacheSize(conf.DCache)
   507  
   508  		opts = &vfs.MountOptions{
   509  			ReadOnly: c.root.Readonly,
   510  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   511  				InternalMount: true,
   512  				Data:          strings.Join(data, ","),
   513  				InternalData: gofer.InternalFilesystemOptions{
   514  					UniqueID: vfs.RestoreID{
   515  						ContainerName: c.containerName,
   516  						Path:          "/",
   517  					},
   518  				},
   519  			},
   520  		}
   521  
   522  	case rootfsConf.ShouldUseErofs():
   523  		fsName = erofs.Name
   524  		opts = &vfs.MountOptions{
   525  			ReadOnly: c.root.Readonly,
   526  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   527  				InternalMount: true,
   528  				Data:          fmt.Sprintf("ifd=%d", ioFD),
   529  				InternalData: erofs.InternalFilesystemOptions{
   530  					UniqueID: vfs.RestoreID{
   531  						ContainerName: c.containerName,
   532  						Path:          "/",
   533  					},
   534  				},
   535  			},
   536  		}
   537  
   538  	default:
   539  		return nil, fmt.Errorf("unsupported rootfs config: %+v", rootfsConf)
   540  	}
   541  
   542  	log.Infof("Mounting root with %s, ioFD: %d", fsName, ioFD)
   543  
   544  	if rootfsConf.ShouldUseOverlayfs() {
   545  		log.Infof("Adding overlay on top of root")
   546  		var (
   547  			err         error
   548  			cleanup     func()
   549  			filestoreFD *fd.FD
   550  		)
   551  		if rootfsConf.IsFilestorePresent() {
   552  			filestoreFD = c.goferFilestoreFDs.removeAsFD()
   553  		}
   554  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, filestoreFD, rootfsConf, "/")
   555  		if err != nil {
   556  			return nil, fmt.Errorf("mounting root with overlay: %w", err)
   557  		}
   558  		defer cleanup()
   559  		fsName = overlay.Name
   560  	}
   561  
   562  	// The namespace root mount can't be changed, so let's mount a dummy
   563  	// read-only tmpfs here. It simplifies creation of containers without
   564  	// leaking the root file system.
   565  	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs",
   566  		&vfs.MountOptions{ReadOnly: true, Locked: true}, c.k)
   567  	if err != nil {
   568  		return nil, fmt.Errorf("setting up mount namespace: %w", err)
   569  	}
   570  	defer mns.DecRef(ctx)
   571  
   572  	mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts)
   573  	if err != nil {
   574  		return nil, fmt.Errorf("creating root file system: %w", err)
   575  	}
   576  	defer mnt.DecRef(ctx)
   577  	root := mns.Root(ctx)
   578  	defer root.DecRef(ctx)
   579  	target := &vfs.PathOperation{
   580  		Root:  root,
   581  		Start: root,
   582  	}
   583  	if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil {
   584  		return nil, fmt.Errorf("mounting root file system: %w", err)
   585  	}
   586  
   587  	mns.IncRef()
   588  	return mns, nil
   589  }
   590  
   591  // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
   592  // layer using tmpfs, and return overlay mount options. "cleanup" must be called
   593  // after the options have been used to mount the overlay, to release refs on
   594  // lower and upper mounts.
   595  func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, mountConf GoferMountConf, dst string) (*vfs.MountOptions, func(), error) {
   596  	// First copy options from lower layer to upper layer and overlay. Clear
   597  	// filesystem specific options.
   598  	upperOpts := *lowerOpts
   599  	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true}
   600  
   601  	overlayOpts := *lowerOpts
   602  	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true}
   603  
   604  	// All writes go to the upper layer, be paranoid and make lower readonly.
   605  	lowerOpts.ReadOnly = true
   606  	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
   607  	if err != nil {
   608  		return nil, nil, err
   609  	}
   610  	cu := cleanup.Make(func() { lower.DecRef(ctx) })
   611  	defer cu.Clean()
   612  
   613  	// Determine the lower layer's root's type.
   614  	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
   615  	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
   616  		Root:  lowerRootVD,
   617  		Start: lowerRootVD,
   618  	}, &vfs.StatOptions{
   619  		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
   620  	})
   621  	if err != nil {
   622  		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
   623  	}
   624  	if stat.Mask&linux.STATX_TYPE == 0 {
   625  		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
   626  	}
   627  	rootType := stat.Mode & linux.S_IFMT
   628  	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
   629  		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
   630  	}
   631  
   632  	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
   633  	tmpfsOpts := tmpfs.FilesystemOpts{
   634  		RootFileType: uint16(rootType),
   635  		// If a mount is being overlaid, it should not be limited by the default
   636  		// tmpfs size limit.
   637  		DisableDefaultSizeLimit: true,
   638  	}
   639  	if filestoreFD != nil {
   640  		// Create memory file for disk-backed overlays.
   641  		mf, err := createPrivateMemoryFile(filestoreFD.ReleaseToFile("overlay-filestore"), vfs.RestoreID{ContainerName: c.containerName, Path: dst})
   642  		if err != nil {
   643  			return nil, nil, fmt.Errorf("failed to create memory file for overlay: %v", err)
   644  		}
   645  		tmpfsOpts.MemoryFile = mf
   646  	}
   647  	upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts
   648  	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
   649  	if err != nil {
   650  		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
   651  	}
   652  	cu.Add(func() { upper.DecRef(ctx) })
   653  
   654  	// If the overlay mount consists of a regular file, copy up its contents
   655  	// from the lower layer, since in the overlay the otherwise-empty upper
   656  	// layer file will take precedence.
   657  	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
   658  	if rootType == linux.S_IFREG {
   659  		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   660  			Root:  lowerRootVD,
   661  			Start: lowerRootVD,
   662  		}, &vfs.OpenOptions{
   663  			Flags: linux.O_RDONLY,
   664  		})
   665  		if err != nil {
   666  			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
   667  		}
   668  		defer lowerFD.DecRef(ctx)
   669  		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   670  			Root:  upperRootVD,
   671  			Start: upperRootVD,
   672  		}, &vfs.OpenOptions{
   673  			Flags: linux.O_WRONLY,
   674  		})
   675  		if err != nil {
   676  			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
   677  		}
   678  		defer upperFD.DecRef(ctx)
   679  		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
   680  			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
   681  		}
   682  	}
   683  
   684  	// We need to hide the filestore from the containerized application.
   685  	if mountConf.IsSelfBacked() {
   686  		if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{
   687  			Root:  upperRootVD,
   688  			Start: upperRootVD,
   689  			Path:  fspath.Parse(selfFilestoreName(c.sandboxID)),
   690  		}); err != nil {
   691  			return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err)
   692  		}
   693  	}
   694  
   695  	// Propagate the lower layer's root's owner, group, and mode to the upper
   696  	// layer's root for consistency with VFS1.
   697  	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
   698  		Root:  upperRootVD,
   699  		Start: upperRootVD,
   700  	}, &vfs.SetStatOptions{
   701  		Stat: linux.Statx{
   702  			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
   703  			UID:  stat.UID,
   704  			GID:  stat.GID,
   705  			Mode: stat.Mode,
   706  		},
   707  	})
   708  	if err != nil {
   709  		return nil, nil, err
   710  	}
   711  
   712  	// Configure overlay with both layers.
   713  	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
   714  		UpperRoot:  upperRootVD,
   715  		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
   716  	}
   717  	return &overlayOpts, cu.Release(), nil
   718  }
   719  
   720  func (c *containerMounter) mountSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
   721  	mounts, err := c.prepareMounts()
   722  	if err != nil {
   723  		return err
   724  	}
   725  
   726  	for i := range mounts {
   727  		submount := &mounts[i]
   728  		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
   729  		var (
   730  			mnt *vfs.Mount
   731  			err error
   732  		)
   733  
   734  		if submount.hint != nil && submount.hint.ShouldShareMount() {
   735  			sharedMount, err := c.getSharedMount(ctx, spec, conf, submount, creds)
   736  			if err != nil {
   737  				return fmt.Errorf("getting shared mount %q: %w", submount.hint.Name, err)
   738  			}
   739  			mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount, sharedMount)
   740  			if err != nil {
   741  				return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err)
   742  			}
   743  		} else if submount.mount.Type == cgroupfs.Name {
   744  			// Mount all the cgroups controllers.
   745  			if err := c.mountCgroupSubmounts(ctx, spec, conf, mns, creds, submount); err != nil {
   746  				return fmt.Errorf("mount cgroup %q: %w", submount.mount.Destination, err)
   747  			}
   748  		} else {
   749  			mnt, err = c.mountSubmount(ctx, spec, conf, mns, creds, submount)
   750  			if err != nil {
   751  				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
   752  			}
   753  		}
   754  
   755  		if mnt != nil && mnt.ReadOnly() {
   756  			// Switch to ReadWrite while we setup submounts.
   757  			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
   758  				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
   759  			}
   760  			// Restore back to ReadOnly at the end.
   761  			defer func() {
   762  				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
   763  					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
   764  				}
   765  			}()
   766  		}
   767  	}
   768  
   769  	if err := c.mountTmp(ctx, spec, conf, creds, mns); err != nil {
   770  		return fmt.Errorf(`mount submount "/tmp": %w`, err)
   771  	}
   772  	return nil
   773  }
   774  
   775  type mountInfo struct {
   776  	mount          *specs.Mount
   777  	goferFD        *fd.FD
   778  	hint           *MountHint
   779  	goferMountConf GoferMountConf
   780  	filestoreFD    *fd.FD
   781  }
   782  
   783  func (c *containerMounter) prepareMounts() ([]mountInfo, error) {
   784  	// If device gofer exists, connect to it.
   785  	if c.devGoferFD != nil {
   786  		if err := c.k.AddDevGofer(c.containerName, c.devGoferFD.Release()); err != nil {
   787  			return nil, err
   788  		}
   789  	}
   790  	// Associate bind mounts with their FDs before sorting since there is an
   791  	// undocumented assumption that FDs are dispensed in the order in which
   792  	// they are required by mounts.
   793  	var mounts []mountInfo
   794  	goferMntIdx := 1 // First index is for rootfs.
   795  	for i := range c.mounts {
   796  		info := mountInfo{
   797  			mount: &c.mounts[i],
   798  			hint:  c.hints.FindMount(c.mounts[i].Source),
   799  		}
   800  		specutils.MaybeConvertToBindMount(info.mount)
   801  		if specutils.IsGoferMount(*info.mount) {
   802  			info.goferMountConf = c.goferMountConfs[goferMntIdx]
   803  			if info.goferMountConf.ShouldUseLisafs() {
   804  				info.goferFD = c.goferFDs.removeAsFD()
   805  			}
   806  			if info.goferMountConf.IsFilestorePresent() {
   807  				info.filestoreFD = c.goferFilestoreFDs.removeAsFD()
   808  			}
   809  			if info.goferMountConf.ShouldUseTmpfs() {
   810  				specutils.ChangeMountType(info.mount, tmpfs.Name)
   811  			}
   812  			goferMntIdx++
   813  		}
   814  		mounts = append(mounts, info)
   815  	}
   816  	if err := c.checkDispenser(); err != nil {
   817  		return nil, err
   818  	}
   819  
   820  	// Sort the mounts so that we don't place children before parents.
   821  	sort.Slice(mounts, func(i, j int) bool {
   822  		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
   823  	})
   824  
   825  	return mounts, nil
   826  }
   827  
   828  func (c *containerMounter) mountSubmount(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) {
   829  	fsName, opts, err := getMountNameAndOptions(spec, conf, submount, c.productName, c.containerName)
   830  	if err != nil {
   831  		return nil, fmt.Errorf("mountOptions failed: %w", err)
   832  	}
   833  	if len(fsName) == 0 {
   834  		// Filesystem is not supported (e.g. cgroup), just skip it.
   835  		return nil, nil
   836  	}
   837  
   838  	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
   839  		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
   840  	}
   841  
   842  	if submount.goferMountConf.ShouldUseOverlayfs() {
   843  		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
   844  		var cleanup func()
   845  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.filestoreFD, submount.goferMountConf, submount.mount.Destination)
   846  		if err != nil {
   847  			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
   848  		}
   849  		defer cleanup()
   850  		fsName = overlay.Name
   851  	}
   852  
   853  	root := mns.Root(ctx)
   854  	defer root.DecRef(ctx)
   855  	target := &vfs.PathOperation{
   856  		Root:  root,
   857  		Start: root,
   858  		Path:  fspath.Parse(submount.mount.Destination),
   859  	}
   860  	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
   861  	if err != nil {
   862  		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
   863  	}
   864  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
   865  	return mnt, nil
   866  }
   867  
   868  // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
   869  // used for mounts.
   870  func getMountNameAndOptions(spec *specs.Spec, conf *config.Config, m *mountInfo, productName, containerName string) (string, *vfs.MountOptions, error) {
   871  	fsName := m.mount.Type
   872  	var (
   873  		mopts        = m.mount.Options
   874  		data         []string
   875  		internalData any
   876  	)
   877  
   878  	// Find filesystem name and FS specific data field.
   879  	switch m.mount.Type {
   880  	case devpts.Name, dev.Name, proc.Name:
   881  		// Nothing to do.
   882  
   883  	case Nonefs:
   884  		fsName = sys.Name
   885  
   886  	case sys.Name:
   887  		sysData := &sys.InternalData{EnableTPUProxyPaths: specutils.TPUProxyIsEnabled(spec, conf)}
   888  		if len(productName) > 0 {
   889  			sysData.ProductName = productName
   890  		}
   891  		internalData = sysData
   892  
   893  	case tmpfs.Name:
   894  		var err error
   895  		mopts, data, err = consumeMountOptions(mopts, tmpfsAllowedData...)
   896  		if err != nil {
   897  			return "", nil, err
   898  		}
   899  		if m.filestoreFD != nil {
   900  			mf, err := createPrivateMemoryFile(m.filestoreFD.ReleaseToFile("tmpfs-filestore"), vfs.RestoreID{ContainerName: containerName, Path: m.mount.Destination})
   901  			if err != nil {
   902  				return "", nil, fmt.Errorf("failed to create memory file for tmpfs: %v", err)
   903  			}
   904  			internalData = tmpfs.FilesystemOpts{
   905  				MemoryFile: mf,
   906  				// If a mount is being overlaid with tmpfs, it should not be limited by
   907  				// the default tmpfs size limit.
   908  				DisableDefaultSizeLimit: true,
   909  			}
   910  		}
   911  
   912  	case Bind:
   913  		fsName = gofer.Name
   914  		if m.goferFD == nil {
   915  			// Check that an FD was provided to fails fast.
   916  			return "", nil, fmt.Errorf("gofer mount requires a connection FD")
   917  		}
   918  		var err error
   919  		mopts, data, err = consumeMountOptions(mopts, gofer.SupportedMountOptions...)
   920  		if err != nil {
   921  			return "", nil, err
   922  		}
   923  		data = append(data, goferMountData(m.goferFD.Release(), getMountAccessType(conf, m.hint), conf)...)
   924  		internalData = gofer.InternalFilesystemOptions{
   925  			UniqueID: vfs.RestoreID{
   926  				ContainerName: containerName,
   927  				Path:          m.mount.Destination,
   928  			},
   929  		}
   930  
   931  	case cgroupfs.Name:
   932  		var err error
   933  		mopts, data, err = consumeMountOptions(mopts, cgroupfs.SupportedMountOptions...)
   934  		if err != nil {
   935  			return "", nil, err
   936  		}
   937  
   938  	default:
   939  		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
   940  		return "", nil, nil
   941  	}
   942  
   943  	opts := ParseMountOptions(mopts)
   944  	opts.GetFilesystemOptions = vfs.GetFilesystemOptions{
   945  		Data:          strings.Join(data, ","),
   946  		InternalData:  internalData,
   947  		InternalMount: true,
   948  	}
   949  
   950  	return fsName, opts, nil
   951  }
   952  
   953  // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions.
   954  func ParseMountOptions(opts []string) *vfs.MountOptions {
   955  	mountOpts := &vfs.MountOptions{
   956  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   957  			InternalMount: true,
   958  		},
   959  	}
   960  	// Note: update mountHint.CheckCompatible when more options are added.
   961  	for _, o := range opts {
   962  		switch o {
   963  		case "ro":
   964  			mountOpts.ReadOnly = true
   965  		case "noatime":
   966  			mountOpts.Flags.NoATime = true
   967  		case "noexec":
   968  			mountOpts.Flags.NoExec = true
   969  		case "rw", "atime", "exec":
   970  			// These use the default value and don't need to be set.
   971  		case "bind", "rbind":
   972  			// These are the same as a mount with type="bind".
   973  		default:
   974  			log.Warningf("ignoring unknown mount option %q", o)
   975  		}
   976  	}
   977  	return mountOpts
   978  }
   979  
   980  func parseKeyValue(s string) (string, string, bool) {
   981  	tokens := strings.SplitN(s, "=", 2)
   982  	if len(tokens) < 2 {
   983  		return "", "", false
   984  	}
   985  	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
   986  }
   987  
   988  func createPrivateMemoryFile(file *os.File, restoreID vfs.RestoreID) (*pgalloc.MemoryFile, error) {
   989  	mfOpts := pgalloc.MemoryFileOpts{
   990  		// Private memory files are usually backed by files on disk. Ideally we
   991  		// would confirm with fstatfs(2) but that is prohibited by seccomp.
   992  		DiskBackedFile: true,
   993  		// Disk backed files need to be decommited on destroy to release disk space.
   994  		DecommitOnDestroy: true,
   995  		// sentry's seccomp filters don't allow the mmap(2) syscalls that
   996  		// pgalloc.IMAWorkAroundForMemFile() uses. Users of private memory files
   997  		// are expected to have performed the work around outside the sandbox.
   998  		DisableIMAWorkAround: true,
   999  		// Private memory files need to be restored correctly using this ID.
  1000  		RestoreID: restoreID.String(),
  1001  	}
  1002  	return pgalloc.NewMemoryFile(file, mfOpts)
  1003  }
  1004  
  1005  // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
  1006  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
  1007  // the host /tmp, but this is a nice optimization, and fixes some apps that call
  1008  // mknod in /tmp. It's unsafe to mount tmpfs if:
  1009  //  1. /tmp is mounted explicitly: we should not override user's wish
  1010  //  2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
  1011  //
  1012  // Note that when there are submounts inside of '/tmp', directories for the
  1013  // mount points must be present, making '/tmp' not empty anymore.
  1014  func (c *containerMounter) mountTmp(ctx context.Context, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
  1015  	for _, m := range c.mounts {
  1016  		// m.Destination has been cleaned, so it's to use equality here.
  1017  		if m.Destination == "/tmp" {
  1018  			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
  1019  			return nil
  1020  		}
  1021  	}
  1022  
  1023  	root := mns.Root(ctx)
  1024  	defer root.DecRef(ctx)
  1025  	pop := vfs.PathOperation{
  1026  		Root:  root,
  1027  		Start: root,
  1028  		Path:  fspath.Parse("/tmp"),
  1029  	}
  1030  	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
  1031  	switch {
  1032  	case err == nil:
  1033  		defer fd.DecRef(ctx)
  1034  
  1035  		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
  1036  			if dirent.Name != "." && dirent.Name != ".." {
  1037  				return linuxerr.ENOTEMPTY
  1038  			}
  1039  			return nil
  1040  		}))
  1041  		switch {
  1042  		case err == nil:
  1043  			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
  1044  		case linuxerr.Equals(linuxerr.ENOTEMPTY, err):
  1045  			// If more than "." and ".." is found, skip internal tmpfs to prevent
  1046  			// hiding existing files.
  1047  			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
  1048  			return nil
  1049  		default:
  1050  			return fmt.Errorf("fd.IterDirents failed: %v", err)
  1051  		}
  1052  		fallthrough
  1053  
  1054  	case linuxerr.Equals(linuxerr.ENOENT, err):
  1055  		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
  1056  		// tmpfs.
  1057  		tmpMount := specs.Mount{
  1058  			Type:        tmpfs.Name,
  1059  			Destination: "/tmp",
  1060  			// Sticky bit is added to prevent accidental deletion of files from
  1061  			// another user. This is normally done for /tmp.
  1062  			Options: []string{"mode=01777"},
  1063  		}
  1064  		if _, err := c.mountSubmount(ctx, spec, conf, mns, creds, &mountInfo{mount: &tmpMount}); err != nil {
  1065  			return fmt.Errorf("mountSubmount failed: %v", err)
  1066  		}
  1067  		return nil
  1068  
  1069  	case linuxerr.Equals(linuxerr.ENOTDIR, err):
  1070  		// Not a dir?! Let it be.
  1071  		return nil
  1072  
  1073  	default:
  1074  		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
  1075  	}
  1076  }
  1077  
  1078  func (c *containerMounter) getSharedMount(ctx context.Context, spec *specs.Spec, conf *config.Config, mount *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) {
  1079  	sharedMount, ok := c.sharedMounts[mount.hint.Mount.Source]
  1080  	if ok {
  1081  		log.Infof("Using existing shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type)
  1082  		if mount.goferFD != nil {
  1083  			panic(fmt.Errorf("extra goferFD provided for shared mount %q", mount.hint.Name))
  1084  		}
  1085  		if mount.filestoreFD != nil {
  1086  			mount.filestoreFD.Close()
  1087  		}
  1088  		return sharedMount, nil
  1089  	}
  1090  	log.Infof("Mounting master of shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type)
  1091  	sharedMount, err := c.mountSharedMaster(ctx, spec, conf, mount, creds)
  1092  	if err != nil {
  1093  		return nil, fmt.Errorf("mounting shared master %q: %v", mount.hint.Name, err)
  1094  	}
  1095  	c.sharedMounts[mount.hint.Mount.Source] = sharedMount
  1096  	return sharedMount, nil
  1097  }
  1098  
  1099  // mountCgroupMounts mounts the cgroups which are shared across all containers.
  1100  // Postcondition: Initialized k.cgroupMounts on success.
  1101  func (l *Loader) mountCgroupMounts(conf *config.Config, creds *auth.Credentials) error {
  1102  	ctx := l.k.SupervisorContext()
  1103  	for _, sopts := range kernel.CgroupCtrls {
  1104  		mopts := &vfs.MountOptions{
  1105  			GetFilesystemOptions: vfs.GetFilesystemOptions{
  1106  				Data:          string(sopts),
  1107  				InternalMount: true,
  1108  			},
  1109  		}
  1110  		fs, root, err := l.k.VFS().NewFilesystem(ctx, creds, "cgroup", cgroupfs.Name, mopts)
  1111  		if err != nil {
  1112  			return err
  1113  		}
  1114  
  1115  		mount := l.k.VFS().NewDisconnectedMount(fs, root, mopts)
  1116  		// Private so that mounts created by containers do not appear
  1117  		// in other container's cgroup paths.
  1118  		l.k.VFS().SetMountPropagation(mount, linux.MS_PRIVATE, false)
  1119  		l.k.AddCgroupMount(string(sopts), &kernel.CgroupMount{
  1120  			Fs:    fs,
  1121  			Root:  root,
  1122  			Mount: mount,
  1123  		})
  1124  	}
  1125  	log.Infof("created cgroup mounts for controllers %v", kernel.CgroupCtrls)
  1126  	return nil
  1127  }
  1128  
  1129  // mountCgroupSubmounts mounts all the cgroup controller submounts for the
  1130  // container. The cgroup submounts are created under the root controller mount
  1131  // with containerID as the directory name and then bind mounts this directory
  1132  // inside the container's mount namespace.
  1133  func (c *containerMounter) mountCgroupSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) error {
  1134  	root := mns.Root(ctx)
  1135  	defer root.DecRef(ctx)
  1136  
  1137  	// Mount "/sys/fs/cgroup" in the container's mount namespace.
  1138  	submount.mount.Type = tmpfs.Name
  1139  	mnt, err := c.mountSubmount(ctx, spec, conf, mns, creds, submount)
  1140  	if err != nil {
  1141  		return err
  1142  	}
  1143  	if mnt != nil && mnt.ReadOnly() {
  1144  		// Switch to ReadWrite while we setup submounts.
  1145  		if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
  1146  			return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
  1147  		}
  1148  		// Restore back to ReadOnly at the end.
  1149  		defer func() {
  1150  			if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
  1151  				panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
  1152  			}
  1153  		}()
  1154  	}
  1155  
  1156  	// Mount all the cgroup controllers in the container's mount namespace.
  1157  	mountCtx := vfs.WithRoot(vfs.WithMountNamespace(ctx, mns), root)
  1158  	for _, ctrl := range kernel.CgroupCtrls {
  1159  		ctrlName := string(ctrl)
  1160  		cgroupMnt := c.k.GetCgroupMount(ctrlName)
  1161  		if cgroupMnt == nil {
  1162  			return fmt.Errorf("cgroup mount for controller %s not found", ctrlName)
  1163  		}
  1164  
  1165  		cgroupMntVD := vfs.MakeVirtualDentry(cgroupMnt.Mount, cgroupMnt.Root)
  1166  		sourcePop := vfs.PathOperation{
  1167  			Root:  cgroupMntVD,
  1168  			Start: cgroupMntVD,
  1169  			// Use the containerID as the cgroup path.
  1170  			Path: fspath.Parse(c.containerID),
  1171  		}
  1172  		if err := c.k.VFS().MkdirAt(mountCtx, creds, &sourcePop, &vfs.MkdirOptions{
  1173  			Mode: 0755,
  1174  		}); err != nil {
  1175  			log.Infof("error in creating directory %v", err)
  1176  			return err
  1177  		}
  1178  
  1179  		// Bind mount the new cgroup directory into the container's mount namespace.
  1180  		destination := "/sys/fs/cgroup/" + ctrlName
  1181  		if err := c.k.VFS().MakeSyntheticMountpoint(mountCtx, destination, root, creds); err != nil {
  1182  			// Log a warning, but attempt the mount anyway.
  1183  			log.Warningf("Failed to create mount point %q: %v", destination, err)
  1184  		}
  1185  
  1186  		target := &vfs.PathOperation{
  1187  			Root:  root,
  1188  			Start: root,
  1189  			Path:  fspath.Parse(destination),
  1190  		}
  1191  		if err := c.k.VFS().BindAt(mountCtx, creds, &sourcePop, target, false); err != nil {
  1192  			log.Infof("error in bind mounting %v", err)
  1193  			return err
  1194  		}
  1195  	}
  1196  	c.cgroupsMounted = true
  1197  	return nil
  1198  }
  1199  
  1200  // mountSharedMaster mounts the master of a volume that is shared among
  1201  // containers in a pod.
  1202  func (c *containerMounter) mountSharedMaster(ctx context.Context, spec *specs.Spec, conf *config.Config, mntInfo *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) {
  1203  	// Mount the master using the options from the hint (mount annotations).
  1204  	origOpts := mntInfo.mount.Options
  1205  	mntInfo.mount.Options = mntInfo.hint.Mount.Options
  1206  	fsName, opts, err := getMountNameAndOptions(spec, conf, mntInfo, c.productName, c.containerName)
  1207  	mntInfo.mount.Options = origOpts
  1208  	if err != nil {
  1209  		return nil, err
  1210  	}
  1211  	if len(fsName) == 0 {
  1212  		return nil, fmt.Errorf("mount type not supported %q", mntInfo.hint.Mount.Type)
  1213  	}
  1214  	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
  1215  }
  1216  
  1217  // mountSharedSubmount binds mount to a previously mounted volume that is shared
  1218  // among containers in the same pod.
  1219  func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mntInfo *mountInfo, sharedMount *vfs.Mount) (*vfs.Mount, error) {
  1220  	if err := mntInfo.hint.checkCompatible(mntInfo.mount); err != nil {
  1221  		return nil, err
  1222  	}
  1223  
  1224  	// Generate mount point specific opts using mntInfo.mount.
  1225  	opts := ParseMountOptions(mntInfo.mount.Options)
  1226  	newMnt := c.k.VFS().NewDisconnectedMount(sharedMount.Filesystem(), sharedMount.Root(), opts)
  1227  	defer newMnt.DecRef(ctx)
  1228  
  1229  	root := mns.Root(ctx)
  1230  	defer root.DecRef(ctx)
  1231  	target := &vfs.PathOperation{
  1232  		Root:  root,
  1233  		Start: root,
  1234  		Path:  fspath.Parse(mntInfo.mount.Destination),
  1235  	}
  1236  
  1237  	if err := c.makeMountPoint(ctx, creds, mns, mntInfo.mount.Destination); err != nil {
  1238  		return nil, fmt.Errorf("creating mount point %q: %w", mntInfo.mount.Destination, err)
  1239  	}
  1240  
  1241  	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
  1242  		return nil, err
  1243  	}
  1244  	log.Infof("Mounted %q type shared bind to %q", mntInfo.mount.Destination, mntInfo.hint.Name)
  1245  	return newMnt, nil
  1246  }
  1247  
  1248  func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
  1249  	root := mns.Root(ctx)
  1250  	defer root.DecRef(ctx)
  1251  	target := &vfs.PathOperation{
  1252  		Root:  root,
  1253  		Start: root,
  1254  		Path:  fspath.Parse(dest),
  1255  	}
  1256  	// First check if mount point exists. When overlay is enabled, gofer doesn't
  1257  	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
  1258  	// because MkdirAt fails with EROFS even if file exists.
  1259  	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
  1260  	if err == nil {
  1261  		// File exists, we're done.
  1262  		vd.DecRef(ctx)
  1263  		return nil
  1264  	}
  1265  	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
  1266  }
  1267  
  1268  // configureRestore returns an updated context.Context including filesystem
  1269  // state used by restore defined by conf.
  1270  func (c *containerMounter) configureRestore(fdmap map[vfs.RestoreID]int, mfmap map[string]*pgalloc.MemoryFile) error {
  1271  	// Compare createMountNamespace(); rootfs always consumes a gofer FD and a
  1272  	// filestore FD is consumed if the rootfs GoferMountConf indicates so.
  1273  	rootKey := vfs.RestoreID{ContainerName: c.containerName, Path: "/"}
  1274  	fdmap[rootKey] = c.goferFDs.remove()
  1275  
  1276  	if rootfsConf := c.goferMountConfs[0]; rootfsConf.IsFilestorePresent() {
  1277  		mf, err := createPrivateMemoryFile(c.goferFilestoreFDs.removeAsFD().ReleaseToFile("overlay-filestore"), rootKey)
  1278  		if err != nil {
  1279  			return fmt.Errorf("failed to create private memory file for mount rootfs: %w", err)
  1280  		}
  1281  		mfmap[rootKey.String()] = mf
  1282  	}
  1283  	// prepareMounts() consumes the remaining FDs for submounts.
  1284  	mounts, err := c.prepareMounts()
  1285  	if err != nil {
  1286  		return err
  1287  	}
  1288  	for i := range mounts {
  1289  		submount := &mounts[i]
  1290  		if submount.goferFD != nil {
  1291  			key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination}
  1292  			fdmap[key] = submount.goferFD.Release()
  1293  		}
  1294  		if submount.filestoreFD != nil {
  1295  			key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination}
  1296  			mf, err := createPrivateMemoryFile(submount.filestoreFD.ReleaseToFile("overlay-filestore"), key)
  1297  			if err != nil {
  1298  				return fmt.Errorf("failed to create private memory file for mount %q: %w", submount.mount.Destination, err)
  1299  			}
  1300  			mfmap[key.String()] = mf
  1301  		}
  1302  	}
  1303  	return nil
  1304  }
  1305  
  1306  func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error {
  1307  	if info.spec.Linux != nil {
  1308  		// Create any device files specified in the spec.
  1309  		for _, dev := range info.spec.Linux.Devices {
  1310  			if err := createDeviceFile(ctx, creds, info, vfsObj, root, dev); err != nil {
  1311  				return err
  1312  			}
  1313  		}
  1314  	}
  1315  	if specutils.GPUFunctionalityRequestedViaHook(info.spec, info.conf) {
  1316  		// When using nvidia-container-runtime-hook, devices are not injected into
  1317  		// spec.Linux.Devices. So manually create appropriate device files.
  1318  		mode := os.FileMode(0666)
  1319  		nvidiaDevs := []specs.LinuxDevice{
  1320  			specs.LinuxDevice{Path: "/dev/nvidiactl", Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: nvgpu.NV_CONTROL_DEVICE_MINOR, FileMode: &mode},
  1321  			specs.LinuxDevice{Path: "/dev/nvidia-uvm", Type: "c", Major: int64(info.nvidiaUVMDevMajor), Minor: nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, FileMode: &mode},
  1322  		}
  1323  		devClient := devutil.GoferClientFromContext(ctx)
  1324  		if devClient == nil {
  1325  			return fmt.Errorf("dev gofer client not found in context")
  1326  		}
  1327  		names, err := devClient.DirentNames(ctx)
  1328  		if err != nil {
  1329  			return fmt.Errorf("failed to get names of dirents from dev gofer: %w", err)
  1330  		}
  1331  		nvidiaDeviceRegex := regexp.MustCompile(`^nvidia(\d+)$`)
  1332  		for _, name := range names {
  1333  			ms := nvidiaDeviceRegex.FindStringSubmatch(name)
  1334  			if ms == nil {
  1335  				continue
  1336  			}
  1337  			minor, err := strconv.ParseUint(ms[1], 10, 32)
  1338  			if err != nil {
  1339  				return fmt.Errorf("invalid nvidia device name %q: %w", name, err)
  1340  			}
  1341  			nvidiaDevs = append(nvidiaDevs, specs.LinuxDevice{Path: fmt.Sprintf("/dev/nvidia%d", minor), Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: int64(minor), FileMode: &mode})
  1342  		}
  1343  		for _, nvidiaDev := range nvidiaDevs {
  1344  			if err := createDeviceFile(ctx, creds, info, vfsObj, root, nvidiaDev); err != nil {
  1345  				return err
  1346  			}
  1347  		}
  1348  	}
  1349  	return nil
  1350  }
  1351  
  1352  func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, devSpec specs.LinuxDevice) error {
  1353  	mode := linux.FileMode(devSpec.FileMode.Perm())
  1354  	var major, minor uint32
  1355  	// See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices.
  1356  	switch devSpec.Type {
  1357  	case "b":
  1358  		mode |= linux.S_IFBLK
  1359  		major = uint32(devSpec.Major)
  1360  		minor = uint32(devSpec.Minor)
  1361  	case "c", "u":
  1362  		mode |= linux.S_IFCHR
  1363  		major = uint32(devSpec.Major)
  1364  		minor = uint32(devSpec.Minor)
  1365  	case "p":
  1366  		mode |= linux.S_IFIFO
  1367  	default:
  1368  		return fmt.Errorf("specified device at %q has invalid type %q", devSpec.Path, devSpec.Type)
  1369  	}
  1370  	if devSpec.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && major != info.nvidiaUVMDevMajor {
  1371  		// nvidia-uvm's major device number is dynamically assigned, so the
  1372  		// number that it has on the host may differ from the number that
  1373  		// it has in sentry VFS; switch from the former to the latter.
  1374  		log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", devSpec.Major, info.nvidiaUVMDevMajor)
  1375  		major = info.nvidiaUVMDevMajor
  1376  	}
  1377  	return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID)
  1378  }
  1379  
  1380  // registerTPUDevice registers a TPU device in vfsObj based on the given device ID.
  1381  func registerTPUDevice(vfsObj *vfs.VirtualFilesystem, minor uint32, deviceID int64) error {
  1382  	switch deviceID {
  1383  	case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID:
  1384  		return accel.RegisterTPUDevice(vfsObj, minor, deviceID == tpu.TPUV4liteDeviceID)
  1385  	case tpu.TPUV5eDeviceID:
  1386  		return tpuproxy.RegisterTPUDevice(vfsObj, minor)
  1387  	default:
  1388  		return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID)
  1389  	}
  1390  }
  1391  
  1392  // pathGlobToPathRegex is a map that points a TPU PCI path glob to its path regex.
  1393  // TPU v4 devices are accessible via /sys/devices/pci0000:00/<pci_address>/accel/accel# on the host.
  1394  // TPU v5 devices are accessible via at /sys/devices/pci0000:00/<pci_address>/vfio-dev/vfio# on the host.
  1395  var pathGlobToPathRegex = map[string]string{
  1396  	pciPathGlobTPUv4: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/accel/accel(\d+)$`,
  1397  	pciPathGlobTPUv5: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/vfio-dev/vfio(\d+)$`,
  1398  }
  1399  
  1400  func tpuProxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error {
  1401  	if !specutils.TPUProxyIsEnabled(info.spec, info.conf) {
  1402  		return nil
  1403  	}
  1404  	// Enumerate all potential PCI paths where TPU devices are available and register the found TPU devices.
  1405  	for pciPathGlobal, pathRegex := range pathGlobToPathRegex {
  1406  		pciAddrs, err := filepath.Glob(pciPathGlobal)
  1407  		if err != nil {
  1408  			return fmt.Errorf("enumerating PCI device files: %w", err)
  1409  		}
  1410  		pciPathRegex := regexp.MustCompile(pathRegex)
  1411  		for _, pciPath := range pciAddrs {
  1412  			ms := pciPathRegex.FindStringSubmatch(pciPath)
  1413  			if ms == nil {
  1414  				continue
  1415  			}
  1416  			deviceNum, err := strconv.ParseUint(ms[1], 10, 32)
  1417  			if err != nil {
  1418  				return fmt.Errorf("parsing PCI device number: %w", err)
  1419  			}
  1420  			var deviceIDBytes []byte
  1421  			if deviceIDBytes, err = os.ReadFile(path.Join(pciPath, "device/device")); err != nil {
  1422  				return fmt.Errorf("reading PCI device ID: %w", err)
  1423  			}
  1424  			deviceIDStr := strings.Replace(string(deviceIDBytes), "0x", "", -1)
  1425  			deviceID, err := strconv.ParseInt(strings.TrimSpace(deviceIDStr), 16, 64)
  1426  			if err != nil {
  1427  				return fmt.Errorf("parsing PCI device ID: %w", err)
  1428  			}
  1429  			if err := registerTPUDevice(vfsObj, uint32(deviceNum), deviceID); err != nil {
  1430  				return fmt.Errorf("registering TPU driver: %w", err)
  1431  			}
  1432  		}
  1433  	}
  1434  	if err := tpuproxy.RegisterVfioDevice(vfsObj); err != nil {
  1435  		return fmt.Errorf("registering vfio driver: %w", err)
  1436  	}
  1437  	return nil
  1438  }
  1439  
  1440  func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error {
  1441  	if !specutils.NVProxyEnabled(info.spec, info.conf) {
  1442  		return nil
  1443  	}
  1444  	uvmDevMajor, err := vfsObj.GetDynamicCharDevMajor()
  1445  	if err != nil {
  1446  		return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err)
  1447  	}
  1448  	if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, uvmDevMajor); err != nil {
  1449  		return fmt.Errorf("registering nvproxy driver: %w", err)
  1450  	}
  1451  	info.nvidiaUVMDevMajor = uvmDevMajor
  1452  	return nil
  1453  }