github.com/metacubex/gvisor@v0.0.0-20240320004321-933faba989ec/runsc/boot/vfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"os"
    20  	"path"
    21  	"path/filepath"
    22  	"regexp"
    23  	"sort"
    24  	"strconv"
    25  	"strings"
    26  
    27  	specs "github.com/opencontainers/runtime-spec/specs-go"
    28  	"github.com/metacubex/gvisor/pkg/abi/linux"
    29  	"github.com/metacubex/gvisor/pkg/abi/nvgpu"
    30  	"github.com/metacubex/gvisor/pkg/abi/tpu"
    31  	"github.com/metacubex/gvisor/pkg/cleanup"
    32  	"github.com/metacubex/gvisor/pkg/context"
    33  	"github.com/metacubex/gvisor/pkg/devutil"
    34  	"github.com/metacubex/gvisor/pkg/errors/linuxerr"
    35  	"github.com/metacubex/gvisor/pkg/fd"
    36  	"github.com/metacubex/gvisor/pkg/fspath"
    37  	"github.com/metacubex/gvisor/pkg/log"
    38  	"github.com/metacubex/gvisor/pkg/sentry/devices/accel"
    39  	"github.com/metacubex/gvisor/pkg/sentry/devices/memdev"
    40  	"github.com/metacubex/gvisor/pkg/sentry/devices/nvproxy"
    41  	"github.com/metacubex/gvisor/pkg/sentry/devices/tpuproxy"
    42  	"github.com/metacubex/gvisor/pkg/sentry/devices/ttydev"
    43  	"github.com/metacubex/gvisor/pkg/sentry/devices/tundev"
    44  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/cgroupfs"
    45  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/dev"
    46  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/devpts"
    47  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/devtmpfs"
    48  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/erofs"
    49  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/fuse"
    50  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/gofer"
    51  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/mqfs"
    52  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/overlay"
    53  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/proc"
    54  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/sys"
    55  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/tmpfs"
    56  	"github.com/metacubex/gvisor/pkg/sentry/fsimpl/user"
    57  	"github.com/metacubex/gvisor/pkg/sentry/inet"
    58  	"github.com/metacubex/gvisor/pkg/sentry/kernel"
    59  	"github.com/metacubex/gvisor/pkg/sentry/kernel/auth"
    60  	"github.com/metacubex/gvisor/pkg/sentry/pgalloc"
    61  	"github.com/metacubex/gvisor/pkg/sentry/vfs"
    62  	"github.com/metacubex/gvisor/runsc/config"
    63  	"github.com/metacubex/gvisor/runsc/specutils"
    64  )
    65  
    66  // Supported filesystems that map to different internal filesystems.
    67  const (
    68  	Bind   = "bind"
    69  	Nonefs = "none"
    70  )
    71  
    72  // SelfFilestorePrefix is the prefix of the self filestore file name.
    73  const SelfFilestorePrefix = ".gvisor.filestore."
    74  
    75  const (
    76  	pciPathGlobTPUv4 = "/sys/devices/pci0000:00/*/accel/accel*"
    77  	pciPathGlobTPUv5 = "/sys/devices/pci0000:00/*/vfio-dev/vfio*"
    78  )
    79  
    80  // SelfFilestorePath returns the path at which the self filestore file is
    81  // stored for a given mount.
    82  func SelfFilestorePath(mountSrc, sandboxID string) string {
    83  	// We will place the filestore file in a gVisor specific hidden file inside
    84  	// the mount being overlaid itself. The same volume can be overlaid by
    85  	// multiple sandboxes. So make the filestore file unique to a sandbox by
    86  	// suffixing the sandbox ID.
    87  	return path.Join(mountSrc, selfFilestoreName(sandboxID))
    88  }
    89  
    90  func selfFilestoreName(sandboxID string) string {
    91  	return SelfFilestorePrefix + sandboxID
    92  }
    93  
    94  // tmpfs has some extra supported options that we must pass through.
    95  var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"}
    96  
    97  func registerFilesystems(k *kernel.Kernel, info *containerInfo) error {
    98  	ctx := k.SupervisorContext()
    99  	vfsObj := k.VFS()
   100  
   101  	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   102  		AllowUserMount: true,
   103  		AllowUserList:  true,
   104  	})
   105  	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   106  		AllowUserList: true,
   107  		// TODO(b/29356795): Users may mount this once the terminals are in a
   108  		//  usable state.
   109  		AllowUserMount: true,
   110  	})
   111  	vfsObj.MustRegisterFilesystemType(dev.Name, &dev.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{})
   112  	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   113  		AllowUserMount: true,
   114  		AllowUserList:  true,
   115  	})
   116  	vfsObj.MustRegisterFilesystemType(erofs.Name, &erofs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   117  		AllowUserList: true,
   118  	})
   119  	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   120  		AllowUserMount: true,
   121  		AllowUserList:  true,
   122  	})
   123  	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   124  		AllowUserList: true,
   125  	})
   126  	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   127  		AllowUserMount: true,
   128  		AllowUserList:  true,
   129  	})
   130  	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   131  		AllowUserMount: true,
   132  		AllowUserList:  true,
   133  	})
   134  	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   135  		AllowUserMount: true,
   136  		AllowUserList:  true,
   137  	})
   138  	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   139  		AllowUserMount: true,
   140  		AllowUserList:  true,
   141  	})
   142  	vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   143  		AllowUserMount: true,
   144  		AllowUserList:  true,
   145  	})
   146  
   147  	// Register devices.
   148  	if err := memdev.Register(vfsObj); err != nil {
   149  		return fmt.Errorf("registering memdev: %w", err)
   150  	}
   151  	if err := ttydev.Register(vfsObj); err != nil {
   152  		return fmt.Errorf("registering ttydev: %w", err)
   153  	}
   154  	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
   155  	if tunSupported {
   156  		if err := tundev.Register(vfsObj); err != nil {
   157  			return fmt.Errorf("registering tundev: %v", err)
   158  		}
   159  	}
   160  	if err := fuse.Register(vfsObj); err != nil {
   161  		return fmt.Errorf("registering fusedev: %w", err)
   162  	}
   163  
   164  	if err := nvproxyRegisterDevices(info, vfsObj); err != nil {
   165  		return err
   166  	}
   167  
   168  	if err := tpuProxyRegisterDevices(info, vfsObj); err != nil {
   169  		return err
   170  	}
   171  
   172  	return nil
   173  }
   174  
   175  func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   176  	// Create context with root credentials to mount the filesystem (the current
   177  	// user may not be privileged enough).
   178  	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   179  	rootProcArgs := *procArgs
   180  	rootProcArgs.WorkingDirectory = "/"
   181  	rootProcArgs.Credentials = rootCreds
   182  	rootProcArgs.Umask = 0022
   183  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   184  	rootCtx := rootProcArgs.NewContext(mntr.k)
   185  
   186  	mns, err := mntr.mountAll(rootCtx, rootCreds, info.spec, info.conf, &rootProcArgs)
   187  	if err != nil {
   188  		return fmt.Errorf("failed to setupFS: %w", err)
   189  	}
   190  	procArgs.MountNamespace = mns
   191  
   192  	// If cgroups are mounted, then only check for the cgroup mounts per
   193  	// container. Otherwise the root cgroups will be enabled.
   194  	if mntr.cgroupsMounted {
   195  		cgroupRegistry := mntr.k.CgroupRegistry()
   196  		for _, ctrl := range kernel.CgroupCtrls {
   197  			cg, err := cgroupRegistry.FindCgroup(ctx, ctrl, "/"+mntr.containerID)
   198  			if err != nil {
   199  				return fmt.Errorf("cgroup mount for controller %v not found", ctrl)
   200  			}
   201  			if procArgs.InitialCgroups == nil {
   202  				procArgs.InitialCgroups = make(map[kernel.Cgroup]struct{}, len(kernel.CgroupCtrls))
   203  			}
   204  			procArgs.InitialCgroups[cg] = struct{}{}
   205  		}
   206  	}
   207  
   208  	mnsRoot := mns.Root(rootCtx)
   209  	defer mnsRoot.DecRef(rootCtx)
   210  
   211  	if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil {
   212  		return fmt.Errorf("failed to create device files: %w", err)
   213  	}
   214  
   215  	// We are executing a file directly. Do not resolve the executable path.
   216  	if procArgs.File != nil {
   217  		return nil
   218  	}
   219  	// Resolve the executable path from working dir and environment.
   220  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   221  	if err != nil {
   222  		return err
   223  	}
   224  	procArgs.Filename = resolved
   225  	return nil
   226  }
   227  
   228  // compileMounts returns the supported mounts from the mount spec, adding any
   229  // mandatory mounts that are required by the OCI specification.
   230  //
   231  // This function must NOT add/remove any gofer mounts or change their order.
   232  func compileMounts(spec *specs.Spec, conf *config.Config, containerID string) []specs.Mount {
   233  	// Keep track of whether proc and sys were mounted.
   234  	var procMounted, sysMounted, devMounted, devptsMounted, cgroupsMounted bool
   235  	var mounts []specs.Mount
   236  
   237  	// Mount all submounts from the spec.
   238  	for _, m := range spec.Mounts {
   239  		// Mount all the cgroup controllers when "/sys/fs/cgroup" mount
   240  		// is present. If any other cgroup controller mounts are there,
   241  		// it will be a no-op, drop them.
   242  		if m.Type == cgroupfs.Name && cgroupsMounted {
   243  			continue
   244  		}
   245  
   246  		switch filepath.Clean(m.Destination) {
   247  		case "/proc":
   248  			procMounted = true
   249  		case "/sys":
   250  			sysMounted = true
   251  		case "/dev":
   252  			m.Type = dev.Name
   253  			devMounted = true
   254  		case "/dev/pts":
   255  			m.Type = devpts.Name
   256  			devptsMounted = true
   257  		case "/sys/fs/cgroup":
   258  			cgroupsMounted = true
   259  		}
   260  
   261  		mounts = append(mounts, m)
   262  	}
   263  
   264  	// Mount proc and sys even if the user did not ask for it, as the spec
   265  	// says we SHOULD.
   266  	var mandatoryMounts []specs.Mount
   267  
   268  	if !procMounted {
   269  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   270  			Type:        proc.Name,
   271  			Destination: "/proc",
   272  		})
   273  	}
   274  	if !sysMounted {
   275  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   276  			Type:        sys.Name,
   277  			Destination: "/sys",
   278  		})
   279  	}
   280  	if !devMounted {
   281  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   282  			Type:        dev.Name,
   283  			Destination: "/dev",
   284  		})
   285  	}
   286  	if !devptsMounted {
   287  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   288  			Type:        devpts.Name,
   289  			Destination: "/dev/pts",
   290  		})
   291  	}
   292  
   293  	// The mandatory mounts should be ordered right after the root, in case
   294  	// there are submounts of these mandatory mounts already in the spec.
   295  	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
   296  
   297  	return mounts
   298  }
   299  
   300  // goferMountData creates a slice of gofer mount data.
   301  func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string {
   302  	opts := []string{
   303  		"trans=fd",
   304  		"rfdno=" + strconv.Itoa(fd),
   305  		"wfdno=" + strconv.Itoa(fd),
   306  	}
   307  	if fa == config.FileAccessShared {
   308  		opts = append(opts, "cache=remote_revalidating")
   309  	}
   310  	if conf.DirectFS {
   311  		opts = append(opts, "directfs")
   312  	}
   313  	if !conf.HostFifo.AllowOpen() {
   314  		opts = append(opts, "disable_fifo_open")
   315  	}
   316  	return opts
   317  }
   318  
   319  // consumeMountOptions consumes mount options from opts based on allowedKeys
   320  // and returns the remaining and consumed options.
   321  func consumeMountOptions(opts []string, allowedKeys ...string) ([]string, []string, error) {
   322  	var rem, out []string
   323  	for _, o := range opts {
   324  		ok, err := parseMountOption(o, allowedKeys...)
   325  		if err != nil {
   326  			return nil, nil, err
   327  		}
   328  		if ok {
   329  			out = append(out, o)
   330  		} else {
   331  			rem = append(rem, o)
   332  		}
   333  	}
   334  	return rem, out, nil
   335  }
   336  
   337  func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
   338  	kv := strings.SplitN(opt, "=", 3)
   339  	if len(kv) > 2 {
   340  		return false, fmt.Errorf("invalid option %q", opt)
   341  	}
   342  	return specutils.ContainsStr(allowedKeys, kv[0]), nil
   343  }
   344  
   345  type fdDispenser struct {
   346  	fds []*fd.FD
   347  }
   348  
   349  func (f *fdDispenser) remove() int {
   350  	return f.removeAsFD().Release()
   351  }
   352  
   353  func (f *fdDispenser) removeAsFD() *fd.FD {
   354  	if f.empty() {
   355  		panic("fdDispenser out of fds")
   356  	}
   357  	rv := f.fds[0]
   358  	f.fds = f.fds[1:]
   359  	return rv
   360  }
   361  
   362  func (f *fdDispenser) empty() bool {
   363  	return len(f.fds) == 0
   364  }
   365  
   366  type containerMounter struct {
   367  	root *specs.Root
   368  
   369  	// mounts is the set of submounts for the container. It's a copy from the spec
   370  	// that may be freely modified without affecting the original spec.
   371  	mounts []specs.Mount
   372  
   373  	// goferFDs is the list of FDs to be dispensed for gofer mounts.
   374  	goferFDs fdDispenser
   375  
   376  	// goferFilestoreFDs are FDs to the regular files that will back the tmpfs or
   377  	// overlayfs mount for certain gofer mounts.
   378  	goferFilestoreFDs fdDispenser
   379  
   380  	// devGoferFD is the FD to attach the sandbox to the dev gofer.
   381  	devGoferFD *fd.FD
   382  
   383  	// goferMountConfs contains information about how the gofer mounts have been
   384  	// configured. The first entry is for rootfs and the following entries are
   385  	// for bind mounts in Spec.Mounts (in the same order).
   386  	goferMountConfs []GoferMountConf
   387  
   388  	k *kernel.Kernel
   389  
   390  	// hints is the set of pod mount hints for the sandbox.
   391  	hints *PodMountHints
   392  
   393  	// sharedMounts is a map of shared mounts that can be reused across
   394  	// containers.
   395  	sharedMounts map[string]*vfs.Mount
   396  
   397  	// productName is the value to show in
   398  	// /sys/devices/virtual/dmi/id/product_name.
   399  	productName string
   400  
   401  	// containerID is the ID for the container.
   402  	containerID string
   403  
   404  	// sandboxID is the ID for the whole sandbox.
   405  	sandboxID     string
   406  	containerName string
   407  
   408  	// cgroupsMounted indicates if cgroups are mounted in the container.
   409  	// This is used to set the InitialCgroups before starting the container
   410  	// process.
   411  	cgroupsMounted bool
   412  }
   413  
   414  func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, sharedMounts map[string]*vfs.Mount, productName string, sandboxID string) *containerMounter {
   415  	return &containerMounter{
   416  		root:              info.spec.Root,
   417  		mounts:            compileMounts(info.spec, info.conf, info.procArgs.ContainerID),
   418  		goferFDs:          fdDispenser{fds: info.goferFDs},
   419  		goferFilestoreFDs: fdDispenser{fds: info.goferFilestoreFDs},
   420  		devGoferFD:        info.devGoferFD,
   421  		goferMountConfs:   info.goferMountConfs,
   422  		k:                 k,
   423  		hints:             hints,
   424  		sharedMounts:      sharedMounts,
   425  		productName:       productName,
   426  		containerID:       info.procArgs.ContainerID,
   427  		sandboxID:         sandboxID,
   428  		containerName:     info.containerName,
   429  	}
   430  }
   431  
   432  func (c *containerMounter) checkDispenser() error {
   433  	if !c.goferFDs.empty() {
   434  		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.goferFDs)
   435  	}
   436  	if !c.goferFilestoreFDs.empty() {
   437  		return fmt.Errorf("not all gofer Filestore FDs were consumed, remaining: %v", c.goferFilestoreFDs)
   438  	}
   439  	if c.devGoferFD != nil && c.devGoferFD.FD() >= 0 {
   440  		return fmt.Errorf("dev gofer FD was not consumed: %d", c.devGoferFD.FD())
   441  	}
   442  	return nil
   443  }
   444  
   445  func getMountAccessType(conf *config.Config, hint *MountHint) config.FileAccessType {
   446  	if hint != nil {
   447  		return hint.fileAccessType()
   448  	}
   449  	return conf.FileAccessMounts
   450  }
   451  
   452  func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, spec *specs.Spec, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
   453  	log.Infof("Configuring container's file system")
   454  
   455  	mns, err := c.createMountNamespace(rootCtx, conf, rootCreds)
   456  	if err != nil {
   457  		return nil, fmt.Errorf("creating mount namespace: %w", err)
   458  	}
   459  	rootProcArgs.MountNamespace = mns
   460  
   461  	root := mns.Root(rootCtx)
   462  	defer root.DecRef(rootCtx)
   463  	if root.Mount().ReadOnly() {
   464  		// Switch to ReadWrite while we setup submounts.
   465  		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
   466  			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
   467  		}
   468  		// Restore back to ReadOnly at the end.
   469  		defer func() {
   470  			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
   471  				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
   472  			}
   473  		}()
   474  	}
   475  
   476  	// Mount submounts.
   477  	if err := c.mountSubmounts(rootCtx, spec, conf, mns, rootCreds); err != nil {
   478  		return nil, fmt.Errorf("mounting submounts: %w", err)
   479  	}
   480  
   481  	return mns, nil
   482  }
   483  
   484  // createMountNamespace creates the container's root mount and namespace.
   485  func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
   486  	ioFD := c.goferFDs.remove()
   487  	rootfsConf := c.goferMountConfs[0]
   488  
   489  	var (
   490  		fsName string
   491  		opts   *vfs.MountOptions
   492  	)
   493  	switch {
   494  	case rootfsConf.ShouldUseLisafs():
   495  		fsName = gofer.Name
   496  
   497  		data := goferMountData(ioFD, conf.FileAccess, conf)
   498  
   499  		// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   500  		// can only send mount options for specs.Mounts (specs.Root is missing
   501  		// Options field). So assume root is always on top of overlayfs.
   502  		data = append(data, "overlayfs_stale_read")
   503  
   504  		// Configure the gofer dentry cache size.
   505  		gofer.SetDentryCacheSize(conf.DCache)
   506  
   507  		opts = &vfs.MountOptions{
   508  			ReadOnly: c.root.Readonly,
   509  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   510  				InternalMount: true,
   511  				Data:          strings.Join(data, ","),
   512  				InternalData: gofer.InternalFilesystemOptions{
   513  					UniqueID: vfs.RestoreID{
   514  						ContainerName: c.containerName,
   515  						Path:          "/",
   516  					},
   517  				},
   518  			},
   519  		}
   520  
   521  	case rootfsConf.ShouldUseErofs():
   522  		fsName = erofs.Name
   523  		opts = &vfs.MountOptions{
   524  			ReadOnly: c.root.Readonly,
   525  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   526  				InternalMount: true,
   527  				Data:          fmt.Sprintf("ifd=%d", ioFD),
   528  				InternalData: erofs.InternalFilesystemOptions{
   529  					UniqueID: vfs.RestoreID{
   530  						ContainerName: c.containerName,
   531  						Path:          "/",
   532  					},
   533  				},
   534  			},
   535  		}
   536  
   537  	default:
   538  		return nil, fmt.Errorf("unsupported rootfs config: %+v", rootfsConf)
   539  	}
   540  
   541  	log.Infof("Mounting root with %s, ioFD: %d", fsName, ioFD)
   542  
   543  	if rootfsConf.ShouldUseOverlayfs() {
   544  		log.Infof("Adding overlay on top of root")
   545  		var (
   546  			err         error
   547  			cleanup     func()
   548  			filestoreFD *fd.FD
   549  		)
   550  		if rootfsConf.IsFilestorePresent() {
   551  			filestoreFD = c.goferFilestoreFDs.removeAsFD()
   552  		}
   553  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, filestoreFD, rootfsConf, "/")
   554  		if err != nil {
   555  			return nil, fmt.Errorf("mounting root with overlay: %w", err)
   556  		}
   557  		defer cleanup()
   558  		fsName = overlay.Name
   559  	}
   560  
   561  	// The namespace root mount can't be changed, so let's mount a dummy
   562  	// read-only tmpfs here. It simplifies creation of containers without
   563  	// leaking the root file system.
   564  	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "rootfs", "tmpfs",
   565  		&vfs.MountOptions{ReadOnly: true, Locked: true}, c.k)
   566  	if err != nil {
   567  		return nil, fmt.Errorf("setting up mount namespace: %w", err)
   568  	}
   569  	defer mns.DecRef(ctx)
   570  
   571  	mnt, err := c.k.VFS().MountDisconnected(ctx, creds, "root", fsName, opts)
   572  	if err != nil {
   573  		return nil, fmt.Errorf("creating root file system: %w", err)
   574  	}
   575  	defer mnt.DecRef(ctx)
   576  	root := mns.Root(ctx)
   577  	defer root.DecRef(ctx)
   578  	target := &vfs.PathOperation{
   579  		Root:  root,
   580  		Start: root,
   581  	}
   582  	if err := c.k.VFS().ConnectMountAt(ctx, creds, mnt, target); err != nil {
   583  		return nil, fmt.Errorf("mounting root file system: %w", err)
   584  	}
   585  
   586  	mns.IncRef()
   587  	return mns, nil
   588  }
   589  
   590  // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
   591  // layer using tmpfs, and return overlay mount options. "cleanup" must be called
   592  // after the options have been used to mount the overlay, to release refs on
   593  // lower and upper mounts.
   594  func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, mountConf GoferMountConf, dst string) (*vfs.MountOptions, func(), error) {
   595  	// First copy options from lower layer to upper layer and overlay. Clear
   596  	// filesystem specific options.
   597  	upperOpts := *lowerOpts
   598  	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true}
   599  
   600  	overlayOpts := *lowerOpts
   601  	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{InternalMount: true}
   602  
   603  	// All writes go to the upper layer, be paranoid and make lower readonly.
   604  	lowerOpts.ReadOnly = true
   605  	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
   606  	if err != nil {
   607  		return nil, nil, err
   608  	}
   609  	cu := cleanup.Make(func() { lower.DecRef(ctx) })
   610  	defer cu.Clean()
   611  
   612  	// Determine the lower layer's root's type.
   613  	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
   614  	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
   615  		Root:  lowerRootVD,
   616  		Start: lowerRootVD,
   617  	}, &vfs.StatOptions{
   618  		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
   619  	})
   620  	if err != nil {
   621  		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
   622  	}
   623  	if stat.Mask&linux.STATX_TYPE == 0 {
   624  		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
   625  	}
   626  	rootType := stat.Mode & linux.S_IFMT
   627  	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
   628  		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
   629  	}
   630  
   631  	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
   632  	tmpfsOpts := tmpfs.FilesystemOpts{
   633  		RootFileType: uint16(rootType),
   634  		// If a mount is being overlaid, it should not be limited by the default
   635  		// tmpfs size limit.
   636  		DisableDefaultSizeLimit: true,
   637  	}
   638  	if filestoreFD != nil {
   639  		// Create memory file for disk-backed overlays.
   640  		mf, err := createPrivateMemoryFile(filestoreFD.ReleaseToFile("overlay-filestore"), vfs.RestoreID{ContainerName: c.containerName, Path: dst})
   641  		if err != nil {
   642  			return nil, nil, fmt.Errorf("failed to create memory file for overlay: %v", err)
   643  		}
   644  		tmpfsOpts.MemoryFile = mf
   645  	}
   646  	upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts
   647  	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
   648  	if err != nil {
   649  		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
   650  	}
   651  	cu.Add(func() { upper.DecRef(ctx) })
   652  
   653  	// If the overlay mount consists of a regular file, copy up its contents
   654  	// from the lower layer, since in the overlay the otherwise-empty upper
   655  	// layer file will take precedence.
   656  	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
   657  	if rootType == linux.S_IFREG {
   658  		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   659  			Root:  lowerRootVD,
   660  			Start: lowerRootVD,
   661  		}, &vfs.OpenOptions{
   662  			Flags: linux.O_RDONLY,
   663  		})
   664  		if err != nil {
   665  			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
   666  		}
   667  		defer lowerFD.DecRef(ctx)
   668  		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   669  			Root:  upperRootVD,
   670  			Start: upperRootVD,
   671  		}, &vfs.OpenOptions{
   672  			Flags: linux.O_WRONLY,
   673  		})
   674  		if err != nil {
   675  			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
   676  		}
   677  		defer upperFD.DecRef(ctx)
   678  		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
   679  			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
   680  		}
   681  	}
   682  
   683  	// We need to hide the filestore from the containerized application.
   684  	if mountConf.IsSelfBacked() {
   685  		if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{
   686  			Root:  upperRootVD,
   687  			Start: upperRootVD,
   688  			Path:  fspath.Parse(selfFilestoreName(c.sandboxID)),
   689  		}); err != nil {
   690  			return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err)
   691  		}
   692  	}
   693  
   694  	// Propagate the lower layer's root's owner, group, and mode to the upper
   695  	// layer's root for consistency with VFS1.
   696  	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
   697  		Root:  upperRootVD,
   698  		Start: upperRootVD,
   699  	}, &vfs.SetStatOptions{
   700  		Stat: linux.Statx{
   701  			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
   702  			UID:  stat.UID,
   703  			GID:  stat.GID,
   704  			Mode: stat.Mode,
   705  		},
   706  	})
   707  	if err != nil {
   708  		return nil, nil, err
   709  	}
   710  
   711  	// Configure overlay with both layers.
   712  	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
   713  		UpperRoot:  upperRootVD,
   714  		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
   715  	}
   716  	return &overlayOpts, cu.Release(), nil
   717  }
   718  
   719  func (c *containerMounter) mountSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
   720  	mounts, err := c.prepareMounts()
   721  	if err != nil {
   722  		return err
   723  	}
   724  
   725  	for i := range mounts {
   726  		submount := &mounts[i]
   727  		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
   728  		var (
   729  			mnt *vfs.Mount
   730  			err error
   731  		)
   732  
   733  		if submount.hint != nil && submount.hint.ShouldShareMount() {
   734  			sharedMount, err := c.getSharedMount(ctx, spec, conf, submount, creds)
   735  			if err != nil {
   736  				return fmt.Errorf("getting shared mount %q: %w", submount.hint.Name, err)
   737  			}
   738  			mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount, sharedMount)
   739  			if err != nil {
   740  				return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.Name, submount.mount.Destination, err)
   741  			}
   742  		} else if submount.mount.Type == cgroupfs.Name {
   743  			// Mount all the cgroups controllers.
   744  			if err := c.mountCgroupSubmounts(ctx, spec, conf, mns, creds, submount); err != nil {
   745  				return fmt.Errorf("mount cgroup %q: %w", submount.mount.Destination, err)
   746  			}
   747  		} else {
   748  			mnt, err = c.mountSubmount(ctx, spec, conf, mns, creds, submount)
   749  			if err != nil {
   750  				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
   751  			}
   752  		}
   753  
   754  		if mnt != nil && mnt.ReadOnly() {
   755  			// Switch to ReadWrite while we setup submounts.
   756  			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
   757  				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
   758  			}
   759  			// Restore back to ReadOnly at the end.
   760  			defer func() {
   761  				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
   762  					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
   763  				}
   764  			}()
   765  		}
   766  	}
   767  
   768  	if err := c.mountTmp(ctx, spec, conf, creds, mns); err != nil {
   769  		return fmt.Errorf(`mount submount "/tmp": %w`, err)
   770  	}
   771  	return nil
   772  }
   773  
   774  type mountInfo struct {
   775  	mount          *specs.Mount
   776  	goferFD        *fd.FD
   777  	hint           *MountHint
   778  	goferMountConf GoferMountConf
   779  	filestoreFD    *fd.FD
   780  }
   781  
   782  func (c *containerMounter) prepareMounts() ([]mountInfo, error) {
   783  	// If device gofer exists, connect to it.
   784  	if c.devGoferFD != nil {
   785  		if err := c.k.AddDevGofer(c.containerID, c.devGoferFD.Release()); err != nil {
   786  			return nil, err
   787  		}
   788  	}
   789  	// Associate bind mounts with their FDs before sorting since there is an
   790  	// undocumented assumption that FDs are dispensed in the order in which
   791  	// they are required by mounts.
   792  	var mounts []mountInfo
   793  	goferMntIdx := 1 // First index is for rootfs.
   794  	for i := range c.mounts {
   795  		info := mountInfo{
   796  			mount: &c.mounts[i],
   797  			hint:  c.hints.FindMount(c.mounts[i].Source),
   798  		}
   799  		specutils.MaybeConvertToBindMount(info.mount)
   800  		if specutils.IsGoferMount(*info.mount) {
   801  			info.goferMountConf = c.goferMountConfs[goferMntIdx]
   802  			if info.goferMountConf.ShouldUseLisafs() {
   803  				info.goferFD = c.goferFDs.removeAsFD()
   804  			}
   805  			if info.goferMountConf.IsFilestorePresent() {
   806  				info.filestoreFD = c.goferFilestoreFDs.removeAsFD()
   807  			}
   808  			if info.goferMountConf.ShouldUseTmpfs() {
   809  				specutils.ChangeMountType(info.mount, tmpfs.Name)
   810  			}
   811  			goferMntIdx++
   812  		}
   813  		mounts = append(mounts, info)
   814  	}
   815  	if err := c.checkDispenser(); err != nil {
   816  		return nil, err
   817  	}
   818  
   819  	// Sort the mounts so that we don't place children before parents.
   820  	sort.Slice(mounts, func(i, j int) bool {
   821  		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
   822  	})
   823  
   824  	return mounts, nil
   825  }
   826  
   827  func (c *containerMounter) mountSubmount(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) {
   828  	fsName, opts, err := getMountNameAndOptions(spec, conf, submount, c.productName, c.containerName)
   829  	if err != nil {
   830  		return nil, fmt.Errorf("mountOptions failed: %w", err)
   831  	}
   832  	if len(fsName) == 0 {
   833  		// Filesystem is not supported (e.g. cgroup), just skip it.
   834  		return nil, nil
   835  	}
   836  
   837  	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
   838  		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
   839  	}
   840  
   841  	if submount.goferMountConf.ShouldUseOverlayfs() {
   842  		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
   843  		var cleanup func()
   844  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.filestoreFD, submount.goferMountConf, submount.mount.Destination)
   845  		if err != nil {
   846  			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
   847  		}
   848  		defer cleanup()
   849  		fsName = overlay.Name
   850  	}
   851  
   852  	root := mns.Root(ctx)
   853  	defer root.DecRef(ctx)
   854  	target := &vfs.PathOperation{
   855  		Root:  root,
   856  		Start: root,
   857  		Path:  fspath.Parse(submount.mount.Destination),
   858  	}
   859  	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
   860  	if err != nil {
   861  		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
   862  	}
   863  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
   864  	return mnt, nil
   865  }
   866  
   867  // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
   868  // used for mounts.
   869  func getMountNameAndOptions(spec *specs.Spec, conf *config.Config, m *mountInfo, productName, containerName string) (string, *vfs.MountOptions, error) {
   870  	fsName := m.mount.Type
   871  	var (
   872  		mopts        = m.mount.Options
   873  		data         []string
   874  		internalData any
   875  	)
   876  
   877  	// Find filesystem name and FS specific data field.
   878  	switch m.mount.Type {
   879  	case devpts.Name, dev.Name, proc.Name:
   880  		// Nothing to do.
   881  
   882  	case Nonefs:
   883  		fsName = sys.Name
   884  
   885  	case sys.Name:
   886  		sysData := &sys.InternalData{EnableTPUProxyPaths: specutils.TPUProxyIsEnabled(spec, conf)}
   887  		if len(productName) > 0 {
   888  			sysData.ProductName = productName
   889  		}
   890  		internalData = sysData
   891  
   892  	case tmpfs.Name:
   893  		var err error
   894  		mopts, data, err = consumeMountOptions(mopts, tmpfsAllowedData...)
   895  		if err != nil {
   896  			return "", nil, err
   897  		}
   898  		if m.filestoreFD != nil {
   899  			mf, err := createPrivateMemoryFile(m.filestoreFD.ReleaseToFile("tmpfs-filestore"), vfs.RestoreID{ContainerName: containerName, Path: m.mount.Destination})
   900  			if err != nil {
   901  				return "", nil, fmt.Errorf("failed to create memory file for tmpfs: %v", err)
   902  			}
   903  			internalData = tmpfs.FilesystemOpts{
   904  				MemoryFile: mf,
   905  				// If a mount is being overlaid with tmpfs, it should not be limited by
   906  				// the default tmpfs size limit.
   907  				DisableDefaultSizeLimit: true,
   908  			}
   909  		}
   910  
   911  	case Bind:
   912  		fsName = gofer.Name
   913  		if m.goferFD == nil {
   914  			// Check that an FD was provided to fails fast.
   915  			return "", nil, fmt.Errorf("gofer mount requires a connection FD")
   916  		}
   917  		var err error
   918  		mopts, data, err = consumeMountOptions(mopts, gofer.SupportedMountOptions...)
   919  		if err != nil {
   920  			return "", nil, err
   921  		}
   922  		data = append(data, goferMountData(m.goferFD.Release(), getMountAccessType(conf, m.hint), conf)...)
   923  		internalData = gofer.InternalFilesystemOptions{
   924  			UniqueID: vfs.RestoreID{
   925  				ContainerName: containerName,
   926  				Path:          m.mount.Destination,
   927  			},
   928  		}
   929  
   930  	case cgroupfs.Name:
   931  		var err error
   932  		mopts, data, err = consumeMountOptions(mopts, cgroupfs.SupportedMountOptions...)
   933  		if err != nil {
   934  			return "", nil, err
   935  		}
   936  
   937  	default:
   938  		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
   939  		return "", nil, nil
   940  	}
   941  
   942  	opts := ParseMountOptions(mopts)
   943  	opts.GetFilesystemOptions = vfs.GetFilesystemOptions{
   944  		Data:          strings.Join(data, ","),
   945  		InternalData:  internalData,
   946  		InternalMount: true,
   947  	}
   948  
   949  	return fsName, opts, nil
   950  }
   951  
   952  // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions.
   953  func ParseMountOptions(opts []string) *vfs.MountOptions {
   954  	mountOpts := &vfs.MountOptions{
   955  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   956  			InternalMount: true,
   957  		},
   958  	}
   959  	// Note: update mountHint.CheckCompatible when more options are added.
   960  	for _, o := range opts {
   961  		switch o {
   962  		case "ro":
   963  			mountOpts.ReadOnly = true
   964  		case "noatime":
   965  			mountOpts.Flags.NoATime = true
   966  		case "noexec":
   967  			mountOpts.Flags.NoExec = true
   968  		case "rw", "atime", "exec":
   969  			// These use the default value and don't need to be set.
   970  		case "bind", "rbind":
   971  			// These are the same as a mount with type="bind".
   972  		default:
   973  			log.Warningf("ignoring unknown mount option %q", o)
   974  		}
   975  	}
   976  	return mountOpts
   977  }
   978  
   979  func parseKeyValue(s string) (string, string, bool) {
   980  	tokens := strings.SplitN(s, "=", 2)
   981  	if len(tokens) < 2 {
   982  		return "", "", false
   983  	}
   984  	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
   985  }
   986  
   987  func createPrivateMemoryFile(file *os.File, restoreID vfs.RestoreID) (*pgalloc.MemoryFile, error) {
   988  	mfOpts := pgalloc.MemoryFileOpts{
   989  		// Private memory files are usually backed by files on disk. Ideally we
   990  		// would confirm with fstatfs(2) but that is prohibited by seccomp.
   991  		DiskBackedFile: true,
   992  		// Disk backed files need to be decommited on destroy to release disk space.
   993  		DecommitOnDestroy: true,
   994  		// sentry's seccomp filters don't allow the mmap(2) syscalls that
   995  		// pgalloc.IMAWorkAroundForMemFile() uses. Users of private memory files
   996  		// are expected to have performed the work around outside the sandbox.
   997  		DisableIMAWorkAround: true,
   998  		// Private memory files need to be restored correctly using this ID.
   999  		RestoreID: restoreID.String(),
  1000  	}
  1001  	return pgalloc.NewMemoryFile(file, mfOpts)
  1002  }
  1003  
  1004  // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
  1005  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
  1006  // the host /tmp, but this is a nice optimization, and fixes some apps that call
  1007  // mknod in /tmp. It's unsafe to mount tmpfs if:
  1008  //  1. /tmp is mounted explicitly: we should not override user's wish
  1009  //  2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
  1010  //
  1011  // Note that when there are submounts inside of '/tmp', directories for the
  1012  // mount points must be present, making '/tmp' not empty anymore.
  1013  func (c *containerMounter) mountTmp(ctx context.Context, spec *specs.Spec, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
  1014  	for _, m := range c.mounts {
  1015  		// m.Destination has been cleaned, so it's to use equality here.
  1016  		if m.Destination == "/tmp" {
  1017  			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
  1018  			return nil
  1019  		}
  1020  	}
  1021  
  1022  	root := mns.Root(ctx)
  1023  	defer root.DecRef(ctx)
  1024  	pop := vfs.PathOperation{
  1025  		Root:  root,
  1026  		Start: root,
  1027  		Path:  fspath.Parse("/tmp"),
  1028  	}
  1029  	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
  1030  	switch {
  1031  	case err == nil:
  1032  		defer fd.DecRef(ctx)
  1033  
  1034  		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
  1035  			if dirent.Name != "." && dirent.Name != ".." {
  1036  				return linuxerr.ENOTEMPTY
  1037  			}
  1038  			return nil
  1039  		}))
  1040  		switch {
  1041  		case err == nil:
  1042  			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
  1043  		case linuxerr.Equals(linuxerr.ENOTEMPTY, err):
  1044  			// If more than "." and ".." is found, skip internal tmpfs to prevent
  1045  			// hiding existing files.
  1046  			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
  1047  			return nil
  1048  		default:
  1049  			return fmt.Errorf("fd.IterDirents failed: %v", err)
  1050  		}
  1051  		fallthrough
  1052  
  1053  	case linuxerr.Equals(linuxerr.ENOENT, err):
  1054  		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
  1055  		// tmpfs.
  1056  		tmpMount := specs.Mount{
  1057  			Type:        tmpfs.Name,
  1058  			Destination: "/tmp",
  1059  			// Sticky bit is added to prevent accidental deletion of files from
  1060  			// another user. This is normally done for /tmp.
  1061  			Options: []string{"mode=01777"},
  1062  		}
  1063  		if _, err := c.mountSubmount(ctx, spec, conf, mns, creds, &mountInfo{mount: &tmpMount}); err != nil {
  1064  			return fmt.Errorf("mountSubmount failed: %v", err)
  1065  		}
  1066  		return nil
  1067  
  1068  	case linuxerr.Equals(linuxerr.ENOTDIR, err):
  1069  		// Not a dir?! Let it be.
  1070  		return nil
  1071  
  1072  	default:
  1073  		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
  1074  	}
  1075  }
  1076  
  1077  func (c *containerMounter) getSharedMount(ctx context.Context, spec *specs.Spec, conf *config.Config, mount *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) {
  1078  	sharedMount, ok := c.sharedMounts[mount.hint.Mount.Source]
  1079  	if ok {
  1080  		log.Infof("Using existing shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type)
  1081  		if mount.goferFD != nil {
  1082  			panic(fmt.Errorf("extra goferFD provided for shared mount %q", mount.hint.Name))
  1083  		}
  1084  		if mount.filestoreFD != nil {
  1085  			mount.filestoreFD.Close()
  1086  		}
  1087  		return sharedMount, nil
  1088  	}
  1089  	log.Infof("Mounting master of shared mount %q from %q type %q", mount.hint.Name, mount.hint.Mount.Source, mount.hint.Mount.Type)
  1090  	sharedMount, err := c.mountSharedMaster(ctx, spec, conf, mount, creds)
  1091  	if err != nil {
  1092  		return nil, fmt.Errorf("mounting shared master %q: %v", mount.hint.Name, err)
  1093  	}
  1094  	c.sharedMounts[mount.hint.Mount.Source] = sharedMount
  1095  	return sharedMount, nil
  1096  }
  1097  
  1098  // mountCgroupMounts mounts the cgroups which are shared across all containers.
  1099  // Postcondition: Initialized k.cgroupMounts on success.
  1100  func (l *Loader) mountCgroupMounts(conf *config.Config, creds *auth.Credentials) error {
  1101  	ctx := l.k.SupervisorContext()
  1102  	for _, sopts := range kernel.CgroupCtrls {
  1103  		mopts := &vfs.MountOptions{
  1104  			GetFilesystemOptions: vfs.GetFilesystemOptions{
  1105  				Data:          string(sopts),
  1106  				InternalMount: true,
  1107  			},
  1108  		}
  1109  		fs, root, err := l.k.VFS().NewFilesystem(ctx, creds, "cgroup", cgroupfs.Name, mopts)
  1110  		if err != nil {
  1111  			return err
  1112  		}
  1113  
  1114  		mount := l.k.VFS().NewDisconnectedMount(fs, root, mopts)
  1115  		// Private so that mounts created by containers do not appear
  1116  		// in other container's cgroup paths.
  1117  		l.k.VFS().SetMountPropagation(mount, linux.MS_PRIVATE, false)
  1118  		l.k.AddCgroupMount(string(sopts), &kernel.CgroupMount{
  1119  			Fs:    fs,
  1120  			Root:  root,
  1121  			Mount: mount,
  1122  		})
  1123  	}
  1124  	log.Infof("created cgroup mounts for controllers %v", kernel.CgroupCtrls)
  1125  	return nil
  1126  }
  1127  
  1128  // mountCgroupSubmounts mounts all the cgroup controller submounts for the
  1129  // container. The cgroup submounts are created under the root controller mount
  1130  // with containerID as the directory name and then bind mounts this directory
  1131  // inside the container's mount namespace.
  1132  func (c *containerMounter) mountCgroupSubmounts(ctx context.Context, spec *specs.Spec, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) error {
  1133  	root := mns.Root(ctx)
  1134  	defer root.DecRef(ctx)
  1135  
  1136  	// Mount "/sys/fs/cgroup" in the container's mount namespace.
  1137  	submount.mount.Type = tmpfs.Name
  1138  	mnt, err := c.mountSubmount(ctx, spec, conf, mns, creds, submount)
  1139  	if err != nil {
  1140  		return err
  1141  	}
  1142  	if mnt != nil && mnt.ReadOnly() {
  1143  		// Switch to ReadWrite while we setup submounts.
  1144  		if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
  1145  			return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
  1146  		}
  1147  		// Restore back to ReadOnly at the end.
  1148  		defer func() {
  1149  			if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
  1150  				panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
  1151  			}
  1152  		}()
  1153  	}
  1154  
  1155  	// Mount all the cgroup controllers in the container's mount namespace.
  1156  	mountCtx := vfs.WithRoot(vfs.WithMountNamespace(ctx, mns), root)
  1157  	for _, ctrl := range kernel.CgroupCtrls {
  1158  		ctrlName := string(ctrl)
  1159  		cgroupMnt := c.k.GetCgroupMount(ctrlName)
  1160  		if cgroupMnt == nil {
  1161  			return fmt.Errorf("cgroup mount for controller %s not found", ctrlName)
  1162  		}
  1163  
  1164  		cgroupMntVD := vfs.MakeVirtualDentry(cgroupMnt.Mount, cgroupMnt.Root)
  1165  		sourcePop := vfs.PathOperation{
  1166  			Root:  cgroupMntVD,
  1167  			Start: cgroupMntVD,
  1168  			// Use the containerID as the cgroup path.
  1169  			Path: fspath.Parse(c.containerID),
  1170  		}
  1171  		if err := c.k.VFS().MkdirAt(mountCtx, creds, &sourcePop, &vfs.MkdirOptions{
  1172  			Mode: 0755,
  1173  		}); err != nil {
  1174  			log.Infof("error in creating directory %v", err)
  1175  			return err
  1176  		}
  1177  
  1178  		// Bind mount the new cgroup directory into the container's mount namespace.
  1179  		destination := "/sys/fs/cgroup/" + ctrlName
  1180  		if err := c.k.VFS().MakeSyntheticMountpoint(mountCtx, destination, root, creds); err != nil {
  1181  			// Log a warning, but attempt the mount anyway.
  1182  			log.Warningf("Failed to create mount point %q: %v", destination, err)
  1183  		}
  1184  
  1185  		target := &vfs.PathOperation{
  1186  			Root:  root,
  1187  			Start: root,
  1188  			Path:  fspath.Parse(destination),
  1189  		}
  1190  		if err := c.k.VFS().BindAt(mountCtx, creds, &sourcePop, target, false); err != nil {
  1191  			log.Infof("error in bind mounting %v", err)
  1192  			return err
  1193  		}
  1194  	}
  1195  	c.cgroupsMounted = true
  1196  	return nil
  1197  }
  1198  
  1199  // mountSharedMaster mounts the master of a volume that is shared among
  1200  // containers in a pod.
  1201  func (c *containerMounter) mountSharedMaster(ctx context.Context, spec *specs.Spec, conf *config.Config, mntInfo *mountInfo, creds *auth.Credentials) (*vfs.Mount, error) {
  1202  	// Mount the master using the options from the hint (mount annotations).
  1203  	origOpts := mntInfo.mount.Options
  1204  	mntInfo.mount.Options = mntInfo.hint.Mount.Options
  1205  	fsName, opts, err := getMountNameAndOptions(spec, conf, mntInfo, c.productName, c.containerName)
  1206  	mntInfo.mount.Options = origOpts
  1207  	if err != nil {
  1208  		return nil, err
  1209  	}
  1210  	if len(fsName) == 0 {
  1211  		return nil, fmt.Errorf("mount type not supported %q", mntInfo.hint.Mount.Type)
  1212  	}
  1213  	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
  1214  }
  1215  
  1216  // mountSharedSubmount binds mount to a previously mounted volume that is shared
  1217  // among containers in the same pod.
  1218  func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mntInfo *mountInfo, sharedMount *vfs.Mount) (*vfs.Mount, error) {
  1219  	if err := mntInfo.hint.checkCompatible(mntInfo.mount); err != nil {
  1220  		return nil, err
  1221  	}
  1222  
  1223  	// Generate mount point specific opts using mntInfo.mount.
  1224  	opts := ParseMountOptions(mntInfo.mount.Options)
  1225  	newMnt := c.k.VFS().NewDisconnectedMount(sharedMount.Filesystem(), sharedMount.Root(), opts)
  1226  	defer newMnt.DecRef(ctx)
  1227  
  1228  	root := mns.Root(ctx)
  1229  	defer root.DecRef(ctx)
  1230  	target := &vfs.PathOperation{
  1231  		Root:  root,
  1232  		Start: root,
  1233  		Path:  fspath.Parse(mntInfo.mount.Destination),
  1234  	}
  1235  
  1236  	if err := c.makeMountPoint(ctx, creds, mns, mntInfo.mount.Destination); err != nil {
  1237  		return nil, fmt.Errorf("creating mount point %q: %w", mntInfo.mount.Destination, err)
  1238  	}
  1239  
  1240  	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
  1241  		return nil, err
  1242  	}
  1243  	log.Infof("Mounted %q type shared bind to %q", mntInfo.mount.Destination, mntInfo.hint.Name)
  1244  	return newMnt, nil
  1245  }
  1246  
  1247  func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
  1248  	root := mns.Root(ctx)
  1249  	defer root.DecRef(ctx)
  1250  	target := &vfs.PathOperation{
  1251  		Root:  root,
  1252  		Start: root,
  1253  		Path:  fspath.Parse(dest),
  1254  	}
  1255  	// First check if mount point exists. When overlay is enabled, gofer doesn't
  1256  	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
  1257  	// because MkdirAt fails with EROFS even if file exists.
  1258  	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
  1259  	if err == nil {
  1260  		// File exists, we're done.
  1261  		vd.DecRef(ctx)
  1262  		return nil
  1263  	}
  1264  	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
  1265  }
  1266  
  1267  // configureRestore returns an updated context.Context including filesystem
  1268  // state used by restore defined by conf.
  1269  func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) {
  1270  	// Compare createMountNamespace(); rootfs always consumes a gofer FD and a
  1271  	// filestore FD is consumed if the rootfs GoferMountConf indicates so.
  1272  	fdmap := make(map[vfs.RestoreID]int)
  1273  
  1274  	rootKey := vfs.RestoreID{ContainerName: c.containerName, Path: "/"}
  1275  	fdmap[rootKey] = c.goferFDs.remove()
  1276  
  1277  	mfmap := make(map[string]*pgalloc.MemoryFile)
  1278  	if rootfsConf := c.goferMountConfs[0]; rootfsConf.IsFilestorePresent() {
  1279  		mf, err := createPrivateMemoryFile(c.goferFilestoreFDs.removeAsFD().ReleaseToFile("overlay-filestore"), rootKey)
  1280  		if err != nil {
  1281  			return ctx, fmt.Errorf("failed to create private memory file for mount rootfs: %w", err)
  1282  		}
  1283  		mfmap[rootKey.String()] = mf
  1284  	}
  1285  	// prepareMounts() consumes the remaining FDs for submounts.
  1286  	mounts, err := c.prepareMounts()
  1287  	if err != nil {
  1288  		return ctx, err
  1289  	}
  1290  	for i := range mounts {
  1291  		submount := &mounts[i]
  1292  		if submount.goferFD != nil {
  1293  			key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination}
  1294  			fdmap[key] = submount.goferFD.Release()
  1295  		}
  1296  		if submount.filestoreFD != nil {
  1297  			key := vfs.RestoreID{ContainerName: c.containerName, Path: submount.mount.Destination}
  1298  			mf, err := createPrivateMemoryFile(submount.filestoreFD.ReleaseToFile("overlay-filestore"), key)
  1299  			if err != nil {
  1300  				return ctx, fmt.Errorf("failed to create private memory file for mount %q: %w", submount.mount.Destination, err)
  1301  			}
  1302  			mfmap[key.String()] = mf
  1303  		}
  1304  	}
  1305  	return context.WithValue(context.WithValue(ctx, vfs.CtxRestoreFilesystemFDMap, fdmap), pgalloc.CtxMemoryFileMap, mfmap), nil
  1306  }
  1307  
  1308  func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error {
  1309  	if info.spec.Linux != nil {
  1310  		// Create any device files specified in the spec.
  1311  		for _, dev := range info.spec.Linux.Devices {
  1312  			if err := createDeviceFile(ctx, creds, info, vfsObj, root, dev); err != nil {
  1313  				return err
  1314  			}
  1315  		}
  1316  	}
  1317  	if specutils.GPUFunctionalityRequestedViaHook(info.spec, info.conf) {
  1318  		// When using nvidia-container-runtime-hook, devices are not injected into
  1319  		// spec.Linux.Devices. So manually create appropriate device files.
  1320  		mode := os.FileMode(0666)
  1321  		nvidiaDevs := []specs.LinuxDevice{
  1322  			specs.LinuxDevice{Path: "/dev/nvidiactl", Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: nvgpu.NV_CONTROL_DEVICE_MINOR, FileMode: &mode},
  1323  			specs.LinuxDevice{Path: "/dev/nvidia-uvm", Type: "c", Major: int64(info.nvidiaUVMDevMajor), Minor: nvgpu.NVIDIA_UVM_PRIMARY_MINOR_NUMBER, FileMode: &mode},
  1324  		}
  1325  		devClient := devutil.GoferClientFromContext(ctx)
  1326  		if devClient == nil {
  1327  			return fmt.Errorf("dev gofer client not found in context")
  1328  		}
  1329  		names, err := devClient.DirentNames(ctx)
  1330  		if err != nil {
  1331  			return fmt.Errorf("failed to get names of dirents from dev gofer: %w", err)
  1332  		}
  1333  		nvidiaDeviceRegex := regexp.MustCompile(`^nvidia(\d+)$`)
  1334  		for _, name := range names {
  1335  			ms := nvidiaDeviceRegex.FindStringSubmatch(name)
  1336  			if ms == nil {
  1337  				continue
  1338  			}
  1339  			minor, err := strconv.ParseUint(ms[1], 10, 32)
  1340  			if err != nil {
  1341  				return fmt.Errorf("invalid nvidia device name %q: %w", name, err)
  1342  			}
  1343  			nvidiaDevs = append(nvidiaDevs, specs.LinuxDevice{Path: fmt.Sprintf("/dev/nvidia%d", minor), Type: "c", Major: nvgpu.NV_MAJOR_DEVICE_NUMBER, Minor: int64(minor), FileMode: &mode})
  1344  		}
  1345  		for _, nvidiaDev := range nvidiaDevs {
  1346  			if err := createDeviceFile(ctx, creds, info, vfsObj, root, nvidiaDev); err != nil {
  1347  				return err
  1348  			}
  1349  		}
  1350  	}
  1351  	return nil
  1352  }
  1353  
  1354  func createDeviceFile(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry, devSpec specs.LinuxDevice) error {
  1355  	mode := linux.FileMode(devSpec.FileMode.Perm())
  1356  	var major, minor uint32
  1357  	// See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices.
  1358  	switch devSpec.Type {
  1359  	case "b":
  1360  		mode |= linux.S_IFBLK
  1361  		major = uint32(devSpec.Major)
  1362  		minor = uint32(devSpec.Minor)
  1363  	case "c", "u":
  1364  		mode |= linux.S_IFCHR
  1365  		major = uint32(devSpec.Major)
  1366  		minor = uint32(devSpec.Minor)
  1367  	case "p":
  1368  		mode |= linux.S_IFIFO
  1369  	default:
  1370  		return fmt.Errorf("specified device at %q has invalid type %q", devSpec.Path, devSpec.Type)
  1371  	}
  1372  	if devSpec.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && major != info.nvidiaUVMDevMajor {
  1373  		// nvidia-uvm's major device number is dynamically assigned, so the
  1374  		// number that it has on the host may differ from the number that
  1375  		// it has in sentry VFS; switch from the former to the latter.
  1376  		log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", devSpec.Major, info.nvidiaUVMDevMajor)
  1377  		major = info.nvidiaUVMDevMajor
  1378  	}
  1379  	return dev.CreateDeviceFile(ctx, vfsObj, creds, root, devSpec.Path, major, minor, mode, devSpec.UID, devSpec.GID)
  1380  }
  1381  
  1382  // registerTPUDevice registers a TPU device in vfsObj based on the given device ID.
  1383  func registerTPUDevice(vfsObj *vfs.VirtualFilesystem, minor uint32, deviceID int64) error {
  1384  	switch deviceID {
  1385  	case tpu.TPUV4DeviceID, tpu.TPUV4liteDeviceID:
  1386  		return accel.RegisterTPUDevice(vfsObj, minor, deviceID == tpu.TPUV4liteDeviceID)
  1387  	case tpu.TPUV5eDeviceID:
  1388  		return tpuproxy.RegisterTPUDevice(vfsObj, minor)
  1389  	default:
  1390  		return fmt.Errorf("unsupported TPU device with ID: 0x%x", deviceID)
  1391  	}
  1392  }
  1393  
  1394  // pathGlobToPathRegex is a map that points a TPU PCI path glob to its path regex.
  1395  // TPU v4 devices are accessible via /sys/devices/pci0000:00/<pci_address>/accel/accel# on the host.
  1396  // TPU v5 devices are accessible via at /sys/devices/pci0000:00/<pci_address>/vfio-dev/vfio# on the host.
  1397  var pathGlobToPathRegex = map[string]string{
  1398  	pciPathGlobTPUv4: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/accel/accel(\d+)$`,
  1399  	pciPathGlobTPUv5: `^/sys/devices/pci0000:00/\d+:\d+:\d+\.\d+/vfio-dev/vfio(\d+)$`,
  1400  }
  1401  
  1402  func tpuProxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error {
  1403  	if !specutils.TPUProxyIsEnabled(info.spec, info.conf) {
  1404  		return nil
  1405  	}
  1406  	// Enumerate all potential PCI paths where TPU devices are available and register the found TPU devices.
  1407  	for pciPathGlobal, pathRegex := range pathGlobToPathRegex {
  1408  		pciAddrs, err := filepath.Glob(pciPathGlobal)
  1409  		if err != nil {
  1410  			return fmt.Errorf("enumerating PCI device files: %w", err)
  1411  		}
  1412  		pciPathRegex := regexp.MustCompile(pathRegex)
  1413  		for _, pciPath := range pciAddrs {
  1414  			ms := pciPathRegex.FindStringSubmatch(pciPath)
  1415  			if ms == nil {
  1416  				continue
  1417  			}
  1418  			deviceNum, err := strconv.ParseUint(ms[1], 10, 32)
  1419  			if err != nil {
  1420  				return fmt.Errorf("parsing PCI device number: %w", err)
  1421  			}
  1422  			var deviceIDBytes []byte
  1423  			if deviceIDBytes, err = os.ReadFile(path.Join(pciPath, "device/device")); err != nil {
  1424  				return fmt.Errorf("reading PCI device ID: %w", err)
  1425  			}
  1426  			deviceIDStr := strings.Replace(string(deviceIDBytes), "0x", "", -1)
  1427  			deviceID, err := strconv.ParseInt(strings.TrimSpace(deviceIDStr), 16, 64)
  1428  			if err != nil {
  1429  				return fmt.Errorf("parsing PCI device ID: %w", err)
  1430  			}
  1431  			if err := registerTPUDevice(vfsObj, uint32(deviceNum), deviceID); err != nil {
  1432  				return fmt.Errorf("registering TPU driver: %w", err)
  1433  			}
  1434  		}
  1435  	}
  1436  	if err := tpuproxy.RegisterVfioDevice(vfsObj); err != nil {
  1437  		return fmt.Errorf("registering vfio driver: %w", err)
  1438  	}
  1439  	return nil
  1440  }
  1441  
  1442  func nvproxyRegisterDevices(info *containerInfo, vfsObj *vfs.VirtualFilesystem) error {
  1443  	if !specutils.NVProxyEnabled(info.spec, info.conf) {
  1444  		return nil
  1445  	}
  1446  	uvmDevMajor, err := vfsObj.GetDynamicCharDevMajor()
  1447  	if err != nil {
  1448  		return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err)
  1449  	}
  1450  	if err := nvproxy.Register(vfsObj, info.nvidiaDriverVersion, uvmDevMajor); err != nil {
  1451  		return fmt.Errorf("registering nvproxy driver: %w", err)
  1452  	}
  1453  	info.nvidiaUVMDevMajor = uvmDevMajor
  1454  	return nil
  1455  }