github.com/nicocha30/gvisor-ligolo@v0.0.0-20230726075806-989fa2c0a413/runsc/boot/vfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"path"
    20  	"path/filepath"
    21  	"regexp"
    22  	"sort"
    23  	"strconv"
    24  	"strings"
    25  
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/nicocha30/gvisor-ligolo/pkg/abi/linux"
    28  	"github.com/nicocha30/gvisor-ligolo/pkg/cleanup"
    29  	"github.com/nicocha30/gvisor-ligolo/pkg/context"
    30  	"github.com/nicocha30/gvisor-ligolo/pkg/errors/linuxerr"
    31  	"github.com/nicocha30/gvisor-ligolo/pkg/fd"
    32  	"github.com/nicocha30/gvisor-ligolo/pkg/fspath"
    33  	"github.com/nicocha30/gvisor-ligolo/pkg/log"
    34  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/devices/accel"
    35  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/devices/memdev"
    36  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/devices/nvproxy"
    37  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/devices/ttydev"
    38  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/devices/tundev"
    39  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/cgroupfs"
    40  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/devpts"
    41  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/devtmpfs"
    42  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/fuse"
    43  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/gofer"
    44  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/mqfs"
    45  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/overlay"
    46  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/proc"
    47  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/sys"
    48  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/tmpfs"
    49  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/fsimpl/user"
    50  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/inet"
    51  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel"
    52  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/kernel/auth"
    53  	"github.com/nicocha30/gvisor-ligolo/pkg/sentry/vfs"
    54  	"github.com/nicocha30/gvisor-ligolo/runsc/config"
    55  	"github.com/nicocha30/gvisor-ligolo/runsc/specutils"
    56  )
    57  
    58  // Supported filesystems that map to different internal filesystems.
    59  const (
    60  	Bind   = "bind"
    61  	Nonefs = "none"
    62  )
    63  
    64  // SelfOverlayFilestorePrefix is the prefix in the file name of the
    65  // self overlay filestore file.
    66  const SelfOverlayFilestorePrefix = ".gvisor.overlay.img."
    67  
    68  // SelfOverlayFilestorePath returns the path at which the self overlay
    69  // filestore file is stored for a given mount.
    70  func SelfOverlayFilestorePath(mountSrc, sandboxID string) string {
    71  	// We will place the filestore file in a gVisor specific hidden file inside
    72  	// the mount being overlay-ed itself. The same volume can be overlay-ed by
    73  	// multiple sandboxes. So make the filestore file unique to a sandbox by
    74  	// suffixing the sandbox ID.
    75  	return path.Join(mountSrc, selfOverlayFilestoreName(sandboxID))
    76  }
    77  
    78  func selfOverlayFilestoreName(sandboxID string) string {
    79  	return SelfOverlayFilestorePrefix + sandboxID
    80  }
    81  
    82  // tmpfs has some extra supported options that we must pass through.
    83  var tmpfsAllowedData = []string{"mode", "size", "uid", "gid"}
    84  
    85  func registerFilesystems(k *kernel.Kernel, info *containerInfo) error {
    86  	ctx := k.SupervisorContext()
    87  	creds := auth.NewRootCredentials(k.RootUserNamespace())
    88  	vfsObj := k.VFS()
    89  
    90  	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    91  		AllowUserMount: true,
    92  		AllowUserList:  true,
    93  	})
    94  	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    95  		AllowUserList: true,
    96  		// TODO(b/29356795): Users may mount this once the terminals are in a
    97  		//  usable state.
    98  		AllowUserMount: false,
    99  	})
   100  	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   101  		AllowUserMount: true,
   102  		AllowUserList:  true,
   103  	})
   104  	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   105  		AllowUserMount: true,
   106  		AllowUserList:  true,
   107  	})
   108  	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   109  		AllowUserList: true,
   110  	})
   111  	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   112  		AllowUserMount: true,
   113  		AllowUserList:  true,
   114  	})
   115  	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   116  		AllowUserMount: true,
   117  		AllowUserList:  true,
   118  	})
   119  	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   120  		AllowUserMount: true,
   121  		AllowUserList:  true,
   122  	})
   123  	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   124  		AllowUserMount: true,
   125  		AllowUserList:  true,
   126  	})
   127  	vfsObj.MustRegisterFilesystemType(mqfs.Name, &mqfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
   128  		AllowUserMount: true,
   129  		AllowUserList:  true,
   130  	})
   131  
   132  	// Register devices.
   133  	if err := memdev.Register(vfsObj); err != nil {
   134  		return fmt.Errorf("registering memdev: %w", err)
   135  	}
   136  	if err := ttydev.Register(vfsObj); err != nil {
   137  		return fmt.Errorf("registering ttydev: %w", err)
   138  	}
   139  	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
   140  	if tunSupported {
   141  		if err := tundev.Register(vfsObj); err != nil {
   142  			return fmt.Errorf("registering tundev: %v", err)
   143  		}
   144  	}
   145  	if err := fuse.Register(vfsObj); err != nil {
   146  		return fmt.Errorf("registering fusedev: %w", err)
   147  	}
   148  
   149  	// Setup files in devtmpfs.
   150  	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
   151  	if err != nil {
   152  		return fmt.Errorf("creating devtmpfs accessor: %w", err)
   153  	}
   154  	defer a.Release(ctx)
   155  
   156  	if err := a.UserspaceInit(ctx); err != nil {
   157  		return fmt.Errorf("initializing userspace: %w", err)
   158  	}
   159  	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
   160  		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
   161  	}
   162  	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
   163  		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
   164  	}
   165  	if tunSupported {
   166  		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
   167  			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
   168  		}
   169  	}
   170  	if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
   171  		return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
   172  	}
   173  
   174  	if err := nvproxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil {
   175  		return err
   176  	}
   177  
   178  	if err := tpuProxyRegisterDevicesAndCreateFiles(ctx, info, k, vfsObj, a); err != nil {
   179  		return err
   180  	}
   181  
   182  	return nil
   183  }
   184  
   185  func setupContainerVFS(ctx context.Context, info *containerInfo, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   186  	// Create context with root credentials to mount the filesystem (the current
   187  	// user may not be privileged enough).
   188  	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   189  	rootProcArgs := *procArgs
   190  	rootProcArgs.WorkingDirectory = "/"
   191  	rootProcArgs.Credentials = rootCreds
   192  	rootProcArgs.Umask = 0022
   193  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   194  	rootCtx := rootProcArgs.NewContext(mntr.k)
   195  
   196  	mns, err := mntr.mountAll(rootCtx, rootCreds, info.conf, &rootProcArgs)
   197  	if err != nil {
   198  		return fmt.Errorf("failed to setupFS: %w", err)
   199  	}
   200  	procArgs.MountNamespace = mns
   201  
   202  	mnsRoot := mns.Root()
   203  	mnsRoot.IncRef()
   204  	defer mnsRoot.DecRef(rootCtx)
   205  
   206  	if err := createDeviceFiles(rootCtx, rootCreds, info, mntr.k.VFS(), mnsRoot); err != nil {
   207  		return fmt.Errorf("failed to create device files: %w", err)
   208  	}
   209  
   210  	// We are executing a file directly. Do not resolve the executable path.
   211  	if procArgs.File != nil {
   212  		return nil
   213  	}
   214  	// Resolve the executable path from working dir and environment.
   215  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   216  	if err != nil {
   217  		return err
   218  	}
   219  	procArgs.Filename = resolved
   220  	return nil
   221  }
   222  
   223  // compileMounts returns the supported mounts from the mount spec, adding any
   224  // mandatory mounts that are required by the OCI specification.
   225  //
   226  // This function must NOT add/remove any gofer mounts or change their order.
   227  func compileMounts(spec *specs.Spec, conf *config.Config) []specs.Mount {
   228  	// Keep track of whether proc and sys were mounted.
   229  	var procMounted, sysMounted, devMounted, devptsMounted bool
   230  	var mounts []specs.Mount
   231  
   232  	// Mount all submounts from the spec.
   233  	for _, m := range spec.Mounts {
   234  		// Unconditionally drop any cgroupfs mounts. If requested, we'll add our
   235  		// own below.
   236  		if m.Type == cgroupfs.Name {
   237  			continue
   238  		}
   239  		switch filepath.Clean(m.Destination) {
   240  		case "/proc":
   241  			procMounted = true
   242  		case "/sys":
   243  			sysMounted = true
   244  		case "/dev":
   245  			m.Type = devtmpfs.Name
   246  			devMounted = true
   247  		case "/dev/pts":
   248  			m.Type = devpts.Name
   249  			devptsMounted = true
   250  		}
   251  		mounts = append(mounts, m)
   252  	}
   253  
   254  	// Mount proc and sys even if the user did not ask for it, as the spec
   255  	// says we SHOULD.
   256  	var mandatoryMounts []specs.Mount
   257  
   258  	if conf.Cgroupfs {
   259  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   260  			Type:        tmpfs.Name,
   261  			Destination: "/sys/fs/cgroup",
   262  		})
   263  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   264  			Type:        cgroupfs.Name,
   265  			Destination: "/sys/fs/cgroup/memory",
   266  			Options:     []string{"memory"},
   267  		})
   268  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   269  			Type:        cgroupfs.Name,
   270  			Destination: "/sys/fs/cgroup/cpu",
   271  			Options:     []string{"cpu"},
   272  		})
   273  	}
   274  
   275  	if !procMounted {
   276  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   277  			Type:        proc.Name,
   278  			Destination: "/proc",
   279  		})
   280  	}
   281  	if !sysMounted {
   282  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   283  			Type:        sys.Name,
   284  			Destination: "/sys",
   285  		})
   286  	}
   287  	if !devMounted {
   288  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   289  			Type:        devtmpfs.Name,
   290  			Destination: "/dev",
   291  		})
   292  	}
   293  	if !devptsMounted {
   294  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   295  			Type:        devpts.Name,
   296  			Destination: "/dev/pts",
   297  		})
   298  	}
   299  
   300  	// The mandatory mounts should be ordered right after the root, in case
   301  	// there are submounts of these mandatory mounts already in the spec.
   302  	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
   303  
   304  	return mounts
   305  }
   306  
   307  // goferMountData creates a slice of gofer mount data.
   308  func goferMountData(fd int, fa config.FileAccessType, conf *config.Config) []string {
   309  	opts := []string{
   310  		"trans=fd",
   311  		"rfdno=" + strconv.Itoa(fd),
   312  		"wfdno=" + strconv.Itoa(fd),
   313  	}
   314  	if fa == config.FileAccessShared {
   315  		opts = append(opts, "cache=remote_revalidating")
   316  	}
   317  	if conf.DirectFS {
   318  		opts = append(opts, "directfs")
   319  	}
   320  	if !conf.HostFifo.AllowOpen() {
   321  		opts = append(opts, "disable_fifo_open")
   322  	}
   323  	return opts
   324  }
   325  
   326  // parseAndFilterOptions parses a MountOptions slice and filters by the allowed
   327  // keys.
   328  func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
   329  	var out []string
   330  	for _, o := range opts {
   331  		ok, err := parseMountOption(o, allowedKeys...)
   332  		if err != nil {
   333  			return nil, err
   334  		}
   335  		if ok {
   336  			out = append(out, o)
   337  		}
   338  	}
   339  	return out, nil
   340  }
   341  
   342  func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
   343  	kv := strings.SplitN(opt, "=", 3)
   344  	if len(kv) > 2 {
   345  		return false, fmt.Errorf("invalid option %q", opt)
   346  	}
   347  	return specutils.ContainsStr(allowedKeys, kv[0]), nil
   348  }
   349  
   350  type fdDispenser struct {
   351  	fds []*fd.FD
   352  }
   353  
   354  func (f *fdDispenser) remove() int {
   355  	return f.removeAsFD().Release()
   356  }
   357  
   358  func (f *fdDispenser) removeAsFD() *fd.FD {
   359  	if f.empty() {
   360  		panic("fdDispenser out of fds")
   361  	}
   362  	rv := f.fds[0]
   363  	f.fds = f.fds[1:]
   364  	return rv
   365  }
   366  
   367  func (f *fdDispenser) empty() bool {
   368  	return len(f.fds) == 0
   369  }
   370  
   371  type containerMounter struct {
   372  	root *specs.Root
   373  
   374  	// mounts is the set of submounts for the container. It's a copy from the spec
   375  	// that may be freely modified without affecting the original spec.
   376  	mounts []specs.Mount
   377  
   378  	// fds is the list of FDs to be dispensed for mounts that require it.
   379  	fds fdDispenser
   380  
   381  	// overlayFilestoreFDs are the FDs to the regular files that will back the
   382  	// tmpfs upper mount in the overlay mounts.
   383  	overlayFilestoreFDs fdDispenser
   384  
   385  	// overlayMediums contains information about how the gofer mounts have been
   386  	// overlaid. The first entry is for rootfs and the following entries are for
   387  	// bind mounts in `mounts` slice above (in the same order).
   388  	overlayMediums []OverlayMedium
   389  
   390  	k *kernel.Kernel
   391  
   392  	hints *PodMountHints
   393  
   394  	// productName is the value to show in
   395  	// /sys/devices/virtual/dmi/id/product_name.
   396  	productName string
   397  
   398  	// sandboxID is the ID for the whole sandbox.
   399  	sandboxID string
   400  }
   401  
   402  func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *PodMountHints, productName string, sandboxID string) *containerMounter {
   403  	return &containerMounter{
   404  		root:                info.spec.Root,
   405  		mounts:              compileMounts(info.spec, info.conf),
   406  		fds:                 fdDispenser{fds: info.goferFDs},
   407  		overlayFilestoreFDs: fdDispenser{fds: info.overlayFilestoreFDs},
   408  		overlayMediums:      info.overlayMediums,
   409  		k:                   k,
   410  		hints:               hints,
   411  		productName:         productName,
   412  		sandboxID:           sandboxID,
   413  	}
   414  }
   415  
   416  func (c *containerMounter) checkDispenser() error {
   417  	if !c.fds.empty() {
   418  		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
   419  	}
   420  	return nil
   421  }
   422  
   423  func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount, hint *MountHint) config.FileAccessType {
   424  	if hint != nil {
   425  		return hint.fileAccessType()
   426  	}
   427  	return conf.FileAccessMounts
   428  }
   429  
   430  func (c *containerMounter) mountAll(rootCtx context.Context, rootCreds *auth.Credentials, conf *config.Config, rootProcArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
   431  	log.Infof("Configuring container's file system")
   432  
   433  	mns, err := c.createMountNamespace(rootCtx, conf, rootCreds)
   434  	if err != nil {
   435  		return nil, fmt.Errorf("creating mount namespace: %w", err)
   436  	}
   437  	rootProcArgs.MountNamespace = mns
   438  
   439  	root := mns.Root()
   440  	root.IncRef()
   441  	defer root.DecRef(rootCtx)
   442  	if root.Mount().ReadOnly() {
   443  		// Switch to ReadWrite while we setup submounts.
   444  		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
   445  			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
   446  		}
   447  		// Restore back to ReadOnly at the end.
   448  		defer func() {
   449  			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
   450  				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
   451  			}
   452  		}()
   453  	}
   454  
   455  	// Mount submounts.
   456  	if err := c.mountSubmounts(rootCtx, conf, mns, rootCreds); err != nil {
   457  		return nil, fmt.Errorf("mounting submounts: %w", err)
   458  	}
   459  
   460  	return mns, nil
   461  }
   462  
   463  // createMountNamespace creates the container's root mount and namespace.
   464  func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
   465  	ioFD := c.fds.remove()
   466  	data := goferMountData(ioFD, conf.FileAccess, conf)
   467  
   468  	// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   469  	// can only send mount options for specs.Mounts (specs.Root is missing
   470  	// Options field). So assume root is always on top of overlayfs.
   471  	data = append(data, "overlayfs_stale_read")
   472  
   473  	// Configure the gofer dentry cache size.
   474  	gofer.SetDentryCacheSize(conf.DCache)
   475  
   476  	log.Infof("Mounting root with gofer, ioFD: %d", ioFD)
   477  	opts := &vfs.MountOptions{
   478  		ReadOnly: c.root.Readonly,
   479  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   480  			Data: strings.Join(data, ","),
   481  			InternalData: gofer.InternalFilesystemOptions{
   482  				UniqueID: "/",
   483  			},
   484  		},
   485  		InternalMount: true,
   486  	}
   487  
   488  	fsName := gofer.Name
   489  	if c.overlayMediums[0].IsEnabled() {
   490  		log.Infof("Adding overlay on top of root")
   491  		var (
   492  			err              error
   493  			cleanup          func()
   494  			overlayFilestore *fd.FD
   495  		)
   496  		if c.overlayMediums[0].IsBackedByHostFile() {
   497  			overlayFilestore = c.overlayFilestoreFDs.removeAsFD()
   498  		}
   499  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, overlayFilestore, c.overlayMediums[0])
   500  		if err != nil {
   501  			return nil, fmt.Errorf("mounting root with overlay: %w", err)
   502  		}
   503  		defer cleanup()
   504  		fsName = overlay.Name
   505  	}
   506  
   507  	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
   508  	if err != nil {
   509  		return nil, fmt.Errorf("setting up mount namespace: %w", err)
   510  	}
   511  	return mns, nil
   512  }
   513  
   514  // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
   515  // layer using tmpfs, and return overlay mount options. "cleanup" must be called
   516  // after the options have been used to mount the overlay, to release refs on
   517  // lower and upper mounts.
   518  func (c *containerMounter) configureOverlay(ctx context.Context, conf *config.Config, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string, filestoreFD *fd.FD, medium OverlayMedium) (*vfs.MountOptions, func(), error) {
   519  	// First copy options from lower layer to upper layer and overlay. Clear
   520  	// filesystem specific options.
   521  	upperOpts := *lowerOpts
   522  	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   523  
   524  	overlayOpts := *lowerOpts
   525  	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   526  
   527  	// All writes go to the upper layer, be paranoid and make lower readonly.
   528  	lowerOpts.ReadOnly = true
   529  	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
   530  	if err != nil {
   531  		return nil, nil, err
   532  	}
   533  	cu := cleanup.Make(func() { lower.DecRef(ctx) })
   534  	defer cu.Clean()
   535  
   536  	// Determine the lower layer's root's type.
   537  	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
   538  	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
   539  		Root:  lowerRootVD,
   540  		Start: lowerRootVD,
   541  	}, &vfs.StatOptions{
   542  		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
   543  	})
   544  	if err != nil {
   545  		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
   546  	}
   547  	if stat.Mask&linux.STATX_TYPE == 0 {
   548  		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
   549  	}
   550  	rootType := stat.Mode & linux.S_IFMT
   551  	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
   552  		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
   553  	}
   554  
   555  	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
   556  	tmpfsOpts := tmpfs.FilesystemOpts{
   557  		RootFileType: uint16(rootType),
   558  		FilestoreFD:  filestoreFD,
   559  		// If a mount is being overlaid, it should not be limited by the default
   560  		// tmpfs size limit.
   561  		DisableDefaultSizeLimit: true,
   562  	}
   563  	upperOpts.GetFilesystemOptions.InternalData = tmpfsOpts
   564  	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
   565  	if err != nil {
   566  		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
   567  	}
   568  	cu.Add(func() { upper.DecRef(ctx) })
   569  
   570  	// If the overlay mount consists of a regular file, copy up its contents
   571  	// from the lower layer, since in the overlay the otherwise-empty upper
   572  	// layer file will take precedence.
   573  	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
   574  	if rootType == linux.S_IFREG {
   575  		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   576  			Root:  lowerRootVD,
   577  			Start: lowerRootVD,
   578  		}, &vfs.OpenOptions{
   579  			Flags: linux.O_RDONLY,
   580  		})
   581  		if err != nil {
   582  			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
   583  		}
   584  		defer lowerFD.DecRef(ctx)
   585  		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   586  			Root:  upperRootVD,
   587  			Start: upperRootVD,
   588  		}, &vfs.OpenOptions{
   589  			Flags: linux.O_WRONLY,
   590  		})
   591  		if err != nil {
   592  			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
   593  		}
   594  		defer upperFD.DecRef(ctx)
   595  		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
   596  			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
   597  		}
   598  	}
   599  
   600  	// We need to hide the filestore from the containerized application.
   601  	if medium == SelfMedium {
   602  		if err := overlay.CreateWhiteout(ctx, c.k.VFS(), creds, &vfs.PathOperation{
   603  			Root:  upperRootVD,
   604  			Start: upperRootVD,
   605  			Path:  fspath.Parse(selfOverlayFilestoreName(c.sandboxID)),
   606  		}); err != nil {
   607  			return nil, nil, fmt.Errorf("failed to create whiteout to hide self overlay filestore: %w", err)
   608  		}
   609  	}
   610  
   611  	// Propagate the lower layer's root's owner, group, and mode to the upper
   612  	// layer's root for consistency with VFS1.
   613  	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
   614  		Root:  upperRootVD,
   615  		Start: upperRootVD,
   616  	}, &vfs.SetStatOptions{
   617  		Stat: linux.Statx{
   618  			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
   619  			UID:  stat.UID,
   620  			GID:  stat.GID,
   621  			Mode: stat.Mode,
   622  		},
   623  	})
   624  	if err != nil {
   625  		return nil, nil, err
   626  	}
   627  
   628  	// Configure overlay with both layers.
   629  	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
   630  		UpperRoot:  upperRootVD,
   631  		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
   632  	}
   633  	return &overlayOpts, cu.Release(), nil
   634  }
   635  
   636  func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
   637  	mounts, err := c.prepareMounts()
   638  	if err != nil {
   639  		return err
   640  	}
   641  
   642  	for i := range mounts {
   643  		submount := &mounts[i]
   644  		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
   645  		var (
   646  			mnt *vfs.Mount
   647  			err error
   648  		)
   649  
   650  		if submount.hint != nil && submount.hint.shouldShareMount() {
   651  			mnt, err = c.mountSharedSubmount(ctx, conf, mns, creds, submount.mount, submount.hint)
   652  			if err != nil {
   653  				return fmt.Errorf("mount shared mount %q to %q: %v", submount.hint.name, submount.mount.Destination, err)
   654  			}
   655  		} else {
   656  			mnt, err = c.mountSubmount(ctx, conf, mns, creds, submount)
   657  			if err != nil {
   658  				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
   659  			}
   660  		}
   661  
   662  		if mnt != nil && mnt.ReadOnly() {
   663  			// Switch to ReadWrite while we setup submounts.
   664  			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
   665  				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
   666  			}
   667  			// Restore back to ReadOnly at the end.
   668  			defer func() {
   669  				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
   670  					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
   671  				}
   672  			}()
   673  		}
   674  	}
   675  
   676  	if err := c.mountTmp(ctx, conf, creds, mns); err != nil {
   677  		return fmt.Errorf(`mount submount "\tmp": %w`, err)
   678  	}
   679  	return nil
   680  }
   681  
   682  type mountInfo struct {
   683  	mount              *specs.Mount
   684  	fd                 int
   685  	hint               *MountHint
   686  	overlayMedium      OverlayMedium
   687  	overlayFilestoreFD *fd.FD
   688  }
   689  
   690  func newNonGoferMountInfo(mount *specs.Mount) *mountInfo {
   691  	return &mountInfo{mount: mount, fd: -1}
   692  }
   693  
   694  func (c *containerMounter) prepareMounts() ([]mountInfo, error) {
   695  	// Associate bind mounts with their FDs before sorting since there is an
   696  	// undocumented assumption that FDs are dispensed in the order in which
   697  	// they are required by mounts.
   698  	var mounts []mountInfo
   699  	goferMntIdx := 1 // First index is for rootfs.
   700  	for i := range c.mounts {
   701  		m := &c.mounts[i]
   702  		specutils.MaybeConvertToBindMount(m)
   703  
   704  		// Only bind mounts use host FDs; see
   705  		// containerMounter.getMountNameAndOptions.
   706  		info := mountInfo{
   707  			mount:         m,
   708  			fd:            -1,
   709  			hint:          c.hints.FindMount(m),
   710  			overlayMedium: NoOverlay,
   711  		}
   712  		if specutils.IsGoferMount(*m) {
   713  			info.fd = c.fds.remove()
   714  			info.overlayMedium = c.overlayMediums[goferMntIdx]
   715  			if info.overlayMedium.IsBackedByHostFile() {
   716  				info.overlayFilestoreFD = c.overlayFilestoreFDs.removeAsFD()
   717  			}
   718  			goferMntIdx++
   719  		}
   720  		mounts = append(mounts, info)
   721  	}
   722  	if err := c.checkDispenser(); err != nil {
   723  		return nil, err
   724  	}
   725  
   726  	// Sort the mounts so that we don't place children before parents.
   727  	sort.Slice(mounts, func(i, j int) bool {
   728  		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
   729  	})
   730  
   731  	return mounts, nil
   732  }
   733  
   734  func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountInfo) (*vfs.Mount, error) {
   735  	fsName, opts, err := c.getMountNameAndOptions(conf, submount)
   736  	if err != nil {
   737  		return nil, fmt.Errorf("mountOptions failed: %w", err)
   738  	}
   739  	if len(fsName) == 0 {
   740  		// Filesystem is not supported (e.g. cgroup), just skip it.
   741  		return nil, nil
   742  	}
   743  
   744  	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
   745  		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
   746  	}
   747  
   748  	if submount.overlayMedium.IsEnabled() {
   749  		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
   750  		var cleanup func()
   751  		opts, cleanup, err = c.configureOverlay(ctx, conf, creds, opts, fsName, submount.overlayFilestoreFD, submount.overlayMedium)
   752  		if err != nil {
   753  			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
   754  		}
   755  		defer cleanup()
   756  		fsName = overlay.Name
   757  	}
   758  
   759  	root := mns.Root()
   760  	root.IncRef()
   761  	defer root.DecRef(ctx)
   762  	target := &vfs.PathOperation{
   763  		Root:  root,
   764  		Start: root,
   765  		Path:  fspath.Parse(submount.mount.Destination),
   766  	}
   767  	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
   768  	if err != nil {
   769  		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
   770  	}
   771  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
   772  	return mnt, nil
   773  }
   774  
   775  // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
   776  // used for mounts.
   777  func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *mountInfo) (string, *vfs.MountOptions, error) {
   778  	fsName := m.mount.Type
   779  	var (
   780  		data         []string
   781  		internalData any
   782  	)
   783  
   784  	// Find filesystem name and FS specific data field.
   785  	switch m.mount.Type {
   786  	case devpts.Name, devtmpfs.Name, proc.Name:
   787  		// Nothing to do.
   788  
   789  	case Nonefs:
   790  		fsName = sys.Name
   791  
   792  	case sys.Name:
   793  		sysData := &sys.InternalData{EnableAccelSysfs: conf.TPUProxy}
   794  		if len(c.productName) > 0 {
   795  			sysData.ProductName = c.productName
   796  		}
   797  		internalData = sysData
   798  
   799  	case tmpfs.Name:
   800  		var err error
   801  		data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...)
   802  		if err != nil {
   803  			return "", nil, err
   804  		}
   805  
   806  	case Bind:
   807  		fsName = gofer.Name
   808  		if m.fd < 0 {
   809  			// Check that an FD was provided to fails fast.
   810  			return "", nil, fmt.Errorf("gofer mount requires a connection FD")
   811  		}
   812  		data = goferMountData(m.fd, c.getMountAccessType(conf, m.mount, m.hint), conf)
   813  		internalData = gofer.InternalFilesystemOptions{
   814  			UniqueID: m.mount.Destination,
   815  		}
   816  
   817  	case cgroupfs.Name:
   818  		var err error
   819  		data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...)
   820  		if err != nil {
   821  			return "", nil, err
   822  		}
   823  
   824  	default:
   825  		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
   826  		return "", nil, nil
   827  	}
   828  
   829  	opts := ParseMountOptions(m.mount.Options)
   830  	opts.GetFilesystemOptions = vfs.GetFilesystemOptions{
   831  		Data:         strings.Join(data, ","),
   832  		InternalData: internalData,
   833  	}
   834  
   835  	return fsName, opts, nil
   836  }
   837  
   838  // ParseMountOptions converts specs.Mount.Options to vfs.MountOptions.
   839  func ParseMountOptions(opts []string) *vfs.MountOptions {
   840  	mountOpts := &vfs.MountOptions{
   841  		InternalMount: true,
   842  	}
   843  	// Note: update mountHint.CheckCompatible when more options are added.
   844  	for _, o := range opts {
   845  		switch o {
   846  		case "ro":
   847  			mountOpts.ReadOnly = true
   848  		case "noatime":
   849  			mountOpts.Flags.NoATime = true
   850  		case "noexec":
   851  			mountOpts.Flags.NoExec = true
   852  		case "rw", "atime", "exec":
   853  			// These use the default value and don't need to be set.
   854  		case "bind", "rbind":
   855  			// These are the same as a mount with type="bind".
   856  		default:
   857  			log.Warningf("ignoring unknown mount option %q", o)
   858  		}
   859  	}
   860  	return mountOpts
   861  }
   862  
   863  func parseKeyValue(s string) (string, string, bool) {
   864  	tokens := strings.SplitN(s, "=", 2)
   865  	if len(tokens) < 2 {
   866  		return "", "", false
   867  	}
   868  	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
   869  }
   870  
   871  // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
   872  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
   873  // the host /tmp, but this is a nice optimization, and fixes some apps that call
   874  // mknod in /tmp. It's unsafe to mount tmpfs if:
   875  //  1. /tmp is mounted explicitly: we should not override user's wish
   876  //  2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
   877  //
   878  // Note that when there are submounts inside of '/tmp', directories for the
   879  // mount points must be present, making '/tmp' not empty anymore.
   880  func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
   881  	for _, m := range c.mounts {
   882  		// m.Destination has been cleaned, so it's to use equality here.
   883  		if m.Destination == "/tmp" {
   884  			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
   885  			return nil
   886  		}
   887  	}
   888  
   889  	root := mns.Root()
   890  	root.IncRef()
   891  	defer root.DecRef(ctx)
   892  	pop := vfs.PathOperation{
   893  		Root:  root,
   894  		Start: root,
   895  		Path:  fspath.Parse("/tmp"),
   896  	}
   897  	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
   898  	switch {
   899  	case err == nil:
   900  		defer fd.DecRef(ctx)
   901  
   902  		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
   903  			if dirent.Name != "." && dirent.Name != ".." {
   904  				return linuxerr.ENOTEMPTY
   905  			}
   906  			return nil
   907  		}))
   908  		switch {
   909  		case err == nil:
   910  			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
   911  		case linuxerr.Equals(linuxerr.ENOTEMPTY, err):
   912  			// If more than "." and ".." is found, skip internal tmpfs to prevent
   913  			// hiding existing files.
   914  			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
   915  			return nil
   916  		default:
   917  			return fmt.Errorf("fd.IterDirents failed: %v", err)
   918  		}
   919  		fallthrough
   920  
   921  	case linuxerr.Equals(linuxerr.ENOENT, err):
   922  		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
   923  		// tmpfs.
   924  		tmpMount := specs.Mount{
   925  			Type:        tmpfs.Name,
   926  			Destination: "/tmp",
   927  			// Sticky bit is added to prevent accidental deletion of files from
   928  			// another user. This is normally done for /tmp.
   929  			Options: []string{"mode=01777"},
   930  		}
   931  		if _, err := c.mountSubmount(ctx, conf, mns, creds, newNonGoferMountInfo(&tmpMount)); err != nil {
   932  			return fmt.Errorf("mountSubmount failed: %v", err)
   933  		}
   934  		return nil
   935  
   936  	case linuxerr.Equals(linuxerr.ENOTDIR, err):
   937  		// Not a dir?! Let it be.
   938  		return nil
   939  
   940  	default:
   941  		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
   942  	}
   943  }
   944  
   945  // processHints processes annotations that container hints about how volumes
   946  // should be mounted (e.g. a volume shared between containers). It must be
   947  // called for the root container only.
   948  func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
   949  	ctx := c.k.SupervisorContext()
   950  	for _, hint := range c.hints.mounts {
   951  		if !hint.shouldShareMount() {
   952  			continue
   953  		}
   954  
   955  		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
   956  		mnt, err := c.mountSharedMaster(ctx, conf, hint, creds)
   957  		if err != nil {
   958  			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
   959  		}
   960  		hint.vfsMount = mnt
   961  	}
   962  	return nil
   963  }
   964  
   965  // mountSharedMaster mounts the master of a volume that is shared among
   966  // containers in a pod.
   967  func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *MountHint, creds *auth.Credentials) (*vfs.Mount, error) {
   968  	// Map mount type to filesystem name, and parse out the options that we are
   969  	// capable of dealing with.
   970  	mntInfo := newNonGoferMountInfo(&hint.mount)
   971  	fsName, opts, err := c.getMountNameAndOptions(conf, mntInfo)
   972  	if err != nil {
   973  		return nil, err
   974  	}
   975  	if len(fsName) == 0 {
   976  		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
   977  	}
   978  	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
   979  }
   980  
   981  // mountSharedSubmount binds mount to a previously mounted volume that is shared
   982  // among containers in the same pod.
   983  func (c *containerMounter) mountSharedSubmount(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, source *MountHint) (*vfs.Mount, error) {
   984  	if err := source.checkCompatible(mount); err != nil {
   985  		return nil, err
   986  	}
   987  
   988  	// Ignore data and useOverlay because these were already applied to
   989  	// the master mount.
   990  	_, opts, err := c.getMountNameAndOptions(conf, newNonGoferMountInfo(mount))
   991  	if err != nil {
   992  		return nil, err
   993  	}
   994  	newMnt := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
   995  	defer newMnt.DecRef(ctx)
   996  
   997  	root := mns.Root()
   998  	root.IncRef()
   999  	defer root.DecRef(ctx)
  1000  	target := &vfs.PathOperation{
  1001  		Root:  root,
  1002  		Start: root,
  1003  		Path:  fspath.Parse(mount.Destination),
  1004  	}
  1005  
  1006  	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
  1007  		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
  1008  	}
  1009  
  1010  	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
  1011  		return nil, err
  1012  	}
  1013  	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
  1014  	return newMnt, nil
  1015  }
  1016  
  1017  func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
  1018  	root := mns.Root()
  1019  	root.IncRef()
  1020  	defer root.DecRef(ctx)
  1021  	target := &vfs.PathOperation{
  1022  		Root:  root,
  1023  		Start: root,
  1024  		Path:  fspath.Parse(dest),
  1025  	}
  1026  	// First check if mount point exists. When overlay is enabled, gofer doesn't
  1027  	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
  1028  	// because MkdirAt fails with EROFS even if file exists.
  1029  	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
  1030  	if err == nil {
  1031  		// File exists, we're done.
  1032  		vd.DecRef(ctx)
  1033  		return nil
  1034  	}
  1035  	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
  1036  }
  1037  
  1038  // configureRestore returns an updated context.Context including filesystem
  1039  // state used by restore defined by conf.
  1040  func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) {
  1041  	fdmap := make(map[string]int)
  1042  	fdmap["/"] = c.fds.remove()
  1043  	mounts, err := c.prepareMounts()
  1044  	if err != nil {
  1045  		return ctx, err
  1046  	}
  1047  	for i := range c.mounts {
  1048  		submount := &mounts[i]
  1049  		if submount.fd >= 0 {
  1050  			fdmap[submount.mount.Destination] = submount.fd
  1051  		}
  1052  	}
  1053  	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
  1054  }
  1055  
  1056  func createDeviceFiles(ctx context.Context, creds *auth.Credentials, info *containerInfo, vfsObj *vfs.VirtualFilesystem, root vfs.VirtualDentry) error {
  1057  	if info.spec.Linux == nil {
  1058  		return nil
  1059  	}
  1060  	for _, dev := range info.spec.Linux.Devices {
  1061  		pop := vfs.PathOperation{
  1062  			Root:  root,
  1063  			Start: root,
  1064  			Path:  fspath.Parse(dev.Path),
  1065  		}
  1066  		opts := vfs.MknodOptions{
  1067  			Mode: linux.FileMode(dev.FileMode.Perm()),
  1068  		}
  1069  		// See https://github.com/opencontainers/runtime-spec/blob/main/config-linux.md#devices.
  1070  		switch dev.Type {
  1071  		case "b":
  1072  			opts.Mode |= linux.S_IFBLK
  1073  			opts.DevMajor = uint32(dev.Major)
  1074  			opts.DevMinor = uint32(dev.Minor)
  1075  		case "c", "u":
  1076  			opts.Mode |= linux.S_IFCHR
  1077  			opts.DevMajor = uint32(dev.Major)
  1078  			opts.DevMinor = uint32(dev.Minor)
  1079  		case "p":
  1080  			opts.Mode |= linux.S_IFIFO
  1081  		default:
  1082  			return fmt.Errorf("specified device at %q has invalid type %q", dev.Path, dev.Type)
  1083  		}
  1084  		if dev.Path == "/dev/nvidia-uvm" && info.nvidiaUVMDevMajor != 0 && opts.DevMajor != info.nvidiaUVMDevMajor {
  1085  			// nvidia-uvm's major device number is dynamically assigned, so the
  1086  			// number that it has on the host may differ from the number that
  1087  			// it has in sentry VFS; switch from the former to the latter.
  1088  			log.Infof("Switching /dev/nvidia-uvm device major number from %d to %d", dev.Major, info.nvidiaUVMDevMajor)
  1089  			opts.DevMajor = info.nvidiaUVMDevMajor
  1090  		}
  1091  		if err := vfsObj.MkdirAllAt(ctx, path.Dir(dev.Path), root, creds, &vfs.MkdirOptions{
  1092  			Mode: 0o755,
  1093  		}, true /* mustBeDir */); err != nil {
  1094  			return fmt.Errorf("failed to create ancestor directories of %q: %w", dev.Path, err)
  1095  		}
  1096  		// EEXIST is silently ignored; compare
  1097  		// opencontainers/runc:libcontainer/rootfs_linux.go:createDeviceNode().
  1098  		created := true
  1099  		if err := vfsObj.MknodAt(ctx, creds, &pop, &opts); err != nil && !linuxerr.Equals(linuxerr.EEXIST, err) {
  1100  			if linuxerr.Equals(linuxerr.EEXIST, err) {
  1101  				created = false
  1102  			} else {
  1103  				return fmt.Errorf("failed to create device file at %q: %w", dev.Path, err)
  1104  			}
  1105  		}
  1106  		if created && (dev.UID != nil || dev.GID != nil) {
  1107  			var opts vfs.SetStatOptions
  1108  			if dev.UID != nil {
  1109  				opts.Stat.Mask |= linux.STATX_UID
  1110  				opts.Stat.UID = *dev.UID
  1111  			}
  1112  			if dev.GID != nil {
  1113  				opts.Stat.Mask |= linux.STATX_GID
  1114  				opts.Stat.GID = *dev.GID
  1115  			}
  1116  			if err := vfsObj.SetStatAt(ctx, creds, &pop, &opts); err != nil {
  1117  				return fmt.Errorf("failed to set UID/GID for device file %q: %w", dev.Path, err)
  1118  			}
  1119  		}
  1120  	}
  1121  	return nil
  1122  }
  1123  
  1124  func tpuProxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error {
  1125  	if !info.conf.TPUProxy {
  1126  		return nil
  1127  	}
  1128  	// At this point /dev/accel just contains the TPU devices have been mounted
  1129  	// into the sandbox chroot. Enumerate all of them and create sentry devices.
  1130  	paths, err := filepath.Glob("/dev/accel*")
  1131  	if err != nil {
  1132  		return fmt.Errorf("enumerating accel device files: %w", err)
  1133  	}
  1134  	for _, path := range paths {
  1135  		accelDeviceRegex := regexp.MustCompile(`^/dev/accel(\d+)$`)
  1136  		if ms := accelDeviceRegex.FindStringSubmatch(path); ms != nil {
  1137  			deviceNum, _ := strconv.ParseUint(ms[1], 10, 32)
  1138  			if err := accel.Register(vfsObj, uint32(deviceNum)); err != nil {
  1139  				return fmt.Errorf("registering accel driver: %w", err)
  1140  			}
  1141  			if err := accel.CreateDevtmpfsFile(ctx, a, uint32(deviceNum)); err != nil {
  1142  				return fmt.Errorf("creating accel device file %q: %w", deviceNum, err)
  1143  			}
  1144  		}
  1145  	}
  1146  	return nil
  1147  }
  1148  
  1149  func nvproxyRegisterDevicesAndCreateFiles(ctx context.Context, info *containerInfo, k *kernel.Kernel, vfsObj *vfs.VirtualFilesystem, a *devtmpfs.Accessor) error {
  1150  	if !specutils.GPUFunctionalityRequested(info.spec, info.conf) {
  1151  		return nil
  1152  	}
  1153  	uvmDevMajor, err := k.VFS().GetDynamicCharDevMajor()
  1154  	if err != nil {
  1155  		return fmt.Errorf("reserving device major number for nvidia-uvm: %w", err)
  1156  	}
  1157  	if err := nvproxy.Register(vfsObj, uvmDevMajor); err != nil {
  1158  		return fmt.Errorf("registering nvproxy driver: %w", err)
  1159  	}
  1160  	info.nvidiaUVMDevMajor = uvmDevMajor
  1161  	if info.conf.NVProxyDocker {
  1162  		// In Docker mode, create all the device files now.
  1163  		// In non-Docker mode, these are instead created as part of
  1164  		// `createDeviceFiles`, using the spec's Device list.
  1165  		nvd, err := specutils.NvidiaDeviceNumbers(info.spec, info.conf)
  1166  		if err != nil {
  1167  			return fmt.Errorf("getting nvidia devices: %w", err)
  1168  		}
  1169  		if err := nvproxy.CreateDriverDevtmpfsFiles(ctx, a, uvmDevMajor); err != nil {
  1170  			return fmt.Errorf("creating nvproxy devtmpfs files: %w", err)
  1171  		}
  1172  		for _, d := range nvd {
  1173  			if err := nvproxy.CreateIndexDevtmpfsFile(ctx, a, d); err != nil {
  1174  				return fmt.Errorf("creating nvproxy devtmpfs file for device %d: %w", d, err)
  1175  			}
  1176  		}
  1177  	}
  1178  	return nil
  1179  }