github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/fs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"path/filepath"
    20  	"sort"
    21  	"strconv"
    22  	"strings"
    23  
    24  	specs "github.com/opencontainers/runtime-spec/specs-go"
    25  	"golang.org/x/sys/unix"
    26  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    27  	"github.com/SagerNet/gvisor/pkg/context"
    28  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    29  	"github.com/SagerNet/gvisor/pkg/fd"
    30  	"github.com/SagerNet/gvisor/pkg/log"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/fs"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/fs/gofer"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/fs/ramfs"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/fs/user"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/cgroupfs"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devpts"
    37  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devtmpfs"
    38  	gofervfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/gofer"
    39  	procvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/proc"
    40  	sysvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/sys"
    41  	tmpfsvfs2 "github.com/SagerNet/gvisor/pkg/sentry/fsimpl/tmpfs"
    42  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    43  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    45  	"github.com/SagerNet/gvisor/runsc/config"
    46  	"github.com/SagerNet/gvisor/runsc/specutils"
    47  
    48  	// Include filesystem types that OCI spec might mount.
    49  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/dev"
    50  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/host"
    51  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/proc"
    52  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/sys"
    53  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/tmpfs"
    54  	_ "github.com/SagerNet/gvisor/pkg/sentry/fs/tty"
    55  )
    56  
    57  const (
    58  	// Device name for root mount.
    59  	rootDevice = "9pfs-/"
    60  
    61  	// MountPrefix is the annotation prefix for mount hints.
    62  	MountPrefix = "dev.gvisor.spec.mount."
    63  
    64  	// Supported filesystems that map to different internal filesystem.
    65  	bind   = "bind"
    66  	nonefs = "none"
    67  )
    68  
    69  // tmpfs has some extra supported options that we must pass through.
    70  var tmpfsAllowedData = []string{"mode", "uid", "gid"}
    71  
    72  func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
    73  	// Upper layer uses the same flags as lower, but it must be read-write.
    74  	upperFlags := lowerFlags
    75  	upperFlags.ReadOnly = false
    76  
    77  	tmpFS := mustFindFilesystem("tmpfs")
    78  	if !fs.IsDir(lower.StableAttr) {
    79  		// Create overlay on top of mount file, e.g. /etc/hostname.
    80  		msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
    81  		return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
    82  	}
    83  
    84  	// Create overlay on top of mount dir.
    85  	upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
    86  	if err != nil {
    87  		return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
    88  	}
    89  
    90  	// Replicate permissions and owner from lower to upper mount point.
    91  	attr, err := lower.UnstableAttr(ctx)
    92  	if err != nil {
    93  		return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
    94  	}
    95  	if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
    96  		return nil, fmt.Errorf("error setting permission to upper mount point")
    97  	}
    98  	if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
    99  		return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
   100  	}
   101  
   102  	return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
   103  }
   104  
   105  // compileMounts returns the supported mounts from the mount spec, adding any
   106  // mandatory mounts that are required by the OCI specification.
   107  func compileMounts(spec *specs.Spec, conf *config.Config, vfs2Enabled bool) []specs.Mount {
   108  	// Keep track of whether proc and sys were mounted.
   109  	var procMounted, sysMounted, devMounted, devptsMounted bool
   110  	var mounts []specs.Mount
   111  
   112  	// Mount all submounts from the spec.
   113  	for _, m := range spec.Mounts {
   114  		if !specutils.IsSupportedDevMount(m, vfs2Enabled) {
   115  			log.Warningf("ignoring dev mount at %q", m.Destination)
   116  			continue
   117  		}
   118  		// Unconditionally drop any cgroupfs mounts. If requested, we'll add our
   119  		// own below.
   120  		if m.Type == cgroupfs.Name {
   121  			continue
   122  		}
   123  		switch filepath.Clean(m.Destination) {
   124  		case "/proc":
   125  			procMounted = true
   126  		case "/sys":
   127  			sysMounted = true
   128  		case "/dev":
   129  			m.Type = devtmpfs.Name
   130  			devMounted = true
   131  		case "/dev/pts":
   132  			m.Type = devpts.Name
   133  			devptsMounted = true
   134  		}
   135  		mounts = append(mounts, m)
   136  	}
   137  
   138  	// Mount proc and sys even if the user did not ask for it, as the spec
   139  	// says we SHOULD.
   140  	var mandatoryMounts []specs.Mount
   141  
   142  	if conf.Cgroupfs {
   143  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   144  			Type:        tmpfsvfs2.Name,
   145  			Destination: "/sys/fs/cgroup",
   146  		})
   147  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   148  			Type:        cgroupfs.Name,
   149  			Destination: "/sys/fs/cgroup/memory",
   150  			Options:     []string{"memory"},
   151  		})
   152  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   153  			Type:        cgroupfs.Name,
   154  			Destination: "/sys/fs/cgroup/cpu",
   155  			Options:     []string{"cpu"},
   156  		})
   157  	}
   158  
   159  	if !procMounted {
   160  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   161  			Type:        procvfs2.Name,
   162  			Destination: "/proc",
   163  		})
   164  	}
   165  	if !sysMounted {
   166  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   167  			Type:        sysvfs2.Name,
   168  			Destination: "/sys",
   169  		})
   170  	}
   171  	if !devMounted {
   172  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   173  			Type:        devtmpfs.Name,
   174  			Destination: "/dev",
   175  		})
   176  	}
   177  	if !devptsMounted {
   178  		mandatoryMounts = append(mandatoryMounts, specs.Mount{
   179  			Type:        devpts.Name,
   180  			Destination: "/dev/pts",
   181  		})
   182  	}
   183  
   184  	// The mandatory mounts should be ordered right after the root, in case
   185  	// there are submounts of these mandatory mounts already in the spec.
   186  	mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
   187  
   188  	return mounts
   189  }
   190  
   191  // p9MountData creates a slice of p9 mount data.
   192  func p9MountData(fd int, fa config.FileAccessType, vfs2 bool) []string {
   193  	opts := []string{
   194  		"trans=fd",
   195  		"rfdno=" + strconv.Itoa(fd),
   196  		"wfdno=" + strconv.Itoa(fd),
   197  	}
   198  	if !vfs2 {
   199  		// privateunixsocket is always enabled in VFS2. VFS1 requires explicit
   200  		// enablement.
   201  		opts = append(opts, "privateunixsocket=true")
   202  	}
   203  	if fa == config.FileAccessShared {
   204  		opts = append(opts, "cache=remote_revalidating")
   205  	}
   206  	return opts
   207  }
   208  
   209  // parseAndFilterOptions parses a MountOptions slice and filters by the allowed
   210  // keys.
   211  func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
   212  	var out []string
   213  	for _, o := range opts {
   214  		ok, err := parseMountOption(o, allowedKeys...)
   215  		if err != nil {
   216  			return nil, err
   217  		}
   218  		if ok {
   219  			out = append(out, o)
   220  		}
   221  	}
   222  	return out, nil
   223  }
   224  
   225  func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
   226  	kv := strings.SplitN(opt, "=", 3)
   227  	if len(kv) > 2 {
   228  		return false, fmt.Errorf("invalid option %q", opt)
   229  	}
   230  	return specutils.ContainsStr(allowedKeys, kv[0]), nil
   231  }
   232  
   233  // mountDevice returns a device string based on the fs type and target
   234  // of the mount.
   235  func mountDevice(m *specs.Mount) string {
   236  	if m.Type == bind {
   237  		// Make a device string that includes the target, which is consistent across
   238  		// S/R and uniquely identifies the connection.
   239  		return "9pfs-" + m.Destination
   240  	}
   241  	// All other fs types use device "none".
   242  	return "none"
   243  }
   244  
   245  func mountFlags(opts []string) fs.MountSourceFlags {
   246  	mf := fs.MountSourceFlags{}
   247  	// Note: changes to supported options must be reflected in
   248  	// isSupportedMountFlag() as well.
   249  	for _, o := range opts {
   250  		switch o {
   251  		case "rw":
   252  			mf.ReadOnly = false
   253  		case "ro":
   254  			mf.ReadOnly = true
   255  		case "noatime":
   256  			mf.NoAtime = true
   257  		case "noexec":
   258  			mf.NoExec = true
   259  		case "bind", "rbind":
   260  			// These are the same as a mount with type="bind".
   261  		default:
   262  			log.Warningf("ignoring unknown mount option %q", o)
   263  		}
   264  	}
   265  	return mf
   266  }
   267  
   268  func isSupportedMountFlag(fstype, opt string) bool {
   269  	switch opt {
   270  	case "rw", "ro", "noatime", "noexec":
   271  		return true
   272  	}
   273  	if fstype == tmpfsvfs2.Name {
   274  		ok, err := parseMountOption(opt, tmpfsAllowedData...)
   275  		return ok && err == nil
   276  	}
   277  	if fstype == cgroupfs.Name {
   278  		ok, err := parseMountOption(opt, cgroupfs.SupportedMountOptions...)
   279  		return ok && err == nil
   280  	}
   281  	return false
   282  }
   283  
   284  func mustFindFilesystem(name string) fs.Filesystem {
   285  	fs, ok := fs.FindFilesystem(name)
   286  	if !ok {
   287  		panic(fmt.Sprintf("could not find filesystem %q", name))
   288  	}
   289  	return fs
   290  }
   291  
   292  // addSubmountOverlay overlays the inode over a ramfs tree containing the given
   293  // paths.
   294  func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string, mf fs.MountSourceFlags) (*fs.Inode, error) {
   295  	// Construct a ramfs tree of mount points. The contents never
   296  	// change, so this can be fully caching. There's no real
   297  	// filesystem backing this tree, so we set the filesystem to
   298  	// nil.
   299  	msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
   300  	mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
   301  	if err != nil {
   302  		return nil, fmt.Errorf("creating mount tree: %v", err)
   303  	}
   304  	overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, mf)
   305  	if err != nil {
   306  		return nil, fmt.Errorf("adding mount overlay: %v", err)
   307  	}
   308  	return overlayInode, err
   309  }
   310  
   311  // subtargets takes a set of Mounts and returns only the targets that are
   312  // children of the given root. The returned paths are relative to the root.
   313  func subtargets(root string, mnts []specs.Mount) []string {
   314  	var targets []string
   315  	for _, mnt := range mnts {
   316  		if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
   317  			targets = append(targets, relPath)
   318  		}
   319  	}
   320  	return targets
   321  }
   322  
   323  func setupContainerFS(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   324  	if conf.VFS2 {
   325  		return setupContainerVFS2(ctx, conf, mntr, procArgs)
   326  	}
   327  	mns, err := mntr.setupFS(conf, procArgs)
   328  	if err != nil {
   329  		return err
   330  	}
   331  
   332  	// Set namespace here so that it can be found in ctx.
   333  	procArgs.MountNamespace = mns
   334  
   335  	// Resolve the executable path from working dir and environment.
   336  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   337  	if err != nil {
   338  		return err
   339  	}
   340  	procArgs.Filename = resolved
   341  	return nil
   342  }
   343  
   344  func adjustDirentCache(k *kernel.Kernel) error {
   345  	var hl unix.Rlimit
   346  	if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &hl); err != nil {
   347  		return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
   348  	}
   349  	if hl.Cur != unix.RLIM_INFINITY {
   350  		newSize := hl.Cur / 2
   351  		if newSize < gofer.DefaultDirentCacheSize {
   352  			log.Infof("Setting gofer dirent cache size to %d", newSize)
   353  			gofer.DefaultDirentCacheSize = newSize
   354  			k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
   355  		}
   356  	}
   357  	return nil
   358  }
   359  
   360  type fdDispenser struct {
   361  	fds []*fd.FD
   362  }
   363  
   364  func (f *fdDispenser) remove() int {
   365  	if f.empty() {
   366  		panic("fdDispenser out of fds")
   367  	}
   368  	rv := f.fds[0].Release()
   369  	f.fds = f.fds[1:]
   370  	return rv
   371  }
   372  
   373  func (f *fdDispenser) empty() bool {
   374  	return len(f.fds) == 0
   375  }
   376  
   377  type shareType int
   378  
   379  const (
   380  	invalid shareType = iota
   381  
   382  	// container shareType indicates that the mount is used by a single container.
   383  	container
   384  
   385  	// pod shareType indicates that the mount is used by more than one container
   386  	// inside the pod.
   387  	pod
   388  
   389  	// shared shareType indicates that the mount can also be shared with a process
   390  	// outside the pod, e.g. NFS.
   391  	shared
   392  )
   393  
   394  func parseShare(val string) (shareType, error) {
   395  	switch val {
   396  	case "container":
   397  		return container, nil
   398  	case "pod":
   399  		return pod, nil
   400  	case "shared":
   401  		return shared, nil
   402  	default:
   403  		return 0, fmt.Errorf("invalid share value %q", val)
   404  	}
   405  }
   406  
   407  func (s shareType) String() string {
   408  	switch s {
   409  	case invalid:
   410  		return "invalid"
   411  	case container:
   412  		return "container"
   413  	case pod:
   414  		return "pod"
   415  	case shared:
   416  		return "shared"
   417  	default:
   418  		return fmt.Sprintf("invalid share value %d", s)
   419  	}
   420  }
   421  
   422  // mountHint represents extra information about mounts that are provided via
   423  // annotations. They can override mount type, and provide sharing information
   424  // so that mounts can be correctly shared inside the pod.
   425  type mountHint struct {
   426  	name  string
   427  	share shareType
   428  	mount specs.Mount
   429  
   430  	// root is the inode where the volume is mounted. For mounts with 'pod' share
   431  	// the volume is mounted once and then bind mounted inside the containers.
   432  	root *fs.Inode
   433  
   434  	// vfsMount is the master mount for the volume. For mounts with 'pod' share
   435  	// the master volume is bind mounted inside the containers.
   436  	vfsMount *vfs.Mount
   437  }
   438  
   439  func (m *mountHint) setField(key, val string) error {
   440  	switch key {
   441  	case "source":
   442  		if len(val) == 0 {
   443  			return fmt.Errorf("source cannot be empty")
   444  		}
   445  		m.mount.Source = val
   446  	case "type":
   447  		return m.setType(val)
   448  	case "share":
   449  		share, err := parseShare(val)
   450  		if err != nil {
   451  			return err
   452  		}
   453  		m.share = share
   454  	case "options":
   455  		return m.setOptions(val)
   456  	default:
   457  		return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
   458  	}
   459  	return nil
   460  }
   461  
   462  func (m *mountHint) setType(val string) error {
   463  	switch val {
   464  	case "tmpfs", "bind":
   465  		m.mount.Type = val
   466  	default:
   467  		return fmt.Errorf("invalid type %q", val)
   468  	}
   469  	return nil
   470  }
   471  
   472  func (m *mountHint) setOptions(val string) error {
   473  	opts := strings.Split(val, ",")
   474  	if err := specutils.ValidateMountOptions(opts); err != nil {
   475  		return err
   476  	}
   477  	// Sort options so it can be compared with container mount options later on.
   478  	sort.Strings(opts)
   479  	m.mount.Options = opts
   480  	return nil
   481  }
   482  
   483  func (m *mountHint) isSupported() bool {
   484  	return m.mount.Type == tmpfsvfs2.Name && m.share == pod
   485  }
   486  
   487  // checkCompatible verifies that shared mount is compatible with master.
   488  // For now enforce that all options are the same. Once bind mount is properly
   489  // supported, then we should ensure the master is less restrictive than the
   490  // container, e.g. master can be 'rw' while container mounts as 'ro'.
   491  func (m *mountHint) checkCompatible(mount *specs.Mount) error {
   492  	// Remove options that don't affect to mount's behavior.
   493  	masterOpts := filterUnsupportedOptions(&m.mount)
   494  	replicaOpts := filterUnsupportedOptions(mount)
   495  
   496  	if len(masterOpts) != len(replicaOpts) {
   497  		return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
   498  	}
   499  
   500  	sort.Strings(masterOpts)
   501  	sort.Strings(replicaOpts)
   502  	for i, opt := range masterOpts {
   503  		if opt != replicaOpts[i] {
   504  			return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, replicaOpts)
   505  		}
   506  	}
   507  	return nil
   508  }
   509  
   510  func (m *mountHint) fileAccessType() config.FileAccessType {
   511  	if m.share == container {
   512  		return config.FileAccessExclusive
   513  	}
   514  	return config.FileAccessShared
   515  }
   516  
   517  func filterUnsupportedOptions(mount *specs.Mount) []string {
   518  	rv := make([]string, 0, len(mount.Options))
   519  	for _, o := range mount.Options {
   520  		if isSupportedMountFlag(mount.Type, o) {
   521  			rv = append(rv, o)
   522  		}
   523  	}
   524  	return rv
   525  }
   526  
   527  // podMountHints contains a collection of mountHints for the pod.
   528  type podMountHints struct {
   529  	mounts map[string]*mountHint
   530  }
   531  
   532  func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
   533  	mnts := make(map[string]*mountHint)
   534  	for k, v := range spec.Annotations {
   535  		// Look for 'dev.gvisor.spec.mount' annotations and parse them.
   536  		if strings.HasPrefix(k, MountPrefix) {
   537  			// Remove the prefix and split the rest.
   538  			parts := strings.Split(k[len(MountPrefix):], ".")
   539  			if len(parts) != 2 {
   540  				return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
   541  			}
   542  			name := parts[0]
   543  			if len(name) == 0 {
   544  				return nil, fmt.Errorf("invalid mount name: %s", name)
   545  			}
   546  			mnt := mnts[name]
   547  			if mnt == nil {
   548  				mnt = &mountHint{name: name}
   549  				mnts[name] = mnt
   550  			}
   551  			if err := mnt.setField(parts[1], v); err != nil {
   552  				return nil, err
   553  			}
   554  		}
   555  	}
   556  
   557  	// Validate all hints after done parsing.
   558  	for name, m := range mnts {
   559  		log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
   560  		if m.share == invalid {
   561  			return nil, fmt.Errorf("share field for %q has not been set", m.name)
   562  		}
   563  		if len(m.mount.Source) == 0 {
   564  			return nil, fmt.Errorf("source field for %q has not been set", m.name)
   565  		}
   566  		if len(m.mount.Type) == 0 {
   567  			return nil, fmt.Errorf("type field for %q has not been set", m.name)
   568  		}
   569  
   570  		// Check for duplicate mount sources.
   571  		for name2, m2 := range mnts {
   572  			if name != name2 && m.mount.Source == m2.mount.Source {
   573  				return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
   574  			}
   575  		}
   576  	}
   577  
   578  	return &podMountHints{mounts: mnts}, nil
   579  }
   580  
   581  func (p *podMountHints) findMount(mount *specs.Mount) *mountHint {
   582  	for _, m := range p.mounts {
   583  		if m.mount.Source == mount.Source {
   584  			return m
   585  		}
   586  	}
   587  	return nil
   588  }
   589  
   590  type containerMounter struct {
   591  	root *specs.Root
   592  
   593  	// mounts is the set of submounts for the container. It's a copy from the spec
   594  	// that may be freely modified without affecting the original spec.
   595  	mounts []specs.Mount
   596  
   597  	// fds is the list of FDs to be dispensed for mounts that require it.
   598  	fds fdDispenser
   599  
   600  	k *kernel.Kernel
   601  
   602  	hints *podMountHints
   603  }
   604  
   605  func newContainerMounter(info *containerInfo, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter {
   606  	return &containerMounter{
   607  		root:   info.spec.Root,
   608  		mounts: compileMounts(info.spec, info.conf, vfs2Enabled),
   609  		fds:    fdDispenser{fds: info.goferFDs},
   610  		k:      k,
   611  		hints:  hints,
   612  	}
   613  }
   614  
   615  // processHints processes annotations that container hints about how volumes
   616  // should be mounted (e.g. a volume shared between containers). It must be
   617  // called for the root container only.
   618  func (c *containerMounter) processHints(conf *config.Config, creds *auth.Credentials) error {
   619  	if conf.VFS2 {
   620  		return c.processHintsVFS2(conf, creds)
   621  	}
   622  	ctx := c.k.SupervisorContext()
   623  	for _, hint := range c.hints.mounts {
   624  		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
   625  		// common gofer to mount all shared volumes.
   626  		if hint.mount.Type != tmpfsvfs2.Name {
   627  			continue
   628  		}
   629  		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
   630  		inode, err := c.mountSharedMaster(ctx, conf, hint)
   631  		if err != nil {
   632  			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
   633  		}
   634  		hint.root = inode
   635  	}
   636  	return nil
   637  }
   638  
   639  // setupFS is used to set up the file system for all containers. This is the
   640  // main entry point method, with most of the other being internal only. It
   641  // returns the mount namespace that is created for the container.
   642  func (c *containerMounter) setupFS(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
   643  	log.Infof("Configuring container's file system")
   644  
   645  	// Create context with root credentials to mount the filesystem (the current
   646  	// user may not be privileged enough).
   647  	rootProcArgs := *procArgs
   648  	rootProcArgs.WorkingDirectory = "/"
   649  	rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   650  	rootProcArgs.Umask = 0022
   651  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   652  	rootCtx := rootProcArgs.NewContext(c.k)
   653  
   654  	mns, err := c.createMountNamespace(rootCtx, conf)
   655  	if err != nil {
   656  		return nil, err
   657  	}
   658  
   659  	// Set namespace here so that it can be found in rootCtx.
   660  	rootProcArgs.MountNamespace = mns
   661  
   662  	if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
   663  		return nil, err
   664  	}
   665  	return mns, nil
   666  }
   667  
   668  func (c *containerMounter) createMountNamespace(ctx context.Context, conf *config.Config) (*fs.MountNamespace, error) {
   669  	rootInode, err := c.createRootMount(ctx, conf)
   670  	if err != nil {
   671  		return nil, fmt.Errorf("creating filesystem for container: %v", err)
   672  	}
   673  	mns, err := fs.NewMountNamespace(ctx, rootInode)
   674  	if err != nil {
   675  		return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
   676  	}
   677  	return mns, nil
   678  }
   679  
   680  func (c *containerMounter) mountSubmounts(ctx context.Context, conf *config.Config, mns *fs.MountNamespace) error {
   681  	root := mns.Root()
   682  	defer root.DecRef(ctx)
   683  
   684  	for i := range c.mounts {
   685  		m := &c.mounts[i]
   686  		log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
   687  		if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
   688  			if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
   689  				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
   690  			}
   691  		} else {
   692  			if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
   693  				return fmt.Errorf("mount submount %q: %v", m.Destination, err)
   694  			}
   695  		}
   696  	}
   697  
   698  	if err := c.mountTmp(ctx, conf, mns, root); err != nil {
   699  		return fmt.Errorf("mount submount %q: %v", "tmp", err)
   700  	}
   701  
   702  	if err := c.checkDispenser(); err != nil {
   703  		return err
   704  	}
   705  	return nil
   706  }
   707  
   708  func (c *containerMounter) checkDispenser() error {
   709  	if !c.fds.empty() {
   710  		return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
   711  	}
   712  	return nil
   713  }
   714  
   715  // mountSharedMaster mounts the master of a volume that is shared among
   716  // containers in a pod. It returns the root mount's inode.
   717  func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *config.Config, hint *mountHint) (*fs.Inode, error) {
   718  	// Map mount type to filesystem name, and parse out the options that we are
   719  	// capable of dealing with.
   720  	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, &hint.mount)
   721  	if err != nil {
   722  		return nil, err
   723  	}
   724  	if len(fsName) == 0 {
   725  		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
   726  	}
   727  
   728  	// Mount with revalidate because it's shared among containers.
   729  	opts = append(opts, "cache=revalidate")
   730  
   731  	// All filesystem names should have been mapped to something we know.
   732  	filesystem := mustFindFilesystem(fsName)
   733  
   734  	mf := mountFlags(hint.mount.Options)
   735  	if useOverlay {
   736  		// All writes go to upper, be paranoid and make lower readonly.
   737  		mf.ReadOnly = true
   738  	}
   739  
   740  	inode, err := filesystem.Mount(ctx, mountDevice(&hint.mount), mf, strings.Join(opts, ","), nil)
   741  	if err != nil {
   742  		return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
   743  	}
   744  
   745  	if useOverlay {
   746  		log.Debugf("Adding overlay on top of shared mount %q", hint.name)
   747  		inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
   748  		if err != nil {
   749  			return nil, err
   750  		}
   751  	}
   752  
   753  	return inode, nil
   754  }
   755  
   756  // createRootMount creates the root filesystem.
   757  func (c *containerMounter) createRootMount(ctx context.Context, conf *config.Config) (*fs.Inode, error) {
   758  	// First construct the filesystem from the spec.Root.
   759  	mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
   760  
   761  	fd := c.fds.remove()
   762  	log.Infof("Mounting root over 9P, ioFD: %d", fd)
   763  	p9FS := mustFindFilesystem("9p")
   764  	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
   765  
   766  	// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   767  	// can only send mount options for specs.Mounts (specs.Root is missing
   768  	// Options field). So assume root is always on top of overlayfs.
   769  	opts = append(opts, "overlayfs_stale_read")
   770  
   771  	rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
   772  	if err != nil {
   773  		return nil, fmt.Errorf("creating root mount point: %v", err)
   774  	}
   775  
   776  	// We need to overlay the root on top of a ramfs with stub directories
   777  	// for submount paths.  "/dev" "/sys" "/proc" and "/tmp" are always
   778  	// mounted even if they are not in the spec.
   779  	submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
   780  	rootInode, err = addSubmountOverlay(ctx, rootInode, submounts, mf)
   781  	if err != nil {
   782  		return nil, fmt.Errorf("adding submount overlay: %v", err)
   783  	}
   784  
   785  	if conf.Overlay && !c.root.Readonly {
   786  		log.Debugf("Adding overlay on top of root mount")
   787  		// Overlay a tmpfs filesystem on top of the root.
   788  		rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
   789  		if err != nil {
   790  			return nil, err
   791  		}
   792  	}
   793  
   794  	log.Infof("Mounted %q to %q type root", c.root.Path, "/")
   795  	return rootInode, nil
   796  }
   797  
   798  // getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
   799  // used for mounts.
   800  func (c *containerMounter) getMountNameAndOptions(conf *config.Config, m *specs.Mount) (string, []string, bool, error) {
   801  	specutils.MaybeConvertToBindMount(m)
   802  
   803  	var (
   804  		fsName     string
   805  		opts       []string
   806  		useOverlay bool
   807  	)
   808  	switch m.Type {
   809  	case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
   810  		fsName = m.Type
   811  	case nonefs:
   812  		fsName = sysvfs2.Name
   813  	case tmpfsvfs2.Name:
   814  		fsName = m.Type
   815  
   816  		var err error
   817  		opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
   818  		if err != nil {
   819  			return "", nil, false, err
   820  		}
   821  
   822  	case bind:
   823  		fd := c.fds.remove()
   824  		fsName = gofervfs2.Name
   825  		opts = p9MountData(fd, c.getMountAccessType(conf, m), conf.VFS2)
   826  		// If configured, add overlay to all writable mounts.
   827  		useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
   828  	case cgroupfs.Name:
   829  		fsName = m.Type
   830  		var err error
   831  		opts, err = parseAndFilterOptions(m.Options, cgroupfs.SupportedMountOptions...)
   832  		if err != nil {
   833  			return "", nil, false, err
   834  		}
   835  	default:
   836  		log.Warningf("ignoring unknown filesystem type %q", m.Type)
   837  	}
   838  	return fsName, opts, useOverlay, nil
   839  }
   840  
   841  func (c *containerMounter) getMountAccessType(conf *config.Config, mount *specs.Mount) config.FileAccessType {
   842  	if hint := c.hints.findMount(mount); hint != nil {
   843  		return hint.fileAccessType()
   844  	}
   845  	return conf.FileAccessMounts
   846  }
   847  
   848  // mountSubmount mounts volumes inside the container's root. Because mounts may
   849  // be readonly, a lower ramfs overlay is added to create the mount point dir.
   850  // Another overlay is added with tmpfs on top if Config.Overlay is true.
   851  // 'm.Destination' must be an absolute path with '..' and symlinks resolved.
   852  func (c *containerMounter) mountSubmount(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent, m *specs.Mount) error {
   853  	// Map mount type to filesystem name, and parse out the options that we are
   854  	// capable of dealing with.
   855  	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
   856  	if err != nil {
   857  		return err
   858  	}
   859  	if fsName == "" {
   860  		// Filesystem is not supported (e.g. cgroup), just skip it.
   861  		return nil
   862  	}
   863  
   864  	// All filesystem names should have been mapped to something we know.
   865  	filesystem := mustFindFilesystem(fsName)
   866  
   867  	mf := mountFlags(m.Options)
   868  	if useOverlay {
   869  		// All writes go to upper, be paranoid and make lower readonly.
   870  		mf.ReadOnly = true
   871  	}
   872  
   873  	inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
   874  	if err != nil {
   875  		err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
   876  		// Check to see if this is a common error due to a Linux bug.
   877  		// This error is generated here in order to cause it to be
   878  		// printed to the user using Docker via 'runsc create' etc. rather
   879  		// than simply printed to the logs for the 'runsc boot' command.
   880  		//
   881  		// We check the error message string rather than type because the
   882  		// actual error types (unix.EIO, unix.EPIPE) are lost by file system
   883  		// implementation (e.g. p9).
   884  		// TODO(github.com/SagerNet/issue/1765): Remove message when bug is resolved.
   885  		if strings.Contains(err.Error(), unix.EIO.Error()) || strings.Contains(err.Error(), unix.EPIPE.Error()) {
   886  			return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
   887  		}
   888  		return err
   889  	}
   890  
   891  	// If there are submounts, we need to overlay the mount on top of a ramfs
   892  	// with stub directories for submount paths.
   893  	submounts := subtargets(m.Destination, c.mounts)
   894  	if len(submounts) > 0 {
   895  		log.Infof("Adding submount overlay over %q", m.Destination)
   896  		inode, err = addSubmountOverlay(ctx, inode, submounts, mf)
   897  		if err != nil {
   898  			return fmt.Errorf("adding submount overlay: %v", err)
   899  		}
   900  	}
   901  
   902  	if useOverlay {
   903  		log.Debugf("Adding overlay on top of mount %q", m.Destination)
   904  		inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
   905  		if err != nil {
   906  			return err
   907  		}
   908  	}
   909  
   910  	maxTraversals := uint(0)
   911  	dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
   912  	if err != nil {
   913  		return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
   914  	}
   915  	defer dirent.DecRef(ctx)
   916  	if err := mns.Mount(ctx, dirent, inode); err != nil {
   917  		return fmt.Errorf("mount %q error: %v", m.Destination, err)
   918  	}
   919  
   920  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
   921  	return nil
   922  }
   923  
   924  // mountSharedSubmount binds mount to a previously mounted volume that is shared
   925  // among containers in the same pod.
   926  func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount *specs.Mount, source *mountHint) error {
   927  	if err := source.checkCompatible(mount); err != nil {
   928  		return err
   929  	}
   930  
   931  	maxTraversals := uint(0)
   932  	target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
   933  	if err != nil {
   934  		return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
   935  	}
   936  	defer target.DecRef(ctx)
   937  
   938  	// Take a ref on the inode that is about to be (re)-mounted.
   939  	source.root.IncRef()
   940  	if err := mns.Mount(ctx, target, source.root); err != nil {
   941  		source.root.DecRef(ctx)
   942  		return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
   943  	}
   944  
   945  	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
   946  	return nil
   947  }
   948  
   949  // addRestoreMount adds a mount to the MountSources map used for restoring a
   950  // checkpointed container.
   951  func (c *containerMounter) addRestoreMount(conf *config.Config, renv *fs.RestoreEnvironment, m *specs.Mount) error {
   952  	fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
   953  	if err != nil {
   954  		return err
   955  	}
   956  	if fsName == "" {
   957  		// Filesystem is not supported (e.g. cgroup), just skip it.
   958  		return nil
   959  	}
   960  
   961  	newMount := fs.MountArgs{
   962  		Dev:        mountDevice(m),
   963  		Flags:      mountFlags(m.Options),
   964  		DataString: strings.Join(opts, ","),
   965  	}
   966  	if useOverlay {
   967  		newMount.Flags.ReadOnly = true
   968  	}
   969  	renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
   970  	log.Infof("Added mount at %q: %+v", fsName, newMount)
   971  	return nil
   972  }
   973  
   974  // createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
   975  // the mounts to the environment.
   976  func (c *containerMounter) createRestoreEnvironment(conf *config.Config) (*fs.RestoreEnvironment, error) {
   977  	renv := &fs.RestoreEnvironment{
   978  		MountSources: make(map[string][]fs.MountArgs),
   979  	}
   980  
   981  	// Add root mount.
   982  	fd := c.fds.remove()
   983  	opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
   984  
   985  	mf := fs.MountSourceFlags{}
   986  	if c.root.Readonly || conf.Overlay {
   987  		mf.ReadOnly = true
   988  	}
   989  
   990  	rootMount := fs.MountArgs{
   991  		Dev:        rootDevice,
   992  		Flags:      mf,
   993  		DataString: strings.Join(opts, ","),
   994  	}
   995  	renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount)
   996  
   997  	// Add submounts.
   998  	var tmpMounted bool
   999  	for i := range c.mounts {
  1000  		m := &c.mounts[i]
  1001  		if err := c.addRestoreMount(conf, renv, m); err != nil {
  1002  			return nil, err
  1003  		}
  1004  		if filepath.Clean(m.Destination) == "/tmp" {
  1005  			tmpMounted = true
  1006  		}
  1007  	}
  1008  
  1009  	// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
  1010  	if !tmpMounted {
  1011  		tmpMount := specs.Mount{
  1012  			Type:        tmpfsvfs2.Name,
  1013  			Destination: "/tmp",
  1014  		}
  1015  		if err := c.addRestoreMount(conf, renv, &tmpMount); err != nil {
  1016  			return nil, err
  1017  		}
  1018  	}
  1019  
  1020  	return renv, nil
  1021  }
  1022  
  1023  // mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
  1024  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
  1025  // the host /tmp, but this is a nice optimization, and fixes some apps that call
  1026  // mknod in /tmp. It's unsafe to mount tmpfs if:
  1027  //   1. /tmp is mounted explicitly: we should not override user's wish
  1028  //   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
  1029  //
  1030  // Note that when there are submounts inside of '/tmp', directories for the
  1031  // mount points must be present, making '/tmp' not empty anymore.
  1032  func (c *containerMounter) mountTmp(ctx context.Context, conf *config.Config, mns *fs.MountNamespace, root *fs.Dirent) error {
  1033  	for _, m := range c.mounts {
  1034  		if filepath.Clean(m.Destination) == "/tmp" {
  1035  			log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
  1036  			return nil
  1037  		}
  1038  	}
  1039  
  1040  	maxTraversals := uint(0)
  1041  	tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
  1042  	switch {
  1043  	case err == nil:
  1044  		// Found '/tmp' in filesystem, check if it's empty.
  1045  		defer tmp.DecRef(ctx)
  1046  		f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
  1047  		if err != nil {
  1048  			return err
  1049  		}
  1050  		defer f.DecRef(ctx)
  1051  		serializer := &fs.CollectEntriesSerializer{}
  1052  		if err := f.Readdir(ctx, serializer); err != nil {
  1053  			return err
  1054  		}
  1055  		// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
  1056  		// existing files.
  1057  		if len(serializer.Order) > 2 {
  1058  			log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
  1059  			return nil
  1060  		}
  1061  		log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
  1062  		fallthrough
  1063  
  1064  	case linuxerr.Equals(linuxerr.ENOENT, err):
  1065  		// No '/tmp' found (or fallthrough from above). Safe to mount internal
  1066  		// tmpfs.
  1067  		tmpMount := specs.Mount{
  1068  			Type:        tmpfsvfs2.Name,
  1069  			Destination: "/tmp",
  1070  			// Sticky bit is added to prevent accidental deletion of files from
  1071  			// another user. This is normally done for /tmp.
  1072  			Options: []string{"mode=01777"},
  1073  		}
  1074  		return c.mountSubmount(ctx, conf, mns, root, &tmpMount)
  1075  
  1076  	default:
  1077  		return err
  1078  	}
  1079  }