github.com/SagerNet/gvisor@v0.0.0-20210707092255-7731c139d75c/runsc/boot/vfs.go (about)

     1  // Copyright 2018 The gVisor Authors.
     2  //
     3  // Licensed under the Apache License, Version 2.0 (the "License");
     4  // you may not use this file except in compliance with the License.
     5  // You may obtain a copy of the License at
     6  //
     7  //     http://www.apache.org/licenses/LICENSE-2.0
     8  //
     9  // Unless required by applicable law or agreed to in writing, software
    10  // distributed under the License is distributed on an "AS IS" BASIS,
    11  // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    12  // See the License for the specific language governing permissions and
    13  // limitations under the License.
    14  
    15  package boot
    16  
    17  import (
    18  	"fmt"
    19  	"path"
    20  	"sort"
    21  	"strings"
    22  
    23  	specs "github.com/opencontainers/runtime-spec/specs-go"
    24  	"github.com/SagerNet/gvisor/pkg/abi/linux"
    25  	"github.com/SagerNet/gvisor/pkg/cleanup"
    26  	"github.com/SagerNet/gvisor/pkg/context"
    27  	"github.com/SagerNet/gvisor/pkg/errors/linuxerr"
    28  	"github.com/SagerNet/gvisor/pkg/fspath"
    29  	"github.com/SagerNet/gvisor/pkg/log"
    30  	"github.com/SagerNet/gvisor/pkg/sentry/devices/memdev"
    31  	"github.com/SagerNet/gvisor/pkg/sentry/devices/ttydev"
    32  	"github.com/SagerNet/gvisor/pkg/sentry/devices/tundev"
    33  	"github.com/SagerNet/gvisor/pkg/sentry/fs/user"
    34  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/cgroupfs"
    35  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devpts"
    36  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/devtmpfs"
    37  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/fuse"
    38  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/gofer"
    39  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/overlay"
    40  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/proc"
    41  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/sys"
    42  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/tmpfs"
    43  	"github.com/SagerNet/gvisor/pkg/sentry/fsimpl/verity"
    44  	"github.com/SagerNet/gvisor/pkg/sentry/inet"
    45  	"github.com/SagerNet/gvisor/pkg/sentry/kernel"
    46  	"github.com/SagerNet/gvisor/pkg/sentry/kernel/auth"
    47  	"github.com/SagerNet/gvisor/pkg/sentry/vfs"
    48  	"github.com/SagerNet/gvisor/runsc/config"
    49  	"github.com/SagerNet/gvisor/runsc/specutils"
    50  )
    51  
    52  func registerFilesystems(k *kernel.Kernel) error {
    53  	ctx := k.SupervisorContext()
    54  	creds := auth.NewRootCredentials(k.RootUserNamespace())
    55  	vfsObj := k.VFS()
    56  
    57  	vfsObj.MustRegisterFilesystemType(cgroupfs.Name, &cgroupfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    58  		AllowUserMount: true,
    59  		AllowUserList:  true,
    60  	})
    61  	vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    62  		AllowUserList: true,
    63  		// TODO(b/29356795): Users may mount this once the terminals are in a
    64  		//  usable state.
    65  		AllowUserMount: false,
    66  	})
    67  	vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    68  		AllowUserMount: true,
    69  		AllowUserList:  true,
    70  	})
    71  	vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    72  		AllowUserMount: true,
    73  		AllowUserList:  true,
    74  	})
    75  	vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    76  		AllowUserList: true,
    77  	})
    78  	vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    79  		AllowUserMount: true,
    80  		AllowUserList:  true,
    81  	})
    82  	vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    83  		AllowUserMount: true,
    84  		AllowUserList:  true,
    85  	})
    86  	vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    87  		AllowUserMount: true,
    88  		AllowUserList:  true,
    89  	})
    90  	vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    91  		AllowUserMount: true,
    92  		AllowUserList:  true,
    93  	})
    94  	vfsObj.MustRegisterFilesystemType(verity.Name, &verity.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
    95  		AllowUserList:  true,
    96  		AllowUserMount: true,
    97  	})
    98  
    99  	// Setup files in devtmpfs.
   100  	if err := memdev.Register(vfsObj); err != nil {
   101  		return fmt.Errorf("registering memdev: %w", err)
   102  	}
   103  	if err := ttydev.Register(vfsObj); err != nil {
   104  		return fmt.Errorf("registering ttydev: %w", err)
   105  	}
   106  	tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
   107  	if tunSupported {
   108  		if err := tundev.Register(vfsObj); err != nil {
   109  			return fmt.Errorf("registering tundev: %v", err)
   110  		}
   111  	}
   112  
   113  	if kernel.FUSEEnabled {
   114  		if err := fuse.Register(vfsObj); err != nil {
   115  			return fmt.Errorf("registering fusedev: %w", err)
   116  		}
   117  	}
   118  
   119  	a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
   120  	if err != nil {
   121  		return fmt.Errorf("creating devtmpfs accessor: %w", err)
   122  	}
   123  	defer a.Release(ctx)
   124  
   125  	if err := a.UserspaceInit(ctx); err != nil {
   126  		return fmt.Errorf("initializing userspace: %w", err)
   127  	}
   128  	if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
   129  		return fmt.Errorf("creating memdev devtmpfs files: %w", err)
   130  	}
   131  	if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
   132  		return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
   133  	}
   134  	if tunSupported {
   135  		if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
   136  			return fmt.Errorf("creating tundev devtmpfs files: %v", err)
   137  		}
   138  	}
   139  
   140  	if kernel.FUSEEnabled {
   141  		if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
   142  			return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
   143  		}
   144  	}
   145  
   146  	return nil
   147  }
   148  
   149  func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
   150  	mns, err := mntr.mountAll(conf, procArgs)
   151  	if err != nil {
   152  		return fmt.Errorf("failed to setupFS: %w", err)
   153  	}
   154  	procArgs.MountNamespaceVFS2 = mns
   155  
   156  	// Resolve the executable path from working dir and environment.
   157  	resolved, err := user.ResolveExecutablePath(ctx, procArgs)
   158  	if err != nil {
   159  		return err
   160  	}
   161  	procArgs.Filename = resolved
   162  	return nil
   163  }
   164  
   165  func (c *containerMounter) mountAll(conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
   166  	log.Infof("Configuring container's file system with VFS2")
   167  
   168  	// Create context with root credentials to mount the filesystem (the current
   169  	// user may not be privileged enough).
   170  	rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
   171  	rootProcArgs := *procArgs
   172  	rootProcArgs.WorkingDirectory = "/"
   173  	rootProcArgs.Credentials = rootCreds
   174  	rootProcArgs.Umask = 0022
   175  	rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
   176  	rootCtx := procArgs.NewContext(c.k)
   177  
   178  	mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
   179  	if err != nil {
   180  		return nil, fmt.Errorf("creating mount namespace: %w", err)
   181  	}
   182  	rootProcArgs.MountNamespaceVFS2 = mns
   183  
   184  	root := mns.Root()
   185  	root.IncRef()
   186  	defer root.DecRef(rootCtx)
   187  	if root.Mount().ReadOnly() {
   188  		// Switch to ReadWrite while we setup submounts.
   189  		if err := c.k.VFS().SetMountReadOnly(root.Mount(), false); err != nil {
   190  			return nil, fmt.Errorf(`failed to set mount at "/" readwrite: %w`, err)
   191  		}
   192  		// Restore back to ReadOnly at the end.
   193  		defer func() {
   194  			if err := c.k.VFS().SetMountReadOnly(root.Mount(), true); err != nil {
   195  				panic(fmt.Sprintf(`failed to restore mount at "/" back to readonly: %v`, err))
   196  			}
   197  		}()
   198  	}
   199  
   200  	// Mount submounts.
   201  	if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
   202  		return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
   203  	}
   204  
   205  	return mns, nil
   206  }
   207  
   208  // createMountNamespaceVFS2 creates the container's root mount and namespace.
   209  func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
   210  	fd := c.fds.remove()
   211  	data := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
   212  
   213  	// We can't check for overlayfs here because sandbox is chroot'ed and gofer
   214  	// can only send mount options for specs.Mounts (specs.Root is missing
   215  	// Options field). So assume root is always on top of overlayfs.
   216  	data = append(data, "overlayfs_stale_read")
   217  
   218  	log.Infof("Mounting root over 9P, ioFD: %d", fd)
   219  	opts := &vfs.MountOptions{
   220  		ReadOnly: c.root.Readonly,
   221  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   222  			Data: strings.Join(data, ","),
   223  			InternalData: gofer.InternalFilesystemOptions{
   224  				UniqueID: "/",
   225  			},
   226  		},
   227  		InternalMount: true,
   228  	}
   229  
   230  	fsName := gofer.Name
   231  	if conf.Overlay && !c.root.Readonly {
   232  		log.Infof("Adding overlay on top of root")
   233  		var err error
   234  		var cleanup func()
   235  		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
   236  		if err != nil {
   237  			return nil, fmt.Errorf("mounting root with overlay: %w", err)
   238  		}
   239  		defer cleanup()
   240  		fsName = overlay.Name
   241  	}
   242  
   243  	mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", fsName, opts)
   244  	if err != nil {
   245  		return nil, fmt.Errorf("setting up mount namespace: %w", err)
   246  	}
   247  	return mns, nil
   248  }
   249  
   250  // configureOverlay mounts the lower layer using "lowerOpts", mounts the upper
   251  // layer using tmpfs, and return overlay mount options. "cleanup" must be called
   252  // after the options have been used to mount the overlay, to release refs on
   253  // lower and upper mounts.
   254  func (c *containerMounter) configureOverlay(ctx context.Context, creds *auth.Credentials, lowerOpts *vfs.MountOptions, lowerFSName string) (*vfs.MountOptions, func(), error) {
   255  	// First copy options from lower layer to upper layer and overlay. Clear
   256  	// filesystem specific options.
   257  	upperOpts := *lowerOpts
   258  	upperOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   259  
   260  	overlayOpts := *lowerOpts
   261  	overlayOpts.GetFilesystemOptions = vfs.GetFilesystemOptions{}
   262  
   263  	// All writes go to the upper layer, be paranoid and make lower readonly.
   264  	lowerOpts.ReadOnly = true
   265  	lower, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, lowerFSName, lowerOpts)
   266  	if err != nil {
   267  		return nil, nil, err
   268  	}
   269  	cu := cleanup.Make(func() { lower.DecRef(ctx) })
   270  	defer cu.Clean()
   271  
   272  	// Determine the lower layer's root's type.
   273  	lowerRootVD := vfs.MakeVirtualDentry(lower, lower.Root())
   274  	stat, err := c.k.VFS().StatAt(ctx, creds, &vfs.PathOperation{
   275  		Root:  lowerRootVD,
   276  		Start: lowerRootVD,
   277  	}, &vfs.StatOptions{
   278  		Mask: linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE | linux.STATX_TYPE,
   279  	})
   280  	if err != nil {
   281  		return nil, nil, fmt.Errorf("failed to stat lower layer's root: %v", err)
   282  	}
   283  	if stat.Mask&linux.STATX_TYPE == 0 {
   284  		return nil, nil, fmt.Errorf("failed to get file type of lower layer's root")
   285  	}
   286  	rootType := stat.Mode & linux.S_IFMT
   287  	if rootType != linux.S_IFDIR && rootType != linux.S_IFREG {
   288  		return nil, nil, fmt.Errorf("lower layer's root has unsupported file type %v", rootType)
   289  	}
   290  
   291  	// Upper is a tmpfs mount to keep all modifications inside the sandbox.
   292  	upperOpts.GetFilesystemOptions.InternalData = tmpfs.FilesystemOpts{
   293  		RootFileType: uint16(rootType),
   294  	}
   295  	upper, err := c.k.VFS().MountDisconnected(ctx, creds, "" /* source */, tmpfs.Name, &upperOpts)
   296  	if err != nil {
   297  		return nil, nil, fmt.Errorf("failed to create upper layer for overlay, opts: %+v: %v", upperOpts, err)
   298  	}
   299  	cu.Add(func() { upper.DecRef(ctx) })
   300  
   301  	// If the overlay mount consists of a regular file, copy up its contents
   302  	// from the lower layer, since in the overlay the otherwise-empty upper
   303  	// layer file will take precedence.
   304  	upperRootVD := vfs.MakeVirtualDentry(upper, upper.Root())
   305  	if rootType == linux.S_IFREG {
   306  		lowerFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   307  			Root:  lowerRootVD,
   308  			Start: lowerRootVD,
   309  		}, &vfs.OpenOptions{
   310  			Flags: linux.O_RDONLY,
   311  		})
   312  		if err != nil {
   313  			return nil, nil, fmt.Errorf("failed to open lower layer root for copying: %v", err)
   314  		}
   315  		defer lowerFD.DecRef(ctx)
   316  		upperFD, err := c.k.VFS().OpenAt(ctx, creds, &vfs.PathOperation{
   317  			Root:  upperRootVD,
   318  			Start: upperRootVD,
   319  		}, &vfs.OpenOptions{
   320  			Flags: linux.O_WRONLY,
   321  		})
   322  		if err != nil {
   323  			return nil, nil, fmt.Errorf("failed to open upper layer root for copying: %v", err)
   324  		}
   325  		defer upperFD.DecRef(ctx)
   326  		if _, err := vfs.CopyRegularFileData(ctx, upperFD, lowerFD); err != nil {
   327  			return nil, nil, fmt.Errorf("failed to copy up overlay file: %v", err)
   328  		}
   329  	}
   330  
   331  	// Propagate the lower layer's root's owner, group, and mode to the upper
   332  	// layer's root for consistency with VFS1.
   333  	err = c.k.VFS().SetStatAt(ctx, creds, &vfs.PathOperation{
   334  		Root:  upperRootVD,
   335  		Start: upperRootVD,
   336  	}, &vfs.SetStatOptions{
   337  		Stat: linux.Statx{
   338  			Mask: (linux.STATX_UID | linux.STATX_GID | linux.STATX_MODE) & stat.Mask,
   339  			UID:  stat.UID,
   340  			GID:  stat.GID,
   341  			Mode: stat.Mode,
   342  		},
   343  	})
   344  	if err != nil {
   345  		return nil, nil, err
   346  	}
   347  
   348  	// Configure overlay with both layers.
   349  	overlayOpts.GetFilesystemOptions.InternalData = overlay.FilesystemOptions{
   350  		UpperRoot:  upperRootVD,
   351  		LowerRoots: []vfs.VirtualDentry{lowerRootVD},
   352  	}
   353  	return &overlayOpts, cu.Release(), nil
   354  }
   355  
   356  func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
   357  	mounts, err := c.prepareMountsVFS2()
   358  	if err != nil {
   359  		return err
   360  	}
   361  
   362  	for i := range mounts {
   363  		submount := &mounts[i]
   364  		log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.mount.Source, submount.mount.Destination, submount.mount.Type, submount.mount.Options)
   365  		var (
   366  			mnt *vfs.Mount
   367  			err error
   368  		)
   369  
   370  		if hint := c.hints.findMount(submount.mount); hint != nil && hint.isSupported() {
   371  			mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.mount, hint)
   372  			if err != nil {
   373  				return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.mount.Destination, err)
   374  			}
   375  		} else {
   376  			mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
   377  			if err != nil {
   378  				return fmt.Errorf("mount submount %q: %w", submount.mount.Destination, err)
   379  			}
   380  		}
   381  
   382  		if mnt != nil && mnt.ReadOnly() {
   383  			// Switch to ReadWrite while we setup submounts.
   384  			if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
   385  				return fmt.Errorf("failed to set mount at %q readwrite: %w", submount.mount.Destination, err)
   386  			}
   387  			// Restore back to ReadOnly at the end.
   388  			defer func() {
   389  				if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
   390  					panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.mount.Destination, err))
   391  				}
   392  			}()
   393  		}
   394  	}
   395  
   396  	if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
   397  		return fmt.Errorf(`mount submount "\tmp": %w`, err)
   398  	}
   399  	return nil
   400  }
   401  
   402  type mountAndFD struct {
   403  	mount *specs.Mount
   404  	fd    int
   405  }
   406  
   407  func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
   408  	// Associate bind mounts with their FDs before sorting since there is an
   409  	// undocumented assumption that FDs are dispensed in the order in which
   410  	// they are required by mounts.
   411  	var mounts []mountAndFD
   412  	for i := range c.mounts {
   413  		m := &c.mounts[i]
   414  		specutils.MaybeConvertToBindMount(m)
   415  
   416  		// Only bind mounts use host FDs; see
   417  		// containerMounter.getMountNameAndOptionsVFS2.
   418  		fd := -1
   419  		if m.Type == bind {
   420  			fd = c.fds.remove()
   421  		}
   422  		mounts = append(mounts, mountAndFD{
   423  			mount: m,
   424  			fd:    fd,
   425  		})
   426  	}
   427  	if err := c.checkDispenser(); err != nil {
   428  		return nil, err
   429  	}
   430  
   431  	// Sort the mounts so that we don't place children before parents.
   432  	sort.Slice(mounts, func(i, j int) bool {
   433  		return len(mounts[i].mount.Destination) < len(mounts[j].mount.Destination)
   434  	})
   435  
   436  	return mounts, nil
   437  }
   438  
   439  func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
   440  	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
   441  	if err != nil {
   442  		return nil, fmt.Errorf("mountOptions failed: %w", err)
   443  	}
   444  	if len(fsName) == 0 {
   445  		// Filesystem is not supported (e.g. cgroup), just skip it.
   446  		return nil, nil
   447  	}
   448  
   449  	if err := c.makeMountPoint(ctx, creds, mns, submount.mount.Destination); err != nil {
   450  		return nil, fmt.Errorf("creating mount point %q: %w", submount.mount.Destination, err)
   451  	}
   452  
   453  	if useOverlay {
   454  		log.Infof("Adding overlay on top of mount %q", submount.mount.Destination)
   455  		var cleanup func()
   456  		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
   457  		if err != nil {
   458  			return nil, fmt.Errorf("mounting volume with overlay at %q: %w", submount.mount.Destination, err)
   459  		}
   460  		defer cleanup()
   461  		fsName = overlay.Name
   462  	}
   463  
   464  	root := mns.Root()
   465  	root.IncRef()
   466  	defer root.DecRef(ctx)
   467  	target := &vfs.PathOperation{
   468  		Root:  root,
   469  		Start: root,
   470  		Path:  fspath.Parse(submount.mount.Destination),
   471  	}
   472  	mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
   473  	if err != nil {
   474  		return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.mount.Destination, submount.mount.Type, err, opts)
   475  	}
   476  	log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.mount.Source, submount.mount.Destination, submount.mount.Type, opts.GetFilesystemOptions.Data)
   477  	return mnt, nil
   478  }
   479  
   480  // getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
   481  // used for mounts.
   482  func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, bool, error) {
   483  	fsName := m.mount.Type
   484  	useOverlay := false
   485  	var (
   486  		data         []string
   487  		internalData interface{}
   488  	)
   489  
   490  	verityData, verityOpts, verityRequested, remainingMOpts, err := parseVerityMountOptions(m.mount.Options)
   491  	if err != nil {
   492  		return "", nil, false, err
   493  	}
   494  	m.mount.Options = remainingMOpts
   495  
   496  	// Find filesystem name and FS specific data field.
   497  	switch m.mount.Type {
   498  	case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
   499  		// Nothing to do.
   500  
   501  	case nonefs:
   502  		fsName = sys.Name
   503  
   504  	case tmpfs.Name:
   505  		var err error
   506  		data, err = parseAndFilterOptions(m.mount.Options, tmpfsAllowedData...)
   507  		if err != nil {
   508  			return "", nil, false, err
   509  		}
   510  
   511  	case bind:
   512  		fsName = gofer.Name
   513  		if m.fd == 0 {
   514  			// Check that an FD was provided to fails fast. Technically FD=0 is valid,
   515  			// but unlikely to be correct in this context.
   516  			return "", nil, false, fmt.Errorf("9P mount requires a connection FD")
   517  		}
   518  		data = p9MountData(m.fd, c.getMountAccessType(conf, m.mount), true /* vfs2 */)
   519  		internalData = gofer.InternalFilesystemOptions{
   520  			UniqueID: m.mount.Destination,
   521  		}
   522  
   523  		// If configured, add overlay to all writable mounts.
   524  		useOverlay = conf.Overlay && !mountFlags(m.mount.Options).ReadOnly
   525  
   526  	case cgroupfs.Name:
   527  		var err error
   528  		data, err = parseAndFilterOptions(m.mount.Options, cgroupfs.SupportedMountOptions...)
   529  		if err != nil {
   530  			return "", nil, false, err
   531  		}
   532  
   533  	default:
   534  		log.Warningf("ignoring unknown filesystem type %q", m.mount.Type)
   535  		return "", nil, false, nil
   536  	}
   537  
   538  	opts := &vfs.MountOptions{
   539  		GetFilesystemOptions: vfs.GetFilesystemOptions{
   540  			Data:         strings.Join(data, ","),
   541  			InternalData: internalData,
   542  		},
   543  		InternalMount: true,
   544  	}
   545  
   546  	for _, o := range m.mount.Options {
   547  		switch o {
   548  		case "rw":
   549  			opts.ReadOnly = false
   550  		case "ro":
   551  			opts.ReadOnly = true
   552  		case "noatime":
   553  			opts.Flags.NoATime = true
   554  		case "noexec":
   555  			opts.Flags.NoExec = true
   556  		case "bind", "rbind":
   557  			// These are the same as a mount with type="bind".
   558  		default:
   559  			log.Warningf("ignoring unknown mount option %q", o)
   560  		}
   561  	}
   562  
   563  	if verityRequested {
   564  		verityData = verityData + "root_name=" + path.Base(m.mount.Destination)
   565  		verityOpts.LowerName = fsName
   566  		verityOpts.LowerGetFSOptions = opts.GetFilesystemOptions
   567  		fsName = verity.Name
   568  		opts = &vfs.MountOptions{
   569  			GetFilesystemOptions: vfs.GetFilesystemOptions{
   570  				Data:         verityData,
   571  				InternalData: verityOpts,
   572  			},
   573  			InternalMount: true,
   574  		}
   575  	}
   576  
   577  	return fsName, opts, useOverlay, nil
   578  }
   579  
   580  func parseKeyValue(s string) (string, string, bool) {
   581  	tokens := strings.SplitN(s, "=", 2)
   582  	if len(tokens) < 2 {
   583  		return "", "", false
   584  	}
   585  	return strings.TrimSpace(tokens[0]), strings.TrimSpace(tokens[1]), true
   586  }
   587  
   588  // parseAndFilterOptions scans the provided mount options for verity-related
   589  // mount options. It returns the parsed set of verity mount options, as well as
   590  // the filtered set of mount options unrelated to verity.
   591  func parseVerityMountOptions(mopts []string) (string, verity.InternalFilesystemOptions, bool, []string, error) {
   592  	nonVerity := []string{}
   593  	found := false
   594  	var rootHash string
   595  	verityOpts := verity.InternalFilesystemOptions{
   596  		Action: verity.PanicOnViolation,
   597  	}
   598  	for _, o := range mopts {
   599  		if !strings.HasPrefix(o, "verity.") {
   600  			nonVerity = append(nonVerity, o)
   601  			continue
   602  		}
   603  
   604  		k, v, ok := parseKeyValue(o)
   605  		if !ok {
   606  			return "", verityOpts, found, nonVerity, fmt.Errorf("invalid verity mount option with no value: %q", o)
   607  		}
   608  
   609  		found = true
   610  		switch k {
   611  		case "verity.roothash":
   612  			rootHash = v
   613  		case "verity.action":
   614  			switch v {
   615  			case "error":
   616  				verityOpts.Action = verity.ErrorOnViolation
   617  			case "panic":
   618  				verityOpts.Action = verity.PanicOnViolation
   619  			default:
   620  				log.Warningf("Invalid verity action %q", v)
   621  				verityOpts.Action = verity.PanicOnViolation
   622  			}
   623  		default:
   624  			return "", verityOpts, found, nonVerity, fmt.Errorf("unknown verity mount option: %q", k)
   625  		}
   626  	}
   627  	verityOpts.AllowRuntimeEnable = len(rootHash) == 0
   628  	verityData := "root_hash=" + rootHash + ","
   629  	return verityData, verityOpts, found, nonVerity, nil
   630  }
   631  
   632  // mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
   633  // Technically we don't have to mount tmpfs at /tmp, as we could just rely on
   634  // the host /tmp, but this is a nice optimization, and fixes some apps that call
   635  // mknod in /tmp. It's unsafe to mount tmpfs if:
   636  //   1. /tmp is mounted explicitly: we should not override user's wish
   637  //   2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
   638  //
   639  // Note that when there are submounts inside of '/tmp', directories for the
   640  // mount points must be present, making '/tmp' not empty anymore.
   641  func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
   642  	for _, m := range c.mounts {
   643  		// m.Destination has been cleaned, so it's to use equality here.
   644  		if m.Destination == "/tmp" {
   645  			log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
   646  			return nil
   647  		}
   648  	}
   649  
   650  	root := mns.Root()
   651  	root.IncRef()
   652  	defer root.DecRef(ctx)
   653  	pop := vfs.PathOperation{
   654  		Root:  root,
   655  		Start: root,
   656  		Path:  fspath.Parse("/tmp"),
   657  	}
   658  	fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
   659  	switch {
   660  	case err == nil:
   661  		defer fd.DecRef(ctx)
   662  
   663  		err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
   664  			if dirent.Name != "." && dirent.Name != ".." {
   665  				return linuxerr.ENOTEMPTY
   666  			}
   667  			return nil
   668  		}))
   669  		switch {
   670  		case err == nil:
   671  			log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
   672  		case linuxerr.Equals(linuxerr.ENOTEMPTY, err):
   673  			// If more than "." and ".." is found, skip internal tmpfs to prevent
   674  			// hiding existing files.
   675  			log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
   676  			return nil
   677  		default:
   678  			return err
   679  		}
   680  		fallthrough
   681  
   682  	case linuxerr.Equals(linuxerr.ENOENT, err):
   683  		// No '/tmp' found (or fallthrough from above). It's safe to mount internal
   684  		// tmpfs.
   685  		tmpMount := specs.Mount{
   686  			Type:        tmpfs.Name,
   687  			Destination: "/tmp",
   688  			// Sticky bit is added to prevent accidental deletion of files from
   689  			// another user. This is normally done for /tmp.
   690  			Options: []string{"mode=01777"},
   691  		}
   692  		_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{mount: &tmpMount})
   693  		return err
   694  
   695  	case linuxerr.Equals(linuxerr.ENOTDIR, err):
   696  		// Not a dir?! Let it be.
   697  		return nil
   698  
   699  	default:
   700  		return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
   701  	}
   702  }
   703  
   704  // processHintsVFS2 processes annotations that container hints about how volumes
   705  // should be mounted (e.g. a volume shared between containers). It must be
   706  // called for the root container only.
   707  func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
   708  	ctx := c.k.SupervisorContext()
   709  	for _, hint := range c.hints.mounts {
   710  		// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
   711  		// common gofer to mount all shared volumes.
   712  		if hint.mount.Type != tmpfs.Name {
   713  			continue
   714  		}
   715  
   716  		log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
   717  		mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
   718  		if err != nil {
   719  			return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
   720  		}
   721  		hint.vfsMount = mnt
   722  	}
   723  	return nil
   724  }
   725  
   726  // mountSharedMasterVFS2 mounts the master of a volume that is shared among
   727  // containers in a pod.
   728  func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
   729  	// Map mount type to filesystem name, and parse out the options that we are
   730  	// capable of dealing with.
   731  	mntFD := &mountAndFD{mount: &hint.mount}
   732  	fsName, opts, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
   733  	if err != nil {
   734  		return nil, err
   735  	}
   736  	if len(fsName) == 0 {
   737  		return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
   738  	}
   739  
   740  	if useOverlay {
   741  		log.Infof("Adding overlay on top of shared mount %q", mntFD.mount.Destination)
   742  		var cleanup func()
   743  		opts, cleanup, err = c.configureOverlay(ctx, creds, opts, fsName)
   744  		if err != nil {
   745  			return nil, fmt.Errorf("mounting shared volume with overlay at %q: %w", mntFD.mount.Destination, err)
   746  		}
   747  		defer cleanup()
   748  		fsName = overlay.Name
   749  	}
   750  
   751  	return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
   752  }
   753  
   754  // mountSharedSubmount binds mount to a previously mounted volume that is shared
   755  // among containers in the same pod.
   756  func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount *specs.Mount, source *mountHint) (*vfs.Mount, error) {
   757  	if err := source.checkCompatible(mount); err != nil {
   758  		return nil, err
   759  	}
   760  
   761  	// Ignore data and useOverlay because these were already applied to
   762  	// the master mount.
   763  	_, opts, _, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{mount: mount})
   764  	if err != nil {
   765  		return nil, err
   766  	}
   767  	newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
   768  	if err != nil {
   769  		return nil, err
   770  	}
   771  	defer newMnt.DecRef(ctx)
   772  
   773  	root := mns.Root()
   774  	root.IncRef()
   775  	defer root.DecRef(ctx)
   776  	target := &vfs.PathOperation{
   777  		Root:  root,
   778  		Start: root,
   779  		Path:  fspath.Parse(mount.Destination),
   780  	}
   781  
   782  	if err := c.makeMountPoint(ctx, creds, mns, mount.Destination); err != nil {
   783  		return nil, fmt.Errorf("creating mount point %q: %w", mount.Destination, err)
   784  	}
   785  
   786  	if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
   787  		return nil, err
   788  	}
   789  	log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
   790  	return newMnt, nil
   791  }
   792  
   793  func (c *containerMounter) makeMountPoint(ctx context.Context, creds *auth.Credentials, mns *vfs.MountNamespace, dest string) error {
   794  	root := mns.Root()
   795  	root.IncRef()
   796  	defer root.DecRef(ctx)
   797  	target := &vfs.PathOperation{
   798  		Root:  root,
   799  		Start: root,
   800  		Path:  fspath.Parse(dest),
   801  	}
   802  	// First check if mount point exists. When overlay is enabled, gofer doesn't
   803  	// allow changes to the FS, making MakeSytheticMountpoint() ineffective
   804  	// because MkdirAt fails with EROFS even if file exists.
   805  	vd, err := c.k.VFS().GetDentryAt(ctx, creds, target, &vfs.GetDentryOptions{})
   806  	if err == nil {
   807  		// File exists, we're done.
   808  		vd.DecRef(ctx)
   809  		return nil
   810  	}
   811  	return c.k.VFS().MakeSyntheticMountpoint(ctx, dest, root, creds)
   812  }
   813  
   814  // configureRestore returns an updated context.Context including filesystem
   815  // state used by restore defined by conf.
   816  func (c *containerMounter) configureRestore(ctx context.Context) (context.Context, error) {
   817  	fdmap := make(map[string]int)
   818  	fdmap["/"] = c.fds.remove()
   819  	mounts, err := c.prepareMountsVFS2()
   820  	if err != nil {
   821  		return ctx, err
   822  	}
   823  	for i := range c.mounts {
   824  		submount := &mounts[i]
   825  		if submount.fd >= 0 {
   826  			fdmap[submount.mount.Destination] = submount.fd
   827  		}
   828  	}
   829  	return context.WithValue(ctx, gofer.CtxRestoreServerFDMap, fdmap), nil
   830  }