github.com/opencontainers/runc@v1.2.0-rc.1.0.20240520010911-492dc558cdd6/libcontainer/rootfs_linux.go (about)

     1  package libcontainer
     2  
     3  import (
     4  	"encoding/json"
     5  	"errors"
     6  	"fmt"
     7  	"os"
     8  	"path"
     9  	"path/filepath"
    10  	"strconv"
    11  	"strings"
    12  	"syscall"
    13  	"time"
    14  
    15  	securejoin "github.com/cyphar/filepath-securejoin"
    16  	"github.com/moby/sys/mountinfo"
    17  	"github.com/mrunalp/fileutils"
    18  	"github.com/opencontainers/runtime-spec/specs-go"
    19  	"github.com/opencontainers/selinux/go-selinux/label"
    20  	"github.com/sirupsen/logrus"
    21  	"golang.org/x/sys/unix"
    22  
    23  	"github.com/opencontainers/runc/libcontainer/cgroups"
    24  	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
    25  	"github.com/opencontainers/runc/libcontainer/configs"
    26  	"github.com/opencontainers/runc/libcontainer/devices"
    27  	"github.com/opencontainers/runc/libcontainer/userns"
    28  	"github.com/opencontainers/runc/libcontainer/utils"
    29  )
    30  
    31  const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
    32  
    33  // mountConfig contains mount data not specific to a mount point.
    34  type mountConfig struct {
    35  	root            string
    36  	label           string
    37  	cgroup2Path     string
    38  	rootlessCgroups bool
    39  	cgroupns        bool
    40  }
    41  
    42  // mountEntry contains mount data specific to a mount point.
    43  type mountEntry struct {
    44  	*configs.Mount
    45  	srcFile *mountSource
    46  }
    47  
    48  // srcName is only meant for error messages, it returns a "friendly" name.
    49  func (m mountEntry) srcName() string {
    50  	if m.srcFile != nil {
    51  		return m.srcFile.file.Name()
    52  	}
    53  	return m.Source
    54  }
    55  
    56  func (m mountEntry) srcStat() (os.FileInfo, *syscall.Stat_t, error) {
    57  	var (
    58  		st  os.FileInfo
    59  		err error
    60  	)
    61  	if m.srcFile != nil {
    62  		st, err = m.srcFile.file.Stat()
    63  	} else {
    64  		st, err = os.Stat(m.Source)
    65  	}
    66  	if err != nil {
    67  		return nil, nil, err
    68  	}
    69  	return st, st.Sys().(*syscall.Stat_t), nil
    70  }
    71  
    72  func (m mountEntry) srcStatfs() (*unix.Statfs_t, error) {
    73  	var st unix.Statfs_t
    74  	if m.srcFile != nil {
    75  		if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil {
    76  			return nil, os.NewSyscallError("fstatfs", err)
    77  		}
    78  	} else {
    79  		if err := unix.Statfs(m.Source, &st); err != nil {
    80  			return nil, &os.PathError{Op: "statfs", Path: m.Source, Err: err}
    81  		}
    82  	}
    83  	return &st, nil
    84  }
    85  
    86  // needsSetupDev returns true if /dev needs to be set up.
    87  func needsSetupDev(config *configs.Config) bool {
    88  	for _, m := range config.Mounts {
    89  		if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" {
    90  			return false
    91  		}
    92  	}
    93  	return true
    94  }
    95  
    96  // prepareRootfs sets up the devices, mount points, and filesystems for use
    97  // inside a new mount namespace. It doesn't set anything as ro. You must call
    98  // finalizeRootfs after this function to finish setting up the rootfs.
    99  func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
   100  	config := iConfig.Config
   101  	if err := prepareRoot(config); err != nil {
   102  		return fmt.Errorf("error preparing rootfs: %w", err)
   103  	}
   104  
   105  	mountConfig := &mountConfig{
   106  		root:            config.Rootfs,
   107  		label:           config.MountLabel,
   108  		cgroup2Path:     iConfig.Cgroup2Path,
   109  		rootlessCgroups: iConfig.RootlessCgroups,
   110  		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
   111  	}
   112  	for _, m := range config.Mounts {
   113  		entry := mountEntry{Mount: m}
   114  		// Figure out whether we need to request runc to give us an
   115  		// open_tree(2)-style mountfd. For idmapped mounts, this is always
   116  		// necessary. For bind-mounts, this is only necessary if we cannot
   117  		// resolve the parent mount (this is only hit if you are running in a
   118  		// userns -- but for rootless the host-side thread can't help).
   119  		wantSourceFile := m.IsIDMapped()
   120  		if m.IsBind() && !config.RootlessEUID {
   121  			if _, err := os.Stat(m.Source); err != nil {
   122  				wantSourceFile = true
   123  			}
   124  		}
   125  		if wantSourceFile {
   126  			// Request a source file from the host.
   127  			if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
   128  				return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
   129  			}
   130  			sync, err := readSyncFull(pipe, procMountFd)
   131  			if err != nil {
   132  				return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
   133  			}
   134  			if sync.File == nil {
   135  				return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
   136  			}
   137  			defer sync.File.Close()
   138  			// Sanity-check to make sure we didn't get the wrong fd back. Note
   139  			// that while m.Source might contain symlinks, the (*os.File).Name
   140  			// is based on the path provided to os.OpenFile, not what it
   141  			// resolves to. So this should never happen.
   142  			if sync.File.Name() != m.Source {
   143  				return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
   144  			}
   145  			// Unmarshal the procMountFd argument (the file is sync.File).
   146  			var src *mountSource
   147  			if sync.Arg == nil {
   148  				return fmt.Errorf("sync %q is missing an argument", sync.Type)
   149  			}
   150  			if err := json.Unmarshal(*sync.Arg, &src); err != nil {
   151  				return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
   152  			}
   153  			if src == nil {
   154  				return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source)
   155  			}
   156  			src.file = sync.File
   157  			entry.srcFile = src
   158  		}
   159  		if err := mountToRootfs(mountConfig, entry); err != nil {
   160  			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
   161  		}
   162  	}
   163  
   164  	setupDev := needsSetupDev(config)
   165  	if setupDev {
   166  		if err := createDevices(config); err != nil {
   167  			return fmt.Errorf("error creating device nodes: %w", err)
   168  		}
   169  		if err := setupPtmx(config); err != nil {
   170  			return fmt.Errorf("error setting up ptmx: %w", err)
   171  		}
   172  		if err := setupDevSymlinks(config.Rootfs); err != nil {
   173  			return fmt.Errorf("error setting up /dev symlinks: %w", err)
   174  		}
   175  	}
   176  
   177  	// Signal the parent to run the pre-start hooks.
   178  	// The hooks are run after the mounts are setup, but before we switch to the new
   179  	// root, so that the old root is still available in the hooks for any mount
   180  	// manipulations.
   181  	// Note that iConfig.Cwd is not guaranteed to exist here.
   182  	if err := syncParentHooks(pipe); err != nil {
   183  		return err
   184  	}
   185  
   186  	// The reason these operations are done here rather than in finalizeRootfs
   187  	// is because the console-handling code gets quite sticky if we have to set
   188  	// up the console before doing the pivot_root(2). This is because the
   189  	// Console API has to also work with the ExecIn case, which means that the
   190  	// API must be able to deal with being inside as well as outside the
   191  	// container. It's just cleaner to do this here (at the expense of the
   192  	// operation not being perfectly split).
   193  
   194  	if err := unix.Chdir(config.Rootfs); err != nil {
   195  		return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
   196  	}
   197  
   198  	s := iConfig.SpecState
   199  	s.Pid = unix.Getpid()
   200  	s.Status = specs.StateCreating
   201  	if err := iConfig.Config.Hooks.Run(configs.CreateContainer, s); err != nil {
   202  		return err
   203  	}
   204  
   205  	if config.NoPivotRoot {
   206  		err = msMoveRoot(config.Rootfs)
   207  	} else if config.Namespaces.Contains(configs.NEWNS) {
   208  		err = pivotRoot(config.Rootfs)
   209  	} else {
   210  		err = chroot()
   211  	}
   212  	if err != nil {
   213  		return fmt.Errorf("error jailing process inside rootfs: %w", err)
   214  	}
   215  
   216  	if setupDev {
   217  		if err := reOpenDevNull(); err != nil {
   218  			return fmt.Errorf("error reopening /dev/null inside container: %w", err)
   219  		}
   220  	}
   221  
   222  	if cwd := iConfig.Cwd; cwd != "" {
   223  		// Note that spec.Process.Cwd can contain unclean value like  "../../../../foo/bar...".
   224  		// However, we are safe to call MkDirAll directly because we are in the jail here.
   225  		if err := os.MkdirAll(cwd, 0o755); err != nil {
   226  			return err
   227  		}
   228  	}
   229  
   230  	return nil
   231  }
   232  
   233  // finalizeRootfs sets anything to ro if necessary. You must call
   234  // prepareRootfs first.
   235  func finalizeRootfs(config *configs.Config) (err error) {
   236  	// All tmpfs mounts and /dev were previously mounted as rw
   237  	// by mountPropagate. Remount them read-only as requested.
   238  	for _, m := range config.Mounts {
   239  		if m.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
   240  			continue
   241  		}
   242  		if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
   243  			if err := remountReadonly(m); err != nil {
   244  				return err
   245  			}
   246  		}
   247  	}
   248  
   249  	// set rootfs ( / ) as readonly
   250  	if config.Readonlyfs {
   251  		if err := setReadonly(); err != nil {
   252  			return fmt.Errorf("error setting rootfs as readonly: %w", err)
   253  		}
   254  	}
   255  
   256  	if config.Umask != nil {
   257  		unix.Umask(int(*config.Umask))
   258  	} else {
   259  		unix.Umask(0o022)
   260  	}
   261  	return nil
   262  }
   263  
   264  // /tmp has to be mounted as private to allow MS_MOVE to work in all situations
   265  func prepareTmp(topTmpDir string) (string, error) {
   266  	tmpdir, err := os.MkdirTemp(topTmpDir, "runctop")
   267  	if err != nil {
   268  		return "", err
   269  	}
   270  	if err := mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
   271  		return "", err
   272  	}
   273  	if err := mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
   274  		return "", err
   275  	}
   276  	return tmpdir, nil
   277  }
   278  
   279  func cleanupTmp(tmpdir string) {
   280  	_ = unix.Unmount(tmpdir, 0)
   281  	_ = os.RemoveAll(tmpdir)
   282  }
   283  
   284  func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
   285  	binds, err := getCgroupMounts(m)
   286  	if err != nil {
   287  		return err
   288  	}
   289  	var merged []string
   290  	for _, b := range binds {
   291  		ss := filepath.Base(b.Destination)
   292  		if strings.Contains(ss, ",") {
   293  			merged = append(merged, ss)
   294  		}
   295  	}
   296  	tmpfs := &configs.Mount{
   297  		Source:           "tmpfs",
   298  		Device:           "tmpfs",
   299  		Destination:      m.Destination,
   300  		Flags:            defaultMountFlags,
   301  		Data:             "mode=755",
   302  		PropagationFlags: m.PropagationFlags,
   303  	}
   304  
   305  	if err := mountToRootfs(c, mountEntry{Mount: tmpfs}); err != nil {
   306  		return err
   307  	}
   308  
   309  	for _, b := range binds {
   310  		if c.cgroupns {
   311  			subsystemPath := filepath.Join(c.root, b.Destination)
   312  			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
   313  				return err
   314  			}
   315  			if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
   316  				flags := defaultMountFlags
   317  				if m.Flags&unix.MS_RDONLY != 0 {
   318  					flags = flags | unix.MS_RDONLY
   319  				}
   320  				var (
   321  					source = "cgroup"
   322  					data   = filepath.Base(subsystemPath)
   323  				)
   324  				if data == "systemd" {
   325  					data = cgroups.CgroupNamePrefix + data
   326  					source = "systemd"
   327  				}
   328  				return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
   329  			}); err != nil {
   330  				return err
   331  			}
   332  		} else {
   333  			if err := mountToRootfs(c, mountEntry{Mount: b}); err != nil {
   334  				return err
   335  			}
   336  		}
   337  	}
   338  	for _, mc := range merged {
   339  		for _, ss := range strings.Split(mc, ",") {
   340  			// symlink(2) is very dumb, it will just shove the path into
   341  			// the link and doesn't do any checks or relative path
   342  			// conversion. Also, don't error out if the cgroup already exists.
   343  			if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) {
   344  				return err
   345  			}
   346  		}
   347  	}
   348  	return nil
   349  }
   350  
   351  func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
   352  	dest, err := securejoin.SecureJoin(c.root, m.Destination)
   353  	if err != nil {
   354  		return err
   355  	}
   356  	if err := os.MkdirAll(dest, 0o755); err != nil {
   357  		return err
   358  	}
   359  	err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
   360  		return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
   361  	})
   362  	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
   363  		return err
   364  	}
   365  
   366  	// When we are in UserNS but CgroupNS is not unshared, we cannot mount
   367  	// cgroup2 (#2158), so fall back to bind mount.
   368  	bindM := &configs.Mount{
   369  		Device:           "bind",
   370  		Source:           fs2.UnifiedMountpoint,
   371  		Destination:      m.Destination,
   372  		Flags:            unix.MS_BIND | m.Flags,
   373  		PropagationFlags: m.PropagationFlags,
   374  	}
   375  	if c.cgroupns && c.cgroup2Path != "" {
   376  		// Emulate cgroupns by bind-mounting the container cgroup path
   377  		// rather than the whole /sys/fs/cgroup.
   378  		bindM.Source = c.cgroup2Path
   379  	}
   380  	// mountToRootfs() handles remounting for MS_RDONLY.
   381  	err = mountToRootfs(c, mountEntry{Mount: bindM})
   382  	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
   383  		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
   384  		// outside the userns+mountns.
   385  		//
   386  		// Mask `/sys/fs/cgroup` to ensure it is read-only, even when `/sys` is mounted
   387  		// with `rbind,ro` (`runc spec --rootless` produces `rbind,ro` for `/sys`).
   388  		err = utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
   389  			return maskPath(procfd, c.label)
   390  		})
   391  	}
   392  	return err
   393  }
   394  
   395  func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
   396  	// Set up a scratch dir for the tmpfs on the host.
   397  	tmpdir, err := prepareTmp("/tmp")
   398  	if err != nil {
   399  		return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err)
   400  	}
   401  	defer cleanupTmp(tmpdir)
   402  	tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir")
   403  	if err != nil {
   404  		return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err)
   405  	}
   406  	defer os.RemoveAll(tmpDir)
   407  
   408  	// Configure the *host* tmpdir as if it's the container mount. We change
   409  	// m.Destination since we are going to mount *on the host*.
   410  	oldDest := m.Destination
   411  	m.Destination = tmpDir
   412  	err = mountPropagate(m, "/", mountLabel)
   413  	m.Destination = oldDest
   414  	if err != nil {
   415  		return err
   416  	}
   417  	defer func() {
   418  		if Err != nil {
   419  			if err := unmount(tmpDir, unix.MNT_DETACH); err != nil {
   420  				logrus.Warnf("tmpcopyup: %v", err)
   421  			}
   422  		}
   423  	}()
   424  
   425  	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
   426  		// Copy the container data to the host tmpdir. We append "/" to force
   427  		// CopyDirectory to resolve the symlink rather than trying to copy the
   428  		// symlink itself.
   429  		if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
   430  			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
   431  		}
   432  		// Now move the mount into the container.
   433  		if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
   434  			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
   435  		}
   436  		return nil
   437  	})
   438  }
   439  
   440  const (
   441  	// The atime "enum" flags (which are mutually exclusive).
   442  	mntAtimeEnumFlags = unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME
   443  	// All atime-related flags.
   444  	mntAtimeFlags = mntAtimeEnumFlags | unix.MS_NODIRATIME
   445  	// Flags which can be locked when inheriting mounts in a different userns.
   446  	// In the kernel, these are the mounts that are locked using MNT_LOCK_*.
   447  	mntLockFlags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC |
   448  		unix.MS_NOSUID | mntAtimeFlags
   449  )
   450  
   451  func statfsToMountFlags(st unix.Statfs_t) int {
   452  	// From <linux/statfs.h>.
   453  	const ST_NOSYMFOLLOW = 0x2000 //nolint:revive
   454  
   455  	var flags int
   456  	for _, f := range []struct {
   457  		st, ms int
   458  	}{
   459  		// See calculate_f_flags() in fs/statfs.c.
   460  		{unix.ST_RDONLY, unix.MS_RDONLY},
   461  		{unix.ST_NOSUID, unix.MS_NOSUID},
   462  		{unix.ST_NODEV, unix.MS_NODEV},
   463  		{unix.ST_NOEXEC, unix.MS_NOEXEC},
   464  		{unix.ST_MANDLOCK, unix.MS_MANDLOCK},
   465  		{unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS},
   466  		{unix.ST_NOATIME, unix.MS_NOATIME},
   467  		{unix.ST_NODIRATIME, unix.MS_NODIRATIME},
   468  		{unix.ST_RELATIME, unix.MS_RELATIME},
   469  		{ST_NOSYMFOLLOW, unix.MS_NOSYMFOLLOW},
   470  		// There is no ST_STRICTATIME -- see below.
   471  	} {
   472  		if int(st.Flags)&f.st == f.st {
   473  			flags |= f.ms
   474  		}
   475  	}
   476  	// MS_STRICTATIME is a "fake" MS_* flag. It isn't stored in mnt->mnt_flags,
   477  	// and so it doesn't show up in statfs(2). If none of the other flags in
   478  	// atime enum are present, the mount is MS_STRICTATIME.
   479  	if flags&mntAtimeEnumFlags == 0 {
   480  		flags |= unix.MS_STRICTATIME
   481  	}
   482  	return flags
   483  }
   484  
   485  func mountToRootfs(c *mountConfig, m mountEntry) error {
   486  	rootfs := c.root
   487  
   488  	// procfs and sysfs are special because we need to ensure they are actually
   489  	// mounted on a specific path in a container without any funny business.
   490  	switch m.Device {
   491  	case "proc", "sysfs":
   492  		// If the destination already exists and is not a directory, we bail
   493  		// out. This is to avoid mounting through a symlink or similar -- which
   494  		// has been a "fun" attack scenario in the past.
   495  		// TODO: This won't be necessary once we switch to libpathrs and we can
   496  		//       stop all of these symlink-exchange attacks.
   497  		dest := filepath.Clean(m.Destination)
   498  		if !strings.HasPrefix(dest, rootfs) {
   499  			// Do not use securejoin as it resolves symlinks.
   500  			dest = filepath.Join(rootfs, dest)
   501  		}
   502  		if err := checkProcMount(rootfs, dest, m); err != nil {
   503  			return err
   504  		}
   505  		if fi, err := os.Lstat(dest); err != nil {
   506  			if !os.IsNotExist(err) {
   507  				return err
   508  			}
   509  		} else if !fi.IsDir() {
   510  			return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device)
   511  		}
   512  		if err := os.MkdirAll(dest, 0o755); err != nil {
   513  			return err
   514  		}
   515  		// Selinux kernels do not support labeling of /proc or /sys.
   516  		return mountPropagate(m, rootfs, "")
   517  	}
   518  
   519  	mountLabel := c.label
   520  	dest, err := securejoin.SecureJoin(rootfs, m.Destination)
   521  	if err != nil {
   522  		return err
   523  	}
   524  	if err := checkProcMount(rootfs, dest, m); err != nil {
   525  		return err
   526  	}
   527  
   528  	switch m.Device {
   529  	case "mqueue":
   530  		if err := os.MkdirAll(dest, 0o755); err != nil {
   531  			return err
   532  		}
   533  		if err := mountPropagate(m, rootfs, ""); err != nil {
   534  			return err
   535  		}
   536  		return label.SetFileLabel(dest, mountLabel)
   537  	case "tmpfs":
   538  		if stat, err := os.Stat(dest); err != nil {
   539  			if err := os.MkdirAll(dest, 0o755); err != nil {
   540  				return err
   541  			}
   542  		} else {
   543  			dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode()))
   544  			if m.Data != "" {
   545  				dt = dt + "," + m.Data
   546  			}
   547  			m.Data = dt
   548  		}
   549  
   550  		if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
   551  			err = doTmpfsCopyUp(m, rootfs, mountLabel)
   552  		} else {
   553  			err = mountPropagate(m, rootfs, mountLabel)
   554  		}
   555  
   556  		return err
   557  	case "bind":
   558  		fi, _, err := m.srcStat()
   559  		if err != nil {
   560  			// error out if the source of a bind mount does not exist as we will be
   561  			// unable to bind anything to it.
   562  			return err
   563  		}
   564  		if err := createIfNotExists(dest, fi.IsDir()); err != nil {
   565  			return err
   566  		}
   567  		// open_tree()-related shenanigans are all handled in mountViaFds.
   568  		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
   569  			return err
   570  		}
   571  
   572  		// The initial MS_BIND won't change the mount options, we need to do a
   573  		// separate MS_BIND|MS_REMOUNT to apply the mount options. We skip
   574  		// doing this if the user has not specified any mount flags at all
   575  		// (including cleared flags) -- in which case we just keep the original
   576  		// mount flags.
   577  		//
   578  		// Note that the fact we check whether any clearing flags are set is in
   579  		// contrast to mount(8)'s current behaviour, but is what users probably
   580  		// expect. See <https://github.com/util-linux/util-linux/issues/2433>.
   581  		if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
   582  			if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
   583  				flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
   584  				// The runtime-spec says we SHOULD map to the relevant mount(8)
   585  				// behaviour. However, it's not clear whether we want the
   586  				// "mount --bind -o ..." or "mount --bind -o remount,..."
   587  				// behaviour here -- both of which are somewhat broken[1].
   588  				//
   589  				// So, if the user has passed "remount" as a mount option, we
   590  				// implement the "mount --bind -o remount" behaviour, otherwise
   591  				// we implement the spiritual intent of the "mount --bind -o"
   592  				// behaviour, which should match what users expect. Maybe
   593  				// mount(8) will eventually implement this behaviour too..
   594  				//
   595  				// [1]: https://github.com/util-linux/util-linux/issues/2433
   596  
   597  				// Initially, we emulate "mount --bind -o ..." where we set
   598  				// only the requested flags (clearing any existing flags). The
   599  				// only difference from mount(8) is that we do this
   600  				// unconditionally, regardless of whether any set-me mount
   601  				// options have been requested.
   602  				//
   603  				// TODO: We are not doing any special handling of the atime
   604  				// flags here, which means that the mount will inherit the old
   605  				// atime flags if the user didn't explicitly request a
   606  				// different set of flags. This also has the mount(8) bug where
   607  				// "nodiratime,norelatime" will result in a
   608  				// "nodiratime,relatime" mount.
   609  				mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
   610  				if mountErr == nil {
   611  					return nil
   612  				}
   613  
   614  				// If the mount failed, the mount may contain locked mount
   615  				// flags. In that case, we emulate "mount --bind -o
   616  				// remount,...", where we take the existing mount flags of the
   617  				// mount and apply the request flags (including clearing flags)
   618  				// on top. The main divergence we have from mount(8) here is
   619  				// that we handle atimes correctly to make sure we error out if
   620  				// we cannot fulfil the requested mount flags.
   621  
   622  				st, err := m.srcStatfs()
   623  				if err != nil {
   624  					return err
   625  				}
   626  				srcFlags := statfsToMountFlags(*st)
   627  				// If the user explicitly request one of the locked flags *not*
   628  				// be set, we need to return an error to avoid producing mounts
   629  				// that don't match the user's request.
   630  				if srcFlags&m.ClearedFlags&mntLockFlags != 0 {
   631  					return mountErr
   632  				}
   633  
   634  				// If an MS_*ATIME flag was requested, it must match the
   635  				// existing one. This handles two separate kernel bugs, and
   636  				// matches the logic of can_change_locked_flags() but without
   637  				// these bugs:
   638  				//
   639  				// * (2.6.30+) Since commit 613cbe3d4870 ("Don't set relatime
   640  				// when noatime is specified"), MS_RELATIME is ignored when
   641  				// MS_NOATIME is set. This means that us inheriting MS_NOATIME
   642  				// from a mount while requesting MS_RELATIME would *silently*
   643  				// produce an MS_NOATIME mount.
   644  				//
   645  				// * (2.6.30+) Since its introduction in commit d0adde574b84
   646  				// ("Add a strictatime mount option"), MS_STRICTATIME has
   647  				// caused any passed MS_RELATIME and MS_NOATIME flags to be
   648  				// ignored which results in us *silently* producing
   649  				// MS_STRICTATIME mounts even if the user requested MS_RELATIME
   650  				// or MS_NOATIME.
   651  				if m.Flags&mntAtimeFlags != 0 && m.Flags&mntAtimeFlags != srcFlags&mntAtimeFlags {
   652  					return mountErr
   653  				}
   654  
   655  				// Retry the mount with the existing lockable mount flags
   656  				// applied.
   657  				flags |= srcFlags & mntLockFlags
   658  				mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
   659  				logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
   660  				return mountErr
   661  			}); err != nil {
   662  				return err
   663  			}
   664  		}
   665  
   666  		if m.Relabel != "" {
   667  			if err := label.Validate(m.Relabel); err != nil {
   668  				return err
   669  			}
   670  			shared := label.IsShared(m.Relabel)
   671  			if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
   672  				return err
   673  			}
   674  		}
   675  		return setRecAttr(m.Mount, rootfs)
   676  	case "cgroup":
   677  		if cgroups.IsCgroup2UnifiedMode() {
   678  			return mountCgroupV2(m.Mount, c)
   679  		}
   680  		return mountCgroupV1(m.Mount, c)
   681  	default:
   682  		if err := os.MkdirAll(dest, 0o755); err != nil {
   683  			return err
   684  		}
   685  		return mountPropagate(m, rootfs, mountLabel)
   686  	}
   687  }
   688  
   689  func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
   690  	mounts, err := cgroups.GetCgroupMounts(false)
   691  	if err != nil {
   692  		return nil, err
   693  	}
   694  
   695  	// We don't need to use /proc/thread-self here because runc always runs
   696  	// with every thread in the same cgroup. This lets us avoid having to do
   697  	// runtime.LockOSThread.
   698  	cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
   699  	if err != nil {
   700  		return nil, err
   701  	}
   702  
   703  	var binds []*configs.Mount
   704  
   705  	for _, mm := range mounts {
   706  		dir, err := mm.GetOwnCgroup(cgroupPaths)
   707  		if err != nil {
   708  			return nil, err
   709  		}
   710  		relDir, err := filepath.Rel(mm.Root, dir)
   711  		if err != nil {
   712  			return nil, err
   713  		}
   714  		binds = append(binds, &configs.Mount{
   715  			Device:           "bind",
   716  			Source:           filepath.Join(mm.Mountpoint, relDir),
   717  			Destination:      filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)),
   718  			Flags:            unix.MS_BIND | unix.MS_REC | m.Flags,
   719  			PropagationFlags: m.PropagationFlags,
   720  		})
   721  	}
   722  
   723  	return binds, nil
   724  }
   725  
   726  // Taken from <include/linux/proc_ns.h>. If a file is on a filesystem of type
   727  // PROC_SUPER_MAGIC, we're guaranteed that only the root of the superblock will
   728  // have this inode number.
   729  const procRootIno = 1
   730  
   731  // checkProcMount checks to ensure that the mount destination is not over the top of /proc.
   732  // dest is required to be an abs path and have any symlinks resolved before calling this function.
   733  //
   734  // If m is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
   735  func checkProcMount(rootfs, dest string, m mountEntry) error {
   736  	const procPath = "/proc"
   737  	path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
   738  	if err != nil {
   739  		return err
   740  	}
   741  	// pass if the mount path is located outside of /proc
   742  	if strings.HasPrefix(path, "..") {
   743  		return nil
   744  	}
   745  	if path == "." {
   746  		// Only allow bind-mounts on top of /proc, and only if the source is a
   747  		// procfs mount.
   748  		if m.IsBind() {
   749  			fsSt, err := m.srcStatfs()
   750  			if err != nil {
   751  				return err
   752  			}
   753  			if fsSt.Type == unix.PROC_SUPER_MAGIC {
   754  				if _, uSt, err := m.srcStat(); err != nil {
   755  					return err
   756  				} else if uSt.Ino != procRootIno {
   757  					// We cannot error out in this case, because we've
   758  					// supported these kinds of mounts for a long time.
   759  					// However, we would expect users to bind-mount the root of
   760  					// a real procfs on top of /proc in the container. We might
   761  					// want to block this in the future.
   762  					logrus.Warnf("bind-mount %v (source %v) is of type procfs but is not the root of a procfs (inode %d). Future versions of runc might block this configuration -- please report an issue to <https://github.com/opencontainers/runc> if you see this warning.", dest, m.srcName(), uSt.Ino)
   763  				}
   764  				return nil
   765  			}
   766  		} else if m.Device == "proc" {
   767  			// Fresh procfs-type mounts are always safe to mount on top of /proc.
   768  			return nil
   769  		}
   770  		return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
   771  	}
   772  
   773  	// Here dest is definitely under /proc. Do not allow those,
   774  	// except for a few specific entries emulated by lxcfs.
   775  	validProcMounts := []string{
   776  		"/proc/cpuinfo",
   777  		"/proc/diskstats",
   778  		"/proc/meminfo",
   779  		"/proc/stat",
   780  		"/proc/swaps",
   781  		"/proc/uptime",
   782  		"/proc/loadavg",
   783  		"/proc/slabinfo",
   784  		"/proc/net/dev",
   785  		"/proc/sys/kernel/ns_last_pid",
   786  		"/proc/sys/crypto/fips_enabled",
   787  	}
   788  	for _, valid := range validProcMounts {
   789  		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
   790  		if err != nil {
   791  			return err
   792  		}
   793  		if path == "." {
   794  			return nil
   795  		}
   796  	}
   797  
   798  	return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
   799  }
   800  
   801  func setupDevSymlinks(rootfs string) error {
   802  	// In theory, these should be links to /proc/thread-self, but systems
   803  	// expect these to be /proc/self and this matches how most distributions
   804  	// work.
   805  	links := [][2]string{
   806  		{"/proc/self/fd", "/dev/fd"},
   807  		{"/proc/self/fd/0", "/dev/stdin"},
   808  		{"/proc/self/fd/1", "/dev/stdout"},
   809  		{"/proc/self/fd/2", "/dev/stderr"},
   810  	}
   811  	// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
   812  	// in /dev if it exists in /proc.
   813  	if _, err := os.Stat("/proc/kcore"); err == nil {
   814  		links = append(links, [2]string{"/proc/kcore", "/dev/core"})
   815  	}
   816  	for _, link := range links {
   817  		var (
   818  			src = link[0]
   819  			dst = filepath.Join(rootfs, link[1])
   820  		)
   821  		if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
   822  			return err
   823  		}
   824  	}
   825  	return nil
   826  }
   827  
   828  // If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs
   829  // this method will make them point to `/dev/null` in this container's rootfs.  This
   830  // needs to be called after we chroot/pivot into the container's rootfs so that any
   831  // symlinks are resolved locally.
   832  func reOpenDevNull() error {
   833  	var stat, devNullStat unix.Stat_t
   834  	file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
   835  	if err != nil {
   836  		return err
   837  	}
   838  	defer file.Close() //nolint: errcheck
   839  	if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
   840  		return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
   841  	}
   842  	for fd := 0; fd < 3; fd++ {
   843  		if err := unix.Fstat(fd, &stat); err != nil {
   844  			return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err}
   845  		}
   846  		if stat.Rdev == devNullStat.Rdev {
   847  			// Close and re-open the fd.
   848  			if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
   849  				return &os.PathError{
   850  					Op:   "dup3",
   851  					Path: "fd " + strconv.Itoa(int(file.Fd())),
   852  					Err:  err,
   853  				}
   854  			}
   855  		}
   856  	}
   857  	return nil
   858  }
   859  
   860  // Create the device nodes in the container.
   861  func createDevices(config *configs.Config) error {
   862  	useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
   863  	for _, node := range config.Devices {
   864  
   865  		// The /dev/ptmx device is setup by setupPtmx()
   866  		if utils.CleanPath(node.Path) == "/dev/ptmx" {
   867  			continue
   868  		}
   869  
   870  		// containers running in a user namespace are not allowed to mknod
   871  		// devices so we can just bind mount it from the host.
   872  		if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
   873  			return err
   874  		}
   875  	}
   876  	return nil
   877  }
   878  
   879  func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
   880  	f, err := os.Create(dest)
   881  	if err != nil && !os.IsExist(err) {
   882  		return err
   883  	}
   884  	if f != nil {
   885  		_ = f.Close()
   886  	}
   887  	return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
   888  		return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
   889  	})
   890  }
   891  
   892  // Creates the device node in the rootfs of the container.
   893  func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
   894  	if node.Path == "" {
   895  		// The node only exists for cgroup reasons, ignore it here.
   896  		return nil
   897  	}
   898  	dest, err := securejoin.SecureJoin(rootfs, node.Path)
   899  	if err != nil {
   900  		return err
   901  	}
   902  	if err := os.MkdirAll(filepath.Dir(dest), 0o755); err != nil {
   903  		return err
   904  	}
   905  	if bind {
   906  		return bindMountDeviceNode(rootfs, dest, node)
   907  	}
   908  	if err := mknodDevice(dest, node); err != nil {
   909  		if errors.Is(err, os.ErrExist) {
   910  			return nil
   911  		} else if errors.Is(err, os.ErrPermission) {
   912  			return bindMountDeviceNode(rootfs, dest, node)
   913  		}
   914  		return err
   915  	}
   916  	return nil
   917  }
   918  
   919  func mknodDevice(dest string, node *devices.Device) error {
   920  	fileMode := node.FileMode
   921  	switch node.Type {
   922  	case devices.BlockDevice:
   923  		fileMode |= unix.S_IFBLK
   924  	case devices.CharDevice:
   925  		fileMode |= unix.S_IFCHR
   926  	case devices.FifoDevice:
   927  		fileMode |= unix.S_IFIFO
   928  	default:
   929  		return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
   930  	}
   931  	dev, err := node.Mkdev()
   932  	if err != nil {
   933  		return err
   934  	}
   935  	if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil {
   936  		return &os.PathError{Op: "mknod", Path: dest, Err: err}
   937  	}
   938  	// Ensure permission bits (can be different because of umask).
   939  	if err := os.Chmod(dest, fileMode); err != nil {
   940  		return err
   941  	}
   942  	return os.Chown(dest, int(node.Uid), int(node.Gid))
   943  }
   944  
   945  // Get the parent mount point of directory passed in as argument. Also return
   946  // optional fields.
   947  func getParentMount(rootfs string) (string, string, error) {
   948  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs))
   949  	if err != nil {
   950  		return "", "", err
   951  	}
   952  	if len(mi) < 1 {
   953  		return "", "", fmt.Errorf("could not find parent mount of %s", rootfs)
   954  	}
   955  
   956  	// find the longest mount point
   957  	var idx, maxlen int
   958  	for i := range mi {
   959  		if len(mi[i].Mountpoint) > maxlen {
   960  			maxlen = len(mi[i].Mountpoint)
   961  			idx = i
   962  		}
   963  	}
   964  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   965  }
   966  
   967  // Make parent mount private if it was shared
   968  func rootfsParentMountPrivate(rootfs string) error {
   969  	sharedMount := false
   970  
   971  	parentMount, optionalOpts, err := getParentMount(rootfs)
   972  	if err != nil {
   973  		return err
   974  	}
   975  
   976  	optsSplit := strings.Split(optionalOpts, " ")
   977  	for _, opt := range optsSplit {
   978  		if strings.HasPrefix(opt, "shared:") {
   979  			sharedMount = true
   980  			break
   981  		}
   982  	}
   983  
   984  	// Make parent mount PRIVATE if it was shared. It is needed for two
   985  	// reasons. First of all pivot_root() will fail if parent mount is
   986  	// shared. Secondly when we bind mount rootfs it will propagate to
   987  	// parent namespace and we don't want that to happen.
   988  	if sharedMount {
   989  		return mount("", parentMount, "", unix.MS_PRIVATE, "")
   990  	}
   991  
   992  	return nil
   993  }
   994  
   995  func prepareRoot(config *configs.Config) error {
   996  	flag := unix.MS_SLAVE | unix.MS_REC
   997  	if config.RootPropagation != 0 {
   998  		flag = config.RootPropagation
   999  	}
  1000  	if err := mount("", "/", "", uintptr(flag), ""); err != nil {
  1001  		return err
  1002  	}
  1003  
  1004  	// Make parent mount private to make sure following bind mount does
  1005  	// not propagate in other namespaces. Also it will help with kernel
  1006  	// check pass in pivot_root. (IS_SHARED(new_mnt->mnt_parent))
  1007  	if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
  1008  		return err
  1009  	}
  1010  
  1011  	return mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
  1012  }
  1013  
  1014  func setReadonly() error {
  1015  	flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY)
  1016  
  1017  	err := mount("", "/", "", flags, "")
  1018  	if err == nil {
  1019  		return nil
  1020  	}
  1021  	var s unix.Statfs_t
  1022  	if err := unix.Statfs("/", &s); err != nil {
  1023  		return &os.PathError{Op: "statfs", Path: "/", Err: err}
  1024  	}
  1025  	flags |= uintptr(s.Flags)
  1026  	return mount("", "/", "", flags, "")
  1027  }
  1028  
  1029  func setupPtmx(config *configs.Config) error {
  1030  	ptmx := filepath.Join(config.Rootfs, "dev/ptmx")
  1031  	if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) {
  1032  		return err
  1033  	}
  1034  	if err := os.Symlink("pts/ptmx", ptmx); err != nil {
  1035  		return err
  1036  	}
  1037  	return nil
  1038  }
  1039  
  1040  // pivotRoot will call pivot_root such that rootfs becomes the new root
  1041  // filesystem, and everything else is cleaned up.
  1042  func pivotRoot(rootfs string) error {
  1043  	// While the documentation may claim otherwise, pivot_root(".", ".") is
  1044  	// actually valid. What this results in is / being the new root but
  1045  	// /proc/self/cwd being the old root. Since we can play around with the cwd
  1046  	// with pivot_root this allows us to pivot without creating directories in
  1047  	// the rootfs. Shout-outs to the LXC developers for giving us this idea.
  1048  
  1049  	oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
  1050  	if err != nil {
  1051  		return &os.PathError{Op: "open", Path: "/", Err: err}
  1052  	}
  1053  	defer unix.Close(oldroot) //nolint: errcheck
  1054  
  1055  	newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
  1056  	if err != nil {
  1057  		return &os.PathError{Op: "open", Path: rootfs, Err: err}
  1058  	}
  1059  	defer unix.Close(newroot) //nolint: errcheck
  1060  
  1061  	// Change to the new root so that the pivot_root actually acts on it.
  1062  	if err := unix.Fchdir(newroot); err != nil {
  1063  		return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
  1064  	}
  1065  
  1066  	if err := unix.PivotRoot(".", "."); err != nil {
  1067  		return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
  1068  	}
  1069  
  1070  	// Currently our "." is oldroot (according to the current kernel code).
  1071  	// However, purely for safety, we will fchdir(oldroot) since there isn't
  1072  	// really any guarantee from the kernel what /proc/self/cwd will be after a
  1073  	// pivot_root(2).
  1074  
  1075  	if err := unix.Fchdir(oldroot); err != nil {
  1076  		return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
  1077  	}
  1078  
  1079  	// Make oldroot rslave to make sure our unmounts don't propagate to the
  1080  	// host (and thus bork the machine). We don't use rprivate because this is
  1081  	// known to cause issues due to races where we still have a reference to a
  1082  	// mount while a process in the host namespace are trying to operate on
  1083  	// something they think has no mounts (devicemapper in particular).
  1084  	if err := mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
  1085  		return err
  1086  	}
  1087  	// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
  1088  	if err := unmount(".", unix.MNT_DETACH); err != nil {
  1089  		return err
  1090  	}
  1091  
  1092  	// Switch back to our shiny new root.
  1093  	if err := unix.Chdir("/"); err != nil {
  1094  		return &os.PathError{Op: "chdir", Path: "/", Err: err}
  1095  	}
  1096  	return nil
  1097  }
  1098  
  1099  func msMoveRoot(rootfs string) error {
  1100  	// Before we move the root and chroot we have to mask all "full" sysfs and
  1101  	// procfs mounts which exist on the host. This is because while the kernel
  1102  	// has protections against mounting procfs if it has masks, when using
  1103  	// chroot(2) the *host* procfs mount is still reachable in the mount
  1104  	// namespace and the kernel permits procfs mounts inside --no-pivot
  1105  	// containers.
  1106  	//
  1107  	// Users shouldn't be using --no-pivot except in exceptional circumstances,
  1108  	// but to avoid such a trivial security flaw we apply a best-effort
  1109  	// protection here. The kernel only allows a mount of a pseudo-filesystem
  1110  	// like procfs or sysfs if there is a *full* mount (the root of the
  1111  	// filesystem is mounted) without any other locked mount points covering a
  1112  	// subtree of the mount.
  1113  	//
  1114  	// So we try to unmount (or mount tmpfs on top of) any mountpoint which is
  1115  	// a full mount of either sysfs or procfs (since those are the most
  1116  	// concerning filesystems to us).
  1117  	mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) {
  1118  		// Collect every sysfs and procfs filesystem, except for those which
  1119  		// are non-full mounts or are inside the rootfs of the container.
  1120  		if info.Root != "/" ||
  1121  			(info.FSType != "proc" && info.FSType != "sysfs") ||
  1122  			strings.HasPrefix(info.Mountpoint, rootfs) {
  1123  			skip = true
  1124  		}
  1125  		return
  1126  	})
  1127  	if err != nil {
  1128  		return err
  1129  	}
  1130  	for _, info := range mountinfos {
  1131  		p := info.Mountpoint
  1132  		// Be sure umount events are not propagated to the host.
  1133  		if err := mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
  1134  			if errors.Is(err, unix.ENOENT) {
  1135  				// If the mountpoint doesn't exist that means that we've
  1136  				// already blasted away some parent directory of the mountpoint
  1137  				// and so we don't care about this error.
  1138  				continue
  1139  			}
  1140  			return err
  1141  		}
  1142  		if err := unmount(p, unix.MNT_DETACH); err != nil {
  1143  			if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) {
  1144  				return err
  1145  			} else {
  1146  				// If we have not privileges for umounting (e.g. rootless), then
  1147  				// cover the path.
  1148  				if err := mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
  1149  					return err
  1150  				}
  1151  			}
  1152  		}
  1153  	}
  1154  
  1155  	// Move the rootfs on top of "/" in our mount namespace.
  1156  	if err := mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
  1157  		return err
  1158  	}
  1159  	return chroot()
  1160  }
  1161  
  1162  func chroot() error {
  1163  	if err := unix.Chroot("."); err != nil {
  1164  		return &os.PathError{Op: "chroot", Path: ".", Err: err}
  1165  	}
  1166  	if err := unix.Chdir("/"); err != nil {
  1167  		return &os.PathError{Op: "chdir", Path: "/", Err: err}
  1168  	}
  1169  	return nil
  1170  }
  1171  
  1172  // createIfNotExists creates a file or a directory only if it does not already exist.
  1173  func createIfNotExists(path string, isDir bool) error {
  1174  	if _, err := os.Stat(path); err != nil {
  1175  		if os.IsNotExist(err) {
  1176  			if isDir {
  1177  				return os.MkdirAll(path, 0o755)
  1178  			}
  1179  			if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
  1180  				return err
  1181  			}
  1182  			f, err := os.OpenFile(path, os.O_CREATE, 0o755)
  1183  			if err != nil {
  1184  				return err
  1185  			}
  1186  			_ = f.Close()
  1187  		}
  1188  	}
  1189  	return nil
  1190  }
  1191  
  1192  // readonlyPath will make a path read only.
  1193  func readonlyPath(path string) error {
  1194  	if err := mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
  1195  		if errors.Is(err, os.ErrNotExist) {
  1196  			return nil
  1197  		}
  1198  		return err
  1199  	}
  1200  
  1201  	var s unix.Statfs_t
  1202  	if err := unix.Statfs(path, &s); err != nil {
  1203  		return &os.PathError{Op: "statfs", Path: path, Err: err}
  1204  	}
  1205  	flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
  1206  
  1207  	if err := mount(path, path, "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
  1208  		return err
  1209  	}
  1210  
  1211  	return nil
  1212  }
  1213  
  1214  // remountReadonly will remount an existing mount point and ensure that it is read-only.
  1215  func remountReadonly(m *configs.Mount) error {
  1216  	var (
  1217  		dest  = m.Destination
  1218  		flags = m.Flags
  1219  	)
  1220  	for i := 0; i < 5; i++ {
  1221  		// There is a special case in the kernel for
  1222  		// MS_REMOUNT | MS_BIND, which allows us to change only the
  1223  		// flags even as an unprivileged user (i.e. user namespace)
  1224  		// assuming we don't drop any security related flags (nodev,
  1225  		// nosuid, etc.). So, let's use that case so that we can do
  1226  		// this re-mount without failing in a userns.
  1227  		flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
  1228  		if err := mount("", dest, "", uintptr(flags), ""); err != nil {
  1229  			if errors.Is(err, unix.EBUSY) {
  1230  				time.Sleep(100 * time.Millisecond)
  1231  				continue
  1232  			}
  1233  			return err
  1234  		}
  1235  		return nil
  1236  	}
  1237  	return fmt.Errorf("unable to mount %s as readonly max retries reached", dest)
  1238  }
  1239  
  1240  // maskPath masks the top of the specified path inside a container to avoid
  1241  // security issues from processes reading information from non-namespace aware
  1242  // mounts ( proc/kcore ).
  1243  // For files, maskPath bind mounts /dev/null over the top of the specified path.
  1244  // For directories, maskPath mounts read-only tmpfs over the top of the specified path.
  1245  func maskPath(path string, mountLabel string) error {
  1246  	if err := mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) {
  1247  		if errors.Is(err, unix.ENOTDIR) {
  1248  			return mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
  1249  		}
  1250  		return err
  1251  	}
  1252  	return nil
  1253  }
  1254  
  1255  // writeSystemProperty writes the value to a path under /proc/sys as determined from the key.
  1256  // For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
  1257  func writeSystemProperty(key, value string) error {
  1258  	keyPath := strings.Replace(key, ".", "/", -1)
  1259  	return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
  1260  }
  1261  
  1262  // Do the mount operation followed by additional mounts required to take care
  1263  // of propagation flags. This will always be scoped inside the container rootfs.
  1264  func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
  1265  	var (
  1266  		data  = label.FormatMountLabel(m.Data, mountLabel)
  1267  		flags = m.Flags
  1268  	)
  1269  	// Delay mounting the filesystem read-only if we need to do further
  1270  	// operations on it. We need to set up files in "/dev", and other tmpfs
  1271  	// mounts may need to be chmod-ed after mounting. These mounts will be
  1272  	// remounted ro later in finalizeRootfs(), if necessary.
  1273  	if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
  1274  		flags &= ^unix.MS_RDONLY
  1275  	}
  1276  
  1277  	// Because the destination is inside a container path which might be
  1278  	// mutating underneath us, we verify that we are actually going to mount
  1279  	// inside the container with WithProcfd() -- mounting through a procfd
  1280  	// mounts on the target.
  1281  	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
  1282  		return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
  1283  	}); err != nil {
  1284  		return err
  1285  	}
  1286  	// We have to apply mount propagation flags in a separate WithProcfd() call
  1287  	// because the previous call invalidates the passed procfd -- the mount
  1288  	// target needs to be re-opened.
  1289  	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
  1290  		for _, pflag := range m.PropagationFlags {
  1291  			if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
  1292  				return err
  1293  			}
  1294  		}
  1295  		return nil
  1296  	}); err != nil {
  1297  		return fmt.Errorf("change mount propagation through procfd: %w", err)
  1298  	}
  1299  	return nil
  1300  }
  1301  
  1302  func setRecAttr(m *configs.Mount, rootfs string) error {
  1303  	if m.RecAttr == nil {
  1304  		return nil
  1305  	}
  1306  	return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
  1307  		return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr)
  1308  	})
  1309  }