github.com/rumpl/bof@v23.0.0-rc.2+incompatible/daemon/oci_linux.go

github.com/rumpl/bof@v23.0.0-rc.2+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"os"
     7  	"os/exec"
     8  	"path/filepath"
     9  	"sort"
    10  	"strconv"
    11  	"strings"
    12  
    13  	cdcgroups "github.com/containerd/cgroups"
    14  	"github.com/containerd/containerd/containers"
    15  	coci "github.com/containerd/containerd/oci"
    16  	"github.com/containerd/containerd/pkg/apparmor"
    17  	"github.com/containerd/containerd/pkg/userns"
    18  	containertypes "github.com/docker/docker/api/types/container"
    19  	"github.com/docker/docker/container"
    20  	dconfig "github.com/docker/docker/daemon/config"
    21  	"github.com/docker/docker/errdefs"
    22  	"github.com/docker/docker/oci"
    23  	"github.com/docker/docker/oci/caps"
    24  	"github.com/docker/docker/pkg/idtools"
    25  	"github.com/docker/docker/pkg/rootless/specconv"
    26  	"github.com/docker/docker/pkg/stringid"
    27  	volumemounts "github.com/docker/docker/volume/mounts"
    28  	"github.com/moby/sys/mount"
    29  	"github.com/moby/sys/mountinfo"
    30  	"github.com/opencontainers/runc/libcontainer/cgroups"
    31  	"github.com/opencontainers/runc/libcontainer/user"
    32  	specs "github.com/opencontainers/runtime-spec/specs-go"
    33  	"github.com/pkg/errors"
    34  	"github.com/sirupsen/logrus"
    35  	"golang.org/x/sys/unix"
    36  )
    37  
    38  const inContainerInitPath = "/sbin/" + dconfig.DefaultInitBinary
    39  
    40  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    41  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    42  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    43  		var rlimits []specs.POSIXRlimit
    44  
    45  		// We want to leave the original HostConfig alone so make a copy here
    46  		hostConfig := *c.HostConfig
    47  		// Merge with the daemon defaults
    48  		daemon.mergeUlimits(&hostConfig)
    49  		for _, ul := range hostConfig.Ulimits {
    50  			rlimits = append(rlimits, specs.POSIXRlimit{
    51  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    52  				Soft: uint64(ul.Soft),
    53  				Hard: uint64(ul.Hard),
    54  			})
    55  		}
    56  
    57  		s.Process.Rlimits = rlimits
    58  		return nil
    59  	}
    60  }
    61  
    62  // WithLibnetwork sets the libnetwork hook
    63  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    64  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    65  		if s.Hooks == nil {
    66  			s.Hooks = &specs.Hooks{}
    67  		}
    68  		for _, ns := range s.Linux.Namespaces {
    69  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    70  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    71  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    72  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    73  					Path: target,
    74  					Args: []string{
    75  						"libnetwork-setkey",
    76  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    77  						c.ID,
    78  						shortNetCtlrID,
    79  					},
    80  				})
    81  			}
    82  		}
    83  		return nil
    84  	}
    85  }
    86  
    87  // WithRootless sets the spec to the rootless configuration
    88  func WithRootless(daemon *Daemon) coci.SpecOpts {
    89  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    90  		var v2Controllers []string
    91  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    92  			if cdcgroups.Mode() != cdcgroups.Unified {
    93  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    94  			}
    95  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    96  			if rootlesskitParentEUID == "" {
    97  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    98  			}
    99  			euid, err := strconv.Atoi(rootlesskitParentEUID)
   100  			if err != nil {
   101  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   102  			}
   103  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   104  			controllersFile, err := os.ReadFile(controllersPath)
   105  			if err != nil {
   106  				return err
   107  			}
   108  			v2Controllers = strings.Fields(string(controllersFile))
   109  		}
   110  		return specconv.ToRootless(s, v2Controllers)
   111  	}
   112  }
   113  
   114  // WithOOMScore sets the oom score
   115  func WithOOMScore(score *int) coci.SpecOpts {
   116  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   117  		s.Process.OOMScoreAdj = score
   118  		return nil
   119  	}
   120  }
   121  
   122  // WithSelinux sets the selinux labels
   123  func WithSelinux(c *container.Container) coci.SpecOpts {
   124  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   125  		s.Process.SelinuxLabel = c.GetProcessLabel()
   126  		s.Linux.MountLabel = c.MountLabel
   127  		return nil
   128  	}
   129  }
   130  
   131  // WithApparmor sets the apparmor profile
   132  func WithApparmor(c *container.Container) coci.SpecOpts {
   133  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   134  		if apparmor.HostSupports() {
   135  			var appArmorProfile string
   136  			if c.AppArmorProfile != "" {
   137  				appArmorProfile = c.AppArmorProfile
   138  			} else if c.HostConfig.Privileged {
   139  				appArmorProfile = unconfinedAppArmorProfile
   140  			} else {
   141  				appArmorProfile = defaultAppArmorProfile
   142  			}
   143  
   144  			if appArmorProfile == defaultAppArmorProfile {
   145  				// Unattended upgrades and other fun services can unload AppArmor
   146  				// profiles inadvertently. Since we cannot store our profile in
   147  				// /etc/apparmor.d, nor can we practically add other ways of
   148  				// telling the system to keep our profile loaded, in order to make
   149  				// sure that we keep the default profile enabled we dynamically
   150  				// reload it if necessary.
   151  				if err := ensureDefaultAppArmorProfile(); err != nil {
   152  					return err
   153  				}
   154  			}
   155  			s.Process.ApparmorProfile = appArmorProfile
   156  		}
   157  		return nil
   158  	}
   159  }
   160  
   161  // WithCapabilities sets the container's capabilties
   162  func WithCapabilities(c *container.Container) coci.SpecOpts {
   163  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   164  		capabilities, err := caps.TweakCapabilities(
   165  			caps.DefaultCapabilities(),
   166  			c.HostConfig.CapAdd,
   167  			c.HostConfig.CapDrop,
   168  			c.HostConfig.Privileged,
   169  		)
   170  		if err != nil {
   171  			return err
   172  		}
   173  		return oci.SetCapabilities(s, capabilities)
   174  	}
   175  }
   176  
   177  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   178  	p, err := getPath()
   179  	if err != nil {
   180  		return "", err
   181  	}
   182  	return c.GetResourcePath(p)
   183  }
   184  
   185  func getUser(c *container.Container, username string) (specs.User, error) {
   186  	var usr specs.User
   187  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   188  	if err != nil {
   189  		return usr, err
   190  	}
   191  	groupPath, err := resourcePath(c, user.GetGroupPath)
   192  	if err != nil {
   193  		return usr, err
   194  	}
   195  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   196  	if err != nil {
   197  		return usr, err
   198  	}
   199  	usr.UID = uint32(execUser.Uid)
   200  	usr.GID = uint32(execUser.Gid)
   201  	usr.AdditionalGids = []uint32{usr.GID}
   202  
   203  	var addGroups []int
   204  	if len(c.HostConfig.GroupAdd) > 0 {
   205  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   206  		if err != nil {
   207  			return usr, err
   208  		}
   209  	}
   210  	for _, g := range append(execUser.Sgids, addGroups...) {
   211  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   212  	}
   213  	return usr, nil
   214  }
   215  
   216  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   217  	for i, n := range s.Linux.Namespaces {
   218  		if n.Type == ns.Type {
   219  			s.Linux.Namespaces[i] = ns
   220  			return
   221  		}
   222  	}
   223  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   224  }
   225  
   226  // WithNamespaces sets the container's namespaces
   227  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   228  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   229  		userNS := false
   230  		// user
   231  		if c.HostConfig.UsernsMode.IsPrivate() {
   232  			uidMap := daemon.idMapping.UIDMaps
   233  			if uidMap != nil {
   234  				userNS = true
   235  				ns := specs.LinuxNamespace{Type: "user"}
   236  				setNamespace(s, ns)
   237  				s.Linux.UIDMappings = specMapping(uidMap)
   238  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDMaps)
   239  			}
   240  		}
   241  		// network
   242  		if !c.Config.NetworkDisabled {
   243  			ns := specs.LinuxNamespace{Type: "network"}
   244  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   245  			if parts[0] == "container" {
   246  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   247  				if err != nil {
   248  					return err
   249  				}
   250  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   251  				if userNS {
   252  					// to share a net namespace, they must also share a user namespace
   253  					nsUser := specs.LinuxNamespace{Type: "user"}
   254  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   255  					setNamespace(s, nsUser)
   256  				}
   257  			} else if c.HostConfig.NetworkMode.IsHost() {
   258  				ns.Path = c.NetworkSettings.SandboxKey
   259  			}
   260  			setNamespace(s, ns)
   261  		}
   262  
   263  		// ipc
   264  		ipcMode := c.HostConfig.IpcMode
   265  		if !ipcMode.Valid() {
   266  			return errdefs.InvalidParameter(errors.Errorf("invalid IPC mode: %v", ipcMode))
   267  		}
   268  		switch {
   269  		case ipcMode.IsContainer():
   270  			ns := specs.LinuxNamespace{Type: "ipc"}
   271  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   272  			if err != nil {
   273  				return errdefs.InvalidParameter(errors.Wrapf(err, "invalid IPC mode: %v", ipcMode))
   274  			}
   275  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   276  			setNamespace(s, ns)
   277  			if userNS {
   278  				// to share an IPC namespace, they must also share a user namespace
   279  				nsUser := specs.LinuxNamespace{Type: "user"}
   280  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   281  				setNamespace(s, nsUser)
   282  			}
   283  		case ipcMode.IsHost():
   284  			oci.RemoveNamespace(s, "ipc")
   285  		case ipcMode.IsEmpty():
   286  			// A container was created by an older version of the daemon.
   287  			// The default behavior used to be what is now called "shareable".
   288  			fallthrough
   289  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   290  			ns := specs.LinuxNamespace{Type: "ipc"}
   291  			setNamespace(s, ns)
   292  		}
   293  
   294  		// pid
   295  		if !c.HostConfig.PidMode.Valid() {
   296  			return errdefs.InvalidParameter(errors.Errorf("invalid PID mode: %v", c.HostConfig.PidMode))
   297  		}
   298  		if c.HostConfig.PidMode.IsContainer() {
   299  			pc, err := daemon.getPidContainer(c)
   300  			if err != nil {
   301  				return err
   302  			}
   303  			ns := specs.LinuxNamespace{
   304  				Type: "pid",
   305  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   306  			}
   307  			setNamespace(s, ns)
   308  			if userNS {
   309  				// to share a PID namespace, they must also share a user namespace
   310  				nsUser := specs.LinuxNamespace{
   311  					Type: "user",
   312  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   313  				}
   314  				setNamespace(s, nsUser)
   315  			}
   316  		} else if c.HostConfig.PidMode.IsHost() {
   317  			oci.RemoveNamespace(s, "pid")
   318  		} else {
   319  			ns := specs.LinuxNamespace{Type: "pid"}
   320  			setNamespace(s, ns)
   321  		}
   322  		// uts
   323  		if !c.HostConfig.UTSMode.Valid() {
   324  			return errdefs.InvalidParameter(errors.Errorf("invalid UTS mode: %v", c.HostConfig.UTSMode))
   325  		}
   326  		if c.HostConfig.UTSMode.IsHost() {
   327  			oci.RemoveNamespace(s, "uts")
   328  			s.Hostname = ""
   329  		}
   330  
   331  		// cgroup
   332  		if !c.HostConfig.CgroupnsMode.Valid() {
   333  			return errdefs.InvalidParameter(errors.Errorf("invalid cgroup namespace mode: %v", c.HostConfig.CgroupnsMode))
   334  		}
   335  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   336  			if c.HostConfig.CgroupnsMode.IsPrivate() {
   337  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   338  				setNamespace(s, nsCgroup)
   339  			}
   340  		}
   341  
   342  		return nil
   343  	}
   344  }
   345  
   346  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   347  	var ids []specs.LinuxIDMapping
   348  	for _, item := range s {
   349  		ids = append(ids, specs.LinuxIDMapping{
   350  			HostID:      uint32(item.HostID),
   351  			ContainerID: uint32(item.ContainerID),
   352  			Size:        uint32(item.Size),
   353  		})
   354  	}
   355  	return ids
   356  }
   357  
   358  // Get the source mount point of directory passed in as argument. Also return
   359  // optional fields.
   360  func getSourceMount(source string) (string, string, error) {
   361  	// Ensure any symlinks are resolved.
   362  	sourcePath, err := filepath.EvalSymlinks(source)
   363  	if err != nil {
   364  		return "", "", err
   365  	}
   366  
   367  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   368  	if err != nil {
   369  		return "", "", err
   370  	}
   371  	if len(mi) < 1 {
   372  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   373  	}
   374  
   375  	// find the longest mount point
   376  	var idx, maxlen int
   377  	for i := range mi {
   378  		if len(mi[i].Mountpoint) > maxlen {
   379  			maxlen = len(mi[i].Mountpoint)
   380  			idx = i
   381  		}
   382  	}
   383  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   384  }
   385  
   386  const (
   387  	sharedPropagationOption = "shared:"
   388  	slavePropagationOption  = "master:"
   389  )
   390  
   391  // hasMountInfoOption checks if any of the passed any of the given option values
   392  // are set in the passed in option string.
   393  func hasMountInfoOption(opts string, vals ...string) bool {
   394  	for _, opt := range strings.Split(opts, " ") {
   395  		for _, val := range vals {
   396  			if strings.HasPrefix(opt, val) {
   397  				return true
   398  			}
   399  		}
   400  	}
   401  	return false
   402  }
   403  
   404  // Ensure mount point on which path is mounted, is shared.
   405  func ensureShared(path string) error {
   406  	sourceMount, optionalOpts, err := getSourceMount(path)
   407  	if err != nil {
   408  		return err
   409  	}
   410  	// Make sure source mount point is shared.
   411  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   412  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   413  	}
   414  	return nil
   415  }
   416  
   417  // Ensure mount point on which path is mounted, is either shared or slave.
   418  func ensureSharedOrSlave(path string) error {
   419  	sourceMount, optionalOpts, err := getSourceMount(path)
   420  	if err != nil {
   421  		return err
   422  	}
   423  
   424  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   425  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   426  	}
   427  	return nil
   428  }
   429  
   430  // Get the set of mount flags that are set on the mount that contains the given
   431  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   432  // bind-mounting "with options" will not fail with user namespaces, due to
   433  // kernel restrictions that require user namespace mounts to preserve
   434  // CL_UNPRIVILEGED locked flags.
   435  func getUnprivilegedMountFlags(path string) ([]string, error) {
   436  	var statfs unix.Statfs_t
   437  	if err := unix.Statfs(path, &statfs); err != nil {
   438  		return nil, err
   439  	}
   440  
   441  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   442  	unprivilegedFlags := map[uint64]string{
   443  		unix.MS_RDONLY:     "ro",
   444  		unix.MS_NODEV:      "nodev",
   445  		unix.MS_NOEXEC:     "noexec",
   446  		unix.MS_NOSUID:     "nosuid",
   447  		unix.MS_NOATIME:    "noatime",
   448  		unix.MS_RELATIME:   "relatime",
   449  		unix.MS_NODIRATIME: "nodiratime",
   450  	}
   451  
   452  	var flags []string
   453  	for mask, flag := range unprivilegedFlags {
   454  		if uint64(statfs.Flags)&mask == mask {
   455  			flags = append(flags, flag)
   456  		}
   457  	}
   458  
   459  	return flags, nil
   460  }
   461  
   462  var (
   463  	mountPropagationMap = map[string]int{
   464  		"private":  mount.PRIVATE,
   465  		"rprivate": mount.RPRIVATE,
   466  		"shared":   mount.SHARED,
   467  		"rshared":  mount.RSHARED,
   468  		"slave":    mount.SLAVE,
   469  		"rslave":   mount.RSLAVE,
   470  	}
   471  
   472  	mountPropagationReverseMap = map[int]string{
   473  		mount.PRIVATE:  "private",
   474  		mount.RPRIVATE: "rprivate",
   475  		mount.SHARED:   "shared",
   476  		mount.RSHARED:  "rshared",
   477  		mount.SLAVE:    "slave",
   478  		mount.RSLAVE:   "rslave",
   479  	}
   480  )
   481  
   482  // inSlice tests whether a string is contained in a slice of strings or not.
   483  // Comparison is case sensitive
   484  func inSlice(slice []string, s string) bool {
   485  	for _, ss := range slice {
   486  		if s == ss {
   487  			return true
   488  		}
   489  	}
   490  	return false
   491  }
   492  
   493  // WithMounts sets the container's mounts
   494  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   495  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   496  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   497  			return err
   498  		}
   499  
   500  		if err := daemon.setupIpcDirs(c); err != nil {
   501  			return err
   502  		}
   503  
   504  		defer func() {
   505  			if err != nil {
   506  				daemon.cleanupSecretDir(c)
   507  			}
   508  		}()
   509  
   510  		if err := daemon.setupSecretDir(c); err != nil {
   511  			return err
   512  		}
   513  
   514  		ms, err := daemon.setupMounts(c)
   515  		if err != nil {
   516  			return err
   517  		}
   518  
   519  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   520  			ms = append(ms, c.IpcMounts()...)
   521  		}
   522  
   523  		tmpfsMounts, err := c.TmpfsMounts()
   524  		if err != nil {
   525  			return err
   526  		}
   527  		ms = append(ms, tmpfsMounts...)
   528  
   529  		secretMounts, err := c.SecretMounts()
   530  		if err != nil {
   531  			return err
   532  		}
   533  		ms = append(ms, secretMounts...)
   534  
   535  		sort.Sort(mounts(ms))
   536  
   537  		mounts := ms
   538  
   539  		userMounts := make(map[string]struct{})
   540  		for _, m := range mounts {
   541  			userMounts[m.Destination] = struct{}{}
   542  		}
   543  
   544  		// Copy all mounts from spec to defaultMounts, except for
   545  		//  - mounts overridden by a user supplied mount;
   546  		//  - all mounts under /dev if a user supplied /dev is present;
   547  		//  - /dev/shm, in case IpcMode is none.
   548  		// While at it, also
   549  		//  - set size for /dev/shm from shmsize.
   550  		defaultMounts := s.Mounts[:0]
   551  		_, mountDev := userMounts["/dev"]
   552  		for _, m := range s.Mounts {
   553  			if _, ok := userMounts[m.Destination]; ok {
   554  				// filter out mount overridden by a user supplied mount
   555  				continue
   556  			}
   557  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   558  				// filter out everything under /dev if /dev is user-mounted
   559  				continue
   560  			}
   561  
   562  			if m.Destination == "/dev/shm" {
   563  				if c.HostConfig.IpcMode.IsNone() {
   564  					// filter out /dev/shm for "none" IpcMode
   565  					continue
   566  				}
   567  				// set size for /dev/shm mount from spec
   568  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   569  				m.Options = append(m.Options, sizeOpt)
   570  			}
   571  
   572  			defaultMounts = append(defaultMounts, m)
   573  		}
   574  
   575  		s.Mounts = defaultMounts
   576  		for _, m := range mounts {
   577  			if m.Source == "tmpfs" {
   578  				data := m.Data
   579  				parser := volumemounts.NewParser()
   580  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   581  				if data != "" {
   582  					options = append(options, strings.Split(data, ",")...)
   583  				}
   584  
   585  				merged, err := mount.MergeTmpfsOptions(options)
   586  				if err != nil {
   587  					return err
   588  				}
   589  
   590  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   591  				continue
   592  			}
   593  
   594  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   595  
   596  			// Determine property of RootPropagation based on volume
   597  			// properties. If a volume is shared, then keep root propagation
   598  			// shared. This should work for slave and private volumes too.
   599  			//
   600  			// For slave volumes, it can be either [r]shared/[r]slave.
   601  			//
   602  			// For private volumes any root propagation value should work.
   603  			pFlag := mountPropagationMap[m.Propagation]
   604  			switch pFlag {
   605  			case mount.SHARED, mount.RSHARED:
   606  				if err := ensureShared(m.Source); err != nil {
   607  					return err
   608  				}
   609  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   610  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   611  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   612  				}
   613  			case mount.SLAVE, mount.RSLAVE:
   614  				var fallback bool
   615  				if err := ensureSharedOrSlave(m.Source); err != nil {
   616  					// For backwards compatibility purposes, treat mounts from the daemon root
   617  					// as special since we automatically add rslave propagation to these mounts
   618  					// when the user did not set anything, so we should fallback to the old
   619  					// behavior which is to use private propagation which is normally the
   620  					// default.
   621  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   622  						return err
   623  					}
   624  
   625  					cm, ok := c.MountPoints[m.Destination]
   626  					if !ok {
   627  						return err
   628  					}
   629  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   630  						// This means the user explicitly set a propagation, do not fallback in that case.
   631  						return err
   632  					}
   633  					fallback = true
   634  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   635  				}
   636  				if !fallback {
   637  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   638  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   639  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   640  					}
   641  				}
   642  			}
   643  
   644  			bindMode := "rbind"
   645  			if m.NonRecursive {
   646  				bindMode = "bind"
   647  			}
   648  			opts := []string{bindMode}
   649  			if !m.Writable {
   650  				opts = append(opts, "ro")
   651  			}
   652  			if pFlag != 0 {
   653  				opts = append(opts, mountPropagationReverseMap[pFlag])
   654  			}
   655  
   656  			// If we are using user namespaces, then we must make sure that we
   657  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   658  			// "mount" when we bind-mount. The reason for this is that at the point
   659  			// when runc sets up the root filesystem, it is already inside a user
   660  			// namespace, and thus cannot change any flags that are locked.
   661  			if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
   662  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   663  				if err != nil {
   664  					return err
   665  				}
   666  				opts = append(opts, unprivOpts...)
   667  			}
   668  
   669  			mt.Options = opts
   670  			s.Mounts = append(s.Mounts, mt)
   671  		}
   672  
   673  		if s.Root.Readonly {
   674  			for i, m := range s.Mounts {
   675  				switch m.Destination {
   676  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   677  					continue
   678  				}
   679  				if _, ok := userMounts[m.Destination]; !ok {
   680  					if !inSlice(m.Options, "ro") {
   681  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   682  					}
   683  				}
   684  			}
   685  		}
   686  
   687  		if c.HostConfig.Privileged {
   688  			// clear readonly for /sys
   689  			for i := range s.Mounts {
   690  				if s.Mounts[i].Destination == "/sys" {
   691  					clearReadOnly(&s.Mounts[i])
   692  				}
   693  			}
   694  			s.Linux.ReadonlyPaths = nil
   695  			s.Linux.MaskedPaths = nil
   696  		}
   697  
   698  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   699  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   700  		if uidMap := daemon.idMapping.UIDMaps; uidMap != nil || c.HostConfig.Privileged {
   701  			for i, m := range s.Mounts {
   702  				if m.Type == "cgroup" {
   703  					clearReadOnly(&s.Mounts[i])
   704  				}
   705  			}
   706  		}
   707  
   708  		return nil
   709  	}
   710  }
   711  
   712  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   713  // exist, so do not add the default ones if running on an old kernel.
   714  func sysctlExists(s string) bool {
   715  	f := filepath.Join("/proc", "sys", strings.ReplaceAll(s, ".", "/"))
   716  	_, err := os.Stat(f)
   717  	return err == nil
   718  }
   719  
   720  // WithCommonOptions sets common docker options
   721  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   722  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   723  		if c.BaseFS == nil {
   724  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
   725  		}
   726  		linkedEnv, err := daemon.setupLinkedContainers(c)
   727  		if err != nil {
   728  			return err
   729  		}
   730  		s.Root = &specs.Root{
   731  			Path:     c.BaseFS.Path(),
   732  			Readonly: c.HostConfig.ReadonlyRootfs,
   733  		}
   734  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   735  			return err
   736  		}
   737  		cwd := c.Config.WorkingDir
   738  		if len(cwd) == 0 {
   739  			cwd = "/"
   740  		}
   741  		s.Process.Args = append([]string{c.Path}, c.Args...)
   742  
   743  		// only add the custom init if it is specified and the container is running in its
   744  		// own private pid namespace.  It does not make sense to add if it is running in the
   745  		// host namespace or another container's pid namespace where we already have an init
   746  		if c.HostConfig.PidMode.IsPrivate() {
   747  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   748  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   749  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   750  				path := daemon.configStore.InitPath
   751  				if path == "" {
   752  					path, err = exec.LookPath(dconfig.DefaultInitBinary)
   753  					if err != nil {
   754  						return err
   755  					}
   756  				}
   757  				s.Mounts = append(s.Mounts, specs.Mount{
   758  					Destination: inContainerInitPath,
   759  					Type:        "bind",
   760  					Source:      path,
   761  					Options:     []string{"bind", "ro"},
   762  				})
   763  			}
   764  		}
   765  		s.Process.Cwd = cwd
   766  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   767  		s.Process.Terminal = c.Config.Tty
   768  
   769  		s.Hostname = c.Config.Hostname
   770  		setLinuxDomainname(c, s)
   771  
   772  		// Add default sysctls that are generally safe and useful; currently we
   773  		// grant the capabilities to allow these anyway. You can override if
   774  		// you want to restore the original behaviour.
   775  		// We do not set network sysctls if network namespace is host, or if we are
   776  		// joining an existing namespace, only if we create a new net namespace.
   777  		if c.HostConfig.NetworkMode.IsPrivate() {
   778  			// We cannot set up ping socket support in a user namespace
   779  			userNS := daemon.configStore.RemappedRoot != "" && c.HostConfig.UsernsMode.IsPrivate()
   780  			if !userNS && !userns.RunningInUserNS() && sysctlExists("net.ipv4.ping_group_range") {
   781  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   782  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   783  			}
   784  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   785  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   786  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   787  			}
   788  		}
   789  
   790  		return nil
   791  	}
   792  }
   793  
   794  // WithCgroups sets the container's cgroups
   795  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   796  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   797  		var cgroupsPath string
   798  		scopePrefix := "docker"
   799  		parent := "/docker"
   800  		useSystemd := UsingSystemd(daemon.configStore)
   801  		if useSystemd {
   802  			parent = "system.slice"
   803  			if daemon.configStore.Rootless {
   804  				parent = "user.slice"
   805  			}
   806  		}
   807  
   808  		if c.HostConfig.CgroupParent != "" {
   809  			parent = c.HostConfig.CgroupParent
   810  		} else if daemon.configStore.CgroupParent != "" {
   811  			parent = daemon.configStore.CgroupParent
   812  		}
   813  
   814  		if useSystemd {
   815  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   816  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   817  		} else {
   818  			cgroupsPath = filepath.Join(parent, c.ID)
   819  		}
   820  		s.Linux.CgroupsPath = cgroupsPath
   821  
   822  		// the rest is only needed for CPU RT controller
   823  
   824  		if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
   825  			return nil
   826  		}
   827  
   828  		p := cgroupsPath
   829  		if useSystemd {
   830  			initPath, err := cgroups.GetInitCgroup("cpu")
   831  			if err != nil {
   832  				return errors.Wrap(err, "unable to init CPU RT controller")
   833  			}
   834  			_, err = cgroups.GetOwnCgroup("cpu")
   835  			if err != nil {
   836  				return errors.Wrap(err, "unable to init CPU RT controller")
   837  			}
   838  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   839  		}
   840  
   841  		// Clean path to guard against things like ../../../BAD
   842  		parentPath := filepath.Dir(p)
   843  		if !filepath.IsAbs(parentPath) {
   844  			parentPath = filepath.Clean("/" + parentPath)
   845  		}
   846  
   847  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   848  		if err != nil {
   849  			return errors.Wrap(err, "unable to init CPU RT controller")
   850  		}
   851  		// When docker is run inside docker, the root is based of the host cgroup.
   852  		// Should this be handled in runc/libcontainer/cgroups ?
   853  		if strings.HasPrefix(root, "/docker/") {
   854  			root = "/"
   855  		}
   856  		mnt = filepath.Join(mnt, root)
   857  
   858  		if err := daemon.initCPURtController(mnt, parentPath); err != nil {
   859  			return errors.Wrap(err, "unable to init CPU RT controller")
   860  		}
   861  		return nil
   862  	}
   863  }
   864  
   865  // WithDevices sets the container's devices
   866  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   867  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   868  		// Build lists of devices allowed and created within the container.
   869  		var devs []specs.LinuxDevice
   870  		devPermissions := s.Linux.Resources.Devices
   871  
   872  		if c.HostConfig.Privileged {
   873  			hostDevices, err := coci.HostDevices()
   874  			if err != nil {
   875  				return err
   876  			}
   877  			devs = append(devs, hostDevices...)
   878  
   879  			// adding device mappings in privileged containers
   880  			for _, deviceMapping := range c.HostConfig.Devices {
   881  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   882  				if deviceMapping.CgroupPermissions != "rwm" {
   883  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   884  				}
   885  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   886  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   887  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   888  					continue
   889  				}
   890  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   891  				if err != nil {
   892  					return err
   893  				}
   894  				devs = append(devs, d...)
   895  			}
   896  
   897  			devPermissions = []specs.LinuxDeviceCgroup{
   898  				{
   899  					Allow:  true,
   900  					Access: "rwm",
   901  				},
   902  			}
   903  		} else {
   904  			for _, deviceMapping := range c.HostConfig.Devices {
   905  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   906  				if err != nil {
   907  					return err
   908  				}
   909  				devs = append(devs, d...)
   910  				devPermissions = append(devPermissions, dPermissions...)
   911  			}
   912  
   913  			var err error
   914  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   915  			if err != nil {
   916  				return err
   917  			}
   918  		}
   919  
   920  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   921  		s.Linux.Resources.Devices = devPermissions
   922  
   923  		for _, req := range c.HostConfig.DeviceRequests {
   924  			if err := daemon.handleDevice(req, s); err != nil {
   925  				return err
   926  			}
   927  		}
   928  		return nil
   929  	}
   930  }
   931  
   932  // WithResources applies the container resources
   933  func WithResources(c *container.Container) coci.SpecOpts {
   934  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   935  		r := c.HostConfig.Resources
   936  		weightDevices, err := getBlkioWeightDevices(r)
   937  		if err != nil {
   938  			return err
   939  		}
   940  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   941  		if err != nil {
   942  			return err
   943  		}
   944  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   945  		if err != nil {
   946  			return err
   947  		}
   948  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   949  		if err != nil {
   950  			return err
   951  		}
   952  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   953  		if err != nil {
   954  			return err
   955  		}
   956  
   957  		memoryRes := getMemoryResources(r)
   958  		cpuRes, err := getCPUResources(r)
   959  		if err != nil {
   960  			return err
   961  		}
   962  		blkioWeight := r.BlkioWeight
   963  
   964  		specResources := &specs.LinuxResources{
   965  			Memory: memoryRes,
   966  			CPU:    cpuRes,
   967  			BlockIO: &specs.LinuxBlockIO{
   968  				Weight:                  &blkioWeight,
   969  				WeightDevice:            weightDevices,
   970  				ThrottleReadBpsDevice:   readBpsDevice,
   971  				ThrottleWriteBpsDevice:  writeBpsDevice,
   972  				ThrottleReadIOPSDevice:  readIOpsDevice,
   973  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   974  			},
   975  			Pids: getPidsLimit(r),
   976  		}
   977  
   978  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   979  			specResources.Devices = s.Linux.Resources.Devices
   980  		}
   981  
   982  		s.Linux.Resources = specResources
   983  		return nil
   984  	}
   985  }
   986  
   987  // WithSysctls sets the container's sysctls
   988  func WithSysctls(c *container.Container) coci.SpecOpts {
   989  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   990  		// We merge the sysctls injected above with the HostConfig (latter takes
   991  		// precedence for backwards-compatibility reasons).
   992  		for k, v := range c.HostConfig.Sysctls {
   993  			s.Linux.Sysctl[k] = v
   994  		}
   995  		return nil
   996  	}
   997  }
   998  
   999  // WithUser sets the container's user
  1000  func WithUser(c *container.Container) coci.SpecOpts {
  1001  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1002  		var err error
  1003  		s.Process.User, err = getUser(c, c.Config.User)
  1004  		return err
  1005  	}
  1006  }
  1007  
  1008  func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  1009  	var (
  1010  		opts []coci.SpecOpts
  1011  		s    = oci.DefaultSpec()
  1012  	)
  1013  	opts = append(opts,
  1014  		WithCommonOptions(daemon, c),
  1015  		WithCgroups(daemon, c),
  1016  		WithResources(c),
  1017  		WithSysctls(c),
  1018  		WithDevices(daemon, c),
  1019  		WithUser(c),
  1020  		WithRlimits(daemon, c),
  1021  		WithNamespaces(daemon, c),
  1022  		WithCapabilities(c),
  1023  		WithSeccomp(daemon, c),
  1024  		WithMounts(daemon, c),
  1025  		WithLibnetwork(daemon, c),
  1026  		WithApparmor(c),
  1027  		WithSelinux(c),
  1028  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1029  	)
  1030  	if c.NoNewPrivileges {
  1031  		opts = append(opts, coci.WithNoNewPrivileges)
  1032  	}
  1033  	if c.Config.Tty {
  1034  		opts = append(opts, WithConsoleSize(c))
  1035  	}
  1036  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1037  	if c.HostConfig.MaskedPaths != nil {
  1038  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1039  	}
  1040  	if c.HostConfig.ReadonlyPaths != nil {
  1041  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1042  	}
  1043  	if daemon.configStore.Rootless {
  1044  		opts = append(opts, WithRootless(daemon))
  1045  	}
  1046  	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  1047  		ID: c.ID,
  1048  	}, &s, opts...)
  1049  }
  1050  
  1051  func clearReadOnly(m *specs.Mount) {
  1052  	var opt []string
  1053  	for _, o := range m.Options {
  1054  		if o != "ro" {
  1055  			opt = append(opt, o)
  1056  		}
  1057  	}
  1058  	m.Options = opt
  1059  }
  1060  
  1061  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1062  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1063  	ulimits := c.Ulimits
  1064  	// Merge ulimits with daemon defaults
  1065  	ulIdx := make(map[string]struct{})
  1066  	for _, ul := range ulimits {
  1067  		ulIdx[ul.Name] = struct{}{}
  1068  	}
  1069  	for name, ul := range daemon.configStore.Ulimits {
  1070  		if _, exists := ulIdx[name]; !exists {
  1071  			ulimits = append(ulimits, ul)
  1072  		}
  1073  	}
  1074  	c.Ulimits = ulimits
  1075  }