github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/oci_linux.go

github.com/jfrazelle/docker@v1.1.2-0.20210712172922-bf78e25fe508/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io/ioutil"
     7  	"os"
     8  	"os/exec"
     9  	"path/filepath"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  
    14  	cdcgroups "github.com/containerd/cgroups"
    15  	"github.com/containerd/containerd/containers"
    16  	coci "github.com/containerd/containerd/oci"
    17  	"github.com/containerd/containerd/pkg/apparmor"
    18  	"github.com/containerd/containerd/pkg/userns"
    19  	containertypes "github.com/docker/docker/api/types/container"
    20  	"github.com/docker/docker/container"
    21  	daemonconfig "github.com/docker/docker/daemon/config"
    22  	"github.com/docker/docker/oci"
    23  	"github.com/docker/docker/oci/caps"
    24  	"github.com/docker/docker/pkg/idtools"
    25  	"github.com/docker/docker/pkg/stringid"
    26  	"github.com/docker/docker/rootless/specconv"
    27  	volumemounts "github.com/docker/docker/volume/mounts"
    28  	"github.com/moby/sys/mount"
    29  	"github.com/moby/sys/mountinfo"
    30  	"github.com/opencontainers/runc/libcontainer/cgroups"
    31  	"github.com/opencontainers/runc/libcontainer/devices"
    32  	"github.com/opencontainers/runc/libcontainer/user"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"github.com/pkg/errors"
    35  	"github.com/sirupsen/logrus"
    36  	"golang.org/x/sys/unix"
    37  )
    38  
    39  const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
    40  
    41  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    42  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    43  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    44  		var rlimits []specs.POSIXRlimit
    45  
    46  		// We want to leave the original HostConfig alone so make a copy here
    47  		hostConfig := *c.HostConfig
    48  		// Merge with the daemon defaults
    49  		daemon.mergeUlimits(&hostConfig)
    50  		for _, ul := range hostConfig.Ulimits {
    51  			rlimits = append(rlimits, specs.POSIXRlimit{
    52  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    53  				Soft: uint64(ul.Soft),
    54  				Hard: uint64(ul.Hard),
    55  			})
    56  		}
    57  
    58  		s.Process.Rlimits = rlimits
    59  		return nil
    60  	}
    61  }
    62  
    63  // WithLibnetwork sets the libnetwork hook
    64  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    65  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    66  		if s.Hooks == nil {
    67  			s.Hooks = &specs.Hooks{}
    68  		}
    69  		for _, ns := range s.Linux.Namespaces {
    70  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    71  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    72  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    73  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    74  					Path: target,
    75  					Args: []string{
    76  						"libnetwork-setkey",
    77  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    78  						c.ID,
    79  						shortNetCtlrID,
    80  					},
    81  				})
    82  			}
    83  		}
    84  		return nil
    85  	}
    86  }
    87  
    88  // WithRootless sets the spec to the rootless configuration
    89  func WithRootless(daemon *Daemon) coci.SpecOpts {
    90  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    91  		var v2Controllers []string
    92  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    93  			if cdcgroups.Mode() != cdcgroups.Unified {
    94  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    95  			}
    96  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    97  			if rootlesskitParentEUID == "" {
    98  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    99  			}
   100  			euid, err := strconv.Atoi(rootlesskitParentEUID)
   101  			if err != nil {
   102  				return errors.Wrap(err, "invalid $ROOTLESSKIT_PARENT_EUID: must be a numeric value")
   103  			}
   104  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%d.slice/cgroup.controllers", euid)
   105  			controllersFile, err := ioutil.ReadFile(controllersPath)
   106  			if err != nil {
   107  				return err
   108  			}
   109  			v2Controllers = strings.Fields(string(controllersFile))
   110  		}
   111  		return specconv.ToRootless(s, v2Controllers)
   112  	}
   113  }
   114  
   115  // WithOOMScore sets the oom score
   116  func WithOOMScore(score *int) coci.SpecOpts {
   117  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   118  		s.Process.OOMScoreAdj = score
   119  		return nil
   120  	}
   121  }
   122  
   123  // WithSelinux sets the selinux labels
   124  func WithSelinux(c *container.Container) coci.SpecOpts {
   125  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   126  		s.Process.SelinuxLabel = c.GetProcessLabel()
   127  		s.Linux.MountLabel = c.MountLabel
   128  		return nil
   129  	}
   130  }
   131  
   132  // WithApparmor sets the apparmor profile
   133  func WithApparmor(c *container.Container) coci.SpecOpts {
   134  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   135  		if apparmor.HostSupports() {
   136  			var appArmorProfile string
   137  			if c.AppArmorProfile != "" {
   138  				appArmorProfile = c.AppArmorProfile
   139  			} else if c.HostConfig.Privileged {
   140  				appArmorProfile = unconfinedAppArmorProfile
   141  			} else {
   142  				appArmorProfile = defaultAppArmorProfile
   143  			}
   144  
   145  			if appArmorProfile == defaultAppArmorProfile {
   146  				// Unattended upgrades and other fun services can unload AppArmor
   147  				// profiles inadvertently. Since we cannot store our profile in
   148  				// /etc/apparmor.d, nor can we practically add other ways of
   149  				// telling the system to keep our profile loaded, in order to make
   150  				// sure that we keep the default profile enabled we dynamically
   151  				// reload it if necessary.
   152  				if err := ensureDefaultAppArmorProfile(); err != nil {
   153  					return err
   154  				}
   155  			}
   156  			s.Process.ApparmorProfile = appArmorProfile
   157  		}
   158  		return nil
   159  	}
   160  }
   161  
   162  // WithCapabilities sets the container's capabilties
   163  func WithCapabilities(c *container.Container) coci.SpecOpts {
   164  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   165  		capabilities, err := caps.TweakCapabilities(
   166  			caps.DefaultCapabilities(),
   167  			c.HostConfig.CapAdd,
   168  			c.HostConfig.CapDrop,
   169  			c.HostConfig.Privileged,
   170  		)
   171  		if err != nil {
   172  			return err
   173  		}
   174  		return oci.SetCapabilities(s, capabilities)
   175  	}
   176  }
   177  
   178  func resourcePath(c *container.Container, getPath func() (string, error)) (string, error) {
   179  	p, err := getPath()
   180  	if err != nil {
   181  		return "", err
   182  	}
   183  	return c.GetResourcePath(p)
   184  }
   185  
   186  func getUser(c *container.Container, username string) (specs.User, error) {
   187  	var usr specs.User
   188  	passwdPath, err := resourcePath(c, user.GetPasswdPath)
   189  	if err != nil {
   190  		return usr, err
   191  	}
   192  	groupPath, err := resourcePath(c, user.GetGroupPath)
   193  	if err != nil {
   194  		return usr, err
   195  	}
   196  	execUser, err := user.GetExecUserPath(username, nil, passwdPath, groupPath)
   197  	if err != nil {
   198  		return usr, err
   199  	}
   200  	usr.UID = uint32(execUser.Uid)
   201  	usr.GID = uint32(execUser.Gid)
   202  
   203  	var addGroups []int
   204  	if len(c.HostConfig.GroupAdd) > 0 {
   205  		addGroups, err = user.GetAdditionalGroupsPath(c.HostConfig.GroupAdd, groupPath)
   206  		if err != nil {
   207  			return usr, err
   208  		}
   209  	}
   210  	for _, g := range append(execUser.Sgids, addGroups...) {
   211  		usr.AdditionalGids = append(usr.AdditionalGids, uint32(g))
   212  	}
   213  	return usr, nil
   214  }
   215  
   216  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   217  	for i, n := range s.Linux.Namespaces {
   218  		if n.Type == ns.Type {
   219  			s.Linux.Namespaces[i] = ns
   220  			return
   221  		}
   222  	}
   223  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   224  }
   225  
   226  // WithNamespaces sets the container's namespaces
   227  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   228  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   229  		userNS := false
   230  		// user
   231  		if c.HostConfig.UsernsMode.IsPrivate() {
   232  			uidMap := daemon.idMapping.UIDs()
   233  			if uidMap != nil {
   234  				userNS = true
   235  				ns := specs.LinuxNamespace{Type: "user"}
   236  				setNamespace(s, ns)
   237  				s.Linux.UIDMappings = specMapping(uidMap)
   238  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
   239  			}
   240  		}
   241  		// network
   242  		if !c.Config.NetworkDisabled {
   243  			ns := specs.LinuxNamespace{Type: "network"}
   244  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   245  			if parts[0] == "container" {
   246  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   247  				if err != nil {
   248  					return err
   249  				}
   250  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   251  				if userNS {
   252  					// to share a net namespace, they must also share a user namespace
   253  					nsUser := specs.LinuxNamespace{Type: "user"}
   254  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   255  					setNamespace(s, nsUser)
   256  				}
   257  			} else if c.HostConfig.NetworkMode.IsHost() {
   258  				ns.Path = c.NetworkSettings.SandboxKey
   259  			}
   260  			setNamespace(s, ns)
   261  		}
   262  
   263  		// ipc
   264  		ipcMode := c.HostConfig.IpcMode
   265  		switch {
   266  		case ipcMode.IsContainer():
   267  			ns := specs.LinuxNamespace{Type: "ipc"}
   268  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   269  			if err != nil {
   270  				return err
   271  			}
   272  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   273  			setNamespace(s, ns)
   274  			if userNS {
   275  				// to share an IPC namespace, they must also share a user namespace
   276  				nsUser := specs.LinuxNamespace{Type: "user"}
   277  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   278  				setNamespace(s, nsUser)
   279  			}
   280  		case ipcMode.IsHost():
   281  			oci.RemoveNamespace(s, "ipc")
   282  		case ipcMode.IsEmpty():
   283  			// A container was created by an older version of the daemon.
   284  			// The default behavior used to be what is now called "shareable".
   285  			fallthrough
   286  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   287  			ns := specs.LinuxNamespace{Type: "ipc"}
   288  			setNamespace(s, ns)
   289  		default:
   290  			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   291  		}
   292  
   293  		// pid
   294  		if c.HostConfig.PidMode.IsContainer() {
   295  			pc, err := daemon.getPidContainer(c)
   296  			if err != nil {
   297  				return err
   298  			}
   299  			ns := specs.LinuxNamespace{
   300  				Type: "pid",
   301  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   302  			}
   303  			setNamespace(s, ns)
   304  			if userNS {
   305  				// to share a PID namespace, they must also share a user namespace
   306  				nsUser := specs.LinuxNamespace{
   307  					Type: "user",
   308  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   309  				}
   310  				setNamespace(s, nsUser)
   311  			}
   312  		} else if c.HostConfig.PidMode.IsHost() {
   313  			oci.RemoveNamespace(s, "pid")
   314  		} else {
   315  			ns := specs.LinuxNamespace{Type: "pid"}
   316  			setNamespace(s, ns)
   317  		}
   318  		// uts
   319  		if c.HostConfig.UTSMode.IsHost() {
   320  			oci.RemoveNamespace(s, "uts")
   321  			s.Hostname = ""
   322  		}
   323  
   324  		// cgroup
   325  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   326  			cgroupNsMode := c.HostConfig.CgroupnsMode
   327  			if !cgroupNsMode.Valid() {
   328  				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
   329  			}
   330  			if cgroupNsMode.IsPrivate() {
   331  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   332  				setNamespace(s, nsCgroup)
   333  			}
   334  		}
   335  
   336  		return nil
   337  	}
   338  }
   339  
   340  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   341  	var ids []specs.LinuxIDMapping
   342  	for _, item := range s {
   343  		ids = append(ids, specs.LinuxIDMapping{
   344  			HostID:      uint32(item.HostID),
   345  			ContainerID: uint32(item.ContainerID),
   346  			Size:        uint32(item.Size),
   347  		})
   348  	}
   349  	return ids
   350  }
   351  
   352  // Get the source mount point of directory passed in as argument. Also return
   353  // optional fields.
   354  func getSourceMount(source string) (string, string, error) {
   355  	// Ensure any symlinks are resolved.
   356  	sourcePath, err := filepath.EvalSymlinks(source)
   357  	if err != nil {
   358  		return "", "", err
   359  	}
   360  
   361  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   362  	if err != nil {
   363  		return "", "", err
   364  	}
   365  	if len(mi) < 1 {
   366  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   367  	}
   368  
   369  	// find the longest mount point
   370  	var idx, maxlen int
   371  	for i := range mi {
   372  		if len(mi[i].Mountpoint) > maxlen {
   373  			maxlen = len(mi[i].Mountpoint)
   374  			idx = i
   375  		}
   376  	}
   377  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   378  }
   379  
   380  const (
   381  	sharedPropagationOption = "shared:"
   382  	slavePropagationOption  = "master:"
   383  )
   384  
   385  // hasMountInfoOption checks if any of the passed any of the given option values
   386  // are set in the passed in option string.
   387  func hasMountInfoOption(opts string, vals ...string) bool {
   388  	for _, opt := range strings.Split(opts, " ") {
   389  		for _, val := range vals {
   390  			if strings.HasPrefix(opt, val) {
   391  				return true
   392  			}
   393  		}
   394  	}
   395  	return false
   396  }
   397  
   398  // Ensure mount point on which path is mounted, is shared.
   399  func ensureShared(path string) error {
   400  	sourceMount, optionalOpts, err := getSourceMount(path)
   401  	if err != nil {
   402  		return err
   403  	}
   404  	// Make sure source mount point is shared.
   405  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   406  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   407  	}
   408  	return nil
   409  }
   410  
   411  // Ensure mount point on which path is mounted, is either shared or slave.
   412  func ensureSharedOrSlave(path string) error {
   413  	sourceMount, optionalOpts, err := getSourceMount(path)
   414  	if err != nil {
   415  		return err
   416  	}
   417  
   418  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   419  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   420  	}
   421  	return nil
   422  }
   423  
   424  // Get the set of mount flags that are set on the mount that contains the given
   425  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   426  // bind-mounting "with options" will not fail with user namespaces, due to
   427  // kernel restrictions that require user namespace mounts to preserve
   428  // CL_UNPRIVILEGED locked flags.
   429  func getUnprivilegedMountFlags(path string) ([]string, error) {
   430  	var statfs unix.Statfs_t
   431  	if err := unix.Statfs(path, &statfs); err != nil {
   432  		return nil, err
   433  	}
   434  
   435  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   436  	unprivilegedFlags := map[uint64]string{
   437  		unix.MS_RDONLY:     "ro",
   438  		unix.MS_NODEV:      "nodev",
   439  		unix.MS_NOEXEC:     "noexec",
   440  		unix.MS_NOSUID:     "nosuid",
   441  		unix.MS_NOATIME:    "noatime",
   442  		unix.MS_RELATIME:   "relatime",
   443  		unix.MS_NODIRATIME: "nodiratime",
   444  	}
   445  
   446  	var flags []string
   447  	for mask, flag := range unprivilegedFlags {
   448  		if uint64(statfs.Flags)&mask == mask {
   449  			flags = append(flags, flag)
   450  		}
   451  	}
   452  
   453  	return flags, nil
   454  }
   455  
   456  var (
   457  	mountPropagationMap = map[string]int{
   458  		"private":  mount.PRIVATE,
   459  		"rprivate": mount.RPRIVATE,
   460  		"shared":   mount.SHARED,
   461  		"rshared":  mount.RSHARED,
   462  		"slave":    mount.SLAVE,
   463  		"rslave":   mount.RSLAVE,
   464  	}
   465  
   466  	mountPropagationReverseMap = map[int]string{
   467  		mount.PRIVATE:  "private",
   468  		mount.RPRIVATE: "rprivate",
   469  		mount.SHARED:   "shared",
   470  		mount.RSHARED:  "rshared",
   471  		mount.SLAVE:    "slave",
   472  		mount.RSLAVE:   "rslave",
   473  	}
   474  )
   475  
   476  // inSlice tests whether a string is contained in a slice of strings or not.
   477  // Comparison is case sensitive
   478  func inSlice(slice []string, s string) bool {
   479  	for _, ss := range slice {
   480  		if s == ss {
   481  			return true
   482  		}
   483  	}
   484  	return false
   485  }
   486  
   487  // WithMounts sets the container's mounts
   488  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   489  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   490  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   491  			return err
   492  		}
   493  
   494  		if err := daemon.setupIpcDirs(c); err != nil {
   495  			return err
   496  		}
   497  
   498  		defer func() {
   499  			if err != nil {
   500  				daemon.cleanupSecretDir(c)
   501  			}
   502  		}()
   503  
   504  		if err := daemon.setupSecretDir(c); err != nil {
   505  			return err
   506  		}
   507  
   508  		ms, err := daemon.setupMounts(c)
   509  		if err != nil {
   510  			return err
   511  		}
   512  
   513  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   514  			ms = append(ms, c.IpcMounts()...)
   515  		}
   516  
   517  		tmpfsMounts, err := c.TmpfsMounts()
   518  		if err != nil {
   519  			return err
   520  		}
   521  		ms = append(ms, tmpfsMounts...)
   522  
   523  		secretMounts, err := c.SecretMounts()
   524  		if err != nil {
   525  			return err
   526  		}
   527  		ms = append(ms, secretMounts...)
   528  
   529  		sort.Sort(mounts(ms))
   530  
   531  		mounts := ms
   532  
   533  		userMounts := make(map[string]struct{})
   534  		for _, m := range mounts {
   535  			userMounts[m.Destination] = struct{}{}
   536  		}
   537  
   538  		// Copy all mounts from spec to defaultMounts, except for
   539  		//  - mounts overridden by a user supplied mount;
   540  		//  - all mounts under /dev if a user supplied /dev is present;
   541  		//  - /dev/shm, in case IpcMode is none.
   542  		// While at it, also
   543  		//  - set size for /dev/shm from shmsize.
   544  		defaultMounts := s.Mounts[:0]
   545  		_, mountDev := userMounts["/dev"]
   546  		for _, m := range s.Mounts {
   547  			if _, ok := userMounts[m.Destination]; ok {
   548  				// filter out mount overridden by a user supplied mount
   549  				continue
   550  			}
   551  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   552  				// filter out everything under /dev if /dev is user-mounted
   553  				continue
   554  			}
   555  
   556  			if m.Destination == "/dev/shm" {
   557  				if c.HostConfig.IpcMode.IsNone() {
   558  					// filter out /dev/shm for "none" IpcMode
   559  					continue
   560  				}
   561  				// set size for /dev/shm mount from spec
   562  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   563  				m.Options = append(m.Options, sizeOpt)
   564  			}
   565  
   566  			defaultMounts = append(defaultMounts, m)
   567  		}
   568  
   569  		s.Mounts = defaultMounts
   570  		for _, m := range mounts {
   571  			if m.Source == "tmpfs" {
   572  				data := m.Data
   573  				parser := volumemounts.NewParser("linux")
   574  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   575  				if data != "" {
   576  					options = append(options, strings.Split(data, ",")...)
   577  				}
   578  
   579  				merged, err := mount.MergeTmpfsOptions(options)
   580  				if err != nil {
   581  					return err
   582  				}
   583  
   584  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   585  				continue
   586  			}
   587  
   588  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   589  
   590  			// Determine property of RootPropagation based on volume
   591  			// properties. If a volume is shared, then keep root propagation
   592  			// shared. This should work for slave and private volumes too.
   593  			//
   594  			// For slave volumes, it can be either [r]shared/[r]slave.
   595  			//
   596  			// For private volumes any root propagation value should work.
   597  			pFlag := mountPropagationMap[m.Propagation]
   598  			switch pFlag {
   599  			case mount.SHARED, mount.RSHARED:
   600  				if err := ensureShared(m.Source); err != nil {
   601  					return err
   602  				}
   603  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   604  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   605  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   606  				}
   607  			case mount.SLAVE, mount.RSLAVE:
   608  				var fallback bool
   609  				if err := ensureSharedOrSlave(m.Source); err != nil {
   610  					// For backwards compatibility purposes, treat mounts from the daemon root
   611  					// as special since we automatically add rslave propagation to these mounts
   612  					// when the user did not set anything, so we should fallback to the old
   613  					// behavior which is to use private propagation which is normally the
   614  					// default.
   615  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   616  						return err
   617  					}
   618  
   619  					cm, ok := c.MountPoints[m.Destination]
   620  					if !ok {
   621  						return err
   622  					}
   623  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   624  						// This means the user explicitly set a propagation, do not fallback in that case.
   625  						return err
   626  					}
   627  					fallback = true
   628  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   629  				}
   630  				if !fallback {
   631  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   632  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   633  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   634  					}
   635  				}
   636  			}
   637  
   638  			bindMode := "rbind"
   639  			if m.NonRecursive {
   640  				bindMode = "bind"
   641  			}
   642  			opts := []string{bindMode}
   643  			if !m.Writable {
   644  				opts = append(opts, "ro")
   645  			}
   646  			if pFlag != 0 {
   647  				opts = append(opts, mountPropagationReverseMap[pFlag])
   648  			}
   649  
   650  			// If we are using user namespaces, then we must make sure that we
   651  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   652  			// "mount" when we bind-mount. The reason for this is that at the point
   653  			// when runc sets up the root filesystem, it is already inside a user
   654  			// namespace, and thus cannot change any flags that are locked.
   655  			if daemon.configStore.RemappedRoot != "" || userns.RunningInUserNS() {
   656  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   657  				if err != nil {
   658  					return err
   659  				}
   660  				opts = append(opts, unprivOpts...)
   661  			}
   662  
   663  			mt.Options = opts
   664  			s.Mounts = append(s.Mounts, mt)
   665  		}
   666  
   667  		if s.Root.Readonly {
   668  			for i, m := range s.Mounts {
   669  				switch m.Destination {
   670  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   671  					continue
   672  				}
   673  				if _, ok := userMounts[m.Destination]; !ok {
   674  					if !inSlice(m.Options, "ro") {
   675  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   676  					}
   677  				}
   678  			}
   679  		}
   680  
   681  		if c.HostConfig.Privileged {
   682  			// clear readonly for /sys
   683  			for i := range s.Mounts {
   684  				if s.Mounts[i].Destination == "/sys" {
   685  					clearReadOnly(&s.Mounts[i])
   686  				}
   687  			}
   688  			s.Linux.ReadonlyPaths = nil
   689  			s.Linux.MaskedPaths = nil
   690  		}
   691  
   692  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   693  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   694  		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   695  			for i, m := range s.Mounts {
   696  				if m.Type == "cgroup" {
   697  					clearReadOnly(&s.Mounts[i])
   698  				}
   699  			}
   700  		}
   701  
   702  		return nil
   703  
   704  	}
   705  }
   706  
   707  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   708  // exist, so do not add the default ones if running on an old kernel.
   709  func sysctlExists(s string) bool {
   710  	f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1))
   711  	_, err := os.Stat(f)
   712  	return err == nil
   713  }
   714  
   715  // WithCommonOptions sets common docker options
   716  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   717  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   718  		if c.BaseFS == nil {
   719  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
   720  		}
   721  		linkedEnv, err := daemon.setupLinkedContainers(c)
   722  		if err != nil {
   723  			return err
   724  		}
   725  		s.Root = &specs.Root{
   726  			Path:     c.BaseFS.Path(),
   727  			Readonly: c.HostConfig.ReadonlyRootfs,
   728  		}
   729  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   730  			return err
   731  		}
   732  		cwd := c.Config.WorkingDir
   733  		if len(cwd) == 0 {
   734  			cwd = "/"
   735  		}
   736  		s.Process.Args = append([]string{c.Path}, c.Args...)
   737  
   738  		// only add the custom init if it is specified and the container is running in its
   739  		// own private pid namespace.  It does not make sense to add if it is running in the
   740  		// host namespace or another container's pid namespace where we already have an init
   741  		if c.HostConfig.PidMode.IsPrivate() {
   742  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   743  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   744  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   745  				path := daemon.configStore.InitPath
   746  				if path == "" {
   747  					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   748  					if err != nil {
   749  						return err
   750  					}
   751  				}
   752  				s.Mounts = append(s.Mounts, specs.Mount{
   753  					Destination: inContainerInitPath,
   754  					Type:        "bind",
   755  					Source:      path,
   756  					Options:     []string{"bind", "ro"},
   757  				})
   758  			}
   759  		}
   760  		s.Process.Cwd = cwd
   761  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   762  		s.Process.Terminal = c.Config.Tty
   763  
   764  		s.Hostname = c.Config.Hostname
   765  		setLinuxDomainname(c, s)
   766  
   767  		// Add default sysctls that are generally safe and useful; currently we
   768  		// grant the capabilities to allow these anyway. You can override if
   769  		// you want to restore the original behaviour.
   770  		// We do not set network sysctls if network namespace is host, or if we are
   771  		// joining an existing namespace, only if we create a new net namespace.
   772  		if c.HostConfig.NetworkMode.IsPrivate() {
   773  			// We cannot set up ping socket support in a user namespace
   774  			if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") {
   775  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   776  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   777  			}
   778  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   779  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   780  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   781  			}
   782  		}
   783  
   784  		return nil
   785  	}
   786  }
   787  
   788  // WithCgroups sets the container's cgroups
   789  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   790  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   791  		var cgroupsPath string
   792  		scopePrefix := "docker"
   793  		parent := "/docker"
   794  		useSystemd := UsingSystemd(daemon.configStore)
   795  		if useSystemd {
   796  			parent = "system.slice"
   797  			if daemon.configStore.Rootless {
   798  				parent = "user.slice"
   799  			}
   800  		}
   801  
   802  		if c.HostConfig.CgroupParent != "" {
   803  			parent = c.HostConfig.CgroupParent
   804  		} else if daemon.configStore.CgroupParent != "" {
   805  			parent = daemon.configStore.CgroupParent
   806  		}
   807  
   808  		if useSystemd {
   809  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   810  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   811  		} else {
   812  			cgroupsPath = filepath.Join(parent, c.ID)
   813  		}
   814  		s.Linux.CgroupsPath = cgroupsPath
   815  
   816  		// the rest is only needed for CPU RT controller
   817  
   818  		if daemon.configStore.CPURealtimePeriod == 0 && daemon.configStore.CPURealtimeRuntime == 0 {
   819  			return nil
   820  		}
   821  
   822  		if cdcgroups.Mode() == cdcgroups.Unified {
   823  			return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2")
   824  		}
   825  
   826  		// FIXME this is very expensive way to check if cpu rt is supported
   827  		sysInfo := daemon.RawSysInfo(true)
   828  		if !sysInfo.CPURealtime {
   829  			return errors.New("daemon-scoped cpu-rt-period and cpu-rt-runtime are not supported by the kernel")
   830  		}
   831  
   832  		p := cgroupsPath
   833  		if useSystemd {
   834  			initPath, err := cgroups.GetInitCgroup("cpu")
   835  			if err != nil {
   836  				return errors.Wrap(err, "unable to init CPU RT controller")
   837  			}
   838  			_, err = cgroups.GetOwnCgroup("cpu")
   839  			if err != nil {
   840  				return errors.Wrap(err, "unable to init CPU RT controller")
   841  			}
   842  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   843  		}
   844  
   845  		// Clean path to guard against things like ../../../BAD
   846  		parentPath := filepath.Dir(p)
   847  		if !filepath.IsAbs(parentPath) {
   848  			parentPath = filepath.Clean("/" + parentPath)
   849  		}
   850  
   851  		mnt, root, err := cgroups.FindCgroupMountpointAndRoot("", "cpu")
   852  		if err != nil {
   853  			return errors.Wrap(err, "unable to init CPU RT controller")
   854  		}
   855  		// When docker is run inside docker, the root is based of the host cgroup.
   856  		// Should this be handled in runc/libcontainer/cgroups ?
   857  		if strings.HasPrefix(root, "/docker/") {
   858  			root = "/"
   859  		}
   860  		mnt = filepath.Join(mnt, root)
   861  
   862  		if err := daemon.initCPURtController(mnt, parentPath); err != nil {
   863  			return errors.Wrap(err, "unable to init CPU RT controller")
   864  		}
   865  		return nil
   866  	}
   867  }
   868  
   869  // WithDevices sets the container's devices
   870  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   871  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   872  		// Build lists of devices allowed and created within the container.
   873  		var devs []specs.LinuxDevice
   874  		devPermissions := s.Linux.Resources.Devices
   875  
   876  		if c.HostConfig.Privileged && !userns.RunningInUserNS() {
   877  			hostDevices, err := devices.HostDevices()
   878  			if err != nil {
   879  				return err
   880  			}
   881  			for _, d := range hostDevices {
   882  				devs = append(devs, oci.Device(d))
   883  			}
   884  
   885  			// adding device mappings in privileged containers
   886  			for _, deviceMapping := range c.HostConfig.Devices {
   887  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   888  				if deviceMapping.CgroupPermissions != "rwm" {
   889  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   890  				}
   891  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   892  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   893  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   894  					continue
   895  				}
   896  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   897  				if err != nil {
   898  					return err
   899  				}
   900  				devs = append(devs, d...)
   901  			}
   902  
   903  			devPermissions = []specs.LinuxDeviceCgroup{
   904  				{
   905  					Allow:  true,
   906  					Access: "rwm",
   907  				},
   908  			}
   909  		} else {
   910  			for _, deviceMapping := range c.HostConfig.Devices {
   911  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   912  				if err != nil {
   913  					return err
   914  				}
   915  				devs = append(devs, d...)
   916  				devPermissions = append(devPermissions, dPermissions...)
   917  			}
   918  
   919  			var err error
   920  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   921  			if err != nil {
   922  				return err
   923  			}
   924  		}
   925  
   926  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   927  		s.Linux.Resources.Devices = devPermissions
   928  
   929  		for _, req := range c.HostConfig.DeviceRequests {
   930  			if err := daemon.handleDevice(req, s); err != nil {
   931  				return err
   932  			}
   933  		}
   934  		return nil
   935  	}
   936  }
   937  
   938  // WithResources applies the container resources
   939  func WithResources(c *container.Container) coci.SpecOpts {
   940  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   941  		r := c.HostConfig.Resources
   942  		weightDevices, err := getBlkioWeightDevices(r)
   943  		if err != nil {
   944  			return err
   945  		}
   946  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   947  		if err != nil {
   948  			return err
   949  		}
   950  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   951  		if err != nil {
   952  			return err
   953  		}
   954  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   955  		if err != nil {
   956  			return err
   957  		}
   958  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   959  		if err != nil {
   960  			return err
   961  		}
   962  
   963  		memoryRes := getMemoryResources(r)
   964  		cpuRes, err := getCPUResources(r)
   965  		if err != nil {
   966  			return err
   967  		}
   968  		blkioWeight := r.BlkioWeight
   969  
   970  		specResources := &specs.LinuxResources{
   971  			Memory: memoryRes,
   972  			CPU:    cpuRes,
   973  			BlockIO: &specs.LinuxBlockIO{
   974  				Weight:                  &blkioWeight,
   975  				WeightDevice:            weightDevices,
   976  				ThrottleReadBpsDevice:   readBpsDevice,
   977  				ThrottleWriteBpsDevice:  writeBpsDevice,
   978  				ThrottleReadIOPSDevice:  readIOpsDevice,
   979  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   980  			},
   981  			Pids: getPidsLimit(r),
   982  		}
   983  
   984  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   985  			specResources.Devices = s.Linux.Resources.Devices
   986  		}
   987  
   988  		s.Linux.Resources = specResources
   989  		return nil
   990  	}
   991  }
   992  
   993  // WithSysctls sets the container's sysctls
   994  func WithSysctls(c *container.Container) coci.SpecOpts {
   995  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   996  		// We merge the sysctls injected above with the HostConfig (latter takes
   997  		// precedence for backwards-compatibility reasons).
   998  		for k, v := range c.HostConfig.Sysctls {
   999  			s.Linux.Sysctl[k] = v
  1000  		}
  1001  		return nil
  1002  	}
  1003  }
  1004  
  1005  // WithUser sets the container's user
  1006  func WithUser(c *container.Container) coci.SpecOpts {
  1007  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
  1008  		var err error
  1009  		s.Process.User, err = getUser(c, c.Config.User)
  1010  		return err
  1011  	}
  1012  }
  1013  
  1014  func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  1015  	var (
  1016  		opts []coci.SpecOpts
  1017  		s    = oci.DefaultSpec()
  1018  	)
  1019  	opts = append(opts,
  1020  		WithCommonOptions(daemon, c),
  1021  		WithCgroups(daemon, c),
  1022  		WithResources(c),
  1023  		WithSysctls(c),
  1024  		WithDevices(daemon, c),
  1025  		WithUser(c),
  1026  		WithRlimits(daemon, c),
  1027  		WithNamespaces(daemon, c),
  1028  		WithCapabilities(c),
  1029  		WithSeccomp(daemon, c),
  1030  		WithMounts(daemon, c),
  1031  		WithLibnetwork(daemon, c),
  1032  		WithApparmor(c),
  1033  		WithSelinux(c),
  1034  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1035  	)
  1036  	if c.NoNewPrivileges {
  1037  		opts = append(opts, coci.WithNoNewPrivileges)
  1038  	}
  1039  
  1040  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1041  	if c.HostConfig.MaskedPaths != nil {
  1042  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1043  	}
  1044  	if c.HostConfig.ReadonlyPaths != nil {
  1045  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1046  	}
  1047  	if daemon.configStore.Rootless {
  1048  		opts = append(opts, WithRootless(daemon))
  1049  	}
  1050  	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  1051  		ID: c.ID,
  1052  	}, &s, opts...)
  1053  }
  1054  
  1055  func clearReadOnly(m *specs.Mount) {
  1056  	var opt []string
  1057  	for _, o := range m.Options {
  1058  		if o != "ro" {
  1059  			opt = append(opt, o)
  1060  		}
  1061  	}
  1062  	m.Options = opt
  1063  }
  1064  
  1065  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1066  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1067  	ulimits := c.Ulimits
  1068  	// Merge ulimits with daemon defaults
  1069  	ulIdx := make(map[string]struct{})
  1070  	for _, ul := range ulimits {
  1071  		ulIdx[ul.Name] = struct{}{}
  1072  	}
  1073  	for name, ul := range daemon.configStore.Ulimits {
  1074  		if _, exists := ulIdx[name]; !exists {
  1075  			ulimits = append(ulimits, ul)
  1076  		}
  1077  	}
  1078  	c.Ulimits = ulimits
  1079  }