github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/daemon/oci_linux.go

github.com/docker/docker@v299999999.0.0-20200612211812-aaf470eca7b5+incompatible/daemon/oci_linux.go (about)

     1  package daemon // import "github.com/docker/docker/daemon"
     2  
     3  import (
     4  	"context"
     5  	"fmt"
     6  	"io"
     7  	"io/ioutil"
     8  	"os"
     9  	"os/exec"
    10  	"path/filepath"
    11  	"sort"
    12  	"strconv"
    13  	"strings"
    14  
    15  	"github.com/containerd/containerd/containers"
    16  	coci "github.com/containerd/containerd/oci"
    17  	containertypes "github.com/docker/docker/api/types/container"
    18  	"github.com/docker/docker/container"
    19  	daemonconfig "github.com/docker/docker/daemon/config"
    20  	"github.com/docker/docker/oci"
    21  	"github.com/docker/docker/oci/caps"
    22  	"github.com/docker/docker/pkg/idtools"
    23  	"github.com/docker/docker/pkg/stringid"
    24  	"github.com/docker/docker/rootless/specconv"
    25  	volumemounts "github.com/docker/docker/volume/mounts"
    26  	"github.com/moby/sys/mount"
    27  	"github.com/moby/sys/mountinfo"
    28  	"github.com/opencontainers/runc/libcontainer/apparmor"
    29  	"github.com/opencontainers/runc/libcontainer/cgroups"
    30  	"github.com/opencontainers/runc/libcontainer/devices"
    31  	rsystem "github.com/opencontainers/runc/libcontainer/system"
    32  	"github.com/opencontainers/runc/libcontainer/user"
    33  	specs "github.com/opencontainers/runtime-spec/specs-go"
    34  	"github.com/pkg/errors"
    35  	"github.com/sirupsen/logrus"
    36  	"golang.org/x/sys/unix"
    37  )
    38  
    39  const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
    40  
    41  // WithRlimits sets the container's rlimits along with merging the daemon's rlimits
    42  func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts {
    43  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    44  		var rlimits []specs.POSIXRlimit
    45  
    46  		// We want to leave the original HostConfig alone so make a copy here
    47  		hostConfig := *c.HostConfig
    48  		// Merge with the daemon defaults
    49  		daemon.mergeUlimits(&hostConfig)
    50  		for _, ul := range hostConfig.Ulimits {
    51  			rlimits = append(rlimits, specs.POSIXRlimit{
    52  				Type: "RLIMIT_" + strings.ToUpper(ul.Name),
    53  				Soft: uint64(ul.Soft),
    54  				Hard: uint64(ul.Hard),
    55  			})
    56  		}
    57  
    58  		s.Process.Rlimits = rlimits
    59  		return nil
    60  	}
    61  }
    62  
    63  // WithLibnetwork sets the libnetwork hook
    64  func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts {
    65  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    66  		if s.Hooks == nil {
    67  			s.Hooks = &specs.Hooks{}
    68  		}
    69  		for _, ns := range s.Linux.Namespaces {
    70  			if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
    71  				target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
    72  				shortNetCtlrID := stringid.TruncateID(daemon.netController.ID())
    73  				s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{
    74  					Path: target,
    75  					Args: []string{
    76  						"libnetwork-setkey",
    77  						"-exec-root=" + daemon.configStore.GetExecRoot(),
    78  						c.ID,
    79  						shortNetCtlrID,
    80  					},
    81  				})
    82  			}
    83  		}
    84  		return nil
    85  	}
    86  }
    87  
    88  // WithRootless sets the spec to the rootless configuration
    89  func WithRootless(daemon *Daemon) coci.SpecOpts {
    90  	return func(_ context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
    91  		var v2Controllers []string
    92  		if daemon.getCgroupDriver() == cgroupSystemdDriver {
    93  			if !cgroups.IsCgroup2UnifiedMode() {
    94  				return errors.New("rootless systemd driver doesn't support cgroup v1")
    95  			}
    96  			rootlesskitParentEUID := os.Getenv("ROOTLESSKIT_PARENT_EUID")
    97  			if rootlesskitParentEUID == "" {
    98  				return errors.New("$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)")
    99  			}
   100  			controllersPath := fmt.Sprintf("/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers", rootlesskitParentEUID)
   101  			controllersFile, err := ioutil.ReadFile(controllersPath)
   102  			if err != nil {
   103  				return err
   104  			}
   105  			v2Controllers = strings.Fields(string(controllersFile))
   106  		}
   107  		return specconv.ToRootless(s, v2Controllers)
   108  	}
   109  }
   110  
   111  // WithOOMScore sets the oom score
   112  func WithOOMScore(score *int) coci.SpecOpts {
   113  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   114  		s.Process.OOMScoreAdj = score
   115  		return nil
   116  	}
   117  }
   118  
   119  // WithSelinux sets the selinux labels
   120  func WithSelinux(c *container.Container) coci.SpecOpts {
   121  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   122  		s.Process.SelinuxLabel = c.GetProcessLabel()
   123  		s.Linux.MountLabel = c.MountLabel
   124  		return nil
   125  	}
   126  }
   127  
   128  // WithApparmor sets the apparmor profile
   129  func WithApparmor(c *container.Container) coci.SpecOpts {
   130  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   131  		if apparmor.IsEnabled() {
   132  			var appArmorProfile string
   133  			if c.AppArmorProfile != "" {
   134  				appArmorProfile = c.AppArmorProfile
   135  			} else if c.HostConfig.Privileged {
   136  				appArmorProfile = unconfinedAppArmorProfile
   137  			} else {
   138  				appArmorProfile = defaultAppArmorProfile
   139  			}
   140  
   141  			if appArmorProfile == defaultAppArmorProfile {
   142  				// Unattended upgrades and other fun services can unload AppArmor
   143  				// profiles inadvertently. Since we cannot store our profile in
   144  				// /etc/apparmor.d, nor can we practically add other ways of
   145  				// telling the system to keep our profile loaded, in order to make
   146  				// sure that we keep the default profile enabled we dynamically
   147  				// reload it if necessary.
   148  				if err := ensureDefaultAppArmorProfile(); err != nil {
   149  					return err
   150  				}
   151  			}
   152  			s.Process.ApparmorProfile = appArmorProfile
   153  		}
   154  		return nil
   155  	}
   156  }
   157  
   158  // WithCapabilities sets the container's capabilties
   159  func WithCapabilities(c *container.Container) coci.SpecOpts {
   160  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   161  		capabilities, err := caps.TweakCapabilities(
   162  			caps.DefaultCapabilities(),
   163  			c.HostConfig.CapAdd,
   164  			c.HostConfig.CapDrop,
   165  			c.HostConfig.Capabilities,
   166  			c.HostConfig.Privileged,
   167  		)
   168  		if err != nil {
   169  			return err
   170  		}
   171  		return oci.SetCapabilities(s, capabilities)
   172  	}
   173  }
   174  
   175  func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
   176  	fp, err := c.GetResourcePath(p)
   177  	if err != nil {
   178  		return nil, err
   179  	}
   180  	return os.Open(fp)
   181  }
   182  
   183  func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
   184  	passwdPath, err := user.GetPasswdPath()
   185  	if err != nil {
   186  		return 0, 0, nil, err
   187  	}
   188  	groupPath, err := user.GetGroupPath()
   189  	if err != nil {
   190  		return 0, 0, nil, err
   191  	}
   192  	passwdFile, err := readUserFile(c, passwdPath)
   193  	if err == nil {
   194  		defer passwdFile.Close()
   195  	}
   196  	groupFile, err := readUserFile(c, groupPath)
   197  	if err == nil {
   198  		defer groupFile.Close()
   199  	}
   200  
   201  	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
   202  	if err != nil {
   203  		return 0, 0, nil, err
   204  	}
   205  
   206  	// todo: fix this double read by a change to libcontainer/user pkg
   207  	groupFile, err = readUserFile(c, groupPath)
   208  	if err == nil {
   209  		defer groupFile.Close()
   210  	}
   211  	var addGroups []int
   212  	if len(c.HostConfig.GroupAdd) > 0 {
   213  		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
   214  		if err != nil {
   215  			return 0, 0, nil, err
   216  		}
   217  	}
   218  	uid := uint32(execUser.Uid)
   219  	gid := uint32(execUser.Gid)
   220  	sgids := append(execUser.Sgids, addGroups...)
   221  	var additionalGids []uint32
   222  	for _, g := range sgids {
   223  		additionalGids = append(additionalGids, uint32(g))
   224  	}
   225  	return uid, gid, additionalGids, nil
   226  }
   227  
   228  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   229  	for i, n := range s.Linux.Namespaces {
   230  		if n.Type == ns.Type {
   231  			s.Linux.Namespaces[i] = ns
   232  			return
   233  		}
   234  	}
   235  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   236  }
   237  
   238  // WithNamespaces sets the container's namespaces
   239  func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts {
   240  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   241  		userNS := false
   242  		// user
   243  		if c.HostConfig.UsernsMode.IsPrivate() {
   244  			uidMap := daemon.idMapping.UIDs()
   245  			if uidMap != nil {
   246  				userNS = true
   247  				ns := specs.LinuxNamespace{Type: "user"}
   248  				setNamespace(s, ns)
   249  				s.Linux.UIDMappings = specMapping(uidMap)
   250  				s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
   251  			}
   252  		}
   253  		// network
   254  		if !c.Config.NetworkDisabled {
   255  			ns := specs.LinuxNamespace{Type: "network"}
   256  			parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   257  			if parts[0] == "container" {
   258  				nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   259  				if err != nil {
   260  					return err
   261  				}
   262  				ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   263  				if userNS {
   264  					// to share a net namespace, they must also share a user namespace
   265  					nsUser := specs.LinuxNamespace{Type: "user"}
   266  					nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   267  					setNamespace(s, nsUser)
   268  				}
   269  			} else if c.HostConfig.NetworkMode.IsHost() {
   270  				ns.Path = c.NetworkSettings.SandboxKey
   271  			}
   272  			setNamespace(s, ns)
   273  		}
   274  
   275  		// ipc
   276  		ipcMode := c.HostConfig.IpcMode
   277  		switch {
   278  		case ipcMode.IsContainer():
   279  			ns := specs.LinuxNamespace{Type: "ipc"}
   280  			ic, err := daemon.getIpcContainer(ipcMode.Container())
   281  			if err != nil {
   282  				return err
   283  			}
   284  			ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   285  			setNamespace(s, ns)
   286  			if userNS {
   287  				// to share an IPC namespace, they must also share a user namespace
   288  				nsUser := specs.LinuxNamespace{Type: "user"}
   289  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   290  				setNamespace(s, nsUser)
   291  			}
   292  		case ipcMode.IsHost():
   293  			oci.RemoveNamespace(s, "ipc")
   294  		case ipcMode.IsEmpty():
   295  			// A container was created by an older version of the daemon.
   296  			// The default behavior used to be what is now called "shareable".
   297  			fallthrough
   298  		case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   299  			ns := specs.LinuxNamespace{Type: "ipc"}
   300  			setNamespace(s, ns)
   301  		default:
   302  			return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   303  		}
   304  
   305  		// pid
   306  		if c.HostConfig.PidMode.IsContainer() {
   307  			pc, err := daemon.getPidContainer(c)
   308  			if err != nil {
   309  				return err
   310  			}
   311  			ns := specs.LinuxNamespace{
   312  				Type: "pid",
   313  				Path: fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()),
   314  			}
   315  			setNamespace(s, ns)
   316  			if userNS {
   317  				// to share a PID namespace, they must also share a user namespace
   318  				nsUser := specs.LinuxNamespace{
   319  					Type: "user",
   320  					Path: fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()),
   321  				}
   322  				setNamespace(s, nsUser)
   323  			}
   324  		} else if c.HostConfig.PidMode.IsHost() {
   325  			oci.RemoveNamespace(s, "pid")
   326  		} else {
   327  			ns := specs.LinuxNamespace{Type: "pid"}
   328  			setNamespace(s, ns)
   329  		}
   330  		// uts
   331  		if c.HostConfig.UTSMode.IsHost() {
   332  			oci.RemoveNamespace(s, "uts")
   333  			s.Hostname = ""
   334  		}
   335  
   336  		// cgroup
   337  		if !c.HostConfig.CgroupnsMode.IsEmpty() {
   338  			cgroupNsMode := c.HostConfig.CgroupnsMode
   339  			if !cgroupNsMode.Valid() {
   340  				return fmt.Errorf("invalid cgroup namespace mode: %v", cgroupNsMode)
   341  			}
   342  			if cgroupNsMode.IsPrivate() {
   343  				nsCgroup := specs.LinuxNamespace{Type: "cgroup"}
   344  				setNamespace(s, nsCgroup)
   345  			}
   346  		}
   347  
   348  		return nil
   349  	}
   350  }
   351  
   352  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   353  	var ids []specs.LinuxIDMapping
   354  	for _, item := range s {
   355  		ids = append(ids, specs.LinuxIDMapping{
   356  			HostID:      uint32(item.HostID),
   357  			ContainerID: uint32(item.ContainerID),
   358  			Size:        uint32(item.Size),
   359  		})
   360  	}
   361  	return ids
   362  }
   363  
   364  // Get the source mount point of directory passed in as argument. Also return
   365  // optional fields.
   366  func getSourceMount(source string) (string, string, error) {
   367  	// Ensure any symlinks are resolved.
   368  	sourcePath, err := filepath.EvalSymlinks(source)
   369  	if err != nil {
   370  		return "", "", err
   371  	}
   372  
   373  	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(sourcePath))
   374  	if err != nil {
   375  		return "", "", err
   376  	}
   377  	if len(mi) < 1 {
   378  		return "", "", fmt.Errorf("Can't find mount point of %s", source)
   379  	}
   380  
   381  	// find the longest mount point
   382  	var idx, maxlen int
   383  	for i := range mi {
   384  		if len(mi[i].Mountpoint) > maxlen {
   385  			maxlen = len(mi[i].Mountpoint)
   386  			idx = i
   387  		}
   388  	}
   389  	return mi[idx].Mountpoint, mi[idx].Optional, nil
   390  }
   391  
   392  const (
   393  	sharedPropagationOption = "shared:"
   394  	slavePropagationOption  = "master:"
   395  )
   396  
   397  // hasMountInfoOption checks if any of the passed any of the given option values
   398  // are set in the passed in option string.
   399  func hasMountInfoOption(opts string, vals ...string) bool {
   400  	for _, opt := range strings.Split(opts, " ") {
   401  		for _, val := range vals {
   402  			if strings.HasPrefix(opt, val) {
   403  				return true
   404  			}
   405  		}
   406  	}
   407  	return false
   408  }
   409  
   410  // Ensure mount point on which path is mounted, is shared.
   411  func ensureShared(path string) error {
   412  	sourceMount, optionalOpts, err := getSourceMount(path)
   413  	if err != nil {
   414  		return err
   415  	}
   416  	// Make sure source mount point is shared.
   417  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption) {
   418  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   419  	}
   420  	return nil
   421  }
   422  
   423  // Ensure mount point on which path is mounted, is either shared or slave.
   424  func ensureSharedOrSlave(path string) error {
   425  	sourceMount, optionalOpts, err := getSourceMount(path)
   426  	if err != nil {
   427  		return err
   428  	}
   429  
   430  	if !hasMountInfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   431  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   432  	}
   433  	return nil
   434  }
   435  
   436  // Get the set of mount flags that are set on the mount that contains the given
   437  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   438  // bind-mounting "with options" will not fail with user namespaces, due to
   439  // kernel restrictions that require user namespace mounts to preserve
   440  // CL_UNPRIVILEGED locked flags.
   441  func getUnprivilegedMountFlags(path string) ([]string, error) {
   442  	var statfs unix.Statfs_t
   443  	if err := unix.Statfs(path, &statfs); err != nil {
   444  		return nil, err
   445  	}
   446  
   447  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   448  	unprivilegedFlags := map[uint64]string{
   449  		unix.MS_RDONLY:     "ro",
   450  		unix.MS_NODEV:      "nodev",
   451  		unix.MS_NOEXEC:     "noexec",
   452  		unix.MS_NOSUID:     "nosuid",
   453  		unix.MS_NOATIME:    "noatime",
   454  		unix.MS_RELATIME:   "relatime",
   455  		unix.MS_NODIRATIME: "nodiratime",
   456  	}
   457  
   458  	var flags []string
   459  	for mask, flag := range unprivilegedFlags {
   460  		if uint64(statfs.Flags)&mask == mask {
   461  			flags = append(flags, flag)
   462  		}
   463  	}
   464  
   465  	return flags, nil
   466  }
   467  
   468  var (
   469  	mountPropagationMap = map[string]int{
   470  		"private":  mount.PRIVATE,
   471  		"rprivate": mount.RPRIVATE,
   472  		"shared":   mount.SHARED,
   473  		"rshared":  mount.RSHARED,
   474  		"slave":    mount.SLAVE,
   475  		"rslave":   mount.RSLAVE,
   476  	}
   477  
   478  	mountPropagationReverseMap = map[int]string{
   479  		mount.PRIVATE:  "private",
   480  		mount.RPRIVATE: "rprivate",
   481  		mount.SHARED:   "shared",
   482  		mount.RSHARED:  "rshared",
   483  		mount.SLAVE:    "slave",
   484  		mount.RSLAVE:   "rslave",
   485  	}
   486  )
   487  
   488  // inSlice tests whether a string is contained in a slice of strings or not.
   489  // Comparison is case sensitive
   490  func inSlice(slice []string, s string) bool {
   491  	for _, ss := range slice {
   492  		if s == ss {
   493  			return true
   494  		}
   495  	}
   496  	return false
   497  }
   498  
   499  // WithMounts sets the container's mounts
   500  func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts {
   501  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) {
   502  		if err := daemon.setupContainerMountsRoot(c); err != nil {
   503  			return err
   504  		}
   505  
   506  		if err := daemon.setupIpcDirs(c); err != nil {
   507  			return err
   508  		}
   509  
   510  		defer func() {
   511  			if err != nil {
   512  				daemon.cleanupSecretDir(c)
   513  			}
   514  		}()
   515  
   516  		if err := daemon.setupSecretDir(c); err != nil {
   517  			return err
   518  		}
   519  
   520  		ms, err := daemon.setupMounts(c)
   521  		if err != nil {
   522  			return err
   523  		}
   524  
   525  		if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   526  			ms = append(ms, c.IpcMounts()...)
   527  		}
   528  
   529  		tmpfsMounts, err := c.TmpfsMounts()
   530  		if err != nil {
   531  			return err
   532  		}
   533  		ms = append(ms, tmpfsMounts...)
   534  
   535  		secretMounts, err := c.SecretMounts()
   536  		if err != nil {
   537  			return err
   538  		}
   539  		ms = append(ms, secretMounts...)
   540  
   541  		sort.Sort(mounts(ms))
   542  
   543  		mounts := ms
   544  
   545  		userMounts := make(map[string]struct{})
   546  		for _, m := range mounts {
   547  			userMounts[m.Destination] = struct{}{}
   548  		}
   549  
   550  		// Copy all mounts from spec to defaultMounts, except for
   551  		//  - mounts overridden by a user supplied mount;
   552  		//  - all mounts under /dev if a user supplied /dev is present;
   553  		//  - /dev/shm, in case IpcMode is none.
   554  		// While at it, also
   555  		//  - set size for /dev/shm from shmsize.
   556  		defaultMounts := s.Mounts[:0]
   557  		_, mountDev := userMounts["/dev"]
   558  		for _, m := range s.Mounts {
   559  			if _, ok := userMounts[m.Destination]; ok {
   560  				// filter out mount overridden by a user supplied mount
   561  				continue
   562  			}
   563  			if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   564  				// filter out everything under /dev if /dev is user-mounted
   565  				continue
   566  			}
   567  
   568  			if m.Destination == "/dev/shm" {
   569  				if c.HostConfig.IpcMode.IsNone() {
   570  					// filter out /dev/shm for "none" IpcMode
   571  					continue
   572  				}
   573  				// set size for /dev/shm mount from spec
   574  				sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   575  				m.Options = append(m.Options, sizeOpt)
   576  			}
   577  
   578  			defaultMounts = append(defaultMounts, m)
   579  		}
   580  
   581  		s.Mounts = defaultMounts
   582  		for _, m := range mounts {
   583  			if m.Source == "tmpfs" {
   584  				data := m.Data
   585  				parser := volumemounts.NewParser("linux")
   586  				options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   587  				if data != "" {
   588  					options = append(options, strings.Split(data, ",")...)
   589  				}
   590  
   591  				merged, err := mount.MergeTmpfsOptions(options)
   592  				if err != nil {
   593  					return err
   594  				}
   595  
   596  				s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   597  				continue
   598  			}
   599  
   600  			mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   601  
   602  			// Determine property of RootPropagation based on volume
   603  			// properties. If a volume is shared, then keep root propagation
   604  			// shared. This should work for slave and private volumes too.
   605  			//
   606  			// For slave volumes, it can be either [r]shared/[r]slave.
   607  			//
   608  			// For private volumes any root propagation value should work.
   609  			pFlag := mountPropagationMap[m.Propagation]
   610  			switch pFlag {
   611  			case mount.SHARED, mount.RSHARED:
   612  				if err := ensureShared(m.Source); err != nil {
   613  					return err
   614  				}
   615  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   616  				if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   617  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   618  				}
   619  			case mount.SLAVE, mount.RSLAVE:
   620  				var fallback bool
   621  				if err := ensureSharedOrSlave(m.Source); err != nil {
   622  					// For backwards compatibility purposes, treat mounts from the daemon root
   623  					// as special since we automatically add rslave propagation to these mounts
   624  					// when the user did not set anything, so we should fallback to the old
   625  					// behavior which is to use private propagation which is normally the
   626  					// default.
   627  					if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   628  						return err
   629  					}
   630  
   631  					cm, ok := c.MountPoints[m.Destination]
   632  					if !ok {
   633  						return err
   634  					}
   635  					if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   636  						// This means the user explicitly set a propagation, do not fallback in that case.
   637  						return err
   638  					}
   639  					fallback = true
   640  					logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   641  				}
   642  				if !fallback {
   643  					rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   644  					if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   645  						s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   646  					}
   647  				}
   648  			}
   649  
   650  			bindMode := "rbind"
   651  			if m.NonRecursive {
   652  				bindMode = "bind"
   653  			}
   654  			opts := []string{bindMode}
   655  			if !m.Writable {
   656  				opts = append(opts, "ro")
   657  			}
   658  			if pFlag != 0 {
   659  				opts = append(opts, mountPropagationReverseMap[pFlag])
   660  			}
   661  
   662  			// If we are using user namespaces, then we must make sure that we
   663  			// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   664  			// "mount" when we bind-mount. The reason for this is that at the point
   665  			// when runc sets up the root filesystem, it is already inside a user
   666  			// namespace, and thus cannot change any flags that are locked.
   667  			if daemon.configStore.RemappedRoot != "" {
   668  				unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   669  				if err != nil {
   670  					return err
   671  				}
   672  				opts = append(opts, unprivOpts...)
   673  			}
   674  
   675  			mt.Options = opts
   676  			s.Mounts = append(s.Mounts, mt)
   677  		}
   678  
   679  		if s.Root.Readonly {
   680  			for i, m := range s.Mounts {
   681  				switch m.Destination {
   682  				case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
   683  					continue
   684  				}
   685  				if _, ok := userMounts[m.Destination]; !ok {
   686  					if !inSlice(m.Options, "ro") {
   687  						s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   688  					}
   689  				}
   690  			}
   691  		}
   692  
   693  		if c.HostConfig.Privileged {
   694  			// clear readonly for /sys
   695  			for i := range s.Mounts {
   696  				if s.Mounts[i].Destination == "/sys" {
   697  					clearReadOnly(&s.Mounts[i])
   698  				}
   699  			}
   700  			s.Linux.ReadonlyPaths = nil
   701  			s.Linux.MaskedPaths = nil
   702  		}
   703  
   704  		// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   705  		// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   706  		if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   707  			for i, m := range s.Mounts {
   708  				if m.Type == "cgroup" {
   709  					clearReadOnly(&s.Mounts[i])
   710  				}
   711  			}
   712  		}
   713  
   714  		return nil
   715  
   716  	}
   717  }
   718  
   719  // sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
   720  // exist, so do not add the default ones if running on an old kernel.
   721  func sysctlExists(s string) bool {
   722  	f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1))
   723  	_, err := os.Stat(f)
   724  	return err == nil
   725  }
   726  
   727  // WithCommonOptions sets common docker options
   728  func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts {
   729  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   730  		if c.BaseFS == nil {
   731  			return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
   732  		}
   733  		linkedEnv, err := daemon.setupLinkedContainers(c)
   734  		if err != nil {
   735  			return err
   736  		}
   737  		s.Root = &specs.Root{
   738  			Path:     c.BaseFS.Path(),
   739  			Readonly: c.HostConfig.ReadonlyRootfs,
   740  		}
   741  		if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
   742  			return err
   743  		}
   744  		cwd := c.Config.WorkingDir
   745  		if len(cwd) == 0 {
   746  			cwd = "/"
   747  		}
   748  		s.Process.Args = append([]string{c.Path}, c.Args...)
   749  
   750  		// only add the custom init if it is specified and the container is running in its
   751  		// own private pid namespace.  It does not make sense to add if it is running in the
   752  		// host namespace or another container's pid namespace where we already have an init
   753  		if c.HostConfig.PidMode.IsPrivate() {
   754  			if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   755  				(c.HostConfig.Init == nil && daemon.configStore.Init) {
   756  				s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
   757  				path := daemon.configStore.InitPath
   758  				if path == "" {
   759  					path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   760  					if err != nil {
   761  						return err
   762  					}
   763  				}
   764  				s.Mounts = append(s.Mounts, specs.Mount{
   765  					Destination: inContainerInitPath,
   766  					Type:        "bind",
   767  					Source:      path,
   768  					Options:     []string{"bind", "ro"},
   769  				})
   770  			}
   771  		}
   772  		s.Process.Cwd = cwd
   773  		s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   774  		s.Process.Terminal = c.Config.Tty
   775  
   776  		s.Hostname = c.Config.Hostname
   777  		setLinuxDomainname(c, s)
   778  
   779  		// Add default sysctls that are generally safe and useful; currently we
   780  		// grant the capabilities to allow these anyway. You can override if
   781  		// you want to restore the original behaviour.
   782  		// We do not set network sysctls if network namespace is host, or if we are
   783  		// joining an existing namespace, only if we create a new net namespace.
   784  		if c.HostConfig.NetworkMode.IsPrivate() {
   785  			// We cannot set up ping socket support in a user namespace
   786  			if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") {
   787  				// allow unprivileged ICMP echo sockets without CAP_NET_RAW
   788  				s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647"
   789  			}
   790  			// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
   791  			if sysctlExists("net.ipv4.ip_unprivileged_port_start") {
   792  				s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0"
   793  			}
   794  		}
   795  
   796  		return nil
   797  	}
   798  }
   799  
   800  // WithCgroups sets the container's cgroups
   801  func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts {
   802  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   803  		var cgroupsPath string
   804  		scopePrefix := "docker"
   805  		parent := "/docker"
   806  		useSystemd := UsingSystemd(daemon.configStore)
   807  		if useSystemd {
   808  			parent = "system.slice"
   809  			if daemon.configStore.Rootless {
   810  				parent = "user.slice"
   811  			}
   812  		}
   813  
   814  		if c.HostConfig.CgroupParent != "" {
   815  			parent = c.HostConfig.CgroupParent
   816  		} else if daemon.configStore.CgroupParent != "" {
   817  			parent = daemon.configStore.CgroupParent
   818  		}
   819  
   820  		if useSystemd {
   821  			cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   822  			logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   823  		} else {
   824  			cgroupsPath = filepath.Join(parent, c.ID)
   825  		}
   826  		s.Linux.CgroupsPath = cgroupsPath
   827  		p := cgroupsPath
   828  		if useSystemd {
   829  			initPath, err := cgroups.GetInitCgroup("cpu")
   830  			if err != nil {
   831  				return err
   832  			}
   833  			_, err = cgroups.GetOwnCgroup("cpu")
   834  			if err != nil {
   835  				return err
   836  			}
   837  			p = filepath.Join(initPath, s.Linux.CgroupsPath)
   838  		}
   839  
   840  		// Clean path to guard against things like ../../../BAD
   841  		parentPath := filepath.Dir(p)
   842  		if !filepath.IsAbs(parentPath) {
   843  			parentPath = filepath.Clean("/" + parentPath)
   844  		}
   845  
   846  		if err := daemon.initCgroupsPath(parentPath); err != nil {
   847  			return fmt.Errorf("linux init cgroups path: %v", err)
   848  		}
   849  		return nil
   850  	}
   851  }
   852  
   853  // WithDevices sets the container's devices
   854  func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts {
   855  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   856  		// Build lists of devices allowed and created within the container.
   857  		var devs []specs.LinuxDevice
   858  		devPermissions := s.Linux.Resources.Devices
   859  
   860  		if c.HostConfig.Privileged && !rsystem.RunningInUserNS() {
   861  			hostDevices, err := devices.HostDevices()
   862  			if err != nil {
   863  				return err
   864  			}
   865  			for _, d := range hostDevices {
   866  				devs = append(devs, oci.Device(d))
   867  			}
   868  
   869  			// adding device mappings in privileged containers
   870  			for _, deviceMapping := range c.HostConfig.Devices {
   871  				// issue a warning that custom cgroup permissions are ignored in privileged mode
   872  				if deviceMapping.CgroupPermissions != "rwm" {
   873  					logrus.WithField("container", c.ID).Warnf("custom %s permissions for device %s are ignored in privileged mode", deviceMapping.CgroupPermissions, deviceMapping.PathOnHost)
   874  				}
   875  				// issue a warning that the device path already exists via /dev mounting in privileged mode
   876  				if deviceMapping.PathOnHost == deviceMapping.PathInContainer {
   877  					logrus.WithField("container", c.ID).Warnf("path in container %s already exists in privileged mode", deviceMapping.PathInContainer)
   878  					continue
   879  				}
   880  				d, _, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, "rwm")
   881  				if err != nil {
   882  					return err
   883  				}
   884  				devs = append(devs, d...)
   885  			}
   886  
   887  			devPermissions = []specs.LinuxDeviceCgroup{
   888  				{
   889  					Allow:  true,
   890  					Access: "rwm",
   891  				},
   892  			}
   893  		} else {
   894  			for _, deviceMapping := range c.HostConfig.Devices {
   895  				d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   896  				if err != nil {
   897  					return err
   898  				}
   899  				devs = append(devs, d...)
   900  				devPermissions = append(devPermissions, dPermissions...)
   901  			}
   902  
   903  			var err error
   904  			devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
   905  			if err != nil {
   906  				return err
   907  			}
   908  		}
   909  
   910  		s.Linux.Devices = append(s.Linux.Devices, devs...)
   911  		s.Linux.Resources.Devices = devPermissions
   912  
   913  		for _, req := range c.HostConfig.DeviceRequests {
   914  			if err := daemon.handleDevice(req, s); err != nil {
   915  				return err
   916  			}
   917  		}
   918  		return nil
   919  	}
   920  }
   921  
   922  // WithResources applies the container resources
   923  func WithResources(c *container.Container) coci.SpecOpts {
   924  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   925  		r := c.HostConfig.Resources
   926  		weightDevices, err := getBlkioWeightDevices(r)
   927  		if err != nil {
   928  			return err
   929  		}
   930  		readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
   931  		if err != nil {
   932  			return err
   933  		}
   934  		writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
   935  		if err != nil {
   936  			return err
   937  		}
   938  		readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
   939  		if err != nil {
   940  			return err
   941  		}
   942  		writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
   943  		if err != nil {
   944  			return err
   945  		}
   946  
   947  		memoryRes := getMemoryResources(r)
   948  		cpuRes, err := getCPUResources(r)
   949  		if err != nil {
   950  			return err
   951  		}
   952  		blkioWeight := r.BlkioWeight
   953  
   954  		specResources := &specs.LinuxResources{
   955  			Memory: memoryRes,
   956  			CPU:    cpuRes,
   957  			BlockIO: &specs.LinuxBlockIO{
   958  				Weight:                  &blkioWeight,
   959  				WeightDevice:            weightDevices,
   960  				ThrottleReadBpsDevice:   readBpsDevice,
   961  				ThrottleWriteBpsDevice:  writeBpsDevice,
   962  				ThrottleReadIOPSDevice:  readIOpsDevice,
   963  				ThrottleWriteIOPSDevice: writeIOpsDevice,
   964  			},
   965  			Pids: getPidsLimit(r),
   966  		}
   967  
   968  		if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
   969  			specResources.Devices = s.Linux.Resources.Devices
   970  		}
   971  
   972  		s.Linux.Resources = specResources
   973  		return nil
   974  	}
   975  }
   976  
   977  // WithSysctls sets the container's sysctls
   978  func WithSysctls(c *container.Container) coci.SpecOpts {
   979  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   980  		// We merge the sysctls injected above with the HostConfig (latter takes
   981  		// precedence for backwards-compatibility reasons).
   982  		for k, v := range c.HostConfig.Sysctls {
   983  			s.Linux.Sysctl[k] = v
   984  		}
   985  		return nil
   986  	}
   987  }
   988  
   989  // WithUser sets the container's user
   990  func WithUser(c *container.Container) coci.SpecOpts {
   991  	return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error {
   992  		uid, gid, additionalGids, err := getUser(c, c.Config.User)
   993  		if err != nil {
   994  			return err
   995  		}
   996  		s.Process.User.UID = uid
   997  		s.Process.User.GID = gid
   998  		s.Process.User.AdditionalGids = additionalGids
   999  		return nil
  1000  	}
  1001  }
  1002  
  1003  func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
  1004  	var (
  1005  		opts []coci.SpecOpts
  1006  		s    = oci.DefaultSpec()
  1007  	)
  1008  	opts = append(opts,
  1009  		WithCommonOptions(daemon, c),
  1010  		WithCgroups(daemon, c),
  1011  		WithResources(c),
  1012  		WithSysctls(c),
  1013  		WithDevices(daemon, c),
  1014  		WithUser(c),
  1015  		WithRlimits(daemon, c),
  1016  		WithNamespaces(daemon, c),
  1017  		WithCapabilities(c),
  1018  		WithSeccomp(daemon, c),
  1019  		WithMounts(daemon, c),
  1020  		WithLibnetwork(daemon, c),
  1021  		WithApparmor(c),
  1022  		WithSelinux(c),
  1023  		WithOOMScore(&c.HostConfig.OomScoreAdj),
  1024  	)
  1025  	if c.NoNewPrivileges {
  1026  		opts = append(opts, coci.WithNoNewPrivileges)
  1027  	}
  1028  
  1029  	// Set the masked and readonly paths with regard to the host config options if they are set.
  1030  	if c.HostConfig.MaskedPaths != nil {
  1031  		opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths))
  1032  	}
  1033  	if c.HostConfig.ReadonlyPaths != nil {
  1034  		opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths))
  1035  	}
  1036  	if daemon.configStore.Rootless {
  1037  		opts = append(opts, WithRootless(daemon))
  1038  	}
  1039  	return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{
  1040  		ID: c.ID,
  1041  	}, &s, opts...)
  1042  }
  1043  
  1044  func clearReadOnly(m *specs.Mount) {
  1045  	var opt []string
  1046  	for _, o := range m.Options {
  1047  		if o != "ro" {
  1048  			opt = append(opt, o)
  1049  		}
  1050  	}
  1051  	m.Options = opt
  1052  }
  1053  
  1054  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
  1055  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
  1056  	ulimits := c.Ulimits
  1057  	// Merge ulimits with daemon defaults
  1058  	ulIdx := make(map[string]struct{})
  1059  	for _, ul := range ulimits {
  1060  		ulIdx[ul.Name] = struct{}{}
  1061  	}
  1062  	for name, ul := range daemon.configStore.Ulimits {
  1063  		if _, exists := ulIdx[name]; !exists {
  1064  			ulimits = append(ulimits, ul)
  1065  		}
  1066  	}
  1067  	c.Ulimits = ulimits
  1068  }