github.com/sams1990/dockerrepo@v17.12.1-ce-rc2+incompatible/daemon/oci_linux.go (about)

     1  package daemon
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"os"
     7  	"os/exec"
     8  	"path/filepath"
     9  	"regexp"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  
    14  	containertypes "github.com/docker/docker/api/types/container"
    15  	"github.com/docker/docker/container"
    16  	"github.com/docker/docker/daemon/caps"
    17  	daemonconfig "github.com/docker/docker/daemon/config"
    18  	"github.com/docker/docker/oci"
    19  	"github.com/docker/docker/pkg/idtools"
    20  	"github.com/docker/docker/pkg/mount"
    21  	"github.com/docker/docker/volume"
    22  	"github.com/opencontainers/runc/libcontainer/apparmor"
    23  	"github.com/opencontainers/runc/libcontainer/cgroups"
    24  	"github.com/opencontainers/runc/libcontainer/devices"
    25  	"github.com/opencontainers/runc/libcontainer/user"
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/pkg/errors"
    28  	"github.com/sirupsen/logrus"
    29  	"golang.org/x/sys/unix"
    30  )
    31  
    32  // nolint: gosimple
    33  var (
    34  	deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
    35  )
    36  
    37  func setResources(s *specs.Spec, r containertypes.Resources) error {
    38  	weightDevices, err := getBlkioWeightDevices(r)
    39  	if err != nil {
    40  		return err
    41  	}
    42  	readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
    43  	if err != nil {
    44  		return err
    45  	}
    46  	writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
    47  	if err != nil {
    48  		return err
    49  	}
    50  	readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
    51  	if err != nil {
    52  		return err
    53  	}
    54  	writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
    55  	if err != nil {
    56  		return err
    57  	}
    58  
    59  	memoryRes := getMemoryResources(r)
    60  	cpuRes, err := getCPUResources(r)
    61  	if err != nil {
    62  		return err
    63  	}
    64  	blkioWeight := r.BlkioWeight
    65  
    66  	specResources := &specs.LinuxResources{
    67  		Memory: memoryRes,
    68  		CPU:    cpuRes,
    69  		BlockIO: &specs.LinuxBlockIO{
    70  			Weight:                  &blkioWeight,
    71  			WeightDevice:            weightDevices,
    72  			ThrottleReadBpsDevice:   readBpsDevice,
    73  			ThrottleWriteBpsDevice:  writeBpsDevice,
    74  			ThrottleReadIOPSDevice:  readIOpsDevice,
    75  			ThrottleWriteIOPSDevice: writeIOpsDevice,
    76  		},
    77  		Pids: &specs.LinuxPids{
    78  			Limit: r.PidsLimit,
    79  		},
    80  	}
    81  
    82  	if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
    83  		specResources.Devices = s.Linux.Resources.Devices
    84  	}
    85  
    86  	s.Linux.Resources = specResources
    87  	return nil
    88  }
    89  
    90  func setDevices(s *specs.Spec, c *container.Container) error {
    91  	// Build lists of devices allowed and created within the container.
    92  	var devs []specs.LinuxDevice
    93  	devPermissions := s.Linux.Resources.Devices
    94  	if c.HostConfig.Privileged {
    95  		hostDevices, err := devices.HostDevices()
    96  		if err != nil {
    97  			return err
    98  		}
    99  		for _, d := range hostDevices {
   100  			devs = append(devs, oci.Device(d))
   101  		}
   102  		devPermissions = []specs.LinuxDeviceCgroup{
   103  			{
   104  				Allow:  true,
   105  				Access: "rwm",
   106  			},
   107  		}
   108  	} else {
   109  		for _, deviceMapping := range c.HostConfig.Devices {
   110  			d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   111  			if err != nil {
   112  				return err
   113  			}
   114  			devs = append(devs, d...)
   115  			devPermissions = append(devPermissions, dPermissions...)
   116  		}
   117  
   118  		for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
   119  			ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
   120  			if len(ss[0]) != 5 {
   121  				return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
   122  			}
   123  			matches := ss[0]
   124  
   125  			dPermissions := specs.LinuxDeviceCgroup{
   126  				Allow:  true,
   127  				Type:   matches[1],
   128  				Access: matches[4],
   129  			}
   130  			if matches[2] == "*" {
   131  				major := int64(-1)
   132  				dPermissions.Major = &major
   133  			} else {
   134  				major, err := strconv.ParseInt(matches[2], 10, 64)
   135  				if err != nil {
   136  					return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
   137  				}
   138  				dPermissions.Major = &major
   139  			}
   140  			if matches[3] == "*" {
   141  				minor := int64(-1)
   142  				dPermissions.Minor = &minor
   143  			} else {
   144  				minor, err := strconv.ParseInt(matches[3], 10, 64)
   145  				if err != nil {
   146  					return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
   147  				}
   148  				dPermissions.Minor = &minor
   149  			}
   150  			devPermissions = append(devPermissions, dPermissions)
   151  		}
   152  	}
   153  
   154  	s.Linux.Devices = append(s.Linux.Devices, devs...)
   155  	s.Linux.Resources.Devices = devPermissions
   156  	return nil
   157  }
   158  
   159  func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
   160  	var rlimits []specs.POSIXRlimit
   161  
   162  	// We want to leave the original HostConfig alone so make a copy here
   163  	hostConfig := *c.HostConfig
   164  	// Merge with the daemon defaults
   165  	daemon.mergeUlimits(&hostConfig)
   166  	for _, ul := range hostConfig.Ulimits {
   167  		rlimits = append(rlimits, specs.POSIXRlimit{
   168  			Type: "RLIMIT_" + strings.ToUpper(ul.Name),
   169  			Soft: uint64(ul.Soft),
   170  			Hard: uint64(ul.Hard),
   171  		})
   172  	}
   173  
   174  	s.Process.Rlimits = rlimits
   175  	return nil
   176  }
   177  
   178  func setUser(s *specs.Spec, c *container.Container) error {
   179  	uid, gid, additionalGids, err := getUser(c, c.Config.User)
   180  	if err != nil {
   181  		return err
   182  	}
   183  	s.Process.User.UID = uid
   184  	s.Process.User.GID = gid
   185  	s.Process.User.AdditionalGids = additionalGids
   186  	return nil
   187  }
   188  
   189  func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
   190  	fp, err := c.GetResourcePath(p)
   191  	if err != nil {
   192  		return nil, err
   193  	}
   194  	return os.Open(fp)
   195  }
   196  
   197  func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
   198  	passwdPath, err := user.GetPasswdPath()
   199  	if err != nil {
   200  		return 0, 0, nil, err
   201  	}
   202  	groupPath, err := user.GetGroupPath()
   203  	if err != nil {
   204  		return 0, 0, nil, err
   205  	}
   206  	passwdFile, err := readUserFile(c, passwdPath)
   207  	if err == nil {
   208  		defer passwdFile.Close()
   209  	}
   210  	groupFile, err := readUserFile(c, groupPath)
   211  	if err == nil {
   212  		defer groupFile.Close()
   213  	}
   214  
   215  	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
   216  	if err != nil {
   217  		return 0, 0, nil, err
   218  	}
   219  
   220  	// todo: fix this double read by a change to libcontainer/user pkg
   221  	groupFile, err = readUserFile(c, groupPath)
   222  	if err == nil {
   223  		defer groupFile.Close()
   224  	}
   225  	var addGroups []int
   226  	if len(c.HostConfig.GroupAdd) > 0 {
   227  		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
   228  		if err != nil {
   229  			return 0, 0, nil, err
   230  		}
   231  	}
   232  	uid := uint32(execUser.Uid)
   233  	gid := uint32(execUser.Gid)
   234  	sgids := append(execUser.Sgids, addGroups...)
   235  	var additionalGids []uint32
   236  	for _, g := range sgids {
   237  		additionalGids = append(additionalGids, uint32(g))
   238  	}
   239  	return uid, gid, additionalGids, nil
   240  }
   241  
   242  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   243  	for i, n := range s.Linux.Namespaces {
   244  		if n.Type == ns.Type {
   245  			s.Linux.Namespaces[i] = ns
   246  			return
   247  		}
   248  	}
   249  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   250  }
   251  
   252  func setCapabilities(s *specs.Spec, c *container.Container) error {
   253  	var caplist []string
   254  	var err error
   255  	if c.HostConfig.Privileged {
   256  		caplist = caps.GetAllCapabilities()
   257  	} else {
   258  		caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
   259  		if err != nil {
   260  			return err
   261  		}
   262  	}
   263  	s.Process.Capabilities.Effective = caplist
   264  	s.Process.Capabilities.Bounding = caplist
   265  	s.Process.Capabilities.Permitted = caplist
   266  	s.Process.Capabilities.Inheritable = caplist
   267  	return nil
   268  }
   269  
   270  func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
   271  	userNS := false
   272  	// user
   273  	if c.HostConfig.UsernsMode.IsPrivate() {
   274  		uidMap := daemon.idMappings.UIDs()
   275  		if uidMap != nil {
   276  			userNS = true
   277  			ns := specs.LinuxNamespace{Type: "user"}
   278  			setNamespace(s, ns)
   279  			s.Linux.UIDMappings = specMapping(uidMap)
   280  			s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
   281  		}
   282  	}
   283  	// network
   284  	if !c.Config.NetworkDisabled {
   285  		ns := specs.LinuxNamespace{Type: "network"}
   286  		parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   287  		if parts[0] == "container" {
   288  			nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   289  			if err != nil {
   290  				return err
   291  			}
   292  			ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   293  			if userNS {
   294  				// to share a net namespace, they must also share a user namespace
   295  				nsUser := specs.LinuxNamespace{Type: "user"}
   296  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   297  				setNamespace(s, nsUser)
   298  			}
   299  		} else if c.HostConfig.NetworkMode.IsHost() {
   300  			ns.Path = c.NetworkSettings.SandboxKey
   301  		}
   302  		setNamespace(s, ns)
   303  	}
   304  
   305  	// ipc
   306  	ipcMode := c.HostConfig.IpcMode
   307  	switch {
   308  	case ipcMode.IsContainer():
   309  		ns := specs.LinuxNamespace{Type: "ipc"}
   310  		ic, err := daemon.getIpcContainer(ipcMode.Container())
   311  		if err != nil {
   312  			return err
   313  		}
   314  		ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   315  		setNamespace(s, ns)
   316  		if userNS {
   317  			// to share an IPC namespace, they must also share a user namespace
   318  			nsUser := specs.LinuxNamespace{Type: "user"}
   319  			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   320  			setNamespace(s, nsUser)
   321  		}
   322  	case ipcMode.IsHost():
   323  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
   324  	case ipcMode.IsEmpty():
   325  		// A container was created by an older version of the daemon.
   326  		// The default behavior used to be what is now called "shareable".
   327  		fallthrough
   328  	case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   329  		ns := specs.LinuxNamespace{Type: "ipc"}
   330  		setNamespace(s, ns)
   331  	default:
   332  		return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   333  	}
   334  
   335  	// pid
   336  	if c.HostConfig.PidMode.IsContainer() {
   337  		ns := specs.LinuxNamespace{Type: "pid"}
   338  		pc, err := daemon.getPidContainer(c)
   339  		if err != nil {
   340  			return err
   341  		}
   342  		ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
   343  		setNamespace(s, ns)
   344  		if userNS {
   345  			// to share a PID namespace, they must also share a user namespace
   346  			nsUser := specs.LinuxNamespace{Type: "user"}
   347  			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
   348  			setNamespace(s, nsUser)
   349  		}
   350  	} else if c.HostConfig.PidMode.IsHost() {
   351  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
   352  	} else {
   353  		ns := specs.LinuxNamespace{Type: "pid"}
   354  		setNamespace(s, ns)
   355  	}
   356  	// uts
   357  	if c.HostConfig.UTSMode.IsHost() {
   358  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
   359  		s.Hostname = ""
   360  	}
   361  
   362  	return nil
   363  }
   364  
   365  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   366  	var ids []specs.LinuxIDMapping
   367  	for _, item := range s {
   368  		ids = append(ids, specs.LinuxIDMapping{
   369  			HostID:      uint32(item.HostID),
   370  			ContainerID: uint32(item.ContainerID),
   371  			Size:        uint32(item.Size),
   372  		})
   373  	}
   374  	return ids
   375  }
   376  
   377  func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
   378  	for _, m := range mountinfo {
   379  		if m.Mountpoint == dir {
   380  			return m
   381  		}
   382  	}
   383  	return nil
   384  }
   385  
   386  // Get the source mount point of directory passed in as argument. Also return
   387  // optional fields.
   388  func getSourceMount(source string) (string, string, error) {
   389  	// Ensure any symlinks are resolved.
   390  	sourcePath, err := filepath.EvalSymlinks(source)
   391  	if err != nil {
   392  		return "", "", err
   393  	}
   394  
   395  	mountinfos, err := mount.GetMounts()
   396  	if err != nil {
   397  		return "", "", err
   398  	}
   399  
   400  	mountinfo := getMountInfo(mountinfos, sourcePath)
   401  	if mountinfo != nil {
   402  		return sourcePath, mountinfo.Optional, nil
   403  	}
   404  
   405  	path := sourcePath
   406  	for {
   407  		path = filepath.Dir(path)
   408  
   409  		mountinfo = getMountInfo(mountinfos, path)
   410  		if mountinfo != nil {
   411  			return path, mountinfo.Optional, nil
   412  		}
   413  
   414  		if path == "/" {
   415  			break
   416  		}
   417  	}
   418  
   419  	// If we are here, we did not find parent mount. Something is wrong.
   420  	return "", "", fmt.Errorf("Could not find source mount of %s", source)
   421  }
   422  
   423  const (
   424  	sharedPropagationOption = "shared:"
   425  	slavePropagationOption  = "master:"
   426  )
   427  
   428  // hasMountinfoOption checks if any of the passed any of the given option values
   429  // are set in the passed in option string.
   430  func hasMountinfoOption(opts string, vals ...string) bool {
   431  	for _, opt := range strings.Split(opts, " ") {
   432  		for _, val := range vals {
   433  			if strings.HasPrefix(opt, val) {
   434  				return true
   435  			}
   436  		}
   437  	}
   438  	return false
   439  }
   440  
   441  // Ensure mount point on which path is mounted, is shared.
   442  func ensureShared(path string) error {
   443  	sourceMount, optionalOpts, err := getSourceMount(path)
   444  	if err != nil {
   445  		return err
   446  	}
   447  	// Make sure source mount point is shared.
   448  	if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
   449  		return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   450  	}
   451  	return nil
   452  }
   453  
   454  // Ensure mount point on which path is mounted, is either shared or slave.
   455  func ensureSharedOrSlave(path string) error {
   456  	sourceMount, optionalOpts, err := getSourceMount(path)
   457  	if err != nil {
   458  		return err
   459  	}
   460  
   461  	if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
   462  		return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   463  	}
   464  	return nil
   465  }
   466  
   467  // Get the set of mount flags that are set on the mount that contains the given
   468  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   469  // bind-mounting "with options" will not fail with user namespaces, due to
   470  // kernel restrictions that require user namespace mounts to preserve
   471  // CL_UNPRIVILEGED locked flags.
   472  func getUnprivilegedMountFlags(path string) ([]string, error) {
   473  	var statfs unix.Statfs_t
   474  	if err := unix.Statfs(path, &statfs); err != nil {
   475  		return nil, err
   476  	}
   477  
   478  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   479  	unprivilegedFlags := map[uint64]string{
   480  		unix.MS_RDONLY:     "ro",
   481  		unix.MS_NODEV:      "nodev",
   482  		unix.MS_NOEXEC:     "noexec",
   483  		unix.MS_NOSUID:     "nosuid",
   484  		unix.MS_NOATIME:    "noatime",
   485  		unix.MS_RELATIME:   "relatime",
   486  		unix.MS_NODIRATIME: "nodiratime",
   487  	}
   488  
   489  	var flags []string
   490  	for mask, flag := range unprivilegedFlags {
   491  		if uint64(statfs.Flags)&mask == mask {
   492  			flags = append(flags, flag)
   493  		}
   494  	}
   495  
   496  	return flags, nil
   497  }
   498  
   499  var (
   500  	mountPropagationMap = map[string]int{
   501  		"private":  mount.PRIVATE,
   502  		"rprivate": mount.RPRIVATE,
   503  		"shared":   mount.SHARED,
   504  		"rshared":  mount.RSHARED,
   505  		"slave":    mount.SLAVE,
   506  		"rslave":   mount.RSLAVE,
   507  	}
   508  
   509  	mountPropagationReverseMap = map[int]string{
   510  		mount.PRIVATE:  "private",
   511  		mount.RPRIVATE: "rprivate",
   512  		mount.SHARED:   "shared",
   513  		mount.RSHARED:  "rshared",
   514  		mount.SLAVE:    "slave",
   515  		mount.RSLAVE:   "rslave",
   516  	}
   517  )
   518  
   519  // inSlice tests whether a string is contained in a slice of strings or not.
   520  // Comparison is case sensitive
   521  func inSlice(slice []string, s string) bool {
   522  	for _, ss := range slice {
   523  		if s == ss {
   524  			return true
   525  		}
   526  	}
   527  	return false
   528  }
   529  
   530  func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
   531  	userMounts := make(map[string]struct{})
   532  	for _, m := range mounts {
   533  		userMounts[m.Destination] = struct{}{}
   534  	}
   535  
   536  	// Copy all mounts from spec to defaultMounts, except for
   537  	//  - mounts overriden by a user supplied mount;
   538  	//  - all mounts under /dev if a user supplied /dev is present;
   539  	//  - /dev/shm, in case IpcMode is none.
   540  	// While at it, also
   541  	//  - set size for /dev/shm from shmsize.
   542  	var defaultMounts []specs.Mount
   543  	_, mountDev := userMounts["/dev"]
   544  	for _, m := range s.Mounts {
   545  		if _, ok := userMounts[m.Destination]; ok {
   546  			// filter out mount overridden by a user supplied mount
   547  			continue
   548  		}
   549  		if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   550  			// filter out everything under /dev if /dev is user-mounted
   551  			continue
   552  		}
   553  
   554  		if m.Destination == "/dev/shm" {
   555  			if c.HostConfig.IpcMode.IsNone() {
   556  				// filter out /dev/shm for "none" IpcMode
   557  				continue
   558  			}
   559  			// set size for /dev/shm mount from spec
   560  			sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   561  			m.Options = append(m.Options, sizeOpt)
   562  		}
   563  
   564  		defaultMounts = append(defaultMounts, m)
   565  	}
   566  
   567  	s.Mounts = defaultMounts
   568  	for _, m := range mounts {
   569  		for _, cm := range s.Mounts {
   570  			if cm.Destination == m.Destination {
   571  				return duplicateMountPointError(m.Destination)
   572  			}
   573  		}
   574  
   575  		if m.Source == "tmpfs" {
   576  			data := m.Data
   577  			parser := volume.NewParser("linux")
   578  			options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   579  			if data != "" {
   580  				options = append(options, strings.Split(data, ",")...)
   581  			}
   582  
   583  			merged, err := mount.MergeTmpfsOptions(options)
   584  			if err != nil {
   585  				return err
   586  			}
   587  
   588  			s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   589  			continue
   590  		}
   591  
   592  		mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   593  
   594  		// Determine property of RootPropagation based on volume
   595  		// properties. If a volume is shared, then keep root propagation
   596  		// shared. This should work for slave and private volumes too.
   597  		//
   598  		// For slave volumes, it can be either [r]shared/[r]slave.
   599  		//
   600  		// For private volumes any root propagation value should work.
   601  		pFlag := mountPropagationMap[m.Propagation]
   602  		switch pFlag {
   603  		case mount.SHARED, mount.RSHARED:
   604  			if err := ensureShared(m.Source); err != nil {
   605  				return err
   606  			}
   607  			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   608  			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   609  				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   610  			}
   611  		case mount.SLAVE, mount.RSLAVE:
   612  			var fallback bool
   613  			if err := ensureSharedOrSlave(m.Source); err != nil {
   614  				// For backwards compatability purposes, treat mounts from the daemon root
   615  				// as special since we automatically add rslave propagation to these mounts
   616  				// when the user did not set anything, so we should fallback to the old
   617  				// behavior which is to use private propagation which is normally the
   618  				// default.
   619  				if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
   620  					return err
   621  				}
   622  
   623  				cm, ok := c.MountPoints[m.Destination]
   624  				if !ok {
   625  					return err
   626  				}
   627  				if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
   628  					// This means the user explicitly set a propagation, do not fallback in that case.
   629  					return err
   630  				}
   631  				fallback = true
   632  				logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
   633  			}
   634  			if !fallback {
   635  				rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   636  				if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   637  					s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   638  				}
   639  			}
   640  		}
   641  
   642  		opts := []string{"rbind"}
   643  		if !m.Writable {
   644  			opts = append(opts, "ro")
   645  		}
   646  		if pFlag != 0 {
   647  			opts = append(opts, mountPropagationReverseMap[pFlag])
   648  		}
   649  
   650  		// If we are using user namespaces, then we must make sure that we
   651  		// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   652  		// "mount" when we bind-mount. The reason for this is that at the point
   653  		// when runc sets up the root filesystem, it is already inside a user
   654  		// namespace, and thus cannot change any flags that are locked.
   655  		if daemon.configStore.RemappedRoot != "" {
   656  			unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   657  			if err != nil {
   658  				return err
   659  			}
   660  			opts = append(opts, unprivOpts...)
   661  		}
   662  
   663  		mt.Options = opts
   664  		s.Mounts = append(s.Mounts, mt)
   665  	}
   666  
   667  	if s.Root.Readonly {
   668  		for i, m := range s.Mounts {
   669  			switch m.Destination {
   670  			case "/proc", "/dev/pts", "/dev/mqueue", "/dev":
   671  				continue
   672  			}
   673  			if _, ok := userMounts[m.Destination]; !ok {
   674  				if !inSlice(m.Options, "ro") {
   675  					s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   676  				}
   677  			}
   678  		}
   679  	}
   680  
   681  	if c.HostConfig.Privileged {
   682  		if !s.Root.Readonly {
   683  			// clear readonly for /sys
   684  			for i := range s.Mounts {
   685  				if s.Mounts[i].Destination == "/sys" {
   686  					clearReadOnly(&s.Mounts[i])
   687  				}
   688  			}
   689  		}
   690  		s.Linux.ReadonlyPaths = nil
   691  		s.Linux.MaskedPaths = nil
   692  	}
   693  
   694  	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   695  	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   696  	if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   697  		for i, m := range s.Mounts {
   698  			if m.Type == "cgroup" {
   699  				clearReadOnly(&s.Mounts[i])
   700  			}
   701  		}
   702  	}
   703  
   704  	return nil
   705  }
   706  
   707  func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
   708  	linkedEnv, err := daemon.setupLinkedContainers(c)
   709  	if err != nil {
   710  		return err
   711  	}
   712  	s.Root = &specs.Root{
   713  		Path:     c.BaseFS.Path(),
   714  		Readonly: c.HostConfig.ReadonlyRootfs,
   715  	}
   716  	if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
   717  		return err
   718  	}
   719  	cwd := c.Config.WorkingDir
   720  	if len(cwd) == 0 {
   721  		cwd = "/"
   722  	}
   723  	s.Process.Args = append([]string{c.Path}, c.Args...)
   724  
   725  	// only add the custom init if it is specified and the container is running in its
   726  	// own private pid namespace.  It does not make sense to add if it is running in the
   727  	// host namespace or another container's pid namespace where we already have an init
   728  	if c.HostConfig.PidMode.IsPrivate() {
   729  		if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   730  			(c.HostConfig.Init == nil && daemon.configStore.Init) {
   731  			s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
   732  			var path string
   733  			if daemon.configStore.InitPath == "" {
   734  				path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   735  				if err != nil {
   736  					return err
   737  				}
   738  			}
   739  			if daemon.configStore.InitPath != "" {
   740  				path = daemon.configStore.InitPath
   741  			}
   742  			s.Mounts = append(s.Mounts, specs.Mount{
   743  				Destination: "/dev/init",
   744  				Type:        "bind",
   745  				Source:      path,
   746  				Options:     []string{"bind", "ro"},
   747  			})
   748  		}
   749  	}
   750  	s.Process.Cwd = cwd
   751  	s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   752  	s.Process.Terminal = c.Config.Tty
   753  	s.Hostname = c.FullHostname()
   754  
   755  	return nil
   756  }
   757  
   758  func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) {
   759  	s := oci.DefaultSpec()
   760  	if err := daemon.populateCommonSpec(&s, c); err != nil {
   761  		return nil, err
   762  	}
   763  
   764  	var cgroupsPath string
   765  	scopePrefix := "docker"
   766  	parent := "/docker"
   767  	useSystemd := UsingSystemd(daemon.configStore)
   768  	if useSystemd {
   769  		parent = "system.slice"
   770  	}
   771  
   772  	if c.HostConfig.CgroupParent != "" {
   773  		parent = c.HostConfig.CgroupParent
   774  	} else if daemon.configStore.CgroupParent != "" {
   775  		parent = daemon.configStore.CgroupParent
   776  	}
   777  
   778  	if useSystemd {
   779  		cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   780  		logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   781  	} else {
   782  		cgroupsPath = filepath.Join(parent, c.ID)
   783  	}
   784  	s.Linux.CgroupsPath = cgroupsPath
   785  
   786  	if err := setResources(&s, c.HostConfig.Resources); err != nil {
   787  		return nil, fmt.Errorf("linux runtime spec resources: %v", err)
   788  	}
   789  	s.Linux.Sysctl = c.HostConfig.Sysctls
   790  
   791  	p := s.Linux.CgroupsPath
   792  	if useSystemd {
   793  		initPath, err := cgroups.GetInitCgroup("cpu")
   794  		if err != nil {
   795  			return nil, err
   796  		}
   797  		_, err = cgroups.GetOwnCgroup("cpu")
   798  		if err != nil {
   799  			return nil, err
   800  		}
   801  		p = filepath.Join(initPath, s.Linux.CgroupsPath)
   802  	}
   803  
   804  	// Clean path to guard against things like ../../../BAD
   805  	parentPath := filepath.Dir(p)
   806  	if !filepath.IsAbs(parentPath) {
   807  		parentPath = filepath.Clean("/" + parentPath)
   808  	}
   809  
   810  	if err := daemon.initCgroupsPath(parentPath); err != nil {
   811  		return nil, fmt.Errorf("linux init cgroups path: %v", err)
   812  	}
   813  	if err := setDevices(&s, c); err != nil {
   814  		return nil, fmt.Errorf("linux runtime spec devices: %v", err)
   815  	}
   816  	if err := daemon.setRlimits(&s, c); err != nil {
   817  		return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
   818  	}
   819  	if err := setUser(&s, c); err != nil {
   820  		return nil, fmt.Errorf("linux spec user: %v", err)
   821  	}
   822  	if err := setNamespaces(daemon, &s, c); err != nil {
   823  		return nil, fmt.Errorf("linux spec namespaces: %v", err)
   824  	}
   825  	if err := setCapabilities(&s, c); err != nil {
   826  		return nil, fmt.Errorf("linux spec capabilities: %v", err)
   827  	}
   828  	if err := setSeccomp(daemon, &s, c); err != nil {
   829  		return nil, fmt.Errorf("linux seccomp: %v", err)
   830  	}
   831  
   832  	if err := daemon.setupIpcDirs(c); err != nil {
   833  		return nil, err
   834  	}
   835  
   836  	if err := daemon.setupSecretDir(c); err != nil {
   837  		return nil, err
   838  	}
   839  
   840  	if err := daemon.setupConfigDir(c); err != nil {
   841  		return nil, err
   842  	}
   843  
   844  	ms, err := daemon.setupMounts(c)
   845  	if err != nil {
   846  		return nil, err
   847  	}
   848  
   849  	if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   850  		ms = append(ms, c.IpcMounts()...)
   851  	}
   852  
   853  	tmpfsMounts, err := c.TmpfsMounts()
   854  	if err != nil {
   855  		return nil, err
   856  	}
   857  	ms = append(ms, tmpfsMounts...)
   858  
   859  	if m := c.SecretMounts(); m != nil {
   860  		ms = append(ms, m...)
   861  	}
   862  
   863  	ms = append(ms, c.ConfigMounts()...)
   864  
   865  	sort.Sort(mounts(ms))
   866  	if err := setMounts(daemon, &s, c, ms); err != nil {
   867  		return nil, fmt.Errorf("linux mounts: %v", err)
   868  	}
   869  
   870  	for _, ns := range s.Linux.Namespaces {
   871  		if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
   872  			target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"))
   873  			if err != nil {
   874  				return nil, err
   875  			}
   876  
   877  			s.Hooks = &specs.Hooks{
   878  				Prestart: []specs.Hook{{
   879  					Path: target, // FIXME: cross-platform
   880  					Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
   881  				}},
   882  			}
   883  		}
   884  	}
   885  
   886  	if apparmor.IsEnabled() {
   887  		var appArmorProfile string
   888  		if c.AppArmorProfile != "" {
   889  			appArmorProfile = c.AppArmorProfile
   890  		} else if c.HostConfig.Privileged {
   891  			appArmorProfile = "unconfined"
   892  		} else {
   893  			appArmorProfile = "docker-default"
   894  		}
   895  
   896  		if appArmorProfile == "docker-default" {
   897  			// Unattended upgrades and other fun services can unload AppArmor
   898  			// profiles inadvertently. Since we cannot store our profile in
   899  			// /etc/apparmor.d, nor can we practically add other ways of
   900  			// telling the system to keep our profile loaded, in order to make
   901  			// sure that we keep the default profile enabled we dynamically
   902  			// reload it if necessary.
   903  			if err := ensureDefaultAppArmorProfile(); err != nil {
   904  				return nil, err
   905  			}
   906  		}
   907  
   908  		s.Process.ApparmorProfile = appArmorProfile
   909  	}
   910  	s.Process.SelinuxLabel = c.GetProcessLabel()
   911  	s.Process.NoNewPrivileges = c.NoNewPrivileges
   912  	s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
   913  	s.Linux.MountLabel = c.MountLabel
   914  
   915  	return &s, nil
   916  }
   917  
   918  func clearReadOnly(m *specs.Mount) {
   919  	var opt []string
   920  	for _, o := range m.Options {
   921  		if o != "ro" {
   922  			opt = append(opt, o)
   923  		}
   924  	}
   925  	m.Options = opt
   926  }
   927  
   928  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
   929  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
   930  	ulimits := c.Ulimits
   931  	// Merge ulimits with daemon defaults
   932  	ulIdx := make(map[string]struct{})
   933  	for _, ul := range ulimits {
   934  		ulIdx[ul.Name] = struct{}{}
   935  	}
   936  	for name, ul := range daemon.configStore.Ulimits {
   937  		if _, exists := ulIdx[name]; !exists {
   938  			ulimits = append(ulimits, ul)
   939  		}
   940  	}
   941  	c.Ulimits = ulimits
   942  }