github.com/rhatdan/docker@v0.7.7-0.20180119204836-47a0dcbcd20a/daemon/oci_linux.go (about)

     1  package daemon
     2  
     3  import (
     4  	"fmt"
     5  	"io"
     6  	"os"
     7  	"os/exec"
     8  	"path/filepath"
     9  	"regexp"
    10  	"sort"
    11  	"strconv"
    12  	"strings"
    13  
    14  	containertypes "github.com/docker/docker/api/types/container"
    15  	"github.com/docker/docker/container"
    16  	"github.com/docker/docker/daemon/caps"
    17  	daemonconfig "github.com/docker/docker/daemon/config"
    18  	"github.com/docker/docker/oci"
    19  	"github.com/docker/docker/pkg/idtools"
    20  	"github.com/docker/docker/pkg/mount"
    21  	"github.com/docker/docker/volume"
    22  	"github.com/opencontainers/runc/libcontainer/apparmor"
    23  	"github.com/opencontainers/runc/libcontainer/cgroups"
    24  	"github.com/opencontainers/runc/libcontainer/devices"
    25  	"github.com/opencontainers/runc/libcontainer/user"
    26  	specs "github.com/opencontainers/runtime-spec/specs-go"
    27  	"github.com/sirupsen/logrus"
    28  	"golang.org/x/sys/unix"
    29  )
    30  
    31  // nolint: gosimple
    32  var (
    33  	deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
    34  )
    35  
    36  func setResources(s *specs.Spec, r containertypes.Resources) error {
    37  	weightDevices, err := getBlkioWeightDevices(r)
    38  	if err != nil {
    39  		return err
    40  	}
    41  	readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
    42  	if err != nil {
    43  		return err
    44  	}
    45  	writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
    46  	if err != nil {
    47  		return err
    48  	}
    49  	readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
    50  	if err != nil {
    51  		return err
    52  	}
    53  	writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
    54  	if err != nil {
    55  		return err
    56  	}
    57  
    58  	memoryRes := getMemoryResources(r)
    59  	cpuRes, err := getCPUResources(r)
    60  	if err != nil {
    61  		return err
    62  	}
    63  	blkioWeight := r.BlkioWeight
    64  
    65  	specResources := &specs.LinuxResources{
    66  		Memory: memoryRes,
    67  		CPU:    cpuRes,
    68  		BlockIO: &specs.LinuxBlockIO{
    69  			Weight:                  &blkioWeight,
    70  			WeightDevice:            weightDevices,
    71  			ThrottleReadBpsDevice:   readBpsDevice,
    72  			ThrottleWriteBpsDevice:  writeBpsDevice,
    73  			ThrottleReadIOPSDevice:  readIOpsDevice,
    74  			ThrottleWriteIOPSDevice: writeIOpsDevice,
    75  		},
    76  		Pids: &specs.LinuxPids{
    77  			Limit: r.PidsLimit,
    78  		},
    79  	}
    80  
    81  	if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
    82  		specResources.Devices = s.Linux.Resources.Devices
    83  	}
    84  
    85  	s.Linux.Resources = specResources
    86  	return nil
    87  }
    88  
    89  func setDevices(s *specs.Spec, c *container.Container) error {
    90  	// Build lists of devices allowed and created within the container.
    91  	var devs []specs.LinuxDevice
    92  	devPermissions := s.Linux.Resources.Devices
    93  	if c.HostConfig.Privileged {
    94  		hostDevices, err := devices.HostDevices()
    95  		if err != nil {
    96  			return err
    97  		}
    98  		for _, d := range hostDevices {
    99  			devs = append(devs, oci.Device(d))
   100  		}
   101  		devPermissions = []specs.LinuxDeviceCgroup{
   102  			{
   103  				Allow:  true,
   104  				Access: "rwm",
   105  			},
   106  		}
   107  	} else {
   108  		for _, deviceMapping := range c.HostConfig.Devices {
   109  			d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
   110  			if err != nil {
   111  				return err
   112  			}
   113  			devs = append(devs, d...)
   114  			devPermissions = append(devPermissions, dPermissions...)
   115  		}
   116  
   117  		for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
   118  			ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
   119  			if len(ss[0]) != 5 {
   120  				return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
   121  			}
   122  			matches := ss[0]
   123  
   124  			dPermissions := specs.LinuxDeviceCgroup{
   125  				Allow:  true,
   126  				Type:   matches[1],
   127  				Access: matches[4],
   128  			}
   129  			if matches[2] == "*" {
   130  				major := int64(-1)
   131  				dPermissions.Major = &major
   132  			} else {
   133  				major, err := strconv.ParseInt(matches[2], 10, 64)
   134  				if err != nil {
   135  					return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
   136  				}
   137  				dPermissions.Major = &major
   138  			}
   139  			if matches[3] == "*" {
   140  				minor := int64(-1)
   141  				dPermissions.Minor = &minor
   142  			} else {
   143  				minor, err := strconv.ParseInt(matches[3], 10, 64)
   144  				if err != nil {
   145  					return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
   146  				}
   147  				dPermissions.Minor = &minor
   148  			}
   149  			devPermissions = append(devPermissions, dPermissions)
   150  		}
   151  	}
   152  
   153  	s.Linux.Devices = append(s.Linux.Devices, devs...)
   154  	s.Linux.Resources.Devices = devPermissions
   155  	return nil
   156  }
   157  
   158  func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
   159  	var rlimits []specs.POSIXRlimit
   160  
   161  	// We want to leave the original HostConfig alone so make a copy here
   162  	hostConfig := *c.HostConfig
   163  	// Merge with the daemon defaults
   164  	daemon.mergeUlimits(&hostConfig)
   165  	for _, ul := range hostConfig.Ulimits {
   166  		rlimits = append(rlimits, specs.POSIXRlimit{
   167  			Type: "RLIMIT_" + strings.ToUpper(ul.Name),
   168  			Soft: uint64(ul.Soft),
   169  			Hard: uint64(ul.Hard),
   170  		})
   171  	}
   172  
   173  	s.Process.Rlimits = rlimits
   174  	return nil
   175  }
   176  
   177  func setUser(s *specs.Spec, c *container.Container) error {
   178  	uid, gid, additionalGids, err := getUser(c, c.Config.User)
   179  	if err != nil {
   180  		return err
   181  	}
   182  	s.Process.User.UID = uid
   183  	s.Process.User.GID = gid
   184  	s.Process.User.AdditionalGids = additionalGids
   185  	return nil
   186  }
   187  
   188  func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
   189  	fp, err := c.GetResourcePath(p)
   190  	if err != nil {
   191  		return nil, err
   192  	}
   193  	return os.Open(fp)
   194  }
   195  
   196  func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
   197  	passwdPath, err := user.GetPasswdPath()
   198  	if err != nil {
   199  		return 0, 0, nil, err
   200  	}
   201  	groupPath, err := user.GetGroupPath()
   202  	if err != nil {
   203  		return 0, 0, nil, err
   204  	}
   205  	passwdFile, err := readUserFile(c, passwdPath)
   206  	if err == nil {
   207  		defer passwdFile.Close()
   208  	}
   209  	groupFile, err := readUserFile(c, groupPath)
   210  	if err == nil {
   211  		defer groupFile.Close()
   212  	}
   213  
   214  	execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
   215  	if err != nil {
   216  		return 0, 0, nil, err
   217  	}
   218  
   219  	// todo: fix this double read by a change to libcontainer/user pkg
   220  	groupFile, err = readUserFile(c, groupPath)
   221  	if err == nil {
   222  		defer groupFile.Close()
   223  	}
   224  	var addGroups []int
   225  	if len(c.HostConfig.GroupAdd) > 0 {
   226  		addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
   227  		if err != nil {
   228  			return 0, 0, nil, err
   229  		}
   230  	}
   231  	uid := uint32(execUser.Uid)
   232  	gid := uint32(execUser.Gid)
   233  	sgids := append(execUser.Sgids, addGroups...)
   234  	var additionalGids []uint32
   235  	for _, g := range sgids {
   236  		additionalGids = append(additionalGids, uint32(g))
   237  	}
   238  	return uid, gid, additionalGids, nil
   239  }
   240  
   241  func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
   242  	for i, n := range s.Linux.Namespaces {
   243  		if n.Type == ns.Type {
   244  			s.Linux.Namespaces[i] = ns
   245  			return
   246  		}
   247  	}
   248  	s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
   249  }
   250  
   251  func setCapabilities(s *specs.Spec, c *container.Container) error {
   252  	var caplist []string
   253  	var err error
   254  	if c.HostConfig.Privileged {
   255  		caplist = caps.GetAllCapabilities()
   256  	} else {
   257  		caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Effective, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
   258  		if err != nil {
   259  			return err
   260  		}
   261  	}
   262  	s.Process.Capabilities.Effective = caplist
   263  	s.Process.Capabilities.Bounding = caplist
   264  	s.Process.Capabilities.Permitted = caplist
   265  	s.Process.Capabilities.Inheritable = caplist
   266  	return nil
   267  }
   268  
   269  func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
   270  	userNS := false
   271  	// user
   272  	if c.HostConfig.UsernsMode.IsPrivate() {
   273  		uidMap := daemon.idMappings.UIDs()
   274  		if uidMap != nil {
   275  			userNS = true
   276  			ns := specs.LinuxNamespace{Type: "user"}
   277  			setNamespace(s, ns)
   278  			s.Linux.UIDMappings = specMapping(uidMap)
   279  			s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
   280  		}
   281  	}
   282  	// network
   283  	if !c.Config.NetworkDisabled {
   284  		ns := specs.LinuxNamespace{Type: "network"}
   285  		parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
   286  		if parts[0] == "container" {
   287  			nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
   288  			if err != nil {
   289  				return err
   290  			}
   291  			ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
   292  			if userNS {
   293  				// to share a net namespace, they must also share a user namespace
   294  				nsUser := specs.LinuxNamespace{Type: "user"}
   295  				nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
   296  				setNamespace(s, nsUser)
   297  			}
   298  		} else if c.HostConfig.NetworkMode.IsHost() {
   299  			ns.Path = c.NetworkSettings.SandboxKey
   300  		}
   301  		setNamespace(s, ns)
   302  	}
   303  
   304  	// ipc
   305  	ipcMode := c.HostConfig.IpcMode
   306  	switch {
   307  	case ipcMode.IsContainer():
   308  		ns := specs.LinuxNamespace{Type: "ipc"}
   309  		ic, err := daemon.getIpcContainer(ipcMode.Container())
   310  		if err != nil {
   311  			return err
   312  		}
   313  		ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
   314  		setNamespace(s, ns)
   315  		if userNS {
   316  			// to share an IPC namespace, they must also share a user namespace
   317  			nsUser := specs.LinuxNamespace{Type: "user"}
   318  			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
   319  			setNamespace(s, nsUser)
   320  		}
   321  	case ipcMode.IsHost():
   322  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
   323  	case ipcMode.IsEmpty():
   324  		// A container was created by an older version of the daemon.
   325  		// The default behavior used to be what is now called "shareable".
   326  		fallthrough
   327  	case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
   328  		ns := specs.LinuxNamespace{Type: "ipc"}
   329  		setNamespace(s, ns)
   330  	default:
   331  		return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
   332  	}
   333  
   334  	// pid
   335  	if c.HostConfig.PidMode.IsContainer() {
   336  		ns := specs.LinuxNamespace{Type: "pid"}
   337  		pc, err := daemon.getPidContainer(c)
   338  		if err != nil {
   339  			return err
   340  		}
   341  		ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
   342  		setNamespace(s, ns)
   343  		if userNS {
   344  			// to share a PID namespace, they must also share a user namespace
   345  			nsUser := specs.LinuxNamespace{Type: "user"}
   346  			nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
   347  			setNamespace(s, nsUser)
   348  		}
   349  	} else if c.HostConfig.PidMode.IsHost() {
   350  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
   351  	} else {
   352  		ns := specs.LinuxNamespace{Type: "pid"}
   353  		setNamespace(s, ns)
   354  	}
   355  	// uts
   356  	if c.HostConfig.UTSMode.IsHost() {
   357  		oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
   358  		s.Hostname = ""
   359  	}
   360  
   361  	return nil
   362  }
   363  
   364  func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
   365  	var ids []specs.LinuxIDMapping
   366  	for _, item := range s {
   367  		ids = append(ids, specs.LinuxIDMapping{
   368  			HostID:      uint32(item.HostID),
   369  			ContainerID: uint32(item.ContainerID),
   370  			Size:        uint32(item.Size),
   371  		})
   372  	}
   373  	return ids
   374  }
   375  
   376  func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
   377  	for _, m := range mountinfo {
   378  		if m.Mountpoint == dir {
   379  			return m
   380  		}
   381  	}
   382  	return nil
   383  }
   384  
   385  // Get the source mount point of directory passed in as argument. Also return
   386  // optional fields.
   387  func getSourceMount(source string) (string, string, error) {
   388  	// Ensure any symlinks are resolved.
   389  	sourcePath, err := filepath.EvalSymlinks(source)
   390  	if err != nil {
   391  		return "", "", err
   392  	}
   393  
   394  	mountinfos, err := mount.GetMounts()
   395  	if err != nil {
   396  		return "", "", err
   397  	}
   398  
   399  	mountinfo := getMountInfo(mountinfos, sourcePath)
   400  	if mountinfo != nil {
   401  		return sourcePath, mountinfo.Optional, nil
   402  	}
   403  
   404  	path := sourcePath
   405  	for {
   406  		path = filepath.Dir(path)
   407  
   408  		mountinfo = getMountInfo(mountinfos, path)
   409  		if mountinfo != nil {
   410  			return path, mountinfo.Optional, nil
   411  		}
   412  
   413  		if path == "/" {
   414  			break
   415  		}
   416  	}
   417  
   418  	// If we are here, we did not find parent mount. Something is wrong.
   419  	return "", "", fmt.Errorf("Could not find source mount of %s", source)
   420  }
   421  
   422  // Ensure mount point on which path is mounted, is shared.
   423  func ensureShared(path string) error {
   424  	sharedMount := false
   425  
   426  	sourceMount, optionalOpts, err := getSourceMount(path)
   427  	if err != nil {
   428  		return err
   429  	}
   430  	// Make sure source mount point is shared.
   431  	optsSplit := strings.Split(optionalOpts, " ")
   432  	for _, opt := range optsSplit {
   433  		if strings.HasPrefix(opt, "shared:") {
   434  			sharedMount = true
   435  			break
   436  		}
   437  	}
   438  
   439  	if !sharedMount {
   440  		return fmt.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
   441  	}
   442  	return nil
   443  }
   444  
   445  // Ensure mount point on which path is mounted, is either shared or slave.
   446  func ensureSharedOrSlave(path string) error {
   447  	sharedMount := false
   448  	slaveMount := false
   449  
   450  	sourceMount, optionalOpts, err := getSourceMount(path)
   451  	if err != nil {
   452  		return err
   453  	}
   454  	// Make sure source mount point is shared.
   455  	optsSplit := strings.Split(optionalOpts, " ")
   456  	for _, opt := range optsSplit {
   457  		if strings.HasPrefix(opt, "shared:") {
   458  			sharedMount = true
   459  			break
   460  		} else if strings.HasPrefix(opt, "master:") {
   461  			slaveMount = true
   462  			break
   463  		}
   464  	}
   465  
   466  	if !sharedMount && !slaveMount {
   467  		return fmt.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
   468  	}
   469  	return nil
   470  }
   471  
   472  // Get the set of mount flags that are set on the mount that contains the given
   473  // path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
   474  // bind-mounting "with options" will not fail with user namespaces, due to
   475  // kernel restrictions that require user namespace mounts to preserve
   476  // CL_UNPRIVILEGED locked flags.
   477  func getUnprivilegedMountFlags(path string) ([]string, error) {
   478  	var statfs unix.Statfs_t
   479  	if err := unix.Statfs(path, &statfs); err != nil {
   480  		return nil, err
   481  	}
   482  
   483  	// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
   484  	unprivilegedFlags := map[uint64]string{
   485  		unix.MS_RDONLY:     "ro",
   486  		unix.MS_NODEV:      "nodev",
   487  		unix.MS_NOEXEC:     "noexec",
   488  		unix.MS_NOSUID:     "nosuid",
   489  		unix.MS_NOATIME:    "noatime",
   490  		unix.MS_RELATIME:   "relatime",
   491  		unix.MS_NODIRATIME: "nodiratime",
   492  	}
   493  
   494  	var flags []string
   495  	for mask, flag := range unprivilegedFlags {
   496  		if uint64(statfs.Flags)&mask == mask {
   497  			flags = append(flags, flag)
   498  		}
   499  	}
   500  
   501  	return flags, nil
   502  }
   503  
   504  var (
   505  	mountPropagationMap = map[string]int{
   506  		"private":  mount.PRIVATE,
   507  		"rprivate": mount.RPRIVATE,
   508  		"shared":   mount.SHARED,
   509  		"rshared":  mount.RSHARED,
   510  		"slave":    mount.SLAVE,
   511  		"rslave":   mount.RSLAVE,
   512  	}
   513  
   514  	mountPropagationReverseMap = map[int]string{
   515  		mount.PRIVATE:  "private",
   516  		mount.RPRIVATE: "rprivate",
   517  		mount.SHARED:   "shared",
   518  		mount.RSHARED:  "rshared",
   519  		mount.SLAVE:    "slave",
   520  		mount.RSLAVE:   "rslave",
   521  	}
   522  )
   523  
   524  // inSlice tests whether a string is contained in a slice of strings or not.
   525  // Comparison is case sensitive
   526  func inSlice(slice []string, s string) bool {
   527  	for _, ss := range slice {
   528  		if s == ss {
   529  			return true
   530  		}
   531  	}
   532  	return false
   533  }
   534  
   535  func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
   536  	userMounts := make(map[string]struct{})
   537  	for _, m := range mounts {
   538  		userMounts[m.Destination] = struct{}{}
   539  	}
   540  
   541  	// Copy all mounts from spec to defaultMounts, except for
   542  	//  - mounts overriden by a user supplied mount;
   543  	//  - all mounts under /dev if a user supplied /dev is present;
   544  	//  - /dev/shm, in case IpcMode is none.
   545  	// While at it, also
   546  	//  - set size for /dev/shm from shmsize.
   547  	var defaultMounts []specs.Mount
   548  	_, mountDev := userMounts["/dev"]
   549  	for _, m := range s.Mounts {
   550  		if _, ok := userMounts[m.Destination]; ok {
   551  			// filter out mount overridden by a user supplied mount
   552  			continue
   553  		}
   554  		if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
   555  			// filter out everything under /dev if /dev is user-mounted
   556  			continue
   557  		}
   558  
   559  		if m.Destination == "/dev/shm" {
   560  			if c.HostConfig.IpcMode.IsNone() {
   561  				// filter out /dev/shm for "none" IpcMode
   562  				continue
   563  			}
   564  			// set size for /dev/shm mount from spec
   565  			sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
   566  			m.Options = append(m.Options, sizeOpt)
   567  		}
   568  
   569  		defaultMounts = append(defaultMounts, m)
   570  	}
   571  
   572  	s.Mounts = defaultMounts
   573  	for _, m := range mounts {
   574  		for _, cm := range s.Mounts {
   575  			if cm.Destination == m.Destination {
   576  				return duplicateMountPointError(m.Destination)
   577  			}
   578  		}
   579  
   580  		if m.Source == "tmpfs" {
   581  			data := m.Data
   582  			parser := volume.NewParser("linux")
   583  			options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
   584  			if data != "" {
   585  				options = append(options, strings.Split(data, ",")...)
   586  			}
   587  
   588  			merged, err := mount.MergeTmpfsOptions(options)
   589  			if err != nil {
   590  				return err
   591  			}
   592  
   593  			s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
   594  			continue
   595  		}
   596  
   597  		mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
   598  
   599  		// Determine property of RootPropagation based on volume
   600  		// properties. If a volume is shared, then keep root propagation
   601  		// shared. This should work for slave and private volumes too.
   602  		//
   603  		// For slave volumes, it can be either [r]shared/[r]slave.
   604  		//
   605  		// For private volumes any root propagation value should work.
   606  		pFlag := mountPropagationMap[m.Propagation]
   607  		if pFlag == mount.SHARED || pFlag == mount.RSHARED {
   608  			if err := ensureShared(m.Source); err != nil {
   609  				return err
   610  			}
   611  			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   612  			if rootpg != mount.SHARED && rootpg != mount.RSHARED {
   613  				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
   614  			}
   615  		} else if pFlag == mount.SLAVE || pFlag == mount.RSLAVE {
   616  			if err := ensureSharedOrSlave(m.Source); err != nil {
   617  				return err
   618  			}
   619  			rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
   620  			if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
   621  				s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
   622  			}
   623  		}
   624  
   625  		opts := []string{"rbind"}
   626  		if !m.Writable {
   627  			opts = append(opts, "ro")
   628  		}
   629  		if pFlag != 0 {
   630  			opts = append(opts, mountPropagationReverseMap[pFlag])
   631  		}
   632  
   633  		// If we are using user namespaces, then we must make sure that we
   634  		// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
   635  		// "mount" when we bind-mount. The reason for this is that at the point
   636  		// when runc sets up the root filesystem, it is already inside a user
   637  		// namespace, and thus cannot change any flags that are locked.
   638  		if daemon.configStore.RemappedRoot != "" {
   639  			unprivOpts, err := getUnprivilegedMountFlags(m.Source)
   640  			if err != nil {
   641  				return err
   642  			}
   643  			opts = append(opts, unprivOpts...)
   644  		}
   645  
   646  		mt.Options = opts
   647  		s.Mounts = append(s.Mounts, mt)
   648  	}
   649  
   650  	if s.Root.Readonly {
   651  		for i, m := range s.Mounts {
   652  			switch m.Destination {
   653  			case "/proc", "/dev/pts", "/dev/mqueue", "/dev":
   654  				continue
   655  			}
   656  			if _, ok := userMounts[m.Destination]; !ok {
   657  				if !inSlice(m.Options, "ro") {
   658  					s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
   659  				}
   660  			}
   661  		}
   662  	}
   663  
   664  	if c.HostConfig.Privileged {
   665  		if !s.Root.Readonly {
   666  			// clear readonly for /sys
   667  			for i := range s.Mounts {
   668  				if s.Mounts[i].Destination == "/sys" {
   669  					clearReadOnly(&s.Mounts[i])
   670  				}
   671  			}
   672  		}
   673  		s.Linux.ReadonlyPaths = nil
   674  		s.Linux.MaskedPaths = nil
   675  	}
   676  
   677  	// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
   678  	// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
   679  	if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
   680  		for i, m := range s.Mounts {
   681  			if m.Type == "cgroup" {
   682  				clearReadOnly(&s.Mounts[i])
   683  			}
   684  		}
   685  	}
   686  
   687  	return nil
   688  }
   689  
   690  func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
   691  	linkedEnv, err := daemon.setupLinkedContainers(c)
   692  	if err != nil {
   693  		return err
   694  	}
   695  	s.Root = &specs.Root{
   696  		Path:     c.BaseFS.Path(),
   697  		Readonly: c.HostConfig.ReadonlyRootfs,
   698  	}
   699  	if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
   700  		return err
   701  	}
   702  	cwd := c.Config.WorkingDir
   703  	if len(cwd) == 0 {
   704  		cwd = "/"
   705  	}
   706  	s.Process.Args = append([]string{c.Path}, c.Args...)
   707  
   708  	// only add the custom init if it is specified and the container is running in its
   709  	// own private pid namespace.  It does not make sense to add if it is running in the
   710  	// host namespace or another container's pid namespace where we already have an init
   711  	if c.HostConfig.PidMode.IsPrivate() {
   712  		if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
   713  			(c.HostConfig.Init == nil && daemon.configStore.Init) {
   714  			s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
   715  			var path string
   716  			if daemon.configStore.InitPath == "" {
   717  				path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
   718  				if err != nil {
   719  					return err
   720  				}
   721  			}
   722  			if daemon.configStore.InitPath != "" {
   723  				path = daemon.configStore.InitPath
   724  			}
   725  			s.Mounts = append(s.Mounts, specs.Mount{
   726  				Destination: "/dev/init",
   727  				Type:        "bind",
   728  				Source:      path,
   729  				Options:     []string{"bind", "ro"},
   730  			})
   731  		}
   732  	}
   733  	s.Process.Cwd = cwd
   734  	s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
   735  	s.Process.Terminal = c.Config.Tty
   736  	s.Hostname = c.FullHostname()
   737  
   738  	return nil
   739  }
   740  
   741  func (daemon *Daemon) createSpec(c *container.Container) (*specs.Spec, error) {
   742  	s := oci.DefaultSpec()
   743  	if err := daemon.populateCommonSpec(&s, c); err != nil {
   744  		return nil, err
   745  	}
   746  
   747  	var cgroupsPath string
   748  	scopePrefix := "docker"
   749  	parent := "/docker"
   750  	useSystemd := UsingSystemd(daemon.configStore)
   751  	if useSystemd {
   752  		parent = "system.slice"
   753  	}
   754  
   755  	if c.HostConfig.CgroupParent != "" {
   756  		parent = c.HostConfig.CgroupParent
   757  	} else if daemon.configStore.CgroupParent != "" {
   758  		parent = daemon.configStore.CgroupParent
   759  	}
   760  
   761  	if useSystemd {
   762  		cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
   763  		logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
   764  	} else {
   765  		cgroupsPath = filepath.Join(parent, c.ID)
   766  	}
   767  	s.Linux.CgroupsPath = cgroupsPath
   768  
   769  	if err := setResources(&s, c.HostConfig.Resources); err != nil {
   770  		return nil, fmt.Errorf("linux runtime spec resources: %v", err)
   771  	}
   772  	s.Linux.Sysctl = c.HostConfig.Sysctls
   773  
   774  	p := s.Linux.CgroupsPath
   775  	if useSystemd {
   776  		initPath, err := cgroups.GetInitCgroup("cpu")
   777  		if err != nil {
   778  			return nil, err
   779  		}
   780  		_, err = cgroups.GetOwnCgroup("cpu")
   781  		if err != nil {
   782  			return nil, err
   783  		}
   784  		p = filepath.Join(initPath, s.Linux.CgroupsPath)
   785  	}
   786  
   787  	// Clean path to guard against things like ../../../BAD
   788  	parentPath := filepath.Dir(p)
   789  	if !filepath.IsAbs(parentPath) {
   790  		parentPath = filepath.Clean("/" + parentPath)
   791  	}
   792  
   793  	if err := daemon.initCgroupsPath(parentPath); err != nil {
   794  		return nil, fmt.Errorf("linux init cgroups path: %v", err)
   795  	}
   796  	if err := setDevices(&s, c); err != nil {
   797  		return nil, fmt.Errorf("linux runtime spec devices: %v", err)
   798  	}
   799  	if err := daemon.setRlimits(&s, c); err != nil {
   800  		return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
   801  	}
   802  	if err := setUser(&s, c); err != nil {
   803  		return nil, fmt.Errorf("linux spec user: %v", err)
   804  	}
   805  	if err := setNamespaces(daemon, &s, c); err != nil {
   806  		return nil, fmt.Errorf("linux spec namespaces: %v", err)
   807  	}
   808  	if err := setCapabilities(&s, c); err != nil {
   809  		return nil, fmt.Errorf("linux spec capabilities: %v", err)
   810  	}
   811  	if err := setSeccomp(daemon, &s, c); err != nil {
   812  		return nil, fmt.Errorf("linux seccomp: %v", err)
   813  	}
   814  
   815  	if err := daemon.setupIpcDirs(c); err != nil {
   816  		return nil, err
   817  	}
   818  
   819  	if err := daemon.setupSecretDir(c); err != nil {
   820  		return nil, err
   821  	}
   822  
   823  	if err := daemon.setupConfigDir(c); err != nil {
   824  		return nil, err
   825  	}
   826  
   827  	ms, err := daemon.setupMounts(c)
   828  	if err != nil {
   829  		return nil, err
   830  	}
   831  
   832  	if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
   833  		ms = append(ms, c.IpcMounts()...)
   834  	}
   835  
   836  	tmpfsMounts, err := c.TmpfsMounts()
   837  	if err != nil {
   838  		return nil, err
   839  	}
   840  	ms = append(ms, tmpfsMounts...)
   841  
   842  	if m := c.SecretMounts(); m != nil {
   843  		ms = append(ms, m...)
   844  	}
   845  
   846  	ms = append(ms, c.ConfigMounts()...)
   847  
   848  	sort.Sort(mounts(ms))
   849  	if err := setMounts(daemon, &s, c, ms); err != nil {
   850  		return nil, fmt.Errorf("linux mounts: %v", err)
   851  	}
   852  
   853  	for _, ns := range s.Linux.Namespaces {
   854  		if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
   855  			target, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe"))
   856  			if err != nil {
   857  				return nil, err
   858  			}
   859  
   860  			s.Hooks = &specs.Hooks{
   861  				Prestart: []specs.Hook{{
   862  					Path: target, // FIXME: cross-platform
   863  					Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
   864  				}},
   865  			}
   866  		}
   867  	}
   868  
   869  	if apparmor.IsEnabled() {
   870  		var appArmorProfile string
   871  		if c.AppArmorProfile != "" {
   872  			appArmorProfile = c.AppArmorProfile
   873  		} else if c.HostConfig.Privileged {
   874  			appArmorProfile = "unconfined"
   875  		} else {
   876  			appArmorProfile = "docker-default"
   877  		}
   878  
   879  		if appArmorProfile == "docker-default" {
   880  			// Unattended upgrades and other fun services can unload AppArmor
   881  			// profiles inadvertently. Since we cannot store our profile in
   882  			// /etc/apparmor.d, nor can we practically add other ways of
   883  			// telling the system to keep our profile loaded, in order to make
   884  			// sure that we keep the default profile enabled we dynamically
   885  			// reload it if necessary.
   886  			if err := ensureDefaultAppArmorProfile(); err != nil {
   887  				return nil, err
   888  			}
   889  		}
   890  
   891  		s.Process.ApparmorProfile = appArmorProfile
   892  	}
   893  	s.Process.SelinuxLabel = c.GetProcessLabel()
   894  	s.Process.NoNewPrivileges = c.NoNewPrivileges
   895  	s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
   896  	s.Linux.MountLabel = c.MountLabel
   897  
   898  	return &s, nil
   899  }
   900  
   901  func clearReadOnly(m *specs.Mount) {
   902  	var opt []string
   903  	for _, o := range m.Options {
   904  		if o != "ro" {
   905  			opt = append(opt, o)
   906  		}
   907  	}
   908  	m.Options = opt
   909  }
   910  
   911  // mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
   912  func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
   913  	ulimits := c.Ulimits
   914  	// Merge ulimits with daemon defaults
   915  	ulIdx := make(map[string]struct{})
   916  	for _, ul := range ulimits {
   917  		ulIdx[ul.Name] = struct{}{}
   918  	}
   919  	for name, ul := range daemon.configStore.Ulimits {
   920  		if _, exists := ulIdx[name]; !exists {
   921  			ulimits = append(ulimits, ul)
   922  		}
   923  	}
   924  	c.Ulimits = ulimits
   925  }